From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- third_party/aom/.clang-format | 9 + third_party/aom/.cmake-format.py | 102 + third_party/aom/.mailmap | 113 + third_party/aom/AUTHORS | 319 + third_party/aom/CHANGELOG | 828 ++ third_party/aom/CMakeLists.txt | 1035 ++ third_party/aom/LICENSE | 27 + third_party/aom/PATENTS | 108 + third_party/aom/README.md | 677 + third_party/aom/Sample.cfg | 35 + third_party/aom/aom/aom.h | 127 + third_party/aom/aom/aom_codec.h | 577 + third_party/aom/aom/aom_decoder.h | 257 + third_party/aom/aom/aom_encoder.h | 1144 ++ third_party/aom/aom/aom_external_partition.h | 452 + third_party/aom/aom/aom_frame_buffer.h | 84 + third_party/aom/aom/aom_image.h | 448 + third_party/aom/aom/aom_integer.h | 68 + third_party/aom/aom/aomcx.h | 2205 ++++ third_party/aom/aom/aomdx.h | 604 + third_party/aom/aom/exports_com | 42 + third_party/aom/aom/exports_dec | 8 + third_party/aom/aom/exports_enc | 17 + third_party/aom/aom/exports_test | 4 + third_party/aom/aom/internal/aom_codec_internal.h | 418 + third_party/aom/aom/internal/aom_image_internal.h | 93 + third_party/aom/aom/src/aom_codec.c | 199 + third_party/aom/aom/src/aom_decoder.c | 137 + third_party/aom/aom/src/aom_encoder.c | 333 + third_party/aom/aom/src/aom_image.c | 401 + third_party/aom/aom/src/aom_integer.c | 105 + third_party/aom/aom_dsp/aom_convolve.c | 261 + third_party/aom/aom_dsp/aom_dsp.cmake | 510 + third_party/aom/aom_dsp/aom_dsp_common.h | 99 + third_party/aom/aom_dsp/aom_dsp_rtcd.c | 18 + third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 1798 +++ third_party/aom/aom_dsp/aom_filter.h | 56 + third_party/aom/aom_dsp/aom_simd.h | 36 + third_party/aom/aom_dsp/aom_simd_inline.h | 24 + third_party/aom/aom_dsp/arm/aom_convolve8_neon.c | 349 + .../aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c | 460 + .../aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c | 408 + .../aom/aom_dsp/arm/aom_convolve_copy_neon.c | 154 + third_party/aom/aom_dsp/arm/avg_neon.c | 309 + third_party/aom/aom_dsp/arm/avg_pred_neon.c | 221 + third_party/aom/aom_dsp/arm/avg_sve.c | 62 + third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c | 492 + third_party/aom/aom_dsp/arm/blend_neon.h | 125 + third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c | 124 + third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c | 106 + third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h | 65 + third_party/aom/aom_dsp/arm/dot_sve.h | 42 + third_party/aom/aom_dsp/arm/fwd_txfm_neon.c | 304 + third_party/aom/aom_dsp/arm/hadamard_neon.c | 325 + third_party/aom/aom_dsp/arm/highbd_avg_neon.c | 125 + third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c | 190 + .../aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c | 97 + .../aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c | 473 + .../aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c | 105 + .../aom/aom_dsp/arm/highbd_convolve8_neon.c | 363 + third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c | 213 + .../aom/aom_dsp/arm/highbd_intrapred_neon.c | 2730 ++++ .../aom/aom_dsp/arm/highbd_loopfilter_neon.c | 1265 ++ .../aom/aom_dsp/arm/highbd_masked_sad_neon.c | 354 + third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c | 211 + .../aom/aom_dsp/arm/highbd_obmc_variance_neon.c | 369 + third_party/aom/aom_dsp/arm/highbd_quantize_neon.c | 431 + third_party/aom/aom_dsp/arm/highbd_sad_neon.c | 509 + third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c | 617 + third_party/aom/aom_dsp/arm/highbd_sse_neon.c | 284 + third_party/aom/aom_dsp/arm/highbd_sse_sve.c | 215 + .../aom/aom_dsp/arm/highbd_subpel_variance_neon.c | 1497 +++ third_party/aom/aom_dsp/arm/highbd_variance_neon.c | 502 + .../aom/aom_dsp/arm/highbd_variance_neon_dotprod.c | 92 + third_party/aom/aom_dsp/arm/highbd_variance_sve.c | 430 + third_party/aom/aom_dsp/arm/intrapred_neon.c | 3110 +++++ third_party/aom/aom_dsp/arm/loopfilter_neon.c | 1045 ++ third_party/aom/aom_dsp/arm/masked_sad4d_neon.c | 562 + third_party/aom/aom_dsp/arm/masked_sad_neon.c | 244 + third_party/aom/aom_dsp/arm/mem_neon.h | 1253 ++ third_party/aom/aom_dsp/arm/obmc_sad_neon.c | 250 + third_party/aom/aom_dsp/arm/obmc_variance_neon.c | 290 + third_party/aom/aom_dsp/arm/reinterpret_neon.h | 33 + third_party/aom/aom_dsp/arm/sad_neon.c | 873 ++ third_party/aom/aom_dsp/arm/sad_neon_dotprod.c | 530 + third_party/aom/aom_dsp/arm/sadxd_neon.c | 514 + third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c | 289 + third_party/aom/aom_dsp/arm/sse_neon.c | 210 + third_party/aom/aom_dsp/arm/sse_neon_dotprod.c | 223 + third_party/aom/aom_dsp/arm/subpel_variance_neon.c | 1103 ++ third_party/aom/aom_dsp/arm/subtract_neon.c | 166 + third_party/aom/aom_dsp/arm/sum_neon.h | 311 + third_party/aom/aom_dsp/arm/sum_squares_neon.c | 574 + .../aom/aom_dsp/arm/sum_squares_neon_dotprod.c | 154 + third_party/aom/aom_dsp/arm/sum_squares_sve.c | 402 + third_party/aom/aom_dsp/arm/transpose_neon.h | 1263 ++ third_party/aom/aom_dsp/arm/variance_neon.c | 470 + .../aom/aom_dsp/arm/variance_neon_dotprod.c | 314 + third_party/aom/aom_dsp/avg.c | 573 + third_party/aom/aom_dsp/binary_codes_reader.c | 55 + third_party/aom/aom_dsp/binary_codes_reader.h | 44 + third_party/aom/aom_dsp/binary_codes_writer.c | 137 + third_party/aom/aom_dsp/binary_codes_writer.h | 65 + third_party/aom/aom_dsp/bitreader.c | 41 + third_party/aom/aom_dsp/bitreader.h | 232 + third_party/aom/aom_dsp/bitreader_buffer.c | 116 + third_party/aom/aom_dsp/bitreader_buffer.h | 53 + third_party/aom/aom_dsp/bitwriter.c | 40 + third_party/aom/aom_dsp/bitwriter.h | 110 + third_party/aom/aom_dsp/bitwriter_buffer.c | 141 + third_party/aom/aom_dsp/bitwriter_buffer.h | 55 + third_party/aom/aom_dsp/blend.h | 45 + third_party/aom/aom_dsp/blend_a64_hmask.c | 71 + third_party/aom/aom_dsp/blend_a64_mask.c | 349 + third_party/aom/aom_dsp/blend_a64_vmask.c | 73 + third_party/aom/aom_dsp/blk_sse_sum.c | 26 + third_party/aom/aom_dsp/butteraugli.c | 109 + third_party/aom/aom_dsp/butteraugli.h | 23 + third_party/aom/aom_dsp/entcode.c | 49 + third_party/aom/aom_dsp/entcode.h | 41 + third_party/aom/aom_dsp/entdec.c | 247 + third_party/aom/aom_dsp/entdec.h | 81 + third_party/aom/aom_dsp/entenc.c | 374 + third_party/aom/aom_dsp/entenc.h | 108 + third_party/aom/aom_dsp/fastssim.c | 488 + third_party/aom/aom_dsp/fft.c | 220 + third_party/aom/aom_dsp/fft_common.h | 1056 ++ .../aom/aom_dsp/flow_estimation/arm/disflow_neon.c | 368 + .../aom/aom_dsp/flow_estimation/corner_detect.c | 167 + .../aom/aom_dsp/flow_estimation/corner_detect.h | 80 + .../aom/aom_dsp/flow_estimation/corner_match.c | 259 + .../aom/aom_dsp/flow_estimation/corner_match.h | 41 + third_party/aom/aom_dsp/flow_estimation/disflow.c | 823 ++ third_party/aom/aom_dsp/flow_estimation/disflow.h | 106 + .../aom/aom_dsp/flow_estimation/flow_estimation.c | 60 + .../aom/aom_dsp/flow_estimation/flow_estimation.h | 95 + third_party/aom/aom_dsp/flow_estimation/ransac.c | 484 + third_party/aom/aom_dsp/flow_estimation/ransac.h | 35 + .../flow_estimation/x86/corner_match_avx2.c | 80 + .../flow_estimation/x86/corner_match_sse4.c | 104 + .../aom/aom_dsp/flow_estimation/x86/disflow_sse4.c | 558 + third_party/aom/aom_dsp/fwd_txfm.c | 245 + third_party/aom/aom_dsp/grain_params.h | 158 + third_party/aom/aom_dsp/grain_table.c | 358 + third_party/aom/aom_dsp/grain_table.h | 102 + third_party/aom/aom_dsp/intrapred.c | 793 ++ third_party/aom/aom_dsp/intrapred_common.h | 59 + third_party/aom/aom_dsp/loopfilter.c | 997 ++ third_party/aom/aom_dsp/mathutils.h | 145 + third_party/aom/aom_dsp/noise_model.c | 1692 +++ third_party/aom/aom_dsp/noise_model.h | 328 + third_party/aom/aom_dsp/noise_util.c | 225 + third_party/aom/aom_dsp/noise_util.h | 68 + third_party/aom/aom_dsp/odintrin.c | 541 + third_party/aom/aom_dsp/odintrin.h | 81 + third_party/aom/aom_dsp/prob.h | 144 + third_party/aom/aom_dsp/psnr.c | 454 + third_party/aom/aom_dsp/psnr.h | 96 + third_party/aom/aom_dsp/psnrhvs.c | 282 + third_party/aom/aom_dsp/pyramid.c | 414 + third_party/aom/aom_dsp/pyramid.h | 127 + third_party/aom/aom_dsp/quantize.c | 472 + third_party/aom/aom_dsp/quantize.h | 127 + third_party/aom/aom_dsp/recenter.h | 61 + third_party/aom/aom_dsp/rect.h | 35 + third_party/aom/aom_dsp/sad.c | 389 + third_party/aom/aom_dsp/sad_av1.c | 266 + third_party/aom/aom_dsp/simd/v128_intrinsics.h | 346 + third_party/aom/aom_dsp/simd/v128_intrinsics_c.h | 898 ++ third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h | 659 + third_party/aom/aom_dsp/simd/v256_intrinsics.h | 377 + third_party/aom/aom_dsp/simd/v256_intrinsics_c.h | 963 ++ .../aom/aom_dsp/simd/v256_intrinsics_v128.h | 806 ++ third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h | 754 ++ third_party/aom/aom_dsp/simd/v64_intrinsics.h | 234 + third_party/aom/aom_dsp/simd/v64_intrinsics_c.h | 966 ++ third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h | 489 + third_party/aom/aom_dsp/sse.c | 59 + third_party/aom/aom_dsp/ssim.c | 481 + third_party/aom/aom_dsp/ssim.h | 104 + third_party/aom/aom_dsp/subtract.c | 54 + third_party/aom/aom_dsp/sum_squares.c | 90 + third_party/aom/aom_dsp/txfm_common.h | 155 + third_party/aom/aom_dsp/variance.c | 1234 ++ third_party/aom/aom_dsp/variance.h | 127 + third_party/aom/aom_dsp/vmaf.c | 192 + third_party/aom/aom_dsp/vmaf.h | 41 + .../aom/aom_dsp/x86/adaptive_quantize_avx2.c | 244 + .../aom/aom_dsp/x86/adaptive_quantize_sse2.c | 633 + third_party/aom/aom_dsp/x86/aom_asm_stubs.c | 95 + .../aom/aom_dsp/x86/aom_convolve_copy_avx2.c | 256 + .../aom/aom_dsp/x86/aom_convolve_copy_sse2.c | 308 + .../aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 613 + .../x86/aom_high_subpixel_bilinear_sse2.asm | 367 + third_party/aom/aom_dsp/x86/aom_quantize_avx.c | 282 + .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 1441 +++ .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c | 569 + .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c | 847 ++ .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm | 615 + .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 870 ++ .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm | 295 + .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm | 267 + third_party/aom/aom_dsp/x86/avg_intrin_avx2.c | 897 ++ third_party/aom/aom_dsp/x86/avg_intrin_sse2.c | 700 + third_party/aom/aom_dsp/x86/avg_intrin_sse4.c | 59 + .../aom/aom_dsp/x86/bitdepth_conversion_avx2.h | 32 + .../aom/aom_dsp/x86/bitdepth_conversion_sse2.h | 49 + third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c | 36 + third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c | 1374 ++ third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c | 1560 +++ third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c | 285 + third_party/aom/aom_dsp/x86/blend_mask_sse4.h | 237 + third_party/aom/aom_dsp/x86/blend_sse4.h | 191 + third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c | 185 + third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c | 138 + third_party/aom/aom_dsp/x86/common_avx2.h | 147 + third_party/aom/aom_dsp/x86/convolve.h | 204 + third_party/aom/aom_dsp/x86/convolve_avx2.h | 922 ++ .../aom/aom_dsp/x86/convolve_common_intrin.h | 102 + third_party/aom/aom_dsp/x86/convolve_sse2.h | 122 + third_party/aom/aom_dsp/x86/convolve_sse4_1.h | 53 + third_party/aom/aom_dsp/x86/convolve_ssse3.h | 50 + third_party/aom/aom_dsp/x86/fft_avx2.c | 74 + third_party/aom/aom_dsp/x86/fft_sse2.c | 173 + third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h | 529 + third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c | 39 + third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 160 + .../aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 379 + .../aom_dsp/x86/highbd_adaptive_quantize_avx2.c | 456 + .../aom_dsp/x86/highbd_adaptive_quantize_sse2.c | 732 ++ third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c | 1248 ++ third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c | 351 + .../aom/aom_dsp/x86/highbd_convolve_ssse3.c | 439 + .../aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm | 259 + .../aom/aom_dsp/x86/highbd_intrapred_sse2.c | 984 ++ .../aom/aom_dsp/x86/highbd_loopfilter_avx2.c | 66 + .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 1698 +++ .../aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c | 294 + .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 208 + third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm | 344 + third_party/aom/aom_dsp/x86/highbd_sad_avx2.c | 720 ++ third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm | 524 + .../x86/highbd_subpel_variance_impl_sse2.asm | 1024 ++ third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c | 266 + third_party/aom/aom_dsp/x86/highbd_variance_avx2.c | 904 ++ .../aom/aom_dsp/x86/highbd_variance_impl_sse2.asm | 318 + third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 735 ++ third_party/aom/aom_dsp/x86/highbd_variance_sse4.c | 216 + third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm | 608 + third_party/aom/aom_dsp/x86/intrapred_avx2.c | 4707 +++++++ third_party/aom/aom_dsp/x86/intrapred_sse2.c | 1411 ++ third_party/aom/aom_dsp/x86/intrapred_sse4.c | 1307 ++ third_party/aom/aom_dsp/x86/intrapred_ssse3.c | 2997 +++++ third_party/aom/aom_dsp/x86/intrapred_utils.h | 205 + third_party/aom/aom_dsp/x86/intrapred_x86.h | 38 + third_party/aom/aom_dsp/x86/inv_wht_sse2.asm | 107 + third_party/aom/aom_dsp/x86/jnt_sad_sse2.c | 238 + third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c | 161 + third_party/aom/aom_dsp/x86/loopfilter_avx2.c | 1016 ++ third_party/aom/aom_dsp/x86/loopfilter_sse2.c | 2973 +++++ third_party/aom/aom_dsp/x86/lpf_common_sse2.h | 721 ++ third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c | 266 + .../aom/aom_dsp/x86/masked_sad_intrin_avx2.c | 389 + .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 400 + .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.h | 33 + .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 1067 ++ .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.h | 92 + third_party/aom/aom_dsp/x86/mem_sse2.h | 167 + third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h | 58 + third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h | 54 + third_party/aom/aom_dsp/x86/obmc_sad_avx2.c | 271 + third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 269 + third_party/aom/aom_dsp/x86/obmc_variance_avx2.c | 191 + third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 382 + third_party/aom/aom_dsp/x86/quantize_avx2.c | 274 + third_party/aom/aom_dsp/x86/quantize_sse2.c | 125 + third_party/aom/aom_dsp/x86/quantize_ssse3.c | 192 + .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm | 302 + third_party/aom/aom_dsp/x86/quantize_x86.h | 202 + third_party/aom/aom_dsp/x86/sad4d_avx2.c | 326 + third_party/aom/aom_dsp/x86/sad4d_sse2.asm | 437 + third_party/aom/aom_dsp/x86/sad_avx2.c | 219 + third_party/aom/aom_dsp/x86/sad_impl_avx2.c | 181 + third_party/aom/aom_dsp/x86/sad_sse2.asm | 432 + third_party/aom/aom_dsp/x86/sse_avx2.c | 389 + third_party/aom/aom_dsp/x86/sse_sse4.c | 355 + third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm | 222 + .../aom/aom_dsp/x86/subpel_variance_sse2.asm | 1470 +++ third_party/aom/aom_dsp/x86/subtract_avx2.c | 109 + third_party/aom/aom_dsp/x86/subtract_sse2.asm | 147 + third_party/aom/aom_dsp/x86/sum_squares_avx2.c | 326 + third_party/aom/aom_dsp/x86/sum_squares_sse2.c | 478 + third_party/aom/aom_dsp/x86/sum_squares_sse2.h | 28 + third_party/aom/aom_dsp/x86/synonyms.h | 134 + third_party/aom/aom_dsp/x86/synonyms_avx2.h | 79 + third_party/aom/aom_dsp/x86/transpose_sse2.h | 424 + third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 357 + third_party/aom/aom_dsp/x86/txfm_common_sse2.h | 33 + third_party/aom/aom_dsp/x86/variance_avx2.c | 961 ++ third_party/aom/aom_dsp/x86/variance_impl_avx2.c | 924 ++ third_party/aom/aom_dsp/x86/variance_impl_ssse3.c | 129 + third_party/aom/aom_dsp/x86/variance_sse2.c | 802 ++ third_party/aom/aom_mem/aom_mem.c | 82 + third_party/aom/aom_mem/aom_mem.cmake | 29 + third_party/aom/aom_mem/aom_mem.h | 80 + third_party/aom/aom_mem/include/aom_mem_intrnl.h | 29 + third_party/aom/aom_ports/aarch32_cpudetect.c | 86 + third_party/aom/aom_ports/aarch64_cpudetect.c | 188 + third_party/aom/aom_ports/aom_once.h | 83 + third_party/aom/aom_ports/aom_ports.cmake | 96 + third_party/aom/aom_ports/aom_timer.h | 113 + third_party/aom/aom_ports/arm.h | 45 + third_party/aom/aom_ports/arm_cpudetect.h | 54 + third_party/aom/aom_ports/bitops.h | 122 + third_party/aom/aom_ports/emmintrin_compat.h | 56 + third_party/aom/aom_ports/float.asm | 33 + third_party/aom/aom_ports/mem.h | 102 + third_party/aom/aom_ports/mem_ops.h | 228 + third_party/aom/aom_ports/mem_ops_aligned.h | 173 + third_party/aom/aom_ports/msvc.h | 75 + third_party/aom/aom_ports/ppc.h | 30 + third_party/aom/aom_ports/ppc_cpudetect.c | 82 + third_party/aom/aom_ports/sanitizer.h | 38 + third_party/aom/aom_ports/x86.h | 402 + third_party/aom/aom_ports/x86_abi_support.asm | 416 + third_party/aom/aom_scale/aom_scale.cmake | 37 + third_party/aom/aom_scale/aom_scale.h | 23 + third_party/aom/aom_scale/aom_scale_rtcd.c | 18 + third_party/aom/aom_scale/aom_scale_rtcd.pl | 55 + third_party/aom/aom_scale/generic/aom_scale.c | 506 + third_party/aom/aom_scale/generic/gen_scalers.c | 201 + third_party/aom/aom_scale/generic/yv12config.c | 312 + third_party/aom/aom_scale/generic/yv12extend.c | 517 + third_party/aom/aom_scale/yv12config.h | 214 + third_party/aom/aom_util/aom_thread.c | 240 + third_party/aom/aom_util/aom_thread.h | 236 + third_party/aom/aom_util/aom_util.cmake | 34 + third_party/aom/aom_util/debug_util.c | 293 + third_party/aom/aom_util/debug_util.h | 69 + third_party/aom/aom_util/endian_inl.h | 109 + third_party/aom/aomedia_logo_200.png | Bin 0 -> 7052 bytes third_party/aom/apps/aomdec.c | 1088 ++ third_party/aom/apps/aomenc.c | 2688 ++++ third_party/aom/apps/aomenc.h | 59 + third_party/aom/av1/arg_defs.c | 702 + third_party/aom/av1/arg_defs.h | 246 + third_party/aom/av1/av1.cmake | 715 + third_party/aom/av1/av1_cx_iface.c | 4712 +++++++ third_party/aom/av1/av1_cx_iface.h | 37 + third_party/aom/av1/av1_dx_iface.c | 1777 +++ third_party/aom/av1/av1_iface_common.h | 148 + third_party/aom/av1/common/alloccommon.c | 506 + third_party/aom/av1/common/alloccommon.h | 65 + third_party/aom/av1/common/arm/av1_inv_txfm_neon.c | 4217 ++++++ third_party/aom/av1/common/arm/av1_inv_txfm_neon.h | 154 + third_party/aom/av1/common/arm/av1_txfm_neon.c | 30 + .../aom/av1/common/arm/blend_a64_hmask_neon.c | 102 + .../aom/av1/common/arm/blend_a64_vmask_neon.c | 112 + third_party/aom/av1/common/arm/cdef_block_neon.c | 1355 ++ third_party/aom/av1/common/arm/cfl_neon.c | 589 + .../aom/av1/common/arm/compound_convolve_neon.c | 2719 ++++ .../aom/av1/common/arm/compound_convolve_neon.h | 1164 ++ .../common/arm/compound_convolve_neon_dotprod.c | 675 + .../av1/common/arm/compound_convolve_neon_i8mm.c | 614 + third_party/aom/av1/common/arm/convolve_neon.c | 1659 +++ third_party/aom/av1/common/arm/convolve_neon.h | 538 + .../aom/av1/common/arm/convolve_neon_dotprod.c | 793 ++ .../aom/av1/common/arm/convolve_neon_i8mm.c | 702 + .../av1/common/arm/highbd_compound_convolve_neon.c | 2031 +++ .../av1/common/arm/highbd_convolve_horiz_rs_neon.c | 273 + .../aom/av1/common/arm/highbd_convolve_neon.c | 2120 +++ .../aom/av1/common/arm/highbd_convolve_neon.h | 148 + .../av1/common/arm/highbd_convolve_scale_neon.c | 552 + .../aom/av1/common/arm/highbd_inv_txfm_neon.c | 5994 +++++++++ .../aom/av1/common/arm/highbd_reconinter_neon.c | 327 + .../aom/av1/common/arm/highbd_reconintra_neon.c | 241 + .../aom/av1/common/arm/highbd_warp_plane_neon.c | 317 + .../aom/av1/common/arm/highbd_warp_plane_neon.h | 424 + .../av1/common/arm/highbd_wiener_convolve_neon.c | 403 + third_party/aom/av1/common/arm/reconinter_neon.c | 217 + third_party/aom/av1/common/arm/reconintra_neon.c | 392 + third_party/aom/av1/common/arm/resize_neon.c | 1178 ++ third_party/aom/av1/common/arm/selfguided_neon.c | 1595 +++ third_party/aom/av1/common/arm/warp_plane_neon.c | 276 + third_party/aom/av1/common/arm/warp_plane_neon.h | 367 + .../aom/av1/common/arm/warp_plane_neon_i8mm.c | 291 + third_party/aom/av1/common/arm/warp_plane_sve.c | 284 + .../aom/av1/common/arm/wiener_convolve_neon.c | 348 + third_party/aom/av1/common/av1_common_int.h | 1882 +++ third_party/aom/av1/common/av1_inv_txfm1d.c | 1841 +++ third_party/aom/av1/common/av1_inv_txfm1d.h | 61 + third_party/aom/av1/common/av1_inv_txfm1d_cfg.h | 45 + third_party/aom/av1/common/av1_inv_txfm2d.c | 484 + third_party/aom/av1/common/av1_loopfilter.c | 2099 +++ third_party/aom/av1/common/av1_loopfilter.h | 150 + third_party/aom/av1/common/av1_rtcd.c | 18 + third_party/aom/av1/common/av1_rtcd_defs.pl | 655 + third_party/aom/av1/common/av1_txfm.c | 278 + third_party/aom/av1/common/av1_txfm.h | 256 + third_party/aom/av1/common/blockd.c | 100 + third_party/aom/av1/common/blockd.h | 1612 +++ third_party/aom/av1/common/cdef.c | 466 + third_party/aom/av1/common/cdef.h | 112 + third_party/aom/av1/common/cdef_block.c | 426 + third_party/aom/av1/common/cdef_block.h | 65 + third_party/aom/av1/common/cdef_block_simd.h | 844 ++ third_party/aom/av1/common/cfl.c | 434 + third_party/aom/av1/common/cfl.h | 294 + third_party/aom/av1/common/common.h | 61 + third_party/aom/av1/common/common_data.c | 43 + third_party/aom/av1/common/common_data.h | 432 + third_party/aom/av1/common/convolve.c | 1508 +++ third_party/aom/av1/common/convolve.h | 132 + third_party/aom/av1/common/debugmodes.c | 113 + third_party/aom/av1/common/entropy.c | 178 + third_party/aom/av1/common/entropy.h | 182 + third_party/aom/av1/common/entropymode.c | 1094 ++ third_party/aom/av1/common/entropymode.h | 218 + third_party/aom/av1/common/entropymv.c | 67 + third_party/aom/av1/common/entropymv.h | 104 + third_party/aom/av1/common/enums.h | 651 + third_party/aom/av1/common/filter.h | 320 + third_party/aom/av1/common/frame_buffers.c | 98 + third_party/aom/av1/common/frame_buffers.h | 60 + third_party/aom/av1/common/idct.c | 322 + third_party/aom/av1/common/idct.h | 51 + third_party/aom/av1/common/mv.h | 337 + third_party/aom/av1/common/mvref_common.c | 1501 +++ third_party/aom/av1/common/mvref_common.h | 342 + third_party/aom/av1/common/obmc.h | 89 + third_party/aom/av1/common/obu_util.c | 133 + third_party/aom/av1/common/obu_util.h | 47 + third_party/aom/av1/common/ppc/cfl_ppc.c | 152 + third_party/aom/av1/common/pred_common.c | 501 + third_party/aom/av1/common/pred_common.h | 377 + third_party/aom/av1/common/quant_common.c | 12876 +++++++++++++++++++ third_party/aom/av1/common/quant_common.h | 84 + third_party/aom/av1/common/reconinter.c | 1169 ++ third_party/aom/av1/common/reconinter.h | 489 + third_party/aom/av1/common/reconinter_template.inc | 267 + third_party/aom/av1/common/reconintra.c | 1798 +++ third_party/aom/av1/common/reconintra.h | 158 + third_party/aom/av1/common/resize.c | 1452 +++ third_party/aom/av1/common/resize.h | 146 + third_party/aom/av1/common/restoration.c | 1494 +++ third_party/aom/av1/common/restoration.h | 471 + third_party/aom/av1/common/scale.c | 57 + third_party/aom/av1/common/scale.h | 87 + third_party/aom/av1/common/scan.c | 2038 +++ third_party/aom/av1/common/scan.h | 54 + third_party/aom/av1/common/seg_common.c | 91 + third_party/aom/av1/common/seg_common.h | 113 + third_party/aom/av1/common/thread_common.c | 1250 ++ third_party/aom/av1/common/thread_common.h | 345 + third_party/aom/av1/common/tile_common.c | 249 + third_party/aom/av1/common/tile_common.h | 75 + third_party/aom/av1/common/timing.c | 92 + third_party/aom/av1/common/timing.h | 55 + third_party/aom/av1/common/token_cdfs.h | 3555 +++++ third_party/aom/av1/common/txb_common.c | 364 + third_party/aom/av1/common/txb_common.h | 463 + third_party/aom/av1/common/warped_motion.c | 918 ++ third_party/aom/av1/common/warped_motion.h | 97 + .../av1/common/x86/av1_convolve_horiz_rs_sse4.c | 228 + .../aom/av1/common/x86/av1_convolve_scale_sse4.c | 498 + third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c | 2254 ++++ third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h | 71 + .../aom/av1/common/x86/av1_inv_txfm_ssse3.c | 2904 +++++ .../aom/av1/common/x86/av1_inv_txfm_ssse3.h | 247 + third_party/aom/av1/common/x86/av1_txfm_sse2.h | 321 + third_party/aom/av1/common/x86/av1_txfm_sse4.c | 22 + third_party/aom/av1/common/x86/av1_txfm_sse4.h | 72 + third_party/aom/av1/common/x86/cdef_block_avx2.c | 357 + third_party/aom/av1/common/x86/cdef_block_sse2.c | 40 + third_party/aom/av1/common/x86/cdef_block_sse4.c | 40 + third_party/aom/av1/common/x86/cdef_block_ssse3.c | 40 + third_party/aom/av1/common/x86/cfl_avx2.c | 495 + third_party/aom/av1/common/x86/cfl_simd.h | 246 + third_party/aom/av1/common/x86/cfl_sse2.c | 89 + third_party/aom/av1/common/x86/cfl_ssse3.c | 397 + third_party/aom/av1/common/x86/convolve_2d_avx2.c | 161 + third_party/aom/av1/common/x86/convolve_2d_sse2.c | 547 + third_party/aom/av1/common/x86/convolve_avx2.c | 916 ++ third_party/aom/av1/common/x86/convolve_sse2.c | 500 + third_party/aom/av1/common/x86/filterintra_sse4.c | 350 + .../aom/av1/common/x86/highbd_convolve_2d_avx2.c | 200 + .../aom/av1/common/x86/highbd_convolve_2d_sse4.c | 421 + .../aom/av1/common/x86/highbd_convolve_2d_ssse3.c | 414 + .../aom/av1/common/x86/highbd_inv_txfm_avx2.c | 4239 ++++++ .../aom/av1/common/x86/highbd_inv_txfm_sse4.c | 5830 +++++++++ .../aom/av1/common/x86/highbd_jnt_convolve_avx2.c | 849 ++ .../aom/av1/common/x86/highbd_jnt_convolve_sse4.c | 381 + .../aom/av1/common/x86/highbd_txfm_utility_sse4.h | 132 + .../aom/av1/common/x86/highbd_warp_affine_avx2.c | 656 + .../aom/av1/common/x86/highbd_warp_plane_sse4.c | 636 + .../av1/common/x86/highbd_wiener_convolve_avx2.c | 245 + .../av1/common/x86/highbd_wiener_convolve_ssse3.c | 202 + third_party/aom/av1/common/x86/intra_edge_sse4.c | 322 + third_party/aom/av1/common/x86/jnt_convolve_avx2.c | 1124 ++ third_party/aom/av1/common/x86/jnt_convolve_sse2.c | 606 + .../aom/av1/common/x86/jnt_convolve_ssse3.c | 230 + third_party/aom/av1/common/x86/reconinter_avx2.c | 624 + third_party/aom/av1/common/x86/reconinter_sse4.c | 154 + third_party/aom/av1/common/x86/reconinter_ssse3.c | 120 + third_party/aom/av1/common/x86/resize_ssse3.c | 974 ++ third_party/aom/av1/common/x86/selfguided_avx2.c | 724 ++ third_party/aom/av1/common/x86/selfguided_sse4.c | 662 + third_party/aom/av1/common/x86/warp_plane_avx2.c | 1210 ++ third_party/aom/av1/common/x86/warp_plane_sse4.c | 908 ++ .../aom/av1/common/x86/wiener_convolve_avx2.c | 242 + .../aom/av1/common/x86/wiener_convolve_sse2.c | 199 + third_party/aom/av1/decoder/accounting.c | 140 + third_party/aom/av1/decoder/accounting.h | 82 + third_party/aom/av1/decoder/decodeframe.c | 5369 ++++++++ third_party/aom/av1/decoder/decodeframe.h | 84 + third_party/aom/av1/decoder/decodemv.c | 1586 +++ third_party/aom/av1/decoder/decodemv.h | 33 + third_party/aom/av1/decoder/decoder.c | 538 + third_party/aom/av1/decoder/decoder.h | 452 + third_party/aom/av1/decoder/decodetxb.c | 381 + third_party/aom/av1/decoder/decodetxb.h | 34 + third_party/aom/av1/decoder/detokenize.c | 78 + third_party/aom/av1/decoder/detokenize.h | 29 + third_party/aom/av1/decoder/dthread.h | 51 + third_party/aom/av1/decoder/grain_synthesis.c | 1461 +++ third_party/aom/av1/decoder/grain_synthesis.h | 66 + third_party/aom/av1/decoder/inspection.c | 162 + third_party/aom/av1/decoder/inspection.h | 91 + third_party/aom/av1/decoder/obu.c | 1101 ++ third_party/aom/av1/decoder/obu.h | 31 + third_party/aom/av1/encoder/allintra_vis.c | 1055 ++ third_party/aom/av1/encoder/allintra_vis.h | 46 + third_party/aom/av1/encoder/aq_complexity.c | 175 + third_party/aom/av1/encoder/aq_complexity.h | 37 + third_party/aom/av1/encoder/aq_cyclicrefresh.c | 657 + third_party/aom/av1/encoder/aq_cyclicrefresh.h | 332 + third_party/aom/av1/encoder/aq_variance.c | 220 + third_party/aom/av1/encoder/aq_variance.h | 35 + .../aom/av1/encoder/arm/crc32/hash_arm_crc32.c | 61 + .../aom/av1/encoder/arm/neon/av1_error_neon.c | 95 + .../aom/av1/encoder/arm/neon/av1_error_sve.c | 109 + .../aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c | 3090 +++++ .../encoder/arm/neon/av1_highbd_quantize_neon.c | 146 + .../aom/av1/encoder/arm/neon/av1_k_means_neon.c | 115 + .../encoder/arm/neon/av1_temporal_denoiser_neon.c | 360 + third_party/aom/av1/encoder/arm/neon/cnn_neon.c | 1144 ++ .../aom/av1/encoder/arm/neon/encodetxb_neon.c | 646 + .../av1/encoder/arm/neon/highbd_fwd_txfm_neon.c | 2619 ++++ .../aom/av1/encoder/arm/neon/highbd_pickrst_neon.c | 1207 ++ .../aom/av1/encoder/arm/neon/highbd_rdopt_neon.c | 49 + .../encoder/arm/neon/highbd_temporal_filter_neon.c | 562 + .../av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c | 73 + third_party/aom/av1/encoder/arm/neon/ml_neon.c | 339 + .../aom/av1/encoder/arm/neon/pickrst_neon.c | 1217 ++ .../aom/av1/encoder/arm/neon/pickrst_neon.h | 188 + .../aom/av1/encoder/arm/neon/quantize_neon.c | 928 ++ third_party/aom/av1/encoder/arm/neon/rdopt_neon.c | 459 + .../aom/av1/encoder/arm/neon/reconinter_enc_neon.c | 288 + third_party/aom/av1/encoder/arm/neon/shift_neon.h | 49 + .../av1/encoder/arm/neon/temporal_filter_neon.c | 548 + .../arm/neon/temporal_filter_neon_dotprod.c | 299 + third_party/aom/av1/encoder/arm/neon/txfm_neon.h | 26 + .../aom/av1/encoder/arm/neon/wedge_utils_neon.c | 131 + third_party/aom/av1/encoder/av1_fwd_txfm1d.c | 1885 +++ third_party/aom/av1/encoder/av1_fwd_txfm1d.h | 49 + third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h | 19 + third_party/aom/av1/encoder/av1_fwd_txfm2d.c | 423 + .../aom/av1/encoder/av1_ml_partition_models.h | 179 + third_party/aom/av1/encoder/av1_noise_estimate.c | 296 + third_party/aom/av1/encoder/av1_noise_estimate.h | 50 + third_party/aom/av1/encoder/av1_quantize.c | 917 ++ third_party/aom/av1/encoder/av1_quantize.h | 224 + .../aom/av1/encoder/av1_temporal_denoiser.c | 805 ++ .../aom/av1/encoder/av1_temporal_denoiser.h | 134 + third_party/aom/av1/encoder/bitstream.c | 4248 ++++++ third_party/aom/av1/encoder/bitstream.h | 137 + third_party/aom/av1/encoder/block.h | 1515 +++ third_party/aom/av1/encoder/blockiness.c | 140 + third_party/aom/av1/encoder/cnn.c | 1189 ++ third_party/aom/av1/encoder/cnn.h | 191 + third_party/aom/av1/encoder/compound_type.c | 1678 +++ third_party/aom/av1/encoder/compound_type.h | 52 + third_party/aom/av1/encoder/context_tree.c | 311 + third_party/aom/av1/encoder/context_tree.h | 142 + third_party/aom/av1/encoder/cost.c | 46 + third_party/aom/av1/encoder/cost.h | 51 + third_party/aom/av1/encoder/deltaq4_model.c | 7776 +++++++++++ third_party/aom/av1/encoder/dwt.c | 146 + third_party/aom/av1/encoder/dwt.h | 27 + third_party/aom/av1/encoder/enc_enums.h | 268 + third_party/aom/av1/encoder/encode_strategy.c | 1767 +++ third_party/aom/av1/encoder/encode_strategy.h | 138 + third_party/aom/av1/encoder/encodeframe.c | 2408 ++++ third_party/aom/av1/encoder/encodeframe.h | 55 + third_party/aom/av1/encoder/encodeframe_utils.c | 1775 +++ third_party/aom/av1/encoder/encodeframe_utils.h | 595 + third_party/aom/av1/encoder/encodemb.c | 866 ++ third_party/aom/av1/encoder/encodemb.h | 180 + third_party/aom/av1/encoder/encodemv.c | 345 + third_party/aom/av1/encoder/encodemv.h | 110 + third_party/aom/av1/encoder/encoder.c | 5409 ++++++++ third_party/aom/av1/encoder/encoder.h | 4512 +++++++ third_party/aom/av1/encoder/encoder_alloc.h | 531 + third_party/aom/av1/encoder/encoder_utils.c | 1503 +++ third_party/aom/av1/encoder/encoder_utils.h | 1141 ++ third_party/aom/av1/encoder/encodetxb.c | 886 ++ third_party/aom/av1/encoder/encodetxb.h | 276 + third_party/aom/av1/encoder/ethread.c | 3469 +++++ third_party/aom/av1/encoder/ethread.h | 133 + third_party/aom/av1/encoder/extend.c | 163 + third_party/aom/av1/encoder/extend.h | 29 + third_party/aom/av1/encoder/external_partition.c | 98 + third_party/aom/av1/encoder/external_partition.h | 58 + third_party/aom/av1/encoder/firstpass.c | 1600 +++ third_party/aom/av1/encoder/firstpass.h | 603 + third_party/aom/av1/encoder/global_motion.c | 575 + third_party/aom/av1/encoder/global_motion.h | 157 + third_party/aom/av1/encoder/global_motion_facade.c | 450 + third_party/aom/av1/encoder/global_motion_facade.h | 58 + third_party/aom/av1/encoder/gop_structure.c | 867 ++ third_party/aom/av1/encoder/gop_structure.h | 95 + third_party/aom/av1/encoder/grain_test_vectors.h | 781 ++ third_party/aom/av1/encoder/hash.c | 126 + third_party/aom/av1/encoder/hash.h | 53 + third_party/aom/av1/encoder/hash_motion.c | 503 + third_party/aom/av1/encoder/hash_motion.h | 103 + third_party/aom/av1/encoder/hybrid_fwd_txfm.c | 370 + third_party/aom/av1/encoder/hybrid_fwd_txfm.h | 40 + third_party/aom/av1/encoder/interp_search.c | 801 ++ third_party/aom/av1/encoder/interp_search.h | 205 + third_party/aom/av1/encoder/intra_mode_search.c | 1739 +++ third_party/aom/av1/encoder/intra_mode_search.h | 329 + .../aom/av1/encoder/intra_mode_search_utils.h | 690 + third_party/aom/av1/encoder/k_means_template.h | 151 + third_party/aom/av1/encoder/level.c | 1397 ++ third_party/aom/av1/encoder/level.h | 221 + third_party/aom/av1/encoder/lookahead.c | 222 + third_party/aom/av1/encoder/lookahead.h | 138 + third_party/aom/av1/encoder/mcomp.c | 3998 ++++++ third_party/aom/av1/encoder/mcomp.h | 398 + third_party/aom/av1/encoder/mcomp_structs.h | 109 + third_party/aom/av1/encoder/misc_model_weights.h | 696 + third_party/aom/av1/encoder/ml.c | 171 + third_party/aom/av1/encoder/ml.h | 85 + .../aom/av1/encoder/mode_prune_model_weights.h | 185 + third_party/aom/av1/encoder/model_rd.h | 270 + third_party/aom/av1/encoder/motion_search_facade.c | 1071 ++ third_party/aom/av1/encoder/motion_search_facade.h | 145 + third_party/aom/av1/encoder/mv_prec.c | 429 + third_party/aom/av1/encoder/mv_prec.h | 52 + third_party/aom/av1/encoder/nonrd_opt.c | 933 ++ third_party/aom/av1/encoder/nonrd_opt.h | 575 + third_party/aom/av1/encoder/nonrd_pickmode.c | 3537 +++++ third_party/aom/av1/encoder/optical_flow.c | 1113 ++ third_party/aom/av1/encoder/optical_flow.h | 76 + third_party/aom/av1/encoder/palette.c | 975 ++ third_party/aom/av1/encoder/palette.h | 215 + .../aom/av1/encoder/partition_cnn_weights.h | 2139 +++ .../aom/av1/encoder/partition_model_weights.h | 5646 ++++++++ third_party/aom/av1/encoder/partition_search.c | 6263 +++++++++ third_party/aom/av1/encoder/partition_search.h | 81 + third_party/aom/av1/encoder/partition_strategy.c | 2573 ++++ third_party/aom/av1/encoder/partition_strategy.h | 265 + third_party/aom/av1/encoder/pass2_strategy.c | 4488 +++++++ third_party/aom/av1/encoder/pass2_strategy.h | 149 + third_party/aom/av1/encoder/pickcdef.c | 958 ++ third_party/aom/av1/encoder/pickcdef.h | 261 + third_party/aom/av1/encoder/picklpf.c | 339 + third_party/aom/av1/encoder/picklpf.h | 165 + third_party/aom/av1/encoder/pickrst.c | 2217 ++++ third_party/aom/av1/encoder/pickrst.h | 126 + third_party/aom/av1/encoder/pustats.h | 198 + third_party/aom/av1/encoder/random.h | 85 + third_party/aom/av1/encoder/ratectrl.c | 3587 ++++++ third_party/aom/av1/encoder/ratectrl.h | 864 ++ third_party/aom/av1/encoder/rc_utils.h | 469 + third_party/aom/av1/encoder/rd.c | 1580 +++ third_party/aom/av1/encoder/rd.h | 390 + third_party/aom/av1/encoder/rdopt.c | 6598 ++++++++++ third_party/aom/av1/encoder/rdopt.h | 327 + third_party/aom/av1/encoder/rdopt_data_defs.h | 294 + third_party/aom/av1/encoder/rdopt_utils.h | 797 ++ third_party/aom/av1/encoder/reconinter_enc.c | 701 + third_party/aom/av1/encoder/reconinter_enc.h | 94 + third_party/aom/av1/encoder/saliency_map.c | 1414 ++ third_party/aom/av1/encoder/saliency_map.h | 28 + third_party/aom/av1/encoder/segmentation.c | 54 + third_party/aom/av1/encoder/segmentation.h | 38 + third_party/aom/av1/encoder/sorting_network.h | 140 + third_party/aom/av1/encoder/sparse_linear_solver.c | 472 + third_party/aom/av1/encoder/sparse_linear_solver.h | 67 + third_party/aom/av1/encoder/speed_features.c | 2715 ++++ third_party/aom/av1/encoder/speed_features.h | 2025 +++ third_party/aom/av1/encoder/superres_scale.c | 423 + third_party/aom/av1/encoder/superres_scale.h | 28 + third_party/aom/av1/encoder/svc_layercontext.c | 701 + third_party/aom/av1/encoder/svc_layercontext.h | 325 + third_party/aom/av1/encoder/temporal_filter.c | 1520 +++ third_party/aom/av1/encoder/temporal_filter.h | 458 + third_party/aom/av1/encoder/thirdpass.c | 877 ++ third_party/aom/av1/encoder/thirdpass.h | 197 + third_party/aom/av1/encoder/tokenize.c | 396 + third_party/aom/av1/encoder/tokenize.h | 159 + third_party/aom/av1/encoder/tpl_model.c | 2511 ++++ third_party/aom/av1/encoder/tpl_model.h | 794 ++ third_party/aom/av1/encoder/tune_butteraugli.c | 313 + third_party/aom/av1/encoder/tune_butteraugli.h | 45 + third_party/aom/av1/encoder/tune_vmaf.c | 1112 ++ third_party/aom/av1/encoder/tune_vmaf.h | 63 + .../aom/av1/encoder/tx_prune_model_weights.h | 3422 +++++ third_party/aom/av1/encoder/tx_search.c | 3830 ++++++ third_party/aom/av1/encoder/tx_search.h | 226 + third_party/aom/av1/encoder/txb_rdopt.c | 659 + third_party/aom/av1/encoder/txb_rdopt.h | 160 + third_party/aom/av1/encoder/txb_rdopt_utils.h | 236 + third_party/aom/av1/encoder/var_based_part.c | 1914 +++ third_party/aom/av1/encoder/var_based_part.h | 104 + third_party/aom/av1/encoder/wedge_utils.c | 125 + .../aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c | 1409 ++ .../aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c | 3010 +++++ .../aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c | 336 + .../aom/av1/encoder/x86/av1_fwd_txfm_avx2.h | 96 + .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.c | 2673 ++++ .../aom/av1/encoder/x86/av1_fwd_txfm_sse2.h | 253 + .../aom/av1/encoder/x86/av1_highbd_quantize_avx2.c | 137 + .../aom/av1/encoder/x86/av1_highbd_quantize_sse4.c | 195 + third_party/aom/av1/encoder/x86/av1_k_means_avx2.c | 132 + third_party/aom/av1/encoder/x86/av1_k_means_sse2.c | 124 + .../aom/av1/encoder/x86/av1_quantize_avx2.c | 414 + .../aom/av1/encoder/x86/av1_quantize_sse2.c | 289 + .../av1/encoder/x86/av1_quantize_ssse3_x86_64.asm | 204 + .../aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm | 222 + .../av1/encoder/x86/av1_temporal_denoiser_sse2.c | 328 + third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h | 144 + third_party/aom/av1/encoder/x86/cnn_avx2.c | 532 + third_party/aom/av1/encoder/x86/dct_sse2.asm | 82 + third_party/aom/av1/encoder/x86/encodetxb_avx2.c | 122 + third_party/aom/av1/encoder/x86/encodetxb_sse2.c | 505 + third_party/aom/av1/encoder/x86/encodetxb_sse4.c | 84 + .../aom/av1/encoder/x86/error_intrin_avx2.c | 210 + .../aom/av1/encoder/x86/error_intrin_sse2.c | 75 + third_party/aom/av1/encoder/x86/error_sse2.asm | 88 + third_party/aom/av1/encoder/x86/hash_sse42.c | 53 + .../encoder/x86/highbd_block_error_intrin_avx2.c | 64 + .../encoder/x86/highbd_block_error_intrin_sse2.c | 74 + .../aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c | 3132 +++++ .../aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c | 2629 ++++ .../av1/encoder/x86/highbd_temporal_filter_avx2.c | 466 + .../av1/encoder/x86/highbd_temporal_filter_sse2.c | 341 + third_party/aom/av1/encoder/x86/ml_avx2.c | 240 + third_party/aom/av1/encoder/x86/ml_sse3.c | 336 + third_party/aom/av1/encoder/x86/ml_sse3.h | 29 + third_party/aom/av1/encoder/x86/pickrst_avx2.c | 2348 ++++ third_party/aom/av1/encoder/x86/pickrst_sse4.c | 1483 +++ third_party/aom/av1/encoder/x86/rdopt_avx2.c | 254 + third_party/aom/av1/encoder/x86/rdopt_sse4.c | 272 + .../aom/av1/encoder/x86/reconinter_enc_sse2.c | 347 + .../aom/av1/encoder/x86/reconinter_enc_ssse3.c | 67 + .../aom/av1/encoder/x86/temporal_filter_avx2.c | 647 + .../aom/av1/encoder/x86/temporal_filter_sse2.c | 320 + third_party/aom/av1/encoder/x86/wedge_utils_avx2.c | 215 + third_party/aom/av1/encoder/x86/wedge_utils_sse2.c | 254 + third_party/aom/av1/exports_com | 2 + third_party/aom/av1/exports_dec | 3 + third_party/aom/av1/exports_enc | 2 + third_party/aom/av1/exports_ident | 2 + third_party/aom/av1/exports_test | 2 + third_party/aom/av1/ratectrl_rtc.cc | 375 + third_party/aom/av1/ratectrl_rtc.h | 127 + third_party/aom/build/cmake/aom_config.c.template | 13 + .../aom/build/cmake/aom_config_defaults.cmake | 235 + third_party/aom/build/cmake/aom_configure.cmake | 489 + .../aom/build/cmake/aom_experiment_deps.cmake | 24 + third_party/aom/build/cmake/aom_install.cmake | 98 + third_party/aom/build/cmake/aom_optimization.cmake | 279 + third_party/aom/build/cmake/compiler_flags.cmake | 385 + third_party/aom/build/cmake/compiler_tests.cmake | 179 + third_party/aom/build/cmake/cpu.cmake | 108 + third_party/aom/build/cmake/dist.cmake | 64 + third_party/aom/build/cmake/exports.cmake | 76 + third_party/aom/build/cmake/exports_sources.cmake | 35 + .../cmake/generate_aom_config_templates.cmake | 92 + third_party/aom/build/cmake/generate_exports.cmake | 69 + third_party/aom/build/cmake/pkg_config.cmake | 69 + third_party/aom/build/cmake/rtcd.pl | 430 + third_party/aom/build/cmake/sanitizers.cmake | 46 + .../aom/build/cmake/toolchains/android.cmake | 53 + .../build/cmake/toolchains/arm-ios-common.cmake | 24 + .../aom/build/cmake/toolchains/arm64-ios.cmake | 23 + .../build/cmake/toolchains/arm64-linux-clang.cmake | 30 + .../build/cmake/toolchains/arm64-linux-gcc.cmake | 40 + .../aom/build/cmake/toolchains/arm64-macos.cmake | 16 + .../build/cmake/toolchains/arm64-mingw-gcc.cmake | 36 + .../aom/build/cmake/toolchains/armv7-ios.cmake | 31 + .../build/cmake/toolchains/armv7-linux-gcc.cmake | 46 + .../build/cmake/toolchains/armv7-mingw-gcc.cmake | 39 + .../aom/build/cmake/toolchains/armv7s-ios.cmake | 31 + .../build/cmake/toolchains/i686-linux-gcc.cmake | 34 + .../cmake/toolchains/ios-simulator-common.cmake | 24 + .../aom/build/cmake/toolchains/ppc-linux-gcc.cmake | 36 + .../build/cmake/toolchains/riscv-linux-gcc.cmake | 36 + .../build/cmake/toolchains/x86-ios-simulator.cmake | 28 + .../aom/build/cmake/toolchains/x86-linux.cmake | 20 + .../aom/build/cmake/toolchains/x86-macos.cmake | 19 + .../aom/build/cmake/toolchains/x86-mingw-gcc.cmake | 40 + .../cmake/toolchains/x86_64-ios-simulator.cmake | 25 + .../aom/build/cmake/toolchains/x86_64-macos.cmake | 16 + .../build/cmake/toolchains/x86_64-mingw-gcc.cmake | 37 + third_party/aom/build/cmake/util.cmake | 173 + third_party/aom/build/cmake/version.cmake | 67 + third_party/aom/build/cmake/version.pl | 114 + third_party/aom/codereview.settings | 4 + third_party/aom/common/args.c | 248 + third_party/aom/common/args.h | 40 + third_party/aom/common/args_helper.c | 221 + third_party/aom/common/args_helper.h | 79 + third_party/aom/common/av1_config.c | 511 + third_party/aom/common/av1_config.h | 86 + third_party/aom/common/ivf_dec.cmake | 28 + third_party/aom/common/ivfdec.c | 112 + third_party/aom/common/ivfdec.h | 30 + third_party/aom/common/ivfenc.c | 52 + third_party/aom/common/ivfenc.h | 34 + third_party/aom/common/md5_utils.c | 257 + third_party/aom/common/md5_utils.h | 49 + third_party/aom/common/obudec.c | 512 + third_party/aom/common/obudec.h | 48 + third_party/aom/common/rawenc.c | 99 + third_party/aom/common/rawenc.h | 32 + third_party/aom/common/tools_common.c | 636 + third_party/aom/common/tools_common.h | 208 + third_party/aom/common/video_common.h | 25 + third_party/aom/common/video_reader.c | 135 + third_party/aom/common/video_reader.h | 60 + third_party/aom/common/video_writer.c | 83 + third_party/aom/common/video_writer.h | 47 + third_party/aom/common/warnings.c | 97 + third_party/aom/common/warnings.h | 34 + third_party/aom/common/webmdec.cc | 248 + third_party/aom/common/webmdec.h | 71 + third_party/aom/common/webmenc.cc | 242 + third_party/aom/common/webmenc.h | 71 + third_party/aom/common/y4menc.c | 108 + third_party/aom/common/y4menc.h | 39 + third_party/aom/common/y4minput.c | 1222 ++ third_party/aom/common/y4minput.h | 82 + third_party/aom/doc/AlgorithmDescription.md | 799 ++ third_party/aom/doc/dev_guide/av1_decoder.dox | 11 + third_party/aom/doc/dev_guide/av1_encoder.dox | 1617 +++ third_party/aom/doc/dev_guide/av1encoderflow.png | Bin 0 -> 97167 bytes third_party/aom/doc/dev_guide/av1partitions.png | Bin 0 -> 115004 bytes third_party/aom/doc/dev_guide/coeff_coding.png | Bin 0 -> 17955 bytes third_party/aom/doc/dev_guide/filter_flow.png | Bin 0 -> 30616 bytes third_party/aom/doc/dev_guide/filter_thr.png | Bin 0 -> 12969 bytes third_party/aom/doc/dev_guide/genericcodecflow.png | Bin 0 -> 46815 bytes third_party/aom/doc/dev_guide/gf_group.png | Bin 0 -> 121402 bytes third_party/aom/doc/dev_guide/partition.png | Bin 0 -> 32428 bytes .../aom/doc/dev_guide/tplgfgroupdiagram.png | Bin 0 -> 31598 bytes third_party/aom/doc/img/edge_direction.svg | 6319 +++++++++ third_party/aom/doc/img/equ_dir_search.svg | 206 + third_party/aom/doc/img/equ_dual_self_guided.svg | 71 + third_party/aom/doc/img/equ_dual_self_para.svg | 69 + third_party/aom/doc/img/equ_edge_direction.svg | 121 + third_party/aom/doc/img/equ_guided_filter.svg | 53 + third_party/aom/doc/img/equ_wiener_filter.svg | 51 + third_party/aom/doc/img/inter_motion_field.svg | 219 + third_party/aom/doc/img/inter_obmc.svg | 61 + third_party/aom/doc/img/inter_spatial_mvp.svg | 215 + third_party/aom/doc/img/inter_tmvp_positions.svg | 99 + third_party/aom/doc/img/inter_tx_partition.svg | 87 + third_party/aom/doc/img/intra_cfl.svg | 193 + third_party/aom/doc/img/intra_directional.svg | 192 + third_party/aom/doc/img/intra_paeth.svg | 181 + third_party/aom/doc/img/intra_recursive.svg | 710 + third_party/aom/doc/img/intra_tx_partition.svg | 142 + third_party/aom/doc/img/loop_restoration.svg | 114 + third_party/aom/doc/img/partition_codingblock.svg | 225 + third_party/aom/doc/img/primary_tap.svg | 1589 +++ third_party/aom/doc/img/quant_ac.svg | 1 + third_party/aom/doc/img/quant_dc.svg | 1 + third_party/aom/doc/img/scc_intrabc.svg | 348 + third_party/aom/doc/img/secondary_tap.svg | 857 ++ third_party/aom/doc/img/tx_basis.svg | 1 + third_party/aom/doc/img/tx_cands_large.svg | 1 + third_party/aom/doc/img/tx_cands_small.svg | 1 + third_party/aom/doc/img/tx_chroma.svg | 1 + third_party/aom/doc/img/tx_partition.svg | 1 + third_party/aom/doc/img/tx_set.svg | 1 + third_party/aom/docs.cmake | 345 + third_party/aom/examples/analyzer.cc | 722 ++ third_party/aom/examples/aom_cx_set_ref.c | 392 + third_party/aom/examples/av1_dec_fuzzer.cc | 67 + third_party/aom/examples/av1_dec_fuzzer.dict | 5 + third_party/aom/examples/build_av1_dec_fuzzer.sh | 70 + third_party/aom/examples/decode_to_md5.c | 130 + third_party/aom/examples/decode_with_drops.c | 144 + third_party/aom/examples/encoder_util.c | 136 + third_party/aom/examples/encoder_util.h | 40 + third_party/aom/examples/inspect.c | 963 ++ .../aom/examples/lightfield_bitstream_parsing.c | 415 + third_party/aom/examples/lightfield_decoder.c | 381 + third_party/aom/examples/lightfield_encoder.c | 525 + .../aom/examples/lightfield_tile_list_decoder.c | 232 + third_party/aom/examples/lossless_encoder.c | 137 + third_party/aom/examples/noise_model.c | 434 + third_party/aom/examples/photon_noise_table.c | 398 + third_party/aom/examples/scalable_decoder.c | 184 + third_party/aom/examples/scalable_encoder.c | 288 + third_party/aom/examples/set_maps.c | 219 + third_party/aom/examples/simple_decoder.c | 145 + third_party/aom/examples/simple_encoder.c | 259 + third_party/aom/examples/svc_encoder_rtc.cc | 2062 +++ third_party/aom/examples/twopass_encoder.c | 254 + third_party/aom/keywords.dox | 51 + third_party/aom/libs.doxy_template | 2447 ++++ third_party/aom/mainpage.dox | 68 + third_party/aom/stats/aomstats.c | 112 + third_party/aom/stats/aomstats.h | 44 + third_party/aom/stats/rate_hist.c | 301 + third_party/aom/stats/rate_hist.h | 41 + third_party/aom/test/accounting_test.cc | 75 + third_party/aom/test/acm_random.h | 88 + third_party/aom/test/active_map_test.cc | 97 + third_party/aom/test/allintra_end_to_end_test.cc | 145 + third_party/aom/test/altref_test.cc | 215 + third_party/aom/test/aom_image_test.cc | 62 + third_party/aom/test/aom_integer_test.cc | 177 + third_party/aom/test/aom_mem_test.cc | 34 + third_party/aom/test/aomcx_set_ref.sh | 58 + third_party/aom/test/aomdec.sh | 219 + third_party/aom/test/aomenc.sh | 306 + third_party/aom/test/aq_segment_test.cc | 110 + third_party/aom/test/arf_freq_test.cc | 218 + third_party/aom/test/av1_c_vs_simd_encode.sh | 566 + third_party/aom/test/av1_common_int_test.cc | 22 + third_party/aom/test/av1_config_test.cc | 164 + third_party/aom/test/av1_convolve_scale_test.cc | 561 + third_party/aom/test/av1_convolve_test.cc | 2447 ++++ .../aom/test/av1_encoder_parms_get_to_decoder.cc | 160 + third_party/aom/test/av1_ext_tile_test.cc | 212 + .../aom/test/av1_external_partition_test.cc | 702 + third_party/aom/test/av1_fwd_txfm1d_test.cc | 108 + third_party/aom/test/av1_fwd_txfm2d_test.cc | 692 + third_party/aom/test/av1_highbd_iht_test.cc | 376 + .../aom/test/av1_horz_only_frame_superres_test.cc | 385 + third_party/aom/test/av1_inv_txfm1d_test.cc | 157 + third_party/aom/test/av1_inv_txfm2d_test.cc | 406 + third_party/aom/test/av1_k_means_test.cc | 295 + third_party/aom/test/av1_key_value_api_test.cc | 133 + third_party/aom/test/av1_nn_predict_test.cc | 228 + third_party/aom/test/av1_quantize_test.cc | 264 + third_party/aom/test/av1_round_shift_array_test.cc | 131 + third_party/aom/test/av1_softmax_test.cc | 122 + third_party/aom/test/av1_temporal_denoiser_test.cc | 140 + third_party/aom/test/av1_txfm_test.cc | 398 + third_party/aom/test/av1_txfm_test.h | 161 + third_party/aom/test/av1_wedge_utils_test.cc | 411 + third_party/aom/test/avg_test.cc | 1150 ++ third_party/aom/test/avif_progressive_test.cc | 279 + third_party/aom/test/best_encode.sh | 101 + third_party/aom/test/binary_codes_test.cc | 83 + third_party/aom/test/blend_a64_mask_1d_test.cc | 342 + third_party/aom/test/blend_a64_mask_test.cc | 649 + third_party/aom/test/block_test.cc | 209 + third_party/aom/test/boolcoder_test.cc | 173 + third_party/aom/test/borders_test.cc | 82 + third_party/aom/test/cdef_test.cc | 962 ++ third_party/aom/test/cfl_test.cc | 597 + third_party/aom/test/cnn_test.cc | 2661 ++++ third_party/aom/test/codec_factory.h | 178 + third_party/aom/test/coding_path_sync.cc | 212 + third_party/aom/test/comp_avg_pred_test.cc | 249 + third_party/aom/test/comp_avg_pred_test.h | 757 ++ third_party/aom/test/comp_mask_pred_test.cc | 856 ++ third_party/aom/test/convolve_test.cc | 922 ++ third_party/aom/test/corner_match_test.cc | 145 + third_party/aom/test/cpu_speed_test.cc | 175 + third_party/aom/test/cpu_used_firstpass_test.cc | 129 + third_party/aom/test/datarate_test.cc | 712 + third_party/aom/test/datarate_test.h | 223 + third_party/aom/test/decode_api_test.cc | 62 + third_party/aom/test/decode_multithreaded_test.cc | 182 + third_party/aom/test/decode_perf_test.cc | 246 + third_party/aom/test/decode_scalability_test.cc | 121 + third_party/aom/test/decode_test_driver.cc | 114 + third_party/aom/test/decode_test_driver.h | 165 + third_party/aom/test/decode_to_md5.sh | 77 + third_party/aom/test/decode_with_drops.sh | 68 + third_party/aom/test/deltaq_mode_test.cc | 209 + third_party/aom/test/disflow_test.cc | 122 + third_party/aom/test/divu_small_test.cc | 41 + third_party/aom/test/dr_prediction_test.cc | 542 + third_party/aom/test/dropframe_encode_test.cc | 62 + third_party/aom/test/dump_obu.sh | 77 + third_party/aom/test/ec_test.cc | 154 + third_party/aom/test/encode_api_test.cc | 659 + third_party/aom/test/encode_perf_test.cc | 183 + .../aom/test/encode_small_width_height_test.cc | 246 + third_party/aom/test/encode_test_driver.cc | 302 + third_party/aom/test/encode_test_driver.h | 286 + third_party/aom/test/encodemb_test.cc | 245 + third_party/aom/test/encodetxb_test.cc | 289 + third_party/aom/test/end_to_end_psnr_test.cc | 212 + third_party/aom/test/end_to_end_qmpsnr_test.cc | 193 + third_party/aom/test/end_to_end_ssim_test.cc | 189 + third_party/aom/test/error_block_test.cc | 319 + third_party/aom/test/error_resilience_test.cc | 465 + third_party/aom/test/ethread_test.cc | 577 + third_party/aom/test/examples.sh | 37 + third_party/aom/test/external_frame_buffer_test.cc | 547 + third_party/aom/test/fdct4x4_test.cc | 124 + third_party/aom/test/fft_test.cc | 268 + third_party/aom/test/film_grain_table_test.cc | 381 + third_party/aom/test/filterintra_test.cc | 197 + third_party/aom/test/firstpass_test.cc | 166 + third_party/aom/test/force_key_frame_test.cc | 93 + .../aom/test/forced_max_frame_width_height_test.cc | 280 + third_party/aom/test/frame_parallel_enc_test.cc | 197 + third_party/aom/test/frame_size_tests.cc | 388 + third_party/aom/test/function_equivalence_test.h | 68 + third_party/aom/test/fwht4x4_test.cc | 223 + third_party/aom/test/gf_pyr_height_test.cc | 155 + third_party/aom/test/gviz_api.py | 1087 ++ third_party/aom/test/hadamard_test.cc | 547 + third_party/aom/test/hash_test.cc | 141 + third_party/aom/test/hbd_metrics_test.cc | 239 + third_party/aom/test/hiprec_convolve_test.cc | 76 + third_party/aom/test/hiprec_convolve_test_util.cc | 380 + third_party/aom/test/hiprec_convolve_test_util.h | 90 + third_party/aom/test/horver_correlation_test.cc | 154 + third_party/aom/test/horz_superres_test.cc | 409 + third_party/aom/test/i420_video_source.h | 34 + third_party/aom/test/intra_edge_test.cc | 351 + third_party/aom/test/intrabc_test.cc | 172 + third_party/aom/test/intrapred_test.cc | 488 + third_party/aom/test/invalid_file_test.cc | 169 + third_party/aom/test/ivf_video_source.h | 114 + third_party/aom/test/kf_test.cc | 401 + third_party/aom/test/level_test.cc | 188 + third_party/aom/test/lightfield_test.sh | 115 + third_party/aom/test/log2_test.cc | 51 + third_party/aom/test/loopfilter_control_test.cc | 198 + third_party/aom/test/lossless_test.cc | 230 + third_party/aom/test/lpf_test.cc | 824 ++ third_party/aom/test/masked_sad_test.cc | 617 + third_party/aom/test/masked_variance_test.cc | 712 + third_party/aom/test/md5_helper.h | 76 + third_party/aom/test/metadata_test.cc | 332 + third_party/aom/test/metrics_template.html | 422 + third_party/aom/test/minmax_test.cc | 244 + third_party/aom/test/monochrome_test.cc | 213 + third_party/aom/test/motion_vector_test.cc | 103 + third_party/aom/test/mv_cost_test.cc | 125 + third_party/aom/test/noise_model_test.cc | 1372 ++ third_party/aom/test/obmc_sad_test.cc | 333 + third_party/aom/test/obmc_variance_test.cc | 571 + third_party/aom/test/pickrst_test.cc | 750 ++ third_party/aom/test/postproc_filters_test.cc | 140 + third_party/aom/test/quant_test.cc | 188 + third_party/aom/test/quantize_func_test.cc | 795 ++ third_party/aom/test/ratectrl_rtc_test.cc | 505 + third_party/aom/test/ratectrl_test.cc | 39 + third_party/aom/test/rd_test.cc | 87 + third_party/aom/test/reconinter_test.cc | 372 + third_party/aom/test/register_state_check.h | 136 + third_party/aom/test/resize_test.cc | 1136 ++ third_party/aom/test/rt_end_to_end_test.cc | 208 + third_party/aom/test/run_encodes.sh | 39 + third_party/aom/test/sad_test.cc | 3353 +++++ third_party/aom/test/sb_multipass_test.cc | 152 + third_party/aom/test/sb_qp_sweep_test.cc | 147 + third_party/aom/test/scalability_test.cc | 81 + third_party/aom/test/scan_test.cc | 133 + third_party/aom/test/screen_content_test.cc | 135 + third_party/aom/test/segment_binarization_sync.cc | 61 + third_party/aom/test/selfguided_filter_test.cc | 435 + third_party/aom/test/set_maps.sh | 52 + third_party/aom/test/sharpness_test.cc | 143 + third_party/aom/test/simd_avx2_test.cc | 15 + third_party/aom/test/simd_cmp_avx2.cc | 15 + third_party/aom/test/simd_cmp_impl.h | 2175 ++++ third_party/aom/test/simd_cmp_sse2.cc | 18 + third_party/aom/test/simd_cmp_sse4.cc | 18 + third_party/aom/test/simd_cmp_ssse3.cc | 18 + third_party/aom/test/simd_impl.h | 1140 ++ third_party/aom/test/simd_sse2_test.cc | 18 + third_party/aom/test/simd_sse4_test.cc | 18 + third_party/aom/test/simd_ssse3_test.cc | 18 + third_party/aom/test/simple_decoder.sh | 58 + third_party/aom/test/simple_encoder.sh | 53 + third_party/aom/test/sse_sum_test.cc | 182 + third_party/aom/test/still_picture_test.cc | 95 + third_party/aom/test/subtract_test.cc | 292 + third_party/aom/test/sum_squares_test.cc | 928 ++ third_party/aom/test/svc_datarate_test.cc | 2675 ++++ third_party/aom/test/svc_encoder_rtc.sh | 85 + third_party/aom/test/temporal_filter_test.cc | 788 ++ third_party/aom/test/test-data.sha1 | 575 + third_party/aom/test/test.cmake | 647 + third_party/aom/test/test_aom_rc.cc | 17 + .../aom/test/test_data_download_worker.cmake | 46 + third_party/aom/test/test_data_util.cmake | 665 + third_party/aom/test/test_intra_pred_speed.cc | 1742 +++ third_party/aom/test/test_libaom.cc | 91 + third_party/aom/test/test_runner.cmake | 28 + third_party/aom/test/test_vector_test.cc | 173 + third_party/aom/test/test_vectors.cc | 268 + third_party/aom/test/test_vectors.h | 26 + third_party/aom/test/tile_config_test.cc | 363 + third_party/aom/test/tile_independence_test.cc | 170 + third_party/aom/test/time_stamp_test.cc | 107 + third_party/aom/test/tools_common.sh | 520 + third_party/aom/test/tpl_model_test.cc | 529 + third_party/aom/test/transform_test_base.h | 368 + third_party/aom/test/twopass_encoder.sh | 54 + third_party/aom/test/util.h | 60 + third_party/aom/test/variance_test.cc | 4370 +++++++ third_party/aom/test/video_source.h | 282 + third_party/aom/test/visual_metrics.py | 466 + third_party/aom/test/warp_filter_test.cc | 93 + third_party/aom/test/warp_filter_test_util.cc | 505 + third_party/aom/test/warp_filter_test_util.h | 102 + third_party/aom/test/webm_video_source.h | 107 + third_party/aom/test/webmenc_test.cc | 69 + third_party/aom/test/wiener_test.cc | 1390 ++ third_party/aom/test/y4m_test.cc | 287 + third_party/aom/test/y4m_video_source.h | 125 + third_party/aom/test/yuv_video_source.h | 126 + .../aom/third_party/SVT-AV1/EbMemory_AVX2.h | 110 + .../aom/third_party/SVT-AV1/EbMemory_SSE4_1.h | 38 + third_party/aom/third_party/SVT-AV1/LICENSE.md | 32 + third_party/aom/third_party/SVT-AV1/PATENTS.md | 107 + third_party/aom/third_party/SVT-AV1/README.libaom | 14 + .../aom/third_party/SVT-AV1/convolve_2d_avx2.h | 1199 ++ .../aom/third_party/SVT-AV1/convolve_avx2.h | 3335 +++++ third_party/aom/third_party/SVT-AV1/synonyms.h | 31 + third_party/aom/third_party/fastfeat/LICENSE | 30 + third_party/aom/third_party/fastfeat/README.libaom | 44 + third_party/aom/third_party/fastfeat/fast.c | 67 + third_party/aom/third_party/fastfeat/fast.h | 56 + third_party/aom/third_party/fastfeat/fast_9.c | 5947 +++++++++ third_party/aom/third_party/fastfeat/nonmax.c | 174 + .../aom/third_party/googletest/README.libaom | 38 + .../aom/third_party/googletest/src/.clang-format | 4 + .../aom/third_party/googletest/src/CMakeLists.txt | 34 + .../aom/third_party/googletest/src/CONTRIBUTORS | 65 + third_party/aom/third_party/googletest/src/LICENSE | 28 + .../aom/third_party/googletest/src/README.md | 141 + .../googletest/src/googlemock/CMakeLists.txt | 218 + .../googletest/src/googlemock/README.md | 40 + .../googletest/src/googlemock/cmake/gmock.pc.in | 10 + .../src/googlemock/cmake/gmock_main.pc.in | 10 + .../src/googlemock/include/gmock/gmock-actions.h | 2298 ++++ .../googlemock/include/gmock/gmock-cardinalities.h | 159 + .../include/gmock/gmock-function-mocker.h | 514 + .../src/googlemock/include/gmock/gmock-matchers.h | 5610 ++++++++ .../googlemock/include/gmock/gmock-more-actions.h | 662 + .../googlemock/include/gmock/gmock-more-matchers.h | 91 + .../googlemock/include/gmock/gmock-nice-strict.h | 277 + .../googlemock/include/gmock/gmock-spec-builders.h | 2083 +++ .../src/googlemock/include/gmock/gmock.h | 96 + .../include/gmock/internal/custom/README.md | 18 + .../internal/custom/gmock-generated-actions.h | 7 + .../include/gmock/internal/custom/gmock-matchers.h | 37 + .../include/gmock/internal/custom/gmock-port.h | 40 + .../include/gmock/internal/gmock-internal-utils.h | 476 + .../googlemock/include/gmock/internal/gmock-port.h | 139 + .../googlemock/include/gmock/internal/gmock-pp.h | 279 + .../googletest/src/googlemock/src/gmock-all.cc | 46 + .../src/googlemock/src/gmock-cardinalities.cc | 155 + .../src/googlemock/src/gmock-internal-utils.cc | 250 + .../src/googlemock/src/gmock-matchers.cc | 462 + .../src/googlemock/src/gmock-spec-builders.cc | 781 ++ .../googletest/src/googlemock/src/gmock.cc | 223 + .../googletest/src/googlemock/src/gmock_main.cc | 72 + .../googletest/src/googletest/CMakeLists.txt | 322 + .../googletest/src/googletest/README.md | 217 + .../src/googletest/cmake/Config.cmake.in | 9 + .../googletest/src/googletest/cmake/gtest.pc.in | 9 + .../src/googletest/cmake/gtest_main.pc.in | 10 + .../src/googletest/cmake/internal_utils.cmake | 342 + .../googletest/src/googletest/cmake/libgtest.la.in | 21 + .../include/gtest/gtest-assertion-result.h | 237 + .../googletest/include/gtest/gtest-death-test.h | 345 + .../src/googletest/include/gtest/gtest-matchers.h | 956 ++ .../src/googletest/include/gtest/gtest-message.h | 218 + .../googletest/include/gtest/gtest-param-test.h | 510 + .../src/googletest/include/gtest/gtest-printers.h | 1048 ++ .../src/googletest/include/gtest/gtest-spi.h | 248 + .../src/googletest/include/gtest/gtest-test-part.h | 190 + .../googletest/include/gtest/gtest-typed-test.h | 331 + .../src/googletest/include/gtest/gtest.h | 2297 ++++ .../src/googletest/include/gtest/gtest_pred_impl.h | 279 + .../src/googletest/include/gtest/gtest_prod.h | 60 + .../include/gtest/internal/custom/README.md | 44 + .../include/gtest/internal/custom/gtest-port.h | 68 + .../include/gtest/internal/custom/gtest-printers.h | 42 + .../include/gtest/internal/custom/gtest.h | 37 + .../gtest/internal/gtest-death-test-internal.h | 306 + .../include/gtest/internal/gtest-filepath.h | 210 + .../include/gtest/internal/gtest-internal.h | 1570 +++ .../include/gtest/internal/gtest-param-util.h | 956 ++ .../include/gtest/internal/gtest-port-arch.h | 116 + .../googletest/include/gtest/internal/gtest-port.h | 2413 ++++ .../include/gtest/internal/gtest-string.h | 177 + .../include/gtest/internal/gtest-type-util.h | 186 + .../googletest/src/googletest/src/gtest-all.cc | 49 + .../src/googletest/src/gtest-assertion-result.cc | 77 + .../src/googletest/src/gtest-death-test.cc | 1620 +++ .../src/googletest/src/gtest-filepath.cc | 367 + .../src/googletest/src/gtest-internal-inl.h | 1212 ++ .../src/googletest/src/gtest-matchers.cc | 98 + .../googletest/src/googletest/src/gtest-port.cc | 1394 ++ .../src/googletest/src/gtest-printers.cc | 553 + .../src/googletest/src/gtest-test-part.cc | 105 + .../src/googletest/src/gtest-typed-test.cc | 104 + .../googletest/src/googletest/src/gtest.cc | 6795 ++++++++++ .../googletest/src/googletest/src/gtest_main.cc | 53 + third_party/aom/third_party/libwebm/AUTHORS.TXT | 5 + third_party/aom/third_party/libwebm/Android.mk | 23 + third_party/aom/third_party/libwebm/LICENSE.TXT | 30 + third_party/aom/third_party/libwebm/PATENTS.TXT | 23 + third_party/aom/third_party/libwebm/README.libaom | 20 + .../aom/third_party/libwebm/common/file_util.cc | 93 + .../aom/third_party/libwebm/common/file_util.h | 44 + .../aom/third_party/libwebm/common/hdr_util.cc | 220 + .../aom/third_party/libwebm/common/hdr_util.h | 71 + .../aom/third_party/libwebm/common/webmids.h | 193 + .../aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc | 4230 ++++++ .../aom/third_party/libwebm/mkvmuxer/mkvmuxer.h | 1924 +++ .../third_party/libwebm/mkvmuxer/mkvmuxertypes.h | 28 + .../third_party/libwebm/mkvmuxer/mkvmuxerutil.cc | 743 ++ .../third_party/libwebm/mkvmuxer/mkvmuxerutil.h | 115 + .../aom/third_party/libwebm/mkvmuxer/mkvwriter.cc | 92 + .../aom/third_party/libwebm/mkvmuxer/mkvwriter.h | 51 + .../aom/third_party/libwebm/mkvparser/mkvparser.cc | 8100 ++++++++++++ .../aom/third_party/libwebm/mkvparser/mkvparser.h | 1147 ++ .../aom/third_party/libwebm/mkvparser/mkvreader.cc | 135 + .../aom/third_party/libwebm/mkvparser/mkvreader.h | 45 + third_party/aom/third_party/libyuv/LICENSE | 29 + third_party/aom/third_party/libyuv/README.libaom | 37 + .../libyuv/include/libyuv/basic_types.h | 68 + .../third_party/libyuv/include/libyuv/compare.h | 111 + .../third_party/libyuv/include/libyuv/convert.h | 526 + .../libyuv/include/libyuv/convert_argb.h | 1611 +++ .../libyuv/include/libyuv/convert_from.h | 185 + .../libyuv/include/libyuv/convert_from_argb.h | 311 + .../aom/third_party/libyuv/include/libyuv/cpu_id.h | 122 + .../libyuv/include/libyuv/mjpeg_decoder.h | 195 + .../libyuv/include/libyuv/planar_functions.h | 900 ++ .../aom/third_party/libyuv/include/libyuv/rotate.h | 182 + .../libyuv/include/libyuv/rotate_argb.h | 37 + .../third_party/libyuv/include/libyuv/rotate_row.h | 223 + .../aom/third_party/libyuv/include/libyuv/row.h | 4384 +++++++ .../aom/third_party/libyuv/include/libyuv/scale.h | 204 + .../third_party/libyuv/include/libyuv/scale_argb.h | 76 + .../third_party/libyuv/include/libyuv/scale_row.h | 1367 ++ .../third_party/libyuv/include/libyuv/scale_uv.h | 38 + .../third_party/libyuv/include/libyuv/version.h | 16 + .../libyuv/include/libyuv/video_common.h | 206 + .../aom/third_party/libyuv/source/compare.cc | 440 + .../third_party/libyuv/source/compare_common.cc | 104 + .../aom/third_party/libyuv/source/compare_gcc.cc | 360 + .../aom/third_party/libyuv/source/compare_neon.cc | 96 + .../third_party/libyuv/source/compare_neon64.cc | 94 + .../aom/third_party/libyuv/source/compare_win.cc | 241 + .../aom/third_party/libyuv/source/convert.cc | 2514 ++++ .../aom/third_party/libyuv/source/convert_argb.cc | 4125 ++++++ .../aom/third_party/libyuv/source/convert_from.cc | 713 + .../third_party/libyuv/source/convert_from_argb.cc | 2163 ++++ .../aom/third_party/libyuv/source/convert_jpeg.cc | 602 + .../third_party/libyuv/source/convert_to_argb.cc | 382 + .../third_party/libyuv/source/convert_to_i420.cc | 272 + .../aom/third_party/libyuv/source/cpu_id.cc | 280 + .../aom/third_party/libyuv/source/mjpeg_decoder.cc | 585 + .../third_party/libyuv/source/mjpeg_validate.cc | 71 + .../third_party/libyuv/source/planar_functions.cc | 4107 ++++++ .../aom/third_party/libyuv/source/rotate.cc | 609 + .../aom/third_party/libyuv/source/rotate_any.cc | 79 + .../aom/third_party/libyuv/source/rotate_argb.cc | 243 + .../aom/third_party/libyuv/source/rotate_common.cc | 106 + .../aom/third_party/libyuv/source/rotate_gcc.cc | 374 + .../aom/third_party/libyuv/source/rotate_mips.cc | 484 + .../aom/third_party/libyuv/source/rotate_neon.cc | 418 + .../aom/third_party/libyuv/source/rotate_neon64.cc | 443 + .../aom/third_party/libyuv/source/rotate_win.cc | 252 + .../aom/third_party/libyuv/source/row_any.cc | 1562 +++ .../aom/third_party/libyuv/source/row_common.cc | 3849 ++++++ .../aom/third_party/libyuv/source/row_gcc.cc | 7175 +++++++++++ .../aom/third_party/libyuv/source/row_mips.cc | 911 ++ .../aom/third_party/libyuv/source/row_neon.cc | 3039 +++++ .../aom/third_party/libyuv/source/row_neon64.cc | 3387 +++++ .../aom/third_party/libyuv/source/row_win.cc | 6237 +++++++++ third_party/aom/third_party/libyuv/source/scale.cc | 1935 +++ .../aom/third_party/libyuv/source/scale_any.cc | 615 + .../aom/third_party/libyuv/source/scale_argb.cc | 1091 ++ .../aom/third_party/libyuv/source/scale_common.cc | 1564 +++ .../aom/third_party/libyuv/source/scale_gcc.cc | 1464 +++ .../aom/third_party/libyuv/source/scale_mips.cc | 654 + .../aom/third_party/libyuv/source/scale_neon.cc | 1016 ++ .../aom/third_party/libyuv/source/scale_neon64.cc | 1152 ++ .../aom/third_party/libyuv/source/scale_uv.cc | 891 ++ .../aom/third_party/libyuv/source/scale_win.cc | 1391 ++ .../aom/third_party/libyuv/source/video_common.cc | 62 + third_party/aom/third_party/vector/LICENSE | 19 + third_party/aom/third_party/vector/README.libaom | 16 + third_party/aom/third_party/vector/vector.c | 540 + third_party/aom/third_party/vector/vector.h | 138 + third_party/aom/third_party/x86inc/LICENSE | 18 + third_party/aom/third_party/x86inc/README.libaom | 19 + third_party/aom/third_party/x86inc/x86inc.asm | 1923 +++ third_party/aom/tools/aggregate_entropy_stats.py | 39 + third_party/aom/tools/aom_entropy_optimizer.c | 761 ++ .../aom/tools/auto_refactor/auto_refactor.py | 919 ++ .../aom/tools/auto_refactor/av1_preprocess.py | 113 + .../tools/auto_refactor/c_files/decl_status_code.c | 31 + .../aom/tools/auto_refactor/c_files/func_in_out.c | 208 + .../tools/auto_refactor/c_files/global_variable.c | 27 + .../aom/tools/auto_refactor/c_files/parse_lvalue.c | 46 + .../aom/tools/auto_refactor/c_files/simple_code.c | 64 + .../aom/tools/auto_refactor/c_files/struct_code.c | 49 + .../aom/tools/auto_refactor/test_auto_refactor.py | 675 + third_party/aom/tools/cpplint.py | 6244 +++++++++ third_party/aom/tools/diff.py | 132 + third_party/aom/tools/dump_obu.cc | 168 + .../aom/tools/frame_size_variation_analyzer.py | 74 + third_party/aom/tools/gen_authors.sh | 10 + third_party/aom/tools/gen_constrained_tokenset.py | 120 + third_party/aom/tools/gop_bitrate/analyze_data.py | 18 + .../aom/tools/gop_bitrate/encode_all_script.sh | 13 + .../tools/gop_bitrate/python/bitrate_accuracy.py | 185 + third_party/aom/tools/inspect-cli.js | 39 + third_party/aom/tools/inspect-post.js | 1 + third_party/aom/tools/intersect-diffs.py | 78 + third_party/aom/tools/lint-hunks.py | 150 + third_party/aom/tools/obu_parser.cc | 190 + third_party/aom/tools/obu_parser.h | 27 + .../ratectrl_log_analyzer/analyze_ratectrl_log.py | 154 + .../aom/tools/txfm_analyzer/txfm_gen_code.cc | 580 + third_party/aom/tools/txfm_analyzer/txfm_graph.cc | 943 ++ third_party/aom/tools/txfm_analyzer/txfm_graph.h | 160 + third_party/aom/tools/wrap-commit-msg.py | 72 + third_party/aom/usage.dox | 109 + third_party/aom/usage_cx.dox | 9 + third_party/aom/usage_dx.dox | 22 + 1345 files changed, 730066 insertions(+) create mode 100644 third_party/aom/.clang-format create mode 100644 third_party/aom/.cmake-format.py create mode 100644 third_party/aom/.mailmap create mode 100644 third_party/aom/AUTHORS create mode 100644 third_party/aom/CHANGELOG create mode 100644 third_party/aom/CMakeLists.txt create mode 100644 third_party/aom/LICENSE create mode 100644 third_party/aom/PATENTS create mode 100644 third_party/aom/README.md create mode 100644 third_party/aom/Sample.cfg create mode 100644 third_party/aom/aom/aom.h create mode 100644 third_party/aom/aom/aom_codec.h create mode 100644 third_party/aom/aom/aom_decoder.h create mode 100644 third_party/aom/aom/aom_encoder.h create mode 100644 third_party/aom/aom/aom_external_partition.h create mode 100644 third_party/aom/aom/aom_frame_buffer.h create mode 100644 third_party/aom/aom/aom_image.h create mode 100644 third_party/aom/aom/aom_integer.h create mode 100644 third_party/aom/aom/aomcx.h create mode 100644 third_party/aom/aom/aomdx.h create mode 100644 third_party/aom/aom/exports_com create mode 100644 third_party/aom/aom/exports_dec create mode 100644 third_party/aom/aom/exports_enc create mode 100644 third_party/aom/aom/exports_test create mode 100644 third_party/aom/aom/internal/aom_codec_internal.h create mode 100644 third_party/aom/aom/internal/aom_image_internal.h create mode 100644 third_party/aom/aom/src/aom_codec.c create mode 100644 third_party/aom/aom/src/aom_decoder.c create mode 100644 third_party/aom/aom/src/aom_encoder.c create mode 100644 third_party/aom/aom/src/aom_image.c create mode 100644 third_party/aom/aom/src/aom_integer.c create mode 100644 third_party/aom/aom_dsp/aom_convolve.c create mode 100644 third_party/aom/aom_dsp/aom_dsp.cmake create mode 100644 third_party/aom/aom_dsp/aom_dsp_common.h create mode 100644 third_party/aom/aom_dsp/aom_dsp_rtcd.c create mode 100755 third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl create mode 100644 third_party/aom/aom_dsp/aom_filter.h create mode 100644 third_party/aom/aom_dsp/aom_simd.h create mode 100644 third_party/aom/aom_dsp/aom_simd_inline.h create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c create mode 100644 third_party/aom/aom_dsp/arm/avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/avg_pred_neon.c create mode 100644 third_party/aom/aom_dsp/arm/avg_sve.c create mode 100644 third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c create mode 100644 third_party/aom/aom_dsp/arm/blend_neon.h create mode 100644 third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c create mode 100644 third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c create mode 100644 third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h create mode 100644 third_party/aom/aom_dsp/arm/dot_sve.h create mode 100644 third_party/aom/aom_dsp/arm/fwd_txfm_neon.c create mode 100644 third_party/aom/aom_dsp/arm/hadamard_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_quantize_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_sse_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_sse_sve.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/arm/highbd_variance_sve.c create mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_neon.c create mode 100644 third_party/aom/aom_dsp/arm/masked_sad4d_neon.c create mode 100644 third_party/aom/aom_dsp/arm/masked_sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/mem_neon.h create mode 100644 third_party/aom/aom_dsp/arm/obmc_sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/obmc_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/reinterpret_neon.h create mode 100644 third_party/aom/aom_dsp/arm/sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sad_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/arm/sadxd_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/arm/sse_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sse_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/subtract_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sum_neon.h create mode 100644 third_party/aom/aom_dsp/arm/sum_squares_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/arm/sum_squares_sve.c create mode 100644 third_party/aom/aom_dsp/arm/transpose_neon.h create mode 100644 third_party/aom/aom_dsp/arm/variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/variance_neon_dotprod.c create mode 100644 third_party/aom/aom_dsp/avg.c create mode 100644 third_party/aom/aom_dsp/binary_codes_reader.c create mode 100644 third_party/aom/aom_dsp/binary_codes_reader.h create mode 100644 third_party/aom/aom_dsp/binary_codes_writer.c create mode 100644 third_party/aom/aom_dsp/binary_codes_writer.h create mode 100644 third_party/aom/aom_dsp/bitreader.c create mode 100644 third_party/aom/aom_dsp/bitreader.h create mode 100644 third_party/aom/aom_dsp/bitreader_buffer.c create mode 100644 third_party/aom/aom_dsp/bitreader_buffer.h create mode 100644 third_party/aom/aom_dsp/bitwriter.c create mode 100644 third_party/aom/aom_dsp/bitwriter.h create mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.c create mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.h create mode 100644 third_party/aom/aom_dsp/blend.h create mode 100644 third_party/aom/aom_dsp/blend_a64_hmask.c create mode 100644 third_party/aom/aom_dsp/blend_a64_mask.c create mode 100644 third_party/aom/aom_dsp/blend_a64_vmask.c create mode 100644 third_party/aom/aom_dsp/blk_sse_sum.c create mode 100644 third_party/aom/aom_dsp/butteraugli.c create mode 100644 third_party/aom/aom_dsp/butteraugli.h create mode 100644 third_party/aom/aom_dsp/entcode.c create mode 100644 third_party/aom/aom_dsp/entcode.h create mode 100644 third_party/aom/aom_dsp/entdec.c create mode 100644 third_party/aom/aom_dsp/entdec.h create mode 100644 third_party/aom/aom_dsp/entenc.c create mode 100644 third_party/aom/aom_dsp/entenc.h create mode 100644 third_party/aom/aom_dsp/fastssim.c create mode 100644 third_party/aom/aom_dsp/fft.c create mode 100644 third_party/aom/aom_dsp/fft_common.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/corner_detect.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/corner_detect.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/corner_match.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/corner_match.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/disflow.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/disflow.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/flow_estimation.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/flow_estimation.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/ransac.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/ransac.h create mode 100644 third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c create mode 100644 third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c create mode 100644 third_party/aom/aom_dsp/fwd_txfm.c create mode 100644 third_party/aom/aom_dsp/grain_params.h create mode 100644 third_party/aom/aom_dsp/grain_table.c create mode 100644 third_party/aom/aom_dsp/grain_table.h create mode 100644 third_party/aom/aom_dsp/intrapred.c create mode 100644 third_party/aom/aom_dsp/intrapred_common.h create mode 100644 third_party/aom/aom_dsp/loopfilter.c create mode 100644 third_party/aom/aom_dsp/mathutils.h create mode 100644 third_party/aom/aom_dsp/noise_model.c create mode 100644 third_party/aom/aom_dsp/noise_model.h create mode 100644 third_party/aom/aom_dsp/noise_util.c create mode 100644 third_party/aom/aom_dsp/noise_util.h create mode 100644 third_party/aom/aom_dsp/odintrin.c create mode 100644 third_party/aom/aom_dsp/odintrin.h create mode 100644 third_party/aom/aom_dsp/prob.h create mode 100644 third_party/aom/aom_dsp/psnr.c create mode 100644 third_party/aom/aom_dsp/psnr.h create mode 100644 third_party/aom/aom_dsp/psnrhvs.c create mode 100644 third_party/aom/aom_dsp/pyramid.c create mode 100644 third_party/aom/aom_dsp/pyramid.h create mode 100644 third_party/aom/aom_dsp/quantize.c create mode 100644 third_party/aom/aom_dsp/quantize.h create mode 100644 third_party/aom/aom_dsp/recenter.h create mode 100644 third_party/aom/aom_dsp/rect.h create mode 100644 third_party/aom/aom_dsp/sad.c create mode 100644 third_party/aom/aom_dsp/sad_av1.c create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/sse.c create mode 100644 third_party/aom/aom_dsp/ssim.c create mode 100644 third_party/aom/aom_dsp/ssim.h create mode 100644 third_party/aom/aom_dsp/subtract.c create mode 100644 third_party/aom/aom_dsp/sum_squares.c create mode 100644 third_party/aom/aom_dsp/txfm_common.h create mode 100644 third_party/aom/aom_dsp/variance.c create mode 100644 third_party/aom/aom_dsp/variance.h create mode 100644 third_party/aom/aom_dsp/vmaf.c create mode 100644 third_party/aom/aom_dsp/vmaf.h create mode 100644 third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_asm_stubs.c create mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_quantize_avx.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/avg_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/avg_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/avg_intrin_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_mask_sse4.h create mode 100644 third_party/aom/aom_dsp/x86/blend_sse4.h create mode 100644 third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/convolve.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_common_intrin.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_sse4_1.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_ssse3.h create mode 100644 third_party/aom/aom_dsp/x86/fft_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/fft_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/intrapred_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_utils.h create mode 100644 third_party/aom/aom_dsp/x86/intrapred_x86.h create mode 100644 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/jnt_sad_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/loopfilter_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/loopfilter_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/lpf_common_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h create mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h create mode 100644 third_party/aom/aom_dsp/x86/mem_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h create mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h create mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/quantize_x86.h create mode 100644 third_party/aom/aom_dsp/x86/sad4d_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad4d_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_impl_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sse_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sse_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/subtract_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/subtract_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/synonyms.h create mode 100644 third_party/aom/aom_dsp/x86/synonyms_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/transpose_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/variance_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/variance_impl_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/variance_impl_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/variance_sse2.c create mode 100644 third_party/aom/aom_mem/aom_mem.c create mode 100644 third_party/aom/aom_mem/aom_mem.cmake create mode 100644 third_party/aom/aom_mem/aom_mem.h create mode 100644 third_party/aom/aom_mem/include/aom_mem_intrnl.h create mode 100644 third_party/aom/aom_ports/aarch32_cpudetect.c create mode 100644 third_party/aom/aom_ports/aarch64_cpudetect.c create mode 100644 third_party/aom/aom_ports/aom_once.h create mode 100644 third_party/aom/aom_ports/aom_ports.cmake create mode 100644 third_party/aom/aom_ports/aom_timer.h create mode 100644 third_party/aom/aom_ports/arm.h create mode 100644 third_party/aom/aom_ports/arm_cpudetect.h create mode 100644 third_party/aom/aom_ports/bitops.h create mode 100644 third_party/aom/aom_ports/emmintrin_compat.h create mode 100644 third_party/aom/aom_ports/float.asm create mode 100644 third_party/aom/aom_ports/mem.h create mode 100644 third_party/aom/aom_ports/mem_ops.h create mode 100644 third_party/aom/aom_ports/mem_ops_aligned.h create mode 100644 third_party/aom/aom_ports/msvc.h create mode 100644 third_party/aom/aom_ports/ppc.h create mode 100644 third_party/aom/aom_ports/ppc_cpudetect.c create mode 100644 third_party/aom/aom_ports/sanitizer.h create mode 100644 third_party/aom/aom_ports/x86.h create mode 100644 third_party/aom/aom_ports/x86_abi_support.asm create mode 100644 third_party/aom/aom_scale/aom_scale.cmake create mode 100644 third_party/aom/aom_scale/aom_scale.h create mode 100644 third_party/aom/aom_scale/aom_scale_rtcd.c create mode 100644 third_party/aom/aom_scale/aom_scale_rtcd.pl create mode 100644 third_party/aom/aom_scale/generic/aom_scale.c create mode 100644 third_party/aom/aom_scale/generic/gen_scalers.c create mode 100644 third_party/aom/aom_scale/generic/yv12config.c create mode 100644 third_party/aom/aom_scale/generic/yv12extend.c create mode 100644 third_party/aom/aom_scale/yv12config.h create mode 100644 third_party/aom/aom_util/aom_thread.c create mode 100644 third_party/aom/aom_util/aom_thread.h create mode 100644 third_party/aom/aom_util/aom_util.cmake create mode 100644 third_party/aom/aom_util/debug_util.c create mode 100644 third_party/aom/aom_util/debug_util.h create mode 100644 third_party/aom/aom_util/endian_inl.h create mode 100644 third_party/aom/aomedia_logo_200.png create mode 100644 third_party/aom/apps/aomdec.c create mode 100644 third_party/aom/apps/aomenc.c create mode 100644 third_party/aom/apps/aomenc.h create mode 100644 third_party/aom/av1/arg_defs.c create mode 100644 third_party/aom/av1/arg_defs.h create mode 100644 third_party/aom/av1/av1.cmake create mode 100644 third_party/aom/av1/av1_cx_iface.c create mode 100644 third_party/aom/av1/av1_cx_iface.h create mode 100644 third_party/aom/av1/av1_dx_iface.c create mode 100644 third_party/aom/av1/av1_iface_common.h create mode 100644 third_party/aom/av1/common/alloccommon.c create mode 100644 third_party/aom/av1/common/alloccommon.h create mode 100644 third_party/aom/av1/common/arm/av1_inv_txfm_neon.c create mode 100644 third_party/aom/av1/common/arm/av1_inv_txfm_neon.h create mode 100644 third_party/aom/av1/common/arm/av1_txfm_neon.c create mode 100644 third_party/aom/av1/common/arm/blend_a64_hmask_neon.c create mode 100644 third_party/aom/av1/common/arm/blend_a64_vmask_neon.c create mode 100644 third_party/aom/av1/common/arm/cdef_block_neon.c create mode 100644 third_party/aom/av1/common/arm/cfl_neon.c create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon.h create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c create mode 100644 third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c create mode 100644 third_party/aom/av1/common/arm/convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/convolve_neon.h create mode 100644 third_party/aom/av1/common/arm/convolve_neon_dotprod.c create mode 100644 third_party/aom/av1/common/arm/convolve_neon_i8mm.c create mode 100644 third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_neon.h create mode 100644 third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_reconinter_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_reconintra_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_warp_plane_neon.c create mode 100644 third_party/aom/av1/common/arm/highbd_warp_plane_neon.h create mode 100644 third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c create mode 100644 third_party/aom/av1/common/arm/reconinter_neon.c create mode 100644 third_party/aom/av1/common/arm/reconintra_neon.c create mode 100644 third_party/aom/av1/common/arm/resize_neon.c create mode 100644 third_party/aom/av1/common/arm/selfguided_neon.c create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.c create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon.h create mode 100644 third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c create mode 100644 third_party/aom/av1/common/arm/warp_plane_sve.c create mode 100644 third_party/aom/av1/common/arm/wiener_convolve_neon.c create mode 100644 third_party/aom/av1/common/av1_common_int.h create mode 100644 third_party/aom/av1/common/av1_inv_txfm1d.c create mode 100644 third_party/aom/av1/common/av1_inv_txfm1d.h create mode 100644 third_party/aom/av1/common/av1_inv_txfm1d_cfg.h create mode 100644 third_party/aom/av1/common/av1_inv_txfm2d.c create mode 100644 third_party/aom/av1/common/av1_loopfilter.c create mode 100644 third_party/aom/av1/common/av1_loopfilter.h create mode 100644 third_party/aom/av1/common/av1_rtcd.c create mode 100644 third_party/aom/av1/common/av1_rtcd_defs.pl create mode 100644 third_party/aom/av1/common/av1_txfm.c create mode 100644 third_party/aom/av1/common/av1_txfm.h create mode 100644 third_party/aom/av1/common/blockd.c create mode 100644 third_party/aom/av1/common/blockd.h create mode 100644 third_party/aom/av1/common/cdef.c create mode 100644 third_party/aom/av1/common/cdef.h create mode 100644 third_party/aom/av1/common/cdef_block.c create mode 100644 third_party/aom/av1/common/cdef_block.h create mode 100644 third_party/aom/av1/common/cdef_block_simd.h create mode 100644 third_party/aom/av1/common/cfl.c create mode 100644 third_party/aom/av1/common/cfl.h create mode 100644 third_party/aom/av1/common/common.h create mode 100644 third_party/aom/av1/common/common_data.c create mode 100644 third_party/aom/av1/common/common_data.h create mode 100644 third_party/aom/av1/common/convolve.c create mode 100644 third_party/aom/av1/common/convolve.h create mode 100644 third_party/aom/av1/common/debugmodes.c create mode 100644 third_party/aom/av1/common/entropy.c create mode 100644 third_party/aom/av1/common/entropy.h create mode 100644 third_party/aom/av1/common/entropymode.c create mode 100644 third_party/aom/av1/common/entropymode.h create mode 100644 third_party/aom/av1/common/entropymv.c create mode 100644 third_party/aom/av1/common/entropymv.h create mode 100644 third_party/aom/av1/common/enums.h create mode 100644 third_party/aom/av1/common/filter.h create mode 100644 third_party/aom/av1/common/frame_buffers.c create mode 100644 third_party/aom/av1/common/frame_buffers.h create mode 100644 third_party/aom/av1/common/idct.c create mode 100644 third_party/aom/av1/common/idct.h create mode 100644 third_party/aom/av1/common/mv.h create mode 100644 third_party/aom/av1/common/mvref_common.c create mode 100644 third_party/aom/av1/common/mvref_common.h create mode 100644 third_party/aom/av1/common/obmc.h create mode 100644 third_party/aom/av1/common/obu_util.c create mode 100644 third_party/aom/av1/common/obu_util.h create mode 100644 third_party/aom/av1/common/ppc/cfl_ppc.c create mode 100644 third_party/aom/av1/common/pred_common.c create mode 100644 third_party/aom/av1/common/pred_common.h create mode 100644 third_party/aom/av1/common/quant_common.c create mode 100644 third_party/aom/av1/common/quant_common.h create mode 100644 third_party/aom/av1/common/reconinter.c create mode 100644 third_party/aom/av1/common/reconinter.h create mode 100644 third_party/aom/av1/common/reconinter_template.inc create mode 100644 third_party/aom/av1/common/reconintra.c create mode 100644 third_party/aom/av1/common/reconintra.h create mode 100644 third_party/aom/av1/common/resize.c create mode 100644 third_party/aom/av1/common/resize.h create mode 100644 third_party/aom/av1/common/restoration.c create mode 100644 third_party/aom/av1/common/restoration.h create mode 100644 third_party/aom/av1/common/scale.c create mode 100644 third_party/aom/av1/common/scale.h create mode 100644 third_party/aom/av1/common/scan.c create mode 100644 third_party/aom/av1/common/scan.h create mode 100644 third_party/aom/av1/common/seg_common.c create mode 100644 third_party/aom/av1/common/seg_common.h create mode 100644 third_party/aom/av1/common/thread_common.c create mode 100644 third_party/aom/av1/common/thread_common.h create mode 100644 third_party/aom/av1/common/tile_common.c create mode 100644 third_party/aom/av1/common/tile_common.h create mode 100644 third_party/aom/av1/common/timing.c create mode 100644 third_party/aom/av1/common/timing.h create mode 100644 third_party/aom/av1/common/token_cdfs.h create mode 100644 third_party/aom/av1/common/txb_common.c create mode 100644 third_party/aom/av1/common/txb_common.h create mode 100644 third_party/aom/av1/common/warped_motion.c create mode 100644 third_party/aom/av1/common/warped_motion.h create mode 100644 third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c create mode 100644 third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c create mode 100644 third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse2.h create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse4.c create mode 100644 third_party/aom/av1/common/x86/av1_txfm_sse4.h create mode 100644 third_party/aom/av1/common/x86/cdef_block_avx2.c create mode 100644 third_party/aom/av1/common/x86/cdef_block_sse2.c create mode 100644 third_party/aom/av1/common/x86/cdef_block_sse4.c create mode 100644 third_party/aom/av1/common/x86/cdef_block_ssse3.c create mode 100644 third_party/aom/av1/common/x86/cfl_avx2.c create mode 100644 third_party/aom/av1/common/x86/cfl_simd.h create mode 100644 third_party/aom/av1/common/x86/cfl_sse2.c create mode 100644 third_party/aom/av1/common/x86/cfl_ssse3.c create mode 100644 third_party/aom/av1/common/x86/convolve_2d_avx2.c create mode 100644 third_party/aom/av1/common/x86/convolve_2d_sse2.c create mode 100644 third_party/aom/av1/common/x86/convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/convolve_sse2.c create mode 100644 third_party/aom/av1/common/x86/filterintra_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c create mode 100644 third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h create mode 100644 third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c create mode 100644 third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c create mode 100644 third_party/aom/av1/common/x86/intra_edge_sse4.c create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_sse2.c create mode 100644 third_party/aom/av1/common/x86/jnt_convolve_ssse3.c create mode 100644 third_party/aom/av1/common/x86/reconinter_avx2.c create mode 100644 third_party/aom/av1/common/x86/reconinter_sse4.c create mode 100644 third_party/aom/av1/common/x86/reconinter_ssse3.c create mode 100644 third_party/aom/av1/common/x86/resize_ssse3.c create mode 100644 third_party/aom/av1/common/x86/selfguided_avx2.c create mode 100644 third_party/aom/av1/common/x86/selfguided_sse4.c create mode 100644 third_party/aom/av1/common/x86/warp_plane_avx2.c create mode 100644 third_party/aom/av1/common/x86/warp_plane_sse4.c create mode 100644 third_party/aom/av1/common/x86/wiener_convolve_avx2.c create mode 100644 third_party/aom/av1/common/x86/wiener_convolve_sse2.c create mode 100644 third_party/aom/av1/decoder/accounting.c create mode 100644 third_party/aom/av1/decoder/accounting.h create mode 100644 third_party/aom/av1/decoder/decodeframe.c create mode 100644 third_party/aom/av1/decoder/decodeframe.h create mode 100644 third_party/aom/av1/decoder/decodemv.c create mode 100644 third_party/aom/av1/decoder/decodemv.h create mode 100644 third_party/aom/av1/decoder/decoder.c create mode 100644 third_party/aom/av1/decoder/decoder.h create mode 100644 third_party/aom/av1/decoder/decodetxb.c create mode 100644 third_party/aom/av1/decoder/decodetxb.h create mode 100644 third_party/aom/av1/decoder/detokenize.c create mode 100644 third_party/aom/av1/decoder/detokenize.h create mode 100644 third_party/aom/av1/decoder/dthread.h create mode 100644 third_party/aom/av1/decoder/grain_synthesis.c create mode 100644 third_party/aom/av1/decoder/grain_synthesis.h create mode 100644 third_party/aom/av1/decoder/inspection.c create mode 100644 third_party/aom/av1/decoder/inspection.h create mode 100644 third_party/aom/av1/decoder/obu.c create mode 100644 third_party/aom/av1/decoder/obu.h create mode 100644 third_party/aom/av1/encoder/allintra_vis.c create mode 100644 third_party/aom/av1/encoder/allintra_vis.h create mode 100644 third_party/aom/av1/encoder/aq_complexity.c create mode 100644 third_party/aom/av1/encoder/aq_complexity.h create mode 100644 third_party/aom/av1/encoder/aq_cyclicrefresh.c create mode 100644 third_party/aom/av1/encoder/aq_cyclicrefresh.h create mode 100644 third_party/aom/av1/encoder/aq_variance.c create mode 100644 third_party/aom/av1/encoder/aq_variance.h create mode 100644 third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_error_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_error_sve.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/cnn_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/ml_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/pickrst_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/pickrst_neon.h create mode 100644 third_party/aom/av1/encoder/arm/neon/quantize_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/rdopt_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/shift_neon.h create mode 100644 third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c create mode 100644 third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c create mode 100644 third_party/aom/av1/encoder/arm/neon/txfm_neon.h create mode 100644 third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d.c create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d.h create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h create mode 100644 third_party/aom/av1/encoder/av1_fwd_txfm2d.c create mode 100644 third_party/aom/av1/encoder/av1_ml_partition_models.h create mode 100644 third_party/aom/av1/encoder/av1_noise_estimate.c create mode 100644 third_party/aom/av1/encoder/av1_noise_estimate.h create mode 100644 third_party/aom/av1/encoder/av1_quantize.c create mode 100644 third_party/aom/av1/encoder/av1_quantize.h create mode 100644 third_party/aom/av1/encoder/av1_temporal_denoiser.c create mode 100644 third_party/aom/av1/encoder/av1_temporal_denoiser.h create mode 100644 third_party/aom/av1/encoder/bitstream.c create mode 100644 third_party/aom/av1/encoder/bitstream.h create mode 100644 third_party/aom/av1/encoder/block.h create mode 100644 third_party/aom/av1/encoder/blockiness.c create mode 100644 third_party/aom/av1/encoder/cnn.c create mode 100644 third_party/aom/av1/encoder/cnn.h create mode 100644 third_party/aom/av1/encoder/compound_type.c create mode 100644 third_party/aom/av1/encoder/compound_type.h create mode 100644 third_party/aom/av1/encoder/context_tree.c create mode 100644 third_party/aom/av1/encoder/context_tree.h create mode 100644 third_party/aom/av1/encoder/cost.c create mode 100644 third_party/aom/av1/encoder/cost.h create mode 100644 third_party/aom/av1/encoder/deltaq4_model.c create mode 100644 third_party/aom/av1/encoder/dwt.c create mode 100644 third_party/aom/av1/encoder/dwt.h create mode 100644 third_party/aom/av1/encoder/enc_enums.h create mode 100644 third_party/aom/av1/encoder/encode_strategy.c create mode 100644 third_party/aom/av1/encoder/encode_strategy.h create mode 100644 third_party/aom/av1/encoder/encodeframe.c create mode 100644 third_party/aom/av1/encoder/encodeframe.h create mode 100644 third_party/aom/av1/encoder/encodeframe_utils.c create mode 100644 third_party/aom/av1/encoder/encodeframe_utils.h create mode 100644 third_party/aom/av1/encoder/encodemb.c create mode 100644 third_party/aom/av1/encoder/encodemb.h create mode 100644 third_party/aom/av1/encoder/encodemv.c create mode 100644 third_party/aom/av1/encoder/encodemv.h create mode 100644 third_party/aom/av1/encoder/encoder.c create mode 100644 third_party/aom/av1/encoder/encoder.h create mode 100644 third_party/aom/av1/encoder/encoder_alloc.h create mode 100644 third_party/aom/av1/encoder/encoder_utils.c create mode 100644 third_party/aom/av1/encoder/encoder_utils.h create mode 100644 third_party/aom/av1/encoder/encodetxb.c create mode 100644 third_party/aom/av1/encoder/encodetxb.h create mode 100644 third_party/aom/av1/encoder/ethread.c create mode 100644 third_party/aom/av1/encoder/ethread.h create mode 100644 third_party/aom/av1/encoder/extend.c create mode 100644 third_party/aom/av1/encoder/extend.h create mode 100644 third_party/aom/av1/encoder/external_partition.c create mode 100644 third_party/aom/av1/encoder/external_partition.h create mode 100644 third_party/aom/av1/encoder/firstpass.c create mode 100644 third_party/aom/av1/encoder/firstpass.h create mode 100644 third_party/aom/av1/encoder/global_motion.c create mode 100644 third_party/aom/av1/encoder/global_motion.h create mode 100644 third_party/aom/av1/encoder/global_motion_facade.c create mode 100644 third_party/aom/av1/encoder/global_motion_facade.h create mode 100644 third_party/aom/av1/encoder/gop_structure.c create mode 100644 third_party/aom/av1/encoder/gop_structure.h create mode 100644 third_party/aom/av1/encoder/grain_test_vectors.h create mode 100644 third_party/aom/av1/encoder/hash.c create mode 100644 third_party/aom/av1/encoder/hash.h create mode 100644 third_party/aom/av1/encoder/hash_motion.c create mode 100644 third_party/aom/av1/encoder/hash_motion.h create mode 100644 third_party/aom/av1/encoder/hybrid_fwd_txfm.c create mode 100644 third_party/aom/av1/encoder/hybrid_fwd_txfm.h create mode 100644 third_party/aom/av1/encoder/interp_search.c create mode 100644 third_party/aom/av1/encoder/interp_search.h create mode 100644 third_party/aom/av1/encoder/intra_mode_search.c create mode 100644 third_party/aom/av1/encoder/intra_mode_search.h create mode 100644 third_party/aom/av1/encoder/intra_mode_search_utils.h create mode 100644 third_party/aom/av1/encoder/k_means_template.h create mode 100644 third_party/aom/av1/encoder/level.c create mode 100644 third_party/aom/av1/encoder/level.h create mode 100644 third_party/aom/av1/encoder/lookahead.c create mode 100644 third_party/aom/av1/encoder/lookahead.h create mode 100644 third_party/aom/av1/encoder/mcomp.c create mode 100644 third_party/aom/av1/encoder/mcomp.h create mode 100644 third_party/aom/av1/encoder/mcomp_structs.h create mode 100644 third_party/aom/av1/encoder/misc_model_weights.h create mode 100644 third_party/aom/av1/encoder/ml.c create mode 100644 third_party/aom/av1/encoder/ml.h create mode 100644 third_party/aom/av1/encoder/mode_prune_model_weights.h create mode 100644 third_party/aom/av1/encoder/model_rd.h create mode 100644 third_party/aom/av1/encoder/motion_search_facade.c create mode 100644 third_party/aom/av1/encoder/motion_search_facade.h create mode 100644 third_party/aom/av1/encoder/mv_prec.c create mode 100644 third_party/aom/av1/encoder/mv_prec.h create mode 100644 third_party/aom/av1/encoder/nonrd_opt.c create mode 100644 third_party/aom/av1/encoder/nonrd_opt.h create mode 100644 third_party/aom/av1/encoder/nonrd_pickmode.c create mode 100644 third_party/aom/av1/encoder/optical_flow.c create mode 100644 third_party/aom/av1/encoder/optical_flow.h create mode 100644 third_party/aom/av1/encoder/palette.c create mode 100644 third_party/aom/av1/encoder/palette.h create mode 100644 third_party/aom/av1/encoder/partition_cnn_weights.h create mode 100644 third_party/aom/av1/encoder/partition_model_weights.h create mode 100644 third_party/aom/av1/encoder/partition_search.c create mode 100644 third_party/aom/av1/encoder/partition_search.h create mode 100644 third_party/aom/av1/encoder/partition_strategy.c create mode 100644 third_party/aom/av1/encoder/partition_strategy.h create mode 100644 third_party/aom/av1/encoder/pass2_strategy.c create mode 100644 third_party/aom/av1/encoder/pass2_strategy.h create mode 100644 third_party/aom/av1/encoder/pickcdef.c create mode 100644 third_party/aom/av1/encoder/pickcdef.h create mode 100644 third_party/aom/av1/encoder/picklpf.c create mode 100644 third_party/aom/av1/encoder/picklpf.h create mode 100644 third_party/aom/av1/encoder/pickrst.c create mode 100644 third_party/aom/av1/encoder/pickrst.h create mode 100644 third_party/aom/av1/encoder/pustats.h create mode 100644 third_party/aom/av1/encoder/random.h create mode 100644 third_party/aom/av1/encoder/ratectrl.c create mode 100644 third_party/aom/av1/encoder/ratectrl.h create mode 100644 third_party/aom/av1/encoder/rc_utils.h create mode 100644 third_party/aom/av1/encoder/rd.c create mode 100644 third_party/aom/av1/encoder/rd.h create mode 100644 third_party/aom/av1/encoder/rdopt.c create mode 100644 third_party/aom/av1/encoder/rdopt.h create mode 100644 third_party/aom/av1/encoder/rdopt_data_defs.h create mode 100644 third_party/aom/av1/encoder/rdopt_utils.h create mode 100644 third_party/aom/av1/encoder/reconinter_enc.c create mode 100644 third_party/aom/av1/encoder/reconinter_enc.h create mode 100644 third_party/aom/av1/encoder/saliency_map.c create mode 100644 third_party/aom/av1/encoder/saliency_map.h create mode 100644 third_party/aom/av1/encoder/segmentation.c create mode 100644 third_party/aom/av1/encoder/segmentation.h create mode 100644 third_party/aom/av1/encoder/sorting_network.h create mode 100644 third_party/aom/av1/encoder/sparse_linear_solver.c create mode 100644 third_party/aom/av1/encoder/sparse_linear_solver.h create mode 100644 third_party/aom/av1/encoder/speed_features.c create mode 100644 third_party/aom/av1/encoder/speed_features.h create mode 100644 third_party/aom/av1/encoder/superres_scale.c create mode 100644 third_party/aom/av1/encoder/superres_scale.h create mode 100644 third_party/aom/av1/encoder/svc_layercontext.c create mode 100644 third_party/aom/av1/encoder/svc_layercontext.h create mode 100644 third_party/aom/av1/encoder/temporal_filter.c create mode 100644 third_party/aom/av1/encoder/temporal_filter.h create mode 100644 third_party/aom/av1/encoder/thirdpass.c create mode 100644 third_party/aom/av1/encoder/thirdpass.h create mode 100644 third_party/aom/av1/encoder/tokenize.c create mode 100644 third_party/aom/av1/encoder/tokenize.h create mode 100644 third_party/aom/av1/encoder/tpl_model.c create mode 100644 third_party/aom/av1/encoder/tpl_model.h create mode 100644 third_party/aom/av1/encoder/tune_butteraugli.c create mode 100644 third_party/aom/av1/encoder/tune_butteraugli.h create mode 100644 third_party/aom/av1/encoder/tune_vmaf.c create mode 100644 third_party/aom/av1/encoder/tune_vmaf.h create mode 100644 third_party/aom/av1/encoder/tx_prune_model_weights.h create mode 100644 third_party/aom/av1/encoder/tx_search.c create mode 100644 third_party/aom/av1/encoder/tx_search.h create mode 100644 third_party/aom/av1/encoder/txb_rdopt.c create mode 100644 third_party/aom/av1/encoder/txb_rdopt.h create mode 100644 third_party/aom/av1/encoder/txb_rdopt_utils.h create mode 100644 third_party/aom/av1/encoder/var_based_part.c create mode 100644 third_party/aom/av1/encoder/var_based_part.h create mode 100644 third_party/aom/av1/encoder/wedge_utils.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h create mode 100644 third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/av1_k_means_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_k_means_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm create mode 100644 third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm create mode 100644 third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h create mode 100644 third_party/aom/av1/encoder/x86/cnn_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/dct_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/encodetxb_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/error_intrin_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/error_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/error_sse2.asm create mode 100644 third_party/aom/av1/encoder/x86/hash_sse42.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/ml_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/ml_sse3.c create mode 100644 third_party/aom/av1/encoder/x86/ml_sse3.h create mode 100644 third_party/aom/av1/encoder/x86/pickrst_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/pickrst_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/rdopt_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/rdopt_sse4.c create mode 100644 third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c create mode 100644 third_party/aom/av1/encoder/x86/temporal_filter_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/temporal_filter_sse2.c create mode 100644 third_party/aom/av1/encoder/x86/wedge_utils_avx2.c create mode 100644 third_party/aom/av1/encoder/x86/wedge_utils_sse2.c create mode 100644 third_party/aom/av1/exports_com create mode 100644 third_party/aom/av1/exports_dec create mode 100644 third_party/aom/av1/exports_enc create mode 100644 third_party/aom/av1/exports_ident create mode 100644 third_party/aom/av1/exports_test create mode 100644 third_party/aom/av1/ratectrl_rtc.cc create mode 100644 third_party/aom/av1/ratectrl_rtc.h create mode 100644 third_party/aom/build/cmake/aom_config.c.template create mode 100644 third_party/aom/build/cmake/aom_config_defaults.cmake create mode 100644 third_party/aom/build/cmake/aom_configure.cmake create mode 100644 third_party/aom/build/cmake/aom_experiment_deps.cmake create mode 100644 third_party/aom/build/cmake/aom_install.cmake create mode 100644 third_party/aom/build/cmake/aom_optimization.cmake create mode 100644 third_party/aom/build/cmake/compiler_flags.cmake create mode 100644 third_party/aom/build/cmake/compiler_tests.cmake create mode 100644 third_party/aom/build/cmake/cpu.cmake create mode 100644 third_party/aom/build/cmake/dist.cmake create mode 100644 third_party/aom/build/cmake/exports.cmake create mode 100644 third_party/aom/build/cmake/exports_sources.cmake create mode 100644 third_party/aom/build/cmake/generate_aom_config_templates.cmake create mode 100644 third_party/aom/build/cmake/generate_exports.cmake create mode 100644 third_party/aom/build/cmake/pkg_config.cmake create mode 100755 third_party/aom/build/cmake/rtcd.pl create mode 100644 third_party/aom/build/cmake/sanitizers.cmake create mode 100644 third_party/aom/build/cmake/toolchains/android.cmake create mode 100644 third_party/aom/build/cmake/toolchains/arm-ios-common.cmake create mode 100644 third_party/aom/build/cmake/toolchains/arm64-ios.cmake create mode 100644 third_party/aom/build/cmake/toolchains/arm64-linux-clang.cmake create mode 100644 third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/arm64-macos.cmake create mode 100644 third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/armv7-ios.cmake create mode 100644 third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/armv7s-ios.cmake create mode 100644 third_party/aom/build/cmake/toolchains/i686-linux-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake create mode 100644 third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/riscv-linux-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86-linux.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86-macos.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86_64-macos.cmake create mode 100644 third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake create mode 100644 third_party/aom/build/cmake/util.cmake create mode 100644 third_party/aom/build/cmake/version.cmake create mode 100755 third_party/aom/build/cmake/version.pl create mode 100644 third_party/aom/codereview.settings create mode 100644 third_party/aom/common/args.c create mode 100644 third_party/aom/common/args.h create mode 100644 third_party/aom/common/args_helper.c create mode 100644 third_party/aom/common/args_helper.h create mode 100644 third_party/aom/common/av1_config.c create mode 100644 third_party/aom/common/av1_config.h create mode 100644 third_party/aom/common/ivf_dec.cmake create mode 100644 third_party/aom/common/ivfdec.c create mode 100644 third_party/aom/common/ivfdec.h create mode 100644 third_party/aom/common/ivfenc.c create mode 100644 third_party/aom/common/ivfenc.h create mode 100644 third_party/aom/common/md5_utils.c create mode 100644 third_party/aom/common/md5_utils.h create mode 100644 third_party/aom/common/obudec.c create mode 100644 third_party/aom/common/obudec.h create mode 100644 third_party/aom/common/rawenc.c create mode 100644 third_party/aom/common/rawenc.h create mode 100644 third_party/aom/common/tools_common.c create mode 100644 third_party/aom/common/tools_common.h create mode 100644 third_party/aom/common/video_common.h create mode 100644 third_party/aom/common/video_reader.c create mode 100644 third_party/aom/common/video_reader.h create mode 100644 third_party/aom/common/video_writer.c create mode 100644 third_party/aom/common/video_writer.h create mode 100644 third_party/aom/common/warnings.c create mode 100644 third_party/aom/common/warnings.h create mode 100644 third_party/aom/common/webmdec.cc create mode 100644 third_party/aom/common/webmdec.h create mode 100644 third_party/aom/common/webmenc.cc create mode 100644 third_party/aom/common/webmenc.h create mode 100644 third_party/aom/common/y4menc.c create mode 100644 third_party/aom/common/y4menc.h create mode 100644 third_party/aom/common/y4minput.c create mode 100644 third_party/aom/common/y4minput.h create mode 100644 third_party/aom/doc/AlgorithmDescription.md create mode 100644 third_party/aom/doc/dev_guide/av1_decoder.dox create mode 100644 third_party/aom/doc/dev_guide/av1_encoder.dox create mode 100644 third_party/aom/doc/dev_guide/av1encoderflow.png create mode 100644 third_party/aom/doc/dev_guide/av1partitions.png create mode 100644 third_party/aom/doc/dev_guide/coeff_coding.png create mode 100644 third_party/aom/doc/dev_guide/filter_flow.png create mode 100644 third_party/aom/doc/dev_guide/filter_thr.png create mode 100644 third_party/aom/doc/dev_guide/genericcodecflow.png create mode 100644 third_party/aom/doc/dev_guide/gf_group.png create mode 100644 third_party/aom/doc/dev_guide/partition.png create mode 100644 third_party/aom/doc/dev_guide/tplgfgroupdiagram.png create mode 100644 third_party/aom/doc/img/edge_direction.svg create mode 100644 third_party/aom/doc/img/equ_dir_search.svg create mode 100644 third_party/aom/doc/img/equ_dual_self_guided.svg create mode 100644 third_party/aom/doc/img/equ_dual_self_para.svg create mode 100644 third_party/aom/doc/img/equ_edge_direction.svg create mode 100644 third_party/aom/doc/img/equ_guided_filter.svg create mode 100644 third_party/aom/doc/img/equ_wiener_filter.svg create mode 100644 third_party/aom/doc/img/inter_motion_field.svg create mode 100644 third_party/aom/doc/img/inter_obmc.svg create mode 100644 third_party/aom/doc/img/inter_spatial_mvp.svg create mode 100644 third_party/aom/doc/img/inter_tmvp_positions.svg create mode 100644 third_party/aom/doc/img/inter_tx_partition.svg create mode 100644 third_party/aom/doc/img/intra_cfl.svg create mode 100644 third_party/aom/doc/img/intra_directional.svg create mode 100644 third_party/aom/doc/img/intra_paeth.svg create mode 100644 third_party/aom/doc/img/intra_recursive.svg create mode 100644 third_party/aom/doc/img/intra_tx_partition.svg create mode 100644 third_party/aom/doc/img/loop_restoration.svg create mode 100644 third_party/aom/doc/img/partition_codingblock.svg create mode 100644 third_party/aom/doc/img/primary_tap.svg create mode 100644 third_party/aom/doc/img/quant_ac.svg create mode 100644 third_party/aom/doc/img/quant_dc.svg create mode 100644 third_party/aom/doc/img/scc_intrabc.svg create mode 100644 third_party/aom/doc/img/secondary_tap.svg create mode 100644 third_party/aom/doc/img/tx_basis.svg create mode 100644 third_party/aom/doc/img/tx_cands_large.svg create mode 100644 third_party/aom/doc/img/tx_cands_small.svg create mode 100644 third_party/aom/doc/img/tx_chroma.svg create mode 100644 third_party/aom/doc/img/tx_partition.svg create mode 100644 third_party/aom/doc/img/tx_set.svg create mode 100644 third_party/aom/docs.cmake create mode 100644 third_party/aom/examples/analyzer.cc create mode 100644 third_party/aom/examples/aom_cx_set_ref.c create mode 100644 third_party/aom/examples/av1_dec_fuzzer.cc create mode 100644 third_party/aom/examples/av1_dec_fuzzer.dict create mode 100755 third_party/aom/examples/build_av1_dec_fuzzer.sh create mode 100644 third_party/aom/examples/decode_to_md5.c create mode 100644 third_party/aom/examples/decode_with_drops.c create mode 100644 third_party/aom/examples/encoder_util.c create mode 100644 third_party/aom/examples/encoder_util.h create mode 100644 third_party/aom/examples/inspect.c create mode 100644 third_party/aom/examples/lightfield_bitstream_parsing.c create mode 100644 third_party/aom/examples/lightfield_decoder.c create mode 100644 third_party/aom/examples/lightfield_encoder.c create mode 100644 third_party/aom/examples/lightfield_tile_list_decoder.c create mode 100644 third_party/aom/examples/lossless_encoder.c create mode 100644 third_party/aom/examples/noise_model.c create mode 100644 third_party/aom/examples/photon_noise_table.c create mode 100644 third_party/aom/examples/scalable_decoder.c create mode 100644 third_party/aom/examples/scalable_encoder.c create mode 100644 third_party/aom/examples/set_maps.c create mode 100644 third_party/aom/examples/simple_decoder.c create mode 100644 third_party/aom/examples/simple_encoder.c create mode 100644 third_party/aom/examples/svc_encoder_rtc.cc create mode 100644 third_party/aom/examples/twopass_encoder.c create mode 100644 third_party/aom/keywords.dox create mode 100644 third_party/aom/libs.doxy_template create mode 100644 third_party/aom/mainpage.dox create mode 100644 third_party/aom/stats/aomstats.c create mode 100644 third_party/aom/stats/aomstats.h create mode 100644 third_party/aom/stats/rate_hist.c create mode 100644 third_party/aom/stats/rate_hist.h create mode 100644 third_party/aom/test/accounting_test.cc create mode 100644 third_party/aom/test/acm_random.h create mode 100644 third_party/aom/test/active_map_test.cc create mode 100644 third_party/aom/test/allintra_end_to_end_test.cc create mode 100644 third_party/aom/test/altref_test.cc create mode 100644 third_party/aom/test/aom_image_test.cc create mode 100644 third_party/aom/test/aom_integer_test.cc create mode 100644 third_party/aom/test/aom_mem_test.cc create mode 100755 third_party/aom/test/aomcx_set_ref.sh create mode 100755 third_party/aom/test/aomdec.sh create mode 100755 third_party/aom/test/aomenc.sh create mode 100644 third_party/aom/test/aq_segment_test.cc create mode 100644 third_party/aom/test/arf_freq_test.cc create mode 100755 third_party/aom/test/av1_c_vs_simd_encode.sh create mode 100644 third_party/aom/test/av1_common_int_test.cc create mode 100644 third_party/aom/test/av1_config_test.cc create mode 100644 third_party/aom/test/av1_convolve_scale_test.cc create mode 100644 third_party/aom/test/av1_convolve_test.cc create mode 100644 third_party/aom/test/av1_encoder_parms_get_to_decoder.cc create mode 100644 third_party/aom/test/av1_ext_tile_test.cc create mode 100644 third_party/aom/test/av1_external_partition_test.cc create mode 100644 third_party/aom/test/av1_fwd_txfm1d_test.cc create mode 100644 third_party/aom/test/av1_fwd_txfm2d_test.cc create mode 100644 third_party/aom/test/av1_highbd_iht_test.cc create mode 100644 third_party/aom/test/av1_horz_only_frame_superres_test.cc create mode 100644 third_party/aom/test/av1_inv_txfm1d_test.cc create mode 100644 third_party/aom/test/av1_inv_txfm2d_test.cc create mode 100644 third_party/aom/test/av1_k_means_test.cc create mode 100644 third_party/aom/test/av1_key_value_api_test.cc create mode 100644 third_party/aom/test/av1_nn_predict_test.cc create mode 100644 third_party/aom/test/av1_quantize_test.cc create mode 100644 third_party/aom/test/av1_round_shift_array_test.cc create mode 100644 third_party/aom/test/av1_softmax_test.cc create mode 100644 third_party/aom/test/av1_temporal_denoiser_test.cc create mode 100644 third_party/aom/test/av1_txfm_test.cc create mode 100644 third_party/aom/test/av1_txfm_test.h create mode 100644 third_party/aom/test/av1_wedge_utils_test.cc create mode 100644 third_party/aom/test/avg_test.cc create mode 100644 third_party/aom/test/avif_progressive_test.cc create mode 100755 third_party/aom/test/best_encode.sh create mode 100644 third_party/aom/test/binary_codes_test.cc create mode 100644 third_party/aom/test/blend_a64_mask_1d_test.cc create mode 100644 third_party/aom/test/blend_a64_mask_test.cc create mode 100644 third_party/aom/test/block_test.cc create mode 100644 third_party/aom/test/boolcoder_test.cc create mode 100644 third_party/aom/test/borders_test.cc create mode 100644 third_party/aom/test/cdef_test.cc create mode 100644 third_party/aom/test/cfl_test.cc create mode 100644 third_party/aom/test/cnn_test.cc create mode 100644 third_party/aom/test/codec_factory.h create mode 100644 third_party/aom/test/coding_path_sync.cc create mode 100644 third_party/aom/test/comp_avg_pred_test.cc create mode 100644 third_party/aom/test/comp_avg_pred_test.h create mode 100644 third_party/aom/test/comp_mask_pred_test.cc create mode 100644 third_party/aom/test/convolve_test.cc create mode 100644 third_party/aom/test/corner_match_test.cc create mode 100644 third_party/aom/test/cpu_speed_test.cc create mode 100644 third_party/aom/test/cpu_used_firstpass_test.cc create mode 100644 third_party/aom/test/datarate_test.cc create mode 100644 third_party/aom/test/datarate_test.h create mode 100644 third_party/aom/test/decode_api_test.cc create mode 100644 third_party/aom/test/decode_multithreaded_test.cc create mode 100644 third_party/aom/test/decode_perf_test.cc create mode 100644 third_party/aom/test/decode_scalability_test.cc create mode 100644 third_party/aom/test/decode_test_driver.cc create mode 100644 third_party/aom/test/decode_test_driver.h create mode 100755 third_party/aom/test/decode_to_md5.sh create mode 100755 third_party/aom/test/decode_with_drops.sh create mode 100644 third_party/aom/test/deltaq_mode_test.cc create mode 100644 third_party/aom/test/disflow_test.cc create mode 100644 third_party/aom/test/divu_small_test.cc create mode 100644 third_party/aom/test/dr_prediction_test.cc create mode 100644 third_party/aom/test/dropframe_encode_test.cc create mode 100755 third_party/aom/test/dump_obu.sh create mode 100644 third_party/aom/test/ec_test.cc create mode 100644 third_party/aom/test/encode_api_test.cc create mode 100644 third_party/aom/test/encode_perf_test.cc create mode 100644 third_party/aom/test/encode_small_width_height_test.cc create mode 100644 third_party/aom/test/encode_test_driver.cc create mode 100644 third_party/aom/test/encode_test_driver.h create mode 100644 third_party/aom/test/encodemb_test.cc create mode 100644 third_party/aom/test/encodetxb_test.cc create mode 100644 third_party/aom/test/end_to_end_psnr_test.cc create mode 100644 third_party/aom/test/end_to_end_qmpsnr_test.cc create mode 100644 third_party/aom/test/end_to_end_ssim_test.cc create mode 100644 third_party/aom/test/error_block_test.cc create mode 100644 third_party/aom/test/error_resilience_test.cc create mode 100644 third_party/aom/test/ethread_test.cc create mode 100755 third_party/aom/test/examples.sh create mode 100644 third_party/aom/test/external_frame_buffer_test.cc create mode 100644 third_party/aom/test/fdct4x4_test.cc create mode 100644 third_party/aom/test/fft_test.cc create mode 100644 third_party/aom/test/film_grain_table_test.cc create mode 100644 third_party/aom/test/filterintra_test.cc create mode 100644 third_party/aom/test/firstpass_test.cc create mode 100644 third_party/aom/test/force_key_frame_test.cc create mode 100644 third_party/aom/test/forced_max_frame_width_height_test.cc create mode 100644 third_party/aom/test/frame_parallel_enc_test.cc create mode 100644 third_party/aom/test/frame_size_tests.cc create mode 100644 third_party/aom/test/function_equivalence_test.h create mode 100644 third_party/aom/test/fwht4x4_test.cc create mode 100644 third_party/aom/test/gf_pyr_height_test.cc create mode 100755 third_party/aom/test/gviz_api.py create mode 100644 third_party/aom/test/hadamard_test.cc create mode 100644 third_party/aom/test/hash_test.cc create mode 100644 third_party/aom/test/hbd_metrics_test.cc create mode 100644 third_party/aom/test/hiprec_convolve_test.cc create mode 100644 third_party/aom/test/hiprec_convolve_test_util.cc create mode 100644 third_party/aom/test/hiprec_convolve_test_util.h create mode 100644 third_party/aom/test/horver_correlation_test.cc create mode 100644 third_party/aom/test/horz_superres_test.cc create mode 100644 third_party/aom/test/i420_video_source.h create mode 100644 third_party/aom/test/intra_edge_test.cc create mode 100644 third_party/aom/test/intrabc_test.cc create mode 100644 third_party/aom/test/intrapred_test.cc create mode 100644 third_party/aom/test/invalid_file_test.cc create mode 100644 third_party/aom/test/ivf_video_source.h create mode 100644 third_party/aom/test/kf_test.cc create mode 100644 third_party/aom/test/level_test.cc create mode 100755 third_party/aom/test/lightfield_test.sh create mode 100644 third_party/aom/test/log2_test.cc create mode 100644 third_party/aom/test/loopfilter_control_test.cc create mode 100644 third_party/aom/test/lossless_test.cc create mode 100644 third_party/aom/test/lpf_test.cc create mode 100644 third_party/aom/test/masked_sad_test.cc create mode 100644 third_party/aom/test/masked_variance_test.cc create mode 100644 third_party/aom/test/md5_helper.h create mode 100644 third_party/aom/test/metadata_test.cc create mode 100644 third_party/aom/test/metrics_template.html create mode 100644 third_party/aom/test/minmax_test.cc create mode 100644 third_party/aom/test/monochrome_test.cc create mode 100644 third_party/aom/test/motion_vector_test.cc create mode 100644 third_party/aom/test/mv_cost_test.cc create mode 100644 third_party/aom/test/noise_model_test.cc create mode 100644 third_party/aom/test/obmc_sad_test.cc create mode 100644 third_party/aom/test/obmc_variance_test.cc create mode 100644 third_party/aom/test/pickrst_test.cc create mode 100644 third_party/aom/test/postproc_filters_test.cc create mode 100644 third_party/aom/test/quant_test.cc create mode 100644 third_party/aom/test/quantize_func_test.cc create mode 100644 third_party/aom/test/ratectrl_rtc_test.cc create mode 100644 third_party/aom/test/ratectrl_test.cc create mode 100644 third_party/aom/test/rd_test.cc create mode 100644 third_party/aom/test/reconinter_test.cc create mode 100644 third_party/aom/test/register_state_check.h create mode 100644 third_party/aom/test/resize_test.cc create mode 100644 third_party/aom/test/rt_end_to_end_test.cc create mode 100755 third_party/aom/test/run_encodes.sh create mode 100644 third_party/aom/test/sad_test.cc create mode 100644 third_party/aom/test/sb_multipass_test.cc create mode 100644 third_party/aom/test/sb_qp_sweep_test.cc create mode 100644 third_party/aom/test/scalability_test.cc create mode 100644 third_party/aom/test/scan_test.cc create mode 100644 third_party/aom/test/screen_content_test.cc create mode 100644 third_party/aom/test/segment_binarization_sync.cc create mode 100644 third_party/aom/test/selfguided_filter_test.cc create mode 100755 third_party/aom/test/set_maps.sh create mode 100644 third_party/aom/test/sharpness_test.cc create mode 100644 third_party/aom/test/simd_avx2_test.cc create mode 100644 third_party/aom/test/simd_cmp_avx2.cc create mode 100644 third_party/aom/test/simd_cmp_impl.h create mode 100644 third_party/aom/test/simd_cmp_sse2.cc create mode 100644 third_party/aom/test/simd_cmp_sse4.cc create mode 100644 third_party/aom/test/simd_cmp_ssse3.cc create mode 100644 third_party/aom/test/simd_impl.h create mode 100644 third_party/aom/test/simd_sse2_test.cc create mode 100644 third_party/aom/test/simd_sse4_test.cc create mode 100644 third_party/aom/test/simd_ssse3_test.cc create mode 100755 third_party/aom/test/simple_decoder.sh create mode 100755 third_party/aom/test/simple_encoder.sh create mode 100644 third_party/aom/test/sse_sum_test.cc create mode 100644 third_party/aom/test/still_picture_test.cc create mode 100644 third_party/aom/test/subtract_test.cc create mode 100644 third_party/aom/test/sum_squares_test.cc create mode 100644 third_party/aom/test/svc_datarate_test.cc create mode 100644 third_party/aom/test/svc_encoder_rtc.sh create mode 100644 third_party/aom/test/temporal_filter_test.cc create mode 100644 third_party/aom/test/test-data.sha1 create mode 100644 third_party/aom/test/test.cmake create mode 100644 third_party/aom/test/test_aom_rc.cc create mode 100644 third_party/aom/test/test_data_download_worker.cmake create mode 100644 third_party/aom/test/test_data_util.cmake create mode 100644 third_party/aom/test/test_intra_pred_speed.cc create mode 100644 third_party/aom/test/test_libaom.cc create mode 100644 third_party/aom/test/test_runner.cmake create mode 100644 third_party/aom/test/test_vector_test.cc create mode 100644 third_party/aom/test/test_vectors.cc create mode 100644 third_party/aom/test/test_vectors.h create mode 100644 third_party/aom/test/tile_config_test.cc create mode 100644 third_party/aom/test/tile_independence_test.cc create mode 100644 third_party/aom/test/time_stamp_test.cc create mode 100755 third_party/aom/test/tools_common.sh create mode 100644 third_party/aom/test/tpl_model_test.cc create mode 100644 third_party/aom/test/transform_test_base.h create mode 100755 third_party/aom/test/twopass_encoder.sh create mode 100644 third_party/aom/test/util.h create mode 100644 third_party/aom/test/variance_test.cc create mode 100644 third_party/aom/test/video_source.h create mode 100755 third_party/aom/test/visual_metrics.py create mode 100644 third_party/aom/test/warp_filter_test.cc create mode 100644 third_party/aom/test/warp_filter_test_util.cc create mode 100644 third_party/aom/test/warp_filter_test_util.h create mode 100644 third_party/aom/test/webm_video_source.h create mode 100644 third_party/aom/test/webmenc_test.cc create mode 100644 third_party/aom/test/wiener_test.cc create mode 100644 third_party/aom/test/y4m_test.cc create mode 100644 third_party/aom/test/y4m_video_source.h create mode 100644 third_party/aom/test/yuv_video_source.h create mode 100644 third_party/aom/third_party/SVT-AV1/EbMemory_AVX2.h create mode 100644 third_party/aom/third_party/SVT-AV1/EbMemory_SSE4_1.h create mode 100644 third_party/aom/third_party/SVT-AV1/LICENSE.md create mode 100644 third_party/aom/third_party/SVT-AV1/PATENTS.md create mode 100644 third_party/aom/third_party/SVT-AV1/README.libaom create mode 100644 third_party/aom/third_party/SVT-AV1/convolve_2d_avx2.h create mode 100644 third_party/aom/third_party/SVT-AV1/convolve_avx2.h create mode 100644 third_party/aom/third_party/SVT-AV1/synonyms.h create mode 100644 third_party/aom/third_party/fastfeat/LICENSE create mode 100644 third_party/aom/third_party/fastfeat/README.libaom create mode 100644 third_party/aom/third_party/fastfeat/fast.c create mode 100644 third_party/aom/third_party/fastfeat/fast.h create mode 100644 third_party/aom/third_party/fastfeat/fast_9.c create mode 100644 third_party/aom/third_party/fastfeat/nonmax.c create mode 100644 third_party/aom/third_party/googletest/README.libaom create mode 100644 third_party/aom/third_party/googletest/src/.clang-format create mode 100644 third_party/aom/third_party/googletest/src/CMakeLists.txt create mode 100644 third_party/aom/third_party/googletest/src/CONTRIBUTORS create mode 100644 third_party/aom/third_party/googletest/src/LICENSE create mode 100644 third_party/aom/third_party/googletest/src/README.md create mode 100644 third_party/aom/third_party/googletest/src/googlemock/CMakeLists.txt create mode 100644 third_party/aom/third_party/googletest/src/googlemock/README.md create mode 100644 third_party/aom/third_party/googletest/src/googlemock/cmake/gmock.pc.in create mode 100644 third_party/aom/third_party/googletest/src/googlemock/cmake/gmock_main.pc.in create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-actions.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-cardinalities.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-function-mocker.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-matchers.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-actions.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-more-matchers.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-nice-strict.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock-spec-builders.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/gmock.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/README.md create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-generated-actions.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-matchers.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/custom/gmock-port.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-internal-utils.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-port.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/include/gmock/internal/gmock-pp.h create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock-all.cc create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock-cardinalities.cc create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock-internal-utils.cc create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock-matchers.cc create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock-spec-builders.cc create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock.cc create mode 100644 third_party/aom/third_party/googletest/src/googlemock/src/gmock_main.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt create mode 100644 third_party/aom/third_party/googletest/src/googletest/README.md create mode 100644 third_party/aom/third_party/googletest/src/googletest/cmake/Config.cmake.in create mode 100644 third_party/aom/third_party/googletest/src/googletest/cmake/gtest.pc.in create mode 100644 third_party/aom/third_party/googletest/src/googletest/cmake/gtest_main.pc.in create mode 100644 third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake create mode 100644 third_party/aom/third_party/googletest/src/googletest/cmake/libgtest.la.in create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-assertion-result.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-assertion-result.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-matchers.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest.cc create mode 100644 third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc create mode 100644 third_party/aom/third_party/libwebm/AUTHORS.TXT create mode 100644 third_party/aom/third_party/libwebm/Android.mk create mode 100644 third_party/aom/third_party/libwebm/LICENSE.TXT create mode 100644 third_party/aom/third_party/libwebm/PATENTS.TXT create mode 100644 third_party/aom/third_party/libwebm/README.libaom create mode 100644 third_party/aom/third_party/libwebm/common/file_util.cc create mode 100644 third_party/aom/third_party/libwebm/common/file_util.h create mode 100644 third_party/aom/third_party/libwebm/common/hdr_util.cc create mode 100644 third_party/aom/third_party/libwebm/common/hdr_util.h create mode 100644 third_party/aom/third_party/libwebm/common/webmids.h create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc create mode 100644 third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h create mode 100644 third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc create mode 100644 third_party/aom/third_party/libwebm/mkvparser/mkvparser.h create mode 100644 third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc create mode 100644 third_party/aom/third_party/libwebm/mkvparser/mkvreader.h create mode 100644 third_party/aom/third_party/libyuv/LICENSE create mode 100644 third_party/aom/third_party/libyuv/README.libaom create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/basic_types.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/compare.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/convert.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/convert_from.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/rotate.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/row.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/scale.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/scale_row.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/scale_uv.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/version.h create mode 100644 third_party/aom/third_party/libyuv/include/libyuv/video_common.h create mode 100644 third_party/aom/third_party/libyuv/source/compare.cc create mode 100644 third_party/aom/third_party/libyuv/source/compare_common.cc create mode 100644 third_party/aom/third_party/libyuv/source/compare_gcc.cc create mode 100644 third_party/aom/third_party/libyuv/source/compare_neon.cc create mode 100644 third_party/aom/third_party/libyuv/source/compare_neon64.cc create mode 100644 third_party/aom/third_party/libyuv/source/compare_win.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert_argb.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert_from.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert_from_argb.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert_jpeg.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert_to_argb.cc create mode 100644 third_party/aom/third_party/libyuv/source/convert_to_i420.cc create mode 100644 third_party/aom/third_party/libyuv/source/cpu_id.cc create mode 100644 third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc create mode 100644 third_party/aom/third_party/libyuv/source/mjpeg_validate.cc create mode 100644 third_party/aom/third_party/libyuv/source/planar_functions.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_any.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_argb.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_common.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_gcc.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_mips.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_neon.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_neon64.cc create mode 100644 third_party/aom/third_party/libyuv/source/rotate_win.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_any.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_common.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_gcc.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_mips.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_neon.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_neon64.cc create mode 100644 third_party/aom/third_party/libyuv/source/row_win.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_any.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_argb.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_common.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_gcc.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_mips.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_neon.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_neon64.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_uv.cc create mode 100644 third_party/aom/third_party/libyuv/source/scale_win.cc create mode 100644 third_party/aom/third_party/libyuv/source/video_common.cc create mode 100644 third_party/aom/third_party/vector/LICENSE create mode 100644 third_party/aom/third_party/vector/README.libaom create mode 100644 third_party/aom/third_party/vector/vector.c create mode 100644 third_party/aom/third_party/vector/vector.h create mode 100644 third_party/aom/third_party/x86inc/LICENSE create mode 100644 third_party/aom/third_party/x86inc/README.libaom create mode 100644 third_party/aom/third_party/x86inc/x86inc.asm create mode 100644 third_party/aom/tools/aggregate_entropy_stats.py create mode 100644 third_party/aom/tools/aom_entropy_optimizer.c create mode 100644 third_party/aom/tools/auto_refactor/auto_refactor.py create mode 100644 third_party/aom/tools/auto_refactor/av1_preprocess.py create mode 100644 third_party/aom/tools/auto_refactor/c_files/decl_status_code.c create mode 100644 third_party/aom/tools/auto_refactor/c_files/func_in_out.c create mode 100644 third_party/aom/tools/auto_refactor/c_files/global_variable.c create mode 100644 third_party/aom/tools/auto_refactor/c_files/parse_lvalue.c create mode 100644 third_party/aom/tools/auto_refactor/c_files/simple_code.c create mode 100644 third_party/aom/tools/auto_refactor/c_files/struct_code.c create mode 100644 third_party/aom/tools/auto_refactor/test_auto_refactor.py create mode 100755 third_party/aom/tools/cpplint.py create mode 100644 third_party/aom/tools/diff.py create mode 100644 third_party/aom/tools/dump_obu.cc create mode 100644 third_party/aom/tools/frame_size_variation_analyzer.py create mode 100755 third_party/aom/tools/gen_authors.sh create mode 100755 third_party/aom/tools/gen_constrained_tokenset.py create mode 100644 third_party/aom/tools/gop_bitrate/analyze_data.py create mode 100755 third_party/aom/tools/gop_bitrate/encode_all_script.sh create mode 100644 third_party/aom/tools/gop_bitrate/python/bitrate_accuracy.py create mode 100644 third_party/aom/tools/inspect-cli.js create mode 100644 third_party/aom/tools/inspect-post.js create mode 100755 third_party/aom/tools/intersect-diffs.py create mode 100755 third_party/aom/tools/lint-hunks.py create mode 100644 third_party/aom/tools/obu_parser.cc create mode 100644 third_party/aom/tools/obu_parser.h create mode 100644 third_party/aom/tools/ratectrl_log_analyzer/analyze_ratectrl_log.py create mode 100644 third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc create mode 100644 third_party/aom/tools/txfm_analyzer/txfm_graph.cc create mode 100644 third_party/aom/tools/txfm_analyzer/txfm_graph.h create mode 100755 third_party/aom/tools/wrap-commit-msg.py create mode 100644 third_party/aom/usage.dox create mode 100644 third_party/aom/usage_cx.dox create mode 100644 third_party/aom/usage_dx.dox (limited to 'third_party/aom') diff --git a/third_party/aom/.clang-format b/third_party/aom/.clang-format new file mode 100644 index 0000000000..a8bc4967c3 --- /dev/null +++ b/third_party/aom/.clang-format @@ -0,0 +1,9 @@ +--- +Language: Cpp +BasedOnStyle: Google +AllowShortCaseLabelsOnASingleLine: true +ConstructorInitializerAllOnOneLineOrOnePerLine: false +Cpp11BracedListStyle: false +DerivePointerAlignment: false +PointerAlignment: Right +SortIncludes: false diff --git a/third_party/aom/.cmake-format.py b/third_party/aom/.cmake-format.py new file mode 100644 index 0000000000..c79a6ad604 --- /dev/null +++ b/third_party/aom/.cmake-format.py @@ -0,0 +1,102 @@ +# Generated with cmake-format 0.5.1 +# How wide to allow formatted cmake files +line_width = 80 + +# How many spaces to tab for indent +tab_size = 2 + +# If arglists are longer than this, break them always +max_subargs_per_line = 10 + +# If true, separate flow control names from their parentheses with a space +separate_ctrl_name_with_space = False + +# If true, separate function names from parentheses with a space +separate_fn_name_with_space = False + +# If a statement is wrapped to more than one line, than dangle the closing +# parenthesis on it's own line +dangle_parens = False + +# What character to use for bulleted lists +bullet_char = '*' + +# What character to use as punctuation after numerals in an enumerated list +enum_char = '.' + +# What style line endings to use in the output. +line_ending = u'unix' + +# Format command names consistently as 'lower' or 'upper' case +command_case = u'lower' + +# Format keywords consistently as 'lower' or 'upper' case +keyword_case = u'unchanged' + +# Specify structure for custom cmake functions +additional_commands = { + "foo": { + "flags": [ + "BAR", + "BAZ" + ], + "kwargs": { + "HEADERS": "*", + "DEPENDS": "*", + "SOURCES": "*" + } + } +} + +# A list of command names which should always be wrapped +always_wrap = [] + +# Specify the order of wrapping algorithms during successive reflow attempts +algorithm_order = [0, 1, 2, 3, 4] + +# If true, the argument lists which are known to be sortable will be sorted +# lexicographicall +autosort = False + +# enable comment markup parsing and reflow +enable_markup = True + +# If comment markup is enabled, don't reflow the first comment block in +# eachlistfile. Use this to preserve formatting of your +# copyright/licensestatements. +first_comment_is_literal = True + +# If comment markup is enabled, don't reflow any comment block which matchesthis +# (regex) pattern. Default is `None` (disabled). +literal_comment_pattern = None + +# Regular expression to match preformat fences in comments +# default=r'^\s*([`~]{3}[`~]*)(.*)$' +fence_pattern = u'^\\s*([`~]{3}[`~]*)(.*)$' + +# Regular expression to match rulers in comments +# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$' +ruler_pattern = u'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$' + +# If true, emit the unicode byte-order mark (BOM) at the start of the file +emit_byteorder_mark = False + +# If a comment line starts with at least this many consecutive hash characters, +# then don't lstrip() them off. This allows for lazy hash rulers where the first +# hash char is not separated by space +hashruler_min_length = 10 + +# If true, then insert a space between the first hash char and remaining hash +# chars in a hash ruler, and normalize it's length to fill the column +canonicalize_hashrulers = True + +# Specify the encoding of the input file. Defaults to utf-8. +input_encoding = u'utf-8' + +# Specify the encoding of the output file. Defaults to utf-8. Note that cmake +# only claims to support utf-8 so be careful when using anything else +output_encoding = u'utf-8' + +# A dictionary containing any per-command configuration overrides. Currently +# only `command_case` is supported. +per_command = {} diff --git a/third_party/aom/.mailmap b/third_party/aom/.mailmap new file mode 100644 index 0000000000..6d6e6302bc --- /dev/null +++ b/third_party/aom/.mailmap @@ -0,0 +1,113 @@ +Aasaipriya Chandran +Aasaipriya Chandran Aasaipriya C <100778@ittiam.com> +Adrian Grange +Adrian Grange +Alexander Bokov +Alexis Ballier +Alpha Lam +Andrey Norkin +Angie Chiang +Arild Fuldseth +Arild Fuldseth +Aℓex Converse +Aℓex Converse +Aasaipriya Chandran +Aasaipriya Chandran Aasaipriya C <100778@ittiam.com> +Apurve Pandey +Apurve Kumar Pandey Apurve Pandey +Bohan Li +Changjun Yang +Chi Yo Tsai +Chi Yo Tsai +Chm +Damon Shen +Daniele Castagna +Deb Mukherjee +Elliott Karpilovsky +Emil Keyder +Erik Niemeyer +Frederic Barbier +Fyodor Kyslov +Grant Hsu +Guillaume Martres +Guillaume Martres +Guillaume Martres +Guillaume Martres +Hangyu Kuang +Hangyu Kuang +Hui Su +Iole Moccagatta +Jacky Chen +James Zern +Jean-Marc Valin +Jim Bankoski +Johann Koenig +Johann Koenig +Johann Koenig +Johann Koenig +Johann Koenig +John Koleszar +Joshua Litt +Kyle Siefring +Kyle Siefring +Lin Zheng +Logan Goldberg +Lokeshwar Reddy B +Luc Trudeau +Luc Trudeau +Marco Paniconi +Marco Paniconi +Michael Bebenita +Michael Horowitz +Mingliang Chen +Monty Montgomery +Mudassir Galaganath +Narayan Kalaburgi +Mudassir Galaganath Mudassir Galagnath +Nathan E. Egge +Nathan E. Egge +Onur Guleryuz +Pascal Massimino +Pascal Massimino +Paul Wilkins +Peng Bin +Peng Bin +Peter de Rivaz +Rachel Barker David Barker +Ralph Giles +Ralph Giles +Remya Prakasan +Roger Zhou +Ronald S. Bultje +Ryan Lei +Ryan Lei +Ryan Lei +Sachin Kumar Garg +Sai Deng +Sami Pietilä +Sarah Parker +Susanna D'Souza +Tamar Levy +Tamar Levy +Tero Rintaluoma +Thomas Davies Thomas +Timothy B. Terriberry +Timothy B. Terriberry +Timothy B. Terriberry Tim Terriberry +Tom Finegan +Tom Finegan +Tristan Matthews +Venkat Sanampudi +Vitalii Dziumenko +Wei-Ting Lin +Wei-Ting Lin +Wenyao Liu +Will Bresnahan +Yaowu Xu +Yaowu Xu +Yaowu Xu +Yaowu Xu +Yaowu Xu +Yaowu Xu +Zhipin Deng +Zoe Liu diff --git a/third_party/aom/AUTHORS b/third_party/aom/AUTHORS new file mode 100644 index 0000000000..ade7a1a5d0 --- /dev/null +++ b/third_party/aom/AUTHORS @@ -0,0 +1,319 @@ +# This file is automatically generated from the git commit history +# by tools/gen_authors.sh. + +Aamir Anis +Aaron Watry +Aasaipriya Chandran +Abo Talib Mahfoodh +Adrian Grange +Ahmad Sharif +Akshata Jadhav +Alexander Bokov +Alexander Voronov +Aℓex Converse +Alexis Ballier +Alex Peterson +Alok Ahuja +Alpha Lam +A.Mahfoodh +Ami Fischman +Andoni Morales Alastruey +Andres Mejia +Andrew Russell +Andrey Norkin +Angie Chiang +Aniket Dhok +Aniket Wanare +Ankur Saxena +Anupam Pandey +Apurve Kumar Pandey +Arild Fuldseth +Aron Rosenberg +Arpad Panyik +Arun Singh Negi +Attila Nagy +Balaji Anandapadmanaban +Bohan Li +Brennan Shacklett +Brion Vibber +Bruno Berthier +Casey Smalley +Changjun Yang +Charles 'Buck' Krasic +Cheng Chen +Cherma Rajan A +Chethan Kumar R E +Chi Yo Tsai +Chm +Christian Duvivier +Christopher Degawa +Cyril Concolato +Dake He +Damon Shen +Dandan Ding +Daniele Castagna +Daniel Kang +Daniel Max Valenzuela +Danil Chapovalov +David Major +David Michael Barr +David Turner +Deb Mukherjee +Deepa K G +Di Chen +Diksha Singh +Dim Temp +Dmitry Kovalev +Dominic Symes +Dragan Mrdjan +Ed Baker +Edward Hervey +Ehsan Akhgari +Elliott Karpilovsky +Emil Keyder +Erik Niemeyer +Fabio Pedretti +Fangwen Fu +Fergus Simpson +Frank Bossen +Frank Galligan +Frederic Barbier +Fredrik Söderquist +Fritz Koenig +Fyodor Kyslov +Gaute Strokkenes +George Steed +Gerda Zsejke More +Geza Lore +Ghislain MARY +Giuseppe Scrivano +Gordana Cmiljanovic +Grant Hsu +Guillaume Martres +Guillermo Ballester Valor +Hamsalekha S +Hangyu Kuang +Hanno Böck +Harish Mahendrakar +Henrik Lundin +Hien Ho +Hirokazu Honda +Hui Su +Ilie Halip +Ilya Brailovskiy +Imdad Sardharwalla +Iole Moccagatta +Ivan Krasin +Ivan Maltz +Ivan Rosales +Jacek Caban +Jack Haughton +Jacky Chen +James Berry +James Yu +James Zern +Jan Gerber +Jan Kratochvil +Janne Salonen +Jayasanker J +Jayashri Murugan +Jean-Marc Valin +Jean-Yves Avenard +Jeff Faust +Jeff Muizelaar +Jeff Petkau +Jerome Jiang +Jia Jia +Jian Zhou +Jim Bankoski +Jingning Han +Joe Young +Joey Parrish +Johann Koenig +John Koleszar +Johnny Klonaris +John Stark +Jonathan Matthews +Jonathan Wright +Joshua Bleecher Snyder +Joshua Litt +Josh Verdejo +Julia Robson +Justin Clift +Justin Lebar +Katsuhisa Yuasa +Kavi Ramamurthy +KO Myung-Hun +Konstantinos Margaritis +Krishna Malladi +Kwanghoon Son +Kyle Siefring +Larisa Markeeva +Lauren Partin +Lawrence Velázquez +leolzhao +Leon Kollar +L. E. Segovia +Lester Lu +liang zhao +Linfeng Zhang +Link.Meng +Lin Zheng +Logan Goldberg +Lokeshwar Reddy B +Lou Quillio +Luca Barbato +Luca Versari +Luc Trudeau +Madhu Peringassery Krishnan +Makoto Kato +Mans Rullgard +Marco Paniconi +Mark Horvath +Mark Mentovai +Mark Wachsler +Martin Ettl +Martin Storsjo +Maryla +Matthew Heaney +Matthieu Vaudano +Mattias Hansson +Maxym Dmytrychenko +Michael Bebenita +Michael Horowitz +Michael Kohler +Michelle Findlay-Olynyk +Mike Frysinger +Mike Hommey +Mikhal Shemer +Minghai Shang +Mingliang Chen +Mirko Bonadei +Monty Montgomery +Morton Jonuschat +Mudassir Galaganath +Mufaddal Chakera +Narayan Kalaburgi +Nathan E. Egge +Neeraj Gadgil +Neha Mary Thomas +Neil Birkbeck +Nico Weber +Nithya V S +Ola Hugosson +Oleg Nalivayko +Onur Guleryuz +Parag Salasakar +Pascal Massimino +Patrik Westin +Paul Wilkins +Pavel Frolov +Pavol Rusnak +Paweł Hajdan +Peng Bin +Pengchong Jin +Peter Boström +Peter de Rivaz +Peter Kasting +Philip Jägenstedt +Priit Laes +Qiu Jianlin +Rachel Barker +Rafael Ávila de Espíndola +Rafaël Carré +Ralph Giles +Ranjit Kumar Tulabandu +Ravi Chaudhary +Remya Prakasan +Remy Foray +Rob Bradford +Robert-André Mauchin +Robert Chin +Roger Zhou +Rohit Athavale +Ronald S. Bultje +Rostislav Pehlivanov +Ruiling Song +Rui Ueyama +Rupert Swarbrick +Ryan Lei +Ryan Overbeck +Sachin Kumar Garg +Sai Deng +Salome Thirot +Sami Boukortt +Sami Pietilä +Samuel Thibault +Samuthirika S +Sarah Parker +Sasi Inguva +Satheesh Kumar +Satish Kumar Suman +Scott Graham +Scott LaVarnway +Sean DuBois +Sean McGovern +Sean Purser-Haskell +Sebastien Alaiwan +Sergey Kolomenkin +Sergey Ulanov +S Hamsalekha +Shimon Doodkin +Shunyao Li +SmilingWolf +Soo-Chul Han +Stanislav Vitvitskyy +Stefan Holmer +Steinar Midtskogen +Steve Lhomme +Suman Sunkara +Susanna D'Souza +Taekhyun Kim +Takanori MATSUURA +Tamar Levy +Tao Bai +Tarek AMARA +Tarundeep Singh +Tero Rintaluoma +Thijs Vermeir +Thomas Daede +Thomas Davies Thomas +Tim Kopp +Timothy B. Terriberry +Timo Witte +Todd Nguyen +Tom Anderson +Tom Finegan +Tristan Matthews +Umang Saini +Urvang Joshi +Venkat Sanampudi +Victoria Zhislina +Vignesh Venkatasubramanian +Vikas Prasad +Vincent Rabaud +Vishesh +Vishnu Teja Manyam +Vitalii Dziumenko +Wan-Teh Chang +Wei-Ting Lin +Wenyao Liu +Will Bresnahan +Xiaoqing Zhu +Xing Jin +Xin Zhao +Yannis Guyon +Yaowu Xu +Yeqing Wu +Yi Luo +Yingying Ma +Yongzhe Wang +Yuan Tong +Yu-Chen (Eric) Sun +Yue Chen +Yunqing Wang +Yury Gitman +Yushin Cho +Zhijie Yang +Zhipin Deng +Zoe Liu diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG new file mode 100644 index 0000000000..b243837d3c --- /dev/null +++ b/third_party/aom/CHANGELOG @@ -0,0 +1,828 @@ +2023-11-30 v3.8.0 + This release includes new codec interfaces, compression efficiency and + perceptual improvements, speedup and memory optimizations and many bug + fixes. This release is ABI compatible with the last release. + + - New Features + * New codec controls: + * AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR: Set the maximum number of + consecutive frame drops allowed for the frame dropper in 1 pass + CBR mode. + * Run-time CPU feature detection for all Arm platforms: + CRC, DotProd, I8MM and SVE CPU feature presence is detected at run + time and code paths making use of these features are selected + dynamically. These code paths provide meaningful performance gains + for standard bitdepth RTC and VoD encoding: up to 10% and 20% + respectively, over the Armv8.0-A baseline build. + * RTC: Frame-dropper support added to the rate control library. + * RTC Rate control improvements for low bitrate and for SVC. + + - Compression Efficiency Improvements + * Improved accuracy of cost estimation for loop restoration and + global motion. + * Improved selection of loop restoration unit size - full search up + to (non-realtime) speed 2, retuned static selection at higher + speeds. + * RTC Screen content mode: 3-5% bdrate gains across speeds 7 - 10. + * Good-quality mode: 0.2 - 0.5% bdrate gains across speeds 1 - 4. + + - Perceptual Quality Improvements + * RTC Screen: Improved visual quality for scrolling. + * RTC: Improved color quality for both screen and video mode. + + - Speedup and Memory Optimizations + * Good-quality, single-thread encoder speedups: + o 15% improvement for speed 5. + o 12% improvement for speed 6. + * Arm standard bitdepth VoD (--good): + o 8% speedup for speeds 0 and 1. + o 20% speedup for speed 2. + o 27% speedup for speed 3. + o 30% speedup for speed 4. + o 38% speedup for speeds 5 and 6. + * Arm high bitdepth VoD (--good): + o 206% speedup for speeds 0 and 1. + o 180% speedup for speed 2. + o 51% speedup for speeds 3 and 4. + o 68% speedup for speed 5. + o 72% speedup for speed 6. + * RTC Screen content: 2-6% speedup across speeds 7-10. + * RTC: 2-3% speedup for temporal layers. + * RTC: Speedups to reference scaling in nonrd pickmode. + * Good-quality mode: Simplified global motion estimation, saving + ~1200 lines of code and 1KB of tables while improving quality. + + - Bug Fixes + * Fixes to improve libaom stability in case of memory allocation + failures. + * Fixes to SIMD functions (x86 AVX2/SSE2 and ARM Neon). + * b/310457427, b/310766628: Bug fixes to only use rec_sse in CBR + mode. + +2023-11-17 v3.7.1 + This release includes several bug fixes. This release is ABI + compatible with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.7.0..v3.7.1 for all the + commits in this release. + + - Bug Fixes + * aomedia:3349: heap overflow when increasing resolution + * aomedia:3478: GCC 12.2.0 emits a -Wstringop-overflow warning on + aom/av1/encoder/motion_search_facade.c + * aomedia:3489: Detect encoder and image high bit depth mismatch + * aomedia:3491: heap-buffer-overflow on frame size change (CVE-2023-6879) + * b/303023614: Segfault at encoding time for high bit depth images + +2023-08-10 v3.7.0 + This release includes new codec interfaces, compression efficiency and + perceptual improvements, speedup and memory optimizations and many bug fixes. + This release is ABI compatible with the last release. + + - New Features + * New codec controls: + * AV1E_SET_QUANTIZER_ONE_PASS: Set quantizer for each frame. + * AV1E_ENABLE_RATE_GUIDE_DELTAQ: enable the rate distribution guided delta + quantization in all intra mode. The "enable-rate-guide-deltaq" option is + added for this control. + * AV1E_SET_RATE_DISTRIBUTION_INFO: set the input file for rate + distribution used in all intra mode. The "rate-distribution-info" option + is added for this control. + * AV1E_GET_LUMA_CDEF_STRENGTH + * AV1E_SET_BITRATE_ONE_PASS_CBR + * AOM_SCALING_MODE is extended to include 2/3 and 1/3 scaling. + * aom_tune_metric is extended to include AOM_TUNE_VMAF_SALIENCY_MAP. + The "tune" option is extended to include "vmaf_saliency_map". + * SVC example encoder svc_encoder_rtc is able to use the rate control + library. + * Loopfilter level and CDEF filter level is supported by RTC rate control + library. + * New speed (--cpu-used) 11, intended for RTC screen sharing, added for + faster encoding with ~3% bdrate loss with 16% IC (instruction count) + speedup compared to speed 10. + + - Compression Efficiency Improvements + * Improved VoD encoding performance + * 0.1-0.6% BDrate gains for encoding speeds 2 to 6 + * Rate control accuracy improvement in VBR mode + * RTC encoding improvements + * Screen content mode: 10-19% BDrate gains for speeds 6 - 10 + * Temporal layers video mode, for speed 10: + * 2 temporal layers on low resolutions: 13-15% BDrate gain + * 3 temporal layers on VGA/HD: 3-4% BDrate gain + + - Perceptual Quality Improvements + * Fixed multiple block and color artifacts for RTC screen content by + * Incorporating color into RD cost for IDTX + * Reducing thresholds for palette mode in non RD mode + * Allowing more palette mode testing + * Improved color sensitivity for altref in non-RD mode. + * Reduced video flickering for temporal layer encoding. + + - Speedup and Memory Optimizations + * Speed up the VoD encoder + * 2-5% for encoding speed 2 to 4 + * 9-15% for encoding speed 5 to 6 + * ARM + * Standard bitdepth + * speed 5: +31% + * speed 4: +2% + * speed 3: +9% + * speed 2: +157% + * High bitdepth + * speed 5: +85% + * RTC speedups + * Screen content mode + * 15% IC speedup for speeds 6-8 + * ARM: 7% for speed 9, 3% for speed 10 + * Temporal layers video mode + * 7% speedup for 3 temporal layers on VGA/HD, for speed 10 + * Single layer video + * x86: 2% IC speedup for speeds 7-10 + * ARM: 2-4% speedup across speeds 5-10 + + - Other improvements + * VoD: Major improvements to global motion estimation, now enabled up to + speed 4 + * RTC + * Fixes to make lossless coding work. + * Fixes to make frame dropper (--drop_frames) work for single and temporal + layers. + * Improvements to RPS (reference picture selection) recovery frames. + * Improvements to rate control for temporal layers. + * libwebm is updated to libwebm-1.0.0.29-9-g1930e3c + + - Bug Fixes + * aomedia:3261 Assertion failed when encoding av1 with film grain and + '--monochrome' flag + * aomedia:3276 ensure all allocations are checked (partial fix) + * aomedia:3451 The libaom library calls exit() + * aomedia:3450 enable -Wshadow for C++ sources + * aomedia:3449 Test Seg Faults After + b459af3e345be402db052a143fcc5383d4b74cbd + * aomedia:3416 prune unused symbols / restrict symbol visibility + * aomedia:3443 Jenkins failure: + UninstantiatedParameterizedTestSuite + * aomedia:3434 realtime failures with CONFIG_BITSTREAM_DEBUG=1 + * aomedia:3433 DeltaqModeTest crash w/row_mt=0 + * aomedia:3429 Encoder crash when turn on both ExternalResize and + g_threads > 2 + * aomedia:3438 Build failure with + `-DSANITIZE=address -DBUILD_SHARED_LIBS=ON` when using clang. + * aomedia:3435 Block artifacts when scrolling with AV1 in screen sharing + scenarios + * aomedia:3170 vmaf tune presets produce extreme glitches in one scene + * aomedia:3401 Building shared libaom with MSVC results in a race condition + with the export library + * aomedia:3420 Floating point exception in av1_tpl_get_frame_importance() + * aomedia:3424 heap-buffer-overflow in ScaleFilterCols_16_C() (SIGABRT) + * aomedia:3417 examples/svc_encoder_rtc.c is using internal macros and + functions + * aomedia:3372 SEGV in assign_frame_buffer_p av1_common_int.h + * aomedia:3130 'cpu-features.h' file not found on Android NDK 22 + * aomedia:3415 Encoder/decoder mismatch for svc_encoder_rtc running + 1 SL 3 TL + * aomedia:3412 Lossless Mode Fails Loopback Bit Test + * aomedia:3409 The use of AV1_VAR_OFFS in av1/encoder/var_based_part.c is + incorrect for high bit depths + * aomedia:3403 test_libaom fails with error message + "feenableexcept() failed" on Linux arm + * aomedia:3370 Random color block at fast motion area + * aomedia:3393 Assertion failure in av1_convolve_2d_sr_c() + * aomedia:3392 Strong artifacting for high bit-depth real-time + * aomedia:3376 aomenc --threads=10 --deltaq-mode=3 crashes after + "Allintra: multi-threading of calculating differential contrast" + * aomedia:3380 Crashes and ASan and TSan errors in deltaq-mode=3 + multithreading code + * chromium:1410766 heap-buffer-overflow in aom_yv12_copy_v_c + * Cannot set level via AV1E_SET_TARGET_SEQ_LEVEL_IDX + * Encoding failure due to the use of loop restoration with unintended use of + lossless mode. + * Signed integer overflow in scan_past_frames + * Signed integer overflow in update_a_sep_sym + * Flickering in AV1 1440p/2160p HDR transcodes + * Fixed artifacts with screen share at encoder speed 10 + * Fixed prediction setup for IDTX + +2023-05-08 v3.6.1 + This release includes several bug fixes. This release is ABI + compatible with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.6.0..v3.6.1 for all the + commits in this release. + + - Bug Fixes + * aomedia:2871: Guard the support of the 7.x and 8.x levels for AV1 + under the CONFIG_CWG_C013 config flag, and only output the 7.x and + 8.x levels when explicitly requested. + * aomedia:3382: Choose sb_size by ppi instead of svc. + * aomedia:3384: Fix fullpel search limits. + * aomedia:3388: Replace left shift of xq_active by multiplication. + * aomedia:3389: Fix MV clamping in av1_mv_pred. + * aomedia:3390: set_ld_layer_depth: cap max_layer_depth to + MAX_ARF_LAYERS. + * aomedia:3418: Fix MV clamping in av1_int_pro_motion_estimation. + * aomedia:3429: Move lpf thread data init to lpf_pipeline_mt_init(). + * b:266719111: Fix undefined behavior in Arm Neon code. + * b:269840681: nonrd_opt: align scan tables. + * rtc: Fix is_key_frame setting in variance partition. + * Build: Fix build with clang-cl and Visual Studio. + * Build: Fix module definition file for MinGW/MSYS. + +2023-02-03 v3.6.0 + This release includes compression efficiency and perceptual quality + improvements, speedup and memory optimizations, and some new features. + This release is ABI compatible with the last release. + + - New Features + * New values 20-27 (corresponding to levels 7.0-7.3 and 8.0-8.3) for + the encoder control AV1E_SET_TARGET_SEQ_LEVEL_IDX (note that the + proposal to add the new levels are still in draft status). The + original special value 24 (keep level stats only for level + monitoring) is renumbered as 32. + * New encoder control AV1E_SET_SKIP_POSTPROC_FILTERING to skip the + application of post-processing filters on reconstructed frame in + all intra mode. + * New encoder option "kf-max-pyr-height": Maximum height of pyramid + structure used for the GOP starting with a key frame (-1 to 5). + * Make SVC work for screen content. + * Rate control improvements to reduce frame-size spikes for screen + content coding. + * RISC-V architecture support with gcc toolchain. + + - Compression Efficiency Improvements + * Peak compression efficiency in VOD setting is improved by 1%. + * 0.7% - 2.2% RTC encoding BDrate gains for real time speed 8 to 10. + * 15% RTC encoding BDrate gains for screen content speed 10. + + - Perceptual Quality Improvements + * Resolved a visual quality issue that was reported for high + resolution clips (2K) for speed 4 and above in VOD use case. + * Visual quality improvements to screen content coding. + * Quality improvements to temporal layer RTC coding. + + - Speedup and Memory Optimizations + * RTC single-thread encoder speedup: + o ~6% instruction count reduction for speed 5 and 6. + o ~15% instruction count reduction for speed 7. + o ~10% instruction count reduction for speed 8 to 10 (>=360p + resolutions). + * RTC multi-thread encoder speedup (beyond single-thread speedup): + o 5-8% encode time reduction for speed 7 to 10. + * RTC screen-content encoder speedup: + o 11% instruction count reduction for speed 9 and 10 (>=720p + resolutions). + * ~5% reduction in heap memory requirements for RTC, speed 6 to 10. + * AVIF: + o 4-5% speedup for speed 9 in still-picture encoding mode. + o 3-4% heap memory reduction in still-picture encoding mode for + 360p-720p resolutions with multiple threads. + + - Bug Fixes + * Added a workaround for an AV1 specification bug which makes + TRANSLATION type global motion models unusable. + * Fixed AddressSanitizer global-buffer-overflow errors in + av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c. + * Fixed AddressSanitizer heap-buffer-overflow error in + av1_wiener_convolve_add_src_neon(). + * chromium:1393384 Avoid scene detection on spatial resize. + * aomedia:3308 Remove color artifacts under high motion. + * aomedia:3310 Avoid out of memory failures with Visual Studio 2017, + 2019, and 2022 for Win32 x86 builds. + * aomedia:3346 Make SVC work properly for screen content. + * aomedia:3348 Fix a bug where an uninitialized search_site is used. + * aomedia:3365 Work around what seems like a Visual Studio 2022 + compiler optimization bug. + * aomedia:3369 Incorrect PSNR values reported by libaom for 12-bit + encode. + +2022-08-31 v3.5.0 + This release is ABI compatible with the last one, including speedup and memory + optimizations, and new APIs and features. + + - New Features + * Support for frame parallel encode for larger number of threads. --fp-mt + flag is available for all build configurations. + * New codec control AV1E_GET_NUM_OPERATING_POINTS + + - Speedup and Memory Optimizations + * Speed-up multithreaded encoding for good quality mode for larger number of + threads through frame parallel encoding: + o 30-34% encode time reduction for 1080p, 16 threads, 1x1 tile + configuration (tile_rows x tile_columns) + o 18-28% encode time reduction for 1080p, 16 threads, 2x4 tile + configuration + o 18-20% encode time reduction for 2160p, 32 threads, 2x4 tile + configuration + * 16-20% speed-up for speed=6 to 8 in still-picture encoding mode + * 5-6% heap memory reduction for speed=6 to 10 in real-time encoding mode + * Improvements to the speed for speed=7, 8 in real-time encoding mode + * Improvements to the speed for speed=9, 10 in real-time screen encoding + mode + * Optimizations to improve multi-thread efficiency in real-time encoding + mode + * 10-15% speed up for SVC with temporal layers + * SIMD optimizations: + o Improve av1_quantize_fp_32x32_neon() 1.05x to 1.24x faster + o Add aom_highbd_quantize_b{,_32x32,_64x64}_adaptive_neon() 3.15x to 5.6x + faster than "C" + o Improve av1_quantize_fp_64x64_neon() 1.17x to 1.66x faster + o Add aom_quantize_b_avx2() 1.4x to 1.7x faster than aom_quantize_b_avx() + o Add aom_quantize_b_32x32_avx2() 1.4x to 2.3x faster than + aom_quantize_b_32x32_avx() + o Add aom_quantize_b_64x64_avx2() 2.0x to 2.4x faster than + aom_quantize_b_64x64_ssse3() + o Add aom_highbd_quantize_b_32x32_avx2() 9.0x to 10.5x faster than + aom_highbd_quantize_b_32x32_c() + o Add aom_highbd_quantize_b_64x64_avx2() 7.3x to 9.7x faster than + aom_highbd_quantize_b_64x64_c() + o Improve aom_highbd_quantize_b_avx2() 1.07x to 1.20x faster + o Improve av1_quantize_fp_avx2() 1.13x to 1.49x faster + o Improve av1_quantize_fp_32x32_avx2() 1.07x to 1.54x faster + o Improve av1_quantize_fp_64x64_avx2() 1.03x to 1.25x faster + o Improve av1_quantize_lp_avx2() 1.07x to 1.16x faster + + - Bug fixes including but not limited to + * aomedia:3206 Assert that skip_width > 0 for deconvolve function + * aomedia:3278 row_mt enc: Delay top-right sync when intraBC is enabled + * aomedia:3282 blend_a64_*_neon: fix bus error in armv7 + * aomedia:3283 FRAME_PARALLEL: Propagate border size to all cpis + * aomedia:3283 RESIZE_MODE: Fix incorrect strides being used for motion + search + * aomedia:3286 rtc-svc: Fix to dynamic_enable spatial layers + * aomedia:3289 rtc-screen: Fix to skipping inter-mode test in nonrd + * aomedia:3289 rtc-screen: Fix for skip newmv on flat blocks + * aomedia:3299 Fix build failure with CONFIG_TUNE_VMAF=1 + * aomedia:3296 Fix the conflict --enable-tx-size-search=0 with nonrd mode + --enable-tx-size-search will be ignored in non-rd pick mode + * aomedia:3304 Fix off-by-one error of max w/h in validate_config + * aomedia:3306 Do not use pthread_setname_np on GNU/Hurd + * aomedia:3325 row-multithreading produces invalid bitstream in some cases + * chromium:1346938, chromium:1338114 + * compiler_flags.cmake: fix flag detection w/cmake 3.17-3.18.2 + * tools/*.py: update to python3 + * aom_configure.cmake: detect PIE and set CONFIG_PIC + * test/simd_cmp_impl: use explicit types w/CompareSimd* + * rtc: Fix to disable segm for aq-mode=3 + * rtc: Fix to color_sensitivity in variance partition + * rtc-screen: Fix bsize in model rd computation for intra chroma + * Fixes to ensure the correct behavior of the encoder algorithms (like + segmentation, computation of statistics, etc.) + +2022-06-17 v3.4.0 + This release includes compression efficiency and perceptual quality + improvements, speedup and memory optimizations, and some new features. + There are no ABI or API breaking changes in this release. + + - New Features + * New --dist-metric flag with "qm-psnr" value to use quantization + matrices in the distortion computation for RD search. The default + value is "psnr". + * New command line option "--auto-intra-tools-off=1" to make + all-intra encoding faster for high bit rate under + "--deltaq-mode=3" mode. + * New rate control library aom_av1_rc for real-time hardware + encoders. Supports CBR for both one spatial layer and SVC. + * New image format AOM_IMG_FMT_NV12 can be used as input to the + encoder. The presence of AOM_IMG_FMT_NV12 can be detected at + compile time by checking if the macro AOM_HAVE_IMG_FMT_NV12 is + defined. + * New codec controls for the encoder: + o AV1E_SET_AUTO_INTRA_TOOLS_OFF. Only in effect if + --deltaq-mode=3. + o AV1E_SET_RTC_EXTERNAL_RC + o AV1E_SET_FP_MT. Only supported if libaom is built with + -DCONFIG_FRAME_PARALLEL_ENCODE=1. + o AV1E_GET_TARGET_SEQ_LEVEL_IDX + * New key-value pairs for the key-value API: + o --auto-intra-tools-off=0 (default) or 1. Only in effect if + --deltaq-mode=3. + o --strict-level-conformance=0 (default) or 1 + o --fp-mt=0 (default) or 1. Only supported if libaom is built + with -DCONFIG_FRAME_PARALLEL_ENCODE=1. + * New aomenc options (not supported by the key-value API): + o --nv12 + + - Compression Efficiency Improvements + * Correctly calculate SSE for high bitdepth in skip mode, 0.2% to + 0.6% coding gain. + * RTC at speed 9/10: BD-rate gain of ~4/5% + * RTC screen content coding: many improvements for real-time screen + at speed 10 (quality, speedup, and rate control), up to high + resolutions (1080p). + * RTC-SVC: fixes to make intra-only frames work for spatial layers. + * RTC-SVC: quality improvements for temporal layers. + * AV1 RT: A new passive rate control strategy for screen content, an + average of 7.5% coding gain, with some clips of 20+%. The feature + is turned off by default due to higher bit rate variation. + + - Perceptual Quality Improvements + * RTC: Visual quality improvements for high speeds (9/10) + * Improvements in coding quality for all intra mode + + - Speedup and Memory Optimizations + * ~10% speedup in good quality mode encoding. + * ~7% heap memory reduction in good quality encoding mode for speed + 5 and 6. + * Ongoing improvements to intra-frame encoding performance on Arm + * Faster encoding speed for "--deltaq-mode=3" mode. + * ~10% speedup for speed 5/6, ~15% speedup for speed 7/8, and + ~10% speedup for speed 9/10 in real time encoding mode + * ~20% heap memory reduction in still-picture encoding mode for + 360p-720p resolutions with multiple threads + * ~13% speedup for speed 6 and ~12% speedup for speed 9 in + still-picture encoding mode. + * Optimizations to improve multi-thread efficiency for still-picture + encoding mode. + + - Bug Fixes + * b/204460717: README.md: replace master with main + * b/210677928: libaom disable_order is surprising for + max_reference_frames=3 + * b/222461449: -DCONFIG_TUNE_BUTTERAUGLI=1 broken + * b/227207606: write_greyscale writes incorrect chroma in highbd + mode + * b/229955363: Integer-overflow in linsolve_wiener + * https://crbug.com/aomedia/2032 + * https://crbug.com/aomedia/2397 + * https://crbug.com/aomedia/2563 + * https://crbug.com/aomedia/2815 + * https://crbug.com/aomedia/3009 + * https://crbug.com/aomedia/3018 + * https://crbug.com/aomedia/3045 + * https://crbug.com/aomedia/3101 + * https://crbug.com/aomedia/3130 + * https://crbug.com/aomedia/3173 + * https://crbug.com/aomedia/3184 + * https://crbug.com/aomedia/3187 + * https://crbug.com/aomedia/3190 + * https://crbug.com/aomedia/3195 + * https://crbug.com/aomedia/3197 + * https://crbug.com/aomedia/3201 + * https://crbug.com/aomedia/3202 + * https://crbug.com/aomedia/3204 + * https://crbug.com/aomedia/3205 + * https://crbug.com/aomedia/3207 + * https://crbug.com/aomedia/3208 + * https://crbug.com/aomedia/3209 + * https://crbug.com/aomedia/3213 + * https://crbug.com/aomedia/3214 + * https://crbug.com/aomedia/3219 + * https://crbug.com/aomedia/3222 + * https://crbug.com/aomedia/3223 + * https://crbug.com/aomedia/3225 + * https://crbug.com/aomedia/3226 + * https://crbug.com/aomedia/3228 + * https://crbug.com/aomedia/3232 + * https://crbug.com/aomedia/3236 + * https://crbug.com/aomedia/3237 + * https://crbug.com/aomedia/3238 + * https://crbug.com/aomedia/3240 + * https://crbug.com/aomedia/3243 + * https://crbug.com/aomedia/3244 + * https://crbug.com/aomedia/3246 + * https://crbug.com/aomedia/3248 + * https://crbug.com/aomedia/3250 + * https://crbug.com/aomedia/3251 + * https://crbug.com/aomedia/3252 + * https://crbug.com/aomedia/3255 + * https://crbug.com/aomedia/3257 + * https://crbug.com/aomedia/3259 + * https://crbug.com/aomedia/3260 + * https://crbug.com/aomedia/3267 + * https://crbug.com/aomedia/3268 + * https://crbug.com/aomedia/3269 + * https://crbug.com/aomedia/3276 + * https://crbug.com/aomedia/3278 + * https://crbug.com/chromium/1290068 + * https://crbug.com/chromium/1303237 + * https://crbug.com/chromium/1304990 + * https://crbug.com/chromium/1321141 + * https://crbug.com/chromium/1321388 + * https://crbug.com/oss-fuzz/44846 + * https://crbug.com/oss-fuzz/44856 + * https://crbug.com/oss-fuzz/44862 + * https://crbug.com/oss-fuzz/44904 + * https://crbug.com/oss-fuzz/45056 + +2022-01-28 v3.3.0 + This release includes compression efficiency and perceptual quality + improvements, speedup and memory optimizations, some new features, and + several bug fixes. + + - New Features + * AV1 RT: Introducing CDEF search level 5 + * Changed real time speed 4 to behave the same as real time speed 5 + * Add --deltaq-strength + * rtc: Allow scene-change and overshoot detection for svc + * rtc: Intra-only frame for svc + * AV1 RT: Option 2 for codec control AV1E_SET_ENABLE_CDEF to disable + CDEF on non-ref frames + * New codec controls AV1E_SET_LOOPFILTER_CONTROL and + AOME_GET_LOOPFILTER_LEVEL + * Improvements to three pass encoding + + - Compression Efficiency Improvements + * Overall compression gains: 0.6% + + - Perceptual Quality Improvements + * Improves the perceptual quality of high QP encoding for delta-q mode 4 + * Auto select noise synthesis level for all intra + + - Speedup and Memory Optimizations + * Added many SSE2 optimizations. + * Good quality 2-pass encoder speedups: + o Speed 2: 9% + o Speed 3: 12.5% + o Speed 4: 8% + o Speed 5: 3% + o Speed 6: 4% + * Real time mode encoder speedups: + o Speed 5: 2.6% BDRate gain, 4% speedup + o Speed 6: 3.5% BDRate gain, 4% speedup + o Speed 9: 1% BDRate gain, 3% speedup + o Speed 10: 3% BDRate gain, neutral speedup + * All intra encoding speedups (AVIF): + o Single thread - speed 6: 8% + o Single thread - speed 9: 15% + o Multi thread(8) - speed 6: 14% + o Multi thread(8) - speed 9: 34% + + - Bug Fixes + * Issue 3163: Segmentation fault when using --enable-keyframe-filtering=2 + * Issue 2436: Integer overflow in av1_warp_affine_c() + * Issue 3226: armv7 build failure due to gcc-11 + * Issue 3195: Bug report on libaom (AddressSanitizer: heap-buffer-overflow) + * Issue 3191: Bug report on libaom (AddressSanitizer: SEGV on unknown + address) + * Issue 3176: Some SSE2/SADx4AvgTest.* tests fail on Windows + * Issue 3175: Some SSE2/SADSkipTest.* tests fail on Windows + +2021-10-13 v3.2.0 + This release includes compression efficiency and perceptual quality + improvements, speedup and memory optimizations, as well as some new + features. + + - New Features + * Introduced speeds 7, 8, and 9 for all intra mode. + * Introduced speed 10 for real time mode. + * Introduced an API that allows external partition decisions. + * SVC: added support for compound prediction. + * SVC: added support for fixed SVC modes. + + - Compression Efficiency Improvements + * Intra-mode search improvement. + * Improved real time (RT) mode BDrate savings by ~5% (RT speed 5) + and ~12% (RT speed 6). The improvement was measured on the video + conference set. + * Improved real time mode for nonrd path (speed 7, 8, 9): BDrate + gains of ~3-5%. + * Rate control and RD adjustments based on ML research in VP9. + Gains of ~0.5-1.0% for HD. + + - Perceptual Quality Improvements + * Added a new mode --deltaq-mode=3 to improve perceptual quality + based on a differential contrast model for still images. + * Added a new mode --deltaq-mode=4 to improve perceptual quality + based on user rated cq_level data set for still images. + * Weighting of some intra mode and partition size choices to better + manage and retain texture. + + - Speedup and Memory Optimizations + * Further improved 2-pass good quality encoder speed: + o Speed 2 speedup: 18% + o Speed 3 speedup: 22% + o Speed 4 speedup: 37% + o Speed 5 speedup: 30% + o Speed 6 speedup: 20% + * Optimized the real time encoder (measured on the video conference + set): + o RT speed 5 speedup: 110% + o RT speed 6 speedup: 77% + + - Bug Fixes + * Issue 3069: Fix one-pass mode keyframe placement off-by-one error. + * Issue 3156: Fix a bug in av1_quantize_lp AVX2 optimization. + +2021-09-29 v3.1.3 + This release includes several bug fixes. + + - Bug fixes: + The following four cmake changes should help the people building + libaom using MSVC. + 1. exports: use CMAKE_SHARED_LIBRARY_PREFIX to determine lib name + https://aomedia-review.googlesource.com/c/aom/+/142342 + 2. aom_install: Install lib dlls to bindir + https://aomedia-review.googlesource.com/c/aom/+/146546 + 3. aom_install: use relpath for install + https://aomedia-review.googlesource.com/c/aom/+/146550 + 4. aom_install: don't exclude msvc from install + https://aomedia-review.googlesource.com/c/aom/+/146547 + + aom/aom_encoder.h: remove configure option reference + https://aomedia-review.googlesource.com/c/aom/+/146743 + + Issue 3113: Tests for detecting chroma subsampling in + av1_copy_and_extend_frame() do not work when y_width or y_height is + 1 + + Issue 3115: image2yuvconfig() should calculate uv_crop_width and + uv_crop_height from y_crop_width and y_crop_height + + Issue 3140: rc_overshoot_pct is documented as having a range of + 0-1000, but is range checked against 0-100 + + Issue 3147: Build failure on Apple M1 arm64 + +2021-07-20 v3.1.2 + This release includes several bug fixes. + + - Bug fixes: + exports.cmake: use APPLE and WIN32 and use def for mingw-w64 + https://aomedia-review.googlesource.com/c/aom/+/139882 + + Issue 2993: Incorrect spatial_id when decoding base layer of + multi-layer stream + + Issue 3080: Chroma Resampling by Encoder on Y4M Inputs Files Tagged + as C420mpeg2 + + Issue 3081: Use of uninitialized value $version_extra in + concatenation (.) or string at aom/build/cmake/version.pl line 88. + +2021-06-08 v3.1.1 + This release includes several bug fixes. + + - Bug fixes: + Issue 2965: Cherry-picked the following four commits for the + tune=butteraugli mode. + 1. Add libjxl to pkg_config if enabled: + https://aomedia-review.googlesource.com/c/aom/+/136044 + 2. Declare set_mb_butteraugli_rdmult_scaling static: + https://aomedia-review.googlesource.com/c/aom/+/134506 + 3. Add color range detection in tune=butteraugli mode: + https://aomedia-review.googlesource.com/c/aom/+/135521 + 4. Enable tune=butteraugli in all-intra mode: + https://aomedia-review.googlesource.com/c/aom/+/136082 + + Issue 3021: Fix vmaf model initialization error when not set to + tune=vmaf + + Issue 3050: Compilation fails with -DCONFIG_TUNE_VMAF=1 + + Issue 3054: Consistent crash on near-static screen content, keyframe + related + +2021-05-03 v3.1.0 + This release adds an "all intra" mode to the encoder, which significantly + speeds up the encoding of AVIF still images at speed 6. + + - Upgrading: + All intra mode for encoding AVIF still images and AV1 all intra videos: + AOM_USAGE_ALL_INTRA (2) can be passed as the 'usage' argument to + aom_codec_enc_config_default(). + + New encoder control IDs added: + - AV1E_SET_ENABLE_DIAGONAL_INTRA: Enable diagonal (D45 to D203) intra + prediction modes (0: false, 1: true (default)). Also available as + "enable-diagonal-intra" for the aom_codec_set_option() function. + + New aom_tune_metric enum value: AOM_TUNE_BUTTERAUGLI. The new aomenc option + --tune=butteraugli was added to optimize the encoder's perceptual quality by + optimizing the Butteraugli metric. Install libjxl (JPEG XL) and then pass + -DCONFIG_TUNE_BUTTERAUGLI=1 to the cmake command to enable it. + + Addition of support for libvmaf 2.x. + + - Enhancements: + Heap memory consumption for encoding AVIF still images is significantly + reduced. + + - Bug fixes: + Issue 2601: third_party/libaom fails licensecheck + + Issue 2950: Conditional expression for rc->this_key_frame_forced is always + true in find_next_key_frame() + + Issue 2988: "make install" installs the aom.h header twice + + Issue 2992: Incorrectly printing the temporal_id twice in dump_obu tool + + Issue 2998: + + Issue 2999: + + Issue 3000: + +2021-02-24 v3.0.0 + This release includes compression efficiency improvement, speed improvement + for realtime mode, as well as some new APIs. + + - Upgrading: + Support for PSNR calculation based on stream bit-depth. + + New encoder control IDs added: + - AV1E_SET_ENABLE_RECT_TX + - AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP + - AV1E_GET_BASELINE_GF_INTERVAL + - AV1E_SET_ENABLE_DNL_DENOISING + + New decoder control IDs added: + - AOMD_GET_FWD_KF_PRESENT + - AOMD_GET_FRAME_FLAGS + - AOMD_GET_ALTREF_PRESENT + - AOMD_GET_TILE_INFO + - AOMD_GET_SCREEN_CONTENT_TOOLS_INFO + - AOMD_GET_STILL_PICTURE + - AOMD_GET_SB_SIZE + - AOMD_GET_SHOW_EXISTING_FRAME_FLAG + - AOMD_GET_S_FRAME_INFO + + New aom_tune_content enum value: AOM_CONTENT_FILM + + New aom_tune_metric enum value: AOM_TUNE_VMAF_NEG_MAX_GAIN + + Coefficient and mode update can be turned off via + AV1E_SET_{COEFF/MODE}_COST_UPD_FREQ. + + New key & value API added, available with aom_codec_set_option() function. + + Scaling API expanded to include 1/4, 3/4 and 1/8. + + - Enhancements: + Better multithreading performance with realtime mode. + + New speed 9 setting for faster realtime encoding. + + Smaller binary size with low bitdepth and realtime only build. + + Temporal denoiser and its optimizations on x86 and Neon. + + Optimizations for scaling. + + Faster encoding with speed settings 2 to 6 for good encoding mode. + + Improved documentation throughout the library, with function level + documentation, tree view and support for the dot tool. + + - Bug fixes: + Aside from those mentioned in v2.0.1 and v2.0.2, this release includes the + following bug fixes: + + Issue 2940: Segfault when encoding with --use-16bit-internal and --limit > 1 + + Issue 2941: Decoder mismatch with --rt --bit-depth=10 and --cpu-used=8 + + Issue 2895: mingw-w64 i686 gcc fails to build + + Issue 2874: Separate ssse3 functions from sse2 file. + +2021-02-09 v2.0.2 + This release includes several bug fixes. + + - Bug fixes: + Issue 2643: Modify the assertion in temporal filter intrinsics. + + Issue 2648: Fix unit test ThreadTestLarge.EncoderResultTest/49 + assertion failure. + + Issue 2869: Add -Wimplicit-function-declaration as C flag only. + + Issue 2878: Avoid memset in the av1_filter_intra_predictor module + functions. + + Issue 2903: Fix a typo bug in apply_temporal_filter_planewise. + + Call av1_setup_frame_size() when dropping a frame in the + encode_frame_to_data_rate() function in av1/encoder/encoder.c. + +2020-11-25 v2.0.1 + This release includes two bug fixes. + + - Bug fixes: + Issue 2723: Fix crash in chroma_check() when generating a monochrome + encoded stream in real-time mode. + + Issue 2833: Fix crash on some input when reduced still picture header is + used in real-time mode and speed >=7. + +2020-05-07 v2.0.0 "Applejack" + First official release of libaom. + This release includes new real-time mode and SVC support. + + - Upgrading: + AOM_SET_POSTPROC, AOM_CODEC_CAP_POSTPROC and AOM_CODEC_USE_POSTPROC are + removed. + + AOM_SET_DBG_* is removed. + + Multi-resolution encoding is removed. + + put_frame and put_slice callbacks are removed. + + - Enhancements: + Full-sweep document update for codec controls. + +2018-06-28 v1.0.0 + AOMedia Codec Workgroup Approved version 1.0 + +2016-04-07 v0.1.0 "AOMedia Codec 1" + This release is the first Alliance for Open Media codec. diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt new file mode 100644 index 0000000000..76944e6917 --- /dev/null +++ b/third_party/aom/CMakeLists.txt @@ -0,0 +1,1035 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(CONFIG_TFLITE) + cmake_minimum_required(VERSION 3.11) +else() + cmake_minimum_required(VERSION 3.9) +endif() + +set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}") +set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}") +if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}") + message( + FATAL_ERROR "Building from within the aom source tree is not supported.\n" + "Hint: Run these commands\n" + "$ rm -rf CMakeCache.txt CMakeFiles\n" + "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n" + "And re-run CMake from the aom_build directory.") +endif() + +project(AOM C CXX) + +# GENERATED source property global visibility. +if(POLICY CMP0118) + cmake_policy(SET CMP0118 NEW) +endif() + +if(NOT EMSCRIPTEN) + if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE + "Release" + CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" + FORCE) + endif() +endif() + +if(MSVC AND MSVC_VERSION LESS 1920) + message( + WARNING + "MSVC versions prior to 2019 (v16) are not supported and may generate" + " incorrect code!") +endif() + +# Library version info. Update LT_CURRENT, LT_REVISION and LT_AGE when making a +# public release by following the guidelines in the libtool document: +# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info +# +# c=, r=, a= +# +# libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is +# passed to libtool. +# +# We set SO_FILE_VERSION = [c-a].a.r +set(LT_CURRENT 11) +set(LT_REVISION 0) +set(LT_AGE 8) +math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}") +set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}") +unset(LT_CURRENT) +unset(LT_REVISION) +unset(LT_AGE) + +# Enable generators like Xcode and Visual Studio to place projects in folders. +set_property(GLOBAL PROPERTY USE_FOLDERS TRUE) + +include("${AOM_ROOT}/build/cmake/aom_configure.cmake") +if(CONFIG_THREE_PASS) + include("${AOM_ROOT}/common/ivf_dec.cmake") +endif() +include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake") +include("${AOM_ROOT}/aom_mem/aom_mem.cmake") +include("${AOM_ROOT}/aom_ports/aom_ports.cmake") +include("${AOM_ROOT}/aom_scale/aom_scale.cmake") +include("${AOM_ROOT}/aom_util/aom_util.cmake") +include("${AOM_ROOT}/av1/av1.cmake") +include("${AOM_ROOT}/build/cmake/aom_install.cmake") +include("${AOM_ROOT}/build/cmake/sanitizers.cmake") +include("${AOM_ROOT}/build/cmake/util.cmake") +include("${AOM_ROOT}/test/test.cmake") + +list(APPEND AOM_RTCD_SOURCES + "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" + "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" + "${AOM_CONFIG_DIR}/config/av1_rtcd.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" + "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" + "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl" + "${AOM_ROOT}/av1/common/av1_rtcd.c" + "${AOM_ROOT}/build/cmake/rtcd.pl") + +list(APPEND AOM_LIBWEBM_SOURCES + "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc" + "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h" + "${AOM_ROOT}/third_party/libwebm/common/webmids.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc" + "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc" + "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h") + +list(APPEND AOM_LIBYUV_SOURCES + "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h" + "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h" + "${AOM_ROOT}/third_party/libyuv/source/convert_argb.cc" + "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc" + "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_any.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_common.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc" + "${AOM_ROOT}/third_party/libyuv/source/row_win.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc" + "${AOM_ROOT}/third_party/libyuv/source/scale_uv.cc") + +list(APPEND AOM_SOURCES + "${AOM_CONFIG_DIR}/config/aom_config.c" + "${AOM_CONFIG_DIR}/config/aom_config.h" + "${AOM_ROOT}/aom/aom.h" + "${AOM_ROOT}/aom/aom_codec.h" + "${AOM_ROOT}/aom/aom_decoder.h" + "${AOM_ROOT}/aom/aom_encoder.h" + "${AOM_ROOT}/aom/aom_external_partition.h" + "${AOM_ROOT}/aom/aom_frame_buffer.h" + "${AOM_ROOT}/aom/aom_image.h" + "${AOM_ROOT}/aom/aom_integer.h" + "${AOM_ROOT}/aom/aomcx.h" + "${AOM_ROOT}/aom/aomdx.h" + "${AOM_ROOT}/aom/internal/aom_codec_internal.h" + "${AOM_ROOT}/aom/internal/aom_image_internal.h" + "${AOM_ROOT}/aom/src/aom_codec.c" + "${AOM_ROOT}/aom/src/aom_decoder.c" + "${AOM_ROOT}/aom/src/aom_encoder.c" + "${AOM_ROOT}/aom/src/aom_image.c" + "${AOM_ROOT}/aom/src/aom_integer.c") + +list(APPEND AOM_COMMON_APP_UTIL_SOURCES + "${AOM_ROOT}/av1/arg_defs.c" + "${AOM_ROOT}/av1/arg_defs.h" + "${AOM_ROOT}/common/args_helper.c" + "${AOM_ROOT}/common/args_helper.h" + "${AOM_ROOT}/common/args.c" + "${AOM_ROOT}/common/args.h" + "${AOM_ROOT}/common/av1_config.c" + "${AOM_ROOT}/common/av1_config.h" + "${AOM_ROOT}/common/md5_utils.c" + "${AOM_ROOT}/common/md5_utils.h" + "${AOM_ROOT}/common/tools_common.c" + "${AOM_ROOT}/common/tools_common.h" + "${AOM_ROOT}/common/video_common.h" + "${AOM_ROOT}/common/rawenc.c" + "${AOM_ROOT}/common/rawenc.h" + "${AOM_ROOT}/common/y4menc.c" + "${AOM_ROOT}/common/y4menc.h" + "${AOM_ROOT}/common/ivfdec.c" + "${AOM_ROOT}/common/ivfdec.h") + +list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/obudec.c" + "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c" + "${AOM_ROOT}/common/video_reader.h") + +list(APPEND AOM_ENCODER_APP_UTIL_SOURCES + "${AOM_ROOT}/common/ivfenc.c" + "${AOM_ROOT}/common/ivfenc.h" + "${AOM_ROOT}/common/video_writer.c" + "${AOM_ROOT}/common/video_writer.h" + "${AOM_ROOT}/common/warnings.c" + "${AOM_ROOT}/common/warnings.h" + "${AOM_ROOT}/common/y4minput.c" + "${AOM_ROOT}/common/y4minput.h" + "${AOM_ROOT}/examples/encoder_util.h" + "${AOM_ROOT}/examples/encoder_util.c") + +list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c" + "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c" + "${AOM_ROOT}/stats/rate_hist.h") + +list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h") + +list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc" + "${AOM_ROOT}/common/webmdec.h") + +list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc" + "${AOM_ROOT}/common/webmenc.h") + +include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps + ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats) + +# Targets +add_library(aom_version ${AOM_VERSION_SOURCES}) +add_no_op_source_file_to_target(aom_version c) +add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h" + COMMAND ${CMAKE_COMMAND} ARGS + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake" + COMMENT "Writing aom_version.h" + VERBATIM) + +add_custom_target(aom_version_check + COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake" + COMMENT "Updating version info if necessary." + VERBATIM) + +if(BUILD_SHARED_LIBS AND NOT MSVC) + # Generate version file immediately for non-MSVC shared builds: The version + # string is needed for the aom target. + execute_process(COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake") +endif() + +add_dependencies(aom_version aom_version_check) + +# TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd +# source. +add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" + "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd") +add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" + "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd") +add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl" + "${AOM_CONFIG_DIR}/config/av1_rtcd.h" + "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd") + +add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES}) +add_dependencies(aom_rtcd aom_version) + +if(ENABLE_EXAMPLES) + add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats) +endif() + +# Xcode generator cannot take a library composed solely of objects. See +# https://gitlab.kitware.com/cmake/cmake/-/issues/17500 +if(XCODE) + set(target_objs_aom ${AOM_SOURCES}) +else() + add_library(aom_obj OBJECT ${AOM_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_obj) + set(target_objs_aom $) +endif() +add_library(aom ${target_objs_aom} $) + +if(BUILD_SHARED_LIBS) + add_library(aom_static STATIC ${target_objs_aom} $) + set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom) + if(MSVC OR (WIN32 AND NOT MINGW)) + # Fix race condition between the import library and the static library. + # Affects MSVC in all three flavors (stock, clang-cl, LLVM -- the latter + # sets MSVC and MINGW both to FALSE). + set_target_properties(aom PROPERTIES ARCHIVE_OUTPUT_NAME "aom_dll") + endif() + + if(NOT MSVC) + # Extract version string and set VERSION/SOVERSION for the aom target. + extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" + aom_version_triple) + + # Strip any trailing version information, if present. + string(FIND "${aom_version_triple}" "-" dash_pos) + if(NOT dash_pos EQUAL -1) + string(SUBSTRING "${aom_version_triple}" 0 ${dash_pos} aom_version_triple) + endif() + + # cmake-format: off + # VERSION is embedded in the .so file name. + # libaom.so -> libaom.so.SOVERSION + # libaom.so.SOVERSION -> libaom.so.VERSION + # libaom.so.VERSION + # cmake-format: on + set_target_properties(aom PROPERTIES SOVERSION ${SO_VERSION}) + set_target_properties(aom PROPERTIES VERSION ${SO_FILE_VERSION}) + endif() +endif() + +if(NOT WIN32 AND NOT APPLE) + target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m) + if(BUILD_SHARED_LIBS) + target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} m) + endif() +endif() + +if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS) + list(APPEND AOM_AV1_RC_SOURCES "${AOM_ROOT}/av1/ratectrl_rtc.h" + "${AOM_ROOT}/av1/ratectrl_rtc.cc") + add_library(aom_av1_rc ${AOM_AV1_RC_SOURCES}) + target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} aom) + if(NOT WIN32 AND NOT APPLE) + target_link_libraries(aom_av1_rc ${AOM_LIB_LINK_TYPE} m) + endif() + set_target_properties(aom_av1_rc PROPERTIES LINKER_LANGUAGE CXX) +endif() + +# List of object and static library targets. +set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom) +if(CONFIG_AV1_ENCODER AND NOT BUILD_SHARED_LIBS) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_rc) +endif() +if(BUILD_SHARED_LIBS) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static) +endif() + +# Setup dependencies. +if(CONFIG_THREE_PASS) + setup_ivf_dec_targets() +endif() +setup_aom_dsp_targets() +setup_aom_mem_targets() +setup_aom_ports_targets() +setup_aom_util_targets() +setup_aom_scale_targets() +setup_av1_targets() + +# Make all library targets depend on aom_rtcd to make sure it builds first. +foreach(aom_lib ${AOM_LIB_TARGETS}) + if(NOT "${aom_lib}" STREQUAL "aom_rtcd") + add_dependencies(${aom_lib} aom_rtcd) + endif() +endforeach() + +# Generate a C file containing the function usage_exit(). Users of the +# aom_common_app_util library must define this function. This is a convenience +# to allow omission of the function from applications that might want to use +# other pieces of the util support without defining usage_exit(). +file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" + "#include \n\n#include \"common/tools_common.h\"\n\n" + "void usage_exit(void) { exit(EXIT_FAILURE); }\n") + +# +# Application and application support targets. +# +if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) + add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES}) + set_property(TARGET ${example} PROPERTY FOLDER examples) + if(CONFIG_AV1_DECODER) + add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES}) + set_property(TARGET ${example} PROPERTY FOLDER examples) + # obudec depends on internal headers that require *rtcd.h + add_dependencies(aom_decoder_app_util aom_rtcd) + endif() + if(CONFIG_AV1_ENCODER) + add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES}) + set_property(TARGET ${example} PROPERTY FOLDER examples) + endif() +endif() + +if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) + add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c" + $ + $) + add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c" + $ + $) + add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c" + $ + $) + add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c" + $ + $) + add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c" + $ + $) + + if(CONFIG_ANALYZER) + add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc" + $ + $) + target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES}) + list(APPEND AOM_APP_TARGETS analyzer) + list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer) + endif() + + if(CONFIG_INSPECTION) + add_executable(inspect "${AOM_ROOT}/examples/inspect.c" + $ + $) + list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect) + + if(EMSCRIPTEN) + add_preproc_definition(_POSIX_SOURCE) + append_link_flag_to_target("inspect" "--emrun") + append_link_flag_to_target("inspect" "-s USE_PTHREADS=0") + append_link_flag_to_target("inspect" "-s WASM=1") + append_link_flag_to_target("inspect" "-s MODULARIZE=1") + append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1") + append_link_flag_to_target( + "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'") + append_link_flag_to_target("inspect" + "-s EXPORT_NAME=\"\'DecoderModule\'\"") + append_link_flag_to_target("inspect" "--memory-init-file 0") + + if("${CMAKE_BUILD_TYPE}" STREQUAL "") + + # Default to -O3 when no build type is specified. + append_compiler_flag("-O3") + endif() + + em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js") + endif() + endif() + + # Maintain a list of decoder example targets. + list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5 decode_with_drops + scalable_decoder simple_decoder) + + # Add decoder examples to the app targets list. + list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}) +endif() + +if(CONFIG_LIBYUV OR CONFIG_TUNE_BUTTERAUGLI) + add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES}) + if(NOT MSVC) + target_compile_options(yuv PRIVATE -Wno-shadow) + endif() + include_directories("${AOM_ROOT}/third_party/libyuv/include") +endif() + +if(CONFIG_AV1_ENCODER) + if(ENABLE_EXAMPLES) + add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c" + $ + $ + $) + add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c" + $ + $) + add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c" + $ + $) + add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c" + $ + $) + add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c" + $ + $) + add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c" + $ + $) + add_executable(photon_noise_table + "${AOM_ROOT}/examples/photon_noise_table.c" + $ + $) + add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c" + $ + $) + + # Maintain a list of encoder example targets. + list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model + photon_noise_table set_maps simple_encoder scalable_encoder + twopass_encoder) + + if(NOT BUILD_SHARED_LIBS) + add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.cc" + $ + $) + target_link_libraries(svc_encoder_rtc ${AOM_LIB_LINK_TYPE} aom_av1_rc) + list(APPEND AOM_ENCODER_EXAMPLE_TARGETS svc_encoder_rtc) + endif() + endif() + + if(ENABLE_TOOLS) + if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS) + + # TODO(tomfinegan): Sort out why a simple link command with + # aom_entropy_optimizer.c won't work on macos, but dragging in all the + # helper machinery allows the link to succeed. + add_executable(aom_entropy_optimizer + "${AOM_GEN_SRC_DIR}/usage_exit.c" + "${AOM_ROOT}/tools/aom_entropy_optimizer.c" + $ + $) + + # Maintain a list of encoder tool targets. + list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer) + endif() + endif() + + # Add encoder examples and tools to the targets list. + list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS} + ${AOM_ENCODER_TOOL_TARGETS}) + + if(CONFIG_TUNE_BUTTERAUGLI) + find_package(PkgConfig) + # Use find_library() with STATIC_LINK_JXL for static build since + # pkg_check_modules() with LIBJXL_STATIC is not working. + if(STATIC_LINK_JXL OR NOT PKG_CONFIG_FOUND) + find_library(LIBJXL_LIBRARIES libjxl.a) + find_library(LIBHWY_LIBRARIES libhwy.a) + find_library(LIBSKCMS_LIBRARIES libskcms.a) + find_library(LIBBROTLICOMMON_LIBRARIES libbrotlicommon-static.a) + find_library(LIBBROTLIENC_LIBRARIES libbrotlienc-static.a) + find_library(LIBBROTLIDEC_LIBRARIES libbrotlidec-static.a) + find_path(LIBJXL_INCLUDE_DIRS butteraugli.h PATH_SUFFIXES jxl) + if(LIBJXL_LIBRARIES + AND LIBHWY_LIBRARIES + AND LIBSKCMS_LIBRARIES + AND LIBBROTLICOMMON_LIBRARIES + AND LIBBROTLIENC_LIBRARIES + AND LIBBROTLIDEC_LIBRARIES + AND LIBJXL_INCLUDE_DIRS) + message(STATUS "Found JXL library: ${LIBJXL_LIBRARIES} " + "${LIBHWY_LIBRARIES} ${LIBSKCMS_LIBRARIES} " + "${LIBBROTLICOMMON_LIBRARIES} ${LIBBROTLIENC_LIBRARIES}" + "${LIBBROTLIDEC_LIBRARIES}") + message(STATUS "Found JXL include: ${LIBJXL_INCLUDE_DIRS}") + else() + message(FATAL_ERROR "JXL library not found.") + endif() + target_link_libraries(aom + PRIVATE ${LIBJXL_LIBRARIES} ${LIBHWY_LIBRARIES} + ${LIBSKCMS_LIBRARIES} + ${LIBBROTLIENC_LIBRARIES} + ${LIBBROTLIDEC_LIBRARIES} + ${LIBBROTLICOMMON_LIBRARIES}) + target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS}) + else() + pkg_check_modules(LIBJXL REQUIRED libjxl) + target_link_libraries(aom PRIVATE ${LIBJXL_LDFLAGS} ${LIBJXL_LIBRARIES}) + target_include_directories(aom_dsp_encoder PRIVATE ${LIBJXL_INCLUDE_DIRS}) + if(LIBJXL_CFLAGS) + append_compiler_flag("${LIBJXL_CFLAGS}") + endif() + pkg_check_modules(LIBHWY REQUIRED libhwy) + target_link_libraries(aom PRIVATE ${LIBHWY_LDFLAGS} ${LIBHWY_LIBRARIES}) + target_include_directories(aom_dsp_encoder + PRIVATE ${LIBLIBHWY_INCLUDE_DIRS}) + if(LIBHWY_CFLAGS) + append_compiler_flag("${LIBHWY_CFLAGS}") + endif() + endif() + + set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX) + if(BUILD_SHARED_LIBS) + set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX) + endif() + + list(APPEND AOM_LIB_TARGETS yuv) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(CONFIG_TFLITE) + include(FetchContent) + + set(TFLITE_TAG "v2.6.1") + + message(STATUS "Fetching TFLite ${TFLITE_TAG}...") + + # static linking makes life with TFLite much easier + set(TFLITE_C_BUILD_SHARED_LIBS OFF) + + # We don't care about comparing against these delegates (yet), and disabling + # it reduces compile time meaningfully + set(TFLITE_ENABLE_RUY OFF) + set(TFLITE_ENABLE_XNNPACK OFF) + + fetchcontent_declare(tflite + GIT_REPOSITORY https://github.com/tensorflow/tensorflow + GIT_TAG ${TFLITE_TAG} + GIT_SHALLOW TRUE) + + fetchcontent_getproperties(tflite) + if(NOT tflite_POPULATED) + fetchcontent_populate(tflite) + # Some of the subprojects (e.g. Eigen) are very noisy and emit status + # messages all the time. Temporary ignore status messages while adding + # this to silence it. Ugly but effective. + set(OLD_CMAKE_MESSAGE_LOG_LEVEL ${CMAKE_MESSAGE_LOG_LEVEL}) + set(CMAKE_MESSAGE_LOG_LEVEL WARNING) + add_subdirectory(${tflite_SOURCE_DIR}/tensorflow/lite/c + ${tflite_BINARY_DIR}) + set(CMAKE_MESSAGE_LOG_LEVEL ${OLD_CMAKE_MESSAGE_LOG_LEVEL}) + endif() + + # Disable some noisy warnings in tflite + target_compile_options(tensorflow-lite PRIVATE -w) + + # tensorflowlite_c is implicitly declared by this FetchContent + include_directories(${tflite_SOURCE_DIR}) + target_link_libraries(aom PRIVATE tensorflow-lite) + endif() + + if(CONFIG_TUNE_VMAF) + find_package(PkgConfig) + if(PKG_CONFIG_FOUND) + pkg_check_modules(VMAF REQUIRED libvmaf) + if(BUILD_SHARED_LIBS) + target_link_libraries(aom_static + PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES}) + endif() + target_link_libraries(aom PRIVATE ${VMAF_LDFLAGS} ${VMAF_LIBRARIES}) + target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS}) + if(VMAF_CFLAGS) + foreach(flag "${VMAF_CFLAGS}") + append_compiler_flag("${flag}") + endforeach() + endif() + else() + message(FATAL_ERROR "CONFIG_TUNE_VMAF error: pkg-config not found.") + endif() + set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX) + if(BUILD_SHARED_LIBS) + set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX) + endif() + endif() +endif() + +if(ENABLE_EXAMPLES) + + # Maintain a separate variable listing only the examples to facilitate + # installation of example programs into an examples sub directory of + # $AOM_DIST_DIR/bin when building the dist target. + list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS} + ${AOM_ENCODER_EXAMPLE_TARGETS}) +endif() + +if(ENABLE_TOOLS) + if(CONFIG_AV1_DECODER) + add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.c" + "${AOM_ROOT}/tools/dump_obu.cc" + "${AOM_ROOT}/tools/obu_parser.cc" + "${AOM_ROOT}/tools/obu_parser.h" + $ + $) + + list(APPEND AOM_TOOL_TARGETS dump_obu) + list(APPEND AOM_APP_TARGETS dump_obu) + + # Maintain a separate variable listing only the examples to facilitate + # installation of example programs into an tools sub directory of + # $AOM_DIST_DIR/bin when building the dist target. + list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS} + ${AOM_ENCODER_TOOL_TARGETS}) + endif() +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref) + list(APPEND AOM_APP_TARGETS aom_cx_set_ref) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER) + add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder) + list(APPEND AOM_APP_TARGETS lightfield_encoder) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER) + add_executable(lightfield_tile_list_decoder + "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder) + list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER) + add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c" + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder) + list(APPEND AOM_APP_TARGETS lightfield_decoder) +endif() + +if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER) + add_executable(lightfield_bitstream_parsing + "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c" + $ + $ + $) + list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing) + list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing) +endif() + +foreach(aom_app ${AOM_APP_TARGETS}) + target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom) +endforeach() + +if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS) + if(CONFIG_LIBYUV) + # Add to existing targets. + foreach(aom_app ${AOM_APP_TARGETS}) + target_sources(${aom_app} PRIVATE $) + set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX) + endforeach() + endif() + + if(CONFIG_WEBM_IO) + add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES}) + include_directories("${AOM_ROOT}/third_party/libwebm") + target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS) + target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS) + + if(NOT MSVC) + target_compile_options(webm PRIVATE -Wno-shadow) + endif() + + # Add to existing targets. + if(CONFIG_AV1_DECODER) + target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES}) + endif() + + if(CONFIG_AV1_ENCODER) + target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES}) + endif() + + foreach(aom_app ${AOM_APP_TARGETS}) + target_sources(${aom_app} PRIVATE $) + set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX) + endforeach() + endif() +endif() + +if(ENABLE_TESTS) + + # Create test_libaom target and the targets it depends on. + setup_aom_test_targets() +endif() + +if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD) + find_package(Threads) + target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads) + if(BUILD_SHARED_LIBS) + target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} Threads::Threads) + endif() +endif() + +if(XCODE) + + # TODO(tomfinegan): Make sure target has no C++ files before doing this as + # it's not necessary in that case. + if(CONFIG_LIBYUV OR CONFIG_WEBM_IO) + + # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue + # what looks like a C++ file needs to be in any target that Xcode will link + # when the target contains a C++ dependency. Without this Xcode will try to + # link with the C linker, which always ends badly when a dependency actually + # includes C++. + + # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched + # here, it really is the Xcode generator's fault, or just a deficiency in + # Xcode itself. + foreach(aom_app ${AOM_APP_TARGETS}) + add_no_op_source_file_to_target("${aom_app}" "cc") + endforeach() + endif() +endif() + +if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$") + + # For historical purposes place the example binaries in the example directory. + file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples") + + foreach(target ${AOM_EXAMPLE_TARGETS}) + if(NOT "${target}" MATCHES "aomdec\|aomenc") + set_target_properties(${target} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${AOM_CONFIG_DIR}/examples") + endif() + endforeach() + + if(ENABLE_TOOLS AND AOM_TOOL_TARGETS) + + # The same expectation is true for tool targets. + file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools") + set_target_properties(${AOM_TOOL_TARGETS} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${AOM_CONFIG_DIR}/tools") + endif() +endif() + +if(BUILD_SHARED_LIBS) + # Don't use -Wl,-z,defs with Clang's sanitizers. + # + # Clang's AddressSanitizer documentation says "When linking shared libraries, + # the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link + # errors (don't use it with AddressSanitizer)." See + # https://clang.llvm.org/docs/AddressSanitizer.html#usage. + if(NOT WIN32 + AND NOT APPLE + AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE)) + # The -z defs linker option reports unresolved symbol references from object + # files when building a shared library. + if("${CMAKE_VERSION}" VERSION_LESS "3.13") + # target_link_options() is not available before CMake 3.13. + target_link_libraries(aom PRIVATE -Wl,-z,defs) + else() + target_link_options(aom PRIVATE LINKER:-z,defs) + endif() + endif() + + include("${AOM_ROOT}/build/cmake/exports.cmake") + setup_exports_target() +endif() + +# Do not allow implicit vector type conversions on Clang builds (this is already +# the default on GCC builds). +if(CMAKE_C_COMPILER_ID MATCHES "Clang") + append_compiler_flag("-flax-vector-conversions=none") +endif() + +# Handle user supplied compile and link flags last to ensure they're obeyed. +set_user_flags() + +# Aomedia documentation rule. +set(DOXYGEN_VERSION_VALUE 0) +if(ENABLE_DOCS) + find_package(Doxygen) + if(DOXYGEN_FOUND) + # Check if Doxygen version is >= minimum required version(i.e. 1.8.10). + set(MINIMUM_DOXYGEN_VERSION 1008010) + + if(DOXYGEN_VERSION) + # Strip SHA1 from version string if present. + string(REGEX + REPLACE "^([0-9]+\\.[0-9]+\\.[0-9]+).*" "\\1" DOXYGEN_VERSION + ${DOXYGEN_VERSION}) + # Replace dots with semicolons to create a list. + string(REGEX REPLACE "\\." ";" DOXYGEN_VERSION_LIST ${DOXYGEN_VERSION}) + # Parse version components from the list. + list(GET DOXYGEN_VERSION_LIST 0 DOXYGEN_MAJOR) + list(GET DOXYGEN_VERSION_LIST 1 DOXYGEN_MINOR) + list(GET DOXYGEN_VERSION_LIST 2 DOXYGEN_PATCH) + endif() + + # Construct a version value for comparison. + math(EXPR DOXYGEN_MAJOR "${DOXYGEN_MAJOR}*1000000") + math(EXPR DOXYGEN_MINOR "${DOXYGEN_MINOR}*1000") + math(EXPR DOXYGEN_VERSION_VALUE + "${DOXYGEN_MAJOR} + ${DOXYGEN_MINOR} + ${DOXYGEN_PATCH}") + + if(${DOXYGEN_VERSION_VALUE} LESS ${MINIMUM_DOXYGEN_VERSION}) + set(DOXYGEN_FOUND NO) + endif() + endif() + + if(DOXYGEN_FOUND) + include("${AOM_ROOT}/docs.cmake") + setup_documentation_targets() + else() + message( + "--- Cannot find doxygen(version 1.8.10 or newer), ENABLE_DOCS turned off." + ) + set(ENABLE_DOCS OFF) + endif() +endif() + +# Aomedia dist rule. +if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) + list(APPEND AOM_DIST_APPS $) +endif() +if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES) + list(APPEND AOM_DIST_APPS $) +endif() + +if(ENABLE_EXAMPLES) + foreach(example ${AOM_EXAMPLE_TARGETS}) + list(APPEND AOM_DIST_EXAMPLES $) + set_property(TARGET ${example} PROPERTY FOLDER examples) + endforeach() +endif() + +if(ENABLE_TOOLS) + foreach(tool ${AOM_TOOL_TARGETS}) + list(APPEND AOM_DIST_TOOLS $) + set_property(TARGET ${tool} PROPERTY FOLDER tools) + endforeach() +endif() + +if(NOT AOM_DIST_DIR) + set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist") +endif() + +add_custom_target(dist + COMMAND ${CMAKE_COMMAND} + -DAOM_ROOT=${AOM_ROOT} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_DIST_DIR=${AOM_DIST_DIR} + -DAOM_DIST_APPS="${AOM_DIST_APPS}" + -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}" + -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}" + -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}" + -DAOM_DIST_LIBS=$ + -DENABLE_DOCS=${ENABLE_DOCS} -P + "${AOM_ROOT}/build/cmake/dist.cmake" + DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS} + ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS} + ${AOM_TOOL_TARGETS}) + +if(ENABLE_DOCS) + add_dependencies(dist docs) +endif() + +# Collect all variables containing libaom source files. +get_cmake_property(all_cmake_vars VARIABLES) +foreach(var ${all_cmake_vars}) + if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" + AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST" + AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER") + list(APPEND aom_source_vars ${var}) + endif() +endforeach() + +if(NOT CONFIG_AV1_DECODER) + list(FILTER aom_source_vars EXCLUDE REGEX "_DECODER_") +endif() + +# Libaom_srcs.txt generation. +set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt") +file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n") + +# Static source file list first. +foreach(aom_source_var ${aom_source_vars}) + foreach(file ${${aom_source_var}}) + if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_ROOT}/" "" file "${file}") + if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder") + continue() + endif() + file(APPEND "${libaom_srcs_txt_file}" "${file}\n") + endif() + endforeach() +endforeach() + +file(APPEND "${libaom_srcs_txt_file}" + "# Files below this line are generated by the libaom build system.\n") +foreach(aom_source_var ${aom_source_vars}) + foreach(file ${${aom_source_var}}) + if("${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}") + file(APPEND "${libaom_srcs_txt_file}" "${file}\n") + endif() + endforeach() +endforeach() + +# Libaom_srcs.gni generation. +set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni") +file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n") + +foreach(aom_source_var ${aom_source_vars}) + if("${${aom_source_var}}" MATCHES "${AOM_ROOT}") + string(TOLOWER ${aom_source_var} aom_source_var_lowercase) + file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n") + endif() + + foreach(file ${${aom_source_var}}) + if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}") + string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file + "${file}") + if(NOT CONFIG_AV1_DECODER AND "${file}" MATCHES "aom_decoder") + continue() + endif() + file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n") + endif() + endforeach() + + if("${${aom_source_var}}" MATCHES "${AOM_ROOT}") + file(APPEND "${libaom_srcs_gni_file}" "]\n") + endif() +endforeach() + +file(APPEND "${libaom_srcs_gni_file}" + "\n# Files below this line are generated by the libaom build system.\n") + +foreach(aom_source_var ${aom_source_vars}) + if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}") + string(TOLOWER ${aom_source_var} aom_source_var_lowercase) + file(APPEND "${libaom_srcs_gni_file}" + "\n${aom_source_var_lowercase}_gen = [\n") + endif() + foreach(file ${${aom_source_var}}) + if(NOT "${file}" MATCHES "${AOM_ROOT}") + string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom" + file "${file}") + file(APPEND "${libaom_srcs_gni_file}" " \"${file}\",\n") + endif() + endforeach() + + if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}") + file(APPEND "${libaom_srcs_gni_file}" "]\n") + endif() +endforeach() + +# Generate aom.pc and setup install rule. +setup_aom_install_targets() diff --git a/third_party/aom/LICENSE b/third_party/aom/LICENSE new file mode 100644 index 0000000000..fc340c3764 --- /dev/null +++ b/third_party/aom/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2016, Alliance for Open Media. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + diff --git a/third_party/aom/PATENTS b/third_party/aom/PATENTS new file mode 100644 index 0000000000..fc4de9edf8 --- /dev/null +++ b/third_party/aom/PATENTS @@ -0,0 +1,108 @@ +Alliance for Open Media Patent License 1.0 + +1. License Terms. + +1.1. Patent License. Subject to the terms and conditions of this License, each + Licensor, on behalf of itself and successors in interest and assigns, + grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive, + no-charge, royalty-free, irrevocable (except as expressly stated in this + License) patent license to its Necessary Claims to make, use, sell, offer + for sale, import or distribute any Implementation. + +1.2. Conditions. + +1.2.1. Availability. As a condition to the grant of rights to Licensee to make, + sell, offer for sale, import or distribute an Implementation under + Section 1.1, Licensee must make its Necessary Claims available under + this License, and must reproduce this License with any Implementation + as follows: + + a. For distribution in source code, by including this License in the + root directory of the source code with its Implementation. + + b. For distribution in any other form (including binary, object form, + and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist, + GDSII, etc.)), by including this License in the documentation, legal + notices, and/or other written materials provided with the + Implementation. + +1.2.2. Additional Conditions. This license is directly from Licensor to + Licensee. Licensee acknowledges as a condition of benefiting from it + that no rights from Licensor are received from suppliers, distributors, + or otherwise in connection with this License. + +1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents + initiates patent litigation or files, maintains, or voluntarily + participates in a lawsuit against another entity or any person asserting + that any Implementation infringes Necessary Claims, any patent licenses + granted under this License directly to the Licensee are immediately + terminated as of the date of the initiation of action unless 1) that suit + was in response to a corresponding suit regarding an Implementation first + brought against an initiating entity, or 2) that suit was brought to + enforce the terms of this License (including intervention in a third-party + action by a Licensee). + +1.4. Disclaimers. The Reference Implementation and Specification are provided + "AS IS" and without warranty. The entire risk as to implementing or + otherwise using the Reference Implementation or Specification is assumed + by the implementer and user. Licensor expressly disclaims any warranties + (express, implied, or otherwise), including implied warranties of + merchantability, non-infringement, fitness for a particular purpose, or + title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO + ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL, + INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF + ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH + OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR + NOT THE OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +2. Definitions. + +2.1. Affiliate. "Affiliate" means an entity that directly or indirectly + Controls, is Controlled by, or is under common Control of that party. + +2.2. Control. "Control" means direct or indirect control of more than 50% of + the voting power to elect directors of that corporation, or for any other + entity, the power to direct management of such entity. + +2.3. Decoder. "Decoder" means any decoder that conforms fully with all + non-optional portions of the Specification. + +2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can + be decoded by a Decoder only to the extent it produces such a bitstream. + +2.5. Final Deliverable. "Final Deliverable" means the final version of a + deliverable approved by the Alliance for Open Media as a Final + Deliverable. + +2.6. Implementation. "Implementation" means any implementation, including the + Reference Implementation, that is an Encoder and/or a Decoder. An + Implementation also includes components of an Implementation only to the + extent they are used as part of an Implementation. + +2.7. License. "License" means this license. + +2.8. Licensee. "Licensee" means any person or entity who exercises patent + rights granted under this License. + +2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers + for sale, imports or distributes any Implementation, or (ii) a person + or entity that has a licensing obligation to the Implementation as a + result of its membership and/or participation in the Alliance for Open + Media working group that developed the Specification. + +2.10. Necessary Claims. "Necessary Claims" means all claims of patents or + patent applications, (a) that currently or at any time in the future, + are owned or controlled by the Licensor, and (b) (i) would be an + Essential Claim as defined by the W3C Policy as of February 5, 2004 + (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential) + as if the Specification was a W3C Recommendation; or (ii) are infringed + by the Reference Implementation. + +2.11. Reference Implementation. "Reference Implementation" means an Encoder + and/or Decoder released by the Alliance for Open Media as a Final + Deliverable. + +2.12. Specification. "Specification" means the specification designated by + the Alliance for Open Media as a Final Deliverable for which this + License was issued. + diff --git a/third_party/aom/README.md b/third_party/aom/README.md new file mode 100644 index 0000000000..4e2eb2756c --- /dev/null +++ b/third_party/aom/README.md @@ -0,0 +1,677 @@ +README.md {#LREADME} +========= +# AV1 Codec Library + +## Contents +1. [Building the lib and applications](#building-the-library-and-applications) + - [Prerequisites](#prerequisites) + - [Get the code](#get-the-code) + - [Basics](#basic-build) + - [Configuration options](#configuration-options) + - [Dylib builds](#dylib-builds) + - [Debugging](#debugging) + - [Cross compiling](#cross-compiling) + - [Sanitizer support](#sanitizers) + - [MSVC builds](#microsoft-visual-studio-builds) + - [Xcode builds](#xcode-builds) + - [Emscripten builds](#emscripten-builds) + - [Extra Build Flags](#extra-build-flags) + - [Build with VMAF support](#build-with-vmaf) +2. [Testing the library](#testing-the-av1-codec) + - [Basics](#testing-basics) + - [Unit tests](#unit-tests) + - [Example tests](#example-tests) + - [Encoder tests](#encoder-tests) + - [IDE hosted tests](#ide-hosted-tests) + - [Downloading test data](#downloading-the-test-data) + - [Adding a new test data file](#adding-a-new-test-data-file) + - [Additional test data](#additional-test-data) + - [Sharded testing](#sharded-testing) + - [Running tests directly](#running-test_libaom-directly) + - [Running tests via CMake](#running-the-tests-via-the-cmake-build) +3. [Coding style](#coding-style) +4. [Submitting patches](#submitting-patches) + - [Login cookie](#login-cookie) + - [Contributor agreement](#contributor-agreement) + - [Testing your code](#testing-your-code) + - [Commit message hook](#commit-message-hook) + - [Upload your change](#upload-your-change) + - [Incorporating Reviewer Comments](#incorporating-reviewer-comments) + - [Submitting your change](#submitting-your-change) + - [Viewing change status](#viewing-the-status-of-uploaded-changes) +5. [Support](#support) +6. [Bug reports](#bug-reports) + +## Building the library and applications {#building-the-library-and-applications} + +### Prerequisites {#prerequisites} + + 1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version + required. + 2. [Git](https://git-scm.com/). + 3. [Perl](https://www.perl.org/). + 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a + recent version of [nasm](http://www.nasm.us/). If you download yasm with + the intention to work with Visual Studio, please download win32.exe or + win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe. + 5. Building the documentation requires + [doxygen version 1.8.10 or newer](http://doxygen.org). + 6. Emscripten builds require the portable + [EMSDK](https://kripken.github.io/emscripten-site/index.html). + +### Get the code {#get-the-code} + +The AV1 library source code is stored in the Alliance for Open Media Git +repository: + +~~~ + $ git clone https://aomedia.googlesource.com/aom + # By default, the above command stores the source in the aom directory: + $ cd aom +~~~ + +### Basic build {#basic-build} + +CMake replaces the configure step typical of many projects. Running CMake will +produce configuration and build files for the currently selected CMake +generator. For most systems the default generator is Unix Makefiles. The basic +form of a makefile build is the following: + +~~~ + $ cmake path/to/aom + $ make +~~~ + +The above will generate a makefile build that produces the AV1 library and +applications for the current host system after the make step completes +successfully. The compiler chosen varies by host platform, but a general rule +applies: On systems where cc and c++ are present in $PATH at the time CMake is +run the generated build will use cc and c++ by default. + +### Configuration options {#configuration-options} + +The AV1 codec library has a great many configuration options. These come in two +varieties: + + 1. Build system configuration options. These have the form `ENABLE_FEATURE`. + 2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`. + +Both types of options are set at the time CMake is run. The following example +enables ccache and disables the AV1 encoder: + +~~~ + $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0 + $ make +~~~ + +The available configuration options are too numerous to list here. Build system +configuration options can be found at the top of the CMakeLists.txt file found +in the root of the AV1 repository, and AV1 codec configuration options can +currently be found in the file `build/cmake/aom_config_defaults.cmake`. + +### Dylib builds {#dylib-builds} + +A dylib (shared object) build of the AV1 codec library can be enabled via the +CMake built in variable `BUILD_SHARED_LIBS`: + +~~~ + $ cmake path/to/aom -DBUILD_SHARED_LIBS=1 + $ make +~~~ + +This is currently only supported on non-Windows targets. + +### Debugging {#debugging} + +Depending on the generator used there are multiple ways of going about +debugging AV1 components. For single configuration generators like the Unix +Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient: + +~~~ + $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug +~~~ + +For Xcode, mainly because configuration controls for Xcode builds are buried two +configuration windows deep and must be set for each subproject within the Xcode +IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug: + +~~~ + $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug +~~~ + +For Visual Studio the in-IDE configuration controls should be used. Simply set +the IDE project configuration to Debug to allow for stepping through the code. + +In addition to the above it can sometimes be useful to debug only C and C++ +code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to +generic at generation time: + +~~~ + $ cmake path/to/aom -DAOM_TARGET_CPU=generic +~~~ + +### Cross compiling {#cross-compiling} + +For the purposes of building the AV1 codec and applications and relative to the +scope of this guide, all builds for architectures differing from the native host +architecture will be considered cross compiles. The AV1 CMake build handles +cross compiling via the use of toolchain files included in the AV1 repository. +The toolchain files available at the time of this writing are: + + - arm64-ios.cmake + - arm64-linux-clang.cmake + - arm64-linux-gcc.cmake + - arm64-mingw-gcc.cmake + - armv7-ios.cmake + - armv7-linux-gcc.cmake + - armv7-mingw-gcc.cmake + - armv7s-ios.cmake + - ppc-linux-gcc.cmake + - riscv-linux-gcc.cmake + - x86-ios-simulator.cmake + - x86-linux.cmake + - x86-macos.cmake + - x86-mingw-gcc.cmake + - x86\_64-ios-simulator.cmake + - x86\_64-mingw-gcc.cmake + +The following example demonstrates use of the x86-macos.cmake toolchain file on +a x86\_64 MacOS host: + +~~~ + $ cmake path/to/aom \ + -DCMAKE_TOOLCHAIN_FILE=path/to/aom/build/cmake/toolchains/x86-macos.cmake + $ make +~~~ + +To build for an unlisted target creation of a new toolchain file is the best +solution. The existing toolchain files can be used a starting point for a new +toolchain file since each one exposes the basic requirements for toolchain files +as used in the AV1 codec build. + +As a temporary work around an unoptimized AV1 configuration that builds only C +and C++ sources can be produced using the following commands: + +~~~ + $ cmake path/to/aom -DAOM_TARGET_CPU=generic + $ make +~~~ + +In addition to the above it's important to note that the toolchain files +suffixed with gcc behave differently than the others. These toolchain files +attempt to obey the $CROSS environment variable. + +### Sanitizers {#sanitizers} + +Sanitizer integration is built-in to the CMake build system. To enable a +sanitizer, add `-DSANITIZE=` to the CMake command line. For example, to +enable address sanitizer: + +~~~ + $ cmake path/to/aom -DSANITIZE=address + $ make +~~~ + +Sanitizers available vary by platform, target, and compiler. Consult your +compiler documentation to determine which, if any, are available. + +### Microsoft Visual Studio builds {#microsoft-visual-studio-builds} + +Building the AV1 codec library in Microsoft Visual Studio is supported. Visual +Studio 2019 (16.0) or later is required. The following example demonstrates +generating projects and a solution for the Microsoft IDE: + +~~~ + # This does not require a bash shell; Command Prompt (cmd.exe) is fine. + # This assumes the build host is a Windows x64 computer. + + # To create a Visual Studio 2022 solution for the x64 target: + $ cmake path/to/aom -G "Visual Studio 17 2022" + + # To create a Visual Studio 2022 solution for the 32-bit x86 target: + $ cmake path/to/aom -G "Visual Studio 17 2022" -A Win32 + + # To create a Visual Studio 2019 solution for the x64 target: + $ cmake path/to/aom -G "Visual Studio 16 2019" + + # To create a Visual Studio 2019 solution for the 32-bit x86 target: + $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32 + + # To build the solution: + $ cmake --build . +~~~ + +NOTE: The build system targets Windows 7 or later by compiling files with +`-D_WIN32_WINNT=0x0601`. + +### Xcode builds {#xcode-builds} + +Building the AV1 codec library in Xcode is supported. The following example +demonstrates generating an Xcode project: + +~~~ + $ cmake path/to/aom -G Xcode +~~~ + +### Emscripten builds {#emscripten-builds} + +Building the AV1 codec library with Emscripten is supported. Typically this is +used to hook into the AOMAnalyzer GUI application. These instructions focus on +using the inspector with AOMAnalyzer, but all tools can be built with +Emscripten. + +It is assumed here that you have already downloaded and installed the EMSDK, +installed and activated at least one toolchain, and setup your environment +appropriately using the emsdk\_env script. + +1. Build [AOM Analyzer](https://github.com/xiph/aomanalyzer). + +2. Configure the build: + +~~~ + $ cmake path/to/aom \ + -DENABLE_CCACHE=1 \ + -DAOM_TARGET_CPU=generic \ + -DENABLE_DOCS=0 \ + -DENABLE_TESTS=0 \ + -DCONFIG_ACCOUNTING=1 \ + -DCONFIG_INSPECTION=1 \ + -DCONFIG_MULTITHREAD=0 \ + -DCONFIG_RUNTIME_CPU_DETECT=0 \ + -DCONFIG_WEBM_IO=0 \ + -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake +~~~ + +3. Build it: run make if that's your generator of choice: + +~~~ + $ make inspect +~~~ + +4. Run the analyzer: + +~~~ + # inspect.js is in the examples sub directory of the directory in which you + # executed cmake. + $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file +~~~ + +### Extra build flags {#extra-build-flags} + +Three variables allow for passing of additional flags to the build system. + +- AOM\_EXTRA\_C\_FLAGS +- AOM\_EXTRA\_CXX\_FLAGS +- AOM\_EXTRA\_EXE\_LINKER\_FLAGS + +The build system attempts to ensure the flags passed through the above variables +are passed to tools last in order to allow for override of default behavior. +These flags can be used, for example, to enable asserts in a release build: + +~~~ + $ cmake path/to/aom \ + -DCMAKE_BUILD_TYPE=Release \ + -DAOM_EXTRA_C_FLAGS=-UNDEBUG \ + -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG +~~~ + +### Build with VMAF support {#build-with-vmaf} + +After installing +[libvmaf.a](https://github.com/Netflix/vmaf/tree/master/libvmaf), +you can use it with the encoder: + +~~~ + $ cmake path/to/aom -DCONFIG_TUNE_VMAF=1 +~~~ + +Please note that the default VMAF model +("/usr/local/share/model/vmaf_v0.6.1.json") +will be used unless you set the following flag when running the encoder: + +~~~ + # --vmaf-model-path=path/to/model +~~~ + +## Testing the AV1 codec {#testing-the-av1-codec} + +### Testing basics {#testing-basics} + +There are several methods of testing the AV1 codec. All of these methods require +the presence of the AV1 source code and a working build of the AV1 library and +applications. + +#### 1. Unit tests: {#unit-tests} + +The unit tests can be run at build time: + +~~~ + # Before running the make command the LIBAOM_TEST_DATA_PATH environment + # variable should be set to avoid downloading the test files to the + # cmake build configuration directory. + $ cmake path/to/aom + # Note: The AV1 CMake build creates many test targets. Running make + # with multiple jobs will speed up the test run significantly. + $ make runtests +~~~ + +#### 2. Example tests: {#example-tests} + +The example tests require a bash shell and can be run in the following manner: + +~~~ + # See the note above about LIBAOM_TEST_DATA_PATH above. + $ cmake path/to/aom + $ make + # It's best to build the testdata target using many make jobs. + # Running it like this will verify and download (if necessary) + # one at a time, which takes a while. + $ make testdata + $ path/to/aom/test/examples.sh --bin-path examples +~~~ + +#### 3. Encoder tests: {#encoder-tests} + +When making a change to the encoder run encoder tests to confirm that your +change has a positive or negligible impact on encode quality. When running these +tests the build configuration should be changed to enable internal encoder +statistics: + +~~~ + $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1 + $ make +~~~ + +The repository contains scripts intended to make running these tests as simple +as possible. The following example demonstrates creating a set of baseline clips +for comparison to results produced after making your change to libaom: + +~~~ + # This will encode all Y4M files in the current directory using the + # settings specified to create the encoder baseline statistical data: + $ cd path/to/test/inputs + # This command line assumes that run_encodes.sh, its helper script + # best_encode.sh, and the aomenc you intend to test are all within a + # directory in your PATH. + $ run_encodes.sh 200 500 50 baseline +~~~ + +After making your change and creating the baseline clips, you'll need to run +encodes that include your change(s) to confirm that things are working as +intended: + +~~~ + # This will encode all Y4M files in the current directory using the + # settings specified to create the statistical data for your change: + $ cd path/to/test/inputs + # This command line assumes that run_encodes.sh, its helper script + # best_encode.sh, and the aomenc you intend to test are all within a + # directory in your PATH. + $ run_encodes.sh 200 500 50 mytweak +~~~ + +After creating both data sets you can use `test/visual_metrics.py` to generate a +report that can be viewed in a web browser: + +~~~ + $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \ + > mytweak.html +~~~ + +You can view the report by opening mytweak.html in a web browser. + + +### IDE hosted tests {#ide-hosted-tests} + +By default the generated projects files created by CMake will not include the +runtests and testdata rules when generating for IDEs like Microsoft Visual +Studio and Xcode. This is done to avoid intolerably long build cycles in the +IDEs-- IDE behavior is to build all targets when selecting the build project +options in MSVS and Xcode. To enable the test rules in IDEs the +`ENABLE_IDE_TEST_HOSTING` variable must be enabled at CMake generation time: + +~~~ + # This example uses Xcode. To get a list of the generators + # available, run cmake with the -G argument missing its + # value. + $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode +~~~ + +### Downloading the test data {#downloading-the-test-data} + +The fastest and easiest way to obtain the test data is to use CMake to generate +a build using the Unix Makefiles generator, and then to build only the testdata +rule. By default the test files will be downloaded to the current directory. The +`LIBAOM_TEST_DATA_PATH` environment variable can be used to set a +custom one. + +~~~ + $ cmake path/to/aom -G "Unix Makefiles" + # 28 is used because there are 28 test files as of this writing. + $ make -j28 testdata +~~~ + +The above make command will only download and verify the test data. + +### Adding a new test data file {#adding-a-new-test-data-file} + +First, add the new test data file to the `aom-test-data` bucket of the +`aomedia-testing` project on Google Cloud Platform. You may need to ask someone +with the necessary access permissions to do this for you. + +NOTE: When a new test data file is added to the `aom-test-data` bucket, its +"Public access" is initially "Not public". We need to change its +"Public access" to "Public" by using the following +[`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command: +~~~ + $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name +~~~ +This command grants the `AllUsers` group READ access to the file named +"test-data-file-name" in the `aom-test-data` bucket. + +Once the new test data file has been added to `aom-test-data`, create a CL to +add the name of the new test data file to `test/test_data_util.cmake` and add +the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1 +checksum of a file can be calculated by running the `sha1sum` command on the +file.) + +### Additional test data {#additional-test-data} + +The test data mentioned above is strictly intended for unit testing. + +Additional input data for testing the encoder can be obtained from: +https://media.xiph.org/video/derf/ + +### Sharded testing {#sharded-testing} + +The AV1 codec library unit tests are built upon gtest which supports sharding of +test jobs. Sharded test runs can be achieved in a couple of ways. + +#### 1. Running test\_libaom directly: {#running-test_libaom-directly} + +~~~ + # Set the environment variable GTEST_TOTAL_SHARDS to control the number of + # shards. + $ export GTEST_TOTAL_SHARDS=10 + # (GTEST shard indexing is 0 based). + $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \ + | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom +~~~ + +To create a test shard for each CPU core available on the current system set +`GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one. + +#### 2. Running the tests via the CMake build: {#running-the-tests-via-the-cmake-build} + +~~~ + # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See + # the IDE hosted tests section above for more information. If the IDE + # supports building targets concurrently tests will be sharded by default. + + # For make and ninja builds the -j parameter controls the number of shards + # at test run time. This example will run the tests using 10 shards via + # make. + $ make -j10 runtests +~~~ + +The maximum number of test targets that can run concurrently is determined by +the number of CPUs on the system where the build is configured as detected by +CMake. A system with 24 cores can run 24 test shards using a value of 24 with +the `-j` parameter. When CMake is unable to detect the number of cores 10 shards +is the default maximum value. + +## Coding style {#coding-style} + +We are using the Google C Coding Style defined by the +[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). + +The coding style used by this project is enforced with clang-format using the +configuration contained in the +[.clang-format](https://chromium.googlesource.com/webm/aom/+/main/.clang-format) +file in the root of the repository. + +You can download clang-format using your system's package manager, or directly +from [llvm.org](http://llvm.org/releases/download.html). You can also view the +[documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org. +Output from clang-format varies by clang-format version, for best results your +version should match the one used on Jenkins. You can find the clang-format +version by reading the comment in the `.clang-format` file linked above. + +Before pushing changes for review you can format your code with: + +~~~ + # Apply clang-format to modified .c, .h and .cc files + $ clang-format -i --style=file \ + $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc') +~~~ + +Check the .clang-format file for the version used to generate it if there is any +difference between your local formatting and the review system. + +Some Git installations have clang-format integration. Here are some examples: + +~~~ + # Apply clang-format to all staged changes: + $ git clang-format + + # Clang format all staged and unstaged changes: + $ git clang-format -f + + # Clang format all staged and unstaged changes interactively: + $ git clang-format -f -p +~~~ + +## Submitting patches {#submitting-patches} + +We manage the submission of patches using the +[Gerrit](https://www.gerritcodereview.com/) code review tool. This tool +implements a workflow on top of the Git version control system to ensure that +all changes get peer reviewed and tested prior to their distribution. + +### Login cookie {#login-cookie} + +Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with +your account (Gmail credentials, for example). Next, follow the +`Generate Password` Password link at the top of the page. You’ll be given +instructions for creating a cookie to use with our Git repos. + +You must also have a Gerrit account associated with your Google account. To do +this visit the [Gerrit review server](https://aomedia-review.googlesource.com) +and click "Sign in" (top right). + +### Contributor agreement {#contributor-agreement} + +You will be required to execute a +[contributor agreement](http://aomedia.org/license) to ensure that the AOMedia +Project has the right to distribute your changes. + +Note: If you are pushing changes on behalf of an Alliance for Open Media member +organization this step is not necessary. + +### Testing your code {#testing-your-code} + +The testing basics are covered in the [testing section](#testing-the-av1-codec) +above. + +In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run +through Jenkins instances upon upload to gerrit. + +### Commit message hook {#commit-message-hook} + +Gerrit requires that each submission include a unique Change-Id. You can assign +one manually using git commit --amend, but it’s easier to automate it with the +commit-msg hook provided by Gerrit. + +Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an +example: + +~~~ + $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg + + # Next, ensure that the downloaded commit-msg script is executable: + $ chmod u+x aom/.git/hooks/commit-msg +~~~ + +See the Gerrit +[documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html) +for more information. + +### Upload your change {#upload-your-change} + +The command line to upload your patch looks like this: + +~~~ + $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/main +~~~ + +### Incorporating reviewer comments {#incorporating-reviewer-comments} + +If you previously uploaded a change to Gerrit and the Approver has asked for +changes, follow these steps: + +1. Edit the files to make the changes the reviewer has requested. +2. Recommit your edits using the --amend flag, for example: + +~~~ + $ git commit -a --amend +~~~ + +3. Use the same git push command as above to upload to Gerrit again for another + review cycle. + +In general, you should not rebase your changes when doing updates in response to +review. Doing so can make it harder to follow the evolution of your change in +the diff view. + +### Submitting your change {#submitting-your-change} + +Once your change has been Approved and Verified, you can “submit” it through the +Gerrit UI. This will usually automatically rebase your change onto the branch +specified. + +Sometimes this can’t be done automatically. If you run into this problem, you +must rebase your changes manually: + +~~~ + $ git fetch + $ git rebase origin/branchname +~~~ + +If there are any conflicts, resolve them as you normally would with Git. When +you’re done, reupload your change. + +### Viewing the status of uploaded changes {#viewing-the-status-of-uploaded-changes} + +To check the status of a change that you uploaded, open +[Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My > +Changes. + +## Support {#support} + +This library is an open source project supported by its community. Please +please email aomediacodec@jointdevelopment.kavi.com for help. + +## Bug reports {#bug-reports} + +Bug reports can be filed in the Alliance for Open Media +[issue tracker](https://bugs.chromium.org/p/aomedia/issues/list). diff --git a/third_party/aom/Sample.cfg b/third_party/aom/Sample.cfg new file mode 100644 index 0000000000..d5dbe66415 --- /dev/null +++ b/third_party/aom/Sample.cfg @@ -0,0 +1,35 @@ +#sample config file +super_block_size = 128 # super block size. 0, 64 or 128 +max_partition_size = 128 # max partition size(8, 16, 32, 64, 128) +min_partition_size = 4 # min partition size(4, 8, 16, 32, 64) +disable_rect_partition_type = 0 # disable rectangle partition type +disable_ab_partition_type = 0 # disable AB partition type +disable_1to4_partition_type = 0 # disable 1 to 4 and 4 to 1 partition type +disable_intra_angle_delta = 0 # disable intra angle delta +disable_paeth_intra = 0 # disable paeth intra +disable_smooth_intra = 0 # disable intra smooth mode +disable_intra_edge_filter = 0 # disable intra edge filter +disable_filter_intra = 0 # disable filter intra +disable_intrabc = 0 # disable Intra Block Copy +disable_cfl = 0 # disable chroma from luma prediction +disable_palette = 0 # disable Palette +disable_flip_idtx = 0 # disable flip and identity transform +disable_tx_64x64 = 0 # disable 64x64 transform +reduced_tx_type_set = 0 # use reduced transform type set +reduced_reference_set = 0 # use reduced reference frame set +disable_obmc = 0 # disable OBMC +disable_warp_motion = 0 # disable Warped Motion +disable_global_motion = 0 # disable global motion +disable_ref_frame_mv = 0 # disable ref mv +disable_dual_filter = 0 # disable dual interpolation filter +disable_one_sided_comp = 0 # disable one sided compound mode +disable_masked_comp = 0 # disable masked compound prediction +disable_diff_wtd_comp = 0 # disable difference weighted compound mode +disable_inter_inter_wedge = 0 # disable inter/inter wedge comp +disable_dist_wtd_comp = 0 # disable distant weighted compound mode +disable_inter_intra_comp = 0 # disable inter/intra compound mode. +disable_inter_intra_wedge = 0 # disable inter/intra wedge comp +disable_smooth_inter_intra = 0 # disable smooth inter/intra +disable_cdef = 0 # disable CDEF filter +disable_lr = 0 # disable Loop Restoration Filter +disable_trellis_quant = 0 # disable trellis quantization \ No newline at end of file diff --git a/third_party/aom/aom/aom.h b/third_party/aom/aom/aom.h new file mode 100644 index 0000000000..0650a11f6b --- /dev/null +++ b/third_party/aom/aom/aom.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\defgroup aom AOM + * \ingroup codecs + * AOM is aom's newest video compression algorithm that uses motion + * compensated prediction, Discrete Cosine Transform (DCT) coding of the + * prediction error signal and context dependent entropy coding techniques + * based on arithmetic principles. It features: + * - YUV 4:2:0 image format + * - Macro-block based coding (16x16 luma plus two 8x8 chroma) + * - 1/4 (1/8) pixel accuracy motion compensated prediction + * - 4x4 DCT transform + * - 128 level linear quantizer + * - In loop deblocking filter + * - Context-based entropy coding + * + * @{ + */ +/*!\file + * \brief Provides controls common to both the AOM encoder and decoder. + */ +#ifndef AOM_AOM_AOM_H_ +#define AOM_AOM_AOM_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Control functions + * + * The set of macros define the control functions of AOM interface + * The range for common control IDs is 230-255(max). + */ +enum aom_com_control_id { + /*!\brief Codec control function to get a pointer to a reference frame + * + * av1_ref_frame_t* parameter + */ + AV1_GET_REFERENCE = 230, + + /*!\brief Codec control function to write a frame into a reference buffer + * + * av1_ref_frame_t* parameter + */ + AV1_SET_REFERENCE = 231, + + /*!\brief Codec control function to get a copy of reference frame from the + * decoder + * + * av1_ref_frame_t* parameter + */ + AV1_COPY_REFERENCE = 232, + + /*!\brief Codec control function to get a pointer to the new frame + * + * aom_image_t* parameter + */ + AV1_GET_NEW_FRAME_IMAGE = 233, + + /*!\brief Codec control function to copy the new frame to an external buffer + * + * aom_image_t* parameter + */ + AV1_COPY_NEW_FRAME_IMAGE = 234, + + /*!\brief Start point of control IDs for aom_dec_control_id. + * Any new common control IDs should be added above. + */ + AOM_DECODER_CTRL_ID_START = 256 + // No common control IDs should be added after AOM_DECODER_CTRL_ID_START. +}; + +/*!\brief AV1 specific reference frame data struct + * + * Define the data struct to access av1 reference frames. + */ +typedef struct av1_ref_frame { + int idx; /**< frame index to get (input) */ + int use_external_ref; /**< Directly use external ref buffer(decoder only) */ + aom_image_t img; /**< img structure to populate (output) */ +} av1_ref_frame_t; + +/*!\cond */ +/*!\brief aom decoder control function parameter type + * + * Defines the data type for each of AOM decoder control function requires. + * + * \note For each control ID "X", a macro-define of + * AOM_CTRL_X is provided. It is used at compile time to determine + * if the control ID is supported by the libaom library available, + * when the libaom version cannot be controlled. + */ +AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *) +#define AOM_CTRL_AV1_GET_REFERENCE + +AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *) +#define AOM_CTRL_AV1_SET_REFERENCE + +AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *) +#define AOM_CTRL_AV1_COPY_REFERENCE + +AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *) +#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE + +AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *) +#define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE + +/*!\endcond */ +/*! @} - end defgroup aom */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_H_ diff --git a/third_party/aom/aom/aom_codec.h b/third_party/aom/aom/aom_codec.h new file mode 100644 index 0000000000..d5b8790a98 --- /dev/null +++ b/third_party/aom/aom/aom_codec.h @@ -0,0 +1,577 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/////////////////////////////////////////////////////////////////////////////// +// Internal implementation details +/////////////////////////////////////////////////////////////////////////////// +// +// There are two levels of interfaces used to access the AOM codec: the +// aom_codec_iface and the aom_codec_ctx. +// +// 1. aom_codec_iface_t +// (Related files: aom/aom_codec.h, aom/src/aom_codec.c, +// aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c, +// av1/av1_dx_iface.c) +// +// Used to initialize the codec context, which contains the configuration for +// for modifying the encoder/decoder during run-time. See the other +// documentation in this header file for more details. For the most part, +// users will call helper functions, such as aom_codec_iface_name, +// aom_codec_get_caps, etc., to interact with it. +// +// The main purpose of the aom_codec_iface_t is to provide a way to generate +// a default codec config, find out what capabilities the implementation has, +// and create an aom_codec_ctx_t (which is actually used to interact with the +// codec). +// +// Note that the implementations for the AV1 algorithm are located in +// av1/av1_cx_iface.c and av1/av1_dx_iface.c +// +// +// 2. aom_codec_ctx_t +// (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c, +// aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c) +// +// The actual interface between user code and the codec. It stores the name +// of the codec, a pointer back to the aom_codec_iface_t that initialized it, +// initialization flags, a config for either encoder or the decoder, and a +// pointer to internal data. +// +// The codec is configured / queried through calls to aom_codec_control, +// which takes a control ID (listed in aomcx.h and aomdx.h) and a parameter. +// In the case of "getter" control IDs, the parameter is modified to have +// the requested value; in the case of "setter" control IDs, the codec's +// configuration is changed based on the parameter. Note that a aom_codec_err_t +// is returned, which indicates if the operation was successful or not. +// +// Note that for the encoder, the aom_codec_alg_priv_t points to the +// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder, +// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored +// here and also used in the core algorithm. +// +// At the end, aom_codec_destroy should be called for each initialized +// aom_codec_ctx_t. + +/*!\defgroup codec Common Algorithm Interface + * This abstraction allows applications to easily support multiple video + * formats with minimal code duplication. This section describes the interface + * common to all codecs (both encoders and decoders). + * @{ + */ + +/*!\file + * \brief Describes the codec algorithm interface to applications. + * + * This file describes the interface between an application and a + * video codec algorithm. + * + * An application instantiates a specific codec instance by using + * aom_codec_dec_init() or aom_codec_enc_init() and a pointer to the + * algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           int threads = 4;
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
+ *       }
+ *     
+ * + * Once initialized, the instance is managed using other functions from + * the aom_codec_* family. + */ +#ifndef AOM_AOM_AOM_CODEC_H_ +#define AOM_AOM_AOM_CODEC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_image.h" +#include "aom/aom_integer.h" + +/*!\brief Decorator indicating a function is deprecated */ +#ifndef AOM_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define AOM_DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define AOM_DEPRECATED +#else +#define AOM_DEPRECATED +#endif +#endif /* AOM_DEPRECATED */ + +#ifndef AOM_DECLSPEC_DEPRECATED +#if defined(__GNUC__) && __GNUC__ +#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */ +#elif defined(_MSC_VER) +/*!\brief \copydoc #AOM_DEPRECATED */ +#define AOM_DECLSPEC_DEPRECATED __declspec(deprecated) +#else +#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */ +#endif +#endif /* AOM_DECLSPEC_DEPRECATED */ + +/*!\brief Decorator indicating a function is potentially unused */ +#ifdef AOM_UNUSED +#elif defined(__GNUC__) || defined(__clang__) +#define AOM_UNUSED __attribute__((unused)) +#else +#define AOM_UNUSED +#endif + +/*!\brief Decorator indicating that given struct/union/enum is packed */ +#ifndef ATTRIBUTE_PACKED +#if defined(__GNUC__) && __GNUC__ +#define ATTRIBUTE_PACKED __attribute__((packed)) +#elif defined(_MSC_VER) +#define ATTRIBUTE_PACKED +#else +#define ATTRIBUTE_PACKED +#endif +#endif /* ATTRIBUTE_PACKED */ + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/ + +/*!\brief Algorithm return codes */ +typedef enum { + /*!\brief Operation completed without error */ + AOM_CODEC_OK, + + /*!\brief Unspecified error */ + AOM_CODEC_ERROR, + + /*!\brief Memory operation failed */ + AOM_CODEC_MEM_ERROR, + + /*!\brief ABI version mismatch */ + AOM_CODEC_ABI_MISMATCH, + + /*!\brief Algorithm does not have required capability */ + AOM_CODEC_INCAPABLE, + + /*!\brief The given bitstream is not supported. + * + * The bitstream was unable to be parsed at the highest level. The decoder + * is unable to proceed. This error \ref SHOULD be treated as fatal to the + * stream. */ + AOM_CODEC_UNSUP_BITSTREAM, + + /*!\brief Encoded bitstream uses an unsupported feature + * + * The decoder does not implement a feature required by the encoder. This + * return code should only be used for features that prevent future + * pictures from being properly decoded. This error \ref MAY be treated as + * fatal to the stream or \ref MAY be treated as fatal to the current GOP. + */ + AOM_CODEC_UNSUP_FEATURE, + + /*!\brief The coded data for this stream is corrupt or incomplete + * + * There was a problem decoding the current frame. This return code + * should only be used for failures that prevent future pictures from + * being properly decoded. This error \ref MAY be treated as fatal to the + * stream or \ref MAY be treated as fatal to the current GOP. If decoding + * is continued for the current GOP, artifacts may be present. + */ + AOM_CODEC_CORRUPT_FRAME, + + /*!\brief An application-supplied parameter is not valid. + * + */ + AOM_CODEC_INVALID_PARAM, + + /*!\brief An iterator reached the end of list. + * + */ + AOM_CODEC_LIST_END + +} aom_codec_err_t; + +/*! \brief Codec capabilities bitfield + * + * Each codec advertises the capabilities it supports as part of its + * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported. + * + * The available flags are specified by AOM_CODEC_CAP_* defines. + */ +typedef long aom_codec_caps_t; +#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ +#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by AOM_CODEC_USE_* defines. + */ +typedef long aom_codec_flags_t; + +/*!\brief Time Stamp Type + * + * An integer, which when multiplied by the stream's time base, provides + * the absolute time of a sample. + */ +typedef int64_t aom_codec_pts_t; + +/*!\brief Codec interface structure. + * + * Contains function pointers and other data private to the codec + * implementation. This structure is opaque to the application. Common + * functions used with this structure: + * - aom_codec_iface_name(aom_codec_iface_t *iface): get the + * name of the codec + * - aom_codec_get_caps(aom_codec_iface_t *iface): returns + * the capabilities of the codec + * - aom_codec_enc_config_default: generate the default config for + * initializing the encoder (see documentation in aom_encoder.h) + * - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context + * structure (see documentation on aom_codec_ctx). + * + * To get access to the AV1 encoder and decoder, use aom_codec_av1_cx() and + * aom_codec_av1_dx(). + */ +typedef const struct aom_codec_iface aom_codec_iface_t; + +/*!\brief Codec private data structure. + * + * Contains data private to the codec implementation. This structure is opaque + * to the application. + */ +typedef struct aom_codec_priv aom_codec_priv_t; + +/*!\brief Compressed Frame Flags + * + * This type represents a bitfield containing information about a compressed + * frame that may be useful to an application. The most significant 16 bits + * can be used by an algorithm to provide additional detail, for example to + * support frame types that are codec specific (MPEG-1 D-frames for example) + */ +typedef uint32_t aom_codec_frame_flags_t; +#define AOM_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */ +/*!\brief frame can be dropped without affecting the stream (no future frame + * depends on this one) */ +#define AOM_FRAME_IS_DROPPABLE 0x2u +/*!\brief this is an INTRA_ONLY frame */ +#define AOM_FRAME_IS_INTRAONLY 0x10u +/*!\brief this is an S-frame */ +#define AOM_FRAME_IS_SWITCH 0x20u +/*!\brief this is an error-resilient frame */ +#define AOM_FRAME_IS_ERROR_RESILIENT 0x40u +/*!\brief this is a key-frame dependent recovery-point frame */ +#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80u + +/*!\brief Iterator + * + * Opaque storage used for iterating over lists. + */ +typedef const void *aom_codec_iter_t; + +/*!\brief Codec context structure + * + * All codecs \ref MUST support this context structure fully. In general, + * this data should be considered private to the codec algorithm, and + * not be manipulated or examined by the calling application. Applications + * may reference the 'name' member to get a printable description of the + * algorithm. + */ +typedef struct aom_codec_ctx { + const char *name; /**< Printable interface name */ + aom_codec_iface_t *iface; /**< Interface pointers */ + aom_codec_err_t err; /**< Last returned error */ + const char *err_detail; /**< Detailed info, if available */ + aom_codec_flags_t init_flags; /**< Flags passed at init time */ + union { + /**< Decoder Configuration Pointer */ + const struct aom_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct aom_codec_enc_cfg *enc; + const void *raw; + } config; /**< Configuration pointer aliasing union */ + aom_codec_priv_t *priv; /**< Algorithm private storage */ +} aom_codec_ctx_t; + +/*!\brief Bit depth for codec + * * + * This enumeration determines the bit depth of the codec. + */ +typedef enum aom_bit_depth { + AOM_BITS_8 = 8, /**< 8 bits */ + AOM_BITS_10 = 10, /**< 10 bits */ + AOM_BITS_12 = 12, /**< 12 bits */ +} aom_bit_depth_t; + +/*!\brief Superblock size selection. + * + * Defines the superblock size used for encoding. The superblock size can + * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically + * selected by the encoder for each frame. + */ +typedef enum aom_superblock_size { + AOM_SUPERBLOCK_SIZE_64X64, /**< Always use 64x64 superblocks. */ + AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */ + AOM_SUPERBLOCK_SIZE_DYNAMIC /**< Select superblock size dynamically. */ +} aom_superblock_size_t; + +/* + * Library Version Number Interface + * + * For example, see the following sample return values: + * aom_codec_version() (1<<16 | 2<<8 | 3) + * aom_codec_version_str() "v1.2.3-rc1-16-gec6a1ba" + * aom_codec_version_extra_str() "rc1-16-gec6a1ba" + */ + +/*!\brief Return the version information (as an integer) + * + * Returns a packed encoding of the library version number. This will only + * include the major.minor.patch component of the version number. Note that this + * encoded value should be accessed through the macros provided, as the encoding + * may change in the future. + * + */ +int aom_codec_version(void); + +/*!\brief Return the major version number */ +#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff) + +/*!\brief Return the minor version number */ +#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff) + +/*!\brief Return the patch version number */ +#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff) + +/*!\brief Return the version information (as a string) + * + * Returns a printable string containing the full library version number. This + * may contain additional text following the three digit version number, as to + * indicate release candidates, pre-release versions, etc. + * + */ +const char *aom_codec_version_str(void); + +/*!\brief Return the version information (as a string) + * + * Returns a printable "extra string". This is the component of the string + * returned by aom_codec_version_str() following the three digit version number. + * + */ +const char *aom_codec_version_extra_str(void); + +/*!\brief Return the build configuration + * + * Returns a printable string containing an encoded version of the build + * configuration. This may be useful to aom support. + * + */ +const char *aom_codec_build_config(void); + +/*!\brief Return the name for a given interface + * + * Returns a human readable string for name of the given codec interface. + * + * \param[in] iface Interface pointer + * + */ +const char *aom_codec_iface_name(aom_codec_iface_t *iface); + +/*!\brief Convert error number to printable string + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] err Error number. + * + */ +const char *aom_codec_err_to_string(aom_codec_err_t err); + +/*!\brief Retrieve error synopsis for codec context + * + * Returns a human readable string for the last error returned by the + * algorithm. The returned error will be one line and will not contain + * any newline characters. + * + * + * \param[in] ctx Pointer to this instance's context. + * + */ +const char *aom_codec_error(const aom_codec_ctx_t *ctx); + +/*!\brief Retrieve detailed error information for codec context + * + * Returns a human readable string providing detailed information about + * the last error. The returned string is only valid until the next + * aom_codec_* function call (except aom_codec_error and + * aom_codec_error_detail) on the codec context. + * + * \param[in] ctx Pointer to this instance's context. + * + * \retval NULL + * No detailed information is available. + */ +const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx); + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all codecs. + * They represent the base case functionality expected of all codecs. + */ + +/*!\brief Destroy a codec instance + * + * Destroys a codec context, freeing any associated memory buffers. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval #AOM_CODEC_OK + * The codec instance has been destroyed. + * \retval #AOM_CODEC_INVALID_PARAM + * ctx is a null pointer. + * \retval #AOM_CODEC_ERROR + * Codec context not initialized. + */ +aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx); + +/*!\brief Get the capabilities of an algorithm. + * + * Retrieves the capabilities bitfield from the algorithm's interface. + * + * \param[in] iface Pointer to the algorithm interface + * + */ +aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface); + +/*!\name Codec Control + * + * The aom_codec_control function exchanges algorithm specific data with the + * codec instance. Additionally, the macro AOM_CODEC_CONTROL_TYPECHECKED is + * provided, which will type-check the parameter against the control ID before + * calling aom_codec_control - note that this macro requires the control ID + * to be directly encoded in it, e.g., + * AOM_CODEC_CONTROL_TYPECHECKED(&ctx, AOME_SET_CPUUSED, 8). + * + * The codec control IDs can be found in aom.h, aomcx.h, and aomdx.h + * (defined as aom_com_control_id, aome_enc_control_id, and aom_dec_control_id). + * @{ + */ +/*!\brief Algorithm Control + * + * aom_codec_control takes a context, a control ID, and a third parameter + * (with varying type). If the context is non-null and an error occurs, + * ctx->err will be set to the same value as the return value. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier. + * Must be nonzero. + * + * \retval #AOM_CODEC_OK + * The control request was processed. + * \retval #AOM_CODEC_ERROR + * The control request was not processed. + * \retval #AOM_CODEC_INVALID_PARAM + * The control ID was zero, or the data was not valid. + */ +aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...); + +/*!\brief Key & Value API + * + * aom_codec_set_option() takes a context, a key (option name) and a value. If + * the context is non-null and an error occurs, ctx->err will be set to the same + * value as the return value. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] name The name of the option (key) + * \param[in] value The value of the option + * + * \retval #AOM_CODEC_OK + * The value of the option was set. + * \retval #AOM_CODEC_INVALID_PARAM + * The data was not valid. + * \retval #AOM_CODEC_ERROR + * The option was not successfully set. + */ +aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name, + const char *value); + +/*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible) + * + * This macro allows for type safe conversions across the variadic parameter + * to aom_codec_control(). However, it requires the explicit control ID + * be passed in (it cannot be passed in via a variable) -- otherwise a compiler + * error will occur. After the type checking, it calls aom_codec_control. + */ +#define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \ + aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/ + +/*!\brief Creates type checking mechanisms for aom_codec_control + * + * It defines a static function with the correctly typed arguments as a wrapper + * to the type-unsafe aom_codec_control function. It also creates a typedef + * for each type. + */ +#define AOM_CTRL_USE_TYPE(id, typ) \ + static aom_codec_err_t aom_codec_control_typechecked_##id( \ + aom_codec_ctx_t *, int, typ) AOM_UNUSED; \ + static aom_codec_err_t aom_codec_control_typechecked_##id( \ + aom_codec_ctx_t *ctx, int ctrl, typ data) { \ + return aom_codec_control(ctx, ctrl, data); \ + } /**<\hideinitializer*/ \ + typedef typ aom_codec_control_type_##id; +/*!@} end Codec Control group */ + +/*!\brief OBU types. */ +typedef enum ATTRIBUTE_PACKED { + OBU_SEQUENCE_HEADER = 1, + OBU_TEMPORAL_DELIMITER = 2, + OBU_FRAME_HEADER = 3, + OBU_TILE_GROUP = 4, + OBU_METADATA = 5, + OBU_FRAME = 6, + OBU_REDUNDANT_FRAME_HEADER = 7, + OBU_TILE_LIST = 8, + OBU_PADDING = 15, +} OBU_TYPE; + +/*!\brief OBU metadata types. */ +typedef enum { + OBU_METADATA_TYPE_AOM_RESERVED_0 = 0, + OBU_METADATA_TYPE_HDR_CLL = 1, + OBU_METADATA_TYPE_HDR_MDCV = 2, + OBU_METADATA_TYPE_SCALABILITY = 3, + OBU_METADATA_TYPE_ITUT_T35 = 4, + OBU_METADATA_TYPE_TIMECODE = 5, +} OBU_METADATA_TYPE; + +/*!\brief Returns string representation of OBU_TYPE. + * + * \param[in] type The OBU_TYPE to convert to string. + */ +const char *aom_obu_type_to_string(OBU_TYPE type); + +/*!@} - end defgroup codec*/ +#ifdef __cplusplus +} +#endif +#endif // AOM_AOM_AOM_CODEC_H_ diff --git a/third_party/aom/aom/aom_decoder.h b/third_party/aom/aom/aom_decoder.h new file mode 100644 index 0000000000..229cf7358f --- /dev/null +++ b/third_party/aom/aom/aom_decoder.h @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_DECODER_H_ +#define AOM_AOM_AOM_DECODER_H_ + +/*!\defgroup decoder Decoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this decoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all decoders. + * @{ + */ + +/*!\file + * \brief Describes the decoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video decoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_codec.h" // IWYU pragma: export +#include "aom/aom_frame_buffer.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_DECODER_ABI_VERSION \ + (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/ + +/*! \brief Decoder capabilities bitfield + * + * Each decoder advertises the capabilities it supports as part of its + * ::aom_codec_iface_t interface structure. Capabilities are extra interfaces + * or functionality, and are not required to be supported by a decoder. + * + * The available flags are specified by AOM_CODEC_CAP_* defines. + */ +/*!brief Can support external frame buffers */ +#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow for + * proper memory allocation. + * + * The available flags are specified by AOM_CODEC_USE_* defines. + */ + +/*!\brief Stream properties + * + * This structure is used to query or set properties of the decoded + * stream. + */ +typedef struct aom_codec_stream_info { + unsigned int w; /**< Width (or 0 for unknown/default) */ + unsigned int h; /**< Height (or 0 for unknown/default) */ + unsigned int is_kf; /**< Current frame is a keyframe */ + unsigned int number_spatial_layers; /**< Number of spatial layers */ + unsigned int number_temporal_layers; /**< Number of temporal layers */ + unsigned int is_annexb; /**< Is Bitstream in Annex-B format */ +} aom_codec_stream_info_t; + +/* REQUIRED FUNCTIONS + * + * The following functions are required to be implemented for all decoders. + * They represent the base case functionality expected of all decoders. + */ + +/*!\brief Initialization Configurations + * + * This structure is used to pass init time configuration options to the + * decoder. + */ +typedef struct aom_codec_dec_cfg { + unsigned int threads; /**< Maximum number of threads to use, default 1 */ + unsigned int w; /**< Width */ + unsigned int h; /**< Height */ + unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */ +} aom_codec_dec_cfg_t; /**< alias for struct aom_codec_dec_cfg */ + +/*!\brief Initialize a decoder instance + * + * Initializes a decoder context using the given interface. Applications + * should call the aom_codec_dec_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with cmake -DCONFIG_MULTITHREAD=0, this + * call is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] flags Bitfield of AOM_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * AOM_DECODER_ABI_VERSION + * \retval #AOM_CODEC_OK + * The decoder algorithm has been initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory allocation failed. + */ +aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_dec_cfg_t *cfg, + aom_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for aom_codec_dec_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define aom_codec_dec_init(ctx, iface, cfg, flags) \ + aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION) + +/*!\brief Parse stream info from a buffer + * + * Performs high level parsing of the bitstream. Construction of a decoder + * context is not necessary. Can be used to determine if the bitstream is + * of the proper format, and to extract information from the stream. + * + * \param[in] iface Pointer to the algorithm interface + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The is_annexb + * member \ref MUST be properly initialized. This + * function sets the rest of the members. + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated. + * \retval #AOM_CODEC_INVALID_PARAM + * One of the arguments is invalid, for example a NULL pointer. + * \retval #AOM_CODEC_UNSUP_BITSTREAM + * The decoder didn't recognize the coded data, or the + * buffer was too short. + */ +aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface, + const uint8_t *data, size_t data_sz, + aom_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update. + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated. + * \retval #AOM_CODEC_INVALID_PARAM + * One of the arguments is invalid, for example a NULL pointer. + * \retval #AOM_CODEC_UNSUP_BITSTREAM + * The decoder couldn't parse the submitted data. + */ +aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx, + aom_codec_stream_info_t *si); + +/*!\brief Decode data + * + * Processes a buffer of coded data. Encoded data \ref MUST be passed in DTS + * (decode time stamp) order. Frames produced will always be in PTS + * (presentation time stamp) order. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. + * \param[in] data_sz Size of the coded data, in bytes. + * \param[in] user_priv Application specific data to associate with + * this frame. + * + * \return Returns #AOM_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::aom_codec_err_t + * for recoverability capabilities. + */ +aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data, + size_t data_sz, void *user_priv); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * aom_codec_decode call, and remains valid until the next call to + * aom_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter); + +/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions + * + * The following function is required to be implemented for all decoders + * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. + * Calling this function for codecs that don't advertise this capability + * will result in an error code being returned, usually AOM_CODEC_INCAPABLE. + * @{ + */ + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libaom needs a frame buffer + * to decode the current frame and a function to be called when libaom does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libaom will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #AOM_CODEC_OK + * External frame buffers will be used by libaom. + * \retval #AOM_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #AOM_CODEC_ERROR + * Decoder context not initialized. + * \retval #AOM_CODEC_INCAPABLE + * Algorithm not capable of using external frame buffers. + * + * \note + * When decoding AV1, the application may be required to pass in at least + * #AOM_MAXIMUM_WORK_BUFFERS external frame buffers. + */ +aom_codec_err_t aom_codec_set_frame_buffer_functions( + aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + +/*!@} - end defgroup cap_external_frame_buffer */ + +/*!@} - end defgroup decoder*/ +#ifdef __cplusplus +} +#endif +#endif // AOM_AOM_AOM_DECODER_H_ diff --git a/third_party/aom/aom/aom_encoder.h b/third_party/aom/aom/aom_encoder.h new file mode 100644 index 0000000000..6a6254dafe --- /dev/null +++ b/third_party/aom/aom/aom_encoder.h @@ -0,0 +1,1144 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_ENCODER_H_ +#define AOM_AOM_AOM_ENCODER_H_ + +/*!\defgroup encoder Encoder Algorithm Interface + * \ingroup codec + * This abstraction allows applications using this encoder to easily support + * multiple video formats with minimal code duplication. This section describes + * the interface common to all encoders. + * @{ + */ + +/*!\file + * \brief Describes the encoder algorithm interface to applications. + * + * This file describes the interface between an application and a + * video encoder algorithm. + * + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_codec.h" // IWYU pragma: export +#include "aom/aom_external_partition.h" + +/*!\brief Current ABI version number + * + * \hideinitializer + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + * + * Note: In the definition of AOM_ENCODER_ABI_VERSION, 3 is the value of + * AOM_EXT_PART_ABI_VERSION in libaom v3.2.0. The old value of + * AOM_EXT_PART_ABI_VERSION is used so as to not break the ABI version check in + * aom_codec_enc_init_ver() when an application compiled against libaom v3.2.0 + * passes the old value of AOM_ENCODER_ABI_VERSION to aom_codec_enc_init_ver(). + * The external partition API is still experimental. When it is declared stable, + * we will replace 3 with AOM_EXT_PART_ABI_VERSION in the definition of + * AOM_ENCODER_ABI_VERSION. + */ +#define AOM_ENCODER_ABI_VERSION \ + (10 + AOM_CODEC_ABI_VERSION + /*AOM_EXT_PART_ABI_VERSION=*/3) + +/*! \brief Encoder capabilities bitfield + * + * Each encoder advertises the capabilities it supports as part of its + * ::aom_codec_iface_t interface structure. Capabilities are extra + * interfaces or functionality, and are not required to be supported + * by an encoder. + * + * The available flags are specified by AOM_CODEC_CAP_* defines. + */ +#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */ + +/*! Can support input images at greater than 8 bitdepth. + */ +#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000 + +/*! \brief Initialization-time Feature Enabling + * + * Certain codec features must be known at initialization time, to allow + * for proper memory allocation. + * + * The available flags are specified by AOM_CODEC_USE_* defines. + */ +#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ +#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ + +/*!\brief Generic fixed size buffer structure + * + * This structure is able to hold a reference to any fixed size buffer. + */ +typedef struct aom_fixed_buf { + void *buf; /**< Pointer to the data. Does NOT own the data! */ + size_t sz; /**< Length of the buffer, in chars */ +} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */ + +/*!\brief Error Resilient flags + * + * These flags define which error resilient features to enable in the + * encoder. The flags are specified through the + * aom_codec_enc_cfg::g_error_resilient variable. + */ +typedef uint32_t aom_codec_er_flags_t; +/*!\brief Improve resiliency against losses of whole frames */ +#define AOM_ERROR_RESILIENT_DEFAULT 0x1 + +/*!\brief Encoder output packet variants + * + * This enumeration lists the different kinds of data packets that can be + * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY + * extend this list to provide additional functionality. + */ +enum aom_codec_cx_pkt_kind { + AOM_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + AOM_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + AOM_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + AOM_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ + AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ +}; + +/*!\brief Encoder output packet + * + * This structure contains the different kinds of output data the encoder + * may produce while compressing a frame. + */ +typedef struct aom_codec_cx_pkt { + enum aom_codec_cx_pkt_kind kind; /**< packet variant */ + union { + struct { + void *buf; /**< compressed data buffer */ + size_t sz; /**< length of compressed data */ + /*!\brief time stamp to show frame (in timebase units) */ + aom_codec_pts_t pts; + /*!\brief duration to show frame (in timebase units) */ + unsigned long duration; + aom_codec_frame_flags_t flags; /**< flags for this frame */ + /*!\brief the partition id defines the decoding order of the partitions. + * Only applicable when "output partition" mode is enabled. First + * partition has id 0.*/ + int partition_id; + /*!\brief size of the visible frame in this packet */ + size_t vis_frame_size; + } frame; /**< data for compressed frame packet */ + aom_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ + struct aom_psnr_pkt { + unsigned int samples[4]; /**< Number of samples, total/y/u/v */ + uint64_t sse[4]; /**< sum squared error, total/y/u/v */ + double psnr[4]; /**< PSNR, total/y/u/v */ + /*!\brief Number of samples, total/y/u/v when + * input bit-depth < stream bit-depth.*/ + unsigned int samples_hbd[4]; + /*!\brief sum squared error, total/y/u/v when + * input bit-depth < stream bit-depth.*/ + uint64_t sse_hbd[4]; + /*!\brief PSNR, total/y/u/v when + * input bit-depth < stream bit-depth.*/ + double psnr_hbd[4]; + } psnr; /**< data for PSNR packet */ + aom_fixed_buf_t raw; /**< data for arbitrary packets */ + } data; /**< packet data */ +} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */ + +/*!\brief Rational Number + * + * This structure holds a fractional value. + */ +typedef struct aom_rational { + int num; /**< fraction numerator */ + int den; /**< fraction denominator */ +} aom_rational_t; /**< alias for struct aom_rational */ + +/*!\brief Multi-pass Encoding Pass + * + * AOM_RC_LAST_PASS is kept for backward compatibility. + * If passes is not given and pass==2, the codec will assume passes=2. + * For new code, it is recommended to use AOM_RC_SECOND_PASS and set + * the "passes" member to 2 via the key & val API for two-pass encoding. + */ +enum aom_enc_pass { + AOM_RC_ONE_PASS = 0, /**< Single pass mode */ + AOM_RC_FIRST_PASS = 1, /**< First pass of multi-pass mode */ + AOM_RC_SECOND_PASS = 2, /**< Second pass of multi-pass mode */ + AOM_RC_THIRD_PASS = 3, /**< Third pass of multi-pass mode */ + AOM_RC_LAST_PASS = 2, /**< Final pass of two-pass mode */ +}; + +/*!\brief Rate control mode */ +enum aom_rc_mode { + AOM_VBR, /**< Variable Bit Rate (VBR) mode */ + AOM_CBR, /**< Constant Bit Rate (CBR) mode */ + AOM_CQ, /**< Constrained Quality (CQ) mode */ + AOM_Q, /**< Constant Quality (Q) mode */ +}; + +/*!\brief Keyframe placement mode. + * + * This enumeration determines whether keyframes are placed automatically by + * the encoder or whether this behavior is disabled. Older releases of this + * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled. + * This name is confusing for this behavior, so the new symbols to be used + * are AOM_KF_AUTO and AOM_KF_DISABLED. + */ +enum aom_kf_mode { + AOM_KF_FIXED, /**< deprecated, implies AOM_KF_DISABLED */ + AOM_KF_AUTO, /**< Encoder determines optimal placement automatically */ + AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ +}; + +/*!\brief Frame super-resolution mode. */ +typedef enum { + /**< Frame super-resolution is disabled for all frames. */ + AOM_SUPERRES_NONE, + /**< All frames are coded at the specified scale and super-resolved. */ + AOM_SUPERRES_FIXED, + /**< All frames are coded at a random scale and super-resolved. */ + AOM_SUPERRES_RANDOM, + /**< Super-resolution scale for each frame is determined based on the q index + of that frame. */ + AOM_SUPERRES_QTHRESH, + /**< Full-resolution or super-resolution and the scale (in case of + super-resolution) are automatically selected for each frame. */ + AOM_SUPERRES_AUTO, +} aom_superres_mode; + +/*!\brief Encoder Config Options + * + * This type allows to enumerate and control flags defined for encoder control + * via config file at runtime. + */ +typedef struct cfg_options { + /*!\brief Indicate init by cfg file + * 0 or 1 + */ + unsigned int init_by_cfg_file; + /*!\brief Superblock size + * 0, 64 or 128 + */ + unsigned int super_block_size; + /*!\brief max partition size + * 8, 16, 32, 64, 128 + */ + unsigned int max_partition_size; + /*!\brief min partition size + * 8, 16, 32, 64, 128 + */ + unsigned int min_partition_size; + /*!\brief disable AB Shape partition type + * + */ + unsigned int disable_ab_partition_type; + /*!\brief disable rectangular partition type + * + */ + unsigned int disable_rect_partition_type; + /*!\brief disable 1:4/4:1 partition type + * + */ + unsigned int disable_1to4_partition_type; + /*!\brief disable flip and identity transform type + * + */ + unsigned int disable_flip_idtx; + /*!\brief disable CDEF filter + * + */ + unsigned int disable_cdef; + /*!\brief disable Loop Restoration Filter + * + */ + unsigned int disable_lr; + /*!\brief disable OBMC + * + */ + unsigned int disable_obmc; + /*!\brief disable Warped Motion + * + */ + unsigned int disable_warp_motion; + /*!\brief disable global motion + * + */ + unsigned int disable_global_motion; + /*!\brief disable dist weighted compound + * + */ + unsigned int disable_dist_wtd_comp; + /*!\brief disable diff weighted compound + * + */ + unsigned int disable_diff_wtd_comp; + /*!\brief disable inter/intra compound + * + */ + unsigned int disable_inter_intra_comp; + /*!\brief disable masked compound + * + */ + unsigned int disable_masked_comp; + /*!\brief disable one sided compound + * + */ + unsigned int disable_one_sided_comp; + /*!\brief disable Palette + * + */ + unsigned int disable_palette; + /*!\brief disable Intra Block Copy + * + */ + unsigned int disable_intrabc; + /*!\brief disable chroma from luma + * + */ + unsigned int disable_cfl; + /*!\brief disable intra smooth mode + * + */ + unsigned int disable_smooth_intra; + /*!\brief disable filter intra + * + */ + unsigned int disable_filter_intra; + /*!\brief disable dual filter + * + */ + unsigned int disable_dual_filter; + /*!\brief disable intra angle delta + * + */ + unsigned int disable_intra_angle_delta; + /*!\brief disable intra edge filter + * + */ + unsigned int disable_intra_edge_filter; + /*!\brief disable 64x64 transform + * + */ + unsigned int disable_tx_64x64; + /*!\brief disable smooth inter/intra + * + */ + unsigned int disable_smooth_inter_intra; + /*!\brief disable inter/inter wedge comp + * + */ + unsigned int disable_inter_inter_wedge; + /*!\brief disable inter/intra wedge comp + * + */ + unsigned int disable_inter_intra_wedge; + /*!\brief disable paeth intra + * + */ + unsigned int disable_paeth_intra; + /*!\brief disable trellis quantization + * + */ + unsigned int disable_trellis_quant; + /*!\brief disable ref frame MV + * + */ + unsigned int disable_ref_frame_mv; + /*!\brief use reduced reference frame set + * + */ + unsigned int reduced_reference_set; + /*!\brief use reduced transform type set + * + */ + unsigned int reduced_tx_type_set; +} cfg_options_t; + +/*!\brief Encoded Frame Flags + * + * This type indicates a bitfield to be passed to aom_codec_encode(), defining + * per-frame boolean values. By convention, bits common to all codecs will be + * named AOM_EFLAG_*, and bits specific to an algorithm will be named + * /algo/_eflag_*. The lower order 16 bits are reserved for common use. + */ +typedef long aom_enc_frame_flags_t; +/*!\brief Force this frame to be a keyframe */ +#define AOM_EFLAG_FORCE_KF (1 << 0) + +/*!\brief Encoder configuration structure + * + * This structure contains the encoder settings that have common representations + * across all codecs. This doesn't imply that all codecs support all features, + * however. + */ +typedef struct aom_codec_enc_cfg { + /* + * generic settings (g) + */ + + /*!\brief Algorithm specific "usage" value + * + * Algorithms may define multiple values for usage, which may convey the + * intent of how the application intends to use the stream. If this value + * is non-zero, consult the documentation for the codec to determine its + * meaning. + */ + unsigned int g_usage; + + /*!\brief Maximum number of threads to use + * + * For multi-threaded implementations, use no more than this number of + * threads. The codec may use fewer threads than allowed. The value + * 0 is equivalent to the value 1. + */ + unsigned int g_threads; + + /*!\brief Bitstream profile to use + * + * Some codecs support a notion of multiple bitstream profiles. Typically + * this maps to a set of features that are turned on or off. Often the + * profile to use is determined by the features of the intended decoder. + * Consult the documentation for the codec to determine the valid values + * for this parameter, or set to zero for a sane default. + */ + unsigned int g_profile; /**< profile of bitstream to use */ + + /*!\brief Width of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_w; + + /*!\brief Height of the frame + * + * This value identifies the presentation resolution of the frame, + * in pixels. Note that the frames passed as input to the encoder must + * have this resolution. Frames will be presented by the decoder in this + * resolution, independent of any spatial resampling the encoder may do. + */ + unsigned int g_h; + + /*!\brief Max number of frames to encode + * + * If force video mode is off (the default) and g_limit is 1, the encoder + * will encode a still picture (still_picture is set to 1 in the sequence + * header OBU). If in addition full_still_picture_hdr is 0 (the default), + * the encoder will use a reduced header (reduced_still_picture_header is + * set to 1 in the sequence header OBU) for the still picture. + */ + unsigned int g_limit; + + /*!\brief Forced maximum width of the frame + * + * If this value is non-zero then it is used to force the maximum frame + * width written in write_sequence_header(). + */ + unsigned int g_forced_max_frame_width; + + /*!\brief Forced maximum height of the frame + * + * If this value is non-zero then it is used to force the maximum frame + * height written in write_sequence_header(). + */ + unsigned int g_forced_max_frame_height; + + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * aom_bit_depth_t enum. + */ + aom_bit_depth_t g_bit_depth; + + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; + + /*!\brief Stream timebase units + * + * Indicates the smallest interval of time, in seconds, used by the stream. + * For fixed frame rate material, or variable frame rate material where + * frames are timed at a multiple of a given clock (ex: video capture), + * the \ref RECOMMENDED method is to set the timebase to the reciprocal + * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the + * pts to correspond to the frame number, which can be handy. For + * re-encoding video from containers with absolute time timestamps, the + * \ref RECOMMENDED method is to set the timebase to that of the parent + * container or multimedia framework (ex: 1/1000 for ms, as in FLV). + */ + struct aom_rational g_timebase; + + /*!\brief Enable error resilient modes. + * + * The error resilient bitfield indicates to the encoder which features + * it should enable to take measures for streaming over lossy or noisy + * links. + */ + aom_codec_er_flags_t g_error_resilient; + + /*!\brief Multi-pass Encoding Mode + * + * This value should be set to the current phase for multi-pass encoding. + * For single pass, set to #AOM_RC_ONE_PASS. + */ + enum aom_enc_pass g_pass; + + /*!\brief Allow lagged encoding + * + * If set, this value allows the encoder to consume a number of input + * frames before producing output frames. This allows the encoder to + * base decisions for the current frame on future frames. This does + * increase the latency of the encoding pipeline, so it is not appropriate + * in all situations (ex: realtime encoding). + * + * Note that this is a maximum value -- the encoder may produce frames + * sooner than the given limit. Set this value to 0 to disable this + * feature. + */ + unsigned int g_lag_in_frames; + + /* + * rate control settings (rc) + */ + + /*!\brief Temporal resampling configuration, if supported by the codec. + * + * Temporal resampling allows the codec to "drop" frames as a strategy to + * meet its target data rate. This can cause temporal discontinuities in + * the encoded video, which may appear as stuttering during playback. This + * trade-off is often acceptable, but for many applications is not. It can + * be disabled in these cases. + * + * Note that not all codecs support this feature. All aom AVx codecs do. + * For other codecs, consult the documentation for that algorithm. + * + * This threshold is described as a percentage of the target data buffer. + * When the data buffer falls below this percentage of fullness, a + * dropped frame is indicated. Set the threshold to zero (0) to disable + * this feature. + */ + unsigned int rc_dropframe_thresh; + + /*!\brief Mode for spatial resampling, if supported by the codec. + * + * Spatial resampling allows the codec to compress a lower resolution + * version of the frame, which is then upscaled by the decoder to the + * correct presentation resolution. This increases visual quality at + * low data rates, at the expense of CPU time on the encoder/decoder. + */ + unsigned int rc_resize_mode; + + /*!\brief Frame resize denominator. + * + * The denominator for resize to use, assuming 8 as the numerator. + * + * Valid denominators are 8 - 16 for now. + */ + unsigned int rc_resize_denominator; + + /*!\brief Keyframe resize denominator. + * + * The denominator for resize to use, assuming 8 as the numerator. + * + * Valid denominators are 8 - 16 for now. + */ + unsigned int rc_resize_kf_denominator; + + /*!\brief Frame super-resolution scaling mode. + * + * Similar to spatial resampling, frame super-resolution integrates + * upscaling after the encode/decode process. Taking control of upscaling and + * using restoration filters should allow it to outperform normal resizing. + */ + aom_superres_mode rc_superres_mode; + + /*!\brief Frame super-resolution denominator. + * + * The denominator for superres to use. If fixed it will only change if the + * cumulative scale change over resizing and superres is greater than 1/2; + * this forces superres to reduce scaling. + * + * Valid denominators are 8 to 16. + * + * Used only by AOM_SUPERRES_FIXED. + */ + unsigned int rc_superres_denominator; + + /*!\brief Keyframe super-resolution denominator. + * + * The denominator for superres to use. If fixed it will only change if the + * cumulative scale change over resizing and superres is greater than 1/2; + * this forces superres to reduce scaling. + * + * Valid denominators are 8 - 16 for now. + */ + unsigned int rc_superres_kf_denominator; + + /*!\brief Frame super-resolution q threshold. + * + * The q level threshold after which superres is used. + * Valid values are 1 to 63. + * + * Used only by AOM_SUPERRES_QTHRESH + */ + unsigned int rc_superres_qthresh; + + /*!\brief Keyframe super-resolution q threshold. + * + * The q level threshold after which superres is used for key frames. + * Valid values are 1 to 63. + * + * Used only by AOM_SUPERRES_QTHRESH + */ + unsigned int rc_superres_kf_qthresh; + + /*!\brief Rate control algorithm to use. + * + * Indicates whether the end usage of this stream is to be streamed over + * a bandwidth constrained link, indicating that Constant Bit Rate (CBR) + * mode should be used, or whether it will be played back on a high + * bandwidth link, as from a local disk, where higher variations in + * bitrate are acceptable. + */ + enum aom_rc_mode rc_end_usage; + + /*!\brief Two-pass stats buffer. + * + * A buffer containing all of the stats packets produced in the first + * pass, concatenated. + */ + aom_fixed_buf_t rc_twopass_stats_in; + + /*!\brief first pass mb stats buffer. + * + * A buffer containing all of the first pass mb stats packets produced + * in the first pass, concatenated. + */ + aom_fixed_buf_t rc_firstpass_mb_stats_in; + + /*!\brief Target data rate + * + * Target bitrate to use for this stream, in kilobits per second. + */ + unsigned int rc_target_bitrate; + + /* + * quantizer settings + */ + + /*!\brief Minimum (Best Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * aom_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_min_quantizer; + + /*!\brief Maximum (Worst Quality) Quantizer + * + * The quantizer is the most direct control over the quality of the + * encoded image. The range of valid values for the quantizer is codec + * specific. Consult the documentation for the codec to determine the + * values to use. To determine the range programmatically, call + * aom_codec_enc_config_default() with a usage value of 0. + */ + unsigned int rc_max_quantizer; + + /* + * bitrate tolerance + */ + + /*!\brief Rate control adaptation undershoot control + * + * This value, controls the tolerance of the VBR algorithm to undershoot + * and is used as a trigger threshold for more aggressive adaptation of Q. + * + * Valid values in the range 0-100. + */ + unsigned int rc_undershoot_pct; + + /*!\brief Rate control adaptation overshoot control + * + * This value, controls the tolerance of the VBR algorithm to overshoot + * and is used as a trigger threshold for more aggressive adaptation of Q. + * + * Valid values in the range 0-100. + */ + unsigned int rc_overshoot_pct; + + /* + * decoder buffer model parameters + */ + + /*!\brief Decoder Buffer Size + * + * This value indicates the amount of data that may be buffered by the + * decoding application. Note that this value is expressed in units of + * time (milliseconds). For example, a value of 5000 indicates that the + * client will buffer (at least) 5000ms worth of encoded data. Use the + * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if + * necessary. + */ + unsigned int rc_buf_sz; + + /*!\brief Decoder Buffer Initial Size + * + * This value indicates the amount of data that will be buffered by the + * decoding application prior to beginning playback. This value is + * expressed in units of time (milliseconds). Use the target bitrate + * (#rc_target_bitrate) to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_initial_sz; + + /*!\brief Decoder Buffer Optimal Size + * + * This value indicates the amount of data that the encoder should try + * to maintain in the decoder's buffer. This value is expressed in units + * of time (milliseconds). Use the target bitrate (#rc_target_bitrate) + * to convert to bits/bytes, if necessary. + */ + unsigned int rc_buf_optimal_sz; + + /* + * 2 pass rate control parameters + */ + + /*!\brief Two-pass mode CBR/VBR bias + * + * Bias, expressed on a scale of 0 to 100, for determining target size + * for the current frame. The value 0 indicates the optimal CBR mode + * value should be used. The value 100 indicates the optimal VBR mode + * value should be used. Values in between indicate which way the + * encoder should "lean." + */ + unsigned int rc_2pass_vbr_bias_pct; + + /*!\brief Two-pass mode per-GOP minimum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the minimum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_minsection_pct; + + /*!\brief Two-pass mode per-GOP maximum bitrate + * + * This value, expressed as a percentage of the target bitrate, indicates + * the maximum bitrate to be used for a single GOP (aka "section") + */ + unsigned int rc_2pass_vbr_maxsection_pct; + + /* + * keyframing settings (kf) + */ + + /*!\brief Option to enable forward reference key frame + * + */ + int fwd_kf_enabled; + + /*!\brief Keyframe placement mode + * + * This value indicates whether the encoder should place keyframes at a + * fixed interval, or determine the optimal placement automatically + * (as governed by the #kf_min_dist and #kf_max_dist parameters) + */ + enum aom_kf_mode kf_mode; + + /*!\brief Keyframe minimum interval + * + * This value, expressed as a number of frames, prevents the encoder from + * placing a keyframe nearer than kf_min_dist to the previous keyframe. At + * least kf_min_dist frames non-keyframes will be coded before the next + * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_min_dist; + + /*!\brief Keyframe maximum interval + * + * This value, expressed as a number of frames, forces the encoder to code + * a keyframe if one has not been coded in the last kf_max_dist frames. + * A value of 0 implies all frames will be keyframes. Set kf_min_dist + * equal to kf_max_dist for a fixed interval. + */ + unsigned int kf_max_dist; + + /*!\brief sframe interval + * + * This value, expressed as a number of frames, forces the encoder to code + * an S-Frame every sframe_dist frames. + */ + unsigned int sframe_dist; + + /*!\brief sframe insertion mode + * + * This value must be set to 1 or 2, and tells the encoder how to insert + * S-Frames. It will only have an effect if sframe_dist != 0. + * + * If altref is enabled: + * - if sframe_mode == 1, the considered frame will be made into an + * S-Frame only if it is an altref frame + * - if sframe_mode == 2, the next altref frame will be made into an + * S-Frame. + * + * Otherwise: the considered frame will be made into an S-Frame. + */ + unsigned int sframe_mode; + + /*!\brief Tile coding mode + * + * This value indicates the tile coding mode. + * A value of 0 implies a normal non-large-scale tile coding. A value of 1 + * implies a large-scale tile coding. + */ + unsigned int large_scale_tile; + + /*!\brief Monochrome mode + * + * If this is nonzero, the encoder will generate a monochrome stream + * with no chroma planes. + */ + unsigned int monochrome; + + /*!\brief full_still_picture_hdr + * + * If this is nonzero, the encoder will generate a full header + * (reduced_still_picture_header is set to 0 in the sequence header OBU) even + * for still picture encoding. If this is zero (the default), a reduced + * header (reduced_still_picture_header is set to 1 in the sequence header + * OBU) is used for still picture encoding. This flag has no effect when a + * regular video with more than a single frame is encoded. + */ + unsigned int full_still_picture_hdr; + + /*!\brief Bitstream syntax mode + * + * This value indicates the bitstream syntax mode. + * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value + * of 1 indicates the bitstream is saved in Annex-B format + */ + unsigned int save_as_annexb; + + /*!\brief Number of explicit tile widths specified + * + * This value indicates the number of tile widths specified + * A value of 0 implies no tile widths are specified. + * Tile widths are given in the array tile_widths[] + */ + int tile_width_count; + + /*!\brief Number of explicit tile heights specified + * + * This value indicates the number of tile heights specified + * A value of 0 implies no tile heights are specified. + * Tile heights are given in the array tile_heights[] + */ + int tile_height_count; + +/*!\brief Maximum number of tile widths in tile widths array + * + * This define gives the maximum number of elements in the tile_widths array. + */ +#define MAX_TILE_WIDTHS 64 // maximum tile width array length + + /*!\brief Array of specified tile widths + * + * This array specifies tile widths (and may be empty) + * The number of widths specified is given by tile_width_count + */ + int tile_widths[MAX_TILE_WIDTHS]; + +/*!\brief Maximum number of tile heights in tile heights array. + * + * This define gives the maximum number of elements in the tile_heights array. + */ +#define MAX_TILE_HEIGHTS 64 // maximum tile height array length + + /*!\brief Array of specified tile heights + * + * This array specifies tile heights (and may be empty) + * The number of heights specified is given by tile_height_count + */ + int tile_heights[MAX_TILE_HEIGHTS]; + + /*!\brief Whether encoder should use fixed QP offsets. + * + * If a value of 1 is provided, encoder will use fixed QP offsets for frames + * at different levels of the pyramid. + * If a value of 0 is provided, encoder will NOT use fixed QP offsets. + * Note: This option is only relevant for --end-usage=q. + */ + unsigned int use_fixed_qp_offsets; + + /*!\brief Deprecated and ignored. DO NOT USE. + * + * TODO(aomedia:3269): Remove fixed_qp_offsets in libaom v4.0.0. + */ + int fixed_qp_offsets[5]; + + /*!\brief Options defined per config file + * + */ + cfg_options_t encoder_cfg; +} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */ + +/*!\brief Initialize an encoder instance + * + * Initializes an encoder context using the given interface. Applications + * should call the aom_codec_enc_init convenience macro instead of this + * function directly, to ensure that the ABI version number parameter + * is properly initialized. + * + * If the library was configured with -DCONFIG_MULTITHREAD=0, this call + * is not thread safe and should be guarded with a lock if being used + * in a multithreaded context. + * + * If aom_codec_enc_init_ver() fails, it is not necessary to call + * aom_codec_destroy() on the encoder context. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. + * \param[in] flags Bitfield of AOM_CODEC_USE_* flags + * \param[in] ver ABI version number. Must be set to + * AOM_ENCODER_ABI_VERSION + * \retval #AOM_CODEC_OK + * The encoder algorithm has been initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory allocation failed. + */ +aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_enc_cfg_t *cfg, + aom_codec_flags_t flags, int ver); + +/*!\brief Convenience macro for aom_codec_enc_init_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define aom_codec_enc_init(ctx, iface, cfg, flags) \ + aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION) + +/*!\brief Get the default configuration for a usage. + * + * Initializes an encoder configuration structure with default values. Supports + * the notion of "usages" so that an algorithm may offer different default + * settings depending on the user's intended goal. This function \ref SHOULD + * be called by all applications to initialize the configuration structure + * before specializing the configuration with application specific values. + * + * \param[in] iface Pointer to the algorithm interface to use. + * \param[out] cfg Configuration buffer to populate. + * \param[in] usage Algorithm specific usage value. For AV1, must be + * set to AOM_USAGE_GOOD_QUALITY (0), + * AOM_USAGE_REALTIME (1), or AOM_USAGE_ALL_INTRA (2). + * + * \retval #AOM_CODEC_OK + * The configuration was populated. + * \retval #AOM_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, + aom_codec_enc_cfg_t *cfg, + unsigned int usage); + +/*!\brief Set or change configuration + * + * Reconfigures an encoder instance according to the given configuration. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cfg Configuration buffer to use + * + * \retval #AOM_CODEC_OK + * The configuration was populated. + * \retval #AOM_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, or the usage value was not recognized. + */ +aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, + const aom_codec_enc_cfg_t *cfg); + +/*!\brief Get global stream headers + * + * Retrieves a stream level global header packet, if supported by the codec. + * Calls to this function should be deferred until all configuration information + * has been passed to libaom. Otherwise the global header data may be + * invalidated by additional configuration changes. + * + * The AV1 implementation of this function returns an OBU. The OBU returned is + * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is + * set, and the buffer contains the obu_size field for the returned OBU. + * + * \param[in] ctx Pointer to this instance's context + * + * \retval NULL + * Encoder does not support global header, or an error occurred while + * generating the global header. + * + * \retval Non-NULL + * Pointer to buffer containing global header packet. The caller owns the + * memory associated with this buffer, and must free the 'buf' member of the + * aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned + * must be freed via call to free(). + */ +aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx); + +/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */ +#define AOM_USAGE_GOOD_QUALITY 0u +/*!\brief usage parameter analogous to AV1 REALTIME mode. */ +#define AOM_USAGE_REALTIME 1u +/*!\brief usage parameter analogous to AV1 all intra mode. */ +#define AOM_USAGE_ALL_INTRA 2u + +/*!\brief Encode a frame + * + * Encodes a video frame at the given "presentation time." The presentation + * time stamp (PTS) \ref MUST be strictly increasing. + * + * When the last frame has been passed to the encoder, this function should + * continue to be called in a loop, with the img parameter set to NULL. This + * will signal the end-of-stream condition to the encoder and allow it to + * encode any held buffers. Encoding is complete when aom_codec_encode() is + * called with img set to NULL and aom_codec_get_cx_data() returns no data. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] img Image data to encode, NULL to flush. + * Encoding sample values outside the range + * [0..(1<bit_depth)-1] is undefined behavior. + * Note: Although img is declared as a const pointer, + * if AV1E_SET_DENOISE_NOISE_LEVEL is set to a nonzero + * value aom_codec_encode() modifies (denoises) the + * samples in img->planes[i] . + * \param[in] pts Presentation time stamp, in timebase units. If img + * is NULL, pts is ignored. + * \param[in] duration Duration to show frame, in timebase units. If img + * is not NULL, duration must be nonzero. If img is + * NULL, duration is ignored. + * \param[in] flags Flags to use for encoding this frame. + * + * \retval #AOM_CODEC_OK + * The configuration was populated. + * \retval #AOM_CODEC_INCAPABLE + * Interface is not an encoder interface. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned long duration, + aom_enc_frame_flags_t flags); + +/*!\brief Set compressed data output buffer + * + * Sets the buffer that the codec should output the compressed data + * into. This call effectively sets the buffer pointer returned in the + * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be + * appended into this buffer. The buffer is preserved across frames, + * so applications must periodically call this function after flushing + * the accumulated compressed data to disk or to the network to reset + * the pointer to the buffer's head. + * + * `pad_before` bytes will be skipped before writing the compressed + * data, and `pad_after` bytes will be appended to the packet. The size + * of the packet will be the sum of the size of the actual compressed + * data, pad_before, and pad_after. The padding bytes will be preserved + * (not overwritten). + * + * Note that calling this function does not guarantee that the returned + * compressed data will be placed into the specified buffer. In the + * event that the encoded data will not fit into the buffer provided, + * the returned packet \ref MAY point to an internal buffer, as it would + * if this call were never used. In this event, the output packet will + * NOT have any padding, and the application must free space and copy it + * to the proper place. This is of particular note in configurations + * that may output multiple packets for a single encoded frame (e.g., lagged + * encoding) or if the application does not reset the buffer periodically. + * + * Applications may restore the default behavior of the codec providing + * the compressed data buffer by calling this function with a NULL + * buffer. + * + * Applications \ref MUSTNOT call this function during iteration of + * aom_codec_get_cx_data(). + * + * \param[in] ctx Pointer to this instance's context + * \param[in] buf Buffer to store compressed data into + * \param[in] pad_before Bytes to skip before writing compressed data + * \param[in] pad_after Bytes to skip after writing compressed data + * + * \retval #AOM_CODEC_OK + * The buffer was set successfully. + * \retval #AOM_CODEC_INVALID_PARAM + * A parameter was NULL, the image format is unsupported, etc. + */ +aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx, + const aom_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after); + +/*!\brief Encoded data iterator + * + * Iterates over a list of data packets to be passed from the encoder to the + * application. The different kinds of packets available are enumerated in + * #aom_codec_cx_pkt_kind. + * + * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's + * muxer. Multiple compressed frames may be in the list. + * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer. + * + * The application \ref MUST silently ignore any packet kinds that it does + * not recognize or support. + * + * The data buffers returned from this function are only guaranteed to be + * valid until the application makes another call to any aom_codec_* function. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an output data packet (compressed frame data, + * two-pass statistics, etc.) or NULL to signal end-of-list. + * + */ +const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx, + aom_codec_iter_t *iter); + +/*!\brief Get Preview Frame + * + * Returns an image that can be used as a preview. Shows the image as it would + * exist at the decompressor. The application \ref MUST NOT write into this + * image buffer. + * + * \param[in] ctx Pointer to this instance's context + * + * \return Returns a pointer to a preview image, or NULL if no image is + * available. + * + */ +const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx); + +/*!@} - end defgroup encoder*/ +#ifdef __cplusplus +} +#endif +#endif // AOM_AOM_AOM_ENCODER_H_ diff --git a/third_party/aom/aom/aom_external_partition.h b/third_party/aom/aom/aom_external_partition.h new file mode 100644 index 0000000000..c381f6e5e9 --- /dev/null +++ b/third_party/aom/aom/aom_external_partition.h @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_ +#define AOM_AOM_AOM_EXTERNAL_PARTITION_H_ + +/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder + * \ingroup aom + * + * @{ + */ +#include + +/*!\file + * \brief Provides function pointer definitions for the external partition. + * + * \note The external partition API should be considered experimental. Until the + * external partition API is declared stable, breaking changes may be made to + * this API in a future libaom release. + */ + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures. + */ +#define AOM_EXT_PART_ABI_VERSION 8 + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Abstract external partition model handler + */ +typedef void *aom_ext_part_model_t; + +/*!\brief Number of features to determine whether to skip partition none and + * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT". + */ +#define AOM_EXT_PART_SIZE_DIRECT_SPLIT 17 + +/*!\brief Number of features to use simple motion search to prune out + * rectangular partition in some direction. The same as + * "FEATURE_SIZE_SMS_PRUNE_PART". + */ +#define AOM_EXT_PART_SIZE_PRUNE_PART 25 + +/*!\brief Number of features to prune split and rectangular partition + * after PARTITION_NONE. + */ +#define AOM_EXT_PART_SIZE_PRUNE_NONE 4 + +/*!\brief Number of features to terminates partition after partition none using + * simple_motion_search features and the rate, distortion, and rdcost of + * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE". + */ +#define AOM_EXT_PART_SIZE_TERM_NONE 28 + +/*!\brief Number of features to terminates partition after partition split. + */ +#define AOM_EXT_PART_SIZE_TERM_SPLIT 31 + +/*!\brief Number of features to prune rectangular partition using stats + * collected after partition split. + */ +#define AOM_EXT_PART_SIZE_PRUNE_RECT 9 + +/*!\brief Number of features to prune AB partition using stats + * collected after rectangular partition.. + */ +#define AOM_EXT_PART_SIZE_PRUNE_AB 10 + +/*!\brief Number of features to prune 4-way partition using stats + * collected after AB partition. + */ +#define AOM_EXT_PART_SIZE_PRUNE_4_WAY 18 + +/*!\brief Decision mode of the external partition model. + * AOM_EXT_PART_WHOLE_TREE: the external partition model should provide the + * whole partition tree for the superblock. + * + * AOM_EXT_PART_RECURSIVE: the external partition model provides the partition + * decision of the current block only. The decision process starts from + * the superblock size, down to the smallest block size (4x4) recursively. + */ +typedef enum aom_ext_part_decision_mode { + AOM_EXT_PART_WHOLE_TREE = 0, + AOM_EXT_PART_RECURSIVE = 1, +} aom_ext_part_decision_mode_t; + +/*!\brief Config information sent to the external partition model. + * + * For example, the maximum superblock size determined by the sequence header. + */ +typedef struct aom_ext_part_config { + int superblock_size; ///< super block size (either 64x64 or 128x128) +} aom_ext_part_config_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected before NONE partition. + * Features "f" are used to determine: + * partition_none_allowed, partition_horz_allowed, partition_vert_allowed, + * do_rectangular_split, do_square_split + * Features "f_part2" are used to determine: + * prune_horz, prune_vert. + */ +typedef struct aom_partition_features_before_none { + /*! features to determine whether skip partition none and do split directly */ + float f[AOM_EXT_PART_SIZE_DIRECT_SPLIT]; + /*! features to determine whether to prune rectangular partition */ + float f_part2[AOM_EXT_PART_SIZE_PRUNE_PART]; +} aom_partition_features_before_none_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after NONE partition. + */ +typedef struct aom_partition_features_none { + /*! features to prune split and rectangular partition */ + float f[AOM_EXT_PART_SIZE_PRUNE_NONE]; + /*! features to determine termination of partition */ + float f_terminate[AOM_EXT_PART_SIZE_TERM_NONE]; +} aom_partition_features_none_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after SPLIT partition. + */ +typedef struct aom_partition_features_split { + /*! features to determine termination of partition */ + float f_terminate[AOM_EXT_PART_SIZE_TERM_SPLIT]; + /*! features to determine pruning rect partition */ + float f_prune_rect[AOM_EXT_PART_SIZE_PRUNE_RECT]; +} aom_partition_features_split_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after RECTANGULAR partition. + */ +typedef struct aom_partition_features_rect { + /*! features to determine pruning AB partition */ + float f[AOM_EXT_PART_SIZE_PRUNE_AB]; +} aom_partition_features_rect_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A, + * VERT_B. + */ +typedef struct aom_partition_features_ab { + /*! features to determine pruning 4-way partition */ + float f[AOM_EXT_PART_SIZE_PRUNE_4_WAY]; +} aom_partition_features_ab_t; + +/*!\brief Feature id to tell the external model the current stage in partition + * pruning and what features to use to make decisions accordingly. + */ +typedef enum { + AOM_EXT_PART_FEATURE_BEFORE_NONE, + AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2, + AOM_EXT_PART_FEATURE_AFTER_NONE, + AOM_EXT_PART_FEATURE_AFTER_NONE_PART2, + AOM_EXT_PART_FEATURE_AFTER_SPLIT, + AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2, + AOM_EXT_PART_FEATURE_AFTER_RECT, + AOM_EXT_PART_FEATURE_AFTER_AB +} AOM_EXT_PART_FEATURE_ID; + +/*!\brief Features collected from the tpl process. + * + * The tpl process collects information that help measure the inter-frame + * dependency. + * The tpl process is computed in the unit of tpl_bsize_1d (16x16). + * Therefore, the max number of units inside a superblock is + * 128x128 / (16x16) = 64. Change it if the tpl process changes. + */ +typedef struct aom_sb_tpl_features { + int available; ///< If tpl stats are available + int tpl_unit_length; ///< The block length of tpl process + int num_units; ///< The number of units inside the current superblock + int64_t intra_cost[64]; ///< The intra cost of each unit + int64_t inter_cost[64]; ///< The inter cost of each unit + int64_t mc_dep_cost[64]; ///< The motion compensated dependency cost +} aom_sb_tpl_features_t; + +/*!\brief Features collected from the simple motion process. + * + * The simple motion process collects information by applying motion compensated + * prediction on each block. + * The block size is 16x16, which could be changed. If it is changed, update + * comments and the array size here. + */ +typedef struct aom_sb_simple_motion_features { + int unit_length; ///< The block length of the simple motion process + int num_units; ///< The number of units inside the current superblock + int block_sse[64]; ///< Sum of squared error of each unit + int block_var[64]; ///< Variance of each unit +} aom_sb_simple_motion_features_t; + +/*!\brief Features of each super block. + * + * Features collected for each super block before partition search. + */ +typedef struct aom_sb_features { + /*! Features from motion search */ + aom_sb_simple_motion_features_t motion_features; + /*! Features from tpl process */ + aom_sb_tpl_features_t tpl_features; +} aom_sb_features_t; + +/*!\brief Features pass to the external model to make partition decisions. + * + * The encoder sends these features to the external model through + * "func()" defined in ..... + * + * NOTE: new member variables may be added to this structure in the future. + * Once new features are finalized, bump the major version of libaom. + */ +typedef struct aom_partition_features { + // Features for the current supervised multi-stage ML model. + /*! Feature ID to indicate active features */ + AOM_EXT_PART_FEATURE_ID id; + /*! Features collected before NONE partition */ + aom_partition_features_before_none_t before_part_none; + /*! Features collected after NONE partition */ + aom_partition_features_none_t after_part_none; + /*! Features collected after SPLIT partition */ + aom_partition_features_split_t after_part_split; + /*! Features collected after RECTANGULAR partition */ + aom_partition_features_rect_t after_part_rect; + /*! Features collected after AB partition */ + aom_partition_features_ab_t after_part_ab; + + // Features for a new ML model. + aom_sb_features_t sb_features; ///< Features collected for the super block + int mi_row; ///< Mi_row position of the block + int mi_col; ///< Mi_col position of the block + int frame_width; ///< Frame width + int frame_height; ///< Frame height + int block_size; ///< As "BLOCK_SIZE" in av1/common/enums.h + /*! + * Valid partition types. A bitmask is used. "1" represents the + * corresponding type is valid. The bitmask follows the enum order for + * PARTITION_TYPE in "enums.h" to represent one partition type at a bit. + * For example, 0x01 stands for only PARTITION_NONE is valid, + * 0x09 (00...001001) stands for PARTITION_NONE and PARTITION_SPLIT are valid. + */ + int valid_partition_types; + int update_type; ///< Frame update type, defined in ratectrl.h + int qindex; ///< Quantization index, range: [0, 255] + int rdmult; ///< Rate-distortion multiplier + int pyramid_level; ///< The level of this frame in the hierarchical structure + int has_above_block; ///< Has above neighbor block + int above_block_width; ///< Width of the above block, -1 if not exist + int above_block_height; ///< Height of the above block, -1 if not exist + int has_left_block; ///< Has left neighbor block + int left_block_width; ///< Width of the left block, -1 if not exist + int left_block_height; ///< Height of the left block, -1 if not exist + /*! + * The following parameters are collected from applying simple motion search. + * Sum of squared error (SSE) and variance of motion compensated residual + * are good indicators of block partitioning. + * If a block is a square, we also apply motion search for its 4 sub blocks. + * If not a square, their values are -1. + * If a block is able to split horizontally, we apply motion search and get + * stats for horizontal blocks. If not, their values are -1. + * If a block is able to split vertically, we apply motion search and get + * stats for vertical blocks. If not, their values are -1. + */ + unsigned int block_sse; ///< SSE of motion compensated residual + unsigned int block_var; ///< Variance of motion compensated residual + unsigned int sub_block_sse[4]; ///< SSE of sub blocks. + unsigned int sub_block_var[4]; ///< Variance of sub blocks. + unsigned int horz_block_sse[2]; ///< SSE of horz sub blocks + unsigned int horz_block_var[2]; ///< Variance of horz sub blocks + unsigned int vert_block_sse[2]; ///< SSE of vert sub blocks + unsigned int vert_block_var[2]; ///< Variance of vert sub blocks + /*! + * The following parameters are calculated from tpl model. + * If tpl model is not available, their values are -1. + */ + int64_t tpl_intra_cost; ///< Intra cost, ref to "TplDepStats" in tpl_model.h + int64_t tpl_inter_cost; ///< Inter cost in tpl model + int64_t tpl_mc_dep_cost; ///< Motion compensated dependency cost in tpl model +} aom_partition_features_t; + +/*!\brief Partition decisions received from the external model. + * + * The encoder receives partition decisions and encodes the superblock + * with the given partition type. + * The encoder receives it from "func()" define in .... + * + * NOTE: new member variables may be added to this structure in the future. + * Once new features are finalized, bump the major version of libaom. + */ +typedef struct aom_partition_decision { + // Decisions for directly set partition types + int is_final_decision; ///< The flag whether it's the final decision + int num_nodes; ///< The number of leaf nodes + int partition_decision[2048]; ///< Partition decisions + int current_decision; ///< Partition decision for the current block + + // Decisions for partition type pruning + int terminate_partition_search; ///< Terminate further partition search + int partition_none_allowed; ///< Allow partition none type + int partition_rect_allowed[2]; ///< Allow rectangular partitions + int do_rectangular_split; ///< Try rectangular split partition + int do_square_split; ///< Try square split partition + int prune_rect_part[2]; ///< Prune rectangular partition + int horza_partition_allowed; ///< Allow HORZ_A partition + int horzb_partition_allowed; ///< Allow HORZ_B partition + int verta_partition_allowed; ///< Allow VERT_A partition + int vertb_partition_allowed; ///< Allow VERT_B partition + int partition_horz4_allowed; ///< Allow HORZ4 partition + int partition_vert4_allowed; ///< Allow VERT4 partition +} aom_partition_decision_t; + +/*!\brief Encoding stats for the given partition decision. + * + * The encoding stats collected by encoding the superblock with the + * given partition types. + * The encoder sends the stats to the external model for training + * or inference through "func()" defined in .... + */ +typedef struct aom_partition_stats { + int rate; ///< Rate cost of the block + int64_t dist; ///< Distortion of the block + int64_t rdcost; ///< Rate-distortion cost of the block +} aom_partition_stats_t; + +/*!\brief Enum for return status. + */ +typedef enum aom_ext_part_status { + AOM_EXT_PART_OK = 0, ///< Status of success + AOM_EXT_PART_ERROR = 1, ///< Status of failure + AOM_EXT_PART_TEST = 2, ///< Status used for tests +} aom_ext_part_status_t; + +/*!\brief Callback of creating an external partition model. + * + * The callback is invoked by the encoder to create an external partition + * model. + * + * \param[in] priv Callback's private data + * \param[in] part_config Config information pointer for model creation + * \param[out] ext_part_model Pointer to the model + */ +typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)( + void *priv, const aom_ext_part_config_t *part_config, + aom_ext_part_model_t *ext_part_model); + +/*!\brief Callback of sending features to the external partition model. + * + * The callback is invoked by the encoder to send features to the external + * partition model. + * + * \param[in] ext_part_model The external model + * \param[in] part_features Pointer to the features + */ +typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)( + aom_ext_part_model_t ext_part_model, + const aom_partition_features_t *part_features); + +/*!\brief Callback of receiving partition decisions from the external + * partition model. + * + * The callback is invoked by the encoder to receive partition decisions from + * the external partition model. + * + * \param[in] ext_part_model The external model + * \param[in] ext_part_decision Pointer to the partition decisions + */ +typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)( + aom_ext_part_model_t ext_part_model, + aom_partition_decision_t *ext_part_decision); + +/*!\brief Callback of sending stats to the external partition model. + * + * The callback is invoked by the encoder to send encoding stats to + * the external partition model. + * + * \param[in] ext_part_model The external model + * \param[in] ext_part_stats Pointer to the encoding stats + */ +typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)( + aom_ext_part_model_t ext_part_model, + const aom_partition_stats_t *ext_part_stats); + +/*!\brief Callback of deleting the external partition model. + * + * The callback is invoked by the encoder to delete the external partition + * model. + * + * \param[in] ext_part_model The external model + */ +typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)( + aom_ext_part_model_t ext_part_model); + +/*!\brief Callback function set for external partition model. + * + * Uses can enable external partition model by registering a set of + * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL + */ +typedef struct aom_ext_part_funcs { + /*! + * Create an external partition model. + */ + aom_ext_part_create_model_fn_t create_model; + + /*! + * Send features to the external partition model to make partition decisions. + */ + aom_ext_part_send_features_fn_t send_features; + + /*! + * Get partition decisions from the external partition model. + */ + aom_ext_part_get_decision_fn_t get_partition_decision; + + /*! + * Send stats of the current partition to the external model. + */ + aom_ext_part_send_partition_stats_fn_t send_partition_stats; + + /*! + * Delete the external partition model. + */ + aom_ext_part_delete_model_fn_t delete_model; + + /*! + * The decision mode of the model. + */ + aom_ext_part_decision_mode_t decision_mode; + + /*! + * Private data for the external partition model. + */ + void *priv; +} aom_ext_part_funcs_t; + +/*!@} - end defgroup aom_encoder*/ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_EXTERNAL_PARTITION_H_ diff --git a/third_party/aom/aom/aom_frame_buffer.h b/third_party/aom/aom/aom_frame_buffer.h new file mode 100644 index 0000000000..0e80373ddd --- /dev/null +++ b/third_party/aom/aom/aom_frame_buffer.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_AOM_FRAME_BUFFER_H_ +#define AOM_AOM_AOM_FRAME_BUFFER_H_ + +/*!\file + * \brief Describes the decoder external frame buffer interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_integer.h" + +/*!\brief The maximum number of work buffers used by libaom. + * Support maximum 4 threads to decode video in parallel. + * Each thread will use one work buffer. + * TODO(hkuang): Add support to set number of worker threads dynamically. + */ +#define AOM_MAXIMUM_WORK_BUFFERS 8 + +/*!\brief The maximum number of reference buffers that a AV1 encoder may use. + */ +#define AOM_MAXIMUM_REF_BUFFERS 8 + +/*!\brief External frame buffer + * + * This structure holds allocated frame buffers used by the decoder. + */ +typedef struct aom_codec_frame_buffer { + uint8_t *data; /**< Pointer to the data buffer */ + size_t size; /**< Size of data in bytes */ + void *priv; /**< Frame's private data */ +} aom_codec_frame_buffer_t; + +/*!\brief get frame buffer callback prototype + * + * This callback is invoked by the decoder to retrieve data for the frame + * buffer in order for the decode call to complete. The callback must + * allocate at least min_size in bytes and assign it to fb->data. The callback + * must zero out all the data allocated. Then the callback must set fb->size + * to the allocated size. The application does not need to align the allocated + * data. The callback is triggered when the decoder needs a frame buffer to + * decode a compressed image into. This function may be called more than once + * for every call to aom_codec_decode. The application may set fb->priv to + * some data which will be passed back in the aom_image_t and the release + * function call. |fb| is guaranteed to not be NULL. On success the callback + * must return 0. Any failure the callback must return a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] min_size Size in bytes needed by the buffer + * \param[in,out] fb Pointer to aom_codec_frame_buffer_t + */ +typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, + aom_codec_frame_buffer_t *fb); + +/*!\brief release frame buffer callback prototype + * + * This callback is invoked by the decoder when the frame buffer is not + * referenced by any other buffers. |fb| is guaranteed to not be NULL. On + * success the callback must return 0. Any failure the callback must return + * a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] fb Pointer to aom_codec_frame_buffer_t + */ +typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv, + aom_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_FRAME_BUFFER_H_ diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h new file mode 100644 index 0000000000..d5f0c087e6 --- /dev/null +++ b/third_party/aom/aom/aom_image.h @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes the aom image descriptor and associated operations + * + */ +#ifndef AOM_AOM_AOM_IMAGE_H_ +#define AOM_AOM_AOM_IMAGE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_integer.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/ + +#define AOM_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +/** 0x400 used to signal alpha channel, skipping for backwards compatibility. */ +#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ + +/*!\brief List of supported image formats */ +typedef enum aom_img_fmt { + AOM_IMG_FMT_NONE, + AOM_IMG_FMT_YV12 = + AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ + AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2, + AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | + 3, /** < planar 4:2:0 format with aom color space */ + AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4, + AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5, + AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6, +/*!\brief Allows detection of the presence of AOM_IMG_FMT_NV12 at compile time. + */ +#define AOM_HAVE_IMG_FMT_NV12 1 + AOM_IMG_FMT_NV12 = + AOM_IMG_FMT_PLANAR | 7, /**< 4:2:0 with U and V interleaved */ + AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, + AOM_IMG_FMT_YV1216 = AOM_IMG_FMT_YV12 | AOM_IMG_FMT_HIGHBITDEPTH, + AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH, + AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH, +} aom_img_fmt_t; /**< alias for enum aom_img_fmt */ + +/*!\brief List of supported color primaries */ +typedef enum aom_color_primaries { + AOM_CICP_CP_RESERVED_0 = 0, /**< For future use */ + AOM_CICP_CP_BT_709 = 1, /**< BT.709 */ + AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */ + AOM_CICP_CP_RESERVED_3 = 3, /**< For future use */ + AOM_CICP_CP_BT_470_M = 4, /**< BT.470 System M (historical) */ + AOM_CICP_CP_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ + AOM_CICP_CP_BT_601 = 6, /**< BT.601 */ + AOM_CICP_CP_SMPTE_240 = 7, /**< SMPTE 240 */ + AOM_CICP_CP_GENERIC_FILM = + 8, /**< Generic film (color filters using illuminant C) */ + AOM_CICP_CP_BT_2020 = 9, /**< BT.2020, BT.2100 */ + AOM_CICP_CP_XYZ = 10, /**< SMPTE 428 (CIE 1921 XYZ) */ + AOM_CICP_CP_SMPTE_431 = 11, /**< SMPTE RP 431-2 */ + AOM_CICP_CP_SMPTE_432 = 12, /**< SMPTE EG 432-1 */ + AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21) */ + AOM_CICP_CP_EBU_3213 = 22, /**< EBU Tech. 3213-E */ + AOM_CICP_CP_RESERVED_23 = 23 /**< For future use (values 23 - 255) */ +} aom_color_primaries_t; /**< alias for enum aom_color_primaries */ + +/*!\brief List of supported transfer functions */ +typedef enum aom_transfer_characteristics { + AOM_CICP_TC_RESERVED_0 = 0, /**< For future use */ + AOM_CICP_TC_BT_709 = 1, /**< BT.709 */ + AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */ + AOM_CICP_TC_RESERVED_3 = 3, /**< For future use */ + AOM_CICP_TC_BT_470_M = 4, /**< BT.470 System M (historical) */ + AOM_CICP_TC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ + AOM_CICP_TC_BT_601 = 6, /**< BT.601 */ + AOM_CICP_TC_SMPTE_240 = 7, /**< SMPTE 240 M */ + AOM_CICP_TC_LINEAR = 8, /**< Linear */ + AOM_CICP_TC_LOG_100 = 9, /**< Logarithmic (100 : 1 range) */ + AOM_CICP_TC_LOG_100_SQRT10 = + 10, /**< Logarithmic (100 * Sqrt(10) : 1 range) */ + AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */ + AOM_CICP_TC_BT_1361 = 12, /**< BT.1361 */ + AOM_CICP_TC_SRGB = 13, /**< sRGB or sYCC*/ + AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */ + AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */ + AOM_CICP_TC_SMPTE_2084 = 16, /**< SMPTE ST 2084, ITU BT.2100 PQ */ + AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ + AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ + AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ +} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */ + +/*!\brief List of supported matrix coefficients */ +typedef enum aom_matrix_coefficients { + AOM_CICP_MC_IDENTITY = 0, /**< Identity matrix */ + AOM_CICP_MC_BT_709 = 1, /**< BT.709 */ + AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */ + AOM_CICP_MC_RESERVED_3 = 3, /**< For future use */ + AOM_CICP_MC_FCC = 4, /**< US FCC 73.628 */ + AOM_CICP_MC_BT_470_B_G = 5, /**< BT.470 System B, G (historical) */ + AOM_CICP_MC_BT_601 = 6, /**< BT.601 */ + AOM_CICP_MC_SMPTE_240 = 7, /**< SMPTE 240 M */ + AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */ + AOM_CICP_MC_BT_2020_NCL = + 9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr */ + AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */ + AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */ + AOM_CICP_MC_CHROMAT_NCL = + 12, /**< Chromaticity-derived non-constant luminance */ + AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ + AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ + AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ +} aom_matrix_coefficients_t; + +/*!\brief List of supported color range */ +typedef enum aom_color_range { + AOM_CR_STUDIO_RANGE = 0, /**<- Y [16..235], UV [16..240] (bit depth 8) */ + /**<- Y [64..940], UV [64..960] (bit depth 10) */ + /**<- Y [256..3760], UV [256..3840] (bit depth 12) */ + AOM_CR_FULL_RANGE = 1 /**<- YUV/RGB [0..255] (bit depth 8) */ + /**<- YUV/RGB [0..1023] (bit depth 10) */ + /**<- YUV/RGB [0..4095] (bit depth 12) */ +} aom_color_range_t; /**< alias for enum aom_color_range */ + +/*!\brief List of chroma sample positions */ +typedef enum aom_chroma_sample_position { + AOM_CSP_UNKNOWN = 0, /**< Unknown */ + AOM_CSP_VERTICAL = 1, /**< Horizontally co-located with luma(0, 0)*/ + /**< sample, between two vertical samples */ + AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ + AOM_CSP_RESERVED = 3 /**< Reserved value */ +} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */ + +/*!\brief List of insert flags for Metadata + * + * These flags control how the library treats metadata during encode. + * + * While encoding, when metadata is added to an aom_image via + * aom_img_add_metadata(), the flag passed along with the metadata will + * determine where the metadata OBU will be placed in the encoded OBU stream. + * Metadata will be emitted into the output stream within the next temporal unit + * if it satisfies the specified insertion flag. + * + * During decoding, when the library encounters a metadata OBU, it is always + * flagged as AOM_MIF_ANY_FRAME and emitted with the next output aom_image. + */ +typedef enum aom_metadata_insert_flags { + AOM_MIF_NON_KEY_FRAME = 0, /**< Adds metadata if it's not keyframe */ + AOM_MIF_KEY_FRAME = 1, /**< Adds metadata only if it's a keyframe */ + AOM_MIF_ANY_FRAME = 2 /**< Adds metadata to any type of frame */ +} aom_metadata_insert_flags_t; + +/*!\brief Array of aom_metadata structs for an image. */ +typedef struct aom_metadata_array aom_metadata_array_t; + +/*!\brief Metadata payload. */ +typedef struct aom_metadata { + uint32_t type; /**< Metadata type */ + uint8_t *payload; /**< Metadata payload data */ + size_t sz; /**< Metadata payload size */ + aom_metadata_insert_flags_t insert_flag; /**< Metadata insertion flag */ +} aom_metadata_t; + +/**\brief Image Descriptor */ +typedef struct aom_image { + aom_img_fmt_t fmt; /**< Image Format */ + aom_color_primaries_t cp; /**< CICP Color Primaries */ + aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */ + aom_matrix_coefficients_t mc; /**< CICP Matrix Coefficients */ + int monochrome; /**< Whether image is monochrome */ + aom_chroma_sample_position_t csp; /**< chroma sample position */ + aom_color_range_t range; /**< Color Range */ + + /* Image storage dimensions */ + unsigned int w; /**< Stored image width */ + unsigned int h; /**< Stored image height */ + unsigned int bit_depth; /**< Stored image bit-depth */ + + /* Image display dimensions */ + unsigned int d_w; /**< Displayed image width */ + unsigned int d_h; /**< Displayed image height */ + + /* Image intended rendering dimensions */ + unsigned int r_w; /**< Intended rendering image width */ + unsigned int r_h; /**< Intended rendering image height */ + + /* Chroma subsampling info */ + unsigned int x_chroma_shift; /**< subsampling order, X */ + unsigned int y_chroma_shift; /**< subsampling order, Y */ + +/* Image data pointers. */ +#define AOM_PLANE_PACKED 0 /**< To be used for all packed formats */ +#define AOM_PLANE_Y 0 /**< Y (Luminance) plane */ +#define AOM_PLANE_U 1 /**< U (Chroma) plane */ +#define AOM_PLANE_V 2 /**< V (Chroma) plane */ + /* planes[AOM_PLANE_V] = NULL and stride[AOM_PLANE_V] = 0 when fmt == + * AOM_IMG_FMT_NV12 */ + unsigned char *planes[3]; /**< pointer to the top left pixel for each plane */ + int stride[3]; /**< stride between rows for each plane */ + size_t sz; /**< data size */ + + int bps; /**< bits per sample (for packed formats) */ + + int temporal_id; /**< Temporal layer Id of image */ + int spatial_id; /**< Spatial layer Id of image */ + + /*!\brief The following member may be set by the application to associate + * data with this image. + */ + void *user_priv; + + /* The following members should be treated as private. */ + unsigned char *img_data; /**< private */ + int img_data_owner; /**< private */ + int self_allocd; /**< private */ + + aom_metadata_array_t + *metadata; /**< Metadata payloads associated with the image. */ + + void *fb_priv; /**< Frame buffer data associated with the image. */ +} aom_image_t; /**< alias for struct aom_image */ + +/*!\brief Open a descriptor, allocating storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the image is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image (stride). + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align); + +/*!\brief Open a descriptor, using existing storage for the underlying image + * + * Returns a descriptor for storing an image of the given format. The + * storage for the image has been allocated elsewhere, and a descriptor is + * desired to "wrap" that storage. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of each row in the image + * (stride). + * \param[in] img_data Storage to use for the image + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int align, + unsigned char *img_data); + +/*!\brief Open a descriptor, allocating storage for the underlying image with a + * border + * + * Returns a descriptor for storing an image of the given format and its + * borders. The storage for the image is allocated on the heap. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image (stride). + * \param[in] size_align Alignment, in pixels, of the image width and height. + * \param[in] border A border that is padded on four sides of the image. + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + unsigned int size_align, + unsigned int border); + +/*!\brief Set the rectangle identifying the displayed portion of the image + * + * Updates the displayed rectangle (aka viewport) on the image surface to + * match the specified coordinates and size. Specifically, sets img->d_w, + * img->d_h, and elements of the img->planes[] array. + * + * \param[in] img Image descriptor + * \param[in] x leftmost column + * \param[in] y topmost row + * \param[in] w width + * \param[in] h height + * \param[in] border A border that is padded on four sides of the image. + * + * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise. + */ +int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h, unsigned int border); + +/*!\brief Flip the image vertically (top for bottom) + * + * Adjusts the image descriptor's pointers and strides to make the image + * be referenced upside-down. + * + * \param[in] img Image descriptor + */ +void aom_img_flip(aom_image_t *img); + +/*!\brief Close an image descriptor + * + * Frees all allocated storage associated with an image descriptor. + * + * \param[in] img Image descriptor + */ +void aom_img_free(aom_image_t *img); + +/*!\brief Get the width of a plane + * + * Get the width of a plane of an image + * + * \param[in] img Image descriptor + * \param[in] plane Plane index + */ +int aom_img_plane_width(const aom_image_t *img, int plane); + +/*!\brief Get the height of a plane + * + * Get the height of a plane of an image + * + * \param[in] img Image descriptor + * \param[in] plane Plane index + */ +int aom_img_plane_height(const aom_image_t *img, int plane); + +/*!\brief Add metadata to image. + * + * Adds metadata to aom_image_t. + * Function makes a copy of the provided data parameter. + * Metadata insertion point is controlled by insert_flag. + * + * \param[in] img Image descriptor + * \param[in] type Metadata type + * \param[in] data Metadata contents + * \param[in] sz Metadata contents size + * \param[in] insert_flag Metadata insert flag + * + * \return Returns 0 on success. If img or data is NULL, sz is 0, or memory + * allocation fails, it returns -1. + */ +int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data, + size_t sz, aom_metadata_insert_flags_t insert_flag); + +/*!\brief Return a metadata payload stored within the image metadata array. + * + * Gets the metadata (aom_metadata_t) at the indicated index in the image + * metadata array. + * + * \param[in] img Pointer to image descriptor to get metadata from + * \param[in] index Metadata index to get from metadata array + * + * \return Returns a const pointer to the selected metadata, if img and/or index + * is invalid, it returns NULL. + */ +const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img, + size_t index); + +/*!\brief Return the number of metadata blocks within the image. + * + * Gets the number of metadata blocks contained within the provided image + * metadata array. + * + * \param[in] img Pointer to image descriptor to get metadata number + * from. + * + * \return Returns the size of the metadata array. If img or metadata is NULL, + * it returns 0. + */ +size_t aom_img_num_metadata(const aom_image_t *img); + +/*!\brief Remove metadata from image. + * + * Removes all metadata in image metadata list and sets metadata list pointer + * to NULL. + * + * \param[in] img Image descriptor + */ +void aom_img_remove_metadata(aom_image_t *img); + +/*!\brief Allocate memory for aom_metadata struct. + * + * Allocates storage for the metadata payload, sets its type and copies the + * payload data into the aom_metadata struct. A metadata payload buffer of size + * sz is allocated and sz bytes are copied from data into the payload buffer. + * + * \param[in] type Metadata type + * \param[in] data Metadata data pointer + * \param[in] sz Metadata size + * \param[in] insert_flag Metadata insert flag + * + * \return Returns the newly allocated aom_metadata struct. If data is NULL, + * sz is 0, or memory allocation fails, it returns NULL. + */ +aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data, + size_t sz, + aom_metadata_insert_flags_t insert_flag); + +/*!\brief Free metadata struct. + * + * Free metadata struct and its buffer. + * + * \param[in] metadata Metadata struct pointer + */ +void aom_img_metadata_free(aom_metadata_t *metadata); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_IMAGE_H_ diff --git a/third_party/aom/aom/aom_integer.h b/third_party/aom/aom/aom_integer.h new file mode 100644 index 0000000000..ce65e98452 --- /dev/null +++ b/third_party/aom/aom/aom_integer.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_INTEGER_H_ +#define AOM_AOM_AOM_INTEGER_H_ + +/* get ptrdiff_t, size_t, wchar_t, NULL */ +#include // IWYU pragma: export + +#if defined(_MSC_VER) +#define AOM_FORCE_INLINE __forceinline +#define AOM_INLINE __inline +#else +#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline)) +#define AOM_INLINE inline +#endif + +/* Assume platforms have the C99 standard integer types. */ + +#if defined(__cplusplus) +#if !defined(__STDC_FORMAT_MACROS) +#define __STDC_FORMAT_MACROS +#endif +#if !defined(__STDC_LIMIT_MACROS) +#define __STDC_LIMIT_MACROS +#endif +#endif // __cplusplus + +#include // IWYU pragma: export +#include // IWYU pragma: export + +#if defined(__cplusplus) +extern "C" { +#endif // __cplusplus + +// Returns size of uint64_t when encoded using LEB128. +size_t aom_uleb_size_in_bytes(uint64_t value); + +// Returns 0 on success, -1 on decode failure. +// On success, 'value' stores the decoded LEB128 value and 'length' stores +// the number of bytes decoded. +int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value, + size_t *length); + +// Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure. +int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value, + size_t *coded_size); + +// Encodes LEB128 integer to size specified. Returns 0 when successful, and -1 +// upon failure. +// Note: This will write exactly pad_to_size bytes; if the value cannot be +// encoded in this many bytes, then this will fail. +int aom_uleb_encode_fixed_size(uint64_t value, size_t available, + size_t pad_to_size, uint8_t *coded_value, + size_t *coded_size); + +#if defined(__cplusplus) +} // extern "C" +#endif // __cplusplus + +#endif // AOM_AOM_AOM_INTEGER_H_ diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h new file mode 100644 index 0000000000..edd8cd5e7c --- /dev/null +++ b/third_party/aom/aom/aomcx.h @@ -0,0 +1,2205 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOMCX_H_ +#define AOM_AOM_AOMCX_H_ + +/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder + * \ingroup aom + * + * @{ + */ +#include "aom/aom.h" +#include "aom/aom_encoder.h" +#include "aom/aom_external_partition.h" + +/*!\file + * \brief Provides definitions for using AOM or AV1 encoder algorithm within the + * aom Codec Interface. + * + * Several interfaces are excluded with CONFIG_REALTIME_ONLY build: + * Global motion + * Warped motion + * OBMC + * TPL model + * Loop restoration + * + * The following features are also disabled with CONFIG_REALTIME_ONLY: + * AV1E_SET_QUANT_B_ADAPT + * CNN + * 4X rectangular blocks + * 4X rectangular transform in intra prediction + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\name Algorithm interface for AV1 + * + * This interface provides the capability to encode raw AV1 streams. + *@{ + */ + +/*!\brief A single instance of the AV1 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer aom_codec_av1_cx(). + */ +extern aom_codec_iface_t aom_codec_av1_cx_algo; + +/*!\brief The interface to the AV1 encoder. + */ +extern aom_codec_iface_t *aom_codec_av1_cx(void); +/*!@} - end algorithm interface member group */ + +/* + * Algorithm Flags + */ + +/*!\brief Don't reference the last frame + * + * When this flag is set, the encoder will not use the last frame as a + * predictor. When not set, the encoder will choose whether to use the + * last frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_LAST (1 << 16) +/*!\brief Don't reference the last2 frame + * + * When this flag is set, the encoder will not use the last2 frame as a + * predictor. When not set, the encoder will choose whether to use the + * last2 frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_LAST2 (1 << 17) +/*!\brief Don't reference the last3 frame + * + * When this flag is set, the encoder will not use the last3 frame as a + * predictor. When not set, the encoder will choose whether to use the + * last3 frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_LAST3 (1 << 18) +/*!\brief Don't reference the golden frame + * + * When this flag is set, the encoder will not use the golden frame as a + * predictor. When not set, the encoder will choose whether to use the + * golden frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_GF (1 << 19) + +/*!\brief Don't reference the alternate reference frame + * + * When this flag is set, the encoder will not use the alt ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * alt ref frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_ARF (1 << 20) +/*!\brief Don't reference the bwd reference frame + * + * When this flag is set, the encoder will not use the bwd ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * bwd ref frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_BWD (1 << 21) +/*!\brief Don't reference the alt2 reference frame + * + * When this flag is set, the encoder will not use the alt2 ref frame as a + * predictor. When not set, the encoder will choose whether to use the + * alt2 ref frame or not automatically. + */ +#define AOM_EFLAG_NO_REF_ARF2 (1 << 22) + +/*!\brief Don't update the last frame + * + * When this flag is set, the encoder will not update the last frame with + * the contents of the current frame. + */ +#define AOM_EFLAG_NO_UPD_LAST (1 << 23) + +/*!\brief Don't update the golden frame + * + * When this flag is set, the encoder will not update the golden frame with + * the contents of the current frame. + */ +#define AOM_EFLAG_NO_UPD_GF (1 << 24) + +/*!\brief Don't update the alternate reference frame + * + * When this flag is set, the encoder will not update the alt ref frame with + * the contents of the current frame. + */ +#define AOM_EFLAG_NO_UPD_ARF (1 << 25) +/*!\brief Disable entropy update + * + * When this flag is set, the encoder will not update its internal entropy + * model based on the entropy of this frame. + */ +#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26) +/*!\brief Disable ref frame mvs + * + * When this flag is set, the encoder will not allow frames to + * be encoded using mfmv. + */ +#define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27) +/*!\brief Enable error resilient frame + * + * When this flag is set, the encoder will code frames as error + * resilient. + */ +#define AOM_EFLAG_ERROR_RESILIENT (1 << 28) +/*!\brief Enable s frame mode + * + * When this flag is set, the encoder will code frames as an + * s frame. + */ +#define AOM_EFLAG_SET_S_FRAME (1 << 29) +/*!\brief Force primary_ref_frame to PRIMARY_REF_NONE + * + * When this flag is set, the encoder will set a frame's primary_ref_frame + * to PRIMARY_REF_NONE + */ +#define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30) + +/*!\brief AVx encoder control functions + * + * This set of macros define the control functions available for AVx + * encoder interface. + * The range of encode control ID is 7-229(max). + * + * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) + */ +enum aome_enc_control_id { + /*!\brief Codec control function to set which reference frame encoder can use, + * int parameter. + */ + AOME_USE_REFERENCE = 7, + + /*!\brief Codec control function to pass an ROI map to encoder, aom_roi_map_t* + * parameter. + */ + AOME_SET_ROI_MAP = 8, + + /*!\brief Codec control function to pass an Active map to encoder, + * aom_active_map_t* parameter. + */ + AOME_SET_ACTIVEMAP = 9, + + /* NOTE: enum 10 unused */ + + /*!\brief Codec control function to set encoder scaling mode for the next + * frame to be coded, aom_scaling_mode_t* parameter. + */ + AOME_SET_SCALEMODE = 11, + + /*!\brief Codec control function to set encoder spatial layer id, int + * parameter. + */ + AOME_SET_SPATIAL_LAYER_ID = 12, + + /*!\brief Codec control function to set encoder internal speed settings, + * int parameter + * + * Changes in this value influences the complexity of algorithms used in + * encoding process, values greater than 0 will increase encoder speed at + * the expense of quality. + * + * Valid range: 0..11. 0 runs the slowest, and 11 runs the fastest; + * quality improves as speed decreases (since more compression + * possibilities are explored). + * + * NOTE: 10 and 11 are only allowed in AOM_USAGE_REALTIME. In + * AOM_USAGE_GOOD_QUALITY and AOM_USAGE_ALL_INTRA, 9 is the highest allowed + * value. However, AOM_USAGE_GOOD_QUALITY treats 7..9 the same as 6. Also, + * AOM_USAGE_REALTIME treats 0..4 the same as 5. + */ + AOME_SET_CPUUSED = 13, + + /*!\brief Codec control function to enable automatic set and use alf frames, + * unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AOME_SET_ENABLEAUTOALTREF = 14, + + /* NOTE: enum 15 unused */ + + /*!\brief Codec control function to set the sharpness parameter, + * unsigned int parameter. + * + * This parameter controls the level at which rate-distortion optimization of + * transform coefficients favours sharpness in the block. + * + * Valid range: 0..7. The default is 0. Values 1-7 will avoid eob and skip + * block optimization and will change rdmult in favour of block sharpness. + */ + AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2, // 16 + + /*!\brief Codec control function to set the threshold for MBs treated static, + * unsigned int parameter + */ + AOME_SET_STATIC_THRESHOLD = 17, + + /* NOTE: enum 18 unused */ + + /*!\brief Codec control function to get last quantizer chosen by the encoder, + * int* parameter + * + * Return value uses internal quantizer scale defined by the codec. + */ + AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2, // 19 + + /*!\brief Codec control function to get last quantizer chosen by the encoder, + * int* parameter + * + * Return value uses the 0..63 scale as used by the rc_*_quantizer config + * parameters. + */ + AOME_GET_LAST_QUANTIZER_64 = 20, + + /*!\brief Codec control function to set the max no of frames to create arf, + * unsigned int parameter + */ + AOME_SET_ARNR_MAXFRAMES = 21, + + /*!\brief Codec control function to set the filter strength for the arf, + * unsigned int parameter + */ + AOME_SET_ARNR_STRENGTH = 22, + + /* NOTE: enum 23 unused */ + + /*!\brief Codec control function to set visual tuning, aom_tune_metric (int) + * parameter + * + * The default is AOM_TUNE_PSNR. + */ + AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2, // 24 + + /*!\brief Codec control function to set constrained / constant quality level, + * unsigned int parameter + * + * Valid range: 0..63 + * + * \attention For this value to be used aom_codec_enc_cfg_t::rc_end_usage + * must be set to #AOM_CQ or #AOM_Q. + */ + AOME_SET_CQ_LEVEL = 25, + + /*!\brief Codec control function to set max data rate for intra frames, + * unsigned int parameter + * + * This value controls additional clamping on the maximum size of a + * keyframe. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * unlimited, or no additional clamping beyond the codec's built-in + * algorithm. + * + * For example, to allocate no more than 4.5 frames worth of bitrate + * to a keyframe, set this to 450. + */ + AOME_SET_MAX_INTRA_BITRATE_PCT = 26, + + /*!\brief Codec control function to set number of spatial layers, int + * parameter + */ + AOME_SET_NUMBER_SPATIAL_LAYERS = 27, + + /*!\brief Codec control function to set max data rate for inter frames, + * unsigned int parameter + * + * This value controls additional clamping on the maximum size of an + * inter frame. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * unlimited, or no additional clamping beyond the codec's built-in + * algorithm. + * + * For example, to allow no more than 4.5 frames worth of bitrate + * to an inter frame, set this to 450. + */ + AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2, // 28 + + /*!\brief Boost percentage for Golden Frame in CBR mode, unsigned int + * parameter + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * than average frame, set this to 100. + */ + AV1E_SET_GF_CBR_BOOST_PCT = 29, + + /* NOTE: enum 30 unused */ + + /*!\brief Codec control function to set lossless encoding mode, unsigned int + * parameter + * + * AV1 can operate in lossless encoding mode, in which the bitstream + * produced will be able to decode and reconstruct a perfect copy of + * input source. + * + * - 0 = normal coding mode, may be lossy (default) + * - 1 = lossless coding mode + */ + AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2, // 31 + + /*!\brief Codec control function to enable the row based multi-threading + * of the encoder, unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ROW_MT = 32, + + /*!\brief Codec control function to set number of tile columns. unsigned int + * parameter + * + * In encoding and decoding, AV1 allows an input image frame be partitioned + * into separate vertical tile columns, which can be encoded or decoded + * independently. This enables easy implementation of parallel encoding and + * decoding. The parameter for this control describes the number of tile + * columns (in log2 units), which has a valid range of [0, 6]: + * \verbatim + 0 = 1 tile column + 1 = 2 tile columns + 2 = 4 tile columns + ..... + n = 2**n tile columns + \endverbatim + * By default, the value is 0, i.e. one single column tile for entire image. + */ + AV1E_SET_TILE_COLUMNS = 33, + + /*!\brief Codec control function to set number of tile rows, unsigned int + * parameter + * + * In encoding and decoding, AV1 allows an input image frame be partitioned + * into separate horizontal tile rows, which can be encoded or decoded + * independently. The parameter for this control describes the number of tile + * rows (in log2 units), which has a valid range of [0, 6]: + * \verbatim + 0 = 1 tile row + 1 = 2 tile rows + 2 = 4 tile rows + ..... + n = 2**n tile rows + \endverbatim + * By default, the value is 0, i.e. one single row tile for entire image. + */ + AV1E_SET_TILE_ROWS = 34, + + /*!\brief Codec control function to enable RDO modulated by frame temporal + * dependency, unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + * + * \note Excluded from CONFIG_REALTIME_ONLY build. + */ + AV1E_SET_ENABLE_TPL_MODEL = 35, + + /*!\brief Codec control function to enable temporal filtering on key frame, + * unsigned int parameter + * + * - 0 = disable + * - 1 = enable without overlay (default) + * - 2 = enable with overlay + */ + AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36, + + /*!\brief Codec control function to enable frame parallel decoding feature, + * unsigned int parameter + * + * AV1 has a bitstream feature to reduce decoding dependency between frames + * by turning off backward update of probability context used in encoding + * and decoding. This allows staged parallel processing of more than one + * video frames in the decoder. This control function provides a means to + * turn this feature on or off for bitstreams produced by encoder. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_FRAME_PARALLEL_DECODING = 37, + + /*!\brief Codec control function to enable error_resilient_mode, int parameter + * + * AV1 has a bitstream feature to guarantee parsability of a frame + * by turning on the error_resilient_decoding mode, even though the + * reference buffers are unreliable or not received. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_ERROR_RESILIENT_MODE = 38, + + /*!\brief Codec control function to enable s_frame_mode, int parameter + * + * AV1 has a bitstream feature to designate certain frames as S-frames, + * from where we can switch to a different stream, + * even though the reference buffers may not be exactly identical. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_S_FRAME_MODE = 39, + + /*!\brief Codec control function to set adaptive quantization mode, unsigned + * int parameter + * + * AV1 has a segment based feature that allows encoder to adaptively change + * quantization parameter for each segment within a frame to improve the + * subjective quality. This control makes encoder operate in one of the + * several AQ modes supported. + * + * - 0 = disable (default) + * - 1 = variance + * - 2 = complexity + * - 3 = cyclic refresh + */ + AV1E_SET_AQ_MODE = 40, + + /*!\brief Codec control function to enable/disable periodic Q boost, unsigned + * int parameter + * + * One AV1 encoder speed feature is to enable quality boost by lowering + * frame level Q periodically. This control function provides a means to + * turn on/off this feature. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_FRAME_PERIODIC_BOOST = 41, + + /*!\brief Codec control function to set noise sensitivity, unsigned int + * parameter + * + * - 0 = disable (default) + * - 1 = enable (Y only) + */ + AV1E_SET_NOISE_SENSITIVITY = 42, + + /*!\brief Codec control function to set content type, aom_tune_content + * parameter + * + * - AOM_CONTENT_DEFAULT = Regular video content (default) + * - AOM_CONTENT_SCREEN = Screen capture content + * - AOM_CONTENT_FILM = Film content + */ + AV1E_SET_TUNE_CONTENT = 43, + + /*!\brief Codec control function to set CDF update mode, unsigned int + * parameter + * + * - 0: no update + * - 1: update on every frame (default) + * - 2: selectively update + */ + AV1E_SET_CDF_UPDATE_MODE = 44, + + /*!\brief Codec control function to set color space info, int parameter + * + * - 0 = For future use + * - 1 = BT.709 + * - 2 = Unspecified (default) + * - 3 = For future use + * - 4 = BT.470 System M (historical) + * - 5 = BT.470 System B, G (historical) + * - 6 = BT.601 + * - 7 = SMPTE 240 + * - 8 = Generic film (color filters using illuminant C) + * - 9 = BT.2020, BT.2100 + * - 10 = SMPTE 428 (CIE 1921 XYZ) + * - 11 = SMPTE RP 431-2 + * - 12 = SMPTE EG 432-1 + * - 13..21 = For future use + * - 22 = EBU Tech. 3213-E + * - 23 = For future use + */ + AV1E_SET_COLOR_PRIMARIES = 45, + + /*!\brief Codec control function to set transfer function info, int parameter + * + * - 0 = For future use + * - 1 = BT.709 + * - 2 = Unspecified (default) + * - 3 = For future use + * - 4 = BT.470 System M (historical) + * - 5 = BT.470 System B, G (historical) + * - 6 = BT.601 + * - 7 = SMPTE 240 M + * - 8 = Linear + * - 9 = Logarithmic (100 : 1 range) + * - 10 = Logarithmic (100 * Sqrt(10) : 1 range) + * - 11 = IEC 61966-2-4 + * - 12 = BT.1361 + * - 13 = sRGB or sYCC + * - 14 = BT.2020 10-bit systems + * - 15 = BT.2020 12-bit systems + * - 16 = SMPTE ST 2084, ITU BT.2100 PQ + * - 17 = SMPTE ST 428 + * - 18 = BT.2100 HLG, ARIB STD-B67 + * - 19 = For future use + */ + AV1E_SET_TRANSFER_CHARACTERISTICS = 46, + + /*!\brief Codec control function to set transfer function info, int parameter + * + * - 0 = Identity matrix + * - 1 = BT.709 + * - 2 = Unspecified (default) + * - 3 = For future use + * - 4 = US FCC 73.628 + * - 5 = BT.470 System B, G (historical) + * - 6 = BT.601 + * - 7 = SMPTE 240 M + * - 8 = YCgCo + * - 9 = BT.2020 non-constant luminance, BT.2100 YCbCr + * - 10 = BT.2020 constant luminance + * - 11 = SMPTE ST 2085 YDzDx + * - 12 = Chromaticity-derived non-constant luminance + * - 13 = Chromaticity-derived constant luminance + * - 14 = BT.2100 ICtCp + * - 15 = For future use + */ + AV1E_SET_MATRIX_COEFFICIENTS = 47, + + /*!\brief Codec control function to set chroma 4:2:0 sample position info, + * aom_chroma_sample_position_t parameter + * + * AOM_CSP_UNKNOWN is default + */ + AV1E_SET_CHROMA_SAMPLE_POSITION = 48, + + /*!\brief Codec control function to set minimum interval between GF/ARF + * frames, unsigned int parameter + * + * By default the value is set as 4. + */ + AV1E_SET_MIN_GF_INTERVAL = 49, + + /*!\brief Codec control function to set minimum interval between GF/ARF + * frames, unsigned int parameter + * + * By default the value is set as 16. + */ + AV1E_SET_MAX_GF_INTERVAL = 50, + + /*!\brief Codec control function to get an active map back from the encoder, + aom_active_map_t* parameter + */ + AV1E_GET_ACTIVEMAP = 51, + + /*!\brief Codec control function to set color range bit, int parameter + * + * - 0 = Limited range, 16..235 or HBD equivalent (default) + * - 1 = Full range, 0..255 or HBD equivalent + */ + AV1E_SET_COLOR_RANGE = 52, + + /*!\brief Codec control function to set intended rendering image size, + * int32_t[2] parameter + * + * By default, this is identical to the image size in pixels. + */ + AV1E_SET_RENDER_SIZE = 53, + + /*!\brief Control to set target sequence level index for a certain operating + * point (OP), int parameter + * Possible values are in the form of "ABxy". + * - AB: OP index. + * - xy: Target level index for the OP. Possible values are: + * + 0~27: corresponding to level 2.0 ~ 8.3. Note: + * > Levels 2.2 (2), 2.3 (3), 3.2 (6), 3.3 (7), 4.2 (10) & 4.3 (11) are + * undefined. + * > Levels 7.x and 8.x (20~27) are in draft status, available under the + * config flag CONFIG_CWG_C013. + * + 31: maximum parameters level, no level-based constraints. + * + 32: keep level stats only for level monitoring. + * + * E.g.: + * - "0" means target level index 0 (2.0) for the 0th OP; + * - "109" means target level index 9 (4.1) for the 1st OP; + * - "1019" means target level index 19 (6.3) for the 10th OP. + * + * If the target level is not specified for an OP, the maximum parameters + * level of 31 is used as default. + */ + AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54, + + /*!\brief Codec control function to get sequence level index for each + * operating point. int* parameter. There can be at most 32 operating points. + * The results will be written into a provided integer array of sufficient + * size. + */ + AV1E_GET_SEQ_LEVEL_IDX = 55, + + /*!\brief Codec control function to set intended superblock size, unsigned int + * parameter + * + * By default, the superblock size is determined separately for each + * frame by the encoder. + */ + AV1E_SET_SUPERBLOCK_SIZE = 56, + + /*!\brief Codec control function to enable automatic set and use of + * bwd-pred frames, unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + */ + AOME_SET_ENABLEAUTOBWDREF = 57, + + /*!\brief Codec control function to encode with CDEF, unsigned int parameter + * + * CDEF is the constrained directional enhancement filter which is an + * in-loop filter aiming to remove coding artifacts + * + * - 0 = disable + * - 1 = enable for all frames (default) + * - 2 = disable for non-reference frames + */ + AV1E_SET_ENABLE_CDEF = 58, + + /*!\brief Codec control function to encode with Loop Restoration Filter, + * unsigned int parameter + * + * - 0 = disable + * - 1 = enable (default) + * + * \note Excluded from CONFIG_REALTIME_ONLY build. + */ + AV1E_SET_ENABLE_RESTORATION = 59, + + /*!\brief Codec control function to force video mode, unsigned int parameter + * + * - 0 = do not force video mode (default) + * - 1 = force video mode even for a single frame + */ + AV1E_SET_FORCE_VIDEO_MODE = 60, + + /*!\brief Codec control function to predict with OBMC mode, unsigned int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + * + * \note Excluded from CONFIG_REALTIME_ONLY build. + */ + AV1E_SET_ENABLE_OBMC = 61, + + /*!\brief Codec control function to encode without trellis quantization, + * unsigned int parameter + * + * - 0 = apply trellis quantization (default) + * - 1 = do not apply trellis quantization + * - 2 = disable trellis quantization in rd search + * - 3 = disable trellis quantization in estimate yrd + */ + AV1E_SET_DISABLE_TRELLIS_QUANT = 62, + + /*!\brief Codec control function to encode with quantisation matrices, + * unsigned int parameter + * + * AOM can operate with default quantisation matrices dependent on + * quantisation level and block type. + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_ENABLE_QM = 63, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the minimum level of flatness from which the matrices + * are determined. + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_MIN = 64, + + /*!\brief Codec control function to set the max quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the maximum level of flatness possible. + * + * By default, the encoder sets this maximum at the top of the + * available range. + */ + AV1E_SET_QM_MAX = 65, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the flatness for luma (Y). + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_Y = 66, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the flatness for chroma (U). + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_U = 67, + + /*!\brief Codec control function to set the min quant matrix flatness, + * unsigned int parameter + * + * AOM can operate with different ranges of quantisation matrices. + * As quantisation levels increase, the matrices get flatter. This + * control sets the flatness for chrome (V). + * + * By default, the encoder sets this minimum at half the available + * range. + */ + AV1E_SET_QM_V = 68, + + /* NOTE: enum 69 unused */ + + /*!\brief Codec control function to set a maximum number of tile groups, + * unsigned int parameter + * + * This will set the maximum number of tile groups. This will be + * overridden if an MTU size is set. The default value is 1. + */ + AV1E_SET_NUM_TG = 70, + + /*!\brief Codec control function to set an MTU size for a tile group, unsigned + * int parameter + * + * This will set the maximum number of bytes in a tile group. This can be + * exceeded only if a single tile is larger than this amount. + * + * By default, the value is 0, in which case a fixed number of tile groups + * is used. + */ + AV1E_SET_MTU = 71, + + /* NOTE: enum 72 unused */ + + /*!\brief Codec control function to enable/disable rectangular partitions, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_RECT_PARTITIONS = 73, + + /*!\brief Codec control function to enable/disable AB partitions, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_AB_PARTITIONS = 74, + + /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_1TO4_PARTITIONS = 75, + + /*!\brief Codec control function to set min partition size, int parameter + * + * min_partition_size is applied to both width and height of the partition. + * i.e, both width and height of a partition can not be smaller than + * the min_partition_size, except the partition at the picture boundary. + * + * Valid values: [4, 8, 16, 32, 64, 128]. The default value is 4 for + * 4x4. + */ + AV1E_SET_MIN_PARTITION_SIZE = 76, + + /*!\brief Codec control function to set max partition size, int parameter + * + * max_partition_size is applied to both width and height of the partition. + * i.e, both width and height of a partition can not be larger than + * the max_partition_size. + * + * Valid values:[4, 8, 16, 32, 64, 128] The default value is 128 for + * 128x128. + */ + AV1E_SET_MAX_PARTITION_SIZE = 77, + + /*!\brief Codec control function to turn on / off intra edge filter + * at sequence level, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTRA_EDGE_FILTER = 78, + + /*!\brief Codec control function to turn on / off frame order hint (int + * parameter). Affects: joint compound mode, motion field motion vector, + * ref frame sign bias + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_ORDER_HINT = 79, + + /*!\brief Codec control function to turn on / off 64-length transforms, int + * parameter + * + * This will enable or disable usage of length 64 transforms in any + * direction. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_TX64 = 80, + + /*!\brief Codec control function to turn on / off flip and identity + * transforms, int parameter + * + * This will enable or disable usage of flip and identity transform + * types in any direction. If enabled, this includes: + * - FLIPADST_DCT + * - DCT_FLIPADST + * - FLIPADST_FLIPADST + * - ADST_FLIPADST + * - FLIPADST_ADST + * - IDTX + * - V_DCT + * - H_DCT + * - V_ADST + * - H_ADST + * - V_FLIPADST + * - H_FLIPADST + * + * Valid values: + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_FLIP_IDTX = 81, + + /*!\brief Codec control function to turn on / off rectangular transforms, int + * parameter + * + * This will enable or disable usage of rectangular transforms. NOTE: + * Rectangular transforms only enabled when corresponding rectangular + * partitions are. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_RECT_TX = 82, + + /*!\brief Codec control function to turn on / off dist-wtd compound mode + * at sequence level, int parameter + * + * This will enable or disable distance-weighted compound mode. + * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced + * to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_DIST_WTD_COMP = 83, + + /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage + * at sequence level, int parameter + * + * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced + * to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_REF_FRAME_MVS = 84, + + /*!\brief Codec control function to set temporal mv prediction + * enabling/disabling at frame level, int parameter + * + * \attention If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is + * forced to 0. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ALLOW_REF_FRAME_MVS = 85, + + /*!\brief Codec control function to turn on / off dual interpolation filter + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable + */ + AV1E_SET_ENABLE_DUAL_FILTER = 86, + + /*!\brief Codec control function to turn on / off delta quantization in chroma + * planes for a sequence, int parameter + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_ENABLE_CHROMA_DELTAQ = 87, + + /*!\brief Codec control function to turn on / off masked compound usage + * (wedge and diff-wtd compound modes) for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_MASKED_COMP = 88, + + /*!\brief Codec control function to turn on / off one sided compound usage + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_ONESIDED_COMP = 89, + + /*!\brief Codec control function to turn on / off interintra compound + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTERINTRA_COMP = 90, + + /*!\brief Codec control function to turn on / off smooth inter-intra + * mode for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_SMOOTH_INTERINTRA = 91, + + /*!\brief Codec control function to turn on / off difference weighted + * compound, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_DIFF_WTD_COMP = 92, + + /*!\brief Codec control function to turn on / off interinter wedge + * compound, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTERINTER_WEDGE = 93, + + /*!\brief Codec control function to turn on / off interintra wedge + * compound, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_INTERINTRA_WEDGE = 94, + + /*!\brief Codec control function to turn on / off global motion usage + * for a sequence, int parameter + * + * - 0 = disable + * - 1 = enable (default) + * + * \note Excluded from CONFIG_REALTIME_ONLY build. + */ + AV1E_SET_ENABLE_GLOBAL_MOTION = 95, + + /*!\brief Codec control function to turn on / off warped motion usage + * at sequence level, int parameter + * + * - 0 = disable + * - 1 = enable (default) + * + * \note Excluded from CONFIG_REALTIME_ONLY build. + */ + AV1E_SET_ENABLE_WARPED_MOTION = 96, + + /*!\brief Codec control function to turn on / off warped motion usage + * at frame level, int parameter + * + * \attention If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is + * forced to 0. + * + * - 0 = disable + * - 1 = enable (default) + * + * \note Excluded from CONFIG_REALTIME_ONLY build. + */ + AV1E_SET_ALLOW_WARPED_MOTION = 97, + + /*!\brief Codec control function to turn on / off filter intra usage at + * sequence level, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_FILTER_INTRA = 98, + + /*!\brief Codec control function to turn on / off smooth intra modes usage, + * int parameter + * + * This will enable or disable usage of smooth, smooth_h and smooth_v intra + * modes. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_SMOOTH_INTRA = 99, + + /*!\brief Codec control function to turn on / off Paeth intra mode usage, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_PAETH_INTRA = 100, + + /*!\brief Codec control function to turn on / off CFL uv intra mode usage, int + * parameter + * + * This will enable or disable usage of chroma-from-luma intra mode. + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_CFL_INTRA = 101, + + /*!\brief Codec control function to turn on / off frame superresolution, int + * parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_SUPERRES = 102, + + /*!\brief Codec control function to turn on / off overlay frames for + * filtered ALTREF frames, int parameter + * + * This will enable or disable coding of overlay frames for filtered ALTREF + * frames. When set to 0, overlay frames are not used but show existing frame + * is used to display the filtered ALTREF frame as is. As a result the decoded + * frame rate remains the same as the display frame rate. The default is 1. + */ + AV1E_SET_ENABLE_OVERLAY = 103, + + /*!\brief Codec control function to turn on/off palette mode, int parameter */ + AV1E_SET_ENABLE_PALETTE = 104, + + /*!\brief Codec control function to turn on/off intra block copy mode, int + parameter */ + AV1E_SET_ENABLE_INTRABC = 105, + + /*!\brief Codec control function to turn on/off intra angle delta, int + parameter */ + AV1E_SET_ENABLE_ANGLE_DELTA = 106, + + /*!\brief Codec control function to set the delta q mode, unsigned int + * parameter + * + * AV1 supports a delta q mode feature, that allows modulating q per + * superblock. + * + * - 0 = deltaq signaling off + * - 1 = use modulation to maximize objective quality (default) + * - 2 = use modulation for local test + * - 3 = use modulation for key frame perceptual quality optimization + * - 4 = use modulation for user rating based perceptual quality optimization + */ + AV1E_SET_DELTAQ_MODE = 107, + + /*!\brief Codec control function to turn on/off loopfilter modulation + * when delta q modulation is enabled, unsigned int parameter. + * + * \attention AV1 only supports loopfilter modulation when delta q + * modulation is enabled as well. + */ + AV1E_SET_DELTALF_MODE = 108, + + /*!\brief Codec control function to set the single tile decoding mode, + * unsigned int parameter + * + * \attention Only applicable if large scale tiling is on. + * + * - 0 = single tile decoding is off + * - 1 = single tile decoding is on (default) + */ + AV1E_SET_SINGLE_TILE_DECODING = 109, + + /*!\brief Codec control function to enable the extreme motion vector unit + * test, unsigned int parameter + * + * - 0 = off + * - 1 = MAX_EXTREME_MV + * - 2 = MIN_EXTREME_MV + * + * \note This is only used in motion vector unit test. + */ + AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST = 110, + + /*!\brief Codec control function to signal picture timing info in the + * bitstream, aom_timing_info_type_t parameter. Default is + * AOM_TIMING_UNSPECIFIED. + */ + AV1E_SET_TIMING_INFO_TYPE = 111, + + /*!\brief Codec control function to add film grain parameters (one of several + * preset types) info in the bitstream, int parameter + * + Valid range: 0..16, 0 is unknown, 1..16 are test vectors + */ + AV1E_SET_FILM_GRAIN_TEST_VECTOR = 112, + + /*!\brief Codec control function to set the path to the film grain parameters, + * const char* parameter + */ + AV1E_SET_FILM_GRAIN_TABLE = 113, + + /*!\brief Sets the noise level, int parameter */ + AV1E_SET_DENOISE_NOISE_LEVEL = 114, + + /*!\brief Sets the denoisers block size, unsigned int parameter */ + AV1E_SET_DENOISE_BLOCK_SIZE = 115, + + /*!\brief Sets the chroma subsampling x value, unsigned int parameter */ + AV1E_SET_CHROMA_SUBSAMPLING_X = 116, + + /*!\brief Sets the chroma subsampling y value, unsigned int parameter */ + AV1E_SET_CHROMA_SUBSAMPLING_Y = 117, + + /*!\brief Control to use a reduced tx type set, int parameter */ + AV1E_SET_REDUCED_TX_TYPE_SET = 118, + + /*!\brief Control to use dct only for intra modes, int parameter */ + AV1E_SET_INTRA_DCT_ONLY = 119, + + /*!\brief Control to use dct only for inter modes, int parameter */ + AV1E_SET_INTER_DCT_ONLY = 120, + + /*!\brief Control to use default tx type only for intra modes, int parameter + */ + AV1E_SET_INTRA_DEFAULT_TX_ONLY = 121, + + /*!\brief Control to use adaptive quantize_b, int parameter */ + AV1E_SET_QUANT_B_ADAPT = 122, + + /*!\brief Control to select maximum height for the GF group pyramid structure, + * unsigned int parameter + * + * Valid range: 0..5 + */ + AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123, + + /*!\brief Control to select maximum reference frames allowed per frame, int + * parameter + * + * Valid range: 3..7 + */ + AV1E_SET_MAX_REFERENCE_FRAMES = 124, + + /*!\brief Control to use reduced set of single and compound references, int + parameter */ + AV1E_SET_REDUCED_REFERENCE_SET = 125, + + /*!\brief Control to set frequency of the cost updates for coefficients, + * unsigned int parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_COEFF_COST_UPD_FREQ = 126, + + /*!\brief Control to set frequency of the cost updates for mode, unsigned int + * parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_MODE_COST_UPD_FREQ = 127, + + /*!\brief Control to set frequency of the cost updates for motion vectors, + * unsigned int parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_MV_COST_UPD_FREQ = 128, + + /*!\brief Control to set bit mask that specifies which tier each of the 32 + * possible operating points conforms to, unsigned int parameter + * + * - 0 = main tier (default) + * - 1 = high tier + */ + AV1E_SET_TIER_MASK = 129, + + /*!\brief Control to set minimum compression ratio, unsigned int parameter + * Take integer values. If non-zero, encoder will try to keep the compression + * ratio of each frame to be higher than the given value divided by 100. + * E.g. 850 means minimum compression ratio of 8.5. + */ + AV1E_SET_MIN_CR = 130, + + /* NOTE: enums 145-149 unused */ + + /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t* + * parameter + */ + AV1E_SET_SVC_LAYER_ID = 131, + + /*!\brief Codec control function to set SVC parameters, aom_svc_params_t* + * parameter + */ + AV1E_SET_SVC_PARAMS = 132, + + /*!\brief Codec control function to set reference frame config: + * the ref_idx and the refresh flags for each buffer slot. + * aom_svc_ref_frame_config_t* parameter + */ + AV1E_SET_SVC_REF_FRAME_CONFIG = 133, + + /*!\brief Codec control function to set the path to the VMAF model used when + * tuning the encoder for VMAF, const char* parameter + */ + AV1E_SET_VMAF_MODEL_PATH = 134, + + /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder, + * unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + * + * \note This is only used in lightfield example test. + */ + AV1E_ENABLE_EXT_TILE_DEBUG = 135, + + /*!\brief Codec control function to enable the superblock multipass unit test + * in AV1 to ensure that the encoder does not leak state between different + * passes. unsigned int parameter. + * + * - 0 = disable (default) + * - 1 = enable + * + * \note This is only used in sb_multipass unit test. + */ + AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 136, + + /*!\brief Control to select minimum height for the GF group pyramid structure, + * unsigned int parameter + * + * Valid values: 0..5 + */ + AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 137, + + /*!\brief Control to set average complexity of the corpus in the case of + * single pass vbr based on LAP, unsigned int parameter + */ + AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 138, + + /*!\brief Control to get baseline gf interval + */ + AV1E_GET_BASELINE_GF_INTERVAL = 139, + + /*\brief Control to set encoding the denoised frame from denoise-noise-level + * + * - 0 = disabled/encode the original frame + * - 1 = enabled/encode the denoised frame (default) + */ + AV1E_SET_ENABLE_DNL_DENOISING = 140, + + /*!\brief Codec control function to turn on / off D45 to D203 intra mode + * usage, int parameter + * + * This will enable or disable usage of D45 to D203 intra modes, which are a + * subset of directional modes. This control has no effect if directional + * modes are disabled (AV1E_SET_ENABLE_DIRECTIONAL_INTRA set to 0). + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_DIAGONAL_INTRA = 141, + + /*!\brief Control to set frequency of the cost updates for intrabc motion + * vectors, unsigned int parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_DV_COST_UPD_FREQ = 142, + + /*!\brief Codec control to set the path for partition stats read and write. + * const char * parameter. + */ + AV1E_SET_PARTITION_INFO_PATH = 143, + + /*!\brief Codec control to use an external partition model + * A set of callback functions is passed through this control + * to let the encoder encode with given partitions. + */ + AV1E_SET_EXTERNAL_PARTITION = 144, + + /*!\brief Codec control function to turn on / off directional intra mode + * usage, int parameter + * + * - 0 = disable + * - 1 = enable (default) + */ + AV1E_SET_ENABLE_DIRECTIONAL_INTRA = 145, + + /*!\brief Control to turn on / off transform size search. + * Note: it can not work with non RD pick mode in real-time encoding, + * where the max transform size is only 16x16. + * It will be ignored if non RD pick mode is set. + * + * - 0 = disable, transforms always have the largest possible size + * - 1 = enable, search for the best transform size for each block (default) + */ + AV1E_SET_ENABLE_TX_SIZE_SEARCH = 146, + + /*!\brief Codec control function to set reference frame compound prediction. + * aom_svc_ref_frame_comp_pred_t* parameter + */ + AV1E_SET_SVC_REF_FRAME_COMP_PRED = 147, + + /*!\brief Set --deltaq-mode strength. + * + * Valid range: [0, 1000] + */ + AV1E_SET_DELTAQ_STRENGTH = 148, + + /*!\brief Codec control to control loop filter + * + * - 0 = Loop filter is disabled for all frames + * - 1 = Loop filter is enabled for all frames + * - 2 = Loop filter is disabled for non-reference frames + * - 3 = Loop filter is disabled for the frames with low motion + */ + AV1E_SET_LOOPFILTER_CONTROL = 149, + + /*!\brief Codec control function to get the loopfilter chosen by the encoder, + * int* parameter + */ + AOME_GET_LOOPFILTER_LEVEL = 150, + + /*!\brief Codec control to automatically turn off several intra coding tools, + * unsigned int parameter + * - 0 = do not use the feature + * - 1 = enable the automatic decision to turn off several intra tools + */ + AV1E_SET_AUTO_INTRA_TOOLS_OFF = 151, + + /*!\brief Codec control function to set flag for rate control used by external + * encoders. + * - 1 = Enable rate control for external encoders. This will disable content + * dependency in rate control and cyclic refresh. + * - 0 = Default. Disable rate control for external encoders. + */ + AV1E_SET_RTC_EXTERNAL_RC = 152, + + /*!\brief Codec control function to enable frame parallel multi-threading + * of the encoder, unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + */ + AV1E_SET_FP_MT = 153, + + /*!\brief Codec control to enable actual frame parallel encode or + * simulation of frame parallel encode in FPMT unit test, unsigned int + * parameter + * + * - 0 = simulate frame parallel encode + * - 1 = actual frame parallel encode (default) + * + * \note This is only used in FPMT unit test. + */ + AV1E_SET_FP_MT_UNIT_TEST = 154, + + /*!\brief Codec control function to get the target sequence level index for + * each operating point. int* parameter. There can be at most 32 operating + * points. The results will be written into a provided integer array of + * sufficient size. If a target level is not set, the result will be 31. + * Please refer to https://aomediacodec.github.io/av1-spec/#levels for more + * details on level definitions and indices. + */ + AV1E_GET_TARGET_SEQ_LEVEL_IDX = 155, + + /*!\brief Codec control function to get the number of operating points. int* + * parameter. + */ + AV1E_GET_NUM_OPERATING_POINTS = 156, + + /*!\brief Codec control function to skip the application of post-processing + * filters on reconstructed frame, unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + * + * \attention For this value to be used aom_codec_enc_cfg_t::g_usage + * must be set to AOM_USAGE_ALL_INTRA. + */ + AV1E_SET_SKIP_POSTPROC_FILTERING = 157, + + /*!\brief Codec control function to enable the superblock level + * qp sweep in AV1 to ensure that end-to-end test runs well, + * unsigned int parameter. + * + * - 0 = disable (default) + * - 1 = enable + * + * \note This is only used in sb_qp_sweep unit test. + */ + AV1E_ENABLE_SB_QP_SWEEP = 158, + + /*!\brief Codec control to set quantizer for the next frame, int parameter. + * + * - Valid range [0, 63] + * + * This will turn off cyclic refresh. Only applicable to 1-pass. + */ + AV1E_SET_QUANTIZER_ONE_PASS = 159, + + /*!\brief Codec control to enable the rate distribution guided delta + * quantization in all intra mode, unsigned int parameter + * + * - 0 = disable (default) + * - 1 = enable + * + * \attention This feature requires --deltaq-mode=3, also an input file + * which contains rate distribution for each 16x16 block, + * passed in by --rate-distribution-info=rate_distribution.txt. + */ + AV1E_ENABLE_RATE_GUIDE_DELTAQ = 160, + + /*!\brief Codec control to set the input file for rate distribution used + * in all intra mode, const char * parameter + * The input should be the name of a text file, which + * contains (rows x cols) float values separated by space. + * Each float value represent the number of bits for each 16x16 block. + * rows = (frame_height + 15) / 16 + * cols = (frame_width + 15) / 16 + * + * \attention This feature requires --enable-rate-guide-deltaq=1. + */ + AV1E_SET_RATE_DISTRIBUTION_INFO = 161, + + /*!\brief Codec control to get the CDEF strength for Y / luma plane, + * int * parameter. + * Returns an integer array of CDEF_MAX_STRENGTHS elements. + */ + AV1E_GET_LUMA_CDEF_STRENGTH = 162, + + /*!\brief Codec control to set the target bitrate in kilobits per second, + * unsigned int parameter. For 1 pass CBR mode, single layer encoding. + * This controls replaces the call aom_codec_enc_config_set(&codec, &cfg) + * when only target bitrate is changed, and so is much cheaper as it + * bypasses a lot of unneeded code checks. + */ + AV1E_SET_BITRATE_ONE_PASS_CBR = 163, + + /*!\brief Codec control to set the maximum number of consecutive frame drops + * allowed for the frame dropper in 1 pass CBR mode, int parameter. Value of + * zero has no effect. + */ + AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164, + + /*!\brief Codec control to set the frame drop mode for SVC, + * unsigned int parameter. The valid values are constants of the + * AOM_SVC_FRAME_DROP_MODE enum: AOM_LAYER_DROP or AOM_FULL_SUPERFRAME_DROP. + */ + AV1E_SET_SVC_FRAME_DROP_MODE = 165, + + // Any new encoder control IDs should be added above. + // Maximum allowed encoder control ID is 229. + // No encoder control ID should be added below. +}; + +/*!\brief aom 1-D scaling mode + * + * This set of constants define 1-D aom scaling modes + */ +typedef enum aom_scaling_mode_1d { + AOME_NORMAL = 0, + AOME_FOURFIVE = 1, + AOME_THREEFIVE = 2, + AOME_THREEFOUR = 3, + AOME_ONEFOUR = 4, + AOME_ONEEIGHT = 5, + AOME_ONETWO = 6, + AOME_TWOTHREE = 7, + AOME_ONETHREE = 8 +} AOM_SCALING_MODE; + +/*!\brief Max number of segments + * + * This is the limit of number of segments allowed within a frame. + * + * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports. + * + */ +#define AOM_MAX_SEGMENTS 8 + +/*!\brief aom region of interest map + * + * These defines the data structures for the region of interest map + * + * TODO(yaowu): create a unit test for ROI map related APIs + * + */ +typedef struct aom_roi_map { + /*! An id between 0 and 7 for each 8x8 region within a frame. */ + unsigned char *roi_map; + unsigned int rows; /**< Number of rows. */ + unsigned int cols; /**< Number of columns. */ + int delta_q[AOM_MAX_SEGMENTS]; /**< Quantizer deltas. */ + int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */ + /*! Static breakout threshold for each segment. */ + unsigned int static_threshold[AOM_MAX_SEGMENTS]; +} aom_roi_map_t; + +/*!\brief aom active region map + * + * These defines the data structures for active region map + * + */ + +typedef struct aom_active_map { + /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */ + unsigned char *active_map; + unsigned int rows; /**< number of rows */ + unsigned int cols; /**< number of cols */ +} aom_active_map_t; + +/*!\brief aom image scaling mode + * + * This defines the data structure for image scaling mode + * + */ +typedef struct aom_scaling_mode { + AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */ + AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode */ +} aom_scaling_mode_t; + +/*!brief AV1 encoder content type */ +typedef enum { + AOM_CONTENT_DEFAULT, + AOM_CONTENT_SCREEN, + AOM_CONTENT_FILM, + AOM_CONTENT_INVALID +} aom_tune_content; + +/*!brief AV1 encoder timing info type signaling */ +typedef enum { + AOM_TIMING_UNSPECIFIED, + AOM_TIMING_EQUAL, + AOM_TIMING_DEC_MODEL +} aom_timing_info_type_t; + +/*!\brief Model tuning parameters + * + * Changes the encoder to tune for certain types of input material. + * + */ +typedef enum { + AOM_TUNE_PSNR = 0, + AOM_TUNE_SSIM = 1, + /* NOTE: enums 2 and 3 unused */ + AOM_TUNE_VMAF_WITH_PREPROCESSING = 4, + AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5, + AOM_TUNE_VMAF_MAX_GAIN = 6, + AOM_TUNE_VMAF_NEG_MAX_GAIN = 7, + AOM_TUNE_BUTTERAUGLI = 8, + AOM_TUNE_VMAF_SALIENCY_MAP = 9, +} aom_tune_metric; + +/*!\brief Distortion metric to use for RD optimization. + * + * Changes the encoder to use a different distortion metric for RD search. Note + * that this value operates on a "lower level" compared to aom_tune_metric - it + * affects the distortion metric inside a block, while aom_tune_metric only + * affects RD across blocks. + * + */ +typedef enum { + // Use PSNR for in-block rate-distortion optimization. + AOM_DIST_METRIC_PSNR, + // Use quantization matrix-weighted PSNR for in-block rate-distortion + // optimization. If --enable-qm=1 is not specified, this falls back to + // behaving in the same way as AOM_DIST_METRIC_PSNR. + AOM_DIST_METRIC_QM_PSNR, +} aom_dist_metric; + +#define AOM_MAX_LAYERS 32 /**< Max number of layers */ +#define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */ +#define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */ + +/*!brief Struct for spatial and temporal layer ID */ +typedef struct aom_svc_layer_id { + int spatial_layer_id; /**< Spatial layer ID */ + int temporal_layer_id; /**< Temporal layer ID */ +} aom_svc_layer_id_t; + +/*!brief Parameter type for SVC + * + * In the arrays of size AOM_MAX_LAYERS, the index for spatial layer `sl` and + * temporal layer `tl` is sl * number_temporal_layers + tl. + * + */ +typedef struct aom_svc_params { + int number_spatial_layers; /**< Number of spatial layers */ + int number_temporal_layers; /**< Number of temporal layers */ + int max_quantizers[AOM_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[AOM_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[AOM_MAX_SS_LAYERS]; /**< Scaling factor-numerator */ + int scaling_factor_den[AOM_MAX_SS_LAYERS]; /**< Scaling factor-denominator */ + /*! Target bitrate for each layer, in kilobits per second */ + int layer_target_bitrate[AOM_MAX_LAYERS]; + /*! Frame rate factor for each temporal layer */ + int framerate_factor[AOM_MAX_TS_LAYERS]; +} aom_svc_params_t; + +/*!brief Parameters for setting ref frame config */ +typedef struct aom_svc_ref_frame_config { + // 7 references: The index 0 - 6 refers to the references: + // last(0), last2(1), last3(2), golden(3), bwdref(4), altref2(5), altref(6). + int reference[7]; /**< Reference flag for each of the 7 references. */ + /*! Buffer slot index for each of 7 references indexed above. */ + int ref_idx[7]; + int refresh[8]; /**< Refresh flag for each of the 8 slots. */ +} aom_svc_ref_frame_config_t; + +/*!brief Parameters for setting ref frame compound prediction */ +typedef struct aom_svc_ref_frame_comp_pred { + // Use compound prediction for the ref_frame pairs GOLDEN_LAST (0), + // LAST2_LAST (1), and ALTREF_LAST (2). + int use_comp_pred[3]; /**= 256. + * + * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) + */ +enum aom_dec_control_id { + /*!\brief Codec control function to get info on which reference frames were + * updated by the last decode, int* parameter + */ + AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START, + + /*!\brief Codec control function to check if the indicated frame is + corrupted, int* parameter + */ + AOMD_GET_FRAME_CORRUPTED, + + /*!\brief Codec control function to get info on which reference frames were + * used by the last decode, int* parameter + */ + AOMD_GET_LAST_REF_USED, + + /*!\brief Codec control function to get the dimensions that the current + * frame is decoded at, int* parameter + * + * This may be different to the intended display size for the frame as + * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). + */ + AV1D_GET_FRAME_SIZE, + + /*!\brief Codec control function to get the current frame's intended display + * dimensions (as specified in the wrapper or frame header), int* parameter + * + * This may be different to the decoded dimensions of this frame (see + * AV1D_GET_FRAME_SIZE). + */ + AV1D_GET_DISPLAY_SIZE, + + /*!\brief Codec control function to get the bit depth of the stream, + * unsigned int* parameter + */ + AV1D_GET_BIT_DEPTH, + + /*!\brief Codec control function to get the image format of the stream, + * aom_img_fmt_t* parameter + */ + AV1D_GET_IMG_FORMAT, + + /*!\brief Codec control function to get the size of the tile, unsigned int* + * parameter + */ + AV1D_GET_TILE_SIZE, + + /*!\brief Codec control function to get the tile count in a tile list, + * unsigned int* parameter + */ + AV1D_GET_TILE_COUNT, + + /*!\brief Codec control function to set the byte alignment of the planes in + * the reference buffers, int parameter + * + * Valid values are power of 2, from 32 to 1024. A value of 0 sets + * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly + * follows Y plane, and V plane directly follows U plane. Default value is 0. + */ + AV1_SET_BYTE_ALIGNMENT, + + /*!\brief Codec control function to invert the decoding order to from right to + * left, int parameter + * + * The function is used in a test to confirm the decoding independence of tile + * columns. The function may be used in application where this order + * of decoding is desired. int parameter + * + * TODO(yaowu): Rework the unit test that uses this control, and in a future + * release, this test-only control shall be removed. + */ + AV1_INVERT_TILE_DECODE_ORDER, + + /*!\brief Codec control function to set the skip loop filter flag, int + * parameter + * + * Valid values are integers. The decoder will skip the loop filter + * when its value is set to nonzero. If the loop filter is skipped the + * decoder may accumulate decode artifacts. The default value is 0. + */ + AV1_SET_SKIP_LOOP_FILTER, + + /*!\brief Codec control function to retrieve a pointer to the Accounting + * struct, takes Accounting** as parameter + * + * If called before a frame has been decoded, this returns AOM_CODEC_ERROR. + * The caller should ensure that AOM_CODEC_OK is returned before attempting + * to dereference the Accounting pointer. + * + * \attention When configured with -DCONFIG_ACCOUNTING=0, the default, this + * returns AOM_CODEC_INCAPABLE. + */ + AV1_GET_ACCOUNTING, + + /*!\brief Codec control function to get last decoded frame quantizer, + * int* parameter + * + * Returned value uses internal quantizer scale defined by the codec. + */ + AOMD_GET_LAST_QUANTIZER, + + /*!\brief Codec control function to set the range of tile decoding, int + * parameter + * + * A value that is greater and equal to zero indicates only the specific + * row/column is decoded. A value that is -1 indicates the whole row/column + * is decoded. A special case is both values are -1 that means the whole + * frame is decoded. + */ + AV1_SET_DECODE_TILE_ROW, + AV1_SET_DECODE_TILE_COL, + + /*!\brief Codec control function to set the tile coding mode, unsigned int + * parameter + * + * - 0 = tiles are coded in normal tile mode + * - 1 = tiles are coded in large-scale tile mode + */ + AV1_SET_TILE_MODE, + + /*!\brief Codec control function to get the frame header information of an + * encoded frame, aom_tile_data* parameter + */ + AV1D_GET_FRAME_HEADER_INFO, + + /*!\brief Codec control function to get the start address and size of a + * tile in the coded bitstream, aom_tile_data* parameter. + */ + AV1D_GET_TILE_DATA, + + /*!\brief Codec control function to set the external references' pointers in + * the decoder, av1_ext_ref_frame_t* parameter. + * + * This is used while decoding the tile list OBU in large-scale tile coding + * mode. + */ + AV1D_SET_EXT_REF_PTR, + + /*!\brief Codec control function to enable the ext-tile software debug and + * testing code in the decoder, unsigned int parameter + */ + AV1D_EXT_TILE_DEBUG, + + /*!\brief Codec control function to enable the row based multi-threading of + * decoding, unsigned int parameter + * + * - 0 = disabled + * - 1 = enabled (default) + */ + AV1D_SET_ROW_MT, + + /*!\brief Codec control function to indicate whether bitstream is in + * Annex-B format, unsigned int parameter + */ + AV1D_SET_IS_ANNEXB, + + /*!\brief Codec control function to indicate which operating point to use, + * int parameter + * + * A scalable stream may define multiple operating points, each of which + * defines a set of temporal and spatial layers to be processed. The + * operating point index may take a value between 0 and + * operating_points_cnt_minus_1 (which is at most 31). + */ + AV1D_SET_OPERATING_POINT, + + /*!\brief Codec control function to indicate whether to output one frame per + * temporal unit (the default), or one frame per spatial layer, int parameter + * + * In a scalable stream, each temporal unit corresponds to a single "frame" + * of video, and within a temporal unit there may be multiple spatial layers + * with different versions of that frame. + * For video playback, only the highest-quality version (within the + * selected operating point) is needed, but for some use cases it is useful + * to have access to multiple versions of a frame when they are available. + */ + AV1D_SET_OUTPUT_ALL_LAYERS, + + /*!\brief Codec control function to set an aom_inspect_cb callback that is + * invoked each time a frame is decoded, aom_inspect_init* parameter + * + * \attention When configured with -DCONFIG_INSPECTION=0, the default, this + * returns AOM_CODEC_INCAPABLE. + */ + AV1_SET_INSPECTION_CALLBACK, + + /*!\brief Codec control function to set the skip film grain flag, int + * parameter + * + * Valid values are integers. The decoder will skip the film grain when its + * value is set to nonzero. The default value is 0. + */ + AV1D_SET_SKIP_FILM_GRAIN, + + /*!\brief Codec control function to check the presence of forward key frames, + * int* parameter + */ + AOMD_GET_FWD_KF_PRESENT, + + /*!\brief Codec control function to get the frame flags of the previous frame + * decoded, int* parameter + * + * This will return a flag of type aom_codec_frame_flags_t. + */ + AOMD_GET_FRAME_FLAGS, + + /*!\brief Codec control function to check the presence of altref frames, int* + * parameter + */ + AOMD_GET_ALTREF_PRESENT, + + /*!\brief Codec control function to get tile information of the previous frame + * decoded, aom_tile_info* parameter + * + * This will return a struct of type aom_tile_info. + */ + AOMD_GET_TILE_INFO, + + /*!\brief Codec control function to get screen content tools information, + * aom_screen_content_tools_info* parameter + * + * It returns a struct of type aom_screen_content_tools_info, which contains + * the header flags allow_screen_content_tools, allow_intrabc, and + * force_integer_mv. + */ + AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, + + /*!\brief Codec control function to get the still picture coding information, + * aom_still_picture_info* parameter + */ + AOMD_GET_STILL_PICTURE, + + /*!\brief Codec control function to get superblock size, + * aom_superblock_size_t* parameter + * + * It returns an enum, indicating the superblock size read from the sequence + * header(0 for BLOCK_64X64 and 1 for BLOCK_128X128) + */ + AOMD_GET_SB_SIZE, + + /*!\brief Codec control function to check if the previous frame + * decoded has show existing frame flag set, int* parameter + */ + AOMD_GET_SHOW_EXISTING_FRAME_FLAG, + + /*!\brief Codec control function to get the S_FRAME coding information, + * aom_s_frame_info* parameter + */ + AOMD_GET_S_FRAME_INFO, + + /*!\brief Codec control function to get the show frame flag, int* parameter + */ + AOMD_GET_SHOW_FRAME_FLAG, + + /*!\brief Codec control function to get the base q index of a frame, int* + * parameter + */ + AOMD_GET_BASE_Q_IDX, + + /*!\brief Codec control function to get the order hint of a frame, unsigned + * int* parameter + */ + AOMD_GET_ORDER_HINT, + + /*!\brief Codec control function to get the info of a 4x4 block. + * Parameters: int mi_row, int mi_col, and MB_MODE_INFO*. + * + * \note This only returns a shallow copy, so all pointer members should not + * be used. + */ + AV1D_GET_MI_INFO, +}; + +/*!\cond */ +/*!\brief AOM decoder control function parameter type + * + * Defines the data types that AOMD control functions take. + * + * \note Additional common controls are defined in aom.h. + * + * \note For each control ID "X", a macro-define of + * AOM_CTRL_X is provided. It is used at compile time to determine + * if the control ID is supported by the libaom library available, + * when the libaom version cannot be controlled. + */ +AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *) +#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES + +AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *) +#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED + +AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *) +#define AOM_CTRL_AOMD_GET_LAST_REF_USED + +AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *) +#define AOM_CTRL_AV1D_GET_FRAME_SIZE + +AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *) +#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE + +AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *) +#define AOM_CTRL_AV1D_GET_BIT_DEPTH + +AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *) +#define AOM_CTRL_AV1D_GET_IMG_FORMAT + +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *) +#define AOM_CTRL_AV1D_GET_TILE_SIZE + +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *) +#define AOM_CTRL_AV1D_GET_TILE_COUNT + +AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int) +#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER + +AOM_CTRL_USE_TYPE(AV1_SET_SKIP_LOOP_FILTER, int) +#define AOM_CTRL_AV1_SET_SKIP_LOOP_FILTER + +AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **) +#define AOM_CTRL_AV1_GET_ACCOUNTING + +AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *) +#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER + +AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int) +#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW + +AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int) +#define AOM_CTRL_AV1_SET_DECODE_TILE_COL + +AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int) +#define AOM_CTRL_AV1_SET_TILE_MODE + +AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *) +#define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO + +AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *) +#define AOM_CTRL_AV1D_GET_TILE_DATA + +AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *) +#define AOM_CTRL_AV1D_SET_EXT_REF_PTR + +AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int) +#define AOM_CTRL_AV1D_EXT_TILE_DEBUG + +AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int) +#define AOM_CTRL_AV1D_SET_ROW_MT + +AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int) +#define AOM_CTRL_AV1D_SET_IS_ANNEXB + +AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int) +#define AOM_CTRL_AV1D_SET_OPERATING_POINT + +AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int) +#define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS + +AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *) +#define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK + +AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int) +#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN + +AOM_CTRL_USE_TYPE(AOMD_GET_FWD_KF_PRESENT, int *) +#define AOM_CTRL_AOMD_GET_FWD_KF_PRESENT + +AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_FLAGS, int *) +#define AOM_CTRL_AOMD_GET_FRAME_FLAGS + +AOM_CTRL_USE_TYPE(AOMD_GET_ALTREF_PRESENT, int *) +#define AOM_CTRL_AOMD_GET_ALTREF_PRESENT + +AOM_CTRL_USE_TYPE(AOMD_GET_TILE_INFO, aom_tile_info *) +#define AOM_CTRL_AOMD_GET_TILE_INFO + +AOM_CTRL_USE_TYPE(AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, + aom_screen_content_tools_info *) +#define AOM_CTRL_AOMD_GET_SCREEN_CONTENT_TOOLS_INFO + +AOM_CTRL_USE_TYPE(AOMD_GET_STILL_PICTURE, aom_still_picture_info *) +#define AOM_CTRL_AOMD_GET_STILL_PICTURE + +AOM_CTRL_USE_TYPE(AOMD_GET_SB_SIZE, aom_superblock_size_t *) +#define AOMD_CTRL_AOMD_GET_SB_SIZE + +AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_EXISTING_FRAME_FLAG, int *) +#define AOMD_CTRL_AOMD_GET_SHOW_EXISTING_FRAME_FLAG + +AOM_CTRL_USE_TYPE(AOMD_GET_S_FRAME_INFO, aom_s_frame_info *) +#define AOMD_CTRL_AOMD_GET_S_FRAME_INFO + +AOM_CTRL_USE_TYPE(AOMD_GET_SHOW_FRAME_FLAG, int *) +#define AOM_CTRL_AOMD_GET_SHOW_FRAME_FLAG + +AOM_CTRL_USE_TYPE(AOMD_GET_BASE_Q_IDX, int *) +#define AOM_CTRL_AOMD_GET_BASE_Q_IDX + +AOM_CTRL_USE_TYPE(AOMD_GET_ORDER_HINT, unsigned int *) +#define AOM_CTRL_AOMD_GET_ORDER_HINT + +// The AOM_CTRL_USE_TYPE macro can't be used with AV1D_GET_MI_INFO because +// AV1D_GET_MI_INFO takes more than one parameter. +#define AOM_CTRL_AV1D_GET_MI_INFO +/*!\endcond */ +/*! @} - end defgroup aom_decoder */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOMDX_H_ diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com new file mode 100644 index 0000000000..266e2943a3 --- /dev/null +++ b/third_party/aom/aom/exports_com @@ -0,0 +1,42 @@ +text aom_codec_build_config +text aom_codec_control +text aom_codec_destroy +text aom_codec_err_to_string +text aom_codec_error +text aom_codec_error_detail +text aom_codec_get_caps +text aom_codec_iface_name +text aom_codec_set_option +text aom_codec_version +text aom_codec_version_extra_str +text aom_codec_version_str +text aom_free +text aom_img_add_metadata +text aom_img_alloc +text aom_img_alloc_with_border +text aom_img_flip +text aom_img_free +text aom_img_get_metadata +text aom_img_metadata_array_free +text aom_img_metadata_array_alloc +text aom_img_metadata_free +text aom_img_metadata_alloc +text aom_img_num_metadata +text aom_img_plane_height +text aom_img_plane_width +text aom_img_remove_metadata +text aom_img_set_rect +text aom_img_wrap +text aom_malloc +text aom_rb_bytes_read +text aom_rb_read_bit +text aom_rb_read_literal +text aom_rb_read_uvlc +text aom_uleb_decode +text aom_uleb_encode +text aom_uleb_encode_fixed_size +text aom_uleb_size_in_bytes +text aom_wb_bytes_written +text aom_wb_write_bit +text aom_wb_write_literal +text aom_wb_write_unsigned_literal diff --git a/third_party/aom/aom/exports_dec b/third_party/aom/aom/exports_dec new file mode 100644 index 0000000000..ffff023ddd --- /dev/null +++ b/third_party/aom/aom/exports_dec @@ -0,0 +1,8 @@ +text aom_codec_dec_init_ver +text aom_codec_decode +text aom_codec_get_frame +text aom_codec_get_stream_info +text aom_codec_peek_stream_info +text aom_codec_set_frame_buffer_functions +text aom_obu_type_to_string +text aom_read_obu_header diff --git a/third_party/aom/aom/exports_enc b/third_party/aom/aom/exports_enc new file mode 100644 index 0000000000..1473d9d2b5 --- /dev/null +++ b/third_party/aom/aom/exports_enc @@ -0,0 +1,17 @@ +text aom_codec_enc_config_default +text aom_codec_enc_config_set +text aom_codec_enc_init_ver +text aom_codec_encode +text aom_codec_get_cx_data +text aom_codec_get_global_headers +text aom_codec_get_preview_frame +text aom_codec_set_cx_data_buf +text aom_film_grain_table_append +text aom_film_grain_table_free +text aom_film_grain_table_write +text aom_flat_block_finder_init +text aom_flat_block_finder_run +text aom_noise_model_init +text aom_noise_model_get_grain_parameters +text aom_noise_model_save_latest +text aom_noise_model_update diff --git a/third_party/aom/aom/exports_test b/third_party/aom/aom/exports_test new file mode 100644 index 0000000000..452a532ce6 --- /dev/null +++ b/third_party/aom/aom/exports_test @@ -0,0 +1,4 @@ +text aom_copy_metadata_to_frame_buffer +text aom_dsp_rtcd +text aom_remove_metadata_from_frame_buffer +text aom_scale_rtcd diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h new file mode 100644 index 0000000000..b854a889e0 --- /dev/null +++ b/third_party/aom/aom/internal/aom_codec_internal.h @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes the decoder algorithm interface for algorithm + * implementations. + * + * This file defines the private structures and data types that are only + * relevant to implementing an algorithm, as opposed to using it. + * + * To create a decoder algorithm class, an interface structure is put + * into the global namespace: + *
+ *     my_codec.c:
+ *       aom_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           AOM_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     
+ * + * An application instantiates a specific decoder instance by using + * aom_codec_dec_init() and a pointer to the algorithm's interface structure: + *
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           int threads = 4;
+ *           aom_codec_dec_cfg_t cfg = { threads, 0, 0, 1 };
+ *           res = aom_codec_dec_init(&algo, &my_codec, &cfg, 0);
+ *       }
+ *     
+ * + * Once initialized, the instance is managed using other functions from + * the aom_codec_* family. + */ +#ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ +#define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ +#include "../aom_decoder.h" +#include "../aom_encoder.h" +#include "common/args_helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/ + +typedef struct aom_codec_alg_priv aom_codec_alg_priv_t; + +/*!\brief init function pointer prototype + * + * Performs algorithm-specific initialization of the decoder context. This + * function is called by aom_codec_dec_init() and aom_codec_enc_init(), so + * plugins implementing this interface may trust the input parameters to be + * properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #AOM_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef aom_codec_err_t (*aom_codec_init_fn_t)(aom_codec_ctx_t *ctx); + +/*!\brief destroy function pointer prototype + * + * Performs algorithm-specific destruction of the decoder context. This + * function is called by the generic aom_codec_destroy() wrapper function, + * so plugins implementing this interface may trust the input parameters + * to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \retval #AOM_CODEC_OK + * The input stream was recognized and decoder initialized. + * \retval #AOM_CODEC_MEM_ERROR + * Memory operation failed. + */ +typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx); + +/*!\brief parse stream info function pointer prototype + * + * Performs high level parsing of the bitstream. This function is called by the + * generic aom_codec_peek_stream_info() wrapper function, so plugins + * implementing this interface may trust the input parameters to be properly + * initialized. + * + * \param[in] data Pointer to a block of data to parse + * \param[in] data_sz Size of the data buffer + * \param[in,out] si Pointer to stream info to update. The is_annexb + * member \ref MUST be properly initialized. This + * function sets the rest of the members. + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data, + size_t data_sz, + aom_codec_stream_info_t *si); + +/*!\brief Return information about the current stream. + * + * Returns information about the stream that has been parsed during decoding. + * + * \param[in] ctx Pointer to this instance's context + * \param[in,out] si Pointer to stream info to update + * + * \retval #AOM_CODEC_OK + * Bitstream is parsable and stream information updated + */ +typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx, + aom_codec_stream_info_t *si); + +/*!\brief control function pointer prototype + * + * This function is used to exchange algorithm specific data with the decoder + * instance. This can be used to implement features specific to a particular + * algorithm. + * + * This function is called by the generic aom_codec_control() wrapper + * function, so plugins implementing this interface may trust the input + * parameters to be properly initialized. However, this interface does not + * provide type safety for the exchanged data or assign meanings to the + * control IDs. Those details should be specified in the algorithm's + * header file. In particular, the ctrl_id parameter is guaranteed to exist + * in the algorithm's control mapping table, and the data parameter may be NULL. + * + * + * \param[in] ctx Pointer to this instance's context + * \param[in] ctrl_id Algorithm specific control identifier + * \param[in,out] data Data to exchange with algorithm instance. + * + * \retval #AOM_CODEC_OK + * The internal state data was deserialized. + */ +typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx, + va_list ap); + +/*!\brief codec option setter function pointer prototype + * This function is used to set a codec option using a key (option name) & value + * pair. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] name A string of the option's name (key) + * \param[in] value A string of the value to be set to + * + * \retval #AOM_CODEC_OK + * The option is successfully set to the value + * \retval #AOM_CODEC_INVALID_PARAM + * The data was not valid. + */ +typedef aom_codec_err_t (*aom_codec_set_option_fn_t)(aom_codec_alg_priv_t *ctx, + const char *name, + const char *value); + +/*!\brief control function pointer mapping + * + * This structure stores the mapping between control identifiers and + * implementing functions. Each algorithm provides a list of these + * mappings. This list is searched by the aom_codec_control() + * function to determine which function to invoke. The special + * value defined by CTRL_MAP_END is used to indicate end-of-list, and must be + * present. It can be tested with the at_ctrl_map_end function. Note that + * ctrl_id values \ref MUST be non-zero. + */ +typedef const struct aom_codec_ctrl_fn_map { + int ctrl_id; + aom_codec_control_fn_t fn; +} aom_codec_ctrl_fn_map_t; + +#define CTRL_MAP_END \ + { 0, NULL } + +static AOM_INLINE int at_ctrl_map_end(aom_codec_ctrl_fn_map_t *e) { + return e->ctrl_id == 0 && e->fn == NULL; +} + +/*!\brief decode data function pointer prototype + * + * Processes a buffer of coded data. This function is called by the generic + * aom_codec_decode() wrapper function, so plugins implementing this interface + * may trust the input parameters to be properly initialized. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] data Pointer to this block of new coded data. + * \param[in] data_sz Size of the coded data, in bytes. + * + * \return Returns #AOM_CODEC_OK if the coded data was processed completely + * and future pictures can be decoded without error. Otherwise, + * see the descriptions of the other error codes in ::aom_codec_err_t + * for recoverability capabilities. + */ +typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx, + const uint8_t *data, + size_t data_sz, + void *user_priv); + +/*!\brief Decoded frames iterator + * + * Iterates over a list of the frames available for display. The iterator + * storage should be initialized to NULL to start the iteration. Iteration is + * complete when this function returns NULL. + * + * The list of available frames becomes valid upon completion of the + * aom_codec_decode call, and remains valid until the next call to + * aom_codec_decode. + * + * \param[in] ctx Pointer to this instance's context + * \param[in out] iter Iterator storage, initialized to NULL + * + * \return Returns a pointer to an image, if one is ready for display. Frames + * produced will always be in PTS (presentation time stamp) order. + */ +typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx, + aom_codec_iter_t *iter); + +/*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libaom needs a frame buffer + * to decode the current frame and a function to be called when libaom does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libaom will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #AOM_CODEC_OK + * External frame buffers will be used by libaom. + * \retval #AOM_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #AOM_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding AV1, the application may be required to pass in at least + * #AOM_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ +typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)( + aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + +typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx, + const aom_image_t *img, + aom_codec_pts_t pts, + unsigned long duration, + aom_enc_frame_flags_t flags); +typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)( + aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter); + +typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)( + aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg); +typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)( + aom_codec_alg_priv_t *ctx); + +typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)( + aom_codec_alg_priv_t *ctx); + +/*!\brief Decoder algorithm interface + * + * All decoders \ref MUST expose a variable of this type. + */ +struct aom_codec_iface { + const char *name; /**< Identification String */ + int abi_version; /**< Implemented ABI version */ + aom_codec_caps_t caps; /**< Decoder capabilities */ + aom_codec_init_fn_t init; /**< \copydoc ::aom_codec_init_fn_t */ + aom_codec_destroy_fn_t destroy; /**< \copydoc ::aom_codec_destroy_fn_t */ + aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */ + struct aom_codec_dec_iface { + aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */ + aom_codec_get_si_fn_t get_si; /**< \copydoc ::aom_codec_get_si_fn_t */ + aom_codec_decode_fn_t decode; /**< \copydoc ::aom_codec_decode_fn_t */ + aom_codec_get_frame_fn_t + get_frame; /**< \copydoc ::aom_codec_get_frame_fn_t */ + aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */ + } dec; + struct aom_codec_enc_iface { + int cfg_count; + const aom_codec_enc_cfg_t *cfgs; /**< \copydoc ::aom_codec_enc_cfg_t */ + aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */ + aom_codec_get_cx_data_fn_t + get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */ + aom_codec_enc_config_set_fn_t + cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */ + aom_codec_get_global_headers_fn_t + get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */ + aom_codec_get_preview_frame_fn_t + get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */ + } enc; + aom_codec_set_option_fn_t set_option; +}; + +/*!\brief Instance private storage + * + * This structure is allocated by the algorithm's init function. It can be + * extended in one of two ways. First, a second, algorithm specific structure + * can be allocated and the priv member pointed to it. Alternatively, this + * structure can be made the first member of the algorithm specific structure, + * and the pointer cast to the proper type. + */ +struct aom_codec_priv { + const char *err_detail; + aom_codec_flags_t init_flags; + struct { + aom_fixed_buf_t cx_data_dst_buf; + unsigned int cx_data_pad_before; + unsigned int cx_data_pad_after; + aom_codec_cx_pkt_t cx_data_pkt; + } enc; +}; + +#define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id) + +/* Internal Utility Functions + * + * The following functions are intended to be used inside algorithms as + * utilities for manipulating aom_codec_* data structures. + */ +struct aom_codec_pkt_list { + unsigned int cnt; + unsigned int max; + struct aom_codec_cx_pkt pkts[1]; +}; + +#define aom_codec_pkt_list_decl(n) \ + union { \ + struct aom_codec_pkt_list head; \ + struct { \ + struct aom_codec_pkt_list head; \ + struct aom_codec_cx_pkt pkts[n]; \ + } alloc; \ + } + +#define aom_codec_pkt_list_init(m) \ + (m)->alloc.head.cnt = 0, \ + (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0]) + +int aom_codec_pkt_list_add(struct aom_codec_pkt_list *, + const struct aom_codec_cx_pkt *); + +const aom_codec_cx_pkt_t *aom_codec_pkt_list_get( + struct aom_codec_pkt_list *list, aom_codec_iter_t *iter); + +#include +#include + +struct aom_internal_error_info { + aom_codec_err_t error_code; + int has_detail; + char detail[ARG_ERR_MSG_MAX_LEN]; + int setjmp; // Boolean: whether 'jmp' is valid. + jmp_buf jmp; +}; + +#define CLANG_ANALYZER_NORETURN +#if defined(__has_feature) +#if __has_feature(attribute_analyzer_noreturn) +#undef CLANG_ANALYZER_NORETURN +#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) +#endif +#endif + +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define LIBAOM_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef LIBAOM_FORMAT_PRINTF +#define LIBAOM_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + +// Records the error code and error message. Does not call longjmp(). +void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, + const char *fmt, ...) LIBAOM_FORMAT_PRINTF(3, 4); + +void aom_internal_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, ...) + LIBAOM_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN; + +// Calls aom_internal_error() with the error code and error message in `src`. +// `info` and `src` must not point to the same struct, i.e., self copy is +// prohibited. +void aom_internal_error_copy(struct aom_internal_error_info *info, + const struct aom_internal_error_info *src) + CLANG_ANALYZER_NORETURN; + +void aom_merge_corrupted_flag(int *corrupted, int value); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_ diff --git a/third_party/aom/aom/internal/aom_image_internal.h b/third_party/aom/aom/internal/aom_image_internal.h new file mode 100644 index 0000000000..1b04c9ec3f --- /dev/null +++ b/third_party/aom/aom/internal/aom_image_internal.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes the internal functions associated with the aom image + * descriptor. + * + */ +#ifndef AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ +#define AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ + +#include "aom/aom_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Array of aom_metadata structs for an image. */ +struct aom_metadata_array { + size_t sz; /* Number of metadata structs in the list */ + aom_metadata_t **metadata_array; /* Array of metadata structs */ +}; + +/*!\brief Alloc memory for aom_metadata_array struct. + * + * Allocate memory for aom_metadata_array struct. + * If sz is 0 the aom_metadata_array struct's internal buffer list will be + * NULL, but the aom_metadata_array struct itself will still be allocated. + * Returns a pointer to the allocated struct or NULL on failure. + * + * \param[in] sz Size of internal metadata list buffer + */ +aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz); + +/*!\brief Free metadata array struct. + * + * Free metadata array struct and all metadata structs inside. + * + * \param[in] arr Metadata array struct pointer + */ +void aom_img_metadata_array_free(aom_metadata_array_t *arr); + +typedef void *(*aom_alloc_img_data_cb_fn_t)(void *priv, size_t size); + +/*!\brief Open a descriptor, allocating storage for the underlying image by + * using the provided callback function. + * + * Returns a descriptor for storing an image of the given format. The storage + * for the image is allocated by using the provided callback function. Unlike + * aom_img_alloc(), the returned descriptor does not own the storage for the + * image. The caller is responsible for freeing the storage for the image. + * + * Note: If the callback function is invoked and succeeds, + * aom_img_alloc_with_cb() is guaranteed to succeed. Therefore, if + * aom_img_alloc_with_cb() fails, the caller is assured that no storage was + * allocated. + * + * \param[in] img Pointer to storage for descriptor. If this parameter + * is NULL, the storage for the descriptor will be + * allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image + * \param[in] d_h Height of the image + * \param[in] align Alignment, in bytes, of the image buffer and + * each row in the image (stride). + * \param[in] alloc_cb Callback function used to allocate storage for the + * image. + * \param[in] cb_priv The first argument ('priv') for the callback + * function. + * + * \return Returns a pointer to the initialized image descriptor. If the img + * parameter is non-null, the value of the img parameter will be + * returned. + */ +aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + aom_alloc_img_data_cb_fn_t alloc_cb, + void *cb_priv); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_ diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c new file mode 100644 index 0000000000..512fd28196 --- /dev/null +++ b/third_party/aom/aom/src/aom_codec.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "aom/aom_integer.h" +#include "aom/internal/aom_codec_internal.h" + +int aom_codec_version(void) { return VERSION_PACKED; } + +const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; } + +const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; } + +const char *aom_codec_iface_name(aom_codec_iface_t *iface) { + return iface ? iface->name : ""; +} + +const char *aom_codec_err_to_string(aom_codec_err_t err) { + switch (err) { + case AOM_CODEC_OK: return "Success"; + case AOM_CODEC_ERROR: return "Unspecified internal error"; + case AOM_CODEC_MEM_ERROR: return "Memory allocation error"; + case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch"; + case AOM_CODEC_INCAPABLE: + return "Codec does not implement requested capability"; + case AOM_CODEC_UNSUP_BITSTREAM: + return "Bitstream not supported by this decoder"; + case AOM_CODEC_UNSUP_FEATURE: + return "Bitstream required feature not supported by this decoder"; + case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected"; + case AOM_CODEC_INVALID_PARAM: return "Invalid parameter"; + case AOM_CODEC_LIST_END: return "End of iterated list"; + } + + return "Unrecognized error code"; +} + +const char *aom_codec_error(const aom_codec_ctx_t *ctx) { + return (ctx) ? aom_codec_err_to_string(ctx->err) + : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM); +} + +const char *aom_codec_error_detail(const aom_codec_ctx_t *ctx) { + if (ctx && ctx->err) + return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; + + return NULL; +} + +aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) { + if (!ctx) { + return AOM_CODEC_INVALID_PARAM; + } + if (!ctx->iface || !ctx->priv) { + ctx->err = AOM_CODEC_ERROR; + return AOM_CODEC_ERROR; + } + ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv); + ctx->iface = NULL; + ctx->name = NULL; + ctx->priv = NULL; + ctx->err = AOM_CODEC_OK; + return AOM_CODEC_OK; +} + +aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) { + return iface ? iface->caps : 0; +} + +aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) { + if (!ctx) { + return AOM_CODEC_INVALID_PARAM; + } + // Control ID must be non-zero. + if (!ctrl_id) { + ctx->err = AOM_CODEC_INVALID_PARAM; + return AOM_CODEC_INVALID_PARAM; + } + if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps) { + ctx->err = AOM_CODEC_ERROR; + return AOM_CODEC_ERROR; + } + + // "ctrl_maps" is an array of (control ID, function pointer) elements, + // with CTRL_MAP_END as a sentinel. + for (aom_codec_ctrl_fn_map_t *entry = ctx->iface->ctrl_maps; + !at_ctrl_map_end(entry); ++entry) { + if (entry->ctrl_id == ctrl_id) { + va_list ap; + va_start(ap, ctrl_id); + ctx->err = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap); + va_end(ap); + return ctx->err; + } + } + ctx->err = AOM_CODEC_ERROR; + ctx->priv->err_detail = "Invalid control ID"; + return AOM_CODEC_ERROR; +} + +aom_codec_err_t aom_codec_set_option(aom_codec_ctx_t *ctx, const char *name, + const char *value) { + if (!ctx) { + return AOM_CODEC_INVALID_PARAM; + } + if (!ctx->iface || !ctx->priv || !ctx->iface->set_option) { + ctx->err = AOM_CODEC_ERROR; + return AOM_CODEC_ERROR; + } + ctx->err = + ctx->iface->set_option((aom_codec_alg_priv_t *)ctx->priv, name, value); + return ctx->err; +} + +LIBAOM_FORMAT_PRINTF(3, 0) +static void set_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, va_list ap) { + info->error_code = error; + info->has_detail = 0; + + if (fmt) { + size_t sz = sizeof(info->detail); + + info->has_detail = 1; + vsnprintf(info->detail, sz - 1, fmt, ap); + info->detail[sz - 1] = '\0'; + } +} + +void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, + const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + set_error(info, error, fmt, ap); + va_end(ap); + + assert(!info->setjmp); +} + +void aom_internal_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + set_error(info, error, fmt, ap); + va_end(ap); + + if (info->setjmp) longjmp(info->jmp, info->error_code); +} + +void aom_internal_error_copy(struct aom_internal_error_info *info, + const struct aom_internal_error_info *src) { + assert(info != src); + + if (!src->has_detail) { + aom_internal_error(info, src->error_code, NULL); + } else { + aom_internal_error(info, src->error_code, "%s", src->detail); + } +} + +void aom_merge_corrupted_flag(int *corrupted, int value) { + *corrupted |= value; +} + +const char *aom_obu_type_to_string(OBU_TYPE type) { + switch (type) { + case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER"; + case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER"; + case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER"; + case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER"; + case OBU_FRAME: return "OBU_FRAME"; + case OBU_TILE_GROUP: return "OBU_TILE_GROUP"; + case OBU_METADATA: return "OBU_METADATA"; + case OBU_TILE_LIST: return "OBU_TILE_LIST"; + case OBU_PADDING: return "OBU_PADDING"; + default: break; + } + return ""; +} diff --git a/third_party/aom/aom/src/aom_decoder.c b/third_party/aom/aom/src/aom_decoder.c new file mode 100644 index 0000000000..49fff26352 --- /dev/null +++ b/third_party/aom/aom/src/aom_decoder.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Provides the high level interface to wrap decoder algorithms. + * + */ +#include +#include "aom/internal/aom_codec_internal.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) { + return (aom_codec_alg_priv_t *)ctx->priv; +} + +aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_dec_cfg_t *cfg, + aom_codec_flags_t flags, int ver) { + aom_codec_err_t res; + + if (ver != AOM_DECODER_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!ctx || !iface) + res = AOM_CODEC_INVALID_PARAM; + else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!(iface->caps & AOM_CODEC_CAP_DECODER)) + res = AOM_CODEC_INCAPABLE; + else { + memset(ctx, 0, sizeof(*ctx)); + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.dec = cfg; + + res = ctx->iface->init(ctx); + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + aom_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface, + const uint8_t *data, size_t data_sz, + aom_codec_stream_info_t *si) { + aom_codec_err_t res; + + if (!iface || !data || !data_sz || !si) { + res = AOM_CODEC_INVALID_PARAM; + } else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = iface->dec.peek_si(data, data_sz, si); + } + + return res; +} + +aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx, + aom_codec_stream_info_t *si) { + aom_codec_err_t res; + + if (!ctx || !si) { + res = AOM_CODEC_INVALID_PARAM; + } else if (!ctx->iface || !ctx->priv) { + res = AOM_CODEC_ERROR; + } else { + /* Set default/unknown values */ + si->w = 0; + si->h = 0; + + res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data, + size_t data_sz, void *user_priv) { + aom_codec_err_t res; + + if (!ctx) + res = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = AOM_CODEC_ERROR; + else { + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv); + } + + return SAVE_STATUS(ctx, res); +} + +aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) { + aom_image_t *img; + + if (!ctx || !iter || !ctx->iface || !ctx->priv) + img = NULL; + else + img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); + + return img; +} + +aom_codec_err_t aom_codec_set_frame_buffer_functions( + aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + aom_codec_err_t res; + + if (!ctx || !cb_get || !cb_release) { + res = AOM_CODEC_INVALID_PARAM; + } else if (!ctx->iface || !ctx->priv) { + res = AOM_CODEC_ERROR; + } else if (!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { + res = AOM_CODEC_INCAPABLE; + } else { + res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, + cb_priv); + } + + return SAVE_STATUS(ctx, res); +} diff --git a/third_party/aom/aom/src/aom_encoder.c b/third_party/aom/aom/src/aom_encoder.c new file mode 100644 index 0000000000..70e0b75bcd --- /dev/null +++ b/third_party/aom/aom/src/aom_encoder.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Provides the high level interface to wrap encoder algorithms. + * + */ +#include "config/aom_config.h" + +#if HAVE_FEXCEPT +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#endif + +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/internal/aom_codec_internal.h" + +#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) + +static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) { + return (aom_codec_alg_priv_t *)ctx->priv; +} + +aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx, + aom_codec_iface_t *iface, + const aom_codec_enc_cfg_t *cfg, + aom_codec_flags_t flags, int ver) { + aom_codec_err_t res; + // The value of AOM_ENCODER_ABI_VERSION in libaom v3.0.0 and v3.1.0 - v3.1.3. + // + // We are compatible with these older libaom releases. AOM_ENCODER_ABI_VERSION + // was incremented after these releases for two reasons: + // 1. AOM_ENCODER_ABI_VERSION takes contribution from + // AOM_EXT_PART_ABI_VERSION. The external partition API is still + // experimental, so it should not be considered as part of the stable ABI. + // fd9ed8366 External partition: Define APIs + // https://aomedia-review.googlesource.com/c/aom/+/135663 + // 2. As a way to detect the presence of speeds 7-9 in all-intra mode. I (wtc) + // suggested this change because I misunderstood how + // AOM_ENCODER_ABI_VERSION was used. + // bbdfa68d1 AllIntra: Redefine all-intra mode speed features for speed 7+ + // https://aomedia-review.googlesource.com/c/aom/+/140624 + const int aom_encoder_abi_version_25 = 25; + + // TODO(bug aomedia:3228): Remove the check for aom_encoder_abi_version_25 in + // libaom v4.0.0. + if (ver != AOM_ENCODER_ABI_VERSION && ver != aom_encoder_abi_version_25) + res = AOM_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg) + res = AOM_CODEC_INVALID_PARAM; + else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION) + res = AOM_CODEC_ABI_MISMATCH; + else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR)) + res = AOM_CODEC_INCAPABLE; + else if ((flags & AOM_CODEC_USE_HIGHBITDEPTH) && + !(iface->caps & AOM_CODEC_CAP_HIGHBITDEPTH)) { + res = AOM_CODEC_INCAPABLE; + } else if (cfg->g_bit_depth > 8 && + (flags & AOM_CODEC_USE_HIGHBITDEPTH) == 0) { + res = AOM_CODEC_INVALID_PARAM; + ctx->err_detail = + "High bit-depth used without the AOM_CODEC_USE_HIGHBITDEPTH flag."; + } else { + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx); + + if (res) { + // IMPORTANT: ctx->priv->err_detail must be null or point to a string + // that remains valid after ctx->priv is destroyed, such as a C string + // literal. This makes it safe to call aom_codec_error_detail() after + // aom_codec_enc_init_ver() failed. + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + aom_codec_destroy(ctx); + } + } + + return SAVE_STATUS(ctx, res); +} + +aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface, + aom_codec_enc_cfg_t *cfg, + unsigned int usage) { + aom_codec_err_t res; + + if (!iface || !cfg) + res = AOM_CODEC_INVALID_PARAM; + else if (!(iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else { + res = AOM_CODEC_INVALID_PARAM; + + for (int i = 0; i < iface->enc.cfg_count; ++i) { + if (iface->enc.cfgs[i].g_usage == usage) { + *cfg = iface->enc.cfgs[i]; + res = AOM_CODEC_OK; + /* default values */ + memset(&cfg->encoder_cfg, 0, sizeof(cfg->encoder_cfg)); + cfg->encoder_cfg.super_block_size = 0; // Dynamic + cfg->encoder_cfg.max_partition_size = 128; + cfg->encoder_cfg.min_partition_size = 4; + cfg->encoder_cfg.disable_trellis_quant = 3; + break; + } + } + } + return res; +} + +#if AOM_ARCH_X86 || AOM_ARCH_X86_64 +/* On X86, disable the x87 unit's internal 80 bit precision for better + * consistency with the SSE unit's 64 bit precision. + */ +#include "aom_ports/x86.h" +#define FLOATING_POINT_SET_PRECISION \ + unsigned short x87_orig_mode = x87_set_double_precision(); +#define FLOATING_POINT_RESTORE_PRECISION x87_set_control_word(x87_orig_mode); +#else +#define FLOATING_POINT_SET_PRECISION +#define FLOATING_POINT_RESTORE_PRECISION +#endif // AOM_ARCH_X86 || AOM_ARCH_X86_64 + +#if HAVE_FEXCEPT && CONFIG_DEBUG +#define FLOATING_POINT_SET_EXCEPTIONS \ + const int float_excepts = \ + feenableexcept(FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW); +#define FLOATING_POINT_RESTORE_EXCEPTIONS \ + if (float_excepts != -1) { \ + fedisableexcept(FE_ALL_EXCEPT); \ + feenableexcept(float_excepts); \ + } +#else +#define FLOATING_POINT_SET_EXCEPTIONS +#define FLOATING_POINT_RESTORE_EXCEPTIONS +#endif // HAVE_FEXCEPT && CONFIG_DEBUG + +/* clang-format off */ +#define FLOATING_POINT_INIT \ + do { \ + FLOATING_POINT_SET_PRECISION \ + FLOATING_POINT_SET_EXCEPTIONS + +#define FLOATING_POINT_RESTORE \ + FLOATING_POINT_RESTORE_EXCEPTIONS \ + FLOATING_POINT_RESTORE_PRECISION \ + } while (0); +/* clang-format on */ + +aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned long duration, + aom_enc_frame_flags_t flags) { + aom_codec_err_t res = AOM_CODEC_OK; + + if (!ctx || (img && !duration)) + res = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + res = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else if (img && ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) != 0) != + ((ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) != 0)) { + res = AOM_CODEC_INVALID_PARAM; + } else { + /* Execute in a normalized floating point environment, if the platform + * requires it. + */ + FLOATING_POINT_INIT + res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags); + FLOATING_POINT_RESTORE + } + + return SAVE_STATUS(ctx, res); +} + +const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx, + aom_codec_iter_t *iter) { + const aom_codec_cx_pkt_t *pkt = NULL; + + if (ctx) { + if (!iter) + ctx->err = AOM_CODEC_INVALID_PARAM; + else if (!ctx->iface || !ctx->priv) + ctx->err = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + ctx->err = AOM_CODEC_INCAPABLE; + else + pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter); + } + + if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + // If the application has specified a destination area for the + // compressed data, and the codec has not placed the data there, + // and it fits, copy it. + aom_codec_priv_t *const priv = ctx->priv; + char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf; + + if (dst_buf && pkt->data.raw.buf != dst_buf && + pkt->data.raw.sz + priv->enc.cx_data_pad_before + + priv->enc.cx_data_pad_after <= + priv->enc.cx_data_dst_buf.sz) { + aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt; + + memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf, + pkt->data.raw.sz); + *modified_pkt = *pkt; + modified_pkt->data.raw.buf = dst_buf; + modified_pkt->data.raw.sz += + priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after; + pkt = modified_pkt; + } + + if (dst_buf == pkt->data.raw.buf) { + priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz; + priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz; + } + } + + return pkt; +} + +aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx, + const aom_fixed_buf_t *buf, + unsigned int pad_before, + unsigned int pad_after) { + if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM; + + if (buf) { + ctx->priv->enc.cx_data_dst_buf = *buf; + ctx->priv->enc.cx_data_pad_before = pad_before; + ctx->priv->enc.cx_data_pad_after = pad_after; + } else { + ctx->priv->enc.cx_data_dst_buf.buf = NULL; + ctx->priv->enc.cx_data_dst_buf.sz = 0; + ctx->priv->enc.cx_data_pad_before = 0; + ctx->priv->enc.cx_data_pad_after = 0; + } + + return AOM_CODEC_OK; +} + +const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) { + aom_image_t *img = NULL; + + if (ctx) { + if (!ctx->iface || !ctx->priv) + ctx->err = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + ctx->err = AOM_CODEC_INCAPABLE; + else if (!ctx->iface->enc.get_preview) + ctx->err = AOM_CODEC_INCAPABLE; + else + img = ctx->iface->enc.get_preview(get_alg_priv(ctx)); + } + + return img; +} + +aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) { + aom_fixed_buf_t *buf = NULL; + + if (ctx) { + if (!ctx->iface || !ctx->priv) + ctx->err = AOM_CODEC_ERROR; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + ctx->err = AOM_CODEC_INCAPABLE; + else if (!ctx->iface->enc.get_glob_hdrs) + ctx->err = AOM_CODEC_INCAPABLE; + else + buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx)); + } + + return buf; +} + +aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, + const aom_codec_enc_cfg_t *cfg) { + aom_codec_err_t res; + + if (!ctx || !ctx->iface || !ctx->priv || !cfg) + res = AOM_CODEC_INVALID_PARAM; + else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER)) + res = AOM_CODEC_INCAPABLE; + else + res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg); + + return SAVE_STATUS(ctx, res); +} + +int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list, + const struct aom_codec_cx_pkt *pkt) { + if (list->cnt < list->max) { + list->pkts[list->cnt++] = *pkt; + return 0; + } + + return 1; +} + +const aom_codec_cx_pkt_t *aom_codec_pkt_list_get( + struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) { + const aom_codec_cx_pkt_t *pkt; + + if (!(*iter)) { + *iter = list->pkts; + } + + pkt = (const aom_codec_cx_pkt_t *)*iter; + + if ((size_t)(pkt - list->pkts) < list->cnt) + *iter = pkt + 1; + else + pkt = NULL; + + return pkt; +} diff --git a/third_party/aom/aom/src/aom_image.c b/third_party/aom/aom/src/aom_image.c new file mode 100644 index 0000000000..8e94d5dd4f --- /dev/null +++ b/third_party/aom/aom/src/aom_image.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_image.h" +#include "aom/aom_integer.h" +#include "aom/internal/aom_image_internal.h" +#include "aom_mem/aom_mem.h" + +static INLINE unsigned int align_image_dimension(unsigned int d, + unsigned int subsampling, + unsigned int size_align) { + unsigned int align; + + align = (1 << subsampling) - 1; + align = (size_align - 1 > align) ? (size_align - 1) : align; + return ((d + align) & ~align); +} + +static aom_image_t *img_alloc_helper( + aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, + unsigned int buf_align, unsigned int stride_align, unsigned int size_align, + unsigned int border, unsigned char *img_data, + aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) { + /* NOTE: In this function, bit_depth is either 8 or 16 (if + * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. + */ + unsigned int h, w, s, xcs, ycs, bps, bit_depth; + unsigned int stride_in_bytes; + + if (img != NULL) memset(img, 0, sizeof(aom_image_t)); + + /* Treat align==0 like align==1 */ + if (!buf_align) buf_align = 1; + + /* Validate alignment (must be power of 2) */ + if (buf_align & (buf_align - 1)) goto fail; + + /* Treat align==0 like align==1 */ + if (!stride_align) stride_align = 1; + + /* Validate alignment (must be power of 2) */ + if (stride_align & (stride_align - 1)) goto fail; + + /* Treat align==0 like align==1 */ + if (!size_align) size_align = 1; + + /* Validate alignment (must be power of 2) */ + if (size_align & (size_align - 1)) goto fail; + + /* Get sample size for this format */ + switch (fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_NV12: + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_AOMYV12: bps = 12; break; + case AOM_IMG_FMT_I422: bps = 16; break; + case AOM_IMG_FMT_I444: bps = 24; break; + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: bps = 24; break; + case AOM_IMG_FMT_I42216: bps = 32; break; + case AOM_IMG_FMT_I44416: bps = 48; break; + default: bps = 16; break; + } + + bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + + /* Get chroma shift values for this format */ + switch (fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_NV12: + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_AOMYV12: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I42016: + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42216: xcs = 1; break; + default: xcs = 0; break; + } + + switch (fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_NV12: + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_AOMYV12: + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: ycs = 1; break; + default: ycs = 0; break; + } + + /* Calculate storage sizes given the chroma subsampling */ + w = align_image_dimension(d_w, xcs, size_align); + h = align_image_dimension(d_h, ycs, size_align); + + s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth; + s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1); + stride_in_bytes = s * bit_depth / 8; + + /* Allocate the new image */ + if (!img) { + img = (aom_image_t *)calloc(1, sizeof(aom_image_t)); + + if (!img) goto fail; + + img->self_allocd = 1; + } + + img->img_data = img_data; + + if (!img_data) { + const uint64_t alloc_size = + (fmt & AOM_IMG_FMT_PLANAR) + ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth + : (uint64_t)(h + 2 * border) * stride_in_bytes; + + if (alloc_size != (size_t)alloc_size) goto fail; + + if (alloc_cb) { + const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1; + img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size); + if (img->img_data) { + img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align); + } + img->img_data_owner = 0; + } else { + img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size); + img->img_data_owner = 1; + } + img->sz = (size_t)alloc_size; + } + + if (!img->img_data) goto fail; + + img->fmt = fmt; + img->bit_depth = bit_depth; + // aligned width and aligned height + img->w = w; + img->h = h; + img->x_chroma_shift = xcs; + img->y_chroma_shift = ycs; + img->bps = bps; + + /* Calculate strides */ + img->stride[AOM_PLANE_Y] = stride_in_bytes; + img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs; + + if (fmt == AOM_IMG_FMT_NV12) { + // Each row is a row of U and a row of V interleaved, so the stride is twice + // as long. + img->stride[AOM_PLANE_U] *= 2; + img->stride[AOM_PLANE_V] = 0; + } + + /* Default viewport to entire image. (This aom_img_set_rect call always + * succeeds.) */ + aom_img_set_rect(img, 0, 0, d_w, d_h, border); + return img; + +fail: + aom_img_free(img); + return NULL; +} + +aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, NULL, + NULL); +} + +aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + aom_alloc_img_data_cb_fn_t alloc_cb, + void *cb_priv) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, + alloc_cb, cb_priv); +} + +aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, + unsigned int d_h, unsigned int stride_align, + unsigned char *img_data) { + /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is + * not NULL. */ + return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, 0, img_data, + NULL, NULL); +} + +aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt, + unsigned int d_w, unsigned int d_h, + unsigned int align, + unsigned int size_align, + unsigned int border) { + return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, border, + NULL, NULL, NULL); +} + +int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y, + unsigned int w, unsigned int h, unsigned int border) { + if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h && + y + h <= img->h) { + img->d_w = w; + img->d_h = h; + + x += border; + y += border; + + /* Calculate plane pointers */ + if (!(img->fmt & AOM_IMG_FMT_PLANAR)) { + img->planes[AOM_PLANE_PACKED] = + img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED]; + } else { + const int bytes_per_sample = + (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + unsigned char *data = img->img_data; + + img->planes[AOM_PLANE_Y] = + data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; + data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y]; + + unsigned int uv_border_h = border >> img->y_chroma_shift; + unsigned int uv_x = x >> img->x_chroma_shift; + unsigned int uv_y = y >> img->y_chroma_shift; + if (img->fmt == AOM_IMG_FMT_NV12) { + img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample * 2 + + uv_y * img->stride[AOM_PLANE_U]; + img->planes[AOM_PLANE_V] = NULL; + } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { + img->planes[AOM_PLANE_U] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; + data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + img->stride[AOM_PLANE_U]; + img->planes[AOM_PLANE_V] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; + } else { + img->planes[AOM_PLANE_V] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; + data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + img->stride[AOM_PLANE_V]; + img->planes[AOM_PLANE_U] = + data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; + } + } + return 0; + } + return -1; +} + +void aom_img_flip(aom_image_t *img) { + /* Note: In the calculation pointer adjustment calculation, we want the + * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99 + * standard indicates that if the adjustment parameter is unsigned, the + * stride parameter will be promoted to unsigned, causing errors when + * the lhs is a larger type than the rhs. + */ + img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y]; + img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y]; + + img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[AOM_PLANE_U]; + img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U]; + + img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) * + img->stride[AOM_PLANE_V]; + img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V]; +} + +void aom_img_free(aom_image_t *img) { + if (img) { + aom_img_remove_metadata(img); + if (img->img_data && img->img_data_owner) aom_free(img->img_data); + + if (img->self_allocd) free(img); + } +} + +int aom_img_plane_width(const aom_image_t *img, int plane) { + if (plane > 0 && img->x_chroma_shift > 0) + return (img->d_w + 1) >> img->x_chroma_shift; + else + return img->d_w; +} + +int aom_img_plane_height(const aom_image_t *img, int plane) { + if (plane > 0 && img->y_chroma_shift > 0) + return (img->d_h + 1) >> img->y_chroma_shift; + else + return img->d_h; +} + +aom_metadata_t *aom_img_metadata_alloc( + uint32_t type, const uint8_t *data, size_t sz, + aom_metadata_insert_flags_t insert_flag) { + if (!data || sz == 0) return NULL; + aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t)); + if (!metadata) return NULL; + metadata->type = type; + metadata->payload = (uint8_t *)malloc(sz); + if (!metadata->payload) { + free(metadata); + return NULL; + } + memcpy(metadata->payload, data, sz); + metadata->sz = sz; + metadata->insert_flag = insert_flag; + return metadata; +} + +void aom_img_metadata_free(aom_metadata_t *metadata) { + if (metadata) { + if (metadata->payload) free(metadata->payload); + free(metadata); + } +} + +aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) { + aom_metadata_array_t *arr = + (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t)); + if (!arr) return NULL; + if (sz > 0) { + arr->metadata_array = + (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *)); + if (!arr->metadata_array) { + aom_img_metadata_array_free(arr); + return NULL; + } + arr->sz = sz; + } + return arr; +} + +void aom_img_metadata_array_free(aom_metadata_array_t *arr) { + if (arr) { + if (arr->metadata_array) { + for (size_t i = 0; i < arr->sz; i++) { + aom_img_metadata_free(arr->metadata_array[i]); + } + free(arr->metadata_array); + } + free(arr); + } +} + +int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data, + size_t sz, aom_metadata_insert_flags_t insert_flag) { + if (!img) return -1; + if (!img->metadata) { + img->metadata = aom_img_metadata_array_alloc(0); + if (!img->metadata) return -1; + } + aom_metadata_t *metadata = + aom_img_metadata_alloc(type, data, sz, insert_flag); + if (!metadata) return -1; + aom_metadata_t **metadata_array = + (aom_metadata_t **)realloc(img->metadata->metadata_array, + (img->metadata->sz + 1) * sizeof(metadata)); + if (!metadata_array) { + aom_img_metadata_free(metadata); + return -1; + } + img->metadata->metadata_array = metadata_array; + img->metadata->metadata_array[img->metadata->sz] = metadata; + img->metadata->sz++; + return 0; +} + +void aom_img_remove_metadata(aom_image_t *img) { + if (img && img->metadata) { + aom_img_metadata_array_free(img->metadata); + img->metadata = NULL; + } +} + +const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img, + size_t index) { + if (!img) return NULL; + const aom_metadata_array_t *array = img->metadata; + if (array && index < array->sz) { + return array->metadata_array[index]; + } + return NULL; +} + +size_t aom_img_num_metadata(const aom_image_t *img) { + if (!img || !img->metadata) return 0; + return img->metadata->sz; +} diff --git a/third_party/aom/aom/src/aom_integer.c b/third_party/aom/aom/src/aom_integer.c new file mode 100644 index 0000000000..7edfd0de87 --- /dev/null +++ b/third_party/aom/aom/src/aom_integer.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "aom/aom_integer.h" + +static const size_t kMaximumLeb128Size = 8; +static const uint8_t kLeb128ByteMask = 0x7f; // Binary: 01111111 + +// Disallow values larger than 32-bits to ensure consistent behavior on 32 and +// 64 bit targets: value is typically used to determine buffer allocation size +// when decoded. +static const uint64_t kMaximumLeb128Value = UINT32_MAX; + +size_t aom_uleb_size_in_bytes(uint64_t value) { + size_t size = 0; + do { + ++size; + } while ((value >>= 7) != 0); + return size; +} + +int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value, + size_t *length) { + if (buffer && value) { + *value = 0; + for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) { + const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask; + *value |= ((uint64_t)decoded_byte) << (i * 7); + if ((*(buffer + i) >> 7) == 0) { + if (length) { + *length = i + 1; + } + + // Fail on values larger than 32-bits to ensure consistent behavior on + // 32 and 64 bit targets: value is typically used to determine buffer + // allocation size. + if (*value > UINT32_MAX) return -1; + + return 0; + } + } + } + + // If we get here, either the buffer/value pointers were invalid, + // or we ran over the available space + return -1; +} + +int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value, + size_t *coded_size) { + const size_t leb_size = aom_uleb_size_in_bytes(value); + if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size || + leb_size > available || !coded_value || !coded_size) { + return -1; + } + + for (size_t i = 0; i < leb_size; ++i) { + uint8_t byte = value & 0x7f; + value >>= 7; + + if (value != 0) byte |= 0x80; // Signal that more bytes follow. + + *(coded_value + i) = byte; + } + + *coded_size = leb_size; + return 0; +} + +int aom_uleb_encode_fixed_size(uint64_t value, size_t available, + size_t pad_to_size, uint8_t *coded_value, + size_t *coded_size) { + if (value > kMaximumLeb128Value || !coded_value || !coded_size || + available < pad_to_size || pad_to_size > kMaximumLeb128Size) { + return -1; + } + const uint64_t limit = 1ULL << (7 * pad_to_size); + if (value >= limit) { + // Can't encode 'value' within 'pad_to_size' bytes + return -1; + } + + for (size_t i = 0; i < pad_to_size; ++i) { + uint8_t byte = value & 0x7f; + value >>= 7; + + if (i < pad_to_size - 1) byte |= 0x80; // Signal that more bytes follow. + + *(coded_value + i) = byte; + } + + assert(value == 0); + + *coded_size = pad_to_size; + return 0; +} diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c new file mode 100644 index 0000000000..254f6401c7 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int sum = horz_scalar_product(src_x, x_filter); + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int sum = vert_scalar_product(src_y, src_stride, y_filter); + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + w, h); +} + +void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, + w, h); +} + +void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); +} + +void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + aom_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} + +void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h) { + for (int r = h; r > 0; --r) { + memmove(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int sum = highbd_horz_scalar_product(src_x, x_filter); + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter); + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); +} + +void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, int w, + int h) { + for (int y = 0; y < h; ++y) { + memmove(dst, src, w * sizeof(src[0])); + src += src_stride; + dst += dst_stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake new file mode 100644 index 0000000000..653f690741 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -0,0 +1,510 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_DSP_AOM_DSP_CMAKE_) + return() +endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_ +set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) + +list(APPEND AOM_DSP_COMMON_SOURCES + "${AOM_ROOT}/aom_dsp/aom_convolve.c" + "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" + "${AOM_ROOT}/aom_dsp/aom_filter.h" + "${AOM_ROOT}/aom_dsp/aom_simd.h" + "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" + "${AOM_ROOT}/aom_dsp/blend.h" + "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" + "${AOM_ROOT}/aom_dsp/entcode.c" + "${AOM_ROOT}/aom_dsp/entcode.h" + "${AOM_ROOT}/aom_dsp/fft.c" + "${AOM_ROOT}/aom_dsp/fft_common.h" + "${AOM_ROOT}/aom_dsp/grain_params.h" + "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/intrapred_common.h" + "${AOM_ROOT}/aom_dsp/loopfilter.c" + "${AOM_ROOT}/aom_dsp/odintrin.c" + "${AOM_ROOT}/aom_dsp/odintrin.h" + "${AOM_ROOT}/aom_dsp/prob.h" + "${AOM_ROOT}/aom_dsp/recenter.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/subtract.c" + "${AOM_ROOT}/aom_dsp/txfm_common.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h") + +list(APPEND AOM_DSP_COMMON_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" + "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h") + +list(APPEND AOM_DSP_COMMON_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/convolve_ssse3.h" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_utils.h") + +list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h" + "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h" + "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h" + "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h" + "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h" + "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h") + +list(APPEND AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" + "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" + "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c" + "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c") + +if(CONFIG_AV1_HIGHBITDEPTH) + list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") + + list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c") + + list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c") + + list(APPEND AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c") +endif() + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_DSP_DECODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" + "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" + "${AOM_ROOT}/aom_dsp/bitreader.c" + "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c" + "${AOM_ROOT}/aom_dsp/entdec.h") +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_DSP_ENCODER_SOURCES + "${AOM_ROOT}/aom_dsp/avg.c" + "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" + "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" + "${AOM_ROOT}/aom_dsp/bitwriter.c" + "${AOM_ROOT}/aom_dsp/bitwriter.h" + "${AOM_ROOT}/aom_dsp/blk_sse_sum.c" + "${AOM_ROOT}/aom_dsp/entenc.c" + "${AOM_ROOT}/aom_dsp/entenc.h" + "${AOM_ROOT}/aom_dsp/fwd_txfm.c" + "${AOM_ROOT}/aom_dsp/grain_table.c" + "${AOM_ROOT}/aom_dsp/grain_table.h" + "${AOM_ROOT}/aom_dsp/noise_model.c" + "${AOM_ROOT}/aom_dsp/noise_model.h" + "${AOM_ROOT}/aom_dsp/noise_util.c" + "${AOM_ROOT}/aom_dsp/noise_util.h" + "${AOM_ROOT}/aom_dsp/psnr.c" + "${AOM_ROOT}/aom_dsp/psnr.h" + "${AOM_ROOT}/aom_dsp/quantize.c" + "${AOM_ROOT}/aom_dsp/quantize.h" + "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/sad_av1.c" + "${AOM_ROOT}/aom_dsp/sse.c" + "${AOM_ROOT}/aom_dsp/ssim.c" + "${AOM_ROOT}/aom_dsp/ssim.h" + "${AOM_ROOT}/aom_dsp/sum_squares.c" + "${AOM_ROOT}/aom_dsp/variance.c" + "${AOM_ROOT}/aom_dsp/variance.h") + + # Flow estimation library + if(NOT CONFIG_REALTIME_ONLY) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/pyramid.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/corner_detect.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/corner_match.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/disflow.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/flow_estimation.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/ransac.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_sse4.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") + endif() + + list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") + + list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64 + "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" + "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c") + + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_AVX + "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h" + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/sadxd_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/masked_sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/masked_sad4d_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" + "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sse_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c" + "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD + "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c" + "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c" + "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c" + "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c" + "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/avg_sve.c" + "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_sve.c" + "${AOM_ROOT}/aom_dsp/arm/sum_squares_sve.c") + + if(CONFIG_AV1_HIGHBITDEPTH) + list(APPEND AOM_DSP_ENCODER_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD + "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/arm/highbd_sse_sve.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_variance_sve.c") + endif() + + if(CONFIG_INTERNAL_STATS) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c" + "${AOM_ROOT}/aom_dsp/psnrhvs.c") + endif() + + if(CONFIG_TUNE_VMAF) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c" + "${AOM_ROOT}/aom_dsp/vmaf.h") + endif() + + if(CONFIG_TUNE_BUTTERAUGLI) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/butteraugli.c" + "${AOM_ROOT}/aom_dsp/butteraugli.h") + endif() + + if(CONFIG_REALTIME_ONLY) + list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c") + + list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") + + list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c") + + list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c") + endif() +endif() + +# Creates aom_dsp build targets. Must not be called until after libaom target +# has been created. +function(setup_aom_dsp_targets) + add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_dsp_common) + create_no_op_source_file("aom_av1" "c" "no_op_source_file") + add_library(aom_dsp OBJECT "${no_op_source_file}") + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + list(APPEND AOM_LIB_TARGETS aom_dsp) + + # Not all generators support libraries consisting only of object files. Add a + # source file to the aom_dsp target. + add_no_op_source_file_to_target("aom_dsp" "c") + + if(CONFIG_AV1_DECODER) + add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_dsp_decoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_dsp_encoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + if(CONFIG_TUNE_VMAF) + target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS}) + endif() + endif() + + if(HAVE_SSE2) + add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE2") + + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64}) + endif() + add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE2") + endif() + endif() + + if(HAVE_SSSE3) + add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSSE3") + + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) + endif() + add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSSE3") + endif() + endif() + + if(HAVE_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE4_1") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE4_1") + endif() + endif() + + if(HAVE_AVX) + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-mavx" "avx" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_AVX") + endif() + endif() + + if(HAVE_AVX2) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_AVX2") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_AVX2") + endif() + endif() + + if(HAVE_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_NEON") + endif() + endif() + + if(HAVE_NEON_DOTPROD) + add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" + "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" + "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD") + endif() + endif() + + if(HAVE_NEON_I8MM) + add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm" + "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_NEON_I8MM") + endif() + + if(HAVE_SVE) + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SVE") + endif() + endif() + + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h new file mode 100644 index 0000000000..85dc0052e2 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_ +#define AOM_AOM_DSP_AOM_DSP_COMMON_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define PI 3.141592653589793238462643383279502884 + +#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) +#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) +#define AOMSIGN(x) ((x) < 0 ? -1 : 0) + +#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0])) + +#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') + +#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) + +/* Left shifting a negative value became undefined behavior in C99 (downgraded + from merely implementation-defined in C89). This should still compile to the + correct thing on any two's-complement machine, but avoid ubsan warnings.*/ +#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift))) + +// These can be used to give a hint about branch outcomes. +// This can have an effect, even if your target processor has a +// good branch predictor, as these hints can affect basic block +// ordering by the compiler. +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +typedef uint8_t qm_val_t; +#define AOM_QM_BITS 5 + +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; + +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { + switch (bd) { + case 8: + default: return (uint16_t)clamp(val, 0, 255); + case 10: return (uint16_t)clamp(val, 0, 1023); + case 12: return (uint16_t)clamp(val, 0, 4095); + } +} + +// The result of this branchless code is equivalent to (value < 0 ? 0 : value) +// or max(0, value) and might be faster in some cases. +// Care should be taken since the behavior of right shifting signed type +// negative value is undefined by C standards and implementation defined, +static INLINE unsigned int negative_to_zero(int value) { + return value & ~(value >> (sizeof(value) * 8 - 1)); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c new file mode 100644 index 0000000000..0265dd1ee5 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/aom_once.h" + +void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl new file mode 100755 index 0000000000..4b49605e53 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -0,0 +1,1798 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub aom_dsp_forward_decls() { +print <=4 && $h >=4 && ($w == 2*$h || $h == 2*$w)); + push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w)); + } +} + +@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/; + +# +# Intra prediction +# + +foreach (@tx_sizes) { + ($w, $h) = @$_; + foreach $pred_name (@pred_names) { + add_proto "void", "aom_${pred_name}_predictor_${w}x${h}", + "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}", + "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + } + } +} + +specialize qw/aom_dc_top_predictor_4x4 neon sse2/; +specialize qw/aom_dc_top_predictor_4x8 neon sse2/; +specialize qw/aom_dc_top_predictor_4x16 neon sse2/; +specialize qw/aom_dc_top_predictor_8x4 neon sse2/; +specialize qw/aom_dc_top_predictor_8x8 neon sse2/; +specialize qw/aom_dc_top_predictor_8x16 neon sse2/; +specialize qw/aom_dc_top_predictor_8x32 neon sse2/; +specialize qw/aom_dc_top_predictor_16x4 neon sse2/; +specialize qw/aom_dc_top_predictor_16x8 neon sse2/; +specialize qw/aom_dc_top_predictor_16x16 neon sse2/; +specialize qw/aom_dc_top_predictor_16x32 neon sse2/; +specialize qw/aom_dc_top_predictor_16x64 neon sse2/; +specialize qw/aom_dc_top_predictor_32x8 neon sse2/; +specialize qw/aom_dc_top_predictor_32x16 neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x64 neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x16 neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x32 neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/; + +specialize qw/aom_dc_left_predictor_4x4 neon sse2/; +specialize qw/aom_dc_left_predictor_4x8 neon sse2/; +specialize qw/aom_dc_left_predictor_4x16 neon sse2/; +specialize qw/aom_dc_left_predictor_8x4 neon sse2/; +specialize qw/aom_dc_left_predictor_8x8 neon sse2/; +specialize qw/aom_dc_left_predictor_8x16 neon sse2/; +specialize qw/aom_dc_left_predictor_8x32 neon sse2/; +specialize qw/aom_dc_left_predictor_16x4 neon sse2/; +specialize qw/aom_dc_left_predictor_16x8 neon sse2/; +specialize qw/aom_dc_left_predictor_16x16 neon sse2/; +specialize qw/aom_dc_left_predictor_16x32 neon sse2/; +specialize qw/aom_dc_left_predictor_16x64 neon sse2/; +specialize qw/aom_dc_left_predictor_32x8 neon sse2/; +specialize qw/aom_dc_left_predictor_32x16 neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x64 neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x16 neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x32 neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/; + +specialize qw/aom_dc_128_predictor_4x4 neon sse2/; +specialize qw/aom_dc_128_predictor_4x8 neon sse2/; +specialize qw/aom_dc_128_predictor_4x16 neon sse2/; +specialize qw/aom_dc_128_predictor_8x4 neon sse2/; +specialize qw/aom_dc_128_predictor_8x8 neon sse2/; +specialize qw/aom_dc_128_predictor_8x16 neon sse2/; +specialize qw/aom_dc_128_predictor_8x32 neon sse2/; +specialize qw/aom_dc_128_predictor_16x4 neon sse2/; +specialize qw/aom_dc_128_predictor_16x8 neon sse2/; +specialize qw/aom_dc_128_predictor_16x16 neon sse2/; +specialize qw/aom_dc_128_predictor_16x32 neon sse2/; +specialize qw/aom_dc_128_predictor_16x64 neon sse2/; +specialize qw/aom_dc_128_predictor_32x8 neon sse2/; +specialize qw/aom_dc_128_predictor_32x16 neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x64 neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x16 neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x32 neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/; + +specialize qw/aom_v_predictor_4x4 neon sse2/; +specialize qw/aom_v_predictor_4x8 neon sse2/; +specialize qw/aom_v_predictor_4x16 neon sse2/; +specialize qw/aom_v_predictor_8x4 neon sse2/; +specialize qw/aom_v_predictor_8x8 neon sse2/; +specialize qw/aom_v_predictor_8x16 neon sse2/; +specialize qw/aom_v_predictor_8x32 neon sse2/; +specialize qw/aom_v_predictor_16x4 neon sse2/; +specialize qw/aom_v_predictor_16x8 neon sse2/; +specialize qw/aom_v_predictor_16x16 neon sse2/; +specialize qw/aom_v_predictor_16x32 neon sse2/; +specialize qw/aom_v_predictor_16x64 neon sse2/; +specialize qw/aom_v_predictor_32x8 neon sse2/; +specialize qw/aom_v_predictor_32x16 neon sse2 avx2/; +specialize qw/aom_v_predictor_32x32 neon sse2 avx2/; +specialize qw/aom_v_predictor_32x64 neon sse2 avx2/; +specialize qw/aom_v_predictor_64x16 neon sse2 avx2/; +specialize qw/aom_v_predictor_64x32 neon sse2 avx2/; +specialize qw/aom_v_predictor_64x64 neon sse2 avx2/; + +specialize qw/aom_h_predictor_4x4 neon sse2/; +specialize qw/aom_h_predictor_4x8 neon sse2/; +specialize qw/aom_h_predictor_4x16 neon sse2/; +specialize qw/aom_h_predictor_8x4 neon sse2/; +specialize qw/aom_h_predictor_8x8 neon sse2/; +specialize qw/aom_h_predictor_8x16 neon sse2/; +specialize qw/aom_h_predictor_8x32 neon sse2/; +specialize qw/aom_h_predictor_16x4 neon sse2/; +specialize qw/aom_h_predictor_16x8 neon sse2/; +specialize qw/aom_h_predictor_16x16 neon sse2/; +specialize qw/aom_h_predictor_16x32 neon sse2/; +specialize qw/aom_h_predictor_16x64 neon sse2/; +specialize qw/aom_h_predictor_32x8 neon sse2/; +specialize qw/aom_h_predictor_32x16 neon sse2/; +specialize qw/aom_h_predictor_32x32 neon sse2 avx2/; +specialize qw/aom_h_predictor_32x64 neon sse2/; +specialize qw/aom_h_predictor_64x16 neon sse2/; +specialize qw/aom_h_predictor_64x32 neon sse2/; +specialize qw/aom_h_predictor_64x64 neon sse2/; + +specialize qw/aom_paeth_predictor_4x4 ssse3 neon/; +specialize qw/aom_paeth_predictor_4x8 ssse3 neon/; +specialize qw/aom_paeth_predictor_4x16 ssse3 neon/; +specialize qw/aom_paeth_predictor_8x4 ssse3 neon/; +specialize qw/aom_paeth_predictor_8x8 ssse3 neon/; +specialize qw/aom_paeth_predictor_8x16 ssse3 neon/; +specialize qw/aom_paeth_predictor_8x32 ssse3 neon/; +specialize qw/aom_paeth_predictor_16x4 ssse3 neon/; +specialize qw/aom_paeth_predictor_16x8 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_16x16 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_16x32 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_16x64 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_32x8 ssse3 neon/; +specialize qw/aom_paeth_predictor_32x16 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_32x32 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_32x64 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_64x16 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_64x32 ssse3 avx2 neon/; +specialize qw/aom_paeth_predictor_64x64 ssse3 avx2 neon/; + +specialize qw/aom_smooth_predictor_4x4 neon ssse3/; +specialize qw/aom_smooth_predictor_4x8 neon ssse3/; +specialize qw/aom_smooth_predictor_4x16 neon ssse3/; +specialize qw/aom_smooth_predictor_8x4 neon ssse3/; +specialize qw/aom_smooth_predictor_8x8 neon ssse3/; +specialize qw/aom_smooth_predictor_8x16 neon ssse3/; +specialize qw/aom_smooth_predictor_8x32 neon ssse3/; +specialize qw/aom_smooth_predictor_16x4 neon ssse3/; +specialize qw/aom_smooth_predictor_16x8 neon ssse3/; +specialize qw/aom_smooth_predictor_16x16 neon ssse3/; +specialize qw/aom_smooth_predictor_16x32 neon ssse3/; +specialize qw/aom_smooth_predictor_16x64 neon ssse3/; +specialize qw/aom_smooth_predictor_32x8 neon ssse3/; +specialize qw/aom_smooth_predictor_32x16 neon ssse3/; +specialize qw/aom_smooth_predictor_32x32 neon ssse3/; +specialize qw/aom_smooth_predictor_32x64 neon ssse3/; +specialize qw/aom_smooth_predictor_64x16 neon ssse3/; +specialize qw/aom_smooth_predictor_64x32 neon ssse3/; +specialize qw/aom_smooth_predictor_64x64 neon ssse3/; + +specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/; +specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/; +specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/; +specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/; +specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/; +specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/; +specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/; +specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/; +specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/; +specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/; +specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/; +specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/; +specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/; +specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/; +specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/; +specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/; +specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/; +specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/; +specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/; + +specialize qw/aom_smooth_h_predictor_4x4 neon ssse3/; +specialize qw/aom_smooth_h_predictor_4x8 neon ssse3/; +specialize qw/aom_smooth_h_predictor_4x16 neon ssse3/; +specialize qw/aom_smooth_h_predictor_8x4 neon ssse3/; +specialize qw/aom_smooth_h_predictor_8x8 neon ssse3/; +specialize qw/aom_smooth_h_predictor_8x16 neon ssse3/; +specialize qw/aom_smooth_h_predictor_8x32 neon ssse3/; +specialize qw/aom_smooth_h_predictor_16x4 neon ssse3/; +specialize qw/aom_smooth_h_predictor_16x8 neon ssse3/; +specialize qw/aom_smooth_h_predictor_16x16 neon ssse3/; +specialize qw/aom_smooth_h_predictor_16x32 neon ssse3/; +specialize qw/aom_smooth_h_predictor_16x64 neon ssse3/; +specialize qw/aom_smooth_h_predictor_32x8 neon ssse3/; +specialize qw/aom_smooth_h_predictor_32x16 neon ssse3/; +specialize qw/aom_smooth_h_predictor_32x32 neon ssse3/; +specialize qw/aom_smooth_h_predictor_32x64 neon ssse3/; +specialize qw/aom_smooth_h_predictor_64x16 neon ssse3/; +specialize qw/aom_smooth_h_predictor_64x32 neon ssse3/; +specialize qw/aom_smooth_h_predictor_64x64 neon ssse3/; + +# TODO(yunqingwang): optimize rectangular DC_PRED to replace division +# by multiply and shift. +specialize qw/aom_dc_predictor_4x4 neon sse2/; +specialize qw/aom_dc_predictor_4x8 neon sse2/; +specialize qw/aom_dc_predictor_4x16 neon sse2/; +specialize qw/aom_dc_predictor_8x4 neon sse2/; +specialize qw/aom_dc_predictor_8x8 neon sse2/; +specialize qw/aom_dc_predictor_8x16 neon sse2/; +specialize qw/aom_dc_predictor_8x32 neon sse2/; +specialize qw/aom_dc_predictor_16x4 neon sse2/; +specialize qw/aom_dc_predictor_16x8 neon sse2/; +specialize qw/aom_dc_predictor_16x16 neon sse2/; +specialize qw/aom_dc_predictor_16x32 neon sse2/; +specialize qw/aom_dc_predictor_16x64 neon sse2/; +specialize qw/aom_dc_predictor_32x8 neon sse2/; +specialize qw/aom_dc_predictor_32x16 neon sse2 avx2/; +specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/; +specialize qw/aom_dc_predictor_32x64 neon sse2 avx2/; +specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/; +specialize qw/aom_dc_predictor_64x32 neon sse2 avx2/; +specialize qw/aom_dc_predictor_64x16 neon sse2 avx2/; +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_v_predictor_4x16 neon/; + specialize qw/aom_highbd_v_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_v_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_v_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_v_predictor_8x32 neon/; + specialize qw/aom_highbd_v_predictor_16x4 neon/; + specialize qw/aom_highbd_v_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_v_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_v_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_v_predictor_16x64 neon/; + specialize qw/aom_highbd_v_predictor_32x8 neon/; + specialize qw/aom_highbd_v_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_v_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_v_predictor_32x64 neon/; + specialize qw/aom_highbd_v_predictor_64x16 neon/; + specialize qw/aom_highbd_v_predictor_64x32 neon/; + specialize qw/aom_highbd_v_predictor_64x64 neon/; + + # TODO(yunqingwang): optimize rectangular DC_PRED to replace division + # by multiply and shift. + specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_4x16 neon/; + specialize qw/aom_highbd_dc_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_8x32 neon/; + specialize qw/aom_highbd_dc_predictor_16x4 neon/; + specialize qw/aom_highbd_dc_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_16x64 neon/; + specialize qw/aom_highbd_dc_predictor_32x8 neon/; + specialize qw/aom_highbd_dc_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_32x64 neon/; + specialize qw/aom_highbd_dc_predictor_64x16 neon/; + specialize qw/aom_highbd_dc_predictor_64x32 neon/; + specialize qw/aom_highbd_dc_predictor_64x64 neon/; + + specialize qw/aom_highbd_h_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_h_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_h_predictor_4x16 neon/; + specialize qw/aom_highbd_h_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_h_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_h_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_h_predictor_8x32 neon/; + specialize qw/aom_highbd_h_predictor_16x4 neon/; + specialize qw/aom_highbd_h_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_h_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_h_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_h_predictor_16x64 neon/; + specialize qw/aom_highbd_h_predictor_32x8 neon/; + specialize qw/aom_highbd_h_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_h_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_h_predictor_32x64 neon/; + specialize qw/aom_highbd_h_predictor_64x16 neon/; + specialize qw/aom_highbd_h_predictor_64x32 neon/; + specialize qw/aom_highbd_h_predictor_64x64 neon/; + + specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_4x16 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_8x32 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x4 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_16x64 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x8 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_128_predictor_32x64 neon/; + specialize qw/aom_highbd_dc_128_predictor_64x16 neon/; + specialize qw/aom_highbd_dc_128_predictor_64x32 neon/; + specialize qw/aom_highbd_dc_128_predictor_64x64 neon/; + + specialize qw/aom_highbd_dc_left_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_4x16 neon/; + specialize qw/aom_highbd_dc_left_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_8x32 neon/; + specialize qw/aom_highbd_dc_left_predictor_16x4 neon/; + specialize qw/aom_highbd_dc_left_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_16x64 neon/; + specialize qw/aom_highbd_dc_left_predictor_32x8 neon/; + specialize qw/aom_highbd_dc_left_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_left_predictor_32x64 neon/; + specialize qw/aom_highbd_dc_left_predictor_64x16 neon/; + specialize qw/aom_highbd_dc_left_predictor_64x32 neon/; + specialize qw/aom_highbd_dc_left_predictor_64x64 neon/; + + specialize qw/aom_highbd_dc_top_predictor_4x4 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_4x8 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_4x16 neon/; + specialize qw/aom_highbd_dc_top_predictor_8x4 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_8x8 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_8x16 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_8x32 neon/; + specialize qw/aom_highbd_dc_top_predictor_16x4 neon/; + specialize qw/aom_highbd_dc_top_predictor_16x8 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_16x16 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_16x32 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_16x64 neon/; + specialize qw/aom_highbd_dc_top_predictor_32x8 neon/; + specialize qw/aom_highbd_dc_top_predictor_32x16 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_top_predictor_32x64 neon/; + specialize qw/aom_highbd_dc_top_predictor_64x16 neon/; + specialize qw/aom_highbd_dc_top_predictor_64x32 neon/; + specialize qw/aom_highbd_dc_top_predictor_64x64 neon/; + + specialize qw/aom_highbd_paeth_predictor_4x4 neon/; + specialize qw/aom_highbd_paeth_predictor_4x8 neon/; + specialize qw/aom_highbd_paeth_predictor_4x16 neon/; + specialize qw/aom_highbd_paeth_predictor_8x4 neon/; + specialize qw/aom_highbd_paeth_predictor_8x8 neon/; + specialize qw/aom_highbd_paeth_predictor_8x16 neon/; + specialize qw/aom_highbd_paeth_predictor_8x32 neon/; + specialize qw/aom_highbd_paeth_predictor_16x4 neon/; + specialize qw/aom_highbd_paeth_predictor_16x8 neon/; + specialize qw/aom_highbd_paeth_predictor_16x16 neon/; + specialize qw/aom_highbd_paeth_predictor_16x32 neon/; + specialize qw/aom_highbd_paeth_predictor_16x64 neon/; + specialize qw/aom_highbd_paeth_predictor_32x8 neon/; + specialize qw/aom_highbd_paeth_predictor_32x16 neon/; + specialize qw/aom_highbd_paeth_predictor_32x32 neon/; + specialize qw/aom_highbd_paeth_predictor_32x64 neon/; + specialize qw/aom_highbd_paeth_predictor_64x16 neon/; + specialize qw/aom_highbd_paeth_predictor_64x32 neon/; + specialize qw/aom_highbd_paeth_predictor_64x64 neon/; + + specialize qw/aom_highbd_smooth_predictor_4x4 neon/; + specialize qw/aom_highbd_smooth_predictor_4x8 neon/; + specialize qw/aom_highbd_smooth_predictor_4x16 neon/; + specialize qw/aom_highbd_smooth_predictor_8x4 neon/; + specialize qw/aom_highbd_smooth_predictor_8x8 neon/; + specialize qw/aom_highbd_smooth_predictor_8x16 neon/; + specialize qw/aom_highbd_smooth_predictor_8x32 neon/; + specialize qw/aom_highbd_smooth_predictor_16x4 neon/; + specialize qw/aom_highbd_smooth_predictor_16x8 neon/; + specialize qw/aom_highbd_smooth_predictor_16x16 neon/; + specialize qw/aom_highbd_smooth_predictor_16x32 neon/; + specialize qw/aom_highbd_smooth_predictor_16x64 neon/; + specialize qw/aom_highbd_smooth_predictor_32x8 neon/; + specialize qw/aom_highbd_smooth_predictor_32x16 neon/; + specialize qw/aom_highbd_smooth_predictor_32x32 neon/; + specialize qw/aom_highbd_smooth_predictor_32x64 neon/; + specialize qw/aom_highbd_smooth_predictor_64x16 neon/; + specialize qw/aom_highbd_smooth_predictor_64x32 neon/; + specialize qw/aom_highbd_smooth_predictor_64x64 neon/; + + specialize qw/aom_highbd_smooth_v_predictor_4x4 neon/; + specialize qw/aom_highbd_smooth_v_predictor_4x8 neon/; + specialize qw/aom_highbd_smooth_v_predictor_4x16 neon/; + specialize qw/aom_highbd_smooth_v_predictor_8x4 neon/; + specialize qw/aom_highbd_smooth_v_predictor_8x8 neon/; + specialize qw/aom_highbd_smooth_v_predictor_8x16 neon/; + specialize qw/aom_highbd_smooth_v_predictor_8x32 neon/; + specialize qw/aom_highbd_smooth_v_predictor_16x4 neon/; + specialize qw/aom_highbd_smooth_v_predictor_16x8 neon/; + specialize qw/aom_highbd_smooth_v_predictor_16x16 neon/; + specialize qw/aom_highbd_smooth_v_predictor_16x32 neon/; + specialize qw/aom_highbd_smooth_v_predictor_16x64 neon/; + specialize qw/aom_highbd_smooth_v_predictor_32x8 neon/; + specialize qw/aom_highbd_smooth_v_predictor_32x16 neon/; + specialize qw/aom_highbd_smooth_v_predictor_32x32 neon/; + specialize qw/aom_highbd_smooth_v_predictor_32x64 neon/; + specialize qw/aom_highbd_smooth_v_predictor_64x16 neon/; + specialize qw/aom_highbd_smooth_v_predictor_64x32 neon/; + specialize qw/aom_highbd_smooth_v_predictor_64x64 neon/; + + specialize qw/aom_highbd_smooth_h_predictor_4x4 neon/; + specialize qw/aom_highbd_smooth_h_predictor_4x8 neon/; + specialize qw/aom_highbd_smooth_h_predictor_4x16 neon/; + specialize qw/aom_highbd_smooth_h_predictor_8x4 neon/; + specialize qw/aom_highbd_smooth_h_predictor_8x8 neon/; + specialize qw/aom_highbd_smooth_h_predictor_8x16 neon/; + specialize qw/aom_highbd_smooth_h_predictor_8x32 neon/; + specialize qw/aom_highbd_smooth_h_predictor_16x4 neon/; + specialize qw/aom_highbd_smooth_h_predictor_16x8 neon/; + specialize qw/aom_highbd_smooth_h_predictor_16x16 neon/; + specialize qw/aom_highbd_smooth_h_predictor_16x32 neon/; + specialize qw/aom_highbd_smooth_h_predictor_16x64 neon/; + specialize qw/aom_highbd_smooth_h_predictor_32x8 neon/; + specialize qw/aom_highbd_smooth_h_predictor_32x16 neon/; + specialize qw/aom_highbd_smooth_h_predictor_32x32 neon/; + specialize qw/aom_highbd_smooth_h_predictor_32x64 neon/; + specialize qw/aom_highbd_smooth_h_predictor_64x16 neon/; + specialize qw/aom_highbd_smooth_h_predictor_64x32 neon/; + specialize qw/aom_highbd_smooth_h_predictor_64x64 neon/; +} +# +# Sub Pixel Filters +# +add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h"; +add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + +specialize qw/aom_convolve_copy neon sse2 avx2/; +specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; + +add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; +specialize qw/aom_scaled_2d ssse3 neon/; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h"; + specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/; + + add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; + specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/; + + add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; + specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/; +} + +# +# Loopfilter +# +add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_14 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_14_dual sse2 neon/; + +add_proto qw/void aom_lpf_vertical_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_vertical_14_quad avx2 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_6 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_8 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_8_dual sse2 neon/; + +add_proto qw/void aom_lpf_vertical_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_vertical_8_quad sse2 neon/; + +add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_4 sse2 neon/; + +add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_4_dual sse2 neon/; + +add_proto qw/void aom_lpf_vertical_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_vertical_4_quad sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_14 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_14_dual sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_horizontal_14_quad sse2 avx2 neon/; + +add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_6 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_6_dual sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_horizontal_6_quad sse2 avx2 neon/; + +add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_8 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_8_dual sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_horizontal_8_quad sse2 avx2 neon/; + +add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_4 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_4_dual sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_horizontal_4_quad sse2 neon/; + +add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_6_dual sse2 neon/; + +add_proto qw/void aom_lpf_vertical_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0"; +specialize qw/aom_lpf_vertical_6_quad sse2 neon/; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_14 neon sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_14_dual neon sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_8 neon sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_8_dual neon sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_6 neon sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_6_dual neon sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_vertical_4 neon sse2/; + + add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_vertical_4_dual neon sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_14 neon sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd"; + specialize qw/aom_highbd_lpf_horizontal_14_dual neon sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_6 neon sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_horizontal_6_dual neon sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_8 neon sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_horizontal_8_dual neon sse2 avx2/; + + add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; + specialize qw/aom_highbd_lpf_horizontal_4 neon sse2/; + + add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; + specialize qw/aom_highbd_lpf_horizontal_4_dual neon sse2 avx2/; +} + +# +# Encoder functions. +# + +# +# Forward transform +# +if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ + add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/aom_fdct4x4 neon sse2/; + + add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride"; + specialize qw/aom_fdct4x4_lp neon sse2/; + + if (aom_config("CONFIG_INTERNAL_STATS") eq "yes"){ + # 8x8 DCT transform for psnr-hvs. Unlike other transforms isn't compatible + # with av1 scan orders, because it does two transposes. + add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64"; + # High bit depth + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/aom_highbd_fdct8x8 sse2/; + } + } + # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation) + add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output"; + + add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft4x4_float sse2/; + + add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft8x8_float avx2 sse2/; + + add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft16x16_float avx2 sse2/; + + add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft32x32_float avx2 sse2/; + + add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output"; + + add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft4x4_float sse2/; + + add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft8x8_float avx2 sse2/; + + add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft16x16_float avx2 sse2/; + + add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft32x32_float avx2 sse2/; +} # CONFIG_AV1_ENCODER + +# +# Quantization +# +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b sse2 neon avx avx2/, "$ssse3_x86_64"; + + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32 neon avx avx2/, "$ssse3_x86_64"; + + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_64x64 neon ssse3 avx2/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_adaptive sse2 avx2/; + + add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32_adaptive sse2/; + + add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_64x64_adaptive sse2/; + } +} # CONFIG_AV1_ENCODER + +if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b sse2 avx2 neon/; + + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32 sse2 avx2 neon/; + + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_64x64 sse2 avx2 neon/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2 neon/; + + add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2 neon/; + + add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2 neon/; + } +} # CONFIG_AV1_ENCODER + +# +# Alpha blending with mask +# +add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params"; +specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; +add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh"; +add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; +add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; +specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/; +specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; +specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; + +if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd"; + add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; + add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; + add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd"; + specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/; + specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/; + specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/; + specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/; +} + +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Block subtraction + # + add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; + specialize qw/aom_subtract_block neon sse2 avx2/; + + add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; + specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/; + + add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum"; + specialize qw/aom_get_blk_sse_sum sse2 avx2 neon sve/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; + specialize qw/aom_highbd_subtract_block sse2 neon/; + + add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; + specialize qw/aom_highbd_sse sse4_1 avx2 neon sve/; + } + + # + # Sum of Squares + # + add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; + specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon sve/; + + add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; + specialize qw/aom_sum_squares_i16 sse2 neon sve/; + + add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height"; + specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/; + + add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height"; + specialize qw/aom_var_2d_u16 sse2 avx2 neon sve/; + + # + # Single block SAD / Single block Avg SAD + # + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param"; + } + + add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum"; + specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2 sve/; + specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x64 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x32 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x64 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x32 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x16 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x32 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x16 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x8 sse2 neon neon_dotprod/; + specialize qw/aom_sad8x16 sse2 neon/; + specialize qw/aom_sad8x8 sse2 neon/; + specialize qw/aom_sad8x4 sse2 neon/; + specialize qw/aom_sad4x8 sse2 neon/; + specialize qw/aom_sad4x4 sse2 neon/; + + specialize qw/aom_sad4x16 sse2 neon/; + specialize qw/aom_sad16x4 sse2 neon neon_dotprod/; + specialize qw/aom_sad8x32 sse2 neon/; + specialize qw/aom_sad32x8 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x64 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x16 sse2 neon neon_dotprod/; + + specialize qw/aom_sad_skip_128x128 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_128x64 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x128 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x64 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x32 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x64 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x32 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x16 avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x32 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x16 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x8 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_8x16 sse2 neon/; + specialize qw/aom_sad_skip_8x8 sse2 neon/; + specialize qw/aom_sad_skip_8x4 neon/; + specialize qw/aom_sad_skip_4x8 sse2 neon/; + specialize qw/aom_sad_skip_4x4 neon/; + + specialize qw/aom_sad_skip_4x16 sse2 neon/; + specialize qw/aom_sad_skip_16x4 neon neon_dotprod/; + specialize qw/aom_sad_skip_8x32 sse2 neon/; + specialize qw/aom_sad_skip_32x8 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x64 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x16 sse2 neon neon_dotprod/; + + specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad128x64_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x128_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x64_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x32_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x64_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x32_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x16_avg avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x32_avg sse2 neon neon_dotprod/; + specialize qw/aom_sad16x16_avg sse2 neon neon_dotprod/; + specialize qw/aom_sad16x8_avg sse2 neon neon_dotprod/; + specialize qw/aom_sad8x16_avg sse2 neon/; + specialize qw/aom_sad8x8_avg sse2 neon/; + specialize qw/aom_sad8x4_avg sse2 neon/; + specialize qw/aom_sad4x8_avg sse2 neon/; + specialize qw/aom_sad4x4_avg sse2 neon/; + + specialize qw/aom_sad4x16_avg sse2 neon/; + specialize qw/aom_sad16x4_avg sse2 neon neon_dotprod/; + specialize qw/aom_sad8x32_avg sse2 neon/; + specialize qw/aom_sad32x8_avg sse2 neon neon_dotprod/; + specialize qw/aom_sad16x64_avg sse2 neon neon_dotprod/; + specialize qw/aom_sad64x16_avg sse2 neon neon_dotprod/; + + specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad128x64_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad64x128_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad64x64_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad64x32_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad32x64_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad32x32_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad32x16_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad16x32_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad16x16_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad16x8_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad8x16_avg sse2 neon/; + specialize qw/aom_dist_wtd_sad8x8_avg sse2 neon/; + specialize qw/aom_dist_wtd_sad8x4_avg sse2 neon/; + specialize qw/aom_dist_wtd_sad4x8_avg sse2 neon/; + specialize qw/aom_dist_wtd_sad4x4_avg sse2 neon/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + specialize qw/aom_dist_wtd_sad4x16_avg sse2 neon/; + specialize qw/aom_dist_wtd_sad16x4_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad8x32_avg sse2 neon/; + specialize qw/aom_dist_wtd_sad32x8_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad16x64_avg sse2 neon neon_dotprod/; + specialize qw/aom_dist_wtd_sad64x16_avg sse2 neon neon_dotprod/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_highbd_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_sad${w}x${h}", qw/sse2/; + specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; + } + add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param"; + } + specialize qw/aom_highbd_sad128x128 avx2 neon/; + specialize qw/aom_highbd_sad128x64 avx2 neon/; + specialize qw/aom_highbd_sad64x128 avx2 neon/; + specialize qw/aom_highbd_sad64x64 avx2 sse2 neon/; + specialize qw/aom_highbd_sad64x32 avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x64 avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x32 avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x16 avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x32 avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x16 avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x8 avx2 sse2 neon/; + specialize qw/aom_highbd_sad8x16 sse2 neon/; + specialize qw/aom_highbd_sad8x8 sse2 neon/; + specialize qw/aom_highbd_sad8x4 sse2 neon/; + specialize qw/aom_highbd_sad4x8 sse2 neon/; + specialize qw/aom_highbd_sad4x4 sse2 neon/; + + specialize qw/aom_highbd_sad4x16 sse2 neon/; + specialize qw/aom_highbd_sad16x4 avx2 sse2 neon/; + specialize qw/aom_highbd_sad8x32 sse2 neon/; + specialize qw/aom_highbd_sad32x8 avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x64 avx2 sse2 neon/; + specialize qw/aom_highbd_sad64x16 avx2 sse2 neon/; + + specialize qw/aom_highbd_sad_skip_128x128 avx2 neon/; + specialize qw/aom_highbd_sad_skip_128x64 avx2 neon/; + specialize qw/aom_highbd_sad_skip_64x128 avx2 neon/; + specialize qw/aom_highbd_sad_skip_64x64 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_64x32 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x64 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x32 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x16 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x32 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x16 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x8 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x4 neon/; + specialize qw/aom_highbd_sad_skip_8x16 sse2 neon/; + specialize qw/aom_highbd_sad_skip_8x4 neon/; + specialize qw/aom_highbd_sad_skip_8x8 sse2 neon/; + specialize qw/aom_highbd_sad_skip_4x8 sse2 neon/; + specialize qw/aom_highbd_sad_skip_4x4 neon/; + + specialize qw/aom_highbd_sad_skip_4x16 sse2 neon/; + specialize qw/aom_highbd_sad_skip_8x32 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x8 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x64 avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_64x16 avx2 sse2 neon/; + + specialize qw/aom_highbd_sad128x128_avg avx2 neon/; + specialize qw/aom_highbd_sad128x64_avg avx2 neon/; + specialize qw/aom_highbd_sad64x128_avg avx2 neon/; + specialize qw/aom_highbd_sad64x64_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad64x32_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x64_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x32_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x16_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x32_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x16_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x8_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad8x16_avg neon/; + specialize qw/aom_highbd_sad8x8_avg neon/; + specialize qw/aom_highbd_sad8x4_avg sse2 neon/; + specialize qw/aom_highbd_sad4x8_avg sse2 neon/; + specialize qw/aom_highbd_sad4x4_avg sse2 neon/; + + specialize qw/aom_highbd_sad4x16_avg sse2 neon/; + specialize qw/aom_highbd_sad8x32_avg sse2 neon/; + specialize qw/aom_highbd_sad16x4_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x64_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad32x8_avg avx2 sse2 neon/; + specialize qw/aom_highbd_sad64x16_avg avx2 sse2 neon/; + } + # + # Masked SAD + # + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; + specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2 neon/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; + specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/; + } + } + + # + # OBMC SAD + # + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/; + } + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/; + } + } + } + } + + # + # Multi-block SAD, comparing a reference to N independent blocks + # + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]"; + } + + specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad128x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x128x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad64x32x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x32x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x16x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x32x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x16x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x8x4d avx2 sse2 neon neon_dotprod/; + + specialize qw/aom_sad8x16x4d sse2 neon/; + specialize qw/aom_sad8x8x4d sse2 neon/; + specialize qw/aom_sad8x4x4d sse2 neon/; + specialize qw/aom_sad4x8x4d sse2 neon/; + specialize qw/aom_sad4x4x4d sse2 neon/; + + specialize qw/aom_sad64x16x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad32x8x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad16x4x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad8x32x4d sse2 neon/; + specialize qw/aom_sad4x16x4d sse2 neon/; + + specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_128x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x128x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x32x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_64x16x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x32x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x16x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_32x8x4d avx2 sse2 neon neon_dotprod/; + + specialize qw/aom_sad_skip_16x64x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/; + specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/; + specialize qw/aom_sad_skip_8x32x4d sse2 neon/; + specialize qw/aom_sad_skip_8x16x4d sse2 neon/; + specialize qw/aom_sad_skip_8x8x4d sse2 neon/; + specialize qw/aom_sad_skip_8x4x4d neon/; + specialize qw/aom_sad_skip_4x16x4d sse2 neon/; + specialize qw/aom_sad_skip_4x8x4d sse2 neon/; + specialize qw/aom_sad_skip_4x4x4d neon/; + + specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad128x64x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad64x128x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad64x64x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad64x32x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad32x64x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad32x32x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad32x16x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad16x32x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad16x16x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad16x8x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad8x16x3d neon/; + specialize qw/aom_sad8x8x3d neon/; + specialize qw/aom_sad8x4x3d neon/; + specialize qw/aom_sad4x8x3d neon/; + specialize qw/aom_sad4x4x3d neon/; + + specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/; + specialize qw/aom_sad16x4x3d neon neon_dotprod/; + specialize qw/aom_sad8x32x3d neon/; + specialize qw/aom_sad4x16x3d neon/; + + specialize qw/aom_masked_sad128x128x4d ssse3 neon/; + specialize qw/aom_masked_sad128x64x4d ssse3 neon/; + specialize qw/aom_masked_sad64x128x4d ssse3 neon/; + specialize qw/aom_masked_sad64x64x4d ssse3 neon/; + specialize qw/aom_masked_sad64x32x4d ssse3 neon/; + specialize qw/aom_masked_sad64x16x4d ssse3 neon/; + specialize qw/aom_masked_sad32x64x4d ssse3 neon/; + specialize qw/aom_masked_sad32x32x4d ssse3 neon/; + specialize qw/aom_masked_sad32x16x4d ssse3 neon/; + specialize qw/aom_masked_sad32x8x4d ssse3 neon/; + specialize qw/aom_masked_sad16x64x4d ssse3 neon/; + specialize qw/aom_masked_sad16x32x4d ssse3 neon/; + specialize qw/aom_masked_sad16x16x4d ssse3 neon/; + specialize qw/aom_masked_sad16x8x4d ssse3 neon/; + + specialize qw/aom_masked_sad8x16x4d ssse3 neon/; + specialize qw/aom_masked_sad8x8x4d ssse3 neon/; + specialize qw/aom_masked_sad8x4x4d ssse3 neon/; + specialize qw/aom_masked_sad4x16x4d ssse3 neon/; + specialize qw/aom_masked_sad4x8x4d ssse3 neon/; + specialize qw/aom_masked_sad4x4x4d ssse3 neon/; + + specialize qw/aom_masked_sad4x16x4d ssse3 neon/; + specialize qw/aom_masked_sad16x4x4d ssse3 neon/; + specialize qw/aom_masked_sad8x32x4d ssse3 neon/; + specialize qw/aom_masked_sad32x8x4d ssse3 neon/; + specialize qw/aom_masked_sad64x16x4d ssse3 neon/; + # + # Multi-block SAD, comparing a reference to N independent blocks + # + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; + add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]"; + if ($w != 128 && $h != 128) { + specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; + } + } + specialize qw/aom_highbd_sad128x128x4d avx2 neon/; + specialize qw/aom_highbd_sad128x64x4d avx2 neon/; + specialize qw/aom_highbd_sad64x128x4d avx2 neon/; + specialize qw/aom_highbd_sad64x64x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad64x32x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad32x64x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad32x32x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad32x16x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad16x32x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad16x16x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad16x8x4d sse2 avx2 neon/; + specialize qw/aom_highbd_sad8x16x4d sse2 neon/; + specialize qw/aom_highbd_sad8x8x4d sse2 neon/; + specialize qw/aom_highbd_sad8x4x4d sse2 neon/; + specialize qw/aom_highbd_sad4x8x4d sse2 neon/; + specialize qw/aom_highbd_sad4x4x4d sse2 neon/; + + specialize qw/aom_highbd_sad4x16x4d sse2 neon/; + specialize qw/aom_highbd_sad16x4x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad8x32x4d sse2 neon/; + specialize qw/aom_highbd_sad32x8x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad16x64x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad64x16x4d avx2 sse2 neon/; + + specialize qw/aom_highbd_sad_skip_128x128x4d avx2 neon/; + specialize qw/aom_highbd_sad_skip_128x64x4d avx2 neon/; + specialize qw/aom_highbd_sad_skip_64x128x4d avx2 neon/; + specialize qw/aom_highbd_sad_skip_64x64x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_64x32x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x64x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x32x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x16x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x32x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x16x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x8x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x4x4d neon/; + specialize qw/aom_highbd_sad_skip_8x16x4d sse2 neon/; + specialize qw/aom_highbd_sad_skip_8x8x4d sse2 neon/; + specialize qw/aom_highbd_sad_skip_8x4x4d neon/; + specialize qw/aom_highbd_sad_skip_4x8x4d sse2 neon/; + specialize qw/aom_highbd_sad_skip_4x4x4d neon/; + + specialize qw/aom_highbd_sad_skip_4x16x4d sse2 neon/; + specialize qw/aom_highbd_sad_skip_8x32x4d sse2 neon/; + specialize qw/aom_highbd_sad_skip_32x8x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_16x64x4d avx2 sse2 neon/; + specialize qw/aom_highbd_sad_skip_64x16x4d avx2 sse2 neon/; + + specialize qw/aom_highbd_sad128x128x3d avx2 neon/; + specialize qw/aom_highbd_sad128x64x3d avx2 neon/; + specialize qw/aom_highbd_sad64x128x3d avx2 neon/; + specialize qw/aom_highbd_sad64x64x3d avx2 neon/; + specialize qw/aom_highbd_sad64x32x3d avx2 neon/; + specialize qw/aom_highbd_sad32x64x3d avx2 neon/; + specialize qw/aom_highbd_sad32x32x3d avx2 neon/; + specialize qw/aom_highbd_sad32x16x3d avx2 neon/; + specialize qw/aom_highbd_sad16x32x3d avx2 neon/; + specialize qw/aom_highbd_sad16x16x3d avx2 neon/; + specialize qw/aom_highbd_sad16x8x3d avx2 neon/; + specialize qw/aom_highbd_sad8x16x3d neon/; + specialize qw/aom_highbd_sad8x8x3d neon/; + specialize qw/aom_highbd_sad8x4x3d neon/; + specialize qw/aom_highbd_sad4x8x3d neon/; + specialize qw/aom_highbd_sad4x4x3d neon/; + + specialize qw/aom_highbd_sad64x16x3d avx2 neon/; + specialize qw/aom_highbd_sad32x8x3d avx2 neon/; + specialize qw/aom_highbd_sad16x64x3d avx2 neon/; + specialize qw/aom_highbd_sad16x4x3d avx2 neon/; + specialize qw/aom_highbd_sad8x32x3d neon/; + specialize qw/aom_highbd_sad4x16x3d neon/; + } + # + # Avg + # + add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p"; + specialize qw/aom_avg_8x8 sse2 neon/; + + add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p"; + specialize qw/aom_avg_4x4 sse2 neon/; + + add_proto qw/void aom_avg_8x8_quad/, "const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg"; + specialize qw/aom_avg_8x8_quad avx2 sse2 neon/; + + add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/aom_minmax_8x8 sse2 neon/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p"; + specialize qw/aom_highbd_avg_8x8 neon/; + add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p"; + specialize qw/aom_highbd_avg_4x4 neon/; + add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/aom_highbd_minmax_8x8 neon/; + } + + add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor"; + specialize qw/aom_int_pro_row avx2 sse2 neon/; + + add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor"; + specialize qw/aom_int_pro_col avx2 sse2 neon/; + + add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; + specialize qw/aom_vector_var avx2 sse4_1 neon sve/; + # TODO(kyslov@) bring back SSE2 by extending it to 128 block size + #specialize qw/aom_vector_var neon sse2/; + + # + # hamadard transform and satd for implmenting temporal dependency model + # + add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_4x4 sse2 neon/; + + add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_8x8 sse2 neon/; + + add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_16x16 avx2 sse2 neon/; + + add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_hadamard_32x32 avx2 sse2 neon/; + + add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_lp_8x8 sse2 neon/; + + add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/; + + add_proto qw/void aom_hadamard_lp_8x8_dual/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_lp_8x8_dual sse2 avx2 neon/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_highbd_hadamard_8x8 avx2 neon/; + + add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_highbd_hadamard_16x16 avx2 neon/; + + add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; + specialize qw/aom_highbd_hadamard_32x32 avx2 neon/; + } + add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length"; + specialize qw/aom_satd neon sse2 avx2/; + + add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length"; + specialize qw/aom_satd_lp sse2 avx2 neon/; + + + # + # Structured Similarity (SSIM) + # + add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; + + if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { + add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + } +} # CONFIG_AV1_ENCODER + +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + + # + # Specialty Variance + # + add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8"; + specialize qw/aom_get_var_sse_sum_8x8_quad avx2 sse2 neon neon_dotprod/; + + add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16"; + specialize qw/aom_get_var_sse_sum_16x16_dual avx2 sse2 neon neon_dotprod/; + + add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + + specialize qw/aom_mse16x16 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_mse16x8 sse2 neon neon_dotprod/; + specialize qw/aom_mse8x16 sse2 neon neon_dotprod/; + specialize qw/aom_mse8x8 sse2 neon neon_dotprod/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; + } + + specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/; + specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/; + specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/; + specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/; + } + + # + # + # + add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; + specialize qw/aom_get_mb_ss sse2 neon/; + + # + # Variance / Subpixel Variance / Subpixel Avg Variance + # + add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h"; + specialize qw/aom_mse_wxh_16bit sse2 avx2 neon/; + + add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h"; + specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/; + + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param"; + } + specialize qw/aom_variance128x128 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance128x64 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance64x128 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance64x64 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance64x32 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance32x64 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance32x32 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance32x16 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance16x32 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance16x16 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance16x8 sse2 avx2 neon neon_dotprod/; + specialize qw/aom_variance8x16 sse2 neon neon_dotprod/; + specialize qw/aom_variance8x8 sse2 neon neon_dotprod/; + specialize qw/aom_variance8x4 sse2 neon neon_dotprod/; + specialize qw/aom_variance4x8 sse2 neon neon_dotprod/; + specialize qw/aom_variance4x4 sse2 neon neon_dotprod/; + + specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x64 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x32 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x16 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x8 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x4 neon sse2 ssse3/; + + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + specialize qw/aom_variance4x16 neon neon_dotprod sse2/; + specialize qw/aom_variance16x4 neon neon_dotprod sse2 avx2/; + specialize qw/aom_variance8x32 neon neon_dotprod sse2/; + specialize qw/aom_variance32x8 neon neon_dotprod sse2 avx2/; + specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/; + specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/; + + specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/; + } + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 neon ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 neon ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 neon ssse3/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param"; + } + } + + specialize qw/aom_highbd_12_variance128x128 sse2 neon sve/; + specialize qw/aom_highbd_12_variance128x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x128 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x8 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x8 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x4 neon sve/; + specialize qw/aom_highbd_12_variance4x8 neon sve/; + specialize qw/aom_highbd_12_variance4x4 sse4_1 neon sve/; + + specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x4 neon sve/; + specialize qw/aom_highbd_10_variance4x8 neon sve/; + specialize qw/aom_highbd_10_variance4x4 sse4_1 neon sve/; + + specialize qw/aom_highbd_8_variance128x128 sse2 neon sve/; + specialize qw/aom_highbd_8_variance128x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x128 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x8 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x8 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x4 neon sve/; + specialize qw/aom_highbd_8_variance4x8 neon sve/; + specialize qw/aom_highbd_8_variance4x4 sse4_1 neon sve/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + foreach $bd (8, 10, 12) { + my $avx2 = ($bd == 10) ? "avx2" : ""; + specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance16x4" , qw/neon sve/; + specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance4x16" , qw/neon sve/; + } + } + + specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance4x8 neon/; + specialize qw/aom_highbd_12_sub_pixel_variance4x4 sse4_1 neon/; + + specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2 avx2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance4x8 neon/; + specialize qw/aom_highbd_10_sub_pixel_variance4x4 sse4_1 neon/; + + specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance4x8 neon/; + specialize qw/aom_highbd_8_sub_pixel_variance4x4 sse4_1 neon/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + foreach $bd (8, 10, 12) { + specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/; + } + } + + specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8 neon/; + specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4 sse4_1 neon/; + + specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8 neon/; + specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4 sse4_1 neon/; + + specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8 neon/; + specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4 sse4_1 neon/; + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + foreach $bd (8, 10, 12) { + specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/; + } + } + + foreach $bd (8, 10, 12) { + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4" , qw/neon/; + } + + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + foreach $bd (8, 10, 12) { + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/; + specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/; + } + } + } + # + # Masked Variance / Masked Subpixel Variance + # + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; + specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_8_", "_10_", "_12_") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; + specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/; + } + } + } + + # + # OBMC Variance / OBMC Subpixel Variance + # + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2 neon/; + specialize "aom_obmc_sub_pixel_variance${w}x${h}", qw/sse4_1 neon/; + } + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_8_", "_10_", "_12_") { + foreach (@encoder_block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/; + specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/; + } + } + } + } + + # + # Comp Avg + # + add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + specialize qw/aom_comp_avg_pred avx2 neon/; + + add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param"; + specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + specialize qw/aom_highbd_comp_avg_pred neon/; + + add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param"; + specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/; + + add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h"; + specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon sve/; + } + + add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_comp_mask_pred ssse3 avx2 neon/; + + if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/; + } + + # Flow estimation library + if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { + add_proto qw/double av1_compute_cross_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2"; + specialize qw/av1_compute_cross_correlation sse4_1 avx2/; + + add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; + specialize qw/aom_compute_flow_at_point sse4_1 neon/; + } + +} # CONFIG_AV1_ENCODER + +1; diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h new file mode 100644 index 0000000000..00686ac388 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_filter.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_FILTER_H_ +#define AOM_AOM_DSP_AOM_FILTER_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 + +#define SUBPEL_BITS 4 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) +#define SUBPEL_TAPS 8 + +#define SCALE_SUBPEL_BITS 10 +#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS) +#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1) +#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS) +#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2) + +#define RS_SUBPEL_BITS 6 +#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1) +#define RS_SCALE_SUBPEL_BITS 14 +#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1) +#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS) +#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1)) + +typedef int16_t InterpKernel[SUBPEL_TAPS]; + +#define BIL_SUBPEL_BITS 3 +#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS) + +// 2 tap bilinear filters +static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_AOM_FILTER_H_ diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h new file mode 100644 index 0000000000..69da8f21b4 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_simd.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_SIMD_H_ +#define AOM_AOM_DSP_AOM_SIMD_H_ + +#include + +#if defined(_WIN32) +#include +#endif + +#include "config/aom_config.h" + +#include "aom_dsp/aom_simd_inline.h" + +#define SIMD_CHECK 1 // Sanity checks in C equivalents + +// VS compiling for 32 bit targets does not support vector types in +// structs as arguments, which makes the v256 type of the intrinsics +// hard to support, so optimizations for this target are disabled. +#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) +#include "simd/v256_intrinsics_x86.h" +#else +#include "simd/v256_intrinsics.h" +#endif + +#endif // AOM_AOM_DSP_AOM_SIMD_H_ diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h new file mode 100644 index 0000000000..b4b1b35637 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_simd_inline.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_ +#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_ + +#include "aom/aom_integer.h" + +#ifndef SIMD_INLINE +#define SIMD_INLINE static AOM_FORCE_INLINE +#endif + +#define SIMD_CLAMP(value, min, max) \ + ((value) > (max) ? (max) : (value) < (min) ? (min) : (value)) + +#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c new file mode 100644 index 0000000000..7441108b01 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + int16x4_t sum; + + sum = vmul_lane_s16(s0, filter_lo, 0); + sum = vmla_lane_s16(sum, s1, filter_lo, 1); + sum = vmla_lane_s16(sum, s2, filter_lo, 2); + sum = vmla_lane_s16(sum, s5, filter_hi, 1); + sum = vmla_lane_s16(sum, s6, filter_hi, 2); + sum = vmla_lane_s16(sum, s7, filter_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const int16x8_t filter = vld1q_s16(filter_x); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + if (h == 4) { + uint8x8_t t0, t1, t2, t3, d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + src += 7; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + transpose_elems_inplace_u8_4x4(&d01, &d23); + + store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w != 0); + } else { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + + if (w == 4) { + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, + &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + + transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); + + store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0); + store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1); + store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2); + store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3); + + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } else { + uint8x8_t d4, d5, d6, d7; + int16x8_t s11, s12, s13, s14; + int width; + const uint8_t *s; + uint8_t *d; + + do { + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + width = w; + s = src + 7; + d = dst; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + + transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } +} + +void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const int16x8_t filter = vld1q_s16(filter_y); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)filter_x; + (void)x_step_q4; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23; + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + + src += 7 * src_stride; + + do { + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); + s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int height; + const uint8_t *s; + uint8_t *d; + + do { + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + height = h; + s = src + 7 * src_stride; + d = dst; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c new file mode 100644 index 0000000000..ac0a6efd00 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples, + const int8x8_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x2_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[2]; + int32x4_t sum; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); + sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, + const int8x8_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); + sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1); + /* Second 4 output values. */ + sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); + sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x16_t range_limit = vdupq_n_u8(128); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl); + t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl); + t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl); + t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl); + d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl); + d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl); + d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b = vqtbl2q_s8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2, + int8x8_t a3, int8x16_t *b0, + int8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } }; + *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]); +} + +static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int32x4_t correction, + const int8x8_t filter) { + /* Sample range-clamping and permutation are performed by the caller. */ + int32x4_t sum; + + /* Accumulate dot product into 'correction' to account for range clamp. */ + sum = vdotq_lane_s32(correction, samples_lo, filter, 0); + sum = vdotq_lane_s32(sum, samples_hi, filter, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int32x4_t correction, + const int8x8_t filter) { + /* Sample range-clamping and permutation are performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Accumulate dot product into 'correction' to account for range clamp. */ + /* First 4 output values. */ + sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1); + /* Second 4 output values. */ + sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); + const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128); + const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); + const uint8x8_t range_limit = vdup_n_u8(128); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)filter_x; + (void)x_step_q4; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); + d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); + d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); + d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ + s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); + s7 = vdup_n_s8(0); + s8 = vdup_n_s8(0); + s9 = vdup_n_s8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + uint8x8_t t7, t8, t9, t10; + + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + correction, filter); + d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + correction, filter); + d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + correction, filter); + d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + correction, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c new file mode 100644 index 0000000000..c314c0a192 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + /* Shift left and insert new last column in transposed 4x4 block. */ + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + /* Shift left and insert two new columns in transposed 4x4 block. */ + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + /* Shift left and insert three new columns in transposed 4x4 block. */ + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, + const int8x8_t filter, + const uint8x16x2_t permute_tbl) { + uint8x16_t permuted_samples[2]; + int32x4_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + + sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, + const int8x8_t filter, + const uint8x16x3_t permute_tbl) { + uint8x16_t permuted_samples[3]; + int32x4_t sum0, sum1; + int16x8_t sum; + + /* Permute samples ready for dot product. */ + /* { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } */ + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + /* { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */ + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x)); + uint8x16_t s0, s1, s2, s3; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)x_step_q4; + (void)filter_y; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1); + + if (w == 4) { + const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + do { + int16x4_t t0, t1, t2, t3; + uint8x8_t d01, d23; + + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + t0 = convolve8_4_usdot(s0, filter, perm_tbl); + t1 = convolve8_4_usdot(s1, filter, perm_tbl); + t2 = convolve8_4_usdot(s2, filter, perm_tbl); + t3 = convolve8_4_usdot(s3, filter, perm_tbl); + d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + const uint8_t *s; + uint8_t *d; + int width; + uint8x8_t d0, d1, d2, d3; + + do { + width = w; + s = src; + d = dst; + do { + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + d0 = convolve8_8_usdot(s0, filter, perm_tbl); + d1 = convolve8_8_usdot(s1, filter, perm_tbl); + d2 = convolve8_8_usdot(s2, filter, perm_tbl); + d3 = convolve8_8_usdot(s3, filter, perm_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + } +} + +static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b, + const uint8x16_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, XX, XX, XX, XX + * a1: 10, 11, 12, 13, XX, XX, XX, XX + * a2: 20, 21, 22, 23, XX, XX, XX, XX + * a3: 30, 31, 32, 33, XX, XX, XX, XX + * + * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b = vqtbl2q_u8(samples, permute_tbl); +} + +static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1, + const uint8x16x2_t permute_tbl) { + /* Transpose 8-bit elements and concatenate result rows as follows: + * a0: 00, 01, 02, 03, 04, 05, 06, 07 + * a1: 10, 11, 12, 13, 14, 15, 16, 17 + * a2: 20, 21, 22, 23, 24, 25, 26, 27 + * a3: 30, 31, 32, 33, 34, 35, 36, 37 + * + * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + * + * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it + * as an argument is preferable to loading it directly from memory as this + * inline helper is called many times from the same parent function. + */ + + uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } }; + *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]); + *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]); +} + +static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filter) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum; + + sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filter, 1); + + /* Further narrowing and packing is performed by the caller. */ + return vqmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filter) { + /* Sample permutation is performed by the caller. */ + int32x4_t sum0, sum1; + int16x8_t sum; + + /* First 4 output values. */ + sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1); + /* Second 4 output values. */ + sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1); + + /* Narrow and re-pack. */ + sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint8x16x2_t samples_LUT; + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + (void)filter_x; + (void)x_step_q4; + (void)y_step_q4; + + src -= ((SUBPEL_TAPS / 2) - 1) * src_stride; + + if (w == 4) { + const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); + uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; + int16x4_t d0, d1, d2, d3; + uint8x8_t d01, d23; + + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); + transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); + transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); + transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); + + do { + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456; + samples_LUT.val[1] = s78910; + s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_4_usdot_partial(s0123, s4567, filter); + d1 = convolve8_4_usdot_partial(s1234, s5678, filter); + d2 = convolve8_4_usdot_partial(s2345, s6789, filter); + d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s78910_lo, s78910_hi; + uint8x8_t d0, d1, d2, d3; + const uint8_t *s; + uint8_t *d; + int height; + + do { + height = h; + s = src; + d = dst; + + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + s7 = vdup_n_u8(0); + s8 = vdup_n_u8(0); + s9 = vdup_n_u8(0); + + /* This operation combines a conventional transpose and the sample permute + * (see horizontal case) required before computing the dot product. + */ + transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, + tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, + tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi, + tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, + tran_concat_tbl); + transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, + tran_concat_tbl); + transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, + tran_concat_tbl); + transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, + tran_concat_tbl); + + do { + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, + tran_concat_tbl); + + /* Merge new data into block from previous iteration. */ + samples_LUT.val[0] = s3456_lo; + samples_LUT.val[1] = s78910_lo; + s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, + filter); + d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, + filter); + d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, + filter); + d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, + filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c new file mode 100644 index 0000000000..325d6f29ff --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + const uint8_t *src1; + uint8_t *dst1; + int y; + + if (!(w & 0x0F)) { + for (y = 0; y < h; ++y) { + src1 = src; + dst1 = dst; + for (int x = 0; x < (w >> 4); ++x) { + vst1q_u8(dst1, vld1q_u8(src1)); + src1 += 16; + dst1 += 16; + } + src += src_stride; + dst += dst_stride; + } + } else if (!(w & 0x07)) { + for (y = 0; y < h; ++y) { + vst1_u8(dst, vld1_u8(src)); + src += src_stride; + dst += dst_stride; + } + } else if (!(w & 0x03)) { + for (y = 0; y < h; ++y) { + memcpy(dst, src, sizeof(uint32_t)); + src += src_stride; + dst += dst_stride; + } + } else if (!(w & 0x01)) { + for (y = 0; y < h; ++y) { + memcpy(dst, src, sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, int w, + int h) { + if (w < 8) { // copy4 + uint16x4_t s0, s1; + do { + s0 = vld1_u16(src); + src += src_stride; + s1 = vld1_u16(src); + src += src_stride; + + vst1_u16(dst, s0); + dst += dst_stride; + vst1_u16(dst, s1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { // copy8 + uint16x8_t s0, s1; + do { + s0 = vld1q_u16(src); + src += src_stride; + s1 = vld1q_u16(src); + src += src_stride; + + vst1q_u16(dst, s0); + dst += dst_stride; + vst1q_u16(dst, s1); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + src += src_stride; + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); + src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + dst += dst_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); + dst += dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + dst += dst_stride; + } while (--h != 0); + } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + do { + const uint16_t *s = src; + uint16_t *d = dst; + int width = w; + do { + s0 = vld1q_u16(s); + s1 = vld1q_u16(s + 8); + s2 = vld1q_u16(s + 16); + s3 = vld1q_u16(s + 24); + s4 = vld1q_u16(s + 32); + s5 = vld1q_u16(s + 40); + s6 = vld1q_u16(s + 48); + s7 = vld1q_u16(s + 56); + + vst1q_u16(d, s0); + vst1q_u16(d + 8, s1); + vst1q_u16(d + 16, s2); + vst1q_u16(d + 24, s3); + vst1q_u16(d + 32, s4); + vst1q_u16(d + 40, s5); + vst1q_u16(d + 48, s6); + vst1q_u16(d + 56, s7); + s += 64; + d += 64; + width -= 64; + } while (width > 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c new file mode 100644 index 0000000000..2e79b2ef69 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/avg_neon.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" + +unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) { + const uint8x8_t s0 = load_unaligned_u8(p, stride); + const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride); + + const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1)); + return (sum + (1 << 3)) >> 4; +} + +unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) { + uint8x8_t s0 = vld1_u8(p); + p += stride; + uint8x8_t s1 = vld1_u8(p); + p += stride; + uint16x8_t acc = vaddl_u8(s0, s1); + + int i = 0; + do { + const uint8x8_t si = vld1_u8(p); + p += stride; + acc = vaddw_u8(acc, si); + } while (++i < 6); + + const uint32_t sum = horizontal_add_u16x8(acc); + return (sum + (1 << 5)) >> 6; +} + +void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx, + int *avg) { + avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p); + avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p); + avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p); + avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p); +} + +int aom_satd_lp_neon(const int16_t *coeff, int length) { + int16x8_t s0 = vld1q_s16(coeff); + int16x8_t s1 = vld1q_s16(coeff + 8); + + int16x8_t abs0 = vabsq_s16(s0); + int16x8_t abs1 = vabsq_s16(s1); + + int32x4_t acc0 = vpaddlq_s16(abs0); + int32x4_t acc1 = vpaddlq_s16(abs1); + + length -= 16; + coeff += 16; + + while (length != 0) { + s0 = vld1q_s16(coeff); + s1 = vld1q_s16(coeff + 8); + + abs0 = vabsq_s16(s0); + abs1 = vabsq_s16(s1); + + acc0 = vpadalq_s16(acc0, abs0); + acc1 = vpadalq_s16(acc1, abs1); + + length -= 16; + coeff += 16; + } + + int32x4_t accum = vaddq_s32(acc0, acc1); + return horizontal_add_s32x4(accum); +} + +void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, + const int ref_stride, const int width, + const int height, int norm_factor) { + assert(width % 16 == 0); + assert(height % 4 == 0); + + const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); + uint16x8_t sum_lo[2], sum_hi[2]; + + int w = 0; + do { + const uint8_t *r = ref + w; + uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride); + uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride); + uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride); + uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride); + + sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + r += 4 * ref_stride; + + for (int h = height - 4; h != 0; h -= 4) { + r0 = vld1q_u8(r + 0 * ref_stride); + r1 = vld1q_u8(r + 1 * ref_stride); + r2 = vld1q_u8(r + 2 * ref_stride); + r3 = vld1q_u8(r + 3 * ref_stride); + + uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); + + sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo); + sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi); + sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo); + sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi); + + r += 4 * ref_stride; + } + + sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); + sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); + + const int16x8_t avg0 = + vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); + const int16x8_t avg1 = + vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); + + vst1q_s16(hbuf + w, avg0); + vst1q_s16(hbuf + w + 8, avg1); + w += 16; + } while (w < width); +} + +void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref, + const int ref_stride, const int width, + const int height, int norm_factor) { + assert(width % 16 == 0); + assert(height % 4 == 0); + + const int16x4_t neg_norm_factor = vdup_n_s16(-norm_factor); + uint16x8_t sum[4]; + + int h = 0; + do { + sum[0] = vpaddlq_u8(vld1q_u8(ref + 0 * ref_stride)); + sum[1] = vpaddlq_u8(vld1q_u8(ref + 1 * ref_stride)); + sum[2] = vpaddlq_u8(vld1q_u8(ref + 2 * ref_stride)); + sum[3] = vpaddlq_u8(vld1q_u8(ref + 3 * ref_stride)); + + for (int w = 16; w < width; w += 16) { + sum[0] = vpadalq_u8(sum[0], vld1q_u8(ref + 0 * ref_stride + w)); + sum[1] = vpadalq_u8(sum[1], vld1q_u8(ref + 1 * ref_stride + w)); + sum[2] = vpadalq_u8(sum[2], vld1q_u8(ref + 2 * ref_stride + w)); + sum[3] = vpadalq_u8(sum[3], vld1q_u8(ref + 3 * ref_stride + w)); + } + + uint16x4_t sum_4d = vmovn_u32(horizontal_add_4d_u16x8(sum)); + int16x4_t avg = vshl_s16(vreinterpret_s16_u16(sum_4d), neg_norm_factor); + vst1_s16(vbuf + h, avg); + + ref += 4 * ref_stride; + h += 4; + } while (h < height); +} + +// coeff: 20 bits, dynamic range [-524287, 524287]. +// length: value range {16, 32, 64, 128, 256, 512, 1024}. +int aom_satd_neon(const tran_low_t *coeff, int length) { + const int32x4_t zero = vdupq_n_s32(0); + + int32x4_t s0 = vld1q_s32(&coeff[0]); + int32x4_t s1 = vld1q_s32(&coeff[4]); + int32x4_t s2 = vld1q_s32(&coeff[8]); + int32x4_t s3 = vld1q_s32(&coeff[12]); + + int32x4_t accum0 = vabsq_s32(s0); + int32x4_t accum1 = vabsq_s32(s2); + accum0 = vabaq_s32(accum0, s1, zero); + accum1 = vabaq_s32(accum1, s3, zero); + + length -= 16; + coeff += 16; + + while (length != 0) { + s0 = vld1q_s32(&coeff[0]); + s1 = vld1q_s32(&coeff[4]); + s2 = vld1q_s32(&coeff[8]); + s3 = vld1q_s32(&coeff[12]); + + accum0 = vabaq_s32(accum0, s0, zero); + accum1 = vabaq_s32(accum1, s1, zero); + accum0 = vabaq_s32(accum0, s2, zero); + accum1 = vabaq_s32(accum1, s3, zero); + + length -= 16; + coeff += 16; + } + + // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024] + return horizontal_add_s32x4(vaddq_s32(accum0, accum1)); +} + +int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) { + assert(bwl >= 2 && bwl <= 5); + int width = 4 << bwl; + + int16x8_t r = vld1q_s16(ref); + int16x8_t s = vld1q_s16(src); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff = vsubq_s16(r, s); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + int16x8_t v_mean = diff; + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + int32x4_t v_sse[2]; + v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff)); + v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff)); + + ref += 8; + src += 8; + width -= 8; + + do { + r = vld1q_s16(ref); + s = vld1q_s16(src); + + diff = vsubq_s16(r, s); + v_mean = vaddq_s16(v_mean, diff); + + v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff)); + v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff)); + + ref += 8; + src += 8; + width -= 8; + } while (width != 0); + + // Dynamic range [0, 65280], 16 (unsigned) bits. + const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean)); + const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1])); + + // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits. + return sse - ((mean_abs * mean_abs) >> (bwl + 2)); +} + +void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max) { + // Load and concatenate. + const uint8x16_t a01 = load_u8_8x2(a + 0 * a_stride, a_stride); + const uint8x16_t a23 = load_u8_8x2(a + 2 * a_stride, a_stride); + const uint8x16_t a45 = load_u8_8x2(a + 4 * a_stride, a_stride); + const uint8x16_t a67 = load_u8_8x2(a + 6 * a_stride, a_stride); + + const uint8x16_t b01 = load_u8_8x2(b + 0 * b_stride, b_stride); + const uint8x16_t b23 = load_u8_8x2(b + 2 * b_stride, b_stride); + const uint8x16_t b45 = load_u8_8x2(b + 4 * b_stride, b_stride); + const uint8x16_t b67 = load_u8_8x2(b + 6 * b_stride, b_stride); + + // Absolute difference. + const uint8x16_t ab01_diff = vabdq_u8(a01, b01); + const uint8x16_t ab23_diff = vabdq_u8(a23, b23); + const uint8x16_t ab45_diff = vabdq_u8(a45, b45); + const uint8x16_t ab67_diff = vabdq_u8(a67, b67); + + // Max values between the Q vectors. + const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); + const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); + + const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); + const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); + +#if AOM_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint8_t *)max) = vmaxvq_u8(ab07_max); + *((uint8_t *)min) = vminvq_u8(ab07_min); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); + uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u8((uint8_t *)max, ab_max, 0); + vst1_lane_u8((uint8_t *)min, ab_min, 0); +#endif +} diff --git a/third_party/aom/aom_dsp/arm/avg_pred_neon.c b/third_party/aom/aom_dsp/arm/avg_pred_neon.c new file mode 100644 index 0000000000..b17f7fca7f --- /dev/null +++ b/third_party/aom/aom_dsp/arm/avg_pred_neon.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/dist_wtd_avg_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" + +void aom_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + if (width > 8) { + do { + const uint8_t *pred_ptr = pred; + const uint8_t *ref_ptr = ref; + uint8_t *comp_pred_ptr = comp_pred; + int w = width; + + do { + const uint8x16_t p = vld1q_u8(pred_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + const uint8x16_t avg = vrhaddq_u8(p, r); + + vst1q_u8(comp_pred_ptr, avg); + + ref_ptr += 16; + pred_ptr += 16; + comp_pred_ptr += 16; + w -= 16; + } while (w != 0); + + ref += ref_stride; + pred += width; + comp_pred += width; + } while (--height != 0); + } else if (width == 8) { + int h = height / 2; + + do { + const uint8x16_t p = vld1q_u8(pred); + const uint8x16_t r = load_u8_8x2(ref, ref_stride); + const uint8x16_t avg = vrhaddq_u8(p, r); + + vst1q_u8(comp_pred, avg); + + ref += 2 * ref_stride; + pred += 16; + comp_pred += 16; + } while (--h != 0); + } else { + int h = height / 4; + assert(width == 4); + + do { + const uint8x16_t p = vld1q_u8(pred); + const uint8x16_t r = load_unaligned_u8q(ref, ref_stride); + const uint8x16_t avg = vrhaddq_u8(p, r); + + vst1q_u8(comp_pred, avg); + + ref += 4 * ref_stride; + pred += 16; + comp_pred += 16; + } while (--h != 0); + } +} + +void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + + if (width > 8) { + do { + const uint8_t *pred_ptr = pred; + const uint8_t *ref_ptr = ref; + uint8_t *comp_pred_ptr = comp_pred; + int w = width; + + do { + const uint8x16_t p = vld1q_u8(pred_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t wtd_avg = + dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset); + + vst1q_u8(comp_pred_ptr, wtd_avg); + + ref_ptr += 16; + pred_ptr += 16; + comp_pred_ptr += 16; + w -= 16; + } while (w != 0); + + ref += ref_stride; + pred += width; + comp_pred += width; + } while (--height != 0); + } else if (width == 8) { + int h = height / 2; + + do { + const uint8x16_t p = vld1q_u8(pred); + const uint8x16_t r = load_u8_8x2(ref, ref_stride); + + const uint8x16_t wtd_avg = + dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset); + + vst1q_u8(comp_pred, wtd_avg); + + ref += 2 * ref_stride; + pred += 16; + comp_pred += 16; + } while (--h != 0); + } else { + int h = height / 2; + assert(width == 4); + + do { + const uint8x8_t p = vld1_u8(pred); + const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride); + + const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset), + vget_low_u8(bck_offset)); + + vst1_u8(comp_pred, wtd_avg); + + ref += 2 * ref_stride; + pred += 8; + comp_pred += 8; + } while (--h != 0); + } +} + +void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int src_stride0 = invert_mask ? width : ref_stride; + const int src_stride1 = invert_mask ? ref_stride : width; + + if (width > 8) { + do { + const uint8_t *src0_ptr = src0; + const uint8_t *src1_ptr = src1; + const uint8_t *mask_ptr = mask; + uint8_t *comp_pred_ptr = comp_pred; + int w = width; + + do { + const uint8x16_t s0 = vld1q_u8(src0_ptr); + const uint8x16_t s1 = vld1q_u8(src1_ptr); + const uint8x16_t m0 = vld1q_u8(mask_ptr); + + uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1); + + vst1q_u8(comp_pred_ptr, blend_u8); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + comp_pred_ptr += 16; + w -= 16; + } while (w != 0); + + src0 += src_stride0; + src1 += src_stride1; + mask += mask_stride; + comp_pred += width; + } while (--height != 0); + } else if (width == 8) { + do { + const uint8x8_t s0 = vld1_u8(src0); + const uint8x8_t s1 = vld1_u8(src1); + const uint8x8_t m0 = vld1_u8(mask); + + uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(comp_pred, blend_u8); + + src0 += src_stride0; + src1 += src_stride1; + mask += mask_stride; + comp_pred += 8; + } while (--height != 0); + } else { + int h = height / 2; + assert(width == 4); + + do { + const uint8x8_t s0 = load_unaligned_u8(src0, src_stride0); + const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1); + const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride); + + uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(comp_pred, blend_u8); + + src0 += 2 * src_stride0; + src1 += 2 * src_stride1; + mask += 2 * mask_stride; + comp_pred += 8; + } while (--h != 0); + } +} diff --git a/third_party/aom/aom_dsp/arm/avg_sve.c b/third_party/aom/aom_dsp/arm/avg_sve.c new file mode 100644 index 0000000000..bbf5a9447c --- /dev/null +++ b/third_party/aom/aom_dsp/arm/avg_sve.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) { + assert(bwl >= 2 && bwl <= 5); + int width = 4 << bwl; + + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + + do { + int16x8_t r0 = vld1q_s16(ref); + int16x8_t s0 = vld1q_s16(src); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff0 = vsubq_s16(r0, s0); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + v_mean[0] = vaddq_s16(v_mean[0], diff0); + + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0); + + int16x8_t r1 = vld1q_s16(ref + 8); + int16x8_t s1 = vld1q_s16(src + 8); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff1 = vsubq_s16(r1, s1); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + v_mean[1] = vaddq_s16(v_mean[1], diff1); + + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1); + + ref += 16; + src += 16; + width -= 16; + } while (width != 0); + + // Dynamic range [0, 65280], 16 (unsigned) bits. + const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1]))); + const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); + + // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits. + return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2))); +} diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c new file mode 100644 index 0000000000..1bc3b80310 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" + +uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b, + uint16x8_t round_offset) { + const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); + + uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a)); + uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a)); + + blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); + blend_u32_hi = + vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); + + uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS); + uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS); + + uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi); + + res = vqsubq_u16(res, round_offset); + + return vqrshrn_n_u16(res, + 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); +} + +void aom_lowbd_blend_a64_d16_mask_neon( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + (void)conv_params; + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t offset_vec = vdupq_n_u16(round_offset); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + if (w >= 8) { + do { + int i = 0; + do { + uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec); + + vst1_u8(dst + i, blend); + i += 8; + } while (i < w); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 2 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else if (subw == 1 && subh == 1) { + if (w >= 8) { + do { + int i = 0; + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i); + uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8); + uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t m_avg = + vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); + + uint8x8_t blend = + alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); + + vst1_u8(dst + i, blend); + i += 8; + } while (i < w); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); + uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); + uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 4 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else if (subw == 1 && subh == 0) { + if (w >= 8) { + do { + int i = 0; + do { + uint8x8_t m0 = vld1_u8(mask + 2 * i); + uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); + uint8x8_t blend = + alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); + + vst1_u8(dst + i, blend); + i += 8; + } while (i < w); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); + uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 2 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else { + if (w >= 8) { + do { + int i = 0; + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); + uint8x8_t blend = + alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); + + vst1_u8(dst + i, blend); + i += 8; + } while (i < w); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0_2 = + load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); + uint8x8_t m1_3 = + load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); + uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 4 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } +} + +void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if ((subw | subh) == 0) { + if (w > 8) { + do { + int i = 0; + do { + uint8x16_t m0 = vld1q_u8(mask + i); + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); + + vst1q_u8(dst + i, blend); + i += 16; + } while (i < w); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8x8_t m0 = vld1_u8(mask); + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(dst, blend); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride); + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 2 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else if ((subw & subh) == 1) { + if (w > 8) { + do { + int i = 0; + do { + uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); + uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); + uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16); + uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16); + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3); + uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8); + uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8); + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3); + uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); + + vst1_u8(dst, blend); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); + uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3); + uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 4 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else if (subw == 1 && subh == 0) { + if (w > 8) { + do { + int i = 0; + + do { + uint8x16_t m0 = vld1q_u8(mask + 2 * i); + uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16); + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1); + uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8x8_t m0 = vld1_u8(mask); + uint8x8_t m1 = vld1_u8(mask + 8); + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1); + uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); + + vst1_u8(dst, blend); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1); + uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 2 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else { + if (w > 8) { + do { + int i = 0; + do { + uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i); + uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i); + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t m_avg = avg_blend_u8x16(m0, m1); + uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t m_avg = avg_blend_u8x8(m0, m1); + uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); + + vst1_u8(dst, blend); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0_2 = + load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); + uint8x8_t m1_3 = + load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3); + uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 4 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } +} diff --git a/third_party/aom/aom_dsp/arm/blend_neon.h b/third_party/aom/aom_dsp/arm/blend_neon.h new file mode 100644 index 0000000000..c8a03224e4 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/blend_neon.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_ +#define AOM_AOM_DSP_ARM_BLEND_NEON_H_ + +#include + +#include "aom_dsp/blend.h" + +static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a, + uint8x16_t b) { + const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m); + + uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a)); + uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a)); + + blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b)); + blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b)); + + uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS); + uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS); + + return vcombine_u8(blend_u8_lo, blend_u8_hi); +} + +static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a, + uint8x8_t b) { + const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m); + + uint16x8_t blend_u16 = vmull_u8(m, a); + + blend_u16 = vmlal_u8(blend_u16, m_inv, b); + + return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a, + uint16x8_t b) { + uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); + + uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m)); + uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m)); + + blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv)); + blend_u32_hi = + vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv)); + + uint16x4_t blend_u16_lo = + vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS); + uint16x4_t blend_u16_hi = + vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS); + + return vcombine_u16(blend_u16_lo, blend_u16_hi); +} + +static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a, + uint16x4_t b) { + const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); + + uint32x4_t blend_u16 = vmull_u16(m, a); + + blend_u16 = vmlal_u16(blend_u16, m_inv, b); + + return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) { + return vrhadd_u8(a, b); +} + +static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) { + return vrhaddq_u8(a, b); +} + +static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) { + return vrshr_n_u8(vpadd_u8(a, b), 1); +} + +static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) { +#if AOM_ARCH_AARCH64 + return vrshrq_n_u8(vpaddq_u8(a, b), 1); +#else + uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); + uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b)); + return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1); +#endif // AOM_ARCH_AARCH64 +} + +static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b, + uint8x8_t c, uint8x8_t d) { + uint8x8_t a_c = vpadd_u8(a, c); + uint8x8_t b_d = vpadd_u8(b, d); + return vrshr_n_u8(vqadd_u8(a_c, b_d), 2); +} + +static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b, + uint8x16_t c, + uint8x16_t d) { +#if AOM_ARCH_AARCH64 + uint8x16_t a_c = vpaddq_u8(a, c); + uint8x16_t b_d = vpaddq_u8(b, d); + return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2); +#else + uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); + uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b)); + uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c)); + uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d)); + uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c); + uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d); + return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2); +#endif // AOM_ARCH_AARCH64 +} + +#endif // AOM_AOM_DSP_ARM_BLEND_NEON_H_ diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c new file mode 100644 index 0000000000..f2ada93e95 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int i = bh; + int32x4_t sum = vdupq_n_s32(0); + int32x4_t sse = vdupq_n_s32(0); + + do { + int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride)); + + sum = vpadalq_s16(sum, d); + + sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d)); + sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d)); + + data += 2 * stride; + i -= 2; + } while (i != 0); + + *x_sum = horizontal_add_s32x4(sum); + *x2_sum = horizontal_long_add_s32x4(sse); +} + +static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int i = bh; + int32x4_t sum = vdupq_n_s32(0); + int32x4_t sse = vdupq_n_s32(0); + + // Input is 12-bit wide, so we can add up to 127 squared elements in a signed + // 32-bits element. Since we're accumulating into an int32x4_t and the maximum + // value for bh is 32, we don't have to worry about sse overflowing. + + do { + int16x8_t d = vld1q_s16(data); + + sum = vpadalq_s16(sum, d); + + sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d)); + sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d)); + + data += stride; + } while (--i != 0); + + *x_sum = horizontal_add_s32x4(sum); + *x2_sum = horizontal_long_add_s32x4(sse); +} + +static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride, + int bw, int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum = vdupq_n_s32(0); + int64x2_t sse = vdupq_n_s64(0); + + // Input is 12-bit wide, so we can add up to 127 squared elements in a signed + // 32-bits element. Since we're accumulating into an int32x4_t vector that + // means we can process up to (127*4)/bw rows before we need to widen to + // 64 bits. + + int i_limit = (127 * 4) / bw; + int i_tmp = bh > i_limit ? i_limit : bh; + + int i = 0; + do { + int32x4_t sse_s32 = vdupq_n_s32(0); + do { + int j = bw; + const int16_t *data_ptr = data; + do { + int16x8_t d = vld1q_s16(data_ptr); + + sum = vpadalq_s16(sum, d); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d)); + + data_ptr += 8; + j -= 8; + } while (j != 0); + + data += stride; + i++; + } while (i < i_tmp && i < bh); + + sse = vpadalq_s32(sse, sse_s32); + i_tmp += i_limit; + } while (i < bh); + + *x_sum = horizontal_add_s32x4(sum); + *x2_sum = horizontal_add_s64x2(sse); +} + +void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + if (bw == 4) { + get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum); + } else if (bw == 8) { + get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum); + } else { + assert(bw % 8 == 0); + get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c new file mode 100644 index 0000000000..18bdc5dbfe --- /dev/null +++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum = vdupq_n_s32(0); + int64x2_t sse = vdupq_n_s64(0); + + do { + int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride)); + + sum = vpadalq_s16(sum, d); + + sse = aom_sdotq_s16(sse, d, d); + + data += 2 * stride; + bh -= 2; + } while (bh != 0); + + *x_sum = vaddvq_s32(sum); + *x2_sum = vaddvq_s64(sse); +} + +static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int16x8_t d0 = vld1q_s16(data); + int16x8_t d1 = vld1q_s16(data + stride); + + sum[0] = vpadalq_s16(sum[0], d0); + sum[1] = vpadalq_s16(sum[1], d1); + + sse[0] = aom_sdotq_s16(sse[0], d0, d0); + sse[1] = aom_sdotq_s16(sse[1], d1, d1); + + data += 2 * stride; + bh -= 2; + } while (bh != 0); + + *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); + *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride, + int bw, int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int j = bw; + const int16_t *data_ptr = data; + do { + int16x8_t d0 = vld1q_s16(data_ptr); + int16x8_t d1 = vld1q_s16(data_ptr + 8); + + sum[0] = vpadalq_s16(sum[0], d0); + sum[1] = vpadalq_s16(sum[1], d1); + + sse[0] = aom_sdotq_s16(sse[0], d0, d0); + sse[1] = aom_sdotq_s16(sse[1], d1, d1); + + data_ptr += 16; + j -= 16; + } while (j != 0); + + data += stride; + } while (--bh != 0); + + *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); + *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + if (bw == 4) { + get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum); + } else if (bw == 8) { + get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum); + } else { + assert(bw % 16 == 0); + get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h new file mode 100644 index 0000000000..19c9b04c57 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_ +#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/enums.h" + +static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b, + uint8x8_t wta, uint8x8_t wtb) { + uint16x8_t wtd_sum = vmull_u8(a, wta); + + wtd_sum = vmlal_u8(wtd_sum, b, wtb); + + return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS); +} + +static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b, + uint16x4_t wta, uint16x4_t wtb) { + uint32x4_t wtd_sum = vmull_u16(a, wta); + + wtd_sum = vmlal_u16(wtd_sum, b, wtb); + + return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS); +} + +static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b, + uint8x16_t wta, uint8x16_t wtb) { + uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta)); + uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta)); + + wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb)); + wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb)); + + uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS); + uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS); + + return vcombine_u8(wtd_avg_lo, wtd_avg_hi); +} + +static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b, + uint16x8_t wta, uint16x8_t wtb) { + uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta)); + uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta)); + + wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb)); + wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb)); + + uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS); + uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS); + + return vcombine_u16(wtd_avg_lo, wtd_avg_hi); +} + +#endif // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_ diff --git a/third_party/aom/aom_dsp/arm/dot_sve.h b/third_party/aom/aom_dsp/arm/dot_sve.h new file mode 100644 index 0000000000..cf49f23606 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/dot_sve.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_DOT_SVE_H_ +#define AOM_AOM_DSP_ARM_DOT_SVE_H_ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +// Dot product instructions operating on 16-bit input elements are exclusive to +// the SVE instruction set. However, we can access these instructions from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x, + uint16x8_t y) { + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + +static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +#endif // AOM_AOM_DSP_ARM_DOT_SVE_H_ diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c new file mode 100644 index 0000000000..a4d6322f24 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" + +static void aom_fdct4x4_helper(const int16_t *input, int stride, + int16x4_t *input_0, int16x4_t *input_1, + int16x4_t *input_2, int16x4_t *input_3) { + *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + *input_0 = vadd_s16(*input_0, one); + } + + for (int i = 0; i < 2; ++i) { + const int16x8_t input_01 = vcombine_s16(*input_0, *input_1); + const int16x8_t input_32 = vcombine_s16(*input_3, *input_2); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // (s_0 +/- s_1) * cospi_16_64 + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); + const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); + const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64); + const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64); + + // fdct_round_shift + int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); + int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64); + const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64); + + const int32x4_t temp3 = + vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64); + const int32x4_t temp4 = + vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64); + + // fdct_round_shift + int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); + int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); + + // Only transpose the first pass + if (i == 0) { + transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3); + } + + *input_0 = out_0; + *input_1 = out_1; + *input_2 = out_2; + *input_3 = out_3; + } +} + +void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int16x4_t input_0, input_1, input_2, input_3; + + aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3); + + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); +} + +void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output, + int stride) { + // input[M * stride] * 16 + int16x4_t input_0, input_1, input_2, input_3; + + aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3); + + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(input_0, input_1); + int16x8_t out_23 = vcombine_s16(input_2, input_3); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + vst1q_s16(final_output + 0 * 8, out_01); + vst1q_s16(final_output + 1 * 8, out_23); +} + +void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (int i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from aom_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c new file mode 100644 index 0000000000..d0f59227db --- /dev/null +++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" + +static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3) { + const int16x4_t b0 = vhadd_s16(*a0, *a1); + const int16x4_t b1 = vhsub_s16(*a0, *a1); + const int16x4_t b2 = vhadd_s16(*a2, *a3); + const int16x4_t b3 = vhsub_s16(*a2, *a3); + + *a0 = vadd_s16(b0, b2); + *a1 = vadd_s16(b1, b3); + *a2 = vsub_s16(b0, b2); + *a3 = vsub_s16(b1, b3); +} + +void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x4_t a0 = vld1_s16(src_diff); + int16x4_t a1 = vld1_s16(src_diff + src_stride); + int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride); + int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride); + + hadamard_4x4_one_pass(&a0, &a1, &a2, &a3); + + transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3); + + hadamard_4x4_one_pass(&a0, &a1, &a2, &a3); + + store_s16_to_tran_low(coeff, a0); + store_s16_to_tran_low(coeff + 4, a1); + store_s16_to_tran_low(coeff + 8, a2); + store_s16_to_tran_low(coeff + 12, a3); +} + +static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + const int16x8_t b0 = vaddq_s16(*a0, *a1); + const int16x8_t b1 = vsubq_s16(*a0, *a1); + const int16x8_t b2 = vaddq_s16(*a2, *a3); + const int16x8_t b3 = vsubq_s16(*a2, *a3); + const int16x8_t b4 = vaddq_s16(*a4, *a5); + const int16x8_t b5 = vsubq_s16(*a4, *a5); + const int16x8_t b6 = vaddq_s16(*a6, *a7); + const int16x8_t b7 = vsubq_s16(*a6, *a7); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + const int16x8_t c4 = vaddq_s16(b4, b6); + const int16x8_t c5 = vaddq_s16(b5, b7); + const int16x8_t c6 = vsubq_s16(b4, b6); + const int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a1 = vsubq_s16(c2, c6); + *a2 = vsubq_s16(c0, c4); + *a3 = vaddq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); + *a6 = vsubq_s16(c1, c5); + *a7 = vaddq_s16(c1, c5); +} + +void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); +} + +void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + vst1q_s16(coeff + 0, a0); + vst1q_s16(coeff + 8, a1); + vst1q_s16(coeff + 16, a2); + vst1q_s16(coeff + 24, a3); + vst1q_s16(coeff + 32, a4); + vst1q_s16(coeff + 40, a5); + vst1q_s16(coeff + 48, a6); + vst1q_s16(coeff + 56, a7); +} + +void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff, + ptrdiff_t src_stride, int16_t *coeff) { + for (int i = 0; i < 2; i++) { + aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64)); + } +} + +void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, + coeff + 0); + /* Top right. */ + aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, + coeff + 64); + /* Bottom left. */ + aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, + coeff + 128); + /* Bottom right. */ + aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, + coeff + 192); + + for (int i = 0; i < 64; i += 8) { + const int16x8_t a0 = vld1q_s16(coeff + 0); + const int16x8_t a1 = vld1q_s16(coeff + 64); + const int16x8_t a2 = vld1q_s16(coeff + 128); + const int16x8_t a3 = vld1q_s16(coeff + 192); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + vst1q_s16(coeff + 0, c0); + vst1q_s16(coeff + 64, c1); + vst1q_s16(coeff + 128, c2); + vst1q_s16(coeff + 192, c3); + + coeff += 8; + } +} + +void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + // Each iteration of the loop operates on entire rows (16 samples each) + // because we need to swap the second and third quarters of every row in the + // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for + // loop at the end of aom_hadamard_16x16_c. + for (int i = 0; i < 64; i += 16) { + const int32x4_t a00 = vld1q_s32(coeff + 0); + const int32x4_t a01 = vld1q_s32(coeff + 64); + const int32x4_t a02 = vld1q_s32(coeff + 128); + const int32x4_t a03 = vld1q_s32(coeff + 192); + + const int32x4_t b00 = vhaddq_s32(a00, a01); + const int32x4_t b01 = vhsubq_s32(a00, a01); + const int32x4_t b02 = vhaddq_s32(a02, a03); + const int32x4_t b03 = vhsubq_s32(a02, a03); + + const int32x4_t c00 = vaddq_s32(b00, b02); + const int32x4_t c01 = vaddq_s32(b01, b03); + const int32x4_t c02 = vsubq_s32(b00, b02); + const int32x4_t c03 = vsubq_s32(b01, b03); + + const int32x4_t a10 = vld1q_s32(coeff + 4 + 0); + const int32x4_t a11 = vld1q_s32(coeff + 4 + 64); + const int32x4_t a12 = vld1q_s32(coeff + 4 + 128); + const int32x4_t a13 = vld1q_s32(coeff + 4 + 192); + + const int32x4_t b10 = vhaddq_s32(a10, a11); + const int32x4_t b11 = vhsubq_s32(a10, a11); + const int32x4_t b12 = vhaddq_s32(a12, a13); + const int32x4_t b13 = vhsubq_s32(a12, a13); + + const int32x4_t c10 = vaddq_s32(b10, b12); + const int32x4_t c11 = vaddq_s32(b11, b13); + const int32x4_t c12 = vsubq_s32(b10, b12); + const int32x4_t c13 = vsubq_s32(b11, b13); + + const int32x4_t a20 = vld1q_s32(coeff + 8 + 0); + const int32x4_t a21 = vld1q_s32(coeff + 8 + 64); + const int32x4_t a22 = vld1q_s32(coeff + 8 + 128); + const int32x4_t a23 = vld1q_s32(coeff + 8 + 192); + + const int32x4_t b20 = vhaddq_s32(a20, a21); + const int32x4_t b21 = vhsubq_s32(a20, a21); + const int32x4_t b22 = vhaddq_s32(a22, a23); + const int32x4_t b23 = vhsubq_s32(a22, a23); + + const int32x4_t c20 = vaddq_s32(b20, b22); + const int32x4_t c21 = vaddq_s32(b21, b23); + const int32x4_t c22 = vsubq_s32(b20, b22); + const int32x4_t c23 = vsubq_s32(b21, b23); + + const int32x4_t a30 = vld1q_s32(coeff + 12 + 0); + const int32x4_t a31 = vld1q_s32(coeff + 12 + 64); + const int32x4_t a32 = vld1q_s32(coeff + 12 + 128); + const int32x4_t a33 = vld1q_s32(coeff + 12 + 192); + + const int32x4_t b30 = vhaddq_s32(a30, a31); + const int32x4_t b31 = vhsubq_s32(a30, a31); + const int32x4_t b32 = vhaddq_s32(a32, a33); + const int32x4_t b33 = vhsubq_s32(a32, a33); + + const int32x4_t c30 = vaddq_s32(b30, b32); + const int32x4_t c31 = vaddq_s32(b31, b33); + const int32x4_t c32 = vsubq_s32(b30, b32); + const int32x4_t c33 = vsubq_s32(b31, b33); + + vst1q_s32(coeff + 0 + 0, c00); + vst1q_s32(coeff + 0 + 4, c20); + vst1q_s32(coeff + 0 + 8, c10); + vst1q_s32(coeff + 0 + 12, c30); + + vst1q_s32(coeff + 64 + 0, c01); + vst1q_s32(coeff + 64 + 4, c21); + vst1q_s32(coeff + 64 + 8, c11); + vst1q_s32(coeff + 64 + 12, c31); + + vst1q_s32(coeff + 128 + 0, c02); + vst1q_s32(coeff + 128 + 4, c22); + vst1q_s32(coeff + 128 + 8, c12); + vst1q_s32(coeff + 128 + 12, c32); + + vst1q_s32(coeff + 192 + 0, c03); + vst1q_s32(coeff + 192 + 4, c23); + vst1q_s32(coeff + 192 + 8, c13); + vst1q_s32(coeff + 192 + 12, c33); + + coeff += 16; + } +} + +void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + /* Top left first. */ + aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, + coeff + 256); + /* Bottom left. */ + aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, + coeff + 512); + /* Bottom right. */ + aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, + coeff + 768); + + for (int i = 0; i < 256; i += 4) { + const int32x4_t a0 = vld1q_s32(coeff); + const int32x4_t a1 = vld1q_s32(coeff + 256); + const int32x4_t a2 = vld1q_s32(coeff + 512); + const int32x4_t a3 = vld1q_s32(coeff + 768); + + const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); + const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); + const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); + const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); + + const int32x4_t c0 = vaddq_s32(b0, b2); + const int32x4_t c1 = vaddq_s32(b1, b3); + const int32x4_t c2 = vsubq_s32(b0, b2); + const int32x4_t c3 = vsubq_s32(b1, b3); + + vst1q_s32(coeff + 0, c0); + vst1q_s32(coeff + 256, c1); + vst1q_s32(coeff + 512, c2); + vst1q_s32(coeff + 768, c3); + + coeff += 4; + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c new file mode 100644 index 0000000000..47d5dae012 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" + +uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + uint16x4_t sum, a0, a1, a2, a3; + + load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3); + + sum = vadd_u16(a0, a1); + sum = vadd_u16(sum, a2); + sum = vadd_u16(sum, a3); + + return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4; +} + +uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; + + load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + sum = vaddq_u16(a0, a1); + sum = vaddq_u16(sum, a2); + sum = vaddq_u16(sum, a3); + sum = vaddq_u16(sum, a4); + sum = vaddq_u16(sum, a5); + sum = vaddq_u16(sum, a6); + sum = vaddq_u16(sum, a7); + + return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6; +} + +void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp); + + const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); + const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); + const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); + const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); + const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); + const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); + const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); + const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); + + const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); + const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); + const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); + const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); + + const uint16x8_t max0123 = vmaxq_u16(max01, max23); + const uint16x8_t max4567 = vmaxq_u16(max45, max67); + const uint16x8_t max07 = vmaxq_u16(max0123, max4567); + + const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); + const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); + const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); + const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); + + const uint16x8_t min0123 = vminq_u16(min01, min23); + const uint16x8_t min4567 = vminq_u16(min45, min67); + const uint16x8_t min07 = vminq_u16(min0123, min4567); + +#if AOM_ARCH_AARCH64 + *max = (int)vmaxvq_u16(max07); + *min = (int)vminvq_u16(min07); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); + uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u16((uint16_t *)max, ab_max, 0); + vst1_lane_u16((uint16_t *)min, ab_min, 0); +#endif +} diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c new file mode 100644 index 0000000000..531309b025 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/dist_wtd_avg_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" + +void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + int i = height; + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else { + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + uint16x4_t avg = vrhadd_u16(p, r); + vst1_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } +} + +void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int src_stride0 = invert_mask ? width : ref_stride; + const int src_stride1 = invert_mask ? ref_stride : width; + + if (width >= 8) { + do { + int j = 0; + + do { + const uint16x8_t s0 = vld1q_u16(src0 + j); + const uint16x8_t s1 = vld1q_u16(src1 + j); + const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j)); + + uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1); + + vst1q_u16(comp_pred + j, blend_u16); + + j += 8; + } while (j < width); + + src0 += src_stride0; + src1 += src_stride1; + mask += mask_stride; + comp_pred += width; + } while (--height != 0); + } else { + assert(width == 4); + + do { + const uint16x4_t s0 = vld1_u16(src0); + const uint16x4_t s1 = vld1_u16(src1); + const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask))); + + uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1); + + vst1_u16(comp_pred, blend_u16); + + src0 += src_stride0; + src1 += src_stride1; + mask += mask_stride; + comp_pred += 4; + } while (--height != 0); + } +} + +void aom_highbd_dist_wtd_comp_avg_pred_neon( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset); + const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset); + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + const uint16x8_t avg = + dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16); + + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--height != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + const uint16x8_t avg = + dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16); + + vst1q_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--height != 0); + } else { + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + const uint16x4_t avg = dist_wtd_avg_u16x4( + r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16)); + + vst1_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--height != 0); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c new file mode 100644 index 0000000000..8b03e91ac3 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" + +void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + int w, int h, int bd) { + (void)bd; + + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (w >= 8) { + do { + int i = 0; + do { + uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); + + vst1q_u16(dst + i, blend); + i += 8; + } while (i < w); + + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 4) { + const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask)); + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); + + store_u16x4_strided_x2(dst, dst_stride, blend); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 2 && h >= 8) { + const uint16x4_t m0 = + vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask))); + do { + uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride); + uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride); + + uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); + + store_u16x2_strided_x2(dst, dst_stride, blend); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, w, h, bd); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c new file mode 100644 index 0000000000..90b44fcc5e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c @@ -0,0 +1,473 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" + +#define HBD_BLEND_A64_D16_MASK(bd, round0_bits) \ + static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8( \ + uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) { \ + const uint16x8_t m_inv = \ + vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m); \ + \ + uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset), \ + vget_low_u16(m), vget_low_u16(a)); \ + uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset), \ + vget_high_u16(m), vget_high_u16(a)); \ + \ + blend_u32_lo = \ + vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b)); \ + blend_u32_hi = \ + vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b)); \ + \ + uint16x4_t blend_u16_lo = \ + vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo), \ + AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \ + round0_bits - COMPOUND_ROUND1_BITS); \ + uint16x4_t blend_u16_hi = \ + vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi), \ + AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS - \ + round0_bits - COMPOUND_ROUND1_BITS); \ + \ + uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi); \ + blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1)); \ + \ + return blend_u16; \ + } \ + \ + static INLINE void highbd_##bd##_blend_a64_d16_mask_neon( \ + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, \ + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, \ + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, \ + int subh) { \ + const int offset_bits = bd + 2 * FILTER_BITS - round0_bits; \ + int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + \ + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); \ + int32x4_t offset = \ + vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS)); \ + \ + if ((subw | subh) == 0) { \ + if (w >= 8) { \ + do { \ + int i = 0; \ + do { \ + uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); \ + uint16x8_t s0 = vld1q_u16(src0 + i); \ + uint16x8_t s1 = vld1q_u16(src1 + i); \ + \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \ + \ + vst1q_u16(dst + i, blend); \ + i += 8; \ + } while (i < w); \ + \ + mask += mask_stride; \ + src0 += src0_stride; \ + src1 += src1_stride; \ + dst += dst_stride; \ + } while (--h != 0); \ + } else { \ + do { \ + uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \ + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ + \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \ + \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ + \ + mask += 2 * mask_stride; \ + src0 += 2 * src0_stride; \ + src1 += 2 * src1_stride; \ + dst += 2 * dst_stride; \ + h -= 2; \ + } while (h != 0); \ + } \ + } else if ((subw & subh) == 1) { \ + if (w >= 8) { \ + do { \ + int i = 0; \ + do { \ + uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i); \ + uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i); \ + uint16x8_t s0 = vld1q_u16(src0 + i); \ + uint16x8_t s1 = vld1q_u16(src1 + i); \ + \ + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4( \ + vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0), \ + vget_high_u8(m1))); \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ + \ + vst1q_u16(dst + i, blend); \ + i += 8; \ + } while (i < w); \ + \ + mask += 2 * mask_stride; \ + src0 += src0_stride; \ + src1 += src1_stride; \ + dst += dst_stride; \ + } while (--h != 0); \ + } else { \ + do { \ + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \ + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \ + uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); \ + uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); \ + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ + \ + uint16x8_t m_avg = \ + vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ + \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ + \ + mask += 4 * mask_stride; \ + src0 += 2 * src0_stride; \ + src1 += 2 * src1_stride; \ + dst += 2 * dst_stride; \ + h -= 2; \ + } while (h != 0); \ + } \ + } else if (subw == 1 && subh == 0) { \ + if (w >= 8) { \ + do { \ + int i = 0; \ + do { \ + uint8x8_t m0 = vld1_u8(mask + 2 * i); \ + uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); \ + uint16x8_t s0 = vld1q_u16(src0 + i); \ + uint16x8_t s1 = vld1q_u16(src1 + i); \ + \ + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ + \ + vst1q_u16(dst + i, blend); \ + i += 8; \ + } while (i < w); \ + \ + mask += mask_stride; \ + src0 += src0_stride; \ + src1 += src1_stride; \ + dst += dst_stride; \ + } while (--h != 0); \ + } else { \ + do { \ + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); \ + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); \ + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ + \ + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ + \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ + \ + mask += 2 * mask_stride; \ + src0 += 2 * src0_stride; \ + src1 += 2 * src1_stride; \ + dst += 2 * dst_stride; \ + h -= 2; \ + } while (h != 0); \ + } \ + } else { \ + if (w >= 8) { \ + do { \ + int i = 0; \ + do { \ + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); \ + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); \ + uint16x8_t s0 = vld1q_u16(src0 + i); \ + uint16x8_t s1 = vld1q_u16(src1 + i); \ + \ + uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ + \ + vst1q_u16(dst + i, blend); \ + i += 8; \ + } while (i < w); \ + \ + mask += 2 * mask_stride; \ + src0 += src0_stride; \ + src1 += src1_stride; \ + dst += dst_stride; \ + } while (--h != 0); \ + } else { \ + do { \ + uint8x8_t m0_2 = \ + load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \ + uint8x8_t m1_3 = \ + load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \ + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); \ + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); \ + \ + uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); \ + uint16x8_t blend = \ + alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ + \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ + \ + mask += 4 * mask_stride; \ + src0 += 2 * src0_stride; \ + src1 += 2 * src1_stride; \ + dst += 2 * dst_stride; \ + h -= 2; \ + } while (h != 0); \ + } \ + } \ + } + +// 12 bitdepth +HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2)) +// 10 bitdepth +HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS) +// 8 bitdepth +HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS) + +void aom_highbd_blend_a64_d16_mask_neon( + uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + (void)conv_params; + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + if (bd == 12) { + highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + subw, subh); + } else if (bd == 10) { + highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + subw, subh); + } else { + highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh); + } +} + +void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h, int subw, int subh, int bd) { + (void)bd; + + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if ((subw | subh) == 0) { + if (w >= 8) { + do { + int i = 0; + do { + uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i)); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); + + vst1q_u16(dst + i, blend); + i += 8; + } while (i < w); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); + + store_u16x4_strided_x2(dst, dst_stride, blend); + + mask += 2 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else if ((subw & subh) == 1) { + if (w >= 8) { + do { + int i = 0; + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i); + uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8); + uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t m_avg = + vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); + + uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); + + vst1q_u16(dst + i, blend); + + i += 8; + } while (i < w); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride); + uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); + uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); + + store_u16x4_strided_x2(dst, dst_stride, blend); + + mask += 4 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else if (subw == 1 && subh == 0) { + if (w >= 8) { + do { + int i = 0; + + do { + uint8x8_t m0 = vld1_u8(mask + 2 * i); + uint8x8_t m1 = vld1_u8(mask + 2 * i + 8); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); + uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); + + vst1q_u16(dst + i, blend); + + i += 8; + } while (i < w); + + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); + uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); + + store_u16x4_strided_x2(dst, dst_stride, blend); + + mask += 2 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } else { + if (w >= 8) { + do { + int i = 0; + do { + uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i); + uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i); + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1)); + uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); + + vst1q_u16(dst + i, blend); + + i += 8; + } while (i < w); + + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint8x8_t m0_2 = + load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); + uint8x8_t m1_3 = + load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); + uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); + + store_u16x4_strided_x2(dst, dst_stride, blend); + + mask += 4 * mask_stride; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c new file mode 100644 index 0000000000..1292e20342 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" + +void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + int w, int h, int bd) { + (void)bd; + + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (w >= 8) { + do { + uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0])); + int i = 0; + do { + uint16x8_t s0 = vld1q_u16(src0 + i); + uint16x8_t s1 = vld1q_u16(src1 + i); + + uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1); + + vst1q_u16(dst + i, blend); + i += 8; + } while (i < w); + + mask += 1; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]); + uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]); + uint16x8_t m = vcombine_u16(m1, m2); + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1); + + store_u16x4_strided_x2(dst, dst_stride, blend); + + mask += 2; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 2 && h >= 8) { + do { + uint16x4_t m0 = vdup_n_u16(0); + m0 = vld1_lane_u16((uint16_t *)mask, m0, 0); + uint8x8_t m0_zip = + vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0]; + m0 = vget_low_u16(vmovl_u8(m0_zip)); + uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride); + uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride); + + uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); + + store_u16x2_strided_x2(dst, dst_stride, blend); + + mask += 2; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, w, h, bd); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c new file mode 100644 index 0000000000..e25438c9b4 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" + +static INLINE int32x4_t highbd_convolve8_4_s32( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + return sum; +} + +static INLINE uint16x4_t highbd_convolve8_4_s32_s16( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { + int32x4_t sum = + highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + return vqrshrun_n_s32(sum, FILTER_BITS); +} + +static INLINE int32x4_t highbd_convolve8_horiz4_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) { + const int16x8_t s2 = vextq_s16(s0, s1, 1); + const int16x8_t s3 = vextq_s16(s0, s1, 2); + const int16x8_t s4 = vextq_s16(s0, s1, 3); + const int16x4_t s0_lo = vget_low_s16(s0); + const int16x4_t s1_lo = vget_low_s16(s2); + const int16x4_t s2_lo = vget_low_s16(s3); + const int16x4_t s3_lo = vget_low_s16(s4); + const int16x4_t s4_lo = vget_high_s16(s0); + const int16x4_t s5_lo = vget_high_s16(s2); + const int16x4_t s6_lo = vget_high_s16(s3); + const int16x4_t s7_lo = vget_high_s16(s4); + + return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo, + s7_lo, x_filter_0_7); +} + +static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) { + int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7); + + return vqrshrun_n_s32(sum, FILTER_BITS); +} + +static INLINE void highbd_convolve8_8_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + int32x4_t *sum0, int32x4_t *sum1) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3); + + *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3); +} + +static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0, + const int16x8_t s0_hi, + const int16x8_t x_filter_0_7, + int32x4_t *sum0, + int32x4_t *sum1) { + const int16x8_t s1 = vextq_s16(s0, s0_hi, 1); + const int16x8_t s2 = vextq_s16(s0, s0_hi, 2); + const int16x8_t s3 = vextq_s16(s0, s0_hi, 3); + const int16x8_t s4 = vextq_s16(s0, s0_hi, 4); + const int16x8_t s5 = vextq_s16(s0, s0_hi, 5); + const int16x8_t s6 = vextq_s16(s0, s0_hi, 6); + const int16x8_t s7 = vextq_s16(s0, s0_hi, 7); + + highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0, + sum1); +} + +static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) { + int32x4_t sum0, sum1; + highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE uint16x8_t highbd_convolve8_8_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) { + int32x4_t sum0; + int32x4_t sum1; + highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0, + &sum1); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static void highbd_convolve_horiz_neon(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, + const int16_t *x_filter_ptr, + int x_step_q4, int w, int h, int bd) { + assert(w >= 4 && h >= 4); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x2(s, src_stride, &s0, &s2); + load_s16_8x2(s + 8, src_stride, &s1, &s3); + + uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter); + uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter); + + uint16x8_t d01 = vcombine_u16(d0, d1); + d01 = vminq_u16(d01, max); + + vst1_u16(d + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(d + 1 * dst_stride, vget_high_u16(d01)); + + s += 2 * src_stride; + d += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + int x_q4 = 0; + + const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS]; + int16x8_t s0, s2, s4, s6; + load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6); + src_x += 8; + + do { + int16x8_t s1, s3, s5, s7; + load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7); + + uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter); + uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter); + uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter); + uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s1; + s2 = s3; + s4 = s5; + s6 = s7; + src_x += 8; + d += 8; + width -= 8; + x_q4 += 8 * x_step_q4; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + } else { + (void)filter_y; + (void)y_step_q4; + + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= SUBPEL_TAPS / 2 - 1; + highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, w, h, bd); + } +} + +static void highbd_convolve_vert_neon(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, + const int16_t *y_filter_ptr, int w, int h, + int bd) { + assert(w >= 4 && h >= 4); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint16x4_t d1 = + highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint16x4_t d2 = + highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint16x4_t d3 = + highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + uint16x8_t d01 = vcombine_u16(d0, d1); + uint16x8_t d23 = vcombine_u16(d2, d3); + + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + + vst1_u16(d + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(d + 1 * dst_stride, vget_high_u16(d01)); + vst1_u16(d + 2 * dst_stride, vget_low_u16(d23)); + vst1_u16(d + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6, + s7, y_filter); + uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7, + s8, y_filter); + uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8, + s9, y_filter); + uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9, + s10, y_filter); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } +} + +void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + } else { + (void)filter_x; + (void)x_step_q4; + + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride; + highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h, + bd); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c new file mode 100644 index 0000000000..d28617c67e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" + +static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, + int16x8_t *a7) { + int16x8_t b0 = vaddq_s16(*a0, *a1); + int16x8_t b1 = vsubq_s16(*a0, *a1); + int16x8_t b2 = vaddq_s16(*a2, *a3); + int16x8_t b3 = vsubq_s16(*a2, *a3); + int16x8_t b4 = vaddq_s16(*a4, *a5); + int16x8_t b5 = vsubq_s16(*a4, *a5); + int16x8_t b6 = vaddq_s16(*a6, *a7); + int16x8_t b7 = vsubq_s16(*a6, *a7); + + int16x8_t c0 = vaddq_s16(b0, b2); + int16x8_t c2 = vsubq_s16(b0, b2); + int16x8_t c1 = vaddq_s16(b1, b3); + int16x8_t c3 = vsubq_s16(b1, b3); + int16x8_t c4 = vaddq_s16(b4, b6); + int16x8_t c6 = vsubq_s16(b4, b6); + int16x8_t c5 = vaddq_s16(b5, b7); + int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a2 = vsubq_s16(c0, c4); + *a7 = vaddq_s16(c1, c5); + *a6 = vsubq_s16(c1, c5); + *a3 = vaddq_s16(c2, c6); + *a1 = vsubq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); +} + +static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1, + int16x4_t a2, int16x4_t a3, + int16x4_t a4, int16x4_t a5, + int16x4_t a6, int16x4_t a7, + tran_low_t *coeff) { + int32x4_t b0 = vaddl_s16(a0, a1); + int32x4_t b1 = vsubl_s16(a0, a1); + int32x4_t b2 = vaddl_s16(a2, a3); + int32x4_t b3 = vsubl_s16(a2, a3); + int32x4_t b4 = vaddl_s16(a4, a5); + int32x4_t b5 = vsubl_s16(a4, a5); + int32x4_t b6 = vaddl_s16(a6, a7); + int32x4_t b7 = vsubl_s16(a6, a7); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c3 = vsubq_s32(b1, b3); + int32x4_t c4 = vaddq_s32(b4, b6); + int32x4_t c6 = vsubq_s32(b4, b6); + int32x4_t c5 = vaddq_s32(b5, b7); + int32x4_t c7 = vsubq_s32(b5, b7); + + int32x4_t d0 = vaddq_s32(c0, c4); + int32x4_t d2 = vsubq_s32(c0, c4); + int32x4_t d7 = vaddq_s32(c1, c5); + int32x4_t d6 = vsubq_s32(c1, c5); + int32x4_t d3 = vaddq_s32(c2, c6); + int32x4_t d1 = vsubq_s32(c2, c6); + int32x4_t d4 = vaddq_s32(c3, c7); + int32x4_t d5 = vsubq_s32(c3, c7); + + vst1q_s32(coeff + 0, d0); + vst1q_s32(coeff + 4, d1); + vst1q_s32(coeff + 8, d2); + vst1q_s32(coeff + 12, d3); + vst1q_s32(coeff + 16, d4); + vst1q_s32(coeff + 20, d5); + vst1q_s32(coeff + 24, d6); + vst1q_s32(coeff + 28, d7); +} + +void aom_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x4_t b0, b1, b2, b3, b4, b5, b6, b7; + + int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride); + int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride); + int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride); + + // For the first pass we can stay in 16-bit elements (4095*8 = 32760). + hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + // For the second pass we need to widen to 32-bit elements, so we're + // processing 4 columns at a time. + // Skip the second transpose because it is not required. + + b0 = vget_low_s16(s0); + b1 = vget_low_s16(s1); + b2 = vget_low_s16(s2); + b3 = vget_low_s16(s3); + b4 = vget_low_s16(s4); + b5 = vget_low_s16(s5); + b6 = vget_low_s16(s6); + b7 = vget_low_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff); + + b0 = vget_high_s16(s0); + b1 = vget_high_s16(s1); + b2 = vget_high_s16(s2); + b3 = vget_high_s16(s3); + b4 = vget_high_s16(s4); + b5 = vget_high_s16(s5); + b6 = vget_high_s16(s6); + b7 = vget_high_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); +} + +void aom_highbd_hadamard_16x16_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + // Rearrange 16x16 to 8x32 and remove stride. + // Top left first. + aom_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff); + // Top right. + aom_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64); + // Bottom left. + aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride, + coeff + 128); + // Bottom right. + aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride, + coeff + 192); + + for (int i = 0; i < 16; i++) { + int32x4_t a0 = vld1q_s32(coeff + 4 * i); + int32x4_t a1 = vld1q_s32(coeff + 4 * i + 64); + int32x4_t a2 = vld1q_s32(coeff + 4 * i + 128); + int32x4_t a3 = vld1q_s32(coeff + 4 * i + 192); + + int32x4_t b0 = vhaddq_s32(a0, a1); + int32x4_t b1 = vhsubq_s32(a0, a1); + int32x4_t b2 = vhaddq_s32(a2, a3); + int32x4_t b3 = vhsubq_s32(a2, a3); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + vst1q_s32(coeff + 4 * i, c0); + vst1q_s32(coeff + 4 * i + 64, c1); + vst1q_s32(coeff + 4 * i + 128, c2); + vst1q_s32(coeff + 4 * i + 192, c3); + } +} + +void aom_highbd_hadamard_32x32_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + // Rearrange 32x32 to 16x64 and remove stride. + // Top left first. + aom_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff); + // Top right. + aom_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256); + // Bottom left. + aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride, + coeff + 512); + // Bottom right. + aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride, + coeff + 768); + + for (int i = 0; i < 64; i++) { + int32x4_t a0 = vld1q_s32(coeff + 4 * i); + int32x4_t a1 = vld1q_s32(coeff + 4 * i + 256); + int32x4_t a2 = vld1q_s32(coeff + 4 * i + 512); + int32x4_t a3 = vld1q_s32(coeff + 4 * i + 768); + + int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); + int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); + int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); + int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + vst1q_s32(coeff + 4 * i, c0); + vst1q_s32(coeff + 4 * i + 256, c1); + vst1q_s32(coeff + 4 * i + 512, c2); + vst1q_s32(coeff + 4 * i + 768, c3); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c new file mode 100644 index 0000000000..dc47974c68 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c @@ -0,0 +1,2730 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// DC + +static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x4_t dc) { + for (int i = 0; i < h; ++i) { + vst1_u16(dst + i * stride, dc); + } +} + +static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + } +} + +static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + vst1q_u16(dst + i * stride + 8, dc); + } +} + +static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + vst1q_u16(dst + i * stride + 8, dc); + vst1q_u16(dst + i * stride + 16, dc); + vst1q_u16(dst + i * stride + 24, dc); + } +} + +static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h, + uint16x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u16(dst + i * stride, dc); + vst1q_u16(dst + i * stride + 8, dc); + vst1q_u16(dst + i * stride + 16, dc); + vst1q_u16(dst + i * stride + 24, dc); + vst1q_u16(dst + i * stride + 32, dc); + vst1q_u16(dst + i * stride + 40, dc); + vst1q_u16(dst + i * stride + 48, dc); + vst1q_u16(dst + i * stride + 56, dc); + } +} + +static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) { + // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so + // promote first. + const uint32x4_t b = vpaddlq_u16(a); +#if AOM_ARCH_AARCH64 + const uint32x4_t c = vpaddq_u32(b, b); + return vpaddq_u32(c, c); +#else + const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b)); + const uint32x2_t d = vpadd_u32(c, c); + return vcombine_u32(d, d); +#endif +} + +static INLINE uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) { + // Nothing to do since sum is already one vector, but saves needing to + // special case w=4 or h=4 cases. The combine will be zero cost for a sane + // compiler since vld1 already sets the top half of a vector to zero as part + // of the operation. + return vcombine_u16(vld1_u16(left), vdup_n_u16(0)); +} + +static INLINE uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) { + // Nothing to do since sum is already one vector, but saves needing to + // special case w=8 or h=8 cases. + return vld1q_u16(left); +} + +static INLINE uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) { + const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits + const uint16x8_t a1 = vld1q_u16(left + 8); + return vaddq_u16(a0, a1); // up to 13 bits +} + +static INLINE uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) { + const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits + const uint16x8_t a1 = vld1q_u16(left + 8); + const uint16x8_t a2 = vld1q_u16(left + 16); + const uint16x8_t a3 = vld1q_u16(left + 24); + const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits + const uint16x8_t b1 = vaddq_u16(a2, a3); + return vaddq_u16(b0, b1); // up to 14 bits +} + +static INLINE uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) { + const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits + const uint16x8_t a1 = vld1q_u16(left + 8); + const uint16x8_t a2 = vld1q_u16(left + 16); + const uint16x8_t a3 = vld1q_u16(left + 24); + const uint16x8_t a4 = vld1q_u16(left + 32); + const uint16x8_t a5 = vld1q_u16(left + 40); + const uint16x8_t a6 = vld1q_u16(left + 48); + const uint16x8_t a7 = vld1q_u16(left + 56); + const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits + const uint16x8_t b1 = vaddq_u16(a2, a3); + const uint16x8_t b2 = vaddq_u16(a4, a5); + const uint16x8_t b3 = vaddq_u16(a6, a7); + const uint16x8_t c0 = vaddq_u16(b0, b1); // up to 14 bits + const uint16x8_t c1 = vaddq_u16(b2, b3); + return vaddq_u16(c0, c1); // up to 15 bits +} + +#define HIGHBD_DC_PREDICTOR(w, h, shift) \ + void aom_highbd_dc_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + const uint16x8_t a = highbd_dc_load_partial_sum_##w(above); \ + const uint16x8_t l = highbd_dc_load_partial_sum_##h(left); \ + const uint32x4_t sum = \ + horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l)); \ + const uint16x4_t dc0 = vrshrn_n_u32(sum, shift); \ + highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \ + } + +void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // In the rectangular cases we simply extend the shorter vector to uint16x8 + // in order to accumulate, however in the 4x4 case there is no shorter vector + // to extend so it is beneficial to do the whole calculation in uint16x4 + // instead. + (void)bd; + const uint16x4_t a = vld1_u16(above); // up to 12 bits + const uint16x4_t l = vld1_u16(left); + uint16x4_t sum = vpadd_u16(a, l); // up to 13 bits + sum = vpadd_u16(sum, sum); // up to 14 bits + sum = vpadd_u16(sum, sum); + const uint16x4_t dc = vrshr_n_u16(sum, 3); + highbd_dc_store_4xh(dst, stride, 4, dc); +} + +HIGHBD_DC_PREDICTOR(8, 8, 4) +HIGHBD_DC_PREDICTOR(16, 16, 5) +HIGHBD_DC_PREDICTOR(32, 32, 6) +HIGHBD_DC_PREDICTOR(64, 64, 7) + +#undef HIGHBD_DC_PREDICTOR + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier, int shift2) { + const int interm = num >> shift1; + return interm * multiplier >> shift2; +} + +#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB +#define HIGHBD_DC_MULTIPLIER_1X4 0x6667 +#define HIGHBD_DC_SHIFT2 17 + +static INLINE int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1, + uint32_t multiplier) { + return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier, + HIGHBD_DC_SHIFT2); +} + +#undef HIGHBD_DC_SHIFT2 + +#define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult) \ + void aom_highbd_dc_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \ + uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \ + uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \ + int sum = horizontal_add_u16x8(sum_vec); \ + int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \ + highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \ + } + +HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4) +HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4) +HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4) +HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4) +HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4) +HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2) +HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4) +HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2) + +#undef HIGHBD_DC_PREDICTOR_RECT +#undef HIGHBD_DC_MULTIPLIER_1X2 +#undef HIGHBD_DC_MULTIPLIER_1X4 + +// ----------------------------------------------------------------------------- +// DC_128 + +#define HIGHBD_DC_PREDICTOR_128(w, h, q) \ + void aom_highbd_dc_128_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)above; \ + (void)bd; \ + (void)left; \ + highbd_dc_store_##w##xh(dst, stride, (h), \ + vdup##q##_n_u16(0x80 << (bd - 8))); \ + } + +HIGHBD_DC_PREDICTOR_128(4, 4, ) +HIGHBD_DC_PREDICTOR_128(4, 8, ) +HIGHBD_DC_PREDICTOR_128(4, 16, ) +HIGHBD_DC_PREDICTOR_128(8, 4, q) +HIGHBD_DC_PREDICTOR_128(8, 8, q) +HIGHBD_DC_PREDICTOR_128(8, 16, q) +HIGHBD_DC_PREDICTOR_128(8, 32, q) +HIGHBD_DC_PREDICTOR_128(16, 4, q) +HIGHBD_DC_PREDICTOR_128(16, 8, q) +HIGHBD_DC_PREDICTOR_128(16, 16, q) +HIGHBD_DC_PREDICTOR_128(16, 32, q) +HIGHBD_DC_PREDICTOR_128(16, 64, q) +HIGHBD_DC_PREDICTOR_128(32, 8, q) +HIGHBD_DC_PREDICTOR_128(32, 16, q) +HIGHBD_DC_PREDICTOR_128(32, 32, q) +HIGHBD_DC_PREDICTOR_128(32, 64, q) +HIGHBD_DC_PREDICTOR_128(64, 16, q) +HIGHBD_DC_PREDICTOR_128(64, 32, q) +HIGHBD_DC_PREDICTOR_128(64, 64, q) + +#undef HIGHBD_DC_PREDICTOR_128 + +// ----------------------------------------------------------------------------- +// DC_LEFT + +static INLINE uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) { + const uint16x4_t a = vld1_u16(left); // up to 12 bits + const uint16x4_t b = vpadd_u16(a, a); // up to 13 bits + return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0)); +} + +static INLINE uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) { + return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left)); +} + +static INLINE uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) { + return horizontal_add_and_broadcast_long_u16x8( + highbd_dc_load_partial_sum_16(left)); +} + +static INLINE uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) { + return horizontal_add_and_broadcast_long_u16x8( + highbd_dc_load_partial_sum_32(left)); +} + +static INLINE uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) { + return horizontal_add_and_broadcast_long_u16x8( + highbd_dc_load_partial_sum_64(left)); +} + +#define DC_PREDICTOR_LEFT(w, h, shift, q) \ + void aom_highbd_dc_left_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)above; \ + (void)bd; \ + const uint32x4_t sum = highbd_dc_load_sum_##h(left); \ + const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \ + highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \ + } + +DC_PREDICTOR_LEFT(4, 4, 2, ) +DC_PREDICTOR_LEFT(4, 8, 3, ) +DC_PREDICTOR_LEFT(4, 16, 4, ) +DC_PREDICTOR_LEFT(8, 4, 2, q) +DC_PREDICTOR_LEFT(8, 8, 3, q) +DC_PREDICTOR_LEFT(8, 16, 4, q) +DC_PREDICTOR_LEFT(8, 32, 5, q) +DC_PREDICTOR_LEFT(16, 4, 2, q) +DC_PREDICTOR_LEFT(16, 8, 3, q) +DC_PREDICTOR_LEFT(16, 16, 4, q) +DC_PREDICTOR_LEFT(16, 32, 5, q) +DC_PREDICTOR_LEFT(16, 64, 6, q) +DC_PREDICTOR_LEFT(32, 8, 3, q) +DC_PREDICTOR_LEFT(32, 16, 4, q) +DC_PREDICTOR_LEFT(32, 32, 5, q) +DC_PREDICTOR_LEFT(32, 64, 6, q) +DC_PREDICTOR_LEFT(64, 16, 4, q) +DC_PREDICTOR_LEFT(64, 32, 5, q) +DC_PREDICTOR_LEFT(64, 64, 6, q) + +#undef DC_PREDICTOR_LEFT + +// ----------------------------------------------------------------------------- +// DC_TOP + +#define DC_PREDICTOR_TOP(w, h, shift, q) \ + void aom_highbd_dc_top_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + (void)left; \ + const uint32x4_t sum = highbd_dc_load_sum_##w(above); \ + const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \ + highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \ + } + +DC_PREDICTOR_TOP(4, 4, 2, ) +DC_PREDICTOR_TOP(4, 8, 2, ) +DC_PREDICTOR_TOP(4, 16, 2, ) +DC_PREDICTOR_TOP(8, 4, 3, q) +DC_PREDICTOR_TOP(8, 8, 3, q) +DC_PREDICTOR_TOP(8, 16, 3, q) +DC_PREDICTOR_TOP(8, 32, 3, q) +DC_PREDICTOR_TOP(16, 4, 4, q) +DC_PREDICTOR_TOP(16, 8, 4, q) +DC_PREDICTOR_TOP(16, 16, 4, q) +DC_PREDICTOR_TOP(16, 32, 4, q) +DC_PREDICTOR_TOP(16, 64, 4, q) +DC_PREDICTOR_TOP(32, 8, 5, q) +DC_PREDICTOR_TOP(32, 16, 5, q) +DC_PREDICTOR_TOP(32, 32, 5, q) +DC_PREDICTOR_TOP(32, 64, 5, q) +DC_PREDICTOR_TOP(64, 16, 6, q) +DC_PREDICTOR_TOP(64, 32, 6, q) +DC_PREDICTOR_TOP(64, 64, 6, q) + +#undef DC_PREDICTOR_TOP + +// ----------------------------------------------------------------------------- +// V_PRED + +#define HIGHBD_V_NXM(W, H) \ + void aom_highbd_v_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)left; \ + (void)bd; \ + vertical##W##xh_neon(dst, stride, above, H); \ + } + +static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) { + uint16x8x2_t x; + // Clang/gcc uses ldp here. + x.val[0] = vld1q_u16(ptr); + x.val[1] = vld1q_u16(ptr + 8); + return x; +} + +static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) { + vst1q_u16(ptr, x.val[0]); + vst1q_u16(ptr + 8, x.val[1]); +} + +static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const above, int height) { + const uint16x4_t row = vld1_u16(above); + int y = height; + do { + vst1_u16(dst, row); + vst1_u16(dst + stride, row); + dst += stride << 1; + y -= 2; + } while (y != 0); +} + +static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const above, int height) { + const uint16x8_t row = vld1q_u16(above); + int y = height; + do { + vst1q_u16(dst, row); + vst1q_u16(dst + stride, row); + dst += stride << 1; + y -= 2; + } while (y != 0); +} + +static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const above, int height) { + const uint16x8x2_t row = load_uint16x8x2(above); + int y = height; + do { + store_uint16x8x2(dst, row); + store_uint16x8x2(dst + stride, row); + dst += stride << 1; + y -= 2; + } while (y != 0); +} + +static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) { + uint16x8x4_t x; + // Clang/gcc uses ldp here. + x.val[0] = vld1q_u16(ptr); + x.val[1] = vld1q_u16(ptr + 8); + x.val[2] = vld1q_u16(ptr + 16); + x.val[3] = vld1q_u16(ptr + 24); + return x; +} + +static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) { + vst1q_u16(ptr, x.val[0]); + vst1q_u16(ptr + 8, x.val[1]); + vst1q_u16(ptr + 16, x.val[2]); + vst1q_u16(ptr + 24, x.val[3]); +} + +static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const above, int height) { + const uint16x8x4_t row = load_uint16x8x4(above); + int y = height; + do { + store_uint16x8x4(dst, row); + store_uint16x8x4(dst + stride, row); + dst += stride << 1; + y -= 2; + } while (y != 0); +} + +static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const above, int height) { + uint16_t *dst32 = dst + 32; + const uint16x8x4_t row = load_uint16x8x4(above); + const uint16x8x4_t row32 = load_uint16x8x4(above + 32); + int y = height; + do { + store_uint16x8x4(dst, row); + store_uint16x8x4(dst32, row32); + store_uint16x8x4(dst + stride, row); + store_uint16x8x4(dst32 + stride, row32); + dst += stride << 1; + dst32 += stride << 1; + y -= 2; + } while (y != 0); +} + +HIGHBD_V_NXM(4, 4) +HIGHBD_V_NXM(4, 8) +HIGHBD_V_NXM(4, 16) + +HIGHBD_V_NXM(8, 4) +HIGHBD_V_NXM(8, 8) +HIGHBD_V_NXM(8, 16) +HIGHBD_V_NXM(8, 32) + +HIGHBD_V_NXM(16, 4) +HIGHBD_V_NXM(16, 8) +HIGHBD_V_NXM(16, 16) +HIGHBD_V_NXM(16, 32) +HIGHBD_V_NXM(16, 64) + +HIGHBD_V_NXM(32, 8) +HIGHBD_V_NXM(32, 16) +HIGHBD_V_NXM(32, 32) +HIGHBD_V_NXM(32, 64) + +HIGHBD_V_NXM(64, 16) +HIGHBD_V_NXM(64, 32) +HIGHBD_V_NXM(64, 64) + +// ----------------------------------------------------------------------------- +// H_PRED + +static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride, + uint16x4_t left) { + vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0)); + vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1)); + vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2)); + vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3)); +} + +static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride, + uint16x4_t left) { + vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0)); + vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1)); + vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2)); + vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3)); +} + +static INLINE void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) { + vst1q_u16(dst + 0, left); + vst1q_u16(dst + 8, left); +} + +static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride, + uint16x4_t left) { + highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); + highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); + highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); + highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); +} + +static INLINE void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) { + vst1q_u16(dst + 0, left); + vst1q_u16(dst + 8, left); + vst1q_u16(dst + 16, left); + vst1q_u16(dst + 24, left); +} + +static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride, + uint16x4_t left) { + highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); + highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); + highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); + highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); +} + +static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) { + vst1q_u16(dst + 0, left); + vst1q_u16(dst + 8, left); + vst1q_u16(dst + 16, left); + vst1q_u16(dst + 24, left); + vst1q_u16(dst + 32, left); + vst1q_u16(dst + 40, left); + vst1q_u16(dst + 48, left); + vst1q_u16(dst + 56, left); +} + +static INLINE void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride, + uint16x4_t left) { + highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); + highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); + highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); + highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); +} + +void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + highbd_h_store_4x4(dst, stride, vld1_u16(left)); +} + +void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + uint16x8_t l = vld1q_u16(left); + highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l)); + highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l)); +} + +void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + highbd_h_store_8x4(dst, stride, vld1_u16(left)); +} + +void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + uint16x8_t l = vld1q_u16(left); + highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l)); + highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l)); +} + +void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + highbd_h_store_16x4(dst, stride, vld1_u16(left)); +} + +void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + uint16x8_t l = vld1q_u16(left); + highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l)); + highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l)); +} + +void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + uint16x8_t l = vld1q_u16(left); + highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l)); + highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l)); +} + +// For cases where height >= 16 we use pairs of loads to get LDP instructions. +#define HIGHBD_H_WXH_LARGE(w, h) \ + void aom_highbd_h_predictor_##w##x##h##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)above; \ + (void)bd; \ + for (int i = 0; i < (h) / 16; ++i) { \ + uint16x8_t l0 = vld1q_u16(left + 0); \ + uint16x8_t l1 = vld1q_u16(left + 8); \ + highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0)); \ + highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0)); \ + highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1)); \ + highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \ + left += 16; \ + dst += 16 * stride; \ + } \ + } + +HIGHBD_H_WXH_LARGE(4, 16) +HIGHBD_H_WXH_LARGE(8, 16) +HIGHBD_H_WXH_LARGE(8, 32) +HIGHBD_H_WXH_LARGE(16, 16) +HIGHBD_H_WXH_LARGE(16, 32) +HIGHBD_H_WXH_LARGE(16, 64) +HIGHBD_H_WXH_LARGE(32, 16) +HIGHBD_H_WXH_LARGE(32, 32) +HIGHBD_H_WXH_LARGE(32, 64) +HIGHBD_H_WXH_LARGE(64, 16) +HIGHBD_H_WXH_LARGE(64, 32) +HIGHBD_H_WXH_LARGE(64, 64) + +#undef HIGHBD_H_WXH_LARGE + +// ----------------------------------------------------------------------------- +// PAETH + +static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + int width, int height) { + const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); + const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); + uint16x8_t top; + if (width == 4) { + top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0)); + } else { // width == 8 + top = vld1q_u16(top_row); + } + + for (int y = 0; y < height; ++y) { + const uint16x8_t left = vdupq_n_u16(left_column[y]); + + const uint16x8_t left_dist = vabdq_u16(top, top_left); + const uint16x8_t top_dist = vabdq_u16(left, top_left); + const uint16x8_t top_left_dist = + vabdq_u16(vaddq_u16(top, left), top_left_x2); + + const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); + const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); + const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); + + // if (left_dist <= top_dist && left_dist <= top_left_dist) + const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); + // dest[x] = left_column[y]; + // Fill all the unused spaces with 'top'. They will be overwritten when + // the positions for top_left are known. + uint16x8_t result = vbslq_u16(left_mask, left, top); + // else if (top_dist <= top_left_dist) + // dest[x] = top_row[x]; + // Add these values to the mask. They were already set. + const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); + // else + // dest[x] = top_left; + result = vbslq_u16(left_or_top_mask, result, top_left); + + if (width == 4) { + vst1_u16(dest, vget_low_u16(result)); + } else { // width == 8 + vst1q_u16(dest, result); + } + dest += stride; + } +} + +#define HIGHBD_PAETH_NXM(W, H) \ + void aom_highbd_paeth_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ + } + +HIGHBD_PAETH_NXM(4, 4) +HIGHBD_PAETH_NXM(4, 8) +HIGHBD_PAETH_NXM(4, 16) +HIGHBD_PAETH_NXM(8, 4) +HIGHBD_PAETH_NXM(8, 8) +HIGHBD_PAETH_NXM(8, 16) +HIGHBD_PAETH_NXM(8, 32) + +// Select the closest values and collect them. +static INLINE uint16x8_t select_paeth(const uint16x8_t top, + const uint16x8_t left, + const uint16x8_t top_left, + const uint16x8_t left_le_top, + const uint16x8_t left_le_top_left, + const uint16x8_t top_le_top_left) { + // if (left_dist <= top_dist && left_dist <= top_left_dist) + const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); + // dest[x] = left_column[y]; + // Fill all the unused spaces with 'top'. They will be overwritten when + // the positions for top_left are known. + const uint16x8_t result = vbslq_u16(left_mask, left, top); + // else if (top_dist <= top_left_dist) + // dest[x] = top_row[x]; + // Add these values to the mask. They were already set. + const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); + // else + // dest[x] = top_left; + return vbslq_u16(left_or_top_mask, result, top_left); +} + +#define PAETH_PREDICTOR(num) \ + do { \ + const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \ + const uint16x8_t top_left_dist = \ + vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \ + const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \ + const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \ + const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \ + const uint16x8_t result = \ + select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \ + top_le_top_left); \ + vst1q_u16(dest + (num * 8), result); \ + } while (0) + +#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8)) + +static INLINE void highbd_paeth16_plus_x_h_neon( + uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, + const uint16_t *const left_column, int width, int height) { + const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); + const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); + uint16x8_t top[8]; + top[0] = LOAD_TOP_ROW(0); + top[1] = LOAD_TOP_ROW(1); + if (width > 16) { + top[2] = LOAD_TOP_ROW(2); + top[3] = LOAD_TOP_ROW(3); + if (width == 64) { + top[4] = LOAD_TOP_ROW(4); + top[5] = LOAD_TOP_ROW(5); + top[6] = LOAD_TOP_ROW(6); + top[7] = LOAD_TOP_ROW(7); + } + } + + for (int y = 0; y < height; ++y) { + const uint16x8_t left = vdupq_n_u16(left_column[y]); + const uint16x8_t top_dist = vabdq_u16(left, top_left); + PAETH_PREDICTOR(0); + PAETH_PREDICTOR(1); + if (width > 16) { + PAETH_PREDICTOR(2); + PAETH_PREDICTOR(3); + if (width == 64) { + PAETH_PREDICTOR(4); + PAETH_PREDICTOR(5); + PAETH_PREDICTOR(6); + PAETH_PREDICTOR(7); + } + } + dest += stride; + } +} + +#define HIGHBD_PAETH_NXM_WIDE(W, H) \ + void aom_highbd_paeth_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ + } + +HIGHBD_PAETH_NXM_WIDE(16, 4) +HIGHBD_PAETH_NXM_WIDE(16, 8) +HIGHBD_PAETH_NXM_WIDE(16, 16) +HIGHBD_PAETH_NXM_WIDE(16, 32) +HIGHBD_PAETH_NXM_WIDE(16, 64) +HIGHBD_PAETH_NXM_WIDE(32, 8) +HIGHBD_PAETH_NXM_WIDE(32, 16) +HIGHBD_PAETH_NXM_WIDE(32, 32) +HIGHBD_PAETH_NXM_WIDE(32, 64) +HIGHBD_PAETH_NXM_WIDE(64, 16) +HIGHBD_PAETH_NXM_WIDE(64, 32) +HIGHBD_PAETH_NXM_WIDE(64, 64) + +// ----------------------------------------------------------------------------- +// SMOOTH + +// 256 - v = vneg_s8(v) +static INLINE uint16x4_t negate_s8(const uint16x4_t v) { + return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); +} + +static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + const int height) { + const uint16_t top_right = top_row[3]; + const uint16_t bottom_left = left_column[height - 1]; + const uint16_t *const weights_y = smooth_weights_u16 + height - 4; + + const uint16x4_t top_v = vld1_u16(top_row); + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16); + const uint16x4_t scaled_weights_x = negate_s8(weights_x_v); + const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); + + for (int y = 0; y < height; ++y) { + // Each variable in the running summation is named for the last item to be + // accumulated. + const uint32x4_t weighted_top = + vmlal_n_u16(weighted_tr, top_v, weights_y[y]); + const uint32x4_t weighted_left = + vmlal_n_u16(weighted_top, weights_x_v, left_column[y]); + const uint32x4_t weighted_bl = + vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]); + + const uint16x4_t pred = + vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1); + vst1_u16(dst, pred); + dst += stride; + } +} + +// Common code between 8xH and [16|32|64]xH. +static INLINE void highbd_calculate_pred8( + uint16_t *dst, const uint32x4_t weighted_corners_low, + const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals, + const uint16x4x2_t weights_x, const uint16_t left_y, + const uint16_t weight_y) { + // Each variable in the running summation is named for the last item to be + // accumulated. + const uint32x4_t weighted_top_low = + vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y); + const uint32x4_t weighted_edges_low = + vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y); + + const uint16x4_t pred_low = + vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1); + vst1_u16(dst, pred_low); + + const uint32x4_t weighted_top_high = + vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y); + const uint32x4_t weighted_edges_high = + vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y); + + const uint16x4_t pred_high = + vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1); + vst1_u16(dst + 4, pred_high); +} + +static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + const int height) { + const uint16_t top_right = top_row[7]; + const uint16_t bottom_left = left_column[height - 1]; + const uint16_t *const weights_y = smooth_weights_u16 + height - 4; + + const uint16x4x2_t top_vals = { { vld1_u16(top_row), + vld1_u16(top_row + 4) } }; + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), + vld1_u16(smooth_weights_u16 + 8) } }; + const uint32x4_t weighted_tr_low = + vmull_n_u16(negate_s8(weights_x.val[0]), top_right); + const uint32x4_t weighted_tr_high = + vmull_n_u16(negate_s8(weights_x.val[1]), top_right); + + for (int y = 0; y < height; ++y) { + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + const uint32x4_t weighted_corners_low = + vaddq_u32(weighted_bl, weighted_tr_low); + const uint32x4_t weighted_corners_high = + vaddq_u32(weighted_bl, weighted_tr_high); + highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high, + top_vals, weights_x, left_column[y], weights_y[y]); + dst += stride; + } +} + +#define HIGHBD_SMOOTH_NXM(W, H) \ + void aom_highbd_smooth_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +HIGHBD_SMOOTH_NXM(4, 4) +HIGHBD_SMOOTH_NXM(4, 8) +HIGHBD_SMOOTH_NXM(8, 4) +HIGHBD_SMOOTH_NXM(8, 8) +HIGHBD_SMOOTH_NXM(4, 16) +HIGHBD_SMOOTH_NXM(8, 16) +HIGHBD_SMOOTH_NXM(8, 32) + +#undef HIGHBD_SMOOTH_NXM + +// For width 16 and above. +#define HIGHBD_SMOOTH_PREDICTOR(W) \ + static void highbd_smooth_##W##xh_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ + const uint16_t *const left_column, const int height) { \ + const uint16_t top_right = top_row[(W)-1]; \ + const uint16_t bottom_left = left_column[height - 1]; \ + const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ + \ + /* Precompute weighted values that don't vary with |y|. */ \ + uint32x4_t weighted_tr_low[(W) >> 3]; \ + uint32x4_t weighted_tr_high[(W) >> 3]; \ + for (int i = 0; i < (W) >> 3; ++i) { \ + const int x = i << 3; \ + const uint16x4_t weights_x_low = \ + vld1_u16(smooth_weights_u16 + (W)-4 + x); \ + weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \ + const uint16x4_t weights_x_high = \ + vld1_u16(smooth_weights_u16 + (W) + x); \ + weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \ + } \ + \ + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ + for (int y = 0; y < height; ++y) { \ + const uint32x4_t weighted_bl = \ + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ + uint16_t *dst_x = dst; \ + for (int i = 0; i < (W) >> 3; ++i) { \ + const int x = i << 3; \ + const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \ + vld1_u16(top_row + x + 4) } }; \ + const uint32x4_t weighted_corners_low = \ + vaddq_u32(weighted_bl, weighted_tr_low[i]); \ + const uint32x4_t weighted_corners_high = \ + vaddq_u32(weighted_bl, weighted_tr_high[i]); \ + /* Accumulate weighted edge values and store. */ \ + const uint16x4x2_t weights_x = { \ + { vld1_u16(smooth_weights_u16 + (W)-4 + x), \ + vld1_u16(smooth_weights_u16 + (W) + x) } \ + }; \ + highbd_calculate_pred8(dst_x, weighted_corners_low, \ + weighted_corners_high, top_vals, weights_x, \ + left_column[y], weights_y[y]); \ + dst_x += 8; \ + } \ + dst += stride; \ + } \ + } + +HIGHBD_SMOOTH_PREDICTOR(16) +HIGHBD_SMOOTH_PREDICTOR(32) +HIGHBD_SMOOTH_PREDICTOR(64) + +#undef HIGHBD_SMOOTH_PREDICTOR + +#define HIGHBD_SMOOTH_NXM_WIDE(W, H) \ + void aom_highbd_smooth_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +HIGHBD_SMOOTH_NXM_WIDE(16, 4) +HIGHBD_SMOOTH_NXM_WIDE(16, 8) +HIGHBD_SMOOTH_NXM_WIDE(16, 16) +HIGHBD_SMOOTH_NXM_WIDE(16, 32) +HIGHBD_SMOOTH_NXM_WIDE(16, 64) +HIGHBD_SMOOTH_NXM_WIDE(32, 8) +HIGHBD_SMOOTH_NXM_WIDE(32, 16) +HIGHBD_SMOOTH_NXM_WIDE(32, 32) +HIGHBD_SMOOTH_NXM_WIDE(32, 64) +HIGHBD_SMOOTH_NXM_WIDE(64, 16) +HIGHBD_SMOOTH_NXM_WIDE(64, 32) +HIGHBD_SMOOTH_NXM_WIDE(64, 64) + +#undef HIGHBD_SMOOTH_NXM_WIDE + +static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + const int height) { + const uint16_t bottom_left = left_column[height - 1]; + const uint16_t *const weights_y = smooth_weights_u16 + height - 4; + + const uint16x4_t top_v = vld1_u16(top_row); + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + + for (int y = 0; y < height; ++y) { + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + const uint32x4_t weighted_top = + vmlal_n_u16(weighted_bl, top_v, weights_y[y]); + vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE)); + + dst += stride; + } +} + +static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + const int height) { + const uint16_t bottom_left = left_column[height - 1]; + const uint16_t *const weights_y = smooth_weights_u16 + height - 4; + + const uint16x4_t top_low = vld1_u16(top_row); + const uint16x4_t top_high = vld1_u16(top_row + 4); + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); + + for (int y = 0; y < height; ++y) { + const uint32x4_t weighted_bl = + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); + + const uint32x4_t weighted_top_low = + vmlal_n_u16(weighted_bl, top_low, weights_y[y]); + vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); + + const uint32x4_t weighted_top_high = + vmlal_n_u16(weighted_bl, top_high, weights_y[y]); + vst1_u16(dst + 4, + vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); + dst += stride; + } +} + +#define HIGHBD_SMOOTH_V_NXM(W, H) \ + void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +HIGHBD_SMOOTH_V_NXM(4, 4) +HIGHBD_SMOOTH_V_NXM(4, 8) +HIGHBD_SMOOTH_V_NXM(4, 16) +HIGHBD_SMOOTH_V_NXM(8, 4) +HIGHBD_SMOOTH_V_NXM(8, 8) +HIGHBD_SMOOTH_V_NXM(8, 16) +HIGHBD_SMOOTH_V_NXM(8, 32) + +#undef HIGHBD_SMOOTH_V_NXM + +// For width 16 and above. +#define HIGHBD_SMOOTH_V_PREDICTOR(W) \ + static void highbd_smooth_v_##W##xh_neon( \ + uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \ + const uint16_t *const left_column, const int height) { \ + const uint16_t bottom_left = left_column[height - 1]; \ + const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ + \ + uint16x4x2_t top_vals[(W) >> 3]; \ + for (int i = 0; i < (W) >> 3; ++i) { \ + const int x = i << 3; \ + top_vals[i].val[0] = vld1_u16(top_row + x); \ + top_vals[i].val[1] = vld1_u16(top_row + x + 4); \ + } \ + \ + const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ + for (int y = 0; y < height; ++y) { \ + const uint32x4_t weighted_bl = \ + vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ + \ + uint16_t *dst_x = dst; \ + for (int i = 0; i < (W) >> 3; ++i) { \ + const uint32x4_t weighted_top_low = \ + vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \ + vst1_u16(dst_x, \ + vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ + \ + const uint32x4_t weighted_top_high = \ + vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \ + vst1_u16(dst_x + 4, \ + vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ + dst_x += 8; \ + } \ + dst += stride; \ + } \ + } + +HIGHBD_SMOOTH_V_PREDICTOR(16) +HIGHBD_SMOOTH_V_PREDICTOR(32) +HIGHBD_SMOOTH_V_PREDICTOR(64) + +#undef HIGHBD_SMOOTH_V_PREDICTOR + +#define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \ + void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +HIGHBD_SMOOTH_V_NXM_WIDE(16, 4) +HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) +HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) +HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) +HIGHBD_SMOOTH_V_NXM_WIDE(16, 64) +HIGHBD_SMOOTH_V_NXM_WIDE(32, 8) +HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) +HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) +HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) +HIGHBD_SMOOTH_V_NXM_WIDE(64, 16) +HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) +HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) + +#undef HIGHBD_SMOOTH_V_NXM_WIDE + +static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + const int height) { + const uint16_t top_right = top_row[3]; + + const uint16x4_t weights_x = vld1_u16(smooth_weights_u16); + const uint16x4_t scaled_weights_x = negate_s8(weights_x); + + const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); + for (int y = 0; y < height; ++y) { + const uint32x4_t weighted_left = + vmlal_n_u16(weighted_tr, weights_x, left_column[y]); + vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE)); + dst += stride; + } +} + +static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *const top_row, + const uint16_t *const left_column, + const int height) { + const uint16_t top_right = top_row[7]; + + const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), + vld1_u16(smooth_weights_u16 + 8) } }; + + const uint32x4_t weighted_tr_low = + vmull_n_u16(negate_s8(weights_x.val[0]), top_right); + const uint32x4_t weighted_tr_high = + vmull_n_u16(negate_s8(weights_x.val[1]), top_right); + + for (int y = 0; y < height; ++y) { + const uint16_t left_y = left_column[y]; + const uint32x4_t weighted_left_low = + vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y); + vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); + + const uint32x4_t weighted_left_high = + vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y); + vst1_u16(dst + 4, + vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); + dst += stride; + } +} + +#define HIGHBD_SMOOTH_H_NXM(W, H) \ + void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +HIGHBD_SMOOTH_H_NXM(4, 4) +HIGHBD_SMOOTH_H_NXM(4, 8) +HIGHBD_SMOOTH_H_NXM(4, 16) +HIGHBD_SMOOTH_H_NXM(8, 4) +HIGHBD_SMOOTH_H_NXM(8, 8) +HIGHBD_SMOOTH_H_NXM(8, 16) +HIGHBD_SMOOTH_H_NXM(8, 32) + +#undef HIGHBD_SMOOTH_H_NXM + +// For width 16 and above. +#define HIGHBD_SMOOTH_H_PREDICTOR(W) \ + void highbd_smooth_h_##W##xh_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ + const uint16_t *const left_column, const int height) { \ + const uint16_t top_right = top_row[(W)-1]; \ + \ + uint16x4_t weights_x_low[(W) >> 3]; \ + uint16x4_t weights_x_high[(W) >> 3]; \ + uint32x4_t weighted_tr_low[(W) >> 3]; \ + uint32x4_t weighted_tr_high[(W) >> 3]; \ + for (int i = 0; i < (W) >> 3; ++i) { \ + const int x = i << 3; \ + weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \ + weighted_tr_low[i] = \ + vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \ + weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \ + weighted_tr_high[i] = \ + vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \ + } \ + \ + for (int y = 0; y < height; ++y) { \ + uint16_t *dst_x = dst; \ + const uint16_t left_y = left_column[y]; \ + for (int i = 0; i < (W) >> 3; ++i) { \ + const uint32x4_t weighted_left_low = \ + vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \ + vst1_u16(dst_x, \ + vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ + \ + const uint32x4_t weighted_left_high = \ + vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \ + vst1_u16(dst_x + 4, \ + vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ + dst_x += 8; \ + } \ + dst += stride; \ + } \ + } + +HIGHBD_SMOOTH_H_PREDICTOR(16) +HIGHBD_SMOOTH_H_PREDICTOR(32) +HIGHBD_SMOOTH_H_PREDICTOR(64) + +#undef HIGHBD_SMOOTH_H_PREDICTOR + +#define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \ + void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ + uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +HIGHBD_SMOOTH_H_NXM_WIDE(16, 4) +HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) +HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) +HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) +HIGHBD_SMOOTH_H_NXM_WIDE(16, 64) +HIGHBD_SMOOTH_H_NXM_WIDE(32, 8) +HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) +HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) +HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) +HIGHBD_SMOOTH_H_NXM_WIDE(64, 16) +HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) +HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) + +#undef HIGHBD_SMOOTH_H_NXM_WIDE + +// ----------------------------------------------------------------------------- +// Z1 + +static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; +static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 }; + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0, + uint16x4_t a1, + int shift) { + // The C implementation of the z1 predictor uses (32 - shift) and a right + // shift by 5, however we instead double shift to avoid an unnecessary right + // shift by 1. + uint32x4_t res = vmull_n_u16(a1, shift); + res = vmlal_n_u16(res, a0, 64 - shift); + return vrshrn_n_u32(res, 6); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0, + uint16x8_t a1, + int shift) { + return vcombine_u16( + highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift), + highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift)); +} + +static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, + const uint16_t *above, + int dx) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx > 0); + + const int max_base_x = (bw + bh) - 1; + const int above_max = above[max_base_x]; + + const int16x8_t iota1x8 = vld1q_s16(iota1_s16); + const int16x4_t iota1x4 = vget_low_s16(iota1x8); + + int x = dx; + int r = 0; + do { + const int base = x >> 6; + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above_max, bw); + dst += stride; + } + return; + } + + // The C implementation of the z1 predictor when not upsampling uses: + // ((x & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const int shift = x & 0x3e; + + if (bw == 4) { + const uint16x4_t a0 = vld1_u16(&above[base]); + const uint16x4_t a1 = vld1_u16(&above[base + 1]); + const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift); + const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4); + const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); + vst1_u16(dst, res); + } else { + int c = 0; + do { + const uint16x8_t a0 = vld1q_u16(&above[base + c]); + const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]); + const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift); + const uint16x8_t cmp = + vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8); + const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); + vst1q_u16(dst + c, res); + c += 8; + } while (c < bw); + } + + dst += stride; + x += dx; + } while (++r < bh); +} + +static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, + const uint16_t *above, + int dx) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << 1; + const int above_max = above[max_base_x]; + + const int16x8_t iota2x8 = vld1q_s16(iota2_s16); + const int16x4_t iota2x4 = vget_low_s16(iota2x8); + + int x = dx; + int r = 0; + do { + const int base = x >> 5; + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above_max, bw); + dst += stride; + } + return; + } + + // The C implementation of the z1 predictor when upsampling uses: + // (((x << 1) & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const int shift = (x << 1) & 0x3e; + + if (bw == 4) { + const uint16x4x2_t a01 = vld2_u16(&above[base]); + const uint16x4_t val = + highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift); + const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4); + const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); + vst1_u16(dst, res); + } else { + int c = 0; + do { + const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]); + const uint16x8_t val = + highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift); + const uint16x8_t cmp = + vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8); + const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); + vst1q_u16(dst + c, res); + c += 8; + } while (c < bw); + } + + dst += stride; + x += dx; + } while (++r < bh); +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + (void)left; + (void)dy; + (void)bd; + assert(dy == 1); + + if (upsample_above) { + highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx); + } else { + highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx); + } +} + +// ----------------------------------------------------------------------------- +// Z2 + +#if AOM_ARCH_AARCH64 +// Incrementally shift more elements from `above` into the result, merging with +// existing `left` elements. +// X0, X1, X2, X3 +// Y0, X0, X1, X2 +// Y0, Y1, X0, X1 +// Y0, Y1, Y2, X0 +// Y0, Y1, Y2, Y3 +// clang-format off +static const uint8_t z2_merge_shuffles_u16x4[5][8] = { + { 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 2, 3, 8, 9, 10, 11 }, + { 0, 1, 2, 3, 4, 5, 8, 9 }, + { 0, 1, 2, 3, 4, 5, 6, 7 }, +}; +// clang-format on + +// Incrementally shift more elements from `above` into the result, merging with +// existing `left` elements. +// X0, X1, X2, X3, X4, X5, X6, X7 +// Y0, X0, X1, X2, X3, X4, X5, X6 +// Y0, Y1, X0, X1, X2, X3, X4, X5 +// Y0, Y1, Y2, X0, X1, X2, X3, X4 +// Y0, Y1, Y2, Y3, X0, X1, X2, X3 +// Y0, Y1, Y2, Y3, Y4, X0, X1, X2 +// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1 +// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0 +// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 +// clang-format off +static const uint8_t z2_merge_shuffles_u16x8[9][16] = { + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, + { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }, + { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, +}; +// clang-format on + +// clang-format off +static const uint16_t z2_y_iter_masks_u16x4[5][4] = { + { 0U, 0U, 0U, 0U }, + { 0xffffU, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU }, +}; +// clang-format on + +// clang-format off +static const uint16_t z2_y_iter_masks_u16x8[9][8] = { + { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU }, +}; +// clang-format on + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8( + const uint16x8_t left_data, const int16x4_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x8_t left_indices = + vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); + left_indices = vtrn1_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); + const uint16x4_t ret = vreinterpret_u16_u8( + vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices)); + return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16( + const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x8_t left_indices = + vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); + left_indices = vtrn1_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); + uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), + vreinterpretq_u8_u16(left_data.val[1]) } }; + const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices)); + return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8( + const uint16x8_t left_data, const int16x8_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x16_t left_indices = + vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); + left_indices = vtrn1q_u8(left_indices, left_indices); + left_indices = vaddq_u8(left_indices, left_indices); + left_indices = + vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); + const uint16x8_t ret = vreinterpretq_u16_u8( + vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices)); + return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16( + const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x16_t left_indices = + vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); + left_indices = vtrn1q_u8(left_indices, left_indices); + left_indices = vaddq_u8(left_indices, left_indices); + left_indices = + vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); + uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), + vreinterpretq_u8_u16(left_data.val[1]) } }; + const uint16x8_t ret = + vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices)); + return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); +} +#endif // AOM_ARCH_AARCH64 + +static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4( + const uint16_t *left, const int16x4_t indices, int n) { + assert(n > 0); + assert(n <= 4); + // Load two elements at a time and then uzp them into separate vectors, to + // reduce the number of memory accesses. + uint32x2_t ret0_u32 = vdup_n_u32(0); + uint32x2_t ret1_u32 = vdup_n_u32(0); + + // Use a single vget_lane_u64 to minimize vector to general purpose register + // transfers and then mask off the bits we actually want. + const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0); + const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); + const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); + const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); + const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); + + // At time of writing both Clang and GCC produced better code with these + // nested if-statements compared to a switch statement with fallthrough. + ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0); + if (n > 1) { + ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1); + if (n > 2) { + ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0); + if (n > 3) { + ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1); + } + } + } + return vuzp_u16(vreinterpret_u16_u32(ret0_u32), + vreinterpret_u16_u32(ret1_u32)); +} + +static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8( + const uint16_t *left, const int16x8_t indices, int n) { + assert(n > 0); + assert(n <= 8); + // Load two elements at a time and then uzp them into separate vectors, to + // reduce the number of memory accesses. + uint32x4_t ret0_u32 = vdupq_n_u32(0); + uint32x4_t ret1_u32 = vdupq_n_u32(0); + + // Use a pair of vget_lane_u64 to minimize vector to general purpose register + // transfers and then mask off the bits we actually want. + const uint64_t indices0123 = + vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0); + const uint64_t indices4567 = + vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1); + const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); + const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); + const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); + const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); + const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU); + const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU); + const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU); + const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU); + + // At time of writing both Clang and GCC produced better code with these + // nested if-statements compared to a switch statement with fallthrough. + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0); + if (n > 1) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1); + if (n > 2) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2); + if (n > 3) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3); + if (n > 4) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0); + if (n > 5) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1); + if (n > 6) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2); + if (n > 7) { + ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7), + ret1_u32, 3); + } + } + } + } + } + } + } + return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32), + vreinterpretq_u16_u32(ret1_u32)); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4( + uint16x4_t out_x, uint16x4_t out_y, int base_shift) { + assert(base_shift >= 0); + assert(base_shift <= 4); + // On AArch64 we can permute the data from the `above` and `left` vectors + // into a single vector in a single load (of the permute vector) + tbl. +#if AOM_ARCH_AARCH64 + const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y), + vreinterpret_u8_u16(out_x) } }; + return vreinterpret_u16_u8( + vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift]))); +#else + uint16x4_t out = out_y; + for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) { + out[c2] = out_x[x_idx]; + } + return out; +#endif +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8( + uint16x8_t out_x, uint16x8_t out_y, int base_shift) { + assert(base_shift >= 0); + assert(base_shift <= 8); + // On AArch64 we can permute the data from the `above` and `left` vectors + // into a single vector in a single load (of the permute vector) + tbl. +#if AOM_ARCH_AARCH64 + const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y), + vreinterpretq_u8_u16(out_x) } }; + return vreinterpretq_u16_u8( + vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift]))); +#else + uint16x8_t out = out_y; + for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) { + out[c2] = out_x[x_idx]; + } + return out; +#endif +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4( + uint16x4_t a0, uint16x4_t a1, int16x4_t shift) { + uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift)); + res = + vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift))); + return vrshrn_n_u32(res, 5); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8( + uint16x8_t a0, uint16x8_t a1, int16x8_t shift) { + return vcombine_u16( + highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), + vget_low_s16(shift)), + highbd_dr_prediction_z2_apply_shift_x4( + vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift))); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4( + const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1, + const uint16_t *left, int dx, int dy, int r, int c) { + const int16x4_t iota = vld1_s16(iota1_s16); + + const int x0 = (c << 6) - (r + 1) * dx; + const int y0 = (r << 6) - (c + 1) * dy; + + const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6)); + const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy)); + const int16x4_t shift_x0123 = + vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); + const int16x4_t shift_y0123 = + vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1); + const int16x4_t base_y0123 = vshr_n_s16(y0123, 6); + + const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; + + // Based on the value of `base_shift` there are three possible cases to + // compute the result: + // 1) base_shift <= 0: We can load and operate entirely on data from the + // `above` input vector. + // 2) base_shift < vl: We can load from `above[-1]` and shift + // `vl - base_shift` elements across to the end of the + // vector, then compute the remainder from `left`. + // 3) base_shift >= vl: We can load and operate entirely on data from the + // `left` input vector. + + if (base_shift <= 0) { + const int base_x = x0 >> 6; + const uint16x4_t a0 = vld1_u16(above + base_x); + const uint16x4_t a1 = vld1_u16(above + base_x + 1); + return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + } else if (base_shift < 4) { + const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4( + left + 1, base_y0123, base_shift); + const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4( + l01.val[0], l01.val[1], shift_y0123); + + // No need to reload from above in the loop, just use pre-loaded constants. + const uint16x4_t out16_x = + highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123); + + return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift); + } else { + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4); + return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1], + shift_y0123); + } +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8( + const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1, + const uint16_t *left, int dx, int dy, int r, int c) { + const int16x8_t iota = vld1q_s16(iota1_s16); + + const int x0 = (c << 6) - (r + 1) * dx; + const int y0 = (r << 6) - (c + 1) * dy; + + const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6)); + const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy)); + const int16x8_t shift_x01234567 = + vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1); + const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6); + + const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; + + // Based on the value of `base_shift` there are three possible cases to + // compute the result: + // 1) base_shift <= 0: We can load and operate entirely on data from the + // `above` input vector. + // 2) base_shift < vl: We can load from `above[-1]` and shift + // `vl - base_shift` elements across to the end of the + // vector, then compute the remainder from `left`. + // 3) base_shift >= vl: We can load and operate entirely on data from the + // `left` input vector. + + if (base_shift <= 0) { + const int base_x = x0 >> 6; + const uint16x8_t a0 = vld1q_u16(above + base_x); + const uint16x8_t a1 = vld1q_u16(above + base_x + 1); + return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + } else if (base_shift < 8) { + const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8( + left + 1, base_y01234567, base_shift); + const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8( + l01.val[0], l01.val[1], shift_y01234567); + + // No need to reload from above in the loop, just use pre-loaded constants. + const uint16x8_t out16_x = + highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567); + + return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift); + } else { + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8); + return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1], + shift_y01234567); + } +} + +// Left array is accessed from -1 through `bh - 1` inclusive. +// Above array is accessed from -1 through `bw - 1` inclusive. +#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \ + static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int upsample_above, int upsample_left, int dx, \ + int dy, int bd) { \ + (void)bd; \ + (void)upsample_above; \ + (void)upsample_left; \ + assert(!upsample_above); \ + assert(!upsample_left); \ + assert(bw % 4 == 0); \ + assert(bh % 4 == 0); \ + assert(dx > 0); \ + assert(dy > 0); \ + \ + uint16_t left_data[bh + 1]; \ + memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \ + \ + uint16x8_t a0, a1; \ + if (bw == 4) { \ + a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \ + a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \ + } else { \ + a0 = vld1q_u16(above - 1); \ + a1 = vld1q_u16(above + 0); \ + } \ + \ + int r = 0; \ + do { \ + if (bw == 4) { \ + vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \ + above, vget_low_u16(a0), vget_low_u16(a1), \ + left_data, dx, dy, r, 0)); \ + } else { \ + int c = 0; \ + do { \ + vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \ + above, a0, a1, left_data, dx, dy, r, c)); \ + c += 8; \ + } while (c < bw); \ + } \ + dst += stride; \ + } while (++r < bh); \ + } + +HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) + +#undef HIGHBD_DR_PREDICTOR_Z2_WXH + +typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd); + +static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 6 inclusive from `left`. + // else we only need -1 through 3 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16(left - 2); + left_data1 = vld1q_u16(left - 1); + } else { + left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); + left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); + } +#endif + + const int16x4_t iota0123 = vld1_s16(iota1_s16); + const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); + + for (int r = 0; r < 4; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above + base_x0); + a1 = vld1_u16(above + base_x0 + 1); + shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); + } + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); + } else if (base_shift < 4) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, + left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, + left_data_base, y_iters); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x4_t out_y = + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); + + // Calculate X component from `above`. + const int16x4_t shift_x0123 = vshr_n_s16( + vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)), + 1); + uint16x4_t a0, a1; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + } else { + a0 = vld1_u16(above - 1); + a1 = vld1_u16(above + 0); + } + const uint16x4_t out_x = + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + + // Combine X and Y vectors. + const uint16x4_t out = + highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); + vst1_u16(dst, out); + } else { + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, + left_data_base, 4); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, + left_data_base, 4); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 14 inclusive from `left`. + // else we only need -1 through 6 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8x2_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16_x2(left - 2); + left_data1 = vld1q_u16_x2(left - 1); + } else { + left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; + left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; + } +#endif + + const int16x4_t iota0123 = vld1_s16(iota1_s16); + const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); + + for (int r = 0; r < 8; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above + base_x0); + a1 = vld1_u16(above + base_x0 + 1); + shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); + } + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); + } else if (base_shift < 4) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( + left_data0, base_y0123, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( + left_data1, base_y0123, left_data_base, y_iters); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x4_t out_y = + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); + + // Calculate X component from `above`. + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above - 1); + a1 = vld1_u16(above + 0); + shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); + } + const uint16x4_t out_x = + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + + // Combine X and Y vectors. + const uint16x4_t out = + highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); + vst1_u16(dst, out); + } else { + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123, + left_data_base, 4); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123, + left_data_base, 4); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 6 inclusive from `left`. + // else we only need -1 through 3 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16(left - 2); + left_data1 = vld1q_u16(left - 1); + } else { + left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); + left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); + } +#endif + + const int16x8_t iota01234567 = vld1q_s16(iota1_s16); + const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); + + for (int r = 0; r < 4; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x8_t x01234567 = + vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = vld2q_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above + base_x0); + a1 = vld1q_u16(above + base_x0 + 1); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); + } else if (base_shift < 8) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data0, base_y01234567, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data1, base_y01234567, left_data_base, y_iters); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x8_t out_y = + highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); + + // Calculate X component from `above`. + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = + vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above - 1); + a1 = vld1q_u16(above + 0); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + const uint16x8_t out_x = + highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + + // Combine X and Y vectors. + const uint16x8_t out = + highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); + vst1q_u16(dst, out); + } else { + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data0, base_y01234567, left_data_base, 8); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data1, base_y01234567, left_data_base, 8); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 14 inclusive from `left`. + // else we only need -1 through 6 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8x2_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16_x2(left - 2); + left_data1 = vld1q_u16_x2(left - 1); + } else { + left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; + left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; + } +#endif + + const int16x8_t iota01234567 = vld1q_s16(iota1_s16); + const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); + + for (int r = 0; r < 8; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x8_t x01234567 = + vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = vld2q_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above + base_x0); + a1 = vld1q_u16(above + base_x0 + 1); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); + } else if (base_shift < 8) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data0, base_y01234567, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data1, base_y01234567, left_data_base, y_iters); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x8_t out_y = + highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); + + // Calculate X component from `above`. + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = + vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above - 1); + a1 = vld1q_u16(above + 0); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + const uint16x8_t out_x = + highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + + // Combine X and Y vectors. + const uint16x8_t out = + highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); + vst1q_u16(dst, out); + } else { + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data0, base_y01234567, left_data_base, 8); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data1, base_y01234567, left_data_base, 8); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); + } + dst += stride; + } +} + +static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, + &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL, + NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, + &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, + &highbd_dr_prediction_z2_8x32_neon, NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon, + &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon, + &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon }, + { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon, + &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon, + &highbd_dr_prediction_z2_32x64_neon }, + { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon, + &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon }, +}; + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + highbd_dr_prediction_z2_ptr f = + dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)]; + assert(f != NULL); + f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd); +} + +// ----------------------------------------------------------------------------- +// Z3 + +// Both the lane to the use and the shift amount must be immediates. +#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \ + lane, shift) \ + do { \ + uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \ + val = vmlal_lane_u16(val, (in1), (s1), (lane)); \ + const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \ + const uint16x4_t res = vrshrn_n_u32(val, (shift)); \ + *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \ + vdup_n_u16(left_max)); \ + } while (0) + +#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \ + lane, shift) \ + do { \ + uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \ + val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \ + uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \ + val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \ + const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base)); \ + const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \ + vrshrn_n_u32(val_hi, (shift))); \ + *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res, \ + vdupq_n_u16(left_max)); \ + } while (0) + +static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, const uint16_t *left, + int dy) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dy > 0); + + // Factor out left + 1 to give the compiler a better chance of recognising + // that the offsets used for the loads from left and left + 1 are otherwise + // identical. + const uint16_t *left1 = left + 1; + + const int max_base_y = (bw + bh - 1); + const int left_max = left[max_base_y]; + const int frac_bits = 6; + + const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16)); + const uint16x4_t iota1x4 = vget_low_u16(iota1x8); + + // The C implementation of the z3 predictor when not upsampling uses: + // ((y & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const uint16x4_t shift_mask = vdup_n_u16(0x3e); + + if (bh == 4) { + int y = dy; + int c = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); + const int base0 = (y + 0 * dy) >> frac_bits; + const int base1 = (y + 1 * dy) >> frac_bits; + const int base2 = (y + 2 * dy) >> frac_bits; + const int base3 = (y + 3 * dy) >> frac_bits; + uint16x4_t out[4]; + if (base0 >= max_base_y) { + out[0] = vdup_n_u16(left_max); + } else { + const uint16x4_t l00 = vld1_u16(left + base0); + const uint16x4_t l01 = vld1_u16(left1 + base0); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01, + shifts0, shifts1, 0, 6); + } + if (base1 >= max_base_y) { + out[1] = vdup_n_u16(left_max); + } else { + const uint16x4_t l10 = vld1_u16(left + base1); + const uint16x4_t l11 = vld1_u16(left1 + base1); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11, + shifts0, shifts1, 1, 6); + } + if (base2 >= max_base_y) { + out[2] = vdup_n_u16(left_max); + } else { + const uint16x4_t l20 = vld1_u16(left + base2); + const uint16x4_t l21 = vld1_u16(left1 + base2); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21, + shifts0, shifts1, 2, 6); + } + if (base3 >= max_base_y) { + out[3] = vdup_n_u16(left_max); + } else { + const uint16x4_t l30 = vld1_u16(left + base3); + const uint16x4_t l31 = vld1_u16(left1 + base3); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31, + shifts0, shifts1, 3, 6); + } + transpose_array_inplace_u16_4x4(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + r2 * stride + c, out[r2]); + } + y += 4 * dy; + c += 4; + } while (c < bw); + } else { + int y = dy; + int c = 0; + do { + int r = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); + const int base0 = ((y + 0 * dy) >> frac_bits) + r; + const int base1 = ((y + 1 * dy) >> frac_bits) + r; + const int base2 = ((y + 2 * dy) >> frac_bits) + r; + const int base3 = ((y + 3 * dy) >> frac_bits) + r; + uint16x8_t out[4]; + if (base0 >= max_base_y) { + out[0] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l00 = vld1q_u16(left + base0); + const uint16x8_t l01 = vld1q_u16(left1 + base0); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01, + shifts0, shifts1, 0, 6); + } + if (base1 >= max_base_y) { + out[1] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l10 = vld1q_u16(left + base1); + const uint16x8_t l11 = vld1q_u16(left1 + base1); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11, + shifts0, shifts1, 1, 6); + } + if (base2 >= max_base_y) { + out[2] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l20 = vld1q_u16(left + base2); + const uint16x8_t l21 = vld1q_u16(left1 + base2); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21, + shifts0, shifts1, 2, 6); + } + if (base3 >= max_base_y) { + out[3] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l30 = vld1q_u16(left + base3); + const uint16x8_t l31 = vld1q_u16(left1 + base3); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31, + shifts0, shifts1, 3, 6); + } + transpose_array_inplace_u16_4x8(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); + } + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); + } + r += 8; + } while (r < bh); + y += 4 * dy; + c += 4; + } while (c < bw); + } +} + +static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, const uint16_t *left, + int dy) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << 1; + const int left_max = left[max_base_y]; + const int frac_bits = 5; + + const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16)); + const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16)); + const uint16x4_t iota2x4 = vget_low_u16(iota2x8); + + // The C implementation of the z3 predictor when upsampling uses: + // (((x << 1) & 0x3f) >> 1) + // The two shifts are unnecessary here since the lowest bit is guaranteed to + // be zero when the mask is applied, so adjust the mask to 0x1f to avoid + // needing the shifts at all. + const uint16x4_t shift_mask = vdup_n_u16(0x1F); + + if (bh == 4) { + int y = dy; + int c = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); + const int base0 = (y + 0 * dy) >> frac_bits; + const int base1 = (y + 1 * dy) >> frac_bits; + const int base2 = (y + 2 * dy) >> frac_bits; + const int base3 = (y + 3 * dy) >> frac_bits; + const uint16x4x2_t l0 = vld2_u16(left + base0); + const uint16x4x2_t l1 = vld2_u16(left + base1); + const uint16x4x2_t l2 = vld2_u16(left + base2); + const uint16x4x2_t l3 = vld2_u16(left + base3); + uint16x4_t out[4]; + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 5); + transpose_array_inplace_u16_4x4(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + r2 * stride + c, out[r2]); + } + y += 4 * dy; + c += 4; + } while (c < bw); + } else { + assert(bh % 8 == 0); + + int y = dy; + int c = 0; + do { + int r = 0; + do { + // Fully unroll the 4x8 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); + const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2); + const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2); + const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2); + const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2); + const uint16x8x2_t l0 = vld2q_u16(left + base0); + const uint16x8x2_t l1 = vld2q_u16(left + base1); + const uint16x8x2_t l2 = vld2q_u16(left + base2); + const uint16x8x2_t l3 = vld2q_u16(left + base3); + uint16x8_t out[4]; + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 5); + transpose_array_inplace_u16_4x8(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); + } + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); + } + r += 8; + } while (r < bh); + y += 4 * dy; + c += 4; + } while (c < bw); + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + (void)above; + (void)dx; + (void)bd; + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx == 1); + assert(dy > 0); + + if (upsample_left) { + highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy); + } else { + highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy); + } +} + +#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4 +#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8 diff --git a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c new file mode 100644 index 0000000000..77727b7665 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c @@ -0,0 +1,1265 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/transpose_neon.h" + +static INLINE int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low, + const int16x4_t high) { + return vmin_s16(vmax_s16(val, low), high); +} + +static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val, + int bitdepth) { + const int16x8_t low = vdupq_n_s16(0); + const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1); + + return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high); +} + +// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh) +static INLINE uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1, + const uint16_t thresh) { + const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh)); + return vorr_u16(vget_low_u16(a), vget_high_u16(a)); +} + +// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh +static INLINE uint16x4_t outer_threshold(const uint16x4_t p1, + const uint16x4_t p0, + const uint16x4_t q0, + const uint16x4_t q1, + const uint16_t outer_thresh) { + const uint16x4_t abd_p0q0 = vabd_u16(p0, q0); + const uint16x4_t abd_p1q1 = vabd_u16(p1, q1); + const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1); + const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1); + const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half); + return vcle_u16(sum, vdup_n_u16(outer_thresh)); +} + +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// outer_threshold() +static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh && +// abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh && +// outer_threshold() +static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b)); + return vand_u16(inner_mask, outer_mask); +} + +// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh && +// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh && +// abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh +// outer_threshold() +static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p1p2_q1q2, + const uint16x8_t abd_p2p3_q2q3, + const uint16_t inner_thresh, + const uint16x4_t outer_mask) { + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2); + const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh)); + const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c)); + return vand_u16(inner_mask, outer_mask); +} + +// ----------------------------------------------------------------------------- +// filterN_masks functions. + +static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1, + const uint16_t hev_thresh, + const uint16x4_t outer_mask, + const uint16_t inner_thresh, + uint16x4_t *const hev_mask, + uint16x4_t *const needs_filter4_mask) { + const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + // This includes cases where needs_filter4() is not true and so filter2() will + // not be applied. + const uint16x4_t hev_tmp_mask = hev(p0p1_q0q1, hev_thresh); + + *needs_filter4_mask = needs_filter4(p0p1_q0q1, inner_thresh, outer_mask); + + // filter2() will only be applied if both needs_filter4() and hev() are true. + *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask); +} + +// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh && +// abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1, + const uint16x8_t abd_p0p2_q0q2, + const int bitdepth) { + const int flat_thresh = 1 << (bitdepth - 8); + const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2); + const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(b), vget_high_u16(b)); +} + +static INLINE void filter6_masks( + const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0, + const uint16_t hev_thresh, const uint16x4_t outer_mask, + const uint16_t inner_thresh, const int bitdepth, + uint16x4_t *const needs_filter6_mask, uint16x4_t *const is_flat3_mask, + uint16x4_t *const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = hev(abd_p0p1_q0q1, hev_thresh); + *is_flat3_mask = is_flat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), bitdepth); + *needs_filter6_mask = needs_filter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), + inner_thresh, outer_mask); +} + +// is_flat4 uses N=1, IsFlatOuter4 uses N=4. +// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh && +// abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh && +// abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh +// |flat_thresh| == 4 for 10 bit decode. +static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0, + const uint16x8_t abd_pn1p0_qn1q0, + const uint16x8_t abd_pn2p0_qn2q0, + const int bitdepth) { + const int flat_thresh = 1 << (bitdepth - 8); + const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0); + const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0); + const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh)); + return vand_u16(vget_low_u16(c), vget_high_u16(c)); +} + +static INLINE void filter8_masks( + const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, const uint16_t hev_thresh, + const uint16x4_t outer_mask, const uint16_t inner_thresh, + const int bitdepth, uint16x4_t *const needs_filter8_mask, + uint16x4_t *const is_flat4_mask, uint16x4_t *const hev_mask) { + const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1); + *hev_mask = hev(abd_p0p1_q0q1, hev_thresh); + const uint16x4_t v_is_flat4 = is_flat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), + vabdq_u16(p0q0, p3q3), bitdepth); + *needs_filter8_mask = + needs_filter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3), + inner_thresh, outer_mask); + // |is_flat4_mask| is used to decide where to use the result of filter8. + // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false, + // overriding the question of whether to use filter8. Because filter4 doesn't + // apply to p2q2, |is_flat4_mask| chooses directly between filter8 and the + // source value. To be correct, the mask must account for this override. + *is_flat4_mask = vand_u16(v_is_flat4, *needs_filter8_mask); +} + +// ----------------------------------------------------------------------------- +// filterN functions. + +// Calculate filter4() or filter2() based on |hev_mask|. +static INLINE void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1, + const uint16x8_t p1q1, const uint16x4_t hev_mask, + int bitdepth, uint16x8_t *const p1q1_result, + uint16x8_t *const p0q0_result) { + const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4); + // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val); + // q0mp0 means "q0 minus p0". + const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1)); + const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3); + + // If this is for filter2() then include |p1mq1|. Otherwise zero it. + const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (bitdepth - 1))); + const int16x4_t max_signed_pixel = vdup_n_s16((1 << (bitdepth - 1)) - 1); + const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1); + const int16x4_t p1mq1_saturated = + clip3_s16(p1mq1, min_signed_pixel, max_signed_pixel); + const int16x4_t hev_option = + vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated); + + const int16x4_t a = vadd_s16(q0mp0_3, hev_option); + + // Need to figure out what's going on here because there are some unnecessary + // tricks to accommodate 8x8 as smallest 8bpp vector + + // We can not shift with rounding because the clamp comes *before* the + // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 = + // Clip3(a + 3, min_signed_val, max_signed_val) >> 3; + const int16x4_t plus_four = + clip3_s16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel); + const int16x4_t plus_three = + clip3_s16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel); + const int16x4_t a1 = vshr_n_s16(plus_four, 3); + const int16x4_t a2 = vshr_n_s16(plus_three, 3); + + // a3 = (a1 + 1) >> 1; + const int16x4_t a3 = vrshr_n_s16(a1, 1); + + const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3)); + const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3); + + // Need to shift the second term or we end up with a2_ma2. + const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1)); + const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1); + *p1q1_result = convert_to_unsigned_pixel_u16(p1q1_a3, bitdepth); + *p0q0_result = convert_to_unsigned_pixel_u16(p0q0_a, bitdepth); +} + +void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch); + uint16_t *const dst_p0 = (uint16_t *)(s - pitch); + uint16_t *const dst_q0 = (uint16_t *)(s); + uint16_t *const dst_q1 = (uint16_t *)(s + pitch); + + const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1) }; + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = + outer_threshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0); + + // Already integrated the hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void aom_highbd_lpf_horizontal_4_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + // Offset by 2 uint16_t values to load from first p1 position. + uint16_t *dst = s - 2; + uint16_t *dst_p1 = dst; + uint16_t *dst_p0 = dst + pitch; + uint16_t *dst_q0 = dst + pitch * 2; + uint16_t *dst_q1 = dst + pitch * 3; + + uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1) }; + transpose_array_inplace_u16_4x4(src); + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = + outer_threshold(src[0], src[1], src[2], src[3], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]); + const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]); + filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask, + &needs_filter4_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter4_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter4_mask_8 = + vcombine_u16(needs_filter4_mask, needs_filter4_mask); + + uint16x8_t f_p1q1; + uint16x8_t f_p0q0; + const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0); + + // Already integrated the hev mask when calculating the filtered values. + const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0); + + // p1/q1 are unmodified if only hev() is true. This works because it was and'd + // with |needs_filter4_mask| previously. + const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8); + const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1); + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + transpose_array_inplace_u16_4x4(output); + + vst1_u16(dst_p1, output[0]); + vst1_u16(dst_p0, output[1]); + vst1_u16(dst_q0, output[2]); + vst1_u16(dst_q1, output[3]); +} + +void aom_highbd_lpf_vertical_4_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t *const p1q1_output, + uint16x8_t *const p0q0_output) { + // Sum p1 and q1 output from opposite directions. + // The formula is regrouped to allow 3 doubling operations to be combined. + // + // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0 + // ^^^^^^^^ + // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2) + // ^^^^^^^^ + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^^^^^^ + uint16x8_t sum = vaddq_u16(p2q2, p1q1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p0q0); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^ + sum = vshlq_n_u16(sum, 1); + + // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0 + // ^^^^^^ ^^^^^^ + // Should dual issue with the left shift. + const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); + const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0); + sum = vaddq_u16(sum, outer_sum); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - (2 * p2) + q0 + q1 + // q0 = q1 - (2 * q2) + p0 + p1 + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1); + // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1 + // ^^^^^^^^ + sum = vsubq_u16(sum, p2q2_double); + const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); + sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16_t *const dst_p2 = s - 3 * pitch; + uint16_t *const dst_p1 = s - 2 * pitch; + uint16_t *const dst_p0 = s - pitch; + uint16_t *const dst_q0 = s; + uint16_t *const dst_q1 = s + pitch; + uint16_t *const dst_q2 = s + 2 * pitch; + + const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1), + vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2) }; + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = + outer_threshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or + // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // filter6() does not apply, but filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); +} + +void aom_highbd_lpf_horizontal_6_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + // Left side of the filter window. + uint16_t *const dst = s - 3; + uint16_t *const dst_0 = dst; + uint16_t *const dst_1 = dst + pitch; + uint16_t *const dst_2 = dst + 2 * pitch; + uint16_t *const dst_3 = dst + 3 * pitch; + + // Overread by 2 values. These overreads become the high halves of src_raw[2] + // and src_raw[3] after transpose. + uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), + vld1q_u16(dst_2), vld1q_u16(dst_3) }; + transpose_array_inplace_u16_4x8(src_raw); + // p2, p1, p0, q0, q1, q2 + const uint16x4_t src[6] = { + vget_low_u16(src_raw[0]), vget_low_u16(src_raw[1]), + vget_low_u16(src_raw[2]), vget_low_u16(src_raw[3]), + vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]), + }; + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = + outer_threshold(src[1], src[2], src[3], src[4], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat3_mask; + const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]); + const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]); + const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]); + filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd, + &needs_filter_mask, &is_flat3_mask, &hev_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or + // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6 + // output is not used. + uint16x8_t f6_p1q1, f6_p0q0; + const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask); + if (vget_lane_u64(need_filter6, 0) == 0) { + // filter6() does not apply, but filter4() applies to one or more values. + p0q0_output = p0q0; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0); + p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x4_t output[4] = { + vget_low_u16(p1q1_output), + vget_low_u16(p0q0_output), + vget_high_u16(p0q0_output), + vget_high_u16(p1q1_output), + }; + transpose_array_inplace_u16_4x4(output); + + // dst_n starts at p2, so adjust to p1. + vst1_u16(dst_0 + 1, output[0]); + vst1_u16(dst_1 + 1, output[1]); + vst1_u16(dst_2 + 1, output[2]); + vst1_u16(dst_3 + 1, output[3]); +} + +void aom_highbd_lpf_vertical_6_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2, + const uint16x8_t p1q1, const uint16x8_t p0q0, + uint16x8_t *const p2q2_output, + uint16x8_t *const p1q1_output, + uint16x8_t *const p0q0_output) { + // Sum p2 and q2 output from opposite directions. + // The formula is regrouped to allow 2 doubling operations to be combined. + // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0 + // ^^^^^^^^ + // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3) + // ^^^^^^^^ + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^ + uint16x8_t sum = vshlq_n_u16(p23q23, 1); + + // Add two other terms to make dual issue with shift more likely. + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^ + const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^^^^^^^^ + sum = vaddq_u16(sum, p01q01); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + sum = vaddq_u16(sum, p3q3); + + // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0 + // ^^^^^^ + const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); + sum = vaddq_u16(sum, q0p0); + + *p2q2_output = vrshrq_n_u16(sum, 3); + + // Convert to p1 and q1 output: + // p1 = p2 - p3 - p2 + p1 + q1 + // q1 = q2 - q3 - q2 + q0 + p1 + sum = vsubq_u16(sum, p23q23); + const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); + sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1)); + + *p1q1_output = vrshrq_n_u16(sum, 3); + + // Convert to p0 and q0 output: + // p0 = p1 - p3 - p1 + p0 + q2 + // q0 = q1 - q3 - q1 + q0 + p2 + sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1)); + const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4); + sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2)); + + *p0q0_output = vrshrq_n_u16(sum, 3); +} + +void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16_t *const dst_p3 = s - 4 * pitch; + uint16_t *const dst_p2 = s - 3 * pitch; + uint16_t *const dst_p1 = s - 2 * pitch; + uint16_t *const dst_p0 = s - pitch; + uint16_t *const dst_q0 = s; + uint16_t *const dst_q1 = s + pitch; + uint16_t *const dst_q2 = s + 2 * pitch; + uint16_t *const dst_q3 = s + 3 * pitch; + + const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2), + vld1_u16(dst_p1), vld1_u16(dst_p0), + vld1_u16(dst_q0), vld1_u16(dst_q1), + vld1_u16(dst_q2), vld1_u16(dst_q3) }; + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = + outer_threshold(src[2], src[3], src[4], src[5], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]); + const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]); + const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]); + const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]); + filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() does not apply, but filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); +} + +void aom_highbd_lpf_horizontal_8_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); +} + +static INLINE uint16x8_t reverse_low_half(const uint16x8_t a) { + return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a)); +} + +void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16_t *const dst = s - 4; + uint16_t *const dst_0 = dst; + uint16_t *const dst_1 = dst + pitch; + uint16_t *const dst_2 = dst + 2 * pitch; + uint16_t *const dst_3 = dst + 3 * pitch; + + // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n. + // To get desired pairs after transpose, one half should be reversed. + uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3) }; + + // src[0] = p0q0 + // src[1] = p1q1 + // src[2] = p2q2 + // src[3] = p3q3 + loop_filter_transpose_u16_4x8q(src); + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = outer_threshold( + vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]), + vget_high_u16(src[1]), outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = src[0]; + const uint16x8_t p1q1 = src[1]; + const uint16x8_t p2q2 = src[2]; + const uint16x8_t p3q3 = src[3]; + filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 + // output is not used. + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() does not apply, but filter4() applies to one or more values. + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t is_flat4_mask_8 = + vcombine_u16(is_flat4_mask, is_flat4_mask); + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + + uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 }; + // After transpose, |output| will contain rows of the form: + // p0 p1 p2 p3 q0 q1 q2 q3 + transpose_array_inplace_u16_4x8(output); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, reverse_low_half(output[0])); + vst1q_u16(dst_1, reverse_low_half(output[1])); + vst1q_u16(dst_2, reverse_low_half(output[2])); + vst1q_u16(dst_3, reverse_low_half(output[3])); +} + +void aom_highbd_lpf_vertical_8_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void filter14( + const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4, + const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1, + const uint16x8_t p0q0, uint16x8_t *const p5q5_output, + uint16x8_t *const p4q4_output, uint16x8_t *const p3q3_output, + uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output, + uint16x8_t *const p0q0_output) { + // Sum p5 and q5 output from opposite directions. + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^ + const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^^^^^^^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^^^^^^^^^^^^^ + uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1); + sum = vaddq_u16(sum, p6q6_x7); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^^^^^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^^^^^^ + sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum); + + // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0 + // ^^ + // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6) + // ^^ + const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4); + sum = vaddq_u16(sum, q0p0); + + *p5q5_output = vrshrq_n_u16(sum, 4); + + // Convert to p4 and q4 output: + // p4 = p5 - (2 * p6) + p3 + q1 + // q4 = q5 - (2 * q6) + q3 + p1 + sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1)); + const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4); + sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum); + + *p4q4_output = vrshrq_n_u16(sum, 4); + + // Convert to p3 and q3 output: + // p3 = p4 - p6 - p5 + p2 + q2 + // q3 = q4 - q6 - q5 + q2 + p2 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5)); + const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4); + sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum); + + *p3q3_output = vrshrq_n_u16(sum, 4); + + // Convert to p2 and q2 output: + // p2 = p3 - p6 - p4 + p1 + q3 + // q2 = q3 - q6 - q4 + q1 + p3 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4)); + const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4); + sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum); + + *p2q2_output = vrshrq_n_u16(sum, 4); + + // Convert to p1 and q1 output: + // p1 = p2 - p6 - p3 + p0 + q4 + // q1 = q2 - q6 - q3 + q0 + p4 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3)); + const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4); + sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum); + + *p1q1_output = vrshrq_n_u16(sum, 4); + + // Convert to p0 and q0 output: + // p0 = p1 - p6 - p2 + q0 + q5 + // q0 = q1 - q6 - q2 + p0 + p5 + sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2)); + const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4); + sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum); + + *p0q0_output = vrshrq_n_u16(sum, 4); +} + +void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16_t *const dst_p6 = s - 7 * pitch; + uint16_t *const dst_p5 = s - 6 * pitch; + uint16_t *const dst_p4 = s - 5 * pitch; + uint16_t *const dst_p3 = s - 4 * pitch; + uint16_t *const dst_p2 = s - 3 * pitch; + uint16_t *const dst_p1 = s - 2 * pitch; + uint16_t *const dst_p0 = s - pitch; + uint16_t *const dst_q0 = s; + uint16_t *const dst_q1 = s + pitch; + uint16_t *const dst_q2 = s + 2 * pitch; + uint16_t *const dst_q3 = s + 3 * pitch; + uint16_t *const dst_q4 = s + 4 * pitch; + uint16_t *const dst_q5 = s + 5 * pitch; + uint16_t *const dst_q6 = s + 6 * pitch; + + const uint16x4_t src[14] = { + vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3), + vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0), + vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4), + vld1_u16(dst_q5), vld1_u16(dst_q6) + }; + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = + outer_threshold(src[5], src[6], src[7], src[8], outer_thresh); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]); + const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]); + const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]); + const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]); + filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]); + const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]); + const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]); + // Mask to choose between the outputs of filter8 and filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6), bd)); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + // ZIP1 p0q0, p1q1 may perform better here. + const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() and filter14() do not apply, but filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // filter14() does not apply, but filter8() and filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + + vst1_u16(dst_p5, vget_low_u16(p5q5_output)); + vst1_u16(dst_p4, vget_low_u16(p4q4_output)); + vst1_u16(dst_p3, vget_low_u16(p3q3_output)); + vst1_u16(dst_p2, vget_low_u16(p2q2_output)); + vst1_u16(dst_p1, vget_low_u16(p1q1_output)); + vst1_u16(dst_p0, vget_low_u16(p0q0_output)); + vst1_u16(dst_q0, vget_high_u16(p0q0_output)); + vst1_u16(dst_q1, vget_high_u16(p1q1_output)); + vst1_u16(dst_q2, vget_high_u16(p2q2_output)); + vst1_u16(dst_q3, vget_high_u16(p3q3_output)); + vst1_u16(dst_q4, vget_high_u16(p4q4_output)); + vst1_u16(dst_q5, vget_high_u16(p5q5_output)); +} + +void aom_highbd_lpf_horizontal_14_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd); +} + +static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab, + const uint16x8_t cd) { + uint16x8x2_t acdb; +#if AOM_ARCH_AARCH64 + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd))); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab))); +#else + // a[b] <- [c]d + acdb.val[0] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0), + vreinterpretq_u64_u16(ab), 1)); + // [a]b <- c[d] + acdb.val[1] = vreinterpretq_u16_u64( + vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1), + vreinterpretq_u64_u16(ab), 0)); +#endif // AOM_ARCH_AARCH64 + return acdb; +} + +void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16_t *const dst = s - 8; + uint16_t *const dst_0 = dst; + uint16_t *const dst_1 = dst + pitch; + uint16_t *const dst_2 = dst + 2 * pitch; + uint16_t *const dst_3 = dst + 3 * pitch; + + // Low halves: p7 p6 p5 p4 + // High halves: p3 p2 p1 p0 + uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2), + vld1q_u16(dst_3) }; + // p7 will be the low half of src_p[0]. Not used until the end. + transpose_array_inplace_u16_4x8(src_p); + + // Low halves: q0 q1 q2 q3 + // High halves: q4 q5 q6 q7 + uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8), + vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) }; + // q7 will be the high half of src_q[3]. Not used until the end. + transpose_array_inplace_u16_4x8(src_q); + + // Adjust thresholds to bitdepth. + const int outer_thresh = *blimit << (bd - 8); + const int inner_thresh = *limit << (bd - 8); + const int hev_thresh = *thresh << (bd - 8); + const uint16x4_t outer_mask = outer_threshold( + vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]), + vget_low_u16(src_q[1]), outer_thresh); + const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4); + const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4); + const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4); + const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4); + uint16x4_t hev_mask; + uint16x4_t needs_filter_mask; + uint16x4_t is_flat4_mask; + filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, + bd, &needs_filter_mask, &is_flat4_mask, &hev_mask); + +#if AOM_ARCH_AARCH64 + if (vaddv_u16(needs_filter_mask) == 0) { + // None of the values will be filtered. + return; + } +#endif // AOM_ARCH_AARCH64 + const uint16x8_t p4q4 = + vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0])); + const uint16x8_t p5q5 = + vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1])); + const uint16x8_t p6q6 = + vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2])); + const uint16x8_t p7q7 = + vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3])); + // Mask to choose between the outputs of filter8 and filter14. + // As with the derivation of |is_flat4_mask|, the question of whether to use + // filter14 is only raised where |is_flat4_mask| is true. + const uint16x4_t is_flat4_outer_mask = vand_u16( + is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5), + vabdq_u16(p0q0, p6q6), bd)); + // Copy the masks to the high bits for packed comparisons later. + const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask); + const uint16x8_t needs_filter_mask_8 = + vcombine_u16(needs_filter_mask, needs_filter_mask); + + uint16x8_t f4_p1q1; + uint16x8_t f4_p0q0; + const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1)); + filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0); + f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1); + + uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output, + p5q5_output; + // Because we did not return after testing |needs_filter_mask| we know it is + // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or + // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8 + // output is not used. + uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0; + const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask); + if (vget_lane_u64(need_filter8, 0) == 0) { + // filter8() and filter14() do not apply, but filter4() applies to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = p2q2; + p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1); + p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0); + } else { + const uint16x8_t use_filter8_mask = + vcombine_u16(is_flat4_mask, is_flat4_mask); + filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0); + const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask); + if (vget_lane_u64(need_filter14, 0) == 0) { + // filter14() does not apply, but filter8() and filter4() apply to one or + // more values. + p5q5_output = p5q5; + p4q4_output = p4q4; + p3q3_output = p3q3; + p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2); + p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } else { + // All filters may contribute values to final outputs. + const uint16x8_t use_filter14_mask = + vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask); + uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0; + filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4, + &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0); + p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5); + p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4); + p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3); + p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2); + p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2); + p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2); + p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1); + p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1); + p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1); + p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0); + p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0); + p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0); + } + } + // To get the correctly ordered rows from the transpose, we need: + // p7p3 p6p2 p5p1 p4p0 + // q0q4 q1q5 q2q6 q3q7 + const uint16x8x2_t p7p3_q3q7 = permute_acdb64(p7q7, p3q3_output); + const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output); + const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output); + const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output); + uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0], + p5p1_q1q5.val[0], p4p0_q0q4.val[0] }; + transpose_array_inplace_u16_4x8(output_p); + uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1], + p6p2_q2q6.val[1], p7p3_q3q7.val[1] }; + transpose_array_inplace_u16_4x8(output_q); + + // Reverse p values to produce original order: + // p3 p2 p1 p0 q0 q1 q2 q3 + vst1q_u16(dst_0, output_p[0]); + vst1q_u16(dst_0 + 8, output_q[0]); + vst1q_u16(dst_1, output_p[1]); + vst1q_u16(dst_1 + 8, output_q[1]); + vst1q_u16(dst_2, output_p[2]); + vst1q_u16(dst_2 + 8, output_q[2]); + vst1q_u16(dst_3, output_p[3]); + vst1q_u16(dst_3 + 8, output_q[3]); +} + +void aom_highbd_lpf_vertical_14_dual_neon( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, + thresh1, bd); +} diff --git a/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c new file mode 100644 index 0000000000..9262d818e9 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/blend.h" + +static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, + const uint16_t *src, + const uint16_t *a, + const uint16_t *b, + const uint8_t *m) { + const uint16x8_t s0 = vld1q_u16(src); + const uint16x8_t a0 = vld1q_u16(a); + const uint16x8_t b0 = vld1q_u16(b); + const uint16x8_t m0 = vmovl_u8(vld1_u8(m)); + + uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0); + + return vaddq_u16(sad, vabdq_u16(blend_u16, s0)); +} + +static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad, + const uint16_t *src, + const uint16_t *a, + const uint16_t *b, + const uint8_t *m) { + sad = masked_sad_8x1_neon(sad, src, a, b, m); + return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]); +} + +static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad, + const uint16_t *src, + const uint16_t *a, + const uint16_t *b, + const uint8_t *m) { + sad = masked_sad_16x1_neon(sad, src, a, b, m); + return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]); +} + +static INLINE unsigned int masked_sad_128xh_large_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + do { + uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + for (int h = 0; h < 4; ++h) { + sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m); + sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]); + sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]); + sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]); + sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]); + sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]); + sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]); + height -= 4; + } while (height != 0); + + sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]); + sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]); + sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]); + + return horizontal_add_u32x4(sad_u32[0]); +} + +static INLINE unsigned int masked_sad_64xh_large_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + do { + uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + for (int h = 0; h < 4; ++h) { + sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m); + sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]); + sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]); + height -= 4; + } while (height != 0); + + return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1])); +} + +static INLINE unsigned int masked_sad_32xh_large_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint32x4_t sad_u32 = vdupq_n_u32(0); + + do { + uint16x8_t sad = vdupq_n_u16(0); + for (int h = 0; h < 4; ++h) { + sad = masked_sad_32x1_neon(sad, src, a, b, m); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + sad_u32 = vpadalq_u16(sad_u32, sad); + height -= 4; + } while (height != 0); + + return horizontal_add_u32x4(sad_u32); +} + +static INLINE unsigned int masked_sad_16xh_large_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint32x4_t sad_u32 = vdupq_n_u32(0); + + do { + uint16x8_t sad_u16 = vdupq_n_u16(0); + + for (int h = 0; h < 8; ++h) { + sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + sad_u32 = vpadalq_u16(sad_u32, sad_u16); + height -= 8; + } while (height != 0); + + return horizontal_add_u32x4(sad_u32); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE unsigned int masked_sad_8xh_large_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint32x4_t sad_u32 = vdupq_n_u32(0); + + do { + uint16x8_t sad_u16 = vdupq_n_u16(0); + + for (int h = 0; h < 16; ++h) { + sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + sad_u32 = vpadalq_u16(sad_u32, sad_u16); + height -= 16; + } while (height != 0); + + return horizontal_add_u32x4(sad_u32); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE unsigned int masked_sad_16xh_small_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + // For 12-bit data, we can only accumulate up to 128 elements in the + // uint16x8_t type sad accumulator, so we can only process up to 8 rows + // before we have to accumulate into 32-bit elements. + assert(height <= 8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint16x8_t sad = vdupq_n_u16(0); + + do { + sad = masked_sad_16x1_neon(sad, src, a, b, m); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } while (--height != 0); + + return horizontal_add_u16x8(sad); +} + +static INLINE unsigned int masked_sad_8xh_small_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + // For 12-bit data, we can only accumulate up to 128 elements in the + // uint16x8_t type sad accumulator, so we can only process up to 16 rows + // before we have to accumulate into 32-bit elements. + assert(height <= 16); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint16x8_t sad = vdupq_n_u16(0); + + do { + sad = masked_sad_8x1_neon(sad, src, a, b, m); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } while (--height != 0); + + return horizontal_add_u16x8(sad); +} + +static INLINE unsigned int masked_sad_4xh_small_neon( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, + int height) { + // For 12-bit data, we can only accumulate up to 64 elements in the + // uint16x4_t type sad accumulator, so we can only process up to 16 rows + // before we have to accumulate into 32-bit elements. + assert(height <= 16); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + uint16x4_t sad = vdup_n_u16(0); + do { + uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m))); + uint16x4_t a0 = load_unaligned_u16_4x1(a); + uint16x4_t b0 = load_unaligned_u16_4x1(b); + uint16x4_t s0 = load_unaligned_u16_4x1(src); + + uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0); + + sad = vadd_u16(sad, vabd_u16(blend_u16, s0)); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } while (--height != 0); + + return horizontal_add_u16x4(sad); +} + +#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h) \ + unsigned int aom_highbd_masked_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride, \ + second_pred, w, msk, msk_stride, \ + h); \ + else \ + return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w, \ + ref, ref_stride, msk, msk_stride, \ + h); \ + } + +#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h) \ + unsigned int aom_highbd_masked_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride, \ + second_pred, w, msk, msk_stride, \ + h); \ + else \ + return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w, \ + ref, ref_stride, msk, msk_stride, \ + h); \ + } + +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4) +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8) + +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4) +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8) +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16) + +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32) + +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64) + +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128) + +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16) + +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32) + +HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4) +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64) + +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8) + +HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c new file mode 100644 index 0000000000..28699e6f41 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref, + const int32_t *mask, + const int32_t *wsrc, + uint32x4_t *sum) { + int16x8_t ref_s16 = vreinterpretq_s16_u16(ref); + + int32x4_t wsrc_lo = vld1q_s32(wsrc); + int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); + + int32x4_t mask_lo = vld1q_s32(mask); + int32x4_t mask_hi = vld1q_s32(mask + 4); + + int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi)); + + int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16)); + int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16)); + + uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); + uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); + + *sum = vrsraq_n_u32(*sum, abs_lo, 12); + *sum = vrsraq_n_u32(*sum, abs_hi, 12); +} + +static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int height) { + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + uint32x4_t sum = vdupq_n_u32(0); + + int h = height / 2; + do { + uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); + + highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum); + + ref_ptr += 2 * ref_stride; + wsrc += 8; + mask += 8; + } while (--h != 0); + + return horizontal_add_u32x4(sum); +} + +static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int height) { + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + uint32x4_t sum = vdupq_n_u32(0); + + do { + uint16x8_t r = vld1q_u16(ref_ptr); + + highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum); + + ref_ptr += ref_stride; + wsrc += 8; + mask += 8; + } while (--height != 0); + + return horizontal_add_u32x4(sum); +} + +static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int width, int height) { + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + do { + int i = 0; + do { + uint16x8_t r0 = vld1q_u16(ref_ptr + i); + highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]); + + uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8); + highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]); + + wsrc += 16; + mask += 16; + i += 16; + } while (i < width); + + ref_ptr += ref_stride; + } while (--height != 0); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int h) { + return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h); +} + +static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int height) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + + do { + uint16x8_t r0 = vld1q_u16(ref_ptr); + uint16x8_t r1 = vld1q_u16(ref_ptr + 8); + uint16x8_t r2 = vld1q_u16(ref_ptr + 16); + uint16x8_t r3 = vld1q_u16(ref_ptr + 24); + + highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]); + highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]); + highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]); + highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]); + + wsrc += 32; + mask += 32; + ref_ptr += ref_stride; + } while (--height != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2])); +} + +static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int h) { + return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h); +} + +static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, + int h) { + return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h); +} + +#define HIGHBD_OBMC_SAD_WXH_NEON(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_neon( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \ + } + +HIGHBD_OBMC_SAD_WXH_NEON(4, 4) +HIGHBD_OBMC_SAD_WXH_NEON(4, 8) + +HIGHBD_OBMC_SAD_WXH_NEON(8, 4) +HIGHBD_OBMC_SAD_WXH_NEON(8, 8) +HIGHBD_OBMC_SAD_WXH_NEON(8, 16) + +HIGHBD_OBMC_SAD_WXH_NEON(16, 8) +HIGHBD_OBMC_SAD_WXH_NEON(16, 16) +HIGHBD_OBMC_SAD_WXH_NEON(16, 32) + +HIGHBD_OBMC_SAD_WXH_NEON(32, 16) +HIGHBD_OBMC_SAD_WXH_NEON(32, 32) +HIGHBD_OBMC_SAD_WXH_NEON(32, 64) + +HIGHBD_OBMC_SAD_WXH_NEON(64, 32) +HIGHBD_OBMC_SAD_WXH_NEON(64, 64) +HIGHBD_OBMC_SAD_WXH_NEON(64, 128) + +HIGHBD_OBMC_SAD_WXH_NEON(128, 64) +HIGHBD_OBMC_SAD_WXH_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HIGHBD_OBMC_SAD_WXH_NEON(4, 16) + +HIGHBD_OBMC_SAD_WXH_NEON(8, 32) + +HIGHBD_OBMC_SAD_WXH_NEON(16, 4) +HIGHBD_OBMC_SAD_WXH_NEON(16, 64) + +HIGHBD_OBMC_SAD_WXH_NEON(32, 8) + +HIGHBD_OBMC_SAD_WXH_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c new file mode 100644 index 0000000000..d59224619b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre, + const int32_t *wsrc, + const int32_t *mask, + uint32x4_t *sse, + int32x4_t *sum) { + int16x8_t pre_s16 = vreinterpretq_s16_u16(pre); + int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]); + int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]); + + int32x4_t mask_lo = vld1q_s32(&mask[0]); + int32x4_t mask_hi = vld1q_s32(&mask[4]); + + int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi)); + + int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16)); + int32x4_t diff_hi = + vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16)); + + diff_lo = vsubq_s32(wsrc_lo, diff_lo); + diff_hi = vsubq_s32(wsrc_hi, diff_hi); + + // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away + // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. + // This difference only affects the bit patterns at the rounding breakpoints + // exactly, so we can add -1 to all negative numbers to move the breakpoint + // one value across and into the correct rounding region. + diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31); + diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31); + int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12); + int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12); + + *sum = vaddq_s32(*sum, round_lo); + *sum = vaddq_s32(*sum, round_hi); + *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo), + vreinterpretq_u32_s32(round_lo)); + *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi), + vreinterpretq_u32_s32(round_hi)); +} + +// For 12-bit data, we can only accumulate up to 256 elements in the unsigned +// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate +// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128, +// 128x64, 128x128 are processed in a different helper function. +static INLINE void highbd_obmc_variance_xlarge_neon( + const uint8_t *pre, int pre_stride, const int32_t *wsrc, + const int32_t *mask, int width, int h, int h_limit, uint64_t *sse, + int64_t *sum) { + uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); + int32x4_t sum_s32 = vdupq_n_s32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit + // accumulator overflows. After hitting this limit we accumulate into 64-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + do { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + int j = 0; + + do { + int i = 0; + + do { + uint16x8_t pre0 = vld1q_u16(pre_ptr + i); + highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0], + &sum_s32); + + uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8); + highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1], + &sum_s32); + + i += 16; + wsrc += 16; + mask += 16; + } while (i < width); + + pre_ptr += pre_stride; + j++; + } while (j < h_tmp); + + sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]); + sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]); + h -= h_tmp; + } while (h != 0); + + *sse = horizontal_add_u64x2(sse_u64); + *sum = horizontal_long_add_s32x4(sum_s32); +} + +static INLINE void highbd_obmc_variance_xlarge_neon_128xh( + const uint8_t *pre, int pre_stride, const int32_t *wsrc, + const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse, + sum); +} + +static INLINE void highbd_obmc_variance_xlarge_neon_64xh( + const uint8_t *pre, int pre_stride, const int32_t *wsrc, + const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse, + sum); +} + +static INLINE void highbd_obmc_variance_xlarge_neon_32xh( + const uint8_t *pre, int pre_stride, const int32_t *wsrc, + const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse, + sum); +} + +static INLINE void highbd_obmc_variance_large_neon( + const uint8_t *pre, int pre_stride, const int32_t *wsrc, + const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) { + uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int32x4_t sum_s32 = vdupq_n_s32(0); + + do { + int i = 0; + do { + uint16x8_t pre0 = vld1q_u16(pre_ptr + i); + highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32); + + uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8); + highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32, + &sum_s32); + + i += 16; + wsrc += 16; + mask += 16; + } while (i < width); + + pre_ptr += pre_stride; + } while (--h != 0); + + *sse = horizontal_long_add_u32x4(sse_u32); + *sum = horizontal_long_add_s32x4(sum_s32); +} + +static INLINE void highbd_obmc_variance_neon_128xh( + const uint8_t *pre, int pre_stride, const int32_t *wsrc, + const int32_t *mask, int h, uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, + sum); +} + +static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre, + int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum); +} + +static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre, + int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum); +} + +static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre, + int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + uint64_t *sse, int64_t *sum) { + highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum); +} + +static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8, + int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + uint64_t *sse, int64_t *sum) { + uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int32x4_t sum_s32 = vdupq_n_s32(0); + + do { + uint16x8_t pre_u16 = vld1q_u16(pre); + + highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32); + + pre += pre_stride; + wsrc += 8; + mask += 8; + } while (--h != 0); + + *sse = horizontal_long_add_u32x4(sse_u32); + *sum = horizontal_long_add_s32x4(sum_s32); +} + +static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8, + int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + uint64_t *sse, int64_t *sum) { + assert(h % 2 == 0); + uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + uint32x4_t sse_u32 = vdupq_n_u32(0); + int32x4_t sum_s32 = vdupq_n_s32(0); + + do { + uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride); + + highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32); + + pre += 2 * pre_stride; + wsrc += 8; + mask += 8; + h -= 2; + } while (h != 0); + + *sse = horizontal_long_add_u32x4(sse_u32); + *sum = horizontal_long_add_s32x4(sum_s32); +} + +static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64, + int *sum, unsigned int *sse) { + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64, + int *sum, unsigned int *sse) { + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64, + int *sum, unsigned int *sse) { + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth) \ + unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t sum64; \ + uint64_t sse64; \ + highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \ + &sum64); \ + highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth) \ + unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t sum64; \ + uint64_t sse64; \ + highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \ + &sse64, &sum64); \ + highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h)); \ + } + +// 8-bit +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8) +HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8) + +// 10-bit +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10) +HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10) + +// 12-bit +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12) +HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12) +HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12) + +HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12) +HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12) +HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12) +HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12) + +HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12) +HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12) diff --git a/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c new file mode 100644 index 0000000000..6149c9f13e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/quantize.h" + +static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) { +#if AOM_ARCH_AARCH64 + return vaddvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b)); + return (uint32_t)vget_lane_u64(c, 0); +#endif +} + +static INLINE uint16x4_t +quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32, + int32x4_t v_quant_shift_s32, int log_scale) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // if (abs_coeff < zbins[rc != 0]), + const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32); + const int32x4_t v_log_scale = vdupq_n_s32(log_scale); + // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; + const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32); + // const int32_t tmpw32 = tmp * wt; + const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS))); + // const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16); + const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32); + // const int32_t tmp3 = + // ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32); + const int32x4_t v_tmp3 = vqdmulhq_s32( + vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32); + // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0; + const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask), + vshrq_n_s32(v_tmp3, AOM_QM_BITS)); + // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale; + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale)); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Used to find eob. + const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0)); + return vmovn_u32(nz_qcoeff_mask); +} + +static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan, + int16x8_t v_eobmax, + uint16x8_t v_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE void get_min_max_lane_eob(const int16_t *iscan, + int16x8_t *v_eobmin, + int16x8_t *v_eobmax, uint16x8_t v_mask, + intptr_t n_coeffs) { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1)); +#if SKIP_EOB_FACTOR_ADJUST + const int16x8_t v_nz_iscan_min = + vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs)); + *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min); +#else + (void)v_eobmin; +#endif + *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if AOM_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif +} + +#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY +static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) { +#if AOM_ARCH_AARCH64 + return (uint16_t)vminvq_s16(v_eobmin); +#else + const int16x4_t v_eobmin_3210 = + vmin_s16(vget_low_s16(v_eobmin), vget_high_s16(v_eobmin)); + const int64x1_t v_eobmin_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmin_3210), 32); + const int16x4_t v_eobmin_tmp = + vmin_s16(v_eobmin_3210, vreinterpret_s16_s64(v_eobmin_xx32)); + const int64x1_t v_eobmin_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmin_tmp), 16); + const int16x4_t v_eobmin_final = + vmin_s16(v_eobmin_tmp, vreinterpret_s16_s64(v_eobmin_xxx3)); + return (uint16_t)vget_lane_s16(v_eobmin_final, 0); +#endif +} +#endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY + +static void highbd_quantize_b_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const int log_scale) { + (void)scan; + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); + const int16x4_t v_round_no_scale = vld1_s16(round_ptr); + const int16x4_t v_round_log_scale = + vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); + const int16x4_t v_round = + vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); + const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr); + const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr); + const int16x4_t v_zbin_log_scale = + vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale))); + const int16x4_t v_zbin = + vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale); + int32x4_t v_round_s32 = vmovl_s16(v_round); + int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15); + int32x4_t v_dequant_s32 = vmovl_s16(v_dequant); + int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15); + int32x4_t v_zbin_s32 = vmovl_s16(v_zbin); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + + intptr_t non_zero_count = n_coeffs; + + assert(n_coeffs > 8); + // Pre-scan pass + const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); + intptr_t i = n_coeffs; + do { + const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4); + const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8); + const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a); + const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b); + const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x); + const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x); + // If the coefficient is in the base ZBIN range, then discard. + if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) { + non_zero_count -= 8; + } else { + break; + } + i -= 8; + } while (i > 0); + + const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; + memset(qcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*dqcoeff_ptr)); + + // DC and first 3 AC + v_mask_lo = + quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, + v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1); + v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); + + // 4 more AC + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, + v_quant_shift_s32, log_scale); + + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + intptr_t count = non_zero_count - 8; + for (; count > 0; count -= 8) { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, + v_dequant_s32, v_round_s32, v_zbin_s32, + v_quant_shift_s32, log_scale); + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, + v_quant_shift_s32, log_scale); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + } + + *eob_ptr = get_max_eob(v_eobmax); +} + +void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, 0); +} + +void aom_highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, 1); +} + +void aom_highbd_quantize_b_64x64_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, 2); +} + +#if !CONFIG_REALTIME_ONLY +static void highbd_quantize_b_adaptive_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const int log_scale) { + (void)scan; + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); + const int16x4_t v_round_no_scale = vld1_s16(round_ptr); + const int16x4_t v_round_log_scale = + vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); + const int16x4_t v_round = + vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); + const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr); + const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr); + const int16x4_t v_zbin_log_scale = + vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale))); + const int16x4_t v_zbin = + vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale); + int32x4_t v_round_s32 = vmovl_s16(v_round); + int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15); + int32x4_t v_dequant_s32 = vmovl_s16(v_dequant); + int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15); + int32x4_t v_zbin_s32 = vmovl_s16(v_zbin); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs); + + assert(n_coeffs > 8); + // Pre-scan pass + const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); + const int prescan_add_1 = + ROUND_POWER_OF_TWO(dequant_ptr[1] * EOB_FACTOR, 7 + AOM_QM_BITS); + const int32x4_t v_zbin_prescan = + vaddq_s32(v_zbin_s32x, vdupq_n_s32(prescan_add_1)); + intptr_t non_zero_count = n_coeffs; + intptr_t i = n_coeffs; + do { + const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4); + const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8); + const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a); + const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b); + const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_prescan); + const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_prescan); + // If the coefficient is in the base ZBIN range, then discard. + if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) { + non_zero_count -= 8; + } else { + break; + } + i -= 8; + } while (i > 0); + + const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; + memset(qcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*dqcoeff_ptr)); + + // DC and first 3 AC + v_mask_lo = + quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32, + v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1); + v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1); + + // 4 more AC + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, + v_quant_shift_s32, log_scale); + + get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax, + vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs); + + intptr_t count = non_zero_count - 8; + for (; count > 0; count -= 8) { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, + v_dequant_s32, v_round_s32, v_zbin_s32, + v_quant_shift_s32, log_scale); + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32, + v_quant_shift_s32, log_scale); + + get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax, + vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs); + } + + int eob = get_max_eob(v_eobmax); + +#if SKIP_EOB_FACTOR_ADJUST + const int first = get_min_eob(v_eobmin); + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const qm_val_t wt = (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_adaptive_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_adaptive_neon( + coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0); +} + +void aom_highbd_quantize_b_32x32_adaptive_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_adaptive_neon( + coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); +} + +void aom_highbd_quantize_b_64x64_adaptive_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_adaptive_neon( + coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2); +} +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/arm/highbd_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c new file mode 100644 index 0000000000..d51f639de6 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + sum = vabal_u16(sum, s, r); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + sum = vabaq_u16(sum, s, r); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t sum_u16 = vabdq_u16(s, r); + sum_u32 = vpadalq_u16(sum_u32, sum_u16); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(sum_u32); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0 = vld1q_u16(src16_ptr); + uint16x8_t r0 = vld1q_u16(ref16_ptr); + uint16x8_t diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + uint16x8_t s1 = vld1q_u16(src16_ptr + 8); + uint16x8_t r1 = vld1q_u16(ref16_ptr + 8); + uint16x8_t diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_u32x4(sum[0]); +} + +static INLINE uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src16_ptr + j); + uint16x8_t r0 = vld1q_u16(ref16_ptr + j); + uint16x8_t diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8); + uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8); + uint16x8_t diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16); + uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16); + uint16x8_t diff2 = vabdq_u16(s2, r2); + sum[2] = vpadalq_u16(sum[2], diff2); + + uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24); + uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24); + uint16x8_t diff3 = vabdq_u16(s3, r3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_u32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, + h); +} + +static INLINE unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, + h); +} + +static INLINE unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, + h); +} + +#define HBD_SAD_WXH_SMALL_NEON(w, h) \ + unsigned int aom_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \ + (h)); \ + } + +#define HBD_SAD_WXH_LARGE_NEON(w, h) \ + unsigned int aom_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \ + (h)); \ + } + +HBD_SAD_WXH_SMALL_NEON(4, 4) +HBD_SAD_WXH_SMALL_NEON(4, 8) + +HBD_SAD_WXH_SMALL_NEON(8, 4) +HBD_SAD_WXH_SMALL_NEON(8, 8) +HBD_SAD_WXH_SMALL_NEON(8, 16) + +HBD_SAD_WXH_LARGE_NEON(16, 8) +HBD_SAD_WXH_LARGE_NEON(16, 16) +HBD_SAD_WXH_LARGE_NEON(16, 32) + +HBD_SAD_WXH_LARGE_NEON(32, 16) +HBD_SAD_WXH_LARGE_NEON(32, 32) +HBD_SAD_WXH_LARGE_NEON(32, 64) + +HBD_SAD_WXH_LARGE_NEON(64, 32) +HBD_SAD_WXH_LARGE_NEON(64, 64) +HBD_SAD_WXH_LARGE_NEON(64, 128) + +HBD_SAD_WXH_LARGE_NEON(128, 64) +HBD_SAD_WXH_LARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SAD_WXH_SMALL_NEON(4, 16) + +HBD_SAD_WXH_LARGE_NEON(8, 32) + +HBD_SAD_WXH_LARGE_NEON(16, 4) +HBD_SAD_WXH_LARGE_NEON(16, 64) + +HBD_SAD_WXH_LARGE_NEON(32, 8) + +HBD_SAD_WXH_LARGE_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h) \ + unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +#define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h) \ + unsigned int aom_highbd_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +HBD_SAD_SKIP_WXH_SMALL_NEON(4, 4) +HBD_SAD_SKIP_WXH_SMALL_NEON(4, 8) + +HBD_SAD_SKIP_WXH_SMALL_NEON(8, 4) +HBD_SAD_SKIP_WXH_SMALL_NEON(8, 8) +HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16) + +HBD_SAD_SKIP_WXH_LARGE_NEON(16, 8) +HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16) +HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32) + +HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16) +HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32) +HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64) + +HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32) +HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64) +HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128) + +HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64) +HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16) + +HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32) + +HBD_SAD_SKIP_WXH_LARGE_NEON(16, 4) +HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64) + +HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8) + +HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + uint16x4_t p = vld1_u16(pred16_ptr); + + uint16x4_t avg = vrhadd_u16(r, p); + sum = vabal_u16(sum, s, avg); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 4; + } while (--i != 0); + + return horizontal_add_u32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t p = vld1q_u16(pred16_ptr); + + uint16x8_t avg = vrhaddq_u16(r, p); + uint16x8_t diff = vabdq_u16(s, avg); + sum = vpadalq_u16(sum, diff); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 8; + } while (--i != 0); + + return horizontal_add_u32x4(sum); +} + +static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1, p0, p1; + uint16x8_t avg0, avg1, diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + p0 = vld1q_u16(pred16_ptr); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + p1 = vld1q_u16(pred16_ptr + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 16; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_u32x4(sum[0]); +} + +static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + p0 = vld1q_u16(pred16_ptr + j); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + p1 = vld1q_u16(pred16_ptr + j + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + p2 = vld1q_u16(pred16_ptr + j + 16); + avg2 = vrhaddq_u16(r2, p2); + diff2 = vabdq_u16(s2, avg2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + p3 = vld1q_u16(pred16_ptr + j + 24); + avg3 = vrhaddq_u16(r3, p3); + diff3 = vabdq_u16(s3, avg3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += w; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_u32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad128xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128, + h, second_pred); +} + +static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred); +} + +static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +#define HBD_SAD_WXH_AVG_NEON(w, h) \ + uint32_t aom_highbd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +HBD_SAD_WXH_AVG_NEON(4, 4) +HBD_SAD_WXH_AVG_NEON(4, 8) + +HBD_SAD_WXH_AVG_NEON(8, 4) +HBD_SAD_WXH_AVG_NEON(8, 8) +HBD_SAD_WXH_AVG_NEON(8, 16) + +HBD_SAD_WXH_AVG_NEON(16, 8) +HBD_SAD_WXH_AVG_NEON(16, 16) +HBD_SAD_WXH_AVG_NEON(16, 32) + +HBD_SAD_WXH_AVG_NEON(32, 16) +HBD_SAD_WXH_AVG_NEON(32, 32) +HBD_SAD_WXH_AVG_NEON(32, 64) + +HBD_SAD_WXH_AVG_NEON(64, 32) +HBD_SAD_WXH_AVG_NEON(64, 64) +HBD_SAD_WXH_AVG_NEON(64, 128) + +HBD_SAD_WXH_AVG_NEON(128, 64) +HBD_SAD_WXH_AVG_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SAD_WXH_AVG_NEON(4, 16) + +HBD_SAD_WXH_AVG_NEON(8, 32) + +HBD_SAD_WXH_AVG_NEON(16, 4) +HBD_SAD_WXH_AVG_NEON(16, 64) + +HBD_SAD_WXH_AVG_NEON(32, 8) + +HBD_SAD_WXH_AVG_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c new file mode 100644 index 0000000000..85ca6732a8 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c @@ -0,0 +1,617 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + sum[3] = vabal_u16(sum[3], s, r3); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); + + } while (++i < h); + + sum_u32[0] = vpaddlq_u16(sum[0]); + sum_u32[1] = vpaddlq_u16(sum[1]); + sum_u32[2] = vpaddlq_u16(sum[2]); + sum_u32[3] = vpaddlq_u16(sum[3]); + vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); + sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); + sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); + sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); + + uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int w, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); + + uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); + + uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), + &sum_lo[3]); + + uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), + &sum_hi[3]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void highbd_sad128xhx4d_large_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int h) { + highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, + 128, h); +} + +static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, + h); +} + +static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, + h); +} + +#define HBD_SAD_WXH_4D_SMALL_NEON(w, h) \ + void aom_highbd_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +#define HBD_SAD_WXH_4D_LARGE_NEON(w, h) \ + void aom_highbd_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +HBD_SAD_WXH_4D_SMALL_NEON(4, 4) +HBD_SAD_WXH_4D_SMALL_NEON(4, 8) + +HBD_SAD_WXH_4D_SMALL_NEON(8, 4) +HBD_SAD_WXH_4D_SMALL_NEON(8, 8) +HBD_SAD_WXH_4D_SMALL_NEON(8, 16) + +HBD_SAD_WXH_4D_LARGE_NEON(16, 8) +HBD_SAD_WXH_4D_LARGE_NEON(16, 16) +HBD_SAD_WXH_4D_LARGE_NEON(16, 32) + +HBD_SAD_WXH_4D_LARGE_NEON(32, 16) +HBD_SAD_WXH_4D_LARGE_NEON(32, 32) +HBD_SAD_WXH_4D_LARGE_NEON(32, 64) + +HBD_SAD_WXH_4D_LARGE_NEON(64, 32) +HBD_SAD_WXH_4D_LARGE_NEON(64, 64) +HBD_SAD_WXH_4D_LARGE_NEON(64, 128) + +HBD_SAD_WXH_4D_LARGE_NEON(128, 64) +HBD_SAD_WXH_4D_LARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SAD_WXH_4D_SMALL_NEON(4, 16) + +HBD_SAD_WXH_4D_LARGE_NEON(8, 32) + +HBD_SAD_WXH_4D_LARGE_NEON(16, 4) +HBD_SAD_WXH_4D_LARGE_NEON(16, 64) + +HBD_SAD_WXH_4D_LARGE_NEON(32, 8) + +HBD_SAD_WXH_4D_LARGE_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h) \ + void aom_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h) \ + void aom_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4) +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8) + +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4) +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8) +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16) + +HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4) +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8) + +HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + + uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + + } while (++i < h); + + res[0] = horizontal_add_u32x4(sum[0]); + res[1] = horizontal_add_u32x4(sum[1]); + res[2] = horizontal_add_u32x4(sum[2]); +} + +static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + + uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + + } while (++i < h); + + res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0])); + res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1])); + res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2])); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + + uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride); + uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride); + uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride); + + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + + } while (++i < h); + + res[0] = horizontal_add_u32x4(sum[0]); + res[1] = horizontal_add_u32x4(sum[1]); + res[2] = horizontal_add_u32x4(sum[2]); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + + uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + + uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + + } while (++i < h); + + res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0])); + res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1])); + res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2])); +} + +static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int w, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + + uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + uint32x4_t sum[3]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + + uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + + uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + + uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + + res[0] = horizontal_add_u32x4(sum[0]); + res[1] = horizontal_add_u32x4(sum[1]); + res[2] = horizontal_add_u32x4(sum[2]); +} + +static INLINE void highbd_sad128xhx3d_large_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int h) { + highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, + 128, h); +} + +static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, + h); +} + +static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, + h); +} + +#define HBD_SAD_WXH_3D_SMALL_NEON(w, h) \ + void aom_highbd_sad##w##x##h##x3d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +#define HBD_SAD_WXH_3D_LARGE_NEON(w, h) \ + void aom_highbd_sad##w##x##h##x3d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +HBD_SAD_WXH_3D_SMALL_NEON(4, 4) +HBD_SAD_WXH_3D_SMALL_NEON(4, 8) + +HBD_SAD_WXH_3D_SMALL_NEON(8, 4) +HBD_SAD_WXH_3D_SMALL_NEON(8, 8) +HBD_SAD_WXH_3D_SMALL_NEON(8, 16) + +HBD_SAD_WXH_3D_LARGE_NEON(16, 8) +HBD_SAD_WXH_3D_LARGE_NEON(16, 16) +HBD_SAD_WXH_3D_LARGE_NEON(16, 32) + +HBD_SAD_WXH_3D_LARGE_NEON(32, 16) +HBD_SAD_WXH_3D_LARGE_NEON(32, 32) +HBD_SAD_WXH_3D_LARGE_NEON(32, 64) + +HBD_SAD_WXH_3D_LARGE_NEON(64, 32) +HBD_SAD_WXH_3D_LARGE_NEON(64, 64) +HBD_SAD_WXH_3D_LARGE_NEON(64, 128) + +HBD_SAD_WXH_3D_LARGE_NEON(128, 64) +HBD_SAD_WXH_3D_LARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SAD_WXH_3D_SMALL_NEON(4, 16) + +HBD_SAD_WXH_3D_LARGE_NEON(8, 32) + +HBD_SAD_WXH_3D_LARGE_NEON(16, 4) +HBD_SAD_WXH_3D_LARGE_NEON(16, 64) + +HBD_SAD_WXH_3D_LARGE_NEON(32, 8) + +HBD_SAD_WXH_3D_LARGE_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_neon.c b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c new file mode 100644 index 0000000000..184e9f9bef --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src, + const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi); +} + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); +} + +static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[16]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_u32x4_x16(sse); +} + +static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_u32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_u32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[4]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_u32x4_x4(sse); +} + +static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2]; + highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_u32x4_x2(sse); +} + +static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + // Peel the first loop iteration. + uint16x4_t s = vld1_u16(src); + uint16x4_t r = vld1_u16(ref); + + uint16x4_t abs_diff = vabd_u16(s, r); + uint32x4_t sse = vmull_u16(abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + s = vld1_u16(src); + r = vld1_u16(ref); + + abs_diff = vabd_u16(s, r); + sse = vmlal_u16(sse, abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_u32x4(sse); +} + +static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + // { 0, 1, 2, 3, 4, 5, 6, 7 } + uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100)); + uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7)); + uint64_t sse = 0; + + do { + int w = width; + int offset = 0; + + do { + uint16x8_t s = vld1q_u16(src + offset); + uint16x8_t r = vld1q_u16(ref + offset); + + if (w < 8) { + // Mask out-of-range elements. + s = vandq_u16(s, remainder_mask); + r = vandq_u16(r, remainder_mask); + } + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo); + sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi); + + sse += horizontal_long_add_u32x4(sse_u32); + + offset += 8; + w -= 8; + } while (w > 0); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return sse; +} + +int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: + return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: + return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); + case 128: + return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_sve.c b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c new file mode 100644 index 0000000000..b267da5cfb --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint64x2_t *sse) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + + *sse = aom_udotq_u16(*sse, abs_diff, abs_diff); +} + +static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return vaddvq_u64(vaddq_u64(sse[0], sse[1])); +} + +static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]); + highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]); + + src += 2 * src_stride; + ref += 2 * ref_stride; + height -= 2; + } while (height != 0); + + return vaddvq_u64(vaddq_u64(sse[0], sse[1])); +} + +static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse = vdupq_n_u64(0); + + do { + uint16x8_t s = load_unaligned_u16_4x2(src, src_stride); + uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride); + + uint16x8_t abs_diff = vabdq_u16(s, r); + sse = aom_udotq_u16(sse, abs_diff, abs_diff); + + src += 2 * src_stride; + ref += 2 * ref_stride; + height -= 2; + } while (height != 0); + + return vaddvq_u64(sse); +} + +static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + svuint64_t sse = svdup_n_u64(0); + uint64_t step = svcnth(); + + do { + int w = 0; + const uint16_t *src_ptr = src; + const uint16_t *ref_ptr = ref; + + do { + svbool_t pred = svwhilelt_b16_u32(w, width); + svuint16_t s = svld1_u16(pred, src_ptr); + svuint16_t r = svld1_u16(pred, ref_ptr); + + svuint16_t abs_diff = svabd_u16_z(pred, s, r); + + sse = svdot_u64(sse, abs_diff, abs_diff); + + src_ptr += step; + ref_ptr += step; + w += step; + } while (w < width); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return svaddv_u64(svptrue_b64(), sse); +} + +int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height); + case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height); + case 128: + return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c new file mode 100644 index 0000000000..686fa5f226 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c @@ -0,0 +1,1497 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/dist_wtd_avg_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/variance.h" + +// The bilinear filters look like this: +// +// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, +// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} +// +// We can factor out the highest common multiple, such that the sum of both +// weights will be 8 instead of 128. The benefits of this are two-fold: +// +// 1) We can infer the filter values from the filter_offset parameter in the +// bilinear filter functions below - we don't have to actually load the values +// from memory: +// f0 = 8 - filter_offset +// f1 = filter_offset +// +// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on +// 16-bit data types at all times, rather than widening out to 32-bit and +// requiring double the number of data processing instructions. (12-bit * 8 = +// 15-bit.) + +// Process a block exactly 4 wide and any height. +static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); + uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, blend); + + src_ptr += src_stride; + dst_ptr += 4; + } while (--i != 0); +} + +// Process a block which is a multiple of 8 and any height. +static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, blend); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 8, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 16, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 32, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 64, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 128, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + vst1q_u16(dst_ptr + j, avg); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ + h); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ + src_stride, h, yoffset); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 10-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 12-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having +// width 4. +static void highbd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); + uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); + uint16x4_t p = vld1_u16(second_pred); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, vrhadd_u16(blend, p)); + + src_ptr += src_stride; + dst_ptr += 4; + second_pred += 4; + } while (--i != 0); +} + +// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks. +static void highbd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 8, dst_height, + filter_offset, second_pred); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w128( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 128, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with aom_highbd_comp_avg_pred. +static void highbd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + + uint16x8_t p = vld1q_u16(second_pred); + avg = vrhaddq_u16(avg, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16. +static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = vrhaddq_u16(s, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp, source_stride, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp, source_stride, source_stride, h, yoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp0, source_stride, 1, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp0, source_stride, 1, h, xoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 10-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 12-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * (h + 1)]; \ + uint16_t tmp2[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ + h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + if (xoffset == 0) { \ + uint16_t tmp0[w * h]; \ + if (yoffset == 0) { \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \ + w, h, src, src_stride, msk, msk_stride, \ + invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \ + w, h); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \ + src_stride, h, yoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + uint16_t tmp2[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + uint16_t tmp2[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ + } \ + } else { \ + if (yoffset == 0) { \ + uint16_t tmp0[w * h]; \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t tmp2[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * (h + 1)]; \ + uint16_t tmp2[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ + w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) + +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 10-bit +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) + +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 12-bit +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) + +HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) + +HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#if !CONFIG_REALTIME_ONLY +#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ + } + +#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + pre, pre_stride, wsrc, mask, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \ + h); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \ + pre_stride, h, yoffset); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \ + xoffset); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \ + h + 1, xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \ + h + 1, xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ + } \ + } \ + } + +// 8-bit +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) + +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) + +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) + +// 10-bit +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) + +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) + +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) + +// 12-bit +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) + +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) + +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) +HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) + +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) +SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) +#endif // !CONFIG_REALTIME_ONLY + +static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, + int dst_height, + const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset); + const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset); + + vst1q_u16(dst_ptr + j, avg); + + second_pred += 8; + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset); + const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + uint16x8_t avg = vrhaddq_u16(s0, s1); + avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset); + + vst1q_u16(dst_ptr + j, avg); + + second_pred += 8; + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset); + const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset); + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); + uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); + uint16x4_t p = vld1_u16(second_pred); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset); + + vst1_u16(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 4; + second_pred += 4; + } while (--i != 0); +} + +// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks. +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset); + const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset); + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset); + + vst1q_u16(dst_ptr + j, avg); + + second_pred += 8; + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16. +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32. +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64. +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128. +static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset, + second_pred, jcp_param); +} + +#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \ + xoffset); \ + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \ + } + +#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *second = CONVERT_TO_SHORTPTR(second_pred); \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second, \ + jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \ + src, tmp, source_stride, source_stride, w, h, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \ + } else { \ + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second, \ + jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_dist_wtd_avg_pred_var_filter_block2d_avg( \ + src, tmp0, source_stride, 1, w, h, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \ + highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \ + h, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1); \ + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp0, source_stride, 1, h, xoffset, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \ + xoffset); \ + highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, \ + h, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \ + xoffset); \ + highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, second, jcp_param); \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16) + +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 10-bit +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16) + +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16) +#endif // !CONFIG_REALTIME_ONLY + +// 12-bit +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128) + +#if !CONFIG_REALTIME_ONLY +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16) + +HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4) +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8) + +SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c new file mode 100644 index 0000000000..18b8efff4c --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/variance.h" + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + + int i = h; + do { + const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_s16x8(sum_s16); + *sse = horizontal_add_s32x4(sse_s32); +} + +// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all +// block sizes can be processed in 32-bit elements (1023*1023*128*32 = +// 4286582784 for a 128x128 block). +static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int i = h; + do { + int j = 0; + do { + const uint16x8_t s = vld1q_u16(src_ptr + j); + const uint16x8_t r = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_s32x4(sum_s32); + *sse = horizontal_long_add_u32x4(vaddq_u32( + vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1]))); +} + +static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum); +} + +static INLINE void highbd_variance_16xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum); +} + +static INLINE void highbd_variance_32xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +static INLINE void highbd_variance_128xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse, + sum); +} + +// For 12-bit data, we can only accumulate up to 128 elements in the sum of +// squares (4095*4095*128 = 2146435200), and because we're using two int32x4 +// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128) +// or 16 64-element rows before we have to accumulate into 64-bit elements. +// Therefore blocks of size 32x64, 64x32, 64x64, 64x128, 128x64, 128x128 are +// processed in a different helper function. + +// Process a block of any size where the width is divisible by 8, with +// accumulation into 64-bit elements. +static INLINE void highbd_variance_xlarge_neon( + const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, + int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit + // accumulator overflows. After hitting this limit we accumulate into 64-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + do { + int j = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + j); + const uint16x8_t r0 = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]); + sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]); + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_s32x4(sum_s32); + *sse = (uint64_t)horizontal_add_s64x2(sse_s64); +} + +static INLINE void highbd_variance_32xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse, + sum); +} + +static INLINE void highbd_variance_64xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse, + sum); +} + +static INLINE void highbd_variance_128xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 128, h, 8, sse, + sum); +} + +#define HBD_VARIANCE_WXH_8_NEON(w, h) \ + uint32_t aom_highbd_8_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_NEON(w, h) \ + uint32_t aom_highbd_10_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_NEON(w, h) \ + uint32_t aom_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \ + uint32_t aom_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_NEON(4, 4) +HBD_VARIANCE_WXH_8_NEON(4, 8) + +HBD_VARIANCE_WXH_8_NEON(8, 4) +HBD_VARIANCE_WXH_8_NEON(8, 8) +HBD_VARIANCE_WXH_8_NEON(8, 16) + +HBD_VARIANCE_WXH_8_NEON(16, 8) +HBD_VARIANCE_WXH_8_NEON(16, 16) +HBD_VARIANCE_WXH_8_NEON(16, 32) + +HBD_VARIANCE_WXH_8_NEON(32, 16) +HBD_VARIANCE_WXH_8_NEON(32, 32) +HBD_VARIANCE_WXH_8_NEON(32, 64) + +HBD_VARIANCE_WXH_8_NEON(64, 32) +HBD_VARIANCE_WXH_8_NEON(64, 64) +HBD_VARIANCE_WXH_8_NEON(64, 128) + +HBD_VARIANCE_WXH_8_NEON(128, 64) +HBD_VARIANCE_WXH_8_NEON(128, 128) + +// 10-bit +HBD_VARIANCE_WXH_10_NEON(4, 4) +HBD_VARIANCE_WXH_10_NEON(4, 8) + +HBD_VARIANCE_WXH_10_NEON(8, 4) +HBD_VARIANCE_WXH_10_NEON(8, 8) +HBD_VARIANCE_WXH_10_NEON(8, 16) + +HBD_VARIANCE_WXH_10_NEON(16, 8) +HBD_VARIANCE_WXH_10_NEON(16, 16) +HBD_VARIANCE_WXH_10_NEON(16, 32) + +HBD_VARIANCE_WXH_10_NEON(32, 16) +HBD_VARIANCE_WXH_10_NEON(32, 32) +HBD_VARIANCE_WXH_10_NEON(32, 64) + +HBD_VARIANCE_WXH_10_NEON(64, 32) +HBD_VARIANCE_WXH_10_NEON(64, 64) +HBD_VARIANCE_WXH_10_NEON(64, 128) + +HBD_VARIANCE_WXH_10_NEON(128, 64) +HBD_VARIANCE_WXH_10_NEON(128, 128) + +// 12-bit +HBD_VARIANCE_WXH_12_NEON(4, 4) +HBD_VARIANCE_WXH_12_NEON(4, 8) + +HBD_VARIANCE_WXH_12_NEON(8, 4) +HBD_VARIANCE_WXH_12_NEON(8, 8) +HBD_VARIANCE_WXH_12_NEON(8, 16) + +HBD_VARIANCE_WXH_12_NEON(16, 8) +HBD_VARIANCE_WXH_12_NEON(16, 16) +HBD_VARIANCE_WXH_12_NEON(16, 32) + +HBD_VARIANCE_WXH_12_NEON(32, 16) +HBD_VARIANCE_WXH_12_NEON(32, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64) + +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 128) + +HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 64) +HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +// 8-bit +HBD_VARIANCE_WXH_8_NEON(4, 16) + +HBD_VARIANCE_WXH_8_NEON(8, 32) + +HBD_VARIANCE_WXH_8_NEON(16, 4) +HBD_VARIANCE_WXH_8_NEON(16, 64) + +HBD_VARIANCE_WXH_8_NEON(32, 8) + +HBD_VARIANCE_WXH_8_NEON(64, 16) + +// 10-bit +HBD_VARIANCE_WXH_10_NEON(4, 16) + +HBD_VARIANCE_WXH_10_NEON(8, 32) + +HBD_VARIANCE_WXH_10_NEON(16, 4) +HBD_VARIANCE_WXH_10_NEON(16, 64) + +HBD_VARIANCE_WXH_10_NEON(32, 8) + +HBD_VARIANCE_WXH_10_NEON(64, 16) + +// 12-bit +HBD_VARIANCE_WXH_12_NEON(4, 16) + +HBD_VARIANCE_WXH_12_NEON(8, 32) + +HBD_VARIANCE_WXH_12_NEON(16, 4) +HBD_VARIANCE_WXH_12_NEON(16, 64) + +HBD_VARIANCE_WXH_12_NEON(32, 8) + +HBD_VARIANCE_WXH_12_NEON(64, 16) + +#endif // !CONFIG_REALTIME_ONLY + +static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + unsigned int *sse) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u32[0] = + vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); + sse_u32[1] = + vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return *sse; +} + +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t aom_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON(16, 16) +HIGHBD_MSE_WXH_NEON(16, 8) +HIGHBD_MSE_WXH_NEON(8, 16) +HIGHBD_MSE_WXH_NEON(8, 8) + +#undef HIGHBD_MSE_WXH_NEON + +static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0, + uint16x8_t s1, uint16x8_t d0, + uint16x8_t d1) { + uint16x8_t e0 = vabdq_u16(s0, d0); + uint16x8_t e1 = vabdq_u16(s1, d1); + + uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0)); + mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0)); + mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1)); + mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1)); + + return vpadalq_u32(sum, mse); +} + +uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int w, + int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4)); + + uint64x2_t sum = vdupq_n_u64(0); + + if (w == 8) { + do { + uint16x8_t d0 = vld1q_u16(dst + 0 * dstride); + uint16x8_t d1 = vld1q_u16(dst + 1 * dstride); + uint16x8_t s0 = vld1q_u16(src + 0 * sstride); + uint16x8_t s1 = vld1q_u16(src + 1 * sstride); + + sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1); + + dst += 2 * dstride; + src += 2 * sstride; + h -= 2; + } while (h != 0); + } else { // w == 4 + do { + uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride); + uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride); + uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); + uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); + + sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1); + + dst += 4 * dstride; + src += 4 * sstride; + h -= 4; + } while (h != 0); + } + + return horizontal_add_u64x2(sum); +} diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c new file mode 100644 index 0000000000..d56ae97571 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/arm/sum_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + uint16x8_t s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + uint16x8_t r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + uint16x8_t r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + uint8x16_t diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + *sse = horizontal_add_u32x4(sse_u32); + return *sse; +} + +static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + unsigned int *sse) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + uint16x8_t r0 = vld1q_u16(ref_ptr); + uint16x8_t r1 = vld1q_u16(ref_ptr + 8); + + uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + uint8x16_t diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sse = horizontal_add_u32x4(sse_u32); + return *sse; +} + +#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \ + uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \ + sse); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8) + +#undef HIGHBD_MSE_WXH_NEON_DOTPROD diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c new file mode 100644 index 0000000000..d0058bfa90 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/variance.h" + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s64 = aom_sdotq_s16(sse_s64, diff, diff); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + h -= 2; + } while (h != 0); + + *sum = vaddlvq_s16(sum_s16); + *sse = vaddvq_s64(sse_s64); +} + +static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref, + int32x4_t *sum, int64x2_t *sse) { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + *sum = vpadalq_s16(*sum, diff); + + *sse = aom_sdotq_s16(*sse, diff, diff); +} + +static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = vaddlvq_s32(sum_s32); + *sse = vaddvq_s64(sse_s64); +} + +static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]); + variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1])); + *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); +} + +static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int j = 0; + do { + variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]); + variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1], + &sse_s64[1]); + variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2], + &sse_s64[2]); + variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3], + &sse_s64[3]); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); + *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2])); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); + *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2])); +} + +static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +static INLINE void highbd_variance_128xh_sve(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum); +} + +#define HBD_VARIANCE_WXH_8_SVE(w, h) \ + uint32_t aom_highbd_8_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_SVE(w, h) \ + uint32_t aom_highbd_10_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_SVE(w, h) \ + uint32_t aom_highbd_12_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_SVE(4, 4) +HBD_VARIANCE_WXH_8_SVE(4, 8) + +HBD_VARIANCE_WXH_8_SVE(8, 4) +HBD_VARIANCE_WXH_8_SVE(8, 8) +HBD_VARIANCE_WXH_8_SVE(8, 16) + +HBD_VARIANCE_WXH_8_SVE(16, 8) +HBD_VARIANCE_WXH_8_SVE(16, 16) +HBD_VARIANCE_WXH_8_SVE(16, 32) + +HBD_VARIANCE_WXH_8_SVE(32, 16) +HBD_VARIANCE_WXH_8_SVE(32, 32) +HBD_VARIANCE_WXH_8_SVE(32, 64) + +HBD_VARIANCE_WXH_8_SVE(64, 32) +HBD_VARIANCE_WXH_8_SVE(64, 64) +HBD_VARIANCE_WXH_8_SVE(64, 128) + +HBD_VARIANCE_WXH_8_SVE(128, 64) +HBD_VARIANCE_WXH_8_SVE(128, 128) + +// 10-bit +HBD_VARIANCE_WXH_10_SVE(4, 4) +HBD_VARIANCE_WXH_10_SVE(4, 8) + +HBD_VARIANCE_WXH_10_SVE(8, 4) +HBD_VARIANCE_WXH_10_SVE(8, 8) +HBD_VARIANCE_WXH_10_SVE(8, 16) + +HBD_VARIANCE_WXH_10_SVE(16, 8) +HBD_VARIANCE_WXH_10_SVE(16, 16) +HBD_VARIANCE_WXH_10_SVE(16, 32) + +HBD_VARIANCE_WXH_10_SVE(32, 16) +HBD_VARIANCE_WXH_10_SVE(32, 32) +HBD_VARIANCE_WXH_10_SVE(32, 64) + +HBD_VARIANCE_WXH_10_SVE(64, 32) +HBD_VARIANCE_WXH_10_SVE(64, 64) +HBD_VARIANCE_WXH_10_SVE(64, 128) + +HBD_VARIANCE_WXH_10_SVE(128, 64) +HBD_VARIANCE_WXH_10_SVE(128, 128) + +// 12-bit +HBD_VARIANCE_WXH_12_SVE(4, 4) +HBD_VARIANCE_WXH_12_SVE(4, 8) + +HBD_VARIANCE_WXH_12_SVE(8, 4) +HBD_VARIANCE_WXH_12_SVE(8, 8) +HBD_VARIANCE_WXH_12_SVE(8, 16) + +HBD_VARIANCE_WXH_12_SVE(16, 8) +HBD_VARIANCE_WXH_12_SVE(16, 16) +HBD_VARIANCE_WXH_12_SVE(16, 32) + +HBD_VARIANCE_WXH_12_SVE(32, 16) +HBD_VARIANCE_WXH_12_SVE(32, 32) +HBD_VARIANCE_WXH_12_SVE(32, 64) + +HBD_VARIANCE_WXH_12_SVE(64, 32) +HBD_VARIANCE_WXH_12_SVE(64, 64) +HBD_VARIANCE_WXH_12_SVE(64, 128) + +HBD_VARIANCE_WXH_12_SVE(128, 64) +HBD_VARIANCE_WXH_12_SVE(128, 128) + +#if !CONFIG_REALTIME_ONLY +// 8-bit +HBD_VARIANCE_WXH_8_SVE(4, 16) + +HBD_VARIANCE_WXH_8_SVE(8, 32) + +HBD_VARIANCE_WXH_8_SVE(16, 4) +HBD_VARIANCE_WXH_8_SVE(16, 64) + +HBD_VARIANCE_WXH_8_SVE(32, 8) + +HBD_VARIANCE_WXH_8_SVE(64, 16) + +// 10-bit +HBD_VARIANCE_WXH_10_SVE(4, 16) + +HBD_VARIANCE_WXH_10_SVE(8, 32) + +HBD_VARIANCE_WXH_10_SVE(16, 4) +HBD_VARIANCE_WXH_10_SVE(16, 64) + +HBD_VARIANCE_WXH_10_SVE(32, 8) + +HBD_VARIANCE_WXH_10_SVE(64, 16) + +// 12-bit +HBD_VARIANCE_WXH_12_SVE(4, 16) + +HBD_VARIANCE_WXH_12_SVE(8, 32) + +HBD_VARIANCE_WXH_12_SVE(16, 4) +HBD_VARIANCE_WXH_12_SVE(16, 64) + +HBD_VARIANCE_WXH_12_SVE(32, 8) + +HBD_VARIANCE_WXH_12_SVE(64, 16) + +#endif // !CONFIG_REALTIME_ONLY + +#undef HBD_VARIANCE_WXH_8_SVE +#undef HBD_VARIANCE_WXH_10_SVE +#undef HBD_VARIANCE_WXH_12_SVE + +static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + unsigned int *sse) { + uint64x2_t sse_u64 = vdupq_n_u64(0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u64 = aom_udotq_u16(sse_u64, diff, diff); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sse = (uint32_t)vaddvq_u64(sse_u64); + return *sse; +} + +#define HIGHBD_MSE_WXH_SVE(w, h) \ + uint32_t aom_highbd_8_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_SVE(16, 16) +HIGHBD_MSE_WXH_SVE(16, 8) +HIGHBD_MSE_WXH_SVE(8, 16) +HIGHBD_MSE_WXH_SVE(8, 8) + +#undef HIGHBD_MSE_WXH_SVE + +uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4)); + + uint64x2_t sum = vdupq_n_u64(0); + + if (w == 8) { + do { + uint16x8_t d0 = vld1q_u16(dst + 0 * dstride); + uint16x8_t d1 = vld1q_u16(dst + 1 * dstride); + uint16x8_t s0 = vld1q_u16(src + 0 * sstride); + uint16x8_t s1 = vld1q_u16(src + 1 * sstride); + + uint16x8_t abs_diff0 = vabdq_u16(s0, d0); + uint16x8_t abs_diff1 = vabdq_u16(s1, d1); + + sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); + sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); + + dst += 2 * dstride; + src += 2 * sstride; + h -= 2; + } while (h != 0); + } else { // w == 4 + do { + uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride); + uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride); + uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); + uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); + + uint16x8_t abs_diff0 = vabdq_u16(s0, d0); + uint16x8_t abs_diff1 = vabdq_u16(s1, d1); + + sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); + sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); + + dst += 4 * dstride; + src += 4 * sstride; + h -= 4; + } while (h != 0); + } + + return vaddvq_u64(sum); +} diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c new file mode 100644 index 0000000000..d8dc60c1fe --- /dev/null +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -0,0 +1,3110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/reinterpret_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/intrapred_common.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) { + const uint8x8_t a = load_u8_4x1(in); + const uint16x4_t p0 = vpaddl_u8(a); + const uint16x4_t p1 = vpadd_u16(p0, p0); + return vcombine_u16(p1, vdup_n_u16(0)); +} + +static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x8_t dc) { + for (int i = 0; i < h; ++i) { + store_u8_4x1(dst + i * stride, dc); + } +} + +void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_top = dc_load_sum_4(above); + const uint16x8_t sum_left = dc_load_sum_4(left); + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); + dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); +} + +void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_left = dc_load_sum_4(left); + const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2); + (void)above; + dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); +} + +void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_top = dc_load_sum_4(above); + const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2); + (void)left; + dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); +} + +void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc0 = vdup_n_u8(0x80); + (void)above; + (void)left; + dc_store_4xh(dst, stride, 4, dc0); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) { + // This isn't used in the case where we want to load both above and left + // vectors, since we want to avoid performing the reduction twice. + const uint8x8_t a = vld1_u8(in); + const uint16x4_t p0 = vpaddl_u8(a); + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + return vcombine_u16(p2, vdup_n_u16(0)); +} + +static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) { +#if AOM_ARCH_AARCH64 + // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an + // instruction, however the addv instruction is usually slightly more + // expensive than a pairwise addition, so the need for immediately + // broadcasting the result again seems to negate any benefit. + const uint16x8_t b = vpaddq_u16(a, a); + const uint16x8_t c = vpaddq_u16(b, b); + return vpaddq_u16(c, c); +#else + const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a)); + const uint16x4_t c = vpadd_u16(b, b); + const uint16x4_t d = vpadd_u16(c, c); + return vcombine_u16(d, d); +#endif +} + +static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x8_t dc) { + for (int i = 0; i < h; ++i) { + vst1_u8(dst + i * stride, dc); + } +} + +void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t sum_top = vld1_u8(above); + const uint8x8_t sum_left = vld1_u8(left); + uint16x8_t sum = vaddl_u8(sum_left, sum_top); + sum = horizontal_add_and_broadcast_u16x8(sum); + const uint8x8_t dc0 = vrshrn_n_u16(sum, 4); + dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); +} + +void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_left = dc_load_sum_8(left); + const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3); + (void)above; + dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); +} + +void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_top = dc_load_sum_8(above); + const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3); + (void)left; + dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); +} + +void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc0 = vdup_n_u8(0x80); + (void)above; + (void)left; + dc_store_8xh(dst, stride, 8, dc0); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) { + const uint8x16_t a = vld1q_u8(in); + // delay the remainder of the reduction until + // horizontal_add_and_broadcast_u16x8, since we want to do it once rather + // than twice in the case we are loading both above and left. + return vpaddlq_u8(a); +} + +static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) { + return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in)); +} + +static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x16_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u8(dst + i * stride, dc); + } +} + +void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_top = dc_load_partial_sum_16(above); + const uint16x8_t sum_left = dc_load_partial_sum_16(left); + uint16x8_t sum = vaddq_u16(sum_left, sum_top); + sum = horizontal_add_and_broadcast_u16x8(sum); + const uint8x8_t dc0 = vrshrn_n_u16(sum, 5); + dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16x8_t sum_left = dc_load_sum_16(left); + const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4); + (void)above; + dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16x8_t sum_top = dc_load_sum_16(above); + const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4); + (void)left; + dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint8x16_t dc0 = vdupq_n_u8(0x80); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, dc0); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) { + const uint8x16_t a0 = vld1q_u8(in); + const uint8x16_t a1 = vld1q_u8(in + 16); + // delay the remainder of the reduction until + // horizontal_add_and_broadcast_u16x8, since we want to do it once rather + // than twice in the case we are loading both above and left. + return vpadalq_u8(vpaddlq_u8(a0), a1); +} + +static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) { + return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in)); +} + +static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x16_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + } +} + +void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_top = dc_load_partial_sum_32(above); + const uint16x8_t sum_left = dc_load_partial_sum_32(left); + uint16x8_t sum = vaddq_u16(sum_left, sum_top); + sum = horizontal_add_and_broadcast_u16x8(sum); + const uint8x8_t dc0 = vrshrn_n_u16(sum, 6); + dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16x8_t sum_left = dc_load_sum_32(left); + const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5); + (void)above; + dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16x8_t sum_top = dc_load_sum_32(above); + const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5); + (void)left; + dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint8x16_t dc0 = vdupq_n_u8(0x80); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, dc0); +} + +//------------------------------------------------------------------------------ +// DC 64x64 + +static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) { + const uint8x16_t a0 = vld1q_u8(in); + const uint8x16_t a1 = vld1q_u8(in + 16); + const uint8x16_t a2 = vld1q_u8(in + 32); + const uint8x16_t a3 = vld1q_u8(in + 48); + const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1); + const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3); + // delay the remainder of the reduction until + // horizontal_add_and_broadcast_u16x8, since we want to do it once rather + // than twice in the case we are loading both above and left. + return vaddq_u16(p01, p23); +} + +static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) { + return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in)); +} + +static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x16_t dc) { + for (int i = 0; i < h; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + vst1q_u8(dst + i * stride + 32, dc); + vst1q_u8(dst + i * stride + 48, dc); + } +} + +void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint16x8_t sum_top = dc_load_partial_sum_64(above); + const uint16x8_t sum_left = dc_load_partial_sum_64(left); + uint16x8_t sum = vaddq_u16(sum_left, sum_top); + sum = horizontal_add_and_broadcast_u16x8(sum); + const uint8x8_t dc0 = vrshrn_n_u16(sum, 7); + dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16x8_t sum_left = dc_load_sum_64(left); + const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6); + (void)above; + dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint16x8_t sum_top = dc_load_sum_64(above); + const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6); + (void)left; + dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); +} + +void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const uint8x16_t dc0 = vdupq_n_u8(0x80); + (void)above; + (void)left; + dc_store_64xh(dst, stride, 64, dc0); +} + +//------------------------------------------------------------------------------ +// DC rectangular cases + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier, int shift2) { + const int interm = num >> shift1; + return interm * multiplier >> shift2; +} + +static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum, + int shift1, int multiplier) { + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); + assert(expected_dc < (1 << 8)); + return expected_dc; +} + +#undef DC_SHIFT2 + +void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a = load_u8_4x1(above); + uint8x8_t l = vld1_u8(left); + uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l)); + uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2); + dc_store_4xh(dst, stride, 8, vdup_n_u8(dc)); +} + +void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a = vld1_u8(above); + uint8x8_t l = load_u8_4x1(left); + uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l)); + uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2); + dc_store_8xh(dst, stride, 4, vdup_n_u8(dc)); +} + +void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a = load_u8_4x1(above); + uint8x16_t l = vld1q_u8(left); + uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4); + dc_store_4xh(dst, stride, 16, vdup_n_u8(dc)); +} + +void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a = vld1q_u8(above); + uint8x8_t l = load_u8_4x1(left); + uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4); + dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a = vld1_u8(above); + uint8x16_t l = vld1q_u8(left); + uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2); + dc_store_8xh(dst, stride, 16, vdup_n_u8(dc)); +} + +void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a = vld1q_u8(above); + uint8x8_t l = vld1_u8(left); + uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2); + dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a = vld1_u8(above); + uint16x8_t sum_left = dc_load_partial_sum_32(left); + uint16x8_t sum_al = vaddw_u8(sum_left, a); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4); + dc_store_8xh(dst, stride, 32, vdup_n_u8(dc)); +} + +void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_top = dc_load_partial_sum_32(above); + uint8x8_t l = vld1_u8(left); + uint16x8_t sum_al = vaddw_u8(sum_top, l); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4); + dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_above = dc_load_partial_sum_16(above); + uint16x8_t sum_left = dc_load_partial_sum_32(left); + uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2); + dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_above = dc_load_partial_sum_32(above); + uint16x8_t sum_left = dc_load_partial_sum_16(left); + uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2); + dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_above = dc_load_partial_sum_16(above); + uint16x8_t sum_left = dc_load_partial_sum_64(left); + uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4); + dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_above = dc_load_partial_sum_64(above); + uint16x8_t sum_left = dc_load_partial_sum_16(left); + uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4); + dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_above = dc_load_partial_sum_32(above); + uint16x8_t sum_left = dc_load_partial_sum_64(left); + uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2); + dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc)); +} + +void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint16x8_t sum_above = dc_load_partial_sum_64(above); + uint16x8_t sum_left = dc_load_partial_sum_32(left); + uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); + uint32_t sum = horizontal_add_u16x8(sum_al); + uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2); + dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc)); +} + +#undef DC_MULTIPLIER_1X2 +#undef DC_MULTIPLIER_1X4 + +#define DC_PREDICTOR_128(w, h, q) \ + void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + (void)above; \ + (void)left; \ + dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \ + } + +DC_PREDICTOR_128(4, 8, ) +DC_PREDICTOR_128(4, 16, ) +DC_PREDICTOR_128(8, 4, ) +DC_PREDICTOR_128(8, 16, ) +DC_PREDICTOR_128(8, 32, ) +DC_PREDICTOR_128(16, 4, q) +DC_PREDICTOR_128(16, 8, q) +DC_PREDICTOR_128(16, 32, q) +DC_PREDICTOR_128(16, 64, q) +DC_PREDICTOR_128(32, 8, q) +DC_PREDICTOR_128(32, 16, q) +DC_PREDICTOR_128(32, 64, q) +DC_PREDICTOR_128(64, 32, q) +DC_PREDICTOR_128(64, 16, q) + +#undef DC_PREDICTOR_128 + +#define DC_PREDICTOR_LEFT(w, h, shift, q) \ + void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + (void)above; \ + const uint16x8_t sum = dc_load_sum_##h(left); \ + const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \ + dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \ + } + +DC_PREDICTOR_LEFT(4, 8, 3, ) +DC_PREDICTOR_LEFT(8, 4, 2, ) +DC_PREDICTOR_LEFT(8, 16, 4, ) +DC_PREDICTOR_LEFT(16, 8, 3, q) +DC_PREDICTOR_LEFT(16, 32, 5, q) +DC_PREDICTOR_LEFT(32, 16, 4, q) +DC_PREDICTOR_LEFT(32, 64, 6, q) +DC_PREDICTOR_LEFT(64, 32, 5, q) +DC_PREDICTOR_LEFT(4, 16, 4, ) +DC_PREDICTOR_LEFT(16, 4, 2, q) +DC_PREDICTOR_LEFT(8, 32, 5, ) +DC_PREDICTOR_LEFT(32, 8, 3, q) +DC_PREDICTOR_LEFT(16, 64, 6, q) +DC_PREDICTOR_LEFT(64, 16, 4, q) + +#undef DC_PREDICTOR_LEFT + +#define DC_PREDICTOR_TOP(w, h, shift, q) \ + void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + (void)left; \ + const uint16x8_t sum = dc_load_sum_##w(above); \ + const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \ + dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \ + } + +DC_PREDICTOR_TOP(4, 8, 2, ) +DC_PREDICTOR_TOP(4, 16, 2, ) +DC_PREDICTOR_TOP(8, 4, 3, ) +DC_PREDICTOR_TOP(8, 16, 3, ) +DC_PREDICTOR_TOP(8, 32, 3, ) +DC_PREDICTOR_TOP(16, 4, 4, q) +DC_PREDICTOR_TOP(16, 8, 4, q) +DC_PREDICTOR_TOP(16, 32, 4, q) +DC_PREDICTOR_TOP(16, 64, 4, q) +DC_PREDICTOR_TOP(32, 8, 5, q) +DC_PREDICTOR_TOP(32, 16, 5, q) +DC_PREDICTOR_TOP(32, 64, 5, q) +DC_PREDICTOR_TOP(64, 16, 6, q) +DC_PREDICTOR_TOP(64, 32, 6, q) + +#undef DC_PREDICTOR_TOP + +// ----------------------------------------------------------------------------- + +static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x8_t d0) { + for (int i = 0; i < h; ++i) { + store_u8_4x1(dst + i * stride, d0); + } +} + +static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x8_t d0) { + for (int i = 0; i < h; ++i) { + vst1_u8(dst + i * stride, d0); + } +} + +static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x16_t d0) { + for (int i = 0; i < h; ++i) { + vst1q_u8(dst + i * stride, d0); + } +} + +static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x16_t d0, uint8x16_t d1) { + for (int i = 0; i < h; ++i) { + vst1q_u8(dst + 0, d0); + vst1q_u8(dst + 16, d1); + dst += stride; + } +} + +static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h, + uint8x16_t d0, uint8x16_t d1, uint8x16_t d2, + uint8x16_t d3) { + for (int i = 0; i < h; ++i) { + vst1q_u8(dst + 0, d0); + vst1q_u8(dst + 16, d1); + vst1q_u8(dst + 32, d2); + vst1q_u8(dst + 48, d3); + dst += stride; + } +} + +void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_4xh(dst, stride, 4, load_u8_4x1(above)); +} + +void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_8xh(dst, stride, 8, vld1_u8(above)); +} + +void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_16xh(dst, stride, 16, vld1q_u8(above)); +} + +void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + (void)left; + v_store_32xh(dst, stride, 32, d0, d1); +} + +void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_4xh(dst, stride, 8, load_u8_4x1(above)); +} + +void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_4xh(dst, stride, 16, load_u8_4x1(above)); +} + +void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_8xh(dst, stride, 4, vld1_u8(above)); +} + +void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_8xh(dst, stride, 16, vld1_u8(above)); +} + +void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_8xh(dst, stride, 32, vld1_u8(above)); +} + +void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_16xh(dst, stride, 4, vld1q_u8(above)); +} + +void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_16xh(dst, stride, 8, vld1q_u8(above)); +} + +void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_16xh(dst, stride, 32, vld1q_u8(above)); +} + +void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_store_16xh(dst, stride, 64, vld1q_u8(above)); +} + +void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + (void)left; + v_store_32xh(dst, stride, 8, d0, d1); +} + +void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + (void)left; + v_store_32xh(dst, stride, 16, d0, d1); +} + +void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + (void)left; + v_store_32xh(dst, stride, 64, d0, d1); +} + +void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + const uint8x16_t d2 = vld1q_u8(above + 32); + const uint8x16_t d3 = vld1q_u8(above + 48); + (void)left; + v_store_64xh(dst, stride, 16, d0, d1, d2, d3); +} + +void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + const uint8x16_t d2 = vld1q_u8(above + 32); + const uint8x16_t d3 = vld1q_u8(above + 48); + (void)left; + v_store_64xh(dst, stride, 32, d0, d1, d2, d3); +} + +void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); + const uint8x16_t d2 = vld1q_u8(above + 32); + const uint8x16_t d3 = vld1q_u8(above + 48); + (void)left; + v_store_64xh(dst, stride, 64, d0, d1, d2, d3); +} + +// ----------------------------------------------------------------------------- + +static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { + store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); + store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); + store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); + store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); + store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4)); + store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5)); + store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6)); + store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7)); +} + +static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { + vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0)); + vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1)); + vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2)); + vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3)); + vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4)); + vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5)); + vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6)); + vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7)); +} + +static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { + vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0)); + vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1)); + vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2)); + vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3)); + vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4)); + vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5)); + vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6)); + vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7)); +} + +static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7)); +} + +static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6)); + dst += stride; + vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7)); + vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7)); + vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7)); + vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7)); +} + +void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = load_u8_4x1(left); + (void)above; + store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); + store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); + store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); + store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); +} + +void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = vld1_u8(left); + (void)above; + h_store_8x8(dst, stride, d0); +} + +void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + (void)above; + h_store_16x8(dst, stride, vget_low_u8(d0)); + h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); +} + +void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + const uint8x16_t d1 = vld1q_u8(left + 16); + (void)above; + h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); + h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1)); + h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1)); +} + +void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = vld1_u8(left); + (void)above; + h_store_4x8(dst, stride, d0); +} + +void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + (void)above; + h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0)); +} + +void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = load_u8_4x1(left); + (void)above; + vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0)); + vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1)); + vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2)); + vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3)); +} + +void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + (void)above; + h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0)); +} + +void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + const uint8x16_t d1 = vld1q_u8(left + 16); + (void)above; + h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0)); + h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1)); + h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1)); +} + +void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = load_u8_4x1(left); + (void)above; + vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0)); + vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1)); + vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2)); + vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3)); +} + +void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = vld1_u8(left); + (void)above; + h_store_16x8(dst, stride, d0); +} + +void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + const uint8x16_t d1 = vld1q_u8(left + 16); + (void)above; + h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); + h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1)); + h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1)); +} + +void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + const uint8x16_t d1 = vld1q_u8(left + 16); + const uint8x16_t d2 = vld1q_u8(left + 32); + const uint8x16_t d3 = vld1q_u8(left + 48); + (void)above; + h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); + h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1)); + h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1)); + h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2)); + h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2)); + h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3)); + h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3)); +} + +void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t d0 = vld1_u8(left); + (void)above; + h_store_32x8(dst, stride, d0); +} + +void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + (void)above; + h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); +} + +void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left + 0); + const uint8x16_t d1 = vld1q_u8(left + 16); + const uint8x16_t d2 = vld1q_u8(left + 32); + const uint8x16_t d3 = vld1q_u8(left + 48); + (void)above; + h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); + h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1)); + h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1)); + h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2)); + h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2)); + h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3)); + h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3)); +} + +void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(left); + (void)above; + h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); +} + +void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + for (int i = 0; i < 2; ++i) { + const uint8x16_t d0 = vld1q_u8(left); + h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); + left += 16; + dst += 16 * stride; + } +} + +void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + for (int i = 0; i < 4; ++i) { + const uint8x16_t d0 = vld1q_u8(left); + h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); + h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); + left += 16; + dst += 16 * stride; + } +} + +/* ---------------------P R E D I C T I O N Z 1--------------------------- */ + +// Low bit depth functions +static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, +}; + +static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64( + int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above, + int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((W + H) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + + const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]); + + int x = dx; + for (int r = 0; r < W; r++) { + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < W; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + if (base_max_diff > H) base_max_diff = H; + + uint8x8x2_t a01_128; + uint16x8_t shift; + if (upsample_above) { + a01_128 = vld2_u8(above + base); + shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1); + } else { + a01_128.val[0] = vld1_u8(above + base); + a01_128.val[1] = vld1_u8(above + base + 1); + shift = vdupq_n_u16((x & 0x3f) >> 1); + } + uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]); + uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32)); + uint16x8_t res = vmlaq_u16(a32, diff, shift); + + uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]); + dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x); + + x += dx; + } +} + +static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + uint8x8_t dstvec[16]; + + dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above, + dx); + for (int i = 0; i < N; i++) { + vst1_lane_u32((uint32_t *)(dst + stride * i), + vreinterpret_u32_u8(dstvec[i]), 0); + } +} + +static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + uint8x8_t dstvec[32]; + + dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above, + dx); + for (int i = 0; i < N; i++) { + vst1_u8(dst + stride * i, dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon( + int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above, + int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((W + H) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + + const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); + + int x = dx; + for (int r = 0; r < W; r++) { + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < W; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + if (base_max_diff > H) base_max_diff = H; + + uint16x8_t shift; + uint8x16_t a0_128, a1_128; + if (upsample_above) { + uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base); + a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]); + a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8); + shift = vdupq_n_u16(x & 0x1f); + } else { + a0_128 = vld1q_u8(above + base); + a1_128 = vld1q_u8(above + base + 1); + shift = vdupq_n_u16((x & 0x3f) >> 1); + } + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); + uint8x16_t v_temp = + vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); + + uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]); + dst[r] = vbslq_u8(mask, v_temp, a_mbase_x); + + x += dx; + } +} + +static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + uint8x16_t dstvec[64]; + + dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + vst1q_u8(dst + stride * i, dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon( + int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) { + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + + const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); + + int x = dx; + for (int r = 0; r < N; r++) { + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i].val[0] = a_mbase_x; // save 32 values + dstvec[i].val[1] = a_mbase_x; + } + return; + } + if (base_max_diff > 32) base_max_diff = 32; + + uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); + + uint8x16_t res16[2]; + for (int j = 0, jj = 0; j < 32; j += 16, jj++) { + int mdiff = base_max_diff - j; + if (mdiff <= 0) { + res16[jj] = a_mbase_x; + } else { + uint8x16_t a0_128 = vld1q_u8(above + base + j); + uint8x16_t a1_128 = vld1q_u8(above + base + j + 1); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = + vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); + + res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); + } + } + + uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]); + uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16); + dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x); + dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x); + x += dx; + } +} + +static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int dx) { + uint8x16x2_t dstvec[64]; + + dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx); + for (int i = 0; i < N; i++) { + vst1q_u8(dst + stride * i, dstvec[i].val[0]); + vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]); + } +} + +static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int dx) { + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + + const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); + const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + vst1q_u8(dst, a_mbase_x); + vst1q_u8(dst + 16, a_mbase_x); + vst1q_u8(dst + 32, a_mbase_x); + vst1q_u8(dst + 48, a_mbase_x); + dst += stride; + } + return; + } + + uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); + uint8x16_t base_inc128 = + vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100), + vcreate_u8(0x0F0E0D0C0B0A0908))); + + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + vst1q_u8(dst + j, a_mbase_x); + } else { + uint8x16_t a0_128 = vld1q_u8(above + base + j); + uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = + vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); + uint8x16_t v_temp = + vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); + + uint8x16_t mask128 = + vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0)); + uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x); + vst1q_u8(dst + j, res128); + + base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16)); + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + (void)left; + (void)dy; + + switch (bw) { + case 4: + dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx); + break; + case 8: + dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx); + break; + case 16: + dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx); + break; + case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break; + case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break; + default: break; + } +} + +/* ---------------------P R E D I C T I O N Z 2--------------------------- */ + +#if !AOM_ARCH_AARCH64 +static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = { + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff } +}; +#endif // !AOM_ARCH_AARCH64 + +static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon( + const uint8_t *above, int upsample_above, int dx, int base_x, int y, + uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) { + uint16x4_t r6 = vcreate_u16(0x00C0008000400000); + uint16x4_t ydx = vdup_n_u16(y * dx); + if (upsample_above) { + // Cannot use LD2 here since we only want to load eight bytes, but LD2 can + // only load either 16 or 32. + uint8x8_t v_tmp = vld1_u8(above + base_x); + *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0]; + *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1]; + *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f)); + } else { + *a0_x = load_u8_4x1(above + base_x); + *a1_x = load_u8_4x1(above + base_x + 1); + *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f)); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x2_t left_vals, +#else + const uint8_t *left, +#endif + int upsample_left, int dy, int r, int min_base_y, int frac_bits_y, + uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) { + int16x4_t dy64 = vdup_n_s16(dy); + int16x4_t v_1234 = vcreate_s16(0x0004000300020001); + int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); + int16x4_t min_base_y64 = vdup_n_s16(min_base_y); + int16x4_t v_r6 = vdup_n_s16(r << 6); + int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); + int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); + + // Values in base_y_c64 range from -2 through 14 inclusive. + base_y_c64 = vmax_s16(base_y_c64, min_base_y64); + +#if AOM_ARCH_AARCH64 + uint8x8_t left_idx0 = + vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] + uint8x8_t left_idx1 = + vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] + + *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0)); + *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1)); +#else // !AOM_ARCH_AARCH64 + DECLARE_ALIGNED(32, int16_t, base_y_c[4]); + + vst1_s16(base_y_c, base_y_c64); + uint8x8_t a0_y_u8 = vdup_n_u8(0); + a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0); + a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2); + a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4); + a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6); + + base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); + vst1_s16(base_y_c, base_y_c64); + uint8x8_t a1_y_u8 = vdup_n_u8(0); + a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0); + a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2); + a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4); + a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6); + + *a0_y = vreinterpret_u16_u8(a0_y_u8); + *a1_y = vreinterpret_u16_u8(a1_y_u8); +#endif // AOM_ARCH_AARCH64 + + if (upsample_left) { + *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f)); + } else { + *shift1 = + vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f)); + } +} + +static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon( + const uint8_t *above, int upsample_above, int dx, int base_x, int y) { + uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), + vcreate_u16(0x0008000700060005)); + uint16x8_t ydx = vdupq_n_u16(y * dx); + uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6); + + uint16x8_t shift0; + uint8x8_t a0_x0; + uint8x8_t a1_x0; + if (upsample_above) { + uint8x8x2_t v_tmp = vld2_u8(above + base_x); + a0_x0 = v_tmp.val[0]; + a1_x0 = v_tmp.val[1]; + shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); + } else { + a0_x0 = vld1_u8(above + base_x); + a1_x0 = vld1_u8(above + base_x + 1); + shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); + } + + uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] + uint16x8_t a32 = + vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 + uint16x8_t res = vmlaq_u16(a32, diff0, shift0); + return vshrn_n_u16(res, 5); +} + +static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x3_t left_vals, +#else + const uint8_t *left, +#endif + int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) { + int16x8_t v_r6 = vdupq_n_s16(r << 6); + int16x8_t dy128 = vdupq_n_s16(dy); + int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); + int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); + + uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), + vcreate_u16(0x0008000700060005)); + int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); + int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); + + // Values in base_y_c128 range from -2 through 31 inclusive. + base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128); + +#if AOM_ARCH_AARCH64 + uint8x16_t left_idx0 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] + uint8x16_t left_idx1 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] + uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); + + uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); + uint8x8_t a0_x1 = vget_low_u8(a01_x); + uint8x8_t a1_x1 = vget_high_u8(a01_x); +#else // !AOM_ARCH_AARCH64 + uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128); + uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128); +#endif // AOM_ARCH_AARCH64 + + uint16x8_t shift1; + if (upsample_left) { + shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f)); + } else { + shift1 = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1); + } + + uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1); + uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32)); + uint16x8_t res = vmlaq_u16(a32, diff1, shift1); + return vshrn_n_u16(res, 5); +} + +static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon( + const uint8_t *above, int dx, int base_x, int y, int j) { + uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)), + vcombine_u16(vcreate_u16(0x000B000A00090008), + vcreate_u16(0x000F000E000D000C)) } }; + uint16x8_t j256 = vdupq_n_u16(j); + uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); + + const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j); + const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1); + uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); + uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); + uint16x8_t shift0 = + vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1); + uint16x8_t shift1 = + vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1); + // a[x+1] - a[x] + uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128)); + uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128)); + // a[x] * 32 + 16 + uint16x8_t a32_0 = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32)); + uint16x8_t a32_1 = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32)); + uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0); + uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1); + return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); +} + +static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x4_t left_vals0, uint8x16x4_t left_vals1, +#else + const uint8_t *left, +#endif + int dy, int r, int j) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_y = -1; + + int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); + int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1); + int16x8_t dy256 = vdupq_n_s16(dy); + uint16x8_t j256 = vdupq_n_u16(j); + + uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)), + vcombine_u16(vcreate_u16(0x000B000A00090008), + vcreate_u16(0x000F000E000D000C)) } }; + uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)), + vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } }; + + int16x8_t v_r6 = vdupq_n_s16(r << 6); + + int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0])); + int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1])); + int16x8_t mul16_lo = vreinterpretq_s16_u16( + vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)), + vreinterpretq_u16_s16(half_min_base_y256))); + int16x8_t mul16_hi = vreinterpretq_s16_u16( + vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)), + vreinterpretq_u16_s16(half_min_base_y256))); + int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo); + int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi); + + int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6); + int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6); + + base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo); + base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi); + +#if !AOM_ARCH_AARCH64 + int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7); + int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0); + int16_t offset_diff = max_y - min_y; + + uint8x8_t a0_y0; + uint8x8_t a0_y1; + uint8x8_t a1_y0; + uint8x8_t a1_y1; + if (offset_diff < 16) { + // Avoid gathers where the data we want is close together in memory. + // We don't need this for AArch64 since we can already use TBL to cover the + // full range of possible values. + assert(offset_diff >= 0); + int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3); + + int16x8x2_t base_y_offset; + base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256); + base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256); + + int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]), + vqmovn_s16(base_y_offset.val[1])); + + uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); + uint8x16_t a0_y128 = vld1q_u8(left + min_y); + uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1); + a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); + a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); + + uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); + uint8x8_t v_index_high = + vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); + uint8x8x2_t v_tmp, v_res; + v_tmp.val[0] = vget_low_u8(a0_y128); + v_tmp.val[1] = vget_high_u8(a0_y128); + v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); + v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); + a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); + v_tmp.val[0] = vget_low_u8(a1_y128); + v_tmp.val[1] = vget_high_u8(a1_y128); + v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); + v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); + a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); + + a0_y0 = vget_low_u8(a0_y128); + a0_y1 = vget_high_u8(a0_y128); + a1_y0 = vget_low_u8(a1_y128); + a1_y1 = vget_high_u8(a1_y128); + } else { + a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo); + a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi); + a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo); + a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi); + } +#else + // Values in left_idx{0,1} range from 0 through 63 inclusive. + uint8x16_t left_idx0 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1))); + uint8x16_t left_idx1 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1))); + uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); + + uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); + uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); + + uint8x8_t a0_y0 = vget_low_u8(a0_y01); + uint8x8_t a0_y1 = vget_high_u8(a0_y01); + uint8x8_t a1_y0 = vget_low_u8(a1_y01); + uint8x8_t a1_y1 = vget_high_u8(a1_y01); +#endif // !AOM_ARCH_AARCH64 + + uint16x8_t shifty_lo = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1); + uint16x8_t shifty_hi = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1); + + // a[x+1] - a[x] + uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0); + uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1); + // a[x] * 32 + 16 + uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32)); + uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32)); + + uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo); + uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi); + + return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); +} + +static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + +#if AOM_ARCH_AARCH64 + // Use ext rather than loading left + 14 directly to avoid over-read. + const uint8x16_t left_m2 = vld1q_u8(left - 2); + const uint8x16_t left_0 = vld1q_u8(left); + const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14); + const uint8x16x2_t left_vals = { { left_m2, left_14 } }; +#define LEFT left_vals +#else // !AOM_ARCH_AARCH64 +#define LEFT left +#endif // AOM_ARCH_AARCH64 + + for (int r = 0; r < N; r++) { + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + const int base_min_diff = + (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >> + upsample_above; + + if (base_min_diff <= 0) { + uint8x8_t a0_x_u8, a1_x_u8; + uint16x4_t shift0; + dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, + &a0_x_u8, &a1_x_u8, &shift0); + uint8x8_t a0_x = a0_x_u8; + uint8x8_t a1_x = a1_x_u8; + + uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x] + uint16x8_t a32 = + vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16 + uint16x8_t res = + vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0))); + uint8x8_t resx = vshrn_n_u16(res, 5); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0); + } else if (base_min_diff < 4) { + uint8x8_t a0_x_u8, a1_x_u8; + uint16x4_t shift0; + dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, + &a0_x_u8, &a1_x_u8, &shift0); + uint16x8_t a0_x = vmovl_u8(a0_x_u8); + uint16x8_t a1_x = vmovl_u8(a1_x_u8); + + uint16x4_t a0_y; + uint16x4_t a1_y; + uint16x4_t shift1; + dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, + frac_bits_y, &a0_y, &a1_y, &shift1); + a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y); + a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y); + + uint16x8_t shift = vcombine_u16(shift0, shift1); + uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] + uint16x8_t a32 = + vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16 + uint16x8_t res = vmlaq_u16(a32, diff, shift); + uint8x8_t resx = vshrn_n_u16(res, 5); + uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4); + + uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); + uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); + } else { + uint16x4_t a0_y, a1_y; + uint16x4_t shift1; + dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, + frac_bits_y, &a0_y, &a1_y, &shift1); + uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x] + uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16 + uint16x4_t res = vmla_u16(a32, diff, shift1); + uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0); + } + + dst += stride; + } +#undef LEFT +} + +static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + +#if AOM_ARCH_AARCH64 + // Use ext rather than loading left + 30 directly to avoid over-read. + const uint8x16_t left_m2 = vld1q_u8(left - 2); + const uint8x16_t left_0 = vld1q_u8(left + 0); + const uint8x16_t left_16 = vld1q_u8(left + 16); + const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14); + const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14); + const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } }; +#define LEFT left_vals +#else // !AOM_ARCH_AARCH64 +#define LEFT left +#endif // AOM_ARCH_AARCH64 + + for (int r = 0; r < N; r++) { + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + + if (base_min_diff <= 0) { + uint8x8_t resx = + dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); + vst1_u8(dst, resx); + } else if (base_min_diff < 8) { + uint8x8_t resx = + dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); + uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( + LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); + uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); + uint8x8_t resxy = vbsl_u8(mask, resy, resx); + vst1_u8(dst, resxy); + } else { + uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( + LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); + vst1_u8(dst, resy); + } + + dst += stride; + } +#undef LEFT +} + +static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst, + ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int dx, int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + +#if AOM_ARCH_AARCH64 + const uint8x16_t left_m1 = vld1q_u8(left - 1); + const uint8x16_t left_0 = vld1q_u8(left + 0); + const uint8x16_t left_16 = vld1q_u8(left + 16); + const uint8x16_t left_32 = vld1q_u8(left + 32); + const uint8x16_t left_48 = vld1q_u8(left + 48); + const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15); + const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15); + const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15); + const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } }; + const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } }; +#define LEFT left_vals0, left_vals1 +#else // !AOM_ARCH_AARCH64 +#define LEFT left +#endif // AOM_ARCH_AARCH64 + + for (int r = 0; r < H; r++) { + int y = r + 1; + int base_x = (-y * dx) >> 6; + for (int j = 0; j < W; j += 16) { + const int base_min_diff = min_base_x - base_x - j; + + if (base_min_diff <= 0) { + uint8x16_t resx = + dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); + vst1q_u8(dst + j, resx); + } else if (base_min_diff < 16) { + uint8x16_t resx = + dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); + uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); + uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); + uint8x16_t resxy = vbslq_u8(mask, resy, resx); + vst1q_u8(dst + j, resxy); + } else { + uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); + vst1q_u8(dst + j, resy); + } + } // for j + dst += stride; + } +#undef LEFT +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + + switch (bw) { + case 4: + dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + case 8: + dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + default: + dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy); + break; + } +} + +/* ---------------------P R E D I C T I O N Z 3--------------------------- */ + +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x, + uint8x16x2_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + + d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0]))); + d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1]))); +} + +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x, + uint8x8x2_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); + + *d = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); +} + +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x, + uint8x8x2_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); + + d[0] = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); + d[1] = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]))); +} + +static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + // The same as the normal transposes in transpose_neon.h, but with a stride + // between consecutive vectors of elements. + uint8x16_t r[16]; + uint8x16_t d[16]; + for (int i = 0; i < 16; i++) { + r[i] = vld1q_u8(src + i * pitchSrc); + } + transpose_arrays_u8_16x16(r, d); + for (int i = 0; i < 16; i++) { + vst1q_u8(dst + i * pitchDst, d[i]); + } +} + +static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src, + ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, + int height) { + for (int j = 0; j < height; j += 16) { + for (int i = 0; i < width; i += 16) { + z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); + } + } +} + +static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[4]; + uint8x8x2_t dest; + + dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy); + z3_transpose_arrays_u8_4x4(dstvec, &dest); + store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]); + store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]); +} + +static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[8]; + uint8x8_t d[8]; + + dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy); + transpose_arrays_u8_8x8(dstvec, d); + store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); +} + +static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[4]; + uint8x8x2_t d[2]; + + dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy); + z3_transpose_arrays_u8_8x4(dstvec, d); + store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]); + store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]); + store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]); + store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]); +} + +static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[8]; + uint8x8_t d[8]; + + dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy); + transpose_arrays_u8_8x8(dstvec, d); + store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]); +} + +static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x16_t dstvec[8]; + uint8x8_t d[16]; + + dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy); + transpose_arrays_u8_16x8(dstvec, d); + for (int i = 0; i < 16; i++) { + vst1_u8(dst + i * stride, d[i]); + } +} + +static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[16]; + uint8x16_t d[8]; + + dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy); + transpose_arrays_u8_8x16(dstvec, d); + for (int i = 0; i < 8; i++) { + vst1q_u8(dst + i * stride, d[i]); + } +} + +static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x16_t dstvec[4]; + uint8x16x2_t d[2]; + + dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy); + z3_transpose_arrays_u8_16x4(dstvec, d); + store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]); + store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]); + store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]); + store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]); +} + +static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[16]; + uint8x16_t d[8]; + + dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy); + transpose_arrays_u8_8x16(dstvec, d); + for (int i = 0; i < 4; i++) { + vst1q_u8(dst + i * stride, d[i]); + } +} + +static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + uint8x16x2_t dstvec[16]; + uint8x16_t d[32]; + uint8x16_t v_zero = vdupq_n_u8(0); + + dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy); + for (int i = 8; i < 16; i++) { + dstvec[i].val[0] = v_zero; + dstvec[i].val[1] = v_zero; + } + transpose_arrays_u8_32x16(dstvec, d); + for (int i = 0; i < 32; i++) { + vst1_u8(dst + i * stride, vget_low_u8(d[i])); + } +} + +static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x8_t dstvec[32]; + uint8x16_t d[16]; + + dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy); + transpose_arrays_u8_8x16(dstvec, d); + transpose_arrays_u8_8x16(dstvec + 16, d + 8); + for (int i = 0; i < 8; i++) { + vst1q_u8(dst + i * stride, d[i]); + vst1q_u8(dst + i * stride + 16, d[i + 8]); + } +} + +static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x16_t dstvec[16]; + uint8x16_t d[16]; + + dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy); + transpose_arrays_u8_16x16(dstvec, d); + for (int i = 0; i < 16; i++) { + vst1q_u8(dst + i * stride, d[i]); + } +} + +static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + uint8x16x2_t dstvec[32]; + uint8x16_t d[64]; + + dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy); + transpose_arrays_u8_32x16(dstvec, d); + transpose_arrays_u8_32x16(dstvec + 16, d + 32); + for (int i = 0; i < 32; i++) { + vst1q_u8(dst + i * stride, d[i]); + vst1q_u8(dst + i * stride + 16, d[i + 32]); + } +} + +static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); + + dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64); +} + +static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + uint8x16x2_t dstvec[16]; + uint8x16_t d[32]; + + dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy); + transpose_arrays_u8_32x16(dstvec, d); + for (int i = 0; i < 16; i++) { + vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]); + vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]); + } +} + +static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x16_t dstvec[32]; + + dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy); + for (int i = 0; i < 32; i += 16) { + uint8x16_t d[16]; + transpose_arrays_u8_16x16(dstvec + i, d); + for (int j = 0; j < 16; j++) { + vst1q_u8(dst + j * stride + i, d[j]); + } + } +} + +static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + uint8_t dstT[64 * 32]; + + dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64); +} + +static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + uint8_t dstT[32 * 64]; + + dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32); +} + +static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + (void)upsample_left; + uint8_t dstT[64 * 16]; + + dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64); +} + +static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8x16_t dstvec[64]; + + dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy); + for (int i = 0; i < 64; i += 16) { + uint8x16_t d[16]; + transpose_arrays_u8_16x16(dstvec + i, d); + for (int j = 0; j < 16; ++j) { + vst1q_u8(dst + j * stride + i, d[j]); + } + } +} + +typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy); + +static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, + dr_prediction_z3_4x16_neon, NULL, NULL }, + { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, + dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL }, + { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon, + dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon, + dr_prediction_z3_16x64_neon }, + { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon, + dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, + { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon, + dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon }, +}; + +void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + (void)above; + (void)dx; + assert(dx == 1); + assert(dy > 0); + + dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)]; + assert(f != NULL); + f(dst, stride, left, upsample_left, dy); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// 256 - v = vneg_s8(v) +static INLINE uint8x8_t negate_s8(const uint8x8_t v) { + return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); +} + +static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *const top_row, + const uint8_t *const left_column, + const int height) { + const uint8_t top_right = top_row[3]; + const uint8_t bottom_left = left_column[height - 1]; + const uint8_t *const weights_y = smooth_weights + height - 4; + + uint8x8_t top_v = load_u8_4x1(top_row); + const uint8x8_t top_right_v = vdup_n_u8(top_right); + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + uint8x8_t weights_x_v = load_u8_4x1(smooth_weights); + const uint8x8_t scaled_weights_x = negate_s8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + + assert(height > 0); + int y = 0; + do { + const uint8x8_t left_v = vdup_n_u8(left_column[y]); + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); + const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_bl, weights_y_v, top_v); + const uint16x8_t weighted_left_tr = + vmlal_u8(weighted_tr, weights_x_v, left_v); + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE); + + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0); + dst += stride; + } while (++y != height); +} + +static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl, + const uint16x8_t weighted_left_tr) { + // Maximum value of each parameter: 0xFF00 + const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); + return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE); +} + +static INLINE uint8x8_t calculate_weights_and_pred( + const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, + const uint8x8_t bottom_left, const uint8x8_t weights_x, + const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { + const uint16x8_t weighted_top = vmull_u8(weights_y, top); + const uint16x8_t weighted_top_bl = + vmlal_u8(weighted_top, scaled_weights_y, bottom_left); + const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); + return calculate_pred(weighted_top_bl, weighted_left_tr); +} + +static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *const top_row, + const uint8_t *const left_column, + const int height) { + const uint8_t top_right = top_row[7]; + const uint8_t bottom_left = left_column[height - 1]; + const uint8_t *const weights_y = smooth_weights + height - 4; + + const uint8x8_t top_v = vld1_u8(top_row); + const uint8x8_t top_right_v = vdup_n_u8(top_right); + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); + const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4); + const uint8x8_t scaled_weights_x = negate_s8(weights_x_v); + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); + + assert(height > 0); + int y = 0; + do { + const uint8x8_t left_v = vdup_n_u8(left_column[y]); + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); + const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); + const uint8x8_t result = + calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v, + weights_x_v, scaled_weights_y, weights_y_v); + + vst1_u8(dst, result); + dst += stride; + } while (++y != height); +} + +#define SMOOTH_NXM(W, H) \ + void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + smooth_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +SMOOTH_NXM(4, 4) +SMOOTH_NXM(4, 8) +SMOOTH_NXM(8, 4) +SMOOTH_NXM(8, 8) +SMOOTH_NXM(4, 16) +SMOOTH_NXM(8, 16) +SMOOTH_NXM(8, 32) + +#undef SMOOTH_NXM + +static INLINE uint8x16_t calculate_weights_and_predq( + const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, + const uint8x8_t weights_y, const uint8x16_t weights_x, + const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { + const uint16x8_t weighted_top_bl_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); + const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t result_low = + calculate_pred(weighted_top_bl_low, weighted_left_tr_low); + + const uint16x8_t weighted_top_bl_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); + const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); + const uint8x8_t result_high = + calculate_pred(weighted_top_bl_high, weighted_left_tr_high); + + return vcombine_u8(result_low, result_high); +} + +// 256 - v = vneg_s8(v) +static INLINE uint8x16_t negate_s8q(const uint8x16_t v) { + return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); +} + +// For width 16 and above. +#define SMOOTH_PREDICTOR(W) \ + static void smooth_##W##xh_neon( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ + const uint8_t *const left_column, const int height) { \ + const uint8_t top_right = top_row[(W)-1]; \ + const uint8_t bottom_left = left_column[height - 1]; \ + const uint8_t *const weights_y = smooth_weights + height - 4; \ + \ + uint8x16_t top_v[4]; \ + top_v[0] = vld1q_u8(top_row); \ + if ((W) > 16) { \ + top_v[1] = vld1q_u8(top_row + 16); \ + if ((W) == 64) { \ + top_v[2] = vld1q_u8(top_row + 32); \ + top_v[3] = vld1q_u8(top_row + 48); \ + } \ + } \ + \ + const uint8x8_t top_right_v = vdup_n_u8(top_right); \ + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ + \ + uint8x16_t weights_x_v[4]; \ + weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4); \ + if ((W) > 16) { \ + weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \ + if ((W) == 64) { \ + weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \ + weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \ + } \ + } \ + \ + uint8x16_t scaled_weights_x[4]; \ + scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \ + if ((W) > 16) { \ + scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \ + if ((W) == 64) { \ + scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \ + scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \ + } \ + } \ + \ + for (int y = 0; y < height; ++y) { \ + const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ + const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ + const uint16x8_t weighted_bl = \ + vmull_u8(scaled_weights_y, bottom_left_v); \ + \ + vst1q_u8(dst, calculate_weights_and_predq( \ + top_v[0], left_v, top_right_v, weights_y_v, \ + weights_x_v[0], scaled_weights_x[0], weighted_bl)); \ + \ + if ((W) > 16) { \ + vst1q_u8(dst + 16, \ + calculate_weights_and_predq( \ + top_v[1], left_v, top_right_v, weights_y_v, \ + weights_x_v[1], scaled_weights_x[1], weighted_bl)); \ + if ((W) == 64) { \ + vst1q_u8(dst + 32, \ + calculate_weights_and_predq( \ + top_v[2], left_v, top_right_v, weights_y_v, \ + weights_x_v[2], scaled_weights_x[2], weighted_bl)); \ + vst1q_u8(dst + 48, \ + calculate_weights_and_predq( \ + top_v[3], left_v, top_right_v, weights_y_v, \ + weights_x_v[3], scaled_weights_x[3], weighted_bl)); \ + } \ + } \ + \ + dst += stride; \ + } \ + } + +SMOOTH_PREDICTOR(16) +SMOOTH_PREDICTOR(32) +SMOOTH_PREDICTOR(64) + +#undef SMOOTH_PREDICTOR + +#define SMOOTH_NXM_WIDE(W, H) \ + void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + smooth_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +SMOOTH_NXM_WIDE(16, 4) +SMOOTH_NXM_WIDE(16, 8) +SMOOTH_NXM_WIDE(16, 16) +SMOOTH_NXM_WIDE(16, 32) +SMOOTH_NXM_WIDE(16, 64) +SMOOTH_NXM_WIDE(32, 8) +SMOOTH_NXM_WIDE(32, 16) +SMOOTH_NXM_WIDE(32, 32) +SMOOTH_NXM_WIDE(32, 64) +SMOOTH_NXM_WIDE(64, 16) +SMOOTH_NXM_WIDE(64, 32) +SMOOTH_NXM_WIDE(64, 64) + +#undef SMOOTH_NXM_WIDE + +// ----------------------------------------------------------------------------- +// SMOOTH_V_PRED + +// For widths 4 and 8. +#define SMOOTH_V_PREDICTOR(W) \ + static void smooth_v_##W##xh_neon( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ + const uint8_t *const left_column, const int height) { \ + const uint8_t bottom_left = left_column[height - 1]; \ + const uint8_t *const weights_y = smooth_weights + height - 4; \ + \ + uint8x8_t top_v; \ + if ((W) == 4) { \ + top_v = load_u8_4x1(top_row); \ + } else { /* width == 8 */ \ + top_v = vld1_u8(top_row); \ + } \ + \ + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ + \ + assert(height > 0); \ + int y = 0; \ + do { \ + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ + const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ + \ + const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \ + const uint16x8_t weighted_top_bl = \ + vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \ + const uint8x8_t pred = \ + vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \ + \ + if ((W) == 4) { \ + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \ + } else { /* width == 8 */ \ + vst1_u8(dst, pred); \ + } \ + dst += stride; \ + } while (++y != height); \ + } + +SMOOTH_V_PREDICTOR(4) +SMOOTH_V_PREDICTOR(8) + +#undef SMOOTH_V_PREDICTOR + +#define SMOOTH_V_NXM(W, H) \ + void aom_smooth_v_predictor_##W##x##H##_neon( \ + uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ + const uint8_t *left) { \ + smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +SMOOTH_V_NXM(4, 4) +SMOOTH_V_NXM(4, 8) +SMOOTH_V_NXM(4, 16) +SMOOTH_V_NXM(8, 4) +SMOOTH_V_NXM(8, 8) +SMOOTH_V_NXM(8, 16) +SMOOTH_V_NXM(8, 32) + +#undef SMOOTH_V_NXM + +static INLINE uint8x16_t calculate_vertical_weights_and_pred( + const uint8x16_t top, const uint8x8_t weights_y, + const uint16x8_t weighted_bl) { + const uint16x8_t pred_low = + vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); + const uint16x8_t pred_high = + vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); + const uint8x8_t pred_scaled_low = + vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE); + const uint8x8_t pred_scaled_high = + vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE); + return vcombine_u8(pred_scaled_low, pred_scaled_high); +} + +// For width 16 and above. +#define SMOOTH_V_PREDICTOR(W) \ + static void smooth_v_##W##xh_neon( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ + const uint8_t *const left_column, const int height) { \ + const uint8_t bottom_left = left_column[height - 1]; \ + const uint8_t *const weights_y = smooth_weights + height - 4; \ + \ + uint8x16_t top_v[4]; \ + top_v[0] = vld1q_u8(top_row); \ + if ((W) > 16) { \ + top_v[1] = vld1q_u8(top_row + 16); \ + if ((W) == 64) { \ + top_v[2] = vld1q_u8(top_row + 32); \ + top_v[3] = vld1q_u8(top_row + 48); \ + } \ + } \ + \ + const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ + \ + assert(height > 0); \ + int y = 0; \ + do { \ + const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ + const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ + const uint16x8_t weighted_bl = \ + vmull_u8(scaled_weights_y, bottom_left_v); \ + \ + const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \ + top_v[0], weights_y_v, weighted_bl); \ + vst1q_u8(dst, pred_0); \ + \ + if ((W) > 16) { \ + const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \ + top_v[1], weights_y_v, weighted_bl); \ + vst1q_u8(dst + 16, pred_1); \ + \ + if ((W) == 64) { \ + const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \ + top_v[2], weights_y_v, weighted_bl); \ + vst1q_u8(dst + 32, pred_2); \ + \ + const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \ + top_v[3], weights_y_v, weighted_bl); \ + vst1q_u8(dst + 48, pred_3); \ + } \ + } \ + \ + dst += stride; \ + } while (++y != height); \ + } + +SMOOTH_V_PREDICTOR(16) +SMOOTH_V_PREDICTOR(32) +SMOOTH_V_PREDICTOR(64) + +#undef SMOOTH_V_PREDICTOR + +#define SMOOTH_V_NXM_WIDE(W, H) \ + void aom_smooth_v_predictor_##W##x##H##_neon( \ + uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ + const uint8_t *left) { \ + smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +SMOOTH_V_NXM_WIDE(16, 4) +SMOOTH_V_NXM_WIDE(16, 8) +SMOOTH_V_NXM_WIDE(16, 16) +SMOOTH_V_NXM_WIDE(16, 32) +SMOOTH_V_NXM_WIDE(16, 64) +SMOOTH_V_NXM_WIDE(32, 8) +SMOOTH_V_NXM_WIDE(32, 16) +SMOOTH_V_NXM_WIDE(32, 32) +SMOOTH_V_NXM_WIDE(32, 64) +SMOOTH_V_NXM_WIDE(64, 16) +SMOOTH_V_NXM_WIDE(64, 32) +SMOOTH_V_NXM_WIDE(64, 64) + +#undef SMOOTH_V_NXM_WIDE + +// ----------------------------------------------------------------------------- +// SMOOTH_H_PRED + +// For widths 4 and 8. +#define SMOOTH_H_PREDICTOR(W) \ + static void smooth_h_##W##xh_neon( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ + const uint8_t *const left_column, const int height) { \ + const uint8_t top_right = top_row[(W)-1]; \ + \ + const uint8x8_t top_right_v = vdup_n_u8(top_right); \ + /* Over-reads for 4xN but still within the array. */ \ + const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4); \ + const uint8x8_t scaled_weights_x = negate_s8(weights_x); \ + const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \ + \ + assert(height > 0); \ + int y = 0; \ + do { \ + const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ + const uint16x8_t weighted_left_tr = \ + vmlal_u8(weighted_tr, weights_x, left_v); \ + const uint8x8_t pred = \ + vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \ + \ + if ((W) == 4) { \ + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \ + } else { /* width == 8 */ \ + vst1_u8(dst, pred); \ + } \ + dst += stride; \ + } while (++y != height); \ + } + +SMOOTH_H_PREDICTOR(4) +SMOOTH_H_PREDICTOR(8) + +#undef SMOOTH_H_PREDICTOR + +#define SMOOTH_H_NXM(W, H) \ + void aom_smooth_h_predictor_##W##x##H##_neon( \ + uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ + const uint8_t *left) { \ + smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +SMOOTH_H_NXM(4, 4) +SMOOTH_H_NXM(4, 8) +SMOOTH_H_NXM(4, 16) +SMOOTH_H_NXM(8, 4) +SMOOTH_H_NXM(8, 8) +SMOOTH_H_NXM(8, 16) +SMOOTH_H_NXM(8, 32) + +#undef SMOOTH_H_NXM + +static INLINE uint8x16_t calculate_horizontal_weights_and_pred( + const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, + const uint8x16_t scaled_weights_x) { + const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); + const uint16x8_t weighted_left_tr_low = + vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); + const uint8x8_t pred_scaled_low = + vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE); + + const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); + const uint16x8_t weighted_left_tr_high = + vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); + const uint8x8_t pred_scaled_high = + vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE); + + return vcombine_u8(pred_scaled_low, pred_scaled_high); +} + +// For width 16 and above. +#define SMOOTH_H_PREDICTOR(W) \ + static void smooth_h_##W##xh_neon( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ + const uint8_t *const left_column, const int height) { \ + const uint8_t top_right = top_row[(W)-1]; \ + \ + const uint8x8_t top_right_v = vdup_n_u8(top_right); \ + \ + uint8x16_t weights_x[4]; \ + weights_x[0] = vld1q_u8(smooth_weights + (W)-4); \ + if ((W) > 16) { \ + weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \ + if ((W) == 64) { \ + weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \ + weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \ + } \ + } \ + \ + uint8x16_t scaled_weights_x[4]; \ + scaled_weights_x[0] = negate_s8q(weights_x[0]); \ + if ((W) > 16) { \ + scaled_weights_x[1] = negate_s8q(weights_x[1]); \ + if ((W) == 64) { \ + scaled_weights_x[2] = negate_s8q(weights_x[2]); \ + scaled_weights_x[3] = negate_s8q(weights_x[3]); \ + } \ + } \ + \ + assert(height > 0); \ + int y = 0; \ + do { \ + const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ + \ + const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \ + left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \ + vst1q_u8(dst, pred_0); \ + \ + if ((W) > 16) { \ + const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \ + left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \ + vst1q_u8(dst + 16, pred_1); \ + \ + if ((W) == 64) { \ + const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \ + left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \ + vst1q_u8(dst + 32, pred_2); \ + \ + const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \ + left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \ + vst1q_u8(dst + 48, pred_3); \ + } \ + } \ + dst += stride; \ + } while (++y != height); \ + } + +SMOOTH_H_PREDICTOR(16) +SMOOTH_H_PREDICTOR(32) +SMOOTH_H_PREDICTOR(64) + +#undef SMOOTH_H_PREDICTOR + +#define SMOOTH_H_NXM_WIDE(W, H) \ + void aom_smooth_h_predictor_##W##x##H##_neon( \ + uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ + const uint8_t *left) { \ + smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ + } + +SMOOTH_H_NXM_WIDE(16, 4) +SMOOTH_H_NXM_WIDE(16, 8) +SMOOTH_H_NXM_WIDE(16, 16) +SMOOTH_H_NXM_WIDE(16, 32) +SMOOTH_H_NXM_WIDE(16, 64) +SMOOTH_H_NXM_WIDE(32, 8) +SMOOTH_H_NXM_WIDE(32, 16) +SMOOTH_H_NXM_WIDE(32, 32) +SMOOTH_H_NXM_WIDE(32, 64) +SMOOTH_H_NXM_WIDE(64, 16) +SMOOTH_H_NXM_WIDE(64, 32) +SMOOTH_H_NXM_WIDE(64, 64) + +#undef SMOOTH_H_NXM_WIDE + +// ----------------------------------------------------------------------------- +// PAETH + +static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride, + const uint8_t *const top_row, + const uint8_t *const left_column, + int width, int height) { + const uint8x8_t top_left = vdup_n_u8(top_row[-1]); + const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); + uint8x8_t top; + if (width == 4) { + top = load_u8_4x1(top_row); + } else { // width == 8 + top = vld1_u8(top_row); + } + + assert(height > 0); + int y = 0; + do { + const uint8x8_t left = vdup_n_u8(left_column[y]); + + const uint8x8_t left_dist = vabd_u8(top, top_left); + const uint8x8_t top_dist = vabd_u8(left, top_left); + const uint16x8_t top_left_dist = + vabdq_u16(vaddl_u8(top, left), top_left_x2); + + const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist); + const uint8x8_t left_le_top_left = + vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist)); + const uint8x8_t top_le_top_left = + vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist)); + + // if (left_dist <= top_dist && left_dist <= top_left_dist) + const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left); + // dest[x] = left_column[y]; + // Fill all the unused spaces with 'top'. They will be overwritten when + // the positions for top_left are known. + uint8x8_t result = vbsl_u8(left_mask, left, top); + // else if (top_dist <= top_left_dist) + // dest[x] = top_row[x]; + // Add these values to the mask. They were already set. + const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left); + // else + // dest[x] = top_left; + result = vbsl_u8(left_or_top_mask, result, top_left); + + if (width == 4) { + store_u8_4x1(dest, result); + } else { // width == 8 + vst1_u8(dest, result); + } + dest += stride; + } while (++y != height); +} + +#define PAETH_NXM(W, H) \ + void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ + } + +PAETH_NXM(4, 4) +PAETH_NXM(4, 8) +PAETH_NXM(8, 4) +PAETH_NXM(8, 8) +PAETH_NXM(8, 16) + +PAETH_NXM(4, 16) +PAETH_NXM(8, 32) + +// Calculate X distance <= TopLeft distance and pack the resulting mask into +// uint8x8_t. +static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist, + const uint16x8_t top_left_dist_low, + const uint16x8_t top_left_dist_high) { + const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low), + vqmovn_u16(top_left_dist_high)); + return vcleq_u8(x_dist, top_left_dist); +} + +// Select the closest values and collect them. +static INLINE uint8x16_t select_paeth(const uint8x16_t top, + const uint8x16_t left, + const uint8x16_t top_left, + const uint8x16_t left_le_top, + const uint8x16_t left_le_top_left, + const uint8x16_t top_le_top_left) { + // if (left_dist <= top_dist && left_dist <= top_left_dist) + const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left); + // dest[x] = left_column[y]; + // Fill all the unused spaces with 'top'. They will be overwritten when + // the positions for top_left are known. + uint8x16_t result = vbslq_u8(left_mask, left, top); + // else if (top_dist <= top_left_dist) + // dest[x] = top_row[x]; + // Add these values to the mask. They were already set. + const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left); + // else + // dest[x] = top_left; + return vbslq_u8(left_or_top_mask, result, top_left); +} + +// Generate numbered and high/low versions of top_left_dist. +#define TOP_LEFT_DIST(num) \ + const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \ + vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \ + const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \ + vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2) + +// Generate numbered versions of XLeTopLeft with x = left. +#define LEFT_LE_TOP_LEFT(num) \ + const uint8x16_t left_le_top_left_##num = \ + x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \ + top_left_##num##_dist_high) + +// Generate numbered versions of XLeTopLeft with x = top. +#define TOP_LE_TOP_LEFT(num) \ + const uint8x16_t top_le_top_left_##num = x_le_top_left( \ + top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high) + +static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride, + const uint8_t *const top_row, + const uint8_t *const left_column, + int width, int height) { + const uint8x16_t top_left = vdupq_n_u8(top_row[-1]); + const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); + uint8x16_t top[4]; + top[0] = vld1q_u8(top_row); + if (width > 16) { + top[1] = vld1q_u8(top_row + 16); + if (width == 64) { + top[2] = vld1q_u8(top_row + 32); + top[3] = vld1q_u8(top_row + 48); + } + } + + assert(height > 0); + int y = 0; + do { + const uint8x16_t left = vdupq_n_u8(left_column[y]); + + const uint8x16_t top_dist = vabdq_u8(left, top_left); + + const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left); + TOP_LEFT_DIST(0); + const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist); + LEFT_LE_TOP_LEFT(0); + TOP_LE_TOP_LEFT(0); + + const uint8x16_t result_0 = + select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0, + top_le_top_left_0); + vst1q_u8(dest, result_0); + + if (width > 16) { + const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left); + TOP_LEFT_DIST(1); + const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist); + LEFT_LE_TOP_LEFT(1); + TOP_LE_TOP_LEFT(1); + + const uint8x16_t result_1 = + select_paeth(top[1], left, top_left, left_1_le_top, + left_le_top_left_1, top_le_top_left_1); + vst1q_u8(dest + 16, result_1); + + if (width == 64) { + const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left); + TOP_LEFT_DIST(2); + const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist); + LEFT_LE_TOP_LEFT(2); + TOP_LE_TOP_LEFT(2); + + const uint8x16_t result_2 = + select_paeth(top[2], left, top_left, left_2_le_top, + left_le_top_left_2, top_le_top_left_2); + vst1q_u8(dest + 32, result_2); + + const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left); + TOP_LEFT_DIST(3); + const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist); + LEFT_LE_TOP_LEFT(3); + TOP_LE_TOP_LEFT(3); + + const uint8x16_t result_3 = + select_paeth(top[3], left, top_left, left_3_le_top, + left_le_top_left_3, top_le_top_left_3); + vst1q_u8(dest + 48, result_3); + } + } + + dest += stride; + } while (++y != height); +} + +#define PAETH_NXM_WIDE(W, H) \ + void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ + } + +PAETH_NXM_WIDE(16, 8) +PAETH_NXM_WIDE(16, 16) +PAETH_NXM_WIDE(16, 32) +PAETH_NXM_WIDE(32, 16) +PAETH_NXM_WIDE(32, 32) +PAETH_NXM_WIDE(32, 64) +PAETH_NXM_WIDE(64, 32) +PAETH_NXM_WIDE(64, 64) + +PAETH_NXM_WIDE(16, 4) +PAETH_NXM_WIDE(16, 64) +PAETH_NXM_WIDE(32, 8) +PAETH_NXM_WIDE(64, 16) diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c new file mode 100644 index 0000000000..7c64be1253 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -0,0 +1,1045 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" + +static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p3q3, p2q2); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1)); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0, + const uint8_t blimit, const uint8_t limit) { + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + const uint16x4_t blimit_16x4 = vdup_n_u16(blimit); + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + uint8x8_t mask_8x8, temp_8x8; + + mask_8x8 = vabd_u8(p1q1, p0q0); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, + uint8x8_t p1q1, uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask3 values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p2q2, p1q1); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, + uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, + out_f14_pq5; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8, flat2_8x8; + uint8x8_t q0p0, q1p1, q2p2; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + // filter 14 + uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5; + uint16x8_t p6q6_2, p6q6_temp, qp_sum; + uint8x8_t qp_rev; + + out = vaddw_u8(out, *p4q4); + out = vaddw_u8(out, *p5q5); + out = vaddw_u8(out, *p6q6); + + out_pq5 = vaddw_u8(out, *p4q4); + out_pq4 = vaddw_u8(out_pq5, *p3q3); + out_pq3 = vaddw_u8(out_pq4, *p2q2); + + out_pq5 = vaddw_u8(out_pq5, *p5q5); + out_pq4 = vaddw_u8(out_pq4, *p5q5); + + out_pq0 = vaddw_u8(out, *p1q1); + out_pq1 = vaddw_u8(out_pq0, *p2q2); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + + out_pq0 = vaddw_u8(out_pq0, *p0q0); + out_pq1 = vaddw_u8(out_pq1, *p0q0); + + out_pq1 = vaddw_u8(out_pq1, *p6q6); + p6q6_2 = vaddl_u8(*p6q6, *p6q6); + out_pq2 = vaddq_u16(out_pq2, p6q6_2); + p6q6_temp = vaddw_u8(p6q6_2, *p6q6); + out_pq3 = vaddq_u16(out_pq3, p6q6_temp); + p6q6_temp = vaddw_u8(p6q6_temp, *p6q6); + out_pq4 = vaddq_u16(out_pq4, p6q6_temp); + p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2); + out_pq5 = vaddq_u16(out_pq5, p6q6_temp); + + out_pq4 = vaddw_u8(out_pq4, q1p1); + + qp_sum = vaddl_u8(q2p2, q1p1); + out_pq3 = vaddq_u16(out_pq3, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq2 = vaddq_u16(out_pq2, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq1 = vaddq_u16(out_pq1, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq0 = vaddq_u16(out_pq0, qp_sum); + + out_pq0 = vaddw_u8(out_pq0, q0p0); + + out_f14_pq0 = vrshrn_n_u16(out_pq0, 4); + out_f14_pq1 = vrshrn_n_u16(out_pq1, 4); + out_f14_pq2 = vrshrn_n_u16(out_pq2, 4); + out_f14_pq3 = vrshrn_n_u16(out_pq3, 4); + out_f14_pq4 = vrshrn_n_u16(out_pq4, 4); + out_f14_pq5 = vrshrn_n_u16(out_pq5, 4); + } + { + uint8x8_t filter4_cond, filter8_cond, filter14_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + filter14_cond = vand_u8(filter8_cond, flat2_8x8); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + + // filter14 outputs + *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); + *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); + *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); + *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); + *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); + *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); + } +} + +static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + uint8x8_t q0p0, q1p1, q2p2; + + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + uint8x8_t filter4_cond, filter8_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + } +} + +static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, + const uint8_t blimit, const uint8_t limit, + const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f6_pq0, out_f6_pq1; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 6 + uint16x8_t out_pq0, out_pq1; + uint8x8_t pq_rev; + + out = vaddl_u8(*p0q0, *p1q1); + out = vaddq_u16(out, out); + out = vaddw_u8(out, *p2q2); + + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + out = vaddw_u8(out, pq_rev); + + out_pq0 = vaddw_u8(out, pq_rev); + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + out_pq0 = vaddw_u8(out_pq0, pq_rev); + + out_pq1 = vaddw_u8(out, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p2q2); + + out_f6_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f6_pq1 = vrshrn_n_u16(out_pq1, 3); + } + { + uint8x8_t filter4_cond, filter6_cond; + filter6_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter6_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter6 outputs + *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); + *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); + } +} + +static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t mask_8x8, temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + // Calculate filter mask + mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); +} + +void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x16_t row0, row1, row2, row3; + uint8x8_t pxp3, p6p2, p5p1, p4p0; + uint8x8_t q0q4, q1q5, q2q6, q3qy; + uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3; + uint32x2_t pq_rev; + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6; + + // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + load_u8_16x4(src - 8, stride, &row0, &row1, &row2, &row3); + + pxp3 = vget_low_u8(row0); + p6p2 = vget_low_u8(row1); + p5p1 = vget_low_u8(row2); + p4p0 = vget_low_u8(row3); + transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + q0q4 = vget_high_u8(row0); + q1q5 = vget_high_u8(row1); + q2q6 = vget_high_u8(row2); + q3qy = vget_high_u8(row3); + transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy)); + pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev); + + p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2)); + + pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]); + p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]); + p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]); + p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]); + + q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]); + p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + row0 = vcombine_u8(pxp3, q0q4); + row1 = vcombine_u8(p6p2, q1q5); + row2 = vcombine_u8(p5p1, q2q6); + row3 = vcombine_u8(p4p0, q3qy); + + store_u8_16x4(src - 8, stride, row0, row1, row2, row3); +} + +void aom_lpf_vertical_14_dual_neon( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_vertical_14_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, + thresh, blimit, limit, thresh); +} + +void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, p3q3_p0q0; + uint32x2_t pq_rev; + uint8x8_t p3q0, p2q1, p1q2, p0q3; + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + // row0: p3 p2 p1 p0 | q0 q1 q2 q3 + // row1: p3 p2 p1 p0 | q0 q1 q2 q3 + // row2: p3 p2 p1 p0 | q0 q1 q2 q3 + // row3: p3 p2 p1 p0 | q0 q1 q2 q3 + load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3); + + transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); +} + +void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + aom_lpf_vertical_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_vertical_8_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, + thresh, blimit, limit, thresh); +} + +void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, pxqy_p0q0; + uint32x2_t pq_rev; + uint8x8_t pxq0, p2q1, p1q2, p0qy; + uint8x8_t p0q0, p1q1, p2q2, pxqy; + + // row0: px p2 p1 p0 | q0 q1 q2 qy + // row1: px p2 p1 p0 | q0 q1 q2 qy + // row2: px p2 p1 p0 | q0 q1 q2 qy + // row3: px p2 p1 p0 | q0 q1 q2 qy + load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy); + + transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy); +} + +void aom_lpf_vertical_6_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + aom_lpf_vertical_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_vertical_6_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, + thresh, blimit, limit, thresh); +} + +void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0; + uint32x2_t pq_rev; + uint8x8_t p1p0, q0q1; + uint8x8_t p0q0, p1q1; + + // row0: p1 p0 | q0 q1 + // row1: p1 p0 | q0 q1 + // row2: p1 p0 | q0 q1 + // row3: p1 p0 | q0 q1 + load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1); + + transpose_elems_inplace_u8_4x4(&p1p0, &q0q1); + + p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1)); + + pq_rev = vrev64_u32(p1q0_p0q1.val[1]); + p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev); + + p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]); + p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0)); + + p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]); + q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1])); + + transpose_elems_inplace_u8_4x4(&p1p0, &q0q1); + + store_u8x4_strided_x2(src - 2, 2 * stride, p1p0); + store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1); +} + +void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + aom_lpf_vertical_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_vertical_4_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit, + thresh, blimit, limit, thresh); +} + +void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride); + uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride); + uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride); + uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride); + uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride); + uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride); + uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); + store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); + store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2); + store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3); + store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4); + store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5); +} + +void aom_lpf_horizontal_14_dual_neon( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1); +} + +// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed +// up. +void aom_lpf_horizontal_14_quad_neon(uint8_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_horizontal_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, + limit, thresh); + aom_lpf_horizontal_14_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, + blimit, limit, thresh); +} + +void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride))); + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride), + vreinterpret_u32_u8(p3q3), 1)); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0); + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); + vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1); +} + +void aom_lpf_horizontal_8_dual_neon( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1); +} + +// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed +// up. +void aom_lpf_horizontal_8_quad_neon(uint8_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_horizontal_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_horizontal_8_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, + blimit, limit, thresh); +} + +void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2; + + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); +} + +void aom_lpf_horizontal_6_dual_neon( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1); +} + +// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed +// up. +void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_horizontal_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_horizontal_6_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, + blimit, limit, thresh); +} + +void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride); + uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); + store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); +} + +void aom_lpf_horizontal_4_dual_neon( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1); +} + +// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed +// up. +void aom_lpf_horizontal_4_quad_neon(uint8_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_horizontal_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit, + thresh); + aom_lpf_horizontal_4_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh, + blimit, limit, thresh); +} diff --git a/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c new file mode 100644 index 0000000000..8f65b805ec --- /dev/null +++ b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "mem_neon.h" +#include "sum_neon.h" + +static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad, + const uint8x16_t s0, + const uint8x16_t a0, + const uint8x16_t b0, + const uint8x16_t m0) { + uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0); + uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0)); + uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0)); + blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0)); + blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0)); + + uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS); + uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS); + uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi); + return vpadalq_u8(sad, vabdq_u8(blend_u8, s0)); +} + +static INLINE void masked_inv_sadwxhx4d_large_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int width, int height, int h_overflow) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + int h_limit = height > h_overflow ? h_overflow : height; + + int ref_offset = 0; + int i = 0; + do { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src + j); + uint8x16_t p0 = vld1q_u8(second_pred + j); + uint8x16_t m0 = vld1q_u8(mask + j); + sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0, + vld1q_u8(ref[0] + ref_offset + j), m0); + sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0, + vld1q_u8(ref[1] + ref_offset + j), m0); + sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0, + vld1q_u8(ref[2] + ref_offset + j), m0); + sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0, + vld1q_u8(ref[3] + ref_offset + j), m0); + + uint8x16_t s1 = vld1q_u8(src + j + 16); + uint8x16_t p1 = vld1q_u8(second_pred + j + 16); + uint8x16_t m1 = vld1q_u8(mask + j + 16); + sum_hi[0] = masked_sad_16x1_neon( + sum_hi[0], s1, p1, vld1q_u8(ref[0] + ref_offset + j + 16), m1); + sum_hi[1] = masked_sad_16x1_neon( + sum_hi[1], s1, p1, vld1q_u8(ref[1] + ref_offset + j + 16), m1); + sum_hi[2] = masked_sad_16x1_neon( + sum_hi[2], s1, p1, vld1q_u8(ref[2] + ref_offset + j + 16), m1); + sum_hi[3] = masked_sad_16x1_neon( + sum_hi[3], s1, p1, vld1q_u8(ref[3] + ref_offset + j + 16), m1); + + j += 32; + } while (j < width); + + src += src_stride; + ref_offset += ref_stride; + second_pred += width; + mask += mask_stride; + } while (++i < h_limit); + + sum[0] = vpadalq_u16(sum[0], sum_lo[0]); + sum[0] = vpadalq_u16(sum[0], sum_hi[0]); + sum[1] = vpadalq_u16(sum[1], sum_lo[1]); + sum[1] = vpadalq_u16(sum[1], sum_hi[1]); + sum[2] = vpadalq_u16(sum[2], sum_lo[2]); + sum[2] = vpadalq_u16(sum[2], sum_hi[2]); + sum[3] = vpadalq_u16(sum[3], sum_lo[3]); + sum[3] = vpadalq_u16(sum[3], sum_hi[3]); + + h_limit += h_overflow; + } while (i < height); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void masked_inv_sad128xhx4d_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int h) { + masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred, + mask, mask_stride, res, 128, h, 32); +} + +static INLINE void masked_inv_sad64xhx4d_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int h) { + masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred, + mask, mask_stride, res, 64, h, 64); +} + +static INLINE void masked_sadwxhx4d_large_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int width, int height, int h_overflow) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + int h_limit = height > h_overflow ? h_overflow : height; + + int ref_offset = 0; + int i = 0; + do { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src + j); + uint8x16_t p0 = vld1q_u8(second_pred + j); + uint8x16_t m0 = vld1q_u8(mask + j); + sum_lo[0] = masked_sad_16x1_neon( + sum_lo[0], s0, vld1q_u8(ref[0] + ref_offset + j), p0, m0); + sum_lo[1] = masked_sad_16x1_neon( + sum_lo[1], s0, vld1q_u8(ref[1] + ref_offset + j), p0, m0); + sum_lo[2] = masked_sad_16x1_neon( + sum_lo[2], s0, vld1q_u8(ref[2] + ref_offset + j), p0, m0); + sum_lo[3] = masked_sad_16x1_neon( + sum_lo[3], s0, vld1q_u8(ref[3] + ref_offset + j), p0, m0); + + uint8x16_t s1 = vld1q_u8(src + j + 16); + uint8x16_t p1 = vld1q_u8(second_pred + j + 16); + uint8x16_t m1 = vld1q_u8(mask + j + 16); + sum_hi[0] = masked_sad_16x1_neon( + sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + j + 16), p1, m1); + sum_hi[1] = masked_sad_16x1_neon( + sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + j + 16), p1, m1); + sum_hi[2] = masked_sad_16x1_neon( + sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + j + 16), p1, m1); + sum_hi[3] = masked_sad_16x1_neon( + sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + j + 16), p1, m1); + + j += 32; + } while (j < width); + + src += src_stride; + ref_offset += ref_stride; + second_pred += width; + mask += mask_stride; + } while (++i < h_limit); + + sum[0] = vpadalq_u16(sum[0], sum_lo[0]); + sum[0] = vpadalq_u16(sum[0], sum_hi[0]); + sum[1] = vpadalq_u16(sum[1], sum_lo[1]); + sum[1] = vpadalq_u16(sum[1], sum_hi[1]); + sum[2] = vpadalq_u16(sum[2], sum_lo[2]); + sum[2] = vpadalq_u16(sum[2], sum_hi[2]); + sum[3] = vpadalq_u16(sum[3], sum_lo[3]); + sum[3] = vpadalq_u16(sum[3], sum_hi[3]); + + h_limit += h_overflow; + } while (i < height); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + uint32_t res[4], int h) { + masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred, + mask, mask_stride, res, 128, h, 32); +} + +static INLINE void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + uint32_t res[4], int h) { + masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred, + mask, mask_stride, res, 64, h, 64); +} + +static INLINE void masked_inv_sad32xhx4d_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t m0 = vld1q_u8(mask); + sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0, + vld1q_u8(ref[0] + ref_offset), m0); + sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0, + vld1q_u8(ref[1] + ref_offset), m0); + sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0, + vld1q_u8(ref[2] + ref_offset), m0); + sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0, + vld1q_u8(ref[3] + ref_offset), m0); + + uint8x16_t s1 = vld1q_u8(src + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t m1 = vld1q_u8(mask + 16); + sum_hi[0] = masked_sad_16x1_neon(sum_hi[0], s1, p1, + vld1q_u8(ref[0] + ref_offset + 16), m1); + sum_hi[1] = masked_sad_16x1_neon(sum_hi[1], s1, p1, + vld1q_u8(ref[1] + ref_offset + 16), m1); + sum_hi[2] = masked_sad_16x1_neon(sum_hi[2], s1, p1, + vld1q_u8(ref[2] + ref_offset + 16), m1); + sum_hi[3] = masked_sad_16x1_neon(sum_hi[3], s1, p1, + vld1q_u8(ref[3] + ref_offset + 16), m1); + + src += src_stride; + ref_offset += ref_stride; + second_pred += 32; + mask += mask_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi)); +} + +static INLINE void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t m0 = vld1q_u8(mask); + sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, + vld1q_u8(ref[0] + ref_offset), p0, m0); + sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, + vld1q_u8(ref[1] + ref_offset), p0, m0); + sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, + vld1q_u8(ref[2] + ref_offset), p0, m0); + sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, + vld1q_u8(ref[3] + ref_offset), p0, m0); + + uint8x16_t s1 = vld1q_u8(src + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t m1 = vld1q_u8(mask + 16); + sum_hi[0] = masked_sad_16x1_neon( + sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + 16), p1, m1); + sum_hi[1] = masked_sad_16x1_neon( + sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + 16), p1, m1); + sum_hi[2] = masked_sad_16x1_neon( + sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + 16), p1, m1); + sum_hi[3] = masked_sad_16x1_neon( + sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + 16), p1, m1); + + src += src_stride; + ref_offset += ref_stride; + second_pred += 32; + mask += mask_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi)); +} + +static INLINE void masked_inv_sad16xhx4d_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int h) { + uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int ref_offset = 0; + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t m0 = vld1q_u8(mask); + sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0, p0, + vld1q_u8(ref[0] + ref_offset), m0); + sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0, p0, + vld1q_u8(ref[1] + ref_offset), m0); + sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0, p0, + vld1q_u8(ref[2] + ref_offset), m0); + sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0, p0, + vld1q_u8(ref[3] + ref_offset), m0); + + src += src_stride; + ref_offset += ref_stride; + second_pred += 16; + mask += mask_stride; + } while (--i != 0); + + sum_u32[0] = vpaddlq_u16(sum_u16[0]); + sum_u32[1] = vpaddlq_u16(sum_u16[1]); + sum_u32[2] = vpaddlq_u16(sum_u16[2]); + sum_u32[3] = vpaddlq_u16(sum_u16[3]); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); +} + +static INLINE void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + uint32_t res[4], int h) { + uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int ref_offset = 0; + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t m0 = vld1q_u8(mask); + sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0, + vld1q_u8(ref[0] + ref_offset), p0, m0); + sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0, + vld1q_u8(ref[1] + ref_offset), p0, m0); + sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0, + vld1q_u8(ref[2] + ref_offset), p0, m0); + sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0, + vld1q_u8(ref[3] + ref_offset), p0, m0); + + src += src_stride; + ref_offset += ref_stride; + second_pred += 16; + mask += mask_stride; + } while (--i != 0); + + sum_u32[0] = vpaddlq_u16(sum_u16[0]); + sum_u32[1] = vpaddlq_u16(sum_u16[1]); + sum_u32[2] = vpaddlq_u16(sum_u16[2]); + sum_u32[3] = vpaddlq_u16(sum_u16[3]); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); +} + +static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0, + const uint8x8_t a0, + const uint8x8_t b0, + const uint8x8_t m0) { + uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0); + uint16x8_t blend_u16 = vmull_u8(m0, a0); + blend_u16 = vmlal_u8(blend_u16, m0_inv, b0); + + uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS); + return vabal_u8(sad, blend_u8, s0); +} + +static INLINE void masked_inv_sad8xhx4d_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t p0 = vld1_u8(second_pred); + uint8x8_t m0 = vld1_u8(mask); + sum[0] = + masked_sad_8x1_neon(sum[0], s0, p0, vld1_u8(ref[0] + ref_offset), m0); + sum[1] = + masked_sad_8x1_neon(sum[1], s0, p0, vld1_u8(ref[1] + ref_offset), m0); + sum[2] = + masked_sad_8x1_neon(sum[2], s0, p0, vld1_u8(ref[2] + ref_offset), m0); + sum[3] = + masked_sad_8x1_neon(sum[3], s0, p0, vld1_u8(ref[3] + ref_offset), m0); + + src += src_stride; + ref_offset += ref_stride; + second_pred += 8; + mask += mask_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_add_4d_u16x8(sum)); +} + +static INLINE void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t p0 = vld1_u8(second_pred); + uint8x8_t m0 = vld1_u8(mask); + + sum[0] = + masked_sad_8x1_neon(sum[0], s0, vld1_u8(ref[0] + ref_offset), p0, m0); + sum[1] = + masked_sad_8x1_neon(sum[1], s0, vld1_u8(ref[1] + ref_offset), p0, m0); + sum[2] = + masked_sad_8x1_neon(sum[2], s0, vld1_u8(ref[2] + ref_offset), p0, m0); + sum[3] = + masked_sad_8x1_neon(sum[3], s0, vld1_u8(ref[3] + ref_offset), p0, m0); + + src += src_stride; + ref_offset += ref_stride; + second_pred += 8; + mask += mask_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_add_4d_u16x8(sum)); +} + +static INLINE void masked_inv_sad4xhx4d_neon( + const uint8_t *src, int src_stride, const uint8_t *const ref[4], + int ref_stride, const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride); + uint8x8_t p0 = vld1_u8(second_pred); + uint8x8_t m0 = load_unaligned_u8(mask, mask_stride); + + sum[0] = masked_sad_8x1_neon(sum[0], s, p0, r0, m0); + sum[1] = masked_sad_8x1_neon(sum[1], s, p0, r1, m0); + sum[2] = masked_sad_8x1_neon(sum[2], s, p0, r2, m0); + sum[3] = masked_sad_8x1_neon(sum[3], s, p0, r3, m0); + + src += 2 * src_stride; + ref_offset += 2 * ref_stride; + second_pred += 2 * 4; + mask += 2 * mask_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_add_4d_u16x8(sum)); +} + +static INLINE void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride); + uint8x8_t p0 = vld1_u8(second_pred); + uint8x8_t m0 = load_unaligned_u8(mask, mask_stride); + + sum[0] = masked_sad_8x1_neon(sum[0], s, r0, p0, m0); + sum[1] = masked_sad_8x1_neon(sum[1], s, r1, p0, m0); + sum[2] = masked_sad_8x1_neon(sum[2], s, r2, p0, m0); + sum[3] = masked_sad_8x1_neon(sum[3], s, r3, p0, m0); + + src += 2 * src_stride; + ref_offset += 2 * ref_stride; + second_pred += 2 * 4; + mask += 2 * mask_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_add_4d_u16x8(sum)); +} + +#define MASKED_SAD4D_WXH_NEON(w, h) \ + void aom_masked_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int invert_mask, uint32_t res[4]) { \ + if (invert_mask) { \ + masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, \ + second_pred, msk, msk_stride, res, h); \ + } else { \ + masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, res, h); \ + } \ + } + +MASKED_SAD4D_WXH_NEON(4, 8) +MASKED_SAD4D_WXH_NEON(4, 4) + +MASKED_SAD4D_WXH_NEON(8, 16) +MASKED_SAD4D_WXH_NEON(8, 8) +MASKED_SAD4D_WXH_NEON(8, 4) + +MASKED_SAD4D_WXH_NEON(16, 32) +MASKED_SAD4D_WXH_NEON(16, 16) +MASKED_SAD4D_WXH_NEON(16, 8) + +MASKED_SAD4D_WXH_NEON(32, 64) +MASKED_SAD4D_WXH_NEON(32, 32) +MASKED_SAD4D_WXH_NEON(32, 16) + +MASKED_SAD4D_WXH_NEON(64, 128) +MASKED_SAD4D_WXH_NEON(64, 64) +MASKED_SAD4D_WXH_NEON(64, 32) + +MASKED_SAD4D_WXH_NEON(128, 128) +MASKED_SAD4D_WXH_NEON(128, 64) + +#if !CONFIG_REALTIME_ONLY +MASKED_SAD4D_WXH_NEON(4, 16) +MASKED_SAD4D_WXH_NEON(16, 4) +MASKED_SAD4D_WXH_NEON(8, 32) +MASKED_SAD4D_WXH_NEON(32, 8) +MASKED_SAD4D_WXH_NEON(16, 64) +MASKED_SAD4D_WXH_NEON(64, 16) +#endif diff --git a/third_party/aom/aom_dsp/arm/masked_sad_neon.c b/third_party/aom/aom_dsp/arm/masked_sad_neon.c new file mode 100644 index 0000000000..9d263105e3 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/masked_sad_neon.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/blend.h" + +static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad, + const uint8_t *src, + const uint8_t *a, + const uint8_t *b, + const uint8_t *m) { + uint8x16_t m0 = vld1q_u8(m); + uint8x16_t a0 = vld1q_u8(a); + uint8x16_t b0 = vld1q_u8(b); + uint8x16_t s0 = vld1q_u8(src); + + uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0); + + return vpadalq_u8(sad, vabdq_u8(blend_u8, s0)); +} + +static INLINE unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, + int height) { + // Eight accumulator vectors are required to avoid overflow in the 128x128 + // case. + assert(height <= 128); + uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; + + do { + sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]); + sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]); + sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]); + sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]); + sad[4] = masked_sad_16x1_neon(sad[4], &src[64], &a[64], &b[64], &m[64]); + sad[5] = masked_sad_16x1_neon(sad[5], &src[80], &a[80], &b[80], &m[80]); + sad[6] = masked_sad_16x1_neon(sad[6], &src[96], &a[96], &b[96], &m[96]); + sad[7] = masked_sad_16x1_neon(sad[7], &src[112], &a[112], &b[112], &m[112]); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + height--; + } while (height != 0); + + return horizontal_long_add_u16x8(sad[0], sad[1]) + + horizontal_long_add_u16x8(sad[2], sad[3]) + + horizontal_long_add_u16x8(sad[4], sad[5]) + + horizontal_long_add_u16x8(sad[6], sad[7]); +} + +static INLINE unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, + int height) { + // Four accumulator vectors are required to avoid overflow in the 64x128 case. + assert(height <= 128); + uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]); + sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]); + sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]); + sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + height--; + } while (height != 0); + + return horizontal_long_add_u16x8(sad[0], sad[1]) + + horizontal_long_add_u16x8(sad[2], sad[3]); +} + +static INLINE unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, + int height) { + // We could use a single accumulator up to height=64 without overflow. + assert(height <= 64); + uint16x8_t sad = vdupq_n_u16(0); + + do { + sad = masked_sad_16x1_neon(sad, &src[0], &a[0], &b[0], &m[0]); + sad = masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + height--; + } while (height != 0); + + return horizontal_add_u16x8(sad); +} + +static INLINE unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, + int height) { + // We could use a single accumulator up to height=128 without overflow. + assert(height <= 128); + uint16x8_t sad = vdupq_n_u16(0); + + do { + sad = masked_sad_16x1_neon(sad, src, a, b, m); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + height--; + } while (height != 0); + + return horizontal_add_u16x8(sad); +} + +static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, + int height) { + // We could use a single accumulator up to height=128 without overflow. + assert(height <= 128); + uint16x4_t sad = vdup_n_u16(0); + + do { + uint8x8_t m0 = vld1_u8(m); + uint8x8_t a0 = vld1_u8(a); + uint8x8_t b0 = vld1_u8(b); + uint8x8_t s0 = vld1_u8(src); + + uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0); + + sad = vpadal_u8(sad, vabd_u8(blend_u8, s0)); + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + height--; + } while (height != 0); + + return horizontal_add_u16x4(sad); +} + +static INLINE unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, + int height) { + // Process two rows per loop iteration. + assert(height % 2 == 0); + + // We could use a single accumulator up to height=256 without overflow. + assert(height <= 256); + uint16x4_t sad = vdup_n_u16(0); + + do { + uint8x8_t m0 = load_unaligned_u8(m, m_stride); + uint8x8_t a0 = load_unaligned_u8(a, a_stride); + uint8x8_t b0 = load_unaligned_u8(b, b_stride); + uint8x8_t s0 = load_unaligned_u8(src, src_stride); + + uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0); + + sad = vpadal_u8(sad, vabd_u8(blend_u8, s0)); + + src += 2 * src_stride; + a += 2 * a_stride; + b += 2 * b_stride; + m += 2 * m_stride; + height -= 2; + } while (height != 0); + + return horizontal_add_u16x4(sad); +} + +#define MASKED_SAD_WXH_NEON(width, height) \ + unsigned aom_masked_sad##width##x##height##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_##width##xh_neon(src, src_stride, ref, ref_stride, \ + second_pred, width, msk, msk_stride, \ + height); \ + else \ + return masked_sad_##width##xh_neon(src, src_stride, second_pred, width, \ + ref, ref_stride, msk, msk_stride, \ + height); \ + } + +MASKED_SAD_WXH_NEON(4, 4) +MASKED_SAD_WXH_NEON(4, 8) +MASKED_SAD_WXH_NEON(8, 4) +MASKED_SAD_WXH_NEON(8, 8) +MASKED_SAD_WXH_NEON(8, 16) +MASKED_SAD_WXH_NEON(16, 8) +MASKED_SAD_WXH_NEON(16, 16) +MASKED_SAD_WXH_NEON(16, 32) +MASKED_SAD_WXH_NEON(32, 16) +MASKED_SAD_WXH_NEON(32, 32) +MASKED_SAD_WXH_NEON(32, 64) +MASKED_SAD_WXH_NEON(64, 32) +MASKED_SAD_WXH_NEON(64, 64) +MASKED_SAD_WXH_NEON(64, 128) +MASKED_SAD_WXH_NEON(128, 64) +MASKED_SAD_WXH_NEON(128, 128) +#if !CONFIG_REALTIME_ONLY +MASKED_SAD_WXH_NEON(4, 16) +MASKED_SAD_WXH_NEON(16, 4) +MASKED_SAD_WXH_NEON(8, 32) +MASKED_SAD_WXH_NEON(32, 8) +MASKED_SAD_WXH_NEON(16, 64) +MASKED_SAD_WXH_NEON(64, 16) +#endif diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h new file mode 100644 index 0000000000..52c7a34e3e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/mem_neon.h @@ -0,0 +1,1253 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_ +#define AOM_AOM_DSP_ARM_MEM_NEON_H_ + +#include +#include +#include "aom_dsp/aom_dsp_common.h" + +// Support for xN Neon intrinsics is lacking in some compilers. +#if defined(__arm__) || defined(_M_ARM) +#define ARM_32_BIT +#endif + +// DEFICIENT_CLANG_32_BIT includes clang-cl. +#if defined(__clang__) && defined(ARM_32_BIT) && \ + (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7)) +#define DEFICIENT_CLANG_32_BIT // This includes clang-cl. +#endif + +#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT) +#define GCC_32_BIT +#endif + +#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT) + +static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { + uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), + vld1q_u8(ptr + 2 * 16) } }; + return res; +} + +static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { + uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; + return res; +} + +static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) { + uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } }; + return res; +} + +static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { + uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), + vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; + return res; +} + +#elif defined(__GNUC__) && !defined(__clang__) // GCC 64-bit. +#if __GNUC__ < 8 + +static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) { + uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; + return res; +} + +static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) { + uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8), + vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } }; + return res; +} +#endif // __GNUC__ < 8 + +#if __GNUC__ < 9 +static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) { + uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), + vld1q_u8(ptr + 2 * 16) } }; + return res; +} +#endif // __GNUC__ < 9 +#endif // defined(__GNUC__) && !defined(__clang__) + +static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, + const uint8x8_t s1) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; +} + +static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) { + return vcombine_u8(vld1_u8(s), vld1_u8(s + p)); +} + +// Load four bytes into the low half of a uint8x8_t, zero the upper half. +static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) { + uint8x8_t ret = vdup_n_u8(0); + ret = vreinterpret_u8_u32( + vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0)); + return ret; +} + +static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) { + uint8x8_t ret = vdup_n_u8(0); + ret = vreinterpret_u8_u32( + vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0)); + p += stride; + ret = vreinterpret_u8_u32( + vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1)); + return ret; +} + +static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) { + uint16x4_t ret = vdup_n_u16(0); + ret = vreinterpret_u16_u32( + vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0)); + p += stride; + ret = vreinterpret_u16_u32( + vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1)); + return ret; +} + +static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); +} + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; +} + +static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3, + uint16x4_t *const s4, uint16x4_t *const s5, + uint16x4_t *const s6) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; + *s4 = vld1_u16(s); + s += p; + *s5 = vld1_u16(s); + s += p; + *s6 = vld1_u16(s); +} + +static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); +} + +static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); +} + +static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; +} + +static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4, int16x4_t *const s5, + int16x4_t *const s6, int16x4_t *const s7, + int16x4_t *const s8, int16x4_t *const s9, + int16x4_t *const s10, int16x4_t *const s11) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); + s += p; + *s8 = vld1_s16(s); + s += p; + *s9 = vld1_s16(s); + s += p; + *s10 = vld1_s16(s); + s += p; + *s11 = vld1_s16(s); +} + +static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4, int16x4_t *const s5, + int16x4_t *const s6, int16x4_t *const s7, + int16x4_t *const s8, int16x4_t *const s9, + int16x4_t *const s10) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); + s += p; + *s8 = vld1_s16(s); + s += p; + *s9 = vld1_s16(s); + s += p; + *s10 = vld1_s16(s); +} + +static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3, + uint16x4_t *const s4, uint16x4_t *const s5, + uint16x4_t *const s6, uint16x4_t *const s7, + uint16x4_t *const s8, uint16x4_t *const s9, + uint16x4_t *const s10) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; + *s4 = vld1_u16(s); + s += p; + *s5 = vld1_u16(s); + s += p; + *s6 = vld1_u16(s); + s += p; + *s7 = vld1_u16(s); + s += p; + *s8 = vld1_u16(s); + s += p; + *s9 = vld1_u16(s); + s += p; + *s10 = vld1_u16(s); +} + +static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4, int16x4_t *const s5, + int16x4_t *const s6, int16x4_t *const s7) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); +} + +static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4, int16x4_t *const s5, + int16x4_t *const s6) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); +} + +static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4, int16x4_t *const s5) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); +} + +static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3, + int16x4_t *const s4) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); +} + +static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p, + uint16x4_t *const s0, uint16x4_t *const s1, + uint16x4_t *const s2, uint16x4_t *const s3, + uint16x4_t *const s4) { + *s0 = vld1_u16(s); + s += p; + *s1 = vld1_u16(s); + s += p; + *s2 = vld1_u16(s); + s += p; + *s3 = vld1_u16(s); + s += p; + *s4 = vld1_u16(s); + s += p; +} + +static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); +} + +static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3, + uint16x8_t *const s4) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; +} + +static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2, int16x4_t *const s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, + const uint8x8_t s1, const uint8x8_t s2, + const uint8x8_t s3, const uint8x8_t s4, + const uint8x8_t s5, const uint8x8_t s6, + const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, + const uint8x8_t s1, const uint8x8_t s2, + const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, + const uint8x16_t s1, const uint8x16_t s2, + const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3, + const uint16x8_t s4, const uint16x8_t s5, + const uint16x8_t s6, const uint16x8_t s7) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); + s += dst_stride; + vst1q_u16(s, s2); + s += dst_stride; + vst1q_u16(s, s3); + s += dst_stride; + vst1q_u16(s, s4); + s += dst_stride; + vst1q_u16(s, s5); + s += dst_stride; + vst1q_u16(s, s6); + s += dst_stride; + vst1q_u16(s, s7); +} + +static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3) { + vst1_u16(s, s0); + s += dst_stride; + vst1_u16(s, s1); + s += dst_stride; + vst1_u16(s, s2); + s += dst_stride; + vst1_u16(s, s3); +} + +static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); +} + +static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3) { + vst1q_u16(s, s0); + s += dst_stride; + vst1q_u16(s, s1); + s += dst_stride; + vst1q_u16(s, s2); + s += dst_stride; + vst1q_u16(s, s3); +} + +static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride, + const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7) { + vst1q_s16(s, s0); + s += dst_stride; + vst1q_s16(s, s1); + s += dst_stride; + vst1q_s16(s, s2); + s += dst_stride; + vst1q_s16(s, s3); + s += dst_stride; + vst1q_s16(s, s4); + s += dst_stride; + vst1q_s16(s, s5); + s += dst_stride; + vst1q_s16(s, s6); + s += dst_stride; + vst1q_s16(s, s7); +} + +static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride, + const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3) { + vst1_s16(s, s0); + s += dst_stride; + vst1_s16(s, s1); + s += dst_stride; + vst1_s16(s, s2); + s += dst_stride; + vst1_s16(s, s3); +} + +static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, + const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3) { + vst1q_s16(s, s0); + s += dst_stride; + vst1q_s16(s, s1); + s += dst_stride; + vst1q_s16(s, s2); + s += dst_stride; + vst1q_s16(s, s3); +} + +static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7, + uint8x8_t *const s8, uint8x8_t *const s9, + uint8x8_t *const s10) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); + s += p; + *s8 = vld1_u8(s); + s += p; + *s9 = vld1_u8(s); + s += p; + *s10 = vld1_u8(s); +} + +static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7, + int16x8_t *const s8, int16x8_t *const s9) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); + s += p; + *s8 = vld1q_s16(s); + s += p; + *s9 = vld1q_s16(s); +} + +static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7, + int16x8_t *const s8, int16x8_t *const s9, + int16x8_t *const s10) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); + s += p; + *s8 = vld1q_s16(s); + s += p; + *s9 = vld1q_s16(s); + s += p; + *s10 = vld1q_s16(s); +} + +static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7, + int16x8_t *const s8, int16x8_t *const s9, + int16x8_t *const s10, int16x8_t *const s11) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); + s += p; + *s8 = vld1q_s16(s); + s += p; + *s9 = vld1q_s16(s); + s += p; + *s10 = vld1q_s16(s); + s += p; + *s11 = vld1q_s16(s); +} + +static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3, + uint16x8_t *const s4, uint16x8_t *const s5, + uint16x8_t *const s6, uint16x8_t *const s7, + uint16x8_t *const s8, uint16x8_t *const s9, + uint16x8_t *const s10) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); + s += p; + *s8 = vld1q_u16(s); + s += p; + *s9 = vld1q_u16(s); + s += p; + *s10 = vld1q_u16(s); +} + +static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6, int16x8_t *const s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + +static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3, + uint16x8_t *const s4, uint16x8_t *const s5, + uint16x8_t *const s6) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); +} + +static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5, + int16x8_t *const s6) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); +} + +static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4, int16x8_t *const s5) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); +} + +static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, + int16x8_t *const s4) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); +} + +static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); +} + +// Load 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { + uint32_t a; + memcpy(&a, buf, 4); + buf += stride; + uint32x2_t a_u32 = vdup_n_u32(a); + memcpy(&a, buf, 4); + a_u32 = vset_lane_u32(a, a_u32, 1); + return vreinterpret_u8_u32(a_u32); +} + +// Load 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { + uint32_t a; + uint32x4_t a_u32; + if (stride == 4) return vld1q_u8(buf); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vdupq_n_u32(a); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 1); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 2); + memcpy(&a, buf, 4); + a_u32 = vsetq_lane_u32(a, a_u32, 3); + return vreinterpretq_u8_u32(a_u32); +} + +static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) { + uint16_t a; + uint16x4_t a_u16; + + memcpy(&a, buf, 2); + buf += stride; + a_u16 = vdup_n_u16(a); + memcpy(&a, buf, 2); + a_u16 = vset_lane_u16(a, a_u16, 1); + return vreinterpret_u8_u16(a_u16); +} + +static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { + uint32_t a; + uint32x2_t a_u32; + + memcpy(&a, buf, 4); + a_u32 = vdup_n_u32(0); + a_u32 = vset_lane_u32(a, a_u32, 0); + return vreinterpret_u8_u32(a_u32); +} + +static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) { + uint32_t a; + uint32x2_t a_u32; + + memcpy(&a, buf, 4); + a_u32 = vdup_n_u32(a); + return vreinterpret_u8_u32(a_u32); +} + +static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) { + uint16_t a; + uint16x4_t a_u32; + + memcpy(&a, buf, 2); + a_u32 = vdup_n_u16(a); + return vreinterpret_u8_u16(a_u32); +} + +static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) { + uint32_t a; + uint32x2_t a_u32; + + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vdup_n_u32(a); + memcpy(&a, buf, 4); + a_u32 = vset_lane_u32(a, a_u32, 1); + return vreinterpret_u8_u32(a_u32); +} + +static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride, + uint8x8_t *tu0, uint8x8_t *tu1) { + *tu0 = load_unaligned_u8_4x2(buf, stride); + buf += 2 * stride; + *tu1 = load_unaligned_u8_4x2(buf, stride); +} + +static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride, + uint8x8_t *tu0, uint8x8_t *tu1, + uint8x8_t *tu2) { + load_unaligned_u8_4x4(buf, stride, tu0, tu1); + buf += 4 * stride; + *tu2 = load_unaligned_u8_4x2(buf, stride); +} + +static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride, + uint8x8_t *tu0, uint8x8_t *tu1, + uint8x8_t *tu2, uint8x8_t *tu3) { + load_unaligned_u8_4x4(buf, stride, tu0, tu1); + buf += 4 * stride; + load_unaligned_u8_4x4(buf, stride, tu2, tu3); +} + +static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, + uint16x8_t *s6, uint16x8_t *s7) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); +} + +static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p, + uint16x8_t *const s0, uint16x8_t *const s1, + uint16x8_t *const s2, uint16x8_t *const s3, + uint16x8_t *const s4, uint16x8_t *const s5, + uint16x8_t *const s6, uint16x8_t *const s7) { + *s0 = vld1q_u16(s); + *s1 = vld1q_u16(s + 8); + s += p; + *s2 = vld1q_u16(s); + *s3 = vld1q_u16(s + 8); + s += p; + *s4 = vld1q_u16(s); + *s5 = vld1q_u16(s + 8); + s += p; + *s6 = vld1q_u16(s); + *s7 = vld1q_u16(s + 8); +} + +static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf, + int stride) { + uint32_t a; + uint32x2_t a_u32; + + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vdup_n_u32(a); + memcpy(&a, buf, 4); + a_u32 = vset_lane_u32(a, a_u32, 1); + return vreinterpret_u16_u32(a_u32); +} + +static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) { + uint64_t a; + uint64x1_t a_u64 = vdup_n_u64(0); + memcpy(&a, buf, 8); + a_u64 = vset_lane_u64(a, a_u64, 0); + return vreinterpret_u16_u64(a_u64); +} + +static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf, + uint32_t stride) { + uint64_t a; + uint64x2_t a_u64; + + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vdupq_n_u64(0); + a_u64 = vsetq_lane_u64(a, a_u64, 0); + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vsetq_lane_u64(a, a_u64, 1); + return vreinterpretq_u16_u64(a_u64); +} + +static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride, + uint16x8_t *tu0, uint16x8_t *tu1) { + *tu0 = load_unaligned_u16_4x2(buf, stride); + buf += 2 * stride; + *tu1 = load_unaligned_u16_4x2(buf, stride); +} + +static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1, + int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) { + *s1 = vld1q_s32(s); + s += p; + *s2 = vld1q_s32(s); + s += p; + *s3 = vld1q_s32(s); + s += p; + *s4 = vld1q_s32(s); +} + +static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1, + int32x4_t s2, int32x4_t s3, int32x4_t s4) { + vst1q_s32(s, s1); + s += p; + vst1q_s32(s, s2); + s += p; + vst1q_s32(s, s3); + s += p; + vst1q_s32(s, s4); +} + +static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1, + uint32x4_t *s2, uint32x4_t *s3, + uint32x4_t *s4) { + *s1 = vld1q_u32(s); + s += p; + *s2 = vld1q_u32(s); + s += p; + *s3 = vld1q_u32(s); + s += p; + *s4 = vld1q_u32(s); +} + +static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, + uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) { + vst1q_u32(s, s1); + s += p; + vst1q_u32(s, s2); + s += p; + vst1q_u32(s, s3); + s += p; + vst1q_u32(s, s4); +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +} + +static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) { + const int32x4_t v0 = vmovl_s16(a); + vst1q_s32(buf, v0); +} + +static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src, + int16x8_t indices) { + // Recent Clang and GCC versions correctly identify that this zero-broadcast + // is redundant. Alternatively we could load and broadcast the zeroth element + // and then replace the other lanes, however this is slower than loading a + // single element without broadcast on some micro-architectures. + uint8x8_t ret = vdup_n_u8(0); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7); + return ret; +} + +// The `lane` parameter here must be an immediate. +#define store_u8_2x1_lane(dst, src, lane) \ + do { \ + uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ + memcpy(dst, &a, 2); \ + } while (0) + +#define store_u8_4x1_lane(dst, src, lane) \ + do { \ + uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +#define store_u16_2x1_lane(dst, src, lane) \ + do { \ + uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +#define store_u16_4x1_lane(dst, src, lane) \ + do { \ + uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ + memcpy(dst, &a, 8); \ + } while (0) + +// Store the low 16-bits from a single vector. +static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) { + store_u8_2x1_lane(dst, src, 0); +} + +// Store the low 32-bits from a single vector. +static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) { + store_u8_4x1_lane(dst, src, 0); +} + +// Store two blocks of 16-bits from a single vector. +static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride, + uint8x8_t src) { + store_u8_2x1_lane(dst, src, 0); + dst += dst_stride; + store_u8_2x1_lane(dst, src, 1); +} + +// Store two blocks of 32-bits from a single vector. +static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride, + uint8x8_t src) { + store_u8_4x1_lane(dst, src, 0); + dst += stride; + store_u8_4x1_lane(dst, src, 1); +} + +// Store four blocks of 32-bits from a single vector. +static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride, + uint8x16_t src) { + store_u8_4x1_lane(dst, vget_low_u8(src), 0); + dst += stride; + store_u8_4x1_lane(dst, vget_low_u8(src), 1); + dst += stride; + store_u8_4x1_lane(dst, vget_high_u8(src), 0); + dst += stride; + store_u8_4x1_lane(dst, vget_high_u8(src), 1); +} + +// Store the low 32-bits from a single vector. +static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) { + store_u16_2x1_lane(dst, src, 0); +} + +// Store two blocks of 32-bits from a single vector. +static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride, + uint16x4_t src) { + store_u16_2x1_lane(dst, src, 0); + dst += dst_stride; + store_u16_2x1_lane(dst, src, 1); +} + +// Store two blocks of 64-bits from a single vector. +static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride, + uint16x8_t src) { + store_u16_4x1_lane(dst, src, 0); + dst += dst_stride; + store_u16_4x1_lane(dst, src, 1); +} + +#undef store_u8_2x1_lane +#undef store_u8_4x1_lane +#undef store_u16_2x1_lane +#undef store_u16_4x1_lane + +#endif // AOM_AOM_DSP_ARM_MEM_NEON_H_ diff --git a/third_party/aom/aom_dsp/arm/obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c new file mode 100644 index 0000000000..a692cbb388 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "mem_neon.h" +#include "sum_neon.h" + +static INLINE void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask, + const int32_t *wsrc, uint32x4_t *sum) { + int32x4_t wsrc_lo = vld1q_s32(wsrc); + int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); + + int32x4_t mask_lo = vld1q_s32(mask); + int32x4_t mask_hi = vld1q_s32(mask + 4); + + int16x8_t mask_s16 = + vuzpq_s16(vreinterpretq_s16_s32(mask_lo), vreinterpretq_s16_s32(mask_hi)) + .val[0]; + + int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16)); + int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16)); + + uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); + uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); + + *sum = vrsraq_n_u32(*sum, abs_lo, 12); + *sum = vrsraq_n_u32(*sum, abs_hi, 12); +} + +#if AOM_ARCH_AARCH64 + +// Use tbl for doing a double-width zero extension from 8->32 bits since we can +// do this in one instruction rather than two (indices out of range (255 here) +// are set to zero by tbl). +DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = { + 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, + 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255, + 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255, + 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255 +}; + +static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo, + uint32x4_t ref_u32_hi, + const int32_t *mask, + const int32_t *wsrc, + uint32x4_t sum[2]) { + int32x4_t wsrc_lo = vld1q_s32(wsrc); + int32x4_t wsrc_hi = vld1q_s32(wsrc + 4); + int32x4_t mask_lo = vld1q_s32(mask); + int32x4_t mask_hi = vld1q_s32(mask + 4); + + int32x4_t pre_lo = vmulq_s32(vreinterpretq_s32_u32(ref_u32_lo), mask_lo); + int32x4_t pre_hi = vmulq_s32(vreinterpretq_s32_u32(ref_u32_hi), mask_hi); + + uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo)); + uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi)); + + sum[0] = vrsraq_n_u32(sum[0], abs_lo, 12); + sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12); +} + +static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int width, + int height) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + // Use tbl for doing a double-width zero extension from 8->32 bits since we + // can do this in one instruction rather than two. + uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]); + uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]); + uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]); + uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]); + + int h = height; + do { + int w = width; + const uint8_t *ref_ptr = ref; + do { + uint8x16_t r = vld1q_u8(ref_ptr); + + uint32x4_t ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx0)); + uint32x4_t ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx1)); + obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask, wsrc, sum); + + ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx2)); + ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx3)); + obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask + 8, wsrc + 8, sum); + + ref_ptr += 16; + wsrc += 16; + mask += 16; + w -= 16; + } while (w != 0); + + ref += ref_stride; + } while (--h != 0); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); +} + +#else // !AOM_ARCH_AARCH64 + +static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int width, + int height) { + uint32x4_t sum = vdupq_n_u32(0); + + int h = height; + do { + int w = width; + const uint8_t *ref_ptr = ref; + do { + uint8x16_t r = vld1q_u8(ref_ptr); + + int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r))); + obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); + + ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r))); + obmc_sad_8x1_s16_neon(ref_s16, mask + 8, wsrc + 8, &sum); + + ref_ptr += 16; + wsrc += 16; + mask += 16; + w -= 16; + } while (w != 0); + + ref += ref_stride; + } while (--h != 0); + + return horizontal_add_u32x4(sum); +} + +#endif // AOM_ARCH_AARCH64 + +static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int h) { + return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h); +} + +static INLINE unsigned int obmc_sad_64xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int h) { + return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h); +} + +static INLINE unsigned int obmc_sad_32xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int h) { + return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h); +} + +static INLINE unsigned int obmc_sad_16xh_neon(const uint8_t *ref, + int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int h) { + return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h); +} + +static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int height) { + uint32x4_t sum = vdupq_n_u32(0); + + int h = height; + do { + uint8x8_t r = vld1_u8(ref); + + int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r)); + obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); + + ref += ref_stride; + wsrc += 8; + mask += 8; + } while (--h != 0); + + return horizontal_add_u32x4(sum); +} + +static INLINE unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride, + const int32_t *wsrc, + const int32_t *mask, int height) { + uint32x4_t sum = vdupq_n_u32(0); + + int h = height / 2; + do { + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r)); + obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum); + + ref += 2 * ref_stride; + wsrc += 8; + mask += 8; + } while (--h != 0); + + return horizontal_add_u32x4(sum); +} + +#define OBMC_SAD_WXH_NEON(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_neon( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \ + } + +OBMC_SAD_WXH_NEON(4, 4) +OBMC_SAD_WXH_NEON(4, 8) +OBMC_SAD_WXH_NEON(4, 16) + +OBMC_SAD_WXH_NEON(8, 4) +OBMC_SAD_WXH_NEON(8, 8) +OBMC_SAD_WXH_NEON(8, 16) +OBMC_SAD_WXH_NEON(8, 32) + +OBMC_SAD_WXH_NEON(16, 4) +OBMC_SAD_WXH_NEON(16, 8) +OBMC_SAD_WXH_NEON(16, 16) +OBMC_SAD_WXH_NEON(16, 32) +OBMC_SAD_WXH_NEON(16, 64) + +OBMC_SAD_WXH_NEON(32, 8) +OBMC_SAD_WXH_NEON(32, 16) +OBMC_SAD_WXH_NEON(32, 32) +OBMC_SAD_WXH_NEON(32, 64) + +OBMC_SAD_WXH_NEON(64, 16) +OBMC_SAD_WXH_NEON(64, 32) +OBMC_SAD_WXH_NEON(64, 64) +OBMC_SAD_WXH_NEON(64, 128) + +OBMC_SAD_WXH_NEON(128, 64) +OBMC_SAD_WXH_NEON(128, 128) diff --git a/third_party/aom/aom_dsp/arm/obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c new file mode 100644 index 0000000000..50cd5f3b6a --- /dev/null +++ b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "mem_neon.h" +#include "sum_neon.h" + +static INLINE void obmc_variance_8x1_s16_neon(int16x8_t pre_s16, + const int32_t *wsrc, + const int32_t *mask, + int32x4_t *ssev, + int32x4_t *sumv) { + // For 4xh and 8xh we observe it is faster to avoid the double-widening of + // pre. Instead we do a single widening step and narrow the mask to 16-bits + // to allow us to perform a widening multiply. Widening multiply + // instructions have better throughput on some micro-architectures but for + // the larger block sizes this benefit is outweighed by the additional + // instruction needed to first narrow the mask vectors. + + int32x4_t wsrc_s32_lo = vld1q_s32(&wsrc[0]); + int32x4_t wsrc_s32_hi = vld1q_s32(&wsrc[4]); + int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(vld1q_s32(&mask[0])), + vreinterpretq_s16_s32(vld1q_s32(&mask[4]))) + .val[0]; + + int32x4_t diff_s32_lo = + vmlsl_s16(wsrc_s32_lo, vget_low_s16(pre_s16), vget_low_s16(mask_s16)); + int32x4_t diff_s32_hi = + vmlsl_s16(wsrc_s32_hi, vget_high_s16(pre_s16), vget_high_s16(mask_s16)); + + // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away + // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. + // This difference only affects the bit patterns at the rounding breakpoints + // exactly, so we can add -1 to all negative numbers to move the breakpoint + // one value across and into the correct rounding region. + diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31); + diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31); + int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12); + int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12); + + *sumv = vrsraq_n_s32(*sumv, diff_s32_lo, 12); + *sumv = vrsraq_n_s32(*sumv, diff_s32_hi, 12); + *ssev = vmlaq_s32(*ssev, round_s32_lo, round_s32_lo); + *ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi); +} + +#if AOM_ARCH_AARCH64 + +// Use tbl for doing a double-width zero extension from 8->32 bits since we can +// do this in one instruction rather than two (indices out of range (255 here) +// are set to zero by tbl). +DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = { + 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, + 4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255, + 8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255, + 12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255 +}; + +static INLINE void obmc_variance_8x1_s32_neon( + int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc, + const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) { + int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]); + int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]); + int32x4_t mask_lo = vld1q_s32(&mask[0]); + int32x4_t mask_hi = vld1q_s32(&mask[4]); + + int32x4_t diff_lo = vmlsq_s32(wsrc_lo, pre_lo, mask_lo); + int32x4_t diff_hi = vmlsq_s32(wsrc_hi, pre_hi, mask_hi); + + // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from + // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This + // difference only affects the bit patterns at the rounding breakpoints + // exactly, so we can add -1 to all negative numbers to move the breakpoint + // one value across and into the correct rounding region. + diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31); + diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31); + int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12); + int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12); + + *sumv = vrsraq_n_s32(*sumv, diff_lo, 12); + *sumv = vrsraq_n_s32(*sumv, diff_hi, 12); + *ssev = vmlaq_s32(*ssev, round_lo, round_lo); + *ssev = vmlaq_s32(*ssev, round_hi, round_hi); +} + +static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int width, + int height, unsigned *sse, + int *sum) { + assert(width % 16 == 0); + + // Use tbl for doing a double-width zero extension from 8->32 bits since we + // can do this in one instruction rather than two. + uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]); + uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]); + uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]); + uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]); + + int32x4_t ssev = vdupq_n_s32(0); + int32x4_t sumv = vdupq_n_s32(0); + + int h = height; + do { + int w = width; + do { + uint8x16_t pre_u8 = vld1q_u8(pre); + + int32x4_t pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx0)); + int32x4_t pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx1)); + obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[0], &mask[0], + &ssev, &sumv); + + pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx2)); + pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx3)); + obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[8], &mask[8], + &ssev, &sumv); + + wsrc += 16; + mask += 16; + pre += 16; + w -= 16; + } while (w != 0); + + pre += pre_stride - width; + } while (--h != 0); + + *sse = horizontal_add_s32x4(ssev); + *sum = horizontal_add_s32x4(sumv); +} + +#else // !AOM_ARCH_AARCH64 + +static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int width, + int height, unsigned *sse, + int *sum) { + // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the + // widening version of the core kernel instead. + + assert(width % 16 == 0); + + int32x4_t ssev = vdupq_n_s32(0); + int32x4_t sumv = vdupq_n_s32(0); + + int h = height; + do { + int w = width; + do { + uint8x16_t pre_u8 = vld1q_u8(pre); + + int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pre_u8))); + obmc_variance_8x1_s16_neon(pre_s16, &wsrc[0], &mask[0], &ssev, &sumv); + + pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pre_u8))); + obmc_variance_8x1_s16_neon(pre_s16, &wsrc[8], &mask[8], &ssev, &sumv); + + wsrc += 16; + mask += 16; + pre += 16; + w -= 16; + } while (w != 0); + + pre += pre_stride - width; + } while (--h != 0); + + *sse = horizontal_add_s32x4(ssev); + *sum = horizontal_add_s32x4(sumv); +} + +#endif // AOM_ARCH_AARCH64 + +static INLINE void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + unsigned *sse, int *sum) { + obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum); +} + +static INLINE void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + unsigned *sse, int *sum) { + obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum); +} + +static INLINE void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + unsigned *sse, int *sum) { + obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum); +} + +static INLINE void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + unsigned *sse, int *sum) { + obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum); +} + +static INLINE void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + unsigned *sse, int *sum) { + int32x4_t ssev = vdupq_n_s32(0); + int32x4_t sumv = vdupq_n_s32(0); + + do { + uint8x8_t pre_u8 = vld1_u8(pre); + int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8)); + + obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv); + + pre += pre_stride; + wsrc += 8; + mask += 8; + } while (--h != 0); + + *sse = horizontal_add_s32x4(ssev); + *sum = horizontal_add_s32x4(sumv); +} + +static INLINE void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int h, + unsigned *sse, int *sum) { + assert(h % 2 == 0); + + int32x4_t ssev = vdupq_n_s32(0); + int32x4_t sumv = vdupq_n_s32(0); + + do { + uint8x8_t pre_u8 = load_unaligned_u8(pre, pre_stride); + int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8)); + + obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv); + + pre += 2 * pre_stride; + wsrc += 8; + mask += 8; + h -= 2; + } while (h != 0); + + *sse = horizontal_add_s32x4(ssev); + *sum = horizontal_add_s32x4(sumv); +} + +#define OBMC_VARIANCE_WXH_NEON(W, H) \ + unsigned aom_obmc_variance##W##x##H##_neon( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned *sse) { \ + int sum; \ + obmc_variance_neon_##W##xh(pre, pre_stride, wsrc, mask, H, sse, &sum); \ + return *sse - (unsigned)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMC_VARIANCE_WXH_NEON(4, 4) +OBMC_VARIANCE_WXH_NEON(4, 8) +OBMC_VARIANCE_WXH_NEON(8, 4) +OBMC_VARIANCE_WXH_NEON(8, 8) +OBMC_VARIANCE_WXH_NEON(8, 16) +OBMC_VARIANCE_WXH_NEON(16, 8) +OBMC_VARIANCE_WXH_NEON(16, 16) +OBMC_VARIANCE_WXH_NEON(16, 32) +OBMC_VARIANCE_WXH_NEON(32, 16) +OBMC_VARIANCE_WXH_NEON(32, 32) +OBMC_VARIANCE_WXH_NEON(32, 64) +OBMC_VARIANCE_WXH_NEON(64, 32) +OBMC_VARIANCE_WXH_NEON(64, 64) +OBMC_VARIANCE_WXH_NEON(64, 128) +OBMC_VARIANCE_WXH_NEON(128, 64) +OBMC_VARIANCE_WXH_NEON(128, 128) +OBMC_VARIANCE_WXH_NEON(4, 16) +OBMC_VARIANCE_WXH_NEON(16, 4) +OBMC_VARIANCE_WXH_NEON(8, 32) +OBMC_VARIANCE_WXH_NEON(32, 8) +OBMC_VARIANCE_WXH_NEON(16, 64) +OBMC_VARIANCE_WXH_NEON(64, 16) diff --git a/third_party/aom/aom_dsp/arm/reinterpret_neon.h b/third_party/aom/aom_dsp/arm/reinterpret_neon.h new file mode 100644 index 0000000000..f9702513ad --- /dev/null +++ b/third_party/aom/aom_dsp/arm/reinterpret_neon.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ +#define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ + +#include + +#include "aom/aom_integer.h" // For AOM_FORCE_INLINE. +#include "config/aom_config.h" + +#define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q) \ + static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t \ + aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n( \ + const u##int##from_sz##x##from_count##x##n##_t src) { \ + u##int##to_sz##x##to_count##x##n##_t ret; \ + for (int i = 0; i < (n); ++i) { \ + ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \ + } \ + return ret; \ + } + +REINTERPRET_NEON(u, 8, 8, 16, 4, 2, ) // uint8x8x2_t from uint16x4x2_t +REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q) // uint8x16x2_t from uint16x8x2_t + +#endif // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c new file mode 100644 index 0000000000..46a1666331 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad_neon.c @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/dist_wtd_avg_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + // We use 8 accumulators to prevent overflow for large values of 'h', as well + // as enabling optimal UADALP instruction throughput on CPUs that have either + // 2 or 4 Neon pipes. + uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7; + uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); + + s4 = vld1q_u8(src_ptr + 64); + r4 = vld1q_u8(ref_ptr + 64); + diff4 = vabdq_u8(s4, r4); + sum[4] = vpadalq_u8(sum[4], diff4); + + s5 = vld1q_u8(src_ptr + 80); + r5 = vld1q_u8(ref_ptr + 80); + diff5 = vabdq_u8(s5, r5); + sum[5] = vpadalq_u8(sum[5], diff5); + + s6 = vld1q_u8(src_ptr + 96); + r6 = vld1q_u8(ref_ptr + 96); + diff6 = vabdq_u8(s6, r6); + sum[6] = vpadalq_u8(sum[6], diff6); + + s7 = vld1q_u8(src_ptr + 112); + r7 = vld1q_u8(ref_ptr + 112); + diff7 = vabdq_u8(s7, r7); + sum[7] = vpadalq_u8(sum[7], diff7); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + sum_u32 = vpadalq_u16(sum_u32, sum[4]); + sum_u32 = vpadalq_u16(sum_u32, sum[5]); + sum_u32 = vpadalq_u16(sum_u32, sum[6]); + sum_u32 = vpadalq_u16(sum_u32, sum[7]); + + return horizontal_add_u32x4(sum_u32); +} + +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; + uint8x16_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_u32x4(sum_u32); +} + +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); +} + +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + + uint8x16_t diff = vabdq_u8(s, r); + sum = vpadalq_u8(sum, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + + sum = vabal_u8(sum, s, r); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + + sum = vabal_u8(sum, s, r); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +#define SAD_WXH_NEON(w, h) \ + unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON(4, 4) +SAD_WXH_NEON(4, 8) + +SAD_WXH_NEON(8, 4) +SAD_WXH_NEON(8, 8) +SAD_WXH_NEON(8, 16) + +SAD_WXH_NEON(16, 8) +SAD_WXH_NEON(16, 16) +SAD_WXH_NEON(16, 32) + +SAD_WXH_NEON(32, 16) +SAD_WXH_NEON(32, 32) +SAD_WXH_NEON(32, 64) + +SAD_WXH_NEON(64, 32) +SAD_WXH_NEON(64, 64) +SAD_WXH_NEON(64, 128) + +SAD_WXH_NEON(128, 64) +SAD_WXH_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_NEON(4, 16) +SAD_WXH_NEON(8, 32) +SAD_WXH_NEON(16, 4) +SAD_WXH_NEON(16, 64) +SAD_WXH_NEON(32, 8) +SAD_WXH_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_NEON + +#define SAD_SKIP_WXH_NEON(w, h) \ + unsigned int aom_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON(4, 4) +SAD_SKIP_WXH_NEON(4, 8) + +SAD_SKIP_WXH_NEON(8, 4) +SAD_SKIP_WXH_NEON(8, 8) +SAD_SKIP_WXH_NEON(8, 16) + +SAD_SKIP_WXH_NEON(16, 8) +SAD_SKIP_WXH_NEON(16, 16) +SAD_SKIP_WXH_NEON(16, 32) + +SAD_SKIP_WXH_NEON(32, 16) +SAD_SKIP_WXH_NEON(32, 32) +SAD_SKIP_WXH_NEON(32, 64) + +SAD_SKIP_WXH_NEON(64, 32) +SAD_SKIP_WXH_NEON(64, 64) +SAD_SKIP_WXH_NEON(64, 128) + +SAD_SKIP_WXH_NEON(128, 64) +SAD_SKIP_WXH_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_SKIP_WXH_NEON(4, 16) +SAD_SKIP_WXH_NEON(8, 32) +SAD_SKIP_WXH_NEON(16, 4) +SAD_SKIP_WXH_NEON(16, 64) +SAD_SKIP_WXH_NEON(32, 8) +SAD_SKIP_WXH_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_SKIP_WXH_NEON + +static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + // We use 8 accumulators to prevent overflow for large values of 'h', as well + // as enabling optimal UADALP instruction throughput on CPUs that have either + // 2 or 4 Neon pipes. + uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7; + uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7; + uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7; + uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7; + uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + + s4 = vld1q_u8(src_ptr + 64); + r4 = vld1q_u8(ref_ptr + 64); + p4 = vld1q_u8(second_pred + 64); + avg4 = vrhaddq_u8(r4, p4); + diff4 = vabdq_u8(s4, avg4); + sum[4] = vpadalq_u8(sum[4], diff4); + + s5 = vld1q_u8(src_ptr + 80); + r5 = vld1q_u8(ref_ptr + 80); + p5 = vld1q_u8(second_pred + 80); + avg5 = vrhaddq_u8(r5, p5); + diff5 = vabdq_u8(s5, avg5); + sum[5] = vpadalq_u8(sum[5], diff5); + + s6 = vld1q_u8(src_ptr + 96); + r6 = vld1q_u8(ref_ptr + 96); + p6 = vld1q_u8(second_pred + 96); + avg6 = vrhaddq_u8(r6, p6); + diff6 = vabdq_u8(s6, avg6); + sum[6] = vpadalq_u8(sum[6], diff6); + + s7 = vld1q_u8(src_ptr + 112); + r7 = vld1q_u8(ref_ptr + 112); + p7 = vld1q_u8(second_pred + 112); + avg7 = vrhaddq_u8(r7, p7); + diff7 = vabdq_u8(s7, avg7); + sum[7] = vpadalq_u8(sum[7], diff7); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 128; + } while (--i != 0); + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + sum_u32 = vpadalq_u16(sum_u32, sum[4]); + sum_u32 = vpadalq_u16(sum_u32, sum[5]); + sum_u32 = vpadalq_u16(sum_u32, sum[6]); + sum_u32 = vpadalq_u16(sum_u32, sum[7]); + + return horizontal_add_u32x4(sum_u32); +} + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + } while (--i != 0); + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_u32x4(sum_u32); +} + +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t avg0 = vrhaddq_u8(r0, p0); + uint8x16_t diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t avg1 = vrhaddq_u8(r1, p1); + uint8x16_t diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + } while (--i != 0); + + return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); +} + +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(r, p); + uint8x16_t diff = vabdq_u8(s, avg); + sum = vpadalq_u8(sum, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +#define SAD_WXH_AVG_NEON(w, h) \ + unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON(4, 4) +SAD_WXH_AVG_NEON(4, 8) + +SAD_WXH_AVG_NEON(8, 4) +SAD_WXH_AVG_NEON(8, 8) +SAD_WXH_AVG_NEON(8, 16) + +SAD_WXH_AVG_NEON(16, 8) +SAD_WXH_AVG_NEON(16, 16) +SAD_WXH_AVG_NEON(16, 32) + +SAD_WXH_AVG_NEON(32, 16) +SAD_WXH_AVG_NEON(32, 32) +SAD_WXH_AVG_NEON(32, 64) + +SAD_WXH_AVG_NEON(64, 32) +SAD_WXH_AVG_NEON(64, 64) +SAD_WXH_AVG_NEON(64, 128) + +SAD_WXH_AVG_NEON(128, 64) +SAD_WXH_AVG_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_AVG_NEON(4, 16) +SAD_WXH_AVG_NEON(8, 32) +SAD_WXH_AVG_NEON(16, 4) +SAD_WXH_AVG_NEON(16, 64) +SAD_WXH_AVG_NEON(32, 8) +SAD_WXH_AVG_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_AVG_NEON + +static INLINE unsigned int dist_wtd_sad128xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + // We use 8 accumulators to prevent overflow for large values of 'h', as well + // as enabling optimal UADALP instruction throughput on CPUs that have either + // 2 or 4 Neon pipes. + uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + uint8x16_t s2 = vld1q_u8(src_ptr + 32); + uint8x16_t r2 = vld1q_u8(ref_ptr + 32); + uint8x16_t p2 = vld1q_u8(second_pred + 32); + uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset); + uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + uint8x16_t s3 = vld1q_u8(src_ptr + 48); + uint8x16_t r3 = vld1q_u8(ref_ptr + 48); + uint8x16_t p3 = vld1q_u8(second_pred + 48); + uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset); + uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + + uint8x16_t s4 = vld1q_u8(src_ptr + 64); + uint8x16_t r4 = vld1q_u8(ref_ptr + 64); + uint8x16_t p4 = vld1q_u8(second_pred + 64); + uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset); + uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4); + sum[4] = vpadalq_u8(sum[4], diff4); + + uint8x16_t s5 = vld1q_u8(src_ptr + 80); + uint8x16_t r5 = vld1q_u8(ref_ptr + 80); + uint8x16_t p5 = vld1q_u8(second_pred + 80); + uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset); + uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5); + sum[5] = vpadalq_u8(sum[5], diff5); + + uint8x16_t s6 = vld1q_u8(src_ptr + 96); + uint8x16_t r6 = vld1q_u8(ref_ptr + 96); + uint8x16_t p6 = vld1q_u8(second_pred + 96); + uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset); + uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6); + sum[6] = vpadalq_u8(sum[6], diff6); + + uint8x16_t s7 = vld1q_u8(src_ptr + 112); + uint8x16_t r7 = vld1q_u8(ref_ptr + 112); + uint8x16_t p7 = vld1q_u8(second_pred + 112); + uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset); + uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7); + sum[7] = vpadalq_u8(sum[7], diff7); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 128; + } while (--h != 0); + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + sum_u32 = vpadalq_u16(sum_u32, sum[4]); + sum_u32 = vpadalq_u16(sum_u32, sum[5]); + sum_u32 = vpadalq_u16(sum_u32, sum[6]); + sum_u32 = vpadalq_u16(sum_u32, sum[7]); + + return horizontal_add_u32x4(sum_u32); +} + +static INLINE unsigned int dist_wtd_sad64xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + uint8x16_t s2 = vld1q_u8(src_ptr + 32); + uint8x16_t r2 = vld1q_u8(ref_ptr + 32); + uint8x16_t p2 = vld1q_u8(second_pred + 32); + uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset); + uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + uint8x16_t s3 = vld1q_u8(src_ptr + 48); + uint8x16_t r3 = vld1q_u8(ref_ptr + 48); + uint8x16_t p3 = vld1q_u8(second_pred + 48); + uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset); + uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + } while (--h != 0); + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_u32x4(sum_u32); +} + +static INLINE unsigned int dist_wtd_sad32xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + } while (--h != 0); + + return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1])); +} + +static INLINE unsigned int dist_wtd_sad16xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + uint16x8_t sum = vdupq_n_u16(0); + + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset); + uint8x16_t diff = vabdq_u8(s, wtd_avg); + sum = vpadalq_u8(sum, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--h != 0); + + return horizontal_add_u16x8(sum); +} + +static INLINE unsigned int dist_wtd_sad8xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset); + const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset); + uint16x8_t sum = vdupq_n_u16(0); + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset); + sum = vabal_u8(sum, s, wtd_avg); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + } while (--h != 0); + + return horizontal_add_u16x8(sum); +} + +static INLINE unsigned int dist_wtd_sad4xh_avg_neon( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset); + const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset); + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset); + sum = vabal_u8(sum, s, wtd_avg); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_u16x8(sum); +} + +#define DIST_WTD_SAD_WXH_AVG_NEON(w, h) \ + unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred, jcp_param); \ + } + +DIST_WTD_SAD_WXH_AVG_NEON(4, 4) +DIST_WTD_SAD_WXH_AVG_NEON(4, 8) + +DIST_WTD_SAD_WXH_AVG_NEON(8, 4) +DIST_WTD_SAD_WXH_AVG_NEON(8, 8) +DIST_WTD_SAD_WXH_AVG_NEON(8, 16) + +DIST_WTD_SAD_WXH_AVG_NEON(16, 8) +DIST_WTD_SAD_WXH_AVG_NEON(16, 16) +DIST_WTD_SAD_WXH_AVG_NEON(16, 32) + +DIST_WTD_SAD_WXH_AVG_NEON(32, 16) +DIST_WTD_SAD_WXH_AVG_NEON(32, 32) +DIST_WTD_SAD_WXH_AVG_NEON(32, 64) + +DIST_WTD_SAD_WXH_AVG_NEON(64, 32) +DIST_WTD_SAD_WXH_AVG_NEON(64, 64) +DIST_WTD_SAD_WXH_AVG_NEON(64, 128) + +DIST_WTD_SAD_WXH_AVG_NEON(128, 64) +DIST_WTD_SAD_WXH_AVG_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +DIST_WTD_SAD_WXH_AVG_NEON(4, 16) +DIST_WTD_SAD_WXH_AVG_NEON(8, 32) +DIST_WTD_SAD_WXH_AVG_NEON(16, 4) +DIST_WTD_SAD_WXH_AVG_NEON(16, 64) +DIST_WTD_SAD_WXH_AVG_NEON(32, 8) +DIST_WTD_SAD_WXH_AVG_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef DIST_WTD_SAD_WXH_AVG_NEON diff --git a/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c new file mode 100644 index 0000000000..5504c6838e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/dist_wtd_avg_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h); +} + +static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_NEON_DOTPROD(w, h) \ + unsigned int aom_sad##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON_DOTPROD(16, 8) +SAD_WXH_NEON_DOTPROD(16, 16) +SAD_WXH_NEON_DOTPROD(16, 32) + +SAD_WXH_NEON_DOTPROD(32, 16) +SAD_WXH_NEON_DOTPROD(32, 32) +SAD_WXH_NEON_DOTPROD(32, 64) + +SAD_WXH_NEON_DOTPROD(64, 32) +SAD_WXH_NEON_DOTPROD(64, 64) +SAD_WXH_NEON_DOTPROD(64, 128) + +SAD_WXH_NEON_DOTPROD(128, 64) +SAD_WXH_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_NEON_DOTPROD(16, 4) +SAD_WXH_NEON_DOTPROD(16, 64) +SAD_WXH_NEON_DOTPROD(32, 8) +SAD_WXH_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_NEON_DOTPROD + +#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ + unsigned int aom_sad_skip_##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_NEON_DOTPROD(64, 64) +SAD_SKIP_WXH_NEON_DOTPROD(64, 128) + +SAD_SKIP_WXH_NEON_DOTPROD(128, 64) +SAD_SKIP_WXH_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_SKIP_WXH_NEON_DOTPROD(16, 4) +SAD_SKIP_WXH_NEON_DOTPROD(16, 64) +SAD_SKIP_WXH_NEON_DOTPROD(32, 8) +SAD_SKIP_WXH_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_SKIP_WXH_NEON_DOTPROD + +static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad128xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, + h, second_pred); +} + +static INLINE unsigned int sad64xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, + h, second_pred); +} + +static INLINE unsigned int sad32xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, + h, second_pred); +} + +static INLINE unsigned int sad16xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + p1 = vld1q_u8(second_pred); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ + unsigned int aom_sad##w##x##h##_avg_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON_DOTPROD(16, 8) +SAD_WXH_AVG_NEON_DOTPROD(16, 16) +SAD_WXH_AVG_NEON_DOTPROD(16, 32) + +SAD_WXH_AVG_NEON_DOTPROD(32, 16) +SAD_WXH_AVG_NEON_DOTPROD(32, 32) +SAD_WXH_AVG_NEON_DOTPROD(32, 64) + +SAD_WXH_AVG_NEON_DOTPROD(64, 32) +SAD_WXH_AVG_NEON_DOTPROD(64, 64) +SAD_WXH_AVG_NEON_DOTPROD(64, 128) + +SAD_WXH_AVG_NEON_DOTPROD(128, 64) +SAD_WXH_AVG_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_AVG_NEON_DOTPROD(16, 4) +SAD_WXH_AVG_NEON_DOTPROD(16, 64) +SAD_WXH_AVG_NEON_DOTPROD(32, 8) +SAD_WXH_AVG_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_AVG_NEON_DOTPROD + +static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + // We use 8 accumulators to minimize the accumulation and loop carried + // dependencies for better instruction throughput. + uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0), vdupq_n_u32(0) }; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + uint8x16_t s2 = vld1q_u8(src_ptr + 32); + uint8x16_t r2 = vld1q_u8(ref_ptr + 32); + uint8x16_t p2 = vld1q_u8(second_pred + 32); + uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset); + uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2); + sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1)); + + uint8x16_t s3 = vld1q_u8(src_ptr + 48); + uint8x16_t r3 = vld1q_u8(ref_ptr + 48); + uint8x16_t p3 = vld1q_u8(second_pred + 48); + uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset); + uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3); + sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1)); + + uint8x16_t s4 = vld1q_u8(src_ptr + 64); + uint8x16_t r4 = vld1q_u8(ref_ptr + 64); + uint8x16_t p4 = vld1q_u8(second_pred + 64); + uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset); + uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4); + sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1)); + + uint8x16_t s5 = vld1q_u8(src_ptr + 80); + uint8x16_t r5 = vld1q_u8(ref_ptr + 80); + uint8x16_t p5 = vld1q_u8(second_pred + 80); + uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset); + uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5); + sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1)); + + uint8x16_t s6 = vld1q_u8(src_ptr + 96); + uint8x16_t r6 = vld1q_u8(ref_ptr + 96); + uint8x16_t p6 = vld1q_u8(second_pred + 96); + uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset); + uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6); + sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1)); + + uint8x16_t s7 = vld1q_u8(src_ptr + 112); + uint8x16_t r7 = vld1q_u8(ref_ptr + 112); + uint8x16_t p7 = vld1q_u8(second_pred + 112); + uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset); + uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7); + sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 128; + } while (--h != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[4] = vaddq_u32(sum[4], sum[5]); + sum[6] = vaddq_u32(sum[6], sum[7]); + sum[0] = vaddq_u32(sum[0], sum[2]); + sum[4] = vaddq_u32(sum[4], sum[6]); + sum[0] = vaddq_u32(sum[0], sum[4]); + return horizontal_add_u32x4(sum[0]); +} + +static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + uint8x16_t s2 = vld1q_u8(src_ptr + 32); + uint8x16_t r2 = vld1q_u8(ref_ptr + 32); + uint8x16_t p2 = vld1q_u8(second_pred + 32); + uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset); + uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2); + sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1)); + + uint8x16_t s3 = vld1q_u8(src_ptr + 48); + uint8x16_t r3 = vld1q_u8(ref_ptr + 48); + uint8x16_t p3 = vld1q_u8(second_pred + 48); + uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset); + uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3); + sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + } while (--h != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + return horizontal_add_u32x4(sum[0]); +} + +static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + } while (--h != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_u32x4(sum[0]); +} + +static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset); + uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + + uint8x16_t s1 = vld1q_u8(src_ptr); + uint8x16_t r1 = vld1q_u8(ref_ptr); + uint8x16_t p1 = vld1q_u8(second_pred); + uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset); + uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_u32x4(sum[0]); +} + +#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h) \ + unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return dist_wtd_sad##w##xh_avg_neon_dotprod( \ + src, src_stride, ref, ref_stride, (h), second_pred, jcp_param); \ + } + +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 8) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 16) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 32) + +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 16) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 32) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 64) + +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 32) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 64) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 128) + +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 64) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 4) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 64) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 8) +DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon.c b/third_party/aom/aom_dsp/arm/sadxd_neon.c new file mode 100644 index 0000000000..e89e1c5a73 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sadxd_neon.c @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint16x8_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vpadalq_u8(*sad_sum, abs_diff); +} + +static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], + int ref_stride, uint32_t res[3], int w, + int h, int h_overflow) { + uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + int h_limit = h > h_overflow ? h_overflow : h; + + int ref_offset = 0; + int i = 0; + do { + uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; + uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; + + do { + int j = 0; + do { + const uint8x16_t s0 = vld1q_u8(src + j); + sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); + + const uint8x16_t s1 = vld1q_u8(src + j + 16); + sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); + + j += 32; + } while (j < w); + + src += src_stride; + ref_offset += ref_stride; + } while (++i < h_limit); + + sum[0] = vpadalq_u16(sum[0], sum_lo[0]); + sum[0] = vpadalq_u16(sum[0], sum_hi[0]); + sum[1] = vpadalq_u16(sum[1], sum_lo[1]); + sum[1] = vpadalq_u16(sum[1], sum_hi[1]); + sum[2] = vpadalq_u16(sum[2], sum_lo[2]); + sum[2] = vpadalq_u16(sum[2], sum_hi[2]); + + h_limit += h_overflow; + } while (i < h); + + res[0] = horizontal_add_u32x4(sum[0]); + res[1] = horizontal_add_u32x4(sum[1]); + res[2] = horizontal_add_u32x4(sum[2]); +} + +static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], int ref_stride, + uint32_t res[3], int h) { + sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); +} + +static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], int ref_stride, + uint32_t res[3], int h) { + sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); +} + +static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], int ref_stride, + uint32_t res[3], int h) { + uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; + uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + const uint8x16_t s0 = vld1q_u8(src); + sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); + + const uint8x16_t s1 = vld1q_u8(src + 16); + sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]); + res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]); + res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]); +} + +static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], int ref_stride, + uint32_t res[3], int h) { + uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + const uint8x16_t s = vld1q_u8(src); + sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + res[0] = horizontal_add_u16x8(sum[0]); + res[1] = horizontal_add_u16x8(sum[1]); + res[2] = horizontal_add_u16x8(sum[2]); +} + +static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], int ref_stride, + uint32_t res[3], int h) { + uint16x8_t sum[3]; + + uint8x8_t s = vld1_u8(src); + sum[0] = vabdl_u8(s, vld1_u8(ref[0])); + sum[1] = vabdl_u8(s, vld1_u8(ref[1])); + sum[2] = vabdl_u8(s, vld1_u8(ref[2])); + + src += src_stride; + int ref_offset = ref_stride; + int i = h - 1; + do { + s = vld1_u8(src); + sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); + sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); + sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + res[0] = horizontal_add_u16x8(sum[0]); + res[1] = horizontal_add_u16x8(sum[1]); + res[2] = horizontal_add_u16x8(sum[2]); +} + +static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[3], int ref_stride, + uint32_t res[3], int h) { + assert(h % 2 == 0); + uint16x8_t sum[3]; + + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); + + sum[0] = vabdl_u8(s, r0); + sum[1] = vabdl_u8(s, r1); + sum[2] = vabdl_u8(s, r2); + + src += 2 * src_stride; + int ref_offset = 2 * ref_stride; + int i = (h / 2) - 1; + do { + s = load_unaligned_u8(src, src_stride); + r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); + r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); + r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); + + sum[0] = vabal_u8(sum[0], s, r0); + sum[1] = vabal_u8(sum[1], s, r1); + sum[2] = vabal_u8(sum[2], s, r2); + + src += 2 * src_stride; + ref_offset += 2 * ref_stride; + } while (--i != 0); + + res[0] = horizontal_add_u16x8(sum[0]); + res[1] = horizontal_add_u16x8(sum[1]); + res[2] = horizontal_add_u16x8(sum[2]); +} + +#define SAD_WXH_3D_NEON(w, h) \ + void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h)); \ + } + +SAD_WXH_3D_NEON(4, 4) +SAD_WXH_3D_NEON(4, 8) + +SAD_WXH_3D_NEON(8, 4) +SAD_WXH_3D_NEON(8, 8) +SAD_WXH_3D_NEON(8, 16) + +SAD_WXH_3D_NEON(16, 8) +SAD_WXH_3D_NEON(16, 16) +SAD_WXH_3D_NEON(16, 32) + +SAD_WXH_3D_NEON(32, 16) +SAD_WXH_3D_NEON(32, 32) +SAD_WXH_3D_NEON(32, 64) + +SAD_WXH_3D_NEON(64, 32) +SAD_WXH_3D_NEON(64, 64) +SAD_WXH_3D_NEON(64, 128) + +SAD_WXH_3D_NEON(128, 64) +SAD_WXH_3D_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_3D_NEON(4, 16) +SAD_WXH_3D_NEON(8, 32) +SAD_WXH_3D_NEON(16, 4) +SAD_WXH_3D_NEON(16, 64) +SAD_WXH_3D_NEON(32, 8) +SAD_WXH_3D_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_3D_NEON + +static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], int w, + int h, int h_overflow) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + int h_limit = h > h_overflow ? h_overflow : h; + + int ref_offset = 0; + int i = 0; + do { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + int j = 0; + do { + const uint8x16_t s0 = vld1q_u8(src + j); + sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); + + const uint8x16_t s1 = vld1q_u8(src + j + 16); + sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); + + j += 32; + } while (j < w); + + src += src_stride; + ref_offset += ref_stride; + } while (++i < h_limit); + + sum[0] = vpadalq_u16(sum[0], sum_lo[0]); + sum[0] = vpadalq_u16(sum[0], sum_hi[0]); + sum[1] = vpadalq_u16(sum[1], sum_lo[1]); + sum[1] = vpadalq_u16(sum[1], sum_hi[1]); + sum[2] = vpadalq_u16(sum[2], sum_lo[2]); + sum[2] = vpadalq_u16(sum[2], sum_hi[2]); + sum[3] = vpadalq_u16(sum[3], sum_lo[3]); + sum[3] = vpadalq_u16(sum[3], sum_hi[3]); + + h_limit += h_overflow; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32); +} + +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64); +} + +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int ref_offset = 0; + int i = h; + do { + const uint8x16_t s0 = vld1q_u8(src); + sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]); + + const uint8x16_t s1 = vld1q_u8(src + 16); + sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi)); +} + +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int ref_offset = 0; + int i = h; + do { + const uint8x16_t s = vld1q_u8(src); + sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]); + sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]); + sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]); + sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + sum_u32[0] = vpaddlq_u16(sum_u16[0]); + sum_u32[1] = vpaddlq_u16(sum_u16[1]); + sum_u32[2] = vpaddlq_u16(sum_u16[2]); + sum_u32[3] = vpaddlq_u16(sum_u16[3]); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32)); +} + +static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4]; + + uint8x8_t s = vld1_u8(src); + sum[0] = vabdl_u8(s, vld1_u8(ref[0])); + sum[1] = vabdl_u8(s, vld1_u8(ref[1])); + sum[2] = vabdl_u8(s, vld1_u8(ref[2])); + sum[3] = vabdl_u8(s, vld1_u8(ref[3])); + + src += src_stride; + int ref_offset = ref_stride; + int i = h - 1; + do { + s = vld1_u8(src); + sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset)); + sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset)); + sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset)); + sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset)); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_add_4d_u16x8(sum)); +} + +static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4]; + + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride); + + sum[0] = vabdl_u8(s, r0); + sum[1] = vabdl_u8(s, r1); + sum[2] = vabdl_u8(s, r2); + sum[3] = vabdl_u8(s, r3); + + src += 2 * src_stride; + int ref_offset = 2 * ref_stride; + int i = h / 2; + while (--i != 0) { + s = load_unaligned_u8(src, src_stride); + r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride); + r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride); + r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride); + r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride); + + sum[0] = vabal_u8(sum[0], s, r0); + sum[1] = vabal_u8(sum[1], s, r1); + sum[2] = vabal_u8(sum[2], s, r2); + sum[3] = vabal_u8(sum[3], s, r3); + + src += 2 * src_stride; + ref_offset += 2 * ref_stride; + } + + vst1q_u32(res, horizontal_add_4d_u16x8(sum)); +} + +#define SAD_WXH_4D_NEON(w, h) \ + void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ + } + +SAD_WXH_4D_NEON(4, 4) +SAD_WXH_4D_NEON(4, 8) + +SAD_WXH_4D_NEON(8, 4) +SAD_WXH_4D_NEON(8, 8) +SAD_WXH_4D_NEON(8, 16) + +SAD_WXH_4D_NEON(16, 8) +SAD_WXH_4D_NEON(16, 16) +SAD_WXH_4D_NEON(16, 32) + +SAD_WXH_4D_NEON(32, 16) +SAD_WXH_4D_NEON(32, 32) +SAD_WXH_4D_NEON(32, 64) + +SAD_WXH_4D_NEON(64, 32) +SAD_WXH_4D_NEON(64, 64) +SAD_WXH_4D_NEON(64, 128) + +SAD_WXH_4D_NEON(128, 64) +SAD_WXH_4D_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_4D_NEON(4, 16) +SAD_WXH_4D_NEON(8, 32) +SAD_WXH_4D_NEON(16, 4) +SAD_WXH_4D_NEON(16, 64) +SAD_WXH_4D_NEON(32, 8) +SAD_WXH_4D_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_4D_NEON + +#define SAD_SKIP_WXH_4D_NEON(w, h) \ + void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res, \ + ((h) >> 1)); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON(4, 4) +SAD_SKIP_WXH_4D_NEON(4, 8) + +SAD_SKIP_WXH_4D_NEON(8, 4) +SAD_SKIP_WXH_4D_NEON(8, 8) +SAD_SKIP_WXH_4D_NEON(8, 16) + +SAD_SKIP_WXH_4D_NEON(16, 8) +SAD_SKIP_WXH_4D_NEON(16, 16) +SAD_SKIP_WXH_4D_NEON(16, 32) + +SAD_SKIP_WXH_4D_NEON(32, 16) +SAD_SKIP_WXH_4D_NEON(32, 32) +SAD_SKIP_WXH_4D_NEON(32, 64) + +SAD_SKIP_WXH_4D_NEON(64, 32) +SAD_SKIP_WXH_4D_NEON(64, 64) +SAD_SKIP_WXH_4D_NEON(64, 128) + +SAD_SKIP_WXH_4D_NEON(128, 64) +SAD_SKIP_WXH_4D_NEON(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_SKIP_WXH_4D_NEON(4, 16) +SAD_SKIP_WXH_4D_NEON(8, 32) +SAD_SKIP_WXH_4D_NEON(16, 4) +SAD_SKIP_WXH_4D_NEON(16, 64) +SAD_SKIP_WXH_4D_NEON(32, 8) +SAD_SKIP_WXH_4D_NEON(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_SKIP_WXH_4D_NEON diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c new file mode 100644 index 0000000000..3d11d1cb96 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); +} + +static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int w, int h) { + uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + + int ref_offset = 0; + int i = h; + do { + int j = 0; + do { + const uint8x16_t s0 = vld1q_u8(src + j); + sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); + + const uint8x16_t s1 = vld1q_u8(src + j + 16); + sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); + + j += 32; + } while (j < w); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0])); + res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1])); + res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2])); +} + +static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h); +} + +static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h); +} + +static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h); +} + +static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; + + int ref_offset = 0; + int i = h; + do { + const uint8x16_t s = vld1q_u8(src); + sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + res[0] = horizontal_add_u32x4(sum[0]); + res[1] = horizontal_add_u32x4(sum[1]); + res[2] = horizontal_add_u32x4(sum[2]); +} + +#define SAD_WXH_3D_NEON_DOTPROD(w, h) \ + void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \ + } + +SAD_WXH_3D_NEON_DOTPROD(16, 8) +SAD_WXH_3D_NEON_DOTPROD(16, 16) +SAD_WXH_3D_NEON_DOTPROD(16, 32) + +SAD_WXH_3D_NEON_DOTPROD(32, 16) +SAD_WXH_3D_NEON_DOTPROD(32, 32) +SAD_WXH_3D_NEON_DOTPROD(32, 64) + +SAD_WXH_3D_NEON_DOTPROD(64, 32) +SAD_WXH_3D_NEON_DOTPROD(64, 64) +SAD_WXH_3D_NEON_DOTPROD(64, 128) + +SAD_WXH_3D_NEON_DOTPROD(128, 64) +SAD_WXH_3D_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_3D_NEON_DOTPROD(16, 4) +SAD_WXH_3D_NEON_DOTPROD(16, 64) +SAD_WXH_3D_NEON_DOTPROD(32, 8) +SAD_WXH_3D_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_3D_NEON_DOTPROD + +static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int w, int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int ref_offset = 0; + int i = h; + do { + int j = 0; + do { + const uint8x16_t s0 = vld1q_u8(src + j); + sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]); + + const uint8x16_t s1 = vld1q_u8(src + j + 16); + sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]); + + j += 32; + } while (j < w); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h); +} + +static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h); +} + +static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h); +} + +static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int ref_offset = 0; + int i = h; + do { + const uint8x16_t s = vld1q_u8(src); + sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]); + + src += src_stride; + ref_offset += ref_stride; + } while (--i != 0); + + vst1q_u32(res, horizontal_add_4d_u32x4(sum)); +} + +#define SAD_WXH_4D_NEON_DOTPROD(w, h) \ + void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h)); \ + } + +SAD_WXH_4D_NEON_DOTPROD(16, 8) +SAD_WXH_4D_NEON_DOTPROD(16, 16) +SAD_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_WXH_4D_NEON_DOTPROD(32, 16) +SAD_WXH_4D_NEON_DOTPROD(32, 32) +SAD_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_WXH_4D_NEON_DOTPROD(64, 32) +SAD_WXH_4D_NEON_DOTPROD(64, 64) +SAD_WXH_4D_NEON_DOTPROD(64, 128) + +SAD_WXH_4D_NEON_DOTPROD(128, 64) +SAD_WXH_4D_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_WXH_4D_NEON_DOTPROD(16, 4) +SAD_WXH_4D_NEON_DOTPROD(16, 64) +SAD_WXH_4D_NEON_DOTPROD(32, 8) +SAD_WXH_4D_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_WXH_4D_NEON_DOTPROD + +#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ + void aom_sad_skip_##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \ + ((h) >> 1)); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64) +SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128) + +#if !CONFIG_REALTIME_ONLY +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 4) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 8) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#undef SAD_SKIP_WXH_4D_NEON_DOTPROD diff --git a/third_party/aom/aom_dsp/arm/sse_neon.c b/third_party/aom/aom_dsp/arm/sse_neon.c new file mode 100644 index 0000000000..ec8f0ee183 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sse_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); + uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); +} + +static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x4_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x4_t sse = vdupq_n_u32(0); + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + j += 8; + } while (j < width); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + } + return horizontal_add_u32x4(sse); +} + +static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + sse_16x1_neon(src + 64, ref + 64, &sse[0]); + sse_16x1_neon(src + 80, ref + 80, &sse[1]); + sse_16x1_neon(src + 96, ref + 96, &sse[0]); + sse_16x1_neon(src + 112, ref + 112, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_8x1_neon(src, ref, &sse); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(sse); +} + +static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_u32x4(sse); +} + +int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int width, int height) { + switch (width) { + case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); + case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); + } +} diff --git a/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c new file mode 100644 index 0000000000..979049780b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + *sse = vdotq_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x2_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x2_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j < width); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } + return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]); + sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]); + sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]); + sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = height; + do { + sse_8x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_8x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse = vdup_n_u32(0); + + int i = height; + do { + sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_u32x2(sse); +} + +int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int width, + int height) { + switch (width) { + case 4: + return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 8: + return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 16: + return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 32: + return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 64: + return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 128: + return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c new file mode 100644 index 0000000000..2e6e738853 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -0,0 +1,1103 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/arm/dist_wtd_avg_neon.h" +#include "aom_dsp/arm/mem_neon.h" + +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmull_u8(s0, f0); + blend = vmlal_u8(blend, s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmull_u8(s0, f0); + blend = vmlal_u8(blend, s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); +} + +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); + blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); + uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); + blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + vst1q_u8(dst_ptr + j, blend_u8); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} + +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} + +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +static void var_filter_block2d_bil_w128(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128, + dst_height, filter_offset); +} + +static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + vst1q_u8(dst_ptr + j, avg); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) + +// Realtime mode doesn't use 4x rectangular blocks. +#if !CONFIG_REALTIME_ONLY + +SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) + +SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) + +#endif // !CONFIG_REALTIME_ONLY + +#undef SUBPEL_VARIANCE_WXH_NEON +#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 4. +static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmull_u8(s0, f0); + blend = vmlal_u8(blend, s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having +// width 4. +static void dist_wtd_avg_pred_var_filter_block2d_bil_w4( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset); + const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset); + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint8x8_t p = vld1_u8(second_pred); + uint16x8_t blend = vmull_u8(s0, f0); + blend = vmlal_u8(blend, s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8. +static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmull_u8(s0, f0); + blend = vmlal_u8(blend, s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); +} + +// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having +// width 8. +static void dist_wtd_avg_pred_var_filter_block2d_bil_w8( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset); + const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset); + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint8x8_t p = vld1_u8(second_pred); + uint16x8_t blend = vmull_u8(s0, f0); + blend = vmlal_u8(blend, s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); +} + +// Combine bilinear filter with aom_comp_avg_pred for large blocks. +static void avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); + blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); + uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); + blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(blend_u8, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks. +static void dist_wtd_avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); + blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); + uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); + blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16. +static void avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32. +static void avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64. +static void avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128. +static void avg_pred_var_filter_block2d_bil_w128( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 128, dst_height, + filter_offset, second_pred); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16. +static void dist_wtd_avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32. +static void dist_wtd_avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64. +static void dist_wtd_avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128. +static void dist_wtd_avg_pred_var_filter_block2d_bil_w128( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + dist_wtd_avg_pred_var_filter_block2d_bil_large( + src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset, + second_pred, jcp_param); +} + +// Combine averaging subpel filter with aom_comp_avg_pred. +static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, + const uint8_t *second_pred) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + + uint8x16_t p = vld1q_u8(second_pred); + avg = vrhaddq_u8(avg, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred. +static void dist_wtd_avg_pred_var_filter_block2d_avg( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(s0, s1); + avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of aom_comp_avg_pred for blocks having width >= 16. +static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, + int dst_width, int dst_height, + const uint8_t *second_pred) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(s, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16. +static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param) { + // We only specialise on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset); + const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + avg_pred(src, tmp, source_stride, w, h, second_pred); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, second_pred); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ + second_pred); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ + xoffset, second_pred); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1) + +#if !CONFIG_REALTIME_ONLY + +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1) + +SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1) + +#endif // !CONFIG_REALTIME_ONLY + +#undef SUBPEL_AVG_VARIANCE_WXH_NEON +#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON + +#define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred, \ + jcp_param); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, \ + second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred, \ + jcp_param); \ + return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + dist_wtd_avg_pred_var_filter_block2d_avg( \ + src, tmp0, source_stride, 1, w, h, second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \ + second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, \ + second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + dist_wtd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1) + +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1) + +#if !CONFIG_REALTIME_ONLY + +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2) + +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1) + +DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1) +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1) + +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1) + +SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1) + +#endif // !CONFIG_REALTIME_ONLY + +#undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON +#undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON + +#if !CONFIG_REALTIME_ONLY + +#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \ + } + +#define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \ + sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \ + return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \ + yoffset); \ + return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \ + return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \ + return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ + } \ + } \ + } + +OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) +OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) + +OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) +OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) + +OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) +OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) + +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) + +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) +SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) + +#undef OBMC_SUBPEL_VARIANCE_WXH_NEON +#undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON +#endif // !CONFIG_REALTIME_ONLY + +#define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + uint8_t tmp2[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \ + invert_mask); \ + return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + if (xoffset == 0) { \ + uint8_t tmp0[w * h]; \ + if (yoffset == 0) { \ + aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \ + aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \ + yoffset); \ + aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + uint8_t tmp2[w * h]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + uint8_t tmp2[w * h]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ + } \ + } else { \ + if (yoffset == 0) { \ + uint8_t tmp0[w * h]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + uint8_t tmp2[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * (h + padding)]; \ + uint8_t tmp2[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ + msk_stride, invert_mask); \ + return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ + } \ + } \ + } + +MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) + +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) + +// Realtime mode doesn't use 4x rectangular blocks. +#if !CONFIG_REALTIME_ONLY +MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) +MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) +MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) +SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) +#endif // !CONFIG_REALTIME_ONLY + +#undef MASKED_SUBPEL_VARIANCE_WXH_NEON +#undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c new file mode 100644 index 0000000000..a195c40d19 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subtract_neon.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + if (cols > 16) { + int r = rows; + do { + int c = 0; + do { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = + vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = + vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = + vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = + vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + c += 32; + } while (c < cols); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r != 0); + } else if (cols > 8) { + int r = rows; + do { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = + vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = + vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r != 0); + } else if (cols > 4) { + int r = rows; + do { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r != 0); + } else { + int r = rows; + do { + int c = 0; + do { + diff[c] = src[c] - pred[c]; + } while (++c < cols); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r != 0); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + + if (cols > 16) { + int r = rows; + do { + int c = 0; + do { + const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]); + const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]); + const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00); + const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]); + const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]); + const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08)); + c += 16; + } while (c < cols); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r != 0); + } else if (cols > 8) { + int r = rows; + do { + const uint16x8_t v_src_00 = vld1q_u16(&src[0]); + const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]); + const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00); + const uint16x8_t v_src_08 = vld1q_u16(&src[8]); + const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]); + const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r != 0); + } else if (cols > 4) { + int r = rows; + do { + const uint16x8_t v_src_r0 = vld1q_u16(&src[0]); + const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]); + const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]); + const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]); + const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0); + const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0)); + vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1)); + diff += diff_stride << 1; + pred += pred_stride << 1; + src += src_stride << 1; + r -= 2; + } while (r != 0); + } else { + int r = rows; + do { + const uint16x4_t v_src_r0 = vld1_u16(&src[0]); + const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]); + const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]); + const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]); + const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0); + const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1); + vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0)); + vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1)); + diff += diff_stride << 1; + pred += pred_stride << 1; + src += src_stride << 1; + r -= 2; + } while (r != 0); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/arm/sum_neon.h b/third_party/aom/aom_dsp/arm/sum_neon.h new file mode 100644 index 0000000000..30a108e70a --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sum_neon.h @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_ +#define AOM_AOM_DSP_ARM_SUM_NEON_H_ + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int horizontal_add_u8x8(const uint8x8_t a) { +#if AOM_ARCH_AARCH64 + return vaddlv_u8(a); +#else + uint16x4_t b = vpaddl_u8(a); + uint32x2_t c = vpaddl_u16(b); + return vget_lane_u32(c, 0) + vget_lane_u32(c, 1); +#endif +} + +static INLINE int horizontal_add_s16x8(const int16x8_t a) { +#if AOM_ARCH_AARCH64 + return vaddlvq_s16(a); +#else + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); + return vget_lane_s32(d, 0); +#endif +} + +static INLINE int horizontal_add_s32x4(const int32x4_t a) { +#if AOM_ARCH_AARCH64 + return vaddvq_s32(a); +#else + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +#endif +} + +static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) { +#if AOM_ARCH_AARCH64 + return vaddvq_s64(a); +#else + return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) { +#if AOM_ARCH_AARCH64 + return vaddvq_u64(a); +#else + return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) { +#if AOM_ARCH_AARCH64 + return vaddlvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); +#endif +} + +static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) { +#if AOM_ARCH_AARCH64 + return vaddlvq_s32(a); +#else + const int64x2_t b = vpaddlq_s32(a); + return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1); +#endif +} + +static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) { +#if AOM_ARCH_AARCH64 + return vaddvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) { +#if AOM_ARCH_AARCH64 + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3); + return res; +#endif +} + +static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) { +#if AOM_ARCH_AARCH64 + int32x4_t res01 = vpaddq_s32(sum[0], sum[1]); + int32x4_t res23 = vpaddq_s32(sum[2], sum[3]); + return vpaddq_s32(res01, res23); +#else + int32x4_t res = vdupq_n_s32(0); + res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0); + res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1); + res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2); + res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3); + return res; +#endif +} + +static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { +#if AOM_ARCH_AARCH64 + return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); +#else + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_long_add_4d_u16x8( + const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { + const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); + const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); + const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); + const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); + const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); + const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); + const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); + const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); +#if AOM_ARCH_AARCH64 + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + return vpaddq_u32(c0, c1); +#else + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + return vcombine_u32(d0, d1); +#endif +} + +static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) { +#if AOM_ARCH_AARCH64 + return vaddlvq_u16(a); +#else + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + return vget_lane_u32(d, 0); +#endif +} + +static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) { +#if AOM_ARCH_AARCH64 + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + return vpaddlq_u16(b0); +#else + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + return vpaddlq_u16(vcombine_u16(b0, b1)); +#endif +} + +static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) { +#if AOM_ARCH_AARCH64 + const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]); + const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]); + const int16x8_t b0 = vpaddq_s16(a0, a1); + return vpaddlq_s16(b0); +#else + const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0])); + const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1])); + const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2])); + const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3])); + const int16x4_t b0 = vpadd_s16(a0, a1); + const int16x4_t b1 = vpadd_s16(a2, a3); + return vpaddlq_s16(vcombine_s16(b0, b1)); +#endif +} + +static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) { +#if AOM_ARCH_AARCH64 + return vaddv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); +#endif +} + +static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) { +#if AOM_ARCH_AARCH64 + return vaddlv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u64(b, 0); +#endif +} + +static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) { +#if AOM_ARCH_AARCH64 + return vaddlv_u16(a); +#else + const uint32x2_t b = vpaddl_u16(a); + const uint64x1_t c = vpaddl_u32(b); + return vget_lane_u32(vreinterpret_u32_u64(c), 0); +#endif +} + +static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) { +#if AOM_ARCH_AARCH64 + return vpaddq_s32(a, b); +#else + const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +#endif +} + +static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) { +#if AOM_ARCH_AARCH64 + return vget_low_s32(vpaddq_s32(a, a)); +#else + return vpadd_s32(vget_low_s32(a), vget_high_s32(a)); +#endif +} + +static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) { + return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]); +} + +static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) { + uint64x2_t sum = vpaddlq_u32(a[0]); + sum = vpadalq_u32(sum, a[1]); + sum = vpadalq_u32(sum, a[2]); + sum = vpadalq_u32(sum, a[3]); + + return horizontal_add_u64x2(sum); +} + +static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + + return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1])); +} + +static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + sum[0] = vpadalq_u32(sum[0], a[8]); + sum[1] = vpadalq_u32(sum[1], a[9]); + sum[0] = vpadalq_u32(sum[0], a[10]); + sum[1] = vpadalq_u32(sum[1], a[11]); + sum[0] = vpadalq_u32(sum[0], a[12]); + sum[1] = vpadalq_u32(sum[1], a[13]); + sum[0] = vpadalq_u32(sum[0], a[14]); + sum[1] = vpadalq_u32(sum[1], a[15]); + + return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1])); +} + +#endif // AOM_AOM_DSP_ARM_SUM_NEON_H_ diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon.c b/third_party/aom/aom_dsp/arm/sum_squares_neon.c new file mode 100644 index 0000000000..424b2b4445 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sum_squares_neon.c @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, + int stride) { + int16x4_t s0 = vld1_s16(src + 0 * stride); + int16x4_t s1 = vld1_s16(src + 1 * stride); + int16x4_t s2 = vld1_s16(src + 2 * stride); + int16x4_t s3 = vld1_s16(src + 3 * stride); + + int32x4_t sum_squares = vmull_s16(s0, s0); + sum_squares = vmlal_s16(sum_squares, s1, s1); + sum_squares = vmlal_s16(sum_squares, s2, s2); + sum_squares = vmlal_s16(sum_squares, s3, s3); + + return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares)); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, + int stride, int height) { + int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int h = height; + do { + int16x4_t s0 = vld1_s16(src + 0 * stride); + int16x4_t s1 = vld1_s16(src + 1 * stride); + int16x4_t s2 = vld1_s16(src + 2 * stride); + int16x4_t s3 = vld1_s16(src + 3 * stride); + + sum_squares[0] = vmlal_s16(sum_squares[0], s0, s0); + sum_squares[0] = vmlal_s16(sum_squares[0], s1, s1); + sum_squares[1] = vmlal_s16(sum_squares[1], s2, s2); + sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3); + + src += 4 * stride; + h -= 4; + } while (h != 0); + + return horizontal_long_add_u32x4( + vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1]))); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, + int stride, int width, + int height) { + uint64x2_t sum_squares = vdupq_n_u64(0); + + int h = height; + do { + int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int w = 0; + do { + const int16_t *s = src + w; + int16x8_t s0 = vld1q_s16(s + 0 * stride); + int16x8_t s1 = vld1q_s16(s + 1 * stride); + int16x8_t s2 = vld1q_s16(s + 2 * stride); + int16x8_t s3 = vld1q_s16(s + 3 * stride); + + ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s0), vget_low_s16(s0)); + ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s1), vget_low_s16(s1)); + ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s2), vget_low_s16(s2)); + ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s3), vget_low_s16(s3)); + ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s0), vget_high_s16(s0)); + ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s1), vget_high_s16(s1)); + ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s2), vget_high_s16(s2)); + ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s3), vget_high_s16(s3)); + w += 8; + } while (w < width); + + sum_squares = vpadalq_u32( + sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1]))); + + src += 4 * stride; + h -= 4; + } while (h != 0); + + return horizontal_add_u64x2(sum_squares); +} + +uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width, + int height) { + // 4 elements per row only requires half an SIMD register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_neon(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_neon(src, stride, height); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { + // Generic case + return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src, + int stride, int *sum) { + int16x4_t s0 = vld1_s16(src + 0 * stride); + int16x4_t s1 = vld1_s16(src + 1 * stride); + int16x4_t s2 = vld1_s16(src + 2 * stride); + int16x4_t s3 = vld1_s16(src + 3 * stride); + + int32x4_t sse = vmull_s16(s0, s0); + sse = vmlal_s16(sse, s1, s1); + sse = vmlal_s16(sse, s2, s2); + sse = vmlal_s16(sse, s3, s3); + + int32x4_t sum_01 = vaddl_s16(s0, s1); + int32x4_t sum_23 = vaddl_s16(s2, s3); + *sum += horizontal_add_s32x4(vaddq_s32(sum_01, sum_23)); + + return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse)); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src, + int stride, int height, + int *sum) { + int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) }; + + int h = height; + do { + int16x4_t s0 = vld1_s16(src + 0 * stride); + int16x4_t s1 = vld1_s16(src + 1 * stride); + int16x4_t s2 = vld1_s16(src + 2 * stride); + int16x4_t s3 = vld1_s16(src + 3 * stride); + + sse[0] = vmlal_s16(sse[0], s0, s0); + sse[0] = vmlal_s16(sse[0], s1, s1); + sse[1] = vmlal_s16(sse[1], s2, s2); + sse[1] = vmlal_s16(sse[1], s3, s3); + + sum_acc[0] = vpadal_s16(sum_acc[0], s0); + sum_acc[0] = vpadal_s16(sum_acc[0], s1); + sum_acc[1] = vpadal_s16(sum_acc[1], s2); + sum_acc[1] = vpadal_s16(sum_acc[1], s3); + + src += 4 * stride; + h -= 4; + } while (h != 0); + + *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1])); + return horizontal_long_add_u32x4( + vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1]))); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src, + int stride, int width, + int height, int *sum) { + uint64x2_t sse = vdupq_n_u64(0); + int32x4_t sum_acc = vdupq_n_s32(0); + + int h = height; + do { + int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int w = 0; + do { + const int16_t *s = src + w; + int16x8_t s0 = vld1q_s16(s + 0 * stride); + int16x8_t s1 = vld1q_s16(s + 1 * stride); + int16x8_t s2 = vld1q_s16(s + 2 * stride); + int16x8_t s3 = vld1q_s16(s + 3 * stride); + + sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s0), vget_low_s16(s0)); + sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s1), vget_low_s16(s1)); + sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s2), vget_low_s16(s2)); + sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s3), vget_low_s16(s3)); + sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s0), vget_high_s16(s0)); + sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s1), vget_high_s16(s1)); + sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s2), vget_high_s16(s2)); + sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s3), vget_high_s16(s3)); + + sum_acc = vpadalq_s16(sum_acc, s0); + sum_acc = vpadalq_s16(sum_acc, s1); + sum_acc = vpadalq_s16(sum_acc, s2); + sum_acc = vpadalq_s16(sum_acc, s3); + + w += 8; + } while (w < width); + + sse = vpadalq_u32(sse, + vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1]))); + + src += 4 * stride; + h -= 4; + } while (h != 0); + + *sum += horizontal_add_s32x4(sum_acc); + return horizontal_add_u64x2(sse); +} + +uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width, + int height, int *sum) { + uint64_t sse; + + if (LIKELY(width == 4 && height == 4)) { + sse = aom_sum_sse_2d_i16_4x4_neon(src, stride, sum); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + // width = 4, height is a multiple of 4. + sse = aom_sum_sse_2d_i16_4xn_neon(src, stride, height, sum); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { + // Generic case - width is multiple of 8, height is multiple of 4. + sse = aom_sum_sse_2d_i16_nxn_neon(src, stride, width, height, sum); + } else { + sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum); + } + + return sse; +} + +static INLINE uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src, + uint32_t n) { + uint64x2_t sum_u64 = vdupq_n_u64(0); + + int i = n; + do { + uint32x4_t sum; + int16x4_t s0 = vld1_s16(src); + + sum = vreinterpretq_u32_s32(vmull_s16(s0, s0)); + + sum_u64 = vpadalq_u32(sum_u64, sum); + + src += 4; + i -= 4; + } while (i >= 4); + + if (i > 0) { + return horizontal_add_u64x2(sum_u64) + aom_sum_squares_i16_c(src, i); + } + return horizontal_add_u64x2(sum_u64); +} + +static INLINE uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src, + uint32_t n) { + uint64x2_t sum_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int i = n; + do { + uint32x4_t sum[2]; + int16x8_t s0 = vld1q_s16(src); + + sum[0] = + vreinterpretq_u32_s32(vmull_s16(vget_low_s16(s0), vget_low_s16(s0))); + sum[1] = + vreinterpretq_u32_s32(vmull_s16(vget_high_s16(s0), vget_high_s16(s0))); + + sum_u64[0] = vpadalq_u32(sum_u64[0], sum[0]); + sum_u64[1] = vpadalq_u32(sum_u64[1], sum[1]); + + src += 8; + i -= 8; + } while (i >= 8); + + if (i > 0) { + return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])) + + aom_sum_squares_i16_c(src, i); + } + return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])); +} + +uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) { + // This function seems to be called only for values of N >= 64. See + // av1/encoder/compound_type.c. + if (LIKELY(n >= 8)) { + return aom_sum_squares_i16_8xn_neon(src, n); + } + if (n >= 4) { + return aom_sum_squares_i16_4xn_neon(src, n); + } + return aom_sum_squares_i16_c(src, n); +} + +static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride, + int width, int height) { + uint64_t sum = 0; + uint64_t sse = 0; + uint32x2_t sum_u32 = vdup_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit + // element before we need to accumulate to 32-bit elements. Since we're + // accumulating in uint16x4_t vectors, this means we can accumulate up to 4 + // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4 + // * 256) / width. + int h_limit = (4 * 256) / width; + int h_tmp = height > h_limit ? h_limit : height; + + int h = 0; + do { + uint16x4_t sum_u16 = vdup_n_u16(0); + do { + uint8_t *src_ptr = src; + int w = width; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + + sum_u16 = vpadal_u8(sum_u16, s0); + + uint16x8_t sse_u16 = vmull_u8(s0, s0); + + sse_u32 = vpadalq_u16(sse_u32, sse_u16); + + src_ptr += 8; + w -= 8; + } while (w >= 8); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint8_t v = src[idx]; + sum += v; + sse += v * v; + w--; + } + + src += 2 * src_stride; + h += 2; + } while (h < h_tmp && h < height); + + sum_u32 = vpadal_u16(sum_u32, sum_u16); + h_tmp += h_limit; + } while (h < height); + + sum += horizontal_long_add_u32x2(sum_u32); + sse += horizontal_long_add_u32x4(sse_u32); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride, + int width, int height) { + uint64_t sum = 0; + uint64_t sse = 0; + uint32x2_t sum_u32 = vdup_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit + // element before we need to accumulate to 32-bit elements. Since we're + // accumulating in uint16x4_t vectors, this means we can accumulate up to 4 + // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4 + // * 256) / width. + int h_limit = (4 * 256) / width; + int h_tmp = height > h_limit ? h_limit : height; + + int h = 0; + do { + uint16x4_t sum_u16 = vdup_n_u16(0); + do { + uint8_t *src_ptr = src; + int w = width; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + + sum_u16 = vpadal_u8(sum_u16, s0); + + uint16x8_t sse_u16 = vmull_u8(s0, s0); + + sse_u32 = vpadalq_u16(sse_u32, sse_u16); + + src_ptr += 8; + w -= 8; + } while (w >= 8); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint8_t v = src[idx]; + sum += v; + sse += v * v; + w--; + } + + src += src_stride; + ++h; + } while (h < h_tmp && h < height); + + sum_u32 = vpadal_u16(sum_u32, sum_u16); + h_tmp += h_limit; + } while (h < height); + + sum += horizontal_long_add_u32x2(sum_u32); + sse += horizontal_long_add_u32x4(sse_u32); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride, + int width, int height) { + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit + // element before we need to accumulate to 32-bit elements. Since we're + // accumulating in uint16x8_t vectors, this means we can accumulate up to 8 + // rows of 256 elements. Therefore the limit can be computed as: h_limit = (8 + // * 256) / width. + int h_limit = (8 * 256) / width; + int h_tmp = height > h_limit ? h_limit : height; + + int h = 0; + do { + uint16x8_t sum_u16 = vdupq_n_u16(0); + do { + int w = width; + uint8_t *src_ptr = src; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + + sum_u16 = vpadalq_u8(sum_u16, s0); + + uint16x8_t sse_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(s0)); + uint16x8_t sse_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(s0)); + + sse_u32[0] = vpadalq_u16(sse_u32[0], sse_u16_lo); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse_u16_hi); + + src_ptr += 16; + w -= 16; + } while (w >= 16); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint8_t v = src[idx]; + sum += v; + sse += v * v; + w--; + } + + src += src_stride; + ++h; + } while (h < h_tmp && h < height); + + sum_u32 = vpadalq_u16(sum_u32, sum_u16); + h_tmp += h_limit; + } while (h < height); + + sum += horizontal_long_add_u32x4(sum_u32); + sse += horizontal_long_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + + return sse - sum * sum / (width * height); +} + +uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width, + int height) { + if (width >= 16) { + return aom_var_2d_u8_16xh_neon(src, src_stride, width, height); + } + if (width >= 8) { + return aom_var_2d_u8_8xh_neon(src, src_stride, width, height); + } + if (width >= 4 && height % 2 == 0) { + return aom_var_2d_u8_4xh_neon(src, src_stride, width, height); + } + return aom_var_2d_u8_c(src, src_stride, width, height); +} + +static INLINE uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x2_t sum_u32 = vdup_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x4_t s0 = vld1_u16(src_ptr); + + sum_u32 = vpadal_u16(sum_u32, s0); + + uint32x4_t sse_u32 = vmull_u16(s0, s0); + + sse_u64 = vpadalq_u32(sse_u64, sse_u32); + + src_ptr += 4; + w -= 4; + } while (w >= 4); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint16_t v = src_u16[idx]; + sum += v; + sse += v * v; + w--; + } + + src_u16 += src_stride; + } while (--h != 0); + + sum += horizontal_long_add_u32x2(sum_u32); + sse += horizontal_add_u64x2(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + uint32x4_t sse_u32_lo = vmull_u16(vget_low_u16(s0), vget_low_u16(s0)); + uint32x4_t sse_u32_hi = vmull_u16(vget_high_u16(s0), vget_high_u16(s0)); + + sse_u64[0] = vpadalq_u32(sse_u64[0], sse_u32_lo); + sse_u64[1] = vpadalq_u32(sse_u64[1], sse_u32_hi); + + src_ptr += 8; + w -= 8; + } while (w >= 8); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint16_t v = src_u16[idx]; + sum += v; + sse += v * v; + w--; + } + + src_u16 += src_stride; + } while (--h != 0); + + sum += horizontal_long_add_u32x4(sum_u32); + sse += horizontal_add_u64x2(vaddq_u64(sse_u64[0], sse_u64[1])); + + return sse - sum * sum / (width * height); +} + +uint64_t aom_var_2d_u16_neon(uint8_t *src, int src_stride, int width, + int height) { + if (width >= 8) { + return aom_var_2d_u16_8xh_neon(src, src_stride, width, height); + } + if (width >= 4) { + return aom_var_2d_u16_4xh_neon(src, src_stride, width, height); + } + return aom_var_2d_u16_c(src, src_stride, width, height); +} diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c new file mode 100644 index 0000000000..44462a693c --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src, + int src_stride, int width, + int height) { + uint64_t sum = 0; + uint64_t sse = 0; + uint32x2_t sum_u32 = vdup_n_u32(0); + uint32x2_t sse_u32 = vdup_n_u32(0); + + int h = height / 2; + do { + int w = width; + uint8_t *src_ptr = src; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + + sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1)); + + sse_u32 = vdot_u32(sse_u32, s0, s0); + + src_ptr += 8; + w -= 8; + } while (w >= 8); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint8_t v = src[idx]; + sum += v; + sse += v * v; + w--; + } + + src += 2 * src_stride; + } while (--h != 0); + + sum += horizontal_long_add_u32x2(sum_u32); + sse += horizontal_long_add_u32x2(sse_u32); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src, + int src_stride, int width, + int height) { + uint64_t sum = 0; + uint64_t sse = 0; + uint32x2_t sum_u32 = vdup_n_u32(0); + uint32x2_t sse_u32 = vdup_n_u32(0); + + int h = height; + do { + int w = width; + uint8_t *src_ptr = src; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + + sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1)); + + sse_u32 = vdot_u32(sse_u32, s0, s0); + + src_ptr += 8; + w -= 8; + } while (w >= 8); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint8_t v = src[idx]; + sum += v; + sse += v * v; + w--; + } + + src += src_stride; + } while (--h != 0); + + sum += horizontal_long_add_u32x2(sum_u32); + sse += horizontal_long_add_u32x2(sse_u32); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src, + int src_stride, + int width, int height) { + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int h = height; + do { + int w = width; + uint8_t *src_ptr = src; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + + sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1)); + + sse_u32 = vdotq_u32(sse_u32, s0, s0); + + src_ptr += 16; + w -= 16; + } while (w >= 16); + + // Process remaining columns in the row using C. + while (w > 0) { + int idx = width - w; + const uint8_t v = src[idx]; + sum += v; + sse += v * v; + w--; + } + + src += src_stride; + } while (--h != 0); + + sum += horizontal_long_add_u32x4(sum_u32); + sse += horizontal_long_add_u32x4(sse_u32); + + return sse - sum * sum / (width * height); +} + +uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width, + int height) { + if (width >= 16) { + return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height); + } + if (width >= 8) { + return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height); + } + if (width >= 4 && height % 2 == 0) { + return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height); + } + return aom_var_2d_u8_c(src, src_stride, width, height); +} diff --git a/third_party/aom/aom_dsp/arm/sum_squares_sve.c b/third_party/aom/aom_dsp/arm/sum_squares_sve.c new file mode 100644 index 0000000000..724e43859e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sum_squares_sve.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src, + int stride, int height) { + int64x2_t sum_squares = vdupq_n_s64(0); + + do { + int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); + + sum_squares = aom_sdotq_s16(sum_squares, s, s); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + return (uint64_t)vaddvq_s64(sum_squares); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src, + int stride, int height) { + int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int16x8_t s0 = vld1q_s16(src + 0 * stride); + int16x8_t s1 = vld1q_s16(src + 1 * stride); + + sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); + sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); + return (uint64_t)vaddvq_s64(sum_squares[0]); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src, + int stride, int width, + int height) { + int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16_t *src_ptr = src; + int w = width; + do { + int16x8_t s0 = vld1q_s16(src_ptr); + int16x8_t s1 = vld1q_s16(src_ptr + 8); + + sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); + sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); + + src_ptr += 16; + w -= 16; + } while (w != 0); + + src += stride; + } while (--height != 0); + + sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); + return (uint64_t)vaddvq_s64(sum_squares[0]); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src, + int stride, int width, + int height) { + svint64_t sum_squares = svdup_n_s64(0); + uint64_t step = svcnth(); + + do { + const int16_t *src_ptr = src; + int w = 0; + do { + svbool_t pred = svwhilelt_b16_u32(w, width); + svint16_t s0 = svld1_s16(pred, src_ptr); + + sum_squares = svdot_s64(sum_squares, s0, s0); + + src_ptr += step; + w += step; + } while (w < width); + + src += stride; + } while (--height != 0); + + return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares); +} + +uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width, + int height) { + if (width == 4) { + return aom_sum_squares_2d_i16_4xh_sve(src, stride, height); + } + if (width == 8) { + return aom_sum_squares_2d_i16_8xh_sve(src, stride, height); + } + if (width % 16 == 0) { + return aom_sum_squares_2d_i16_large_sve(src, stride, width, height); + } + return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height); +} + +uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) { + // This function seems to be called only for values of N >= 64. See + // av1/encoder/compound_type.c. Additionally, because N = width x height for + // width and height between the standard block sizes, N will also be a + // multiple of 64. + if (LIKELY(n % 64 == 0)) { + int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int16x8_t s0 = vld1q_s16(src); + int16x8_t s1 = vld1q_s16(src + 8); + int16x8_t s2 = vld1q_s16(src + 16); + int16x8_t s3 = vld1q_s16(src + 24); + + sum[0] = aom_sdotq_s16(sum[0], s0, s0); + sum[1] = aom_sdotq_s16(sum[1], s1, s1); + sum[2] = aom_sdotq_s16(sum[2], s2, s2); + sum[3] = aom_sdotq_s16(sum[3], s3, s3); + + src += 32; + n -= 32; + } while (n != 0); + + sum[0] = vaddq_s64(sum[0], sum[1]); + sum[2] = vaddq_s64(sum[2], sum[3]); + sum[0] = vaddq_s64(sum[0], sum[2]); + return vaddvq_s64(sum[0]); + } + return aom_sum_squares_i16_c(src, n); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src, + int stride, int height, + int *sum) { + int64x2_t sse = vdupq_n_s64(0); + int32x4_t sum_s32 = vdupq_n_s32(0); + + do { + int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); + + sse = aom_sdotq_s16(sse, s, s); + + sum_s32 = vpadalq_s16(sum_s32, s); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + *sum += vaddvq_s32(sum_s32); + return vaddvq_s64(sse); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src, + int stride, int height, + int *sum) { + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int16x8_t s0 = vld1q_s16(src); + int16x8_t s1 = vld1q_s16(src + stride); + + sse[0] = aom_sdotq_s16(sse[0], s0, s0); + sse[1] = aom_sdotq_s16(sse[1], s1, s1); + + sum_acc[0] = vpadalq_s16(sum_acc[0], s0); + sum_acc[1] = vpadalq_s16(sum_acc[1], s1); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); + return vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src, + int stride, int width, + int height, int *sum) { + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int w = 0; + do { + int16x8_t s0 = vld1q_s16(src + w); + int16x8_t s1 = vld1q_s16(src + w + 8); + + sse[0] = aom_sdotq_s16(sse[0], s0, s0); + sse[1] = aom_sdotq_s16(sse[1], s1, s1); + + sum_acc[0] = vpadalq_s16(sum_acc[0], s0); + sum_acc[1] = vpadalq_s16(sum_acc[1], s1); + + w += 16; + } while (w < width); + + src += stride; + } while (--height != 0); + + *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); + return vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width, + int height, int *sum) { + uint64_t sse; + + if (width == 4) { + sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum); + } else if (width == 8) { + sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum); + } else if (width % 16 == 0) { + sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum); + } else { + sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum); + } + + return sse; +} + +static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + uint16x8_t s0 = + vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride)); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + sse_u64 = aom_udotq_u16(sse_u64, s0, s0); + + src_u16 += 2 * src_stride; + h -= 2; + } while (h != 0); + + sum += vaddlvq_u32(sum_u32); + sse += vaddvq_u64(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + sse_u64 = aom_udotq_u16(sse_u64, s0, s0); + + src_ptr += 8; + w -= 8; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum += vaddlvq_u32(sum_u32); + sse += vaddvq_u64(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s1); + + sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); + sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); + + src_ptr += 16; + w -= 16; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); + + sum += vaddlvq_u32(sum_u32[0]); + sse += vaddvq_u64(sse_u64[0]); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + uint16x8_t s2 = vld1q_u16(src_ptr + 16); + uint16x8_t s3 = vld1q_u16(src_ptr + 24); + + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s1); + sum_u32[2] = vpadalq_u16(sum_u32[2], s2); + sum_u32[3] = vpadalq_u16(sum_u32[3], s3); + + sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); + sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); + sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2); + sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3); + + src_ptr += 32; + w -= 32; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); + sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]); + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); + sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]); + + sum += vaddlvq_u32(sum_u32[0]); + sse += vaddvq_u64(sse_u64[0]); + + return sse - sum * sum / (width * height); +} + +uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width, + int height) { + if (width == 4) { + return aom_var_2d_u16_4xh_sve(src, src_stride, width, height); + } + if (width == 8) { + return aom_var_2d_u16_8xh_sve(src, src_stride, width, height); + } + if (width == 16) { + return aom_var_2d_u16_16xh_sve(src, src_stride, width, height); + } + if (width % 32 == 0) { + return aom_var_2d_u16_large_sve(src, src_stride, width, height); + } + return aom_var_2d_u16_neon(src, src_stride, width, height); +} diff --git a/third_party/aom/aom_dsp/arm/transpose_neon.h b/third_party/aom/aom_dsp/arm/transpose_neon.h new file mode 100644 index 0000000000..8027018235 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/transpose_neon.h @@ -0,0 +1,1263 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ +#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ + +#include + +#include "aom/aom_integer.h" // For AOM_FORCE_INLINE. +#include "config/aom_config.h" + +static INLINE void transpose_elems_u8_8x8( + uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4, + uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1, + uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, + uint8x8_t *o7) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 + // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 + // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 + // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 + + const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5)); + const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 + // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 + // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 + // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 + + const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), + vreinterpretq_u16_u8(b1.val[0])); + const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), + vreinterpretq_u16_u8(b1.val[1])); + + // Unzip 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + +static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, + uint8x8_t *a2, uint8x8_t *a3, + uint8x8_t *a4, uint8x8_t *a5, + uint8x8_t *a6, + uint8x8_t *a7) { + transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3, + a4, a5, a6, a7); +} + +static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in, + uint8x8_t *out) { + transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + &out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x, + uint8x16_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); + uint8x8x2_t w2 = vzip_u8(x[4], x[5]); + uint8x8x2_t w3 = vzip_u8(x[6], x[7]); + + uint8x8x2_t w8 = vzip_u8(x[8], x[9]); + uint8x8x2_t w9 = vzip_u8(x[10], x[11]); + uint8x8x2_t w10 = vzip_u8(x[12], x[13]); + uint8x8x2_t w11 = vzip_u8(x[14], x[15]); + + uint16x4x2_t w4 = + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); + uint16x4x2_t w5 = + vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); + uint16x4x2_t w12 = + vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); + uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), + vreinterpret_u16_u8(w11.val[0])); + + uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), + vreinterpret_u32_u16(w5.val[0])); + uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), + vreinterpret_u32_u16(w5.val[1])); + uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), + vreinterpret_u32_u16(w13.val[0])); + uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), + vreinterpret_u32_u16(w13.val[1])); + + // Store first 4-line result + d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); + d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); + d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); + d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); + + w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); + w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); + w12 = + vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); + w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), + vreinterpret_u16_u8(w11.val[1])); + + w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), + vreinterpret_u32_u16(w5.val[0])); + w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), + vreinterpret_u32_u16(w5.val[1])); + w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), + vreinterpret_u32_u16(w13.val[0])); + w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), + vreinterpret_u32_u16(w13.val[1])); + + // Store second 4-line result + d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); + d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); + d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); + d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x, + uint8x8_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); + uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); + + uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0])); + uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), + vreinterpretq_u16_u8(w3.val[0])); + uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1])); + uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), + vreinterpretq_u16_u8(w3.val[1])); + + uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), + vreinterpretq_u32_u16(w5.val[0])); + uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), + vreinterpretq_u32_u16(w7.val[0])); + uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), + vreinterpretq_u32_u16(w5.val[1])); + uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), + vreinterpretq_u32_u16(w7.val[1])); + + d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0])); + d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0])); + d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1])); + d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1])); + d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0])); + d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0])); + d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1])); + d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1])); + d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0])); + d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0])); + d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1])); + d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1])); + d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0])); + d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0])); + d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1])); + d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1])); +} + +static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; +#if AOM_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); +#endif + return b0; +} + +static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x, + uint8x16_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); + uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); + + uint8x16x2_t w4 = vzipq_u8(x[8], x[9]); + uint8x16x2_t w5 = vzipq_u8(x[10], x[11]); + uint8x16x2_t w6 = vzipq_u8(x[12], x[13]); + uint8x16x2_t w7 = vzipq_u8(x[14], x[15]); + + uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0])); + uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), + vreinterpretq_u16_u8(w3.val[0])); + uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), + vreinterpretq_u16_u8(w5.val[0])); + uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), + vreinterpretq_u16_u8(w7.val[0])); + + uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), + vreinterpretq_u32_u16(w9.val[0])); + uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), + vreinterpretq_u32_u16(w11.val[0])); + uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), + vreinterpretq_u32_u16(w9.val[1])); + uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), + vreinterpretq_u32_u16(w11.val[1])); + + uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); + d[0] = vreinterpretq_u8_u16(d01.val[0]); + d[1] = vreinterpretq_u8_u16(d01.val[1]); + uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); + d[2] = vreinterpretq_u8_u16(d23.val[0]); + d[3] = vreinterpretq_u8_u16(d23.val[1]); + uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); + d[4] = vreinterpretq_u8_u16(d45.val[0]); + d[5] = vreinterpretq_u8_u16(d45.val[1]); + uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); + d[6] = vreinterpretq_u8_u16(d67.val[0]); + d[7] = vreinterpretq_u8_u16(d67.val[1]); + + // upper half + w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1])); + w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), + vreinterpretq_u16_u8(w3.val[1])); + w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), + vreinterpretq_u16_u8(w5.val[1])); + w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), + vreinterpretq_u16_u8(w7.val[1])); + + w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), + vreinterpretq_u32_u16(w9.val[0])); + w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), + vreinterpretq_u32_u16(w11.val[0])); + w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), + vreinterpretq_u32_u16(w9.val[1])); + w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), + vreinterpretq_u32_u16(w11.val[1])); + + d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); + d[8] = vreinterpretq_u8_u16(d01.val[0]); + d[9] = vreinterpretq_u8_u16(d01.val[1]); + d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); + d[10] = vreinterpretq_u8_u16(d23.val[0]); + d[11] = vreinterpretq_u8_u16(d23.val[1]); + d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); + d[12] = vreinterpretq_u8_u16(d45.val[0]); + d[13] = vreinterpretq_u8_u16(d45.val[1]); + d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); + d[14] = vreinterpretq_u8_u16(d67.val[0]); + d[15] = vreinterpretq_u8_u16(d67.val[1]); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x, + uint8x16_t *d) { + uint8x16_t x2[32]; + for (int i = 0; i < 16; ++i) { + x2[i] = x[i].val[0]; + x2[i + 16] = x[i].val[1]; + } + transpose_arrays_u8_16x16(x2, d); + transpose_arrays_u8_16x16(x2 + 16, d + 16); +} + +static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, + uint8x8_t *a2, + uint8x8_t *a3) { + // Swap 8 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); + const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint16x4x2_t c0 = + vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); + const uint16x4x2_t c1 = + vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); + + *a0 = vreinterpret_u8_u16(c0.val[0]); + *a1 = vreinterpret_u8_u16(c1.val[0]); + *a2 = vreinterpret_u8_u16(c0.val[1]); + *a3 = vreinterpret_u8_u16(c1.val[1]); +} + +static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0, + uint8x8_t *a1) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint16x4x2_t b0 = + vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b0.val[1])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + +static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x8_t a4, uint8x8_t a5, + uint8x8_t a6, uint8x8_t a7, + uint8x8_t *o0, uint8x8_t *o1, + uint8x8_t *o2, uint8x8_t *o3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 XX XX XX XX + // a1: 10 11 12 13 XX XX XX XX + // a2: 20 21 22 23 XX XX XX XX + // a3; 30 31 32 33 XX XX XX XX + // a4: 40 41 42 43 XX XX XX XX + // a5: 50 51 52 53 XX XX XX XX + // a6: 60 61 62 63 XX XX XX XX + // a7: 70 71 72 73 XX XX XX XX + // to: + // b0.val[0]: 00 01 02 03 40 41 42 43 + // b1.val[0]: 10 11 12 13 50 51 52 53 + // b2.val[0]: 20 21 22 23 60 61 62 63 + // b3.val[0]: 30 31 32 33 70 71 72 73 + + const uint32x2x2_t b0 = + vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4)); + const uint32x2x2_t b1 = + vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5)); + const uint32x2x2_t b2 = + vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6)); + const uint32x2x2_t b3 = + vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 01 20 21 40 41 60 61 + // c0.val[1]: 02 03 22 23 42 43 62 63 + // c1.val[0]: 10 11 30 31 50 51 70 71 + // c1.val[1]: 12 13 32 33 52 53 72 73 + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), + vreinterpret_u16_u32(b2.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), + vreinterpret_u16_u32(b3.val[0])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 01 11 21 31 41 51 61 71 + // d1.val[0]: 02 12 22 32 42 52 62 72 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); + const uint8x8x2_t d1 = + vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); + + *o0 = d0.val[0]; + *o1 = d0.val[1]; + *o2 = d1.val[0]; + *o3 = d1.val[1]; +} + +static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) { + // Input: + // 00 01 02 03 + // 10 11 12 13 + // 20 21 22 23 + // 30 31 32 33 + + // b: + // 00 10 02 12 + // 01 11 03 13 + const uint16x4x2_t b = vtrn_u16(a[0], a[1]); + // c: + // 20 30 22 32 + // 21 31 23 33 + const uint16x4x2_t c = vtrn_u16(a[2], a[3]); + // d: + // 00 10 20 30 + // 02 12 22 32 + const uint32x2x2_t d = + vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0])); + // e: + // 01 11 21 31 + // 03 13 23 33 + const uint32x2x2_t e = + vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1])); + + // Output: + // 00 10 20 30 + // 01 11 21 31 + // 02 12 22 32 + // 03 13 23 33 + a[0] = vreinterpret_u16_u32(d.val[0]); + a[1] = vreinterpret_u16_u32(e.val[0]); + a[2] = vreinterpret_u16_u32(d.val[1]); + a[3] = vreinterpret_u16_u32(e.val[1]); +} + +static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) { + // 4x8 Input: + // a[0]: 00 01 02 03 04 05 06 07 + // a[1]: 10 11 12 13 14 15 16 17 + // a[2]: 20 21 22 23 24 25 26 27 + // a[3]: 30 31 32 33 34 35 36 37 + + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); + const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); + + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + + // 8x4 Output: + // a[0]: 00 10 20 30 04 14 24 34 + // a[1]: 01 11 21 31 05 15 25 35 + // a[2]: 02 12 22 32 06 16 26 36 + // a[3]: 03 13 23 33 07 17 27 37 + a[0] = vreinterpretq_u16_u32(c0.val[0]); + a[1] = vreinterpretq_u16_u32(c1.val[0]); + a[2] = vreinterpretq_u16_u32(c0.val[1]); + a[3] = vreinterpretq_u16_u32(c1.val[1]); +} + +// Special transpose for loop filter. +// 4x8 Input: +// p_q: p3 p2 p1 p0 q0 q1 q2 q3 +// a[0]: 00 01 02 03 04 05 06 07 +// a[1]: 10 11 12 13 14 15 16 17 +// a[2]: 20 21 22 23 24 25 26 27 +// a[3]: 30 31 32 33 34 35 36 37 +// 8x4 Output: +// a[0]: 03 13 23 33 04 14 24 34 p0q0 +// a[1]: 02 12 22 32 05 15 25 35 p1q1 +// a[2]: 01 11 21 31 06 16 26 36 p2q2 +// a[3]: 00 10 20 30 07 17 27 37 p3q3 +// Direct reapplication of the function will reset the high halves, but +// reverse the low halves: +// p_q: p0 p1 p2 p3 q0 q1 q2 q3 +// a[0]: 33 32 31 30 04 05 06 07 +// a[1]: 23 22 21 20 14 15 16 17 +// a[2]: 13 12 11 10 24 25 26 27 +// a[3]: 03 02 01 00 34 35 36 37 +// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but +// reverse the high halves. +// The standard transpose_u16_4x8q will produce the same reversals, but with the +// order of the low halves also restored relative to the high halves. This is +// preferable because it puts all values from the same source row back together, +// but some post-processing is inevitable. +static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) { + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]); + const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]); + + // Reverse odd vectors to bring the appropriate items to the front of zips. + // b0.val[0]: 00 10 02 12 04 14 06 16 + // r0 : 03 13 01 11 07 17 05 15 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // r1 : 23 33 21 31 27 37 25 35 + const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1])); + const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1])); + + // Zip to complete the halves. + // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1 + // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2 + // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2 + // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1 + const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vzipq_u32(r0, r1); + + // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3 + // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1 + // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0 + // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2 + const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]); + // The third row of c comes first here to swap p2 with q0. + const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]); + + // 8x4 Output: + // a[0]: 03 13 23 33 04 14 24 34 p0q0 + // a[1]: 02 12 22 32 05 15 25 35 p1q1 + // a[2]: 01 11 21 31 06 16 26 36 p2q2 + // a[3]: 00 10 20 30 07 17 27 37 p3q3 + a[0] = d1.val[0]; // p0q0 + a[1] = d0.val[1]; // p1q1 + a[2] = d1.val[1]; // p2q2 + a[3] = d0.val[0]; // p3q3 +} + +static INLINE void transpose_elems_u16_4x8( + const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2, + const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5, + const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1, + uint16x8_t *o2, uint16x8_t *o3) { + // Combine rows. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0: 00 01 02 03 40 41 42 43 + // b1: 10 11 12 13 50 51 52 53 + // b2: 20 21 22 23 60 61 62 63 + // b3: 30 31 32 33 70 71 72 73 + + const uint16x8_t b0 = vcombine_u16(a0, a4); + const uint16x8_t b1 = vcombine_u16(a1, a5); + const uint16x8_t b2 = vcombine_u16(a2, a6); + const uint16x8_t b3 = vcombine_u16(a3, a7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 02 12 40 50 42 52 + // c0.val[1]: 01 11 03 13 41 51 43 53 + // c1.val[0]: 20 30 22 32 60 70 62 72 + // c1.val[1]: 21 31 23 33 61 71 63 73 + + const uint16x8x2_t c0 = vtrnq_u16(b0, b1); + const uint16x8x2_t c1 = vtrnq_u16(b2, b3); + + // Swap 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 02 12 22 32 42 52 62 72 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), + vreinterpretq_u32_u16(c1.val[0])); + const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), + vreinterpretq_u32_u16(c1.val[1])); + + *o0 = vreinterpretq_u16_u32(d0.val[0]); + *o1 = vreinterpretq_u16_u32(d1.val[0]); + *o2 = vreinterpretq_u16_u32(d0.val[1]); + *o3 = vreinterpretq_u16_u32(d1.val[1]); +} + +static INLINE void transpose_elems_s16_4x8( + const int16x4_t a0, const int16x4_t a1, const int16x4_t a2, + const int16x4_t a3, const int16x4_t a4, const int16x4_t a5, + const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1, + int16x8_t *o2, int16x8_t *o3) { + // Combine rows. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0: 00 01 02 03 40 41 42 43 + // b1: 10 11 12 13 50 51 52 53 + // b2: 20 21 22 23 60 61 62 63 + // b3: 30 31 32 33 70 71 72 73 + + const int16x8_t b0 = vcombine_s16(a0, a4); + const int16x8_t b1 = vcombine_s16(a1, a5); + const int16x8_t b2 = vcombine_s16(a2, a6); + const int16x8_t b3 = vcombine_s16(a3, a7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 02 12 40 50 42 52 + // c0.val[1]: 01 11 03 13 41 51 43 53 + // c1.val[0]: 20 30 22 32 60 70 62 72 + // c1.val[1]: 21 31 23 33 61 71 63 73 + + const int16x8x2_t c0 = vtrnq_s16(b0, b1); + const int16x8x2_t c1 = vtrnq_s16(b2, b3); + + // Swap 32 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 02 12 22 32 42 52 62 72 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); + + *o0 = vreinterpretq_s16_s32(d0.val[0]); + *o1 = vreinterpretq_s16_s32(d1.val[0]); + *o2 = vreinterpretq_s16_s32(d0.val[1]); + *o3 = vreinterpretq_s16_s32(d1.val[1]); +} + +static INLINE void transpose_elems_inplace_u16_8x8( + uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3, + uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); + const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), + vreinterpretq_u32_u16(b3.val[0])); + const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), + vreinterpretq_u32_u16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); + const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); + const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); + const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + +static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; +#if AOM_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); +#endif + return b0; +} + +static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, + int16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); + const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); + const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); + const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + +static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a, + int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; +} + +static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0, + uint16x4_t *a1, + uint16x4_t *a2, + uint16x4_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const uint16x4x2_t b0 = vtrn_u16(*a0, *a1); + const uint16x4x2_t b1 = vtrn_u16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), + vreinterpret_u32_u16(b1.val[0])); + const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]), + vreinterpret_u32_u16(b1.val[1])); + + *a0 = vreinterpret_u16_u32(c0.val[0]); + *a1 = vreinterpret_u16_u32(c1.val[0]); + *a2 = vreinterpret_u16_u32(c0.val[1]); + *a3 = vreinterpret_u16_u32(c1.val[1]); +} + +static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, + int16x4_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int16x4x2_t b0 = vtrn_s16(*a0, *a1); + const int16x4x2_t b1 = vtrn_s16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + + *a0 = vreinterpret_s16_s32(c0.val[0]); + *a1 = vreinterpret_s16_s32(c1.val[0]); + *a2 = vreinterpret_s16_s32(c0.val[1]); + *a3 = vreinterpret_s16_s32(c1.val[1]); +} + +static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; +#if AOM_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s32_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s32_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); +#endif + return b0; +} + +static INLINE void transpose_elems_s32_4x4(const int32x4_t a0, + const int32x4_t a1, + const int32x4_t a2, + const int32x4_t a3, int32x4_t *o0, + int32x4_t *o1, int32x4_t *o2, + int32x4_t *o3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(a0, a1); + const int32x4x2_t b1 = vtrnq_s32(a2, a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *o0 = c0.val[0]; + *o1 = c1.val[0]; + *o2 = c0.val[1]; + *o3 = c1.val[1]; +} + +static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, + int32x4_t *a3) { + transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3); +} + +static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in, + int32x4_t *out) { + transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2], + &out[3]); +} + +static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in, + int32x4_t *out, + const int width, + const int height) { + const int h = height >> 2; + const int w = width >> 2; + for (int j = 0; j < w; j++) { + for (int i = 0; i < h; i++) { + transpose_arrays_s32_4x4(in + j * height + i * 4, + out + i * width + j * 4); + } + } +} + +#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \ + static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \ + const int32x4_t *in, int32x4_t *out) { \ + transpose_arrays_s32_4nx4n(in, out, w, h); \ + } + +TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8) +TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16) +TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4) +TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8) +TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16) +TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32) +TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8) +TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16) +TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32) +TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64) +TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8) +TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16) +TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32) +TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64) +TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16) +TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32) + +#undef TRANSPOSE_ARRAYS_S32_WXH_NEON + +static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) { +#if AOM_ARCH_AARCH64 + return vtrn1q_s64(a, b); +#else + return vcombine_s64(vget_low_s64(a), vget_low_s64(b)); +#endif +} + +static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) { +#if AOM_ARCH_AARCH64 + return vtrn2q_s64(a, b); +#else + return vcombine_s64(vget_high_s64(a), vget_high_s64(b)); +#endif +} + +static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1, + int32x4_t a2, int32x4_t a3, + int32x4_t a4, int32x4_t a5, + int32x4_t a6, int32x4_t a7, + int32x4x2_t *o0, int32x4x2_t *o1, + int32x4x2_t *o2, int32x4x2_t *o3) { + // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4 + // matrix transpose implementation: + // [ A ]^T => [ A^T B^T ] + // [ B ] + + transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T + transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T + + o0->val[0] = a0; + o1->val[0] = a1; + o2->val[0] = a2; + o3->val[0] = a3; + + o0->val[1] = a4; + o1->val[1] = a5; + o2->val[1] = a6; + o3->val[1] = a7; +} + +static INLINE void transpose_elems_inplace_s32_8x8( + int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3, + int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) { + // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4 + // matrix transpose implementation: + // [ A B ]^T => [ A^T C^T ] + // [ C D ] [ B^T D^T ] + + int32x4_t q0_v1 = a0->val[0]; + int32x4_t q0_v2 = a1->val[0]; + int32x4_t q0_v3 = a2->val[0]; + int32x4_t q0_v4 = a3->val[0]; + + int32x4_t q1_v1 = a0->val[1]; + int32x4_t q1_v2 = a1->val[1]; + int32x4_t q1_v3 = a2->val[1]; + int32x4_t q1_v4 = a3->val[1]; + + int32x4_t q2_v1 = a4->val[0]; + int32x4_t q2_v2 = a5->val[0]; + int32x4_t q2_v3 = a6->val[0]; + int32x4_t q2_v4 = a7->val[0]; + + int32x4_t q3_v1 = a4->val[1]; + int32x4_t q3_v2 = a5->val[1]; + int32x4_t q3_v3 = a6->val[1]; + int32x4_t q3_v4 = a7->val[1]; + + transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T + transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T + transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T + transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T + + a0->val[0] = q0_v1; + a1->val[0] = q0_v2; + a2->val[0] = q0_v3; + a3->val[0] = q0_v4; + + a0->val[1] = q2_v1; + a1->val[1] = q2_v2; + a2->val[1] = q2_v3; + a3->val[1] = q2_v4; + + a4->val[0] = q1_v1; + a5->val[0] = q1_v2; + a6->val[0] = q1_v3; + a7->val[0] = q1_v4; + + a4->val[1] = q3_v1; + a5->val[1] = q3_v2; + a6->val[1] = q3_v3; + a7->val[1] = q3_v4; +} + +static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in, + int16x4_t *const out) { + int16x4_t a0 = in[0]; + int16x4_t a1 = in[1]; + int16x4_t a2 = in[2]; + int16x4_t a3 = in[3]; + + transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3); + + out[0] = a0; + out[1] = a1; + out[2] = a2; + out[3] = a3; +} + +static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in, + int16x8_t *const out) { +#if AOM_ARCH_AARCH64 + const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)), + vcombine_s16(in[1], vdup_n_s16(0))); + const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)), + vcombine_s16(in[3], vdup_n_s16(0))); + const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)), + vcombine_s16(in[5], vdup_n_s16(0))); + const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)), + vcombine_s16(in[7], vdup_n_s16(0))); +#else + int16x4x2_t temp; + temp = vzip_s16(in[0], in[1]); + const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]); + temp = vzip_s16(in[2], in[3]); + const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]); + temp = vzip_s16(in[4], in[5]); + const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]); + temp = vzip_s16(in[6], in[7]); + const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]); +#endif + + const int32x4x2_t b02 = + vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1)); + const int32x4x2_t b13 = + vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3)); + +#if AOM_ARCH_AARCH64 + out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]), + vreinterpretq_s64_s32(b13.val[0]))); + out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]), + vreinterpretq_s64_s32(b13.val[0]))); + out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]), + vreinterpretq_s64_s32(b13.val[1]))); + out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]), + vreinterpretq_s64_s32(b13.val[1]))); +#else + out[0] = vreinterpretq_s16_s32( + vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2)); + out[2] = vreinterpretq_s16_s32( + vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2)); + out[1] = vreinterpretq_s16_s32( + vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2)); + out[3] = vreinterpretq_s16_s32( + vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2)); +#endif +} + +static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in, + int16x4_t *const out) { + // Swap 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]); + const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]), + vreinterpretq_u32_s16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]), + vreinterpretq_u32_s16(b1.val[1])); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + + out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0])); + out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0])); + out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1])); + out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1])); + out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0])); + out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0])); + out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1])); + out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1])); +} + +#endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c new file mode 100644 index 0000000000..9e4e8c0cf0 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_neon.c @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. + assert(h <= 256); + + int i = h; + do { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_s16x8(sum_s16); + *sse = (uint32_t)horizontal_add_s32x4(sse_s32); +} + +static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + int i = h; + do { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_s16x8(sum_s16); + *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. + assert(h <= 128); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_large_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, int h_limit, uint32_t *sse, + int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src + j); + uint8x16_t r = vld1q_u8(ref + j); + + int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src += src_stride; + ref += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_s32x4(sum_s32); + *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); +} + +static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int h, uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum); +} + +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int aom_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) +VARIANCE_WXH_NEON(4, 16, 6) + +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) +VARIANCE_WXH_NEON(8, 32, 8) + +VARIANCE_WXH_NEON(16, 4, 6) +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) +VARIANCE_WXH_NEON(16, 64, 10) + +VARIANCE_WXH_NEON(32, 8, 8) +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 16, 10) +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) +VARIANCE_WXH_NEON(64, 128, 13) + +VARIANCE_WXH_NEON(128, 64, 13) +VARIANCE_WXH_NEON(128, 128, 14) + +#undef VARIANCE_WXH_NEON + +// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of +// AVX2. Also, implement the NEON for variance computation present in this +// function. +void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32_t *sse8x8, int *sum8x8, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var8x8) { + // Loop over four 8x8 blocks. Process one 8x32 block. + for (int k = 0; k < 4; k++) { + variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8, + &sse8x8[k], &sum8x8[k]); + } + + *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; + *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; + for (int i = 0; i < 4; i++) { + var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); + } +} + +void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32_t *sse16x16, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var16x16) { + int sum16x16[2] = { 0 }; + // Loop over two 16x16 blocks. Process one 16x32 block. + for (int k = 0; k < 2; k++) { + variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride, + 16, &sse16x16[k], &sum16x16[k]); + } + + *tot_sse += sse16x16[0] + sse16x16[1]; + *tot_sum += sum16x16[0] + sum16x16[1]; + for (int i = 0; i < 2; i++) { + var16x16[i] = + sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); + } +} + +static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int h) { + uint8x8_t s[2], r[2]; + int16x4_t diff_lo[2], diff_hi[2]; + uint16x8_t diff[2]; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int i = h; + do { + s[0] = vld1_u8(src); + src += src_stride; + s[1] = vld1_u8(src); + src += src_stride; + r[0] = vld1_u8(ref); + ref += ref_stride; + r[1] = vld1_u8(ref); + ref += ref_stride; + + diff[0] = vsubl_u8(s[0], r[0]); + diff[1] = vsubl_u8(s[1], r[1]); + + diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); + diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); + sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]); + sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]); + + diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); + diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); + sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]); + sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]); + + i -= 2; + } while (i != 0); + + sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]); + + *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); + return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); +} + +static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int h) { + uint8x16_t s[2], r[2]; + int16x4_t diff_lo[4], diff_hi[4]; + uint16x8_t diff[4]; + int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + int i = h; + do { + s[0] = vld1q_u8(src); + src += src_stride; + s[1] = vld1q_u8(src); + src += src_stride; + r[0] = vld1q_u8(ref); + ref += ref_stride; + r[1] = vld1q_u8(ref); + ref += ref_stride; + + diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0])); + diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0])); + diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1])); + diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1])); + + diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0])); + diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1])); + sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]); + sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]); + + diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2])); + diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3])); + sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]); + sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]); + + diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0])); + diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1])); + sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]); + sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]); + + diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2])); + diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3])); + sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]); + sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]); + + i -= 2; + } while (i != 0); + + sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]); + sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]); + sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]); + + *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); + return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0])); +} + +#define MSE_WXH_NEON(w, h) \ + unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h); \ + } + +MSE_WXH_NEON(8, 8) +MSE_WXH_NEON(8, 16) + +MSE_WXH_NEON(16, 8) +MSE_WXH_NEON(16, 16) + +#undef MSE_WXH_NEON + +static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum, + uint16x8_t s0, uint16x8_t s1, + uint8x8_t d0, uint8x8_t d1) { + int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0)); + int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1)); + + int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0)); + mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0)); + mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1)); + mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1)); + + return vpadalq_u32(sum, vreinterpretq_u32_s32(mse)); +} + +static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src, + int sstride, int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4)); + + uint64x2_t sum = vdupq_n_u64(0); + + if (w == 8) { + do { + uint8x8_t d0 = vld1_u8(dst + 0 * dstride); + uint8x8_t d1 = vld1_u8(dst + 1 * dstride); + uint16x8_t s0 = vld1q_u16(src + 0 * sstride); + uint16x8_t s1 = vld1q_u16(src + 1 * sstride); + + sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1); + + dst += 2 * dstride; + src += 2 * sstride; + h -= 2; + } while (h != 0); + } else { + do { + uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride); + uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride); + uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); + uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); + + sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1); + + dst += 4 * dstride; + src += 4 * sstride; + h -= 4; + } while (h != 0); + } + + return sum; +} + +// Computes mse for a given block size. This function gets called for specific +// block sizes, which are 8x8, 8x4, 4x8 and 4x4. +uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h)); +} + +uint32_t aom_get_mb_ss_neon(const int16_t *a) { + int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + for (int i = 0; i < 256; i = i + 8) { + int16x8_t a_s16 = vld1q_s16(a + i); + + sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16)); + sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16)); + } + + return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1])); +} + +uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src, + int w, int h) { + uint64x2_t sum = vdupq_n_u64(0); + + int num_blks = 16 / w; + do { + sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h)); + dst += w; + src += w * h; + } while (--num_blks != 0); + + return horizontal_add_u64x2(sum); +} diff --git a/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c new file mode 100644 index 0000000000..9fb52e1df7 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s = load_unaligned_u8q(src, src_stride); + uint8x16_t r = load_unaligned_u8q(ref, ref_stride); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src += 4 * src_stride; + ref += 4 * ref_stride; + i -= 4; + } while (i != 0); + + int32x4_t sum_diff = + vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); + *sum = horizontal_add_s32x4(sum_diff); + *sse = horizontal_add_u32x4(sse_u32); +} + +static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int h, uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride)); + uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride)); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + int32x4_t sum_diff = + vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); + *sum = horizontal_add_s32x4(sum_diff); + *sse = horizontal_add_u32x4(sse_u32); +} + +static INLINE void variance_16xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + int32x4_t sum_diff = + vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); + *sum = horizontal_add_s32x4(sum_diff); + *sse = horizontal_add_u32x4(sse_u32); +} + +static INLINE void variance_large_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src + j); + uint8x16_t r = vld1q_u8(ref + j); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + j += 16; + } while (j < w); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + int32x4_t sum_diff = + vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum)); + *sum = horizontal_add_s32x4(sum_diff); + *sse = horizontal_add_u32x4(sse_u32); +} + +static INLINE void variance_32xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse, + sum); +} + +static INLINE void variance_64xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse, + sum); +} + +static INLINE void variance_128xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse, + sum); +} + +#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \ + unsigned int aom_variance##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON_DOTPROD(4, 4, 4) +VARIANCE_WXH_NEON_DOTPROD(4, 8, 5) +VARIANCE_WXH_NEON_DOTPROD(4, 16, 6) + +VARIANCE_WXH_NEON_DOTPROD(8, 4, 5) +VARIANCE_WXH_NEON_DOTPROD(8, 8, 6) +VARIANCE_WXH_NEON_DOTPROD(8, 16, 7) +VARIANCE_WXH_NEON_DOTPROD(8, 32, 8) + +VARIANCE_WXH_NEON_DOTPROD(16, 4, 6) +VARIANCE_WXH_NEON_DOTPROD(16, 8, 7) +VARIANCE_WXH_NEON_DOTPROD(16, 16, 8) +VARIANCE_WXH_NEON_DOTPROD(16, 32, 9) +VARIANCE_WXH_NEON_DOTPROD(16, 64, 10) + +VARIANCE_WXH_NEON_DOTPROD(32, 8, 8) +VARIANCE_WXH_NEON_DOTPROD(32, 16, 9) +VARIANCE_WXH_NEON_DOTPROD(32, 32, 10) +VARIANCE_WXH_NEON_DOTPROD(32, 64, 11) + +VARIANCE_WXH_NEON_DOTPROD(64, 16, 10) +VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) +VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) +VARIANCE_WXH_NEON_DOTPROD(64, 128, 13) + +VARIANCE_WXH_NEON_DOTPROD(128, 64, 13) +VARIANCE_WXH_NEON_DOTPROD(128, 128, 14) + +#undef VARIANCE_WXH_NEON_DOTPROD + +void aom_get_var_sse_sum_8x8_quad_neon_dotprod( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, + uint32_t *var8x8) { + // Loop over four 8x8 blocks. Process one 8x32 block. + for (int k = 0; k < 4; k++) { + variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8), + ref_stride, 8, &sse8x8[k], &sum8x8[k]); + } + + *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; + *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; + for (int i = 0; i < 4; i++) { + var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); + } +} + +void aom_get_var_sse_sum_16x16_dual_neon_dotprod( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, + uint32_t *var16x16) { + int sum16x16[2] = { 0 }; + // Loop over two 16x16 blocks. Process one 16x32 block. + for (int k = 0; k < 2; k++) { + variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16), + ref_stride, 16, &sse16x16[k], &sum16x16[k]); + } + + *tot_sse += sse16x16[0] + sse16x16[1]; + *tot_sum += sum16x16[0] + sum16x16[1]; + for (int i = 0; i < 2; i++) { + var16x16[i] = + sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); + } +} + +static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, + unsigned int *sse, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride)); + uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride)); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sse = horizontal_add_u32x4(sse_u32); + return horizontal_add_u32x4(sse_u32); +} + +static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, + unsigned int *sse, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src); + uint8x16_t s1 = vld1q_u8(src + src_stride); + uint8x16_t r0 = vld1q_u8(ref); + uint8x16_t r1 = vld1q_u8(ref + ref_stride); + + uint8x16_t abs_diff0 = vabdq_u8(s0, r0); + uint8x16_t abs_diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); + return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +#define MSE_WXH_NEON_DOTPROD(w, h) \ + unsigned int aom_mse##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \ + } + +MSE_WXH_NEON_DOTPROD(8, 8) +MSE_WXH_NEON_DOTPROD(8, 16) + +MSE_WXH_NEON_DOTPROD(16, 8) +MSE_WXH_NEON_DOTPROD(16, 16) + +#undef MSE_WXH_NEON_DOTPROD diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c new file mode 100644 index 0000000000..893f9c2f65 --- /dev/null +++ b/third_party/aom/aom_dsp/avg.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + +unsigned int aom_avg_4x4_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +unsigned int aom_avg_8x8_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + +void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx, + int *avg) { + for (int k = 0; k < 4; k++) { + const int x8_idx = x16_idx + ((k & 1) << 3); + const int y8_idx = y16_idx + ((k >> 1) << 3); + const uint8_t *s_tmp = s + y8_idx * p + x8_idx; + avg[k] = aom_avg_8x8_c(s_tmp, p); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + +unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + int i, j; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const uint16_t *d = CONVERT_TO_SHORTPTR(d8); + *min = 65535; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1; + int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1; + int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1; + int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1; + + coeff[0] = b0 + b2; + coeff[1] = b1 + b3; + coeff[2] = b0 - b2; + coeff[3] = b1 - b3; +} + +void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[16]; + int16_t buffer2[16]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 4; ++idx) { + hadamard_col4(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 4; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 4; ++idx) { + hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } + + // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2). + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i]; + } + } +} + +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] +static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int16_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } + + // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2). + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i]; + } + } +} + +void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t buffer[64]; + int16_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (int idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (int idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } + + for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx]; + + // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2). + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + coeff[i * 8 + j] = buffer2[j * 8 + i]; + } + } +} + +void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + for (int i = 0; i < 2; i++) { + aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride, + (int16_t *)coeff + (i * 64)); + } +} + +// In place 16x16 2D Hadamard transform +void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } + + coeff -= 64; + // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2). + // Note that to match SSE2 output, it does not need this step. + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 4; j++) { + tran_low_t temp = coeff[i * 16 + 4 + j]; + coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j]; + coeff[i * 16 + 8 + j] = temp; + } + } +} + +void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + for (int idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + for (int idx = 0; idx < 64; ++idx) { + int16_t a0 = coeff[0]; + int16_t a1 = coeff[64]; + int16_t a2 = coeff[128]; + int16_t a3 = coeff[192]; + + int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + int16_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 16 bit, dynamic range [-32768, 32767] + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] + tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] + tran_low_t b3 = (a2 - a3) >> 2; + + coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void hadamard_highbd_col8_first_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// src_diff: 16 bit, dynamic range [-32760, 32760] +// coeff: 19 bit +static void hadamard_highbd_col8_second_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int32_t *coeff) { + int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int32_t c0 = b0 + b2; + int32_t c1 = b1 + b3; + int32_t c2 = b0 - b2; + int32_t c3 = b1 - b3; + int32_t c4 = b4 + b6; + int32_t c5 = b5 + b7; + int32_t c6 = b4 - b6; + int32_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int32_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // src_diff: 13 bit + // buffer: 16 bit, dynamic range [-32760, 32760] + hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf); + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // buffer: 16 bit + // buffer2: 19 bit, dynamic range [-262080, 262080] + hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx); + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 19 bit, dynamic range [-262080, 262080] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; + tran_low_t b1 = (a0 - a1) >> 1; + tran_low_t b2 = (a2 + a3) >> 1; + tran_low_t b3 = (a2 - a3) >> 1; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 20 bit + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; + tran_low_t b1 = (a0 - a1) >> 2; + tran_low_t b2 = (a2 + a3) >> 2; + tran_low_t b3 = (a2 - a3) >> 2; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// coeff: 20 bits, dynamic range [-524287, 524287]. +// length: value range {16, 32, 64, 128, 256, 512, 1024}. +int aom_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024] + return satd; +} + +int aom_satd_lp_c(const int16_t *coeff, int length) { + int satd = 0; + for (int i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + return satd; +} + +// Integer projection onto row vectors. +// height: value range {16, 32, 64, 128}. +void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, + const int width, const int height, int norm_factor) { + assert(height >= 2); + for (int idx = 0; idx < width; ++idx) { + hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 32640]. + for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 1020]. + hbuf[idx] >>= norm_factor; + ++ref; + } +} + +// width: value range {16, 32, 64, 128}. +void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride, + const int width, const int height, int norm_factor) { + for (int ht = 0; ht < height; ++ht) { + int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 32640] + for (int idx = 0; idx < width; ++idx) sum += ref[idx]; + vbuf[ht] = sum >> norm_factor; + ref += ref_stride; + } +} + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4, 5} +int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) { + int i; + int width = 4 << bwl; + int sse = 0, mean = 0, var; + + for (i = 0; i < width; ++i) { + int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. + mean += diff; // mean: dynamic range 16 bits. + sse += diff * diff; // sse: dynamic range 26 bits. + } + + // (mean * mean): dynamic range 31 bits. + // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~= + // 31.99, so it needs to be casted to unsigned int to compute its square. + const unsigned int mean_abs = abs(mean); + var = sse - ((mean_abs * mean_abs) >> (bwl + 2)); + return var; +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c new file mode 100644 index 0000000000..ee0ce62278 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/binary_codes_reader.h" +#include "aom_dsp/recenter.h" + +uint16_t aom_read_primitive_quniform_(aom_reader *r, + uint16_t n ACCT_STR_PARAM) { + if (n <= 1) return 0; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); + return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); +} + +// Decode finite subexponential code that for a symbol v in [0, n-1] with +// parameter k +uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, + uint16_t k ACCT_STR_PARAM) { + int i = 0; + int mk = 0; + + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + + if (n <= mk + 3 * a) { + return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; + } + + if (!aom_read_bit(r, ACCT_STR_NAME)) { + return aom_read_literal(r, b, ACCT_STR_NAME) + mk; + } + + i = i + 1; + mk += a; + } + + assert(0); + return 0; +} + +uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref ACCT_STR_PARAM) { + return inv_recenter_finite_nonneg( + n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h new file mode 100644 index 0000000000..d218f0619f --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_ +#define AOM_AOM_DSP_BINARY_CODES_READER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitreader_buffer.h" + +#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \ + aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \ + aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ + aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME)) + +uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM); +uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, + uint16_t k ACCT_STR_PARAM); +uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref ACCT_STR_PARAM); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_ diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c new file mode 100644 index 0000000000..55ce8429d7 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/recenter.h" +#include "aom_ports/bitops.h" + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits]. +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int abs_bits) { + if (v == 0) { + aom_write_bit(w, 0); + } else { + const int x = abs(v); + const int s = v < 0; + aom_write_bit(w, 1); + aom_write_bit(w, s); + aom_write_literal(w, x - 1, abs_bits); + } +} + +int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { + return (v == 0 ? 1 : abs_bits + 2); +} + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_bit(w, (v - m) & 1); + } +} + +int aom_count_primitive_quniform(uint16_t n, uint16_t v) { + if (n <= 1) return 0; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + return v < m ? l - 1 : l; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + aom_write_primitive_quniform(w, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_write_bit(w, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_write_literal(w, v - mk, b); + break; + } + } + } +} + +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { + int count = 0; + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + count += aom_count_primitive_quniform(n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + count++; + if (t) { + i = i + 1; + mk += a; + } else { + count += b; + break; + } + } + } + return count; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +// Recenters symbol around r first and then uses a finite subexponential code. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t ref, uint16_t v) { + aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); +} + +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, int16_t ref, + int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); +} + +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v) { + return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); +} + +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); +} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h new file mode 100644 index 0000000000..5ec8662139 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_ +#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/bitwriter_buffer.h" + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits] +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int mag_bits); + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t ref, uint16_t v); + +// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with +// parameter k based on a reference ref also in [-(n-1), n-1]. +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, int16_t ref, + int16_t v); + +// Functions that counts bits for the above primitives +int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); +int aom_count_primitive_quniform(uint16_t n, uint16_t v); +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v); +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v); +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/third_party/aom/aom_dsp/bitreader.c b/third_party/aom/aom_dsp/bitreader.c new file mode 100644 index 0000000000..4c70a91712 --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitreader.h" + +int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) { + if (size && !buffer) { + return 1; + } + r->buffer_end = buffer + size; + r->buffer = buffer; + od_ec_dec_init(&r->ec, buffer, (uint32_t)size); +#if CONFIG_ACCOUNTING + r->accounting = NULL; +#endif + return 0; +} + +const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; } + +const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; } + +uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); } + +uint32_t aom_reader_tell_frac(const aom_reader *r) { + return od_ec_dec_tell_frac(&r->ec); +} + +int aom_reader_has_overflowed(const aom_reader *r) { + const uint32_t tell_bits = aom_reader_tell(r); + const uint32_t tell_bytes = (tell_bits + 7) >> 3; + return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer); +} diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h new file mode 100644 index 0000000000..29321f916e --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITREADER_H_ +#define AOM_AOM_DSP_BITREADER_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aomdx.h" +#include "aom/aom_integer.h" +#include "aom_dsp/entdec.h" +#include "aom_dsp/odintrin.h" +#include "aom_dsp/prob.h" + +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#define ACCT_STR_NAME acct_str +#define ACCT_STR_PARAM , const char *ACCT_STR_NAME +#define ACCT_STR_ARG(s) , s +#else +#define ACCT_STR_PARAM +#define ACCT_STR_ARG(s) +#endif + +#define aom_read(r, prob, ACCT_STR_NAME) \ + aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_bit(r, ACCT_STR_NAME) \ + aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \ + aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_literal(r, bits, ACCT_STR_NAME) \ + aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_reader { + const uint8_t *buffer; + const uint8_t *buffer_end; + od_ec_dec ec; +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif + uint8_t allow_update_cdf; +}; + +typedef struct aom_reader aom_reader; + +int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size); + +const uint8_t *aom_reader_find_begin(aom_reader *r); + +const uint8_t *aom_reader_find_end(aom_reader *r); + +// Returns true if the bit reader has tried to decode more data from the buffer +// than was actually provided. +int aom_reader_has_overflowed(const aom_reader *r); + +// Returns the position in the bit reader in bits. +uint32_t aom_reader_tell(const aom_reader *r); + +// Returns the position in the bit reader in 1/8th bits. +uint32_t aom_reader_tell_frac(const aom_reader *r); + +#if CONFIG_ACCOUNTING +static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) { + if (r->accounting != NULL) { + uint32_t tell_frac; + tell_frac = aom_reader_tell_frac(r); + aom_accounting_record(r->accounting, ACCT_STR_NAME, + tell_frac - r->accounting->last_tell_frac); + r->accounting->last_tell_frac = tell_frac; + } +} + +static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) { + if (r->accounting != NULL) { + r->accounting->syms.num_multi_syms += !is_binary; + r->accounting->syms.num_binary_syms += !!is_binary; + } +} +#endif + +static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { + int p = (0x7FFFFF - (prob << 15) + prob) >> 8; + int bit = od_ec_decode_bool_q15(&r->ec, p); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int ref_bit, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = aom_bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs); + if (ref_nsymbs != 2) { + fprintf(stderr, + "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs " + "%d queue_r %d\n", + frame_idx, 2, ref_nsymbs, queue_r); + assert(0); + } + if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) || + (ref_cdf[1] != 32767)) { + fprintf(stderr, + "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d", + frame_idx, p, 32767, ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (bit != ref_bit) { + fprintf(stderr, + "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d " + "queue_r %d\n", + frame_idx, bit, ref_bit, queue_r); + assert(0); + } + } +#endif + +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, 1); +#endif + return bit; +} + +static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { + int ret; + ret = aom_read(r, 128, NULL); // aom_prob_half +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return ret; +} + +static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit; +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return literal; +} + +static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int symb; + assert(cdf != NULL); + symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int cdf_error = 0; + int ref_symb, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = aom_bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs); + if (nsymbs != ref_nsymbs) { + fprintf(stderr, + "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d " + "queue_r %d\n", + frame_idx, nsymbs, ref_nsymbs, queue_r); + cdf_error = 0; + assert(0); + } else { + for (i = 0; i < nsymbs; ++i) + if (cdf[i] != ref_cdf[i]) cdf_error = 1; + } + if (cdf_error) { + fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx, + cdf[0]); + for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]); + fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (symb != ref_symb) { + fprintf( + stderr, + "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n", + frame_idx, symb, ref_symb, queue_r); + assert(0); + } + } +#endif + +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, (nsymbs == 2)); +#endif + return symb; +} + +static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; + ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); + if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs); + return ret; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITREADER_H_ diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c new file mode 100644 index 0000000000..d79feea6a3 --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/recenter.h" +#include "aom_ports/bitops.h" + +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) { + return (rb->bit_offset + 7) >> 3; +} + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { + const uint32_t off = rb->bit_offset; + const uint32_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; + rb->bit_offset = off + 1; + return bit; + } else { + if (rb->error_handler) rb->error_handler(rb->error_handler_data); + return 0; + } +} + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { + assert(bits <= 31); + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; + return value; +} + +uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, + int bits) { + assert(bits <= 32); + uint32_t value = 0; + int bit; + for (bit = bits - 1; bit >= 0; bit--) + value |= (uint32_t)aom_rb_read_bit(rb) << bit; + return value; +} + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { + const int nbits = sizeof(unsigned) * 8 - bits - 1; + const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; + return ((int)value) >> nbits; +} + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { + int leading_zeros = 0; + while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros; + // Maximum 32 bits. + if (leading_zeros == 32) return UINT32_MAX; + const uint32_t base = (1u << leading_zeros) - 1; + const uint32_t value = aom_rb_read_literal(rb, leading_zeros); + return base + value; +} + +static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, + uint16_t n) { + if (n <= 1) return 0; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + const int v = aom_rb_read_literal(rb, l - 1); + return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); +} + +static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, + uint16_t n, uint16_t k) { + int i = 0; + int mk = 0; + + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + + if (n <= mk + 3 * a) { + return aom_rb_read_primitive_quniform(rb, n - mk) + mk; + } + + if (!aom_rb_read_bit(rb)) { + return aom_rb_read_literal(rb, b) + mk; + } + + i = i + 1; + mk += a; + } + + assert(0); + return 0; +} + +static uint16_t aom_rb_read_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) { + return inv_recenter_finite_nonneg(n, ref, + aom_rb_read_primitive_subexpfin(rb, n, k)); +} + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { + ref += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h new file mode 100644 index 0000000000..359fbe5194 --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_ +#define AOM_AOM_DSP_BITREADER_BUFFER_H_ + +#include + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*aom_rb_error_handler)(void *data); + +struct aom_read_bit_buffer { + const uint8_t *bit_buffer; + const uint8_t *bit_buffer_end; + uint32_t bit_offset; + + void *error_handler_data; + aom_rb_error_handler error_handler; +}; + +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb); + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb); + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); + +uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter.c b/third_party/aom/aom_dsp/bitwriter.c new file mode 100644 index 0000000000..4c27bb1fc3 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/bitwriter.h" + +void aom_start_encode(aom_writer *w, uint8_t *source) { + w->buffer = source; + w->pos = 0; + od_ec_enc_init(&w->ec, 62025); +} + +int aom_stop_encode(aom_writer *w) { + int nb_bits; + uint32_t bytes; + unsigned char *data; + data = od_ec_enc_done(&w->ec, &bytes); + if (!data) { + od_ec_enc_clear(&w->ec); + return -1; + } + nb_bits = od_ec_enc_tell(&w->ec); + memcpy(w->buffer, data, bytes); + w->pos = bytes; + od_ec_enc_clear(&w->ec); + return nb_bits; +} + +int aom_tell_size(aom_writer *w) { + const int nb_bits = od_ec_enc_tell(&w->ec); + return nb_bits; +} diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h new file mode 100644 index 0000000000..6aedd8ceb9 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITWRITER_H_ +#define AOM_AOM_DSP_BITWRITER_H_ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" + +#if CONFIG_RD_DEBUG +#include "av1/common/blockd.h" +#include "av1/encoder/cost.h" +#endif + +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_writer { + unsigned int pos; + uint8_t *buffer; + od_ec_enc ec; + uint8_t allow_update_cdf; +}; + +typedef struct aom_writer aom_writer; + +typedef struct TOKEN_STATS { + int cost; +#if CONFIG_RD_DEBUG + int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; +#endif +} TOKEN_STATS; + +static INLINE void init_token_stats(TOKEN_STATS *token_stats) { +#if CONFIG_RD_DEBUG + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + token_stats->txb_coeff_cost_map[r][c] = 0; + } + } +#endif + token_stats->cost = 0; +} + +void aom_start_encode(aom_writer *w, uint8_t *buffer); + +// Returns a negative number on error. Caller must check the return value and +// handle error. +int aom_stop_encode(aom_writer *w); + +int aom_tell_size(aom_writer *w); + +static INLINE void aom_write(aom_writer *w, int bit, int probability) { + int p = (0x7FFFFF - (probability << 15) + probability) >> 8; +#if CONFIG_BITSTREAM_DEBUG + aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; + bitstream_queue_push(bit, cdf, 2); +#endif + + od_ec_encode_bool_q15(&w->ec, bit, p); +} + +static INLINE void aom_write_bit(aom_writer *w, int bit) { + aom_write(w, bit, 128); // aom_prob_half +} + +static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { + int bit; + + for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); +} + +static INLINE void aom_write_cdf(aom_writer *w, int symb, + const aom_cdf_prob *cdf, int nsymbs) { +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_push(symb, cdf, nsymbs); +#endif + + od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs); +} + +static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + aom_write_cdf(w, symb, cdf, nsymbs); + if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITWRITER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c new file mode 100644 index 0000000000..7d0ab9486a --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/bitwriter_buffer.h" +#include "aom_dsp/recenter.h" +#include "aom_ports/bitops.h" + +int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) { + return (wb->bit_offset % CHAR_BIT == 0); +} + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { + return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); +} + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) { + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if (q == CHAR_BIT - 1) { + // Zero next char and write bit + wb->bit_buffer[p] = bit << q; + } else { + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + } + wb->bit_offset = off + 1; +} + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { + // Do not zero bytes but overwrite exisiting values + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + wb->bit_offset = off + 1; +} + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { + assert(bits <= 31); + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + +void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, + uint32_t data, int bits) { + assert(bits <= 32); + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) + aom_wb_overwrite_bit(wb, (data >> bit) & 1); +} + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + aom_wb_write_literal(wb, data, bits + 1); +} + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { + int64_t shift_val = ++v; + int leading_zeroes = 1; + + assert(shift_val > 0); + + while (shift_val >>= 1) leading_zeroes += 2; + + aom_wb_write_literal(wb, 0, leading_zeroes >> 1); + aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); +} + +static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_bit(wb, (v - m) & 1); + } +} + +static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + wb_write_primitive_quniform(wb, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_wb_write_bit(wb, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_wb_write_literal(wb, v - mk, b); + break; + } + } + } +} + +static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + uint16_t ref, uint16_t v) { + wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v)); +} + +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v); +} diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h new file mode 100644 index 0000000000..fd10e01bb7 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter_buffer.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_ +#define AOM_AOM_DSP_BITWRITER_BUFFER_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_write_bit_buffer { + uint8_t *bit_buffer; + uint32_t bit_offset; +}; + +int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb); + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); + +void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, + uint32_t data, int bits); + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); + +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h new file mode 100644 index 0000000000..fd87dc1810 --- /dev/null +++ b/third_party/aom/aom_dsp/blend.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BLEND_H_ +#define AOM_AOM_DSP_BLEND_H_ + +#include "aom_ports/mem.h" + +// Various blending functions and macros. +// See also the aom_blend_* functions in aom_dsp_rtcd.h + +// Alpha blending with alpha values from the range [0, 64], where 64 +// means use the first input and 0 means use the second input. + +#define AOM_BLEND_A64_ROUND_BITS 6 +#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64 + +#define AOM_BLEND_A64(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A64_ROUND_BITS) + +// Alpha blending with alpha values from the range [0, 256], where 256 +// means use the first input and 0 means use the second input. +#define AOM_BLEND_A256_ROUND_BITS 8 +#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256 + +#define AOM_BLEND_A256(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A256_ROUND_BITS) + +// Blending by averaging. +#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) + +#define DIFF_FACTOR_LOG2 4 +#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) + +#endif // AOM_AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c new file mode 100644 index 0000000000..e9e38ef969 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_hmask.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c new file mode 100644 index 0000000000..35017fd737 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_mask.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "config/aom_dsp_rtcd.h" + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +// NOTE(rachelbarker): The input and output of aom_blend_a64_d16_mask_c() are +// in a higher intermediate precision, and will later be rounded down to pixel +// precision. +// Thus, in order to avoid double-rounding, we want to use normal right shifts +// within this function, not ROUND_POWER_OF_TWO. +// This works because of the identity: +// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z) +// +// In contrast, the output of the non-d16 functions will not be further rounded, +// so we *should* use ROUND_POWER_OF_TWO there. + +void aom_lowbd_blend_a64_d16_mask_c( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + int i, j; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = mask[i * mask_stride + j]; + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_d16_mask_c( + uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + // excerpt from clip_pixel_highbd() + // set saturation_value to (1 << bd) - 1 + unsigned int saturation_value; + switch (bd) { + case 8: + default: saturation_value = 255; break; + case 10: saturation_value = 1023; break; + case 12: saturation_value = 4095; break; + } + + if (subw == 0 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = mask[j]; + res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (subw == 1 && subh == 1) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = ROUND_POWER_OF_TWO( + mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] + + mask[mask_stride + 2 * j + 1], + 2); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (subw == 1 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h, int subw, int subh, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c new file mode 100644 index 0000000000..c938bb33af --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_vmask.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/blk_sse_sum.c b/third_party/aom/aom_dsp/blk_sse_sum.c new file mode 100644 index 0000000000..d76c3f87b9 --- /dev/null +++ b/third_party/aom/aom_dsp/blk_sse_sum.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + for (int i = 0; i < bh; ++i) { + for (int j = 0; j < bw; ++j) { + const int val = data[j]; + *x_sum += val; + *x2_sum += val * val; + } + data += stride; + } +} diff --git a/third_party/aom/aom_dsp/butteraugli.c b/third_party/aom/aom_dsp/butteraugli.c new file mode 100644 index 0000000000..8d2a29f7a3 --- /dev/null +++ b/third_party/aom/aom_dsp/butteraugli.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/butteraugli.h" +#include "aom_mem/aom_mem.h" +#include "third_party/libyuv/include/libyuv/convert_argb.h" + +int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + aom_matrix_coefficients_t matrix_coefficients, + aom_color_range_t color_range, float *dist_map) { + (void)bit_depth; + assert(bit_depth == 8); + const int width = source->y_crop_width; + const int height = source->y_crop_height; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; + + const struct YuvConstants *yuv_constants; + if (matrix_coefficients == AOM_CICP_MC_BT_709) { + if (color_range == AOM_CR_FULL_RANGE) return 0; + yuv_constants = &kYuvH709Constants; + } else { + yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants + : &kYuvI601Constants; + } + + const int stride_argb = width * 4; + const size_t buffer_size = (size_t)height * stride_argb; + uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size); + uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size); + if (!src_argb || !distorted_argb) { + aom_free(src_argb); + aom_free(distorted_argb); + return 0; + } + + if (ss_x == 1 && ss_y == 1) { + I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, + source->uv_stride, source->v_buffer, source->uv_stride, + src_argb, stride_argb, yuv_constants, width, height); + I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride, + distorted->u_buffer, distorted->uv_stride, + distorted->v_buffer, distorted->uv_stride, distorted_argb, + stride_argb, yuv_constants, width, height); + } else if (ss_x == 1 && ss_y == 0) { + I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, + source->uv_stride, source->v_buffer, source->uv_stride, + src_argb, stride_argb, yuv_constants, width, height); + I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride, + distorted->u_buffer, distorted->uv_stride, + distorted->v_buffer, distorted->uv_stride, distorted_argb, + stride_argb, yuv_constants, width, height); + } else if (ss_x == 0 && ss_y == 0) { + I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, + source->uv_stride, source->v_buffer, source->uv_stride, + src_argb, stride_argb, yuv_constants, width, height); + I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride, + distorted->u_buffer, distorted->uv_stride, + distorted->v_buffer, distorted->uv_stride, distorted_argb, + stride_argb, yuv_constants, width, height); + } else { + aom_free(src_argb); + aom_free(distorted_argb); + return 0; + } + + JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 }; + JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL); + JxlButteraugliApiSetHFAsymmetry(api, 0.8f); + + JxlButteraugliResult *result = JxlButteraugliCompute( + api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format, + distorted_argb, buffer_size); + + const float *distmap = NULL; + uint32_t row_stride; + JxlButteraugliResultGetDistmap(result, &distmap, &row_stride); + if (distmap == NULL) { + JxlButteraugliApiDestroy(api); + JxlButteraugliResultDestroy(result); + aom_free(src_argb); + aom_free(distorted_argb); + return 0; + } + + for (int j = 0; j < height; ++j) { + for (int i = 0; i < width; ++i) { + dist_map[j * width + i] = distmap[j * row_stride + i]; + } + } + + JxlButteraugliApiDestroy(api); + JxlButteraugliResultDestroy(result); + aom_free(src_argb); + aom_free(distorted_argb); + return 1; +} diff --git a/third_party/aom/aom_dsp/butteraugli.h b/third_party/aom/aom_dsp/butteraugli.h new file mode 100644 index 0000000000..5304092ccb --- /dev/null +++ b/third_party/aom/aom_dsp/butteraugli.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_BUTTERAUGLI_H_ +#define AOM_AOM_DSP_BUTTERAUGLI_H_ + +#include "aom_scale/yv12config.h" + +// Returns a boolean that indicates success/failure. +int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + aom_matrix_coefficients_t matrix_coefficients, + aom_color_range_t color_range, float *dist_map); + +#endif // AOM_AOM_DSP_BUTTERAUGLI_H_ diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c new file mode 100644 index 0000000000..aad96c6fc6 --- /dev/null +++ b/third_party/aom/aom_dsp/entcode.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/entcode.h" + +/*Given the current total integer number of bits used and the current value of + rng, computes the fraction number of bits used to OD_BITRES precision. + This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac(). + nbits_total: The number of whole bits currently used, i.e., the value + returned by od_ec_enc_tell() or od_ec_dec_tell(). + rng: The current value of rng from either the encoder or decoder state. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) { + uint32_t nbits; + int l; + int i; + /*To handle the non-integral number of bits still left in the encoder/decoder + state, we compute the worst-case number of bits of val that must be + encoded to ensure that the value is inside the range for any possible + subsequent bits. + The computation here is independent of val itself (the decoder does not + even track that value), even though the real number of bits used after + od_ec_enc_done() may be 1 smaller if rng is a power of two and the + corresponding trailing bits of val are all zeros. + If we did try to track that special case, then coding a value with a + probability of 1/(1 << n) might sometimes appear to use more than n bits. + This may help explain the surprising result that a newly initialized + encoder or decoder claims to have used 1 bit.*/ + nbits = nbits_total << OD_BITRES; + l = 0; + for (i = OD_BITRES; i-- > 0;) { + int b; + rng = rng * rng >> 15; + b = (int)(rng >> 16); + l = l << 1 | b; + rng >>= b; + } + return nbits - l; +} diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h new file mode 100644 index 0000000000..526ca598d3 --- /dev/null +++ b/third_party/aom/aom_dsp/entcode.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ENTCODE_H_ +#define AOM_AOM_DSP_ENTCODE_H_ + +#include +#include +#include "aom_dsp/odintrin.h" +#include "aom_dsp/prob.h" + +#define EC_PROB_SHIFT 6 +#define EC_MIN_PROB 4 // must be <= (1< 1/8th bits.*/ +#define OD_BITRES (3) + +#define OD_ICDF AOM_ICDF + +/*See entcode.c for further documentation.*/ + +OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, + uint32_t rng); + +#endif // AOM_AOM_DSP_ENTCODE_H_ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c new file mode 100644 index 0000000000..5bbcddae08 --- /dev/null +++ b/third_party/aom/aom_dsp/entdec.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/entdec.h" +#include "aom_dsp/prob.h" + +/*A range decoder. + This is an entropy decoder based upon \cite{Mar79}, which is itself a + rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. + It is very similar to arithmetic encoding, except that encoding is done with + digits in any base, instead of with bits, and so it is faster when using + larger bases (i.e.: a byte). + The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ + is the base, longer than the theoretical optimum, but to my knowledge there + is no published justification for this claim. + This only seems true when using near-infinite precision arithmetic so that + the process is carried out with no rounding errors. + + An excellent description of implementation details is available at + http://www.arturocampos.com/ac_range.html + A recent work \cite{MNW98} which proposes several changes to arithmetic + encoding for efficiency actually re-discovers many of the principles + behind range encoding, and presents a good theoretical analysis of them. + + End of stream is handled by writing out the smallest number of bits that + ensures that the stream will be correctly decoded regardless of the value of + any subsequent bits. + od_ec_dec_tell() can be used to determine how many bits were needed to decode + all the symbols thus far; other data can be packed in the remaining bits of + the input buffer. + @PHDTHESIS{Pas76, + author="Richard Clark Pasco", + title="Source coding algorithms for fast data compression", + school="Dept. of Electrical Engineering, Stanford University", + address="Stanford, CA", + month=May, + year=1976, + URL="http://www.richpasco.org/scaffdc.pdf" + } + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video & Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*This is meant to be a large, positive constant that can still be efficiently + loaded as an immediate (on platforms like ARM, for example). + Even relatively modest values like 100 would work fine.*/ +#define OD_EC_LOTS_OF_BITS (0x4000) + +/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill + call.*/ +static void od_ec_dec_refill(od_ec_dec *dec) { + int s; + od_ec_window dif; + int16_t cnt; + const unsigned char *bptr; + const unsigned char *end; + dif = dec->dif; + cnt = dec->cnt; + bptr = dec->bptr; + end = dec->end; + s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); + for (; s >= 0 && bptr < end; s -= 8, bptr++) { + /*Each time a byte is inserted into the window (dif), bptr advances and cnt + is incremented by 8, so the total number of consumed bits (the return + value of od_ec_dec_tell) does not change.*/ + assert(s <= OD_EC_WINDOW_SIZE - 8); + dif ^= (od_ec_window)bptr[0] << s; + cnt += 8; + } + if (bptr >= end) { + /*We've reached the end of the buffer. It is perfectly valid for us to need + to fill the window with additional bits past the end of the buffer (and + this happens in normal operation). These bits should all just be taken + as zero. But we cannot increment bptr past 'end' (this is undefined + behavior), so we start to increment dec->tell_offs. We also don't want + to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS + and adjust dec->tell_offs so that the total number of unconsumed bits in + the window (dec->cnt - dec->tell_offs) does not change. This effectively + puts lots of zero bits into the window, and means we won't try to refill + it from the buffer for a very long time (at which point we'll put lots + of zero bits into the window again).*/ + dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt; + cnt = OD_EC_LOTS_OF_BITS; + } + dec->dif = dif; + dec->cnt = cnt; + dec->bptr = bptr; +} + +/*Takes updated dif and range values, renormalizes them so that + 32768 <= rng < 65536 (reading more bytes from the stream into dif if + necessary), and stores them back in the decoder context. + dif: The new value of dif. + rng: The new value of the range. + ret: The value to return. + Return: ret. + This allows the compiler to jump to this function via a tail-call.*/ +static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, + int ret) { + int d; + assert(rng <= 65535U); + /*The number of leading zeros in the 16-bit binary representation of rng.*/ + d = 16 - OD_ILOG_NZ(rng); + /*d bits in dec->dif are consumed.*/ + dec->cnt -= d; + /*This is equivalent to shifting in 1's instead of 0's.*/ + dec->dif = ((dif + 1) << d) - 1; + dec->rng = rng << d; + if (dec->cnt < 0) od_ec_dec_refill(dec); + return ret; +} + +/*Initializes the decoder. + buf: The input buffer to use. + storage: The size in bytes of the input buffer.*/ +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, + uint32_t storage) { + dec->buf = buf; + dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); + dec->end = buf + storage; + dec->bptr = buf; + dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; + dec->rng = 0x8000; + dec->cnt = -15; + od_ec_dec_refill(dec); +} + +/*Decode a single binary value. + f: The probability that the bit is one, scaled by 32768. + Return: The value decoded (0 or 1).*/ +int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { + od_ec_window dif; + od_ec_window vw; + unsigned r; + unsigned r_new; + unsigned v; + int ret; + assert(0 < f); + assert(f < 32768U); + dif = dec->dif; + r = dec->rng; + assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + assert(32768U <= r); + v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB; + vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + ret = 1; + r_new = v; + if (dif >= vw) { + r_new = r - v; + dif -= vw; + ret = 0; + } + return od_ec_dec_normalize(dec, dif, r_new, ret); +} + +/*Decodes a symbol given an inverse cumulative distribution function (CDF) + table in Q15. + icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range + [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]). + The values must be monotonically non-increasing, and icdf[nsyms - 1] + must be 0. + nsyms: The number of symbols in the alphabet. + This should be at most 16. + Return: The decoded symbol s.*/ +int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { + od_ec_window dif; + unsigned r; + unsigned c; + unsigned u; + unsigned v; + int ret; + (void)nsyms; + dif = dec->dif; + r = dec->rng; + const int N = nsyms - 1; + + assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); + assert(32768U <= r); + assert(7 - EC_PROB_SHIFT >= 0); + c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); + v = r; + ret = -1; + do { + u = v; + v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB * (N - ret); + } while (c < v); + assert(v < u); + assert(u <= r); + r = u - v; + dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + return od_ec_dec_normalize(dec, dif, r, ret); +} + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_dec_tell(const od_ec_dec *dec) { + /*There is a window of bits stored in dec->dif. The difference + (dec->bptr - dec->buf) tells us how many bytes have been read into this + window. The difference (dec->cnt - dec->tell_offs) tells us how many of + the bits in that window remain unconsumed.*/ + return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs); +} + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) { + return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng); +} diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h new file mode 100644 index 0000000000..c746167775 --- /dev/null +++ b/third_party/aom/aom_dsp/entdec.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ENTDEC_H_ +#define AOM_AOM_DSP_ENTDEC_H_ +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_dec od_ec_dec; + +#if defined(OD_ACCOUNTING) && OD_ACCOUNTING +#define OD_ACC_STR , char *acc_str +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str) +#else +#define OD_ACC_STR +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb) +#endif + +/*The entropy decoder context.*/ +struct od_ec_dec { + /*The start of the current input buffer.*/ + const unsigned char *buf; + /*An offset used to keep track of tell after reaching the end of the stream. + This is constant throughout most of the decoding process, but becomes + important once we hit the end of the buffer and stop incrementing bptr + (and instead pretend cnt has lots of bits).*/ + int32_t tell_offs; + /*The end of the current input buffer.*/ + const unsigned char *end; + /*The read pointer for the entropy-coded bits.*/ + const unsigned char *bptr; + /*The difference between the high end of the current range, (low + rng), and + the coded value, minus 1. + This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the + decoder only uses the top 16 bits of the window to decode the next symbol. + As we shift up during renormalization, if we don't have enough bits left in + the window to fill the top 16, we'll read in more bits of the coded + value.*/ + od_ec_window dif; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; +}; + +/*See entdec.c for further documentation.*/ + +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec, + const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) + OD_ARG_NONNULL(1); + +OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) + OD_ARG_NONNULL(1); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_ENTDEC_H_ diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c new file mode 100644 index 0000000000..591e0ad214 --- /dev/null +++ b/third_party/aom/aom_dsp/entenc.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" + +#if OD_MEASURE_EC_OVERHEAD +#if !defined(M_LOG2E) +#define M_LOG2E (1.4426950408889634073599246810019) +#endif +#define OD_LOG2(x) (M_LOG2E * log(x)) +#endif // OD_MEASURE_EC_OVERHEAD + +/*A range encoder. + See entdec.c and the references for implementation details \cite{Mar79,MNW98}. + + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video \& Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*Takes updated low and range values, renormalizes them so that + 32768 <= rng < 65536 (flushing bytes from low to the output buffer if + necessary), and stores them back in the encoder context. + low: The new value of low. + rng: The new value of the range.*/ +static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_enc_window low, + unsigned rng) { + int d; + int c; + int s; + if (enc->error) return; + c = enc->cnt; + assert(rng <= 65535U); + /*The number of leading zeros in the 16-bit binary representation of rng.*/ + d = 16 - OD_ILOG_NZ(rng); + s = c + d; + + /* We flush every time "low" cannot safely and efficiently accommodate any + more data. Overall, c must not exceed 63 at the time of byte flush out. To + facilitate this, "s" cannot exceed 56-bits because we have to keep 1 byte + for carry. Also, we need to subtract 16 because we want to keep room for + the next symbol worth "d"-bits (max 15). An alternate condition would be if + (e < d), where e = number of leading zeros in "low", indicating there is + not enough rooom to accommodate "rng" worth of "d"-bits in "low". However, + this approach needs additional computations: (i) compute "e", (ii) push + the leading 0x00's as a special case. + */ + if (s >= 40) { // 56 - 16 + unsigned char *out = enc->buf; + uint32_t storage = enc->storage; + uint32_t offs = enc->offs; + if (offs + 8 > storage) { + storage = 2 * storage + 8; + out = (unsigned char *)realloc(out, sizeof(*out) * storage); + if (out == NULL) { + enc->error = -1; + return; + } + enc->buf = out; + enc->storage = storage; + } + // Need to add 1 byte here since enc->cnt always counts 1 byte less + // (enc->cnt = -9) to ensure correct operation + uint8_t num_bytes_ready = (s >> 3) + 1; + + // Update "c" to contain the number of non-ready bits in "low". Since "low" + // has 64-bit capacity, we need to add the (64 - 40) cushion bits and take + // off the number of ready bits. + c += 24 - (num_bytes_ready << 3); + + // Prepare "output" and update "low" + uint64_t output = low >> c; + low = low & (((uint64_t)1 << c) - 1); + + // Prepare data and carry mask + uint64_t mask = (uint64_t)1 << (num_bytes_ready << 3); + uint64_t carry = output & mask; + + mask = mask - 0x01; + output = output & mask; + + // Write data in a single operation + write_enc_data_to_out_buf(out, offs, output, carry, &enc->offs, + num_bytes_ready); + + // Update state of the encoder: enc->cnt to contain the number of residual + // bits + s = c + d - 24; + } + enc->low = low << d; + enc->rng = rng << d; + enc->cnt = s; +} + +/*Initializes the encoder. + size: The initial size of the buffer, in bytes.*/ +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { + od_ec_enc_reset(enc); + enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size); + enc->storage = size; + if (size > 0 && enc->buf == NULL) { + enc->storage = 0; + enc->error = -1; + } +} + +/*Reinitializes the encoder.*/ +void od_ec_enc_reset(od_ec_enc *enc) { + enc->offs = 0; + enc->low = 0; + enc->rng = 0x8000; + /*This is initialized to -9 so that it crosses zero after we've accumulated + one byte + one carry bit.*/ + enc->cnt = -9; + enc->error = 0; +#if OD_MEASURE_EC_OVERHEAD + enc->entropy = 0; + enc->nb_symbols = 0; +#endif +} + +/*Frees the buffers used by the encoder.*/ +void od_ec_enc_clear(od_ec_enc *enc) { free(enc->buf); } + +/*Encodes a symbol given its frequency in Q15. + fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come + before the one to be encoded. + fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and + including the one to be encoded.*/ +static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s, + int nsyms) { + od_ec_enc_window l; + unsigned r; + unsigned u; + unsigned v; + l = enc->low; + r = enc->rng; + assert(32768U <= r); + assert(fh <= fl); + assert(fl <= 32768U); + assert(7 - EC_PROB_SHIFT >= 0); + const int N = nsyms - 1; + if (fl < CDF_PROB_TOP) { + u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + + EC_MIN_PROB * (N - (s - 1)); + v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + + EC_MIN_PROB * (N - (s + 0)); + l += r - u; + r = u - v; + } else { + r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + + EC_MIN_PROB * (N - (s + 0)); + } + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.); + enc->nb_symbols++; +#endif +} + +/*Encode a single binary value. + val: The value to encode (0 or 1). + f: The probability that the val is one, scaled by 32768.*/ +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { + od_ec_enc_window l; + unsigned r; + unsigned v; + assert(0 < f); + assert(f < 32768U); + l = enc->low; + r = enc->rng; + assert(32768U <= r); + v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB; + if (val) l += r - v; + r = val ? v : r - v; + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.); + enc->nb_symbols++; +#endif +} + +/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. + s: The index of the symbol to encode. + icdf: 32768 minus the CDF, such that symbol s falls in the range + [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + The values must be monotonically decreasing, and icdf[nsyms - 1] must + be 0. + nsyms: The number of symbols in the alphabet. + This should be at most 16.*/ +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, + int nsyms) { + (void)nsyms; + assert(s >= 0); + assert(s < nsyms); + assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); + od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms); +} + +/*Overwrites a few bits at the very start of an existing stream, after they + have already been encoded. + This makes it possible to have a few flags up front, where it is easy for + decoders to access them without parsing the whole stream, even if their + values are not determined until late in the encoding process, without having + to buffer all the intermediate symbols in the encoder. + In order for this to work, at least nbits bits must have already been encoded + using probabilities that are an exact power of two. + The encoder can verify the number of encoded bits is sufficient, but cannot + check this latter condition. + val: The bits to encode (in the least nbits significant bits). + They will be decoded in order from most-significant to least. + nbits: The number of bits to overwrite. + This must be no more than 8.*/ +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) { + int shift; + unsigned mask; + assert(nbits >= 0); + assert(nbits <= 8); + assert(val < 1U << nbits); + shift = 8 - nbits; + mask = ((1U << nbits) - 1) << shift; + if (enc->offs > 0) { + /*The first byte has been finalized.*/ + enc->buf[0] = (unsigned char)((enc->buf[0] & ~mask) | val << shift); + } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) { + /*The first byte has yet to be output.*/ + enc->low = (enc->low & ~((od_ec_enc_window)mask << (16 + enc->cnt))) | + (od_ec_enc_window)val << (16 + enc->cnt + shift); + } else { + /*The encoder hasn't even encoded _nbits of data yet.*/ + enc->error = -1; + } +} + +#if OD_MEASURE_EC_OVERHEAD +#include +#endif + +/*Indicates that there are no more symbols to encode. + All remaining output bytes are flushed to the output buffer. + od_ec_enc_reset() should be called before using the encoder again. + bytes: Returns the size of the encoded data in the returned buffer. + Return: A pointer to the start of the final buffer, or NULL if there was an + encoding error.*/ +unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { + unsigned char *out; + uint32_t storage; + uint32_t offs; + od_ec_enc_window m; + od_ec_enc_window e; + od_ec_enc_window l; + int c; + int s; + if (enc->error) return NULL; +#if OD_MEASURE_EC_OVERHEAD + { + uint32_t tell; + /* Don't count the 1 bit we lose to raw bits as overhead. */ + tell = od_ec_enc_tell(enc) - 1; + fprintf(stderr, "overhead: %f%%\n", + 100 * (tell - enc->entropy) / enc->entropy); + fprintf(stderr, "efficiency: %f bits/symbol\n", + (double)tell / enc->nb_symbols); + } +#endif + + l = enc->low; + c = enc->cnt; + s = 10; + m = 0x3FFF; + e = ((l + m) & ~m) | (m + 1); + s += c; + offs = enc->offs; + + /*Make sure there's enough room for the entropy-coded bits.*/ + out = enc->buf; + storage = enc->storage; + const int s_bits = (s + 7) >> 3; + int b = OD_MAXI(s_bits, 0); + if (offs + b > storage) { + storage = offs + b; + out = (unsigned char *)realloc(out, sizeof(*out) * storage); + if (out == NULL) { + enc->error = -1; + return NULL; + } + enc->buf = out; + enc->storage = storage; + } + + /*We output the minimum number of bits that ensures that the symbols encoded + thus far will be decoded correctly regardless of the bits that follow.*/ + if (s > 0) { + uint64_t n; + n = ((uint64_t)1 << (c + 16)) - 1; + do { + assert(offs < storage); + uint16_t val = (uint16_t)(e >> (c + 16)); + out[offs] = (unsigned char)(val & 0x00FF); + if (val & 0x0100) { + assert(offs > 0); + propagate_carry_bwd(out, offs - 1); + } + offs++; + + e &= n; + s -= 8; + c -= 8; + n >>= 8; + } while (s > 0); + } + *nbytes = offs; + + return out; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_enc_tell(const od_ec_enc *enc) { + /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra + bit, which we reserve for terminating the stream.*/ + return (enc->cnt + 10) + enc->offs * 8; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { + return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); +} diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h new file mode 100644 index 0000000000..1a38affb4f --- /dev/null +++ b/third_party/aom/aom_dsp/entenc.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_ENTENC_H_ +#define AOM_AOM_DSP_ENTENC_H_ +#include +#include "aom_dsp/entcode.h" +#include "aom_util/endian_inl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint64_t od_ec_enc_window; + +typedef struct od_ec_enc od_ec_enc; + +#define OD_MEASURE_EC_OVERHEAD (0) + +/*The entropy encoder context.*/ +struct od_ec_enc { + /*Buffered output. + This contains only the raw bits until the final call to od_ec_enc_done(), + where all the arithmetic-coded data gets prepended to it.*/ + unsigned char *buf; + /*The size of the buffer.*/ + uint32_t storage; + /*The offset at which the next entropy-coded byte will be written.*/ + uint32_t offs; + /*The low end of the current range.*/ + od_ec_enc_window low; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; + /*Nonzero if an error occurred.*/ + int error; +#if OD_MEASURE_EC_OVERHEAD + double entropy; + int nb_symbols; +#endif +}; + +/*See entenc.c for further documentation.*/ + +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1); +void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1); +void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1); + +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15) + OD_ARG_NONNULL(1); +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(3); + +void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) + OD_ARG_NONNULL(1); + +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc, + uint32_t *nbytes) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) + OD_ARG_NONNULL(1); + +// buf is the frame bitbuffer, offs is where carry to be added +static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) { + uint16_t sum, carry = 1; + do { + sum = (uint16_t)buf[offs] + 1; + buf[offs--] = (unsigned char)sum; + carry = sum >> 8; + } while (carry); +} + +// Convert to big-endian byte order and write data to buffer adding the +// carry-bit +static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out, + uint32_t offs, uint64_t output, + uint64_t carry, + uint32_t *enc_offs, + uint8_t num_bytes_ready) { + const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3)); + memcpy(&out[offs], ®, 8); + // Propagate carry backwards if exists + if (carry) { + assert(offs > 0); + propagate_carry_bwd(out, offs - 1); + } + *enc_offs = offs + num_bytes_ready; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_ENTENC_H_ diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c new file mode 100644 index 0000000000..0ef0590e89 --- /dev/null +++ b/third_party/aom/aom_dsp/fastssim.c @@ -0,0 +1,488 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Nathan E. Egge, at the Daala + * project. + */ +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/ssim.h" + +typedef struct fs_level fs_level; +typedef struct fs_ctx fs_ctx; + +#define SSIM_C1 (255 * 255 * 0.01 * 0.01) +#define SSIM_C2 (255 * 255 * 0.03 * 0.03) +#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) +#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) +#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) +#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) +#define MAX_SSIM_DB 100.0 + +#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) +#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) + +struct fs_level { + uint32_t *im1; + uint32_t *im2; + double *ssim; + int w; + int h; +}; + +struct fs_ctx { + fs_level *level; + int nlevels; + unsigned *col_buf; +}; + +static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { + unsigned char *data; + size_t data_size; + int lw; + int lh; + int l; + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + data_size = + _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size += im_size; + level_size *= sizeof(*_ctx->level[l].ssim); + data_size += level_size; + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + data = (unsigned char *)malloc(data_size); + if (!data) return -1; + _ctx->level = (fs_level *)data; + _ctx->nlevels = _nlevels; + data += _nlevels * sizeof(*_ctx->level); + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + _ctx->level[l].w = lw; + _ctx->level[l].h = lh; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size *= sizeof(*_ctx->level[l].ssim); + _ctx->level[l].im1 = (uint32_t *)data; + _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; + data += level_size; + _ctx->level[l].ssim = (double *)data; + data += im_size * sizeof(*_ctx->level[l].ssim); + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + _ctx->col_buf = (unsigned *)data; + return 0; +} + +static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } + +static void fs_downsample_level(fs_ctx *_ctx, int _l) { + const uint32_t *src1; + const uint32_t *src2; + uint32_t *dst1; + uint32_t *dst2; + int w2; + int h2; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + dst1 = _ctx->level[_l].im1; + dst2 = _ctx->level[_l].im2; + w2 = _ctx->level[_l - 1].w; + h2 = _ctx->level[_l - 1].h; + src1 = _ctx->level[_l - 1].im1; + src2 = _ctx->level[_l - 1].im2; + for (j = 0; j < h; j++) { + int j0offs; + int j1offs; + j0offs = 2 * j * w2; + j1offs = FS_MINI(2 * j + 1, h2) * w2; + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, w2); + dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]; + dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]; + } + } +} + +static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, + int _s1ystride, const uint8_t *_src2, + int _s2ystride, int _w, int _h, uint32_t shift, + int buf_is_hbd) { + uint32_t *dst1; + uint32_t *dst2; + int w; + int h; + int i; + int j; + w = _ctx->level[0].w; + h = _ctx->level[0].h; + dst1 = _ctx->level[0].im1; + dst2 = _ctx->level[0].im2; + for (j = 0; j < h; j++) { + int j0; + int j1; + j0 = 2 * j; + j1 = FS_MINI(j0 + 1, _h); + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, _w); + if (!buf_is_hbd) { + dst1[j * w + i] = + _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; + dst2[j * w + i] = + _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; + } else { + uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); + uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); + dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + + (src1s[j0 * _s1ystride + i1] >> shift) + + (src1s[j1 * _s1ystride + i0] >> shift) + + (src1s[j1 * _s1ystride + i1] >> shift); + dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + + (src2s[j0 * _s2ystride + i1] >> shift) + + (src2s[j1 * _s2ystride + i0] >> shift) + + (src2s[j1 * _s2ystride + i1] >> shift); + } + } + } +} + +static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { + unsigned *col_sums_x; + unsigned *col_sums_y; + uint32_t *im1; + uint32_t *im2; + double *ssim; + double c1; + int w; + int h; + int j0offs; + int j1offs; + int i; + int j; + double ssim_c1 = SSIM_C1; + + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; + if (bit_depth == 12) ssim_c1 = SSIM_C1_12; + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + col_sums_x = _ctx->col_buf; + col_sums_y = col_sums_x + w; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; + for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; + for (j = 1; j < 4; j++) { + j1offs = FS_MINI(j, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + ssim = _ctx->level[_l].ssim; + c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); + for (j = 0; j < h; j++) { + unsigned mux; + unsigned muy; + int i0; + int i1; + mux = 5 * col_sums_x[0]; + muy = 5 * col_sums_y[0]; + for (i = 1; i < 4; i++) { + i1 = FS_MINI(i, w - 1); + mux += col_sums_x[i1]; + muy += col_sums_y[i1]; + } + for (i = 0; i < w; i++) { + ssim[j * w + i] *= (2 * mux * (double)muy + c1) / + (mux * (double)mux + muy * (double)muy + c1); + if (i + 1 < w) { + i0 = FS_MAXI(0, i - 4); + i1 = FS_MINI(i + 4, w - 1); + mux += col_sums_x[i1] - col_sums_x[i0]; + muy += col_sums_x[i1] - col_sums_x[i0]; + } + } + if (j + 1 < h) { + j0offs = FS_MAXI(0, j - 4) * w; + for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; + j1offs = FS_MINI(j + 4, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + } +} + +#define FS_COL_SET(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] = gx * (double)gx; \ + col_sums_gy2[(_col)] = gy * (double)gy; \ + col_sums_gxgy[(_col)] = gx * (double)gy; \ + } while (0) + +#define FS_COL_ADD(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] += gx * (double)gx; \ + col_sums_gy2[(_col)] += gy * (double)gy; \ + col_sums_gxgy[(_col)] += gx * (double)gy; \ + } while (0) + +#define FS_COL_SUB(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] -= gx * (double)gx; \ + col_sums_gy2[(_col)] -= gy * (double)gy; \ + col_sums_gxgy[(_col)] -= gx * (double)gy; \ + } while (0) + +#define FS_COL_COPY(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ + } while (0) + +#define FS_COL_HALVE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ + } while (0) + +#define FS_COL_DOUBLE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ + } while (0) + +static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { + uint32_t *im1; + uint32_t *im2; + unsigned *gx_buf; + unsigned *gy_buf; + double *ssim; + double col_sums_gx2[8]; + double col_sums_gy2[8]; + double col_sums_gxgy[8]; + double c2; + int stride; + int w; + int h; + int i; + int j; + double ssim_c2 = SSIM_C2; + if (bit_depth == 10) ssim_c2 = SSIM_C2_10; + if (bit_depth == 12) ssim_c2 = SSIM_C2_12; + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + ssim = _ctx->level[_l].ssim; + gx_buf = _ctx->col_buf; + stride = w + 8; + gy_buf = gx_buf + 8 * stride; + memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); + c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; + for (j = 0; j < h + 4; j++) { + if (j < h - 1) { + for (i = 0; i < w - 1; i++) { + unsigned g1; + unsigned g2; + unsigned gx; + unsigned gy; + g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); + g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); + gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); + g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); + gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + gx_buf[(j & 7) * stride + i + 4] = gx; + gy_buf[(j & 7) * stride + i + 4] = gy; + } + } else { + memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); + memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); + } + if (j >= 4) { + int k; + col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; + col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; + col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = + col_sums_gxgy[0] = 0; + for (i = 4; i < 8; i++) { + FS_COL_SET(i, -1, 0); + FS_COL_ADD(i, 0, 0); + for (k = 1; k < 8 - i; k++) { + FS_COL_DOUBLE(i, i); + FS_COL_ADD(i, -k - 1, 0); + FS_COL_ADD(i, k, 0); + } + } + for (i = 0; i < w; i++) { + double mugx2; + double mugy2; + double mugxgy; + mugx2 = col_sums_gx2[0]; + for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; + mugy2 = col_sums_gy2[0]; + for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; + mugxgy = col_sums_gxgy[0]; + for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; + ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); + if (i + 1 < w) { + FS_COL_SET(0, -1, 1); + FS_COL_ADD(0, 0, 1); + FS_COL_SUB(2, -3, 2); + FS_COL_SUB(2, 2, 2); + FS_COL_HALVE(1, 2); + FS_COL_SUB(3, -4, 3); + FS_COL_SUB(3, 3, 3); + FS_COL_HALVE(2, 3); + FS_COL_COPY(3, 4); + FS_COL_DOUBLE(4, 5); + FS_COL_ADD(4, -4, 5); + FS_COL_ADD(4, 3, 5); + FS_COL_DOUBLE(5, 6); + FS_COL_ADD(5, -3, 6); + FS_COL_ADD(5, 2, 6); + FS_COL_DOUBLE(6, 7); + FS_COL_ADD(6, -2, 7); + FS_COL_ADD(6, 1, 7); + FS_COL_SET(7, -1, 8); + FS_COL_ADD(7, 0, 8); + } + } + } + } +} + +#define FS_NLEVELS (4) + +/*These weights were derived from the default weights found in Wang's original + Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. + We drop the finest scale and renormalize the rest to sum to 1.*/ + +static const double FS_WEIGHTS[FS_NLEVELS] = { + 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 +}; + +static double fs_average(fs_ctx *_ctx, int _l) { + double *ssim; + double ret; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + ssim = _ctx->level[_l].ssim; + ret = 0; + for (j = 0; j < h; j++) + for (i = 0; i < w; i++) ret += ssim[j * w + i]; + return pow(ret / (w * h), FS_WEIGHTS[_l]); +} + +static double convert_ssim_db(double _ssim, double _weight) { + assert(_weight >= _ssim); + if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; + return 10 * (log10(_weight) - log10(_weight - _ssim)); +} + +static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, + int _dystride, int _w, int _h, uint32_t _bd, + uint32_t _shift, int buf_is_hbd) { + fs_ctx ctx; + double ret; + int l; + ret = 1; + if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0; + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift, + buf_is_hbd); + for (l = 0; l < FS_NLEVELS - 1; l++) { + fs_calc_structure(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_downsample_level(&ctx, l + 1); + } + fs_calc_structure(&ctx, l, _bd); + fs_apply_luminance(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_ctx_clear(&ctx); + return ret; +} + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd) { + double ssimv; + uint32_t bd_shift = 0; + assert(bd >= in_bd); + assert(source->flags == dest->flags); + int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH; + bd_shift = bd - in_bd; + + *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, bd_shift, buf_is_hbd); + *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); + *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); + ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); + return convert_ssim_db(ssimv, 1.0); +} diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c new file mode 100644 index 0000000000..a44dbf77b1 --- /dev/null +++ b/third_party/aom/aom_dsp/fft.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void simple_transpose(const float *A, float *B, int n) { + for (int y = 0; y < n; y++) { + for (int x = 0; x < n; x++) { + B[y * n + x] = A[x * n + y]; + } + } +} + +// The 1d transform is real to complex and packs the complex results in +// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real +// components, followed by the n/2 - 1 imaginary components). After the +// transform is done on the rows, the first n/2 + 1 columns are real, and +// the remaining are the imaginary components. After the transform on the +// columns, the region of [0, n/2]x[0, n/2] contains the real part of +// fft of the real columns. The real part of the 2d fft also includes the +// imaginary part of transformed imaginary columns. This function assembles +// the correct outputs while putting the real and imaginary components +// next to each other. +static INLINE void unpack_2d_output(const float *col_fft, float *output, + int n) { + for (int y = 0; y <= n / 2; ++y) { + const int y2 = y + n / 2; + const int y_extra = y2 > n / 2 && y2 < n; + + for (int x = 0; x <= n / 2; ++x) { + const int x2 = x + n / 2; + const int x_extra = x2 > n / 2 && x2 < n; + output[2 * (y * n + x)] = + col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); + output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + + (x_extra ? col_fft[y * n + x2] : 0); + if (y_extra) { + output[2 * ((n - y) * n + x)] = + col_fft[y * n + x] + + (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); + output[2 * ((n - y) * n + x) + 1] = + -(y_extra ? col_fft[y2 * n + x] : 0) + + (x_extra ? col_fft[y * n + x2] : 0); + } + } + } +} + +void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, + aom_fft_unpack_func_t unpack, int vec_size) { + for (int x = 0; x < n; x += vec_size) { + tform(input + x, output + x, n); + } + transpose(output, temp, n); + + for (int x = 0; x < n; x += vec_size) { + tform(temp + x, output + x, n); + } + transpose(output, temp, n); + + unpack(temp, output, n); +} + +static INLINE void store_float(float *output, float input) { *output = input; } +static INLINE float add_float(float a, float b) { return a + b; } +static INLINE float sub_float(float a, float b) { return a - b; } +static INLINE float mul_float(float a, float b) { return a * b; } + +GEN_FFT_2(void, float, float, float, *, store_float) +GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, + sub_float) +GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float) +GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float) +GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float) + +void aom_fft2x2_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft4x4_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft8x8_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft16x16_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft32x32_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, + aom_fft_1d_func_t ifft_multi, + aom_fft_transpose_func_t transpose, int vec_size) { + // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft + // and get real outputs. + for (int y = 0; y <= n / 2; ++y) { + output[y * n] = input[2 * y * n]; + output[y * n + 1] = input[2 * (y * n + n / 2)]; + } + for (int y = n / 2 + 1; y < n; ++y) { + output[y * n] = input[2 * (y - n / 2) * n + 1]; + output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; + } + + for (int i = 0; i < 2; i += vec_size) { + ifft_multi(output + i, temp + i, n); + } + + // For the other columns, since we don't have a full ifft for complex inputs + // we have to split them into the real and imaginary counterparts. + // Pack the real component, then the imaginary components. + for (int y = 0; y < n; ++y) { + for (int x = 1; x < n / 2; ++x) { + output[y * n + (x + 1)] = input[2 * (y * n + x)]; + } + for (int x = 1; x < n / 2; ++x) { + output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; + } + } + for (int y = 2; y < vec_size; y++) { + fft_single(output + y, temp + y, n); + } + // This is the part that can be sped up with SIMD + for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { + fft_multi(output + y, temp + y, n); + } + + // Put the 0 and n/2 th results in the correct place. + for (int x = 0; x < n; ++x) { + output[x] = temp[x * n]; + output[(n / 2) * n + x] = temp[x * n + 1]; + } + // This rearranges and transposes. + for (int y = 1; y < n / 2; ++y) { + // Fill in the real columns + for (int x = 0; x <= n / 2; ++x) { + output[x + y * n] = + temp[(y + 1) + x * n] + + ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); + } + for (int x = n / 2 + 1; x < n; ++x) { + output[x + y * n] = temp[(y + 1) + (n - x) * n] - + temp[(y + n / 2) + ((n - x) + n / 2) * n]; + } + // Fill in the imag columns + for (int x = 0; x <= n / 2; ++x) { + output[x + (y + n / 2) * n] = + temp[(y + n / 2) + x * n] - + ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); + } + for (int x = n / 2 + 1; x < n; ++x) { + output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + + temp[(y + n / 2) + (n - x) * n]; + } + } + for (int y = 0; y < n; y += vec_size) { + ifft_multi(output + y, temp + y, n); + } + transpose(temp, output, n); +} + +GEN_IFFT_2(void, float, float, float, *, store_float) +GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float, + sub_float) +GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float) +GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float) +GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float) + +void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, + aom_ifft1d_2_float, simple_transpose, 1); +} + +void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, + aom_ifft1d_4_float, simple_transpose, 1); +} + +void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, + aom_ifft1d_8_float, simple_transpose, 1); +} + +void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); +} + +void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); +} diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h new file mode 100644 index 0000000000..3de1a045ee --- /dev/null +++ b/third_party/aom/aom_dsp/fft_common.h @@ -0,0 +1,1056 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FFT_COMMON_H_ +#define AOM_AOM_DSP_FFT_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief A function pointer for computing 1d fft and ifft. + * + * The function will point to an implementation for a specific transform size, + * and may perform the transforms using vectorized instructions. + * + * For a non-vectorized forward transforms of size n, the input and output + * buffers will be size n. The output takes advantage of conjugate symmetry and + * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where + * (r_{j}, i_{j}) is the complex output for index j. + * + * An inverse transform will assume that the complex "input" is packed + * similarly. Its output will be real. + * + * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. + * + * Vectorized implementations are parallelized along the columns so that the fft + * can be performed on multiple columns at a time. In such cases the data block + * for input and output is typically square (n x n) and the stride will + * correspond to the spacing between rows. At minimum, the input size must be + * n x simd_vector_length. + * + * \param[in] input Input buffer. See above for size restrictions. + * \param[out] output Output buffer. See above for size restrictions. + * \param[in] stride The spacing in number of elements between rows + * (or elements) + */ +typedef void (*aom_fft_1d_func_t)(const float *input, float *output, + int stride); + +// Declare some of the forward non-vectorized transforms which are used in some +// of the vectorized implementations +void aom_fft1d_2_float(const float *input, float *output, int stride); +void aom_fft1d_4_float(const float *input, float *output, int stride); +void aom_fft1d_8_float(const float *input, float *output, int stride); +void aom_fft1d_16_float(const float *input, float *output, int stride); +void aom_fft1d_32_float(const float *input, float *output, int stride); +void aom_ifft1d_2_float(const float *input, float *output, int stride); +void aom_ifft1d_4_float(const float *input, float *output, int stride); +void aom_ifft1d_8_float(const float *input, float *output, int stride); +void aom_ifft1d_16_float(const float *input, float *output, int stride); +void aom_ifft1d_32_float(const float *input, float *output, int stride); + +/**\!brief Function pointer for transposing a matrix of floats. + * + * \param[in] input Input buffer (size n x n) + * \param[out] output Output buffer (size n x n) + * \param[in] n Extent of one dimension of the square matrix. + */ +typedef void (*aom_fft_transpose_func_t)(const float *input, float *output, + int n); + +/**\!brief Function pointer for re-arranging intermediate 2d transform results. + * + * After re-arrangement, the real and imaginary components will be packed + * tightly next to each other. + * + * \param[in] input Input buffer (size n x n) + * \param[out] output Output buffer (size 2 x n x n) + * \param[in] n Extent of one dimension of the square matrix. + */ +typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n); + +/*!\brief Performs a 2d fft with the given functions. + * + * This generator function allows for multiple different implementations of 2d + * fft with different vector operations, without having to redefine the main + * body multiple times. + * + * \param[in] input Input buffer to run the transform on (size n x n) + * \param[out] temp Working buffer for computing the transform (size n x n) + * \param[out] output Output buffer (size 2 x n x n) + * \param[in] tform Forward transform function + * \param[in] transpose Transpose function (for n x n matrix) + * \param[in] unpack Unpack function used to massage outputs to correct form + * \param[in] vec_size Vector size (the transform is done vec_size units at + * a time) + */ +void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, + aom_fft_unpack_func_t unpack, int vec_size); + +/*!\brief Perform a 2d inverse fft with the given helper functions + * + * \param[in] input Input buffer to run the transform on (size 2 x n x n) + * \param[out] temp Working buffer for computations (size 2 x n x n) + * \param[out] output Output buffer (size n x n) + * \param[in] fft_single Forward transform function (non vectorized) + * \param[in] fft_multi Forward transform function (vectorized) + * \param[in] ifft_multi Inverse transform function (vectorized) + * \param[in] transpose Transpose function (for n x n matrix) + * \param[in] vec_size Vector size (the transform is done vec_size + * units at a time) + */ +void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, + aom_fft_1d_func_t ifft_multi, + aom_fft_transpose_func_t transpose, int vec_size); +#ifdef __cplusplus +} +#endif + +// The macros below define 1D fft/ifft for different data types and for +// different simd vector intrinsic types. + +#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ + ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + store(output + 0 * stride, i0 + i1); \ + store(output + 1 * stride, i0 - i1); \ + } + +#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ + ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC w0 = add(i0, i2); \ + const T_VEC w1 = sub(i0, i2); \ + const T_VEC w2 = add(i1, i3); \ + const T_VEC w3 = sub(i1, i3); \ + store(output + 0 * stride, add(w0, w2)); \ + store(output + 1 * stride, w1); \ + store(output + 2 * stride, sub(w0, w2)); \ + store(output + 3 * stride, sub(kWeight0, w3)); \ + } + +#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ + ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC w0 = add(i0, i4); \ + const T_VEC w1 = sub(i0, i4); \ + const T_VEC w2 = add(i2, i6); \ + const T_VEC w3 = sub(i2, i6); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i1, i5); \ + const T_VEC w8 = sub(i1, i5); \ + const T_VEC w9 = add(i3, i7); \ + const T_VEC w10 = sub(i3, i7); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + store(output + 0 * stride, add(w4, w11)); \ + store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \ + store(output + 2 * stride, w5); \ + store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \ + store(output + 4 * stride, sub(w4, w11)); \ + store(output + 5 * stride, \ + sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \ + store(output + 6 * stride, sub(kWeight0, w12)); \ + store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \ + } + +#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC w0 = add(i0, i8); \ + const T_VEC w1 = sub(i0, i8); \ + const T_VEC w2 = add(i4, i12); \ + const T_VEC w3 = sub(i4, i12); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i2, i10); \ + const T_VEC w8 = sub(i2, i10); \ + const T_VEC w9 = add(i6, i14); \ + const T_VEC w10 = sub(i6, i14); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + const T_VEC w14 = add(w4, w11); \ + const T_VEC w15 = sub(w4, w11); \ + const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ + sub(sub(kWeight0, w3), \ + mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ + sub(w3, mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w19 = add(i1, i9); \ + const T_VEC w20 = sub(i1, i9); \ + const T_VEC w21 = add(i5, i13); \ + const T_VEC w22 = sub(i5, i13); \ + const T_VEC w23 = add(w19, w21); \ + const T_VEC w24 = sub(w19, w21); \ + const T_VEC w26 = add(i3, i11); \ + const T_VEC w27 = sub(i3, i11); \ + const T_VEC w28 = add(i7, i15); \ + const T_VEC w29 = sub(i7, i15); \ + const T_VEC w30 = add(w26, w28); \ + const T_VEC w31 = sub(w26, w28); \ + const T_VEC w33 = add(w23, w30); \ + const T_VEC w34 = sub(w23, w30); \ + const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ + sub(sub(kWeight0, w22), \ + mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ + sub(w22, mul(kWeight2, add(w29, w27))) }; \ + store(output + 0 * stride, add(w14, w33)); \ + store(output + 1 * stride, \ + add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \ + store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \ + store(output + 3 * stride, \ + add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \ + store(output + 4 * stride, w15); \ + store(output + 5 * stride, \ + add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \ + mul(kWeight3, w37[1])))); \ + store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \ + store(output + 7 * stride, \ + add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \ + mul(kWeight4, w35[1])))); \ + store(output + 8 * stride, sub(w14, w33)); \ + store(output + 9 * stride, \ + add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \ + store(output + 10 * stride, \ + sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \ + store(output + 11 * stride, \ + add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \ + store(output + 12 * stride, sub(kWeight0, w34)); \ + store(output + 13 * stride, \ + sub(sub(kWeight0, w18[1]), \ + sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \ + store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \ + store(output + 15 * stride, \ + sub(sub(kWeight0, w16[1]), \ + sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \ + } + +#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC kWeight5 = constant(0.980785f); \ + const T_VEC kWeight6 = constant(0.19509f); \ + const T_VEC kWeight7 = constant(0.83147f); \ + const T_VEC kWeight8 = constant(0.55557f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC i16 = load(input + 16 * stride); \ + const T_VEC i17 = load(input + 17 * stride); \ + const T_VEC i18 = load(input + 18 * stride); \ + const T_VEC i19 = load(input + 19 * stride); \ + const T_VEC i20 = load(input + 20 * stride); \ + const T_VEC i21 = load(input + 21 * stride); \ + const T_VEC i22 = load(input + 22 * stride); \ + const T_VEC i23 = load(input + 23 * stride); \ + const T_VEC i24 = load(input + 24 * stride); \ + const T_VEC i25 = load(input + 25 * stride); \ + const T_VEC i26 = load(input + 26 * stride); \ + const T_VEC i27 = load(input + 27 * stride); \ + const T_VEC i28 = load(input + 28 * stride); \ + const T_VEC i29 = load(input + 29 * stride); \ + const T_VEC i30 = load(input + 30 * stride); \ + const T_VEC i31 = load(input + 31 * stride); \ + const T_VEC w0 = add(i0, i16); \ + const T_VEC w1 = sub(i0, i16); \ + const T_VEC w2 = add(i8, i24); \ + const T_VEC w3 = sub(i8, i24); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i4, i20); \ + const T_VEC w8 = sub(i4, i20); \ + const T_VEC w9 = add(i12, i28); \ + const T_VEC w10 = sub(i12, i28); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + const T_VEC w14 = add(w4, w11); \ + const T_VEC w15 = sub(w4, w11); \ + const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ + sub(sub(kWeight0, w3), \ + mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ + sub(w3, mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w19 = add(i2, i18); \ + const T_VEC w20 = sub(i2, i18); \ + const T_VEC w21 = add(i10, i26); \ + const T_VEC w22 = sub(i10, i26); \ + const T_VEC w23 = add(w19, w21); \ + const T_VEC w24 = sub(w19, w21); \ + const T_VEC w26 = add(i6, i22); \ + const T_VEC w27 = sub(i6, i22); \ + const T_VEC w28 = add(i14, i30); \ + const T_VEC w29 = sub(i14, i30); \ + const T_VEC w30 = add(w26, w28); \ + const T_VEC w31 = sub(w26, w28); \ + const T_VEC w33 = add(w23, w30); \ + const T_VEC w34 = sub(w23, w30); \ + const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ + sub(sub(kWeight0, w22), \ + mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ + sub(w22, mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w38 = add(w14, w33); \ + const T_VEC w39 = sub(w14, w33); \ + const T_VEC w40[2] = { \ + add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \ + add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \ + }; \ + const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \ + sub(sub(kWeight0, w12), \ + mul(kWeight2, add(w31, w24))) }; \ + const T_VEC w42[2] = { \ + add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \ + add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \ + }; \ + const T_VEC w44[2] = { \ + add(w18[0], \ + sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \ + sub(sub(kWeight0, w18[1]), \ + sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \ + }; \ + const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \ + sub(w12, mul(kWeight2, add(w31, w24))) }; \ + const T_VEC w46[2] = { \ + add(w16[0], \ + sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \ + sub(sub(kWeight0, w16[1]), \ + sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \ + }; \ + const T_VEC w47 = add(i1, i17); \ + const T_VEC w48 = sub(i1, i17); \ + const T_VEC w49 = add(i9, i25); \ + const T_VEC w50 = sub(i9, i25); \ + const T_VEC w51 = add(w47, w49); \ + const T_VEC w52 = sub(w47, w49); \ + const T_VEC w54 = add(i5, i21); \ + const T_VEC w55 = sub(i5, i21); \ + const T_VEC w56 = add(i13, i29); \ + const T_VEC w57 = sub(i13, i29); \ + const T_VEC w58 = add(w54, w56); \ + const T_VEC w59 = sub(w54, w56); \ + const T_VEC w61 = add(w51, w58); \ + const T_VEC w62 = sub(w51, w58); \ + const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \ + sub(sub(kWeight0, w50), \ + mul(kWeight2, add(w57, w55))) }; \ + const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \ + sub(w50, mul(kWeight2, add(w57, w55))) }; \ + const T_VEC w66 = add(i3, i19); \ + const T_VEC w67 = sub(i3, i19); \ + const T_VEC w68 = add(i11, i27); \ + const T_VEC w69 = sub(i11, i27); \ + const T_VEC w70 = add(w66, w68); \ + const T_VEC w71 = sub(w66, w68); \ + const T_VEC w73 = add(i7, i23); \ + const T_VEC w74 = sub(i7, i23); \ + const T_VEC w75 = add(i15, i31); \ + const T_VEC w76 = sub(i15, i31); \ + const T_VEC w77 = add(w73, w75); \ + const T_VEC w78 = sub(w73, w75); \ + const T_VEC w80 = add(w70, w77); \ + const T_VEC w81 = sub(w70, w77); \ + const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \ + sub(sub(kWeight0, w69), \ + mul(kWeight2, add(w76, w74))) }; \ + const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \ + sub(w69, mul(kWeight2, add(w76, w74))) }; \ + const T_VEC w85 = add(w61, w80); \ + const T_VEC w86 = sub(w61, w80); \ + const T_VEC w87[2] = { \ + add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \ + add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \ + }; \ + const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \ + sub(sub(kWeight0, w59), \ + mul(kWeight2, add(w78, w71))) }; \ + const T_VEC w89[2] = { \ + add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \ + add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \ + }; \ + const T_VEC w91[2] = { \ + add(w65[0], \ + sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \ + sub(sub(kWeight0, w65[1]), \ + sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \ + }; \ + const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \ + sub(w59, mul(kWeight2, add(w78, w71))) }; \ + const T_VEC w93[2] = { \ + add(w63[0], \ + sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \ + sub(sub(kWeight0, w63[1]), \ + sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \ + }; \ + store(output + 0 * stride, add(w38, w85)); \ + store(output + 1 * stride, \ + add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \ + store(output + 2 * stride, \ + add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \ + store(output + 3 * stride, \ + add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \ + store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \ + store(output + 5 * stride, \ + add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \ + store(output + 6 * stride, \ + add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \ + store(output + 7 * stride, \ + add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \ + store(output + 8 * stride, w39); \ + store(output + 9 * stride, \ + add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \ + mul(kWeight5, w93[1])))); \ + store(output + 10 * stride, \ + add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \ + mul(kWeight3, w92[1])))); \ + store(output + 11 * stride, \ + add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \ + mul(kWeight7, w91[1])))); \ + store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \ + store(output + 13 * stride, \ + add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \ + mul(kWeight8, w89[1])))); \ + store(output + 14 * stride, \ + add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \ + mul(kWeight4, w88[1])))); \ + store(output + 15 * stride, \ + add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \ + mul(kWeight6, w87[1])))); \ + store(output + 16 * stride, sub(w38, w85)); \ + store(output + 17 * stride, \ + add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \ + store(output + 18 * stride, \ + add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \ + store(output + 19 * stride, \ + add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \ + store(output + 20 * stride, \ + sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \ + store(output + 21 * stride, \ + add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \ + store(output + 22 * stride, \ + add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \ + store(output + 23 * stride, \ + add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \ + store(output + 24 * stride, sub(kWeight0, w86)); \ + store(output + 25 * stride, \ + sub(sub(kWeight0, w46[1]), \ + sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \ + store(output + 26 * stride, \ + sub(sub(kWeight0, w45[1]), \ + sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \ + store(output + 27 * stride, \ + sub(sub(kWeight0, w44[1]), \ + sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \ + store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \ + store(output + 29 * stride, \ + sub(sub(kWeight0, w42[1]), \ + sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \ + store(output + 30 * stride, \ + sub(sub(kWeight0, w41[1]), \ + sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \ + store(output + 31 * stride, \ + sub(sub(kWeight0, w40[1]), \ + sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \ + } + +#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ + ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + store(output + 0 * stride, i0 + i1); \ + store(output + 1 * stride, i0 - i1); \ + } + +#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ + ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC w2 = add(i0, i2); \ + const T_VEC w3 = sub(i0, i2); \ + const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \ + const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \ + store(output + 0 * stride, add(w2, w4[0])); \ + store(output + 1 * stride, add(w3, w5[1])); \ + store(output + 2 * stride, sub(w2, w4[0])); \ + store(output + 3 * stride, sub(w3, w5[1])); \ + } + +#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC w6 = add(i0, i4); \ + const T_VEC w7 = sub(i0, i4); \ + const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \ + const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \ + const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \ + const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \ + const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \ + const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \ + const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \ + const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \ + const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \ + const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \ + const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \ + const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \ + const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \ + const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \ + store(output + 0 * stride, add(w10[0], w18[0])); \ + store(output + 1 * stride, \ + add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \ + store(output + 2 * stride, add(w11[0], w19[1])); \ + store(output + 3 * stride, \ + sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ + store(output + 4 * stride, sub(w10[0], w18[0])); \ + store(output + 5 * stride, \ + add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \ + mul(kWeight2, w20[1])))); \ + store(output + 6 * stride, sub(w11[0], w19[1])); \ + store(output + 7 * stride, \ + add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ + } + +#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC w14 = add(i0, i8); \ + const T_VEC w15 = sub(i0, i8); \ + const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \ + const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \ + const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \ + const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \ + const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \ + const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \ + const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \ + const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \ + const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \ + const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \ + const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \ + const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \ + const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \ + const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \ + const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \ + const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \ + const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \ + add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \ + const T_VEC w33[2] = { add(w20[0], \ + sub(sub(kWeight0, mul(kWeight2, w28[0])), \ + mul(kWeight2, w28[1]))), \ + add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \ + const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \ + const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \ + const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ + sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ + const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ + add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ + const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \ + const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \ + const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \ + const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \ + const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ + const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ + const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ + const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ + const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \ + const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \ + const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \ + const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \ + const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \ + const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \ + const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \ + const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \ + const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \ + const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \ + const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \ + add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \ + const T_VEC w57[2] = { add(w44[0], \ + sub(sub(kWeight0, mul(kWeight2, w52[0])), \ + mul(kWeight2, w52[1]))), \ + add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \ + const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \ + const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \ + const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ + sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ + const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ + add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ + store(output + 0 * stride, add(w30[0], w54[0])); \ + store(output + 1 * stride, \ + add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \ + store(output + 2 * stride, \ + add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \ + store(output + 3 * stride, \ + add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \ + store(output + 4 * stride, add(w31[0], w55[1])); \ + store(output + 5 * stride, \ + sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ + store(output + 6 * stride, \ + sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ + store(output + 7 * stride, \ + sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ + store(output + 8 * stride, sub(w30[0], w54[0])); \ + store(output + 9 * stride, \ + add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \ + mul(kWeight4, w56[1])))); \ + store(output + 10 * stride, \ + add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \ + mul(kWeight2, w58[1])))); \ + store(output + 11 * stride, \ + add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \ + mul(kWeight3, w60[1])))); \ + store(output + 12 * stride, sub(w31[0], w55[1])); \ + store(output + 13 * stride, \ + add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ + store(output + 14 * stride, \ + add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ + store(output + 15 * stride, \ + add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ + } +#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC kWeight5 = constant(0.980785f); \ + const T_VEC kWeight6 = constant(0.19509f); \ + const T_VEC kWeight7 = constant(0.83147f); \ + const T_VEC kWeight8 = constant(0.55557f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC i16 = load(input + 16 * stride); \ + const T_VEC i17 = load(input + 17 * stride); \ + const T_VEC i18 = load(input + 18 * stride); \ + const T_VEC i19 = load(input + 19 * stride); \ + const T_VEC i20 = load(input + 20 * stride); \ + const T_VEC i21 = load(input + 21 * stride); \ + const T_VEC i22 = load(input + 22 * stride); \ + const T_VEC i23 = load(input + 23 * stride); \ + const T_VEC i24 = load(input + 24 * stride); \ + const T_VEC i25 = load(input + 25 * stride); \ + const T_VEC i26 = load(input + 26 * stride); \ + const T_VEC i27 = load(input + 27 * stride); \ + const T_VEC i28 = load(input + 28 * stride); \ + const T_VEC i29 = load(input + 29 * stride); \ + const T_VEC i30 = load(input + 30 * stride); \ + const T_VEC i31 = load(input + 31 * stride); \ + const T_VEC w30 = add(i0, i16); \ + const T_VEC w31 = sub(i0, i16); \ + const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \ + const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \ + const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \ + const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \ + const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \ + const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \ + const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \ + const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \ + const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \ + const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \ + const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ + const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ + const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ + const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ + const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \ + const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \ + const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \ + add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \ + const T_VEC w49[2] = { add(w36[0], \ + sub(sub(kWeight0, mul(kWeight2, w44[0])), \ + mul(kWeight2, w44[1]))), \ + add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \ + const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \ + const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \ + const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ + sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ + const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ + add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ + const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \ + const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \ + const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \ + const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \ + const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \ + const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \ + const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \ + const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \ + const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \ + const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \ + const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \ + const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \ + const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \ + const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \ + const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \ + const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \ + const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \ + const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \ + const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \ + add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \ + const T_VEC w73[2] = { add(w60[0], \ + sub(sub(kWeight0, mul(kWeight2, w68[0])), \ + mul(kWeight2, w68[1]))), \ + add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \ + const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \ + const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \ + const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ + sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ + const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ + add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ + const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \ + const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \ + const T_VEC w80[2] = { \ + add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \ + add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \ + }; \ + const T_VEC w81[2] = { \ + add(w48[0], \ + sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \ + add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \ + }; \ + const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \ + add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \ + const T_VEC w83[2] = { add(w50[0], \ + sub(sub(kWeight0, mul(kWeight2, w74[0])), \ + mul(kWeight2, w74[1]))), \ + add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \ + const T_VEC w84[2] = { \ + add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \ + add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \ + }; \ + const T_VEC w85[2] = { \ + add(w52[0], \ + sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \ + add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \ + }; \ + const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \ + const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \ + const T_VEC w88[2] = { \ + sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ + add(w49[1], \ + sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \ + }; \ + const T_VEC w89[2] = { \ + add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ + add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \ + }; \ + const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ + sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ + const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ + add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ + const T_VEC w92[2] = { \ + sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ + add(w53[1], \ + sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \ + }; \ + const T_VEC w93[2] = { \ + add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ + add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \ + }; \ + const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \ + const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \ + const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \ + const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \ + const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \ + const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \ + const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \ + const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \ + const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \ + const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \ + const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \ + const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \ + const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \ + const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \ + const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \ + const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \ + const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \ + const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \ + const T_VEC w112[2] = { \ + add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \ + add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \ + }; \ + const T_VEC w113[2] = { \ + add(w100[0], \ + sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \ + add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \ + }; \ + const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \ + const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \ + const T_VEC w116[2] = { \ + sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ + sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ + }; \ + const T_VEC w117[2] = { \ + add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ + add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ + }; \ + const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \ + const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \ + const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \ + const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \ + const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \ + const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \ + const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \ + const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \ + const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \ + const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \ + const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \ + const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \ + const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \ + const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \ + const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \ + const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \ + const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \ + const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \ + const T_VEC w136[2] = { \ + add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \ + add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \ + }; \ + const T_VEC w137[2] = { \ + add(w124[0], \ + sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \ + add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \ + }; \ + const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \ + const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \ + const T_VEC w140[2] = { \ + sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ + sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ + }; \ + const T_VEC w141[2] = { \ + add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ + add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ + }; \ + const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \ + const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \ + const T_VEC w144[2] = { \ + add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \ + add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \ + }; \ + const T_VEC w145[2] = { \ + add(w112[0], \ + sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \ + add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \ + }; \ + const T_VEC w146[2] = { \ + add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \ + add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \ + }; \ + const T_VEC w147[2] = { \ + add(w114[0], \ + sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \ + add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \ + }; \ + const T_VEC w148[2] = { \ + add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \ + add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \ + }; \ + const T_VEC w149[2] = { \ + add(w116[0], \ + sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \ + add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \ + }; \ + const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \ + const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \ + const T_VEC w152[2] = { \ + sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ + add(w113[1], \ + sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \ + }; \ + const T_VEC w153[2] = { \ + add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ + add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \ + }; \ + const T_VEC w154[2] = { \ + sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ + sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ + }; \ + const T_VEC w155[2] = { \ + add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ + add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ + }; \ + const T_VEC w156[2] = { \ + sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ + add(w117[1], \ + sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \ + }; \ + const T_VEC w157[2] = { \ + add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ + add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \ + }; \ + store(output + 0 * stride, add(w78[0], w142[0])); \ + store(output + 1 * stride, \ + add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \ + store(output + 2 * stride, \ + add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \ + store(output + 3 * stride, \ + add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \ + store(output + 4 * stride, \ + add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \ + store(output + 5 * stride, \ + add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \ + store(output + 6 * stride, \ + add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \ + store(output + 7 * stride, \ + add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \ + store(output + 8 * stride, add(w79[0], w143[1])); \ + store(output + 9 * stride, \ + sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ + store(output + 10 * stride, \ + sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ + store(output + 11 * stride, \ + sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ + store(output + 12 * stride, \ + sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ + store(output + 13 * stride, \ + sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ + store(output + 14 * stride, \ + sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ + store(output + 15 * stride, \ + sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ + store(output + 16 * stride, sub(w78[0], w142[0])); \ + store(output + 17 * stride, \ + add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \ + mul(kWeight6, w144[1])))); \ + store(output + 18 * stride, \ + add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \ + mul(kWeight4, w146[1])))); \ + store(output + 19 * stride, \ + add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \ + mul(kWeight8, w148[1])))); \ + store(output + 20 * stride, \ + add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \ + mul(kWeight2, w150[1])))); \ + store(output + 21 * stride, \ + add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \ + mul(kWeight7, w152[1])))); \ + store(output + 22 * stride, \ + add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \ + mul(kWeight3, w154[1])))); \ + store(output + 23 * stride, \ + add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \ + mul(kWeight5, w156[1])))); \ + store(output + 24 * stride, sub(w79[0], w143[1])); \ + store(output + 25 * stride, \ + add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ + store(output + 26 * stride, \ + add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ + store(output + 27 * stride, \ + add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ + store(output + 28 * stride, \ + add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ + store(output + 29 * stride, \ + add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ + store(output + 30 * stride, \ + add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ + store(output + 31 * stride, \ + add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ + } + +#endif // AOM_AOM_DSP_FFT_COMMON_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c new file mode 100644 index 0000000000..ee42be7393 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/flow_estimation/disflow.h" + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { + // Check that the fractional position is in range. + // + // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Mathematically, this implies that 0 <= x < 1. However, in practice it is + // possible to have x == 1 due to floating point rounding. This is fine, + // and we still interpolate correctly if we allow x = 1. + assert(0 <= x && x <= 1); + + double x2 = x * x; + double x3 = x2 * x; + kernel[0] = -0.5 * x + x2 - 0.5 * x3; + kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; + kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; + kernel[3] = -0.5 * x2 + 0.5 * x3; +} + +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { + double kernel_dbl[4]; + get_cubic_kernel_dbl(x, kernel_dbl); + + kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); + kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); + kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); + kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); +} + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, int16_t *dt) { + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + int h_kernel[4]; + int v_kernel[4]; + get_cubic_kernel_int(u_frac, h_kernel); + get_cubic_kernel_int(v_frac, v_kernel); + + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution. + const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1); + int16x4_t h_filter = vmovn_s32(vld1q_s32(h_kernel)); + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) { + uint8x16_t r = vld1q_u8(ref_start + i * stride); + uint16x8_t r0 = vmovl_u8(vget_low_u8(r)); + uint16x8_t r1 = vmovl_u8(vget_high_u8(r)); + + int16x8_t s0 = vreinterpretq_s16_u16(r0); + int16x8_t s1 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 1)); + int16x8_t s2 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 2)); + int16x8_t s3 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 3)); + + int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), h_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), h_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), h_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), h_filter, 3); + + int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), h_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), h_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), h_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), h_filter, 3); + + // 6 is the maximum allowable number of extra bits which will avoid + // the intermediate values overflowing an int16_t. The most extreme + // intermediate value occurs when: + // * The input pixels are [0, 255, 255, 0] + // * u_frac = 0.5 + // In this case, the un-scaled output is 255 * 1.125 = 286.875. + // As an integer with 6 fractional bits, that is 18360, which fits + // in an int16_t. But with 7 fractional bits it would be 36720, + // which is too large. + + int16x8_t sum = vcombine_s16(vrshrn_n_s32(sum_lo, DISFLOW_INTERP_BITS - 6), + vrshrn_n_s32(sum_hi, DISFLOW_INTERP_BITS - 6)); + vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, sum); + } + + // Vertical convolution. + int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel)); + int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { + int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE); + int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE); + int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE); + int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE); + + int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3); + + int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3); + + uint8x8_t s = vld1_u8(src + (i + y) * stride + x); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3)); + + // This time, we have to round off the 6 extra bits which were kept + // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits + // of precision to match the scale of the dx and dy arrays. + sum_lo = vrshrq_n_s32(sum_lo, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + sum_hi = vrshrq_n_s32(sum_hi, + DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2); + int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16)); + int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16)); + vst1q_s16(dt + i * DISFLOW_PATCH_SIZE, + vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi))); + } +} + +static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 0, -1}. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2)); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff); + } + + // Vertical filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution, one of + // which can be reused for the next line. + int16x8_t s0 = vld1q_s16(tmp); + int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE); + int16x8_t sum01 = vaddq_s16(s0, s1); + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE); + + int16x8_t sum12 = vaddq_s16(s1, s2); + int16x8_t sum = vaddq_s16(sum01, sum12); + + vst1q_s16(dst + i * dst_stride, sum); + + sum01 = sum12; + s1 = s2; + } +} + +static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + + // Horizontal filter, using kernel {1, 2, 1}. + // This kernel can be split into two 2-taps kernels of value {1, 1}. + // That way we need only 3 add operations to perform the convolution. + const uint8_t *src_start = src - 1 * src_stride - 1; + + for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) { + uint8x16_t s = vld1q_u8(src_start + i * src_stride); + uint8x8_t s0 = vget_low_u8(s); + uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1)); + uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2)); + + uint16x8_t sum01 = vaddl_u8(s0, s1); + uint16x8_t sum12 = vaddl_u8(s1, s2); + uint16x8_t sum = vaddq_u16(sum01, sum12); + + vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum)); + } + + // Vertical filter, using kernel {1, 0, -1}. + // Load the whole block at once to avoid redundant loads during convolution. + int16x8_t t[10]; + load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4], + &t[5], &t[6], &t[7], &t[8], &t[9]); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Given that the kernel is {1, 0, -1} the convolution is a simple + // subtraction. + int16x8_t diff = vsubq_s16(t[i], t[i + 2]); + + vst1q_s16(dst + i * dst_stride, diff); + } +} + +// Computes the components of the system of equations used to solve for +// a flow vector. +// +// The flow equations are a least-squares system, derived as follows: +// +// For each pixel in the patch, we calculate the current error `dt`, +// and the x and y gradients `dx` and `dy` of the source patch. +// This means that, to first order, the squared error for this pixel is +// +// (dt + u * dx + v * dy)^2 +// +// where (u, v) are the incremental changes to the flow vector. +// +// We then want to find the values of u and v which minimize the sum +// of the squared error across all pixels. Conveniently, this fits exactly +// into the form of a least squares problem, with one equation +// +// u * dx + v * dy = -dt +// +// for each pixel. +// +// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, +// and absorbing the - sign elsewhere, this results in the least squares system +// +// M = |sum(dx * dx) sum(dx * dy)| +// |sum(dx * dy) sum(dy * dy)| +// +// b = |sum(dx * dt)| +// |sum(dy * dt)| +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M_inv) { + int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t x = vld1q_s16(dx + i * dx_stride); + int16x8_t y = vld1q_s16(dy + i * dy_stride); + sum[0] = vmlal_s16(sum[0], vget_low_s16(x), vget_low_s16(x)); + sum[0] = vmlal_s16(sum[0], vget_high_s16(x), vget_high_s16(x)); + + sum[1] = vmlal_s16(sum[1], vget_low_s16(x), vget_low_s16(y)); + sum[1] = vmlal_s16(sum[1], vget_high_s16(x), vget_high_s16(y)); + + sum[3] = vmlal_s16(sum[3], vget_low_s16(y), vget_low_s16(y)); + sum[3] = vmlal_s16(sum[3], vget_high_s16(y), vget_high_s16(y)); + } + sum[2] = sum[1]; + + int32x4_t res = horizontal_add_4d_s32x4(sum); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + + double M0 = (double)vgetq_lane_s32(res, 0) + 1; + double M1 = (double)vgetq_lane_s32(res, 1); + double M2 = (double)vgetq_lane_s32(res, 2); + double M3 = (double)vgetq_lane_s32(res, 3) + 1; + + // Invert matrix M. + double det = (M0 * M3) - (M1 * M2); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M3 * det_inv; + M_inv[1] = -M1 * det_inv; + M_inv[2] = -M2 * det_inv; + M_inv[3] = M0 * det_inv; +} + +static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + const int16_t *dt, int dt_stride, + int *b) { + int32x4_t b_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + int16x8_t dx16 = vld1q_s16(dx + i * dx_stride); + int16x8_t dy16 = vld1q_s16(dy + i * dy_stride); + int16x8_t dt16 = vld1q_s16(dt + i * dt_stride); + + b_s32[0] = vmlal_s16(b_s32[0], vget_low_s16(dx16), vget_low_s16(dt16)); + b_s32[0] = vmlal_s16(b_s32[0], vget_high_s16(dx16), vget_high_s16(dt16)); + + b_s32[1] = vmlal_s16(b_s32[1], vget_low_s16(dy16), vget_low_s16(dt16)); + b_s32[1] = vmlal_s16(b_s32[1], vget_high_s16(dy16), vget_high_s16(dt16)); + } + + int32x4_t b_red = horizontal_add_2d_s32(b_s32[0], b_s32[1]); + vst1_s32(b, add_pairwise_s32x4(b_red)); +} + +void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + double M_inv[4]; + int b[2]; + int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); + sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt); + compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt, + DISFLOW_PATCH_SIZE, b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c new file mode 100644 index 0000000000..284d1bd7b8 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "third_party/fastfeat/fast.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/common.h" + +#define FAST_BARRIER 18 + +size_t av1_get_corner_list_size(void) { return sizeof(CornerList); } + +CornerList *av1_alloc_corner_list(void) { + CornerList *corners = (CornerList *)aom_calloc(1, sizeof(*corners)); + if (!corners) { + return NULL; + } + + corners->valid = false; +#if CONFIG_MULTITHREAD + pthread_mutex_init(&corners->mutex, NULL); +#endif // CONFIG_MULTITHREAD + return corners; +} + +static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { + const uint8_t *buf = pyr->layers[0].buffer; + int width = pyr->layers[0].width; + int height = pyr->layers[0].height; + int stride = pyr->layers[0].stride; + + int *scores = NULL; + int num_corners; + xy *const frame_corners_xy = aom_fast9_detect_nonmax( + buf, width, height, stride, FAST_BARRIER, &scores, &num_corners); + if (num_corners < 0) return false; + + if (num_corners <= MAX_CORNERS) { + // Use all detected corners + if (num_corners != 0) { + memcpy(corners->corners, frame_corners_xy, + sizeof(*frame_corners_xy) * num_corners); + } + corners->num_corners = num_corners; + } else { + // There are more than MAX_CORNERS corners avilable, so pick out a subset + // of the sharpest corners, as these will be the most useful for flow + // estimation + int histogram[256]; + av1_zero(histogram); + for (int i = 0; i < num_corners; i++) { + assert(FAST_BARRIER <= scores[i] && scores[i] <= 255); + histogram[scores[i]] += 1; + } + + int threshold = -1; + int found_corners = 0; + for (int bucket = 255; bucket >= 0; bucket--) { + if (found_corners + histogram[bucket] > MAX_CORNERS) { + // Set threshold here + threshold = bucket; + break; + } + found_corners += histogram[bucket]; + } + assert(threshold != -1 && "Failed to select a valid threshold"); + + int copied_corners = 0; + for (int i = 0; i < num_corners; i++) { + if (scores[i] > threshold) { + assert(copied_corners < MAX_CORNERS); + corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x; + corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y; + copied_corners += 1; + } + } + assert(copied_corners == found_corners); + corners->num_corners = copied_corners; + } + + free(scores); + free(frame_corners_xy); + return true; +} + +bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { + assert(corners); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&corners->mutex); +#endif // CONFIG_MULTITHREAD + + if (!corners->valid) { + corners->valid = compute_corner_list(pyr, corners); + } + bool valid = corners->valid; + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&corners->mutex); +#endif // CONFIG_MULTITHREAD + return valid; +} + +#ifndef NDEBUG +// Check if a corner list has already been computed. +// This is mostly a debug helper - as it is necessary to hold corners->mutex +// while reading the valid flag, we cannot just write: +// assert(corners->valid); +// This function allows the check to be correctly written as: +// assert(aom_is_corner_list_valid(corners)); +bool aom_is_corner_list_valid(CornerList *corners) { + assert(corners); + + // Per the comments in the CornerList struct, we must take this mutex + // before reading or writing the "valid" flag, and hold it while computing + // the pyramid, to ensure proper behaviour if multiple threads call this + // function simultaneously +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&corners->mutex); +#endif // CONFIG_MULTITHREAD + + bool valid = corners->valid; + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&corners->mutex); +#endif // CONFIG_MULTITHREAD + + return valid; +} +#endif + +void av1_invalidate_corner_list(CornerList *corners) { + if (corners) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&corners->mutex); +#endif // CONFIG_MULTITHREAD + corners->valid = false; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&corners->mutex); +#endif // CONFIG_MULTITHREAD + } +} + +void av1_free_corner_list(CornerList *corners) { + if (corners) { +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&corners->mutex); +#endif // CONFIG_MULTITHREAD + aom_free(corners); + } +} diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h new file mode 100644 index 0000000000..d05846ce5d --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_ + +#include +#include +#include +#include + +#include "aom_dsp/pyramid.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_CORNERS 4096 + +typedef struct corner_list { +#if CONFIG_MULTITHREAD + // Mutex which is used to prevent the corner list from being computed twice + // at the same time + // + // Semantics: + // * This mutex must be held whenever reading or writing the `valid` flag + // + // * This mutex must also be held while computing the image pyramid, + // to ensure that only one thread may do so at a time. + // + // * However, once you have read the valid flag and seen a true value, + // it is safe to drop the mutex and read from the remaining fields. + // This is because, once the image pyramid is computed, its contents + // will not be changed until the parent frame buffer is recycled, + // which will not happen until there are no more outstanding references + // to the frame buffer. + pthread_mutex_t mutex; +#endif // CONFIG_MULTITHREAD + // Flag indicating whether the corner list contains valid data + bool valid; + // Number of corners found + int num_corners; + // (x, y) coordinates of each corner + int corners[2 * MAX_CORNERS]; +} CornerList; + +size_t av1_get_corner_list_size(void); + +CornerList *av1_alloc_corner_list(void); + +bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners); + +#ifndef NDEBUG +// Check if a corner list has already been computed. +// This is mostly a debug helper - as it is necessary to hold corners->mutex +// while reading the valid flag, we cannot just write: +// assert(corners->valid); +// This function allows the check to be correctly written as: +// assert(aom_is_corner_list_valid(corners)); +bool aom_is_corner_list_valid(CornerList *corners); +#endif + +void av1_invalidate_corner_list(CornerList *corners); + +void av1_free_corner_list(CornerList *corners); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c new file mode 100644 index 0000000000..cef719b68d --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/flow_estimation/corner_match.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_dsp/flow_estimation/ransac.h" +#include "aom_dsp/pyramid.h" +#include "aom_scale/yv12config.h" + +#define SEARCH_SZ 9 +#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) + +#define THRESHOLD_NCC 0.75 + +/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame, + centered at (x, y). +*/ +static double compute_variance(const unsigned char *frame, int stride, int x, + int y) { + int sum = 0; + int sumsq = 0; + int var; + int i, j; + for (i = 0; i < MATCH_SZ; ++i) + for (j = 0; j < MATCH_SZ; ++j) { + sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; + sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * + frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; + } + var = sumsq * MATCH_SZ_SQ - sum * sum; + return (double)var; +} + +/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, + int x1, int y1, + const unsigned char *frame2, int stride2, + int x2, int y2) { + int v1, v2; + int sum1 = 0; + int sum2 = 0; + int sumsq2 = 0; + int cross = 0; + int var2, cov; + int i, j; + for (i = 0; i < MATCH_SZ; ++i) + for (j = 0; j < MATCH_SZ; ++j) { + v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; + v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; + sum1 += v1; + sum2 += v2; + sumsq2 += v2 * v2; + cross += v1 * v2; + } + var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + cov = cross * MATCH_SZ_SQ - sum1 * sum2; + return cov / sqrt((double)var2); +} + +static int is_eligible_point(int pointx, int pointy, int width, int height) { + return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 && + pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height); +} + +static int is_eligible_distance(int point1x, int point1y, int point2x, + int point2y, int width, int height) { + const int thresh = (width < height ? height : width) >> 4; + return ((point1x - point2x) * (point1x - point2x) + + (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; +} + +static void improve_correspondence(const unsigned char *src, + const unsigned char *ref, int width, + int height, int src_stride, int ref_stride, + Correspondence *correspondences, + int num_correspondences) { + int i; + for (i = 0; i < num_correspondences; ++i) { + int x, y, best_x = 0, best_y = 0; + double best_match_ncc = 0.0; + // For this algorithm, all points have integer coordinates. + // It's a little more efficient to convert them to ints once, + // before the inner loops + int x0 = (int)correspondences[i].x; + int y0 = (int)correspondences[i].y; + int rx0 = (int)correspondences[i].rx; + int ry0 = (int)correspondences[i].ry; + for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { + for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { + double match_ncc; + if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue; + if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height)) + continue; + match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref, + ref_stride, rx0 + x, ry0 + y); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_y = y; + best_x = x; + } + } + } + correspondences[i].rx += best_x; + correspondences[i].ry += best_y; + } + for (i = 0; i < num_correspondences; ++i) { + int x, y, best_x = 0, best_y = 0; + double best_match_ncc = 0.0; + int x0 = (int)correspondences[i].x; + int y0 = (int)correspondences[i].y; + int rx0 = (int)correspondences[i].rx; + int ry0 = (int)correspondences[i].ry; + for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) + for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { + double match_ncc; + if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue; + if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height)) + continue; + match_ncc = av1_compute_cross_correlation( + ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_y = y; + best_x = x; + } + } + correspondences[i].x += best_x; + correspondences[i].y += best_y; + } +} + +static int determine_correspondence(const unsigned char *src, + const int *src_corners, int num_src_corners, + const unsigned char *ref, + const int *ref_corners, int num_ref_corners, + int width, int height, int src_stride, + int ref_stride, + Correspondence *correspondences) { + // TODO(sarahparker) Improve this to include 2-way match + int i, j; + int num_correspondences = 0; + for (i = 0; i < num_src_corners; ++i) { + double best_match_ncc = 0.0; + double template_norm; + int best_match_j = -1; + if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width, + height)) + continue; + for (j = 0; j < num_ref_corners; ++j) { + double match_ncc; + if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, + height)) + continue; + if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1], + ref_corners[2 * j], ref_corners[2 * j + 1], + width, height)) + continue; + match_ncc = av1_compute_cross_correlation( + src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref, + ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); + if (match_ncc > best_match_ncc) { + best_match_ncc = match_ncc; + best_match_j = j; + } + } + // Note: We want to test if the best correlation is >= THRESHOLD_NCC, + // but need to account for the normalization in + // av1_compute_cross_correlation. + template_norm = compute_variance(src, src_stride, src_corners[2 * i], + src_corners[2 * i + 1]); + if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { + correspondences[num_correspondences].x = src_corners[2 * i]; + correspondences[num_correspondences].y = src_corners[2 * i + 1]; + correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; + correspondences[num_correspondences].ry = + ref_corners[2 * best_match_j + 1]; + num_correspondences++; + } + } + improve_correspondence(src, ref, width, height, src_stride, ref_stride, + correspondences, num_correspondences); + return num_correspondences; +} + +bool av1_compute_global_motion_feature_match( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, MotionModel *motion_models, int num_motion_models, + bool *mem_alloc_failed) { + int num_correspondences; + Correspondence *correspondences; + ImagePyramid *src_pyramid = src->y_pyramid; + CornerList *src_corners = src->corners; + ImagePyramid *ref_pyramid = ref->y_pyramid; + CornerList *ref_corners = ref->corners; + + // Precompute information we will need about each frame + if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + *mem_alloc_failed = true; + return false; + } + if (!av1_compute_corner_list(src_pyramid, src_corners)) { + *mem_alloc_failed = true; + return false; + } + if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + *mem_alloc_failed = true; + return false; + } + if (!av1_compute_corner_list(src_pyramid, src_corners)) { + *mem_alloc_failed = true; + return false; + } + + const uint8_t *src_buffer = src_pyramid->layers[0].buffer; + const int src_width = src_pyramid->layers[0].width; + const int src_height = src_pyramid->layers[0].height; + const int src_stride = src_pyramid->layers[0].stride; + + const uint8_t *ref_buffer = ref_pyramid->layers[0].buffer; + assert(ref_pyramid->layers[0].width == src_width); + assert(ref_pyramid->layers[0].height == src_height); + const int ref_stride = ref_pyramid->layers[0].stride; + + // find correspondences between the two images + correspondences = (Correspondence *)aom_malloc(src_corners->num_corners * + sizeof(*correspondences)); + if (!correspondences) { + *mem_alloc_failed = true; + return false; + } + num_correspondences = determine_correspondence( + src_buffer, src_corners->corners, src_corners->num_corners, ref_buffer, + ref_corners->corners, ref_corners->num_corners, src_width, src_height, + src_stride, ref_stride, correspondences); + + bool result = ransac(correspondences, num_correspondences, type, + motion_models, num_motion_models, mem_alloc_failed); + + aom_free(correspondences); + return result; +} diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.h b/third_party/aom/aom_dsp/flow_estimation/corner_match.h new file mode 100644 index 0000000000..4435d2c767 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_ + +#include +#include +#include +#include + +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MATCH_SZ 13 +#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) +#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) + +bool av1_compute_global_motion_feature_match( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, MotionModel *motion_models, int num_motion_models, + bool *mem_alloc_failed); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c new file mode 100644 index 0000000000..147a8ab3b3 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c @@ -0,0 +1,823 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Dense Inverse Search flow algorithm +// Paper: https://arxiv.org/abs/1603.03590 + +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/flow_estimation/ransac.h" +#include "aom_dsp/pyramid.h" +#include "aom_mem/aom_mem.h" + +#include "config/aom_dsp_rtcd.h" + +// Amount to downsample the flow field by. +// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate +// one flow point for each 4x4 pixel region of the frame +// Must be a power of 2 +#define DOWNSAMPLE_SHIFT 3 +#define DOWNSAMPLE_FACTOR (1 << DOWNSAMPLE_SHIFT) + +// Filters used when upscaling the flow field from one pyramid level +// to another. See upscale_flow_component for details on kernel selection +#define FLOW_UPSCALE_TAPS 4 + +// Number of outermost flow field entries (on each edge) which can't be +// computed, because the patch they correspond to extends outside of the +// frame +// The border is (DISFLOW_PATCH_SIZE >> 1) pixels, which is +// (DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT many flow field entries +#define FLOW_BORDER_INNER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT) + +// Number of extra padding entries on each side of the flow field. +// These samples are added so that we do not need to apply clamping when +// interpolating or upsampling the flow field +#define FLOW_BORDER_OUTER (FLOW_UPSCALE_TAPS / 2) + +// When downsampling the flow field, each flow field entry covers a square +// region of pixels in the image pyramid. This value is equal to the position +// of the center of that region, as an offset from the top/left edge. +// +// Note: Using ((DOWNSAMPLE_FACTOR - 1) / 2) is equivalent to the more +// natural expression ((DOWNSAMPLE_FACTOR / 2) - 1), +// unless DOWNSAMPLE_FACTOR == 1 (ie, no downsampling), in which case +// this gives the correct offset of 0 instead of -1. +#define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2) + +static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = { + // Cubic interpolation kernels for phase=0.75 and phase=0.25, respectively + { -3 / 128., 29 / 128., 111 / 128., -9 / 128. }, + { -9 / 128., 111 / 128., 29 / 128., -3 / 128. } +}; + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { + // Check that the fractional position is in range. + // + // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Mathematically, this implies that 0 <= x < 1. However, in practice it is + // possible to have x == 1 due to floating point rounding. This is fine, + // and we still interpolate correctly if we allow x = 1. + assert(0 <= x && x <= 1); + + double x2 = x * x; + double x3 = x2 * x; + kernel[0] = -0.5 * x + x2 - 0.5 * x3; + kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; + kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; + kernel[3] = -0.5 * x2 + 0.5 * x3; +} + +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { + double kernel_dbl[4]; + get_cubic_kernel_dbl(x, kernel_dbl); + + kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); + kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); + kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); + kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); +} + +static INLINE double get_cubic_value_dbl(const double *p, + const double kernel[4]) { + return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + + kernel[3] * p[3]; +} + +static INLINE int get_cubic_value_int(const int *p, const int kernel[4]) { + return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + + kernel[3] * p[3]; +} + +static INLINE double bicubic_interp_one(const double *arr, int stride, + const double h_kernel[4], + const double v_kernel[4]) { + double tmp[1 * 4]; + + // Horizontal convolution + for (int i = -1; i < 3; ++i) { + tmp[i + 1] = get_cubic_value_dbl(&arr[i * stride - 1], h_kernel); + } + + // Vertical convolution + return get_cubic_value_dbl(tmp, v_kernel); +} + +static int determine_disflow_correspondence(const ImagePyramid *src_pyr, + const ImagePyramid *ref_pyr, + CornerList *corners, + const FlowField *flow, + Correspondence *correspondences) { + const int width = flow->width; + const int height = flow->height; + const int stride = flow->stride; + + int num_correspondences = 0; + for (int i = 0; i < corners->num_corners; ++i) { + const int x0 = corners->corners[2 * i]; + const int y0 = corners->corners[2 * i + 1]; + + // Offset points, to compensate for the fact that (say) a flow field entry + // at horizontal index i, is nominally associated with the pixel at + // horizontal coordinate (i << DOWNSAMPLE_FACTOR) + UPSAMPLE_CENTER_OFFSET + // This offset must be applied before we split the coordinate into integer + // and fractional parts, in order for the interpolation to be correct. + const int x = x0 - UPSAMPLE_CENTER_OFFSET; + const int y = y0 - UPSAMPLE_CENTER_OFFSET; + + // Split the pixel coordinates into integer flow field coordinates and + // an offset for interpolation + const int flow_x = x >> DOWNSAMPLE_SHIFT; + const double flow_sub_x = + (x & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR; + const int flow_y = y >> DOWNSAMPLE_SHIFT; + const double flow_sub_y = + (y & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR; + + // Exclude points which would sample from the outer border of the flow + // field, as this would give lower-quality results. + // + // Note: As we never read from the border region at pyramid level 0, we + // can skip filling it in. If the conditions here are removed, or any + // other logic is added which reads from this border region, then + // compute_flow_field() will need to be modified to call + // fill_flow_field_borders() at pyramid level 0 to set up the correct + // border data. + if (flow_x < 1 || (flow_x + 2) >= width) continue; + if (flow_y < 1 || (flow_y + 2) >= height) continue; + + double h_kernel[4]; + double v_kernel[4]; + get_cubic_kernel_dbl(flow_sub_x, h_kernel); + get_cubic_kernel_dbl(flow_sub_y, v_kernel); + + double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x], + stride, h_kernel, v_kernel); + double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x], + stride, h_kernel, v_kernel); + + // Refine the interpolated flow vector one last time + const int patch_tl_x = x0 - DISFLOW_PATCH_CENTER; + const int patch_tl_y = y0 - DISFLOW_PATCH_CENTER; + aom_compute_flow_at_point( + src_pyr->layers[0].buffer, ref_pyr->layers[0].buffer, patch_tl_x, + patch_tl_y, src_pyr->layers[0].width, src_pyr->layers[0].height, + src_pyr->layers[0].stride, &flow_u, &flow_v); + + // Use original points (without offsets) when filling in correspondence + // array + correspondences[num_correspondences].x = x0; + correspondences[num_correspondences].y = y0; + correspondences[num_correspondences].rx = x0 + flow_u; + correspondences[num_correspondences].ry = y0 + flow_v; + num_correspondences++; + } + return num_correspondences; +} + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, + const int16_t *dx, const int16_t *dy, + int *b) { + memset(b, 0, 2 * sizeof(*b)); + + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + int h_kernel[4]; + int v_kernel[4]; + get_cubic_kernel_int(u_frac, h_kernel); + get_cubic_kernel_int(v_frac, v_kernel); + + // Storage for intermediate values between the two convolution directions + int tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + int *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution + for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) { + const int y_w = y0 + i; + for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { + const int x_w = x0 + j; + int arr[4]; + + arr[0] = (int)ref[y_w * stride + (x_w - 1)]; + arr[1] = (int)ref[y_w * stride + (x_w + 0)]; + arr[2] = (int)ref[y_w * stride + (x_w + 1)]; + arr[3] = (int)ref[y_w * stride + (x_w + 2)]; + + // Apply kernel and round, keeping 6 extra bits of precision. + // + // 6 is the maximum allowable number of extra bits which will avoid + // the intermediate values overflowing an int16_t. The most extreme + // intermediate value occurs when: + // * The input pixels are [0, 255, 255, 0] + // * u_frac = 0.5 + // In this case, the un-scaled output is 255 * 1.125 = 286.875. + // As an integer with 6 fractional bits, that is 18360, which fits + // in an int16_t. But with 7 fractional bits it would be 36720, + // which is too large. + tmp[i * DISFLOW_PATCH_SIZE + j] = ROUND_POWER_OF_TWO( + get_cubic_value_int(arr, h_kernel), DISFLOW_INTERP_BITS - 6); + } + } + + // Vertical convolution + for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { + for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { + const int *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; + const int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], + p[2 * DISFLOW_PATCH_SIZE] }; + const int result = get_cubic_value_int(arr, v_kernel); + + // Apply kernel and round. + // This time, we have to round off the 6 extra bits which were kept + // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits + // of precision to match the scale of the dx and dy arrays. + const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; + const int warped = ROUND_POWER_OF_TWO(result, round_bits); + const int src_px = src[(x + j) + (y + i) * stride] << 3; + const int dt = warped - src_px; + b[0] += dx[i * DISFLOW_PATCH_SIZE + j] * dt; + b[1] += dy[i * DISFLOW_PATCH_SIZE + j] * dt; + } + } +} + +static INLINE void sobel_filter(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride, int dir) { + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; + + // Sobel filter kernel + // This must have an overall scale factor equal to DISFLOW_DERIV_SCALE, + // in order to produce correctly scaled outputs. + // To work out the scale factor, we multiply two factors: + // + // * For the derivative filter (sobel_a), comparing our filter + // image[x - 1] - image[x + 1] + // to the standard form + // d/dx image[x] = image[x+1] - image[x] + // tells us that we're actually calculating -2 * d/dx image[2] + // + // * For the smoothing filter (sobel_b), all coefficients are positive + // so the scale factor is just the sum of the coefficients + // + // Thus we need to make sure that DISFLOW_DERIV_SCALE = 2 * sum(sobel_b) + // (and take care of the - sign from sobel_a elsewhere) + static const int16_t sobel_a[3] = { 1, 0, -1 }; + static const int16_t sobel_b[3] = { 1, 2, 1 }; + const int taps = 3; + + // horizontal filter + const int16_t *h_kernel = dir ? sobel_a : sobel_b; + + for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { + for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { + int sum = 0; + for (int k = 0; k < taps; ++k) { + sum += h_kernel[k] * src[y * src_stride + (x + k - 1)]; + } + tmp[y * DISFLOW_PATCH_SIZE + x] = sum; + } + } + + // vertical filter + const int16_t *v_kernel = dir ? sobel_b : sobel_a; + + for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { + for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { + int sum = 0; + for (int k = 0; k < taps; ++k) { + sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; + } + dst[y * dst_stride + x] = sum; + } + } +} + +// Computes the components of the system of equations used to solve for +// a flow vector. +// +// The flow equations are a least-squares system, derived as follows: +// +// For each pixel in the patch, we calculate the current error `dt`, +// and the x and y gradients `dx` and `dy` of the source patch. +// This means that, to first order, the squared error for this pixel is +// +// (dt + u * dx + v * dy)^2 +// +// where (u, v) are the incremental changes to the flow vector. +// +// We then want to find the values of u and v which minimize the sum +// of the squared error across all pixels. Conveniently, this fits exactly +// into the form of a least squares problem, with one equation +// +// u * dx + v * dy = -dt +// +// for each pixel. +// +// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE, +// and absorbing the - sign elsewhere, this results in the least squares system +// +// M = |sum(dx * dx) sum(dx * dy)| +// |sum(dx * dy) sum(dy * dy)| +// +// b = |sum(dx * dt)| +// |sum(dy * dt)| +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M) { + int tmp[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { + tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; + tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; + // Don't compute tmp[2], as it should be equal to tmp[1] + tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; + } + } + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + tmp[0] += 1; + tmp[3] += 1; + + tmp[2] = tmp[1]; + + M[0] = (double)tmp[0]; + M[1] = (double)tmp[1]; + M[2] = (double)tmp[2]; + M[3] = (double)tmp[3]; +} + +// Try to invert the matrix M +// Note: Due to the nature of how a least-squares matrix is constructed, all of +// the eigenvalues will be >= 0, and therefore det M >= 0 as well. +// The regularization term `+ k * I` further ensures that det M >= k^2. +// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. +// So we don't have to worry about non-invertible matrices here. +static INLINE void invert_2x2(const double *M, double *M_inv) { + double det = (M[0] * M[3]) - (M[1] * M[2]); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M[3] * det_inv; + M_inv[1] = -M[1] * det_inv; + M_inv[2] = -M[2] * det_inv; + M_inv[3] = M[0] * det_inv; +} + +void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x, + int y, int width, int height, int stride, + double *u, double *v) { + double M[4]; + double M_inv[4]; + int b[2]; + int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter(src_patch, stride, dx, DISFLOW_PATCH_SIZE, 1); + sobel_filter(src_patch, stride, dy, DISFLOW_PATCH_SIZE, 0); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); + invert_2x2(M, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, + b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} + +static void fill_flow_field_borders(double *flow, int width, int height, + int stride) { + // Calculate the bounds of the rectangle which was filled in by + // compute_flow_field() before calling this function. + // These indices are inclusive on both ends. + const int left_index = FLOW_BORDER_INNER; + const int right_index = (width - FLOW_BORDER_INNER - 1); + const int top_index = FLOW_BORDER_INNER; + const int bottom_index = (height - FLOW_BORDER_INNER - 1); + + // Left area + for (int i = top_index; i <= bottom_index; i += 1) { + double *row = flow + i * stride; + const double left = row[left_index]; + for (int j = -FLOW_BORDER_OUTER; j < left_index; j++) { + row[j] = left; + } + } + + // Right area + for (int i = top_index; i <= bottom_index; i += 1) { + double *row = flow + i * stride; + const double right = row[right_index]; + for (int j = right_index + 1; j < width + FLOW_BORDER_OUTER; j++) { + row[j] = right; + } + } + + // Top area + const double *top_row = flow + top_index * stride - FLOW_BORDER_OUTER; + for (int i = -FLOW_BORDER_OUTER; i < top_index; i++) { + double *row = flow + i * stride - FLOW_BORDER_OUTER; + size_t length = width + 2 * FLOW_BORDER_OUTER; + memcpy(row, top_row, length * sizeof(*row)); + } + + // Bottom area + const double *bottom_row = flow + bottom_index * stride - FLOW_BORDER_OUTER; + for (int i = bottom_index + 1; i < height + FLOW_BORDER_OUTER; i++) { + double *row = flow + i * stride - FLOW_BORDER_OUTER; + size_t length = width + 2 * FLOW_BORDER_OUTER; + memcpy(row, bottom_row, length * sizeof(*row)); + } +} + +// Upscale one component of the flow field, from a size of +// cur_width x cur_height to a size of (2*cur_width) x (2*cur_height), storing +// the result back into the same buffer. This function also scales the flow +// vector by 2, so that when we move to the next pyramid level down, the implied +// motion vector is the same. +// +// The temporary buffer tmpbuf must be large enough to hold an intermediate +// array of size stride * cur_height, *plus* FLOW_BORDER_OUTER rows above and +// below. In other words, indices from -FLOW_BORDER_OUTER * stride to +// (cur_height + FLOW_BORDER_OUTER) * stride - 1 must be valid. +// +// Note that the same stride is used for u before and after upscaling +// and for the temporary buffer, for simplicity. +// +// A note on phasing: +// +// The flow fields at two adjacent pyramid levels are offset from each other, +// and we need to account for this in the construction of the interpolation +// kernels. +// +// Consider an 8x8 pixel patch at pyramid level n. This is split into four +// patches at pyramid level n-1. Bringing these patches back up to pyramid level +// n, each sub-patch covers 4x4 pixels, and between them they cover the same +// 8x8 region. +// +// Therefore, at pyramid level n, two adjacent patches look like this: +// +// + - - - - - - - + - - - - - - - + +// | | | +// | x x | x x | +// | | | +// | # | # | +// | | | +// | x x | x x | +// | | | +// + - - - - - - - + - - - - - - - + +// +// where # marks the center of a patch at pyramid level n (the input to this +// function), and x marks the center of a patch at pyramid level n-1 (the output +// of this function). +// +// By counting pixels (marked by +, -, and |), we can see that the flow vectors +// at pyramid level n-1 are offset relative to the flow vectors at pyramid +// level n, by 1/4 of the larger (input) patch size. Therefore, our +// interpolation kernels need to have phases of 0.25 and 0.75. +// +// In addition, in order to handle the frame edges correctly, we need to +// generate one output vector to the left and one to the right of each input +// vector, even though these must be interpolated using different source points. +static void upscale_flow_component(double *flow, int cur_width, int cur_height, + int stride, double *tmpbuf) { + const int half_len = FLOW_UPSCALE_TAPS / 2; + + // Check that the outer border is large enough to avoid needing to clamp + // the source locations + assert(half_len <= FLOW_BORDER_OUTER); + + // Horizontal upscale and multiply by 2 + for (int i = 0; i < cur_height; i++) { + for (int j = 0; j < cur_width; j++) { + double left = 0; + for (int k = -half_len; k < half_len; k++) { + left += + flow[i * stride + (j + k)] * flow_upscale_filter[0][k + half_len]; + } + tmpbuf[i * stride + (2 * j + 0)] = 2.0 * left; + + // Right output pixel is 0.25 units to the right of the input pixel + double right = 0; + for (int k = -(half_len - 1); k < (half_len + 1); k++) { + right += flow[i * stride + (j + k)] * + flow_upscale_filter[1][k + (half_len - 1)]; + } + tmpbuf[i * stride + (2 * j + 1)] = 2.0 * right; + } + } + + // Fill in top and bottom borders of tmpbuf + const double *top_row = &tmpbuf[0]; + for (int i = -FLOW_BORDER_OUTER; i < 0; i++) { + double *row = &tmpbuf[i * stride]; + memcpy(row, top_row, 2 * cur_width * sizeof(*row)); + } + + const double *bottom_row = &tmpbuf[(cur_height - 1) * stride]; + for (int i = cur_height; i < cur_height + FLOW_BORDER_OUTER; i++) { + double *row = &tmpbuf[i * stride]; + memcpy(row, bottom_row, 2 * cur_width * sizeof(*row)); + } + + // Vertical upscale + int upscaled_width = cur_width * 2; + for (int i = 0; i < cur_height; i++) { + for (int j = 0; j < upscaled_width; j++) { + double top = 0; + for (int k = -half_len; k < half_len; k++) { + top += + tmpbuf[(i + k) * stride + j] * flow_upscale_filter[0][k + half_len]; + } + flow[(2 * i) * stride + j] = top; + + double bottom = 0; + for (int k = -(half_len - 1); k < (half_len + 1); k++) { + bottom += tmpbuf[(i + k) * stride + j] * + flow_upscale_filter[1][k + (half_len - 1)]; + } + flow[(2 * i + 1) * stride + j] = bottom; + } + } +} + +// make sure flow_u and flow_v start at 0 +static bool compute_flow_field(const ImagePyramid *src_pyr, + const ImagePyramid *ref_pyr, FlowField *flow) { + bool mem_status = true; + assert(src_pyr->n_levels == ref_pyr->n_levels); + + double *flow_u = flow->u; + double *flow_v = flow->v; + + double *tmpbuf0; + double *tmpbuf; + + if (src_pyr->n_levels < 2) { + // tmpbuf not needed + tmpbuf0 = NULL; + tmpbuf = NULL; + } else { + // This line must match the calculation of cur_flow_height below + const int layer1_height = src_pyr->layers[1].height >> DOWNSAMPLE_SHIFT; + + const size_t tmpbuf_size = + (layer1_height + 2 * FLOW_BORDER_OUTER) * flow->stride; + tmpbuf0 = aom_malloc(tmpbuf_size * sizeof(*tmpbuf0)); + if (!tmpbuf0) { + mem_status = false; + goto free_tmpbuf; + } + tmpbuf = tmpbuf0 + FLOW_BORDER_OUTER * flow->stride; + } + + // Compute flow field from coarsest to finest level of the pyramid + // + // Note: We stop after refining pyramid level 1 and interpolating it to + // generate an initial flow field at level 0. We do *not* refine the dense + // flow field at level 0. Instead, we wait until we have generated + // correspondences by interpolating this flow field, and then refine the + // correspondences themselves. This is both faster and gives better output + // compared to refining the flow field at level 0 and then interpolating. + for (int level = src_pyr->n_levels - 1; level >= 1; --level) { + const PyramidLayer *cur_layer = &src_pyr->layers[level]; + const int cur_width = cur_layer->width; + const int cur_height = cur_layer->height; + const int cur_stride = cur_layer->stride; + + const uint8_t *src_buffer = cur_layer->buffer; + const uint8_t *ref_buffer = ref_pyr->layers[level].buffer; + + const int cur_flow_width = cur_width >> DOWNSAMPLE_SHIFT; + const int cur_flow_height = cur_height >> DOWNSAMPLE_SHIFT; + const int cur_flow_stride = flow->stride; + + for (int i = FLOW_BORDER_INNER; i < cur_flow_height - FLOW_BORDER_INNER; + i += 1) { + for (int j = FLOW_BORDER_INNER; j < cur_flow_width - FLOW_BORDER_INNER; + j += 1) { + const int flow_field_idx = i * cur_flow_stride + j; + + // Calculate the position of a patch of size DISFLOW_PATCH_SIZE pixels, + // which is centered on the region covered by this flow field entry + const int patch_center_x = + (j << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET; // In pixels + const int patch_center_y = + (i << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET; // In pixels + const int patch_tl_x = patch_center_x - DISFLOW_PATCH_CENTER; + const int patch_tl_y = patch_center_y - DISFLOW_PATCH_CENTER; + assert(patch_tl_x >= 0); + assert(patch_tl_y >= 0); + + aom_compute_flow_at_point(src_buffer, ref_buffer, patch_tl_x, + patch_tl_y, cur_width, cur_height, cur_stride, + &flow_u[flow_field_idx], + &flow_v[flow_field_idx]); + } + } + + // Fill in the areas which we haven't explicitly computed, with copies + // of the outermost values which we did compute + fill_flow_field_borders(flow_u, cur_flow_width, cur_flow_height, + cur_flow_stride); + fill_flow_field_borders(flow_v, cur_flow_width, cur_flow_height, + cur_flow_stride); + + if (level > 0) { + const int upscale_flow_width = cur_flow_width << 1; + const int upscale_flow_height = cur_flow_height << 1; + const int upscale_stride = flow->stride; + + upscale_flow_component(flow_u, cur_flow_width, cur_flow_height, + cur_flow_stride, tmpbuf); + upscale_flow_component(flow_v, cur_flow_width, cur_flow_height, + cur_flow_stride, tmpbuf); + + // If we didn't fill in the rightmost column or bottommost row during + // upsampling (in order to keep the ratio to exactly 2), fill them + // in here by copying the next closest column/row + const PyramidLayer *next_layer = &src_pyr->layers[level - 1]; + const int next_flow_width = next_layer->width >> DOWNSAMPLE_SHIFT; + const int next_flow_height = next_layer->height >> DOWNSAMPLE_SHIFT; + + // Rightmost column + if (next_flow_width > upscale_flow_width) { + assert(next_flow_width == upscale_flow_width + 1); + for (int i = 0; i < upscale_flow_height; i++) { + const int index = i * upscale_stride + upscale_flow_width; + flow_u[index] = flow_u[index - 1]; + flow_v[index] = flow_v[index - 1]; + } + } + + // Bottommost row + if (next_flow_height > upscale_flow_height) { + assert(next_flow_height == upscale_flow_height + 1); + for (int j = 0; j < next_flow_width; j++) { + const int index = upscale_flow_height * upscale_stride + j; + flow_u[index] = flow_u[index - upscale_stride]; + flow_v[index] = flow_v[index - upscale_stride]; + } + } + } + } + +free_tmpbuf: + aom_free(tmpbuf0); + return mem_status; +} + +static FlowField *alloc_flow_field(int frame_width, int frame_height) { + FlowField *flow = (FlowField *)aom_malloc(sizeof(FlowField)); + if (flow == NULL) return NULL; + + // Calculate the size of the bottom (largest) layer of the flow pyramid + flow->width = frame_width >> DOWNSAMPLE_SHIFT; + flow->height = frame_height >> DOWNSAMPLE_SHIFT; + flow->stride = flow->width + 2 * FLOW_BORDER_OUTER; + + const size_t flow_size = + flow->stride * (size_t)(flow->height + 2 * FLOW_BORDER_OUTER); + + flow->buf0 = aom_calloc(2 * flow_size, sizeof(*flow->buf0)); + if (!flow->buf0) { + aom_free(flow); + return NULL; + } + + flow->u = flow->buf0 + FLOW_BORDER_OUTER * flow->stride + FLOW_BORDER_OUTER; + flow->v = flow->u + flow_size; + + return flow; +} + +static void free_flow_field(FlowField *flow) { + aom_free(flow->buf0); + aom_free(flow); +} + +// Compute flow field between `src` and `ref`, and then use that flow to +// compute a global motion model relating the two frames. +// +// Following the convention in flow_estimation.h, the flow vectors are computed +// at fixed points in `src` and point to the corresponding locations in `ref`, +// regardless of the temporal ordering of the frames. +bool av1_compute_global_motion_disflow(TransformationType type, + YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *ref, int bit_depth, + MotionModel *motion_models, + int num_motion_models, + bool *mem_alloc_failed) { + // Precompute information we will need about each frame + ImagePyramid *src_pyramid = src->y_pyramid; + CornerList *src_corners = src->corners; + ImagePyramid *ref_pyramid = ref->y_pyramid; + if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + *mem_alloc_failed = true; + return false; + } + if (!av1_compute_corner_list(src_pyramid, src_corners)) { + *mem_alloc_failed = true; + return false; + } + if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + *mem_alloc_failed = true; + return false; + } + + const int src_width = src_pyramid->layers[0].width; + const int src_height = src_pyramid->layers[0].height; + assert(ref_pyramid->layers[0].width == src_width); + assert(ref_pyramid->layers[0].height == src_height); + + FlowField *flow = alloc_flow_field(src_width, src_height); + if (!flow) { + *mem_alloc_failed = true; + return false; + } + + if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) { + *mem_alloc_failed = true; + free_flow_field(flow); + return false; + } + + // find correspondences between the two images using the flow field + Correspondence *correspondences = + aom_malloc(src_corners->num_corners * sizeof(*correspondences)); + if (!correspondences) { + *mem_alloc_failed = true; + free_flow_field(flow); + return false; + } + + const int num_correspondences = determine_disflow_correspondence( + src_pyramid, ref_pyramid, src_corners, flow, correspondences); + + bool result = ransac(correspondences, num_correspondences, type, + motion_models, num_motion_models, mem_alloc_failed); + + aom_free(correspondences); + free_flow_field(flow); + return result; +} diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.h b/third_party/aom/aom_dsp/flow_estimation/disflow.h new file mode 100644 index 0000000000..ef877b638c --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/disflow.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_ + +#include + +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_dsp/rect.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Number of pyramid levels in disflow computation +#define DISFLOW_PYRAMID_LEVELS 12 + +// Size of square patches in the disflow dense grid +// Must be a power of 2 +#define DISFLOW_PATCH_SIZE_LOG2 3 +#define DISFLOW_PATCH_SIZE (1 << DISFLOW_PATCH_SIZE_LOG2) +// Center point of square patch +#define DISFLOW_PATCH_CENTER ((DISFLOW_PATCH_SIZE / 2) - 1) + +// Overall scale of the `dx`, `dy` and `dt` arrays in the disflow code +// In other words, the various derivatives are calculated with an internal +// precision of (8 + DISFLOW_DERIV_SCALE_LOG2) bits, from an 8-bit input. +// +// This must be carefully synchronized with the code in sobel_filter() +// (which fills the dx and dy arrays) and compute_flow_error() (which +// fills dt); see the comments in those functions for more details +#define DISFLOW_DERIV_SCALE_LOG2 3 +#define DISFLOW_DERIV_SCALE (1 << DISFLOW_DERIV_SCALE_LOG2) + +// Scale factor applied to each step in the main refinement loop +// +// This should be <= 1.0 to avoid overshoot. Values below 1.0 +// may help in some cases, but slow convergence overall, so +// will require careful tuning. +// TODO(rachelbarker): Tune this value +#define DISFLOW_STEP_SIZE 1.0 + +// Step size at which we should terminate iteration +// The idea here is that, if we take a step which is much smaller than 1px in +// size, then the values won't change much from iteration to iteration, so +// many future steps will also be small, and that won't have much effect +// on the ultimate result. So we can terminate early. +// +// To look at it another way, when we take a small step, that means that +// either we're near to convergence (so can stop), or we're stuck in a +// shallow valley and will take many iterations to get unstuck. +// +// Solving the latter properly requires fancier methods, such as "gradient +// descent with momentum". For now, we terminate to avoid wasting a ton of +// time on points which are either nearly-converged or stuck. +// +// Terminating at 1/8 px seems to give good results for global motion estimation +#define DISFLOW_STEP_SIZE_THRESOLD (1. / 8.) + +// Max number of iterations if warp convergence is not found +#define DISFLOW_MAX_ITR 4 + +// Internal precision of cubic interpolation filters +// The limiting factor here is that: +// * Before integerizing, the maximum value of any kernel tap is 1.0 +// * After integerizing, each tap must fit into an int16_t. +// Thus the largest multiplier we can get away with is 2^14 = 16384, +// as 2^15 = 32768 is too large to fit in an int16_t. +#define DISFLOW_INTERP_BITS 14 + +typedef struct { + // Start of allocation for u and v buffers + double *buf0; + + // x and y directions of flow, per patch + double *u; + double *v; + + // Sizes of the above arrays + int width; + int height; + int stride; +} FlowField; + +bool av1_compute_global_motion_disflow(TransformationType type, + YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *ref, int bit_depth, + MotionModel *motion_models, + int num_motion_models, + bool *mem_alloc_failed); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c new file mode 100644 index 0000000000..0f47f86f55 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/flow_estimation/corner_match.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +// For each global motion method, how many pyramid levels should we allocate? +// Note that this is a maximum, and fewer levels will be allocated if the frame +// is not large enough to need all of the specified levels +const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = { + 1, // GLOBAL_MOTION_METHOD_FEATURE_MATCH + 16, // GLOBAL_MOTION_METHOD_DISFLOW +}; + +// clang-format off +const double kIdentityParams[MAX_PARAMDIM] = { + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0 +}; +// clang-format on + +// Compute a global motion model between the given source and ref frames. +// +// As is standard for video codecs, the resulting model maps from (x, y) +// coordinates in `src` to the corresponding points in `ref`, regardless +// of the temporal order of the two frames. +// +// Returns true if global motion estimation succeeded, false if not. +// The output models should only be used if this function succeeds. +bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *ref, int bit_depth, + GlobalMotionMethod gm_method, + MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { + switch (gm_method) { + case GLOBAL_MOTION_METHOD_FEATURE_MATCH: + return av1_compute_global_motion_feature_match( + type, src, ref, bit_depth, motion_models, num_motion_models, + mem_alloc_failed); + case GLOBAL_MOTION_METHOD_DISFLOW: + return av1_compute_global_motion_disflow(type, src, ref, bit_depth, + motion_models, num_motion_models, + mem_alloc_failed); + default: assert(0 && "Unknown global motion estimation type"); + } + return false; +} diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h new file mode 100644 index 0000000000..2dfae24980 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_H_ + +#include "aom_dsp/pyramid.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_PARAMDIM 6 +#define MIN_INLIER_PROB 0.1 + +/* clang-format off */ +enum { + IDENTITY = 0, // identity transformation, 0-parameter + TRANSLATION = 1, // translational motion 2-parameter + ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter + AFFINE = 3, // affine, 6-parameter + TRANS_TYPES, +} UENUM1BYTE(TransformationType); +/* clang-format on */ + +// number of parameters used by each transformation in TransformationTypes +static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; + +// Available methods which can be used for global motion estimation +typedef enum { + GLOBAL_MOTION_METHOD_FEATURE_MATCH, + GLOBAL_MOTION_METHOD_DISFLOW, + GLOBAL_MOTION_METHOD_LAST = GLOBAL_MOTION_METHOD_DISFLOW, + GLOBAL_MOTION_METHODS +} GlobalMotionMethod; + +typedef struct { + double params[MAX_PARAMDIM]; + int *inliers; + int num_inliers; +} MotionModel; + +// Data structure to store a single correspondence point during global +// motion search. +// +// A correspondence (x, y) -> (rx, ry) means that point (x, y) in the +// source frame corresponds to point (rx, ry) in the ref frame. +typedef struct { + double x, y; + double rx, ry; +} Correspondence; + +// For each global motion method, how many pyramid levels should we allocate? +// Note that this is a maximum, and fewer levels will be allocated if the frame +// is not large enough to need all of the specified levels +extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS]; + +// Which global motion method should we use in practice? +// Disflow is both faster and gives better results than feature matching in +// practically all cases, so we use disflow by default +static const GlobalMotionMethod default_global_motion_method = + GLOBAL_MOTION_METHOD_DISFLOW; + +extern const double kIdentityParams[MAX_PARAMDIM]; + +// Compute a global motion model between the given source and ref frames. +// +// As is standard for video codecs, the resulting model maps from (x, y) +// coordinates in `src` to the corresponding points in `ref`, regardless +// of the temporal order of the two frames. +// +// Returns true if global motion estimation succeeded, false if not. +// The output models should only be used if this function succeeds. +bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *ref, int bit_depth, + GlobalMotionMethod gm_method, + MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.c b/third_party/aom/aom_dsp/flow_estimation/ransac.c new file mode 100644 index 0000000000..b88a07b023 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/ransac.c @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "aom_dsp/flow_estimation/ransac.h" +#include "aom_dsp/mathutils.h" +#include "aom_mem/aom_mem.h" + +// TODO(rachelbarker): Remove dependence on code in av1/encoder/ +#include "av1/encoder/random.h" + +#define MAX_MINPTS 4 +#define MINPTS_MULTIPLIER 5 + +#define INLIER_THRESHOLD 1.25 +#define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD) +#define NUM_TRIALS 20 + +// Flag to enable functions for finding TRANSLATION type models. +// +// These modes are not considered currently due to a spec bug (see comments +// in gm_get_motion_vector() in av1/common/mv.h). Thus we don't need to compile +// the corresponding search functions, but it is nice to keep the source around +// but disabled, for completeness. +#define ALLOW_TRANSLATION_MODELS 0 + +//////////////////////////////////////////////////////////////////////////////// +// ransac +typedef bool (*IsDegenerateFunc)(double *p); +typedef bool (*FindTransformationFunc)(int points, const double *points1, + const double *points2, double *params); +typedef void (*ProjectPointsFunc)(const double *mat, const double *points, + double *proj, int n, int stride_points, + int stride_proj); + +// vtable-like structure which stores all of the information needed by RANSAC +// for a particular model type +typedef struct { + IsDegenerateFunc is_degenerate; + FindTransformationFunc find_transformation; + ProjectPointsFunc project_points; + int minpts; +} RansacModelInfo; + +#if ALLOW_TRANSLATION_MODELS +static void project_points_translation(const double *mat, const double *points, + double *proj, int n, int stride_points, + int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = x + mat[0]; + *(proj++) = y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} +#endif // ALLOW_TRANSLATION_MODELS + +static void project_points_affine(const double *mat, const double *points, + double *proj, int n, int stride_points, + int stride_proj) { + int i; + for (i = 0; i < n; ++i) { + const double x = *(points++), y = *(points++); + *(proj++) = mat[2] * x + mat[3] * y + mat[0]; + *(proj++) = mat[4] * x + mat[5] * y + mat[1]; + points += stride_points - 2; + proj += stride_proj - 2; + } +} + +#if ALLOW_TRANSLATION_MODELS +static bool find_translation(int np, const double *pts1, const double *pts2, + double *params) { + double sumx = 0; + double sumy = 0; + + for (int i = 0; i < np; ++i) { + double dx = *(pts2++); + double dy = *(pts2++); + double sx = *(pts1++); + double sy = *(pts1++); + + sumx += dx - sx; + sumy += dy - sy; + } + + params[0] = sumx / np; + params[1] = sumy / np; + params[2] = 1; + params[3] = 0; + params[4] = 0; + params[5] = 1; + return true; +} +#endif // ALLOW_TRANSLATION_MODELS + +static bool find_rotzoom(int np, const double *pts1, const double *pts2, + double *params) { + const int n = 4; // Size of least-squares problem + double mat[4 * 4]; // Accumulator for A'A + double y[4]; // Accumulator for A'b + double a[4]; // Single row of A + double b; // Single element of b + + least_squares_init(mat, y, n); + for (int i = 0; i < np; ++i) { + double dx = *(pts2++); + double dy = *(pts2++); + double sx = *(pts1++); + double sy = *(pts1++); + + a[0] = 1; + a[1] = 0; + a[2] = sx; + a[3] = sy; + b = dx; + least_squares_accumulate(mat, y, a, b, n); + + a[0] = 0; + a[1] = 1; + a[2] = sy; + a[3] = -sx; + b = dy; + least_squares_accumulate(mat, y, a, b, n); + } + + // Fill in params[0] .. params[3] with output model + if (!least_squares_solve(mat, y, params, n)) { + return false; + } + + // Fill in remaining parameters + params[4] = -params[3]; + params[5] = params[2]; + + return true; +} + +static bool find_affine(int np, const double *pts1, const double *pts2, + double *params) { + // Note: The least squares problem for affine models is 6-dimensional, + // but it splits into two independent 3-dimensional subproblems. + // Solving these two subproblems separately and recombining at the end + // results in less total computation than solving the 6-dimensional + // problem directly. + // + // The two subproblems correspond to all the parameters which contribute + // to the x output of the model, and all the parameters which contribute + // to the y output, respectively. + + const int n = 3; // Size of each least-squares problem + double mat[2][3 * 3]; // Accumulator for A'A + double y[2][3]; // Accumulator for A'b + double x[2][3]; // Output vector + double a[2][3]; // Single row of A + double b[2]; // Single element of b + + least_squares_init(mat[0], y[0], n); + least_squares_init(mat[1], y[1], n); + for (int i = 0; i < np; ++i) { + double dx = *(pts2++); + double dy = *(pts2++); + double sx = *(pts1++); + double sy = *(pts1++); + + a[0][0] = 1; + a[0][1] = sx; + a[0][2] = sy; + b[0] = dx; + least_squares_accumulate(mat[0], y[0], a[0], b[0], n); + + a[1][0] = 1; + a[1][1] = sx; + a[1][2] = sy; + b[1] = dy; + least_squares_accumulate(mat[1], y[1], a[1], b[1], n); + } + + if (!least_squares_solve(mat[0], y[0], x[0], n)) { + return false; + } + if (!least_squares_solve(mat[1], y[1], x[1], n)) { + return false; + } + + // Rearrange least squares result to form output model + params[0] = x[0][0]; + params[1] = x[1][0]; + params[2] = x[0][1]; + params[3] = x[0][2]; + params[4] = x[1][1]; + params[5] = x[1][2]; + + return true; +} + +typedef struct { + int num_inliers; + double sse; // Sum of squared errors of inliers + int *inlier_indices; +} RANSAC_MOTION; + +// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. +static int compare_motions(const void *arg_a, const void *arg_b) { + const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; + const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b; + + if (motion_a->num_inliers > motion_b->num_inliers) return -1; + if (motion_a->num_inliers < motion_b->num_inliers) return 1; + if (motion_a->sse < motion_b->sse) return -1; + if (motion_a->sse > motion_b->sse) return 1; + return 0; +} + +static bool is_better_motion(const RANSAC_MOTION *motion_a, + const RANSAC_MOTION *motion_b) { + return compare_motions(motion_a, motion_b) < 0; +} + +static void copy_points_at_indices(double *dest, const double *src, + const int *indices, int num_points) { + for (int i = 0; i < num_points; ++i) { + const int index = indices[i]; + dest[i * 2] = src[index * 2]; + dest[i * 2 + 1] = src[index * 2 + 1]; + } +} + +// Returns true on success, false on error +static bool ransac_internal(const Correspondence *matched_points, int npoints, + MotionModel *motion_models, int num_desired_motions, + const RansacModelInfo *model_info, + bool *mem_alloc_failed) { + assert(npoints >= 0); + int i = 0; + int minpts = model_info->minpts; + bool ret_val = true; + + unsigned int seed = (unsigned int)npoints; + + int indices[MAX_MINPTS] = { 0 }; + + double *points1, *points2; + double *corners1, *corners2; + double *projected_corners; + + // Store information for the num_desired_motions best transformations found + // and the worst motion among them, as well as the motion currently under + // consideration. + RANSAC_MOTION *motions, *worst_kept_motion = NULL; + RANSAC_MOTION current_motion; + + // Store the parameters and the indices of the inlier points for the motion + // currently under consideration. + double params_this_motion[MAX_PARAMDIM]; + + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { + return false; + } + + int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts); + + points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); + points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); + corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); + corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); + projected_corners = + (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2); + motions = + (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION)); + + // Allocate one large buffer which will be carved up to store the inlier + // indices for the current motion plus the num_desired_motions many + // output models + // This allows us to keep the allocation/deallocation logic simple, without + // having to (for example) check that `motions` is non-null before allocating + // the inlier arrays + int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints * + (num_desired_motions + 1)); + + if (!(points1 && points2 && corners1 && corners2 && projected_corners && + motions && inlier_buffer)) { + ret_val = false; + *mem_alloc_failed = true; + goto finish_ransac; + } + + // Once all our allocations are known-good, we can fill in our structures + worst_kept_motion = motions; + + for (i = 0; i < num_desired_motions; ++i) { + motions[i].inlier_indices = inlier_buffer + i * npoints; + } + memset(¤t_motion, 0, sizeof(current_motion)); + current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints; + + for (i = 0; i < npoints; ++i) { + corners1[2 * i + 0] = matched_points[i].x; + corners1[2 * i + 1] = matched_points[i].y; + corners2[2 * i + 0] = matched_points[i].rx; + corners2[2 * i + 1] = matched_points[i].ry; + } + + for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) { + lcg_pick(npoints, minpts, indices, &seed); + + copy_points_at_indices(points1, corners1, indices, minpts); + copy_points_at_indices(points2, corners2, indices, minpts); + + if (model_info->is_degenerate(points1)) { + continue; + } + + if (!model_info->find_transformation(minpts, points1, points2, + params_this_motion)) { + continue; + } + + model_info->project_points(params_this_motion, corners1, projected_corners, + npoints, 2, 2); + + current_motion.num_inliers = 0; + double sse = 0.0; + for (i = 0; i < npoints; ++i) { + double dx = projected_corners[i * 2] - corners2[i * 2]; + double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1]; + double squared_error = dx * dx + dy * dy; + + if (squared_error < INLIER_THRESHOLD_SQUARED) { + current_motion.inlier_indices[current_motion.num_inliers++] = i; + sse += squared_error; + } + } + + if (current_motion.num_inliers < min_inliers) { + // Reject models with too few inliers + continue; + } + + current_motion.sse = sse; + if (is_better_motion(¤t_motion, worst_kept_motion)) { + // This motion is better than the worst currently kept motion. Remember + // the inlier points and sse. The parameters for each kept motion + // will be recomputed later using only the inliers. + worst_kept_motion->num_inliers = current_motion.num_inliers; + worst_kept_motion->sse = current_motion.sse; + + // Rather than copying the (potentially many) inlier indices from + // current_motion.inlier_indices to worst_kept_motion->inlier_indices, + // we can swap the underlying pointers. + // + // This is okay because the next time current_motion.inlier_indices + // is used will be in the next trial, where we ignore its previous + // contents anyway. And both arrays will be deallocated together at the + // end of this function, so there are no lifetime issues. + int *tmp = worst_kept_motion->inlier_indices; + worst_kept_motion->inlier_indices = current_motion.inlier_indices; + current_motion.inlier_indices = tmp; + + // Determine the new worst kept motion and its num_inliers and sse. + for (i = 0; i < num_desired_motions; ++i) { + if (is_better_motion(worst_kept_motion, &motions[i])) { + worst_kept_motion = &motions[i]; + } + } + } + } + + // Sort the motions, best first. + qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); + + // Recompute the motions using only the inliers. + for (i = 0; i < num_desired_motions; ++i) { + int num_inliers = motions[i].num_inliers; + if (num_inliers > 0) { + assert(num_inliers >= minpts); + + copy_points_at_indices(points1, corners1, motions[i].inlier_indices, + num_inliers); + copy_points_at_indices(points2, corners2, motions[i].inlier_indices, + num_inliers); + + if (!model_info->find_transformation(num_inliers, points1, points2, + motion_models[i].params)) { + // In the unlikely event that this model fitting fails, + // we don't have a good fallback. So just clear the output + // model and move on + memcpy(motion_models[i].params, kIdentityParams, + MAX_PARAMDIM * sizeof(*(motion_models[i].params))); + motion_models[i].num_inliers = 0; + continue; + } + + // Populate inliers array + for (int j = 0; j < num_inliers; j++) { + int index = motions[i].inlier_indices[j]; + const Correspondence *corr = &matched_points[index]; + motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); + motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + } + motion_models[i].num_inliers = num_inliers; + } else { + memcpy(motion_models[i].params, kIdentityParams, + MAX_PARAMDIM * sizeof(*(motion_models[i].params))); + motion_models[i].num_inliers = 0; + } + } + +finish_ransac: + aom_free(inlier_buffer); + aom_free(motions); + aom_free(projected_corners); + aom_free(corners2); + aom_free(corners1); + aom_free(points2); + aom_free(points1); + + return ret_val; +} + +static bool is_collinear3(double *p1, double *p2, double *p3) { + static const double collinear_eps = 1e-3; + const double v = + (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); + return fabs(v) < collinear_eps; +} + +#if ALLOW_TRANSLATION_MODELS +static bool is_degenerate_translation(double *p) { + return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; +} +#endif // ALLOW_TRANSLATION_MODELS + +static bool is_degenerate_affine(double *p) { + return is_collinear3(p, p + 2, p + 4); +} + +static const RansacModelInfo ransac_model_info[TRANS_TYPES] = { + // IDENTITY + { NULL, NULL, NULL, 0 }, +// TRANSLATION +#if ALLOW_TRANSLATION_MODELS + { is_degenerate_translation, find_translation, project_points_translation, + 3 }, +#else + { NULL, NULL, NULL, 0 }, +#endif + // ROTZOOM + { is_degenerate_affine, find_rotzoom, project_points_affine, 3 }, + // AFFINE + { is_degenerate_affine, find_affine, project_points_affine, 3 }, +}; + +// Returns true on success, false on error +bool ransac(const Correspondence *matched_points, int npoints, + TransformationType type, MotionModel *motion_models, + int num_desired_motions, bool *mem_alloc_failed) { +#if ALLOW_TRANSLATION_MODELS + assert(type > IDENTITY && type < TRANS_TYPES); +#else + assert(type > TRANSLATION && type < TRANS_TYPES); +#endif // ALLOW_TRANSLATION_MODELS + + return ransac_internal(matched_points, npoints, motion_models, + num_desired_motions, &ransac_model_info[type], + mem_alloc_failed); +} diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.h b/third_party/aom/aom_dsp/flow_estimation/ransac.h new file mode 100644 index 0000000000..0529b6e13c --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/ransac.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_ +#define AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_ + +#include +#include +#include +#include +#include + +#include "aom_dsp/flow_estimation/flow_estimation.h" + +#ifdef __cplusplus +extern "C" { +#endif + +bool ransac(const Correspondence *matched_points, int npoints, + TransformationType type, MotionModel *motion_models, + int num_desired_motions, bool *mem_alloc_failed); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_ diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c new file mode 100644 index 0000000000..87c76fa13b --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/flow_estimation/corner_match.h" + +DECLARE_ALIGNED(16, static const uint8_t, + byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 0, 0 }; +#if MATCH_SZ != 13 +#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +#endif + +/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the +correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows +of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double av1_compute_cross_correlation_avx2(const unsigned char *frame1, + int stride1, int x1, int y1, + const unsigned char *frame2, + int stride2, int x2, int y2) { + int i, stride1_i = 0, stride2_i = 0; + __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; + const __m128i mask = _mm_load_si128((__m128i *)byte_mask); + const __m256i zero = _mm256_setzero_si256(); + __m128i v1, v2; + + sum_vec = zero; + sumsq2_vec = zero; + cross_vec = zero; + + frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); + frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); + + for (i = 0; i < MATCH_SZ; ++i) { + v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask); + v1_1 = _mm256_cvtepu8_epi16(v1); + v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask); + v2_1 = _mm256_cvtepu8_epi16(v2); + + v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); + sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + + sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); + stride1_i += stride1; + stride2_i += stride2; + } + __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); + sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); + int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); + int sum2_acc = _mm256_extract_epi32(sum_vec, 4); + + __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); + __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); + temp1 = _mm256_add_epi32(unp_low, unp_hig); + + __m128i low_sumsq = _mm256_castsi256_si128(temp1); + low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); + low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); + int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); + int cross_acc = _mm_extract_epi32(low_sumsq, 2); + + int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; + int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; + return cov / sqrt((double)var2); +} diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c new file mode 100644 index 0000000000..b3cb5bc5fd --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/flow_estimation/corner_match.h" + +DECLARE_ALIGNED(16, static const uint8_t, + byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0, 0, 0 }; +#if MATCH_SZ != 13 +#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +#endif + +/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the + correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows + of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, + int stride1, int x1, int y1, + const unsigned char *frame2, + int stride2, int x2, int y2) { + int i; + // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, + // 2) + __m128i sum1_vec = _mm_setzero_si128(); + __m128i sum2_vec = _mm_setzero_si128(); + // 4 32-bit partial sums of squares + __m128i sumsq2_vec = _mm_setzero_si128(); + __m128i cross_vec = _mm_setzero_si128(); + + const __m128i mask = _mm_load_si128((__m128i *)byte_mask); + const __m128i zero = _mm_setzero_si128(); + + frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); + frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); + + for (i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = + _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask); + const __m128i v2 = + _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask); + + // Using the 'sad' intrinsic here is a bit faster than adding + // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit + // conversion step later, for a net speedup of ~10% + sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); + sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + + const __m128i v1_l = _mm_cvtepu8_epi16(v1); + const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); + const __m128i v2_l = _mm_cvtepu8_epi16(v2); + const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); + + sumsq2_vec = _mm_add_epi32( + sumsq2_vec, + _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); + cross_vec = _mm_add_epi32( + cross_vec, + _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + } + + // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, + // cross_vec) + // as holding 4 32-bit elements each, which we want to sum horizontally. + // We do this by transposing and then summing vertically. + __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); + __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); + __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); + __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); + + __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); + __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); + __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); + __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); + + __m128i res = + _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); + + int sum1 = _mm_extract_epi32(res, 0); + int sum2 = _mm_extract_epi32(res, 1); + int sumsq2 = _mm_extract_epi32(res, 2); + int cross = _mm_extract_epi32(res, 3); + + int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; + int cov = cross * MATCH_SZ_SQ - sum1 * sum2; + return cov / sqrt((double)var2); +} diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c new file mode 100644 index 0000000000..d2b04c1973 --- /dev/null +++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c @@ -0,0 +1,558 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 3-Clause Clear License + * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear + * License was not distributed with this source code in the LICENSE file, you + * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the + * Alliance for Open Media Patent License 1.0 was not distributed with this + * source code in the PATENTS file, you can obtain it at + * aomedia.org/license/patent-license/. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +// Internal cross-check against C code +// If you set this to 1 and compile in debug mode, then the outputs of the two +// convolution stages will be checked against the plain C version of the code, +// and an assertion will be fired if the results differ. +#define CHECK_RESULTS 0 + +// Note: Max sum(+ve coefficients) = 1.125 * scale +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { + // Check that the fractional position is in range. + // + // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Mathematically, this implies that 0 <= x < 1. However, in practice it is + // possible to have x == 1 due to floating point rounding. This is fine, + // and we still interpolate correctly if we allow x = 1. + assert(0 <= x && x <= 1); + + double x2 = x * x; + double x3 = x2 * x; + kernel[0] = -0.5 * x + x2 - 0.5 * x3; + kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; + kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; + kernel[3] = -0.5 * x2 + 0.5 * x3; +} + +static INLINE void get_cubic_kernel_int(double x, int16_t kernel[4]) { + double kernel_dbl[4]; + get_cubic_kernel_dbl(x, kernel_dbl); + + kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); + kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); + kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); + kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); +} + +#if CHECK_RESULTS +static INLINE int get_cubic_value_int(const int *p, const int16_t kernel[4]) { + return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + + kernel[3] * p[3]; +} +#endif // CHECK_RESULTS + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +// +// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation +// instad of bicubic interpolation +static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, + const int16_t *dx, const int16_t *dy, + int *b) { + // This function is written to do 8x8 convolutions only + assert(DISFLOW_PATCH_SIZE == 8); + + // Accumulate 4 32-bit partial sums for each element of b + // These will be flattened at the end. + __m128i b0_acc = _mm_setzero_si128(); + __m128i b1_acc = _mm_setzero_si128(); +#if CHECK_RESULTS + // Also keep a running sum using the C algorithm, for cross-checking + int c_result[2] = { 0 }; +#endif // CHECK_RESULTS + + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + int16_t h_kernel[4]; + int16_t v_kernel[4]; + get_cubic_kernel_int(u_frac, h_kernel); + get_cubic_kernel_int(v_frac, v_kernel); + + // Storage for intermediate values between the two convolution directions + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution + + // Prepare the kernel vectors + // We split the kernel into two vectors with kernel indices: + // 0, 1, 0, 1, 0, 1, 0, 1, and + // 2, 3, 2, 3, 2, 3, 2, 3 + __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]); + __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]); + + __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); + + for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) { + const int y_w = y0 + i; + const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load this row of pixels. + // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, + // for a total of 11 pixels. Here we load 16 pixels, but only use + // the first 11. + __m128i row = _mm_loadu_si128((__m128i *)ref_row); + + // Expand pixels to int16s + __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row); + __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4)); + + // Relevant multiply instruction + // This multiplies pointwise, then sums in pairs. + //_mm_madd_epi16(); + + // Compute first four outputs + // input pixels 0, 1, 1, 2, 2, 3, 3, 4 + // * kernel 0, 1, 0, 1, 0, 1, 0, 1 + __m128i px0 = + _mm_unpacklo_epi16(px_0to7_i16, _mm_srli_si128(px_0to7_i16, 2)); + // input pixels 2, 3, 3, 4, 4, 5, 5, 6 + // * kernel 2, 3, 2, 3, 2, 3, 2, 3 + __m128i px1 = _mm_unpacklo_epi16(_mm_srli_si128(px_0to7_i16, 4), + _mm_srli_si128(px_0to7_i16, 6)); + // Convolve with kernel and sum 2x2 boxes to form first 4 outputs + __m128i sum0 = _mm_add_epi32(_mm_madd_epi16(px0, h_kernel_01), + _mm_madd_epi16(px1, h_kernel_23)); + + __m128i out0 = _mm_srai_epi32(_mm_add_epi32(sum0, round_const_h), + DISFLOW_INTERP_BITS - 6); + + // Compute second four outputs + __m128i px2 = + _mm_unpacklo_epi16(px_4to10_i16, _mm_srli_si128(px_4to10_i16, 2)); + __m128i px3 = _mm_unpacklo_epi16(_mm_srli_si128(px_4to10_i16, 4), + _mm_srli_si128(px_4to10_i16, 6)); + __m128i sum1 = _mm_add_epi32(_mm_madd_epi16(px2, h_kernel_01), + _mm_madd_epi16(px3, h_kernel_23)); + + // Round by just enough bits that the result is + // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 + // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 + // as it does now + // This means shifting down so we have 6 extra bits, for a maximum value + // of +18360, which can occur if u_frac == 0.5 and the input pixels are + // {0, 255, 255, 0}. + __m128i out1 = _mm_srai_epi32(_mm_add_epi32(sum1, round_const_h), + DISFLOW_INTERP_BITS - 6); + + _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1)); + +#if CHECK_RESULTS && !defined(NDEBUG) + // Cross-check + for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { + const int x_w = x0 + j; + int arr[4]; + + arr[0] = (int)ref[y_w * stride + (x_w - 1)]; + arr[1] = (int)ref[y_w * stride + (x_w + 0)]; + arr[2] = (int)ref[y_w * stride + (x_w + 1)]; + arr[3] = (int)ref[y_w * stride + (x_w + 2)]; + + // Apply kernel and round, keeping 6 extra bits of precision. + // + // 6 is the maximum allowable number of extra bits which will avoid + // the intermediate values overflowing an int16_t. The most extreme + // intermediate value occurs when: + // * The input pixels are [0, 255, 255, 0] + // * u_frac = 0.5 + // In this case, the un-scaled output is 255 * 1.125 = 286.875. + // As an integer with 6 fractional bits, that is 18360, which fits + // in an int16_t. But with 7 fractional bits it would be 36720, + // which is too large. + const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel), + DISFLOW_INTERP_BITS - 6); + (void)c_value; // Suppress warnings + assert(tmp_row[j] == c_value); + } +#endif // CHECK_RESULTS + } + + // Vertical convolution + const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; + __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1)); + + __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]); + __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load 4 rows of 8 x 16-bit values + __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); + __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + __m128i px3 = + _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); + + // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , + // but each multiply expands its output to 32 bits. So we need to be + // a little clever about how we do this + __m128i sum0 = _mm_add_epi32( + _mm_madd_epi16(_mm_unpacklo_epi16(px0, px1), v_kernel_01), + _mm_madd_epi16(_mm_unpacklo_epi16(px2, px3), v_kernel_23)); + __m128i sum1 = _mm_add_epi32( + _mm_madd_epi16(_mm_unpackhi_epi16(px0, px1), v_kernel_01), + _mm_madd_epi16(_mm_unpackhi_epi16(px2, px3), v_kernel_23)); + + __m128i sum0_rounded = + _mm_srai_epi32(_mm_add_epi32(sum0, round_const_v), round_bits); + __m128i sum1_rounded = + _mm_srai_epi32(_mm_add_epi32(sum1, round_const_v), round_bits); + + __m128i warped = _mm_packs_epi32(sum0_rounded, sum1_rounded); + __m128i src_pixels_u8 = + _mm_loadl_epi64((__m128i *)&src[(y + i) * stride + x]); + __m128i src_pixels = _mm_slli_epi16(_mm_cvtepu8_epi16(src_pixels_u8), 3); + + // Calculate delta from the target patch + __m128i dt = _mm_sub_epi16(warped, src_pixels); + + // Load 8 elements each of dx and dt, to pair with the 8 elements of dt + // that we have just computed. Then compute 8 partial sums of dx * dt + // and dy * dt, implicitly sum to give 4 partial sums of each, and + // accumulate. + __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * DISFLOW_PATCH_SIZE]); + __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]); + b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt)); + b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt)); + +#if CHECK_RESULTS + int16_t dt_arr[8]; + memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr)); + for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { + int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; + int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], + p[2 * DISFLOW_PATCH_SIZE] }; + const int result = get_cubic_value_int(arr, v_kernel); + + // Apply kernel and round. + // This time, we have to round off the 6 extra bits which were kept + // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits + // of precision to match the scale of the dx and dy arrays. + const int c_warped = ROUND_POWER_OF_TWO(result, round_bits); + const int c_src_px = src[(x + j) + (y + i) * stride] << 3; + const int c_dt = c_warped - c_src_px; + + assert(dt_arr[j] == c_dt); + + c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt; + c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt; + } +#endif // CHECK_RESULTS + } + + // Flatten the two sets of partial sums to find the final value of b + // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). + // We need to do 6 additions in total; a `hadd` instruction can take care + // of four of them, leaving two scalar additions. + __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc); + b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); + b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); + +#if CHECK_RESULTS + assert(b[0] == c_result[0]); + assert(b[1] == c_result[1]); +#endif // CHECK_RESULTS +} + +static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; +#if CHECK_RESULTS + const int taps = 3; +#endif // CHECK_RESULTS + + // Horizontal filter + // As the kernel is simply {1, 0, -1}, we implement this as simply + // out[x] = image[x-1] - image[x+1] + // rather than doing a "proper" convolution operation + for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { + const uint8_t *src_row = src + y * src_stride; + int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; + + // Load pixels and expand to 16 bits + __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); + __m128i px0 = _mm_cvtepu8_epi16(row); + __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); + + __m128i out = _mm_sub_epi16(px0, px2); + + // Store to intermediate array + _mm_storeu_si128((__m128i *)tmp_row, out); + +#if CHECK_RESULTS + // Cross-check + static const int16_t h_kernel[3] = { 1, 0, -1 }; + for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { + int sum = 0; + for (int k = 0; k < taps; ++k) { + sum += h_kernel[k] * src_row[x + k - 1]; + } + (void)sum; + assert(tmp_row[x] == sum); + } +#endif // CHECK_RESULTS + } + + // Vertical filter + // Here the kernel is {1, 2, 1}, which can be implemented + // with simple sums rather than multiplies and adds. + // In order to minimize dependency chains, we evaluate in the order + // (image[y - 1] + image[y + 1]) + (image[y] << 1) + // This way, the first addition and the shift can happen in parallel + for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { + const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; + int16_t *dst_row = dst + y * dst_stride; + + __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); + __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + + __m128i out = + _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); + + _mm_storeu_si128((__m128i *)dst_row, out); + +#if CHECK_RESULTS + static const int16_t v_kernel[3] = { 1, 2, 1 }; + for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { + int sum = 0; + for (int k = 0; k < taps; ++k) { + sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; + } + (void)sum; + assert(dst_row[x] == sum); + } +#endif // CHECK_RESULTS + } +} + +static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, + int16_t *dst, int dst_stride) { + int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; +#if CHECK_RESULTS + const int taps = 3; +#endif // CHECK_RESULTS + + // Horizontal filter + // Here the kernel is {1, 2, 1}, which can be implemented + // with simple sums rather than multiplies and adds. + // In order to minimize dependency chains, we evaluate in the order + // (image[y - 1] + image[y + 1]) + (image[y] << 1) + // This way, the first addition and the shift can happen in parallel + for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { + const uint8_t *src_row = src + y * src_stride; + int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; + + // Load pixels and expand to 16 bits + __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); + __m128i px0 = _mm_cvtepu8_epi16(row); + __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); + __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); + + __m128i out = + _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); + + // Store to intermediate array + _mm_storeu_si128((__m128i *)tmp_row, out); + +#if CHECK_RESULTS + // Cross-check + static const int16_t h_kernel[3] = { 1, 2, 1 }; + for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { + int sum = 0; + for (int k = 0; k < taps; ++k) { + sum += h_kernel[k] * src_row[x + k - 1]; + } + (void)sum; + assert(tmp_row[x] == sum); + } +#endif // CHECK_RESULTS + } + + // Vertical filter + // As the kernel is simply {1, 0, -1}, we implement this as simply + // out[x] = image[x-1] - image[x+1] + // rather than doing a "proper" convolution operation + for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { + const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; + int16_t *dst_row = dst + y * dst_stride; + + __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + + __m128i out = _mm_sub_epi16(px0, px2); + + _mm_storeu_si128((__m128i *)dst_row, out); + +#if CHECK_RESULTS + static const int16_t v_kernel[3] = { 1, 0, -1 }; + for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { + int sum = 0; + for (int k = 0; k < taps; ++k) { + sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; + } + (void)sum; + assert(dst_row[x] == sum); + } +#endif // CHECK_RESULTS + } +} + +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M) { + __m128i acc[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * dx_stride]); + __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * dy_stride]); + + acc[0] = _mm_add_epi32(acc[0], _mm_madd_epi16(dx_row, dx_row)); + acc[1] = _mm_add_epi32(acc[1], _mm_madd_epi16(dx_row, dy_row)); + // Don't compute acc[2], as it should be equal to acc[1] + acc[3] = _mm_add_epi32(acc[3], _mm_madd_epi16(dy_row, dy_row)); + } + + // Condense sums + __m128i partial_sum_0 = _mm_hadd_epi32(acc[0], acc[1]); + __m128i partial_sum_1 = _mm_hadd_epi32(acc[1], acc[3]); + __m128i result = _mm_hadd_epi32(partial_sum_0, partial_sum_1); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); + +#if CHECK_RESULTS + int tmp[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { + tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; + tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; + // Don't compute tmp[2], as it should be equal to tmp[1] + tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; + } + } + + // Apply regularization + tmp[0] += 1; + tmp[3] += 1; + + tmp[2] = tmp[1]; + + assert(tmp[0] == _mm_extract_epi32(result, 0)); + assert(tmp[1] == _mm_extract_epi32(result, 1)); + assert(tmp[2] == _mm_extract_epi32(result, 2)); + assert(tmp[3] == _mm_extract_epi32(result, 3)); +#endif // CHECK_RESULTS + + // Convert results to doubles and store + _mm_storeu_pd(M, _mm_cvtepi32_pd(result)); + _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8))); +} + +// Try to invert the matrix M +// Note: Due to the nature of how a least-squares matrix is constructed, all of +// the eigenvalues will be >= 0, and therefore det M >= 0 as well. +// The regularization term `+ k * I` further ensures that det M >= k^2. +// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. +// So we don't have to worry about non-invertible matrices here. +static INLINE void invert_2x2(const double *M, double *M_inv) { + double det = (M[0] * M[3]) - (M[1] * M[2]); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M[3] * det_inv; + M_inv[1] = -M[1] * det_inv; + M_inv[2] = -M[2] * det_inv; + M_inv[3] = M[0] * det_inv; +} + +void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + double M[4]; + double M_inv[4]; + int b[2]; + int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); + sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); + invert_2x2(M, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, + b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c new file mode 100644 index 0000000000..5503501d62 --- /dev/null +++ b/third_party/aom/aom_dsp/fwd_txfm.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/txfm_common.h" +#include "config/aom_dsp_rtcd.h" + +void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. + // We need an intermediate buffer between passes. + tran_low_t intermediate[4 * 4]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform passes + for (int pass = 0; pass < 2; ++pass) { + tran_high_t in_high[4]; // canbe16 + tran_high_t step[4]; // canbe16 + tran_low_t temp[4]; + for (int i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + ++input; // Next column + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; // Next column (which is a transposed row) + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp[0] = (tran_low_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64); + temp[2] = (tran_low_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64); + temp[1] = (tran_low_t)fdct_round_shift(step[2] * cospi_24_64 + + step[3] * cospi_8_64); + temp[3] = (tran_low_t)fdct_round_shift(-step[2] * cospi_8_64 + + step[3] * cospi_24_64); + // Only transpose the first pass. + if (pass == 0) { + out[0] = temp[0]; + out[1] = temp[1]; + out[2] = temp[2]; + out[3] = temp[3]; + out += 4; + } else { + out[0 * 4] = temp[0]; + out[1 * 4] = temp[1]; + out[2 * 4] = temp[2]; + out[3 * 4] = temp[3]; + ++out; + } + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) + output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } +} + +void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. + // We need an intermediate buffer between passes. + int16_t intermediate[4 * 4]; + const int16_t *in_low = NULL; + int16_t *out = intermediate; + // Do the two transform passes + for (int pass = 0; pass < 2; ++pass) { + int32_t in_high[4]; // canbe16 + int32_t step[4]; // canbe16 + int16_t temp[4]; + for (int i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + ++input; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp[0] = (int16_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64); + temp[2] = (int16_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64); + temp[1] = (int16_t)fdct_round_shift(step[2] * cospi_24_64 + + step[3] * cospi_8_64); + temp[3] = (int16_t)fdct_round_shift(-step[2] * cospi_8_64 + + step[3] * cospi_24_64); + // Only transpose the first pass. + if (pass == 0) { + out[0] = temp[0]; + out[1] = temp[1]; + out[2] = temp[2]; + out[3] = temp[3]; + out += 4; + } else { + out[0 * 4] = temp[0]; + out[1 * 4] = temp[1]; + out[2 * 4] = temp[2]; + out[3 * 4] = temp[3]; + ++out; + } + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) + output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } +} + +#if CONFIG_INTERNAL_STATS +void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { + int i, j; + tran_low_t intermediate[64]; + int pass; + tran_low_t *output = intermediate; + const tran_low_t *in = NULL; + + // Transform columns + for (pass = 0; pass < 2; ++pass) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + for (i = 0; i < 8; i++) { + // stage 1 + if (pass == 0) { + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + ++input; + } else { + s0 = in[0 * 8] + in[7 * 8]; + s1 = in[1 * 8] + in[6 * 8]; + s2 = in[2 * 8] + in[5 * 8]; + s3 = in[3 * 8] + in[4 * 8]; + s4 = in[3 * 8] - in[4 * 8]; + s5 = in[2 * 8] - in[5 * 8]; + s6 = in[1 * 8] - in[6 * 8]; + s7 = in[0 * 8] - in[7 * 8]; + ++in; + } + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = (tran_low_t)fdct_round_shift(t0); + output[2] = (tran_low_t)fdct_round_shift(t2); + output[4] = (tran_low_t)fdct_round_shift(t1); + output[6] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = (tran_low_t)fdct_round_shift(t0); + output[3] = (tran_low_t)fdct_round_shift(t2); + output[5] = (tran_low_t)fdct_round_shift(t1); + output[7] = (tran_low_t)fdct_round_shift(t3); + output += 8; + } + in = intermediate; + output = final_output; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + } +} +#endif // CONFIG_INTERNAL_STATS + +#if CONFIG_AV1_HIGHBITDEPTH && CONFIG_INTERNAL_STATS +void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, + int stride) { + aom_fdct8x8_c(input, final_output, stride); +} +#endif diff --git a/third_party/aom/aom_dsp/grain_params.h b/third_party/aom/aom_dsp/grain_params.h new file mode 100644 index 0000000000..5a28afc2a1 --- /dev/null +++ b/third_party/aom/aom_dsp/grain_params.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain parameters + * + */ +#ifndef AOM_AOM_DSP_GRAIN_PARAMS_H_ +#define AOM_AOM_DSP_GRAIN_PARAMS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "config/aom_config.h" + +/*!\brief Structure containing film grain synthesis parameters for a frame + * + * This structure contains input parameters for film grain synthesis + */ +typedef struct { + // This structure is compared element-by-element in the function + // aom_check_grain_params_equiv: this function must be updated if any changes + // are made to this structure. + int apply_grain; + + int update_parameters; + + // 8 bit values + int scaling_points_y[14][2]; + int num_y_points; // value: 0..14 + + // 8 bit values + int scaling_points_cb[10][2]; + int num_cb_points; // value: 0..10 + + // 8 bit values + int scaling_points_cr[10][2]; + int num_cr_points; // value: 0..10 + + int scaling_shift; // values : 8..11 + + int ar_coeff_lag; // values: 0..3 + + // 8 bit values + int ar_coeffs_y[24]; + int ar_coeffs_cb[25]; + int ar_coeffs_cr[25]; + + // Shift value: AR coeffs range + // 6: [-2, 2) + // 7: [-1, 1) + // 8: [-0.5, 0.5) + // 9: [-0.25, 0.25) + int ar_coeff_shift; // values : 6..9 + + int cb_mult; // 8 bits + int cb_luma_mult; // 8 bits + int cb_offset; // 9 bits + + int cr_mult; // 8 bits + int cr_luma_mult; // 8 bits + int cr_offset; // 9 bits + + int overlap_flag; + + int clip_to_restricted_range; + + unsigned int bit_depth; // video bit depth + + int chroma_scaling_from_luma; + + int grain_scale_shift; + + uint16_t random_seed; + // This structure is compared element-by-element in the function + // aom_check_grain_params_equiv: this function must be updated if any changes + // are made to this structure. +} aom_film_grain_t; + +/*!\brief Check if two film grain parameters structs are equivalent + * + * Check if two film grain parameters are equal, except for the + * update_parameters and random_seed elements which are ignored. + * + * \param[in] pa The first set of parameters to compare + * \param[in] pb The second set of parameters to compare + * \return Returns 1 if the params are equivalent, 0 otherwise + */ +static INLINE int aom_check_grain_params_equiv( + const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) { + if (pa->apply_grain != pb->apply_grain) return 0; + // Don't compare update_parameters + + if (pa->num_y_points != pb->num_y_points) return 0; + if (memcmp(pa->scaling_points_y, pb->scaling_points_y, + pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0) + return 0; + + if (pa->num_cb_points != pb->num_cb_points) return 0; + if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb, + pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0) + return 0; + + if (pa->num_cr_points != pb->num_cr_points) return 0; + if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr, + pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0) + return 0; + + if (pa->scaling_shift != pb->scaling_shift) return 0; + if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0; + + const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1); + if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y, + num_pos * sizeof(*pa->ar_coeffs_y)) != 0) + return 0; + if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb, + num_pos * sizeof(*pa->ar_coeffs_cb)) != 0) + return 0; + if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr, + num_pos * sizeof(*pa->ar_coeffs_cr)) != 0) + return 0; + + if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0; + + if (pa->cb_mult != pb->cb_mult) return 0; + if (pa->cb_luma_mult != pb->cb_luma_mult) return 0; + if (pa->cb_offset != pb->cb_offset) return 0; + + if (pa->cr_mult != pb->cr_mult) return 0; + if (pa->cr_luma_mult != pb->cr_luma_mult) return 0; + if (pa->cr_offset != pb->cr_offset) return 0; + + if (pa->overlap_flag != pb->overlap_flag) return 0; + if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0; + if (pa->bit_depth != pb->bit_depth) return 0; + if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0; + if (pa->grain_scale_shift != pb->grain_scale_shift) return 0; + + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_GRAIN_PARAMS_H_ diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c new file mode 100644 index 0000000000..3505f9f2c8 --- /dev/null +++ b/third_party/aom/aom_dsp/grain_table.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief This file has the implementation details of the grain table. + * + * The file format is an ascii representation for readability and + * editability. Array parameters are separated from the non-array + * parameters and prefixed with a few characters to make for easy + * localization with a parameter set. Each entry is prefixed with "E" + * and the other parameters are only specified if "update-parms" is + * non-zero. + * + * filmgrn1 + * E + * p ... + * sY ... + * sCb ... + * sCr ... + * cY .... + * cCb .... + * cCr .... + * E ... + */ +#include +#include +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/grain_table.h" +#include "aom_mem/aom_mem.h" + +static const char kFileMagic[8] = "filmgrn1"; + +static void grain_table_entry_read(FILE *file, + struct aom_internal_error_info *error_info, + aom_film_grain_table_entry_t *entry) { + aom_film_grain_t *pars = &entry->params; + int num_read = + fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time, + &entry->end_time, &pars->apply_grain, &pars->random_seed, + &pars->update_parameters); + if (num_read == 0 && feof(file)) return; + if (num_read != 5) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read entry header. Read %d != 5", num_read); + return; + } + if (pars->update_parameters) { + num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n", + &pars->ar_coeff_lag, &pars->ar_coeff_shift, + &pars->grain_scale_shift, &pars->scaling_shift, + &pars->chroma_scaling_from_luma, &pars->overlap_flag, + &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset, + &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset); + if (num_read != 12) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read entry params. Read %d != 12", + num_read); + return; + } + if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num y points"); + return; + } + for (int i = 0; i < pars->num_y_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0], + &pars->scaling_points_y[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read y scaling points"); + return; + } + } + if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num cb points"); + return; + } + for (int i = 0; i < pars->num_cb_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0], + &pars->scaling_points_cb[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read cb scaling points"); + return; + } + } + if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num cr points"); + return; + } + for (int i = 0; i < pars->num_cr_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0], + &pars->scaling_points_cr[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read cr scaling points"); + return; + } + } + + if (fscanf(file, "\n\tcY")) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Y coeffs header (cY)"); + return; + } + const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + for (int i = 0; i < n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Y coeffs"); + return; + } + } + if (fscanf(file, "\n\tcCb")) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cb coeffs header (cCb)"); + return; + } + for (int i = 0; i <= n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cb coeffs"); + return; + } + } + if (fscanf(file, "\n\tcCr")) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable read to Cr coeffs header (cCr)"); + return; + } + for (int i = 0; i <= n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cr coeffs"); + return; + } + } + (void)fscanf(file, "\n"); + } +} + +static void grain_table_entry_write(FILE *file, + aom_film_grain_table_entry_t *entry) { + const aom_film_grain_t *pars = &entry->params; + fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time, + entry->end_time, pars->apply_grain, pars->random_seed, + pars->update_parameters); + if (pars->update_parameters) { + fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n", + pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift, + pars->scaling_shift, pars->chroma_scaling_from_luma, + pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult, + pars->cb_offset, pars->cr_mult, pars->cr_luma_mult, + pars->cr_offset); + fprintf(file, "\tsY %d ", pars->num_y_points); + for (int i = 0; i < pars->num_y_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_y[i][0], + pars->scaling_points_y[i][1]); + } + fprintf(file, "\n\tsCb %d", pars->num_cb_points); + for (int i = 0; i < pars->num_cb_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_cb[i][0], + pars->scaling_points_cb[i][1]); + } + fprintf(file, "\n\tsCr %d", pars->num_cr_points); + for (int i = 0; i < pars->num_cr_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_cr[i][0], + pars->scaling_points_cr[i][1]); + } + fprintf(file, "\n\tcY"); + const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + for (int i = 0; i < n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_y[i]); + } + fprintf(file, "\n\tcCb"); + for (int i = 0; i <= n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_cb[i]); + } + fprintf(file, "\n\tcCr"); + for (int i = 0; i <= n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_cr[i]); + } + fprintf(file, "\n"); + } +} + +// TODO(https://crbug.com/aomedia/3228): Update this function to return an +// integer status. +void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, + const aom_film_grain_t *grain) { + if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) { + aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail)); + if (!new_tail) return; + memset(new_tail, 0, sizeof(*new_tail)); + if (t->tail) t->tail->next = new_tail; + if (!t->head) t->head = new_tail; + t->tail = new_tail; + + new_tail->start_time = time_stamp; + new_tail->end_time = end_time; + new_tail->params = *grain; + } else { + t->tail->end_time = AOMMAX(t->tail->end_time, end_time); + t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp); + } +} + +int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, int erase, + aom_film_grain_t *grain) { + aom_film_grain_table_entry_t *entry = t->head; + aom_film_grain_table_entry_t *prev_entry = NULL; + uint16_t random_seed = grain ? grain->random_seed : 0; + if (grain) memset(grain, 0, sizeof(*grain)); + + while (entry) { + aom_film_grain_table_entry_t *next = entry->next; + if (time_stamp >= entry->start_time && time_stamp < entry->end_time) { + if (grain) { + *grain = entry->params; + if (time_stamp != 0) grain->random_seed = random_seed; + } + if (!erase) return 1; + + const int64_t entry_end_time = entry->end_time; + if (time_stamp <= entry->start_time && end_time >= entry->end_time) { + if (t->tail == entry) t->tail = prev_entry; + if (prev_entry) { + prev_entry->next = entry->next; + } else { + t->head = entry->next; + } + aom_free(entry); + } else if (time_stamp <= entry->start_time && + end_time < entry->end_time) { + entry->start_time = end_time; + } else if (time_stamp > entry->start_time && + end_time >= entry->end_time) { + entry->end_time = time_stamp; + } else { + aom_film_grain_table_entry_t *new_entry = + aom_malloc(sizeof(*new_entry)); + if (!new_entry) return 0; + new_entry->next = entry->next; + new_entry->start_time = end_time; + new_entry->end_time = entry->end_time; + new_entry->params = entry->params; + entry->next = new_entry; + entry->end_time = time_stamp; + if (t->tail == entry) t->tail = new_entry; + } + // If segments aren't aligned, delete from the beginning of subsequent + // segments + if (end_time > entry_end_time) { + // Ignoring the return value here is safe since we're erasing from the + // beginning of subsequent entries. + aom_film_grain_table_lookup(t, entry_end_time, end_time, /*erase=*/1, + NULL); + } + return 1; + } + prev_entry = entry; + entry = next; + } + return 0; +} + +aom_codec_err_t aom_film_grain_table_read( + aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info) { + FILE *file = fopen(filename, "rb"); + if (!file) { + aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s", + filename); + return error_info->error_code; + } + error_info->error_code = AOM_CODEC_OK; + + // Read in one extra character as there should be white space after + // the header. + char magic[9]; + if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read (or invalid) file magic"); + fclose(file); + return error_info->error_code; + } + + aom_film_grain_table_entry_t *prev_entry = NULL; + while (!feof(file)) { + aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry)); + if (!entry) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Unable to allocate grain table entry"); + break; + } + memset(entry, 0, sizeof(*entry)); + grain_table_entry_read(file, error_info, entry); + entry->next = NULL; + + if (prev_entry) prev_entry->next = entry; + if (!t->head) t->head = entry; + t->tail = entry; + prev_entry = entry; + + if (error_info->error_code != AOM_CODEC_OK) break; + } + + fclose(file); + return error_info->error_code; +} + +aom_codec_err_t aom_film_grain_table_write( + const aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info) { + error_info->error_code = AOM_CODEC_OK; + + FILE *file = fopen(filename, "wb"); + if (!file) { + aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s", + filename); + return error_info->error_code; + } + + if (!fwrite(kFileMagic, 8, 1, file)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to write file magic"); + fclose(file); + return error_info->error_code; + } + + fprintf(file, "\n"); + aom_film_grain_table_entry_t *entry = t->head; + while (entry) { + grain_table_entry_write(file, entry); + entry = entry->next; + } + fclose(file); + return error_info->error_code; +} + +void aom_film_grain_table_free(aom_film_grain_table_t *t) { + aom_film_grain_table_entry_t *entry = t->head; + while (entry) { + aom_film_grain_table_entry_t *next = entry->next; + aom_free(entry); + entry = next; + } + memset(t, 0, sizeof(*t)); +} diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h new file mode 100644 index 0000000000..49e84980ee --- /dev/null +++ b/third_party/aom/aom_dsp/grain_table.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief A table mapping from time to corresponding film grain parameters. + * + * In order to apply grain synthesis in the decoder, the film grain parameters + * need to be signalled in the encoder. The film grain parameters are time + * varying, and for two-pass encoding (and denoiser implementation flexibility) + * it is common to denoise the video and do parameter estimation before encoding + * the denoised video. + * + * The film grain table is used to provide this flexibility and is used as a + * parameter that is passed to the encoder. + * + * Further, if regraining is to be done in say a single pass mode, or in two + * pass within the encoder (before frames are added to the lookahead buffer), + * this data structure can be used to keep track of on-the-fly estimated grain + * parameters, that are then extracted from the table before the encoded frame + * is written. + */ +#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_ +#define AOM_AOM_DSP_GRAIN_TABLE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom_dsp/grain_params.h" +#include "aom/internal/aom_codec_internal.h" + +typedef struct aom_film_grain_table_entry_t { + aom_film_grain_t params; + int64_t start_time; + int64_t end_time; + struct aom_film_grain_table_entry_t *next; +} aom_film_grain_table_entry_t; + +typedef struct { + aom_film_grain_table_entry_t *head; + aom_film_grain_table_entry_t *tail; +} aom_film_grain_table_t; + +/*!\brief Add a mapping from [time_stamp, end_time) to the given grain + * parameters + * + * \param[in,out] table The grain table + * \param[in] time_stamp The start time stamp + * \param[in] end_stamp The end time_stamp + * \param[in] grain The grain parameters + */ +void aom_film_grain_table_append(aom_film_grain_table_t *table, + int64_t time_stamp, int64_t end_time, + const aom_film_grain_t *grain); + +/*!\brief Look-up (and optionally erase) the grain parameters for the given time + * + * \param[in] table The grain table + * \param[in] time_stamp The start time stamp + * \param[in] end_stamp The end time_stamp + * \param[in] erase Whether the time segment can be deleted + * \param[out] grain The output grain parameters + */ +int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, int erase, + aom_film_grain_t *grain); + +/*!\brief Reads the grain table from a file. + * + * \param[out] table The grain table + * \param[in] filename The file to read from + * \param[in] error_info Error info for tracking errors + */ +aom_codec_err_t aom_film_grain_table_read( + aom_film_grain_table_t *table, const char *filename, + struct aom_internal_error_info *error_info); + +/*!\brief Writes the grain table from a file. + * + * \param[out] table The grain table + * \param[in] filename The file to read from + * \param[in] error_info Error info for tracking errors + */ +aom_codec_err_t aom_film_grain_table_write( + const aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info); + +void aom_film_grain_table_free(aom_film_grain_table_t *t); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_GRAIN_TABLE_H_ diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c new file mode 100644 index 0000000000..6ec091f5f3 --- /dev/null +++ b/third_party/aom/aom_dsp/intrapred.c @@ -0,0 +1,793 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/intrapred_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" + +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left) { + int r; + (void)left; + + for (r = 0; r < bh; r++) { + memcpy(dst, above, bw); + dst += stride; + } +} + +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + + for (r = 0; r < bh; r++) { + memset(dst, left[r], bw); + dst += stride; + } +} + +static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } + +static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, + uint16_t top_left) { + const int base = top + left - top_left; + const int p_left = abs_diff(base, left); + const int p_top = abs_diff(base, top); + const int p_top_left = abs_diff(base, top_left); + + // Return nearest to base of left, top and top_left. + return (p_left <= p_top && p_left <= p_top_left) ? left + : (p_top <= p_top_left) ? top + : top_left; +} + +static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int r, c; + const uint8_t ytop_left = above[-1]; + + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) + dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +// Some basic checks on weights for smooth predictor. +#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ + pred_scale) \ + assert(weights_w[0] < weights_scale); \ + assert(weights_h[0] < weights_scale); \ + assert(weights_scale - weights_w[bw - 1] < weights_scale); \ + assert(weights_scale - weights_h[bh - 1] < weights_scale); \ + assert(pred_scale < 31) // ensures no overflow when calculating predictor. + +#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) + +static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights_w = smooth_weights + bw - 4; + const uint8_t *const sm_weights_h = smooth_weights + bh - 4; + // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE + const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE; + const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); + sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, + log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bh; ++r) { + int c; + for (c = 0; c < bw; ++c) { + const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], + sm_weights_w[c], scale - sm_weights_w[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = smooth_weights + bh - 4; + // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE + const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; + const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint8_t pixels[] = { above[c], below_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[r]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = smooth_weights + bw - 4; + // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE + const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; + const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint8_t pixels[] = { left[r], right_pred }; + const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[c]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bh; r++) { + memset(dst, 128, bw); + dst += stride; + } +} + +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)above; + + for (i = 0; i < bh; i++) sum += left[i]; + expected_dc = (sum + (bh >> 1)) / bh; + + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)left; + + for (i = 0; i < bw; i++) sum += above[i]; + expected_dc = (sum + (bw >> 1)) / bw; + + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + const int count = bw + bh; + + for (i = 0; i < bw; i++) { + sum += above[i]; + } + for (i = 0; i < bh; i++) { + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier, int shift2) { + const int interm = num >> shift1; + return interm * multiplier >> shift2; +} + +// The constants (multiplier and shifts) for a given block size are obtained +// as follows: +// - Let sum_w_h = block width + block height. +// - Shift 'sum_w_h' right until we reach an odd number. Let the number of +// shifts for that block size be called 'shift1' (see the parameter in +// dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2 +// possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect +// block]. +// - Find multipliers for (i) dividing by 3, and (ii) dividing by 5, +// using the "Algorithm 1" in: +// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 +// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd +// shift will be 16, regardless of the block size. + +// Note: For low bitdepth, assembly code may be optimized by using smaller +// constants for smaller block sizes, where the range of the 'sum' is +// restricted to fewer bits. + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left, int shift1, + int multiplier) { + int sum = 0; + + for (int i = 0; i < bw; i++) { + sum += above[i]; + } + for (int i = 0; i < bh; i++) { + sum += left[i]; + } + + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); + assert(expected_dc < (1 << 8)); + + for (int r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } +} + +#undef DC_SHIFT2 + +void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4); +} + +void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2); +} + +void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2); +} + +#undef DC_MULTIPLIER_1X2 +#undef DC_MULTIPLIER_1X4 + +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)left; + (void)bd; + for (r = 0; r < bh; r++) { + memcpy(dst, above, bw * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)bd; + for (r = 0; r < bh; r++) { + aom_memset16(dst, left[r], bw); + dst += stride; + } +} + +static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + const uint16_t ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) + dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights_w = smooth_weights + bw - 4; + const uint8_t *const sm_weights_h = smooth_weights + bh - 4; + // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE + const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE; + const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); + sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, + log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bh; ++r) { + int c; + for (c = 0; c < bw; ++c) { + const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], + sm_weights_w[c], scale - sm_weights_w[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = smooth_weights + bh - 4; + // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE + const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; + const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint16_t pixels[] = { above[c], below_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[r]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = smooth_weights + bw - 4; + // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE + const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE; + const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bh; r++) { + int c; + for (c = 0; c < bw; ++c) { + const uint16_t pixels[] = { left[r], right_pred }; + const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[c]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = divide_round(this_pred, log2_scale); + } + dst += stride; + } +} + +static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, 128 << (bd - 8), bw); + dst += stride; + } +} + +static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)above; + (void)bd; + + for (i = 0; i < bh; i++) sum += left[i]; + expected_dc = (sum + (bh >> 1)) / bh; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)left; + (void)bd; + + for (i = 0; i < bw; i++) sum += above[i]; + expected_dc = (sum + (bw >> 1)) / bw; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = bw + bh; + (void)bd; + + for (i = 0; i < bw; i++) { + sum += above[i]; + } + for (i = 0; i < bh; i++) { + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but +// assume 2nd shift of 17 bits instead of 16. +// Note: Strictly speaking, 2nd shift needs to be 17 only when: +// - bit depth == 12, and +// - bw + bh is divisible by 5 (as opposed to divisible by 3). +// All other cases can use half the multipliers with a shift of 16 instead. +// This special optimization can be used when writing assembly code. +#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB +// Note: This constant is odd, but a smaller even constant (0x199a) with the +// appropriate shift should work for neon in 8/10-bit. +#define HIGHBD_DC_MULTIPLIER_1X4 0x6667 + +#define HIGHBD_DC_SHIFT2 17 + +static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd, + int shift1, uint32_t multiplier) { + int sum = 0; + (void)bd; + + for (int i = 0; i < bw; i++) { + sum += above[i]; + } + for (int i = 0; i < bh; i++) { + sum += left[i]; + } + + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); + assert(expected_dc < (1 << bd)); + + for (int r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +#undef HIGHBD_DC_SHIFT2 + +void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5, + HIGHBD_DC_MULTIPLIER_1X2); +} + +#undef HIGHBD_DC_MULTIPLIER_1X2 +#undef HIGHBD_DC_MULTIPLIER_1X4 + +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. +#define intra_pred_sized(type, width, height) \ + void aom_##type##_predictor_##width##x##height##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, width, height, above, left); \ + } + +#define intra_pred_highbd_sized(type, width, height) \ + void aom_highbd_##type##_predictor_##width##x##height##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \ + } + +/* clang-format off */ +#define intra_pred_rectangular(type) \ + intra_pred_sized(type, 4, 8) \ + intra_pred_sized(type, 8, 4) \ + intra_pred_sized(type, 8, 16) \ + intra_pred_sized(type, 16, 8) \ + intra_pred_sized(type, 16, 32) \ + intra_pred_sized(type, 32, 16) \ + intra_pred_sized(type, 32, 64) \ + intra_pred_sized(type, 64, 32) \ + intra_pred_sized(type, 4, 16) \ + intra_pred_sized(type, 16, 4) \ + intra_pred_sized(type, 8, 32) \ + intra_pred_sized(type, 32, 8) \ + intra_pred_sized(type, 16, 64) \ + intra_pred_sized(type, 64, 16) \ + intra_pred_highbd_sized(type, 4, 8) \ + intra_pred_highbd_sized(type, 8, 4) \ + intra_pred_highbd_sized(type, 8, 16) \ + intra_pred_highbd_sized(type, 16, 8) \ + intra_pred_highbd_sized(type, 16, 32) \ + intra_pred_highbd_sized(type, 32, 16) \ + intra_pred_highbd_sized(type, 32, 64) \ + intra_pred_highbd_sized(type, 64, 32) \ + intra_pred_highbd_sized(type, 4, 16) \ + intra_pred_highbd_sized(type, 16, 4) \ + intra_pred_highbd_sized(type, 8, 32) \ + intra_pred_highbd_sized(type, 32, 8) \ + intra_pred_highbd_sized(type, 16, 64) \ + intra_pred_highbd_sized(type, 64, 16) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ + intra_pred_highbd_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 8, 8) \ + intra_pred_highbd_sized(type, 16, 16) \ + intra_pred_highbd_sized(type, 32, 32) \ + intra_pred_highbd_sized(type, 64, 64) \ + intra_pred_rectangular(type) +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_above_4x4(type) +#define intra_pred_square(type) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ + intra_pred_highbd_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 8, 8) \ + intra_pred_highbd_sized(type, 16, 16) \ + intra_pred_highbd_sized(type, 32, 32) \ + intra_pred_highbd_sized(type, 64, 64) + +intra_pred_allsizes(v) +intra_pred_allsizes(h) +intra_pred_allsizes(smooth) +intra_pred_allsizes(smooth_v) +intra_pred_allsizes(smooth_h) +intra_pred_allsizes(paeth) +intra_pred_allsizes(dc_128) +intra_pred_allsizes(dc_left) +intra_pred_allsizes(dc_top) +intra_pred_square(dc) +/* clang-format on */ +#undef intra_pred_allsizes diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h new file mode 100644 index 0000000000..6172224be1 --- /dev/null +++ b/third_party/aom/aom_dsp/intrapred_common.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_ +#define AOM_AOM_DSP_INTRAPRED_COMMON_H_ + +#include "config/aom_config.h" + +// Weights are quadratic from '1' to '1 / block_size', scaled by +// 2^SMOOTH_WEIGHT_LOG2_SCALE. +#define SMOOTH_WEIGHT_LOG2_SCALE 8 + +// Note these arrays are aligned to ensure NEON loads using a cast to uint32_t* +// have sufficient alignment. Using 8 preserves the potential for an alignment +// hint in load_weight_w8(). For that case, this could be increased to 16 to +// allow an aligned load in x86. +DECLARE_ALIGNED(8, static const uint8_t, smooth_weights[]) = { + // bs = 4 + 255, 149, 85, 64, + // bs = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // bs = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // bs = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // bs = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, + 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 +}; + +DECLARE_ALIGNED(8, static const uint16_t, smooth_weights_u16[]) = { + // block dimension = 4 + 255, 149, 85, 64, + // block dimension = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // block dimension = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // block dimension = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, + // block dimension = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, + 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 +}; + +#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_ diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c new file mode 100644 index 0000000000..075f13689c --- /dev/null +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -0,0 +1,997 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int16_t signed_char_clamp_high(int t, int bd) { + switch (bd) { + case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); + case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); + case 8: + default: return (int16_t)clamp(t, -128, 128 - 1); + } +} +#endif + +// should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, + uint8_t p2, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +// is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t)(*op1 ^ 0x80); + const int8_t ps0 = (int8_t)(*op0 ^ 0x80); + const int8_t qs0 = (int8_t)(*oq0 ^ 0x80); + const int8_t qs1 = (int8_t)(*oq1 ^ 0x80); + const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80); + *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80); + + // outer tap adjustments + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80); + *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80); +} + +void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } +} + +void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_4_quad_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0) { + aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 4, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 8, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 12, p, blimit0, limit0, thresh0); +} + +void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0) { + aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0); +} + +static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { + if (flat && mask) { + const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; + + const int8_t mask = + filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p); + ++s; + } +} + +void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_6_quad_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0) { + aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_c(s + 4, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_c(s + 8, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_c(s + 12, p, blimit0, limit0, thresh0); +} + +void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } +} + +void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_8_quad_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0) { + aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 4, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 8, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 12, p, blimit0, limit0, thresh0); +} + +void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2]; + const int8_t mask = + filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); + s += pitch; + } +} + +void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_6_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0) { + aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0); +} + +void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, + s + 3); + s += pitch; + } +} + +void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0) { + aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0); +} + +static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op6, uint8_t *op5, + uint8_t *op4, uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, uint8_t *oq0, + uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) { + if (flat2 && flat && mask) { + const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, + p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + int i; + int step = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < step * count; ++i) { + const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], + p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], + q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); + ++s; + } +} + +void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); + mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1); +} + +void aom_lpf_horizontal_14_quad_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0) { + mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); + mb_lpf_horizontal_edge_w(s + 4, p, blimit0, limit0, thresh0, 1); + mb_lpf_horizontal_edge_w(s + 8, p, blimit0, limit0, thresh0, 1); + mb_lpf_horizontal_edge_w(s + 12, p, blimit0, limit0, thresh0, 1); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < count; ++i) { + const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], + p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], + q5 = s[5], q6 = s[6]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, + s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); + s += p; + } +} + +void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); +} + +void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); + mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4); +} + +void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0) { + mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); + mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit0, limit0, thresh0, 4); + mb_lpf_vertical_edge_w(s + 8 * pitch, pitch, blimit0, limit0, thresh0, 4); + mb_lpf_vertical_edge_w(s + 12 * pitch, pitch, blimit0, limit0, thresh0, 4); +} + +#if CONFIG_AV1_HIGHBITDEPTH +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, + uint16_t p3, uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p3 - p2) > limit16) * -1; + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(q3 - q2) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, + uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, + int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, + uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, + uint16_t q2, int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, uint16_t q3, + int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + mask |= (abs(p3 - p0) > thresh16) * -1; + mask |= (abs(q3 - q0) > thresh16) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: +// 11111111_11111111 yes, 00000000_00000000 no ? +static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, int bd) { + int16_t hev = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + hev |= (abs(p1 - p0) > thresh16) * -1; + hev |= (abs(q1 - q0) > thresh16) * -1; + return hev; +} + +static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + int bd) { + int16_t filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); + const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); + const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); + const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); + const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + + // Add outer taps if we have high edge variance. + int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; + + // Inner taps. + filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way. + filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; + filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; + + *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); + *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); + + // Outer tap adjustments. + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); + *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); +} + +void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); + highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_4_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); + highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + int bd) { + if (flat && mask) { + const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op3, uint16_t *op2, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, int bd) { + if (flat && mask) { + const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, + s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; + + const int8_t mask = + highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_6_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2]; + const int8_t mask = + highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, + bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_6_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + int count = 4; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, + s + 2, s + 3, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op6, uint16_t *op5, + uint16_t *op4, uint16_t *op3, uint16_t *op2, + uint16_t *op1, uint16_t *op0, uint16_t *oq0, + uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, + uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, + int bd) { + if (flat2 && flat && mask) { + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} + +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + int step = 4; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < step * count; ++i) { + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + + const int8_t flat2 = + highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], + s[5 * p], s[6 * p], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); +} + +void aom_highbd_lpf_horizontal_14_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd); + highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd); +} + +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4]; + const uint16_t p2 = s[-3]; + const uint16_t p1 = s[-2]; + const uint16_t p0 = s[-1]; + const uint16_t q0 = s[0]; + const uint16_t q1 = s[1]; + const uint16_t q2 = s[2]; + const uint16_t q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = + highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, + s + 6, bd); + s += p; + } +} + +void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); +} + +void aom_highbd_lpf_vertical_14_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd); + highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + 4, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/mathutils.h b/third_party/aom/aom_dsp/mathutils.h new file mode 100644 index 0000000000..cbb6cf491f --- /dev/null +++ b/third_party/aom/aom_dsp/mathutils.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_MATHUTILS_H_ +#define AOM_AOM_DSP_MATHUTILS_H_ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" + +static const double TINY_NEAR_ZERO = 1.0E-16; + +// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn +static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) { + int i, j, k; + double c; + // Forward elimination + for (k = 0; k < n - 1; k++) { + // Bring the largest magnitude to the diagonal position + for (i = n - 1; i > k; i--) { + if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) { + for (j = 0; j < n; j++) { + c = A[i * stride + j]; + A[i * stride + j] = A[(i - 1) * stride + j]; + A[(i - 1) * stride + j] = c; + } + c = b[i]; + b[i] = b[i - 1]; + b[i - 1] = c; + } + } + for (i = k; i < n - 1; i++) { + if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0; + c = A[(i + 1) * stride + k] / A[k * stride + k]; + for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j]; + b[i + 1] -= c * b[k]; + } + } + // Backward substitution + for (i = n - 1; i >= 0; i--) { + if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0; + c = 0; + for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j]; + x[i] = (b[i] - c) / A[i * stride + i]; + } + + return 1; +} + +//////////////////////////////////////////////////////////////////////////////// +// Least-squares +// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2 +// The solution is simply x = (A'A)^-1 A'b or simply the solution for +// the system: A'A x = A'b +// +// This process is split into three steps in order to avoid needing to +// explicitly allocate the A matrix, which may be very large if there +// are many equations to solve. +// +// The process for using this is (in pseudocode): +// +// Allocate mat (size n*n), y (size n), a (size n), x (size n) +// least_squares_init(mat, y, n) +// for each equation a . x = b { +// least_squares_accumulate(mat, y, a, b, n) +// } +// least_squares_solve(mat, y, x, n) +// +// where: +// * mat, y are accumulators for the values A'A and A'b respectively, +// * a, b are the coefficients of each individual equation, +// * x is the result vector +// * and n is the problem size +static INLINE void least_squares_init(double *mat, double *y, int n) { + memset(mat, 0, n * n * sizeof(double)); + memset(y, 0, n * sizeof(double)); +} + +// Round the given positive value to nearest integer +static AOM_FORCE_INLINE int iroundpf(float x) { + assert(x >= 0.0); + return (int)(x + 0.5f); +} + +static INLINE void least_squares_accumulate(double *mat, double *y, + const double *a, double b, int n) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + mat[i * n + j] += a[i] * a[j]; + } + } + for (int i = 0; i < n; i++) { + y[i] += a[i] * b; + } +} + +static INLINE int least_squares_solve(double *mat, double *y, double *x, + int n) { + return linsolve(n, mat, n, y, x); +} + +// Matrix multiply +static INLINE void multiply_mat(const double *m1, const double *m2, double *res, + const int m1_rows, const int inner_dim, + const int m2_cols) { + double sum; + + int row, col, inner; + for (row = 0; row < m1_rows; ++row) { + for (col = 0; col < m2_cols; ++col) { + sum = 0; + for (inner = 0; inner < inner_dim; ++inner) + sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col]; + *(res++) = sum; + } + } +} + +static AOM_INLINE float approx_exp(float y) { +#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) +#define B \ + 127 // Offset for the exponent according to IEEE floating point standard. +#define C 60801 // Magic number controls the accuracy of approximation + union { + float as_float; + int32_t as_int32; + } container; + container.as_int32 = ((int32_t)(y * A)) + ((B << 23) - C); + return container.as_float; +#undef A +#undef B +#undef C +} +#endif // AOM_AOM_DSP_MATHUTILS_H_ diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c new file mode 100644 index 0000000000..065ec9a106 --- /dev/null +++ b/third_party/aom/aom_dsp/noise_model.c @@ -0,0 +1,1692 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/noise_model.h" +#include "aom_dsp/noise_util.h" +#include "aom_mem/aom_mem.h" + +#define kLowPolyNumParams 3 + +static const int kMaxLag = 4; + +// Defines a function that can be used to obtain the mean of a block for the +// provided data type (uint8_t, or uint16_t) +#define GET_BLOCK_MEAN(INT_TYPE, suffix) \ + static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \ + int stride, int x_o, int y_o, \ + int block_size) { \ + const int max_h = AOMMIN(h - y_o, block_size); \ + const int max_w = AOMMIN(w - x_o, block_size); \ + double block_mean = 0; \ + for (int y = 0; y < max_h; ++y) { \ + for (int x = 0; x < max_w; ++x) { \ + block_mean += data[(y_o + y) * stride + x_o + x]; \ + } \ + } \ + return block_mean / (max_w * max_h); \ + } + +GET_BLOCK_MEAN(uint8_t, lowbd) +GET_BLOCK_MEAN(uint16_t, highbd) + +static INLINE double get_block_mean(const uint8_t *data, int w, int h, + int stride, int x_o, int y_o, + int block_size, int use_highbd) { + if (use_highbd) + return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o, + block_size); + return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size); +} + +// Defines a function that can be used to obtain the variance of a block +// for the provided data type (uint8_t, or uint16_t) +#define GET_NOISE_VAR(INT_TYPE, suffix) \ + static double get_noise_var_##suffix( \ + const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \ + int h, int x_o, int y_o, int block_size_x, int block_size_y) { \ + const int max_h = AOMMIN(h - y_o, block_size_y); \ + const int max_w = AOMMIN(w - x_o, block_size_x); \ + double noise_var = 0; \ + double noise_mean = 0; \ + for (int y = 0; y < max_h; ++y) { \ + for (int x = 0; x < max_w; ++x) { \ + double noise = (double)data[(y_o + y) * stride + x_o + x] - \ + denoised[(y_o + y) * stride + x_o + x]; \ + noise_mean += noise; \ + noise_var += noise * noise; \ + } \ + } \ + noise_mean /= (max_w * max_h); \ + return noise_var / (max_w * max_h) - noise_mean * noise_mean; \ + } + +GET_NOISE_VAR(uint8_t, lowbd) +GET_NOISE_VAR(uint16_t, highbd) + +static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised, + int w, int h, int stride, int x_o, int y_o, + int block_size_x, int block_size_y, + int use_highbd) { + if (use_highbd) + return get_noise_var_highbd((const uint16_t *)data, + (const uint16_t *)denoised, w, h, stride, x_o, + y_o, block_size_x, block_size_y); + return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o, + block_size_x, block_size_y); +} + +static void equation_system_clear(aom_equation_system_t *eqns) { + const int n = eqns->n; + memset(eqns->A, 0, sizeof(*eqns->A) * n * n); + memset(eqns->x, 0, sizeof(*eqns->x) * n); + memset(eqns->b, 0, sizeof(*eqns->b) * n); +} + +static void equation_system_copy(aom_equation_system_t *dst, + const aom_equation_system_t *src) { + const int n = dst->n; + memcpy(dst->A, src->A, sizeof(*dst->A) * n * n); + memcpy(dst->x, src->x, sizeof(*dst->x) * n); + memcpy(dst->b, src->b, sizeof(*dst->b) * n); +} + +static int equation_system_init(aom_equation_system_t *eqns, int n) { + eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n); + eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n); + eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n); + eqns->n = n; + if (!eqns->A || !eqns->b || !eqns->x) { + fprintf(stderr, "Failed to allocate system of equations of size %d\n", n); + aom_free(eqns->A); + aom_free(eqns->b); + aom_free(eqns->x); + memset(eqns, 0, sizeof(*eqns)); + return 0; + } + equation_system_clear(eqns); + return 1; +} + +static int equation_system_solve(aom_equation_system_t *eqns) { + const int n = eqns->n; + double *b = (double *)aom_malloc(sizeof(*b) * n); + double *A = (double *)aom_malloc(sizeof(*A) * n * n); + int ret = 0; + if (A == NULL || b == NULL) { + fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n); + aom_free(b); + aom_free(A); + return 0; + } + memcpy(A, eqns->A, sizeof(*eqns->A) * n * n); + memcpy(b, eqns->b, sizeof(*eqns->b) * n); + ret = linsolve(n, A, eqns->n, b, eqns->x); + aom_free(b); + aom_free(A); + + if (ret == 0) { + return 0; + } + return 1; +} + +static void equation_system_add(aom_equation_system_t *dest, + aom_equation_system_t *src) { + const int n = dest->n; + int i, j; + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + dest->A[i * n + j] += src->A[i * n + j]; + } + dest->b[i] += src->b[i]; + } +} + +static void equation_system_free(aom_equation_system_t *eqns) { + if (!eqns) return; + aom_free(eqns->A); + aom_free(eqns->b); + aom_free(eqns->x); + memset(eqns, 0, sizeof(*eqns)); +} + +static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) { + equation_system_clear(&solver->eqns); + solver->num_equations = 0; + solver->total = 0; +} + +static void noise_strength_solver_add(aom_noise_strength_solver_t *dest, + aom_noise_strength_solver_t *src) { + equation_system_add(&dest->eqns, &src->eqns); + dest->num_equations += src->num_equations; + dest->total += src->total; +} + +// Return the number of coefficients required for the given parameters +static int num_coeffs(const aom_noise_model_params_t params) { + const int n = 2 * params.lag + 1; + switch (params.shape) { + case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1); + case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2; + } + return 0; +} + +static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) { + const int kNumBins = 20; + if (!equation_system_init(&state->eqns, n)) { + fprintf(stderr, "Failed initialization noise state with size %d\n", n); + return 0; + } + state->ar_gain = 1.0; + state->num_observations = 0; + return aom_noise_strength_solver_init(&state->strength_solver, kNumBins, + bit_depth); +} + +static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) { + const double kTolerance = 1e-6; + const int last = eqns->n - 1; + // Set all of the AR coefficients to zero, but try to solve for correlation + // with the luma channel + memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n); + if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) { + eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last]; + } +} + +int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) { + if (!lut) return 0; + if (num_points <= 0) return 0; + lut->num_points = 0; + lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points)); + if (!lut->points) return 0; + lut->num_points = num_points; + memset(lut->points, 0, sizeof(*lut->points) * num_points); + return 1; +} + +void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) { + if (!lut) return; + aom_free(lut->points); + memset(lut, 0, sizeof(*lut)); +} + +double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, + double x) { + int i = 0; + // Constant extrapolation for x < x_0. + if (x < lut->points[0][0]) return lut->points[0][1]; + for (i = 0; i < lut->num_points - 1; ++i) { + if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) { + const double a = + (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]); + return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a); + } + } + // Constant extrapolation for x > x_{n-1} + return lut->points[lut->num_points - 1][1]; +} + +static double noise_strength_solver_get_bin_index( + const aom_noise_strength_solver_t *solver, double value) { + const double val = + fclamp(value, solver->min_intensity, solver->max_intensity); + const double range = solver->max_intensity - solver->min_intensity; + return (solver->num_bins - 1) * (val - solver->min_intensity) / range; +} + +static double noise_strength_solver_get_value( + const aom_noise_strength_solver_t *solver, double x) { + const double bin = noise_strength_solver_get_bin_index(solver, x); + const int bin_i0 = (int)floor(bin); + const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); + const double a = bin - bin_i0; + return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1]; +} + +void aom_noise_strength_solver_add_measurement( + aom_noise_strength_solver_t *solver, double block_mean, double noise_std) { + const double bin = noise_strength_solver_get_bin_index(solver, block_mean); + const int bin_i0 = (int)floor(bin); + const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); + const double a = bin - bin_i0; + const int n = solver->num_bins; + solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a); + solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a); + solver->eqns.A[bin_i1 * n + bin_i1] += a * a; + solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a); + solver->eqns.b[bin_i0] += (1.0 - a) * noise_std; + solver->eqns.b[bin_i1] += a * noise_std; + solver->total += noise_std; + solver->num_equations++; +} + +int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) { + // Add regularization proportional to the number of constraints + const int n = solver->num_bins; + const double kAlpha = 2.0 * (double)(solver->num_equations) / n; + int result = 0; + double mean = 0; + + // Do this in a non-destructive manner so it is not confusing to the caller + double *old_A = solver->eqns.A; + double *A = (double *)aom_malloc(sizeof(*A) * n * n); + if (!A) { + fprintf(stderr, "Unable to allocate copy of A\n"); + return 0; + } + memcpy(A, old_A, sizeof(*A) * n * n); + + for (int i = 0; i < n; ++i) { + const int i_lo = AOMMAX(0, i - 1); + const int i_hi = AOMMIN(n - 1, i + 1); + A[i * n + i_lo] -= kAlpha; + A[i * n + i] += 2 * kAlpha; + A[i * n + i_hi] -= kAlpha; + } + + // Small regularization to give average noise strength + mean = solver->total / solver->num_equations; + for (int i = 0; i < n; ++i) { + A[i * n + i] += 1.0 / 8192.; + solver->eqns.b[i] += mean / 8192.; + } + solver->eqns.A = A; + result = equation_system_solve(&solver->eqns); + solver->eqns.A = old_A; + + aom_free(A); + return result; +} + +int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, + int num_bins, int bit_depth) { + if (!solver) return 0; + memset(solver, 0, sizeof(*solver)); + solver->num_bins = num_bins; + solver->min_intensity = 0; + solver->max_intensity = (1 << bit_depth) - 1; + solver->total = 0; + solver->num_equations = 0; + return equation_system_init(&solver->eqns, num_bins); +} + +void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) { + if (!solver) return; + equation_system_free(&solver->eqns); +} + +double aom_noise_strength_solver_get_center( + const aom_noise_strength_solver_t *solver, int i) { + const double range = solver->max_intensity - solver->min_intensity; + const int n = solver->num_bins; + return ((double)i) / (n - 1) * range + solver->min_intensity; +} + +// Computes the residual if a point were to be removed from the lut. This is +// calculated as the area between the output of the solver and the line segment +// that would be formed between [x_{i - 1}, x_{i + 1}). +static void update_piecewise_linear_residual( + const aom_noise_strength_solver_t *solver, + const aom_noise_strength_lut_t *lut, double *residual, int start, int end) { + const double dx = 255. / solver->num_bins; + for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) { + const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index( + solver, lut->points[i - 1][0]))); + const int upper = AOMMIN(solver->num_bins - 1, + (int)ceil(noise_strength_solver_get_bin_index( + solver, lut->points[i + 1][0]))); + double r = 0; + for (int j = lower; j <= upper; ++j) { + const double x = aom_noise_strength_solver_get_center(solver, j); + if (x < lut->points[i - 1][0]) continue; + if (x >= lut->points[i + 1][0]) continue; + const double y = solver->eqns.x[j]; + const double a = (x - lut->points[i - 1][0]) / + (lut->points[i + 1][0] - lut->points[i - 1][0]); + const double estimate_y = + lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a; + r += fabs(y - estimate_y); + } + residual[i] = r * dx; + } +} + +int aom_noise_strength_solver_fit_piecewise( + const aom_noise_strength_solver_t *solver, int max_output_points, + aom_noise_strength_lut_t *lut) { + // The tolerance is normalized to be give consistent results between + // different bit-depths. + const double kTolerance = solver->max_intensity * 0.00625 / 255.0; + if (!aom_noise_strength_lut_init(lut, solver->num_bins)) { + fprintf(stderr, "Failed to init lut\n"); + return 0; + } + for (int i = 0; i < solver->num_bins; ++i) { + lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i); + lut->points[i][1] = solver->eqns.x[i]; + } + if (max_output_points < 0) { + max_output_points = solver->num_bins; + } + + double *residual = (double *)aom_malloc(solver->num_bins * sizeof(*residual)); + if (!residual) { + aom_noise_strength_lut_free(lut); + return 0; + } + memset(residual, 0, sizeof(*residual) * solver->num_bins); + + update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins); + + // Greedily remove points if there are too many or if it doesn't hurt local + // approximation (never remove the end points) + while (lut->num_points > 2) { + int min_index = 1; + for (int j = 1; j < lut->num_points - 1; ++j) { + if (residual[j] < residual[min_index]) { + min_index = j; + } + } + const double dx = + lut->points[min_index + 1][0] - lut->points[min_index - 1][0]; + const double avg_residual = residual[min_index] / dx; + if (lut->num_points <= max_output_points && avg_residual > kTolerance) { + break; + } + + const int num_remaining = lut->num_points - min_index - 1; + memmove(lut->points + min_index, lut->points + min_index + 1, + sizeof(lut->points[0]) * num_remaining); + lut->num_points--; + + update_piecewise_linear_residual(solver, lut, residual, min_index - 1, + min_index + 1); + } + aom_free(residual); + return 1; +} + +int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, + int block_size, int bit_depth, int use_highbd) { + const int n = block_size * block_size; + aom_equation_system_t eqns; + double *AtA_inv = 0; + double *A = 0; + int x = 0, y = 0, i = 0, j = 0; + block_finder->A = NULL; + block_finder->AtA_inv = NULL; + + if (!equation_system_init(&eqns, kLowPolyNumParams)) { + fprintf(stderr, "Failed to init equation system for block_size=%d\n", + block_size); + return 0; + } + + AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams * + sizeof(*AtA_inv)); + A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A)); + if (AtA_inv == NULL || A == NULL) { + fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n", + block_size); + aom_free(AtA_inv); + aom_free(A); + equation_system_free(&eqns); + return 0; + } + + block_finder->A = A; + block_finder->AtA_inv = AtA_inv; + block_finder->block_size = block_size; + block_finder->normalization = (1 << bit_depth) - 1; + block_finder->use_highbd = use_highbd; + + for (y = 0; y < block_size; ++y) { + const double yd = ((double)y - block_size / 2.) / (block_size / 2.); + for (x = 0; x < block_size; ++x) { + const double xd = ((double)x - block_size / 2.) / (block_size / 2.); + const double coords[3] = { yd, xd, 1 }; + const int row = y * block_size + x; + A[kLowPolyNumParams * row + 0] = yd; + A[kLowPolyNumParams * row + 1] = xd; + A[kLowPolyNumParams * row + 2] = 1; + + for (i = 0; i < kLowPolyNumParams; ++i) { + for (j = 0; j < kLowPolyNumParams; ++j) { + eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j]; + } + } + } + } + + // Lazy inverse using existing equation solver. + for (i = 0; i < kLowPolyNumParams; ++i) { + memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams); + eqns.b[i] = 1; + equation_system_solve(&eqns); + + for (j = 0; j < kLowPolyNumParams; ++j) { + AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j]; + } + } + equation_system_free(&eqns); + return 1; +} + +void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) { + if (!block_finder) return; + aom_free(block_finder->A); + aom_free(block_finder->AtA_inv); + memset(block_finder, 0, sizeof(*block_finder)); +} + +void aom_flat_block_finder_extract_block( + const aom_flat_block_finder_t *block_finder, const uint8_t *const data, + int w, int h, int stride, int offsx, int offsy, double *plane, + double *block) { + const int block_size = block_finder->block_size; + const int n = block_size * block_size; + const double *A = block_finder->A; + const double *AtA_inv = block_finder->AtA_inv; + double plane_coords[kLowPolyNumParams]; + double AtA_inv_b[kLowPolyNumParams]; + int xi, yi, i; + + if (block_finder->use_highbd) { + const uint16_t *const data16 = (const uint16_t *const)data; + for (yi = 0; yi < block_size; ++yi) { + const int y = clamp(offsy + yi, 0, h - 1); + for (xi = 0; xi < block_size; ++xi) { + const int x = clamp(offsx + xi, 0, w - 1); + block[yi * block_size + xi] = + ((double)data16[y * stride + x]) / block_finder->normalization; + } + } + } else { + for (yi = 0; yi < block_size; ++yi) { + const int y = clamp(offsy + yi, 0, h - 1); + for (xi = 0; xi < block_size; ++xi) { + const int x = clamp(offsx + xi, 0, w - 1); + block[yi * block_size + xi] = + ((double)data[y * stride + x]) / block_finder->normalization; + } + } + } + multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams); + multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams, + kLowPolyNumParams, 1); + multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1); + + for (i = 0; i < n; ++i) { + block[i] -= plane[i]; + } +} + +typedef struct { + int index; + float score; +} index_and_score_t; + +static int compare_scores(const void *a, const void *b) { + const float diff = + ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score; + if (diff < 0) + return -1; + else if (diff > 0) + return 1; + return 0; +} + +int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, + const uint8_t *const data, int w, int h, + int stride, uint8_t *flat_blocks) { + // The gradient-based features used in this code are based on: + // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise + // correlation for improved video denoising," 2012 19th, ICIP. + // The thresholds are more lenient to allow for correct grain modeling + // if extreme cases. + const int block_size = block_finder->block_size; + const int n = block_size * block_size; + const double kTraceThreshold = 0.15 / (32 * 32); + const double kRatioThreshold = 1.25; + const double kNormThreshold = 0.08 / (32 * 32); + const double kVarThreshold = 0.005 / (double)n; + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + int num_flat = 0; + double *plane = (double *)aom_malloc(n * sizeof(*plane)); + double *block = (double *)aom_malloc(n * sizeof(*block)); + index_and_score_t *scores = (index_and_score_t *)aom_malloc( + num_blocks_w * num_blocks_h * sizeof(*scores)); + if (plane == NULL || block == NULL || scores == NULL) { + fprintf(stderr, "Failed to allocate memory for block of size %d\n", n); + aom_free(plane); + aom_free(block); + aom_free(scores); + return -1; + } + +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "score = ["); +#endif + for (int by = 0; by < num_blocks_h; ++by) { + for (int bx = 0; bx < num_blocks_w; ++bx) { + // Compute gradient covariance matrix. + aom_flat_block_finder_extract_block(block_finder, data, w, h, stride, + bx * block_size, by * block_size, + plane, block); + double Gxx = 0, Gxy = 0, Gyy = 0; + double mean = 0; + double var = 0; + + for (int yi = 1; yi < block_size - 1; ++yi) { + for (int xi = 1; xi < block_size - 1; ++xi) { + const double gx = (block[yi * block_size + xi + 1] - + block[yi * block_size + xi - 1]) / + 2; + const double gy = (block[yi * block_size + xi + block_size] - + block[yi * block_size + xi - block_size]) / + 2; + Gxx += gx * gx; + Gxy += gx * gy; + Gyy += gy * gy; + + const double value = block[yi * block_size + xi]; + mean += value; + var += value * value; + } + } + mean /= (block_size - 2) * (block_size - 2); + + // Normalize gradients by block_size. + Gxx /= ((block_size - 2) * (block_size - 2)); + Gxy /= ((block_size - 2) * (block_size - 2)); + Gyy /= ((block_size - 2) * (block_size - 2)); + var = var / ((block_size - 2) * (block_size - 2)) - mean * mean; + + { + const double trace = Gxx + Gyy; + const double det = Gxx * Gyy - Gxy * Gxy; + const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.; + const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.; + const double norm = e1; // Spectral norm + const double ratio = (e1 / AOMMAX(e2, 1e-6)); + const int is_flat = (trace < kTraceThreshold) && + (ratio < kRatioThreshold) && + (norm < kNormThreshold) && (var > kVarThreshold); + // The following weights are used to combine the above features to give + // a sigmoid score for flatness. If the input was normalized to [0,100] + // the magnitude of these values would be close to 1 (e.g., weights + // corresponding to variance would be a factor of 10000x smaller). + // The weights are given in the following order: + // [{var}, {ratio}, {trace}, {norm}, offset] + // with one of the most discriminative being simply the variance. + const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 }; + double sum_weights = weights[0] * var + weights[1] * ratio + + weights[2] * trace + weights[3] * norm + + weights[4]; + // clamp the value to [-25.0, 100.0] to prevent overflow + sum_weights = fclamp(sum_weights, -25.0, 100.0); + const float score = (float)(1.0 / (1 + exp(-sum_weights))); + flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0; + scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0; + scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx; +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm, + is_flat); +#endif + num_flat += is_flat; + } + } +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "\n"); +#endif + } +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "];\n"); +#endif + // Find the top-scored blocks (most likely to be flat) and set the flat blocks + // be the union of the thresholded results and the top 10th percentile of the + // scored results. + qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores); + const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100; + const float score_threshold = scores[top_nth_percentile].score; + for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) { + if (scores[i].score >= score_threshold) { + num_flat += flat_blocks[scores[i].index] == 0; + flat_blocks[scores[i].index] |= 1; + } + } + aom_free(block); + aom_free(plane); + aom_free(scores); + return num_flat; +} + +int aom_noise_model_init(aom_noise_model_t *model, + const aom_noise_model_params_t params) { + const int n = num_coeffs(params); + const int lag = params.lag; + const int bit_depth = params.bit_depth; + int x = 0, y = 0, i = 0, c = 0; + + memset(model, 0, sizeof(*model)); + if (params.lag < 1) { + fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag); + return 0; + } + if (params.lag > kMaxLag) { + fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag, + kMaxLag); + return 0; + } + if (!(params.bit_depth == 8 || params.bit_depth == 10 || + params.bit_depth == 12)) { + return 0; + } + + memcpy(&model->params, ¶ms, sizeof(params)); + for (c = 0; c < 3; ++c) { + if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) { + fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); + aom_noise_model_free(model); + return 0; + } + if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) { + fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); + aom_noise_model_free(model); + return 0; + } + } + model->n = n; + model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n); + if (!model->coords) { + aom_noise_model_free(model); + return 0; + } + + for (y = -lag; y <= 0; ++y) { + const int max_x = y == 0 ? -1 : lag; + for (x = -lag; x <= max_x; ++x) { + switch (params.shape) { + case AOM_NOISE_SHAPE_DIAMOND: + if (abs(x) <= y + lag) { + model->coords[i][0] = x; + model->coords[i][1] = y; + ++i; + } + break; + case AOM_NOISE_SHAPE_SQUARE: + model->coords[i][0] = x; + model->coords[i][1] = y; + ++i; + break; + default: + fprintf(stderr, "Invalid shape\n"); + aom_noise_model_free(model); + return 0; + } + } + } + assert(i == n); + return 1; +} + +void aom_noise_model_free(aom_noise_model_t *model) { + int c = 0; + if (!model) return; + + aom_free(model->coords); + for (c = 0; c < 3; ++c) { + equation_system_free(&model->latest_state[c].eqns); + equation_system_free(&model->combined_state[c].eqns); + + equation_system_free(&model->latest_state[c].strength_solver.eqns); + equation_system_free(&model->combined_state[c].strength_solver.eqns); + } + memset(model, 0, sizeof(*model)); +} + +// Extracts the neighborhood defined by coords around point (x, y) from +// the difference between the data and denoised images. Also extracts the +// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma). +#define EXTRACT_AR_ROW(INT_TYPE, suffix) \ + static double extract_ar_row_##suffix( \ + int(*coords)[2], int num_coords, const INT_TYPE *const data, \ + const INT_TYPE *const denoised, int stride, int sub_log2[2], \ + const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \ + int alt_stride, int x, int y, double *buffer) { \ + for (int i = 0; i < num_coords; ++i) { \ + const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \ + buffer[i] = \ + (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \ + } \ + const double val = \ + (double)data[y * stride + x] - denoised[y * stride + x]; \ + \ + if (alt_data && alt_denoised) { \ + double avg_data = 0, avg_denoised = 0; \ + int num_samples = 0; \ + for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \ + const int y_up = (y << sub_log2[1]) + dy_i; \ + for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \ + const int x_up = (x << sub_log2[0]) + dx_i; \ + avg_data += alt_data[y_up * alt_stride + x_up]; \ + avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \ + num_samples++; \ + } \ + } \ + buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \ + } \ + return val; \ + } + +EXTRACT_AR_ROW(uint8_t, lowbd) +EXTRACT_AR_ROW(uint16_t, highbd) + +static int add_block_observations( + aom_noise_model_t *noise_model, int c, const uint8_t *const data, + const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2], + const uint8_t *const alt_data, const uint8_t *const alt_denoised, + int alt_stride, const uint8_t *const flat_blocks, int block_size, + int num_blocks_w, int num_blocks_h) { + const int lag = noise_model->params.lag; + const int num_coords = noise_model->n; + const double normalization = (1 << noise_model->params.bit_depth) - 1; + double *A = noise_model->latest_state[c].eqns.A; + double *b = noise_model->latest_state[c].eqns.b; + double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1)); + const int n = noise_model->latest_state[c].eqns.n; + + if (!buffer) { + fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1); + return 0; + } + for (int by = 0; by < num_blocks_h; ++by) { + const int y_o = by * (block_size >> sub_log2[1]); + for (int bx = 0; bx < num_blocks_w; ++bx) { + const int x_o = bx * (block_size >> sub_log2[0]); + if (!flat_blocks[by * num_blocks_w + bx]) { + continue; + } + int y_start = + (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag; + int x_start = + (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag; + int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), + block_size >> sub_log2[1]); + int x_end = AOMMIN( + (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag, + (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1]) + ? (block_size >> sub_log2[0]) + : ((block_size >> sub_log2[0]) - lag)); + for (int y = y_start; y < y_end; ++y) { + for (int x = x_start; x < x_end; ++x) { + const double val = + noise_model->params.use_highbd + ? extract_ar_row_highbd(noise_model->coords, num_coords, + (const uint16_t *const)data, + (const uint16_t *const)denoised, + stride, sub_log2, + (const uint16_t *const)alt_data, + (const uint16_t *const)alt_denoised, + alt_stride, x + x_o, y + y_o, buffer) + : extract_ar_row_lowbd(noise_model->coords, num_coords, data, + denoised, stride, sub_log2, alt_data, + alt_denoised, alt_stride, x + x_o, + y + y_o, buffer); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + A[i * n + j] += + (buffer[i] * buffer[j]) / (normalization * normalization); + } + b[i] += (buffer[i] * val) / (normalization * normalization); + } + noise_model->latest_state[c].num_observations++; + } + } + } + } + aom_free(buffer); + return 1; +} + +static void add_noise_std_observations( + aom_noise_model_t *noise_model, int c, const double *coeffs, + const uint8_t *const data, const uint8_t *const denoised, int w, int h, + int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride, + const uint8_t *const flat_blocks, int block_size, int num_blocks_w, + int num_blocks_h) { + const int num_coords = noise_model->n; + aom_noise_strength_solver_t *noise_strength_solver = + &noise_model->latest_state[c].strength_solver; + + const aom_noise_strength_solver_t *noise_strength_luma = + &noise_model->latest_state[0].strength_solver; + const double luma_gain = noise_model->latest_state[0].ar_gain; + const double noise_gain = noise_model->latest_state[c].ar_gain; + for (int by = 0; by < num_blocks_h; ++by) { + const int y_o = by * (block_size >> sub_log2[1]); + for (int bx = 0; bx < num_blocks_w; ++bx) { + const int x_o = bx * (block_size >> sub_log2[0]); + if (!flat_blocks[by * num_blocks_w + bx]) { + continue; + } + const int num_samples_h = + AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), + block_size >> sub_log2[1]); + const int num_samples_w = + AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]), + (block_size >> sub_log2[0])); + // Make sure that we have a reasonable amount of samples to consider the + // block + if (num_samples_w * num_samples_h > block_size) { + const double block_mean = get_block_mean( + alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride, + x_o << sub_log2[0], y_o << sub_log2[1], block_size, + noise_model->params.use_highbd); + const double noise_var = get_noise_var( + data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o, + y_o, block_size >> sub_log2[0], block_size >> sub_log2[1], + noise_model->params.use_highbd); + // We want to remove the part of the noise that came from being + // correlated with luma. Note that the noise solver for luma must + // have already been run. + const double luma_strength = + c > 0 ? luma_gain * noise_strength_solver_get_value( + noise_strength_luma, block_mean) + : 0; + const double corr = c > 0 ? coeffs[num_coords] : 0; + // Chroma noise: + // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2) + // The uncorrelated component: + // uncorr_var = noise_var - (corr * luma_strength)^2 + // But don't allow fully correlated noise (hence the max), since the + // synthesis cannot model it. + const double uncorr_std = sqrt( + AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2))); + // After we've removed correlation with luma, undo the gain that will + // come from running the IIR filter. + const double adjusted_strength = uncorr_std / noise_gain; + aom_noise_strength_solver_add_measurement( + noise_strength_solver, block_mean, adjusted_strength); + } + } + } +} + +// Return true if the noise estimate appears to be different from the combined +// (multi-frame) estimate. The difference is measured by checking whether the +// AR coefficients have diverged (using a threshold on normalized cross +// correlation), or whether the noise strength has changed. +static int is_noise_model_different(aom_noise_model_t *const noise_model) { + // These thresholds are kind of arbitrary and will likely need further tuning + // (or exported as parameters). The threshold on noise strength is a weighted + // difference between the noise strength histograms + const double kCoeffThreshold = 0.9; + const double kStrengthThreshold = + 0.005 * (1 << (noise_model->params.bit_depth - 8)); + for (int c = 0; c < 1; ++c) { + const double corr = + aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x, + noise_model->combined_state[c].eqns.x, + noise_model->combined_state[c].eqns.n); + if (corr < kCoeffThreshold) return 1; + + const double dx = + 1.0 / noise_model->latest_state[c].strength_solver.num_bins; + + const aom_equation_system_t *latest_eqns = + &noise_model->latest_state[c].strength_solver.eqns; + const aom_equation_system_t *combined_eqns = + &noise_model->combined_state[c].strength_solver.eqns; + double diff = 0; + double total_weight = 0; + for (int j = 0; j < latest_eqns->n; ++j) { + double weight = 0; + for (int i = 0; i < latest_eqns->n; ++i) { + weight += latest_eqns->A[i * latest_eqns->n + j]; + } + weight = sqrt(weight); + diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]); + total_weight += weight; + } + if (diff * dx / total_weight > kStrengthThreshold) return 1; + } + return 0; +} + +static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) { + const int ret = equation_system_solve(&state->eqns); + state->ar_gain = 1.0; + if (!ret) return ret; + + // Update the AR gain from the equation system as it will be used to fit + // the noise strength as a function of intensity. In the Yule-Walker + // equations, the diagonal should be the variance of the correlated noise. + // In the case of the least squares estimate, there will be some variability + // in the diagonal. So use the mean of the diagonal as the estimate of + // overall variance (this works for least squares or Yule-Walker formulation). + double var = 0; + const int n = state->eqns.n; + for (int i = 0; i < (state->eqns.n - is_chroma); ++i) { + var += state->eqns.A[i * n + i] / state->num_observations; + } + var /= (n - is_chroma); + + // Keep track of E(Y^2) = + E(X^2) + // In the case that we are using chroma and have an estimate of correlation + // with luma we adjust that estimate slightly to remove the correlated bits by + // subtracting out the last column of a scaled by our correlation estimate + // from b. E(y^2) = + double sum_covar = 0; + for (int i = 0; i < state->eqns.n - is_chroma; ++i) { + double bi = state->eqns.b[i]; + if (is_chroma) { + bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1]; + } + sum_covar += (bi * state->eqns.x[i]) / state->num_observations; + } + // Now, get an estimate of the variance of uncorrelated noise signal and use + // it to determine the gain of the AR filter. + const double noise_var = AOMMAX(var - sum_covar, 1e-6); + state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6))); + return ret; +} + +aom_noise_status_t aom_noise_model_update( + aom_noise_model_t *const noise_model, const uint8_t *const data[3], + const uint8_t *const denoised[3], int w, int h, int stride[3], + int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) { + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + int y_model_different = 0; + int num_blocks = 0; + int i = 0, channel = 0; + + if (block_size <= 1) { + fprintf(stderr, "block_size = %d must be > 1\n", block_size); + return AOM_NOISE_STATUS_INVALID_ARGUMENT; + } + + if (block_size < noise_model->params.lag * 2 + 1) { + fprintf(stderr, "block_size = %d must be >= %d\n", block_size, + noise_model->params.lag * 2 + 1); + return AOM_NOISE_STATUS_INVALID_ARGUMENT; + } + + // Clear the latest equation system + for (i = 0; i < 3; ++i) { + equation_system_clear(&noise_model->latest_state[i].eqns); + noise_model->latest_state[i].num_observations = 0; + noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver); + } + + // Check that we have enough flat blocks + for (i = 0; i < num_blocks_h * num_blocks_w; ++i) { + if (flat_blocks[i]) { + num_blocks++; + } + } + + if (num_blocks <= 1) { + fprintf(stderr, "Not enough flat blocks to update noise estimate\n"); + return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS; + } + + for (channel = 0; channel < 3; ++channel) { + int no_subsampling[2] = { 0, 0 }; + const uint8_t *alt_data = channel > 0 ? data[0] : 0; + const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0; + int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling; + const int is_chroma = channel != 0; + if (!data[channel] || !denoised[channel]) break; + if (!add_block_observations(noise_model, channel, data[channel], + denoised[channel], w, h, stride[channel], sub, + alt_data, alt_denoised, stride[0], flat_blocks, + block_size, num_blocks_w, num_blocks_h)) { + fprintf(stderr, "Adding block observation failed\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + + if (!ar_equation_system_solve(&noise_model->latest_state[channel], + is_chroma)) { + if (is_chroma) { + set_chroma_coefficient_fallback_soln( + &noise_model->latest_state[channel].eqns); + } else { + fprintf(stderr, "Solving latest noise equation system failed %d!\n", + channel); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + add_noise_std_observations( + noise_model, channel, noise_model->latest_state[channel].eqns.x, + data[channel], denoised[channel], w, h, stride[channel], sub, alt_data, + stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h); + + if (!aom_noise_strength_solver_solve( + &noise_model->latest_state[channel].strength_solver)) { + fprintf(stderr, "Solving latest noise strength failed!\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + + // Check noise characteristics and return if error. + if (channel == 0 && + noise_model->combined_state[channel].strength_solver.num_equations > + 0 && + is_noise_model_different(noise_model)) { + y_model_different = 1; + } + + // Don't update the combined stats if the y model is different. + if (y_model_different) continue; + + noise_model->combined_state[channel].num_observations += + noise_model->latest_state[channel].num_observations; + equation_system_add(&noise_model->combined_state[channel].eqns, + &noise_model->latest_state[channel].eqns); + if (!ar_equation_system_solve(&noise_model->combined_state[channel], + is_chroma)) { + if (is_chroma) { + set_chroma_coefficient_fallback_soln( + &noise_model->combined_state[channel].eqns); + } else { + fprintf(stderr, "Solving combined noise equation system failed %d!\n", + channel); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + noise_strength_solver_add( + &noise_model->combined_state[channel].strength_solver, + &noise_model->latest_state[channel].strength_solver); + + if (!aom_noise_strength_solver_solve( + &noise_model->combined_state[channel].strength_solver)) { + fprintf(stderr, "Solving combined noise strength failed!\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE + : AOM_NOISE_STATUS_OK; +} + +void aom_noise_model_save_latest(aom_noise_model_t *noise_model) { + for (int c = 0; c < 3; c++) { + equation_system_copy(&noise_model->combined_state[c].eqns, + &noise_model->latest_state[c].eqns); + equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns, + &noise_model->latest_state[c].strength_solver.eqns); + noise_model->combined_state[c].strength_solver.num_equations = + noise_model->latest_state[c].strength_solver.num_equations; + noise_model->combined_state[c].num_observations = + noise_model->latest_state[c].num_observations; + noise_model->combined_state[c].ar_gain = + noise_model->latest_state[c].ar_gain; + } +} + +int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, + aom_film_grain_t *film_grain) { + if (noise_model->params.lag > 3) { + fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); + return 0; + } + uint16_t random_seed = film_grain->random_seed; + memset(film_grain, 0, sizeof(*film_grain)); + film_grain->random_seed = random_seed; + + film_grain->apply_grain = 1; + film_grain->update_parameters = 1; + + film_grain->ar_coeff_lag = noise_model->params.lag; + + // Convert the scaling functions to 8 bit values + aom_noise_strength_lut_t scaling_points[3]; + if (!aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[0].strength_solver, 14, + scaling_points + 0)) { + return 0; + } + if (!aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[1].strength_solver, 10, + scaling_points + 1)) { + aom_noise_strength_lut_free(scaling_points + 0); + return 0; + } + if (!aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[2].strength_solver, 10, + scaling_points + 2)) { + aom_noise_strength_lut_free(scaling_points + 0); + aom_noise_strength_lut_free(scaling_points + 1); + return 0; + } + + // Both the domain and the range of the scaling functions in the film_grain + // are normalized to 8-bit (e.g., they are implicitly scaled during grain + // synthesis). + const double strength_divisor = 1 << (noise_model->params.bit_depth - 8); + double max_scaling_value = 1e-4; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < scaling_points[c].num_points; ++i) { + scaling_points[c].points[i][0] = + AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor); + scaling_points[c].points[i][1] = + AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor); + max_scaling_value = + AOMMAX(scaling_points[c].points[i][1], max_scaling_value); + } + } + + // Scaling_shift values are in the range [8,11] + const int max_scaling_value_log2 = + clamp((int)floor(log2(max_scaling_value) + 1), 2, 5); + film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2); + + const double scale_factor = 1 << (8 - max_scaling_value_log2); + film_grain->num_y_points = scaling_points[0].num_points; + film_grain->num_cb_points = scaling_points[1].num_points; + film_grain->num_cr_points = scaling_points[2].num_points; + + int(*film_grain_scaling[3])[2] = { + film_grain->scaling_points_y, + film_grain->scaling_points_cb, + film_grain->scaling_points_cr, + }; + for (int c = 0; c < 3; c++) { + for (int i = 0; i < scaling_points[c].num_points; ++i) { + film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5); + film_grain_scaling[c][i][1] = clamp( + (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255); + } + } + aom_noise_strength_lut_free(scaling_points + 0); + aom_noise_strength_lut_free(scaling_points + 1); + aom_noise_strength_lut_free(scaling_points + 2); + + // Convert the ar_coeffs into 8-bit values + const int n_coeff = noise_model->combined_state[0].eqns.n; + double max_coeff = 1e-4, min_coeff = -1e-4; + double y_corr[2] = { 0, 0 }; + double avg_luma_strength = 0; + for (int c = 0; c < 3; c++) { + aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; + for (int i = 0; i < n_coeff; ++i) { + max_coeff = AOMMAX(max_coeff, eqns->x[i]); + min_coeff = AOMMIN(min_coeff, eqns->x[i]); + } + // Since the correlation between luma/chroma was computed in an already + // scaled space, we adjust it in the un-scaled space. + aom_noise_strength_solver_t *solver = + &noise_model->combined_state[c].strength_solver; + // Compute a weighted average of the strength for the channel. + double average_strength = 0, total_weight = 0; + for (int i = 0; i < solver->eqns.n; ++i) { + double w = 0; + for (int j = 0; j < solver->eqns.n; ++j) { + w += solver->eqns.A[i * solver->eqns.n + j]; + } + w = sqrt(w); + average_strength += solver->eqns.x[i] * w; + total_weight += w; + } + if (total_weight == 0) + average_strength = 1; + else + average_strength /= total_weight; + if (c == 0) { + avg_luma_strength = average_strength; + } else { + y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength; + max_coeff = AOMMAX(max_coeff, y_corr[c - 1]); + min_coeff = AOMMIN(min_coeff, y_corr[c - 1]); + } + } + // Shift value: AR coeffs range (values 6-9) + // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25) + film_grain->ar_coeff_shift = + clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))), + 6, 9); + double scale_ar_coeff = 1 << film_grain->ar_coeff_shift; + int *ar_coeffs[3] = { + film_grain->ar_coeffs_y, + film_grain->ar_coeffs_cb, + film_grain->ar_coeffs_cr, + }; + for (int c = 0; c < 3; ++c) { + aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; + for (int i = 0; i < n_coeff; ++i) { + ar_coeffs[c][i] = + clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127); + } + if (c > 0) { + ar_coeffs[c][n_coeff] = + clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127); + } + } + + // At the moment, the noise modeling code assumes that the chroma scaling + // functions are a function of luma. + film_grain->cb_mult = 128; // 8 bits + film_grain->cb_luma_mult = 192; // 8 bits + film_grain->cb_offset = 256; // 9 bits + + film_grain->cr_mult = 128; // 8 bits + film_grain->cr_luma_mult = 192; // 8 bits + film_grain->cr_offset = 256; // 9 bits + + film_grain->chroma_scaling_from_luma = 0; + film_grain->grain_scale_shift = 0; + film_grain->overlap_flag = 1; + return 1; +} + +static void pointwise_multiply(const float *a, float *b, int n) { + for (int i = 0; i < n; ++i) { + b[i] *= a[i]; + } +} + +static float *get_half_cos_window(int block_size) { + float *window_function = + (float *)aom_malloc(block_size * block_size * sizeof(*window_function)); + if (!window_function) return NULL; + for (int y = 0; y < block_size; ++y) { + const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2); + for (int x = 0; x < block_size; ++x) { + const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2); + window_function[y * block_size + x] = (float)(cos_yd * cos_xd); + } + } + return window_function; +} + +#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \ + static void dither_and_quantize_##suffix( \ + float *result, int result_stride, INT_TYPE *denoised, int w, int h, \ + int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \ + float block_normalization) { \ + for (int y = 0; y < (h >> chroma_sub_h); ++y) { \ + for (int x = 0; x < (w >> chroma_sub_w); ++x) { \ + const int result_idx = \ + (y + (block_size >> chroma_sub_h)) * result_stride + x + \ + (block_size >> chroma_sub_w); \ + INT_TYPE new_val = (INT_TYPE)AOMMIN( \ + AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \ + block_normalization); \ + const float err = \ + -(((float)new_val) / block_normalization - result[result_idx]); \ + denoised[y * stride + x] = new_val; \ + if (x + 1 < (w >> chroma_sub_w)) { \ + result[result_idx + 1] += err * 7.0f / 16.0f; \ + } \ + if (y + 1 < (h >> chroma_sub_h)) { \ + if (x > 0) { \ + result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \ + } \ + result[result_idx + result_stride] += err * 5.0f / 16.0f; \ + if (x + 1 < (w >> chroma_sub_w)) { \ + result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \ + } \ + } \ + } \ + } \ + } + +DITHER_AND_QUANTIZE(uint8_t, lowbd) +DITHER_AND_QUANTIZE(uint16_t, highbd) + +int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], + int w, int h, int stride[3], int chroma_sub[2], + float *noise_psd[3], int block_size, int bit_depth, + int use_highbd) { + float *plane = NULL, *block = NULL, *window_full = NULL, + *window_chroma = NULL; + double *block_d = NULL, *plane_d = NULL; + struct aom_noise_tx_t *tx_full = NULL; + struct aom_noise_tx_t *tx_chroma = NULL; + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + const int result_stride = (num_blocks_w + 2) * block_size; + const int result_height = (num_blocks_h + 2) * block_size; + float *result = NULL; + int init_success = 1; + aom_flat_block_finder_t block_finder_full; + aom_flat_block_finder_t block_finder_chroma; + const float kBlockNormalization = (float)((1 << bit_depth) - 1); + if (chroma_sub[0] != chroma_sub[1]) { + fprintf(stderr, + "aom_wiener_denoise_2d doesn't handle different chroma " + "subsampling\n"); + return 0; + } + init_success &= aom_flat_block_finder_init(&block_finder_full, block_size, + bit_depth, use_highbd); + result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride * + sizeof(*result)); + plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane)); + block = + (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block)); + block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d)); + plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d)); + window_full = get_half_cos_window(block_size); + tx_full = aom_noise_tx_malloc(block_size); + + if (chroma_sub[0] != 0) { + init_success &= aom_flat_block_finder_init(&block_finder_chroma, + block_size >> chroma_sub[0], + bit_depth, use_highbd); + window_chroma = get_half_cos_window(block_size >> chroma_sub[0]); + tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]); + } else { + window_chroma = window_full; + tx_chroma = tx_full; + } + + init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) && + (plane_d != NULL) && (block != NULL) && (block_d != NULL) && + (window_full != NULL) && (window_chroma != NULL) && + (result != NULL); + for (int c = init_success ? 0 : 3; c < 3; ++c) { + float *window_function = c == 0 ? window_full : window_chroma; + aom_flat_block_finder_t *block_finder = &block_finder_full; + const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0; + const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0; + struct aom_noise_tx_t *tx = + (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full; + if (!data[c] || !denoised[c]) continue; + if (c > 0 && chroma_sub[0] != 0) { + block_finder = &block_finder_chroma; + } + memset(result, 0, sizeof(*result) * result_stride * result_height); + // Do overlapped block processing (half overlapped). The block rows can + // easily be done in parallel + for (int offsy = 0; offsy < (block_size >> chroma_sub_h); + offsy += (block_size >> chroma_sub_h) / 2) { + for (int offsx = 0; offsx < (block_size >> chroma_sub_w); + offsx += (block_size >> chroma_sub_w) / 2) { + // Pad the boundary when processing each block-set. + for (int by = -1; by < num_blocks_h; ++by) { + for (int bx = -1; bx < num_blocks_w; ++bx) { + const int pixels_per_block = + (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h); + aom_flat_block_finder_extract_block( + block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h, + stride[c], bx * (block_size >> chroma_sub_w) + offsx, + by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d); + for (int j = 0; j < pixels_per_block; ++j) { + block[j] = (float)block_d[j]; + plane[j] = (float)plane_d[j]; + } + pointwise_multiply(window_function, block, pixels_per_block); + aom_noise_tx_forward(tx, block); + aom_noise_tx_filter(tx, noise_psd[c]); + aom_noise_tx_inverse(tx, block); + + // Apply window function to the plane approximation (we will apply + // it to the sum of plane + block when composing the results). + pointwise_multiply(window_function, plane, pixels_per_block); + + for (int y = 0; y < (block_size >> chroma_sub_h); ++y) { + const int y_result = + y + (by + 1) * (block_size >> chroma_sub_h) + offsy; + for (int x = 0; x < (block_size >> chroma_sub_w); ++x) { + const int x_result = + x + (bx + 1) * (block_size >> chroma_sub_w) + offsx; + result[y_result * result_stride + x_result] += + (block[y * (block_size >> chroma_sub_w) + x] + + plane[y * (block_size >> chroma_sub_w) + x]) * + window_function[y * (block_size >> chroma_sub_w) + x]; + } + } + } + } + } + } + if (use_highbd) { + dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c], + w, h, stride[c], chroma_sub_w, chroma_sub_h, + block_size, kBlockNormalization); + } else { + dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h, + stride[c], chroma_sub_w, chroma_sub_h, + block_size, kBlockNormalization); + } + } + aom_free(result); + aom_free(plane); + aom_free(block); + aom_free(plane_d); + aom_free(block_d); + aom_free(window_full); + + aom_noise_tx_free(tx_full); + + aom_flat_block_finder_free(&block_finder_full); + if (chroma_sub[0] != 0) { + aom_flat_block_finder_free(&block_finder_chroma); + aom_free(window_chroma); + aom_noise_tx_free(tx_chroma); + } + return init_success; +} + +struct aom_denoise_and_model_t { + int block_size; + int bit_depth; + float noise_level; + + // Size of current denoised buffer and flat_block buffer + int width; + int height; + int y_stride; + int uv_stride; + int num_blocks_w; + int num_blocks_h; + + // Buffers for image and noise_psd allocated on the fly + float *noise_psd[3]; + uint8_t *denoised[3]; + uint8_t *flat_blocks; + + aom_flat_block_finder_t flat_block_finder; + aom_noise_model_t noise_model; +}; + +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level) { + struct aom_denoise_and_model_t *ctx = + (struct aom_denoise_and_model_t *)aom_malloc( + sizeof(struct aom_denoise_and_model_t)); + if (!ctx) { + fprintf(stderr, "Unable to allocate denoise_and_model struct\n"); + return NULL; + } + memset(ctx, 0, sizeof(*ctx)); + + ctx->block_size = block_size; + ctx->noise_level = noise_level; + ctx->bit_depth = bit_depth; + + ctx->noise_psd[0] = + (float *)aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size); + ctx->noise_psd[1] = + (float *)aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size); + ctx->noise_psd[2] = + (float *)aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size); + if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) { + fprintf(stderr, "Unable to allocate noise PSD buffers\n"); + aom_denoise_and_model_free(ctx); + return NULL; + } + return ctx; +} + +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { + aom_free(ctx->flat_blocks); + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + aom_free(ctx->noise_psd[i]); + } + aom_noise_model_free(&ctx->noise_model); + aom_flat_block_finder_free(&ctx->flat_block_finder); + aom_free(ctx); +} + +static int denoise_and_model_realloc_if_necessary( + struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + if (ctx->width == sd->y_width && ctx->height == sd->y_height && + ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) + return 1; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + const int block_size = ctx->block_size; + + ctx->width = sd->y_width; + ctx->height = sd->y_height; + ctx->y_stride = sd->y_stride; + ctx->uv_stride = sd->uv_stride; + + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + ctx->denoised[i] = NULL; + } + aom_free(ctx->flat_blocks); + ctx->flat_blocks = NULL; + + ctx->denoised[0] = + (uint8_t *)aom_malloc((sd->y_stride * sd->y_height) << use_highbd); + ctx->denoised[1] = + (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + ctx->denoised[2] = + (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) { + fprintf(stderr, "Unable to allocate denoise buffers\n"); + return 0; + } + ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size; + ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size; + ctx->flat_blocks = + (uint8_t *)aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h); + if (!ctx->flat_blocks) { + fprintf(stderr, "Unable to allocate flat_blocks buffer\n"); + return 0; + } + + aom_flat_block_finder_free(&ctx->flat_block_finder); + if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size, + ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to init flat block finder\n"); + return 0; + } + + const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, + ctx->bit_depth, use_highbd }; + aom_noise_model_free(&ctx->noise_model); + if (!aom_noise_model_init(&ctx->noise_model, params)) { + fprintf(stderr, "Unable to init noise model\n"); + return 0; + } + + // Simply use a flat PSD (although we could use the flat blocks to estimate + // PSD) those to estimate an actual noise PSD) + const float y_noise_level = + aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level); + const float uv_noise_level = aom_noise_psd_get_default_value( + ctx->block_size >> sd->subsampling_x, ctx->noise_level); + for (int i = 0; i < block_size * block_size; ++i) { + ctx->noise_psd[0][i] = y_noise_level; + ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level; + } + return 1; +} + +// TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer +// are null pointers) correctly. +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *film_grain, int apply_denoise) { + const int block_size = ctx->block_size; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + uint8_t *raw_data[3] = { + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer, + }; + const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] }; + int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride }; + int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y }; + + if (!denoise_and_model_realloc_if_necessary(ctx, sd)) { + fprintf(stderr, "Unable to realloc buffers\n"); + return 0; + } + + aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width, + sd->y_height, strides[0], ctx->flat_blocks); + + if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height, + strides, chroma_sub_log2, ctx->noise_psd, + block_size, ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to denoise image\n"); + return 0; + } + + const aom_noise_status_t status = aom_noise_model_update( + &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised, + sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks, + block_size); + int have_noise_estimate = 0; + if (status == AOM_NOISE_STATUS_OK) { + have_noise_estimate = 1; + } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { + aom_noise_model_save_latest(&ctx->noise_model); + have_noise_estimate = 1; + } else { + // Unable to update noise model; proceed if we have a previous estimate. + have_noise_estimate = + (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0); + } + + film_grain->apply_grain = 0; + if (have_noise_estimate) { + if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) { + fprintf(stderr, "Unable to get grain parameters.\n"); + return 0; + } + if (!film_grain->random_seed) { + film_grain->random_seed = 7391; + } + if (apply_denoise) { + memcpy(raw_data[0], ctx->denoised[0], + (strides[0] * sd->y_height) << use_highbd); + if (!sd->monochrome) { + memcpy(raw_data[1], ctx->denoised[1], + (strides[1] * sd->uv_height) << use_highbd); + memcpy(raw_data[2], ctx->denoised[2], + (strides[2] * sd->uv_height) << use_highbd); + } + } + } + return 1; +} diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h new file mode 100644 index 0000000000..8228aeacfc --- /dev/null +++ b/third_party/aom/aom_dsp/noise_model.h @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_NOISE_MODEL_H_ +#define AOM_AOM_DSP_NOISE_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include +#include "aom_dsp/grain_params.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +/*!\brief Wrapper of data required to represent linear system of eqns and soln. + */ +typedef struct { + double *A; + double *b; + double *x; + int n; +} aom_equation_system_t; + +/*!\brief Representation of a piecewise linear curve + * + * Holds n points as (x, y) pairs, that store the curve. + */ +typedef struct { + double (*points)[2]; + int num_points; +} aom_noise_strength_lut_t; + +/*!\brief Init the noise strength lut with the given number of points*/ +int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points); + +/*!\brief Frees the noise strength lut. */ +void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut); + +/*!\brief Evaluate the lut at the point x. + * + * \param[in] lut The lut data. + * \param[in] x The coordinate to evaluate the lut. + */ +double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, + double x); + +/*!\brief Helper struct to model noise strength as a function of intensity. + * + * Internally, this structure holds a representation of a linear system + * of equations that models noise strength (standard deviation) as a + * function of intensity. The mapping is initially stored using a + * piecewise representation with evenly spaced bins that cover the entire + * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a + * constraint of the form: + * y_{i} (1 - a) + y_{i+1} a = y + * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and + * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding + * normal equations. + * + * As there may be missing data, the solution is regularized to get a + * complete set of values for the bins. A reduced representation after + * solving can be obtained by getting the corresponding noise_strength_lut_t. + */ +typedef struct { + aom_equation_system_t eqns; + double min_intensity; + double max_intensity; + int num_bins; + int num_equations; + double total; +} aom_noise_strength_solver_t; + +/*!\brief Initializes the noise solver with the given number of bins. + * + * Returns 0 if initialization fails. + * + * \param[in] solver The noise solver to be initialized. + * \param[in] num_bins Number of bins to use in the internal representation. + * \param[in] bit_depth The bit depth used to derive {min,max}_intensity. + */ +int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, + int num_bins, int bit_depth); +void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver); + +/*!\brief Gets the x coordinate of bin i. + * + * \param[in] i The bin whose coordinate to query. + */ +double aom_noise_strength_solver_get_center( + const aom_noise_strength_solver_t *solver, int i); + +/*!\brief Add an observation of the block mean intensity to its noise strength. + * + * \param[in] block_mean The average block intensity, + * \param[in] noise_std The observed noise strength. + */ +void aom_noise_strength_solver_add_measurement( + aom_noise_strength_solver_t *solver, double block_mean, double noise_std); + +/*!\brief Solves the current set of equations for the noise strength. */ +int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver); + +/*!\brief Fits a reduced piecewise linear lut to the internal solution + * + * \param[in] max_num_points The maximum number of output points + * \param[out] lut The output piecewise linear lut. + */ +int aom_noise_strength_solver_fit_piecewise( + const aom_noise_strength_solver_t *solver, int max_num_points, + aom_noise_strength_lut_t *lut); + +/*!\brief Helper for holding precomputed data for finding flat blocks. + * + * Internally a block is modeled with a low-order polynomial model. A + * planar model would be a bunch of equations like: + * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i + * for each point in the block. The system matrix A with row i as [y_i x_i 1] + * is maintained as is the inverse, inv(A'*A), so that the plane parameters + * can be fit for each block. + */ +typedef struct { + double *AtA_inv; + double *A; + int num_params; // The number of parameters used for internal low-order model + int block_size; // The block size the finder was initialized with + double normalization; // Normalization factor (1 / (2^(bit_depth) - 1)) + int use_highbd; // Whether input data should be interpreted as uint16 +} aom_flat_block_finder_t; + +/*!\brief Init the block_finder with the given block size, bit_depth */ +int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, + int block_size, int bit_depth, int use_highbd); +void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder); + +/*!\brief Helper to extract a block and low order "planar" model. */ +void aom_flat_block_finder_extract_block( + const aom_flat_block_finder_t *block_finder, const uint8_t *const data, + int w, int h, int stride, int offsx, int offsy, double *plane, + double *block); + +/*!\brief Runs the flat block finder on the input data. + * + * Find flat blocks in the input image data. Returns a map of + * flat_blocks, where the value of flat_blocks map will be non-zero + * when a block is determined to be flat. A higher value indicates a bigger + * confidence in the decision. + */ +int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, + const uint8_t *const data, int w, int h, + int stride, uint8_t *flat_blocks); + +// The noise shape indicates the allowed coefficients in the AR model. +enum { + AOM_NOISE_SHAPE_DIAMOND = 0, + AOM_NOISE_SHAPE_SQUARE = 1 +} UENUM1BYTE(aom_noise_shape); + +// The parameters of the noise model include the shape type, lag, the +// bit depth of the input images provided, and whether the input images +// will be using uint16 (or uint8) representation. +typedef struct { + aom_noise_shape shape; + int lag; + int bit_depth; + int use_highbd; +} aom_noise_model_params_t; + +/*!\brief State of a noise model estimate for a single channel. + * + * This contains a system of equations that can be used to solve + * for the auto-regressive coefficients as well as a noise strength + * solver that can be used to model noise strength as a function of + * intensity. + */ +typedef struct { + aom_equation_system_t eqns; + aom_noise_strength_solver_t strength_solver; + int num_observations; // The number of observations in the eqn system + double ar_gain; // The gain of the current AR filter +} aom_noise_state_t; + +/*!\brief Complete model of noise for a planar video + * + * This includes a noise model for the latest frame and an aggregated + * estimate over all previous frames that had similar parameters. + */ +typedef struct { + aom_noise_model_params_t params; + aom_noise_state_t combined_state[3]; // Combined state per channel + aom_noise_state_t latest_state[3]; // Latest state per channel + int (*coords)[2]; // Offsets (x,y) of the coefficient samples + int n; // Number of parameters (size of coords) + int bit_depth; +} aom_noise_model_t; + +/*!\brief Result of a noise model update. */ +enum { + AOM_NOISE_STATUS_OK = 0, + AOM_NOISE_STATUS_INVALID_ARGUMENT, + AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, + AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, + AOM_NOISE_STATUS_INTERNAL_ERROR, +} UENUM1BYTE(aom_noise_status_t); + +/*!\brief Initializes a noise model with the given parameters. + * + * Returns 0 on failure. + */ +int aom_noise_model_init(aom_noise_model_t *model, + const aom_noise_model_params_t params); +void aom_noise_model_free(aom_noise_model_t *model); + +/*!\brief Updates the noise model with a new frame observation. + * + * Updates the noise model with measurements from the given input frame and a + * denoised variant of it. Noise is sampled from flat blocks using the flat + * block map. + * + * Returns a noise_status indicating if the update was successful. If the + * Update was successful, the combined_state is updated with measurements from + * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise + * state will be updated with measurements from the provided frame. + * + * \param[in,out] noise_model The noise model to be updated + * \param[in] data Raw frame data + * \param[in] denoised Denoised frame data. + * \param[in] w Frame width + * \param[in] h Frame height + * \param[in] strides Stride of the planes + * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. + * \param[in] flat_blocks A map to blocks that have been determined flat + * \param[in] block_size The size of blocks. + */ +aom_noise_status_t aom_noise_model_update( + aom_noise_model_t *const noise_model, const uint8_t *const data[3], + const uint8_t *const denoised[3], int w, int h, int strides[3], + int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size); + +/*\brief Save the "latest" estimate into the "combined" estimate. + * + * This is meant to be called when the noise modeling detected a change + * in parameters (or for example, if a user wanted to reset estimation at + * a shot boundary). + */ +void aom_noise_model_save_latest(aom_noise_model_t *noise_model); + +/*!\brief Converts the noise_model parameters to the corresponding + * grain_parameters. + * + * The noise structs in this file are suitable for estimation (e.g., using + * floats), but the grain parameters in the bitstream are quantized. This + * function does the conversion by selecting the correct quantization levels. + */ +int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, + aom_film_grain_t *film_grain); + +/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd. + * + * \param[in] data Raw frame data + * \param[out] denoised Denoised frame data + * \param[in] w Frame width + * \param[in] h Frame height + * \param[in] stride Stride of the planes + * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. + * \param[in] noise_psd The power spectral density of the noise + * \param[in] block_size The size of blocks + * \param[in] bit_depth Bit depth of the image + * \param[in] use_highbd If true, uint8 pointers are interpreted as + * uint16 and stride is measured in uint16. + * This must be true when bit_depth >= 10. + */ +int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], + int w, int h, int stride[3], int chroma_sub_log2[2], + float *noise_psd[3], int block_size, int bit_depth, + int use_highbd); + +struct aom_denoise_and_model_t; + +/*!\brief Denoise the buffer and model the residual noise. + * + * This is meant to be called sequentially on input frames. The input buffer + * is denoised and the residual noise is modelled. The current noise estimate + * is populated in film_grain. Returns true on success. The grain.apply_grain + * parameter will be true when the input buffer was successfully denoised and + * grain was modelled. Returns false on error. + * + * \param[in] ctx Struct allocated with + * aom_denoise_and_model_alloc that holds some + * buffers for denoising and the current noise + * estimate. + * \param[in,out] buf The raw input buffer to be denoised. + * \param[out] grain Output film grain parameters + * \param[in] apply_denoise Whether or not to apply the denoising to the + * frame that will be encoded + */ +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain, + int apply_denoise); + +/*!\brief Allocates a context that can be used for denoising and noise modeling. + * + * \param[in] bit_depth Bit depth of buffers this will be run on. + * \param[in] block_size Block size for noise modeling and flat block + * estimation + * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for + * higher levels of noise) + */ +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level); + +/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc + */ +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_AOM_DSP_NOISE_MODEL_H_ diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c new file mode 100644 index 0000000000..3ded8cb099 --- /dev/null +++ b/third_party/aom/aom_dsp/noise_util.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include +#include + +#include "aom_dsp/noise_util.h" +#include "aom_dsp/fft_common.h" +#include "aom_mem/aom_mem.h" +#include "config/aom_dsp_rtcd.h" + +float aom_noise_psd_get_default_value(int block_size, float factor) { + return (factor * factor / 10000) * block_size * block_size / 8; +} + +// Internal representation of noise transform. It keeps track of the +// transformed data and a temporary working buffer to use during the +// transform. +struct aom_noise_tx_t { + float *tx_block; + float *temp; + int block_size; + void (*fft)(const float *, float *, float *); + void (*ifft)(const float *, float *, float *); +}; + +struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) { + struct aom_noise_tx_t *noise_tx = + (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t)); + if (!noise_tx) return NULL; + memset(noise_tx, 0, sizeof(*noise_tx)); + switch (block_size) { + case 2: + noise_tx->fft = aom_fft2x2_float; + noise_tx->ifft = aom_ifft2x2_float; + break; + case 4: + noise_tx->fft = aom_fft4x4_float; + noise_tx->ifft = aom_ifft4x4_float; + break; + case 8: + noise_tx->fft = aom_fft8x8_float; + noise_tx->ifft = aom_ifft8x8_float; + break; + case 16: + noise_tx->fft = aom_fft16x16_float; + noise_tx->ifft = aom_ifft16x16_float; + break; + case 32: + noise_tx->fft = aom_fft32x32_float; + noise_tx->ifft = aom_ifft32x32_float; + break; + default: + aom_free(noise_tx); + fprintf(stderr, "Unsupported block size %d\n", block_size); + return NULL; + } + noise_tx->block_size = block_size; + noise_tx->tx_block = (float *)aom_memalign( + 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); + noise_tx->temp = (float *)aom_memalign( + 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size); + if (!noise_tx->tx_block || !noise_tx->temp) { + aom_noise_tx_free(noise_tx); + return NULL; + } + // Clear the buffers up front. Some outputs of the forward transform are + // real only (the imaginary component will never be touched) + memset(noise_tx->tx_block, 0, + 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); + memset(noise_tx->temp, 0, + 2 * sizeof(*noise_tx->temp) * block_size * block_size); + return noise_tx; +} + +void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) { + noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block); +} + +void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) { + const int block_size = noise_tx->block_size; + const float kBeta = 1.1f; + const float kEps = 1e-6f; + for (int y = 0; y < block_size; ++y) { + for (int x = 0; x < block_size; ++x) { + int i = y * block_size + x; + float *c = noise_tx->tx_block + 2 * i; + const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f); + const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f); + const float p = c0 * c0 + c1 * c1; + if (p > kBeta * psd[i] && p > 1e-6) { + noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps); + noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps); + } else { + noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta; + noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta; + } + } + } +} + +void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) { + const int n = noise_tx->block_size * noise_tx->block_size; + noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data); + for (int i = 0; i < n; ++i) { + data[i] /= n; + } +} + +void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx, + float *psd) { + const int block_size = noise_tx->block_size; + for (int yb = 0; yb < block_size; ++yb) { + for (int xb = 0; xb <= block_size / 2; ++xb) { + float *c = noise_tx->tx_block + 2 * (yb * block_size + xb); + psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1]; + } + } +} + +void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) { + if (!noise_tx) return; + aom_free(noise_tx->tx_block); + aom_free(noise_tx->temp); + aom_free(noise_tx); +} + +double aom_normalized_cross_correlation(const double *a, const double *b, + int n) { + double c = 0; + double a_len = 0; + double b_len = 0; + for (int i = 0; i < n; ++i) { + a_len += a[i] * a[i]; + b_len += b[i] * b[i]; + c += a[i] * b[i]; + } + return c / (sqrt(a_len) * sqrt(b_len)); +} + +int aom_noise_data_validate(const double *data, int w, int h) { + const double kVarianceThreshold = 2; + const double kMeanThreshold = 2; + + int x = 0, y = 0; + int ret_value = 1; + double var = 0, mean = 0; + double *mean_x, *mean_y, *var_x, *var_y; + + // Check that noise variance is not increasing in x or y + // and that the data is zero mean. + mean_x = (double *)aom_calloc(w, sizeof(*mean_x)); + var_x = (double *)aom_calloc(w, sizeof(*var_x)); + mean_y = (double *)aom_calloc(h, sizeof(*mean_x)); + var_y = (double *)aom_calloc(h, sizeof(*var_y)); + if (!(mean_x && var_x && mean_y && var_y)) { + aom_free(mean_x); + aom_free(mean_y); + aom_free(var_x); + aom_free(var_y); + return 0; + } + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + const double d = data[y * w + x]; + var_x[x] += d * d; + var_y[y] += d * d; + mean_x[x] += d; + mean_y[y] += d; + var += d * d; + mean += d; + } + } + mean /= (w * h); + var = var / (w * h) - mean * mean; + + for (y = 0; y < h; ++y) { + mean_y[y] /= h; + var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y]; + if (fabs(var_y[y] - var) >= kVarianceThreshold) { + fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var); + ret_value = 0; + break; + } + if (fabs(mean_y[y] - mean) >= kMeanThreshold) { + fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean); + ret_value = 0; + break; + } + } + + for (x = 0; x < w; ++x) { + mean_x[x] /= w; + var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x]; + if (fabs(var_x[x] - var) >= kVarianceThreshold) { + fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var); + ret_value = 0; + break; + } + if (fabs(mean_x[x] - mean) >= kMeanThreshold) { + fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean); + ret_value = 0; + break; + } + } + + aom_free(mean_x); + aom_free(mean_y); + aom_free(var_x); + aom_free(var_y); + + return ret_value; +} diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h new file mode 100644 index 0000000000..2284a171a4 --- /dev/null +++ b/third_party/aom/aom_dsp/noise_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_NOISE_UTIL_H_ +#define AOM_AOM_DSP_NOISE_UTIL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// aom_noise_tx_t is an abstraction of a transform that is used for denoising. +// It is meant to be lightweight and does hold the transformed data (as +// the user should not be manipulating the transformed data directly). +struct aom_noise_tx_t; + +// Allocates and returns a aom_noise_tx_t useful for denoising the given +// block_size. The resulting aom_noise_tx_t should be free'd with +// aom_noise_tx_free. +struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size); +void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx); + +// Transforms the internal data and holds it in the aom_noise_tx's internal +// buffer. For compatibility with existing SIMD implementations, "data" must +// be 32-byte aligned. +void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx, + const float *data); + +// Filters aom_noise_tx's internal data using the provided noise power spectral +// density. The PSD must be at least block_size * block_size and should be +// populated with a constant or via estimates taken from +// aom_noise_tx_add_energy. +void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd); + +// Performs an inverse transform using the internal transform data. +// For compatibility with existing SIMD implementations, "data" must be 32-byte +// aligned. +void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data); + +// Aggregates the power of the buffered transform data into the psd buffer. +void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx, + float *psd); + +// Returns a default value suitable for denosing a transform of the given +// block_size. The noise "factor" determines the strength of the noise to +// be removed. A value of about 2.5 can be used for moderate denoising, +// where a value of 5.0 can be used for a high level of denoising. +float aom_noise_psd_get_default_value(int block_size, float factor); + +// Computes normalized cross correlation of two vectors a and b of length n. +double aom_normalized_cross_correlation(const double *a, const double *b, + int n); + +// Validates the correlated noise in the data buffer of size (w, h). +int aom_noise_data_validate(const double *data, int w, int h); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // AOM_AOM_DSP_NOISE_UTIL_H_ diff --git a/third_party/aom/aom_dsp/odintrin.c b/third_party/aom/aom_dsp/odintrin.c new file mode 100644 index 0000000000..eb6d8d8771 --- /dev/null +++ b/third_party/aom/aom_dsp/odintrin.c @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#include "aom_dsp/odintrin.h" + +/*Constants for use with OD_DIVU_SMALL(). + See \cite{Rob05} for details on computing these constants. + @INPROCEEDINGS{Rob05, + author="Arch D. Robison", + title="{N}-bit Unsigned Division via {N}-bit Multiply-Add", + booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic + (ARITH'05)", + pages="131--139", + address="Cape Cod, MA", + month=Jun, + year=2005 + }*/ +uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = { + { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xAAAAAAAB, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xCCCCCCCD, 0 }, { 0xAAAAAAAB, 0 }, + { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xE38E38E4, 0 }, { 0xCCCCCCCD, 0 }, + { 0xBA2E8BA3, 0 }, { 0xAAAAAAAB, 0 }, + { 0x9D89D89E, 0 }, { 0x92492492, 0x92492492 }, + { 0x88888889, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xF0F0F0F1, 0 }, { 0xE38E38E4, 0 }, + { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 }, + { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 }, + { 0xB21642C9, 0 }, { 0xAAAAAAAB, 0 }, + { 0xA3D70A3E, 0 }, { 0x9D89D89E, 0 }, + { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 }, + { 0x8D3DCB09, 0 }, { 0x88888889, 0 }, + { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xF83E0F84, 0 }, { 0xF0F0F0F1, 0 }, + { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 }, + { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 }, + { 0xC7CE0C7D, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xBE82FA0C, 0 }, { 0xBA2E8BA3, 0 }, + { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 }, + { 0xAE4C415D, 0 }, { 0xAAAAAAAB, 0 }, + { 0xA72F053A, 0 }, { 0xA3D70A3E, 0 }, + { 0xA0A0A0A1, 0 }, { 0x9D89D89E, 0 }, + { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED }, + { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 }, + { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 }, + { 0x8AD8F2FC, 0 }, { 0x88888889, 0 }, + { 0x864B8A7E, 0 }, { 0x84210842, 0x84210842 }, + { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFC0FC0FD, 0 }, { 0xF83E0F84, 0 }, + { 0xF4898D60, 0 }, { 0xF0F0F0F1, 0 }, + { 0xED7303B6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE6C2B449, 0 }, { 0xE38E38E4, 0 }, + { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDA740DA8, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD4C77B04, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xCF6474A9, 0 }, { 0xCCCCCCCD, 0 }, + { 0xCA4587E7, 0 }, { 0xC7CE0C7D, 0 }, + { 0xC565C87C, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC0C0C0C1, 0 }, { 0xBE82FA0C, 0 }, + { 0xBC52640C, 0 }, { 0xBA2E8BA3, 0 }, + { 0xB81702E1, 0 }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 }, + { 0xB02C0B03, 0 }, { 0xAE4C415D, 0 }, + { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 }, + { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 }, + { 0xA57EB503, 0 }, { 0xA3D70A3E, 0 }, + { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 }, + { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 }, + { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED }, + { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F }, + { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 }, + { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 }, + { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 }, + { 0x89AE408A, 0 }, { 0x88888889, 0 }, + { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 }, + { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 }, + { 0x83126E98, 0 }, { 0x82082082, 0x82082082 }, + { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFE03F810, 0 }, { 0xFC0FC0FD, 0 }, + { 0xFA232CF3, 0 }, { 0xF83E0F84, 0 }, + { 0xF6603D99, 0 }, { 0xF4898D60, 0 }, + { 0xF2B9D649, 0 }, { 0xF0F0F0F1, 0 }, + { 0xEF2EB720, 0 }, { 0xED7303B6, 0 }, + { 0xEBBDB2A6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE865AC7C, 0 }, { 0xE6C2B449, 0 }, + { 0xE525982B, 0 }, { 0xE38E38E4, 0 }, + { 0xE1FC780F, 0 }, { 0xE070381C, 0xE070381C }, + { 0xDEE95C4D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDBEB61EF, 0 }, { 0xDA740DA8, 0 }, + { 0xD901B204, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD62B80D7, 0 }, { 0xD4C77B04, 0 }, + { 0xD3680D37, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xD0B69FCC, 0 }, { 0xCF6474A9, 0 }, + { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 }, + { 0xCB8727C1, 0 }, { 0xCA4587E7, 0 }, + { 0xC907DA4F, 0 }, { 0xC7CE0C7D, 0 }, + { 0xC6980C6A, 0 }, { 0xC565C87C, 0 }, + { 0xC4372F86, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC1E4BBD6, 0 }, { 0xC0C0C0C1, 0 }, + { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 }, + { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 }, + { 0xBB3EE722, 0 }, { 0xBA2E8BA3, 0 }, + { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 }, + { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB509E68B, 0 }, { 0xB40B40B4, 0xB40B40B4 }, + { 0xB30F6353, 0 }, { 0xB21642C9, 0 }, + { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 }, + { 0xAF3ADDC7, 0 }, { 0xAE4C415D, 0 }, + { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 }, + { 0xAB8F69E3, 0 }, { 0xAAAAAAAB, 0 }, + { 0xA9C84A48, 0 }, { 0xA8E83F57, 0xA8E83F57 }, + { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 }, + { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 }, + { 0xA4A9CF1E, 0 }, { 0xA3D70A3E, 0 }, + { 0xA3065E40, 0 }, { 0xA237C32B, 0xA237C32B }, + { 0xA16B312F, 0 }, { 0xA0A0A0A1, 0 }, + { 0x9FD809FE, 0 }, { 0x9F1165E7, 0x9F1165E7 }, + { 0x9E4CAD24, 0 }, { 0x9D89D89E, 0 }, + { 0x9CC8E161, 0 }, { 0x9C09C09C, 0x9C09C09C }, + { 0x9B4C6F9F, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x99D722DB, 0 }, { 0x991F1A51, 0x991F1A51 }, + { 0x9868C80A, 0 }, { 0x97B425ED, 0x97B425ED }, + { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C }, + { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F }, + { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 }, + { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 }, + { 0x91A2B3C5, 0 }, { 0x90FDBC09, 0x90FDBC09 }, + { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8F1779DA, 0 }, { 0x8E78356D, 0x8E78356D }, + { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 }, + { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C }, + { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 }, + { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 }, + { 0x891AC73B, 0 }, { 0x88888889, 0 }, + { 0x87F78088, 0 }, { 0x8767AB5F, 0x8767AB5F }, + { 0x86D90545, 0 }, { 0x864B8A7E, 0 }, + { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 }, + { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 }, + { 0x83993052, 0x83993052 }, { 0x83126E98, 0 }, + { 0x828CBFBF, 0 }, { 0x82082082, 0x82082082 }, + { 0x81848DA9, 0 }, { 0x81020408, 0x81020408 }, + { 0x80808081, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFF00FF01, 0 }, { 0xFE03F810, 0 }, + { 0xFD08E551, 0 }, { 0xFC0FC0FD, 0 }, + { 0xFB188566, 0 }, { 0xFA232CF3, 0 }, + { 0xF92FB222, 0 }, { 0xF83E0F84, 0 }, + { 0xF74E3FC3, 0 }, { 0xF6603D99, 0 }, + { 0xF57403D6, 0 }, { 0xF4898D60, 0 }, + { 0xF3A0D52D, 0 }, { 0xF2B9D649, 0 }, + { 0xF1D48BCF, 0 }, { 0xF0F0F0F1, 0 }, + { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 }, + { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 }, + { 0xEC979119, 0 }, { 0xEBBDB2A6, 0 }, + { 0xEAE56404, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE9396520, 0 }, { 0xE865AC7C, 0 }, + { 0xE79372E3, 0 }, { 0xE6C2B449, 0 }, + { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 }, + { 0xE45932D8, 0 }, { 0xE38E38E4, 0 }, + { 0xE2C4A689, 0 }, { 0xE1FC780F, 0 }, + { 0xE135A9CA, 0 }, { 0xE070381C, 0xE070381C }, + { 0xDFAC1F75, 0 }, { 0xDEE95C4D, 0 }, + { 0xDE27EB2D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDCA8F159, 0 }, { 0xDBEB61EF, 0 }, + { 0xDB2F171E, 0 }, { 0xDA740DA8, 0 }, + { 0xD9BA4257, 0 }, { 0xD901B204, 0 }, + { 0xD84A598F, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD6DF43FD, 0 }, { 0xD62B80D7, 0 }, + { 0xD578E97D, 0 }, { 0xD4C77B04, 0 }, + { 0xD417328A, 0 }, { 0xD3680D37, 0 }, + { 0xD2BA083C, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 }, + { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 }, + { 0xCEBCF8BC, 0 }, { 0xCE168A77, 0xCE168A77 }, + { 0xCD712753, 0 }, { 0xCCCCCCCD, 0 }, + { 0xCC29786D, 0 }, { 0xCB8727C1, 0 }, + { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 }, + { 0xC9A633FD, 0 }, { 0xC907DA4F, 0 }, + { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 }, + { 0xC73293D8, 0 }, { 0xC6980C6A, 0 }, + { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 }, + { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 }, + { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC2780614, 0 }, { 0xC1E4BBD6, 0 }, + { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 }, + { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 }, + { 0xBF112A8B, 0 }, { 0xBE82FA0C, 0 }, + { 0xBDF59C92, 0 }, { 0xBD691047, 0xBD691047 }, + { 0xBCDD535E, 0 }, { 0xBC52640C, 0 }, + { 0xBBC8408D, 0 }, { 0xBB3EE722, 0 }, + { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 }, + { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA }, + { 0xB89BC36D, 0 }, { 0xB81702E1, 0 }, + { 0xB79300B8, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, + { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 }, + { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 }, + { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 }, + { 0xB2927C2A, 0 }, { 0xB21642C9, 0 }, + { 0xB19AB5C5, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, + { 0xB0A59B42, 0 }, { 0xB02C0B03, 0 }, + { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 }, + { 0xAEC33E20, 0 }, { 0xAE4C415D, 0 }, + { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 }, + { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 }, + { 0xAC02B00B, 0 }, { 0xAB8F69E3, 0 }, + { 0xAB1CBDD4, 0 }, { 0xAAAAAAAB, 0 }, + { 0xAA392F36, 0 }, { 0xA9C84A48, 0 }, + { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 }, + { 0xA8791709, 0 }, { 0xA80A80A8, 0xA80A80A8 }, + { 0xA79C7B17, 0 }, { 0xA72F053A, 0 }, + { 0xA6C21DF7, 0 }, { 0xA655C439, 0xA655C439 }, + { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 }, + { 0xA513FD6C, 0 }, { 0xA4A9CF1E, 0 }, + { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 }, + { 0xA36E71A3, 0 }, { 0xA3065E40, 0 }, + { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B }, + { 0xA1D13986, 0 }, { 0xA16B312F, 0 }, + { 0xA105A933, 0 }, { 0xA0A0A0A1, 0 }, + { 0xA03C1689, 0 }, { 0x9FD809FE, 0 }, + { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 }, + { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 }, + { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 }, + { 0x9D2921C4, 0 }, { 0x9CC8E161, 0 }, + { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C }, + { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 }, + { 0x9AEE72FD, 0 }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 }, + { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 }, + { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 }, + { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED }, + { 0x975A7510, 0 }, { 0x97012E02, 0x97012E02 }, + { 0x96A8500A, 0 }, { 0x964FDA6C, 0x964FDA6C }, + { 0x95F7CC73, 0 }, { 0x95A02568, 0x95A02568 }, + { 0x9548E498, 0 }, { 0x94F2094F, 0x94F2094F }, + { 0x949B92DE, 0 }, { 0x94458094, 0x94458094 }, + { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 }, + { 0x93459BE7, 0 }, { 0x92F11384, 0x92F11384 }, + { 0x929CEBF5, 0 }, { 0x92492492, 0x92492492 }, + { 0x91F5BCB9, 0 }, { 0x91A2B3C5, 0 }, + { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 }, + { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 }, + { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8F67A1E4, 0 }, { 0x8F1779DA, 0 }, + { 0x8EC7AB3A, 0 }, { 0x8E78356D, 0x8E78356D }, + { 0x8E2917E1, 0 }, { 0x8DDA5202, 0x8DDA5202 }, + { 0x8D8BE340, 0 }, { 0x8D3DCB09, 0 }, + { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 }, + { 0x8C55841D, 0 }, { 0x8C08C08C, 0x8C08C08C }, + { 0x8BBC50C9, 0 }, { 0x8B70344A, 0x8B70344A }, + { 0x8B246A88, 0 }, { 0x8AD8F2FC, 0 }, + { 0x8A8DCD20, 0 }, { 0x8A42F870, 0x8A42F870 }, + { 0x89F8746A, 0 }, { 0x89AE408A, 0 }, + { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 }, + { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 }, + { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 }, + { 0x87AF6FD6, 0 }, { 0x8767AB5F, 0x8767AB5F }, + { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 }, + { 0x869222B2, 0 }, { 0x864B8A7E, 0 }, + { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 }, + { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 }, + { 0x84EEDD36, 0 }, { 0x84A9F9C8, 0x84A9F9C8 }, + { 0x84655D9C, 0 }, { 0x84210842, 0x84210842 }, + { 0x83DCF94E, 0 }, { 0x83993052, 0x83993052 }, + { 0x8355ACE4, 0 }, { 0x83126E98, 0 }, + { 0x82CF7504, 0 }, { 0x828CBFBF, 0 }, + { 0x824A4E61, 0 }, { 0x82082082, 0x82082082 }, + { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 }, + { 0x814327E4, 0 }, { 0x81020408, 0x81020408 }, + { 0x80C121B3, 0 }, { 0x80808081, 0 }, + { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFF803FE1, 0 }, { 0xFF00FF01, 0 }, + { 0xFE823CA6, 0 }, { 0xFE03F810, 0 }, + { 0xFD863087, 0 }, { 0xFD08E551, 0 }, + { 0xFC8C15B5, 0 }, { 0xFC0FC0FD, 0 }, + { 0xFB93E673, 0 }, { 0xFB188566, 0 }, + { 0xFA9D9D20, 0 }, { 0xFA232CF3, 0 }, + { 0xF9A9342D, 0 }, { 0xF92FB222, 0 }, + { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 }, + { 0xF7C5ED9D, 0 }, { 0xF74E3FC3, 0 }, + { 0xF6D7054E, 0 }, { 0xF6603D99, 0 }, + { 0xF5E9E7FD, 0 }, { 0xF57403D6, 0 }, + { 0xF4FE9083, 0 }, { 0xF4898D60, 0 }, + { 0xF414F9CE, 0 }, { 0xF3A0D52D, 0 }, + { 0xF32D1EE0, 0 }, { 0xF2B9D649, 0 }, + { 0xF246FACC, 0 }, { 0xF1D48BCF, 0 }, + { 0xF16288B9, 0 }, { 0xF0F0F0F1, 0 }, + { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 }, + { 0xEF9EA78C, 0 }, { 0xEF2EB720, 0 }, + { 0xEEBF2F19, 0 }, { 0xEE500EE5, 0xEE500EE5 }, + { 0xEDE155F4, 0 }, { 0xED7303B6, 0 }, + { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 }, + { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 }, + { 0xEB5159A0, 0 }, { 0xEAE56404, 0 }, + { 0xEA79D14A, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA }, + { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 }, + { 0xE8CF58AB, 0 }, { 0xE865AC7C, 0 }, + { 0xE7FC600F, 0 }, { 0xE79372E3, 0 }, + { 0xE72AE476, 0 }, { 0xE6C2B449, 0 }, + { 0xE65AE1DC, 0 }, { 0xE5F36CB0, 0xE5F36CB0 }, + { 0xE58C544A, 0 }, { 0xE525982B, 0 }, + { 0xE4BF37D9, 0 }, { 0xE45932D8, 0 }, + { 0xE3F388AF, 0 }, { 0xE38E38E4, 0 }, + { 0xE32942FF, 0 }, { 0xE2C4A689, 0 }, + { 0xE260630B, 0 }, { 0xE1FC780F, 0 }, + { 0xE198E520, 0 }, { 0xE135A9CA, 0 }, + { 0xE0D2C59A, 0 }, { 0xE070381C, 0xE070381C }, + { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 }, + { 0xDF4A9369, 0 }, { 0xDEE95C4D, 0 }, + { 0xDE8879B3, 0 }, { 0xDE27EB2D, 0 }, + { 0xDDC7B04D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 }, + { 0xDD0833CE, 0 }, { 0xDCA8F159, 0 }, + { 0xDC4A00DD, 0 }, { 0xDBEB61EF, 0 }, + { 0xDB8D1428, 0 }, { 0xDB2F171E, 0 }, + { 0xDAD16A6B, 0 }, { 0xDA740DA8, 0 }, + { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 }, + { 0xD95DD300, 0 }, { 0xD901B204, 0 }, + { 0xD8A5DEFF, 0 }, { 0xD84A598F, 0 }, + { 0xD7EF2152, 0 }, { 0xD79435E5, 0xD79435E5 }, + { 0xD73996E9, 0 }, { 0xD6DF43FD, 0 }, + { 0xD6853CC1, 0 }, { 0xD62B80D7, 0 }, + { 0xD5D20FDF, 0 }, { 0xD578E97D, 0 }, + { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 }, + { 0xD46F3235, 0 }, { 0xD417328A, 0 }, + { 0xD3BF7BA9, 0 }, { 0xD3680D37, 0 }, + { 0xD310E6DB, 0 }, { 0xD2BA083C, 0 }, + { 0xD2637101, 0 }, { 0xD20D20D2, 0xD20D20D2 }, + { 0xD1B71759, 0 }, { 0xD161543E, 0xD161543E }, + { 0xD10BD72C, 0 }, { 0xD0B69FCC, 0 }, + { 0xD061ADCA, 0 }, { 0xD00D00D0, 0xD00D00D0 }, + { 0xCFB8988C, 0 }, { 0xCF6474A9, 0 }, + { 0xCF1094D4, 0 }, { 0xCEBCF8BC, 0 }, + { 0xCE69A00D, 0 }, { 0xCE168A77, 0xCE168A77 }, + { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 }, + { 0xCD1ED924, 0 }, { 0xCCCCCCCD, 0 }, + { 0xCC7B0200, 0 }, { 0xCC29786D, 0 }, + { 0xCBD82FC7, 0 }, { 0xCB8727C1, 0 }, + { 0xCB36600D, 0 }, { 0xCAE5D85F, 0xCAE5D85F }, + { 0xCA95906C, 0 }, { 0xCA4587E7, 0 }, + { 0xC9F5BE86, 0 }, { 0xC9A633FD, 0 }, + { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 }, + { 0xC8B90A96, 0 }, { 0xC86A7890, 0xC86A7890 }, + { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 }, + { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 }, + { 0xC6E5321D, 0 }, { 0xC6980C6A, 0 }, + { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 }, + { 0xC5B200C6, 0 }, { 0xC565C87C, 0 }, + { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 }, + { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 }, + { 0xC3EC1A06, 0 }, { 0xC3A13DE6, 0xC3A13DE6 }, + { 0xC3569AE6, 0 }, { 0xC30C30C3, 0xC30C30C3 }, + { 0xC2C1FF3E, 0 }, { 0xC2780614, 0 }, + { 0xC22E4507, 0 }, { 0xC1E4BBD6, 0 }, + { 0xC19B6A42, 0 }, { 0xC152500C, 0xC152500C }, + { 0xC1096CF6, 0 }, { 0xC0C0C0C1, 0 }, + { 0xC0784B2F, 0 }, { 0xC0300C03, 0xC0300C03 }, + { 0xBFE80300, 0 }, { 0xBFA02FE8, 0xBFA02FE8 }, + { 0xBF589280, 0 }, { 0xBF112A8B, 0 }, + { 0xBEC9F7CE, 0 }, { 0xBE82FA0C, 0 }, + { 0xBE3C310C, 0 }, { 0xBDF59C92, 0 }, + { 0xBDAF3C64, 0 }, { 0xBD691047, 0xBD691047 }, + { 0xBD231803, 0 }, { 0xBCDD535E, 0 }, + { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 }, + { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 }, + { 0xBB837AB1, 0 }, { 0xBB3EE722, 0 }, + { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 }, + { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 }, + { 0xB9EAF063, 0 }, { 0xB9A7862A, 0xB9A7862A }, + { 0xB9644CC4, 0 }, { 0xB92143FA, 0xB92143FA }, + { 0xB8DE6B9A, 0 }, { 0xB89BC36D, 0 }, + { 0xB8594B41, 0 }, { 0xB81702E1, 0 }, + { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 }, + { 0xB7514689, 0 }, { 0xB70FBB5A, 0xB70FBB5A }, + { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 }, + { 0xB64C31D9, 0 }, { 0xB60B60B6, 0xB60B60B6 }, + { 0xB5CABD9B, 0 }, { 0xB58A4855, 0xB58A4855 }, + { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 }, + { 0xB4C9F9A5, 0 }, { 0xB48A39D4, 0xB48A39D4 }, + { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 }, + { 0xB3CC0706, 0 }, { 0xB38CF9B0, 0xB38CF9B0 }, + { 0xB34E1884, 0 }, { 0xB30F6353, 0 }, + { 0xB2D0D9EF, 0 }, { 0xB2927C2A, 0 }, + { 0xB25449D7, 0 }, { 0xB21642C9, 0 }, + { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 }, + { 0xB15D2F76, 0 }, { 0xB11FD3B8, 0xB11FD3B8 }, + { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 }, + { 0xB068BE31, 0 }, { 0xB02C0B03, 0 }, + { 0xAFEF818C, 0 }, { 0xAFB321A1, 0xAFB321A1 }, + { 0xAF76EB19, 0 }, { 0xAF3ADDC7, 0 }, + { 0xAEFEF982, 0 }, { 0xAEC33E20, 0 }, + { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 }, + { 0xAE10FFA9, 0 }, { 0xADD5E632, 0xADD5E632 }, + { 0xAD9AF4D0, 0 }, { 0xAD602B58, 0xAD602B58 }, + { 0xAD2589A4, 0 }, { 0xACEB0F89, 0xACEB0F89 }, + { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 }, + { 0xAC3C8D4A, 0 }, { 0xAC02B00B, 0 }, + { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 }, + { 0xAB5600AC, 0 }, { 0xAB1CBDD4, 0 }, + { 0xAAE3A136, 0 }, { 0xAAAAAAAB, 0 }, + { 0xAA71DA0D, 0 }, { 0xAA392F36, 0 }, + { 0xAA00AA01, 0 }, { 0xA9C84A48, 0 }, + { 0xA9900FE6, 0 }, { 0xA957FAB5, 0xA957FAB5 }, + { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 }, + { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 }, + { 0xA841B9AD, 0 }, { 0xA80A80A8, 0xA80A80A8 }, + { 0xA7D36BD8, 0 }, { 0xA79C7B17, 0 }, + { 0xA765AE44, 0 }, { 0xA72F053A, 0 }, + { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 }, + { 0xA68BDF79, 0 }, { 0xA655C439, 0xA655C439 }, + { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED }, + { 0xA5B4449D, 0 }, { 0xA57EB503, 0 }, + { 0xA54947FE, 0 }, { 0xA513FD6C, 0 }, + { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 }, + { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 }, + { 0xA40B88D0, 0 }, { 0xA3D70A3E, 0 }, + { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 }, + { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 }, + { 0xA2D28634, 0 }, { 0xA29ECF16, 0xA29ECF16 }, + { 0xA26B38C9, 0 }, { 0xA237C32B, 0xA237C32B }, + { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 }, + { 0xA19E2540, 0 }, { 0xA16B312F, 0 }, + { 0xA1385D35, 0 }, { 0xA105A933, 0 }, + { 0xA0D3150C, 0 }, { 0xA0A0A0A1, 0 }, + { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 }, + { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 }, + { 0x9FA63284, 0 }, { 0x9F747A15, 0x9F747A15 }, + { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 }, + { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D }, + { 0x9E7DADA9, 0 }, { 0x9E4CAD24, 0 }, + { 0x9E1BCAE3, 0 }, { 0x9DEB06C9, 0x9DEB06C9 }, + { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 }, + { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 }, + { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 }, + { 0x9C98ED58, 0 }, { 0x9C69169B, 0x9C69169B }, + { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C }, + { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E }, + { 0x9B7B98C0, 0 }, { 0x9B4C6F9F, 0 }, + { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 }, + { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 }, + { 0x9A624C97, 0 }, { 0x9A33CD67, 0x9A33CD67 }, + { 0x9A056A31, 0 }, { 0x99D722DB, 0 }, + { 0x99A8F74C, 0 }, { 0x997AE76B, 0x997AE76B }, + { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 }, + { 0x98F15CE7, 0 }, { 0x98C3BAC7, 0x98C3BAC7 }, + { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 }, + { 0x983B773B, 0 }, { 0x980E4156, 0x980E4156 }, + { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED }, + { 0x97874039, 0 }, { 0x975A7510, 0 }, + { 0x972DC45B, 0 }, { 0x97012E02, 0x97012E02 }, + { 0x96D4B1EF, 0 }, { 0x96A8500A, 0 }, + { 0x967C083B, 0 }, { 0x964FDA6C, 0x964FDA6C }, + { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 }, + { 0x95CBEC1B, 0 }, { 0x95A02568, 0x95A02568 }, + { 0x95747844, 0 }, { 0x9548E498, 0 }, + { 0x951D6A4E, 0 }, { 0x94F2094F, 0x94F2094F }, + { 0x94C6C187, 0 }, { 0x949B92DE, 0 }, + { 0x94707D3F, 0 }, { 0x94458094, 0x94458094 }, + { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 }, + { 0x93C51F76, 0 }, { 0x939A85C4, 0x939A85C4 }, + { 0x9370049C, 0 }, { 0x93459BE7, 0 }, + { 0x931B4B91, 0 }, { 0x92F11384, 0x92F11384 }, + { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 }, + { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 }, + { 0x921F64BF, 0 }, { 0x91F5BCB9, 0 }, + { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 }, + { 0x917952AF, 0 }, { 0x91500915, 0x91500915 }, + { 0x9126D6E5, 0 }, { 0x90FDBC09, 0x90FDBC09 }, + { 0x90D4B86F, 0 }, { 0x90ABCC02, 0x90ABCC02 }, + { 0x9082F6B0, 0 }, { 0x905A3863, 0x905A3863 }, + { 0x9031910A, 0 }, { 0x90090090, 0x90090090 }, + { 0x8FE086E3, 0 }, { 0x8FB823EE, 0x8FB823EE }, + { 0x8F8FD7A0, 0 }, { 0x8F67A1E4, 0 }, + { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 }, + { 0x8EEF8766, 0 }, { 0x8EC7AB3A, 0 }, + { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D }, + { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 }, + { 0x8E01AA05, 0 }, { 0x8DDA5202, 0x8DDA5202 }, + { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 }, + { 0x8D64CC5C, 0 }, { 0x8D3DCB09, 0 }, + { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF }, + { 0x8CC947C5, 0 }, { 0x8CA29C04, 0x8CA29C04 }, + { 0x8C7C057D, 0 }, { 0x8C55841D, 0 }, + { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C }, + { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 }, + { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A }, + { 0x8B4A451A, 0 }, { 0x8B246A88, 0 }, + { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 }, + { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 }, + { 0x8A6858AB, 0 }, { 0x8A42F870, 0x8A42F870 }, + { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 }, + { 0x89D3507D, 0 }, { 0x89AE408A, 0 }, + { 0x89894480, 0 }, { 0x89645C4F, 0x89645C4F }, + { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 }, + { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD }, + { 0x88ACFAEE, 0 }, { 0x88888889, 0 }, + { 0x8864298F, 0 }, { 0x883FDDF0, 0x883FDDF0 }, + { 0x881BA59E, 0 }, { 0x87F78088, 0 }, + { 0x87D36EA0, 0 }, { 0x87AF6FD6, 0 }, + { 0x878B841B, 0 }, { 0x8767AB5F, 0x8767AB5F }, + { 0x8743E595, 0 }, { 0x872032AC, 0x872032AC }, + { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 }, + { 0x86B58AA8, 0 }, { 0x869222B2, 0 }, + { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 }, + { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 }, + { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 }, + { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 }, + { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 }, + { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 }, + { 0x84CC6290, 0 }, { 0x84A9F9C8, 0x84A9F9C8 }, + { 0x8487A2D1, 0 }, { 0x84655D9C, 0 }, + { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 }, + { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 }, + { 0x83BB0C18, 0 }, { 0x83993052, 0x83993052 }, + { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 }, + { 0x83340520, 0x83340520 }, { 0x83126E98, 0 }, + { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 }, + { 0x82AE11DE, 0 }, { 0x828CBFBF, 0 }, + { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 }, + { 0x82292F08, 0 }, { 0x82082082, 0x82082082 }, + { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC }, + { 0x81A55963, 0 }, { 0x81848DA9, 0 }, + { 0x8163D283, 0 }, { 0x814327E4, 0 }, + { 0x81228DBF, 0 }, { 0x81020408, 0x81020408 }, + { 0x80E18AB3, 0 }, { 0x80C121B3, 0 }, + { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 }, + { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 }, + { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF } +}; diff --git a/third_party/aom/aom_dsp/odintrin.h b/third_party/aom/aom_dsp/odintrin.h new file mode 100644 index 0000000000..9e4ba5029a --- /dev/null +++ b/third_party/aom/aom_dsp/odintrin.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* clang-format off */ + +#ifndef AOM_AOM_DSP_ODINTRIN_H_ +#define AOM_AOM_DSP_ODINTRIN_H_ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int od_coeff; + +#define OD_DIVU_DMAX (1024) + +extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2]; + +/*Translate unsigned division by small divisors into multiplications.*/ +#define OD_DIVU_SMALL(_x, _d) \ + ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \ + OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >> \ + 32) >> \ + (OD_ILOG_NZ(_d) - 1)) + +#define OD_DIVU(_x, _d) \ + (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d))) + +#define OD_MINI AOMMIN +#define OD_MAXI AOMMAX +#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max))) + +/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer. + OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/ +#define OD_ILOG_NZ(x) (1 + get_msb(x)) + +/*Enable special features for gcc and compatible compilers.*/ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__) +#define OD_GNUC_PREREQ(maj, min, pat) \ + ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \ + ((maj) << 16) + ((min) << 8) + pat) // NOLINT +#else +#define OD_GNUC_PREREQ(maj, min, pat) (0) +#endif + +#if OD_GNUC_PREREQ(3, 4, 0) +#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#else +#define OD_WARN_UNUSED_RESULT +#endif + +#if OD_GNUC_PREREQ(3, 4, 0) +#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x))) +#else +#define OD_ARG_NONNULL(x) +#endif + +/*All of these macros should expect floats as arguments.*/ +# define OD_SIGNMASK(a) (-((a) < 0)) +# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_ODINTRIN_H_ diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h new file mode 100644 index 0000000000..5711a40a40 --- /dev/null +++ b/third_party/aom/aom_dsp/prob.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_PROB_H_ +#define AOM_AOM_DSP_PROB_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/entcode.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint16_t aom_cdf_prob; + +#define CDF_SIZE(x) ((x) + 1) +#define CDF_PROB_BITS 15 +#define CDF_PROB_TOP (1 << CDF_PROB_BITS) +/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative + probability (an "inverse" CDF). + This function converts from one representation to the other (and is its own + inverse).*/ +#define AOM_ICDF(x) (CDF_PROB_TOP - (x)) + +#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF4(a0, a1, a2) \ + AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF5(a0, a1, a2, a3) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF6(a0, a1, a2, a3, a4) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ + a14) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \ + AOM_ICDF(CDF_PROB_TOP), 0 + +static INLINE uint8_t get_prob(unsigned int num, unsigned int den) { + assert(den != 0); + { + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); + return (uint8_t)clipped_prob; + } +} + +static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) { + assert(nsymbs < 17); + const int count = cdf[nsymbs]; + + // rate is computed in the spec as: + // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2) + // In this case cdf[N] is |count|. + // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all + // nsymbs > 3. So the equation becomes: + // 4 + (count > 15) + (count > 31) + (nsymbs > 3). + // Note that the largest value for count is 32 (it is not incremented beyond + // 32). So using that information: + // count >> 4 is 0 for count from 0 to 15. + // count >> 4 is 1 for count from 16 to 31. + // count >> 4 is 2 for count == 31. + // Now, the equation becomes: + // 4 + (count >> 4) + (nsymbs > 3). + const int rate = 4 + (count >> 4) + (nsymbs > 3); + + int i = 0; + do { + if (i < val) { + cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate; + } else { + cdf[i] -= cdf[i] >> rate; + } + } while (++i < nsymbs - 1); + cdf[nsymbs] += (count < 32); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_PROB_H_ diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c new file mode 100644 index 0000000000..cf0de29945 --- /dev/null +++ b/third_party/aom/aom_dsp/psnr.c @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/psnr.h" +#include "aom_scale/yv12config.h" + +double aom_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} + +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { + int i, j; + int64_t sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t sse = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + return sse; +} + +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + const int dw = width % 16; + const int dh = height % 16; + int64_t total_sse = 0; + int x, y; + + if (dw > 0) { + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); + } + + if (dh > 0) { + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + total_sse += aom_sse(pa, a_stride, pb, b_stride, 16, 16); + + pa += 16; + pb += 16; + } + + a += 16 * a_stride; + b += 16 * b_stride; + } + + return total_sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height, unsigned int input_shift) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t total_sse = 0; + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int64_t diff; + diff = (a[x] >> input_shift) - (b[x] >> input_shift); + total_sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + return total_sse; +} + +static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int64_t total_sse = 0; + int x, y; + const int dw = width % 16; + const int dh = height % 16; + + if (dw > 0) { + total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); + } + if (dh > 0) { + total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + total_sse += aom_highbd_sse(pa, a_stride, pb, b_stride, 16, 16); + pa += 16; + pb += 16; + } + a += 16 * a_stride; + b += 16 * b_stride; + } + return total_sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height) { + return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + width, height) / + (width * height); +} + +uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height) { + return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height) { + return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, + width, height); +} + +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + + return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +#if CONFIG_AV1_HIGHBITDEPTH +uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height) { + return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart, + a->y_stride, width, height) / + (width * height); +} + +uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height) { + return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height) { + return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, width, height) / + (width * height); +} + +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse( + a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); +} + +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + switch (plane) { + case 0: return aom_highbd_get_y_sse(a, b); + case 1: return aom_highbd_get_u_sse(a, b); + case 2: return aom_highbd_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } + } else { + switch (plane) { + case 0: return aom_get_y_sse(a, b); + case 1: return aom_get_u_sse(a, b); + case 2: return aom_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } + } +#else + (void)highbd; + switch (plane) { + case 0: return aom_get_y_sse(a, b); + case 1: return aom_get_u_sse(a, b); + case 2: return aom_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } +#endif +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + uint32_t bit_depth, uint32_t in_bit_depth) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; +#if CONFIG_LIBVMAF_PSNR_PEAK + double peak = (double)(255 << (in_bit_depth - 8)); +#else + double peak = (double)((1 << in_bit_depth) - 1); +#endif // CONFIG_LIBVMAF_PSNR_PEAK + const unsigned int input_shift = bit_depth - in_bit_depth; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + uint64_t sse; + if (a->flags & YV12_FLAG_HIGHBITDEPTH) { + if (input_shift) { + sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i], + b_strides[i], w, h, input_shift); + } else { + sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], + b_strides[i], w, h); + } + } else { + sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, + h); + } + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); + + // Compute PSNR based on stream bit depth + if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { +#if CONFIG_LIBVMAF_PSNR_PEAK + peak = (double)(255 << (bit_depth - 8)); +#else + peak = (double)((1 << bit_depth) - 1); +#endif // CONFIG_LIBVMAF_PSNR_PEAK + total_sse = 0; + total_samples = 0; + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + uint64_t sse; + sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], + b_strides[i], w, h); + psnr->sse_hbd[1 + i] = sse; + psnr->samples_hbd[1 + i] = samples; + psnr->psnr_hbd[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + total_sse += sse; + total_samples += samples; + } + + psnr->sse_hbd[0] = total_sse; + psnr->samples_hbd[0] = total_samples; + psnr->psnr_hbd[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); + } +} +#endif + +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + static const double peak = 255.0; + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + const uint64_t sse = + get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h new file mode 100644 index 0000000000..afe6e08856 --- /dev/null +++ b/third_party/aom/aom_dsp/psnr.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_PSNR_H_ +#define AOM_AOM_DSP_PSNR_H_ + +#include "aom_scale/yv12config.h" + +#define MAX_PSNR 100.0 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double psnr[4]; // total/y/u/v + uint64_t sse[4]; // total/y/u/v + uint32_t samples[4]; // total/y/u/v + double psnr_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth + uint64_t sse_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth + uint32_t samples_hbd[4]; // total/y/u/v when input-bit-depth < bit-depth +} PSNR_STATS; + +/*!\brief Converts SSE to PSNR + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ +double aom_sse_to_psnr(double samples, double peak, double sse); +uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height); +uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height); +uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width, + int vstart, int height); +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd); +#if CONFIG_AV1_HIGHBITDEPTH +uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height); +uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height); +uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + unsigned int bit_depth, unsigned int in_bit_depth); +#endif +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr); + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *phvs_y, + double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AOM_DSP_PSNR_H_ diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c new file mode 100644 index 0000000000..966ba007ed --- /dev/null +++ b/third_party/aom/aom_dsp/psnrhvs.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Gregory Maxwell, at the Daala + * project. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/psnr.h" +#include "aom_dsp/ssim.h" + +static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_highbd_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +/* Normalized inverse quantization matrix for 8x8 DCT at the point of + * transparency. This is not the JPEG based matrix from the paper, + this one gives a slightly higher MOS agreement.*/ +static const double csf_y[8][8] = { + { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542 }, + { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551 }, + { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596 }, + { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, + 0.48309405692, 0.380429446972, 0.295774038565 }, + { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, + 0.352889268808, 0.283006984131, 0.226951348204 }, + { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, + { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, + { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } +}; +static const double csf_cb420[8][8] = { + { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, + 0.898018824055, 0.74725392039, 0.615105596242 }, + { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625 }, + { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837 }, + { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, + 0.751437590932, 0.685398513368, 0.608694761374 }, + { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, + 0.605503172737, 0.55002013668, 0.495804539034 }, + { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, + { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, + { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } +}; +static const double csf_cr420[8][8] = { + { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, + 0.867069376285, 0.721500455585, 0.593906509971 }, + { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198 }, + { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, + { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023 }, + { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, + 0.584635025748, 0.531064164893, 0.478717061273 }, + { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, + { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, + { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } +}; + +static double convert_score_db(double _score, double _weight, int16_t pix_max) { + assert(_score * _weight >= 0.0); + + if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; + return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); +} + +static double calc_psnrhvs(const unsigned char *src, int _systride, + const unsigned char *dst, int _dystride, double _par, + int _w, int _h, int _step, const double _csf[8][8], + uint32_t _shift, int buf_is_hbd, int16_t pix_max, + int luma) { + double ret; + const uint8_t *_src8 = src; + const uint8_t *_dst8 = dst; + const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); + DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); + DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); + double mask[8][8]; + int pixels; + int x; + int y; + float sum1; + float sum2; + float delt; + (void)_par; + ret = pixels = 0; + sum1 = sum2 = delt = 0.0f; + for (y = 0; y < _h; y++) { + for (x = 0; x < _w; x++) { + if (!buf_is_hbd) { + sum1 += _src8[y * _systride + x]; + sum2 += _dst8[y * _dystride + x]; + } else { + sum1 += _src16[y * _systride + x] >> _shift; + sum2 += _dst16[y * _dystride + x] >> _shift; + } + } + } + if (luma) delt = (sum1 - sum2) / (_w * _h); + /*In the PSNR-HVS-M paper[1] the authors describe the construction of + their masking table as "we have used the quantization table for the + color component Y of JPEG [6] that has been also obtained on the + basis of CSF. Note that the values in quantization table JPEG have + been normalized and then squared." Their CSF matrix (from PSNR-HVS) + was also constructed from the JPEG matrices. I can not find any obvious + scheme of normalizing to produce their table, but if I multiply their + CSF by 0.3885746225901003 and square the result I get their masking table. + I have no idea where this constant comes from, but deviating from it + too greatly hurts MOS agreement. + + [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, + Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking + of DCT basis functions", CD-ROM Proceedings of the Third + International Workshop on Video Processing and Quality Metrics for Consumer + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p. + + Suggested in aomedia issue#2363: + 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509) + of the old JPEG based matrix from the paper. Since you are not using that, + divide by actual maximum coefficient. */ + for (x = 0; x < 8; x++) + for (y = 0; y < 8; y++) + mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]); + for (y = 0; y < _h - 7; y += _step) { + for (x = 0; x < _w - 7; x += _step) { + int i; + int j; + int n = 0; + double s_gx = 0; + double s_gy = 0; + double g = 0; + double s_gmean = 0; + double s_gvar = 0; + double s_mask = 0; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + if (!buf_is_hbd) { + dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; + dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; + } else { + dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; + dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; + } + dct_d[i * 8 + j] += (int)(delt + 0.5f); + } + } + for (i = 1; i < 7; i++) { + for (j = 1; j < 7; j++) { + s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 - + dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 - + dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 - + dct_s[(i + 1) * 8 + j + 1] * 3) / + (pix_max * 16.f); + s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 - + dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 - + dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 - + dct_s[(i + 1) * 8 + j + 1] * 3) / + (pix_max * 16.f); + g = sqrt(s_gx * s_gx + s_gy * s_gy); + if (g > 0.1f) n++; + s_gmean += g; + } + } + s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f; +#if CONFIG_AV1_HIGHBITDEPTH + if (!buf_is_hbd) { + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } else { + hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } +#else + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); +#endif // CONFIG_AV1_HIGHBITDEPTH + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; + s_mask = sqrt(s_mask * s_gvar) / 8.f; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + double err; + err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); + if (i != 0 || j != 0) + err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; + ret += (err * _csf[i][j]) * (err * _csf[i][j]); + pixels++; + } + } + } + } + if (pixels <= 0) return 0; + ret /= pixels; + ret += 0.04 * delt * delt; + return ret; +} + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, + double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs, + uint32_t bd, uint32_t in_bd) { + double psnrhvs; + const double par = 1.0; + const int step = 7; + uint32_t bd_shift = 0; + assert(bd == 8 || bd == 10 || bd == 12); + assert(bd >= in_bd); + assert(src->flags == dst->flags); + const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH; + + int16_t pix_max = 255; + if (in_bd == 10) + pix_max = 1023; + else if (in_bd == 12) + pix_max = 4095; + + bd_shift = bd - in_bd; + + *y_psnrhvs = + calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, + par, src->y_crop_width, src->y_crop_height, step, csf_y, + bd_shift, buf_is_hbd, pix_max, 1); + *u_psnrhvs = + calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + par, src->uv_crop_width, src->uv_crop_height, step, + csf_cb420, bd_shift, buf_is_hbd, pix_max, 0); + *v_psnrhvs = + calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + par, src->uv_crop_width, src->uv_crop_height, step, + csf_cr420, bd_shift, buf_is_hbd, pix_max, 0); + psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); + return convert_score_db(psnrhvs, 1.0, pix_max); +} diff --git a/third_party/aom/aom_dsp/pyramid.c b/third_party/aom/aom_dsp/pyramid.c new file mode 100644 index 0000000000..324a18baea --- /dev/null +++ b/third_party/aom/aom_dsp/pyramid.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/pyramid.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_util/aom_thread.h" + +// TODO(rachelbarker): Move needed code from av1/ to aom_dsp/ +#include "av1/common/resize.h" + +#include +#include + +// Lifecycle: +// * Frame buffer alloc code calls aom_get_pyramid_alloc_size() +// to work out how much space is needed for a given number of pyramid +// levels. This is counted in the size checked against the max allocation +// limit +// * Then calls aom_alloc_pyramid() to actually create the pyramid +// * Pyramid is initially marked as invalid (no data) +// * Whenever pyramid is needed, we check the valid flag. If set, use existing +// data. If not set, compute full pyramid +// * Whenever frame buffer is reused, clear the valid flag +// * Whenever frame buffer is resized, reallocate pyramid + +size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, + bool image_is_16bit) { + // Limit number of levels on small frames + const int msb = get_msb(AOMMIN(width, height)); + const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + n_levels = AOMMIN(n_levels, max_levels); + + size_t alloc_size = 0; + alloc_size += sizeof(ImagePyramid); + alloc_size += n_levels * sizeof(PyramidLayer); + + // Calculate how much memory is needed for downscaled frame buffers + size_t buffer_size = 0; + + // Work out if we need to allocate a few extra bytes for alignment. + // aom_memalign() will ensure that the start of the allocation is aligned + // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel + // to be aligned, not the first byte of the allocation. + // + // In the loop below, we ensure that the stride of every image is a multiple + // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will + // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the + // first pixel in the first pyramid layer aligned properly, that will + // automatically mean that the first pixel of every row of every layer is + // properly aligned too. + // + // Thus all we need to consider is the first pixel in the first layer. + // This is located at offset + // extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING + // bytes into the buffer. Since level_stride is a multiple of + // PYRAMID_ALIGNMENT, we can ignore that. So we need + // extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT + // + // To solve this, we can round PYRAMID_PADDING up to the next multiple + // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate + // how many extra bytes are needed. + size_t first_px_offset = + (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); + size_t extra_bytes = first_px_offset - PYRAMID_PADDING; + buffer_size += extra_bytes; + + // If the original image is stored in an 8-bit buffer, then we can point the + // lowest pyramid level at that buffer rather than allocating a new one. + int first_allocated_level = image_is_16bit ? 0 : 1; + + for (int level = first_allocated_level; level < n_levels; level++) { + int level_width = width >> level; + int level_height = height >> level; + + // Allocate padding for each layer + int padded_width = level_width + 2 * PYRAMID_PADDING; + int padded_height = level_height + 2 * PYRAMID_PADDING; + + // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT + // This ensures that, as long as the top-left pixel in this pyramid level is + // properly aligned, then so will the leftmost pixel in every row of the + // pyramid level. + int level_stride = + (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); + + buffer_size += level_stride * padded_height; + } + + alloc_size += buffer_size; + + return alloc_size; +} + +ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, + bool image_is_16bit) { + // Limit number of levels on small frames + const int msb = get_msb(AOMMIN(width, height)); + const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); + n_levels = AOMMIN(n_levels, max_levels); + + ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr)); + if (!pyr) { + return NULL; + } + + pyr->layers = aom_calloc(n_levels, sizeof(*pyr->layers)); + if (!pyr->layers) { + aom_free(pyr); + return NULL; + } + + pyr->valid = false; + pyr->n_levels = n_levels; + + // Compute sizes and offsets for each pyramid level + // These are gathered up first, so that we can allocate all pyramid levels + // in a single buffer + size_t buffer_size = 0; + size_t *layer_offsets = aom_calloc(n_levels, sizeof(*layer_offsets)); + if (!layer_offsets) { + aom_free(pyr->layers); + aom_free(pyr); + return NULL; + } + + // Work out if we need to allocate a few extra bytes for alignment. + // aom_memalign() will ensure that the start of the allocation is aligned + // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel + // to be aligned, not the first byte of the allocation. + // + // In the loop below, we ensure that the stride of every image is a multiple + // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will + // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the + // first pixel in the first pyramid layer aligned properly, that will + // automatically mean that the first pixel of every row of every layer is + // properly aligned too. + // + // Thus all we need to consider is the first pixel in the first layer. + // This is located at offset + // extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING + // bytes into the buffer. Since level_stride is a multiple of + // PYRAMID_ALIGNMENT, we can ignore that. So we need + // extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT + // + // To solve this, we can round PYRAMID_PADDING up to the next multiple + // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate + // how many extra bytes are needed. + size_t first_px_offset = + (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); + size_t extra_bytes = first_px_offset - PYRAMID_PADDING; + buffer_size += extra_bytes; + + // If the original image is stored in an 8-bit buffer, then we can point the + // lowest pyramid level at that buffer rather than allocating a new one. + int first_allocated_level = image_is_16bit ? 0 : 1; + + for (int level = first_allocated_level; level < n_levels; level++) { + PyramidLayer *layer = &pyr->layers[level]; + + int level_width = width >> level; + int level_height = height >> level; + + // Allocate padding for each layer + int padded_width = level_width + 2 * PYRAMID_PADDING; + int padded_height = level_height + 2 * PYRAMID_PADDING; + + // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT + // This ensures that, as long as the top-left pixel in this pyramid level is + // properly aligned, then so will the leftmost pixel in every row of the + // pyramid level. + int level_stride = + (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1); + + size_t level_alloc_start = buffer_size; + size_t level_start = + level_alloc_start + PYRAMID_PADDING * level_stride + PYRAMID_PADDING; + + buffer_size += level_stride * padded_height; + + layer_offsets[level] = level_start; + layer->width = level_width; + layer->height = level_height; + layer->stride = level_stride; + } + + pyr->buffer_alloc = + aom_memalign(PYRAMID_ALIGNMENT, buffer_size * sizeof(*pyr->buffer_alloc)); + if (!pyr->buffer_alloc) { + aom_free(pyr->layers); + aom_free(pyr); + aom_free(layer_offsets); + return NULL; + } + + // Fill in pointers for each level + // If image is 8-bit, then the lowest level is left unconfigured for now, + // and will be set up properly when the pyramid is filled in + for (int level = first_allocated_level; level < n_levels; level++) { + PyramidLayer *layer = &pyr->layers[level]; + layer->buffer = pyr->buffer_alloc + layer_offsets[level]; + } + +#if CONFIG_MULTITHREAD + pthread_mutex_init(&pyr->mutex, NULL); +#endif // CONFIG_MULTITHREAD + + aom_free(layer_offsets); + return pyr; +} + +// Fill the border region of a pyramid frame. +// This must be called after the main image area is filled out. +// `img_buf` should point to the first pixel in the image area, +// ie. it should be pyr->level_buffer + pyr->level_loc[level]. +static INLINE void fill_border(uint8_t *img_buf, const int width, + const int height, const int stride) { + // Fill left and right areas + for (int row = 0; row < height; row++) { + uint8_t *row_start = &img_buf[row * stride]; + uint8_t left_pixel = row_start[0]; + memset(row_start - PYRAMID_PADDING, left_pixel, PYRAMID_PADDING); + uint8_t right_pixel = row_start[width - 1]; + memset(row_start + width, right_pixel, PYRAMID_PADDING); + } + + // Fill top area + for (int row = -PYRAMID_PADDING; row < 0; row++) { + uint8_t *row_start = &img_buf[row * stride]; + memcpy(row_start - PYRAMID_PADDING, img_buf - PYRAMID_PADDING, + width + 2 * PYRAMID_PADDING); + } + + // Fill bottom area + uint8_t *last_row_start = &img_buf[(height - 1) * stride]; + for (int row = height; row < height + PYRAMID_PADDING; row++) { + uint8_t *row_start = &img_buf[row * stride]; + memcpy(row_start - PYRAMID_PADDING, last_row_start - PYRAMID_PADDING, + width + 2 * PYRAMID_PADDING); + } +} + +// Compute coarse to fine pyramids for a frame +// This must only be called while holding frame_pyr->mutex +static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + ImagePyramid *frame_pyr) { + int n_levels = frame_pyr->n_levels; + const int frame_width = frame->y_crop_width; + const int frame_height = frame->y_crop_height; + const int frame_stride = frame->y_stride; + assert((frame_width >> n_levels) >= 0); + assert((frame_height >> n_levels) >= 0); + + PyramidLayer *first_layer = &frame_pyr->layers[0]; + if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { + // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits + assert(first_layer->width == frame_width); + assert(first_layer->height == frame_height); + + uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); + uint8_t *pyr_buffer = first_layer->buffer; + int pyr_stride = first_layer->stride; + for (int y = 0; y < frame_height; y++) { + uint16_t *frame_row = frame_buffer + y * frame_stride; + uint8_t *pyr_row = pyr_buffer + y * pyr_stride; + for (int x = 0; x < frame_width; x++) { + pyr_row[x] = frame_row[x] >> (bit_depth - 8); + } + } + + fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); + } else { + // For frames stored in an 8-bit buffer, we need to configure the first + // pyramid layer to point at the original image buffer + first_layer->buffer = frame->y_buffer; + first_layer->width = frame_width; + first_layer->height = frame_height; + first_layer->stride = frame_stride; + } + + // Fill in the remaining levels through progressive downsampling + for (int level = 1; level < n_levels; ++level) { + PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; + uint8_t *prev_buffer = prev_layer->buffer; + int prev_stride = prev_layer->stride; + + PyramidLayer *this_layer = &frame_pyr->layers[level]; + uint8_t *this_buffer = this_layer->buffer; + int this_width = this_layer->width; + int this_height = this_layer->height; + int this_stride = this_layer->stride; + + // Compute the this pyramid level by downsampling the current level. + // + // We downsample by a factor of exactly 2, clipping the rightmost and + // bottommost pixel off of the current level if needed. We do this for + // two main reasons: + // + // 1) In the disflow code, when stepping from a higher pyramid level to a + // lower pyramid level, we need to not just interpolate the flow field + // but also to scale each flow vector by the upsampling ratio. + // So it is much more convenient if this ratio is simply 2. + // + // 2) Up/downsampling by a factor of 2 can be implemented much more + // efficiently than up/downsampling by a generic ratio. + // TODO(rachelbarker): Use optimized downsample-by-2 function + if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1, + prev_stride, this_buffer, this_height, this_width, + this_stride)) + return false; + fill_border(this_buffer, this_width, this_height, this_stride); + } + return true; +} + +// Fill out a downsampling pyramid for a given frame. +// +// The top level (index 0) will always be an 8-bit copy of the input frame, +// regardless of the input bit depth. Additional levels are then downscaled +// by powers of 2. +// +// For small input frames, the number of levels actually constructed +// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE +// pixels along each side. +// +// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, +// we will still construct the top level. +bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + ImagePyramid *pyr) { + assert(pyr); + + // Per the comments in the ImagePyramid struct, we must take this mutex + // before reading or writing the "valid" flag, and hold it while computing + // the pyramid, to ensure proper behaviour if multiple threads call this + // function simultaneously +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + + if (!pyr->valid) { + pyr->valid = fill_pyramid(frame, bit_depth, pyr); + } + bool valid = pyr->valid; + + // At this point, the pyramid is guaranteed to be valid, and can be safely + // read from without holding the mutex any more + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + return valid; +} + +#ifndef NDEBUG +// Check if a pyramid has already been computed. +// This is mostly a debug helper - as it is necessary to hold pyr->mutex +// while reading the valid flag, we cannot just write: +// assert(pyr->valid); +// This function allows the check to be correctly written as: +// assert(aom_is_pyramid_valid(pyr)); +bool aom_is_pyramid_valid(ImagePyramid *pyr) { + assert(pyr); + + // Per the comments in the ImagePyramid struct, we must take this mutex + // before reading or writing the "valid" flag, and hold it while computing + // the pyramid, to ensure proper behaviour if multiple threads call this + // function simultaneously +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + + bool valid = pyr->valid; + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + + return valid; +} +#endif + +// Mark a pyramid as no longer containing valid data. +// This must be done whenever the corresponding frame buffer is reused +void aom_invalidate_pyramid(ImagePyramid *pyr) { + if (pyr) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + pyr->valid = false; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + } +} + +// Release the memory associated with a pyramid +void aom_free_pyramid(ImagePyramid *pyr) { + if (pyr) { +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&pyr->mutex); +#endif // CONFIG_MULTITHREAD + aom_free(pyr->buffer_alloc); + aom_free(pyr->layers); + aom_free(pyr); + } +} diff --git a/third_party/aom/aom_dsp/pyramid.h b/third_party/aom/aom_dsp/pyramid.h new file mode 100644 index 0000000000..9442a1ff08 --- /dev/null +++ b/third_party/aom/aom_dsp/pyramid.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_PYRAMID_H_ +#define AOM_AOM_DSP_PYRAMID_H_ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Minimum dimensions of a downsampled image +#define MIN_PYRAMID_SIZE_LOG2 3 +#define MIN_PYRAMID_SIZE (1 << MIN_PYRAMID_SIZE_LOG2) + +// Size of border around each pyramid image, in pixels +// Similarly to the border around regular image buffers, this border is filled +// with copies of the outermost pixels of the frame, to allow for more efficient +// convolution code +// TODO(rachelbarker): How many pixels do we actually need here? +// I think we only need 9 for disflow, but how many for corner matching? +#define PYRAMID_PADDING 16 + +// Byte alignment of each line within the image pyramids. +// That is, the first pixel inside the image (ie, not in the border region), +// on each row of each pyramid level, is aligned to this byte alignment. +// This value must be a power of 2. +#define PYRAMID_ALIGNMENT 32 + +typedef struct { + uint8_t *buffer; + int width; + int height; + int stride; +} PyramidLayer; + +// Struct for an image pyramid +typedef struct image_pyramid { +#if CONFIG_MULTITHREAD + // Mutex which is used to prevent the pyramid being computed twice at the + // same time + // + // Semantics: + // * This mutex must be held whenever reading or writing the `valid` flag + // + // * This mutex must also be held while computing the image pyramid, + // to ensure that only one thread may do so at a time. + // + // * However, once you have read the valid flag and seen a true value, + // it is safe to drop the mutex and read from the remaining fields. + // This is because, once the image pyramid is computed, its contents + // will not be changed until the parent frame buffer is recycled, + // which will not happen until there are no more outstanding references + // to the frame buffer. + pthread_mutex_t mutex; +#endif + // Flag indicating whether the pyramid contains valid data + bool valid; + // Number of allocated/filled levels in this pyramid + int n_levels; + // Pointer to allocated buffer + uint8_t *buffer_alloc; + // Data for each level + // The `buffer` pointers inside this array point into the region which + // is stored in the `buffer_alloc` field here + PyramidLayer *layers; +} ImagePyramid; + +size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, + bool image_is_16bit); + +ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, + bool image_is_16bit); + +// Fill out a downsampling pyramid for a given frame. +// +// The top level (index 0) will always be an 8-bit copy of the input frame, +// regardless of the input bit depth. Additional levels are then downscaled +// by powers of 2. +// +// For small input frames, the number of levels actually constructed +// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE +// pixels along each side. +// +// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, +// we will still construct the top level. +bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + ImagePyramid *pyr); + +#ifndef NDEBUG +// Check if a pyramid has already been computed. +// This is mostly a debug helper - as it is necessary to hold pyr->mutex +// while reading the valid flag, we cannot just write: +// assert(pyr->valid); +// This function allows the check to be correctly written as: +// assert(aom_is_pyramid_valid(pyr)); +bool aom_is_pyramid_valid(ImagePyramid *pyr); +#endif + +// Mark a pyramid as no longer containing valid data. +// This must be done whenever the corresponding frame buffer is reused +void aom_invalidate_pyramid(ImagePyramid *pyr); + +// Release the memory associated with a pyramid +void aom_free_pyramid(ImagePyramid *pyr); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_PYRAMID_H_ diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c new file mode 100644 index 0000000000..e5c960b826 --- /dev/null +++ b/third_party/aom/aom_dsp/quantize.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "config/aom_dsp_rtcd.h" + +#if !CONFIG_REALTIME_ONLY +void aom_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + int prescan_add[2]; + for (i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int prescan_add_val = prescan_add[rc != 0]; + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif // SKIP_EOB_FACTOR_ADJUST + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + + if (tmp32) { + eob = i; +#if SKIP_EOB_FACTOR_ADJUST + if (first == -1) first = i; +#endif // SKIP_EOB_FACTOR_ADJUST + } + } + } +#if SKIP_EOB_FACTOR_ADJUST + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} +#endif // !CONFIG_REALTIME_ONLY + +void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, + const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + + if (tmp32) eob = i; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_AV1_HIGHBITDEPTH +#if !CONFIG_REALTIME_ONLY +void aom_highbd_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; + int i, non_zero_count = (int)n_coeffs, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + int prescan_add[2]; + for (i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int prescan_add_val = prescan_add[rc != 0]; + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif // SKIP_EOB_FACTOR_ADJUST + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) { + eob = i; +#if SKIP_EOB_FACTOR_ADJUST + if (first == -1) first = eob; +#endif // SKIP_EOB_FACTOR_ADJUST + } + } + } +#if SKIP_EOB_FACTOR_ADJUST + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} +#endif // !CONFIG_REALTIME_ONLY + +void aom_highbd_quantize_b_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + int i, eob = -1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int dequant; + int idx_arr[4096]; + (void)iscan; + int idx = 0; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = idx_arr[i]; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if !CONFIG_REALTIME_ONLY +/* These functions should only be called when quantisation matrices + are not used. */ +void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 0); +} + +void aom_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 1); +} + +void aom_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 2); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void aom_highbd_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void aom_highbd_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // !CONFIG_REALTIME_ONLY + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 0); +} + +void aom_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); +} + +void aom_highbd_quantize_b_64x64_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h new file mode 100644 index 0000000000..efe253ddb9 --- /dev/null +++ b/third_party/aom/aom_dsp/quantize.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_QUANTIZE_H_ +#define AOM_AOM_DSP_QUANTIZE_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define EOB_FACTOR 325 +#define SKIP_EOB_FACTOR_ADJUST 200 + +void aom_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_highbd_quantize_b_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_highbd_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_highbd_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, + const int log_scale); + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_quantize_b_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif // CONFIG_AV1_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_QUANTIZE_H_ diff --git a/third_party/aom/aom_dsp/recenter.h b/third_party/aom/aom_dsp/recenter.h new file mode 100644 index 0000000000..b3fd412907 --- /dev/null +++ b/third_party/aom/aom_dsp/recenter.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_RECENTER_H_ +#define AOM_AOM_DSP_RECENTER_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +// Inverse recenters a non-negative literal v around a reference r +static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if ((v & 1) == 0) + return (v >> 1) + r; + else + return r - ((v + 1) >> 1); +} + +// Inverse recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, + uint16_t v) { + if ((r << 1) <= n) { + return inv_recenter_nonneg(r, v); + } else { + return n - 1 - inv_recenter_nonneg(n - 1 - r, v); + } +} + +// Recenters a non-negative literal v around a reference r +static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if (v >= r) + return ((v - r) << 1); + else + return ((r - v) << 1) - 1; +} + +// Recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static INLINE uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, + uint16_t v) { + if ((r << 1) <= n) { + return recenter_nonneg(r, v); + } else { + return recenter_nonneg(n - 1 - r, n - 1 - v); + } +} + +#endif // AOM_AOM_DSP_RECENTER_H_ diff --git a/third_party/aom/aom_dsp/rect.h b/third_party/aom/aom_dsp/rect.h new file mode 100644 index 0000000000..11bdaca979 --- /dev/null +++ b/third_party/aom/aom_dsp/rect.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_RECT_H_ +#define AOM_AOM_DSP_RECT_H_ + +#include "config/aom_config.h" + +#include + +// Struct representing a rectangle of pixels. +// The axes are inclusive-exclusive, ie. the point (top, left) is included +// in the rectangle but (bottom, right) is not. +typedef struct { + int left, right, top, bottom; +} PixelRect; + +static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; } + +static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; } + +static INLINE bool is_inside_rect(const int x, const int y, + const PixelRect *r) { + return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom); +} + +#endif // AOM_AOM_DSP_RECT_H_ diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c new file mode 100644 index 0000000000..8d69e3bf1c --- /dev/null +++ b/third_party/aom/aom_dsp/sad.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" + +/* Sum the difference between every corresponding element of the buffers. */ +static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + sad += abs(a[x] - b[x]); + } + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define SADMXN(m, n) \ + unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, \ + ref_stride, jcp_param); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ + } + +// Calculate sad against 4 reference locations and store each in sad_array +#define SAD_MXNX4D(m, n) \ + void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = \ + aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ + } \ + } \ + void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ + } +// Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable. +#define SAD_MXNX3D(m, n) \ + void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \ + } + +// 128x128 +SADMXN(128, 128) +SAD_MXNX4D(128, 128) +SAD_MXNX3D(128, 128) + +// 128x64 +SADMXN(128, 64) +SAD_MXNX4D(128, 64) +SAD_MXNX3D(128, 64) + +// 64x128 +SADMXN(64, 128) +SAD_MXNX4D(64, 128) +SAD_MXNX3D(64, 128) + +// 64x64 +SADMXN(64, 64) +SAD_MXNX4D(64, 64) +SAD_MXNX3D(64, 64) + +// 64x32 +SADMXN(64, 32) +SAD_MXNX4D(64, 32) +SAD_MXNX3D(64, 32) + +// 32x64 +SADMXN(32, 64) +SAD_MXNX4D(32, 64) +SAD_MXNX3D(32, 64) + +// 32x32 +SADMXN(32, 32) +SAD_MXNX4D(32, 32) +SAD_MXNX3D(32, 32) + +// 32x16 +SADMXN(32, 16) +SAD_MXNX4D(32, 16) +SAD_MXNX3D(32, 16) + +// 16x32 +SADMXN(16, 32) +SAD_MXNX4D(16, 32) +SAD_MXNX3D(16, 32) + +// 16x16 +SADMXN(16, 16) +SAD_MXNX4D(16, 16) +SAD_MXNX3D(16, 16) + +// 16x8 +SADMXN(16, 8) +SAD_MXNX4D(16, 8) +SAD_MXNX3D(16, 8) + +// 8x16 +SADMXN(8, 16) +SAD_MXNX4D(8, 16) +SAD_MXNX3D(8, 16) + +// 8x8 +SADMXN(8, 8) +SAD_MXNX4D(8, 8) +SAD_MXNX3D(8, 8) + +// 8x4 +SADMXN(8, 4) +SAD_MXNX4D(8, 4) +SAD_MXNX3D(8, 4) + +// 4x8 +SADMXN(4, 8) +SAD_MXNX4D(4, 8) +SAD_MXNX3D(4, 8) + +// 4x4 +SADMXN(4, 4) +SAD_MXNX4D(4, 4) +SAD_MXNX3D(4, 4) + +#if !CONFIG_REALTIME_ONLY +SADMXN(4, 16) +SAD_MXNX4D(4, 16) +SADMXN(16, 4) +SAD_MXNX4D(16, 4) +SADMXN(8, 32) +SAD_MXNX4D(8, 32) +SADMXN(32, 8) +SAD_MXNX4D(32, 8) +SADMXN(16, 64) +SAD_MXNX4D(16, 64) +SADMXN(64, 16) +SAD_MXNX4D(64, 16) +SAD_MXNX3D(4, 16) +SAD_MXNX3D(16, 4) +SAD_MXNX3D(8, 32) +SAD_MXNX3D(32, 8) +SAD_MXNX3D(16, 64) +SAD_MXNX3D(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + sad += abs(a[x] - b[x]); + } + + a += a_stride; + b += b_stride; + } + return sad; +} + +static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + sad += abs(a[x] - b[x]); + } + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define HIGHBD_SADMXN(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_highbd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \ + aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \ + } \ + unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t comp_pred[m * n]; \ + uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); \ + aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, \ + ref_stride, jcp_param); \ + return highbd_sadb(src, src_stride, comp_pred8, m, m, n); \ + } \ + unsigned int aom_highbd_sad_skip_##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ + } + +#define HIGHBD_SAD_MXNX4D(m, n) \ + void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } \ + void aom_highbd_sad_skip_##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ + } +// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable. +#define HIGHBD_SAD_MXNX3D(m, n) \ + void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, \ + sad_array); \ + } + +// 128x128 +HIGHBD_SADMXN(128, 128) +HIGHBD_SAD_MXNX4D(128, 128) +HIGHBD_SAD_MXNX3D(128, 128) + +// 128x64 +HIGHBD_SADMXN(128, 64) +HIGHBD_SAD_MXNX4D(128, 64) +HIGHBD_SAD_MXNX3D(128, 64) + +// 64x128 +HIGHBD_SADMXN(64, 128) +HIGHBD_SAD_MXNX4D(64, 128) +HIGHBD_SAD_MXNX3D(64, 128) + +// 64x64 +HIGHBD_SADMXN(64, 64) +HIGHBD_SAD_MXNX4D(64, 64) +HIGHBD_SAD_MXNX3D(64, 64) + +// 64x32 +HIGHBD_SADMXN(64, 32) +HIGHBD_SAD_MXNX4D(64, 32) +HIGHBD_SAD_MXNX3D(64, 32) + +// 32x64 +HIGHBD_SADMXN(32, 64) +HIGHBD_SAD_MXNX4D(32, 64) +HIGHBD_SAD_MXNX3D(32, 64) + +// 32x32 +HIGHBD_SADMXN(32, 32) +HIGHBD_SAD_MXNX4D(32, 32) +HIGHBD_SAD_MXNX3D(32, 32) + +// 32x16 +HIGHBD_SADMXN(32, 16) +HIGHBD_SAD_MXNX4D(32, 16) +HIGHBD_SAD_MXNX3D(32, 16) + +// 16x32 +HIGHBD_SADMXN(16, 32) +HIGHBD_SAD_MXNX4D(16, 32) +HIGHBD_SAD_MXNX3D(16, 32) + +// 16x16 +HIGHBD_SADMXN(16, 16) +HIGHBD_SAD_MXNX4D(16, 16) +HIGHBD_SAD_MXNX3D(16, 16) + +// 16x8 +HIGHBD_SADMXN(16, 8) +HIGHBD_SAD_MXNX4D(16, 8) +HIGHBD_SAD_MXNX3D(16, 8) + +// 8x16 +HIGHBD_SADMXN(8, 16) +HIGHBD_SAD_MXNX4D(8, 16) +HIGHBD_SAD_MXNX3D(8, 16) + +// 8x8 +HIGHBD_SADMXN(8, 8) +HIGHBD_SAD_MXNX4D(8, 8) +HIGHBD_SAD_MXNX3D(8, 8) + +// 8x4 +HIGHBD_SADMXN(8, 4) +HIGHBD_SAD_MXNX4D(8, 4) +HIGHBD_SAD_MXNX3D(8, 4) + +// 4x8 +HIGHBD_SADMXN(4, 8) +HIGHBD_SAD_MXNX4D(4, 8) +HIGHBD_SAD_MXNX3D(4, 8) + +// 4x4 +HIGHBD_SADMXN(4, 4) +HIGHBD_SAD_MXNX4D(4, 4) +HIGHBD_SAD_MXNX3D(4, 4) + +HIGHBD_SADMXN(4, 16) +HIGHBD_SAD_MXNX4D(4, 16) +HIGHBD_SADMXN(16, 4) +HIGHBD_SAD_MXNX4D(16, 4) +HIGHBD_SADMXN(8, 32) +HIGHBD_SAD_MXNX4D(8, 32) +HIGHBD_SADMXN(32, 8) +HIGHBD_SAD_MXNX4D(32, 8) +HIGHBD_SADMXN(16, 64) +HIGHBD_SAD_MXNX4D(16, 64) +HIGHBD_SADMXN(64, 16) +HIGHBD_SAD_MXNX4D(64, 16) + +#if !CONFIG_REALTIME_ONLY +HIGHBD_SAD_MXNX3D(4, 16) +HIGHBD_SAD_MXNX3D(16, 4) +HIGHBD_SAD_MXNX3D(8, 32) +HIGHBD_SAD_MXNX3D(32, 8) +HIGHBD_SAD_MXNX3D(16, 64) +HIGHBD_SAD_MXNX3D(64, 16) +#endif // !CONFIG_REALTIME_ONLY +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c new file mode 100644 index 0000000000..f3d5847bd5 --- /dev/null +++ b/third_party/aom/aom_dsp/sad_av1.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" + +static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + return sad; +} + +#define MASKSADMxN(m, n) \ + unsigned int aom_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n); \ + else \ + return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } \ + void aom_masked_sad##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int invert_mask, unsigned sads[4]) { \ + if (!invert_mask) \ + for (int i = 0; i < 4; i++) { \ + sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \ + m, msk, msk_stride, m, n); \ + } \ + else \ + for (int i = 0; i < 4; i++) { \ + sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i], \ + ref_stride, msk, msk_stride, m, n); \ + } \ + } + +/* clang-format off */ +MASKSADMxN(128, 128) +MASKSADMxN(128, 64) +MASKSADMxN(64, 128) +MASKSADMxN(64, 64) +MASKSADMxN(64, 32) +MASKSADMxN(32, 64) +MASKSADMxN(32, 32) +MASKSADMxN(32, 16) +MASKSADMxN(16, 32) +MASKSADMxN(16, 16) +MASKSADMxN(16, 8) +MASKSADMxN(8, 16) +MASKSADMxN(8, 8) +MASKSADMxN(8, 4) +MASKSADMxN(4, 8) +MASKSADMxN(4, 4) +MASKSADMxN(4, 16) +MASKSADMxN(16, 4) +MASKSADMxN(8, 32) +MASKSADMxN(32, 8) +MASKSADMxN(16, 64) +MASKSADMxN(64, 16) +/* clang-format on */ + +#if CONFIG_AV1_HIGHBITDEPTH + static INLINE + unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + + return sad; +} + +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +HIGHBD_MASKSADMXN(128, 128) +HIGHBD_MASKSADMXN(128, 64) +HIGHBD_MASKSADMXN(64, 128) +HIGHBD_MASKSADMXN(64, 64) +HIGHBD_MASKSADMXN(64, 32) +HIGHBD_MASKSADMXN(32, 64) +HIGHBD_MASKSADMXN(32, 32) +HIGHBD_MASKSADMXN(32, 16) +HIGHBD_MASKSADMXN(16, 32) +HIGHBD_MASKSADMXN(16, 16) +HIGHBD_MASKSADMXN(16, 8) +HIGHBD_MASKSADMXN(8, 16) +HIGHBD_MASKSADMXN(8, 8) +HIGHBD_MASKSADMXN(8, 4) +HIGHBD_MASKSADMXN(4, 8) +HIGHBD_MASKSADMXN(4, 4) +HIGHBD_MASKSADMXN(4, 16) +HIGHBD_MASKSADMXN(16, 4) +HIGHBD_MASKSADMXN(8, 32) +HIGHBD_MASKSADMXN(32, 8) +HIGHBD_MASKSADMXN(16, 64) +HIGHBD_MASKSADMXN(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if !CONFIG_REALTIME_ONLY +// pre: predictor being evaluated +// wsrc: target weighted prediction (has been *4096 to keep precision) +// mask: 2d weights (scaled by 4096) +static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define OBMCSADMxN(m, n) \ + unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +OBMCSADMxN(128, 128) +OBMCSADMxN(128, 64) +OBMCSADMxN(64, 128) +OBMCSADMxN(64, 64) +OBMCSADMxN(64, 32) +OBMCSADMxN(32, 64) +OBMCSADMxN(32, 32) +OBMCSADMxN(32, 16) +OBMCSADMxN(16, 32) +OBMCSADMxN(16, 16) +OBMCSADMxN(16, 8) +OBMCSADMxN(8, 16) +OBMCSADMxN(8, 8) +OBMCSADMxN(8, 4) +OBMCSADMxN(4, 8) +OBMCSADMxN(4, 4) +OBMCSADMxN(4, 16) +OBMCSADMxN(16, 4) +OBMCSADMxN(8, 32) +OBMCSADMxN(32, 8) +OBMCSADMxN(16, 64) +OBMCSADMxN(64, 16) +/* clang-format on */ + +#if CONFIG_AV1_HIGHBITDEPTH + static INLINE + unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define HIGHBD_OBMCSADMXN(m, n) \ + unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +HIGHBD_OBMCSADMXN(128, 128) +HIGHBD_OBMCSADMXN(128, 64) +HIGHBD_OBMCSADMXN(64, 128) +HIGHBD_OBMCSADMXN(64, 64) +HIGHBD_OBMCSADMXN(64, 32) +HIGHBD_OBMCSADMXN(32, 64) +HIGHBD_OBMCSADMXN(32, 32) +HIGHBD_OBMCSADMXN(32, 16) +HIGHBD_OBMCSADMXN(16, 32) +HIGHBD_OBMCSADMXN(16, 16) +HIGHBD_OBMCSADMXN(16, 8) +HIGHBD_OBMCSADMXN(8, 16) +HIGHBD_OBMCSADMXN(8, 8) +HIGHBD_OBMCSADMXN(8, 4) +HIGHBD_OBMCSADMXN(4, 8) +HIGHBD_OBMCSADMXN(4, 4) +HIGHBD_OBMCSADMXN(4, 16) +HIGHBD_OBMCSADMXN(16, 4) +HIGHBD_OBMCSADMXN(8, 32) +HIGHBD_OBMCSADMXN(32, 8) +HIGHBD_OBMCSADMXN(16, 64) +HIGHBD_OBMCSADMXN(64, 16) +/* clang-format on */ +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h new file mode 100644 index 0000000000..218a7a6186 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ + +#include +#include +#include + +#include "aom_dsp/simd/v128_intrinsics_c.h" +#include "aom_dsp/simd/v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v128 v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); } +SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); } +SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); } +SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) { + return c_v128_from_64(hi, lo); +} +SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) { + return c_v128_from_v64(hi, lo); +} +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return c_v128_from_32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return c_v128_load_unaligned(p); +} +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return c_v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + c_v128_store_unaligned(p, a); +} +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + c_v128_store_aligned(p, a); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { + return c_v128_align(a, b, c); +} + +SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); } +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } +SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); } + +SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) { + return c_v128_sad_u8_init(); +} +SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) { + return c_v128_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) { + return c_v128_sad_u8_sum(s); +} +SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) { + return c_v128_ssd_u8_init(); +} +SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) { + return c_v128_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) { + return c_v128_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + return c_v128_dotp_su8(a, b); +} +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return c_v128_dotp_s16(a, b); +} +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + return c_v128_dotp_s32(a, b); +} +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); } +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); } +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } +SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); } +SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); } +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } +SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); } +SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); } +SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); } +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } +SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); } +SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } +SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); } +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return c_v128_mullo_s16(a, b); +} +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return c_v128_mulhi_s16(a, b); +} +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return c_v128_mullo_s32(a, b); +} +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } + +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); } +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { + return c_v128_blend_8(a, b, c); +} + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { + return c_v128_rdavg_u16(a, b); +} +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } +SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); } +SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); } + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); } +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); } +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); } +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); } +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); } +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); } +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); } +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); } +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); } +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { + return c_v128_unziplo_8(a, b); +} +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return c_v128_unziphi_8(a, b); +} +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { + return c_v128_unziplo_16(a, b); +} +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return c_v128_unziphi_16(a, b); +} +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return c_v128_unziplo_32(a, b); +} +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return c_v128_unziphi_32(a, b); +} +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return c_v128_unpacklo_u8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return c_v128_unpackhi_u8_s16(a); +} +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return c_v128_unpacklo_s8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return c_v128_unpackhi_s8_s16(a); +} +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return c_v128_pack_s32_s16(a, b); +} +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { + return c_v128_pack_s32_u16(a, b); +} +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return c_v128_pack_s16_u8(a, b); +} +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return c_v128_pack_s16_s8(a, b); +} +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); } +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); } +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return c_v128_unpacklo_u16_s32(a); +} +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return c_v128_unpacklo_s16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return c_v128_unpackhi_u16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return c_v128_unpackhi_s16_s32(a); +} +SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) { + return c_v128_shuffle_8(a, pattern); +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); } +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); } +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); } +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return c_v128_cmpgt_s16(a, b); +} +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return c_v128_cmplt_s16(a, b); +} +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { + return c_v128_cmpgt_s32(a, b); +} +SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { + return c_v128_cmplt_s32(a, b); +} +SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return c_v128_shl_8(a, c); +} +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return c_v128_shr_u8(a, c); +} +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return c_v128_shr_s8(a, c); +} +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return c_v128_shl_16(a, c); +} +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return c_v128_shr_u16(a, c); +} +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return c_v128_shr_s16(a, c); +} +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return c_v128_shl_32(a, c); +} +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return c_v128_shr_u32(a, c); +} +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return c_v128_shr_s32(a, c); +} +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return c_v128_shl_64(a, c); +} +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return c_v128_shr_u64(a, c); +} +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + return c_v128_shr_s64(a, c); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return c_v128_shr_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return c_v128_shl_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) { + return c_v128_shl_n_8(a, n); +} +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { + return c_v128_shl_n_16(a, n); +} +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { + return c_v128_shl_n_32(a, n); +} +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) { + return c_v128_shl_n_64(a, n); +} +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { + return c_v128_shr_n_u8(a, n); +} +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { + return c_v128_shr_n_u16(a, n); +} +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { + return c_v128_shr_n_u32(a, n); +} +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) { + return c_v128_shr_n_u64(a, n); +} +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { + return c_v128_shr_n_s8(a, n); +} +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { + return c_v128_shr_n_s16(a, n); +} +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { + return c_v128_shr_n_s32(a, n); +} +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) { + return c_v128_shr_n_s64(a, n); +} + +typedef uint32_t sad128_internal_u16; +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { + return c_v128_sad_u16_init(); +} +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { + return c_v128_sad_u16(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + return c_v128_sad_u16_sum(s); +} + +typedef uint64_t ssd128_internal_s16; +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { + return c_v128_ssd_s16_init(); +} +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + return c_v128_ssd_s16(s, a, b); +} +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return c_v128_ssd_s16_sum(s); +} + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h new file mode 100644 index 0000000000..f5ca817fb6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h @@ -0,0 +1,898 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/simd/v64_intrinsics_c.h" + +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t s8[16]; + int16_t s16[8]; + int32_t s32[4]; + int64_t s64[2]; + c_v64 v64[2]; +} c_v128; + +SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } + +SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } + +SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { + c_v128 t; + t.u64[1] = hi; + t.u64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { + c_v128 t; + t.v64[1] = hi; + t.v64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, + uint32_t d) { + c_v128 t; + t.u32[3] = a; + t.u32[2] = b; + t.u32[1] = c; + t.u32[0] = d; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { + c_v128 t; + memcpy(&t, p, 16); + return t; +} + +SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 load at %p\n", p); + abort(); + } + return c_v128_load_unaligned(p); +} + +SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { + memcpy(p, &a, 16); +} + +SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 store at %p\n", p); + abort(); + } + c_v128_store_unaligned(p, a); +} + +SIMD_INLINE c_v128 c_v128_zero(void) { + c_v128 t; + t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) { + c_v128 t; + t.u64[1] = t.u64[0] = x; + return t; +} + +SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) { + return c_v64_dotp_su8(a.v64[1], b.v64[1]) + + c_v64_dotp_su8(a.v64[0], b.v64[0]); +} + +SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { + return c_v64_dotp_s16(a.v64[1], b.v64[1]) + + c_v64_dotp_s16(a.v64[0], b.v64[0]); +} + +SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) { + // 32 bit products, 64 bit sum + return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) + + (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) + + (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) + + (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]); +} + +SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { + return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); +} + +typedef struct { + uint32_t val; + int count; +} c_sad128_internal; + +SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) { + c_sad128_internal t; + t.val = t.count = 0; + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is + * undefined. */ +SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) + s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + s.count++; + if (SIMD_CHECK && s.count > 32) { + fprintf(stderr, + "Error: sad called 32 times returning an undefined result\n"); + abort(); + } + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; } + +typedef uint32_t c_ssd128_internal; + +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } + +SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), + c_v64_or(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), + c_v64_xor(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), + c_v64_and(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), + c_v64_andn(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), + c_v64_add_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), + c_v64_add_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]), + c_v64_sadd_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]), + c_v64_sadd_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), + c_v64_sadd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), + c_v64_add_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) { + // Two complement overflow (silences sanitizers) + return c_v128_from_64( + a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1 + : a.v64[1].u64 + b.v64[1].u64, + a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1 + : a.v64[0].u64 + b.v64[0].u64); +} + +SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { + c_v128 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + return t; +} + +SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) { + c_v128 t; + t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1]; + t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3]; + t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5]; + t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7]; + t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9]; + t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11]; + t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13]; + t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15]; + return t; +} + +SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), + c_v64_sub_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), + c_v64_ssub_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), + c_v64_ssub_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), + c_v64_sub_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), + c_v64_ssub_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), + c_v64_ssub_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), + c_v64_sub_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) { + // Two complement underflow (silences sanitizers) + return c_v128_from_64( + a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1 + : a.v64[1].u64 - b.v64[1].u64, + a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1 + : a.v64[0].u64 - b.v64[0].u64); +} + +SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { + c_v64 lo_bits = c_v64_mullo_s16(a, b); + c_v64 hi_bits = c_v64_mulhi_s16(a, b); + return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), + c_v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), + c_v64_mullo_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), + c_v64_mulhi_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), + c_v64_mullo_s32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), + c_v64_madd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), + c_v64_madd_us8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), + c_v64_avg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), + c_v64_rdavg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]), + c_v64_rdavg_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), + c_v64_avg_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), + c_v64_min_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), + c_v64_max_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), + c_v64_min_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) { + return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | + ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | + ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | + ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | + ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | + ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | + ((a.s8[0] < 0) << 0); +} + +SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) { + c_v128 t; + for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; + return t; +} + +SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), + c_v64_max_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), + c_v64_min_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), + c_v64_max_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), + c_v64_ziplo_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), + c_v64_ziplo_8(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), + c_v64_ziplo_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), + c_v64_ziplo_16(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), + c_v64_ziplo_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), + c_v64_ziplo_32(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[0], b.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[1], b.v64[1]); +} + +SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); +} + +SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u8[15] = b.u8[15]; + t.u8[14] = b.u8[13]; + t.u8[13] = b.u8[11]; + t.u8[12] = b.u8[9]; + t.u8[11] = b.u8[7]; + t.u8[10] = b.u8[5]; + t.u8[9] = b.u8[3]; + t.u8[8] = b.u8[1]; + t.u8[7] = a.u8[15]; + t.u8[6] = a.u8[13]; + t.u8[5] = a.u8[11]; + t.u8[4] = a.u8[9]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[15] = a.u8[14]; + t.u8[14] = a.u8[12]; + t.u8[13] = a.u8[10]; + t.u8[12] = a.u8[8]; + t.u8[11] = a.u8[6]; + t.u8[10] = a.u8[4]; + t.u8[9] = a.u8[2]; + t.u8[8] = a.u8[0]; + t.u8[7] = b.u8[14]; + t.u8[6] = b.u8[12]; + t.u8[5] = b.u8[10]; + t.u8[4] = b.u8[8]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) + : _c_v128_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) + : _c_v128_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u16[7] = b.u16[7]; + t.u16[6] = b.u16[5]; + t.u16[5] = b.u16[3]; + t.u16[4] = b.u16[1]; + t.u16[3] = a.u16[7]; + t.u16[2] = a.u16[5]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[7] = a.u16[6]; + t.u16[6] = a.u16[4]; + t.u16[5] = a.u16[2]; + t.u16[4] = a.u16[0]; + t.u16[3] = b.u16[6]; + t.u16[2] = b.u16[4]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) + : _c_v128_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) + : _c_v128_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u32[3] = b.u32[3]; + t.u32[2] = b.u32[1]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[3] = a.u32[2]; + t.u32[2] = a.u32[0]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) + : _c_v128_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) + : _c_v128_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), + c_v64_unpacklo_u8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), + c_v64_unpacklo_u8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), + c_v64_unpacklo_s8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), + c_v64_unpacklo_s8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), + c_v64_pack_s32_s16(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]), + c_v64_pack_s32_u16(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), + c_v64_pack_s16_u8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), + c_v64_pack_s16_s8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), + c_v64_unpacklo_u16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), + c_v64_unpacklo_s16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), + c_v64_unpacklo_u16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), + c_v64_unpacklo_s16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { + c_v128 t; + int c; + for (c = 0; c < 16; c++) + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) + : pattern.u8[c] & 15]; + + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), + c_v64_cmpgt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), + c_v64_cmplt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), + c_v64_cmpeq_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), + c_v64_cmpgt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), + c_v64_cmplt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), + c_v64_cmpeq_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) { + if (n == 0) return a; + if (n < 8) + return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), + c_v64_shr_n_byte(a.v64[0], 8 - n)), + c_v64_shl_n_byte(a.v64[0], n)); + else + return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); +} + +SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) { + if (n == 0) return a; + if (n < 8) + return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), + c_v64_or(c_v64_shr_n_byte(a.v64[0], n), + c_v64_shl_n_byte(a.v64[1], 8 - n))); + else + return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); +} + +SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) { + if (SIMD_CHECK && c > 15) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) + : b; +} + +SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), + c_v64_shr_u16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), + c_v64_shr_s16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), + c_v64_shr_u32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) { + return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), + c_v64_shr_s32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) { + a.v64[1].u64 <<= c; + a.v64[0].u64 <<= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) { + a.v64[1].u64 >>= c; + a.v64[0].u64 >>= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) { + a.v64[1].s64 >>= c; + a.v64[0].s64 >>= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) { + return c_v128_shl_8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) { + return c_v128_shl_16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) { + return c_v128_shl_32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) { + return c_v128_shl_64(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) { + return c_v128_shr_u8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) { + return c_v128_shr_u16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) { + return c_v128_shr_u32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) { + return c_v128_shr_u64(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) { + return c_v128_shr_s8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) { + return c_v128_shr_s16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) { + return c_v128_shr_s32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) { + return c_v128_shr_s64(a, n); +} + +typedef uint32_t c_sad128_internal_u16; + +SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, + c_v128 a, c_v128 b) { + int c; + for (c = 0; c < 8; c++) + s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; } + +typedef uint64_t c_ssd128_internal_s16; + +SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, + c_v128 a, c_v128 b) { + int c; + for (c = 0; c < 8; c++) + s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * + (int32_t)(int16_t)(a.s16[c] - b.s16[c]); + return s; +} + +SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h new file mode 100644 index 0000000000..d20f979dd9 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h @@ -0,0 +1,659 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ + +#include +#include "aom_dsp/simd/v64_intrinsics_x86.h" + +typedef __m128i v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { + return _mm_unpacklo_epi64(a, v64_zero()); +} + +SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return v128_from_v64(v64_from_64(a), v64_from_64(b)); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_set_epi32((int)a, (int)b, (int)c, (int)d); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return _mm_load_si128((__m128i *)p); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { +#if defined(__SSSE3__) + return _mm_lddqu_si128((__m128i *)p); +#else + return _mm_loadu_si128((__m128i *)p); +#endif +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + _mm_store_si128((__m128i *)p, a); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + _mm_storeu_si128((__m128i *)p, a); +} + +// The following function requires an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) +#if defined(__SSSE3__) +SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { + return c ? _mm_alignr_epi8(a, b, c) : b; +} +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#else +#if defined(__SSSE3__) +#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#endif + +SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); } + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } + +SIMD_INLINE v128 v128_dup_64(uint64_t x) { + // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers + return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32), + (int32_t)x); +} + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } + +SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } + +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return _mm_madd_epi16(a, _mm_set1_epi16(1)); +} + +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } + +SIMD_INLINE v128 v128_abs_s16(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v128 v128_abs_s8(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { + return _mm_unpacklo_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { + return _mm_unpackhi_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { + return _mm_unpacklo_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { + return _mm_unpackhi_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { + return _mm_unpacklo_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { + return _mm_unpackhi_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return _mm_unpackhi_epi64(b, a); +} + +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); +} + +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); +} + +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return _mm_unpackhi_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return _mm_packs_epi32(b, a); +} + +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(b, a); +#else + return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), + v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); +#endif +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return _mm_packus_epi16(b, a); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return _mm_packs_epi16(b, a); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return _mm_unpackhi_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v128 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + unsigned char *selected = (unsigned char *)&output; + int counter; + + for (counter = 0; counter < 16; counter++) { + selected[counter] = input[index[counter] & 15]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); + v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); + v128 t = v128_add_32(t1, t2); + t = v128_add_32(t, _mm_srli_si128(t, 8)); + t = v128_add_32(t, _mm_srli_si128(t, 4)); + return (int32_t)v128_low_u32(t); +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + v128 r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), + _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); + return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +#endif +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { + v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); + return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); +} + +typedef v128 sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init(void) { + return _mm_setzero_si128(); +} + +/* Implementation dependent return value. Result must be finalised with + v128_sad_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); +} + +typedef int32_t ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + v128 z = _mm_setzero_si128(); + v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); + v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); + v128 rl = _mm_madd_epi16(l, l); + v128 rh = _mm_madd_epi16(h, h); + v128 r = _mm_add_epi32(rl, rh); + r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); + r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); + return s + _mm_cvtsi128_si32(r); +} + +SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + v64 lo_bits = v64_mullo_s16(a, b); + v64 hi_bits = v64_mulhi_s16(a, b); + return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), + v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return _mm_mullo_epi16(a, b); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return _mm_mulhi_epi16(a, b); +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), + _mm_shuffle_epi32( + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); +#endif +} + +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + v128 r = v128_mullo_s32(a, b); + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + return _mm_packs_epi32( + _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), + _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); +#endif +} + +SIMD_INLINE v128 v128_padd_u8(v128 a) { + return v128_madd_us8(a, _mm_set1_epi8(1)); +} + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); +} + +SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { + return _mm_sub_epi16(_mm_avg_epu16(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } + +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(a, b, c); +#else + c = _mm_cmplt_epi8(c, v128_zero()); + return v128_or(v128_and(b, c), v128_andn(a, c)); +#endif +} + +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi32(a, b); +#else + v128 mask = _mm_cmplt_epi32(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi32(a, b); +#else + v128 mask = _mm_cmplt_epi32(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return _mm_cmpgt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return _mm_cmplt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { + return _mm_cmpgt_epi32(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { + return _mm_cmplt_epi32(a, b); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), + _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128((int)(c + 8)); + return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), + _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + // _mm_sra_epi64 is missing in gcc? + return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c), + (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c)); + // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127) +#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127) +#define v128_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v128_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) +#define v128_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ + _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) +#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) +#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) +#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) +#define v128_shr_n_s64(a, c) \ + v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? + +typedef v128 sad128_internal_u16; + +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { +#if defined(__SSE4_1__) + v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); +#else + v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), + v128_xor(b, v128_dup_16(32768))); + t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), + v128_or(v128_and(a, t), v128_andn(b, t))); +#endif + return v128_add_32( + s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); +} + +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + + v128_low_u32(v128_shr_n_byte(s, 8)) + + v128_low_u32(v128_shr_n_byte(s, 12)); +} + +typedef v128 ssd128_internal_s16; + +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + v128 d = v128_sub_16(a, b); + d = v128_madd_s16(d, d); + return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), + _mm_unpacklo_epi32(d, v128_zero()))); +} + +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); +} + +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h new file mode 100644 index 0000000000..17e36eed61 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ + +#include +#include +#include + +#include "aom_dsp/simd/v256_intrinsics_c.h" +#include "aom_dsp/simd/v128_intrinsics.h" +#include "aom_dsp/simd/v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v256 v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } +SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); } +SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } +SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + return c_v256_from_v128(hi, lo); +} +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return c_v256_from_64(a, b, c, d); +} +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return c_v256_from_v64(a, b, c, d); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return c_v256_load_unaligned(p); +} +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return c_v256_load_aligned(p); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + c_v256_store_unaligned(p, a); +} +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + c_v256_store_aligned(p, a); +} + +SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) { + return c_v256_align(a, b, c); +} + +SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); } +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } +SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); } + +SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) { + return c_v256_sad_u8_init(); +} +SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) { + return c_v256_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) { + return c_v256_sad_u8_sum(s); +} +SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) { + return c_v256_ssd_u8_init(); +} +SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) { + return c_v256_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) { + return c_v256_ssd_u8_sum(s); +} + +SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) { + return c_v256_ssd_s16_init(); +} +SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a, + v256 b) { + return c_v256_ssd_s16(s, a, b); +} +SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) { + return c_v256_ssd_s16_sum(s); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + return c_v256_dotp_su8(a, b); +} +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return c_v256_dotp_s16(a, b); +} +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + return c_v256_dotp_s32(a, b); +} +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); } +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); } +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); } +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); } +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); } +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); } +SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); } +SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); } +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); } +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); } +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); } +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); } +SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); } +SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); } + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); } +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return c_v256_mullo_s16(a, b); +} +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return c_v256_mulhi_s16(a, b); +} +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return c_v256_mullo_s32(a, b); +} +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); } +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return c_v256_blend_8(a, b, c); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return c_v256_rdavg_u16(a, b); +} +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); } +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); } + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); } +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); } +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); } +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); } +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); } +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); } +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return c_v256_ziplo_128(a, b); +} +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return c_v256_ziphi_128(a, b); +} +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); } +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); } +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); } +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return c_v256_unziplo_8(a, b); +} +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return c_v256_unziphi_8(a, b); +} +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return c_v256_unziplo_16(a, b); +} +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return c_v256_unziphi_16(a, b); +} +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return c_v256_unziplo_32(a, b); +} +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return c_v256_unziphi_32(a, b); +} +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { + return c_v256_unziplo_64(a, b); +} +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { + return c_v256_unziphi_64(a, b); +} +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return c_v256_unpacklo_u8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return c_v256_unpackhi_u8_s16(a); +} +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return c_v256_unpacklo_s8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return c_v256_unpackhi_s8_s16(a); +} +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return c_v256_pack_s32_s16(a, b); +} +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return c_v256_pack_s32_u16(a, b); +} +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return c_v256_pack_s16_u8(a, b); +} +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return c_v256_pack_s16_s8(a, b); +} +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return c_v256_unpack_u16_s32(a); +} +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return c_v256_unpack_s16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return c_v256_unpacklo_u16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return c_v256_unpacklo_s16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return c_v256_unpackhi_u16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return c_v256_unpackhi_s16_s32(a); +} +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + return c_v256_shuffle_8(a, pattern); +} +SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { + return c_v256_wideshuffle_8(a, b, pattern); +} +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return c_v256_pshuffle_8(a, pattern); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); } +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); } +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); } +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return c_v256_cmpgt_s16(a, b); +} +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return c_v256_cmplt_s16(a, b); +} +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); } + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return c_v256_cmpgt_s32(a, b); +} +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return c_v256_cmplt_s32(a, b); +} +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return c_v256_shl_8(a, c); +} +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return c_v256_shr_u8(a, c); +} +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + return c_v256_shr_s8(a, c); +} +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return c_v256_shl_16(a, c); +} +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return c_v256_shr_u16(a, c); +} +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return c_v256_shr_s16(a, c); +} +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return c_v256_shl_32(a, c); +} +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return c_v256_shr_u32(a, c); +} +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return c_v256_shr_s32(a, c); +} +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return c_v256_shl_64(a, c); +} +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return c_v256_shr_u64(a, c); +} +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { + return c_v256_shr_s64(a, c); +} + +SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { + return c_v256_shr_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) { + return c_v256_shl_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) { + return c_v256_shl_n_8(a, n); +} +SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { + return c_v256_shl_n_16(a, n); +} +SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { + return c_v256_shl_n_32(a, n); +} +SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) { + return c_v256_shl_n_64(a, n); +} +SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { + return c_v256_shr_n_u8(a, n); +} +SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { + return c_v256_shr_n_u16(a, n); +} +SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { + return c_v256_shr_n_u32(a, n); +} +SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) { + return c_v256_shr_n_u64(a, n); +} +SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { + return c_v256_shr_n_s8(a, n); +} +SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { + return c_v256_shr_n_s16(a, n); +} +SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { + return c_v256_shr_n_s32(a, n); +} +SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) { + return c_v256_shr_n_s64(a, n); +} + +SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) { + return c_v256_shr_n_word(a, n); +} +SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) { + return c_v256_shl_n_word(a, n); +} + +typedef uint32_t sad256_internal_u16; +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { + return c_v256_sad_u16_init(); +} +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { + return c_v256_sad_u16(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + return c_v256_sad_u16_sum(s); +} + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h new file mode 100644 index 0000000000..60d0d53f6f --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h @@ -0,0 +1,963 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/simd/v128_intrinsics_c.h" + +typedef union { + uint8_t u8[32]; + uint16_t u16[16]; + uint32_t u32[8]; + uint64_t u64[4]; + int8_t s8[32]; + int16_t s16[16]; + int32_t s32[8]; + int64_t s64[4]; + c_v64 v64[4]; + c_v128 v128[2]; +} c_v256; + +SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } + +SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } + +SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } + +SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } + +SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { + c_v256 t; + t.v128[1] = hi; + t.v128[0] = lo; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, + uint64_t d) { + c_v256 t; + t.u64[3] = a; + t.u64[2] = b; + t.u64[1] = c; + t.u64[0] = d; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { + c_v256 t; + t.u64[3] = a.u64; + t.u64[2] = b.u64; + t.u64[1] = c.u64; + t.u64[0] = d.u64; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { + c_v256 t; + memcpy(&t, p, 32); + return t; +} + +SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 load at %p\n", p); + abort(); + } + return c_v256_load_unaligned(p); +} + +SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { + memcpy(p, &a, 32); +} + +SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 store at %p\n", p); + abort(); + } + c_v256_store_unaligned(p, a); +} + +SIMD_INLINE c_v256 c_v256_zero(void) { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; + return t; +} + +SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { + return c_v128_dotp_su8(a.v128[1], b.v128[1]) + + c_v128_dotp_su8(a.v128[0], b.v128[0]); +} + +SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { + return c_v128_dotp_s16(a.v128[1], b.v128[1]) + + c_v128_dotp_s16(a.v128[0], b.v128[0]); +} + +SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { + return c_v128_dotp_s32(a.v128[1], b.v128[1]) + + c_v128_dotp_s32(a.v128[0], b.v128[0]); +} + +SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { + return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); +} + +typedef struct { + uint32_t val; + int count; +} c_sad256_internal; + +SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) { + c_sad256_internal t; + t.val = t.count = 0; + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) + s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + s.count++; + if (SIMD_CHECK && s.count > 32) { + fprintf(stderr, + "Error: sad called 32 times returning an undefined result\n"); + abort(); + } + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; } + +typedef uint32_t c_ssd256_internal; + +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } + +SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), + c_v128_or(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), + c_v128_xor(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), + c_v128_and(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), + c_v128_andn(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), + c_v128_add_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), + c_v128_add_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), + c_v128_sadd_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), + c_v128_sadd_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), + c_v128_sadd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), + c_v128_add_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), + c_v128_add_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), + c_v128_sub_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { + c_v256 t; + for (int i = 0; i < 16; i++) + t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; + return t; +} + +SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { + c_v256 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; + t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; + t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; + t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; + return t; +} + +SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), + c_v128_sub_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), + c_v128_ssub_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), + c_v128_ssub_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), + c_v128_sub_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), + c_v128_ssub_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), + c_v128_ssub_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), + c_v128_sub_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { + c_v128 lo_bits = c_v128_mullo_s16(a, b); + c_v128 hi_bits = c_v128_mulhi_s16(a, b); + return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), + c_v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), + c_v128_mullo_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), + c_v128_mulhi_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), + c_v128_mullo_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), + c_v128_madd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), + c_v128_madd_us8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), + c_v128_avg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), + c_v128_rdavg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), + c_v128_rdavg_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), + c_v128_avg_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), + c_v128_min_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), + c_v128_max_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), + c_v128_min_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { + return ((uint32_t)(a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | + ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | + ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | + ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | + ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | + ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | + ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | + ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | + ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | + ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | + ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | + ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | + ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | + ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | + ((a.s8[0] < 0) << 0); +} + +SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { + c_v256 t; + for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; + return t; +} + +SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), + c_v128_max_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), + c_v128_min_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), + c_v128_max_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), + c_v128_min_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), + c_v128_max_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), + c_v128_ziplo_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), + c_v128_ziplo_8(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), + c_v128_ziplo_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), + c_v128_ziplo_16(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), + c_v128_ziplo_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), + c_v128_ziplo_32(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), + c_v128_ziplo_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), + c_v128_ziplo_64(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[0], b.v128[0]); +} + +SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[1], b.v128[1]); +} + +SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); +} + +SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 16; i++) { + t.u8[i] = a.u8[i * 2 + 1]; + t.u8[i + 16] = b.u8[i * 2 + 1]; + } + } else { + for (i = 0; i < 16; i++) { + t.u8[i] = b.u8[i * 2]; + t.u8[i + 16] = a.u8[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) + : _c_v256_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) + : _c_v256_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 8; i++) { + t.u16[i] = a.u16[i * 2 + 1]; + t.u16[i + 8] = b.u16[i * 2 + 1]; + } + } else { + for (i = 0; i < 8; i++) { + t.u16[i] = b.u16[i * 2]; + t.u16[i + 8] = a.u16[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) + : _c_v256_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) + : _c_v256_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u32[7] = b.u32[7]; + t.u32[6] = b.u32[5]; + t.u32[5] = b.u32[3]; + t.u32[4] = b.u32[1]; + t.u32[3] = a.u32[7]; + t.u32[2] = a.u32[5]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[7] = a.u32[6]; + t.u32[6] = a.u32[4]; + t.u32[5] = a.u32[2]; + t.u32[4] = a.u32[0]; + t.u32[3] = b.u32[6]; + t.u32[2] = b.u32[4]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) + : _c_v256_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) + : _c_v256_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u64[3] = b.u64[3]; + t.u64[2] = b.u64[1]; + t.u64[1] = a.u64[3]; + t.u64[0] = a.u64[1]; + } else { + t.u64[3] = a.u64[2]; + t.u64[2] = a.u64[0]; + t.u64[1] = b.u64[2]; + t.u64[0] = b.u64[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) + : _c_v256_unzip_64(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) + : _c_v256_unzip_64(b, a, 1); +} + +SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), + c_v128_unpacklo_u8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), + c_v128_unpacklo_u8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), + c_v128_unpacklo_s8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), + c_v128_unpacklo_s8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), + c_v128_pack_s32_s16(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), + c_v128_pack_s32_u16(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), + c_v128_pack_s16_u8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), + c_v128_pack_s16_s8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), + c_v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), + c_v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), + c_v128_unpacklo_u16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), + c_v128_unpacklo_s16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), + c_v128_unpacklo_u16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), + c_v128_unpacklo_s16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; + + return t; +} + +SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) + t.u8[c] = (pattern.u8[c] < 32 + ? b.u8 + : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; + return t; +} + +// Pairwise / dual-lane shuffle: shuffle two 128 bit lates. +SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { + return c_v256_from_v128( + c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), + c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), + c_v128_cmpgt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), + c_v128_cmplt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), + c_v128_cmpeq_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), + c_v128_cmpgt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), + c_v128_cmplt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), + c_v128_cmpeq_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), + c_v128_cmpgt_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), + c_v128_cmplt_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), + c_v128_cmpeq_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { + if (n == 0) return a; + if (n < 16) + return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), + c_v128_shr_n_byte(a.v128[0], 16 - n)), + c_v128_shl_n_byte(a.v128[0], n)); + else if (n > 16) + return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), + c_v128_zero()); + else + return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); +} + +SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { + if (n == 0) return a; + if (n < 16) + return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), + c_v128_or(c_v128_shr_n_byte(a.v128[0], n), + c_v128_shl_n_byte(a.v128[1], 16 - n))); + else if (n > 16) + return c_v256_from_v128(c_v128_zero(), + c_v128_shr_n_byte(a.v128[1], n - 16)); + else + return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); +} + +SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { + if (SIMD_CHECK && c > 31) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) + : b; +} + +SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), + c_v128_shl_8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), + c_v128_shr_u8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), + c_v128_shr_s8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), + c_v128_shl_16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), + c_v128_shr_u16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), + c_v128_shr_s16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), + c_v128_shl_32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), + c_v128_shr_u32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), + c_v128_shr_s32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.s64[3] = a.s64[3] >> n; + t.s64[2] = a.s64[2] >> n; + t.s64[1] = a.s64[1] >> n; + t.s64[0] = a.s64[0] >> n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.u64[3] = a.u64[3] >> n; + t.u64[2] = a.u64[2] >> n; + t.u64[1] = a.u64[1] >> n; + t.u64[0] = a.u64[0] >> n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.u64[3] = a.u64[3] << n; + t.u64[2] = a.u64[2] << n; + t.u64[1] = a.u64[1] << n; + t.u64[0] = a.u64[0] << n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { + return c_v256_shl_8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { + return c_v256_shl_16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { + return c_v256_shl_32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { + return c_v256_shl_64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { + return c_v256_shr_u8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { + return c_v256_shr_u16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { + return c_v256_shr_u32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { + return c_v256_shr_u64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { + return c_v256_shr_s8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { + return c_v256_shr_s16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { + return c_v256_shr_s32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { + return c_v256_shr_s64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { + return c_v256_shr_n_byte(a, 2 * n); +} +SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { + return c_v256_shl_n_byte(a, 2 * n); +} + +typedef uint32_t c_sad256_internal_u16; + +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u16_sum(). */ +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, + c_v256 a, c_v256 b) { + int c; + for (c = 0; c < 16; c++) + s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } + +typedef uint64_t c_ssd256_internal_s16; + +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, + c_v256 a, c_v256 b) { + int c; + for (c = 0; c < 16; c++) + s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * + (int32_t)(int16_t)(a.s16[c] - b.s16[c]); + return s; +} + +SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h new file mode 100644 index 0000000000..493130df83 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -0,0 +1,806 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ + +#include "config/aom_config.h" + +#if HAVE_NEON +#error "Do not use this file for Neon" +#endif + +#if HAVE_SSE2 +#include "aom_dsp/simd/v128_intrinsics_x86.h" +#else +#include "aom_dsp/simd/v128_intrinsics.h" +#endif + +typedef struct { + v128 val[2]; +} v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } + +SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } + +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } + +SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } + +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + v256 t; + t.val[1] = hi; + t.val[0] = lo; + return t; +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), + v128_load_unaligned(p)); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), + v128_load_aligned(p)); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + v128_store_unaligned(p, a.val[0]); + v128_store_unaligned((uint8_t *)p + 16, a.val[1]); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + v128_store_aligned(p, a.val[0]); + v128_store_aligned((uint8_t *)p + 16, a.val[1]); +} + +SIMD_INLINE v256 v256_zero(void) { + return v256_from_v128(v128_zero(), v128_zero()); +} + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { + v128 t = v128_dup_8(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { + v128 t = v128_dup_16(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { + v128 t = v128_dup_32(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_64(uint64_t x) { + v128 t = v128_dup_64(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); +} + +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); +} + +typedef struct { + sad128_internal val[2]; +} sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init(void) { + sad256_internal t; + t.val[1] = v128_sad_u8_init(); + t.val[0] = v128_sad_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + sad256_internal t; + t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); +} + +typedef struct { + ssd128_internal val[2]; +} ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { + ssd256_internal t; + t.val[1] = v128_ssd_u8_init(); + t.val[0] = v128_ssd_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + ssd256_internal t; + t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { + return v256_from_v128(v128_or(a.val[1], b.val[1]), + v128_or(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { + return v256_from_v128(v128_xor(a.val[1], b.val[1]), + v128_xor(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { + return v256_from_v128(v128_and(a.val[1], b.val[1]), + v128_and(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { + return v256_from_v128(v128_andn(a.val[1], b.val[1]), + v128_andn(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { + return v256_from_v128(v128_add_8(a.val[1], b.val[1]), + v128_add_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { + return v256_from_v128(v128_add_16(a.val[1], b.val[1]), + v128_add_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), + v128_sadd_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { + return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), + v128_sadd_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), + v128_sadd_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { + return v256_from_v128(v128_add_32(a.val[1], b.val[1]), + v128_add_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { + return v256_from_v128(v128_add_64(a.val[1], b.val[1]), + v128_add_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_padd_u8(v256 a) { + return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); +} + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { + return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), + v128_sub_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), + v128_ssub_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), + v128_ssub_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { + return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), + v128_sub_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), + v128_ssub_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), + v128_ssub_u16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { + return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), + v128_sub_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { + return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), + v128_sub_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_abs_s16(v256 a) { + return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_abs_s8(v256 a) { + return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); +} + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), + v128_mullo_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), + v128_mulhi_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), + v128_mullo_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), + v128_madd_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), + v128_madd_us8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { + return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), + v128_avg_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), + v128_rdavg_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), + v128_rdavg_u16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { + return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), + v128_avg_u16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { + return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), + v128_min_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { + return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), + v128_max_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { + return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), + v128_min_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { + return (v128_movemask_8(v256_high_v128(a)) << 16) | + v128_movemask_8(v256_low_v128(a)); +} + +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), + v128_blend_8(a.val[0], b.val[0], c.val[0])); +} + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { + return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), + v128_max_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { + return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), + v128_min_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { + return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), + v128_max_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { + return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), + v128_min_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { + return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), + v128_max_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), + v128_ziplo_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), + v128_ziplo_8(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), + v128_ziplo_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), + v128_ziplo_16(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), + v128_ziplo_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), + v128_ziplo_32(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), + v128_ziplo_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), + v128_ziplo_64(a.val[1], b.val[1])); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return v256_from_v128(a.val[0], b.val[0]); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return v256_from_v128(a.val[1], b.val[1]); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), + v128_unziplo_8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), + v128_unziphi_8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), + v128_unziplo_16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), + v128_unziphi_16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), + v128_unziplo_32(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), + v128_unziphi_32(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { +#if HAVE_SSE2 + return v256_from_v128( + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), + _mm_castsi128_pd(a.val[1]), 0)), + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), + _mm_castsi128_pd(b.val[1]), 0))); +#else + return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), + v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); +#endif +} + +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { +#if HAVE_SSE2 + return v256_from_v128( + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), + _mm_castsi128_pd(a.val[1]), 3)), + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), + _mm_castsi128_pd(b.val[1]), 3))); +#else + return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), + v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); +#endif +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), + v128_unpacklo_u8_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), + v128_unpacklo_u8_s16(a.val[1])); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), + v128_unpacklo_s8_s16(a.val[0])); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), + v128_unpacklo_s8_s16(a.val[1])); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), + v128_pack_s32_s16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), + v128_pack_s32_u16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), + v128_pack_s16_u8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), + v128_pack_s16_s8(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), + v128_unpacklo_u16_s32(a.val[0])); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), + v128_unpacklo_s16_s32(a.val[0])); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), + v128_unpacklo_u16_s32(a.val[1])); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), + v128_unpacklo_s16_s32(a.val[1])); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), + v128_cmpgt_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), + v128_cmplt_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), + v128_cmpeq_8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), + v128_cmpgt_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), + v128_cmplt_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), + v128_cmpeq_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), + v128_cmpgt_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), + v128_cmplt_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), + v128_cmpeq_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { + v128 c16 = v128_dup_8(16); + v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); + v128 masklo = v128_cmplt_s8(pattern.val[0], c16); + return v256_from_v128( + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), + v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), + v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); +} + +SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { + v128 c16 = v128_dup_8(16); + v128 c32 = v128_dup_8(32); + v128 c48 = v128_dup_8(48); + v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); + v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); + v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); + v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); + v256 r1 = v256_from_v128( + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), + v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), + maskhi48), + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), + v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), + masklo48)); + v256 r2 = v256_from_v128( + v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), + v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), + v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), + v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); + return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return v256_from_v128( + v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), + v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ + v128_shr_n_byte(a.val[0], 16 - (n))), \ + v128_shl_n_byte(a.val[0], (n))) \ + : v256_from_v128( \ + (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \ + v128_zero())) + +#define v256_shr_n_byte(a, n) \ + (n == 0 \ + ? a \ + : ((n) < 16 \ + ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ + v128_or(v128_shr_n_byte(a.val[0], n), \ + v128_shl_n_byte(a.val[1], 16 - (n)))) \ + : v256_from_v128( \ + v128_zero(), \ + (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))) + +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) + +#define v256_shl_n_8(a, n) \ + v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) +#define v256_shl_n_16(a, n) \ + v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) +#define v256_shl_n_32(a, n) \ + v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) +#define v256_shl_n_64(a, n) \ + v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) +#define v256_shr_n_u8(a, n) \ + v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) +#define v256_shr_n_u16(a, n) \ + v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) +#define v256_shr_n_u32(a, n) \ + v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) +#define v256_shr_n_u64(a, n) \ + v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) +#define v256_shr_n_s8(a, n) \ + v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) +#define v256_shr_n_s16(a, n) \ + v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) +#define v256_shr_n_s32(a, n) \ + v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) +#define v256_shr_n_s64(a, n) \ + v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) + +#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) +#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) + +typedef struct { + sad128_internal_u16 val[2]; +} sad256_internal_u16; + +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { + sad256_internal_u16 t; + t.val[1] = v128_sad_u16_init(); + t.val[0] = v128_sad_u16_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u16_sum(). + The result for more than 16 v256_sad_u16() calls is undefined. */ +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { + sad256_internal_u16 t; + t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); +} + +typedef struct { + ssd128_internal_s16 val[2]; +} ssd256_internal_s16; + +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { + ssd256_internal_s16 t; + t.val[1] = v128_ssd_s16_init(); + t.val[0] = v128_ssd_s16_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + ssd256_internal_s16 t; + t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); +} + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h new file mode 100644 index 0000000000..894ddee167 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h @@ -0,0 +1,754 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ + +#if !defined(__AVX2__) + +#include "aom_dsp/simd/v256_intrinsics_v128.h" + +#else + +// The _m256i type seems to cause problems for g++'s mangling prior to +// version 5, but adding -fabi-version=0 fixes this. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ + defined(__AVX2__) && defined(__cplusplus) +#pragma GCC optimize "-fabi-version=0" +#endif + +#include + +#include "aom_dsp/simd/v128_intrinsics_x86.h" + +typedef __m256i v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); +} + +SIMD_INLINE v64 v256_low_v64(v256 a) { + return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); +} + +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } + +SIMD_INLINE v128 v256_high_v128(v256 a) { + return _mm256_extracti128_si256(a, 1); +} + +SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { + // gcc seems to be missing _mm256_set_m128i() + return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return _mm256_load_si256((const __m256i *)p); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return _mm256_loadu_si256((const __m256i *)p); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + _mm256_store_si256((__m256i *)p, a); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + _mm256_storeu_si256((__m256i *)p, a); +} + +SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); } + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); } + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); } + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); } + +SIMD_INLINE v256 v256_dup_64(uint64_t x) { + return _mm256_set1_epi64x((int64_t)x); +} + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } + +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } + +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return _mm256_adds_epi16(a, b); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } + +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } + +SIMD_INLINE v256 v256_padd_u8(v256 a) { + return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); +} + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return _mm256_subs_epi16(a, b); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return _mm256_subs_epu16(a, b); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } + +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } + +SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } + +SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } + +// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit +// lanes of lower or upper halves of a 256bit vector because the +// unpack/pack intrinsics operate on the 256 bit input vector as 2 +// independent 128 bit vectors. +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return _mm256_unpacklo_epi8( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return _mm256_unpackhi_epi8( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return _mm256_unpacklo_epi16( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return _mm256_unpackhi_epi16( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return _mm256_unpacklo_epi32( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return _mm256_unpackhi_epi32( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return _mm256_unpacklo_epi64( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return _mm256_unpackhi_epi64( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return _mm256_permute2x128_si256(a, b, 0x02); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return _mm256_permute2x128_si256(a, b, 0x13); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), + _mm256_castsi256_ps(a), + _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), + _mm256_castsi256_ps(a), + _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), + _mm256_castsi256_pd(a), 15)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castpd_si256( + _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); } + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return _mm256_unpacklo_epi8( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return _mm256_unpackhi_epi8( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return _mm256_srai_epi16( + _mm256_unpacklo_epi8( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 8); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return _mm256_srai_epi16( + _mm256_unpackhi_epi8( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 8); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return _mm256_cvtepu16_epi32(a); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return _mm256_cvtepi16_epi32(a); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return _mm256_unpacklo_epi16( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return _mm256_srai_epi32( + _mm256_unpacklo_epi16( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 16); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return _mm256_unpackhi_epi16( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return _mm256_srai_epi32( + _mm256_unpackhi_epi16( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 16); +} + +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + return _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), + _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); +} + +SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { + v256 c32 = v256_dup_8(32); + v256 p32 = v256_sub_8(pattern, c32); + v256 r1 = _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), + _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); + v256 r2 = _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), + _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); + return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return _mm256_shuffle_epi8(a, pattern); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); + v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); + t1 = _mm256_add_epi32(t1, t2); + v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), + _mm256_extracti128_si256(t1, 1)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (int32_t)v128_low_u32(t); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + v256 r = _mm256_madd_epi16(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + v256 r = _mm256_mullo_epi32(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); + v128 lo = v256_low_v128(t); + v128 hi = v256_high_v128(t); + lo = v128_add_32(lo, hi); + return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); +} + +typedef v256 sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init(void) { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 32 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +typedef v256 ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); + v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); + v256 rl = _mm256_madd_epi16(l, l); + v256 rh = _mm256_madd_epi16(h, h); + v128 c = _mm_cvtsi32_si128(32); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); + return _mm256_add_epi64( + s, + _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } + +SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return _mm256_mullo_epi16(a, b); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return _mm256_mulhi_epi16(a, b); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return _mm256_mullo_epi32(a, b); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return _mm256_madd_epi16(a, b); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return _mm256_maddubs_epi16(a, b); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return _mm256_sub_epi8( + _mm256_avg_epu8(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); +} + +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return _mm256_sub_epi16( + _mm256_avg_epu16(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { + return (uint32_t)_mm256_movemask_epi8(a); +} + +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return _mm256_blendv_epi8(a, b, c); +} + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } + +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } + +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return _mm256_cmpgt_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return _mm256_cmpgt_epi8(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return _mm256_cmpeq_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return _mm256_cmpgt_epi16(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return _mm256_cmpgt_epi16(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return _mm256_cmpeq_epi16(a, b); +} + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return _mm256_cmpgt_epi32(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return _mm256_cmpgt_epi32(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { + return _mm256_cmpeq_epi32(a, b); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)), + _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c))); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)), + _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c))); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128((int)(c + 8)); + return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), + _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { +#if defined(__AVX512VL__) + return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c)); +#else + return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), + v128_shr_s64(v256_low_v128(a), c)); +#endif +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +// _mm256_slli_si256 works on 128 bit lanes and can't be used +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128( \ + v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ + v128_shl_n_byte(v256_low_v128(a), n)) \ + : _mm256_inserti128_si256( \ + _mm256_setzero_si256(), \ + v128_shl_n_byte(v256_low_v128(a), (n)-16), 1)) + +// _mm256_srli_si256 works on 128 bit lanes and can't be used +#define v256_shr_n_byte(a, n) \ + ((n) < 16 \ + ? _mm256_alignr_epi8( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ + : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \ + : _mm256_inserti128_si256( \ + _mm256_setzero_si256(), \ + v128_shr_n_byte(v256_high_v128(a), (n)-16), 0))) + +// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) + +#define v256_shl_n_8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \ + _mm256_slli_epi16(a, c)) +#define v256_shr_n_u8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \ + _mm256_srli_epi16(a, c)) +#define v256_shr_n_s8(a, c) \ + _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ + _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) +#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) +#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) +#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) +#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) +#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) +#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) +#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) +#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) +#define v256_shr_n_s64(a, c) \ + v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? +#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) +#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) + +typedef v256 sad256_internal_u16; + +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v256_sad_u16_sum(). */ +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { +#if defined(__SSE4_1__) + v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); +#else + v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), + v256_xor(b, v256_dup_16(32768))); + t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), + v256_or(v256_and(a, t), v256_andn(b, t))); +#endif + return v256_add_32( + s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); +} + +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); + return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + + v128_low_u32(v128_shr_n_byte(t, 8)) + + v128_low_u32(v128_shr_n_byte(t, 12)); +} + +typedef v256 ssd256_internal_s16; + +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + v256 d = v256_sub_16(a, b); + d = v256_madd_s16(d, d); + return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), + _mm256_unpacklo_epi32(d, v256_zero()))); +} + +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); + return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); +} + +#endif + +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h new file mode 100644 index 0000000000..7079949cd8 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ + +#include +#include + +#include "aom_dsp/simd/v64_intrinsics_c.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v64 v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); } +SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); } +SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); } +SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); } +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return c_v64_from_32(x, y); +} +SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); } +SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); } +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return c_v64_from_16(a, b, c, d); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return c_u32_load_unaligned(p); +} +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return c_u32_load_aligned(p); +} +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + c_u32_store_unaligned(p, a); +} +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + c_u32_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return c_v64_load_unaligned(p); +} +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return c_v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + c_v64_store_unaligned(p, a); +} +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + c_v64_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { + return c_v64_align(a, b, c); +} + +SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); } +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); } +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } +SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); } +SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); } +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); } +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); } +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); } +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); } +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); } +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); } +SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); } +SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); } + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); } +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); } +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); } +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); } +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); } +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); } +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); } +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); } +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); } +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); } +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); } +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + return c_v64_pack_s32_s16(a, b); +} +SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { + return c_v64_pack_s32_u16(a, b); +} +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + return c_v64_pack_s16_u8(a, b); +} +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + return c_v64_pack_s16_s8(a, b); +} +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return c_v64_unpacklo_u16_s32(a); +} +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return c_v64_unpacklo_s16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return c_v64_unpackhi_u16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return c_v64_unpackhi_s16_s32(a); +} +SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) { + return c_v64_shuffle_8(a, pattern); +} + +SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) { + return c_v64_sad_u8_init(); +} +SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) { + return c_v64_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) { + return c_v64_sad_u8_sum(s); +} +SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) { + return c_v64_ssd_u8_init(); +} +SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) { + return c_v64_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) { + return c_v64_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); } +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); } +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); } +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); } +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); } +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); } +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); } +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); } +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); } +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); } +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); } +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); } +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); } +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); } +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); } +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); } +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); } +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); } +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); } +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); } +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); } +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); } +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); } +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) { + return c_v64_shr_u16(a, n); +} +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) { + return c_v64_shr_s16(a, n); +} +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); } +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) { + return c_v64_shr_u32(a, n); +} +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) { + return c_v64_shr_s32(a, n); +} +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) { + return c_v64_shr_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) { + return c_v64_shl_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return c_v64_shl_n_8(a, c); +} +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return c_v64_shr_n_u8(a, c); +} +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return c_v64_shr_n_s8(a, c); +} +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return c_v64_shl_n_16(a, c); +} +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return c_v64_shr_n_u16(a, c); +} +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return c_v64_shr_n_s16(a, c); +} +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return c_v64_shl_n_32(a, c); +} +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return c_v64_shr_n_u32(a, c); +} +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return c_v64_shr_n_s32(a, c); +} + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h new file mode 100644 index 0000000000..bfd6fe0710 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h @@ -0,0 +1,966 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ + +/* Note: This implements the intrinsics in plain, unoptimised C. + Intended for reference, porting or debugging. */ + +#include +#include + +#include "config/aom_config.h" + +typedef union { + uint8_t u8[8]; + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64; + int8_t s8[8]; + int16_t s16[4]; + int32_t s32[2]; + int64_t s64; +} c_v64; + +SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { + return a.u32[!!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { + return a.u32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { + return a.s32[!!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { + return a.s32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { + c_v64 t; + t.u32[!CONFIG_BIG_ENDIAN] = x; + t.u32[!!CONFIG_BIG_ENDIAN] = y; + return t; +} + +SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { + c_v64 t; + t.u64 = x; + return t; +} + +SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } + +SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, + uint16_t d) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + t.u16[0] = a; + t.u16[1] = b; + t.u16[2] = c; + t.u16[3] = d; + } else { + t.u16[3] = a; + t.u16[2] = b; + t.u16[1] = c; + t.u16[0] = d; + } + return t; +} + +SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { + uint32_t t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 4; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 4; c++) pp[c] = q[c]; +} + +SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); + abort(); + } + return c_u32_load_unaligned(p); +} + +SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); + abort(); + } + c_u32_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { + c_v64 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 8; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); + abort(); + } + return c_v64_load_unaligned(p); +} + +SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { + uint8_t *q = (uint8_t *)p; + uint8_t *r = (uint8_t *)&a; + int c; + for (c = 0; c < 8; c++) q[c] = r[c]; +} + +SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); + abort(); + } + c_v64_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_zero(void) { + c_v64 t; + t.u64 = 0; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { + c_v64 t; + t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = + t.u8[7] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { + c_v64 t; + t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { + c_v64 t; + t.u32[0] = t.u32[1] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255); + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127); + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767); + return t; +} + +SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); + t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; + t.s8[c] = SIMD_CLAMP(d, -128, 127); + } + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767); + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = + (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); + t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]); + return t; +} + +SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = a.u8[7]; + t.u8[6] = b.u8[7]; + t.u8[5] = a.u8[6]; + t.u8[4] = b.u8[6]; + t.u8[3] = a.u8[5]; + t.u8[2] = b.u8[5]; + t.u8[1] = a.u8[4]; + t.u8[0] = b.u8[4]; + } else { + t.u8[7] = a.u8[3]; + t.u8[6] = b.u8[3]; + t.u8[5] = a.u8[2]; + t.u8[4] = b.u8[2]; + t.u8[3] = a.u8[1]; + t.u8[2] = b.u8[1]; + t.u8[1] = a.u8[0]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = a.u16[3]; + t.u16[2] = b.u16[3]; + t.u16[1] = a.u16[2]; + t.u16[0] = b.u16[2]; + } else { + t.u16[3] = a.u16[1]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[0]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u32[1] = a.u32[1]; + t.u32[0] = b.u32[1]; + } else { + t.u32[1] = a.u32[0]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = b.u8[7]; + t.u8[6] = b.u8[5]; + t.u8[5] = b.u8[3]; + t.u8[4] = b.u8[1]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[7] = a.u8[6]; + t.u8[6] = a.u8[4]; + t.u8[5] = a.u8[2]; + t.u8[4] = a.u8[0]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = b.u16[3]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[3] = a.u16[2]; + t.u16[2] = a.u16[0]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) + : _c_v64_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) + : _c_v64_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[3 + endian]; + t.s16[2] = (int16_t)a.u8[2 + endian]; + t.s16[1] = (int16_t)a.u8[1 + endian]; + t.s16[0] = (int16_t)a.u8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[7 - endian]; + t.s16[2] = (int16_t)a.u8[6 - endian]; + t.s16[1] = (int16_t)a.u8[5 - endian]; + t.s16[0] = (int16_t)a.u8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[3 + endian]; + t.s16[2] = (int16_t)a.s8[2 + endian]; + t.s16[1] = (int16_t)a.s8[1 + endian]; + t.s16[0] = (int16_t)a.s8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[7 - endian]; + t.s16[2] = (int16_t)a.s8[6 - endian]; + t.s16[1] = (int16_t)a.s8[5 - endian]; + t.s16[0] = (int16_t)a.s8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767); + t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767); + t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767); + t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767); + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535); + t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535); + t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535); + t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535); + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255); + t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255); + t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255); + t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255); + t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255); + t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255); + t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255); + t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255); + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127); + t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127); + t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127); + t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127); + t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127); + t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127); + t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127); + t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127); + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + if (SIMD_CHECK && (pattern.u8[c] & ~7)) { + fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", + pattern.u8[c], c); + abort(); + } + t.u8[c] = + a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; + } + return t; +} + +SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { + return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + + a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + + a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; +} + +SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { + return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + + (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); +} + +SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { + return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + + a.u8[0]; +} + +SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { + return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; +} + +typedef struct { + uint32_t val; + int count; +} c_sad64_internal; + +SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) { + c_sad64_internal t; + t.val = t.count = 0; + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is + undefined. */ +SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) + s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + s.count++; + if (SIMD_CHECK && s.count > 32) { + fprintf(stderr, + "Error: sad called 32 times returning an undefined result\n"); + abort(); + } + return s; +} + +SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; } + +typedef uint32_t c_ssd64_internal; + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; } + +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } + +SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 | b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 ^ b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & ~b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); + t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; + t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { + c_v64 t; + int32_t u; + u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; + t.s16[0] = SIMD_CLAMP(u, -32768, 32767); + u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; + t.s16[1] = SIMD_CLAMP(u, -32768, 32767); + u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; + t.s16[2] = SIMD_CLAMP(u, -32768, 32767); + u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; + t.s16[3] = SIMD_CLAMP(u, -32768, 32767); + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n); + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n); + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: undefined s16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift left %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] << n; + t.u32[0] = a.u32[0] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift right %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] >> n; + t.u32[0] = a.u32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined s32 shift right %d\n", n); + abort(); + } + t.s32[1] = a.s32[1] >> n; + t.s32[0] = a.s32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 >> i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 << i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { + if (SIMD_CHECK && c > 7) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; +} + +SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { + return c_v64_shl_8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { + return c_v64_shr_u8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { + return c_v64_shr_s8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { + return c_v64_shl_16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { + return c_v64_shr_u16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { + return c_v64_shr_s16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { + return c_v64_shl_32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { + return c_v64_shr_u32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { + return c_v64_shr_s32(a, c); +} + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h new file mode 100644 index 0000000000..ec27a6bf42 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ + +#include +#if defined(__SSSE3__) +#include +#endif +#if defined(__SSE4_1__) +#include +#endif + +typedef __m128i v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return _mm_packs_epi32( + _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), + _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { +#ifdef __x86_64__ + return _mm_cvtsi64_si128((int64_t)x); +#else + return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x); +#endif +} + +SIMD_INLINE uint64_t v64_u64(v64 x) { + return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); +} + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) +#define v64_align(a, b, c) \ + ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) +#else +#define v64_align(a, b, c) \ + ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ + : (b)) +#endif + +SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); } + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); } + +SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); } + +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v64 v64_abs_s16(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v64 v64_abs_s8(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi32(t, t); +} + +SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { +#if defined(__SSE4_1__) + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi32(t, t); +#else + const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535); + const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535); + const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535); + const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535); + return v64_from_16(ah, al, bh, bl); +#endif +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi16(t, t); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi16(t, t); +} + +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0d0b0907050301LL)); +#else + return _mm_packus_epi16( + _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0e0c0a0806040200LL)); +#else + return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0e0b0a07060302LL)); +#else + return _mm_packs_epi32( + _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0d0c090805040100LL)); +#else + return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return _mm_srli_si128( + _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v64 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + unsigned char *selected = (unsigned char *)&output; + int counter; + + for (counter = 0; counter < 8; counter++) { + selected[counter] = input[index[counter]]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { + __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), + _mm_unpacklo_epi8(b, _mm_setzero_si128())); + t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (int32_t)v64_low_u32(t); +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { + __m128i r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + __m128i x = _mm_cvtepi32_epi64(r); + return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(r); +#endif +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { + return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return v64_dotp_s16(a, v64_dup_16(1)); +} + +typedef v64 sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +typedef v64 ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); + v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); + v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); + return _mm_add_epi64( + s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } + +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } + +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_mul_epu32(a, b), + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); +#endif +} + +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); + return _mm_packs_epi32(t, t); +#endif +} + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); +} + +SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { + return _mm_sub_epi16(_mm_avg_epu16(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)), + _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return _mm_packs_epi16( + _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))), + a); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) +#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) +#define v64_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v64_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c)) +#define v64_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) +#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) + +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c new file mode 100644 index 0000000000..bfe76edc39 --- /dev/null +++ b/third_party/aom/aom_dsp/sse.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* + * Sum the square of the difference between every corresponding element of the + * buffers. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} +#endif diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c new file mode 100644 index 0000000000..35d493b038 --- /dev/null +++ b/third_party/aom/aom_dsp/ssim.c @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/ssim.h" +#include "aom_ports/mem.h" + +#if CONFIG_INTERNAL_STATS +void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +#endif // CONFIG_INTERNAL_STATS + +void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + double ssim_n, ssim_d; + int64_t c1 = 0, c2 = 0; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + assert(0); + // Return similarity as zero for unsupported bit-depth values. + return 0; + } + + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); + + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); + + return ssim_n / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +#if CONFIG_INTERNAL_STATS +void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + double *fast_ssim) { + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], source->crop_heights[is_uv]); + } + + *weight = 1; + *fast_ssim = abc[0] * .8 + .1 * (abc[1] + abc[2]); +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} +#endif // CONFIG_INTERNAL_STATS + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} + +double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, int height, + uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +#if CONFIG_INTERNAL_STATS +void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd, double *fast_ssim) { + assert(bd >= in_bd); + uint32_t shift = bd - in_bd; + + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], + source->crop_heights[is_uv], in_bd, shift); + } + + weight[0] = 1; + fast_ssim[0] = abc[0] * .8 + .1 * (abc[1] + abc[2]); + + if (bd > in_bd) { + // Compute SSIM based on stream bit depth + shift = 0; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], + source->crop_heights[is_uv], bd, shift); + } + + weight[1] = 1; + fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]); + } +} +#endif // CONFIG_INTERNAL_STATS +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if CONFIG_INTERNAL_STATS +void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig, + const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth, + const uint32_t in_bit_depth, int is_hbd, double *weight, + double *frame_ssim2) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth, + frame_ssim2); + return; + } +#else + (void)bit_depth; + (void)in_bit_depth; + (void)is_hbd; +#endif // CONFIG_AV1_HIGHBITDEPTH + aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2); +} +#endif // CONFIG_INTERNAL_STATS diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h new file mode 100644 index 0000000000..fb92556a8c --- /dev/null +++ b/third_party/aom/aom_dsp/ssim.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_SSIM_H_ +#define AOM_AOM_DSP_SSIM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +#if CONFIG_INTERNAL_STATS +#include "aom_scale/yv12config.h" + +// metrics used for calculating ssim, ssim2, dssim, and ssimc +typedef struct { + // source sum ( over 8x8 region ) + uint32_t sum_s; + + // reference sum (over 8x8 region ) + uint32_t sum_r; + + // source sum squared ( over 8x8 region ) + uint32_t sum_sq_s; + + // reference sum squared (over 8x8 region ) + uint32_t sum_sq_r; + + // sum of source times reference (over 8x8 region) + uint32_t sum_sxr; + + // calculated ssim score between source and reference + double ssim; +} Ssimv; + +// metrics collected on a frame basis +typedef struct { + // ssim consistency error metric ( see code for explanation ) + double ssimc; + + // standard ssim + double ssim; + + // revised ssim ( see code for explanation) + double ssim2; + + // ssim restated as an error metric like sse + double dssim; + + // dssim converted to decibels + double dssimd; + + // ssimc converted to decibels + double ssimcd; +} Metrics; + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency); + +void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + double *fast_ssim); + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd); + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd, double *fast_ssim); +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig, + const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth, + const uint32_t in_bit_depth, int is_hbd, double *weight, + double *frame_ssim2); +#endif // CONFIG_INTERNAL_STATS + +double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height); + +#if CONFIG_AV1_HIGHBITDEPTH +double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, int height, + uint32_t bd, uint32_t shift); +#endif // CONFIG_AV1_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_SSIM_H_ diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c new file mode 100644 index 0000000000..4f47e553d4 --- /dev/null +++ b/third_party/aom/aom_dsp/subtract.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c new file mode 100644 index 0000000000..f58defaa11 --- /dev/null +++ b/third_party/aom/aom_dsp/sum_squares.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, + int height) { + int r, c; + uint64_t ss = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const int16_t v = src[c]; + ss += v * v; + } + src += src_stride; + } + + return ss; +} + +uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) { + uint64_t ss = 0; + do { + const int16_t v = *src++; + ss += v * v; + } while (--n); + + return ss; +} + +uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) { + int r, c; + uint64_t ss = 0, s = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const uint8_t v = src[c]; + ss += v * v; + s += v; + } + src += src_stride; + } + + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) { + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + int r, c; + uint64_t ss = 0, s = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const uint16_t v = srcp[c]; + ss += v * v; + s += v; + } + srcp += src_stride; + } + + return (ss - s * s / (width * height)); +} + +uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width, + int height, int *sum) { + int r, c; + int16_t *srcp = (int16_t *)src; + int64_t ss = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const int16_t v = srcp[c]; + ss += v * v; + *sum += v; + } + srcp += src_stride; + } + return ss; +} diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h new file mode 100644 index 0000000000..67d9e90ca9 --- /dev/null +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_TXFM_COMMON_H_ +#define AOM_AOM_DSP_TXFM_COMMON_H_ + +#include "aom_dsp/aom_dsp_common.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) + +// block transform size +enum { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_64X64, // 64x64 transform + TX_4X8, // 4x8 transform + TX_8X4, // 8x4 transform + TX_8X16, // 8x16 transform + TX_16X8, // 16x8 transform + TX_16X32, // 16x32 transform + TX_32X16, // 32x16 transform + TX_32X64, // 32x64 transform + TX_64X32, // 64x32 transform + TX_4X16, // 4x16 transform + TX_16X4, // 16x4 transform + TX_8X32, // 8x32 transform + TX_32X8, // 32x8 transform + TX_16X64, // 16x64 transform + TX_64X16, // 64x16 transform + TX_SIZES_ALL, // Includes rectangular transforms + TX_SIZES = TX_4X8, // Does NOT include rectangular transforms + TX_SIZES_LARGEST = TX_64X64, + TX_INVALID = 255 // Invalid transform size +} UENUM1BYTE(TX_SIZE); + +enum { + DCT_DCT, // DCT in both horizontal and vertical + ADST_DCT, // ADST in vertical, DCT in horizontal + DCT_ADST, // DCT in vertical, ADST in horizontal + ADST_ADST, // ADST in both directions + FLIPADST_DCT, // FLIPADST in vertical, DCT in horizontal + DCT_FLIPADST, // DCT in vertical, FLIPADST in horizontal + FLIPADST_FLIPADST, // FLIPADST in both directions + ADST_FLIPADST, // ADST in vertical, FLIPADST in horizontal + FLIPADST_ADST, // FLIPADST in vertical, ADST in horizontal + IDTX, // Identity in both directions + V_DCT, // DCT in vertical, identity in horizontal + H_DCT, // Identity in vertical, DCT in horizontal + V_ADST, // ADST in vertical, identity in horizontal + H_ADST, // Identity in vertical, ADST in horizontal + V_FLIPADST, // FLIPADST in vertical, identity in horizontal + H_FLIPADST, // Identity in vertical, FLIPADST in horizontal + TX_TYPES, + DCT_ADST_TX_MASK = 0x000F, // Either DCT or ADST in each direction + TX_TYPE_INVALID = 255, // Invalid transform type +} UENUM1BYTE(TX_TYPE); + +enum { + // DCT only + EXT_TX_SET_DCTONLY, + // DCT + Identity only + EXT_TX_SET_DCT_IDTX, + // Discrete Trig transforms w/o flip (4) + Identity (1) + EXT_TX_SET_DTT4_IDTX, + // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2) + EXT_TX_SET_DTT4_IDTX_1DDCT, + // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2) + EXT_TX_SET_DTT9_IDTX_1DDCT, + // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6) + EXT_TX_SET_ALL16, + EXT_TX_SET_TYPES +} UENUM1BYTE(TxSetType); + +typedef struct txfm_param { + // for both forward and inverse transforms + TX_TYPE tx_type; + TX_SIZE tx_size; + int lossless; + int bd; + // are the pixel buffers octets or shorts? This should collapse to + // bd==8 implies !is_hbd, but that's not certain right now. + int is_hbd; + TxSetType tx_set_type; + // for inverse transforms only + int eob; +} TxfmParam; + +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*PI/64))); +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_10_64 = 14449; +static const tran_high_t cospi_11_64 = 14053; +static const tran_high_t cospi_12_64 = 13623; +static const tran_high_t cospi_13_64 = 13160; +static const tran_high_t cospi_14_64 = 12665; +static const tran_high_t cospi_15_64 = 12140; +static const tran_high_t cospi_16_64 = 11585; +static const tran_high_t cospi_17_64 = 11003; +static const tran_high_t cospi_18_64 = 10394; +static const tran_high_t cospi_19_64 = 9760; +static const tran_high_t cospi_20_64 = 9102; +static const tran_high_t cospi_21_64 = 8423; +static const tran_high_t cospi_22_64 = 7723; +static const tran_high_t cospi_23_64 = 7005; +static const tran_high_t cospi_24_64 = 6270; +static const tran_high_t cospi_25_64 = 5520; +static const tran_high_t cospi_26_64 = 4756; +static const tran_high_t cospi_27_64 = 3981; +static const tran_high_t cospi_28_64 = 3196; +static const tran_high_t cospi_29_64 = 2404; +static const tran_high_t cospi_30_64 = 1606; +static const tran_high_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const tran_high_t sinpi_1_9 = 5283; +static const tran_high_t sinpi_2_9 = 9929; +static const tran_high_t sinpi_3_9 = 13377; +static const tran_high_t sinpi_4_9 = 15212; + +// 16384 * sqrt(2) +static const tran_high_t Sqrt2 = 23170; +static const tran_high_t InvSqrt2 = 11585; + +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +#endif // AOM_AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c new file mode 100644 index 0000000000..f02c3077ae --- /dev/null +++ b/third_party/aom/aom_dsp/variance.c @@ -0,0 +1,1234 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/variance.h" + +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" + +uint32_t aom_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +static void variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { + uint32_t sse; + int sum; + variance(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +#define VAR(W, H) \ + uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ + } + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse8x8, int *sum8x8, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var8x8) { + // Loop over 4 8x8 blocks. Process one 8x32 block. + for (int k = 0; k < 4; k++) { + variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k], + &sum8x8[k]); + } + + // Calculate variance at 8x8 level and total sse, sum of 8x32 block. + *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; + *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; + for (int i = 0; i < 4; i++) + var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); +} + +void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse16x16, unsigned int *tot_sse, + int *tot_sum, uint32_t *var16x16) { + int sum16x16[2] = { 0 }; + // Loop over two consecutive 16x16 blocks and process as one 16x32 block. + for (int k = 0; k < 2; k++) { + variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride, + 16, 16, &sse16x16[k], &sum16x16[k]); + } + + // Calculate variance at 16x16 level and total sse, sum of 16x32 block. + *tot_sse += sse16x16[0] + sse16x16[1]; + *tot_sum += sum16x16[0] + sum16x16[1]; + for (int i = 0; i < 2; i++) + var16x16[i] = + sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); +} + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define VARIANCES(W, H) \ + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) + +VARIANCES(128, 128) +VARIANCES(128, 64) +VARIANCES(64, 128) +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) + +// Realtime mode doesn't use rectangular blocks. +#if !CONFIG_REALTIME_ONLY +VARIANCES(4, 16) +VARIANCES(16, 4) +VARIANCES(8, 32) +VARIANCES(32, 8) +VARIANCES(16, 64) +VARIANCES(64, 16) +#endif + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t tsum = 0; + uint64_t tsse = 0; + for (int i = 0; i < h; ++i) { + int32_t lsum = 0; + for (int j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + lsum += diff; + tsse += (uint32_t)(diff * diff); + } + tsum += lsum; + a += a_stride; + b += b_stride; + } + *sum = tsum; + *sse = tsse; +} + +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h) { + uint64_t sse; + int64_t sum; + highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(128, 128) +HIGHBD_VARIANCES(128, 64) +HIGHBD_VARIANCES(64, 128) +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) + +// Realtime mode doesn't use 4x rectangular blocks. +#if !CONFIG_REALTIME_ONLY +HIGHBD_VARIANCES(4, 16) +HIGHBD_VARIANCES(16, 4) +HIGHBD_VARIANCES(8, 32) +HIGHBD_VARIANCES(32, 8) +HIGHBD_VARIANCES(16, 64) +HIGHBD_VARIANCES(64, 16) +#endif + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_highbd_dist_wtd_comp_avg_pred_c( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i, j; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]); + } + comp_pred += width; + src0 += stride0; + src1 += stride1; + mask += mask_stride; + } +} + +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ + } + +MASK_SUBPIX_VAR(4, 4) +MASK_SUBPIX_VAR(4, 8) +MASK_SUBPIX_VAR(8, 4) +MASK_SUBPIX_VAR(8, 8) +MASK_SUBPIX_VAR(8, 16) +MASK_SUBPIX_VAR(16, 8) +MASK_SUBPIX_VAR(16, 16) +MASK_SUBPIX_VAR(16, 32) +MASK_SUBPIX_VAR(32, 16) +MASK_SUBPIX_VAR(32, 32) +MASK_SUBPIX_VAR(32, 64) +MASK_SUBPIX_VAR(64, 32) +MASK_SUBPIX_VAR(64, 64) +MASK_SUBPIX_VAR(64, 128) +MASK_SUBPIX_VAR(128, 64) +MASK_SUBPIX_VAR(128, 128) + +// Realtime mode doesn't use 4x rectangular blocks. +#if !CONFIG_REALTIME_ONLY +MASK_SUBPIX_VAR(4, 16) +MASK_SUBPIX_VAR(16, 4) +MASK_SUBPIX_VAR(8, 32) +MASK_SUBPIX_VAR(32, 8) +MASK_SUBPIX_VAR(16, 64) +MASK_SUBPIX_VAR(64, 16) +#endif + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + if (!invert_mask) + comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); + else + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); + } + comp_pred += width; + pred += width; + ref += ref_stride; + mask += mask_stride; + } +} + +#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } + +HIGHBD_MASK_SUBPIX_VAR(4, 4) +HIGHBD_MASK_SUBPIX_VAR(4, 8) +HIGHBD_MASK_SUBPIX_VAR(8, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 8) +HIGHBD_MASK_SUBPIX_VAR(8, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 8) +HIGHBD_MASK_SUBPIX_VAR(16, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 16) +HIGHBD_MASK_SUBPIX_VAR(32, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 32) +HIGHBD_MASK_SUBPIX_VAR(64, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 128) +HIGHBD_MASK_SUBPIX_VAR(128, 64) +HIGHBD_MASK_SUBPIX_VAR(128, 128) +#if !CONFIG_REALTIME_ONLY +HIGHBD_MASK_SUBPIX_VAR(4, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 8) +HIGHBD_MASK_SUBPIX_VAR(16, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 16) +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if !CONFIG_REALTIME_ONLY +static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +#define OBMC_VAR(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ + W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ + } + +OBMC_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 4) + +OBMC_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 8) + +OBMC_VAR(8, 4) +OBMC_SUBPIX_VAR(8, 4) + +OBMC_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 8) + +OBMC_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 16) + +OBMC_VAR(16, 8) +OBMC_SUBPIX_VAR(16, 8) + +OBMC_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 16) + +OBMC_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 32) + +OBMC_VAR(32, 16) +OBMC_SUBPIX_VAR(32, 16) + +OBMC_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 32) + +OBMC_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 64) + +OBMC_VAR(64, 32) +OBMC_SUBPIX_VAR(64, 32) + +OBMC_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 64) + +OBMC_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 128) + +OBMC_VAR(128, 64) +OBMC_SUBPIX_VAR(128, 64) + +OBMC_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 128) + +OBMC_VAR(4, 16) +OBMC_SUBPIX_VAR(4, 16) +OBMC_VAR(16, 4) +OBMC_SUBPIX_VAR(16, 4) +OBMC_VAR(8, 32) +OBMC_SUBPIX_VAR(8, 32) +OBMC_VAR(32, 8) +OBMC_SUBPIX_VAR(32, 8) +OBMC_VAR(16, 64) +OBMC_SUBPIX_VAR(16, 64) +OBMC_VAR(64, 16) +OBMC_SUBPIX_VAR(64, 16) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_OBMC_VAR(W, H) \ + unsigned int aom_highbd_8_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } + +HIGHBD_OBMC_VAR(4, 4) +HIGHBD_OBMC_SUBPIX_VAR(4, 4) + +HIGHBD_OBMC_VAR(4, 8) +HIGHBD_OBMC_SUBPIX_VAR(4, 8) + +HIGHBD_OBMC_VAR(8, 4) +HIGHBD_OBMC_SUBPIX_VAR(8, 4) + +HIGHBD_OBMC_VAR(8, 8) +HIGHBD_OBMC_SUBPIX_VAR(8, 8) + +HIGHBD_OBMC_VAR(8, 16) +HIGHBD_OBMC_SUBPIX_VAR(8, 16) + +HIGHBD_OBMC_VAR(16, 8) +HIGHBD_OBMC_SUBPIX_VAR(16, 8) + +HIGHBD_OBMC_VAR(16, 16) +HIGHBD_OBMC_SUBPIX_VAR(16, 16) + +HIGHBD_OBMC_VAR(16, 32) +HIGHBD_OBMC_SUBPIX_VAR(16, 32) + +HIGHBD_OBMC_VAR(32, 16) +HIGHBD_OBMC_SUBPIX_VAR(32, 16) + +HIGHBD_OBMC_VAR(32, 32) +HIGHBD_OBMC_SUBPIX_VAR(32, 32) + +HIGHBD_OBMC_VAR(32, 64) +HIGHBD_OBMC_SUBPIX_VAR(32, 64) + +HIGHBD_OBMC_VAR(64, 32) +HIGHBD_OBMC_SUBPIX_VAR(64, 32) + +HIGHBD_OBMC_VAR(64, 64) +HIGHBD_OBMC_SUBPIX_VAR(64, 64) + +HIGHBD_OBMC_VAR(64, 128) +HIGHBD_OBMC_SUBPIX_VAR(64, 128) + +HIGHBD_OBMC_VAR(128, 64) +HIGHBD_OBMC_SUBPIX_VAR(128, 64) + +HIGHBD_OBMC_VAR(128, 128) +HIGHBD_OBMC_SUBPIX_VAR(128, 128) + +HIGHBD_OBMC_VAR(4, 16) +HIGHBD_OBMC_SUBPIX_VAR(4, 16) +HIGHBD_OBMC_VAR(16, 4) +HIGHBD_OBMC_SUBPIX_VAR(16, 4) +HIGHBD_OBMC_VAR(8, 32) +HIGHBD_OBMC_SUBPIX_VAR(8, 32) +HIGHBD_OBMC_VAR(32, 8) +HIGHBD_OBMC_SUBPIX_VAR(32, 8) +HIGHBD_OBMC_VAR(16, 64) +HIGHBD_OBMC_SUBPIX_VAR(16, 64) +HIGHBD_OBMC_VAR(64, 16) +HIGHBD_OBMC_SUBPIX_VAR(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // !CONFIG_REALTIME_ONLY + +uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + uint64_t sum = 0; + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j++) { + int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j]; + sum += e * e; + } + } + return sum; +} + +uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w, + int h) { + uint16_t *src_temp = src; + uint8_t *dst_temp = dst; + const int num_blks = 16 / w; + int64_t sum = 0; + for (int i = 0; i < num_blks; i++) { + sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h); + dst_temp += w; + src_temp += (w * h); + } + return sum; +} + +uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + uint64_t sum = 0; + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j++) { + int e = dst[i * dstride + j] - src[i * sstride + j]; + sum += e * e; + } + } + return sum; +} diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h new file mode 100644 index 0000000000..6603d312b8 --- /dev/null +++ b/third_party/aom/aom_dsp/variance.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_VARIANCE_H_ +#define AOM_AOM_DSP_VARIANCE_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 +#define FILTER_WEIGHT 128 + +typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + +typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred); + +typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, + int b_stride, int n); + +typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *const b_array[], + int b_stride, unsigned int *sad_array); + +typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred); + +typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param); + +typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param); + +typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, + int invert_mask); +typedef unsigned int (*aom_masked_subpixvariance_fn_t)( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); + +typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, + const int32_t *wsrc, + const int32_t *msk); +typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, + int pred_stride, + const int32_t *wsrc, + const int32_t *msk, + unsigned int *sse); +typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( + const uint8_t *pred, int pred_stride, int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *msk, unsigned int *sse); + +typedef struct aom_variance_vtable { + aom_sad_fn_t sdf; + // Same as normal sad, but downsample the rows by a factor of 2. + aom_sad_fn_t sdsf; + aom_sad_avg_fn_t sdaf; + aom_variance_fn_t vf; + aom_subpixvariance_fn_t svf; + aom_subp_avg_variance_fn_t svaf; + aom_sad_multi_d_fn_t sdx4df; + aom_sad_multi_d_fn_t sdx3df; + // Same as sadx4, but downsample the rows by a factor of 2. + aom_sad_multi_d_fn_t sdsx4df; + aom_masked_sad_fn_t msdf; + aom_masked_subpixvariance_fn_t msvf; + aom_obmc_sad_fn_t osdf; + aom_obmc_variance_fn_t ovf; + aom_obmc_subpixvariance_fn_t osvf; + aom_dist_wtd_sad_avg_fn_t jsdaf; + aom_dist_wtd_subp_avg_variance_fn_t jsvaf; +} aom_variance_fn_ptr_t; + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h); + +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_VARIANCE_H_ diff --git a/third_party/aom/aom_dsp/vmaf.c b/third_party/aom/aom_dsp/vmaf.c new file mode 100644 index 0000000000..a40e00cb23 --- /dev/null +++ b/third_party/aom/aom_dsp/vmaf.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/vmaf.h" + +#include +#include +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif + +#include "aom_dsp/blend.h" + +static void vmaf_fatal_error(const char *message) { + fprintf(stderr, "Fatal error: %s\n", message); + exit(EXIT_FAILURE); +} + +void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) { + if (*vmaf_model != NULL) return; + VmafModelConfig model_cfg; + model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP; + model_cfg.name = "vmaf"; + + if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) { + vmaf_fatal_error("Failed to load VMAF model."); + } +} + +void aom_close_vmaf_model(VmafModel *vmaf_model) { + vmaf_model_destroy(vmaf_model); +} + +static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src, + VmafPicture *dst) { + const int width = src->y_width; + const int height = src->y_height; + + if (bit_depth > 8) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src->y_buffer); + uint16_t *dst_ptr = dst->data[0]; + + for (int row = 0; row < height; ++row) { + memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0])); + src_ptr += src->y_stride; + dst_ptr += dst->stride[0] / 2; + } + } else { + uint8_t *src_ptr = src->y_buffer; + uint8_t *dst_ptr = (uint8_t *)dst->data[0]; + + for (int row = 0; row < height; ++row) { + memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0])); + src_ptr += src->y_stride; + dst_ptr += dst->stride[0]; + } + } +} + +void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model, + bool cal_vmaf_neg) { + // TODO(sdeng): make them CLI arguments. + VmafConfiguration cfg; + cfg.log_level = VMAF_LOG_LEVEL_NONE; + cfg.n_threads = 0; + cfg.n_subsample = 0; + cfg.cpumask = 0; + + if (vmaf_init(vmaf_context, cfg)) { + vmaf_fatal_error("Failed to init VMAF context."); + } + + if (cal_vmaf_neg) { + VmafFeatureDictionary *vif_feature = NULL; + if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", + "1.0")) { + vmaf_fatal_error("Failed to set vif_enhn_gain_limit."); + } + if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) { + vmaf_fatal_error("Failed to use feature float_vif."); + } + + VmafFeatureDictionary *adm_feature = NULL; + if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", + "1.0")) { + vmaf_fatal_error("Failed to set adm_enhn_gain_limit."); + } + if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) { + vmaf_fatal_error("Failed to use feature float_adm."); + } + } + + VmafFeatureDictionary *motion_force_zero = NULL; + if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", + "1")) { + vmaf_fatal_error("Failed to set motion_force_zero."); + } + if (vmaf_model_feature_overload(vmaf_model, "float_motion", + motion_force_zero)) { + vmaf_fatal_error("Failed to use feature float_motion."); + } + + if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) { + vmaf_fatal_error("Failed to load feature extractors from VMAF model."); + } +} + +void aom_close_vmaf_context(VmafContext *vmaf_context) { + if (vmaf_close(vmaf_context)) { + vmaf_fatal_error("Failed to close VMAF context."); + } +} + +void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + bool cal_vmaf_neg, double *vmaf) { + VmafContext *vmaf_context; + aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg); + const int frame_index = 0; + VmafPicture ref, dist; + if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, + source->y_height) || + vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth, + source->y_width, source->y_height)) { + vmaf_fatal_error("Failed to alloc VMAF pictures."); + } + copy_picture(bit_depth, source, &ref); + copy_picture(bit_depth, distorted, &dist); + if (vmaf_read_pictures(vmaf_context, &ref, &dist, + /*picture index=*/frame_index)) { + vmaf_fatal_error("Failed to read VMAF pictures."); + } + + if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) { + vmaf_fatal_error("Failed to flush context."); + } + + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); + + vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index); + aom_close_vmaf_context(vmaf_context); +} + +void aom_read_vmaf_image(VmafContext *vmaf_context, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + int frame_index) { + VmafPicture ref, dist; + if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, + source->y_height) || + vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth, + source->y_width, source->y_height)) { + vmaf_fatal_error("Failed to alloc VMAF pictures."); + } + copy_picture(bit_depth, source, &ref); + copy_picture(bit_depth, distorted, &dist); + if (vmaf_read_pictures(vmaf_context, &ref, &dist, + /*picture index=*/frame_index)) { + vmaf_fatal_error("Failed to read VMAF pictures."); + } + + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); +} + +double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model, + int frame_index) { + double vmaf; + if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) { + vmaf_fatal_error("Failed to calc VMAF scores."); + } + return vmaf; +} + +void aom_flush_vmaf_context(VmafContext *vmaf_context) { + if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) { + vmaf_fatal_error("Failed to flush context."); + } +} diff --git a/third_party/aom/aom_dsp/vmaf.h b/third_party/aom/aom_dsp/vmaf.h new file mode 100644 index 0000000000..b539cf8b76 --- /dev/null +++ b/third_party/aom/aom_dsp/vmaf.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_VMAF_H_ +#define AOM_AOM_DSP_VMAF_H_ + +#include +#include + +#include "aom_scale/yv12config.h" + +void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model, + bool cal_vmaf_neg); +void aom_close_vmaf_context(VmafContext *vmaf_context); + +void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path); +void aom_close_vmaf_model(VmafModel *vmaf_model); + +void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + bool cal_vmaf_neg, double *vmaf); + +void aom_read_vmaf_image(VmafContext *vmaf_context, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + int frame_index); + +double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model, + int frame_index); + +void aom_flush_vmaf_context(VmafContext *vmaf_context); + +#endif // AOM_AOM_DSP_VMAF_H_ diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c new file mode 100644 index 0000000000..b3dede75d5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, + const int16_t *round_ptr, __m256i *round, + const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, + __m256i *dequant, + const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { + const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr)); + const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +} + +static INLINE void update_mask1_avx2(__m256i *cmp_mask, + const int16_t *iscan_ptr, int *is_found, + __m256i *mask) { + __m256i temp_mask = _mm256_setzero_si256(); + if (_mm256_movemask_epi8(*cmp_mask)) { + __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); + temp_mask = _mm256_and_si256(*cmp_mask, iscan); + *is_found = 1; + } + *mask = _mm256_max_epi16(temp_mask, *mask); +} + +static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold, + const int16_t *iscan_ptr, int *is_found, + __m256i *mask) { + __m256i zero = _mm256_setzero_si256(); + __m256i coeff[2], cmp_mask0, cmp_mask1; + coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero); + coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero); + coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS); + cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS); + cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); + cmp_mask0 = + _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); + update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round, + const __m256i *quant, + const __m256i *shift) { + __m256i tmp, qcoeff; + qcoeff = _mm256_adds_epi16(*coeff, *round); + tmp = _mm256_mulhi_epi16(qcoeff, *quant); + qcoeff = _mm256_add_epi16(tmp, qcoeff); + *coeff = _mm256_mulhi_epi16(qcoeff, *shift); +} + +static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) { + return _mm256_mullo_epi16(qcoeff, dequant); +} + +static INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +} + +void aom_quantize_b_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff, qcoeff; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + int prescan_add[2]; + int thresh[2]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff = load_coefficients_avx2(coeff_ptr); + qcoeff = _mm256_abs_epi16(coeff); + update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0); + __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); + zbin = _mm256_unpackhi_epi64(zbin, zbin); + cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); + update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + // Reinsert signs + qcoeff = _mm256_sign_epi16(qcoeff, coeff); + // Mask out zbin threshold coeffs + qcoeff = _mm256_and_si256(qcoeff, temp0); + store_coefficients_avx2(qcoeff, qcoeff_ptr); + coeff = calculate_dqcoeff_avx2(qcoeff, dequant); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + store_coefficients_avx2(coeff, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff = load_coefficients_avx2(coeff_ptr + index); + qcoeff = _mm256_abs_epi16(coeff); + update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi16(qcoeff, zbin); + cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8); + update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift); + qcoeff = _mm256_sign_epi16(qcoeff, coeff); + qcoeff = _mm256_and_si256(qcoeff, temp0); + store_coefficients_avx2(qcoeff, qcoeff_ptr + index); + coeff = calculate_dqcoeff_avx2(qcoeff, dequant); + store_coefficients_avx2(coeff, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff0 = qcoeff_ptr[rc]; + if (qcoeff0) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff0 = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff0); + const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c new file mode 100644 index 0000000000..503b9b4682 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +void aom_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + const int log_scale = 1; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_64x64_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + const int log_scale = 2; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0, + &mask0); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + index += 16; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c new file mode 100644 index 0000000000..b08ec2546b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction aom_filter_block1d16_v8_sse2; +filter8_1dfunction aom_filter_block1d16_h8_sse2; +filter8_1dfunction aom_filter_block1d8_v8_sse2; +filter8_1dfunction aom_filter_block1d8_h8_sse2; +filter8_1dfunction aom_filter_block1d4_v8_sse2; +filter8_1dfunction aom_filter_block1d4_h8_sse2; +filter8_1dfunction aom_filter_block1d16_v4_sse2; +filter8_1dfunction aom_filter_block1d16_h4_sse2; + +filter8_1dfunction aom_filter_block1d8_h4_sse2; +filter8_1dfunction aom_filter_block1d8_v4_sse2; +filter8_1dfunction aom_filter_block1d4_h4_sse2; +filter8_1dfunction aom_filter_block1d4_v4_sse2; + +filter8_1dfunction aom_filter_block1d16_v2_sse2; +filter8_1dfunction aom_filter_block1d16_h2_sse2; +filter8_1dfunction aom_filter_block1d8_v2_sse2; +filter8_1dfunction aom_filter_block1d8_h2_sse2; +filter8_1dfunction aom_filter_block1d4_v2_sse2; +filter8_1dfunction aom_filter_block1d4_h2_sse2; + +// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) + +#if CONFIG_AV1_HIGHBITDEPTH +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; + +// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) +#endif +#endif // HAVE_SSE2 diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c new file mode 100644 index 0000000000..a1043828fe --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); + s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32)); + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); + _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]); +} + +void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m256i s[2]; + s[0] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + s[1] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, s[0]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); + src += src_stride; + s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32)); + src += src_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH + +static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); + _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); +} + +static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) { + __m256i s[8]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16)); + s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16)); + s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16)); + s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16)); + s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16)); + + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); + _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]); + _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]); + _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]); + _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]); + _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]); +} + +void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, int w, + int h) { + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m256i s[2]; + s[0] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + s[1] = _mm256_loadu_si256((__m256i *)src); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, s[0]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m256i s[4]; + s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + src += src_stride; + s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16)); + s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]); + _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + highbd_copy_64(src, dst); + src += src_stride; + dst += dst_stride; + highbd_copy_64(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else { + assert(w == 128); + do { + highbd_copy_128(src, dst); + src += src_stride; + dst += dst_stride; + highbd_copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c new file mode 100644 index 0000000000..e78845e97c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void copy_128(const uint8_t *src, uint8_t *dst) { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); + s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16)); + s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16)); + s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16)); + s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16)); + _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); + _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]); + _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]); + _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]); + _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]); +} + +void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 2 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + memmove(dst, src, 4 * sizeof(*src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m128i s[4]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + src += src_stride; + s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); + src += src_stride; + s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16)); + s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16)); + s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16)); + s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]); + _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]); + _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]); + _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]); + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} + +static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); + s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); + s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); + _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); + _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); + _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); + _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); +} + +static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) { + __m128i s[16]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8)); + s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8)); + s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8)); + s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8)); + s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8)); + s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8)); + s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8)); + s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8)); + s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8)); + s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8)); + s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8)); + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); + _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]); + _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]); + _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]); + _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]); + _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]); + _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]); + _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]); + _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]); + _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]); + _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]); + _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]); + _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]); +} + +void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, int w, + int h) { + if (w >= 16) { + assert(!((intptr_t)dst % 16)); + assert(!(dst_stride % 16)); + } + + if (w == 2) { + do { + __m128i s = _mm_loadl_epi64((__m128i *)src); + *(int *)dst = _mm_cvtsi128_si32(s); + src += src_stride; + dst += dst_stride; + s = _mm_loadl_epi64((__m128i *)src); + *(int *)dst = _mm_cvtsi128_si32(s); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 4) { + do { + __m128i s[2]; + s[0] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + s[1] = _mm_loadl_epi64((__m128i *)src); + src += src_stride; + _mm_storel_epi64((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 8) { + do { + __m128i s[2]; + s[0] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + s[1] = _mm_loadu_si128((__m128i *)src); + src += src_stride; + _mm_store_si128((__m128i *)dst, s[0]); + dst += dst_stride; + _mm_store_si128((__m128i *)dst, s[1]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 16) { + do { + __m128i s[4]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + src += src_stride; + s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 32) { + do { + __m128i s[8]; + s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + src += src_stride; + s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8)); + s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8)); + s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8)); + s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8)); + src += src_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]); + dst += dst_stride; + _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]); + _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]); + _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]); + _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]); + dst += dst_stride; + h -= 2; + } while (h); + } else if (w == 64) { + do { + highbd_copy_64(src, dst); + src += src_stride; + dst += dst_stride; + highbd_copy_64(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } else { + do { + highbd_copy_128(src, dst); + src += src_stride; + dst += dst_stride; + highbd_copy_128(src, dst); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h); + } +} diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm new file mode 100644 index 0000000000..d392225906 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -0,0 +1,613 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +SECTION .text + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_highbd_filter_block1d4_v8_sse2) +sym(aom_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_highbd_filter_block1d8_v8_sse2) +sym(aom_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_highbd_filter_block1d16_v8_sse2) +sym(aom_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_highbd_filter_block1d4_h8_sse2) +sym(aom_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_highbd_filter_block1d8_h8_sse2) +sym(aom_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_highbd_filter_block1d16_h8_sse2) +sym(aom_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm new file mode 100644 index 0000000000..db4cad9bcb --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,367 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm3, rdx + movq xmm5, rcx + pshufd xmm3, xmm3, 0b + movdqa xmm1, xmm3 + psllw xmm3, xmm5 + psubw xmm3, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movdqa max, xmm3 + movdqa min, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm5, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm5, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm5, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm5, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm5, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm5 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + pminsw xmm2, max + pmaxsw xmm2, min + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +SECTION .text + +globalsym(aom_highbd_filter_block1d4_v2_sse2) +sym(aom_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_highbd_filter_block1d8_v2_sse2) +sym(aom_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_highbd_filter_block1d16_v2_sse2) +sym(aom_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_highbd_filter_block1d4_h2_sse2) +sym(aom_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_highbd_filter_block1d8_h2_sse2) +sym(aom_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_highbd_filter_block1d16_h2_sse2) +sym(aom_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 2 + %define max [rsp + 16 * 0] + %define min [rsp + 16 * 1] + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + add rsp, 16 * 2 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_quantize_avx.c b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c new file mode 100644 index 0000000000..b2d6d4b76d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_sse2.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + tran_low_t *dqcoeff) { + const __m128i low = _mm_mullo_epi16(qcoeff, dequant); + const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); + + const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan; + + *eob_ptr = 0; + + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); + + if (n_coeffs == 16) return; + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < n_coeffs; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} + +void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + const int log_scale = 1; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + (void)scan; + + // Setup global values. + // The 32x32 halves zbin and round. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // Shift with rounding. + zbin = _mm_add_epi16(zbin, one); + zbin = _mm_srli_epi16(zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, one); + + round = _mm_load_si128((const __m128i *)round_ptr); + round = _mm_add_epi16(round, one); + round = _mm_srli_epi16(round, 1); + + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < n_coeffs; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); + continue; + } + + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c new file mode 100644 index 0000000000..22f2e696d3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -0,0 +1,1441 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_ports/mem.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); + *((int *)(output_ptr + stride)) = + _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_storel_epi64((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_store2_mi128(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_store_si128((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static void aom_filter_block1d4_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg; + __m256i firstFilters, secondFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 32 bits + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d16_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m256i srcReg1, srcReg12; + __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; + + srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); + srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); + + // filter the source buffer + srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); + srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); + srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); + srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt1_1)); + } +} + +static void aom_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = + _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i resReg23_34_lo, resReg45_56_lo; + __m256i resReglo, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + +static void aom_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b11 = srcReg32b2; + srcReg32b2 = srcReg32b4; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); + } +} + +static void aom_filter_block1d16_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; + __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; + __m256i resReglo, resReghi, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); + resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); + + // add and saturate the results together + resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + resReghi = _mm256_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReghi); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg23_34_hi = srcReg45_56_hi; + srcReg4x = srcReg6x; + } +} + +static void aom_filter_block1d16_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = + _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + } +} + +static void aom_filter_block1d4_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i srcReg2345_3456_lo; + __m256i resReglo, resReg; + __m256i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + + resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 +#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 +#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 +#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 +#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 +#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 +#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 +// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) + +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c new file mode 100644 index 0000000000..5c36b68727 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c @@ -0,0 +1,569 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_ports/mem.h" + +void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, + srcRegFilt32b2_2; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); + __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); + __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_2, secondFilters); + d2 = _mm_madd_epi16(ss_2_2, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); + + // reading stride of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + ss_2 = _mm_srli_si128(srcReg32b2, 2); + ss_4 = _mm_srli_si128(srcReg32b2, 4); + ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_1, secondFilters); + d2 = _mm_madd_epi16(ss_2_1, thirdFilters); + srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); + + ss_1 = _mm_srli_si128(srcReg32b2, 3); + ss_3 = _mm_srli_si128(srcReg32b2, 5); + ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); + ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_1_2, secondFilters); + d2 = _mm_madd_epi16(ss_2_2, thirdFilters); + srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); + + res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); + res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); + srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_pixels_per_line; + + _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); + __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); + __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); + __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); + __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); + resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); + resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); + __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); + resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); + __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); + resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); + resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); + resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); + __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); + resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); + __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); + resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); + resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); + resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23_lo_1 = resReg45_lo_1; + resReg23_lo_2 = resReg45_lo_2; + resReg23_hi_1 = resReg45_hi_1; + resReg23_hi_2 = resReg45_hi_2; + resReg34_lo_1 = resReg56_lo_1; + resReg34_lo_2 = resReg56_lo_2; + resReg34_hi_1 = resReg56_hi_1; + resReg34_hi_2 = resReg56_hi_2; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); + ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); + d1 = _mm_madd_epi16(ss_3, secondFilters); + d2 = _mm_madd_epi16(ss_5, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg34_lo; + __m128i srcReg45_lo, srcReg56_lo; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_45_lo, resReg34_56_lo; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); + __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); + __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); + resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); + + tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); + resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); + __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); + resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); + __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); + tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); + resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23_lo_1 = resReg45_lo_1; + resReg23_lo_2 = resReg45_lo_2; + resReg34_lo_1 = resReg56_lo_1; + resReg34_lo_2 = resReg56_lo_2; + srcReg4 = srcReg6; + } +} + +void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_pixels_per_line, + uint8_t *output_ptr, ptrdiff_t output_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); + __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); + + ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); + ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); + ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); + ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); + + __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); + __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); + + __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, + const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23, srcReg34, srcReg45, srcReg56; + __m128i resReg23_34, resReg45_56; + __m128i resReg23_34_45_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + __m128i tmp_0, tmp_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); + __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); + __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + tmp_0 = _mm_madd_epi16(resReg23, secondFilters); + tmp_1 = _mm_madd_epi16(resReg34, secondFilters); + resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); + + __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); + __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); + + tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); + tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); + resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); + + // add and saturate the results together + resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); + + // shift by 6 bit each 16 bit + resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); + resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_34_45_56 = + _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); + + src_ptr += src_stride; + + *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); + *((int *)(output_ptr + out_pitch)) = + _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + resReg23 = resReg45; + resReg34 = resReg56; + srcReg4 = srcReg6; + } +} diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c new file mode 100644 index 0000000000..245fda1e94 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,847 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_ssse3.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + +static void aom_filter_block1d4_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); + filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); + + for (i = output_height; i > 0; i -= 1) { + // load the 2 strides of source + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d4_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32; + __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, + srcReg6, srcReg56; + __m128i srcReg23_34_lo, srcReg45_56_lo; + __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; + __m128i resReglo, resReghi; + __m128i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); + + srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); + + // merge every two consecutive registers + srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); + + resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); + resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); + + // shift by 6 bit each 16 bit + resReglo = _mm_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm_srai_epi16(resReglo, 6); + resReghi = _mm_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReglo = _mm_packus_epi16(resReglo, resReglo); + resReghi = _mm_packus_epi16(resReghi, resReghi); + + src_ptr += src_stride; + + *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); + *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4 = srcReg6; + } +} + +static void aom_filter_block1d8_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt2Reg, filt3Reg; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); + filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + + src_ptr += src_pixels_per_line; + + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d8_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23, srcReg34, srcReg45, srcReg56; + __m128i resReg23, resReg34, resReg45, resReg56; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters); + resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters); + resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters); + resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters); + + // add and saturate the results together + resReg23_45 = _mm_adds_epi16(resReg23, resReg45); + resReg34_56 = _mm_adds_epi16(resReg34, resReg56); + + // shift by 6 bit each 16 bit + resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32); + resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32); + resReg23_45 = _mm_srai_epi16(resReg23_45, 6); + resReg34_56 = _mm_srai_epi16(resReg34_56, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128()); + resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128()); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23 = srcReg45; + srcReg34 = srcReg56; + srcReg4 = srcReg6; + } +} + +static void aom_filter_block1d16_h4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i addFilterReg32, filt2Reg, filt3Reg; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); + filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); + + for (i = output_height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading stride of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + + // filter the source buffer + srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_pixels_per_line; + + _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); + + output_ptr += output_pitch; + } +} + +static void aom_filter_block1d16_v4_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg32, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + + // have consecutive loads on the same 256 register + srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); + + for (i = output_height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); + + // add and saturate the results together + resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); + resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); + resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); + resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); + + // add and saturate the results together + resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); + + // shift by 6 bit each 16 bit + resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); + resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); + resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); + resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); + resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); + resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); + resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); + resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg34_lo = srcReg56_lo; + srcReg23_hi = srcReg45_hi; + srcReg34_hi = srcReg56_hi; + srcReg4 = srcReg6; + } +} + +static INLINE __m128i shuffle_filter_convolve8_8_ssse3( + const __m128i *const s, const int16_t *const filter) { + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + return convolve8_8_ssse3(s, f); +} + +static void filter_horiz_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const x_filter) { + __m128i s[8], ss[4], temp; + + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, ss); + temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void transpose8x8_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[8]; + + load_8bit_8x8(src, src_stride, s); + transpose_8bit_8x8(s, s); + store_8bit_8x8(s, dst, dst_stride); +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 8) { + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + transpose8x8_to_dst(temp, 8, dst + x, dst_stride); + } + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void filter_horiz_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter) { + __m128i s[4]; + __m128i temp; + + load_8bit_8x4(src, src_stride, s); + transpose_16bit_4x4(s, s); + + temp = shuffle_filter_convolve8_8_ssse3(s, filter); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void transpose4x4_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[4]; + + load_8bit_4x4(src, src_stride, s); + s[0] = transpose_8bit_4x4(s); + s[1] = _mm_srli_si128(s[0], 4); + s[2] = _mm_srli_si128(s[0], 8); + s[3] = _mm_srli_si128(s[0], 12); + store_8bit_4x4(s, dst, dst_stride); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 4) { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + transpose4x4_to_dst(temp, 4, dst + x, dst_stride); + } + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static __m128i filter_vert_kernel(const __m128i *const s, + const int16_t *const filter) { + __m128i ss[4]; + __m128i temp; + + // 00 10 01 11 02 12 03 13 + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + // 20 30 21 31 22 32 23 33 + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + // 40 50 41 51 42 52 43 53 + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + // 60 70 61 71 62 72 63 73 + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + + temp = shuffle_filter_convolve8_8_ssse3(ss, filter); + // shrink to 8 bit each 16 bits + return _mm_packus_epi16(temp, temp); +} + +static void filter_vert_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8]; + __m128i temp; + + load_8bit_4x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + + y_q4 += y_step_q4; + } +} + +static void filter_vert_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8], temp; + + load_8bit_8x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void filter_vert_w16_ssse3(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter, const int w) { + int i; + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + + for (i = 0; i < w; i += 16) { + __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; + + loadu_8bit_16x8(src, src_stride, s); + + // merge the result together + s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); + s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); + s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); + s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); + s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); + s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); + s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); + s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); + temp_lo = convolve8_8_ssse3(s_lo, f); + temp_hi = convolve8_8_ssse3(s_hi, f); + + // shrink to 8 bit each 16 bits, the first lane contain the first convolve + // result and the second lane contain the second convolve result + temp_hi = _mm_packus_epi16(temp_lo, temp_hi); + src += 16; + // save 16 bytes convolve result + _mm_store_si128((__m128i *)&dst[i], temp_hi); + } +} + +static void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} + +filter8_1dfunction aom_filter_block1d16_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; + +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; + +// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3) +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3) diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm new file mode 100644 index 0000000000..640c5b2416 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -0,0 +1,615 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +SECTION .text + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_filter_block1d4_v8_sse2) +sym(aom_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_filter_block1d8_v8_sse2) +sym(aom_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_filter_block1d16_v8_sse2) +sym(aom_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_filter_block1d4_h8_sse2) +sym(aom_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_filter_block1d8_h8_sse2) +sym(aom_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +globalsym(aom_filter_block1d16_h8_sse2) +sym(aom_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm new file mode 100644 index 0000000000..e5fafb0302 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -0,0 +1,870 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 +even_byte_mask: times 8 dw 0x00ff + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffav1) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%define LOCAL_VARS_SIZE 16*6 + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if AOM_ARCH_X86_64 + %define krd m12 + %define tmp0 [rsp + 16*4] + %define tmp1 [rsp + 16*5] + mova krd, [GLOBAL(pw_64)] +%else + %define krd [rsp + 16*4] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +;------------------------------------------------------------------------------- +%if AOM_ARCH_X86_64 + %define LOCAL_VARS_SIZE_H4 0 +%else + %define LOCAL_VARS_SIZE_H4 16*4 +%endif + +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if AOM_ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + dec heightd + +.loop: + ;Do two rows at once + movu m4, [srcq - 3] + movu m5, [srcq + sstrideq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + punpckhbw m3, m5, m5 + punpcklbw m5, m5 + palignr m0, m1, m4, 1 + pmaddubsw m0, k0k1k4k5 + palignr m1, m4, 5 + pmaddubsw m1, k2k3k6k7 + palignr m2, m3, m5, 1 + pmaddubsw m2, k0k1k4k5 + palignr m3, m5, 5 + pmaddubsw m3, k2k3k6k7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%ifidn %1, h8_avg + movd m4, [dstq] + movd m5, [dstq + dstrideq] +%endif + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 + psrldq m1, m0, 4 + +%ifidn %1, h8_avg + pavgb m0, m4 + pavgb m1, m5 +%endif + movd [dstq], m0 + movd [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m4, [srcq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + palignr m0, m1, m4, 1 + palignr m1, m4, 5 + pmaddubsw m0, k0k1k4k5 + pmaddubsw m1, k2k3k6k7 + psrldq m2, m0, 8 + psrldq m3, m1, 8 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, h8_avg + movd m4, [dstq] + pavgb m0, m4 +%endif + movd [dstq], m0 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + dec heightd + +.loop: + ;Do two rows at once + movu m0, [srcq - 3] + movu m4, [srcq + sstrideq - 3] + punpckhbw m1, m0, m0 + punpcklbw m0, m0 + palignr m5, m1, m0, 13 + pmaddubsw m5, k6k7 + palignr m2, m1, m0, 5 + palignr m3, m1, m0, 9 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpckhbw m6, m4, m4 + punpcklbw m4, m4 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m6, m4, 13 + palignr m0, m6, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 +%ifidn %1, h8_avg + movh m2, [dstq] + movhps m2, [dstq + dstrideq] +%endif + palignr m5, m6, m4, 9 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpcklbw m4, m3 + punpcklbw m5, m3 + paddsw m1, m4 + paddsw m6, m5 +%endif + packuswb m1, m6 +%ifidn %1, h8_avg + pavgb m1, m2 +%endif + movh [dstq], m1 + movhps [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m0, [srcq - 3] + punpckhbw m3, m0, m0 + punpcklbw m0, m0 + palignr m1, m3, m0, 1 + palignr m2, m3, m0, 5 + palignr m4, m3, m0, 13 + palignr m3, m0, 9 + pmaddubsw m1, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + pmaddubsw m4, k6k7 + paddsw m1, m3 + paddsw m4, m2 + paddsw m1, m4 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, h8_add_src + pxor m6, m6 + movu m5, [srcq] + punpcklbw m5, m6 + paddsw m1, m5 +%endif + packuswb m1, m1 +%ifidn %1, h8_avg + movh m0, [dstq] + pavgb m1, m0 +%endif + movh [dstq], m1 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movu m0, [srcq - 3] + movu m4, [srcq - 2] + pmaddubsw m0, k0k1 + pmaddubsw m4, k0k1 + movu m1, [srcq - 1] + movu m5, [srcq + 0] + pmaddubsw m1, k2k3 + pmaddubsw m5, k2k3 + movu m2, [srcq + 1] + movu m6, [srcq + 2] + pmaddubsw m2, k4k5 + pmaddubsw m6, k4k5 + movu m3, [srcq + 3] + movu m7, [srcq + 4] + pmaddubsw m3, k6k7 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 +%ifidn %1, h8_add_src +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 + pcmpeqb m2, m2 ;all ones + psrlw m2, 8 ;even_byte_mask +%else + mova m2, [GLOBAL(even_byte_mask)] +%endif + movu m5, [srcq] + mova m7, m5 + pand m5, m2 + psrlw m7, 8 + paddsw m0, m5 + paddsw m4, m7 +%endif + packuswb m0, m0 + packuswb m4, m4 + punpcklbw m0, m4 +%ifidn %1, h8_avg + pavgb m0, [dstq] +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + REP_RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER4 h8 + +;------------------------------------------------------------------------------- + +; TODO(Linfeng): Detect cpu type and choose the code with better performance. +%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 + +%if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + %define NUM_GENERAL_REG_USED 9 +%else + %define NUM_GENERAL_REG_USED 6 +%endif + +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif + + dec heightd + +%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if AOM_ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + ;Do two rows at once + movx m0, [srcq ] ;A + movx m1, [src1q ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, v8_add_src + movu m4, [src1q] + punpcklbw m4, m6 + paddsw m1, m4 +%endif + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%else + ; AOM_ARCH_X86_64 + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2 ] + movx m2, [srcq] ;C + movx m3, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2 ] + movx m4, [srcq] ;E + movx m5, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2 ] + movx m6, [srcq] ;G + punpcklbw m0, m1 ;A B + punpcklbw m1, m2 ;A B next iter + punpcklbw m2, m3 ;C D + punpcklbw m3, m4 ;C D next iter + punpcklbw m4, m5 ;E F + punpcklbw m5, m6 ;E F next iter + +.loop: + ;Do two rows at once + movx m7, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2 ] + movx m14, [srcq] ;H next iter + punpcklbw m6, m7 ;G H + punpcklbw m7, m14 ;G H next iter + pmaddubsw m8, m0, k0k1 + pmaddubsw m9, m1, k0k1 + mova m0, m2 + mova m1, m3 + pmaddubsw m10, m2, k2k3 + pmaddubsw m11, m3, k2k3 + mova m2, m4 + mova m3, m5 + pmaddubsw m4, k4k5 + pmaddubsw m5, k4k5 + paddsw m8, m4 + paddsw m9, m5 + mova m4, m6 + mova m5, m7 + pmaddubsw m6, k6k7 + pmaddubsw m7, k6k7 + paddsw m10, m6 + paddsw m11, m7 + paddsw m8, m10 + paddsw m9, m11 + mova m6, m14 + paddsw m8, krd + paddsw m9, krd + psraw m8, 7 + psraw m9, 7 +%ifidn %2, 4 + packuswb m8, m8 + packuswb m9, m9 +%else + packuswb m8, m9 +%endif + +%ifidn %1, v8_avg + movx m7, [dstq] +%ifidn %2, 4 + movx m10, [dstq + dstrideq] + pavgb m9, m10 +%else + movhpd m7, [dstq + dstrideq] +%endif + pavgb m8, m7 +%endif + movx [dstq], m8 +%ifidn %2, 4 + movx [dstq + dstrideq], m9 +%else + movhpd [dstq + dstrideq], m8 +%endif + + lea dstq, [dstq + dstrideq * 2 ] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m7, [srcq + sstrideq] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + pmaddubsw m6, k6k7 + paddsw m0, m4 + paddsw m2, m6 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%endif ; AOM_ARCH_X86_64 + +.done: + REP_RET + +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if AOM_ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + lea src1q, [srcq + sstrideq] + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [src1q ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down + mova m5, m4 + punpcklbw m4, m6 + punpckhbw m5, m6 + paddsw m0, m4 + paddsw m3, m5 +%endif + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + REP_RET + +%else + ; AOM_ARCH_X86_64 + dec heightd + + movu m1, [srcq ] ;A + movu m3, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2] + punpcklbw m0, m1, m3 ;A B + punpckhbw m1, m3 ;A B + movu m5, [srcq] ;C + punpcklbw m2, m3, m5 ;A B next iter + punpckhbw m3, m5 ;A B next iter + mova tmp0, m2 ;store to stack + mova tmp1, m3 ;store to stack + movu m7, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2] + punpcklbw m4, m5, m7 ;C D + punpckhbw m5, m7 ;C D + movu m9, [srcq] ;E + punpcklbw m6, m7, m9 ;C D next iter + punpckhbw m7, m9 ;C D next iter + movu m11, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2] + punpcklbw m8, m9, m11 ;E F + punpckhbw m9, m11 ;E F + movu m2, [srcq] ;G + punpcklbw m10, m11, m2 ;E F next iter + punpckhbw m11, m2 ;E F next iter + +.loop: + ;Do two rows at once + pmaddubsw m13, m0, k0k1 + mova m0, m4 + pmaddubsw m14, m8, k4k5 + pmaddubsw m15, m4, k2k3 + mova m4, m8 + paddsw m13, m14 + movu m3, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2] + punpcklbw m14, m2, m3 ;G H + mova m8, m14 + pmaddubsw m14, k6k7 + paddsw m15, m14 + paddsw m13, m15 + paddsw m13, krd + psraw m13, 7 + + pmaddubsw m14, m1, k0k1 + pmaddubsw m1, m9, k4k5 + pmaddubsw m15, m5, k2k3 + paddsw m14, m1 + mova m1, m5 + mova m5, m9 + punpckhbw m2, m3 ;G H + mova m9, m2 + pmaddubsw m2, k6k7 + paddsw m15, m2 + paddsw m14, m15 + paddsw m14, krd + psraw m14, 7 + packuswb m13, m14 +%ifidn %1, v8_avg + pavgb m13, [dstq] +%endif + mova [dstq], m13 + + ; next iter + pmaddubsw m15, tmp0, k0k1 + pmaddubsw m14, m10, k4k5 + pmaddubsw m13, m6, k2k3 + paddsw m15, m14 + mova tmp0, m6 + mova m6, m10 + movu m2, [srcq] ;G next iter + punpcklbw m14, m3, m2 ;G H next iter + mova m10, m14 + pmaddubsw m14, k6k7 + paddsw m13, m14 + paddsw m15, m13 + paddsw m15, krd + psraw m15, 7 + + pmaddubsw m14, tmp1, k0k1 + mova tmp1, m7 + pmaddubsw m13, m7, k2k3 + mova m7, m11 + pmaddubsw m11, k4k5 + paddsw m14, m11 + punpckhbw m3, m2 ;G H next iter + mova m11, m3 + pmaddubsw m3, k6k7 + paddsw m13, m3 + paddsw m14, m13 + paddsw m14, krd + psraw m14, 7 + packuswb m15, m14 +%ifidn %1, v8_avg + pavgb m15, [dstq + dstrideq] +%endif + mova [dstq + dstrideq], m15 + lea dstq, [dstq + dstrideq * 2] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m3, [srcq + sstrideq] ;H + punpcklbw m6, m2, m3 ;G H + punpckhbw m2, m3 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m1, k0k1 + pmaddubsw m4, k2k3 + pmaddubsw m5, k2k3 + pmaddubsw m8, k4k5 + pmaddubsw m9, k4k5 + pmaddubsw m6, k6k7 + pmaddubsw m2, k6k7 + paddsw m0, m8 + paddsw m1, m9 + paddsw m4, m6 + paddsw m5, m2 + paddsw m0, m4 + paddsw m1, m5 + paddsw m0, krd + paddsw m1, krd + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + +.done: + REP_RET + +%endif ; AOM_ARCH_X86_64 + +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8, 4 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm new file mode 100644 index 0000000000..90dd55a4be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +globalsym(aom_filter_block1d4_v2_sse2) +sym(aom_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d8_v2_sse2) +sym(aom_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d16_v2_sse2) +sym(aom_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d4_h2_sse2) +sym(aom_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d8_h2_sse2) +sym(aom_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d16_h2_sse2) +sym(aom_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm new file mode 100644 index 0000000000..253bc26d38 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -0,0 +1,267 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movd xmm2, ecx ;rounding_shift + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movd xmm6, ecx ;rounding_shift + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + pmulhrsw xmm2, xmm6 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +SECTION .text + +globalsym(aom_filter_block1d4_v2_ssse3) +sym(aom_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d8_v2_ssse3) +sym(aom_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d16_v2_ssse3) +sym(aom_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d4_h2_ssse3) +sym(aom_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d8_h2_ssse3) +sym(aom_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +globalsym(aom_filter_block1d16_h2_ssse3) +sym(aom_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 0000000000..49fcd72098 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,897 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_avx2.h" +#include "aom_ports/mem.h" + +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +void aom_hadamard_lp_8x8_dual_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride, + t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + if (is_final) { + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + coeff += 16; + } else { + _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); + coeff16 += 16; + } + t_coeff += 16; + } +} + +void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + int16_t *t_coeff = coeff; + for (int idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride, + t_coeff + (idx * 64 * 2)); + } + + for (int idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3)); + coeff += 16; + t_coeff += 16; + } +} + +void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; + int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_avx2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); + + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); + + coeff += 16; + t_coeff += 16; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int aom_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} + +int aom_satd_lp_avx2(const int16_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + + for (int i = 0; i < length; i += 16) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx, + int *avg) { + const uint8_t *s_y0 = s + y16_idx * p + x16_idx; + const uint8_t *s_y1 = s_y0 + 8 * p; + __m256i sum0, sum1, s0, s1, s2, s3, u0; + u0 = _mm256_setzero_si256(); + s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0); + s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0); + s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0); + s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0); + sum0 = _mm256_add_epi16(s0, s1); + sum1 = _mm256_add_epi16(s2, s3); + s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0); + s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0); + s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0); + s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0); + sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1)); + sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3)); + sum0 = _mm256_add_epi16(sum0, sum1); + + // (avg + 32) >> 6 + __m256i rounding = _mm256_set1_epi32(32); + sum0 = _mm256_add_epi32(sum0, rounding); + sum0 = _mm256_srli_epi32(sum0, 6); + __m128i lo = _mm256_castsi256_si128(sum0); + __m128i hi = _mm256_extracti128_si256(sum0, 1); + avg[0] = _mm_cvtsi128_si32(lo); + avg[1] = _mm_extract_epi32(lo, 2); + avg[2] = _mm_cvtsi128_si32(hi); + avg[3] = _mm_extract_epi32(hi, 2); +} + +void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref, + const int ref_stride, const int width, + const int height, int norm_factor) { + // SIMD implementation assumes width and height to be multiple of 16 and 2 + // respectively. For any odd width or height, SIMD support needs to be added. + assert(width % 16 == 0 && height % 2 == 0); + + if (width % 32 == 0) { + const __m256i zero = _mm256_setzero_si256(); + for (int wd = 0; wd < width; wd += 32) { + const uint8_t *ref_tmp = ref + wd; + int16_t *hbuf_tmp = hbuf + wd; + __m256i s0 = zero; + __m256i s1 = zero; + int idx = 0; + do { + __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp); + __m256i t0 = _mm256_unpacklo_epi8(src_line, zero); + __m256i t1 = _mm256_unpackhi_epi8(src_line, zero); + s0 = _mm256_add_epi16(s0, t0); + s1 = _mm256_add_epi16(s1, t1); + ref_tmp += ref_stride; + + src_line = _mm256_loadu_si256((const __m256i *)ref_tmp); + t0 = _mm256_unpacklo_epi8(src_line, zero); + t1 = _mm256_unpackhi_epi8(src_line, zero); + s0 = _mm256_add_epi16(s0, t0); + s1 = _mm256_add_epi16(s1, t1); + ref_tmp += ref_stride; + idx += 2; + } while (idx < height); + s0 = _mm256_srai_epi16(s0, norm_factor); + s1 = _mm256_srai_epi16(s1, norm_factor); + _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0)); + _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1)); + _mm_storeu_si128((__m128i *)(hbuf_tmp + 16), + _mm256_extractf128_si256(s0, 1)); + _mm_storeu_si128((__m128i *)(hbuf_tmp + 24), + _mm256_extractf128_si256(s1, 1)); + } + } else if (width % 16 == 0) { + aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor); + } +} + +static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src, + const int stride) { + src[0] = _mm256_loadu_si256((const __m256i *)ref1); + src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride))); + src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride))); +} + +#define CALC_TOT_SAD_AND_STORE \ + /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */ \ + const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0); \ + /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */ \ + const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01); \ + /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */ \ + const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012); \ + \ + const __m128i results0 = _mm_add_epi16( \ + _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \ + const __m128i results1 = \ + _mm_add_epi16(results0, _mm_srli_si128(results0, 8)); \ + _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor)); + +static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref, + const int ref_stride, + const int height, + int norm_factor) { + const __m256i zero = _mm256_setzero_si256(); + int ht = 0; + // Post sad operation, the data is present in lower 16-bit of each 64-bit lane + // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to + // utilize the higher 16-bits efficiently. + do { + __m256i src_00 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref))); + src_00 = _mm256_inserti128_si256( + src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1); + __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1))); + src_01 = _mm256_inserti128_si256( + src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1); + __m256i src_10 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2))); + src_10 = _mm256_inserti128_si256( + src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1); + __m256i src_11 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3))); + src_11 = _mm256_inserti128_si256( + src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1); + + // s00 x x x s01 x x x | s40 x x x s41 x x x + const __m256i s0 = _mm256_sad_epu8(src_00, zero); + // s10 x x x s11 x x x | s50 x x x s51 x x x + const __m256i s1 = _mm256_sad_epu8(src_01, zero); + // s20 x x x s21 x x x | s60 x x x s61 x x x + const __m256i s2 = _mm256_sad_epu8(src_10, zero); + // s30 x x x s31 x x x | s70 x x x s71 x x x + const __m256i s3 = _mm256_sad_epu8(src_11, zero); + + // s00 s10 x x x x x x | s40 s50 x x x x x x + const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1); + // s01 s11 x x x x x x | s41 s51 x x x x x x + const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1); + // s20 s30 x x x x x x | s60 s70 x x x x x x + const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3); + // s21 s31 x x x x x x | s61 s71 x x x x x x + const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3); + + // s0 s1 x x x x x x | s4 s5 x x x x x x + const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi); + // s2 s3 x x x x x x | s6 s7 x x x x x x + const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi); + + // s1 s1 s2 s3 s4 s5 s6 s7 + const __m128i results = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08)); + _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor)); + vbuf += 8; + ref += (ref_stride << 3); + ht += 8; + } while (ht < height); +} + +void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref, + const int ref_stride, const int width, + const int height, int norm_factor) { + assert(width % 16 == 0); + if (width == 128) { + const __m256i zero = _mm256_setzero_si256(); + for (int ht = 0; ht < height; ht += 4) { + __m256i src[16]; + // Load source data. + load_from_src_buf(ref, &src[0], ref_stride); + load_from_src_buf(ref + 32, &src[4], ref_stride); + load_from_src_buf(ref + 64, &src[8], ref_stride); + load_from_src_buf(ref + 96, &src[12], ref_stride); + + // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x + const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero), + _mm256_sad_epu8(src[4], zero)); + const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero), + _mm256_sad_epu8(src[12], zero)); + const __m256i r0 = _mm256_add_epi16(s0, s1); + // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x + const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero), + _mm256_sad_epu8(src[5], zero)); + const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero), + _mm256_sad_epu8(src[13], zero)); + const __m256i r1 = _mm256_add_epi16(s2, s3); + // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x + const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero), + _mm256_sad_epu8(src[6], zero)); + const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero), + _mm256_sad_epu8(src[14], zero)); + const __m256i r2 = _mm256_add_epi16(s4, s5); + // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x + const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero), + _mm256_sad_epu8(src[7], zero)); + const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero), + _mm256_sad_epu8(src[15], zero)); + const __m256i r3 = _mm256_add_epi16(s6, s7); + + CALC_TOT_SAD_AND_STORE + vbuf += 4; + ref += ref_stride << 2; + } + } else if (width == 64) { + const __m256i zero = _mm256_setzero_si256(); + for (int ht = 0; ht < height; ht += 4) { + __m256i src[8]; + // Load source data. + load_from_src_buf(ref, &src[0], ref_stride); + load_from_src_buf(ref + 32, &src[4], ref_stride); + + // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x + const __m256i s0 = _mm256_sad_epu8(src[0], zero); + const __m256i s1 = _mm256_sad_epu8(src[4], zero); + const __m256i r0 = _mm256_add_epi16(s0, s1); + // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x + const __m256i s2 = _mm256_sad_epu8(src[1], zero); + const __m256i s3 = _mm256_sad_epu8(src[5], zero); + const __m256i r1 = _mm256_add_epi16(s2, s3); + // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x + const __m256i s4 = _mm256_sad_epu8(src[2], zero); + const __m256i s5 = _mm256_sad_epu8(src[6], zero); + const __m256i r2 = _mm256_add_epi16(s4, s5); + // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x + const __m256i s6 = _mm256_sad_epu8(src[3], zero); + const __m256i s7 = _mm256_sad_epu8(src[7], zero); + const __m256i r3 = _mm256_add_epi16(s6, s7); + + CALC_TOT_SAD_AND_STORE + vbuf += 4; + ref += ref_stride << 2; + } + } else if (width == 32) { + assert(height % 2 == 0); + const __m256i zero = _mm256_setzero_si256(); + for (int ht = 0; ht < height; ht += 4) { + __m256i src[4]; + // Load source data. + load_from_src_buf(ref, &src[0], ref_stride); + + // s00 x x x s01 x x x s02 x x x s03 x x x + const __m256i r0 = _mm256_sad_epu8(src[0], zero); + // s10 x x x s11 x x x s12 x x x s13 x x x + const __m256i r1 = _mm256_sad_epu8(src[1], zero); + // s20 x x x s21 x x x s22 x x x s23 x x x + const __m256i r2 = _mm256_sad_epu8(src[2], zero); + // s30 x x x s31 x x x s32 x x x s33 x x x + const __m256i r3 = _mm256_sad_epu8(src[3], zero); + + CALC_TOT_SAD_AND_STORE + vbuf += 4; + ref += ref_stride << 2; + } + } else if (width == 16) { + aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor); + } +} + +static inline void calc_vector_mean_sse_64wd(const int16_t *ref, + const int16_t *src, __m256i *mean, + __m256i *sse) { + const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i ref_line2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i ref_line3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + + const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0); + const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1); + const __m256i diff2 = _mm256_sub_epi16(ref_line2, src_line2); + const __m256i diff3 = _mm256_sub_epi16(ref_line3, src_line3); + const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0); + const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1); + const __m256i diff_sqr2 = _mm256_madd_epi16(diff2, diff2); + const __m256i diff_sqr3 = _mm256_madd_epi16(diff3, diff3); + + *mean = _mm256_add_epi16(*mean, _mm256_add_epi16(diff0, diff1)); + *mean = _mm256_add_epi16(*mean, diff2); + *mean = _mm256_add_epi16(*mean, diff3); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(diff_sqr0, diff_sqr1)); + *sse = _mm256_add_epi32(*sse, diff_sqr2); + *sse = _mm256_add_epi32(*sse, diff_sqr3); +} + +#define CALC_VAR_FROM_MEAN_SSE(mean, sse) \ + { \ + mean = _mm256_madd_epi16(mean, _mm256_set1_epi16(1)); \ + mean = _mm256_hadd_epi32(mean, sse); \ + mean = _mm256_add_epi32(mean, _mm256_bsrli_epi128(mean, 4)); \ + const __m128i result = _mm_add_epi32(_mm256_castsi256_si128(mean), \ + _mm256_extractf128_si256(mean, 1)); \ + /*(mean * mean): dynamic range 31 bits.*/ \ + const int mean_int = _mm_extract_epi32(result, 0); \ + const int sse_int = _mm_extract_epi32(result, 2); \ + const unsigned int mean_abs = abs(mean_int); \ + var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2)); \ + } + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4, 5} +int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl) { + const int width = 4 << bwl; + assert(width % 16 == 0 && width <= 128); + int var = 0; + + // Instead of having a loop over width 16, considered loop unrolling to avoid + // some addition operations. + if (width == 128) { + __m256i mean = _mm256_setzero_si256(); + __m256i sse = _mm256_setzero_si256(); + + calc_vector_mean_sse_64wd(src, ref, &mean, &sse); + calc_vector_mean_sse_64wd(src + 64, ref + 64, &mean, &sse); + CALC_VAR_FROM_MEAN_SSE(mean, sse) + } else if (width == 64) { + __m256i mean = _mm256_setzero_si256(); + __m256i sse = _mm256_setzero_si256(); + + calc_vector_mean_sse_64wd(src, ref, &mean, &sse); + CALC_VAR_FROM_MEAN_SSE(mean, sse) + } else if (width == 32) { + const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + + const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0); + const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1); + const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0); + const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1); + const __m256i sse = _mm256_add_epi32(diff_sqr0, diff_sqr1); + __m256i mean = _mm256_add_epi16(diff0, diff1); + + CALC_VAR_FROM_MEAN_SSE(mean, sse) + } else if (width == 16) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)src); + const __m256i ref_line = _mm256_loadu_si256((const __m256i *)ref); + __m256i mean = _mm256_sub_epi16(ref_line, src_line); + const __m256i sse = _mm256_madd_epi16(mean, mean); + + CALC_VAR_FROM_MEAN_SSE(mean, sse) + } + return var; +} diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c new file mode 100644 index 0000000000..9ab9143eee --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c @@ -0,0 +1,700 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/bitdepth_conversion_sse2.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_ports/mem.h" + +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + +static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi32(a, sign); +} + +void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i sum0, sum1, s0, s1, s2, s3, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = loadh_epi64((const __m128i *)(s + p), + _mm_loadl_epi64((const __m128i *)(s))); + s1 = loadh_epi64((const __m128i *)(s + 3 * p), + _mm_loadl_epi64((const __m128i *)(s + 2 * p))); + s2 = loadh_epi64((const __m128i *)(s + 5 * p), + _mm_loadl_epi64((const __m128i *)(s + 4 * p))); + s3 = loadh_epi64((const __m128i *)(s + 7 * p), + _mm_loadl_epi64((const __m128i *)(s + 6 * p))); + s0 = _mm_sad_epu8(s0, u0); + s1 = _mm_sad_epu8(s1, u0); + s2 = _mm_sad_epu8(s2, u0); + s3 = _mm_sad_epu8(s3, u0); + + sum0 = _mm_add_epi16(s0, s1); + sum1 = _mm_add_epi16(s2, s3); + sum0 = _mm_add_epi16(sum0, sum1); + sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8)); + avg = _mm_cvtsi128_si32(sum0); + return (avg + 32) >> 6; +} + +void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { + __m128i sum0, sum1, s0, s1, s2, s3, u0; + u0 = _mm_setzero_si128(); + s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0); + s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0); + s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0); + s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0); + sum0 = _mm_add_epi16(s0, s1); + sum1 = _mm_add_epi16(s2, s3); + s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0); + s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0); + s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0); + s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0); + sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1)); + sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3)); + sum0 = _mm_add_epi16(sum0, sum1); + + // (avg + 32) >> 6 + __m128i rounding = _mm_set1_epi32(32); + sum0 = _mm_add_epi32(sum0, rounding); + sum0 = _mm_srli_epi32(sum0, 6); + avg[0] = _mm_cvtsi128_si32(sum0); + avg[1] = _mm_extract_epi16(sum0, 4); +} + +void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx, + int *avg) { + const uint8_t *s_ptr = s + y16_idx * p + x16_idx; + for (int k = 0; k < 2; k++) { + calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2); + s_ptr += 8 * p; + } +} + +unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)), + _mm_cvtsi32_si128(*(const int *)(s + p))); + s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)), + _mm_cvtsi32_si128(*(const int *)(s + p * 3))); + s0 = _mm_sad_epu8(s0, u0); + s1 = _mm_sad_epu8(s1, u0); + s0 = _mm_add_epi16(s0, s1); + avg = _mm_cvtsi128_si32(s0); + return (avg + 8) >> 4; +} + +static INLINE void hadamard_col4_sse2(__m128i *in, int iter) { + const __m128i a0 = in[0]; + const __m128i a1 = in[1]; + const __m128i a2 = in[2]; + const __m128i a3 = in[3]; + const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1); + const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1); + const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1); + const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1); + in[0] = _mm_add_epi16(b0, b2); + in[1] = _mm_add_epi16(b1, b3); + in[2] = _mm_sub_epi16(b0, b2); + in[3] = _mm_sub_epi16(b1, b3); + + if (iter == 0) { + const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc); + const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc); + in[0] = dcba_lo; + in[1] = _mm_srli_si128(dcba_lo, 8); + in[2] = dcba_hi; + in[3] = _mm_srli_si128(dcba_hi, 8); + } +} + +void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src[4]; + src[0] = _mm_loadl_epi64((const __m128i *)src_diff); + src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride)); + + hadamard_col4_sse2(src, 0); + hadamard_col4_sse2(src, 1); + + store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff); + coeff += 8; + store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff); +} + +static INLINE void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b7 = _mm_add_epi16(a1, a5); + b3 = _mm_add_epi16(a2, a6); + b4 = _mm_add_epi16(a3, a7); + b2 = _mm_sub_epi16(a0, a4); + b6 = _mm_sub_epi16(a1, a5); + b1 = _mm_sub_epi16(a2, a6); + b5 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[1] = _mm_unpackhi_epi64(b0, b1); + in[2] = _mm_unpacklo_epi64(b2, b3); + in[3] = _mm_unpackhi_epi64(b2, b3); + in[4] = _mm_unpacklo_epi64(b4, b5); + in[5] = _mm_unpackhi_epi64(b4, b5); + in[6] = _mm_unpacklo_epi64(b6, b7); + in[7] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + if (is_final) { + store_tran_low(src[0], coeff); + coeff += 8; + store_tran_low(src[1], coeff); + coeff += 8; + store_tran_low(src[2], coeff); + coeff += 8; + store_tran_low(src[3], coeff); + coeff += 8; + store_tran_low(src[4], coeff); + coeff += 8; + store_tran_low(src[5], coeff); + coeff += 8; + store_tran_low(src[6], coeff); + coeff += 8; + store_tran_low(src[7], coeff); + } else { + int16_t *coeff16 = (int16_t *)coeff; + _mm_store_si128((__m128i *)coeff16, src[0]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[1]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[2]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[3]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[4]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[5]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[6]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[7]); + } +} + +void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); +} + +static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, int16_t *coeff) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + _mm_store_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[7]); +} + +void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + hadamard_lp_8x8_sse2(src_diff, src_stride, coeff); +} + +void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, int16_t *coeff) { + for (int i = 0; i < 2; i++) { + hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64)); + } +} + +void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + for (int idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + } + + int16_t *t_coeff = coeff; + for (int idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + + _mm_store_si128((__m128i *)t_coeff, coeff0); + _mm_store_si128((__m128i *)(t_coeff + 64), coeff1); + _mm_store_si128((__m128i *)(t_coeff + 128), coeff2); + _mm_store_si128((__m128i *)(t_coeff + 192), coeff3); + + t_coeff += 8; + } +} + +static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + + if (is_final) { + store_tran_low_offset_4(coeff0, coeff); + store_tran_low_offset_4(coeff1, coeff + 64); + store_tran_low_offset_4(coeff2, coeff + 128); + store_tran_low_offset_4(coeff3, coeff + 192); + coeff += 4; + } else { + _mm_store_si128((__m128i *)coeff16, coeff0); + _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); + _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); + _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); + coeff16 += 8; + } + + t_coeff += 8; + // Increment the pointer additionally by 0 and 8 in alternate + // iterations(instead of 8) to ensure the coherency with the implementation + // of store_tran_low_offset_4() + coeff += (((idx >> 3) & 1) << 3); + } +} + +void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); +} + +void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; + int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_sse2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); + + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); + + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + store_tran_low_offset_4(coeff0, coeff); + store_tran_low_offset_4(coeff1, coeff + 256); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + store_tran_low_offset_4(coeff2, coeff + 512); + store_tran_low_offset_4(coeff3, coeff + 768); + + // Increment the pointer by 4 and 12 in alternate iterations(instead of 8) + // to ensure the coherency with the implementation of + // store_tran_low_offset_4() + coeff += (4 + (((idx >> 3) & 1) << 3)); + t_coeff += 8; + } +} + +int aom_satd_sse2(const tran_low_t *coeff, int length) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 4) { + const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i coeff_sign = _mm_srai_epi32(src_line, 31); + const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign); + accum = _mm_add_epi32(accum, abs_coeff); + coeff += 4; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +int aom_satd_lp_sse2(const int16_t *coeff, int length) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + __m128i accum = zero; + + for (int i = 0; i < length; i += 16) { + const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff); + const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8)); + const __m128i inv0 = _mm_sub_epi16(zero, src_line0); + const __m128i inv1 = _mm_sub_epi16(zero, src_line1); + const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line) + const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line) + const __m128i sum0 = _mm_madd_epi16(abs0, one); + const __m128i sum1 = _mm_madd_epi16(abs1, one); + accum = _mm_add_epi32(accum, sum0); + accum = _mm_add_epi32(accum, sum1); + coeff += 16; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, + const int ref_stride, const int width, + const int height, int norm_factor) { + // SIMD implementation assumes width and height to be multiple of 16 and 2 + // respectively. For any odd width or height, SIMD support needs to be added. + assert(width % 16 == 0 && height % 2 == 0); + __m128i zero = _mm_setzero_si128(); + + for (int wd = 0; wd < width; wd += 16) { + const uint8_t *ref_tmp = ref + wd; + int16_t *hbuf_tmp = hbuf + wd; + __m128i s0 = zero; + __m128i s1 = zero; + int idx = 0; + do { + __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp); + __m128i t0 = _mm_unpacklo_epi8(src_line, zero); + __m128i t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_add_epi16(s0, t0); + s1 = _mm_add_epi16(s1, t1); + ref_tmp += ref_stride; + + src_line = _mm_loadu_si128((const __m128i *)ref_tmp); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_add_epi16(s0, t0); + s1 = _mm_add_epi16(s1, t1); + ref_tmp += ref_stride; + idx += 2; + } while (idx < height); + + s0 = _mm_srai_epi16(s0, norm_factor); + s1 = _mm_srai_epi16(s1, norm_factor); + _mm_storeu_si128((__m128i *)(hbuf_tmp), s0); + _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1); + } +} + +void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref, + const int ref_stride, const int width, + const int height, int norm_factor) { + // SIMD implementation assumes width to be multiple of 16. + assert(width % 16 == 0); + + for (int ht = 0; ht < height; ht++) { + const uint8_t *ref_tmp = ref + (ht * ref_stride); + __m128i zero = _mm_setzero_si128(); + __m128i s0 = zero; + __m128i s1, src_line; + for (int i = 0; i < width; i += 16) { + src_line = _mm_loadu_si128((const __m128i *)ref_tmp); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_add_epi16(s0, s1); + ref_tmp += 16; + } + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_add_epi16(s0, s1); + vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor; + } +} diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c new file mode 100644 index 0000000000..b83b43122a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4, 5} +int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) { + const int width = 4 << bwl; + assert(width % 16 == 0); + + const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + __m128i mean = _mm_setzero_si128(); + __m128i sse = _mm_setzero_si128(); + + for (int i = 0; i < width; i += 16) { + const __m128i src_line = _mm_loadu_si128((const __m128i *)src); + const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref); + const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8)); + const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8)); + __m128i diff = _mm_sub_epi16(ref_line, src_line); + const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2); + __m128i diff_sqr = _mm_madd_epi16(diff, diff); + const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2); + + diff = _mm_add_epi16(diff, diff2); + diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2); + sse = _mm_add_epi32(sse, diff_sqr); + mean = _mm_add_epi16(mean, diff); + + src += 16; + ref += 16; + } + + // m0 m1 m2 m3 + mean = _mm_madd_epi16(mean, k_one_epi16); + // m0+m1 m2+m3 s0+s1 s2+s3 + __m128i result = _mm_hadd_epi32(mean, sse); + // m0+m1+m2+m3 s0+s1+s2+s3 x x + result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4)); + + // (mean * mean): dynamic range 31 bits. + const int mean_int = _mm_extract_epi32(result, 0); + const int sse_int = _mm_extract_epi32(result, 2); + const unsigned int mean_abs = abs(mean_int); + const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2)); + return var; +} diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 0000000000..85896e2768 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE __m256i load_tran_low(const tran_low_t *a) { + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); +} + +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +} diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h new file mode 100644 index 0000000000..ff77760b6f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m128i load_tran_low(const tran_low_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +} + +static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + *a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + *a_2 = _mm_unpackhi_epi16(a_lo, a_hi); +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { + __m128i a_1, a_2; + unpack_trans(a, &a_1, &a_2); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +} +// Stores the second result at an offset of 8 (instead of 4) to match the output +// with that of AVX2 implementation and the function is similar to +// store_tran_low(). +static INLINE void store_tran_low_offset_4(__m128i a, tran_low_t *b) { + __m128i a_1, a_2; + unpack_trans(a, &a_1, &a_2); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 8), a_2); +} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c new file mode 100644 index 0000000000..e0289abe12 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_integer.h" + +#include "config/aom_dsp_rtcd.h" + +// To start out, just dispatch to the function using the 2D mask and +// pass mask stride as 0. This can be improved upon if necessary. + +void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, 0, w, h, 0, 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, + src1_8, src1_stride, mask, 0, w, h, 0, 0, + bd); +} +#endif diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c new file mode 100644 index 0000000000..dfbab324d0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c @@ -0,0 +1,1374 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 +#include // AVX2 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w16_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, + int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s1_0 = yy_loadu_256(src1); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + __m256i res = _mm256_packus_epi16(res0, res0); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); +} + +static INLINE void blend_a64_d16_mask_w32_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, + const __m256i *v_maxval, int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s0_1 = yy_loadu_256(src0 + 16); + const __m256i s1_0 = yy_loadu_256(src1); + const __m256i s1_1 = yy_loadu_256(src1 + 16); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), + _mm256_unpacklo_epi16(*m1, max_minus_m1)); + __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), + _mm256_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); + __m256i res = _mm256_packus_epi16(res0, res1); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm256_storeu_si256((__m256i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m = xx_loadu_128(mask); + const __m256i m0 = _mm256_cvtepu8_epi16(m); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m = yy_loadu_256(mask + j); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); + const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m256i m_i00 = yy_loadu_256(mask); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); + const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + j); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); + + const __m256i m_ac = + _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); + const __m256i m1 = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const __m256i y_round_offset = _mm256_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } +} + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subw, subh); + } else { + if (subw & subh) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subw) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subh) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// aom_highbd_blend_a64_d16_mask_avx2() +////////////////////////////////////////////////////////////////////////////// + +static INLINE void highbd_blend_a64_d16_mask_w4_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + // Load 4x u16 pixels from each of 4 rows from each source + const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride), + *(int64_t *)(src0 + 2 * src0_stride), + *(int64_t *)(src0 + 1 * src0_stride), + *(int64_t *)(src0 + 0 * src0_stride)); + const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride), + *(int64_t *)(src1 + 2 * src1_stride), + *(int64_t *)(src1 + 1 * src1_stride), + *(int64_t *)(src1 + 0 * src1_stride)); + // Generate the inverse mask + const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0); + + // Multiply each mask by the respective source + const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0); + const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0); + const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs); + const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1); + const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1); + const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs); + const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs); + + const __m256i sumh = _mm256_add_epi32(mul0h, mul1h); + const __m256i suml = _mm256_add_epi32(mul0l, mul1l); + + const __m256i roundh = + _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift); + const __m256i roundl = + _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift); + + const __m256i pack = _mm256_packs_epi32(roundl, roundh); + const __m256i clip = + _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high); + + // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way: + const __m128i cliph = _mm256_extracti128_si256(clip, 1); + xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8)); + xx_storel_64(dst + 2 * dst_stride, cliph); + const __m128i clipl = _mm256_castsi256_si128(clip); + xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8)); + xx_storel_64(dst + 0 * dst_stride, clipl); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + do { + // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16 + const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride), + *(int32_t *)(mask + 2 * mask_stride), + *(int32_t *)(mask + 1 * mask_stride), + *(int32_t *)(mask + 0 * mask_stride)); + const __m256i mask0 = _mm256_cvtepu8_epi16(mask08); + + highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, &mask0, round_offset, shift, + clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift, const __m256i *clip_low, + const __m256i *clip_high, const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + do { + // Load 8 pixels from each of 8 rows of mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m256i m0246 = + _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride), + *(int64_t *)(mask + 4 * mask_stride), + *(int64_t *)(mask + 2 * mask_stride), + *(int64_t *)(mask + 0 * mask_stride)); + const __m256i m1357 = + _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride), + *(int64_t *)(mask + 5 * mask_stride), + *(int64_t *)(mask + 3 * mask_stride), + *(int64_t *)(mask + 1 * mask_stride)); + const __m256i addrows = _mm256_adds_epu8(m0246, m1357); + const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b); + const __m256i mask0 = + _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2); + + highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, &mask0, round_offset, shift, + clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, + const __m256i *mask0b, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + // Load 8x u16 pixels from each of 4 rows from each source + const __m256i s0a = + yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride); + const __m256i s0b = + yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride); + const __m256i s1a = + yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride); + const __m256i s1b = + yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride); + + // Generate inverse masks + const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); + const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); + + // Multiply sources by respective masks + const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); + const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); + const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); + const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); + const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah); + const __m256i sumal = _mm256_add_epi32(mul0al, mul1al); + + const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); + const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); + const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); + const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); + const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh); + const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl); + + // Divide down each result, with rounding + const __m256i roundah = + _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift); + const __m256i roundal = + _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift); + const __m256i roundbh = + _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift); + const __m256i roundbl = + _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift); + + // Pack each i32 down to an i16 with saturation, then clip to valid range + const __m256i packa = _mm256_packs_epi32(roundal, roundah); + const __m256i clipa = + _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); + const __m256i packb = _mm256_packs_epi32(roundbl, roundbh); + const __m256i clipb = + _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); + + // Store 8x u16 pixels to each of 4 rows in the destination + yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa); + yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + do { + // Load 8x u8 pixels from each of 4 rows in the mask + const __m128i mask0a8 = + _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride)); + const __m128i mask0b8 = + _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride), + *(int64_t *)(mask + 3 * mask_stride)); + const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8); + const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8); + + highbd_blend_a64_d16_mask_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + do { + // Load 16x u8 pixels from each of 8 rows in the mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m256i m02 = + yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride); + const __m256i m13 = + yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride); + const __m256i m0123 = + _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b); + const __m256i mask_0a = + _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2); + const __m256i m46 = + yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride); + const __m256i m57 = + yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride); + const __m256i m4567 = + _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b); + const __m256i mask_0b = + _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2); + + highbd_blend_a64_d16_mask_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, + &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a, + const __m256i *mask0b, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + // Load 16x pixels from each of 2 rows from each source + const __m256i s0a = yy_loadu_256(src0); + const __m256i s0b = yy_loadu_256(src0 + src0_stride); + const __m256i s1a = yy_loadu_256(src1); + const __m256i s1b = yy_loadu_256(src1 + src1_stride); + + // Calculate inverse masks + const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a); + const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b); + + // Multiply each source by appropriate mask + const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a); + const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a); + const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs); + // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within + // lanes Later, packs does the same again which cancels this out with no need + // for a permute. The intermediate values being reordered makes no difference + + const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a); + const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a); + const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah); + const __m256i mulal = _mm256_add_epi32(mul0al, mul1al); + + const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b); + const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b); + const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b); + const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b); + const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh); + const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl); + + const __m256i resah = + _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift); + const __m256i resal = + _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift); + const __m256i resbh = + _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift); + const __m256i resbl = + _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift); + + // Signed saturating pack from i32 to i16: + const __m256i packa = _mm256_packs_epi32(resal, resah); + const __m256i packb = _mm256_packs_epi32(resbl, resbh); + + // Clip the values to the valid range + const __m256i clipa = + _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high); + const __m256i clipb = + _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high); + + // Store 16 pixels + yy_storeu_256(dst, clipa); + yy_storeu_256(dst + dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, int w, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + for (int i = 0; i < h; i += 2) { + for (int j = 0; j < w; j += 16) { + // Load 16x u8 alpha-mask values from each of two rows and pad to u16 + const __m128i masks_a8 = xx_loadu_128(mask + j); + const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j); + const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8); + const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8); + + highbd_blend_a64_d16_mask_w16_avx2( + dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, + &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); + } + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 2; + } +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask, + int mask_stride, int h, int w, const __m256i *round_offset, int shift, + const __m256i *clip_low, const __m256i *clip_high, + const __m256i *mask_max) { + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; i += 2) { + for (int j = 0; j < w; j += 16) { + // Load 32x u8 alpha-mask values from each of four rows + // (saturating) add pairs of rows, then use madd to add adjacent values + // Finally, divide down each result with rounding + const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j); + const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j); + const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j); + const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j); + + const __m256i m01_8 = _mm256_adds_epu8(m0, m1); + const __m256i m23_8 = _mm256_adds_epu8(m2, m3); + + const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b); + const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b); + + const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2); + const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2); + + highbd_blend_a64_d16_mask_w16_avx2( + dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride, + &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max); + } + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 4; + } +} + +void aom_highbd_blend_a64_d16_mask_avx2( + uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int32_t round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + const __m256i v_round_offset = _mm256_set1_epi32(round_offset); + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + + const __m256i clip_low = _mm256_setzero_si256(); + const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1); + const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >= 16 + highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >= 16 + highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + } else { + // Sub-sampling in only one axis doesn't seem to happen very much, so fall + // back to the vanilla C implementation instead of having all the optimised + // code for these. + aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c new file mode 100644 index 0000000000..58a7345ec2 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -0,0 +1,1560 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h) { + (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int w, int h) { + (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ra_b = xx_loadu_128(mask + c); + const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + (void)w; + + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); + const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); + const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); + const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); + const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); + const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); + const __m128i v_rvsbl_w = + _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); + const __m128i v_rvsbh_w = + _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); + const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); + const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { + typedef void (*blend_fn)( + uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h); + + // Dimensions are: width_index X subx X suby + static const blend_fn blend[3][2][2] = { + { // w % 16 == 0 + { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, + { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, + { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, + { // w == 8 + { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, + { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subw, subh); + } else { + blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0, + src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b10); +} + +static void blend_a64_mask_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_m0_b = xx_loadl_64(mask + c); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_r_b = xx_loadu_128(mask + 2 * c); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadl_64(mask + c); + const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, + blend_unit_fn blend) { + const __m128i v_zmask_b = + _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// +void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int w, int h, + int subw, int subh, int bd) { + typedef void (*blend_fn)( + uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h); + + // Dimensions are: bd_index X width_index X subw X subh + static const blend_fn blend[2][2][2][2] = { + { // bd == 8 or 10 + { // w % 8 == 0 + { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, + { blend_a64_mask_b10_sx_w8n_sse4_1, + blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, + { blend_a64_mask_b10_sx_w4_sse4_1, + blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, + { // bd == 12 + { // w % 8 == 0 + { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, + { blend_a64_mask_b12_sx_w8n_sse4_1, + blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, + { blend_a64_mask_b12_sx_w4_sse4_1, + blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, mask_stride, w, h, subw, + subh, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0]( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, w, h); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void blend_a64_d16_mask_w16_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, + const __m128i *v_maxval, int shift) { + const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); + const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); + const __m128i s0_0 = xx_loadu_128(src0); + const __m128i s0_1 = xx_loadu_128(src0 + 8); + const __m128i s1_0 = xx_loadu_128(src1); + const __m128i s1_1 = xx_loadu_128(src1 + 8); + __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), + _mm_unpacklo_epi16(*m0, max_minus_m0)); + __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), + _mm_unpackhi_epi16(*m0, max_minus_m0)); + __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), + _mm_unpacklo_epi16(*m1, max_minus_m1)); + __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), + _mm_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); + const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); + const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); + const __m128i res = _mm_packus_epi16(res0, res1); + + _mm_storeu_si128((__m128i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m = xx_loadu_128(mask + j); + const __m128i m0 = _mm_cvtepu8_epi16(m); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); + const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); + const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m128i m0 = _mm_cvtepu8_epi16(m_ac); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; + } + } +} + +////////////////////////////////////////////////////////////////////////////// +// aom_highbd_blend_a64_d16_mask_sse4_1() +////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, + const __m128i *mask0b, const __m128i *round_offset, int shift, + const __m128i *clip_low, const __m128i *clip_high, + const __m128i *mask_max) { + // Load 4 pixels from each of 4 rows from each source + const __m128i s0a = + _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride)); + const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride), + *(int64_t *)(src0 + 3 * src0_stride)); + const __m128i s1a = + _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride)); + const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride), + *(int64_t *)(src1 + 3 * src1_stride)); + + // Generate the inverse masks + const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a); + const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b); + + // Multiply each mask by the respective source + const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); + const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); + const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); + const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); + const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); + const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); + const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); + const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); + const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); + const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); + const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); + const __m128i sumal = _mm_add_epi32(mul0al, mul1al); + const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); + const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); + + const __m128i roundah = + _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); + const __m128i roundbh = + _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); + const __m128i roundal = + _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); + const __m128i roundbl = + _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); + + const __m128i packa = _mm_packs_epi32(roundal, roundah); + const __m128i packb = _mm_packs_epi32(roundbl, roundbh); + + const __m128i clipa = + _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); + const __m128i clipb = + _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); + + xx_storel_64(dst, _mm_srli_si128(clipa, 8)); + xx_storel_64(dst + dst_stride, clipa); + xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8)); + xx_storel_64(dst + 3 * dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + do { + const __m128i mask0a8 = + _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride)); + const __m128i mask0b8 = + _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride), + *(int32_t *)(mask + 3 * mask_stride)); + const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8); + const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8); + + highbd_blend_a64_d16_mask_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 4; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + do { + // Load 8 pixels from each of 8 rows of mask, + // (saturating) add together rows then use madd to add adjacent pixels + // Finally, divide each value by 4 (with rounding) + const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask), + *(int64_t *)(mask + 2 * mask_stride)); + const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride), + *(int64_t *)(mask + 3 * mask_stride)); + const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b); + const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2); + const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride), + *(int64_t *)(mask + 6 * mask_stride)); + const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride), + *(int64_t *)(mask + 7 * mask_stride)); + const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b); + const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2); + + highbd_blend_a64_d16_mask_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, + &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); + + dst += dst_stride * 4; + src0 += src0_stride * 4; + src1 += src1_stride * 4; + mask += mask_stride * 8; + } while (h -= 4); +} + +static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1( + uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, + const __m128i *mask0b, const __m128i *round_offset, int shift, + const __m128i *clip_low, const __m128i *clip_high, + const __m128i *max_mask) { + // Load 8x pixels from each of 2 rows from each source + const __m128i s0a = xx_loadu_128(src0); + const __m128i s0b = xx_loadu_128(src0 + src0_stride); + const __m128i s1a = xx_loadu_128(src1); + const __m128i s1b = xx_loadu_128(src1 + src1_stride); + + // Generate inverse masks + const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a); + const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b); + + // Multiply sources by respective masks + const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); + const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); + const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); + const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); + + const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); + const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); + const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); + const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); + + const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); + const __m128i sumal = _mm_add_epi32(mul0al, mul1al); + + const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); + const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); + const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); + const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); + + const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); + const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); + const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); + const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); + + const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); + const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); + + const __m128i roundah = + _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); + const __m128i roundal = + _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); + const __m128i roundbh = + _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); + const __m128i roundbl = + _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); + + const __m128i packa = _mm_packs_epi32(roundal, roundah); + const __m128i clipa = + _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); + const __m128i packb = _mm_packs_epi32(roundbl, roundbh); + const __m128i clipb = + _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); + + xx_storeu_128(dst, clipa); + xx_storeu_128(dst + dst_stride, clipb); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *max_mask) { + do { + const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask)); + const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride)); + highbd_blend_a64_d16_mask_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, + round_offset, shift, clip_low, clip_high, max_mask); + + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 2; + } while (h -= 2); +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *max_mask) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + do { + const __m128i mask_thisrowa = xx_loadu_128(mask); + const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride); + const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride); + const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride); + const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa); + const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb); + const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b); + const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b); + const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2); + const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2); + + highbd_blend_a64_d16_mask_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa, + &mask_sb, round_offset, shift, clip_low, clip_high, max_mask); + + dst += dst_stride * 2; + src0 += src0_stride * 2; + src1 += src1_stride * 2; + mask += mask_stride * 4; + } while (h -= 2); +} + +static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1( + uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *round_offset, int shift, const __m128i *mask0l, + const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high, + const __m128i *mask_max) { + // Load 16x u16 pixels for this row from each src + const __m128i s0l = xx_loadu_128(src0); + const __m128i s0h = xx_loadu_128(src0 + 8); + const __m128i s1l = xx_loadu_128(src1); + const __m128i s1h = xx_loadu_128(src1 + 8); + + // Calculate inverse masks + const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h); + const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l); + + const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h); + const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h); + const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs); + const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs); + + const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h); + const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h); + const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs); + const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs); + + const __m128i mulhh = _mm_add_epi32(mul0h, mul1h); + const __m128i mulhl = _mm_add_epi32(mul0l, mul1l); + + const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l); + const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l); + const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs); + const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs); + + const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l); + const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l); + const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs); + const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs); + + const __m128i mullh = _mm_add_epi32(mul2h, mul3h); + const __m128i mulll = _mm_add_epi32(mul2l, mul3l); + + const __m128i reshh = + _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift); + const __m128i reshl = + _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift); + const __m128i reslh = + _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift); + const __m128i resll = + _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift); + + // Signed saturating pack from i32 to i16: + const __m128i packh = _mm_packs_epi32(reshl, reshh); + const __m128i packl = _mm_packs_epi32(resll, reslh); + + // Clip the values to the valid range + const __m128i cliph = + _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high); + const __m128i clipl = + _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high); + + // Store 16 pixels + xx_storeu_128(dst, clipl); + xx_storeu_128(dst + 8, cliph); +} + +static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j += 16) { + // Load 16x u8 alpha-mask values and pad to u16 + const __m128i masks_u8 = xx_loadu_128(mask + j); + const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8); + const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8)); + + highbd_blend_a64_d16_mask_w16_sse4_1( + dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h, + clip_low, clip_high, mask_max); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } +} + +static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift, const __m128i *clip_low, + const __m128i *clip_high, const __m128i *mask_max) { + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; i++) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + highbd_blend_a64_d16_mask_w16_sse4_1( + dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h, + clip_low, clip_high, mask_max); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride * 2; + } +} + +void aom_highbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int32_t round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + + const __m128i clip_low = _mm_setzero_si128(); + const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1); + const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >=16 + highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + case 8: + highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + default: // >=16 + highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, + &mask_max); + break; + } + } else { + // Sub-sampling in only one axis doesn't seem to happen very much, so fall + // back to the vanilla C implementation instead of having all the optimised + // code for these. + aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, subw, + subh, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c new file mode 100644 index 0000000000..75fb1c5a94 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, + uint32_t src0_stride, + const uint8_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 16) { + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h); + + // Dimension: width_index + static const blend_fn blend[9] = { + blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 + aom_blend_a64_vmask_c, // w == 1 + aom_blend_a64_vmask_c, // w == 2 + NULL, // INVALID + blend_a64_vmask_w4_sse4_1, // w == 4 + NULL, // INVALID + NULL, // INVALID + NULL, // INVALID + blend_a64_vmask_w8_sse4_1, // w == 8 + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, + h); +} + +#if CONFIG_AV1_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_vmask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b10); +} + +static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b12); +} + +static INLINE void blend_a64_vmask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 8) { + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h, blend_8_b10); +} + +static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h, blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_vmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int w, int h, int bd) { + typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h); + + // Dimensions are: bd_index X width_index + static const blend_fn blend[2][2] = { + { + // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, + { + // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 + } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, w, h, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, w, h); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h new file mode 100644 index 0000000000..c071fdcfc4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w4_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); + const __m128i res_d = _mm_srai_epi32(res_c, shift); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +static INLINE void blend_a64_d16_mask_w8_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadu_128(src0); + const __m128i s1 = xx_loadu_128(src1); + __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), + _mm_unpacklo_epi16(*m, max_minus_m)); + __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), + _mm_unpackhi_epi16(*m, max_minus_m)); + res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); + res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); + const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + _mm_storel_epi64((__m128i *)(dst), res); +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_32(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_64(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_i1 = xx_loadu_128(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h new file mode 100644 index 0000000000..8d9b325101 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_w, const __m128i *v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = + _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = + _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = + _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c new file mode 100644 index 0000000000..fdf7de3f4c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum, + int *x_sum, int64_t *x2_sum) { + __m256i sum_buffer, sse_buffer; + __m128i out_buffer; + + // Accumulate the various elements of register into first element. + sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8)); + regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4)); + + sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8)); + + out_buffer = _mm256_castsi256_si128(regx_sum); + *x_sum += _mm_cvtsi128_si32(out_buffer); + out_buffer = _mm256_castsi256_si128(regx2_sum); +#if AOM_ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(out_buffer); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, out_buffer); + *x2_sum += tmp; + } +#endif +} + +static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + __m128i row1, row2, row3; + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int j = 0; j < (bh >> 2); ++j) { + // Load 4 rows at a time. + row1 = _mm_loadl_epi64((__m128i const *)(data_tmp)); + row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); + row1 = _mm_unpacklo_epi64(row1, row2); + row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride)); + row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride)); + row2 = _mm_unpacklo_epi64(row2, row3); + load_pixels = + _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += 4 * stride; + } + + // To prevent 32-bit variable overflow, unpack the elements to 64-bit. + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + __m128i load_128bit, load_next_128bit; + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int j = 0; j < (bh >> 1); ++j) { + // Load 2 rows at a time. + load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp)); + load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride)); + load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit), + load_next_128bit, 1); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += 2 * stride; + } + + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum, + int loop_count) { + __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer, + temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer; + const int16_t *data_tmp = data; + __m256i one = _mm256_set1_epi16(1); + regx_sum = _mm256_setzero_si256(); + regx2_sum = regx_sum; + sum_buffer = _mm256_setzero_si256(); + sse_buffer = sum_buffer; + + for (int i = 0; i < loop_count; ++i) { + data_tmp = data + 16 * i; + for (int j = 0; j < bh; ++j) { + load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp)); + + row_sum_buffer = _mm256_madd_epi16(load_pixels, one); + row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels); + sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer); + sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer); + data_tmp += stride; + } + } + + temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256()); + temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256()); + sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2); + regx_sum = _mm256_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum); + + accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum); +} + +void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + + if ((bh & 3) == 0) { + switch (bw) { + // For smaller block widths, compute multiple rows simultaneously. + case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break; + case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break; + case 16: + case 32: + sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); + break; + case 64: + // 32-bit variables will overflow for 64 rows at a single time, so + // compute 32 rows at a time. + if (bh <= 32) { + sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4); + } else { + sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4); + sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum, + bw >> 4); + } + break; + + default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } + } else { + aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c new file mode 100644 index 0000000000..bf89427872 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum) { + const int16_t *data_tmp = data; + __m128i temp_buffer1, temp_buffer2; + __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer; + __m128i one = _mm_set1_epi16(1); + __m128i regx_sum = _mm_setzero_si128(); + __m128i regx2_sum = regx_sum; + + for (int j = 0; j < (bh >> 1); ++j) { + // Load 2 rows (8 pixels) at a time. + load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp)); + load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride)); + load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi); + sum_buffer = _mm_madd_epi16(load_pixels_low, one); + sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low); + regx_sum = _mm_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); + data_tmp += 2 * stride; + } + + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); + *x_sum = _mm_cvtsi128_si32(regx_sum); + temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); + temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); + regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); + regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); +#if AOM_ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(regx2_sum); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, regx2_sum); + *x2_sum += tmp; + } +#endif +} + +static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh, + int *x_sum, int64_t *x2_sum, + int loop_cycles) { + const int16_t *data_tmp; + __m128i temp_buffer1, temp_buffer2; + __m128i one = _mm_set1_epi16(1); + __m128i regx_sum = _mm_setzero_si128(); + __m128i regx2_sum = regx_sum; + __m128i load_pixels, sum_buffer, sse_buffer; + + for (int i = 0; i < loop_cycles; ++i) { + data_tmp = data + (8 * i); + for (int j = 0; j < bh; ++j) { + // Load 1 row (8-pixels) at a time. + load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp)); + sum_buffer = _mm_madd_epi16(load_pixels, one); + sse_buffer = _mm_madd_epi16(load_pixels, load_pixels); + regx_sum = _mm_add_epi32(sum_buffer, regx_sum); + regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum); + data_tmp += stride; + } + } + + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8)); + regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4)); + *x_sum += _mm_cvtsi128_si32(regx_sum); + temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128()); + temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128()); + regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2); + regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8)); +#if AOM_ARCH_X86_64 + *x2_sum += _mm_cvtsi128_si64(regx2_sum); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, regx2_sum); + *x2_sum += tmp; + } +#endif +} + +// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c' +void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + *x_sum = 0; + *x2_sum = 0; + + if ((bh & 3) == 0) { + switch (bw) { + case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break; + case 8: + case 16: + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + // For widths 32 and 64, the registers may overflow. So compute + // partial widths at a time. + case 32: + if (bh <= 32) { + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + } else { + sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3); + sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum, + bw >> 3); + break; + } + + case 64: + if (bh <= 16) { + sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3); + break; + } else { + for (int i = 0; i < bh; i += 16) + sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum, + bw >> 3); + break; + } + + default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } + } else { + aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum); + } +} diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h new file mode 100644 index 0000000000..96fe4ebb67 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ + +#include + +#include "config/aom_config.h" + +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} +#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h new file mode 100644 index 0000000000..4ca214f469 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_H_ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_highbd_convolve8_##name##_##opt( \ + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + aom_highbd_convolve8_##name##_c( \ + CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } +#endif // CONFIG_AV1_HIGHBITDEPTH + +#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h new file mode 100644 index 0000000000..f5a382ce4e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -0,0 +1,922 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ + +#include + +#include "aom_ports/mem.h" + +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +// filters for 16 +DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { + 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, + 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(32, static const uint8_t, + filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \ + for (i = 0; i < (im_h - 2); i += 2) { \ + __m256i data = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + data = _mm256_inserti128_si256( \ + data, \ + _mm_loadu_si128( \ + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ + 1); \ + __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + __m256i data_1 = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + +#define CONVOLVE_SR_VERTICAL_FILTER_4TAP \ + __m256i s[6]; \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ + const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ + s[2] = _mm256_unpacklo_epi16(s4, s5); \ + s[5] = _mm256_unpackhi_epi16(s4, s5); \ + \ + __m256i res_a = convolve_4tap(s, coeffs_v + 1); \ + __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[3] = s[4]; \ + s[4] = s[5]; \ + } + +#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \ + for (i = 0; i < (im_h - 2); i += 2) { \ + __m256i data = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + data = _mm256_inserti128_si256( \ + data, \ + _mm_loadu_si128( \ + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ + 1); \ + \ + __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt); \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + \ + __m256i data_1 = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + \ + __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt); \ + \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + +#define CONVOLVE_SR_VERTICAL_FILTER_6TAP \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + \ + __m256i s[8]; \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + \ + s[3] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[4] = _mm256_unpackhi_epi16(src_2, src_3); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \ + \ + s[2] = _mm256_unpacklo_epi16(s6, s7); \ + s[5] = _mm256_unpackhi_epi16(s6, s7); \ + \ + __m256i res_a = convolve_6tap(s, coeffs_v); \ + __m256i res_b = convolve_6tap(s + 3, coeffs_v); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + \ + s[3] = s[4]; \ + s[4] = s[5]; \ + } + +#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < (im_h - 2); i += 2) { \ + __m256i data = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + data = _mm256_inserti128_si256( \ + data, \ + _mm_loadu_si128( \ + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ + 1); \ + \ + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + \ + __m256i data_1 = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + \ + __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \ + \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + +#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + __m256i s[8]; \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ + \ + s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ + s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + __m256i res_a = convolve(s, coeffs_v); \ + __m256i res_b = convolve(s + 4, coeffs_v); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } + +#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \ + const __m256i v_zero = _mm256_setzero_si256(); \ + __m256i s[12]; \ + if (w <= 4) { \ + for (i = 0; i < im_h; i += 2) { \ + const __m256i data = _mm256_permute2x128_si256( \ + _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ + _mm256_castsi128_si256(_mm_loadu_si128( \ + (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \ + 0x20); \ + const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ + const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ + const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ + const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ + \ + const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ + const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ + \ + s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ + s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ + s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ + s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ + s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ + s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ + \ + const __m256i res_lo = convolve_12taps(s, coeffs_h); \ + \ + __m256i res_32b_lo = _mm256_sra_epi32( \ + _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ + const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \ + const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \ + if (w > 2) { \ + _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \ + _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \ + res_1); \ + } else { \ + uint32_t horiz_2; \ + horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \ + im_block[i * im_stride] = (uint16_t)horiz_2; \ + im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ + horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \ + im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \ + im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \ + } \ + } \ + } else { \ + for (i = 0; i < im_h; i++) { \ + const __m256i data = _mm256_permute2x128_si256( \ + _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \ + _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \ + 0x20); \ + const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \ + const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \ + \ + const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \ + const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \ + \ + const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \ + const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \ + \ + s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \ + s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \ + s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \ + s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \ + s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \ + s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \ + \ + const __m256i res_lo = convolve_12taps(s, coeffs_h); \ + \ + __m256i res_32b_lo = _mm256_sra_epi32( \ + _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \ + \ + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \ + _mm_store_si128((__m128i *)&im_block[i * im_stride], \ + _mm256_extracti128_si256( \ + _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \ + } \ + } + +#define CONVOLVE_SR_VERTICAL_FILTER_12TAP \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \ + __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \ + __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \ + __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \ + \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ + s[3] = _mm256_unpacklo_epi16(src_6, src_7); \ + s[4] = _mm256_unpacklo_epi16(src_8, src_9); \ + \ + s[6] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[7] = _mm256_unpackhi_epi16(src_2, src_3); \ + s[8] = _mm256_unpackhi_epi16(src_4, src_5); \ + s[9] = _mm256_unpackhi_epi16(src_6, src_7); \ + s[10] = _mm256_unpackhi_epi16(src_8, src_9); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \ + \ + s[5] = _mm256_unpacklo_epi16(s6, s7); \ + s[11] = _mm256_unpackhi_epi16(s6, s7); \ + \ + __m256i res_a = convolve_12taps(s, coeffs_v); \ + __m256i res_b = convolve_12taps(s + 6, coeffs_v); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + s[3] = s[4]; \ + s[4] = s[5]; \ + \ + s[6] = s[7]; \ + s[7] = s[8]; \ + s[8] = s[9]; \ + s[9] = s[10]; \ + s[10] = s[11]; \ + } + +#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ + do { \ + for (i = 0; i < im_h; i += 2) { \ + __m256i data = \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ + if (i + 1 < im_h) \ + data = _mm256_inserti128_si256( \ + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ + src_h += (src_stride << 1); \ + __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ + \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \ + round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + } while (0) + +#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ + do { \ + __m256i s[8]; \ + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + s[0] = _mm256_unpacklo_epi16(s0, s1); \ + s[1] = _mm256_unpacklo_epi16(s2, s3); \ + s[2] = _mm256_unpacklo_epi16(s4, s5); \ + \ + s[4] = _mm256_unpackhi_epi16(s0, s1); \ + s[5] = _mm256_unpackhi_epi16(s2, s3); \ + s[6] = _mm256_unpackhi_epi16(s4, s5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = \ + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = \ + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + const __m256i res_a = convolve(s, coeffs_y); \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + \ + if (w - j > 4) { \ + const __m256i res_b = convolve(s + 4, coeffs_y); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = \ + load_line2_avx2(&dst[i * dst_stride + j], \ + &dst[i * dst_stride + j + dst_stride]); \ + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ + &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = \ + _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ + _mm_storel_epi64( \ + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } else { \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = \ + load_line2_avx2(&dst[i * dst_stride + j], \ + &dst[i * dst_stride + j + dst_stride]); \ + \ + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \ + &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = \ + _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ + _mm_cvtsi128_si32(res_1); \ + \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } \ + } while (0) + +static INLINE void prepare_coeffs_lowbd( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); + + // right shift all filter co-efficients by 1 to reduce the bits required. + // This extra right shift will be taken care of at the end while rounding + // the result. + // Since all filter co-efficients are even, this change will not affect the + // end result + assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), + _mm_set1_epi16((short)0xffff))); + + const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE void prepare_coeffs_6t_lowbd( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); + + // right shift all filter co-efficients by 1 to reduce the bits required. + // This extra right shift will be taken care of at the end while rounding + // the result. + // Since all filter co-efficients are even, this change will not affect the + // end result + assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), + _mm_set1_epi16((int16_t)0xffff))); + + const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); + + // coeffs 1 2 1 2 1 2 1 2 + coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u)); + // coeffs 3 4 3 4 3 4 3 4 + coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u)); + // coeffs 5 6 5 6 5 6 5 6 + coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au)); +} + +static INLINE void prepare_coeffs_6t( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + + const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1)); + const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 1 2 1 2 1 2 1 2 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 3 4 3 4 3 4 3 4 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 5 6 5 6 5 6 5 6 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); +} + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + + const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); +} + +static INLINE void prepare_coeffs_12taps( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + + __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); + __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); + // coeffs 8 9 10 11 0 0 0 0 + coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8)); + coeff = _mm256_broadcastq_epi64(coeff_8); + coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9 + coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11 +} + +static INLINE __m256i convolve_lowbd(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), + _mm256_add_epi16(res_23, res_67)); + + return res; +} + +static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = + _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23); + + return res; +} + +static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(res_45, res_23); + + return res; +} + +static INLINE __m256i convolve_6tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2); + + return res; +} + +static INLINE __m256i convolve_12taps(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); + const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]); + const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]); + + const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(res_2, res_3)); + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1); + + return res; +} + +static INLINE __m256i convolve(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); + + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m256i convolve_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]); + + const __m256i res = _mm256_add_epi32(res_1, res_2); + return res; +} + +static INLINE __m256i convolve_lowbd_x(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[4]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + s[2] = _mm256_shuffle_epi8(data, filt[2]); + s[3] = _mm256_shuffle_epi8(data, filt[3]); + + return convolve_lowbd(s, coeffs); +} + +static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[4]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + s[2] = _mm256_shuffle_epi8(data, filt[2]); + + return convolve_lowbd_6tap(s, coeffs); +} + +static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[2]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + + return convolve_lowbd_4tap(s, coeffs); +} + +static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, + const __m256i *const res, + const int do_average) { + __m256i d; + if (do_average) { + d = _mm256_load_si256((__m256i *)dst); + d = _mm256_add_epi32(d, *res); + d = _mm256_srai_epi32(d, 1); + } else { + d = *res; + } + _mm256_store_si256((__m256i *)dst, d); +} + +static INLINE __m256i comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt, + const int use_dist_wtd_comp_avg) { + __m256i res; + if (use_dist_wtd_comp_avg) { + const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); + const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); + + const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm256_packs_epi32(res_lo, res_hi); + } else { + const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned, + const __m256i *const offset_const, + const __m256i *const round_const, + const int round_shift) { + const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi16( + _mm256_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt0, + const __m256i *const wt1, + const int use_dist_wtd_comp_avg) { + __m256i res; + if (use_dist_wtd_comp_avg) { + const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); + const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); + const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); + res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi32(wt_res, 1); + } + return res; +} + +static INLINE __m256i highbd_convolve_rounding( + const __m256i *const res_unsigned, const __m256i *const offset_const, + const __m256i *const round_const, const int round_shift) { + const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi32( + _mm256_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h new file mode 100644 index 0000000000..9e8662af46 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(d, *res); + d = _mm_srai_epi32(d, 1); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params, + int subpel_q4, + __m128i *coeffs /* [6] */) { + const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + + __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + coeffs[0] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 0 1 0 1 0 1 0 1 + coeffs[1] = _mm_shuffle_epi32(coeffs_y, 85); // coeffs 2 3 2 3 2 3 2 3 + coeffs[2] = _mm_shuffle_epi32(coeffs_y, 170); // coeffs 4 5 4 5 4 5 4 5 + coeffs[3] = _mm_shuffle_epi32(coeffs_y, 255); // coeffs 6 7 6 7 6 7 6 7 + + coeffs_y = _mm_loadl_epi64((__m128i *)(y_filter + 8)); + + coeffs[4] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 8 9 8 9 8 9 8 9 + coeffs[5] = + _mm_shuffle_epi32(coeffs_y, 85); // coeffs 10 11 10 11 10 11 10 11 +} + +static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) { + const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); + const __m128i d4 = _mm_madd_epi16(s[4], coeffs[4]); + const __m128i d5 = _mm_madd_epi16(s[5], coeffs[5]); + const __m128i d_0123 = + _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); + const __m128i d = _mm_add_epi32(_mm_add_epi32(d4, d5), d_0123); + return d; +} + +static INLINE __m128i convolve_lo_x_12tap(const __m128i *s, + const __m128i *coeffs, + const __m128i zero) { + __m128i ss[6]; + ss[0] = _mm_unpacklo_epi8(s[0], zero); // 0 1 1 2 2 3 3 4 + ss[1] = _mm_unpacklo_epi8(s[1], zero); // 2 3 3 4 4 5 5 6 + ss[2] = _mm_unpacklo_epi8(s[2], zero); // 4 5 5 6 6 7 7 8 + ss[3] = _mm_unpacklo_epi8(s[3], zero); // 6 7 7 8 8 9 9 10 + ss[4] = _mm_unpackhi_epi8(s[2], zero); // 8 9 9 10 10 11 11 12 + ss[5] = _mm_unpackhi_epi8(s[3], zero); // 10 11 11 12 12 13 13 14 + return convolve_12tap(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y_12tap(const __m128i *s, + const __m128i *coeffs) { + __m128i ss[6]; + const __m128i zero = _mm_setzero_si128(); + ss[0] = _mm_unpacklo_epi8(s[0], zero); + ss[1] = _mm_unpacklo_epi8(s[2], zero); + ss[2] = _mm_unpacklo_epi8(s[4], zero); + ss[3] = _mm_unpacklo_epi8(s[6], zero); + ss[4] = _mm_unpacklo_epi8(s[8], zero); + ss[5] = _mm_unpacklo_epi8(s[10], zero); + return convolve_12tap(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y_12tap(const __m128i *s, + const __m128i *coeffs) { + __m128i ss[6]; + const __m128i zero = _mm_setzero_si128(); + ss[0] = _mm_unpackhi_epi8(s[0], zero); + ss[1] = _mm_unpackhi_epi8(s[2], zero); + ss[2] = _mm_unpackhi_epi8(s[4], zero); + ss[3] = _mm_unpackhi_epi8(s[6], zero); + ss[4] = _mm_unpackhi_epi8(s[8], zero); + ss[5] = _mm_unpackhi_epi8(s[10], zero); + return convolve_12tap(ss, coeffs); +} +#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h new file mode 100644 index 0000000000..36b7d62b98 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ + +#include "config/aom_scale_rtcd.h" + +// Note: +// This header file should be put below any x86 intrinsics head file +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeff = _mm_loadu_si128((__m128i *)filter); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); + + const __m128i res = + _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i comp_avg(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt, + const int use_dist_wtd_avg) { + __m128i res; + if (use_dist_wtd_avg) { + const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); + const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); + + const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm_packs_epi32(res_lo, res_hi); + } else { + const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); + res = _mm_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned, + const __m128i *const offset_const, + const __m128i *const round_const, + const int round_shift) { + const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m128i highbd_convolve_rounding_sse2( + const __m128i *const res_unsigned, const __m128i *const offset_const, + const __m128i *const round_const, const int round_shift) { + const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h new file mode 100644 index 0000000000..b1a3bb4664 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void mult_add_store(CONV_BUF_TYPE *const dst, + const __m128i *const res, + const __m128i *const wt0, + const __m128i *const wt1, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); + d = _mm_srai_epi32(d, DIST_PRECISION_BITS); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt0, + const __m128i *const wt1, + const int use_dist_wtd_avg) { + __m128i res; + if (use_dist_wtd_avg) { + const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); + const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); + + const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); + res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); + res = _mm_srai_epi32(wt_res, 1); + } + return res; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_ssse3.h b/third_party/aom/aom_dsp/x86/convolve_ssse3.h new file mode 100644 index 0000000000..b1abead146 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_ssse3.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_ + +#include // SSSE3 + +static INLINE void shuffle_filter_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +} + +static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c new file mode 100644 index 0000000000..3f5a9bbeff --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fft_avx2.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +extern void aom_transpose_float_sse2(const float *A, float *B, int n); +extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, + int n); + +// Generate the 1d forward transforms for float using _mm256 +GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps) +GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps) +GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps) + +void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +// Generate the 1d inverse transforms for float using _mm256 +GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps) +GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps) +GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps) + +void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, + aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); +} + +void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_avx2, aom_ifft1d_16_avx2, + aom_transpose_float_sse2, 8); +} + +void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_avx2, aom_ifft1d_32_avx2, + aom_transpose_float_sse2, 8); +} diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c new file mode 100644 index 0000000000..bdd235bcd3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fft_sse2.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the +s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void transpose4x4(const float *A, float *B, const int lda, + const int ldb) { + __m128 row1 = _mm_load_ps(&A[0 * lda]); + __m128 row2 = _mm_load_ps(&A[1 * lda]); + __m128 row3 = _mm_load_ps(&A[2 * lda]); + __m128 row4 = _mm_load_ps(&A[3 * lda]); + _MM_TRANSPOSE4_PS(row1, row2, row3, row4); + _mm_store_ps(&B[0 * ldb], row1); + _mm_store_ps(&B[1 * ldb], row2); + _mm_store_ps(&B[2 * ldb], row3); + _mm_store_ps(&B[3 * ldb], row4); +} + +// Referenced by fft_avx2.c. +void aom_transpose_float_sse2(const float *A, float *B, int n); + +void aom_transpose_float_sse2(const float *A, float *B, int n) { + for (int y = 0; y < n; y += 4) { + for (int x = 0; x < n; x += 4) { + transpose4x4(A + y * n + x, B + x * n + y, n, n); + } + } +} + +// Referenced by fft_avx2.c. +void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n); + +void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { + const int n2 = n / 2; + output[0] = packed[0]; + output[1] = 0; + output[2 * (n2 * n)] = packed[n2 * n]; + output[2 * (n2 * n) + 1] = 0; + + output[2 * n2] = packed[n2]; + output[2 * n2 + 1] = 0; + output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; + output[2 * (n2 * n + n2) + 1] = 0; + + for (int c = 1; c < n2; ++c) { + output[2 * (0 * n + c)] = packed[c]; + output[2 * (0 * n + c) + 1] = packed[c + n2]; + output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; + output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; + } + for (int r = 1; r < n2; ++r) { + output[2 * (r * n + 0)] = packed[r * n]; + output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; + output[2 * (r * n + n2) + 0] = packed[r * n + n2]; + output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; + + for (int c = 1; c < AOMMIN(n2, 4); ++c) { + output[2 * (r * n + c)] = + packed[r * n + c] - packed[(r + n2) * n + c + n2]; + output[2 * (r * n + c) + 1] = + packed[(r + n2) * n + c] + packed[r * n + c + n2]; + } + + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r * n + c); + __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); + real1 = _mm_sub_ps(real1, real2); + imag1 = _mm_add_ps(imag1, imag2); + _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); + } + + int r2 = r + n2; + int r3 = n - r2; + output[2 * (r2 * n + 0)] = packed[r3 * n]; + output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; + output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; + output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; + for (int c = 1; c < AOMMIN(4, n2); ++c) { + output[2 * (r2 * n + c)] = + packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; + output[2 * (r2 * n + c) + 1] = + -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; + } + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r3 * n + c); + __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); + real1 = _mm_add_ps(real1, real2); + imag1 = _mm_sub_ps(imag2, imag1); + _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r2 * n + c + 2), + _mm_unpackhi_ps(real1, imag1)); + } + } +} + +// Generate definitions for 1d transforms using float and __mm128 +GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps) +GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) +GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) +GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) + +void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +// Generate definitions for 1d inverse transforms using float and mm128 +GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps) +GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) +GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) +GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps) + +void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, + aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, + aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_sse2, aom_ifft1d_16_sse2, + aom_transpose_float_sse2, 4); +} + +void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_sse2, aom_ifft1d_32_sse2, + aom_transpose_float_sse2, 4); +} diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h new file mode 100644 index 0000000000..7ee8ba330e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +// TODO(jingning) The high bit-depth functions need rework for performance. +// After we properly fix the high bit-depth function implementations, this +// file's dependency should be substantially simplified. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 + +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif + +static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, + __m128i *in1) { + // Constants + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + + // Load inputs. + *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + *in1 = _mm_unpacklo_epi64( + *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + *in0 = _mm_unpacklo_epi64( + *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + // multiply by 16 to give some extra precision + *in0 = _mm_slli_epi16(*in0, 4); + *in1 = _mm_slli_epi16(*in1, 4); + // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error + { + // The mask will only contain whether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a); + *in0 = _mm_add_epi16(*in0, mask); + *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b); + } + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + *in0 = _mm_shuffle_epi32(x0, 0xD8); + *in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + const __m128i t0 = ADD_EPI16(*in0, *in1); + const __m128i t1 = SUB_EPI16(*in0, *in1); + + // Stage 4: multiply by constants (which gets us into 32 bits). + { + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + *in0 = _mm_packs_epi32(w0, w2); + *in1 = _mm_packs_epi32(w1, w3); + } + } +} + +void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + __m128i in0, in1; + FDCT4x4_2D_HELPER(input, stride, &in0, &in1); + + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); +} + +void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + FDCT4x4_2D_HELPER(input, stride, &in0, &in1); + _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); + _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); +} + +#if CONFIG_INTERNAL_STATS +void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = ADD_EPI16(in0, in7); + const __m128i q1 = ADD_EPI16(in1, in6); + const __m128i q2 = ADD_EPI16(in2, in5); + const __m128i q3 = ADD_EPI16(in3, in4); + const __m128i q4 = SUB_EPI16(in3, in4); + const __m128i q5 = SUB_EPI16(in2, in5); + const __m128i q6 = SUB_EPI16(in1, in6); + const __m128i q7 = SUB_EPI16(in0, in7); +#if DCT_HIGH_BIT_DEPTH + if (pass == 1) { + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); + } +} +#endif // CONFIG_INTERNAL_STATS + +#undef ADD_EPI16 +#undef SUB_EPI16 diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c new file mode 100644 index 0000000000..0e4fb80468 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" + +#define DCT_HIGH_BIT_DEPTH 0 +#define FDCT4x4_2D_HELPER fdct4x4_helper +#define FDCT4x4_2D aom_fdct4x4_sse2 +#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2 +#define FDCT8x8_2D aom_fdct8x8_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT4x4_2D_HELPER +#undef FDCT4x4_2D +#undef FDCT4x4_2D_LP +#undef FDCT8x8_2D + +#if CONFIG_AV1_HIGHBITDEPTH + +#undef DCT_HIGH_BIT_DEPTH +#define DCT_HIGH_BIT_DEPTH 1 +#define FDCT8x8_2D aom_highbd_fdct8x8_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT8x8_2D + +#endif diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h new file mode 100644 index 0000000000..78ea98522e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16((short)0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + cmp0 = _mm_or_si128(cmp0, cmp1); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16((short)0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); + cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + } + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) { + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res0) { + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); + if (!res1) { + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); + if (!res0) { + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); + if (!res1) + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); + } + } + } + } + } + return res0 + res1; +} + +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); +} + +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm new file mode 100644 index 0000000000..06879040b0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -0,0 +1,379 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +%macro STORE_OUTPUT 2 ; index, result + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%endmacro + +SECTION .text + +%if AOM_ARCH_X86_64 +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + STORE_OUTPUT 0, 4 + STORE_OUTPUT 8, 5 + STORE_OUTPUT 16, 3 + STORE_OUTPUT 24, 0 + STORE_OUTPUT 32, 10 + STORE_OUTPUT 40, 9 + STORE_OUTPUT 48, 7 + STORE_OUTPUT 56, 2 + + RET +%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c new file mode 100644 index 0000000000..05c87bcff9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void highbd_load_b_values_avx2( + const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr, + __m256i *round, const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr, + __m256i *shift) { + *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1)); + *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); + *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); + *dequant = + _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); + *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr)); +} + +static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask, + const int16_t *iscan_ptr, + int *is_found, __m256i *mask) { + __m256i temp_mask = _mm256_setzero_si256(); + if (_mm256_movemask_epi8(*cmp_mask)) { + __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr)); + temp_mask = _mm256_and_si256(*cmp_mask, iscan); + *is_found = 1; + } + *mask = _mm256_max_epi16(temp_mask, *mask); +} + +static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1, + __m256i *threshold, + const int16_t *iscan_ptr, + int *is_found, __m256i *mask) { + __m256i coeff[2], cmp_mask0, cmp_mask1; + coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS); + cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS); + cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]); + cmp_mask0 = + _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y, + __m256i *p, const int shift) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, shift); + prod_hi = _mm256_srli_epi64(prod_hi, shift); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa); +} + +static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff, + const __m256i *round, + const __m256i *quant, + const __m256i *shift, + const int *log_scale) { + __m256i tmp, qcoeff; + qcoeff = _mm256_add_epi32(*coeff, *round); + highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16); + qcoeff = _mm256_add_epi32(tmp, qcoeff); + highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale); +} + +static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff, + __m256i dequant) { + return _mm256_mullo_epi32(qcoeff, dequant); +} + +static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2( + __m256i qcoeff, __m256i dequant, const int log_scale) { + __m256i abs_coeff = _mm256_abs_epi32(qcoeff); + highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale); + return _mm256_sign_epi32(abs_coeff, qcoeff); +} + +static INLINE void highbd_store_coefficients_avx2(__m256i coeff0, + __m256i coeff1, + tran_low_t *coeff_ptr) { + _mm256_store_si256((__m256i *)(coeff_ptr), coeff0); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1); +} + +void aom_highbd_quantize_b_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff0, qcoeff0, coeff1, qcoeff1; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + int prescan_add[2]; + int thresh[2]; + const int log_scale = 0; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, + &quant, dequant_ptr, &dequant, quant_shift_ptr, + &shift); + + // Do DC and first 15 AC. + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, + &mask0); + __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm256_unpackhi_epi64(zbin, zbin); + __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + round = _mm256_unpackhi_epi64(round, round); + quant = _mm256_unpackhi_epi64(quant, quant); + shift = _mm256_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + // Reinsert signs + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + // Mask out zbin threshold coeffs + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); + coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); + dequant = _mm256_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); + coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant); + coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_32x32_adaptive_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const int log_scale = 1; + const __m256i zero = _mm256_setzero_si256(); + __m256i zbin, round, quant, dequant, shift; + __m256i coeff0, qcoeff0, coeff1, qcoeff1; + __m256i cmp_mask, mask0 = zero, mask1 = zero; + __m128i temp_mask0, temp_mask1; + const __m256i one = _mm256_set1_epi32(1); + const __m256i log_scale_vec = _mm256_set1_epi32(log_scale); + int prescan_add[2]; + int thresh[2]; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + __m256i threshold[2]; + threshold[0] = _mm256_set1_epi32(thresh[0]); + threshold[1] = _mm256_set1_epi32(thresh[1]); + threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + + // Setup global values. + zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr)); + round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr)); + quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr)); + dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr)); + shift = + _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr)); + + // Shift with rounding. + zbin = _mm256_add_epi32(zbin, log_scale_vec); + round = _mm256_add_epi32(round, log_scale_vec); + zbin = _mm256_srli_epi32(zbin, log_scale); + round = _mm256_srli_epi32(round, log_scale); + zbin = _mm256_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, + &mask0); + __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11); + __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1); + threshold[0] = threshold[1]; + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero); + round = _mm256_permute2x128_si256(round, round, 0x11); + quant = _mm256_permute2x128_si256(quant, quant, 0x11); + shift = _mm256_permute2x128_si256(shift, shift, 0x11); + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); + } else { + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + round = _mm256_permute2x128_si256(round, round, 0x11); + quant = _mm256_permute2x128_si256(quant, quant, 0x11); + shift = _mm256_permute2x128_si256(shift, shift, 0x11); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + // Reinsert signs + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + // Mask out zbin threshold coeffs + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr); + coeff0 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11); + coeff1 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index)); + qcoeff0 = _mm256_abs_epi32(coeff0); + coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8)); + qcoeff1 = _mm256_abs_epi32(coeff1); + highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin); + temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8); + highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1); + if (_mm256_movemask_epi8(cmp_mask) == 0) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero); + index += 16; + continue; + } + highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale); + qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0); + qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1); + qcoeff0 = _mm256_and_si256(qcoeff0, temp0); + qcoeff1 = _mm256_and_si256(qcoeff1, temp1); + highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index); + coeff0 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale); + coeff1 = + highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale); + highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index); + index += 16; + } + if (is_found0) { + temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0), + _mm256_extracti128_si256(mask0, 1)); + non_zero_count = calculate_non_zero_count(temp_mask0); + } + if (is_found1) { + temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1), + _mm256_extracti128_si256(mask1, 1)); + non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1); + } + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c new file mode 100644 index 0000000000..ae31116e9d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c @@ -0,0 +1,732 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi64(a, sign); +} + +static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y, + __m128i *p, const int shift) { + __m128i sign = _mm_srai_epi32(*y, 31); + __m128i sign_lo = _mm_unpacklo_epi32(sign, sign); + __m128i sign_hi = _mm_unpackhi_epi32(sign, sign); + __m128i abs_y = invert_sign_32_sse2(*y, sign); + __m128i prod_lo = _mm_mul_epu32(*x, abs_y); + __m128i prod_hi = _mm_srli_epi64(*x, 32); + const __m128i mult_hi = _mm_srli_epi64(abs_y, 32); + prod_hi = _mm_mul_epu32(prod_hi, mult_hi); + prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo); + prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi); + + prod_lo = _mm_srli_epi64(prod_lo, shift); + const __m128i mask = _mm_set_epi32(0, -1, 0, -1); + prod_lo = _mm_and_si128(prod_lo, mask); + prod_hi = _mm_srli_epi64(prod_hi, shift); + + prod_hi = _mm_slli_epi64(prod_hi, 32); + *p = _mm_or_si128(prod_lo, prod_hi); +} + +static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round, + const __m128i *quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, qcoeff; + qcoeff = _mm_add_epi32(*coeff, *round); + highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16); + qcoeff = _mm_add_epi32(tmp, qcoeff); + highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale); +} + +static INLINE void highbd_update_mask1(__m128i *cmp_mask0, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i temp_mask = _mm_setzero_si128(); + if (_mm_movemask_epi8(*cmp_mask0)) { + __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); + temp_mask = mask0; + *is_found = 1; + } + *mask = _mm_max_epi16(temp_mask, *mask); +} + +static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, + __m128i *threshold, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i coeff[2], cmp_mask0, cmp_mask1; + + coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS); + cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS); + cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); + + cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); + + highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask); +} + +static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant, + const int log_scale) { + __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31); + __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign); + highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale); + return invert_sign_32_sse2(abs_coeff, coeff_sign); +} + +void aom_highbd_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 0; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < + (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 1; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + const __m128i log_scale_vec = _mm_set1_epi32(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + + // Shift with rounding. + zbin = _mm_add_epi32(zbin, log_scale_vec); + round = _mm_add_epi32(round, log_scale_vec); + zbin = _mm_srli_epi32(zbin, log_scale); + round = _mm_srli_epi32(round, log_scale); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_highbd_quantize_b_64x64_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 8; + const int log_scale = 2; + int non_zero_count = 0; + int non_zero_count_prescan_add_zero = 0; + int is_found0 = 0, is_found1 = 0; + int eob = -1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + const __m128i log_scale_vec = _mm_set1_epi32(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, cmp_mask; + __m128i all_zero; + __m128i mask0 = zero, mask1 = zero; + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + int prescan_add[2]; + int thresh[4]; + const qm_val_t wt = (1 << AOM_QM_BITS); + for (int i = 0; i < 2; ++i) { + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1; + } + thresh[2] = thresh[3] = thresh[1]; + __m128i threshold[2]; + threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]); + threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]); + +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + __m128i zbin_sign = _mm_srai_epi16(zbin, 15); + __m128i round_sign = _mm_srai_epi16(round, 15); + __m128i quant_sign = _mm_srai_epi16(quant, 15); + __m128i dequant_sign = _mm_srai_epi16(dequant, 15); + __m128i shift_sign = _mm_srai_epi16(shift, 15); + + zbin = _mm_unpacklo_epi16(zbin, zbin_sign); + round = _mm_unpacklo_epi16(round, round_sign); + quant = _mm_unpacklo_epi16(quant, quant_sign); + dequant = _mm_unpacklo_epi16(dequant, dequant_sign); + shift = _mm_unpacklo_epi16(shift, shift_sign); + + // Shift with rounding. + zbin = _mm_add_epi32(zbin, log_scale_vec); + round = _mm_add_epi32(round, log_scale_vec); + zbin = _mm_srli_epi32(zbin, log_scale); + round = _mm_srli_epi32(round, log_scale); + zbin = _mm_sub_epi32(zbin, one); + + // Do DC and first 15 AC. + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1); + + threshold[0] = threshold[1]; + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1); + } + + // AC only loop. + while (index < n_coeffs) { + coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index)); + coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4)); + + coeff0_sign = _mm_srai_epi32(coeff0, 31); + coeff1_sign = _mm_srai_epi32(coeff1, 31); + qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign); + + highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, + &is_found0, &mask0); + + cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin); + cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1); + highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + index += 8; + continue; + } + highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale); + highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale); + + qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1); + + coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale); + coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale); + + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1); + + index += 8; + } + if (is_found0) non_zero_count = calculate_non_zero_count(mask0); + if (is_found1) + non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1); + + for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) { + const int rc = scan[i]; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + + for (int i = non_zero_count - 1; i >= 0; i--) { + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + break; + } + } + + *eob_ptr = eob + 1; +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 0000000000..11e45778c0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1248 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; +static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd); +void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd); + +void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + if (filter_params_y->taps == 12) { + av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + + __m256i s[8], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32(&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + if (filter_params_x->taps == 12) { + av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, + bd); + return; + } + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[4], coeffs_x[4]; + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else { + xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res)); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } + } + } +} + +#define CONV8_ROUNDING_BITS (7) + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void pack_filters_4tap(const int16_t *filter, + __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(h); + + // coeffs 2 3 2 3 2 3 2 3 + f[0] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + f[1] = _mm256_shuffle_epi32(coeff, 0xaa); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void aom_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void aom_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void aom_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i ff[2], s[2]; + uint32_t i; + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + + __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); + __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); + __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); + + pack_filters_4tap(filter, ff); + src_ptr -= 3; + for (i = 0; i <= (height - 2); i += 2) { + __m256i row0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); + __m256i row1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2])); + + s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); + s[1] = _mm256_alignr_epi8(s[0], s[0], 4); + + s[0] = _mm256_shuffle_epi8(s[0], mask); + s[1] = _mm256_shuffle_epi8(s[1], mask); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch], + _mm256_extracti128_si256(res, 1)); + } + if (height % 2 != 0) { + i = height - 1; + const __m256i row0_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2])); + const __m256i row0_1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6])); + + const __m256i r0 = + _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); + + s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); + s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + } +} + +static void aom_highbd_filter_block1d8_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i ff[2], s[2]; + uint32_t i = 0; + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, + 4, 5, 12, 13, 6, 7, 14, 15 }; + + __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask); + __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3); + __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5); + + pack_filters_4tap(filter, ff); + src_ptr -= 3; + + /* Horizontal filter */ + + for (i = 0; i <= (height - 2); i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]); + + const __m256i r0 = + _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = r0; + s[1] = _mm256_alignr_epi8(r1, r0, 4); + + __m256i res_even = convolve_4tap(s, ff); + res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), + CONV8_ROUNDING_BITS); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + + __m256i res_odd = convolve_4tap(s, ff); + res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), + CONV8_ROUNDING_BITS); + + __m256i res = _mm256_packs_epi32(res_even, res_odd); + res = _mm256_shuffle_epi8(res, mask); + + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res, 1)); + } + + if (height % 2 != 0) { + i = height - 1; + const __m256i row0_0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]); + const __m256i row0_1 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]); + + const __m256i r0 = + _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1); + + s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3); + s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5); + + __m256i res = convolve_4tap(s, ff); + res = + _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS); + + res = _mm256_packs_epi32(res, res); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4], + _mm256_extracti128_si256(res, 1)); + } +} + +static void aom_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, + dst_pitch, height, filter, bd); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void aom_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void aom_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void aom_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void aom_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void aom_highbd_filter_block1d4_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + uint32_t i; + __m256i s[2], ff[2]; + + pack_filters_4tap(filter, ff); + + const uint16_t *data = src_ptr; + /* Vertical filter */ + { + __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch)); + __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch)); + + __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + + __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch)); + + __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + + s[0] = _mm256_unpacklo_epi16(s23, s34); + + for (i = 0; i < height; i += 2) { + data = &src_ptr[i * src_pitch]; + + __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch)); + __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch)); + + __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + + s[1] = _mm256_unpacklo_epi16(s45, s56); + + const __m256i res_a = convolve_4tap(s, ff); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel); + res_16bit = _mm256_max_epi32(res_16bit, zero); + res_16bit = _mm256_packs_epi32(res_16bit, res_16bit); + + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res_16bit)); + _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res_16bit, 1)); + + s[0] = s[1]; + s4 = s6; + } + } +} + +static void aom_highbd_filter_block1d8_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + __m256i s[4], ff[2]; + uint32_t i; + pack_filters_4tap(filter, ff); + + const uint16_t *data = src_ptr; + /* Vertical filter */ + { + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch)); + + __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch)); + + __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + + s[0] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpackhi_epi16(s23, s34); + + for (i = 0; i < height; i += 2) { + data = &src_ptr[i * src_pitch]; + + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch)); + + __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + + s[1] = _mm256_unpacklo_epi16(s45, s56); + s[3] = _mm256_unpackhi_epi16(s45, s56); + + const __m256i res_a = convolve_4tap(s, ff); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + const __m256i res_b = convolve_4tap(s + 2, ff); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch], + _mm256_extracti128_si256(res_16bit, 1)); + + s[0] = s[1]; + s[2] = s[3]; + s4 = s6; + } + } +} + +static void aom_highbd_filter_block1d16_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + + aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8, + dst_pitch, height, filter, bd); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void aom_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void aom_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 +#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 +#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 +#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 + +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2) +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2) + +#undef HIGHBD_FUNC diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c new file mode 100644 index 0000000000..a2bb283222 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +// ----------------------------------------------------------------------------- + +void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg34_lo; + __m128i srcReg45_lo, srcReg56_lo; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_45_lo, resReg34_56_lo; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg64, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiply the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = dst_pitch << 1; + + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); + + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); + + for (i = height; i > 1; i -= 2) { + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); + + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); + + resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); + + // shift by 7 bit each 32 bit + resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); + resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); + resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); + resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); + + // shrink to 16 bit each 32 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128()); + resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128()); + + resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); + resReg23_45 = _mm_min_epi16(resReg23_45, max); + resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); + resReg34_56 = _mm_min_epi16(resReg34_56, max); + + src_ptr += src_stride; + + _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45)); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); + + dst_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg34_lo = srcReg56_lo; + srcReg4 = srcReg6; + } +} + +void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i addFilterReg64; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1; + __m128i srcReg32b1; + unsigned int i; + src_ptr -= 3; + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); + + __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); + __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1); + __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1); + + ss_23 = _mm_madd_epi16(ss_23, secondFilters); + ss_45 = _mm_madd_epi16(ss_45, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45); + + // shift by 7 bit each 32 bit + srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64); + srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7); + + srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); + + src_ptr += src_pitch; + + _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1); + + dst_ptr += dst_pitch; + } +} + +void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; + __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; + __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; + __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; + __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; + __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; + __m128i resReg23_45, resReg34_56; + __m128i addFilterReg64, secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = dst_pitch << 1; + + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3); + srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3); + + srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4); + srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4); + + for (i = height; i > 1; i -= 2) { + srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + + srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5); + srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5); + + srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + + srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6); + srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters); + resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters); + resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters); + resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters); + + resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo); + resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo); + + // multiply 2 adjacent elements with the filter and add the result + + resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters); + resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters); + resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters); + resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters); + + resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi); + resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi); + + // shift by 7 bit each 32 bit + resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64); + resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64); + resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64); + resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64); + resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7); + resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7); + resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7); + resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7); + + // shrink to 16 bit each 32 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi); + resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi); + + resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128()); + resReg23_45 = _mm_min_epi16(resReg23_45, max); + resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128()); + resReg34_56 = _mm_min_epi16(resReg34_56, max); + + src_ptr += src_stride; + + _mm_store_si128((__m128i *)dst_ptr, (resReg23_45)); + _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56)); + + dst_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_lo = srcReg45_lo; + srcReg23_hi = srcReg45_hi; + srcReg34_lo = srcReg56_lo; + srcReg34_hi = srcReg56_hi; + srcReg4 = srcReg6; + } +} + +void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m128i filtersReg; + __m128i addFilterReg64; + __m128i secondFilters, thirdFilters; + __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; + __m128i srcReg32b1, srcReg32b2; + unsigned int i; + src_ptr -= 3; + addFilterReg64 = _mm_set1_epi32(64); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); + + secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + + for (i = height; i > 0; i -= 1) { + srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2)); + srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6)); + + __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4); + __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4); + __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2); + + __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters); + __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); + srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); + + __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2); + __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6); + __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2); + __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6); + __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2); + __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2); + + d1 = _mm_madd_epi16(ss_3, secondFilters); + d2 = _mm_madd_epi16(ss_5, thirdFilters); + srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); + + __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); + + // shift by 7 bit each 32 bit + res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64); + res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64); + res_lo_1 = _mm_srai_epi32(res_lo_1, 7); + res_hi_1 = _mm_srai_epi32(res_hi_1, 7); + + srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1); + + srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); + srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max); + + src_ptr += src_pitch; + + _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1); + + dst_ptr += dst_pitch; + } +} + +void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), + dst_pitch, height, filter, bd); +} + +void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch, + height, filter, bd); + aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8), + dst_pitch, height, filter, bd); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c new file mode 100644 index 0000000000..31c3c31b3c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" + +void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + if (filter_params_y->taps == 12) { + __m128i s[24], coeffs_y[6]; + + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride)); + __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + s[3] = _mm_unpacklo_epi16(s6, s7); + s[4] = _mm_unpacklo_epi16(s8, s9); + + s[6] = _mm_unpackhi_epi16(s0, s1); + s[7] = _mm_unpackhi_epi16(s2, s3); + s[8] = _mm_unpackhi_epi16(s4, s5); + s[9] = _mm_unpackhi_epi16(s6, s7); + s[10] = _mm_unpackhi_epi16(s8, s9); + + s[12] = _mm_unpacklo_epi16(s1, s2); + s[13] = _mm_unpacklo_epi16(s3, s4); + s[14] = _mm_unpacklo_epi16(s5, s6); + s[15] = _mm_unpacklo_epi16(s7, s8); + s[16] = _mm_unpacklo_epi16(s9, s10); + + s[18] = _mm_unpackhi_epi16(s1, s2); + s[19] = _mm_unpackhi_epi16(s3, s4); + s[20] = _mm_unpackhi_epi16(s5, s6); + s[21] = _mm_unpackhi_epi16(s7, s8); + s[22] = _mm_unpackhi_epi16(s9, s10); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride)); + __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride)); + + s[5] = _mm_unpacklo_epi16(s10, s11); + s[11] = _mm_unpackhi_epi16(s10, s11); + + s[17] = _mm_unpacklo_epi16(s11, s12); + s[23] = _mm_unpackhi_epi16(s11, s12); + + const __m128i res_a0 = convolve_12tap(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((int *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((int *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + + s[6] = s[7]; + s[7] = s[8]; + s[8] = s[9]; + s[9] = s[10]; + s[10] = s[11]; + + s[12] = s[13]; + s[13] = s[14]; + s[14] = s[15]; + s[15] = s[16]; + s[16] = s[17]; + + s[18] = s[19]; + s[19] = s[20]; + s[20] = s[21]; + s[21] = s[22]; + s[22] = s[23]; + + s10 = s12; + } + } + } else { + __m128i s[16], coeffs_y[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((int *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((int *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } + } +} + +void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + if (filter_params_x->taps == 12) { + __m128i s[6], coeffs_x[6]; + + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + const __m128i row02 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + s[4] = _mm_alignr_epi8(row02, row01, 0); + s[5] = _mm_alignr_epi8(row02, row01, 4); + + __m128i res_even = convolve_12tap(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), + round_shift_bits); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + s[4] = _mm_alignr_epi8(row02, row01, 2); + s[5] = _mm_alignr_epi8(row02, row01, 6); + + __m128i res_odd = convolve_12tap(s, coeffs_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), + round_shift_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + res = _mm_min_epi16(res, clip_pixel); + res = _mm_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + } else { + *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); + } + } + } + } + } else { + __m128i s[4], coeffs_x[4]; + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + res = _mm_min_epi16(res, clip_pixel); + res = _mm_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + } else { + *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); + } + } + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm new file mode 100644 index 0000000000..91b3d126ca --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm @@ -0,0 +1,259 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c new file mode 100644 index 0000000000..6a2e915ed7 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); + dst += stride << 2; + left += 4; + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); +} + +void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); +} + +void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); + dst += stride << 3; + left += 8; + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + h_predictor_16x8(dst, stride, left); +} + +void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP, DC_LEFT, DC_128 + +// 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 4x8 + +static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +// Shared with DC 8xh +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sum = dc_sum_8(left); + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x8(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 8xh + +static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, const uint16_t *above) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + dc_store_8xh(dst, stride, height, &dc); +} + +void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 4, above); +} + +void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 8, above); +} + +void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 16, above); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 4, &dc); +} + +void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 8, &dc); +} + +// Shared with DC 16xh +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 16, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + dc_store_8xh(dst, stride, height, &dc_dup); +} + +void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 4, bd); +} + +void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 8, bd); +} + +void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 16, bd); +} + +// ----------------------------------------------------------------------------- +// 16xh + +static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +// Shared with 32xh +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 8, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 32xh + +static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); + int i; + for (i = 0; i < 2; ++i) { + _mm_storel_epi64((__m128i *)dst, above_u16); + _mm_storel_epi64((__m128i *)(dst + stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); +} + +void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_above = dc_sum_4(above); + const __m128i sum_left = dc_sum_8(left); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((int16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_left = dc_sum_4(left); + const __m128i sum_above = dc_sum_8(above); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((int16_t)sum32); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); +} + +void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((int16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((int16_t)sum32); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((int16_t)sum32); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((int16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c new file mode 100644 index 0000000000..c954da94e5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/common_avx2.h" +#include "aom_dsp/x86/lpf_common_sse2.h" +#include "aom/aom_integer.h" + +void aom_highbd_lpf_horizontal_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, + blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c new file mode 100644 index 0000000000..ea7dc6a9e5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -0,0 +1,1698 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/lpf_common_sse2.h" + +static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + *pixel = _mm_min_epi16(*pixel, *max); + *pixel = _mm_max_epi16(*pixel, *min); +} + +static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); +} + +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr, __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void get_limit_dual( + const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, + const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, + int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, + __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); + __m128i x1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *blt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *lt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *thr_out = _mm_slli_epi16(x0, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); + } +} + +static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = abs_diff16(p[0], q[0]); + __m128i abs_p1q1 = abs_diff16(p[1], q[1]); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16((short)0xFFFF); + + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, + __m128i *p1p0, __m128i *q1q0, + __m128i *abs_p1p0, __m128i *l, + __m128i *bl, __m128i *t, + __m128i *hev, __m128i *mask) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16((short)0xFFFF); + __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; + __m128i max, max01, h; + + *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); + *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); + + abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 + + max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + *abs_p1p0 = abs_diff16(pq[0], pq[1]); + abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); + max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); + // mask |= (abs(*p1 - *p0) > limit) * -1; + // mask |= (abs(*q1 - *q0) > limit) * -1; + h = _mm_subs_epu16(max01, *t); + + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + // replicate for the further "merged variables" usage + *hev = _mm_unpacklo_epi64(*hev, *hev); + + max = _mm_max_epi16(max, max01); + int i; + for (i = 2; i < x; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // ~mask +} + +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq, + int start, int end, __m128i *flat) { + int i; + __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), + abs_diff16(pq[start + 1], pq[0])); + + for (i = start + 2; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p, + const __m128i *q, int start, int end, + __m128i *flat) { + int i; + __m128i max = + _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); + + for (i = start + 1; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); + } + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal(&th, pq, 1, 4, flat); + flat_mask_internal(&th, pq, 4, 7, flat2); +} + +static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p, + const __m128i *q, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal_dual(&th, p, q, 1, 4, flat); + flat_mask_internal_dual(&th, p, q, 4, 7, flat2); +} + +static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0, __m128i *t80, + int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + + const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); + __m128i ps1ps0_work, qs1qs0_work, work; + __m128i filt, filter2filter1, filter2filt, filter1filt; + + ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); + qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); + + work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); + pixel_clamp(&pmin, &pmax, &work); + filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, *mask); + filt = _mm_unpacklo_epi64(filt, filt); + + filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ + pixel_clamp(&pmin, &pmax, &filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ + + filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); + + // filt >> 1 + filt = _mm_adds_epi16(filt, one); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(*hev, filt); + + filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); + filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); + + qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); + ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); + + pixel_clamp(&pmin, &pmax, &qs1qs0_work); + pixel_clamp(&pmin, &pmax, &ps1ps0_work); + + *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); + *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); +} + +static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, + __m128i *qs, const __m128i *mask, + const __m128i *th, int bd, + __m128i *t80) { + __m128i ps0 = _mm_subs_epi16(p[0], *t80); + __m128i ps1 = _mm_subs_epi16(p[1], *t80); + __m128i qs0 = _mm_subs_epi16(q[0], *t80); + __m128i qs1 = _mm_subs_epi16(q[1], *t80); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); + + // hev_filter + __m128i hev; + const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); + const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *th); + const __m128i ffff = _mm_cmpeq_epi16(h, h); + hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + + filter = _mm_and_si128(filter, hev); + + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); + qs0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); + qs[0] = _mm_adds_epi16(qs0, *t80); + ps[0] = _mm_adds_epi16(ps0, *t80); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); + qs1 = _mm_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); + qs[1] = _mm_adds_epi16(qs1, *t80); + ps[1] = _mm_adds_epi16(ps1, *t80); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( + __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, + const unsigned char *lt, const unsigned char *thr, int bd) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i t80; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); + + for (i = 0; i < 7; i++) { + pq[i] = _mm_unpacklo_epi64(p[i], q[i]); + } + __m128i mask, hevhev; + __m128i p1p0, q1q0, abs_p1p0; + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hevhev, &mask); + + __m128i ps0ps1, qs0qs1; + // filter4 + highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); + + __m128i flat, flat2; + highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + // flat and wide flat calculations + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); + __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); + + sum_p6 = _mm_add_epi16(pq[6], pq[6]); + sum_p3 = _mm_add_epi16(pq[3], pq[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + + work0 = _mm_add_epi16(sum_p3, pq[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq[3]); + work0 = _mm_add_epi16(sum_p3, pq[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + } +} + +void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i p[7], q[7], pq[7]; + int i; + + for (i = 0; i < 7; i++) { + p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); + } + + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); + + for (i = 0; i < 6; i++) { + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( + __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, + const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, + const uint8_t *thr1, int bd) { + __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, + &t80); + __m128i mask; + highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); + __m128i flat, flat2; + highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + __m128i ps[2], qs[2]; + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); + // flat and wide flat calculations + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); + __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); + __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; + } +} + +void aom_highbd_lpf_horizontal_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p[7], q[7]; + int i; + load_highbd_pixel(s, 7, pitch, p, q); + + highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + for (i = 0; i < 6; i++) { + _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, + const uint8_t *_limit, const uint8_t *_thresh, int bd) { + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[3]; + __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; + __m128i flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + + // flat_mask + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, + const unsigned char *_thresh0, const unsigned char *_blimit1, + const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat, work; + __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; + __m128i op1, op0, oq0, oq1; + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p2p1 = abs_diff16(*p2, *p1); + abs_p1p0 = abs_diff16(*p1, *p0); + abs_q1q0 = abs_diff16(*q1, *q0); + abs_q2q1 = abs_diff16(*q2, *q1); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + mask = _mm_max_epi16(abs_q2q1, mask); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + mask = _mm_max_epi16(work, mask); + mask = _mm_max_epi16(mask, abs_p2p1); + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + // flat_mask + flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); + flat = _mm_max_epi16(flat, work); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + op1 = _mm_srli_epi16(workp_shft0, 3); + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + op0 = _mm_srli_epi16(workp_a, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + oq0 = _mm_srli_epi16(workp_shft0, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + oq1 = _mm_srli_epi16(workp_shft1, 3); + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } +} + +void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, + _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); +} + +void aom_highbd_lpf_horizontal_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2; + + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[4]; + __m128i p1p0, q1q0, ps1ps0, qs1qs0; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + pq[3] = _mm_unpacklo_epi64(*p3, *q3); + + __m128i abs_p1p0; + + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + + // flat_mask4 + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, + const unsigned char *_limit0, const unsigned char *_thresh0, + const unsigned char *_blimit1, const unsigned char *_limit1, + const unsigned char *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat; + __m128i work_a, op2, oq2, op1, op0, oq0, oq1; + __m128i abs_p1q1, abs_p0q0, work0, work1, work2; + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; + + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); + work1 = + _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat + work0 = _mm_max_epi16(work0, work1); + work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); + work2 = _mm_max_epi16(work2, work0); + mask = _mm_max_epi16(work2, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); + flat = _mm_max_epi16(work1, flat); + work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); + flat = _mm_max_epi16(work0, flat); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, + &p1p0, _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, + _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, + __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i blimit, limit, thresh; + __m128i mask, hev; + __m128i p1p0, q1q0; + __m128i pq[2]; + + __m128i abs_p1p0; + + __m128i t80; + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + + highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, + __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i mask, flat; + __m128i p[2], q[2]; + + const __m128i zero = _mm_setzero_si128(); + __m128i abs_p0q0 = abs_diff16(*q0, *p0); + __m128i abs_p1q1 = abs_diff16(*q1, *p1); + + __m128i abs_p1p0 = abs_diff16(*p1, *p0); + __m128i abs_q1q0 = abs_diff16(*q1, *q0); + + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + + __m128i t80; + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + mask = _mm_max_epi16(flat, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); + + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); +} + +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p1p0, q1q0; + __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, + _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +void aom_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i ps[2], qs[2]; + + highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, + _thresh0, _blimit1, _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); + _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); +} + +void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i x0, x1, x2, x3, d0, d1, d2, d3; + __m128i p1p0, q1q0; + __m128i p1, q1; + + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + + highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); + + highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, + thresh, bd); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + // transpose from 8x4 to 4x8 + highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i ps[2], qs[2]; + + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); + + highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3); + + highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, + &d3, &d4, &d5, &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} + +void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x3, x2, x1, x0, p0, q0; + __m128i p1p0, q1q0; + + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, + limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0, p1, q1, p2, q2; + + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, + &p0, &q0, &q1, &q2, &d6, &d7); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} + +void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p2, p1, p0, p3, q0; + __m128i q1q0, p1p0; + + p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); + p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); + p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); + p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + // Loop filtering + highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, + &p1p0, blimit, limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, + &d1, &d2, &d3); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3, &d4, &d5, &d6, &d7); + + highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, + blimit0, limit0, thresh0, blimit1, limit1, + thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, + &x2, &x3, &x4, &x5, &x6, &x7); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); + _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); + _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); + _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); + _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); +} + +void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i q[7], p[7], pq[7]; + __m128i p6, p5, p4, p3; + __m128i p6_2, p5_2, p4_2, p3_2; + __m128i d0, d1, d2, d3; + __m128i d0_2, d1_2, d2_2, d3_2, d7_2; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + + highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], + &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + + highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], + &q[3], &q[4], &q[5], &q[6], &d7_2); + + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); + + highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], + &pq[1], &pq[0], &d0, &d1, &d2, &d3); + + q[0] = _mm_srli_si128(pq[0], 8); + q[1] = _mm_srli_si128(pq[1], 8); + q[2] = _mm_srli_si128(pq[2], 8); + q[3] = _mm_srli_si128(pq[3], 8); + q[4] = _mm_srli_si128(pq[4], 8); + q[5] = _mm_srli_si128(pq[5], 8); + + highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], + &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); +} + +void aom_highbd_lpf_vertical_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i q[7], p[7]; + __m128i p6, p5, p4, p3, p2, p1, p0, q0; + __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; + __m128i d0, d7; + __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); + p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); + p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); + q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], + &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, + &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], + &q[6], &d7); + + highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); + + highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 0000000000..950465cf46 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, __m256i *qp, + int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); + qp[0] = _mm256_add_epi32(qp[0], rnd); + qp[0] = _mm256_srai_epi32(qp[0], log_scale); + + qp[1] = _mm256_add_epi32(qp[1], rnd); + qp[1] = _mm256_srai_epi32(qp[1], log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static INLINE __m256i mm256_mul_shift_epi32(const __m256i *x, + const __m256i *y) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, 16); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, + __m256i eobmax, + __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +// Get the max eob from the lower 128 bits. +static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + return (uint16_t)_mm256_extract_epi16(eob, 0); +} + +static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static AOM_FORCE_INLINE void quantize_logscale( + const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + // (16 - log_scale + AOM_QM_BITS)); + const __m256i abs_q = + mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +static AOM_FORCE_INLINE void quantize(const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + const int step = 8; + + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0); + + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} + +void aom_highbd_quantize_b_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + const unsigned int step = 8; + + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1); + + quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} + +void aom_highbd_quantize_b_64x64_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + const int step = 8; + + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2); + + quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 0000000000..3b0c42c4f5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" + +void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = + (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + *eob_ptr = eob_i + 1; +} + +void aom_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_64x64_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 0000000000..03839b493c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,344 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad +%if AOM_ARCH_X86_64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; AOM_ARCH_X86_64 +%else ; %3 == 2, downsample +%if AOM_ARCH_X86_64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; AOM_ARCH_X86_64 +%endif ; sad/avg/skip + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep +%undef rep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 +HIGH_SADNXN4D 4, 16 +HIGH_SADNXN4D 16, 4 +HIGH_SADNXN4D 8, 32 +HIGH_SADNXN4D 32, 8 +HIGH_SADNXN4D 16, 64 +HIGH_SADNXN4D 64, 16 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 +HIGH_SADNXN4D 4, 16, 2 +HIGH_SADNXN4D 8, 32, 2 +HIGH_SADNXN4D 32, 8, 2 +HIGH_SADNXN4D 16, 64, 2 +HIGH_SADNXN4D 64, 16, 2 + +; Current code cannot handle the case when the height is downsampled to 2 +; HIGH_SADNXN4D 16, 4, 2 +; HIGH_SADNXN4D 8, 4, 2 +; HIGH_SADNXN4D 4, 4, 2 diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 0000000000..6c78eeeefb --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,720 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_ports/mem.h" + +// SAD +static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { + // input 8 32-bit summation + __m128i lo128, hi128; + __m256i u = _mm256_srli_si256(*v, 8); + u = _mm256_add_epi32(u, *v); + + // 4 32-bit summation + hi128 = _mm256_extracti128_si256(u, 1); + lo128 = _mm256_castsi256_si128(u); + lo128 = _mm_add_epi32(hi128, lo128); + + // 2 32-bit summation + hi128 = _mm_srli_si128(lo128, 4); + lo128 = _mm_add_epi32(lo128, hi128); + + return (unsigned int)_mm_cvtsi128_si32(lo128); +} + +static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r, + __m256i *sad_acc) { + const __m256i zero = _mm256_setzero_si256(); + int i; + for (i = 0; i < 4; i++) { + s[i] = _mm256_sub_epi16(s[i], r[i]); + s[i] = _mm256_abs_epi16(s[i]); + } + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. +static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + highbd_sad16x4_core_avx2(s, r, sad_acc); +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + int i; + __m256i sad = _mm256_setzero_si256(); + for (i = 0; i < N; i += 4) { + sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad); + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + } + return (unsigned int)get_sad_from_mm256_epi32(&sad); +} + +static void sad32x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + int row_sections = 0; + + while (row_sections < 2) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 32 << 1; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + + row_sections += 1; + src_ptr += src_stride << 1; + ref_ptr += ref_stride << 1; + } +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 2; + int i; + + for (i = 0; i < N; i += 4) { + sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + } + return get_sad_from_mm256_epi32(&sad); +} + +static void sad64x2(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + int i; + for (i = 0; i < 2; i++) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 64; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N, + const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 1; + int i; + for (i = 0; i < N; i += 2) { + sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + } + return get_sad_from_mm256_epi32(&sad); +} + +static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[4], r[4]; + int i; + for (i = 0; i < 2; i++) { + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + if (sec_ptr) { + r[0] = + _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + sec_ptr += 64; + } + highbd_sad16x4_core_avx2(s, r, sad_acc); + src_ptr += 64; + ref_ptr += 64; + } +} + +static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2( + int N, const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + int row = 0; + while (row < N) { + sad128x1(srcp, refp, NULL, &sad); + srcp += src_stride; + refp += ref_stride; + row++; + } + return get_sad_from_mm256_epi32(&sad); +} + +#define HIGHBD_SADMXN_AVX2(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \ + } + +#define HIGHBD_SAD_SKIP_MXN_AVX2(m, n) \ + unsigned int aom_highbd_sad_skip_##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \ + 2 * ref_stride); \ + } + +HIGHBD_SADMXN_AVX2(16, 4) +HIGHBD_SADMXN_AVX2(16, 8) +HIGHBD_SADMXN_AVX2(16, 16) +HIGHBD_SADMXN_AVX2(16, 32) +HIGHBD_SADMXN_AVX2(16, 64) + +HIGHBD_SADMXN_AVX2(32, 8) +HIGHBD_SADMXN_AVX2(32, 16) +HIGHBD_SADMXN_AVX2(32, 32) +HIGHBD_SADMXN_AVX2(32, 64) + +HIGHBD_SADMXN_AVX2(64, 16) +HIGHBD_SADMXN_AVX2(64, 32) +HIGHBD_SADMXN_AVX2(64, 64) +HIGHBD_SADMXN_AVX2(64, 128) + +HIGHBD_SADMXN_AVX2(128, 64) +HIGHBD_SADMXN_AVX2(128, 128) + +HIGHBD_SAD_SKIP_MXN_AVX2(16, 8) +HIGHBD_SAD_SKIP_MXN_AVX2(16, 16) +HIGHBD_SAD_SKIP_MXN_AVX2(16, 32) +HIGHBD_SAD_SKIP_MXN_AVX2(16, 64) + +HIGHBD_SAD_SKIP_MXN_AVX2(32, 8) +HIGHBD_SAD_SKIP_MXN_AVX2(32, 16) +HIGHBD_SAD_SKIP_MXN_AVX2(32, 32) +HIGHBD_SAD_SKIP_MXN_AVX2(32, 64) + +HIGHBD_SAD_SKIP_MXN_AVX2(64, 16) +HIGHBD_SAD_SKIP_MXN_AVX2(64, 32) +HIGHBD_SAD_SKIP_MXN_AVX2(64, 64) +HIGHBD_SAD_SKIP_MXN_AVX2(64, 128) + +HIGHBD_SAD_SKIP_MXN_AVX2(128, 64) +HIGHBD_SAD_SKIP_MXN_AVX2(128, 128) + +unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + // Next 4 rows + srcp += src_stride << 2; + refp += ref_stride << 2; + secp += 64; + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 3; + uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 2) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 8) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 6; + uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, secp, &sad); + srcp += src_stride; + refp += ref_stride; + secp += 16 << 3; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + unsigned int sum; + const int left_shift = 6; + + sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 128 << left_shift; + sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +// SAD 4D +// Combine 4 __m256i input vectors v to uint32_t result[4] +static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, + uint32_t *res) { + __m256i u0, u1, u2, u3; + const __m256i mask = yy_set1_64_from_32i(~0); + __m128i sad; + + // 8 32-bit summation + u0 = _mm256_srli_si256(v[0], 4); + u1 = _mm256_srli_si256(v[1], 4); + u2 = _mm256_srli_si256(v[2], 4); + u3 = _mm256_srli_si256(v[3], 4); + + u0 = _mm256_add_epi32(u0, v[0]); + u1 = _mm256_add_epi32(u1, v[1]); + u2 = _mm256_add_epi32(u2, v[2]); + u3 = _mm256_add_epi32(u3, v[3]); + + u0 = _mm256_and_si256(u0, mask); + u1 = _mm256_and_si256(u1, mask); + u2 = _mm256_and_si256(u2, mask); + u3 = _mm256_and_si256(u3, mask); + // 4 32-bit summation, evenly positioned + + u1 = _mm256_slli_si256(u1, 4); + u3 = _mm256_slli_si256(u3, 4); + + u0 = _mm256_or_si256(u0, u1); + u2 = _mm256_or_si256(u2, u3); + // 8 32-bit summation, interleaved + + u1 = _mm256_unpacklo_epi64(u0, u2); + u3 = _mm256_unpackhi_epi64(u0, u2); + + u0 = _mm256_add_epi32(u1, u3); + sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), + _mm256_castsi256_si128(u0)); + _mm_storeu_si128((__m128i *)res, sad); +} + +static void convert_pointers(const uint8_t *const ref8[], + const uint16_t *ref[]) { + ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); + ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); + ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); + ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); +} + +static void init_sad(__m256i *s) { + s[0] = _mm256_setzero_si256(); + s[1] = _mm256_setzero_si256(); + s[2] = _mm256_setzero_si256(); + s[3] = _mm256_setzero_si256(); +} + +static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2( + int M, int N, int D, const uint8_t *src, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_rows = (M < 128) + (M < 64); + const int row_units = 1 << shift_for_rows; + int i, r; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < D; ++i) { + srcp = keep; + for (r = 0; r < N; r += row_units) { + if (M == 128) { + sad128x1(srcp, refp[i], NULL, &sad_vec[i]); + } else if (M == 64) { + sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); + } else if (M == 32) { + sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + } else if (M == 16) { + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + } else { + assert(0); + } + srcp += src_stride << shift_for_rows; + refp[i] += ref_stride << shift_for_rows; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +#define HIGHBD_SAD_MXNX4D_AVX2(m, n) \ + void aom_highbd_sad##m##x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \ + sad_array); \ + } +#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \ + void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } +#define HIGHBD_SAD_MXNX3D_AVX2(m, n) \ + void aom_highbd_sad##m##x##n##x3d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \ + sad_array); \ + } + +HIGHBD_SAD_MXNX4D_AVX2(16, 4) +HIGHBD_SAD_MXNX4D_AVX2(16, 8) +HIGHBD_SAD_MXNX4D_AVX2(16, 16) +HIGHBD_SAD_MXNX4D_AVX2(16, 32) +HIGHBD_SAD_MXNX4D_AVX2(16, 64) + +HIGHBD_SAD_MXNX4D_AVX2(32, 8) +HIGHBD_SAD_MXNX4D_AVX2(32, 16) +HIGHBD_SAD_MXNX4D_AVX2(32, 32) +HIGHBD_SAD_MXNX4D_AVX2(32, 64) + +HIGHBD_SAD_MXNX4D_AVX2(64, 16) +HIGHBD_SAD_MXNX4D_AVX2(64, 32) +HIGHBD_SAD_MXNX4D_AVX2(64, 64) +HIGHBD_SAD_MXNX4D_AVX2(64, 128) + +HIGHBD_SAD_MXNX4D_AVX2(128, 64) +HIGHBD_SAD_MXNX4D_AVX2(128, 128) + +HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 8) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64) + +HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64) + +HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128) + +HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64) +HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128) + +HIGHBD_SAD_MXNX3D_AVX2(16, 4) +HIGHBD_SAD_MXNX3D_AVX2(16, 8) +HIGHBD_SAD_MXNX3D_AVX2(16, 16) +HIGHBD_SAD_MXNX3D_AVX2(16, 32) +HIGHBD_SAD_MXNX3D_AVX2(16, 64) + +HIGHBD_SAD_MXNX3D_AVX2(32, 8) +HIGHBD_SAD_MXNX3D_AVX2(32, 16) +HIGHBD_SAD_MXNX3D_AVX2(32, 32) +HIGHBD_SAD_MXNX3D_AVX2(32, 64) + +HIGHBD_SAD_MXNX3D_AVX2(64, 16) +HIGHBD_SAD_MXNX3D_AVX2(64, 32) +HIGHBD_SAD_MXNX3D_AVX2(64, 64) +HIGHBD_SAD_MXNX3D_AVX2(64, 128) + +HIGHBD_SAD_MXNX3D_AVX2(128, 64) +HIGHBD_SAD_MXNX3D_AVX2(128, 128) diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 0000000000..3dc4e4e0a2 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,524 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows +; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7 +%macro HIGH_SAD_FN 4-5 7 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%elif %4 == 1 ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if AOM_ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 +HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2 + +; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 +HIGH_SAD32XN 8, 2 ; highbd_sad_skip_32x8_sse2 + +; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else + mov n_rowsd, %1/2 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 +HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 +HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2 +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 +; Current code fails there are only 2 rows +; HIGH_SAD16XN 4, 2 ; highbd_sad_skip_16x4_sse2 + +; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2, 8 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m7, m1 + movu m5, [srcq] + psubusw m1, m5 + psubusw m5, m7 + por m1, m5 + + mova m7, m2 + movu m5, [srcq+src_strideq*2] + psubusw m2, m5 + psubusw m5, m7 + por m2, m5 + + mova m7, m3 + movu m5, [srcq+src_strideq*4] + psubusw m3, m5 + psubusw m5, m7 + por m3, m5 + + mova m7, m4 + movu m5, [srcq+src_stride3q*2] + psubusw m4, m5 + psubusw m5, m7 + por m4, m5 + + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 +; Current code fails there are only 2 rows +; HIGH_SAD8XN 4, 2 ; highbd_sad8x4_avg_sse2 + +; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD4XN 1-2 0 + HIGH_SAD_FN 4, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + pxor m6, m6 + +.loop: + movq m1, [refq] + movq m2, [refq+ref_strideq*2] + movq m3, [refq+ref_strideq*4] + movq m4, [refq+ref_stride3q*2] + punpcklwd m1, m3 + punpcklwd m2, m4 +%if %2 == 1 + movq m3, [second_predq+8*0] + movq m5, [second_predq+8*2] + punpcklwd m3, m5 + movq m4, [second_predq+8*1] + movq m5, [second_predq+8*3] + punpcklwd m4, m5 + lea second_predq, [second_predq+8*4] + pavgw m1, m3 + pavgw m2, m4 +%endif + movq m5, [srcq] + movq m3, [srcq+src_strideq*4] + punpcklwd m5, m3 + movdqa m3, m1 + psubusw m1, m5 + psubusw m5, m3 + por m1, m5 + movq m5, [srcq+src_strideq*2] + movq m4, [srcq+src_stride3q*2] + punpcklwd m5, m4 + movdqa m4, m2 + psubusw m2, m5 + psubusw m5, m4 + por m2, m5 + paddw m1, m2 + movdqa m2, m1 + punpcklwd m1, m6 + punpckhwd m2, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 +HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 +HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 +HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2 +HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2 +HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2 +HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2 +HIGH_SAD4XN 8, 2 ; highbd_sad_skip_4x8_sse2 +; Current code fails there are only 2 rows +; HIGH_SAD4XN 4, 2 ; highbd_sad_skip_4x4_sse2 diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm new file mode 100644 index 0000000000..c0ccc182b4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -0,0 +1,1024 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd eax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp + add srcq, src_stridemp +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%if AOM_ARCH_X86_64 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar block_height, 1 +%endif +%if %2 == 1 ; avg + shl sec_str, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c new file mode 100644 index 0000000000..3c3253bdf9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, + ptrdiff_t pred_stride); + +static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); +} + +static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); + store_diff = (int64_t *)(diff + 4 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x4); + store_diff = (int64_t *)(diff + 5 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x5); + store_diff = (int64_t *)(diff + 6 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x6); + store_diff = (int64_t *)(diff + 7 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x7); +} + +static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); +} + +static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); + _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); + _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); + _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); +} + +#define STACK_V(h, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ + pred + pred_stride * h, pred_stride); \ + } while (0) + +#define STACK_H(w, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ + } while (0) + +#define SUBTRACT_FUN(size) \ + static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ + const uint16_t *src, ptrdiff_t src_stride, \ + const uint16_t *pred, ptrdiff_t pred_stride) + +SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } +SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } +SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } +SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } +SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } +SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } +SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } +SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } +SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } +SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } +SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } +SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } +SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } +SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } +SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } +SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } +SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } +SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } + +static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { + if (rows == 4) { + if (cols == 4) return subtract_4x4; + if (cols == 8) return subtract_8x4; + if (cols == 16) return subtract_16x4; + } + if (rows == 8) { + if (cols == 4) return subtract_4x8; + if (cols == 8) return subtract_8x8; + if (cols == 16) return subtract_16x8; + if (cols == 32) return subtract_32x8; + } + if (rows == 16) { + if (cols == 4) return subtract_4x16; + if (cols == 8) return subtract_8x16; + if (cols == 16) return subtract_16x16; + if (cols == 32) return subtract_32x16; + if (cols == 64) return subtract_64x16; + } + if (rows == 32) { + if (cols == 8) return subtract_8x32; + if (cols == 16) return subtract_16x32; + if (cols == 32) return subtract_32x32; + if (cols == 64) return subtract_64x32; + } + if (rows == 64) { + if (cols == 16) return subtract_16x64; + if (cols == 32) return subtract_32x64; + if (cols == 64) return subtract_64x64; + if (cols == 128) return subtract_128x64; + } + if (rows == 128) { + if (cols == 64) return subtract_64x128; + if (cols == 128) return subtract_128x128; + } + assert(0); + return NULL; +} + +void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + SubtractWxHFuncType func; + + func = getSubtractFunc(rows, cols); + func(diff, diff_stride, src, src_stride, pred, pred_stride); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c new file mode 100644 index 0000000000..b4ff91d856 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c @@ -0,0 +1,904 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" + +typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static uint32_t aom_highbd_var_filter_block2d_bil_avx2( + const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8, + int dst_stride, uint32_t *sse) { + const __m256i filter1 = + _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) | + bilinear_filters_2t[xoffset][0]); + const __m256i filter2 = + _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) | + bilinear_filters_2t[yoffset][0]); + const __m256i one = _mm256_set1_epi16(1); + const int bitshift = 0x40; + (void)pixel_step; + unsigned int i, j, prev = 0, curr = 2; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8); + uint16_t *src_ptr_ref = src_ptr; + uint16_t *dst_ptr_ref = dst_ptr; + int64_t sum_long = 0; + uint64_t sse_long = 0; + unsigned int rshift = 0, inc = 1; + __m256i rbias = _mm256_set1_epi32(bitshift); + __m256i opointer[8]; + unsigned int range; + if (xoffset == 0) { + if (yoffset == 0) { // xoffset==0 && yoffset==0 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + } + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + for (i = 0; i < 16 / inc; ++i) { + __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr); + src_ptr += src_pixels_per_line; + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + + } else if (yoffset == 4) { // xoffset==0 && yoffset==4 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + src_ptr += src_pixels_per_line; + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + prev = curr; + curr = (curr == 0) ? 1 : 0; + opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr); + src_ptr += src_pixels_per_line; + + __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + + } else { // xoffset==0 && yoffset==1,2,3,5,6,7 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + src_ptr += src_pixels_per_line; + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + prev = curr; + curr = (curr == 0) ? 1 : 0; + opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr); + src_ptr += src_pixels_per_line; + + __m256i V_S_M1 = + _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); + __m256i V_S_M2 = + _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); + + __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); + __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); + + __m256i V_S_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); + __m256i V_S_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); + + __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + } + } else if (xoffset == 4) { + if (yoffset == 0) { // xoffset==4 && yoffset==0 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + + opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); + + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + prev = curr; + curr = (curr == 0) ? 1 : 0; + __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + + opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); + + __m256i V_S_M1 = + _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); + __m256i V_S_M2 = + _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); + + __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); + __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); + + __m256i V_S_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); + __m256i V_S_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); + + __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + + } else if (yoffset == 4) { // xoffset==4 && yoffset==4 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + prev = curr; + curr = (curr == 0) ? 1 : 0; + __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); + __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + + } else { // xoffset==4 && yoffset==1,2,3,5,6,7 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + prev = curr; + curr = (curr == 0) ? 1 : 0; + __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); + + __m256i V_S_M1 = + _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); + __m256i V_S_M2 = + _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); + + __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); + __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); + + __m256i V_S_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); + __m256i V_S_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); + + __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + } + } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); + __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); + __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); + __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); + __m256i V_V_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); + __m256i V_V_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); + opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + + } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4 + + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + + __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2); + __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2); + + __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1); + __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1); + + __m256i V_H_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7); + __m256i V_H_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7); + + opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2); + + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < 16 / inc; ++i) { + prev = curr; + curr = (curr == 0) ? 1 : 0; + __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); + __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); + __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); + __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); + __m256i V_V_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); + __m256i V_V_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); + opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); + + __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + + } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7 + range = output_width / 16; + if (output_height == 8) inc = 2; + if (output_height == 4) inc = 4; + unsigned int nloop = 16 / inc; + for (j = 0; j < range * output_height * inc / 16; j++) { + if (j % (output_height * inc / 16) == 0) { + src_ptr = src_ptr_ref; + src_ptr_ref += 16; + dst_ptr = dst_ptr_ref; + dst_ptr_ref += 16; + + __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + + __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2); + __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2); + + __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1); + __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1); + + __m256i V_H_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7); + __m256i V_H_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7); + + opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2); + + curr = 0; + } + + __m256i sum1 = _mm256_setzero_si256(); + __m256i sse1 = _mm256_setzero_si256(); + + for (i = 0; i < nloop; ++i) { + prev = curr; + curr = !curr; + __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); + src_ptr += src_pixels_per_line; + __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); + __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); + __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); + __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); + __m256i V_V_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); + __m256i V_V_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); + opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); + + __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); + __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); + + __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); + __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); + + __m256i V_S_S1 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); + __m256i V_S_S2 = + _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); + + __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); + + __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); + dst_ptr += dst_stride; + + __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); + __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); + + sum1 = _mm256_add_epi16(sum1, V_R_SUB); + sse1 = _mm256_add_epi32(sse1, V_R_MAD); + } + + __m256i v_sum0 = _mm256_madd_epi16(sum1, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + sum_long += _mm_extract_epi32(v_d, 0); + sse_long += _mm_extract_epi32(v_d, 1); + } + + rshift = get_msb(output_height) + get_msb(output_width); + } + + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); + + int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift); + + return (var > 0) ? var : 0; +} + +void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + for (int i = 0; i < 8; i += 2) { + const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); + const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); + const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); + __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); + __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); + v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); + v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride * 2; + ref += ref_stride * 2; + } + __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); + __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); + __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + for (int i = 0; i < 16; ++i) { + const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); + const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride; + ref += ref_stride; + } + __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14) +VAR_FN(128, 64, 16, 13) +VAR_FN(64, 128, 16, 13) +VAR_FN(64, 64, 16, 12) +VAR_FN(64, 32, 16, 11) +VAR_FN(32, 64, 16, 11) +VAR_FN(32, 32, 16, 10) +VAR_FN(32, 16, 16, 9) +VAR_FN(16, 32, 16, 9) +VAR_FN(16, 16, 16, 8) +VAR_FN(16, 8, 8, 7) +VAR_FN(8, 16, 8, 7) +VAR_FN(8, 8, 8, 6) + +#if !CONFIG_REALTIME_ONLY +VAR_FN(16, 64, 16, 10) +VAR_FN(32, 8, 8, 8) +VAR_FN(64, 16, 16, 10) +VAR_FN(8, 32, 8, 8) +#endif // !CONFIG_REALTIME_ONLY + +#undef VAR_FN + +#define SSE2_HEIGHT(H) \ + uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr); + +SSE2_HEIGHT(8) +SSE2_HEIGHT(16) + +#undef SSE2_Height + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + if (W == 8 && H == 16) \ + return aom_highbd_10_sub_pixel_variance8x16_sse2( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ + else if (W == 8 && H == 8) \ + return aom_highbd_10_sub_pixel_variance8x8_sse2( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ + else \ + return aom_highbd_var_filter_block2d_bil_avx2( \ + src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \ + } + +HIGHBD_SUBPIX_VAR(128, 128) +HIGHBD_SUBPIX_VAR(128, 64) +HIGHBD_SUBPIX_VAR(64, 128) +HIGHBD_SUBPIX_VAR(64, 64) +HIGHBD_SUBPIX_VAR(64, 32) +HIGHBD_SUBPIX_VAR(32, 64) +HIGHBD_SUBPIX_VAR(32, 32) +HIGHBD_SUBPIX_VAR(32, 16) +HIGHBD_SUBPIX_VAR(16, 32) +HIGHBD_SUBPIX_VAR(16, 16) +HIGHBD_SUBPIX_VAR(16, 8) +HIGHBD_SUBPIX_VAR(8, 16) +HIGHBD_SUBPIX_VAR(8, 8) + +#undef HIGHBD_SUBPIX_VAR + +uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { + uint64_t sum = 0; + __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; + __m256i src0_8x16, src1_8x16, src_16x16; + __m256i dst0_8x16, dst1_8x16, dst_16x16; + __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64; + __m256i sub_result; + const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + for (int i = 0; i < h; i += 4) { + reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); + reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); + reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride])); + reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride])); + dst0_8x16 = + _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16)); + dst1_8x16 = + _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16)); + dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20); + + reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); + reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); + reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride])); + reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride])); + src0_8x16 = + _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16)); + src1_8x16 = + _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16)); + src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); + + sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); + + src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros); + dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros); + + src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16); + dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16); + + res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros); + res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros); + res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros); + res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros); + + square_result = _mm256_add_epi64( + square_result, + _mm256_add_epi64( + _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64), + res3_4x64)); + } + const __m128i sum_2x64 = + _mm_add_epi64(_mm256_castsi256_si128(square_result), + _mm256_extracti128_si256(square_result, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { + uint64_t sum = 0; + __m256i src0_8x16, src1_8x16, src_16x16; + __m256i dst0_8x16, dst1_8x16, dst_16x16; + __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64; + __m256i sub_result; + const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + + for (int i = 0; i < h; i += 2) { + dst0_8x16 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride])); + dst1_8x16 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride])); + dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20); + + src0_8x16 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride])); + src1_8x16 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride])); + src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); + + sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); + + src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros); + dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros); + + src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16); + dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16); + + res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros); + res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros); + res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros); + res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros); + + square_result = _mm256_add_epi64( + square_result, + _mm256_add_epi64( + _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64), + res3_4x64)); + } + + const __m128i sum_2x64 = + _mm_add_epi64(_mm256_castsi256_si128(square_result), + _mm256_extracti128_si256(square_result, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int w, + int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4) && + "w=8/4 and h=8/4 must satisfy"); + switch (w) { + case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + default: assert(0 && "unsupported width"); return -1; + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 0000000000..ec6c7e9fa7 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,318 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +SECTION .text + +;unsigned int aom_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +globalsym(aom_highbd_calc16x16var_sse2) +sym(aom_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int aom_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +globalsym(aom_highbd_calc8x8var_sse2) +sym(aom_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 0000000000..e897aab645 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" + +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" + +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } \ + \ + uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14) +VAR_FN(128, 64, 16, 13) +VAR_FN(64, 128, 16, 13) +VAR_FN(64, 64, 16, 12) +VAR_FN(64, 32, 16, 11) +VAR_FN(32, 64, 16, 11) +VAR_FN(32, 32, 16, 10) +VAR_FN(32, 16, 16, 9) +VAR_FN(16, 32, 16, 9) +VAR_FN(16, 16, 16, 8) +VAR_FN(16, 8, 8, 7) +VAR_FN(8, 16, 8, 7) +VAR_FN(8, 8, 8, 6) +VAR_FN(8, 32, 8, 8) +VAR_FN(32, 8, 8, 8) +VAR_FN(16, 64, 16, 10) +VAR_FN(64, 16, 16, 10) + +#undef VAR_FN + +unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); +#define DECLS(opt) \ + DECL(8, opt) \ + DECL(16, opt) + +DECLS(sse2) + +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = 0; \ + unsigned int sse = 0; \ + unsigned int sse2; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \ + NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int64_t var; \ + uint32_t sse; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = 0; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse; \ + if (w > wf) { \ + uint32_t sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + int64_t var; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int row_rep = (w > 64) ? 2 : 1; \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + uint16_t *src_tmp = src + (start_row * src_stride); \ + uint16_t *dst_tmp = dst + (start_row * dst_stride); \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src_tmp += wd_64 * 64; \ + dst_tmp += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \ + dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 2 * wf, src_stride, x_offset, y_offset, \ + dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 3 * wf, src_stride, x_offset, y_offset, \ + dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int64_t)) \ + FN(8, 16, 8, 3, 4, opt, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) \ + FN(16, 4, 16, 4, 2, opt, (int64_t)) \ + FN(8, 32, 8, 3, 5, opt, (int64_t)) \ + FN(32, 8, 16, 5, 3, opt, (int64_t)) \ + FN(16, 64, 16, 4, 6, opt, (int64_t)) \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) + +FNS(sse2) + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); +#define DECLS(opt) \ + DECL(16, opt) \ + DECL(8, opt) + +DECLS(sse2) +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ + sec + wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ + sec + wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ + dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ + dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + int64_t var; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + wf + (start_row * dst_stride), dst_stride, \ + sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \ + sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \ + sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int64_t)) \ + FN(8, 16, 8, 3, 4, opt, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) \ + FN(16, 4, 16, 4, 2, opt, (int64_t)) \ + FN(8, 32, 8, 3, 5, opt, (int64_t)) \ + FN(32, 8, 16, 5, 3, opt, (int64_t)) \ + FN(16, 64, 16, 4, 6, opt, (int64_t)) \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) + +FNS(sse2) + +#undef FNS +#undef FN + +static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_dist_wtd_comp_avg_pred_sse2( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i; + const int16_t wt0 = (int16_t)jcp_param->fwd_offset; + const int16_t wt1 = (int16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set1_epi16(wt0); + const __m128i w1 = _mm_set1_epi16(wt1); + const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = _mm_set1_epi16(round); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + + if (width >= 8) { + // Read 8 pixels one row at a time + assert(!(width & 7)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 8) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + // Read 4 pixels two rows at a time + assert(!(width & 3)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 2 * ref_stride; + } + } +} + +uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { + uint64_t sum = 0; + __m128i reg0_4x16, reg1_4x16; + __m128i src_8x16; + __m128i dst_8x16; + __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64; + __m128i sub_result_8x16; + const __m128i zeros = _mm_setzero_si128(); + __m128i square_result = _mm_setzero_si128(); + for (int i = 0; i < h; i += 2) { + reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); + reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); + dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16); + + reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); + reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); + src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16); + + sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16); + + res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros); + res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros); + + res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32); + res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32); + + res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros); + res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros); + res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros); + res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros); + + square_result = _mm_add_epi64( + square_result, + _mm_add_epi64( + _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64), + res3_4x64)); + } + + const __m128i sum_1x64 = + _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { + uint64_t sum = 0; + __m128i src_8x16; + __m128i dst_8x16; + __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64; + __m128i sub_result_8x16; + const __m128i zeros = _mm_setzero_si128(); + __m128i square_result = _mm_setzero_si128(); + + for (int i = 0; i < h; i++) { + dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]); + src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]); + + sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16); + + res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros); + res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros); + + res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32); + res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32); + + res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros); + res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros); + res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros); + res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros); + + square_result = _mm_add_epi64( + square_result, + _mm_add_epi64( + _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64), + res3_4x64)); + } + + const __m128i sum_1x64 = + _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int w, + int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4) && + "w=8/4 and h=8/4 must satisfy"); + switch (w) { + case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + default: assert(0 && "unsupported width"); return -1; + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c new file mode 100644 index 0000000000..df5449a9df --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + uint64_t *sse, int64_t *sum) { + __m128i u0, u1, u2, u3; + __m128i s0, s1, s2, s3; + __m128i t0, t1, x0, y0; + __m128i a0, a1, a2, a3; + __m128i b0, b1, b2, b3; + __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); + a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); + a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); + a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); + + b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); + b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); + b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); + b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); + + u0 = _mm_unpacklo_epi16(a0, a1); + u1 = _mm_unpacklo_epi16(a2, a3); + u2 = _mm_unpacklo_epi16(b0, b1); + u3 = _mm_unpacklo_epi16(b2, b3); + + s0 = _mm_sub_epi16(u0, u2); + s1 = _mm_sub_epi16(u1, u3); + + t0 = _mm_madd_epi16(s0, k_one_epi16); + t1 = _mm_madd_epi16(s1, k_one_epi16); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + y0 = _mm_hadd_epi32(s3, s3); + + t0 = _mm_madd_epi16(s0, s0); + t1 = _mm_madd_epi16(s1, s1); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + x0 = _mm_hadd_epi32(s3, s3); + + *sse = (uint64_t)_mm_extract_epi32(x0, 0); + *sum = (int64_t)_mm_extract_epi32(y0, 0); +} + +uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)local_sse; + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); + sum = ROUND_POWER_OF_TWO(sum, 2); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); + sum = ROUND_POWER_OF_TWO(sum, 4); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return diff >= 0 ? (uint32_t)diff : 0; +} + +// Sub-pixel +uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +// Sub-pixel average + +uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm new file mode 100644 index 0000000000..0eb632326b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm @@ -0,0 +1,608 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c new file mode 100644 index 0000000000..242a548df9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c @@ -0,0 +1,4707 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/x86/intrapred_x86.h" +#include "aom_dsp/x86/intrapred_utils.h" +#include "aom_dsp/x86/lpf_common_sse2.h" + +static INLINE __m256i dc_sum_64(const uint8_t *ref) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i zero = _mm256_setzero_si256(); + __m256i y0 = _mm256_sad_epu8(x0, zero); + __m256i y1 = _mm256_sad_epu8(x1, zero); + y0 = _mm256_add_epi64(y0, y1); + __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); + y0 = _mm256_add_epi64(u0, y0); + u0 = _mm256_unpackhi_epi64(y0, y0); + return _mm256_add_epi16(y0, u0); +} + +static INLINE __m256i dc_sum_32(const uint8_t *ref) { + const __m256i x = _mm256_loadu_si256((const __m256i *)ref); + const __m256i zero = _mm256_setzero_si256(); + __m256i y = _mm256_sad_epu8(x, zero); + __m256i u = _mm256_permute2x128_si256(y, y, 1); + y = _mm256_add_epi64(u, y); + u = _mm256_unpackhi_epi64(y, y); + return _mm256_add_epi16(y, u); +} + +static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + dst += stride; + } +} + +static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, + int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r0); + _mm256_storeu_si256((__m256i *)(dst + 32), *r1); + dst += stride; + } +} + +static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + _mm256_storeu_si256((__m256i *)(dst + 32), *r); + dst += stride; + } +} + +static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, +}; + +static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = { + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 } +}; + +static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = { + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, + 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, + 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, + 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, + 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, + 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 } +}; + +static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, + 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, + 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, + 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } +}; + +static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) { + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + + r0 = _mm_unpacklo_epi16(x[0], x[1]); + r1 = _mm_unpacklo_epi16(x[2], x[3]); + r2 = _mm_unpacklo_epi16(x[4], x[5]); + r3 = _mm_unpacklo_epi16(x[6], x[7]); + + r4 = _mm_unpacklo_epi16(x[8], x[9]); + r5 = _mm_unpacklo_epi16(x[10], x[11]); + r6 = _mm_unpacklo_epi16(x[12], x[13]); + r7 = _mm_unpacklo_epi16(x[14], x[15]); + + r8 = _mm_unpacklo_epi32(r0, r1); + r9 = _mm_unpackhi_epi32(r0, r1); + r10 = _mm_unpacklo_epi32(r2, r3); + r11 = _mm_unpackhi_epi32(r2, r3); + + r12 = _mm_unpacklo_epi32(r4, r5); + r13 = _mm_unpackhi_epi32(r4, r5); + r14 = _mm_unpacklo_epi32(r6, r7); + r15 = _mm_unpackhi_epi32(r6, r7); + + r0 = _mm_unpacklo_epi64(r8, r9); + r1 = _mm_unpackhi_epi64(r8, r9); + r2 = _mm_unpacklo_epi64(r10, r11); + r3 = _mm_unpackhi_epi64(r10, r11); + + r4 = _mm_unpacklo_epi64(r12, r13); + r5 = _mm_unpackhi_epi64(r12, r13); + r6 = _mm_unpacklo_epi64(r14, r15); + r7 = _mm_unpackhi_epi64(r14, r15); + + d[0] = _mm_unpacklo_epi64(r0, r2); + d[1] = _mm_unpacklo_epi64(r4, r6); + d[2] = _mm_unpacklo_epi64(r1, r3); + d[3] = _mm_unpacklo_epi64(r5, r7); + + d[4] = _mm_unpackhi_epi64(r0, r2); + d[5] = _mm_unpackhi_epi64(r4, r6); + d[6] = _mm_unpackhi_epi64(r1, r3); + d[7] = _mm_unpackhi_epi64(r5, r7); +} + +static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + + w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 + w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 + w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53 + w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + + w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 + w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 + w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53 + w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 + + w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17 + w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37 + w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57 + w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77 + + ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, ww0, ww1; + __m256i dd[16]; + w0 = _mm256_unpacklo_epi16(x[0], x[1]); + w1 = _mm256_unpacklo_epi16(x[2], x[3]); + w2 = _mm256_unpacklo_epi16(x[4], x[5]); + w3 = _mm256_unpacklo_epi16(x[6], x[7]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); // + ww1 = _mm256_unpacklo_epi32(w2, w3); // + + dd[0] = _mm256_unpacklo_epi64(ww0, ww1); + dd[1] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); // + ww1 = _mm256_unpackhi_epi32(w2, w3); // + + dd[2] = _mm256_unpacklo_epi64(ww0, ww1); + dd[3] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpackhi_epi16(x[0], x[1]); + w1 = _mm256_unpackhi_epi16(x[2], x[3]); + w2 = _mm256_unpackhi_epi16(x[4], x[5]); + w3 = _mm256_unpackhi_epi16(x[6], x[7]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); // + ww1 = _mm256_unpacklo_epi32(w2, w3); // + + dd[4] = _mm256_unpacklo_epi64(ww0, ww1); + dd[5] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); // + ww1 = _mm256_unpackhi_epi32(w2, w3); // + + dd[6] = _mm256_unpacklo_epi64(ww0, ww1); + dd[7] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpacklo_epi16(x[8], x[9]); + w1 = _mm256_unpacklo_epi16(x[10], x[11]); + w2 = _mm256_unpacklo_epi16(x[12], x[13]); + w3 = _mm256_unpacklo_epi16(x[14], x[15]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); + ww1 = _mm256_unpacklo_epi32(w2, w3); + + dd[8] = _mm256_unpacklo_epi64(ww0, ww1); + dd[9] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); + ww1 = _mm256_unpackhi_epi32(w2, w3); + + dd[10] = _mm256_unpacklo_epi64(ww0, ww1); + dd[11] = _mm256_unpackhi_epi64(ww0, ww1); + + w0 = _mm256_unpackhi_epi16(x[8], x[9]); + w1 = _mm256_unpackhi_epi16(x[10], x[11]); + w2 = _mm256_unpackhi_epi16(x[12], x[13]); + w3 = _mm256_unpackhi_epi16(x[14], x[15]); + + ww0 = _mm256_unpacklo_epi32(w0, w1); + ww1 = _mm256_unpacklo_epi32(w2, w3); + + dd[12] = _mm256_unpacklo_epi64(ww0, ww1); + dd[13] = _mm256_unpackhi_epi64(ww0, ww1); + + ww0 = _mm256_unpackhi_epi32(w0, w1); + ww1 = _mm256_unpackhi_epi32(w2, w3); + + dd[14] = _mm256_unpacklo_epi64(ww0, ww1); + dd[15] = _mm256_unpackhi_epi64(ww0, ww1); + + for (int i = 0; i < 8; i++) { + d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1); + d[i + 8] = _mm256_insertf128_si256(dd[i + 8], + _mm256_extracti128_si256(dd[i], 1), 0); + } +} + +void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum_left = _mm256_add_epi16(sum_left, thirtytwo); + sum_left = _mm256_srai_epi16(sum_left, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum_left, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((int8_t)0x80); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 32, dst, stride); +} + +// There are 32 rows togeter. This function does line: +// 0,1,2,3, and 16,17,18,19. The next call would do +// 4,5,6,7, and 20,21,22,23. So 4 times of calling +// would finish 32 rows. +static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, + ptrdiff_t stride) { + __m256i t[4]; + __m256i m = _mm256_setzero_si256(); + const __m256i inc = _mm256_set1_epi8(4); + int i; + + for (i = 0; i < 4; i++) { + t[i] = _mm256_shuffle_epi8(*row, m); + __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); + __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); + _mm256_storeu_si256((__m256i *)dst, r0); + _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); + dst += stride; + m = _mm256_add_epi8(m, inc); + } +} + +void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); + + __m256i u = _mm256_unpacklo_epi8(left_col, left_col); + + __m256i v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + u = _mm256_unpackhi_epi8(left_col, left_col); + + v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); +} + +// ----------------------------------------------------------------------------- +// Rectangle +void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i top_sum = dc_sum_32_sse2(above); + __m128i left_sum = dc_sum_16_sse2(left); + left_sum = _mm_add_epi16(top_sum, left_sum); + uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum); + sum += 24; + sum /= 48; + const __m256i row = _mm256_set1_epi8((int8_t)sum); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((int8_t)sum); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 64; + sum /= 128; + const __m256i row = _mm256_set1_epi8((int8_t)sum); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((int8_t)sum); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 40; + sum /= 80; + const __m256i row = _mm256_set1_epi8((int8_t)sum); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((int8_t)0x80); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((int8_t)0x80); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((int8_t)0x80); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((int8_t)0x80); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((int8_t)0x80); + row_store_64xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 64, dst, stride); +} + +void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 64, dst, stride); +} + +void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 32, dst, stride); +} + +void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// PAETH_PRED + +// Return 16 16-bit pixels in one row (__m256i) +static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i base = + _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); + + __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); + __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); + __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); + + __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); + mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); + __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); + + pl = _mm256_andnot_si256(mask1, *left); + + ptl = _mm256_and_si256(mask2, *topleft); + pt = _mm256_andnot_si256(mask2, *top); + pt = _mm256_or_si256(pt, ptl); + pt = _mm256_and_si256(mask1, pt); + + return _mm256_or_si256(pt, pl); +} + +// Return 16 8-bit pixels in one row (__m128i) +static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i p0 = paeth_pred(left, top, topleft); + const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i p = _mm256_packus_epi16(p0, p1); + return _mm256_castsi256_si128(p); +} + +static INLINE __m256i get_top_vector(const uint8_t *above) { + const __m128i x = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t0 = _mm_unpacklo_epi8(x, zero); + const __m128i t1 = _mm_unpackhi_epi8(x, zero); + return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); +} + +void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i x = _mm_loadl_epi64((const __m128i *)left); + const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 8; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +static INLINE __m256i get_left_vector(const uint8_t *left) { + const __m128i x = _mm_load_si128((const __m128i *)left); + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + for (int j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (int i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +// Return 32 8-bit pixels in one row (__m256i) +static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, + const __m256i *top1, + const __m256i *topleft) { + __m256i p0 = paeth_pred(left, top0, topleft); + __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x0 = _mm256_packus_epi16(p0, p1); + + p0 = paeth_pred(left, top1, topleft); + p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x1 = _mm256_packus_epi16(p0, p1); + + return _mm256_permute2x128_si256(x0, x1, 0x20); +} + +void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); + + _mm256_storeu_si256((__m256i *)dst, r); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); + __m256i rep = _mm256_set1_epi16((short)0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 2; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i; + const __m256i l = get_left_vector(left); + __m256i rep = _mm256_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6) +#define PERM2x128(c0, c1) c0 + (c1 << 4) + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x, max_base_x128, base_inc128, mask128; + __m128i a0_128, a1_128; + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi16(above[max_base_x]); + max_base_x128 = _mm_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]); + a1_128 = _mm_srli_si128(a0_128, 8); + + base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8, + base + 10, base + 12, base + 14); + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), + _mm256_set1_epi16(0x3f)), + 1); + } else { + base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, + base + 5, base + 6, base + 7); + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_castsi128_si256(a0_128); + a1 = _mm256_castsi128_si256(a1_128); + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res1 = _mm256_castsi256_si128(res); + + mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128); + dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff; + __m128i a_mbase_x, max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm_set1_epi16(above[max_base_x]); + max_base_x128 = _mm_set1_epi32(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), + _mm256_set1_epi32(0x3f)), + 1); + } else { + base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_castsi256_si128(res); + res1 = _mm_packus_epi32(res1, res1); + + mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128); + mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit + dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); + x += dx; + } +} + +static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m128i dstvec[16]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a0_1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi32(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values + } + return; + } + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + + a0_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a0_1 = _mm256_permutevar8x32_epi32( + a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); + + a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); + a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); + base_inc256 = + _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8, + base + 10, base + 12, base + 14); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), + _mm256_set1_epi32(0x3f)), + 1); + } else { + base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256); + mask256 = _mm256_packs_epi32( + mask256, _mm256_castsi128_si256( + _mm256_extracti128_si256(mask256, 1))); // goto 16 bit + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + dst[r] = _mm256_castsi256_si128(res1); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values + } + return; + } + + a0_x128 = _mm_loadu_si128((__m128i *)(above + base)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8)); + atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]); + atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]); + atmp2 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); + atmp3 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); + mask = + _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + + base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6, + base + 8, base + 10, base + 12, base + 14, + 0, 0, 0, 0, 0, 0, 0, 0); + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, 0, + 0, 0, 0, 0, 0, 0, 0); + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_castsi128_si256(a0_x128); + a1 = _mm256_castsi128_si256(a1_x128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256); + dst[r] = _mm256_castsi256_si128(res1); + x += dx; + } +} + +static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m128i dstvec[32]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + + a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + + int mdif = max_base_x - base; + if (mdif > 8) { + a0_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a1_1 = + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, + base + 8, base + 9, base + 10, base + 11, + base + 12, base + 13, base + 14, base + 15); + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + a0 = _mm256_loadu_si256((__m256i *)(above + base)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16bit values + + base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, + base + 4, base + 5, base + 6, base + 7, + base + 8, base + 9, base + 10, base + 11, + base + 12, base + 13, base + 14, base + 15); + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256); + x += dx; + } +} + +static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m256i dstvec[64]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec[i + N] = a_mbase_x; + } + return; + } + + __m256i shift = + _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + + for (int j = 0; j < 32; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + res1 = a_mbase_x; + } else { + a0 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + j))); + a1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 1 + j))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + if (mdif > 8) { + a0_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 8 + j))); + a1_1 = _mm256_cvtepu16_epi32( + _mm_loadu_si128((__m128i *)(above + base + 9 + j))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + } + if (!j) { + dstvec[r] = res1; + } else { + dstvec[r + N] = res1; + } + } + x += dx; + } +} + +static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec[i + N] = a_mbase_x; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0; j < 32; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + res = a_mbase_x; + } else { + a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res = _mm256_blendv_epi8(a_mbase_x, res, mask256); + } + if (!j) { + dstvec[r] = res; + } else { + dstvec[r + N] = res; + } + } + x += dx; + } +} + +static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx, + int bd) { + __m256i dstvec[128]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, + dx); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above, + upsample_above, dx); + } + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]); + } +} + +static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, + int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res[2], res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); + + __m128i a0_128, a0_1_128, a1_128, a1_1_128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu16_epi32(a0_128); + a1 = _mm256_cvtepu16_epi32(a1_128); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + if (mdif > 8) { + a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); + a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); + a0_1 = _mm256_cvtepu16_epi32(a0_1_128); + a1_1 = _mm256_cvtepu16_epi32(a1_1_128); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + } else { + res[1] = a_mbase_x; + } + res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), + 1); // 16 16bit values + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); + _mm256_storeu_si256((__m256i *)(dst + j), res1); + } + } + x += dx; + } +} + +static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, + ptrdiff_t stride, + const uint16_t *above, + int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16, c3f; + __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi16(above[max_base_x]); + max_base_x256 = _mm256_set1_epi16(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); + } else { + a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); + a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + base_inc256 = _mm256_setr_epi16( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); + res = _mm256_blendv_epi8(a_mbase_x, res, mask256); + _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + (void)left; + (void)dy; + + switch (bw) { + case 4: + highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 8: + highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 16: + highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 32: + highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, + dx, bd); + break; + case 64: + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, + upsample_above, dx); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above, + upsample_above, dx); + } + break; + default: break; + } + return; +} + +static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst) { + __m256i r[16]; + __m256i d[16]; + for (int j = 0; j < 16; j++) { + r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc)); + } + highbd_transpose16x16_avx2(r, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]); + } +} + +static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst, int width, + int height) { + for (int j = 0; j < height; j += 16) + for (int i = 0; i < width; i += 16) + highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); +} + +static void highbd_dr_prediction_32bit_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm_set1_epi32(0x3f); + min_base_y128 = _mm_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128( + _mm_slli_epi32( + _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int, base_y_c[4]); + r6 = _mm_set1_epi32(r << 6); + dy128 = _mm_set1_epi32(dy); + c1234 = _mm_setr_epi32(1, 2, 3, 4); + y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); + base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]]); + a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi32( + _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(res); + resx = _mm_packus_epi32(resx, resx); + + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi32(resy, resy); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, + (3 << 6) - y * dx, 0, 0, 0, 0), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, 0, 0, 0, 0), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0, + 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; + __m256i diff; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm256_set1_epi32(0x3f); + min_base_y256 = _mm256_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx = _mm_setzero_si128(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); + atmp0 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp1 = _mm_shuffle_epi8(a1_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp2 = _mm_shuffle_epi8( + a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + atmp3 = _mm_shuffle_epi8( + a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], + _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, (4 << 6) - y * dx, + (5 << 6) - y * dx, (6 << 6) - y * dx, + (7 << 6) - y * dx), + c3f), + 1); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } + // y calc + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int, base_y_c[8]); + __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + dy256 = _mm256_set1_epi32(dy); + c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + if (upsample_left) { + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), + 1); + } else { + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + } + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = resx; + } + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i c3f, min_base_y128; + __m256i a0_x, a1_x, diff, a32, a16; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + __m128i mask, atmp0, atmp1, atmp2, atmp3; + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); + atmp0 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp1 = _mm_shuffle_epi8(a1_x128, + *(__m128i *)HighbdEvenOddMaskx[base_shift]); + atmp2 = _mm_shuffle_epi8( + a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + atmp3 = _mm_shuffle_epi8( + a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); + mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], + _mm_set1_epi8(15)); + a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); + mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), + _mm_set1_epi8(15)); + a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], + left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1; + __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + DECLARE_ALIGNED(32, int, base_y_c[16]); + + a16 = _mm256_set1_epi32(16); + c1 = _mm256_srli_epi32(a16, 4); + c8 = _mm256_srli_epi32(a16, 1); + min_base_y256 = _mm256_set1_epi32(min_base_y); + c3f = _mm256_set1_epi32(0x3f); + dy256 = _mm256_set1_epi32(dy); + c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + c1234 = _mm256_add_epi32(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift, ydx; + __m256i resx[2], resy[2]; + __m256i resxy, j256, r6; + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi32(j); + int y = r + 1; + ydx = _mm256_set1_epi32(y * dx); + + int base_x = ((j << 6) - y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x) < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1); + } + int base_min_diff = (min_base_x - base_x); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx[0] = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + } + int base_shift8 = 0; + if ((base_x + 8) < (min_base_x - 1)) { + base_shift8 = (min_base_x - (base_x + 8) - 1); + } + if (base_shift8 > 7) { + resx[1] = _mm256_setzero_si256(); + } else { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + + a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); + a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); + + r6 = _mm256_slli_epi32( + _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6); + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + resx[1] = _mm256_add_epi32(a32, b); + resx[1] = _mm256_srli_epi32(resx[1], 5); + resx[1] = _mm256_packus_epi32( + resx[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); + } + resx[0] = + _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), + 1); // 16 16bit values + + // y calc + resy[0] = _mm256_setzero_si256(); + if ((base_x < min_base_x)) { + __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + c256 = _mm256_add_epi32(j256, c1234); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + c256 = _mm256_add_epi32(c256, c8); + y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], + left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], + left[base_y_c[14]], left[base_y_c[15]])); + a1_y = _mm256_cvtepu16_epi32( + _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], + left[base_y_c[10] + 1], left[base_y_c[11] + 1], + left[base_y_c[12] + 1], left[base_y_c[13] + 1], + left[base_y_c[14] + 1], left[base_y_c[15] + 1])); + shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[1] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + resy[0] = + _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), + 1); // 16 16bit values + } + + resxy = _mm256_blendv_epi8(resx[0], resy[0], + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +static void highbd_dr_prediction_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16, c3f, c1; + __m256i diff, min_base_y256, dy256, c1234, c0123; + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + + a16 = _mm256_set1_epi16(16); + c1 = _mm256_srli_epi16(a16, 4); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + dy256 = _mm256_set1_epi16(dy); + c0123 = + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + c1234 = _mm256_add_epi16(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift; + __m256i resx, resy, ydx; + __m256i resxy, j256, r6; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + int y = r + 1; + ydx = _mm256_set1_epi16((short)(y * dx)); + + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi16(j); + int base_x = ((j << 6) - y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x)-1); + } + int base_min_diff = (min_base_x - base_x); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 8) { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } else { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + } + + int base_shift1 = 0; + if (base_shift > 8) { + base_shift1 = base_shift - 8; + } + if (base_shift1 < 8) { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift1]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift1]); + + a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); + } + r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); + shift = _mm256_srli_epi16( + _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + resx = _mm256_srli_epi16(res, 5); // 16 16-bit values + + // y calc + resy = _mm256_setzero_si256(); + __m256i a0_y, a1_y, shifty; + if ((base_x < min_base_x)) { + __m256i c256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + c256 = _mm256_add_epi16(j256, c1234); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + base_y_c256 = _mm256_add_epi16(base_y_c256, c1); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a1_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + resy = _mm256_srli_epi16(res, 5); + } + + resxy = _mm256_blendv_epi8(resx, resy, + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + if (bd < 12) { + highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + case 8: + if (bd < 12) { + highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + default: + if (bd < 12) { + highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + } +} + +// Directional prediction, zone 3 functions +static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[4], d[4]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], + &dstvec[3], &d[0], &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); + return; +} + +static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[8], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], + &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[4], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + + highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[8], d[4]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + + highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], + &d[0], &d[1], &d[2], &d[3]); + _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]); +} + +static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[8], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + highbd_transpose8x16_16x8_avx2(dstvec, d); + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_extracti128_si256(d[i - 8], 1)); + } +} + +static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 16; i += 8) { + highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], + &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], + &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], + &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], + &d[5 + i], &d[6 + i], &d[7 + i]); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); + } +} + +static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[4], d[4], d1; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left, + upsample_left, dy); + } + highbd_transpose4x16_avx2(dstvec, d); + for (int i = 0; i < 4; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + d1 = _mm256_bsrli_epi128(d[i], 8); + _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride), + _mm256_castsi256_si128(d1)); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm256_extracti128_si256(d[i], 1)); + _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride), + _mm256_extracti128_si256(d1, 1)); + } +} + +static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[16], d[8]; + if (bd < 12) { + highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + highbd_transpose16x4_8x8_sse2(dstvec, d); + + _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); + _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]); + _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]); + _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]); + _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]); +} + +static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left, + upsample_left, dy); + } + + for (int i = 0; i < 16; i += 8) { + highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); + } + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), + _mm256_extracti128_si256(d[i], 1)); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 8; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m128i dstvec[32], d[32]; + if (bd < 12) { + highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + + for (int i = 0; i < 32; i += 8) { + highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], + &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], + &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], + &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], + &d[5 + i], &d[6 + i], &d[7 + i]); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]); + } +} + +static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[16], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + + highbd_transpose16x16_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]); + } +} + +static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[64], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + highbd_transpose16x16_avx2(dstvec, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 16, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 32, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]); + } + highbd_transpose16x16_avx2(dstvec + 48, d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]); + } +} + +static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 64, 64); +} + +static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[32], d[32]; + if (bd < 12) { + highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 32; i += 8) { + highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); + } + // store + for (int j = 0; j < 32; j += 16) { + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + j) * stride), + _mm256_castsi256_si128(d[(i + j)])); + } + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8), + _mm256_castsi256_si128(d[(i + j) + 8])); + } + for (int i = 8; i < 16; i++) { + _mm256_storeu_si256( + (__m256i *)(dst + (i + j) * stride), + _mm256_inserti128_si256( + d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0)); + } + } +} + +static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[32], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 32; i += 16) { + highbd_transpose16x16_avx2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); + } + } +} + +static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + uint16_t dstT[64 * 32]; + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 32, 64); +} + +static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); + highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd); + highbd_transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); + if (bd < 12) { + highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + } else { + highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left, + dy); + } + highbd_transpose(dstT, 64, dst, stride, 16, 64); +} + +static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left, + int upsample_left, int dy, + int bd) { + __m256i dstvec[64], d[16]; + if (bd < 12) { + highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, + dy); + } else { + highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left, + upsample_left, dy); + } + for (int i = 0; i < 64; i += 16) { + highbd_transpose16x16_avx2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + (void)above; + (void)dx; + + assert(dx == 1); + assert(dy > 0); + if (bw == bh) { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 8: + highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 16: + highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 32: + highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + case 64: + highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy, + bd); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 32: + highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } else { + switch (bw) { + case 4: + highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 32: + highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } else { + switch (bh) { + case 4: + highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 8: + highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + case 16: + highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, + dy, bd); + break; + } + } + } + } + return; +} + +// Low bit depth functions +static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, +}; + +/* clang-format on */ +static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2( + int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, + int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((W + H) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < W; r++) { + __m256i b, res, shift; + __m128i res1, a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < W; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + if (base_max_diff > H) base_max_diff = H; + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]); + a1_128 = _mm_srli_si128(a0_128, 8); + + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + res = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // goto 8 bit + res1 = _mm256_castsi256_si128(res); // 16 8bit values + + dst[r] = + _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); + x += dx; + } +} + +static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[16]; + + dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); + } +} + +static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[32]; + + dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[64]; + + dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i a_mbase_x, diff, c3f; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res16[2]; + __m128i a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + } + return; + } + if (base_max_diff > 32) base_max_diff = 32; + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + for (int j = 0, jj = 0; j < 32; j += 16, jj++) { + int mdiff = base_max_diff - j; + if (mdiff <= 0) { + res16[jj] = a_mbase_x; + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1)); + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res16[jj] = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // 16 8bit values + } + } + res16[1] = + _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), + 1); // 32 8bit values + + dstvec[r] = _mm256_blendv_epi8( + a_mbase_x, res16[1], + *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values + x += dx; + } +} + +static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m256i dstvec[64]; + dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i a_mbase_x, diff, c3f; + __m128i max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]); + max_base_x128 = _mm_set1_epi8(max_base_x); + c3f = _mm256_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res; + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + + __m128i a0_128, a1_128, res128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm_storeu_si128((__m128i *)(dst + j), + _mm256_castsi256_si128(a_mbase_x)); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi16(diff, shift); + + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + res = _mm256_packus_epi16( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // 16 8bit values + + base_inc128 = + _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), + (int8_t)(base + j + 2), (int8_t)(base + j + 3), + (int8_t)(base + j + 4), (int8_t)(base + j + 5), + (int8_t)(base + j + 6), (int8_t)(base + j + 7), + (int8_t)(base + j + 8), (int8_t)(base + j + 9), + (int8_t)(base + j + 10), (int8_t)(base + j + 11), + (int8_t)(base + j + 12), (int8_t)(base + j + 13), + (int8_t)(base + j + 14), (int8_t)(base + j + 15)); + + mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), + _mm_setzero_si128()); + res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), + _mm256_castsi256_si128(res), mask128); + _mm_storeu_si128((__m128i *)(dst + j), res128); + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + (void)left; + (void)dy; + switch (bw) { + case 4: + dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 8: + dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 16: + dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 32: + dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 64: + dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + default: break; + } + return; +} + +static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0_x, a1_x, a32, a16, diff; + __m128i c3f, min_base_y128, c1234, dy128; + + a16 = _mm_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); + dy128 = _mm_set1_epi16(dy); + + for (int r = 0; r < N; r++) { + __m128i b, res, shift, r6, ydx; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm_setzero_si128(); + a1_x = _mm_setzero_si128(); + shift = _mm_setzero_si128(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(c1234, 6); + + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1); + } else { + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 1); + + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + } + a0_x = _mm_cvtepu8_epi16(a0_x128); + a1_x = _mm_cvtepu8_epi16(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i y_c128, base_y_c128, mask128, c1234_; + c1234_ = _mm_srli_si128(c1234, 2); + r6 = _mm_set1_epi16(r << 6); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm_unpacklo_epi64(a0_x, a0_y); + a1_x = _mm_unpacklo_epi64(a1_x, a1_y); + shift = _mm_unpacklo_epi64(shift, shifty); + } + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + resx = _mm_packus_epi16(res, res); + resy = _mm_srli_si128(resx, 4); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + *(int *)(dst) = _mm_cvtsi128_si32(resxy); + dst += stride; + } +} + +static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i diff, a32, a16; + __m256i a0_x, a1_x; + __m128i a0_x128, a1_x128, min_base_y128, c3f; + __m128i c1234, dy128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy, r6, ydx; + + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1)); + } else { + a1_x128 = _mm_srli_si128(a0_x128, 1); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256( + _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1)); + } + a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); + a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m128i y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + base_y_c128 = _mm_add_epi16( + base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm_packus_epi16(_mm256_castsi256_si128(res), + _mm256_castsi256_si128(res)); + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi16(resy, resy); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, + ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123; + __m256i diff, min_base_y256, c3f, shifty, dy256, c1; + __m128i a0_x128, a1_x128; + + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + a16 = _mm256_set1_epi16(16); + c1 = _mm256_srli_epi16(a16, 4); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + dy256 = _mm256_set1_epi16(dy); + c0123 = + _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + c1234 = _mm256_add_epi16(c0123, c1); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift, j256, r6, ydx; + __m128i resx, resy; + __m128i resxy; + int y = r + 1; + ydx = _mm256_set1_epi16((int16_t)(y * dx)); + + int base_x = (-y * dx) >> frac_bits_x; + for (int j = 0; j < W; j += 16) { + j256 = _mm256_set1_epi16(j); + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 16) { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu8_epi16(a0_x128); + a1_x = _mm256_cvtepu8_epi16(a1_x128); + + r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); + shift = _mm256_srli_epi16( + _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resx = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resx = _mm_setzero_si128(); + } + + // y calc + if (base_x < min_base_x) { + __m256i c256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + c256 = _mm256_add_epi16(j256, c1234); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + + base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256); + int16_t min_y = (int16_t)_mm_extract_epi16( + _mm256_extracti128_si256(base_y_c256, 1), 7); + int16_t max_y = + (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0); + int16_t offset_diff = max_y - min_y; + + if (offset_diff < 16) { + __m256i min_y256 = _mm256_set1_epi16(min_y); + + __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256); + __m128i base_y_offset128 = + _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0), + _mm256_extracti128_si256(base_y_offset, 1)); + + __m128i a0_y128 = _mm_maskload_epi32( + (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]); + __m128i a1_y128 = + _mm_maskload_epi32((int *)(left + min_y + 1), + *(__m128i *)LoadMaskz2[offset_diff / 4]); + a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128); + a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128); + a0_y = _mm256_cvtepu8_epi16(a0_y128); + a1_y = _mm256_cvtepu8_epi16(a1_y128); + } else { + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + base_y_c256 = _mm256_add_epi16(base_y_c256, c1); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a1_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + } + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resy = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = _mm_setzero_si128(); + } + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + case 8: + dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + default: + dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + break; + } + return; +} + +// z3 functions +static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m256i w10, w11, w12, w13, w14, w15; + + w0 = _mm256_unpacklo_epi8(x[0], x[1]); + w1 = _mm256_unpacklo_epi8(x[2], x[3]); + w2 = _mm256_unpacklo_epi8(x[4], x[5]); + w3 = _mm256_unpacklo_epi8(x[6], x[7]); + + w8 = _mm256_unpacklo_epi8(x[8], x[9]); + w9 = _mm256_unpacklo_epi8(x[10], x[11]); + w10 = _mm256_unpacklo_epi8(x[12], x[13]); + w11 = _mm256_unpacklo_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm256_unpacklo_epi64(w6, w14); + d[1] = _mm256_unpackhi_epi64(w6, w14); + d[2] = _mm256_unpacklo_epi64(w7, w15); + d[3] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm256_unpacklo_epi64(w6, w14); + d[5] = _mm256_unpackhi_epi64(w6, w14); + d[6] = _mm256_unpacklo_epi64(w7, w15); + d[7] = _mm256_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm256_unpackhi_epi8(x[0], x[1]); + w1 = _mm256_unpackhi_epi8(x[2], x[3]); + w2 = _mm256_unpackhi_epi8(x[4], x[5]); + w3 = _mm256_unpackhi_epi8(x[6], x[7]); + + w8 = _mm256_unpackhi_epi8(x[8], x[9]); + w9 = _mm256_unpackhi_epi8(x[10], x[11]); + w10 = _mm256_unpackhi_epi8(x[12], x[13]); + w11 = _mm256_unpackhi_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm256_unpacklo_epi64(w6, w14); + d[9] = _mm256_unpackhi_epi64(w6, w14); + d[10] = _mm256_unpacklo_epi64(w7, w15); + d[11] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm256_unpacklo_epi64(w6, w14); + d[13] = _mm256_unpackhi_epi64(w6, w14); + d[14] = _mm256_unpacklo_epi64(w7, w15); + d[15] = _mm256_unpackhi_epi64(w7, w15); +} + +static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[4]; + + dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3]); + + *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); + *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); + *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); + *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); + return; +} + +static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy); + transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], + &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], + &d[3]); + + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); + _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); + _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); + _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); +} + +static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[8]; + + dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], + &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + for (int i = 0; i < 8; i++) { + *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[4]; + + dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy); + transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], + &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); +} + +static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy); + transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, + dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, + d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm_srli_si128(d[i], 8)); + } +} + +static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy); + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy); + transpose4x16_sse2(dstvec, d); + for (int i = 0; i < 16; i++) { + *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[8]; + + dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy); + for (int i = 4; i < 8; i++) { + d[i] = _mm_setzero_si128(); + } + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); + for (int i = 8; i < 16; i++) { + dstvec[i] = _mm256_setzero_si256(); + } + transpose16x32_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy); + + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + transpose16x8_8x16_sse2( + &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], + &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], + &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], + &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], + &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], + &d[6 + 8], &d[7 + 8]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); + } +} + +static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy); + transpose16x16_sse2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[32], d[32]; + + dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + transpose16x32_avx2(dstvec + 16, d + 16); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + j * stride + 16), + _mm256_castsi256_si128(d[j + 16])); + } + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), + _mm256_extracti128_si256(d[j + 16], 1)); + } +} + +static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); + dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 64, 64); +} + +static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + // store + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + } +} + +static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy); + for (int i = 0; i < 32; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 32]; + dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 32, 64); +} + +static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[32 * 64]; + dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); + transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 16]; + dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 16, 64); +} + +static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[64], d[16]; + + dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy); + for (int i = 0; i < 64; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + (void)above; + (void)dx; + assert(dx == 1); + assert(dy > 0); + + if (bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 64: + dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bw) { + case 4: + dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bh) { + case 4: + dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c new file mode 100644 index 0000000000..61e29731c4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -0,0 +1,1411 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/x86/intrapred_x86.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; i += 2) { + *(uint32_t *)dst = dc; + dst += stride; + *(uint32_t *)dst = dc; + dst += stride; + } +} + +static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_storel_epi64((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + dst += stride; + } +} + +static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + _mm_store_si128((__m128i *)(dst + 32), *row); + _mm_store_si128((__m128i *)(dst + 48), *row); + dst += stride; + } +} + +static INLINE __m128i dc_sum_4(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_unpacklo_epi8(x, zero); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_8(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_64(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); + __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x2 = _mm_sad_epu8(x2, zero); + x3 = _mm_sad_epu8(x3, zero); + x0 = _mm_add_epi16(x0, x1); + x2 = _mm_add_epi16(x2, x3); + x0 = _mm_add_epi16(x0, x2); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier) { + const int interm = num >> shift1; + return interm * multiplier >> DC_SHIFT2; +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 6; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((int8_t)sum); + const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16_sse2(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + + const __m128i row = _mm_set1_epi8((int8_t)sum); + const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 6; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16_sse2(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 12; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32_sse2(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 12; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32_sse2(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 24; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_64(left); + __m128i sum_above = dc_sum_16_sse2(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_8(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_16_sse2(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 24; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 64; + sum /= 128; + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_32_sse2(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_16_sse2(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((int8_t)sum); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16(2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16(2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16(4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16(4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16(4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16(8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16(8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16(8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16_sse2(above); + const __m128i eight = _mm_set1_epi16(8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sixteen = _mm_set1_epi16(16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sixteen = _mm_set1_epi16(16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32_sse2(above); + const __m128i sixteen = _mm_set1_epi16(16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16(32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16(32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16(32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16(4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16(8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16(2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16(8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32_sse2(left); + const __m128i sixteen = _mm_set1_epi16(16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16(2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16(4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32_sse2(left); + const __m128i sixteen = _mm_set1_epi16(16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16(32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16(4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16(8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16(32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16(32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32_sse2(left); + const __m128i sixteen = _mm_set1_epi16(16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16_sse2(left); + const __m128i eight = _mm_set1_epi16(8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_32xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((int8_t)128); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 16, dst, stride); +} + +void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 4, dst, stride); +} + +void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 64, dst, stride); +} + +static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 8); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 16); +} + +void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 64); +} + +static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); + const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + _mm_store_si128((__m128i *)(dst + 32), row2); + _mm_store_si128((__m128i *)(dst + 48), row3); + dst += stride; + } +} + +void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 64); +} + +void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 32); +} + +void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 16); +} + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(int *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + left_col = _mm_unpackhi_epi64(left_col, left_col); + row0 = _mm_shufflelo_epi16(left_col, 0); + row1 = _mm_shufflelo_epi16(left_col, 0x55); + row2 = _mm_shufflelo_epi16(left_col, 0xaa); + row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(int *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + *(int *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + *(int *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(int *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(int *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(int *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int count) { + (void)above; + for (int i = 0; i < count; ++i) { + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + left += 16; + } +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 1); +} + +void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 2); +} + +static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + dst += stride; + } +} + +static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflelo_epi16(*x, 0); + const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); + + row[0] = _mm_unpacklo_epi64(u0, u0); + row[1] = _mm_unpacklo_epi64(u1, u1); + row[2] = _mm_unpacklo_epi64(u2, u2); + row[3] = _mm_unpacklo_epi64(u3, u3); +} + +static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflehi_epi16(*x, 0); + const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); + + row[0] = _mm_unpackhi_epi64(u0, u0); + row[1] = _mm_unpackhi_epi64(u1, u1); + row[2] = _mm_unpackhi_epi64(u2, u2); + row[3] = _mm_unpackhi_epi64(u3, u3); +} + +// Process 16x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +// Process 16x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); +} + +void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); +} + +static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int count) { + int i = 0; + do { + const __m128i left_col = _mm_load_si128((const __m128i *)left); + const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_lo, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p_lo, dst, stride); + dst += stride << 2; + + const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_hi, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p_hi, dst, stride); + dst += stride << 2; + + left += 16; + i++; + } while (i < count); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 2); +} + +void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 4); +} + +static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + _mm_store_si128((__m128i *)(dst + 16), row[i]); + dst += stride; + } +} + +// Process 32x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +// Process 32x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + +void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + +static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_32xh(dst, stride, left, 64); +} + +static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + 32), r0); + _mm_store_si128((__m128i *)(dst + 48), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + _mm_store_si128((__m128i *)(dst + stride + 32), r1); + _mm_store_si128((__m128i *)(dst + stride + 48), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 64); +} + +void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 32); +} + +void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 16); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse4.c b/third_party/aom/aom_dsp/x86/intrapred_sse4.c new file mode 100644 index 0000000000..9de8bf3c0f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse4.c @@ -0,0 +1,1307 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 +#include /* SSE4.1 */ + +#include "config/av1_rtcd.h" +#include "aom_dsp/x86/intrapred_x86.h" +#include "aom_dsp/x86/intrapred_utils.h" +#include "aom_dsp/x86/lpf_common_sse2.h" + +// Low bit depth functions +static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = { + { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, + 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, + 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff } }, + { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, + 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff }, + }, +}; + +/* clang-format on */ +static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1( + int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, + int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((W + H) - 1) << upsample_above; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0, a1, a32, a16; + __m128i diff, c3f; + __m128i a_mbase_x; + + a16 = _mm_set1_epi16(16); + a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); + c3f = _mm_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < W; r++) { + __m128i b, res, res1, shift; + __m128i a0_above, a1_above; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < W; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + if (base_max_diff > H) base_max_diff = H; + a0_above = _mm_loadu_si128((__m128i *)(above + base)); + a1_above = _mm_loadu_si128((__m128i *)(above + base + 1)); + + if (upsample_above) { + a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]); + a1_above = _mm_srli_si128(a0_above, 8); + + shift = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f), + 1); + } else { + shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); + } + // lower half + a0 = _mm_cvtepu8_epi16(a0_above); + a1 = _mm_cvtepu8_epi16(a1_above); + + diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + // uppar half + a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); + a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); + + diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res1 = _mm_add_epi16(a32, b); + res1 = _mm_srli_epi16(res1, 5); + + res = _mm_packus_epi16(res, res1); + + dst[r] = + _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]); + x += dx; + } +} + +static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + int upsample_above, int dx) { + __m128i dstvec[16]; + + dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); + } +} + +static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + int upsample_above, int dx) { + __m128i dstvec[32]; + + dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + int upsample_above, int dx) { + __m128i dstvec[64]; + + dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above, + dx); + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1( + int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above, + int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0, a1, a32, a16; + __m128i a_mbase_x, diff, c3f; + + a16 = _mm_set1_epi16(16); + a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); + c3f = _mm_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++) { + __m128i b, res, res1, res16[2]; + __m128i a0_above, a1_above; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + dstvec_h[i] = a_mbase_x; + } + return; + } + if (base_max_diff > 32) base_max_diff = 32; + __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); + + for (int j = 0, jj = 0; j < 32; j += 16, jj++) { + int mdiff = base_max_diff - j; + if (mdiff <= 0) { + res16[jj] = a_mbase_x; + } else { + a0_above = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1)); + + // lower half + a0 = _mm_cvtepu8_epi16(a0_above); + a1 = _mm_cvtepu8_epi16(a1_above); + + diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm_mullo_epi16(diff, shift); + + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + // uppar half + a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); + a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); + + diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res1 = _mm_add_epi16(a32, b); + res1 = _mm_srli_epi16(res1, 5); + + res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values + } + } + + dstvec[r] = + _mm_blendv_epi8(a_mbase_x, res16[0], + *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values + + dstvec_h[r] = + _mm_blendv_epi8(a_mbase_x, res16[1], + *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values + x += dx; + } +} + +static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + int upsample_above, int dx) { + __m128i dstvec[64], dstvec_h[64]; + dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above, + upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]); + } +} + +static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + int upsample_above, int dx) { + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0, a1, a32, a16; + __m128i a_mbase_x, diff, c3f; + __m128i max_base, base_inc, mask; + + a16 = _mm_set1_epi16(16); + a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); + max_base = _mm_set1_epi8(max_base_x); + c3f = _mm_set1_epi16(0x3f); + + int x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m128i b, res, res1; + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values + _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x); + _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x); + _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x); + dst += stride; + } + return; + } + + __m128i shift = + _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element + + __m128i a0_above, a1_above, res_val; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x); + } else { + a0_above = + _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element + a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + + // lower half + a0 = _mm_cvtepu8_epi16(a0_above); + a1 = _mm_cvtepu8_epi16(a1_above); + + diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + b = _mm_mullo_epi16(diff, shift); + + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + // uppar half + a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); + a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); + + diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res1 = _mm_add_epi16(a32, b); + res1 = _mm_srli_epi16(res1, 5); + + res = _mm_packus_epi16(res, res1); // 16 8bit values + + base_inc = + _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), + (int8_t)(base + j + 2), (int8_t)(base + j + 3), + (int8_t)(base + j + 4), (int8_t)(base + j + 5), + (int8_t)(base + j + 6), (int8_t)(base + j + 7), + (int8_t)(base + j + 8), (int8_t)(base + j + 9), + (int8_t)(base + j + 10), (int8_t)(base + j + 11), + (int8_t)(base + j + 12), (int8_t)(base + j + 13), + (int8_t)(base + j + 14), (int8_t)(base + j + 15)); + + mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc), + _mm_setzero_si128()); + res_val = _mm_blendv_epi8(a_mbase_x, res, mask); + _mm_storeu_si128((__m128i *)(dst + j), res_val); + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + (void)left; + (void)dy; + switch (bw) { + case 4: + dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx); + break; + case 8: + dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx); + break; + case 16: + dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx); + break; + case 32: + dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx); + break; + case 64: + dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx); + break; + default: assert(0 && "Invalid block size"); + } + return; +} + +static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i a0_x, a1_x, a32, diff; + + const __m128i c3f = _mm_set1_epi16(0x3f); + const __m128i min_y_base = _mm_set1_epi16(min_base_y); + const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); + const __m128i dy_reg = _mm_set1_epi16(dy); + const __m128i a16 = _mm_set1_epi16(16); + + for (int r = 0; r < N; r++) { + __m128i b, res, shift, r6, ydx; + __m128i resx, resy, resxy; + __m128i a0_above, a1_above; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm_setzero_si128(); + a1_x = _mm_setzero_si128(); + shift = _mm_setzero_si128(); + } else { + a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(c1234, 6); + + if (upsample_above) { + a0_above = + _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); + a1_above = _mm_srli_si128(a0_above, 8); + + shift = _mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1); + } else { + a0_above = + _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); + a1_above = _mm_srli_si128(a0_above, 1); + + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + } + a0_x = _mm_cvtepu8_epi16(a0_above); + a1_x = _mm_cvtepu8_epi16(a1_above); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[8]); + __m128i y_c, base_y_c_reg, mask, c1234_; + c1234_ = _mm_srli_si128(c1234, 2); + r6 = _mm_set1_epi16(r << 6); + y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg)); + base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); + mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); + base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); + _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); + } + a0_x = _mm_unpacklo_epi64(a0_x, a0_y); + a1_x = _mm_unpacklo_epi64(a1_x, a1_y); + shift = _mm_unpacklo_epi64(shift, shifty); + } + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + + resx = _mm_packus_epi16(res, res); + resy = _mm_srli_si128(resx, 4); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); + *(int *)(dst) = _mm_cvtsi128_si32(resxy); + dst += stride; + } +} + +static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be calculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i diff, a32; + __m128i a0_x, a1_x, a0_y, a1_y; + __m128i a0_above, a1_above; + + const __m128i a16 = _mm_set1_epi16(16); + const __m128i c3f = _mm_set1_epi16(0x3f); + const __m128i min_y_base = _mm_set1_epi16(min_base_y); + const __m128i dy_reg = _mm_set1_epi16(dy); + const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + + for (int r = 0; r < N; r++) { + __m128i b, res, res1, shift; + __m128i resx, resy, resxy, r6, ydx; + + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx = _mm_setzero_si128(); + } else { + a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + ydx = _mm_set1_epi16(y * dx); + r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); + if (upsample_above) { + a0_above = + _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); + a1_above = _mm_srli_si128(a0_above, 8); + + shift = _mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), + 1); + } else { + a1_above = _mm_srli_si128(a0_above, 1); + a0_above = + _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); + a1_above = + _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); + + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + } + a0_x = _mm_cvtepu8_epi16(a0_above); + a1_x = _mm_cvtepu8_epi16(a1_above); + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); + resx = _mm_packus_epi16(res, res); + } + + // y calc + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m128i y_c, base_y_c_reg, mask; + r6 = _mm_set1_epi16(r << 6); + y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg)); + base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); + mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); + base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); + _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); + _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); + + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + + if (upsample_left) { + shift = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); + } else { + shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); + } + + diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res1 = _mm_add_epi16(a32, b); + res1 = _mm_srli_epi16(res1, 5); + + resy = _mm_packus_epi16(res1, res1); + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); + _mm_storel_epi64((__m128i *)dst, resxy); + } else { + _mm_storel_epi64((__m128i *)dst, resx); + } + + dst += stride; + } +} + +static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst, + ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32; + __m128i diff, shifty, shifty_h; + __m128i a0_above, a1_above; + + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + const __m128i a16 = _mm_set1_epi16(16); + const __m128i c1 = _mm_srli_epi16(a16, 4); + const __m128i min_y_base = _mm_set1_epi16(min_base_y); + const __m128i c3f = _mm_set1_epi16(0x3f); + const __m128i dy256 = _mm_set1_epi16(dy); + const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); + const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); + const __m128i c1234 = _mm_add_epi16(c0123, c1); + const __m128i c1234_h = _mm_add_epi16(c0123_h, c1); + + for (int r = 0; r < H; r++) { + __m128i b, res, res1, shift, reg_j, r6, ydx; + __m128i resx, resy; + __m128i resxy; + int y = r + 1; + ydx = _mm_set1_epi16((int16_t)(y * dx)); + + int base_x = (-y * dx) >> frac_bits_x; + for (int j = 0; j < W; j += 16) { + reg_j = _mm_set1_epi16(j); + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift < 16) { + a0_above = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_above = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_above = + _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); + a1_above = + _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); + + a0_x = _mm_cvtepu8_epi16(a0_above); + a1_x = _mm_cvtepu8_epi16(a1_above); + + r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6); + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); // 16 16-bit values + + a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); + a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); + + r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6); + shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); + + diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shift); + res1 = _mm_add_epi16(a32, b); + res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values + + resx = _mm_packus_epi16(res, res1); + } else { + resx = _mm_setzero_si128(); + } + + // y calc + if (base_x < min_base_x) { + __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h; + __m128i mask, mask_h, mul16, mul16_h; + r6 = _mm_set1_epi16(r << 6); + c_reg = _mm_add_epi16(reg_j, c1234); + c_reg_h = _mm_add_epi16(reg_j, c1234_h); + mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256), + _mm_srli_epi16(min_y_base, 1)); + mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256), + _mm_srli_epi16(min_y_base, 1)); + y_reg = _mm_sub_epi16(r6, mul16); + y_reg_h = _mm_sub_epi16(r6, mul16_h); + + base_y = _mm_srai_epi16(y_reg, frac_bits_y); + base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y); + mask = _mm_cmpgt_epi16(min_y_base, base_y); + mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h); + + base_y = _mm_blendv_epi8(base_y, min_y_base, mask); + base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h); + int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7); + int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0); + int16_t offset_diff = max_y - min_y; + + if (offset_diff < 16) { + __m128i min_y_reg = _mm_set1_epi16(min_y); + + __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg); + __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg); + __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h); + + __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y)); + __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1)); + __m128i LoadMask = + _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4])); + + a0_mask = _mm_and_si128(a0_mask, LoadMask); + a1_mask = _mm_and_si128(a1_mask, LoadMask); + + a0_mask = _mm_shuffle_epi8(a0_mask, y_offset); + a1_mask = _mm_shuffle_epi8(a1_mask, y_offset); + a0_y = _mm_cvtepu8_epi16(a0_mask); + a1_y = _mm_cvtepu8_epi16(a1_mask); + a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8)); + a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8)); + } else { + base_y = _mm_andnot_si128(mask, base_y); + base_y_h = _mm_andnot_si128(mask_h, base_y_h); + _mm_store_si128((__m128i *)base_y_c, base_y); + _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], + left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], + left[base_y_c[14]], left[base_y_c[15]]); + base_y = _mm_add_epi16(base_y, c1); + base_y_h = _mm_add_epi16(base_y_h, c1); + _mm_store_si128((__m128i *)base_y_c, base_y); + _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); + + a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], + left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], + left[base_y_c[14]], left[base_y_c[15]]); + } + shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1); + shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1); + + diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shifty); + res = _mm_add_epi16(a32, b); + res = _mm_srli_epi16(res, 5); // 16 16-bit values + + diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x] + a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32 + a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm_mullo_epi16(diff, shifty_h); + res1 = _mm_add_epi16(a32, b); + res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values + resy = _mm_packus_epi16(res, res1); + } else { + resy = _mm_setzero_si128(); + } + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); + _mm_storeu_si128((__m128i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + case 8: + dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + default: + dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } + return; +} + +// z3 functions +static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[4]; + + dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3]); + + *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); + *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); + *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); + *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); + return; +} + +static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy); + transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], + &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], + &d[3]); + + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); + _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); + _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); + _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); +} + +static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[8]; + + dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], + &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + for (int i = 0; i < 8; i++) { + *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[4]; + + dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy); + transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], + &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); +} + +static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy); + transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, + dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, + d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm_srli_si128(d[i], 8)); + } +} + +static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy); + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[16]; + + dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy); + transpose4x16_sse2(dstvec, d); + for (int i = 0; i < 16; i++) { + *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[8]; + + dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy); + for (int i = 4; i < 8; i++) { + d[i] = _mm_setzero_si128(); + } + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; + + dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left, + upsample_left, dy); + for (int i = 8; i < 16; i++) { + dstvec[i] = _mm_setzero_si128(); + dstvec_h[i] = _mm_setzero_si128(); + } + transpose16x16_sse2(dstvec, d); + transpose16x16_sse2(dstvec_h, d_h); + + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + } + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]); + } +} + +static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy); + + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + transpose16x8_8x16_sse2( + &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], + &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], + &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], + &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], + &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], + &d[6 + 8], &d[7 + 8]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); + } +} + +static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy); + transpose16x16_sse2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + __m128i dstvec[32], d[32], dstvec_h[32], d_h[32]; + + dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left, + upsample_left, dy); + transpose16x16_sse2(dstvec, d); + transpose16x16_sse2(dstvec_h, d_h); + transpose16x16_sse2(dstvec + 16, d + 16); + transpose16x16_sse2(dstvec_h + 16, d_h + 16); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); + _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]); + } + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]); + } +} + +static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + uint8_t dstT[64 * 64]; + dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 64, 64); +} + +static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; + + dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left, + upsample_left, dy); + transpose16x16_sse2(dstvec, d); + transpose16x16_sse2(dstvec_h, d_h); + // store + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); + } +} + +static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy); + for (int i = 0; i < 32; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + uint8_t dstT[64 * 32]; + dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 32, 64); +} + +static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + uint8_t dstT[32 * 64]; + dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy); + transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + uint8_t dstT[64 * 16]; + dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 16, 64); +} + +static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, + int upsample_left, int dy) { + __m128i dstvec[64], d[16]; + + dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy); + for (int i = 0; i < 64; i += 16) { + transpose16x16_sse2(dstvec + i, d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + (void)above; + (void)dx; + assert(dx == 1); + assert(dy > 0); + + if (bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 64: + dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy); + break; + default: assert(0 && "Invalid block size"); + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy); + break; + default: assert(0 && "Invalid block size"); + } + } else { + switch (bw) { + case 4: + dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy); + break; + default: assert(0 && "Invalid block size"); + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy); + break; + default: assert(0 && "Invalid block size"); + } + } else { + switch (bh) { + case 4: + dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy); + break; + default: assert(0 && "Invalid block size"); + } + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c new file mode 100644 index 0000000000..fd48260c6f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -0,0 +1,2997 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// PAETH_PRED + +// Return 8 16-bit pixels in one row +static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, + const __m128i *topleft) { + const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); + + __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); + __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); + __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); + + __m128i mask1 = _mm_cmpgt_epi16(pl, pt); + mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); + __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); + + pl = _mm_andnot_si128(mask1, *left); + + ptl = _mm_and_si128(mask2, *topleft); + pt = _mm_andnot_si128(mask2, *top); + pt = _mm_or_si128(pt, ptl); + pt = _mm_and_si128(mask1, pt); + + return _mm_or_si128(pl, pt); +} + +void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +// Return 16 8-bit pixels in one row +static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, + const __m128i *top1, + const __m128i *topleft) { + const __m128i p0 = paeth_8x1_pred(left, top0, topleft); + const __m128i p1 = paeth_8x1_pred(left, top1, topleft); + return _mm_packus_epi16(p0, p1); +} + +void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + l = _mm_load_si128((const __m128i *)(left + 16)); + rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + const __m128i l = _mm_loadl_epi64((const __m128i *)left); + __m128i l16; + + for (int i = 0; i < 8; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + __m128i rep = _mm_set1_epi16((short)0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + rep = _mm_set1_epi16((short)0x8000); + l = _mm_load_si128((const __m128i *)(left + 16)); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + const __m128i l = _mm_load_si128((const __m128i *)left); + __m128i rep = _mm_set1_epi16((short)0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]); + if (height == 4) + pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]); + else if (height == 8) + pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[1] = _mm_loadu_si128(((const __m128i *)left)); + + pixels[2] = _mm_set1_epi16((int16_t)above[3]); + + const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_w4(int height, __m128i *weight_h, + __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE)); + const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]); + weight_h[0] = _mm_unpacklo_epi8(t, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + + if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = + _mm_loadu_si128((const __m128i *)&smooth_weights[12]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } +} + +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); + __m128i d = _mm_set1_epi16(0x100); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s = _mm_madd_epi16(pixel[0], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixel[1], rep); + b = _mm_unpacklo_epi16(b, pixel[2]); + __m128i sum = _mm_madd_epi16(b, ww[0]); + + sum = _mm_add_epi32(s, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE); + + sum = _mm_shuffle_epi8(sum, gat); + *(int *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(4, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(8, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(16, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +// pixels[4]: above and below_pred interleave vector, first half +// pixels[5]: above and below_pred interleave vector, second half +// pixels[6]: left vector + 16 +// pixels[7]: right_pred vector +static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); + + pixels[3] = _mm_set1_epi16((int16_t)above[7]); + + if (height == 4) { + pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]); + } else if (height == 8) { + pixels[2] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[2] = _mm_load_si128((const __m128i *)left); + } else { + pixels[2] = _mm_load_si128((const __m128i *)left); + pixels[4] = pixels[0]; + pixels[5] = pixels[1]; + pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[7] = pixels[3]; + } +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_w8(int height, __m128i *weight_h, + __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const int we_offset = height < 8 ? 0 : 4; + __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE)); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + + if (height == 4) { + we = _mm_srli_si128(we, 4); + __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(we, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else if (height == 32) { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&smooth_weights[28]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + __m128i rep = second_half ? _mm_set1_epi16((short)0x8008) + : _mm_set1_epi16((short)0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixels[2], rep); + b = _mm_unpacklo_epi16(b, pixels[3]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(4, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(8, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(16, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[8]; + load_pixel_w8(above, left, 32, pixels); + + __m128i wh[8], ww[2]; + load_weight_w8(32, wh, ww); + + smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); +} + +// TODO(slavarnway): Visual Studio only supports restrict when /std:c11 +// (available in 2019+) or greater is specified; __restrict can be used in that +// case. This should be moved to rtcd and used consistently between the +// function declarations and definitions to avoid warnings in Visual Studio +// when defining LIBAOM_RESTRICT to restrict or __restrict. +#if defined(_MSC_VER) +#define LIBAOM_RESTRICT +#else +#define LIBAOM_RESTRICT restrict +#endif + +static AOM_FORCE_INLINE __m128i Load4(const void *src) { + // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32 + // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a + // movss instruction. + // + // Until compiler support of _mm_loadu_si32 is widespread, use of + // _mm_loadu_si32 is banned. + int val; + memcpy(&val, src, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) { + return _mm_loadl_epi64((const __m128i *)(a)); +} + +static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) { + return _mm_loadu_si128((const __m128i *)(a)); +} + +static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) { + const int val = _mm_cvtsi128_si32(x); + memcpy(dst, &val, sizeof(val)); +} + +static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) { + _mm_storel_epi64((__m128i *)(a), v); +} + +static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) { + _mm_storeu_si128((__m128i *)(a), v); +} + +static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) { + return _mm_unpacklo_epi8((x), _mm_setzero_si128()); +} + +static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) { + const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128()); + return _mm_unpacklo_epi16(tmp, _mm_setzero_si128()); +} + +static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) { + return _mm_unpacklo_epi16((x), _mm_setzero_si128()); +} + +void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column, int width, + int height) { + const uint8_t *const sm_weights_h = smooth_weights + height - 4; + const uint8_t *const sm_weights_w = smooth_weights + width - 4; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]); + const __m128i top_right = _mm_set1_epi16(top_row[width - 1]); + const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE); + for (int y = 0; y < height; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128(left_column[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i scaled_bottom_left = + _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i weight_left_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round); + scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0); + for (int x = 0; x < width; x += 8) { + const __m128i top_x = LoadLo8(top_row + x); + const __m128i weights_x = LoadLo8(sm_weights_w + x); + const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x); + const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero); + + // Here opposite weights and pixels are multiplied, where the order of + // interleaving is indicated in the names. + __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y); + __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y); + + // |scaled_bottom_left| is always scaled by the same weight each row, so + // we only derive |scaled_top_right| values here. + const __m128i inverted_weights_x = + _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x)); + const __m128i scaled_top_right = + _mm_mullo_epi16(inverted_weights_x, top_right); + const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right); + const __m128i scaled_top_right_hi = + _mm_unpackhi_epi16(scaled_top_right, zero); + pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left); + pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left); + pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo); + pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi); + + // The round value for RightShiftWithRounding was added with + // |scaled_bottom_left|. + pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE)); + pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE)); + const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + StoreLo8(dst + x, _mm_packus_epi16(pred, pred)); + } + dst += stride; + } +} + +void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 64); +} + +void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 64); +} + +// ----------------------------------------------------------------------------- +// Smooth horizontal/vertical helper functions. + +// For Horizontal, pixels1 and pixels2 are the same repeated value. For +// Vertical, weights1 and weights2 are the same, and scaled_corner1 and +// scaled_corner2 are the same. +static AOM_FORCE_INLINE void write_smooth_directional_sum16( + uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2, + const __m128i weights1, const __m128i weights2, + const __m128i scaled_corner1, const __m128i scaled_corner2, + const __m128i round) { + const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1); + const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2); + const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1); + const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8); + const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8); + StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2)); +} + +static AOM_FORCE_INLINE __m128i smooth_directional_sum8( + const __m128i pixels, const __m128i weights, const __m128i scaled_corner) { + const __m128i weighted_px = _mm_mullo_epi16(pixels, weights); + return _mm_add_epi16(scaled_corner, weighted_px); +} + +static AOM_FORCE_INLINE void write_smooth_directional_sum8( + uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights, + const __m128i *scaled_corner, const __m128i *round) { + const __m128i pred_sum = + smooth_directional_sum8(*pixels, *weights, *scaled_corner); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8); + StoreLo8(dst, _mm_packus_epi16(pred, pred)); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_V_PRED + +static AOM_FORCE_INLINE void load_smooth_vertical_pixels4( + const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left, + const int height, __m128i *pixels) { + __m128i top = Load4(above); + const __m128i bottom_left = _mm_set1_epi16(left[height - 1]); + top = cvtepu8_epi16(top); + pixels[0] = _mm_unpacklo_epi16(top, bottom_left); +} + +// |weight_array| alternates weight vectors from the table with their inverted +// (256-w) counterparts. This is precomputed by the compiler when the weights +// table is visible to this module. Removing this visibility can cut speed by up +// to half in both 4xH and 8xH transforms. +static AOM_FORCE_INLINE void load_smooth_vertical_weights4( + const uint8_t *LIBAOM_RESTRICT weight_array, const int height, + __m128i *weights) { + const __m128i inverter = _mm_set1_epi16(256); + + if (height == 4) { + const __m128i weight = Load4(weight_array); + weights[0] = cvtepu8_epi16(weight); + weights[1] = _mm_sub_epi16(inverter, weights[0]); + } else if (height == 8) { + const __m128i weight = LoadLo8(weight_array + 4); + weights[0] = cvtepu8_epi16(weight); + weights[1] = _mm_sub_epi16(inverter, weights[0]); + } else { + const __m128i weight = LoadUnaligned16(weight_array + 12); + const __m128i zero = _mm_setzero_si128(); + weights[0] = cvtepu8_epi16(weight); + weights[1] = _mm_sub_epi16(inverter, weights[0]); + weights[2] = _mm_unpackhi_epi8(weight, zero); + weights[3] = _mm_sub_epi16(inverter, weights[2]); + } +} + +static AOM_FORCE_INLINE void write_smooth_vertical4xh( + const __m128i *pixel, const __m128i *weight, const int height, + uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32(128); + const __m128i mask_increment = _mm_set1_epi16(0x0202); + const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400); + __m128i y_select = _mm_set1_epi16(0x0100); + + for (int y = 0; y < height; ++y) { + const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select); + const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select); + const __m128i alternate_weights = + _mm_unpacklo_epi16(weight_y, inverted_weight_y); + // Here the pixel vector is top_row[0], corner, top_row[1], corner, ... + // The madd instruction yields four results of the form: + // (top_row[x] * weight[y] + corner * inverted_weight[y]) + __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, 8); + sum = _mm_shuffle_epi8(sum, cvtepu8_epi32); + Store4(dst, sum); + dst += stride; + y_select = _mm_add_epi16(y_select, mask_increment); + } +} + +void aom_smooth_v_predictor_4x4_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + __m128i pixels; + load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels); + + __m128i weights[2]; + load_smooth_vertical_weights4(smooth_weights, 4, weights); + + write_smooth_vertical4xh(&pixels, weights, 4, dst, stride); +} + +void aom_smooth_v_predictor_4x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + __m128i pixels; + load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels); + + __m128i weights[2]; + load_smooth_vertical_weights4(smooth_weights, 8, weights); + + write_smooth_vertical4xh(&pixels, weights, 8, dst, stride); +} + +void aom_smooth_v_predictor_4x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + __m128i pixels; + load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels); + + __m128i weights[4]; + load_smooth_vertical_weights4(smooth_weights, 16, weights); + + write_smooth_vertical4xh(&pixels, weights, 8, dst, stride); + dst += stride << 3; + write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride); +} + +void aom_smooth_v_predictor_8x4_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[3]); + const __m128i weights = cvtepu8_epi16(Load4(smooth_weights)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i y_select = _mm_set1_epi32(0x01000100); + const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); + __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + y_select = _mm_set1_epi32(0x03020302); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + y_select = _mm_set1_epi32(0x05040504); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + y_select = _mm_set1_epi32(0x07060706); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); +} + +void aom_smooth_v_predictor_8x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[7]); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } +} + +void aom_smooth_v_predictor_8x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[15]); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } +} + +void aom_smooth_v_predictor_8x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_column[31]); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + const __m128i top = cvtepu8_epi16(LoadLo8(top_row)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y, + &round); + dst += stride; + } +} + +void aom_smooth_v_predictor_16x4_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[3]); + const __m128i weights = cvtepu8_epi16(Load4(smooth_weights)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + const __m128i round = _mm_set1_epi16(128); + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = cvtepu8_epi16(top); + const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8)); + + __m128i y_select = _mm_set1_epi32(0x01000100); + __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + y_select = _mm_set1_epi32(0x03020302); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + y_select = _mm_set1_epi32(0x05040504); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + y_select = _mm_set1_epi32(0x07060706); + weights_y = _mm_shuffle_epi8(weights, y_select); + scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); +} + +void aom_smooth_v_predictor_16x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[7]); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + const __m128i round = _mm_set1_epi16(128); + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = cvtepu8_epi16(top); + const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_16x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[15]); + const __m128i zero = _mm_setzero_si128(); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i weights_lo = cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + const __m128i round = _mm_set1_epi16(128); + + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = cvtepu8_epi16(top); + const __m128i top_hi = _mm_unpackhi_epi8(top, zero); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_16x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[31]); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i zero = _mm_setzero_si128(); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + const __m128i round = _mm_set1_epi16(128); + + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = cvtepu8_epi16(top); + const __m128i top_hi = _mm_unpackhi_epi8(top, zero); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_16x64_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[63]); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i round = _mm_set1_epi16(128); + const __m128i zero = _mm_setzero_si128(); + const __m128i top = LoadUnaligned16(top_row); + const __m128i top_lo = cvtepu8_epi16(top); + const __m128i top_hi = _mm_unpackhi_epi8(top, zero); + const uint8_t *weights_base_ptr = smooth_weights + 60; + for (int left_offset = 0; left_offset < 64; left_offset += 16) { + const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); + const __m128i weights_lo = cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + } +} + +void aom_smooth_v_predictor_32x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_column[7]); + const __m128i top_lo = LoadUnaligned16(top_row); + const __m128i top_hi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + __m128i scale = _mm_set1_epi16(256); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_bottom_left = + _mm_mullo_epi16(inverted_weights, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_32x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_column[15]); + const __m128i top_lo = LoadUnaligned16(top_row); + const __m128i top_hi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_32x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[31]); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i zero = _mm_setzero_si128(); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i top_lo = LoadUnaligned16(top_row); + const __m128i top_hi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_32x64_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_column[63]); + const __m128i top_lo = LoadUnaligned16(top_row); + const __m128i top_hi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lo); + const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero); + const __m128i top3 = cvtepu8_epi16(top_hi); + const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + const uint8_t *weights_base_ptr = smooth_weights + 60; + for (int left_offset = 0; left_offset < 64; left_offset += 16) { + const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); + const __m128i weights_lo = cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + } +} + +void aom_smooth_v_predictor_64x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i bottom_left = _mm_set1_epi16(left_column[15]); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i zero = _mm_setzero_si128(); + const __m128i top_lolo = LoadUnaligned16(top_row); + const __m128i top_lohi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lolo); + const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); + const __m128i top3 = cvtepu8_epi16(top_lohi); + const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); + + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i top_hilo = LoadUnaligned16(top_row + 32); + const __m128i top_hihi = LoadUnaligned16(top_row + 48); + const __m128i top5 = cvtepu8_epi16(top_hilo); + const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); + const __m128i top7 = cvtepu8_epi16(top_hihi); + const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_64x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_column[31]); + const __m128i top_lolo = LoadUnaligned16(top_row); + const __m128i top_lohi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lolo); + const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); + const __m128i top3 = cvtepu8_epi16(top_lohi); + const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); + const __m128i top_hilo = LoadUnaligned16(top_row + 32); + const __m128i top_hihi = LoadUnaligned16(top_row + 48); + const __m128i top5 = cvtepu8_epi16(top_hilo); + const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); + const __m128i top7 = cvtepu8_epi16(top_hihi); + const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_bottom_left1 = + _mm_mullo_epi16(inverted_weights1, bottom_left); + const __m128i scaled_bottom_left2 = + _mm_mullo_epi16(inverted_weights2, bottom_left); + const __m128i scaled_bottom_left3 = + _mm_mullo_epi16(inverted_weights3, bottom_left); + const __m128i scaled_bottom_left4 = + _mm_mullo_epi16(inverted_weights4, bottom_left); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left1, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left2, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left3, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left4, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } +} + +void aom_smooth_v_predictor_64x64_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i zero = _mm_setzero_si128(); + const __m128i bottom_left = _mm_set1_epi16(left_column[63]); + const __m128i top_lolo = LoadUnaligned16(top_row); + const __m128i top_lohi = LoadUnaligned16(top_row + 16); + const __m128i top1 = cvtepu8_epi16(top_lolo); + const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero); + const __m128i top3 = cvtepu8_epi16(top_lohi); + const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero); + const __m128i top_hilo = LoadUnaligned16(top_row + 32); + const __m128i top_hihi = LoadUnaligned16(top_row + 48); + const __m128i top5 = cvtepu8_epi16(top_hilo); + const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero); + const __m128i top7 = cvtepu8_epi16(top_hihi); + const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i round = _mm_set1_epi16(128); + const uint8_t *weights_base_ptr = smooth_weights + 60; + for (int left_offset = 0; left_offset < 64; left_offset += 16) { + const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset); + const __m128i weights_lo = cvtepu8_epi16(weights); + const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero); + const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo); + const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi); + const __m128i scaled_bottom_left_lo = + _mm_mullo_epi16(inverted_weights_lo, bottom_left); + const __m128i scaled_bottom_left_hi = + _mm_mullo_epi16(inverted_weights_hi, bottom_left); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_lo, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select); + const __m128i scaled_bottom_left_y = + _mm_shuffle_epi8(scaled_bottom_left_hi, y_select); + write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y, + scaled_bottom_left_y, scaled_bottom_left_y, + round); + dst += stride; + } + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_H_PRED +static AOM_FORCE_INLINE void write_smooth_horizontal_sum4( + uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights, + const __m128i *scaled_top_right, const __m128i *round) { + const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights); + const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y); + // Equivalent to RightShiftWithRounding(pred[x][y], 8). + const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8); + const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400); + Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8)); +} + +void aom_smooth_h_predictor_4x4_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi32(top_row[3]); + const __m128i left = cvtepu8_epi32(Load4(left_column)); + const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); +} + +void aom_smooth_h_predictor_4x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi32(top_row[3]); + const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi32(Load4(left_column)); + __m128i left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + + left = cvtepu8_epi32(Load4(left_column + 4)); + left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); +} + +void aom_smooth_h_predictor_4x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi32(top_row[3]); + const __m128i weights = cvtepu8_epi32(Load4(smooth_weights)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi32(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi32(Load4(left_column)); + __m128i left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + + left = cvtepu8_epi32(Load4(left_column + 4)); + left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + + left = cvtepu8_epi32(Load4(left_column + 8)); + left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + + left = cvtepu8_epi32(Load4(left_column + 12)); + left_y = _mm_shuffle_epi32(left, 0); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0x55); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xaa); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + left_y = _mm_shuffle_epi32(left, 0xff); + write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right, + &round); +} + +// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V, +// |pixels| is a segment of the top row or the whole top row, and |weights| is +// repeated. +void aom_smooth_h_predictor_8x4_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[7]); + const __m128i left = cvtepu8_epi16(Load4(left_column)); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i y_select = _mm_set1_epi32(0x01000100); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + y_select = _mm_set1_epi32(0x03020302); + left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + y_select = _mm_set1_epi32(0x05040504); + left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + y_select = _mm_set1_epi32(0x07060706); + left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); +} + +void aom_smooth_h_predictor_8x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[7]); + const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } +} + +void aom_smooth_h_predictor_8x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[7]); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } +} + +void aom_smooth_h_predictor_8x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[7]); + const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4)); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i inverted_weights = _mm_sub_epi16(scale, weights); + const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right, + &round); + dst += stride; + } +} + +void aom_smooth_h_predictor_16x4_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[15]); + const __m128i left = cvtepu8_epi16(Load4(left_column)); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i y_mask = _mm_set1_epi32(0x01000100); + __m128i left_y = _mm_shuffle_epi8(left, y_mask); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + y_mask = _mm_set1_epi32(0x03020302); + left_y = _mm_shuffle_epi8(left, y_mask); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + y_mask = _mm_set1_epi32(0x05040504); + left_y = _mm_shuffle_epi8(left, y_mask); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + y_mask = _mm_set1_epi32(0x07060706); + left_y = _mm_shuffle_epi8(left, y_mask); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); +} + +void aom_smooth_h_predictor_16x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[15]); + const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_16x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[15]); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_16x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[15]); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_16x64_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[15]); + const __m128i weights = LoadUnaligned16(smooth_weights + 12); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int left_offset = 0; left_offset < 64; left_offset += 8) { + const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, + round); + dst += stride; + } + } +} + +void aom_smooth_h_predictor_32x8_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[31]); + const __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_32x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[31]); + const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left1, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } + const __m128i left2 = + cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left2, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_32x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[31]); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + __m128i left = cvtepu8_epi16(LoadLo8(left_column)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } + left = cvtepu8_epi16(LoadLo8(left_column + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_32x64_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[31]); + const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28); + const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_hi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int left_offset = 0; left_offset < 64; left_offset += 8) { + const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, + round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, + weights4, scaled_top_right3, + scaled_top_right4, round); + dst += stride; + } + } +} + +void aom_smooth_h_predictor_64x16_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[63]); + const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); + const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lolo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_lohi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); + const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); + const __m128i weights5 = cvtepu8_epi16(weights_hilo); + const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); + const __m128i weights7 = cvtepu8_epi16(weights_hihi); + const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); + const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); + const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); + const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); + const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); + const __m128i scaled_top_right5 = + _mm_mullo_epi16(inverted_weights5, top_right); + const __m128i scaled_top_right6 = + _mm_mullo_epi16(inverted_weights6, top_right); + const __m128i scaled_top_right7 = + _mm_mullo_epi16(inverted_weights7, top_right); + const __m128i scaled_top_right8 = + _mm_mullo_epi16(inverted_weights8, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left1, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, round); + dst += stride; + } + const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + __m128i y_select = _mm_set1_epi32(y_mask); + __m128i left_y = _mm_shuffle_epi8(left2, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_64x32_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[63]); + const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column)); + const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); + const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lolo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_lohi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); + const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); + const __m128i weights5 = cvtepu8_epi16(weights_hilo); + const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); + const __m128i weights7 = cvtepu8_epi16(weights_hihi); + const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); + const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); + const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); + const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); + const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); + const __m128i scaled_top_right5 = + _mm_mullo_epi16(inverted_weights5, top_right); + const __m128i scaled_top_right6 = + _mm_mullo_epi16(inverted_weights6, top_right); + const __m128i scaled_top_right7 = + _mm_mullo_epi16(inverted_weights7, top_right); + const __m128i scaled_top_right8 = + _mm_mullo_epi16(inverted_weights8, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left1, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, round); + dst += stride; + } + const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left2, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, round); + dst += stride; + } + const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left3, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, round); + dst += stride; + } + const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left4, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4, + scaled_top_right3, scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6, + scaled_top_right5, scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8, + scaled_top_right7, scaled_top_right8, round); + dst += stride; + } +} + +void aom_smooth_h_predictor_64x64_ssse3( + uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column) { + const __m128i top_right = _mm_set1_epi16(top_row[63]); + const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60); + const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76); + const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE); + const __m128i weights1 = cvtepu8_epi16(weights_lolo); + const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8)); + const __m128i weights3 = cvtepu8_epi16(weights_lohi); + const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8)); + const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1); + const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2); + const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3); + const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4); + const __m128i scaled_top_right1 = + _mm_mullo_epi16(inverted_weights1, top_right); + const __m128i scaled_top_right2 = + _mm_mullo_epi16(inverted_weights2, top_right); + const __m128i scaled_top_right3 = + _mm_mullo_epi16(inverted_weights3, top_right); + const __m128i scaled_top_right4 = + _mm_mullo_epi16(inverted_weights4, top_right); + const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92); + const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108); + const __m128i weights5 = cvtepu8_epi16(weights_hilo); + const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8)); + const __m128i weights7 = cvtepu8_epi16(weights_hihi); + const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8)); + const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5); + const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6); + const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7); + const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8); + const __m128i scaled_top_right5 = + _mm_mullo_epi16(inverted_weights5, top_right); + const __m128i scaled_top_right6 = + _mm_mullo_epi16(inverted_weights6, top_right); + const __m128i scaled_top_right7 = + _mm_mullo_epi16(inverted_weights7, top_right); + const __m128i scaled_top_right8 = + _mm_mullo_epi16(inverted_weights8, top_right); + const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1)); + for (int left_offset = 0; left_offset < 64; left_offset += 8) { + const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset)); + for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) { + const __m128i y_select = _mm_set1_epi32(y_mask); + const __m128i left_y = _mm_shuffle_epi8(left, y_select); + write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2, + scaled_top_right1, scaled_top_right2, + round); + write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, + weights4, scaled_top_right3, + scaled_top_right4, round); + write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, + weights6, scaled_top_right5, + scaled_top_right6, round); + write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, + weights8, scaled_top_right7, + scaled_top_right8, round); + dst += stride; + } + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_utils.h b/third_party/aom/aom_dsp/x86/intrapred_utils.h new file mode 100644 index 0000000000..502574673e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_utils.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ +#define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ + +#include // SSE2 +#include "aom/aom_integer.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, + { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, + { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, + { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, + { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, + { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, + { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } +}; + +static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, + { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, +}; + +static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = { + { -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0 }, + { -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0 }, + { -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0 }, + { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 }, +}; + +static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[0], x[1]); + w3 = _mm_unpackhi_epi8(x[2], x[3]); + + ww0 = _mm_unpacklo_epi16(w0, w1); + ww1 = _mm_unpacklo_epi16(w2, w3); + ww2 = _mm_unpackhi_epi16(w0, w1); + ww3 = _mm_unpackhi_epi16(w2, w3); + + w0 = _mm_unpacklo_epi32(ww0, ww1); + w2 = _mm_unpacklo_epi32(ww2, ww3); + w1 = _mm_unpackhi_epi32(ww0, ww1); + w3 = _mm_unpackhi_epi32(ww2, ww3); + + d[0] = _mm_unpacklo_epi64(w0, w2); + d[1] = _mm_unpackhi_epi64(w0, w2); + d[2] = _mm_unpacklo_epi64(w1, w3); + d[3] = _mm_unpackhi_epi64(w1, w3); + + d[4] = _mm_srli_si128(d[0], 8); + d[5] = _mm_srli_si128(d[1], 8); + d[6] = _mm_srli_si128(d[2], 8); + d[7] = _mm_srli_si128(d[3], 8); + + d[8] = _mm_srli_si128(d[0], 4); + d[9] = _mm_srli_si128(d[1], 4); + d[10] = _mm_srli_si128(d[2], 4); + d[11] = _mm_srli_si128(d[3], 4); + + d[12] = _mm_srli_si128(d[0], 12); + d[13] = _mm_srli_si128(d[1], 12); + d[14] = _mm_srli_si128(d[2], 12); + d[15] = _mm_srli_si128(d[3], 12); +} + +static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpacklo_epi8(x[4], x[5]); + w3 = _mm_unpacklo_epi8(x[6], x[7]); + + w8 = _mm_unpacklo_epi8(x[8], x[9]); + w9 = _mm_unpacklo_epi8(x[10], x[11]); + w10 = _mm_unpacklo_epi8(x[12], x[13]); + w11 = _mm_unpacklo_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm_unpacklo_epi64(w6, w14); + d[1] = _mm_unpackhi_epi64(w6, w14); + d[2] = _mm_unpacklo_epi64(w7, w15); + d[3] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm_unpacklo_epi64(w6, w14); + d[5] = _mm_unpackhi_epi64(w6, w14); + d[6] = _mm_unpacklo_epi64(w7, w15); + d[7] = _mm_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm_unpackhi_epi8(x[0], x[1]); + w1 = _mm_unpackhi_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[4], x[5]); + w3 = _mm_unpackhi_epi8(x[6], x[7]); + + w8 = _mm_unpackhi_epi8(x[8], x[9]); + w9 = _mm_unpackhi_epi8(x[10], x[11]); + w10 = _mm_unpackhi_epi8(x[12], x[13]); + w11 = _mm_unpackhi_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm_unpacklo_epi64(w6, w14); + d[9] = _mm_unpackhi_epi64(w6, w14); + d[10] = _mm_unpacklo_epi64(w7, w15); + d[11] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm_unpacklo_epi64(w6, w14); + d[13] = _mm_unpackhi_epi64(w6, w14); + d[14] = _mm_unpacklo_epi64(w7, w15); + d[15] = _mm_unpackhi_epi64(w7, w15); +} + +static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + __m128i r[16]; + __m128i d[16]; + for (int j = 0; j < 16; j++) { + r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc)); + } + transpose16x16_sse2(r, d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]); + } +} + +static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, int height) { + for (int j = 0; j < height; j += 16) + for (int i = 0; i < width; i += 16) + transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); +} + +#endif // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ diff --git a/third_party/aom/aom_dsp/x86/intrapred_x86.h b/third_party/aom/aom_dsp/x86/intrapred_x86.h new file mode 100644 index 0000000000..b13f575a76 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_x86.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_ +#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_ + +#include // SSE2 +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#endif // AOM_AOM_DSP_X86_INTRAPRED_X86_H_ diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 0000000000..0bc841a7a4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,107 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c new file mode 100644 index 0000000000..16d2f4be7f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +static unsigned int sad4xh_sse2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int width, + int height) { + int i; + assert(width == 4); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 4) { + __m128i x0 = xx_loadl_32(a + 0 * a_stride); + __m128i x1 = xx_loadl_32(a + 1 * a_stride); + __m128i x2 = xx_loadl_32(a + 2 * a_stride); + __m128i x3 = xx_loadl_32(a + 3 * a_stride); + __m128i x_lo = _mm_unpacklo_epi32(x0, x1); + __m128i x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i x = _mm_unpacklo_epi64(x_lo, x_hi); + + x0 = xx_loadl_32(b + 0 * b_stride); + x1 = xx_loadl_32(b + 1 * b_stride); + x2 = xx_loadl_32(b + 2 * b_stride); + x3 = xx_loadl_32(b + 3 * b_stride); + x_lo = _mm_unpacklo_epi32(x0, x1); + x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i y = _mm_unpacklo_epi64(x_lo, x_hi); + + __m128i sad4x4 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad4x4); + + a += 4 * a_stride; + b += 4 * b_stride; + } + + // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95]. + const unsigned int res = + (unsigned int)(_mm_cvtsi128_si32(sad) + + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8))); + + return res; +} + +static unsigned int sad8xh_sse2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int width, + int height) { + int i; + assert(width == 8); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 2) { + __m128i x0 = xx_loadl_64(a + 0 * a_stride); + __m128i x1 = xx_loadl_64(a + 1 * a_stride); + + __m128i x = _mm_unpacklo_epi64(x0, x1); + + x0 = xx_loadl_64(b + 0 * b_stride); + x1 = xx_loadl_64(b + 1 * b_stride); + + __m128i y = _mm_unpacklo_epi64(x0, x1); + + __m128i sad8x2 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad8x2); + + a += 2 * a_stride; + b += 2 * b_stride; + } + + const unsigned int res = + (unsigned int)(_mm_cvtsi128_si32(sad) + + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8))); + + return res; +} + +static unsigned int sad16xh_sse2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int width, + int height) { + int i; + assert(width == 16); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + __m128i x = xx_loadu_128(a); + __m128i y = xx_loadu_128(b); + + __m128i sad16x1 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad16x1); + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + (unsigned int)(_mm_cvtsi128_si32(sad) + + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8))); + + return res; +} + +static unsigned int sad32xh_sse2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int width, + int height) { + int i, j; + assert(width == 32); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 2; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad32_half = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad32_half); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + (unsigned int)(_mm_cvtsi128_si32(sad) + + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8))); + + return res; +} + +static unsigned int sad64xh_sse2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int width, + int height) { + int i, j; + assert(width == 64); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 4; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + (unsigned int)(_mm_cvtsi128_si32(sad) + + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8))); + + return res; +} + +static unsigned int sad128xh_sse2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int width, + int height) { + int i, j; + assert(width == 128); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 8; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + (unsigned int)(_mm_cvtsi128_si32(sad) + + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8))); + + return res; +} + +#define DIST_WTD_SADMXN_SSE2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ + } + +DIST_WTD_SADMXN_SSE2(128, 128) +DIST_WTD_SADMXN_SSE2(128, 64) +DIST_WTD_SADMXN_SSE2(64, 128) +DIST_WTD_SADMXN_SSE2(64, 64) +DIST_WTD_SADMXN_SSE2(64, 32) +DIST_WTD_SADMXN_SSE2(32, 64) +DIST_WTD_SADMXN_SSE2(32, 32) +DIST_WTD_SADMXN_SSE2(32, 16) +DIST_WTD_SADMXN_SSE2(16, 32) +DIST_WTD_SADMXN_SSE2(16, 16) +DIST_WTD_SADMXN_SSE2(16, 8) +DIST_WTD_SADMXN_SSE2(8, 16) +DIST_WTD_SADMXN_SSE2(8, 8) +DIST_WTD_SADMXN_SSE2(8, 4) +DIST_WTD_SADMXN_SSE2(4, 8) +DIST_WTD_SADMXN_SSE2(4, 4) +#if !CONFIG_REALTIME_ONLY +DIST_WTD_SADMXN_SSE2(4, 16) +DIST_WTD_SADMXN_SSE2(16, 4) +DIST_WTD_SADMXN_SSE2(8, 32) +DIST_WTD_SADMXN_SSE2(32, 8) +DIST_WTD_SADMXN_SSE2(16, 64) +DIST_WTD_SADMXN_SSE2(64, 16) +#endif diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c new file mode 100644 index 0000000000..dd798ca54a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { + int i; + const int8_t w0 = (int8_t)jcp_param->fwd_offset; + const int8_t w1 = (int8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = _mm_set1_epi16(round); + + if (width >= 16) { + // Read 16 pixels one row at a time + assert(!(width & 15)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 16) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + // Read 8 pixels two row at a time + assert(!(width & 7)); + assert(!(width & 1)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 2 * ref_stride; + } + } else { + // Read 4 pixels four row at a time + assert(!(width & 3)); + assert(!(height & 3)); + for (i = 0; i < height; i += 4) { + const int8_t *row0 = (const int8_t *)ref + 0 * ref_stride; + const int8_t *row1 = (const int8_t *)ref + 1 * ref_stride; + const int8_t *row2 = (const int8_t *)ref + 2 * ref_stride; + const int8_t *row3 = (const int8_t *)ref + 3 * ref_stride; + + __m128i p0 = + _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1], + row1[2], row1[3], row2[0], row2[1], row2[2], row2[3], + row3[0], row3[1], row3[2], row3[3]); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 4 * ref_stride; + } + } +} + +#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +DIST_WTD_SUBPIX_AVG_VAR(128, 128) +DIST_WTD_SUBPIX_AVG_VAR(128, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 128) +DIST_WTD_SUBPIX_AVG_VAR(64, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 64) +DIST_WTD_SUBPIX_AVG_VAR(32, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 32) +DIST_WTD_SUBPIX_AVG_VAR(16, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 16) +DIST_WTD_SUBPIX_AVG_VAR(8, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 8) +DIST_WTD_SUBPIX_AVG_VAR(4, 4) + +#if !CONFIG_REALTIME_ONLY +DIST_WTD_SUBPIX_AVG_VAR(4, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 4) +DIST_WTD_SUBPIX_AVG_VAR(8, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 8) +DIST_WTD_SUBPIX_AVG_VAR(16, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 16) +#endif diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c new file mode 100644 index 0000000000..6e77742e3c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c @@ -0,0 +1,1016 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* AVX2 */ + +#include "config/aom_dsp_rtcd.h" + +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + +void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; + __m128i p2, p1, p0, q0, q1, q2; + __m128i mask, flat; + + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0])); + const __m128i limit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0])); + const __m128i zero = _mm_setzero_si128(); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + + p256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); + p256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); + p256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); + q256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); + q256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); + q256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); + + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + + { + __m128i work; + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + // loop filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i one = _mm_set1_epi8(1); + __m128i hev; + + hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat_p1, flat_p0, flat_q0, flat_q1; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m256i four = _mm256_set1_epi16(4); + __m256i pixetFilter, add, res; + + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + + pixetFilter = _mm256_slli_epi16( + _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1); + pixetFilter = + _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0)); + pixetFilter = _mm256_add_epi16(four, pixetFilter); + res = _mm256_srli_epi16(pixetFilter, 3); + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2), + _mm256_sub_epi16(q256_0, p256_2)); + pixetFilter = _mm256_add_epi16(pixetFilter, add); + res = _mm256_srli_epi16(pixetFilter, 3); + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2), + _mm256_sub_epi16(q256_1, p256_1)); + pixetFilter = _mm256_add_epi16(pixetFilter, add); + res = _mm256_srli_epi16(pixetFilter, 3); + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1), + _mm256_sub_epi16(q256_2, p256_0)); + pixetFilter = _mm256_add_epi16(pixetFilter, add); + res = _mm256_srli_epi16(pixetFilter, 3); + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168)); + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } else { + _mm_storeu_si128((__m128i *)(s - 2 * p), ps1); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps0); + _mm_storeu_si128((__m128i *)(s - 0 * p), qs0); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs1); + } + } +} + +void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, flat; + + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0])); + const __m128i limit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0])); + const __m128i zero = _mm_setzero_si128(); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + + p256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); + p256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); + p256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); + p256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); + q256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); + q256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); + q256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); + q256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); + + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + + { + __m128i work; + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + // loop filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i one = _mm_set1_epi8(1); + __m128i hev; + + hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m256i four = _mm256_set1_epi16(4); + __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p, + sum_q, res_p, res_q; + + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + + p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); + q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); + + pixetFilter_p2p1p0 = + _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0)); + pixetFilter_q2q1q0 = pixetFilter_p2p1p0; + + pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3); + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3); + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + sum_p = _mm256_sub_epi16(p256_3, q256_2); + pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3); + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + sum_q = _mm256_sub_epi16(q256_3, p256_2); + pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3); + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + sum_p = _mm256_sub_epi16(p256_3, q256_1); + pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3); + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + sum_q = _mm256_sub_epi16(q256_3, p256_1); + pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3); + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } else { + _mm_storeu_si128((__m128i *)(s - 2 * p), ps1); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps0); + _mm_storeu_si128((__m128i *)(s - 0 * p), qs0); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs1); + } + } +} + +static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p, + unsigned char *out, int out_p, + int is_store_avx2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)in0); + const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1)); + const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2)); + const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3)); + const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4)); + const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5)); + const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6)); + const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7)); + + const __m256i y0 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)), + 0x1); + const __m256i y1 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)), + 0x1); + const __m256i y2 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)), + 0x1); + const __m256i y3 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)), + 0x1); + const __m256i y4 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)), + 0x1); + const __m256i y5 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)), + 0x1); + const __m256i y6 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)), + 0x1); + const __m256i y7 = _mm256_insertf128_si256( + _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)), + 0x1); + + const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1); + const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1); + const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3); + const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3); + const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5); + const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5); + const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7); + const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7); + + const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02); + const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02); + const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03); + const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03); + const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06); + const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06); + const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07); + const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07); + + const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14); + const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14); + const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15); + const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15); + const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16); + const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16); + const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17); + const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17); + + const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8); + const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8); + const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8); + const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8); + const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8); + const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8); + const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8); + const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8); + + if (is_store_avx2) { + _mm256_storeu_si256((__m256i *)(out), row_s01); + _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23); + _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45); + _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67); + _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89); + _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011); + _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213); + _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415); + } else { + _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01)); + _mm_storeu_si128((__m128i *)(out + (2 * out_p)), + _mm256_castsi256_si128(row_s23)); + _mm_storeu_si128((__m128i *)(out + (4 * out_p)), + _mm256_castsi256_si128(row_s45)); + _mm_storeu_si128((__m128i *)(out + (6 * out_p)), + _mm256_castsi256_si128(row_s67)); + _mm_storeu_si128((__m128i *)(out + (8 * out_p)), + _mm256_castsi256_si128(row_s89)); + _mm_storeu_si128((__m128i *)(out + (10 * out_p)), + _mm256_castsi256_si128(row_s1011)); + _mm_storeu_si128((__m128i *)(out + (12 * out_p)), + _mm256_castsi256_si128(row_s1213)); + _mm_storeu_si128((__m128i *)(out + (14 * out_p)), + _mm256_castsi256_si128(row_s1415)); + _mm_storeu_si128((__m128i *)(out + (1 * out_p)), + _mm256_extracti128_si256(row_s01, 1)); + _mm_storeu_si128((__m128i *)(out + (3 * out_p)), + _mm256_extracti128_si256(row_s23, 1)); + _mm_storeu_si128((__m128i *)(out + (5 * out_p)), + _mm256_extracti128_si256(row_s45, 1)); + _mm_storeu_si128((__m128i *)(out + (7 * out_p)), + _mm256_extracti128_si256(row_s67, 1)); + _mm_storeu_si128((__m128i *)(out + (9 * out_p)), + _mm256_extracti128_si256(row_s89, 1)); + _mm_storeu_si128((__m128i *)(out + (11 * out_p)), + _mm256_extracti128_si256(row_s1011, 1)); + _mm_storeu_si128((__m128i *)(out + (13 * out_p)), + _mm256_extracti128_si256(row_s1213, 1)); + _mm_storeu_si128((__m128i *)(out + (15 * out_p)), + _mm256_extracti128_si256(row_s1415, 1)); + } +} + +void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + __m128i mask, flat; + const __m128i zero = _mm_setzero_si128(); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + + __m256i p256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); + __m256i p256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); + __m256i p256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); + __m256i p256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); + __m256i q256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); + __m256i q256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); + __m256i q256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); + __m256i q256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); + + __m128i p3 = _mm256_castsi256_si128(p256_3); + __m128i p2 = _mm256_castsi256_si128(p256_2); + __m128i p1 = _mm256_castsi256_si128(p256_1); + __m128i p0 = _mm256_castsi256_si128(p256_0); + __m128i q0 = _mm256_castsi256_si128(q256_0); + __m128i q1 = _mm256_castsi256_si128(q256_1); + __m128i q2 = _mm256_castsi256_si128(q256_2); + __m128i q3 = _mm256_castsi256_si128(q256_3); + + { + const __m128i limit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0])); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + __m128i work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + // loop filter + { + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0])); + const __m128i one = _mm_set1_epi8(1); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t4 = _mm_add_epi8(one, t3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t7f = _mm_sub_epi8(t80, one); + + __m128i hev = _mm_subs_epu8(flat, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + + __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + __m128i work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + + __m128i filter1 = _mm_adds_epi8(filt, t4); + __m128i filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + filt = _mm_adds_epi8(filter1, one); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + + // Derive flat + __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0); + __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0); + __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0); + const __m256i ps0qs0256 = + _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1); + const __m256i ps1qs1256 = + _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1); + const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256), + _mm256_subs_epu8(p0q0256, p2q2256)); + const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256), + _mm256_subs_epu8(p0q0256, p3q3256)); + const __m256i max0_256 = _mm256_max_epu8(work01, work02); + const __m128i max1_256 = + _mm_max_epu8(_mm256_castsi256_si128(max0_256), + _mm256_extractf128_si256(max0_256, 1)); + flat = _mm_max_epu8(max1_256, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m256i flat256 = + _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1); + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + + __m256i p256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 5 * p))); + __m256i q256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + __m256i p256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + __m256i q256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + __m256i p256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + __m256i q256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + + // Derive flat2 + __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0); + __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0); + const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0); + const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256), + _mm256_subs_epu8(p0q0256, p4q4256)); + const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256), + _mm256_subs_epu8(p0q0256, p5q5256)); + const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256), + _mm256_subs_epu8(p0q0256, p6q6256)); + __m256i flat2_256 = _mm256_max_epu8(work1, work2); + flat2_256 = _mm256_max_epu8(flat2_256, work3); + __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256), + _mm256_extractf128_si256(flat2_256, 1)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + + const __m256i p2p1p0 = + _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); + const __m256i q2q1q0 = + _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); + + __m256i pixetFilter_p2p1p0 = + _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0)); + __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0; + + // Derive p0 and q0 + pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3); + __m256i res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3); + pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3); + __m256i res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3); + __m256i flat_p0q0 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256); + flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0); + p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256); + p0 = _mm256_castsi256_si128(p0q0256); + q0 = _mm256_extractf128_si256(p0q0256, 1); + + // Derive p1 and q1 + __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2); + pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3); + __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2); + pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3); + __m256i flat_p1q1 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256); + flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1); + p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256); + p1 = _mm256_castsi256_si128(p1q1256); + q1 = _mm256_extractf128_si256(p1q1256, 1); + + // Derive p2 and q2 + sum_p = _mm256_sub_epi16(p256_3, q256_1); + pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p); + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3); + sum_q = _mm256_sub_epi16(q256_3, p256_1); + pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q); + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3); + __m256i flat_p2q2 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p2q2256 = _mm256_andnot_si256(flat256, p2q2256); + flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2); + p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256); + p2 = _mm256_castsi256_si128(p2q2256); + q2 = _mm256_extractf128_si256(p2q2256, 1); + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_256 = + _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + + __m256i pixelFilter_p = + _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3)); + __m256i pixelFilter_q = + _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3)); + + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0); + + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0); + pixelFilter_p = _mm256_add_epi16( + eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + pixelFilter_q = pixelFilter_p; + + // Derive p0 and q0 + pixelFilter_p = + _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p); + res_p = _mm256_srli_epi16(pixelFilter_p, 4); + pixelFilter_q = + _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q); + res_q = _mm256_srli_epi16(pixelFilter_q, 4); + __m256i flat2_p0q0 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256); + flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0); + p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256); + + p0 = _mm256_castsi256_si128(p0q0256); + q0 = _mm256_extractf128_si256(p0q0256, 1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + + // Derive p1 and q1 + sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5), + _mm256_sub_epi16(p256_2, q256_0)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); + res_p = _mm256_srli_epi16(pixelFilter_p, 4); + sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5), + _mm256_sub_epi16(q256_2, p256_0)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); + res_q = _mm256_srli_epi16(pixelFilter_q, 4); + __m256i flat2_p1q1 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256); + flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1); + p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256); + p1 = _mm256_castsi256_si128(p1q1256); + q1 = _mm256_extractf128_si256(p1q1256, 1); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + + // Derive p2 and q2 + sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4), + _mm256_sub_epi16(p256_3, p256_0)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); + res_p = _mm256_srli_epi16(pixelFilter_p, 4); + sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4), + _mm256_sub_epi16(q256_3, q256_0)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); + res_q = _mm256_srli_epi16(pixelFilter_q, 4); + __m256i flat2_p2q2 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256); + flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2); + p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256); + p2 = _mm256_castsi256_si128(p2q2256); + q2 = _mm256_extractf128_si256(p2q2256, 1); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + + // Derive p3 and q3 + sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3), + _mm256_sub_epi16(p256_4, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); + res_p = _mm256_srli_epi16(pixelFilter_p, 4); + sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3), + _mm256_sub_epi16(q256_4, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); + res_q = _mm256_srli_epi16(pixelFilter_q, 4); + __m256i flat2_p3q3 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256); + flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3); + p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256); + p3 = _mm256_castsi256_si128(p3q3256); + q3 = _mm256_extractf128_si256(p3q3256, 1); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + // Derive p4 and q4 + sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2), + _mm256_sub_epi16(p256_5, p256_2)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); + res_p = _mm256_srli_epi16(pixelFilter_p, 4); + sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2), + _mm256_sub_epi16(q256_5, q256_2)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); + res_q = _mm256_srli_epi16(pixelFilter_q, 4); + __m256i flat2_p4q4 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256); + flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4); + p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256); + _mm_storeu_si128((__m128i *)(s - 5 * p), + _mm256_castsi256_si128(p4q4256)); + _mm_storeu_si128((__m128i *)(s + 4 * p), + _mm256_extractf128_si256(p4q4256, 1)); + + // Derive p5 and q5 + sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1), + _mm256_sub_epi16(p256_6, p256_3)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p); + res_p = _mm256_srli_epi16(pixelFilter_p, 4); + sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1), + _mm256_sub_epi16(q256_6, q256_3)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q); + res_q = _mm256_srli_epi16(pixelFilter_q, 4); + __m256i flat2_p5q5 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8); + p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256); + flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5); + p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256); + _mm_storeu_si128((__m128i *)(s - 6 * p), + _mm256_castsi256_si128(p5q5256)); + _mm_storeu_si128((__m128i *)(s + 5 * p), + _mm256_extractf128_si256(p5q5256, 1)); + } else { + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } + } else { + _mm_storeu_si128((__m128i *)(s - 2 * p), ps1); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps0); + _mm_storeu_si128((__m128i *)(s - 0 * p), qs0); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs1); + } + } +} + +void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1); + + // Loop filtering + aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0, + _thresh0); + + // Transpose back + trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0); +} diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c new file mode 100644 index 0000000000..cdf24c332a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -0,0 +1,2973 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" +#include "aom_dsp/x86/lpf_common_sse2.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontally. +// Used for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 + + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx +} + +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them independently while flipping the second matrix horizontaly Used for 14 +// taps filter pq pairs inverse +static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *pq0, __m128i *pq1, + __m128i *pq2, __m128i *pq3) { + __m128i w10, w11, w12, w13; + __m128i w0, w1, w2, w3, w4, w5; + __m128i d0, d1, d2, d3; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w10 = _mm_unpacklo_epi8( + *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 + w11 = _mm_unpacklo_epi8( + *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 + w12 = _mm_unpacklo_epi8( + *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 + w13 = _mm_unpacklo_epi8( + *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 + + w4 = _mm_unpackhi_epi16( + w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpackhi_epi16( + w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + *pq0 = _mm_unpacklo_epi64(d0, d1); // pq + *pq1 = _mm_unpackhi_epi64(d0, d1); // pq + *pq2 = _mm_unpacklo_epi64(d2, d3); // pq + *pq3 = _mm_unpackhi_epi64(d2, d3); // pq +} + +static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, __m128i *ps1ps0) { + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i t3t4 = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8((char)0x80); + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi32(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter2filter1 = + _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + filter = _mm_unpacklo_epi32(filter, filter); + + filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); + hev1 = _mm_srli_si128(filter2filter1, 8); + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0) { + const __m128i t3t4 = + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8((char)0x80); + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi64(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + + hev1 = _mm_unpackhi_epi64(filter2filter1, filter); + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); + + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, flat, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + /* (abs(q1 - q0), abs(p1 - p0) */ + flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ + abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi32(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + /* (abs(q1 - q0), abs(p1 - p0) */ + __m128i flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + + /* const int8_t mask = filter_mask2(*limit, *blimit, */ + /* p1, p0, q0, q1); */ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi64(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); + + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + + __m128i qs1qs0, ps1ps0; + __m128i p1, p0, q0, q1; + + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); + + xx_storel_32(s - 1 * p, ps1ps0); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); + xx_storel_32(s + 0 * p, qs1qs0); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); +} + +void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + __m128i p1p0, q1q0; + __m128i p1, p0, q0, q1; + + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + + __m128i x0, x1, x2, x3; + __m128i d0, d1, d2, d3; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + + transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); + + // Transpose 8x4 to 4x8 + p1 = _mm_srli_si128(p1p0, 4); + q1 = _mm_srli_si128(q1q0, 4); + + transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} + +static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { + xx_storel_32(s - (num + 1) * p, x); + xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); +} + +static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; + + p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); + q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + __m128i fe, ff, work; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8((char)0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); + qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); + // loopfilter done + + __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p6, sum_q6; + __m128i sum_p3, sum_q3, res_p, res_q; + + p6_16 = _mm_unpacklo_epi8(*q6p6, zero); + p5_16 = _mm_unpacklo_epi8(*q5p5, zero); + p4_16 = _mm_unpacklo_epi8(*q4p4, zero); + p3_16 = _mm_unpacklo_epi8(*q3p3, zero); + p2_16 = _mm_unpacklo_epi8(*q2p2, zero); + p1_16 = _mm_unpacklo_epi8(*q1p1, zero); + p0_16 = _mm_unpacklo_epi8(*q0p0, zero); + q0_16 = _mm_unpackhi_epi8(*q0p0, zero); + q1_16 = _mm_unpackhi_epi8(*q1p1, zero); + q2_16 = _mm_unpackhi_epi8(*q2p2, zero); + q3_16 = _mm_unpackhi_epi8(*q3p3, zero); + q4_16 = _mm_unpackhi_epi8(*q4p4, zero); + q5_16 = _mm_unpackhi_epi8(*q5p5, zero); + q6_16 = _mm_unpackhi_epi8(*q6p6, zero); + pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), + _mm_add_epi16(p1_16, q0_16))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), + _mm_add_epi16(p0_16, q1_16))), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(p6_16, p6_16); + sum_q6 = _mm_add_epi16(q6_16, q6_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + // work with flat2 + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat = _mm_unpacklo_epi64(flat, flat); + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } +} + +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i flat2_pq[6], flat_pq[3]; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; + + p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + __m128i fe, ff, work; + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + fe = _mm_set1_epi8((char)0xfe); + ff = _mm_cmpeq_epi8(fe, fe); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); + qs1ps1 = _mm_srli_si128(qs0ps0, 8); + // loopfilter done + + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pq_16[7]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p6; + __m128i sum_p3; + + pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); + pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); + pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); + pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); + pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); + pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); + pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); + q0_16 = _mm_srli_si128(pq_16[0], 8); + q1_16 = _mm_srli_si128(pq_16[1], 8); + q2_16 = _mm_srli_si128(pq_16[2], 8); + q3_16 = _mm_srli_si128(pq_16[3], 8); + q4_16 = _mm_srli_si128(pq_16[4], 8); + q5_16 = _mm_srli_si128(pq_16[5], 8); + + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); + __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); + + sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); + sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); + sum_p = _mm_sub_epi16(sum_p_0, q5_16); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); + work0_1 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); + sum_lp = _mm_sub_epi16(sum_lp, q2_16); + + work0 = _mm_add_epi16(sum_p3, pq_16[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); + flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); + + sum_lp = _mm_sub_epi16(sum_lp, q1_16); + sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); + work0 = _mm_add_epi16(sum_p3, pq_16[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + flat2 = _mm_unpacklo_epi32(flat2, flat2); + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); + *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); + *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); + flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); + + sum_p = _mm_sub_epi16(sum_p, q4_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q3_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[3]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q2_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[2]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q1_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[1]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); + *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); + *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); + *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); + *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); + *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } +} + +void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p)); + q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p)); + q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p)); + q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p)); + + q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p)); + + q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p)); + + q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p)); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + store_buffer_horz_8(q0p0, p, 0, s); + store_buffer_horz_8(q1p1, p, 1, s); + store_buffer_horz_8(q2p2, p, 2, s); + store_buffer_horz_8(q3p3, p, 3, s); + store_buffer_horz_8(q4p4, p, 4, s); + store_buffer_horz_8(q5p5, p, 5, s); +} + +static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), + _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + p2_16); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_shft1 = _mm_srli_epi16(workp_a, 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), + p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_add_epi16(q1_16, q2_16); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), + p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi32(*p0, *p1); + *q1q0 = _mm_unpacklo_epi32(*q0, *q1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + pq2_16 = _mm_unpacklo_epi8(q2p2, zero); + pq1_16 = _mm_unpacklo_epi8(q1p1, zero); + pq0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_srli_si128(pq0_16, 8); + q2_16 = _mm_srli_si128(pq2_16, 8); + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + workp_b = _mm_srli_epi16(workp_b, 3); + + flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), + pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), + pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + workp_a = _mm_srli_epi16(workp_a, 3); + + flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } +} + +void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + p2 = xx_loadl_32(s - 3 * p); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + q2 = xx_loadl_32(s + 2 * p); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); +} + +void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +static AOM_FORCE_INLINE void lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi32(*p3, *q3); + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 + q1q0 = _mm_srli_si128(p1p0, 8); + + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_add_epi16(workp_a, workp_b); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_c = _mm_add_epi16(workp_a, workp_b); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_d, workp_c); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_c, workp_d); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); + workp_c = _mm_srli_epi16(workp_c, 3); + + opq2 = _mm_packus_epi16(workp_c, workp_c); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 4); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi64(*p3, *q3); + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + { + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8((char)0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + } + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + + __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 8); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + p3 = xx_loadl_32(s - 4 * p); + p2 = xx_loadl_32(s - 3 * p); + p1 = xx_loadl_32(s - 2 * p); + p0 = xx_loadl_32(s - 1 * p); + q0 = xx_loadl_32(s - 0 * p); + q1 = xx_loadl_32(s + 1 * p); + q2 = xx_loadl_32(s + 2 * p); + q3 = xx_loadl_32(s + 3 * p); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); + xx_storel_32(s - 3 * p, p2); + xx_storel_32(s + 2 * p, q2); +} + +void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), + _mm_loadl_epi64((__m128i *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + + q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), + _mm_loadl_epi64((__m128i *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), + _mm_loadl_epi64((__m128i *)(s + 6 * p))); + + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); +} + +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +} + +void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i p1, p0, q0, q1; + __m128i qs1qs0, ps1ps0; + + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + + __m128i l = _mm_unpacklo_epi64(blimit, limit); + + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); + + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); + + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); + + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); +} + +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i p0, q0, q1, p1; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i qs1qs0, ps1ps0; + + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + + __m128i l = _mm_unpacklo_epi64(blimit, limit); + + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); + + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); + + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); + + x0 = _mm_loadl_epi64((__m128i *)((s - 2))); + x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); + + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, + &q1); + + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + + p1 = _mm_srli_si128(ps1ps0, 8); + q1 = _mm_srli_si128(qs1qs0, 8); + + transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, + &d5, &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} + +void aom_lpf_vertical_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x2, x1, x0, x3; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + + lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); + + transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} + +void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} + +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + __m128i p0, q0; + __m128i x2, x1, x0, x3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + // Loop filtering + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, + &blimit, &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); + + transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, + &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); +} + +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d1, d3, d5, d7; + __m128i q1q0, p1p0; + __m128i p1, q1; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, + &q1q0, &p1p0, &blimit, &limit, &thresh); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, + &d2d3, &d4d5, &d6d7); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); + _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); + _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); +} + +void aom_lpf_vertical_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3; + __m128i pq0, pq1, pq2, pq3; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + + transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, + &q5p5, &q6p6, &q7p7); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &pq0, &pq1, &pq2, &pq3); + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); +} + +void aom_lpf_vertical_14_dual_sse2( + unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x7, x6, x5, x4, x3, x2, x1, x0; + __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; + __m128i q0, q1, q2, q3, q7; + __m128i p0p1, p2p3, p4p5, p6p7; + + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); + + transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, + &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); + + q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); + q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); + q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); + q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); + q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); + q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); + q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); + q7 = _mm_srli_si128(d14d15, 8); + + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, + &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); +} + +static INLINE __m128i filter_add2_sub2(const __m128i *const total, + const __m128i *const a1, + const __m128i *const a2, + const __m128i *const s1, + const __m128i *const s2) { + __m128i x = _mm_add_epi16(*a1, *total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); + return x; +} + +static INLINE __m128i filter8_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f8_lo, + const __m128i *const f8_hi) { + const __m128i f8 = + _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); + const __m128i result = _mm_and_si128(*flat, f8); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static INLINE __m128i filter16_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f_lo, + const __m128i *const f_hi) { + const __m128i f = + _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); + const __m128i result = _mm_and_si128(*flat, f); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); + const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); + const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); + __m128i mask, hev, flat, flat2; + __m128i p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q6, q5; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + const __m128i eight = _mm_set1_epi16(8); + const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); + const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); + const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); + const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); + const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); + const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); + + const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); + const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); + const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); + const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); + const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); + const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); + + __m128i f_lo; + __m128i f_hi; + + f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo); + f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo); + f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo); + f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); + f_lo = _mm_add_epi16(f_lo, eight); + + f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi); + f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi); + f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi); + f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); + f_hi = _mm_add_epi16(f_hi, eight); + + p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo); + f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi); + p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo); + f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi); + p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo); + f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi); + op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + + f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo); + f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi); + op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo); + f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi); + op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi); + oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi); + oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi); + oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi); + q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi); + q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi); + q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + } else { + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + } + } else { + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + } + } +} + +void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); + const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); + const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + } else { + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + } + } +} + +void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); + const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); + const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); + __m128i mask, hev, flat; + __m128i p2, p1, p0, q0, q1, q2; + + __m128i op1, op0, oq0, oq1; + + __m128i max_abs_p1p0q1q0; + + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter6 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four), + _mm_add_epi16(p2_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo), + _mm_add_epi16(p1_lo, p0_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four), + _mm_add_epi16(p2_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi), + _mm_add_epi16(p1_hi, p0_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + } else { + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + } + } +} + +void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0) { + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0); + const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0); + const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0); + __m128i mask, hev; + __m128i p1, p0, q0, q1; + + __m128i op1, op0, oq0, oq1; + + __m128i max_abs_p1p0q1q0; + + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + mask = _mm_subs_epu8(mask, limit_v); + mask = _mm_cmpeq_epi8(mask, zero); + } + + if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return; + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + } +} + +void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); + + // Loop filtering + aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0); + + // Transpose back + transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch); +} + +void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + + // Transpose 16x8 + transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0); + + // Transpose back + transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch); +} + +void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + + // Transpose 16x8:: (wxh) 8x16 to 16x8 + transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0); + + // Transpose back:: (wxh) 16x8 to 8x16 + transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch); +} + +void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + + // Transpose 16x8 + transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0, + _thresh0); + + // Transpose back + transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch); +} diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h new file mode 100644 index 0000000000..45464e80b1 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) +#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) + +static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5) { + __m128i w0, w1, w2, w3, w4, w5, ww0; + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 + *d1 = _mm_unpackhi_epi64(ww0, + _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + *d2 = _mm_unpacklo_epi64(ww0, + _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx + + w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx + w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx + w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx + + *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 + + ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 + *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 + *d5 = _mm_unpackhi_epi64(ww0, + _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx +} + +static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i zero = _mm_setzero_si128(); + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + + *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx + *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx + *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx + *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx +} + +static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, ww2, ww3; + __m128i zero = _mm_setzero_si128(); + + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + + ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + + *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx + *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx + *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx + *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // output + // 00 10 20 30 xx xx xx xx + // 01 11 21 31 xx xx xx xx + // 02 12 22 32 xx xx xx xx + // 03 13 23 33 xx xx xx xx + // 04 14 24 34 xx xx xx xx + // 05 15 25 35 xx xx xx xx + // 06 16 26 36 xx xx xx xx + // 07 17 27 37 xx xx xx xx + highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); + highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); +} + +static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 + w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose8x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); + highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); +} + +// here in and out pointers (x and d arrays) should be different! we don't store +// their values inside +static INLINE void highbd_transpose8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, + d5, d6, d7); + highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, + x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, + d4 + 1, d5 + 1, d6 + 1, d7 + 1); +} + +// Low bit depth functions +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + x0 = _mm_loadl_epi64((__m128i *)in0); + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); + x0 = _mm_unpacklo_epi8(x0, x1); + + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); + x1 = _mm_unpacklo_epi8(x2, x3); + + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); + x2 = _mm_unpacklo_epi8(x4, x5); + + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); + x3 = _mm_unpacklo_epi8(x6, x7); + x4 = _mm_unpacklo_epi16(x0, x1); + + x8 = _mm_loadl_epi64((__m128i *)in1); + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); + x8 = _mm_unpacklo_epi8(x8, x9); + x5 = _mm_unpacklo_epi16(x2, x3); + + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); + x9 = _mm_unpacklo_epi8(x10, x11); + + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); + x10 = _mm_unpacklo_epi8(x12, x13); + x12 = _mm_unpacklo_epi16(x8, x9); + + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); + x11 = _mm_unpacklo_epi8(x14, x15); + x13 = _mm_unpacklo_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store first 4-line result + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store second 4-line result + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p, + unsigned char *dst, int out_p) { + // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0 + // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1 + // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2 + // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3 + // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4 + // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5 + // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6 + // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7 + const __m128i x0 = _mm_loadu_si128((__m128i *)(src)); + const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p))); + const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p))); + const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p))); + const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p))); + const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p))); + const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p))); + const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p))); + + // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1 + // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1 + // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3 + // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3 + // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5 + // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5 + // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7 + // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7 + const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1); + const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1); + const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3); + const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3); + const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5); + const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5); + const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7); + const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7); + + // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3 + // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3 + // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3 + // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3 + // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7 + // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7 + // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7 + // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7 + const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12); + const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12); + const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13); + const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13); + const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16); + const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16); + const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17); + const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17); + + // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7 + // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7 + // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7 + // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7 + // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7 + // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7 + // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7 + // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7 + const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24); + const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24); + const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25); + const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25); + const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26); + const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26); + const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27); + const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27); + + mm_storelu(dst, x_s30); + mm_storehu(dst + (1 * out_p), x_s30); + mm_storelu(dst + (2 * out_p), x_s31); + mm_storehu(dst + (3 * out_p), x_s31); + mm_storelu(dst + (4 * out_p), x_s32); + mm_storehu(dst + (5 * out_p), x_s32); + mm_storelu(dst + (6 * out_p), x_s33); + mm_storehu(dst + (7 * out_p), x_s33); + mm_storelu(dst + (8 * out_p), x_s34); + mm_storehu(dst + (9 * out_p), x_s34); + mm_storelu(dst + (10 * out_p), x_s35); + mm_storehu(dst + (11 * out_p), x_s35); + mm_storelu(dst + (12 * out_p), x_s36); + mm_storehu(dst + (13 * out_p), x_s36); + mm_storelu(dst + (14 * out_p), x_s37); + mm_storehu(dst + (15 * out_p), x_s37); +} + +static INLINE void transpose_8xn(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = + _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + x1 = + _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + + x2 = + _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + x3 = + _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + + x4 = + _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + x5 = + _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + + x6 = + _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + x7 = + _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 + mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 + mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 + mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 + mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c new file mode 100644 index 0000000000..799ce9ef44 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +#define MASK_SAD16XH_ONE_REF(idx) \ + a = _mm_loadu_si128((const __m128i *)&ref##idx[x]); \ + data_l = _mm_unpacklo_epi8(a, b); \ + mask_l = _mm_unpacklo_epi8(m, m_inv); \ + pred_l = _mm_maddubs_epi16(data_l, mask_l); \ + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \ + \ + data_r = _mm_unpackhi_epi8(a, b); \ + mask_r = _mm_unpackhi_epi8(m, m_inv); \ + pred_r = _mm_maddubs_epi16(data_r, mask_r); \ + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred_l, pred_r); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, int inv_mask, + unsigned sad_array[4]) { + int x, y; + __m128i a; + __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + const uint8_t *ref0 = a_ptr[0]; + const uint8_t *ref1 = a_ptr[1]; + const uint8_t *ref2 = a_ptr[2]; + const uint8_t *ref3 = a_ptr[3]; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD16XH_ONE_REF(0) + MASK_SAD16XH_ONE_REF(1) + MASK_SAD16XH_ONE_REF(2) + MASK_SAD16XH_ONE_REF(3) + } + + src_ptr += src_stride; + ref0 += a_stride; + ref1 += a_stride; + ref2 += a_stride; + ref3 += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1), + _mm_unpackhi_epi32(res0, res1)); + res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3), + _mm_unpackhi_epi32(res2, res3)); + + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASK_SAD8XH_ONE_REF(idx) \ + const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx); \ + const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \ + data_l = _mm_unpacklo_epi8(a##idx##0, b0); \ + mask_l = _mm_unpacklo_epi8(m, m_inv); \ + pred_l = _mm_maddubs_epi16(data_l, mask_l); \ + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \ + \ + data_r = _mm_unpacklo_epi8(a##idx##1, b1); \ + mask_r = _mm_unpackhi_epi8(m, m_inv); \ + pred_r = _mm_maddubs_epi16(data_r, mask_r); \ + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred_l, pred_r); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height, + int inv_mask, unsigned sad_array[4]) { + const uint8_t *ref0 = ref_array[0]; + const uint8_t *ref1 = ref_array[1]; + const uint8_t *ref2 = ref_array[2]; + const uint8_t *ref3 = ref_array[3]; + __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred; + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (int y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride))); + const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride)); + const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr); + const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride)); + __m128i m_copy = _mm_unpacklo_epi64(m0, m1); + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD8XH_ONE_REF(0) + MASK_SAD8XH_ONE_REF(1) + MASK_SAD8XH_ONE_REF(2) + MASK_SAD8XH_ONE_REF(3) + + ref0 += 2 * a_stride; + ref1 += 2 * a_stride; + ref2 += 2 * a_stride; + ref3 += 2 * a_stride; + src_ptr += 2 * src_stride; + b_ptr += 2 * b_stride; + m_ptr += 2 * m_stride; + } + res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1), + _mm_unpackhi_epi32(res0, res1)); + res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3), + _mm_unpackhi_epi32(res2, res3)); + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASK_SAD4XH_ONE_REF(idx) \ + a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx), \ + _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \ + data = _mm_unpacklo_epi8(a, b); \ + mask = _mm_unpacklo_epi8(m, m_inv); \ + pred = _mm_maddubs_epi16(data, mask); \ + pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \ + \ + pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \ + res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); + +void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height, + int inv_mask, unsigned sad_array[4]) { + const uint8_t *ref0 = ref_array[0]; + const uint8_t *ref1 = ref_array[1]; + const uint8_t *ref2 = ref_array[2]; + const uint8_t *ref3 = ref_array[3]; + __m128i data, pred, mask; + __m128i res0 = _mm_setzero_si128(); + __m128i res1 = _mm_setzero_si128(); + __m128i res2 = _mm_setzero_si128(); + __m128i res3 = _mm_setzero_si128(); + __m128i a; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (int y = 0; y < height; y += 2) { + const __m128i src = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr), + _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr), + _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride])); + const __m128i m_copy = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr), + _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride])); + + __m128i m_inv = _mm_sub_epi8(mask_max, m_copy); + __m128i m = inv_mask ? m_inv : m_copy; + m_inv = inv_mask ? m_copy : m_inv; + + MASK_SAD4XH_ONE_REF(0) + MASK_SAD4XH_ONE_REF(1) + MASK_SAD4XH_ONE_REF(2) + MASK_SAD4XH_ONE_REF(3) + + ref0 += 2 * a_stride; + ref1 += 2 * a_stride; + ref2 += 2 * a_stride; + ref3 += 2 * a_stride; + src_ptr += 2 * src_stride; + b_ptr += 2 * b_stride; + m_ptr += 2 * m_stride; + } + res0 = _mm_unpacklo_epi32(res0, res1); + res2 = _mm_unpacklo_epi32(res2, res3); + res0 = _mm_unpacklo_epi64(res0, res2); + _mm_storeu_si128((__m128i *)sad_array, res0); +} + +#define MASKSADMXN_SSSE3(m, n) \ + void aom_masked_sad##m##x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n, inv_mask, sad_array); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + void aom_masked_sad8x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + 8, msk, msk_stride, n, inv_mask, sad_array); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + void aom_masked_sad4x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + 4, msk, msk_stride, n, inv_mask, sad_array); \ + } + +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c new file mode 100644 index 0000000000..2c022555b5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +static INLINE unsigned int masked_sad32xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 32) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return sad; +} + +static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { + __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); + __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); + __m256i a = _mm256_castsi128_si256(a0); + return _mm256_inserti128_si256(a, a1, 1); +} + +static INLINE unsigned int masked_sad16xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return sad; +} + +static INLINE unsigned int aom_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define MASKSADMXN_AVX2(m, n) \ + unsigned int aom_masked_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, invert_mask, m, n); \ + } + +MASKSADMXN_AVX2(4, 4) +MASKSADMXN_AVX2(4, 8) +MASKSADMXN_AVX2(8, 4) +MASKSADMXN_AVX2(8, 8) +MASKSADMXN_AVX2(8, 16) +MASKSADMXN_AVX2(16, 8) +MASKSADMXN_AVX2(16, 16) +MASKSADMXN_AVX2(16, 32) +MASKSADMXN_AVX2(32, 16) +MASKSADMXN_AVX2(32, 32) +MASKSADMXN_AVX2(32, 64) +MASKSADMXN_AVX2(64, 32) +MASKSADMXN_AVX2(64, 64) +MASKSADMXN_AVX2(64, 128) +MASKSADMXN_AVX2(128, 64) +MASKSADMXN_AVX2(128, 128) +MASKSADMXN_AVX2(4, 16) +MASKSADMXN_AVX2(16, 4) +MASKSADMXN_AVX2(8, 32) +MASKSADMXN_AVX2(32, 8) +MASKSADMXN_AVX2(16, 64) +MASKSADMXN_AVX2(64, 16) + +static INLINE unsigned int highbd_masked_sad8xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + // Zero-extend mask to 16 bits + const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(m_ptr)), + _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return sad; +} + +static INLINE unsigned int highbd_masked_sad16xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m256i m = + _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return sad; +} + +static INLINE unsigned int aom_highbd_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define HIGHBD_MASKSADMXN_AVX2(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ + second_pred8, msk, msk_stride, \ + invert_mask, m, n); \ + } + +HIGHBD_MASKSADMXN_AVX2(4, 4) +HIGHBD_MASKSADMXN_AVX2(4, 8) +HIGHBD_MASKSADMXN_AVX2(8, 4) +HIGHBD_MASKSADMXN_AVX2(8, 8) +HIGHBD_MASKSADMXN_AVX2(8, 16) +HIGHBD_MASKSADMXN_AVX2(16, 8) +HIGHBD_MASKSADMXN_AVX2(16, 16) +HIGHBD_MASKSADMXN_AVX2(16, 32) +HIGHBD_MASKSADMXN_AVX2(32, 16) +HIGHBD_MASKSADMXN_AVX2(32, 32) +HIGHBD_MASKSADMXN_AVX2(32, 64) +HIGHBD_MASKSADMXN_AVX2(64, 32) +HIGHBD_MASKSADMXN_AVX2(64, 64) +HIGHBD_MASKSADMXN_AVX2(64, 128) +HIGHBD_MASKSADMXN_AVX2(128, 64) +HIGHBD_MASKSADMXN_AVX2(128, 128) +HIGHBD_MASKSADMXN_AVX2(4, 16) +HIGHBD_MASKSADMXN_AVX2(16, 4) +HIGHBD_MASKSADMXN_AVX2(8, 32) +HIGHBD_MASKSADMXN_AVX2(32, 8) +HIGHBD_MASKSADMXN_AVX2(16, 64) +HIGHBD_MASKSADMXN_AVX2(64, 16) diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c new file mode 100644 index 0000000000..df3a8764e3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#include "aom_dsp/x86/masked_sad_intrin_ssse3.h" + +// For width a multiple of 16 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + m, msk, msk_stride, m, n); \ + else \ + return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ + else \ + return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ + else \ + return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(a, b); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(a, b); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) + + _mm_cvtsi128_si32(_mm_srli_si128(res, 8))); + return sad; +} + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); + const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); + const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi8(a0, b0); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpacklo_epi8(a1, b1); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) + + _mm_cvtsi128_si32(_mm_srli_si128(res, 8))); + return sad; +} + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + // Load two rows at a time, this seems to be a bit faster + // than four rows at a time in this case. + const __m128i src = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr), + _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr), + _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr), + _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride])); + const __m128i m = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr), + _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data = _mm_unpacklo_epi8(a, b); + const __m128i mask = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_16bit = _mm_maddubs_epi16(data, mask); + pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + // At this point, the SAD is stored in lane 0 of 'res' + return (unsigned int)_mm_cvtsi128_si32(res); +} + +// For width a multiple of 8 +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ + ref_stride, second_pred8, 4, msk, \ + msk_stride, n); \ + else \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, \ + n); \ + } + +HIGHBD_MASKSADMXN_SSSE3(128, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 128) +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) +HIGHBD_MASKSAD4XN_SSSE3(16) +HIGHBD_MASKSADMXN_SSSE3(16, 4) +HIGHBD_MASKSADMXN_SSSE3(8, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 8) +HIGHBD_MASKSADMXN_SSSE3(16, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 16) + +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return sad; +} + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m128i res = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), + _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), + _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr), + _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])), + _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return sad; +} diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h new file mode 100644 index 0000000000..cffbd9672c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c new file mode 100644 index 0000000000..0bf383fffd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -0,0 +1,1067 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" + +// For width a multiple of 16 +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h); + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +// For width a multiple of 16 +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_); + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +#define MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * W]; \ + \ + bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, sse, &sum); \ + else \ + masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define MASK_SUBPIX_VAR8XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 8]; \ + \ + bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \ + } + +#define MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 4]; \ + \ + bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } + +MASK_SUBPIX_VAR_SSSE3(128, 128) +MASK_SUBPIX_VAR_SSSE3(128, 64) +MASK_SUBPIX_VAR_SSSE3(64, 128) +MASK_SUBPIX_VAR_SSSE3(64, 64) +MASK_SUBPIX_VAR_SSSE3(64, 32) +MASK_SUBPIX_VAR_SSSE3(32, 64) +MASK_SUBPIX_VAR_SSSE3(32, 32) +MASK_SUBPIX_VAR_SSSE3(32, 16) +MASK_SUBPIX_VAR_SSSE3(16, 32) +MASK_SUBPIX_VAR_SSSE3(16, 16) +MASK_SUBPIX_VAR_SSSE3(16, 8) +MASK_SUBPIX_VAR8XH_SSSE3(16) +MASK_SUBPIX_VAR8XH_SSSE3(8) +MASK_SUBPIX_VAR8XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(8) +MASK_SUBPIX_VAR4XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(16) +MASK_SUBPIX_VAR_SSSE3(16, 4) +MASK_SUBPIX_VAR8XH_SSSE3(32) +MASK_SUBPIX_VAR_SSSE3(32, 8) +MASK_SUBPIX_VAR_SSSE3(64, 16) +MASK_SUBPIX_VAR_SSSE3(16, 64) + +static INLINE __m128i filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi8(a, b); + v0 = _mm_maddubs_epi16(v0, filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi8(a, b); + v1 = _mm_maddubs_epi16(v1, filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h) { + int i, j; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + __m128i z = _mm_alignr_epi8(y, x, 1); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z)); + } + src += src_stride; + b += w; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + const __m128i z = _mm_alignr_epi8(y, x, 1); + const __m128i res = filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; + } + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; + } + } +} + +static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0, + const __m128i *a1, const __m128i *b1, + const __m128i *filter) { + __m128i v0 = _mm_unpacklo_epi8(*a0, *b0); + v0 = _mm_maddubs_epi16(v0, *filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi8(*a1, *b1); + v1 = _mm_maddubs_epi16(v1, *filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 8; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 8; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 16; + } + // Handle i = h separately + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + + __m128i v0 = _mm_unpacklo_epi8(x0, z0); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 8; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); + const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; + } + } +} + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = xx_loadl_32((__m128i *)src); + xx_storel_32(b, x); + src += src_stride; + b += 4; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + xx_storel_32(b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 4; + } + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i x0 = _mm_loadl_epi64((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]); + const __m128i z2 = _mm_srli_si128(x2, 1); + const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]); + const __m128i z3 = _mm_srli_si128(x3, 1); + + const __m128i a0 = _mm_unpacklo_epi32(x0, x1); + const __m128i b0 = _mm_unpacklo_epi32(z0, z1); + const __m128i a1 = _mm_unpacklo_epi32(x2, x3); + const __m128i b1 = _mm_unpacklo_epi32(z2, z3); + const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 4; + b += 16; + } + // Handle i = h separately + const __m128i x = _mm_loadl_epi64((__m128i *)src); + const __m128i z = _mm_srli_si128(x, 1); + + __m128i v0 = _mm_unpacklo_epi8(x, z); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + xx_storel_32(b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = xx_loadl_32((__m128i *)dst); + __m128i y = xx_loadl_32((__m128i *)&dst[4]); + xx_storel_32(dst, _mm_avg_epu8(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i a = xx_loadl_32((__m128i *)dst); + const __m128i b = xx_loadl_32((__m128i *)&dst[4]); + const __m128i c = xx_loadl_32((__m128i *)&dst[8]); + const __m128i d = xx_loadl_32((__m128i *)&dst[12]); + const __m128i e = xx_loadl_32((__m128i *)&dst[16]); + + const __m128i a0 = _mm_unpacklo_epi32(a, b); + const __m128i b0 = _mm_unpacklo_epi32(b, c); + const __m128i a1 = _mm_unpacklo_epi32(c, d); + const __m128i b1 = _mm_unpacklo_epi32(d, e); + const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; + } + } +} + +static INLINE void accumulate_block(const __m128i *src, const __m128i *a, + const __m128i *b, const __m128i *m, + __m128i *sum, __m128i *sum_sq) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i m_inv = _mm_sub_epi8(mask_max, *m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(*a, *b); + const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(*a, *b); + const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi8(*src, zero); + const __m128i src_r = _mm_unpackhi_epi8(*src, zero); + const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); + const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); + + // Update partial sums and partial sums of squares + *sum = + _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one)); + *sum_sq = + _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l), + _mm_madd_epi16(diff_r, diff_r))); +} + +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_) { + int x, y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); + + src_ptr += src_stride * 2; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 4) { + // Load four rows at a time + __m128i src = _mm_setr_epi32(*(int *)src_ptr, *(int *)&src_ptr[src_stride], + *(int *)&src_ptr[src_stride * 2], + *(int *)&src_ptr[src_stride * 3]); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_setr_epi32(*(int *)m_ptr, *(int *)&m_ptr[m_stride], + *(int *)&m_ptr[m_stride * 2], + *(int *)&m_ptr[m_stride * 3]); + accumulate_block(&src, &a, &b, &m, &sum, &sum_sq); + + src_ptr += src_stride * 4; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 4; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +#if CONFIG_AV1_HIGHBITDEPTH +// For width a multiple of 8 +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h); + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h); + +// For width a multiple of 8 +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_); + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_); + +#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)sse64; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)sse_; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + int64_t var; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) + +static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi16(a, b); + v0 = _mm_madd_epi16(v0, filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi16(a, b); + v1 = _mm_madd_epi16(v1, filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h) { + int i, j; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; + } + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + __m128i z = _mm_alignr_epi8(y, x, 2); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z)); + } + src += src_stride; + b += w; + } + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + const __m128i z = _mm_alignr_epi8(y, x, 2); + const __m128i res = highbd_filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; + } + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = highbd_filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; + } + } +} + +static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0, + const __m128i *b0, + const __m128i *a1, + const __m128i *b1, + const __m128i *filter) { + __m128i v0 = _mm_unpacklo_epi16(*a0, *b0); + v0 = _mm_madd_epi16(v0, *filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi16(*a1, *b1); + v1 = _mm_madd_epi16(v1, *filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h) { + int i; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 4; + } + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z)); + src += src_stride; + b += 4; + } + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 2); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 2); + const __m128i res = + highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 8; + } + // Process i = h separately + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + + __m128i v0 = _mm_unpacklo_epi16(x, z); + v0 = _mm_madd_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i res = + highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 8; + } + } +} + +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_) { + int x, y; + // Note on bit widths: + // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26, + // so this can be kept as four 32-bit values. + // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38, + // so this must be stored as two 64-bit values. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + // Calculate 8 predicted pixels. + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit + // field, but the range of values is only [-(2^12 - 1), 2^12 - 1]. + // So we can re-pack into 16-bit fields and use _mm_madd_epi16 + // to calculate the squares and partially sum them. + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + // Then we want to sign-extend to 64 bits and accumulate + const __m128i sign = _mm_srai_epi32(prod, 31); + const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign); + const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign); + sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, zero); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8)); + _mm_storel_epi64((__m128i *)sse, sum_sq); +} + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_) { + int y; + // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions). + // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18 + // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30. + // So we can safely pack sum_sq into 32-bit fields, which is slightly more + // convenient. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr), + _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])), + zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + sum_sq = _mm_add_epi32(sum_sq, prod); + + src_ptr += src_stride * 2; + a_ptr += 8; + b_ptr += 8; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + assert(height % 2 == 0); + int i = 0; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, + mask + mask_stride, comp_pred + width); + comp_pred += (width << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } else { + do { + for (int x = 0; x < width; x += 32) { + comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred); + comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16, + comp_pred + 16); + comp_pred += 32; + } + src0 += (stride0); + src1 += (stride1); + mask += (mask_stride); + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h new file mode 100644 index 0000000000..4faa098ace --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" + +static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0, + const uint8_t *src1, + const uint8_t *mask, uint8_t *dst) { + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); + const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); + const __m128i aA = _mm_load_si128((const __m128i *)(mask)); + + const __m128i maA = _mm_sub_epi8(alpha_max, aA); + + const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); + const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); + const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); + const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); + + const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); + const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); + + const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); + const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); + _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); +} + +static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + const uint8_t *mask, + int mask_stride) { + int i = 0; + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + // odd line A + const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); + const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); + const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); + // even line B + const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); + const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); + const __m128i a = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); + + const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); + const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); + + const __m128i ma = _mm_sub_epi8(alpha_max, a); + const __m128i aaA = _mm_unpacklo_epi8(a, ma); + const __m128i aaB = _mm_unpackhi_epi8(a, ma); + + const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); + const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); + const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); + const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); + const __m128i round = _mm_packus_epi16(roundA, roundB); + // comp_pred's stride == width == 8 + _mm_store_si128((__m128i *)(comp_pred), round); + comp_pred += (8 << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); +} + +#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h new file mode 100644 index 0000000000..085a572cb1 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/mem_sse2.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_AOM_DSP_X86_MEM_SSE2_H_ + +#include // SSE2 +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE int16_t loadu_int16(const void *src) { + int16_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE int32_t loadu_int32(const void *src) { + int32_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE int64_t loadu_int64(const void *src) { + int64_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) { + _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); +} + +static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, + const int byte_stride) { + return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride), + loadu_int32((int8_t *)src + 1 * byte_stride), + loadu_int32((int8_t *)src + 2 * byte_stride), + loadu_int32((int8_t *)src + 3 * byte_stride)); +} + +static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, + const int byte_stride) { + __m128i dst; + dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); + dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); + return dst; +} + +static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s, + uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); + _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); +} + +static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); + *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); + *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); + *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); +} + +static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, + const ptrdiff_t stride) { + __m128i ss[4]; + + ss[0] = s; + ss[1] = _mm_srli_si128(s, 4); + ss[2] = _mm_srli_si128(s, 8); + ss[3] = _mm_srli_si128(s, 12); + store_8bit_4x4(ss, d, stride); +} + +static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); + d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); + d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); + d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); +} + +static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_4x4(s + 0 * stride, stride, &d[0]); + load_8bit_4x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); +} + +static INLINE void loadu_8bit_16x4(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); +} + +static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_8x4(s + 0 * stride, stride, &d[0]); + load_8bit_8x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); + d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); + d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); + d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); + d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); +} + +static INLINE void loadu_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); + loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); + _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); + _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); + _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); + _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); + _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); +} + +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + +#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h new file mode 100644 index 0000000000..210f466b6f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ + +#include + +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n)); + const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); + const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h new file mode 100644 index 0000000000..27398ffd62 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ + +#include + +#include "config/aom_config.h" + +static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if AOM_ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} + +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c new file mode 100644 index 0000000000..9d1b7d4968 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + do { + const __m128i v_p_b_0 = xx_loadl_32(pre); + const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); + const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int obmc_sad_w8n_avx2( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_b = xx_loadl_64(pre + n); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if ((n & (width - 1)) == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + do { + const __m128i v_p_w_0 = xx_loadl_64(pre); + const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); + const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int hbd_obmc_sad_w8n_avx2( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c new file mode 100644 index 0000000000..542572c761 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c new file mode 100644 index 0000000000..c23d8c4eb0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m128i v_d; + const uint8_t *pre_temp; + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); + const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); + const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); + + const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_tmp_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); + const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); + const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); + + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 8; + n += 8; + width -= 8; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm_hadd_epi32(v_d, v_d); + *sum = _mm_cvtsi128_si32(v_d); + *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); +} + +static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m256i v_d; + __m128i res0; + const uint8_t *pre_temp; + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + + assert(w >= 16); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); + const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_m1_d = + _mm256_loadu_si256((__m256i const *)(mask + n + 8)); + const __m256i v_w1_d = + _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); + + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); + + const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); + + const __m256i v_tmp0_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); + const __m256i v_tmp1_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); + + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); + const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); + + const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 16; + n += 16; + width -= 16; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + + v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm256_hadd_epi32(v_d, v_d); + res0 = _mm256_castsi256_si128(v_d); + res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); + *sum = _mm_cvtsi128_si32(res0); + *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else if (W == 8) { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } else { \ + obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c new file mode 100644 index 0000000000..89b050eb20 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) + +#include "config/aom_dsp_rtcd.h" + +#define OBMC_SUBPIX_VAR(W, H) \ + uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ + } + +OBMC_SUBPIX_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 64) +OBMC_SUBPIX_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 32) +OBMC_SUBPIX_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 16) +OBMC_SUBPIX_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 8) +OBMC_SUBPIX_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 4) +OBMC_SUBPIX_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 16) +OBMC_SUBPIX_VAR(16, 4) +OBMC_SUBPIX_VAR(8, 32) +OBMC_SUBPIX_VAR(32, 8) +OBMC_SUBPIX_VAR(16, 64) +OBMC_SUBPIX_VAR(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void hbd_obmc_variance_w4( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void hbd_obmc_variance_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, + const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum += xx_hsum_epi32_si64(v_sum_d); + *sse += xx_hsum_epi32_si64(v_sse_d); +} + +static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else if (w < 128 || h < 128) { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + assert(w == 128 && h == 128); + + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * w; + mask += 64 * w; + h -= 64; + } while (h > 0); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + int max_pel_allowed_per_ovf = 512; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else if (w * h <= max_pel_allowed_per_ovf) { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + int h_per_ovf = max_pel_allowed_per_ovf / w; + + assert(max_pel_allowed_per_ovf % w == 0); + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + h_per_ovf); + pre8 += h_per_ovf * pre_stride; + wsrc += h_per_ovf * w; + mask += h_per_ovf * w; + h -= h_per_ovf; + } while (h > 0); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HBD_OBMCVARWXH(W, H) \ + unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HBD_OBMCVARWXH(128, 128) +HBD_OBMCVARWXH(128, 64) +HBD_OBMCVARWXH(64, 128) +HBD_OBMCVARWXH(64, 64) +HBD_OBMCVARWXH(64, 32) +HBD_OBMCVARWXH(32, 64) +HBD_OBMCVARWXH(32, 32) +HBD_OBMCVARWXH(32, 16) +HBD_OBMCVARWXH(16, 32) +HBD_OBMCVARWXH(16, 16) +HBD_OBMCVARWXH(16, 8) +HBD_OBMCVARWXH(8, 16) +HBD_OBMCVARWXH(8, 8) +HBD_OBMCVARWXH(8, 4) +HBD_OBMCVARWXH(4, 8) +HBD_OBMCVARWXH(4, 4) +HBD_OBMCVARWXH(4, 16) +HBD_OBMCVARWXH(16, 4) +HBD_OBMCVARWXH(8, 32) +HBD_OBMCVARWXH(32, 8) +HBD_OBMCVARWXH(16, 64) +HBD_OBMCVARWXH(64, 16) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_avx2.c b/third_party/aom/aom_dsp/x86/quantize_avx2.c new file mode 100644 index 0000000000..b808d46778 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_avx2.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin, + const int16_t *round_ptr, __m256i *round, + const int16_t *quant_ptr, __m256i *quant, + const int16_t *dequant_ptr, + __m256i *dequant, + const int16_t *shift_ptr, __m256i *shift, + int log_scale) { + *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *zbin = _mm256_add_epi16(*zbin, rnd); + *zbin = _mm256_srai_epi16(*zbin, log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + + *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *round = _mm256_add_epi16(*round, rnd); + *round = _mm256_srai_epi16(*round, log_scale); + } + + *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { + const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +} + +static AOM_FORCE_INLINE __m256i quantize_b_logscale0_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); + return _mm256_setzero_si256(); + } + + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> + // (16 - log_scale + AOM_QM_BITS)); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); + const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); + return v_nz_mask; +} + +static INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, + __m256i v_mask) { + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); + const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +void aom_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + (void)scan; + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, 0); + + // Do DC and first 15 AC. + __m256i v_nz_mask = + quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = + quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} + +static AOM_FORCE_INLINE __m256i quantize_b_logscale_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift, int log_scale) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); + return _mm256_setzero_si256(); + } + + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> + // (16 - log_scale + AOM_QM_BITS)); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32_hi = _mm256_slli_epi16( + _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), log_scale); + const __m256i v_tmp32_lo = _mm256_srli_epi16( + _mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 16 - log_scale); + const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); + const __m256i v_dqcoeff_hi = _mm256_slli_epi16( + _mm256_mulhi_epi16(v_tmp32, *v_dequant), 16 - log_scale); + const __m256i v_dqcoeff_lo = + _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32, *v_dequant), log_scale); + const __m256i v_dqcoeff = + _mm256_sign_epi16(_mm256_or_si256(v_dqcoeff_hi, v_dqcoeff_lo), v_coeff); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); + return v_nz_mask; +} + +static AOM_FORCE_INLINE void quantize_b_no_qmatrix_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *iscan, int log_scale) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + + load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr, + &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr, + &v_quant_shift, log_scale); + + // Do DC and first 15 AC. + __m256i v_nz_mask = quantize_b_logscale_16( + coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round, + &v_zbin, &v_quant_shift, log_scale); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = quantize_b_logscale_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, log_scale); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} + +void aom_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 1); +} + +void aom_quantize_b_64x64_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 2); +} diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c new file mode 100644 index 0000000000..ebef1fbac2 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + (void)scan_ptr; + + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3.c b/third_party/aom/aom_dsp/x86/quantize_ssse3.c new file mode 100644 index 0000000000..25980a055a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round, + const __m128i quant, + const __m128i *shift) { + __m128i tmp, qcoeff, tmp1; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, 14); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, 2); + *coeff = _mm_or_si128(tmp, tmp1); +} + +static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 4. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i two = _mm_set1_epi16(2); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, all_zero; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, two); + round = _mm_add_epi16(round, two); + zbin = _mm_srli_epi16(zbin, 2); + round = _mm_srli_epi16(round, 2); + zbin = _mm_sub_epi16(zbin, one); + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 1024; index += 16) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + continue; + } + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm new file mode 100644 index 0000000000..fa616a6f1a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -0,0 +1,302 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movifnidn dequantq, dequantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [dequantq] ; m3 = dequant + mov r2, shiftmp + psubw m0, [GLOBAL(pw_1)] + mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh + %endif + pmulhw m8, m4 ; m8 = m8*qsh>>16 + %ifidn %1, b_32x32 + psllw m8, 1 + psrlw m5, 15 + por m8, m5 + %endif + punpckhqdq m4, m4 + %ifidn %1, b_32x32 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif + pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + %ifidn %1, b_32x32 + pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh + %endif + pmulhw m14, m4 ; m14 = m14*qsh>>16 + %ifidn %1, b_32x32 + psllw m14, 1 + psrlw m5, 15 + por m14, m5 + pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh + %endif + pmulhw m13, m4 ; m13 = m13*qsh>>16 + %ifidn %1, b_32x32 + psllw m13, 1 + psrlw m5, 15 + por m13, m5 + pxor m5, m5 ; reset m5 to zero register + %endif + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + + ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h new file mode 100644 index 0000000000..5b040a278a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_x86.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi32(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE void calculate_qcoeff_log_scale(__m128i *coeff, + const __m128i round, + const __m128i quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, tmp1, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, (16 - *log_scale)); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, *log_scale); + *coeff = _mm_or_si128(tmp, tmp1); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff, + __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff, + const int *log_scale) { + // calculate abs + __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15); + __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign); + + const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero); + const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale); + + dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0); + dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr)); + const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + return _mm_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +} + +static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1, + const int16_t *iscan_ptr, int *is_found, + __m128i *mask) { + __m128i all_zero; + __m128i temp_mask = _mm_setzero_si128(); + all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1); + if (_mm_movemask_epi8(all_zero)) { + __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0); + __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8)); + __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1); + temp_mask = _mm_max_epi16(mask0, mask1); + *is_found = 1; + } + *mask = _mm_max_epi16(temp_mask, *mask); +} + +static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1, + __m128i *threshold, const int16_t *iscan_ptr, + int *is_found, __m128i *mask) { + __m128i zero = _mm_setzero_si128(); + __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3; + + coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero); + coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero); + coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero); + coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero); + + coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS); + cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]); + coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS); + cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]); + coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS); + cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]); + coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS); + cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]); + + cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1); + cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3); + + update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask); +} + +static INLINE int calculate_non_zero_count(__m128i mask) { + __m128i mask0, mask1; + int non_zero_count = 0; + mask0 = _mm_unpackhi_epi64(mask, mask); + mask1 = _mm_max_epi16(mask0, mask); + mask0 = _mm_shuffle_epi32(mask1, 1); + mask0 = _mm_max_epi16(mask0, mask1); + mask1 = _mm_srli_epi32(mask0, 16); + mask0 = _mm_max_epi16(mask0, mask1); + non_zero_count = _mm_extract_epi16(mask0, 0) + 1; + + return non_zero_count; +} diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c new file mode 100644 index 0000000000..0fea6ddfd3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4], + const __m256i *sum_ref0, + const __m256i *sum_ref1, + const __m256i *sum_ref2, + const __m256i *sum_ref3) { + // In sum_ref-i the result is saved in the first 4 bytes and the other 4 + // bytes are zeroed. + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + // 0, 0, 1, 1 + __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps( + _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1), + _MM_SHUFFLE(2, 0, 2, 0))); + // 2, 2, 3, 3 + __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps( + _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3), + _MM_SHUFFLE(2, 0, 2, 0))); + + // sum adjacent 32 bit integers + __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23); + + // add the low 128 bit to the high 128 bit + __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123), + _mm256_extractf128_si256(sum_ref0123, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); +} + +static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2( + int M, int N, const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + int i, j; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + sum_ref3 = _mm256_setzero_si256(); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j += 32) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)(src + j)); + ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j)); + ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j)); + ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j)); + ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j)); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + } + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3); +} + +static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2( + int M, int N, const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg; + __m256i sum_ref0, sum_ref1, sum_ref2; + int i, j; + const uint8_t *ref0, *ref1, *ref2; + const __m256i zero = _mm256_setzero_si256(); + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + + for (i = 0; i < N; i++) { + for (j = 0; j < M; j += 32) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)(src + j)); + ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j)); + ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j)); + ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j)); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + } + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + } + aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero); +} + +#define SADMXN_AVX2(m, n) \ + void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \ + } \ + void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res); \ + } + +SADMXN_AVX2(32, 8) +SADMXN_AVX2(32, 16) +SADMXN_AVX2(32, 32) +SADMXN_AVX2(32, 64) + +SADMXN_AVX2(64, 16) +SADMXN_AVX2(64, 32) +SADMXN_AVX2(64, 64) +SADMXN_AVX2(64, 128) + +SADMXN_AVX2(128, 64) +SADMXN_AVX2(128, 128) + +#define SAD_SKIP_MXN_AVX2(m, n) \ + void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref, \ + 2 * ref_stride, res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SAD_SKIP_MXN_AVX2(32, 8) +SAD_SKIP_MXN_AVX2(32, 16) +SAD_SKIP_MXN_AVX2(32, 32) +SAD_SKIP_MXN_AVX2(32, 64) + +SAD_SKIP_MXN_AVX2(64, 16) +SAD_SKIP_MXN_AVX2(64, 32) +SAD_SKIP_MXN_AVX2(64, 64) +SAD_SKIP_MXN_AVX2(64, 128) + +SAD_SKIP_MXN_AVX2(128, 64) +SAD_SKIP_MXN_AVX2(128, 128) + +static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg; + __m256i sum_ref0, sum_ref1, sum_ref2; + const uint8_t *ref0, *ref1, *ref2; + const __m256i zero = _mm256_setzero_si256(); + assert(N % 2 == 0); + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + + for (int i = 0; i < N; i += 2) { + // load src and all refs + src_reg = yy_loadu2_128(src + src_stride, src); + ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0); + ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1); + ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + + src += 2 * src_stride; + ref0 += 2 * ref_stride; + ref1 += 2 * ref_stride; + ref2 += 2 * ref_stride; + } + + aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero); +} + +static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src, + int src_stride, + const uint8_t *const ref[4], + int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + const uint8_t *ref0, *ref1, *ref2, *ref3; + assert(N % 2 == 0); + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + sum_ref0 = _mm256_setzero_si256(); + sum_ref2 = _mm256_setzero_si256(); + sum_ref1 = _mm256_setzero_si256(); + sum_ref3 = _mm256_setzero_si256(); + + for (int i = 0; i < N; i += 2) { + // load src and all refs + src_reg = yy_loadu2_128(src + src_stride, src); + ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0); + ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1); + ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2); + ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3); + + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src += 2 * src_stride; + ref0 += 2 * ref_stride; + ref1 += 2 * ref_stride; + ref2 += 2 * ref_stride; + ref3 += 2 * ref_stride; + } + + aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3); +} + +#define SAD16XNX3_AVX2(n) \ + void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \ + } +#define SAD16XNX4_AVX2(n) \ + void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \ + } + +SAD16XNX4_AVX2(32) +SAD16XNX4_AVX2(16) +SAD16XNX4_AVX2(8) + +SAD16XNX3_AVX2(32) +SAD16XNX3_AVX2(16) +SAD16XNX3_AVX2(8) + +#if !CONFIG_REALTIME_ONLY +SAD16XNX3_AVX2(64) +SAD16XNX3_AVX2(4) + +SAD16XNX4_AVX2(64) +SAD16XNX4_AVX2(4) + +#endif // !CONFIG_REALTIME_ONLY + +#define SAD_SKIP_16XN_AVX2(n) \ + void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \ + res); \ + res[0] <<= 1; \ + res[1] <<= 1; \ + res[2] <<= 1; \ + res[3] <<= 1; \ + } + +SAD_SKIP_16XN_AVX2(32) +SAD_SKIP_16XN_AVX2(16) +SAD_SKIP_16XN_AVX2(8) + +#if !CONFIG_REALTIME_ONLY +SAD_SKIP_16XN_AVX2(64) +SAD_SKIP_16XN_AVX2(4) +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm new file mode 100644 index 0000000000..6edad99516 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,437 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; 'spill_src_stride' affect a lot how the code works. +; +; When 'spill_src_stride' is false, the 'src_strideq' resides in +; register, [srcq + src_strideq + offset] is allowed, so we can simply +; use such form to access src memory and don't bother to update 'srcq' +; at each line. We only update 'srcq' each two-lines using a compact +; LEA instruction like [srcq+src_strideq*2]. +; +; When 'spill_src_stride' is true, the 'src_strideq' resides in memory. +; we cannot use above form to access memory, we have to update +; 'srcq' at each line break. As we process two parts (first,second) +; together in each macro function, the second part may also sit +; in the next line, which means we also need to possibly add +; one 'src_strideq' to 'srcq' before processing second part. + +%macro HANDLE_SECOND_OFFSET 0 + %if spill_src_stride + %define second_offset 0 + add srcq, src_strideq + %else + %define second_offset (src_strideq) + %endif +%endmacro + +; This is specically designed to handle when src_strideq is a +; memory position, under such case, we can not accomplish +; complex address calculation using LEA, and fall back to +; using simple ADD instruction at each line ending. +%macro ADVANCE_END_OF_TWO_LINES 0 + %if spill_src_stride + add srcq, src_strideq + %else + lea srcq, [srcq+src_strideq*2] + %endif + +; note: ref_stride is never spilled when processing two lines + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endmacro + +; PROCESS_4x2x4 first +%macro PROCESS_4x2x4 1 + movd m0, [srcq] + HANDLE_SECOND_OFFSET +%if %1 == 1 + movd m6, [ref1q] + movd m4, [ref2q] + movd m7, [ref3q] + movd m5, [ref4q] + + movd m1, [srcq + second_offset] + movd m2, [ref1q+ref_strideq] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+ref_strideq] + movd m2, [ref3q+ref_strideq] + movd m3, [ref4q+ref_strideq] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 + psadbw m6, m0 + psadbw m7, m0 +%else + movd m1, [ref1q] + movd m5, [ref1q+ref_strideq] + movd m2, [ref2q] + movd m4, [ref2q+ref_strideq] + punpckldq m1, m5 + punpckldq m2, m4 + movd m3, [ref3q] + movd m5, [ref3q+ref_strideq] + punpckldq m3, m5 + movd m4, [ref4q] + movd m5, [ref4q+ref_strideq] + punpckldq m4, m5 + movd m5, [srcq + second_offset] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 + psadbw m1, m0 + psadbw m3, m0 + paddd m6, m1 + paddd m7, m3 +%endif +%endmacro + +; PROCESS_8x2x4 first +%macro PROCESS_8x2x4 1 + movh m0, [srcq] + HANDLE_SECOND_OFFSET +%if %1 == 1 + movh m4, [ref1q] + movh m5, [ref2q] + movh m6, [ref3q] + movh m7, [ref4q] + movhps m0, [srcq + second_offset] + movhps m4, [ref1q+ref_strideq] + movhps m5, [ref2q+ref_strideq] + movhps m6, [ref3q+ref_strideq] + movhps m7, [ref4q+ref_strideq] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q] + movh m2, [ref2q] + movhps m0, [srcq + second_offset] + movhps m1, [ref1q+ref_strideq] + movhps m2, [ref2q+ref_strideq] + psadbw m1, m0 + psadbw m2, m0 + paddd m4, m1 + paddd m5, m2 + + movh m1, [ref3q] + movhps m1, [ref3q+ref_strideq] + movh m2, [ref4q] + movhps m2, [ref4q+ref_strideq] + psadbw m1, m0 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 +%endif +%endmacro + +; PROCESS_FIRST_MMSIZE +%macro PROCESS_FIRST_MMSIZE 0 + mova m0, [srcq] + movu m4, [ref1q] + movu m5, [ref2q] + movu m6, [ref3q] + movu m7, [ref4q] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%endmacro + +; PROCESS_16x1x4 offset +%macro PROCESS_16x1x4 1 + mova m0, [srcq + %1] + movu m1, [ref1q + ref_offsetq + %1] + movu m2, [ref2q + ref_offsetq + %1] + psadbw m1, m0 + psadbw m2, m0 + paddd m4, m1 + paddd m5, m2 + + movu m1, [ref3q + ref_offsetq + %1] + movu m2, [ref4q + ref_offsetq + %1] + psadbw m1, m0 + psadbw m2, m0 + paddd m6, m1 + paddd m7, m2 +%endmacro + +; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, else skip rows +%macro SADNXN4D 2-3 0 + +%define spill_src_stride 0 +%define spill_ref_stride 0 +%define spill_cnt 0 + +; Whether a shared offset should be used instead of adding strides to +; each reference array. With this option, only one line will be processed +; per loop iteration. +%define use_ref_offset (%1 >= mmsize) + +; Remove loops in the 4x4 and 8x4 case +%define use_loop (use_ref_offset || %2 > 4) + +%if %3 == 1 ; skip rows +%if AOM_ARCH_X86_64 +%if use_ref_offset +cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \ + ref2, ref3, ref4, cnt, ref_offset +%elif use_loop +cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \ + ref2, ref3, ref4, cnt +%else +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \ + ref2, ref3, ref4 +%endif +%else +%if use_ref_offset +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \ + ref4 +%define spill_src_stride 1 +%define spill_ref_stride 1 +%elif use_loop +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \ + ref3, ref4 +%define spill_src_stride 1 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \ + ref3, ref4 +%endif +%endif +%else ; normal sad +%if AOM_ARCH_X86_64 +%if use_ref_offset +cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \ + ref3, ref4, cnt, ref_offset +%elif use_loop +cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \ + ref3, ref4, cnt +%else +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \ + ref3, ref4 +%endif +%else +%if use_ref_offset +cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4 + %define spill_src_stride 1 + %define spill_ref_stride 1 +%elif use_loop +cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4 + %define spill_src_stride 1 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \ + ref4 +%endif +%endif +%endif + +%if spill_src_stride + %define src_strideq r1mp + %define src_strided r1mp +%endif +%if spill_ref_stride + %define ref_strideq r3mp + %define ref_strided r3mp +%endif + +%if spill_cnt + SUB rsp, 4 + %define cntd word [rsp] +%endif + +%if %3 == 1 + sal src_strided, 1 + sal ref_strided, 1 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; Is the loop for this wxh in another function? +; If so, we jump into that function for the loop and returning +%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2) + +%if use_ref_offset + PROCESS_FIRST_MMSIZE +%if %1 > mmsize + mov ref_offsetq, 0 + mov cntd, %2 >> %3 +; Jump part way into the loop for the square version of this width +%if %3 == 1 + jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop +%else + jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop +%endif +%else + mov ref_offsetq, ref_strideq + add srcq, src_strideq + mov cntd, (%2 >> %3) - 1 +%endif +%if external_loop == 0 +.loop: +; Unrolled horizontal loop +%assign h_offset 0 +%rep %1/mmsize + PROCESS_16x1x4 h_offset +%if h_offset == 0 +; The first row of the first column is done outside the loop and jumps here +.midloop: +%endif +%assign h_offset h_offset+mmsize +%endrep + + add srcq, src_strideq + add ref_offsetq, ref_strideq + sub cntd, 1 + jnz .loop +%endif +%else + PROCESS_%1x2x4 1 + ADVANCE_END_OF_TWO_LINES +%if use_loop + mov cntd, (%2/2 >> %3) - 1 +.loop: +%endif + PROCESS_%1x2x4 0 +%if use_loop + ADVANCE_END_OF_TWO_LINES + sub cntd, 1 + jnz .loop +%endif +%endif + +%if spill_cnt +; Undo stack allocation for cnt + ADD rsp, 4 +%endif + +%if external_loop == 0 +%if %3 == 0 + %define resultq r4 + %define resultmp r4mp +%endif + +; Undo modifications on parameters on the stack +%if %3 == 1 +%if spill_src_stride + shr src_strided, 1 +%endif +%if spill_ref_stride + shr ref_strided, 1 +%endif +%endif + +%if %1 > 4 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif + movifnidn resultq, resultmp + movu [resultq], m4 + RET +%else + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif + movifnidn resultq, resultmp + movq [resultq+0], m6 + movq [resultq+8], m7 + RET +%endif +%endif ; external_loop == 0 +%endmacro + +INIT_XMM sse2 +SADNXN4D 128, 128 +SADNXN4D 128, 64 +SADNXN4D 64, 128 +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 +SADNXN4D 4, 8 +SADNXN4D 4, 4 +%if CONFIG_REALTIME_ONLY==0 +SADNXN4D 4, 16 +SADNXN4D 16, 4 +SADNXN4D 8, 32 +SADNXN4D 32, 8 +SADNXN4D 16, 64 +SADNXN4D 64, 16 +%endif +SADNXN4D 128, 128, 1 +SADNXN4D 128, 64, 1 +SADNXN4D 64, 128, 1 +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 +%if CONFIG_REALTIME_ONLY==0 +SADNXN4D 4, 16, 1 +SADNXN4D 8, 32, 1 +SADNXN4D 32, 8, 1 +SADNXN4D 16, 64, 1 +SADNXN4D 64, 16, 1 +%endif + +; Different assembly is needed when the height gets subsampled to 2 +; SADNXN4D 16, 4, 1 +; SADNXN4D 8, 4, 1 +; SADNXN4D 4, 4, 1 diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c new file mode 100644 index 0000000000..24cea76b37 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_avx2.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); + _mm256_zeroupper(); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + int ref2_stride = ref_stride << 1; + int src2_stride = src_stride << 1; + int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); + _mm256_zeroupper(); + return res; +} + +#define FSAD64_H(h) \ + unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int aom_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +#define FSAD32_H(h) \ + unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int aom_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ + } + +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) + +#define FSAD32 \ + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) + +/* clang-format off */ +FSAD64 +FSAD32 +/* clang-format on */ + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ + unsigned int aom_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG32_H(h) \ + unsigned int aom_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64) \ + FSADAVG64_H(32) + +#define FSADAVG32 \ + FSADAVG32_H(64) \ + FSADAVG32_H(32) \ + FSADAVG32_H(16) + +/* clang-format off */ +FSADAVG64 +FSADAVG32 +/* clang-format on */ + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c new file mode 100644 index 0000000000..c5da6e9ab3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + __m256i s1, s2, r1, r2; + __m256i sum = _mm256_setzero_si256(); + __m128i sum_i128; + int i; + + for (i = 0; i < 16; ++i) { + r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); + r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); + s2 = _mm256_sad_epu8( + r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); + ref_ptr += ref_stride << 1; + src_ptr += src_stride << 1; + } + + sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); + sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), + _mm256_castsi256_si128(sum)); + return (unsigned int)_mm_cvtsi128_si32(sum_i128); +} + +static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 32; + uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 5; + ref_ptr += ref_stride << 5; + sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 64; + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint32_t half_width = 64; + uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2); + return 2 * sum; +} + +unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint32_t sum = + sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride); + return 2 * sum; +} + +unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint32_t sum = + aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride); + return 2 * sum; +} + +static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int h, const uint8_t *second_pred, + const int second_pred_stride) { + int i; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + ref1_reg = _mm256_avg_epu8( + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); + ref2_reg = _mm256_avg_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + second_pred += second_pred_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); +} + +unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 64 << 6; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + return sum; +} + +unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + unsigned int half_width = 64; + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + src_ptr += half_width; + ref_ptr += half_width; + second_pred += half_width; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + return sum; +} + +unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, + ref_stride, second_pred); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 128 << 6; + sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, + second_pred); + return sum; +} diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm new file mode 100644 index 0000000000..dbe8ca3161 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -0,0 +1,432 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows +%macro SAD_FN 4 +%if %4 == 0 ; normal sad +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if AOM_ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD128XN 1-2 0 + SAD_FN 128, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*4] + pavgb m2, [second_predq+mmsize*5] + pavgb m3, [second_predq+mmsize*6] + pavgb m4, [second_predq+mmsize*7] + lea second_predq, [second_predq+mmsize*8] +%endif + psadbw m1, [srcq+64] + psadbw m2, [srcq+80] + psadbw m3, [srcq+96] + psadbw m4, [srcq+112] + + add refq, ref_strideq + add srcq, src_strideq + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + sub n_rowsd, 1 + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD128XN 128 ; sad128x128_sse2 +SAD128XN 128, 1 ; sad128x128_avg_sse2 +SAD128XN 128, 2 ; sad128x128_skip_sse2 +SAD128XN 64 ; sad128x64_sse2 +SAD128XN 64, 1 ; sad128x64_avg_sse2 +SAD128XN 64, 2 ; sad128x64_skip_sse2 + + +; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else + mov n_rowsd, %1 +%endif + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD64XN 128 ; sad64x128_sse2 +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 16 ; sad64x16_sse2 +SAD64XN 128, 1 ; sad64x128_avg_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 16, 1 ; sad64x16_avg_sse2 +SAD64XN 128, 2 ; sad64x128_skip_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 +SAD64XN 16, 2 ; sad64x16_skip_sse2 + +; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else + mov n_rowsd, %1/2 +%endif + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 8 ; sad_32x8_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 8, 1 ; sad_32x8_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 +SAD32XN 8, 2 ; sad_32x8_skip_sse2 + +; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 64 ; sad_16x64_sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 4 ; sad_16x4_sse2 +SAD16XN 64, 1 ; sad_16x64_avg_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 4, 1 ; sad_16x4_avg_sse2 +SAD16XN 64, 2 ; sad_16x64_skip_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 + +; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 32 ; sad_8x32_sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 32, 1 ; sad_8x32_avg_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 32, 2 ; sad_8x32_skip_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 + +; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else + mov n_rowsd, %1/4 +%endif + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 + movlhps m1, m3 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + lea second_predq, [second_predq+mmsize*1] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m3, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m3 + movlhps m2, m4 + psadbw m1, m2 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD4XN 16 ; sad_4x16_sse2 +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 16, 1 ; sad_4x16_avg_sse2 +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 16, 2 ; sad_4x16_skip_sse2 +SAD4XN 8, 2 ; sad4x8_skip_sse diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c new file mode 100644 index 0000000000..c5a5f5c234 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_avx2.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = yy_loadu_256(a); + const __m256i v_b0 = yy_loadu_256(b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + xx_storel_64(&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_a1 = xx_loadu_128(a + a_stride); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_b1 = xx_loadu_128(b + b_stride); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + const uint8_t *a2 = a + i + (a_stride << 1); + const uint8_t *b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = yy_loadu_256(a); + const __m256i v_b_w = yy_loadu_256(b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 128: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 16; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + const uint16_t *a2 = a + i + (a_stride << 1); + const uint16_t *b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c new file mode 100644 index 0000000000..7e74554d75 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_sse4.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = xx_loadu_128(a); + const __m128i v_b_w = xx_loadu_128(b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 128: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm new file mode 100644 index 0000000000..49bc655336 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(aom_ssim_parms_16x16_sse2) +sym(aom_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(aom_ssim_parms_8x8_sse2) +sym(aom_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm new file mode 100644 index 0000000000..d1d8373456 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm @@ -0,0 +1,1470 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%if AOM_ARCH_X86_64 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM sse2 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c new file mode 100644 index 0000000000..b4c5cc7c7b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); + __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); + __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01); + __m256i diff0 = _mm256_unpacklo_epi8(s, p); + __m256i diff1 = _mm256_unpackhi_epi8(s, p); + diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone); + diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone); + _mm256_store_si256((__m256i *)(diff_ptr), + _mm256_permute2x128_si256(diff0, diff1, 0x20)); + _mm256_store_si256((__m256i *)(diff_ptr + 16), + _mm256_permute2x128_si256(diff0, diff1, 0x31)); +} + +static INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); + __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(s); + __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void subtract_block_128xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); + subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 128: + subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + default: + aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm new file mode 100644 index 0000000000..fd508c0916 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void aom_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + cmp colsd, 64 + je .case_64 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + movu m1, [predq+%3] + movu m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_128: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize + loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + sub rowsd, 1 + jnz .loop_128 + RET + +.case_64: + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + emms + RET diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c new file mode 100644 index 0000000000..89b9b824bf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height) { + uint64_t result; + __m256i v_acc_q = _mm256_setzero_si256(); + const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0); + for (int col = 0; col < height; col += 4) { + __m256i v_acc_d = _mm256_setzero_si256(); + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + + const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); + } + v_acc_q = + _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); + src += 4 * stride; + } + __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); + __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); + __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); + + result_64_2_int = _mm_add_epi64( + result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); + + xx_storel_64(&result, result_64_2_int); + + return result; +} + +uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, + int height) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height, int *sum) { + uint64_t result; + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i one_reg = _mm256_set1_epi16(1); + + __m256i v_sse_total = zero_reg; + __m256i v_sum_total = zero_reg; + + for (int col = 0; col < height; col += 4) { + __m256i v_sse_row = zero_reg; + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w); + const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w); + __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23); + v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg); + v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d); + v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d); + } + const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg); + const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg); + v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi); + v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row); + src += 4 * stride; + } + + const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total); + const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1); + __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low); + sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8)); + sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4)); + *sum += _mm_cvtsi128_si32(sum_128bit); + + __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total); + __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1); + __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi); + + sse_128bit = + _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit)); + + xx_storel_64(&result, sse_128bit); + + return result; +} + +uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width, + int height, int *sum) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum); + } else { + return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum); + } +} + +// Accumulate sum of 16-bit elements in the vector +static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) { + __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); + __m128i vtmp2 = _mm256_castsi256_si128(vec_a); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 8); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 4); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 2); + vtmp1 = _mm_add_epi16(vtmp1, vtmp2); + return _mm_extract_epi16(vtmp1, 0); +} + +// Accumulate sum of 32-bit elements in the vector +static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) { + __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1); + __m128i vtmp2 = _mm256_castsi256_si128(vec_a); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 8); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + vtmp2 = _mm_srli_si128(vtmp1, 4); + vtmp1 = _mm_add_epi32(vtmp1, vtmp2); + return _mm_cvtsi128_si32(vtmp1); +} + +uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width, + int height) { + uint8_t *srcp; + uint64_t s = 0, ss = 0; + __m256i vzero = _mm256_setzero_si256(); + __m256i v_acc_sum = vzero; + __m256i v_acc_sqs = vzero; + int i, j; + + // Process 32 elements in a row + for (i = 0; i < width - 31; i += 32) { + srcp = src + i; + // Process 8 columns at a time + for (j = 0; j < height - 7; j += 8) { + __m256i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero); + __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); + __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi16(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); + __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero); + __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0); + __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1); + + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi16(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = src; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint8_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width, + int height) { + uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; + uint64_t s = 0, ss = 0; + __m256i vzero = _mm256_setzero_si256(); + __m256i v_acc_sum = vzero; + __m256i v_acc_sqs = vzero; + int i, j; + + // Process 16 elements in a row + for (i = 0; i < width - 15; i += 16) { + srcp = srcp1 + i; + // Process 8 columns at a time + for (j = 0; j < height - 8; j += 8) { + __m256i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm256_loadu_si256((__m256i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero); + __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero); + v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi32(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp); + __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero); + __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero); + v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum); + + __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc); + v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0); + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm256_accumulate_epi32(v_acc_sum); + ss += mm256_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = srcp1; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint16_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c new file mode 100644 index 0000000000..cf3ed98974 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { + const __m128d ad = _mm_castsi128_pd(a); + return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); +} + +static INLINE uint64_t xx_cvtsi128_si64(__m128i a) { +#if AOM_ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(a); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, a); + return tmp; + } +#endif +} + +static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); + const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); + const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); + const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); + const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); + const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); + + return _mm_add_epi32(v_sq_01_d, v_sq_23_d); +} + +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); + __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); + v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); + return (uint64_t)_mm_cvtsi128_si32(v_sum_d); +} + +uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) { + const __m128i one_reg = _mm_set1_epi16(1); + const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); + const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); + __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); + __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); + + __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w); + v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg); + v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8)); + v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4)); + *sum = _mm_cvtsi128_si32(v_sum_0123_d); + + const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); + const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); + __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d); + v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8)); + v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4)); + return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d); +} + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { + int r = 0; + __m128i v_acc_q = _mm_setzero_si128(); + do { + const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); + v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); + src += stride << 2; + r += 4; + } while (r < height); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0); + __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), + _mm_and_si128(v_acc_q, v_zext_mask_q)); + v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); + return xx_cvtsi128_si64(v_acc_64); +} + +uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height, + int *sum) { + int r = 0; + uint64_t sse = 0; + do { + int curr_sum = 0; + sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum); + *sum += curr_sum; + src += stride << 2; + r += 4; + } while (r < height); + return sse; +} + +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// aom_sum_squares_2d_i16_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +uint64_t +aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height) { + int r = 0; + + const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0); + __m128i v_acc_q = _mm_setzero_si128(); + + do { + __m128i v_acc_d = _mm_setzero_si128(); + int c = 0; + do { + const int16_t *b = src + c; + + const __m128i v_val_0_w = xx_load_128(b + 0 * stride); + const __m128i v_val_1_w = xx_load_128(b + 1 * stride); + const __m128i v_val_2_w = xx_load_128(b + 2 * stride); + const __m128i v_val_3_w = xx_load_128(b + 3 * stride); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + c += 8; + } while (c < width); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 4 * stride; + r += 4; + } while (r < height); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + return xx_cvtsi128_si64(v_acc_q); +} + +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +uint64_t +aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height, int *sum) { + int r = 0; + uint64_t result; + const __m128i zero_reg = _mm_setzero_si128(); + const __m128i one_reg = _mm_set1_epi16(1); + + __m128i v_sse_total = zero_reg; + __m128i v_sum_total = zero_reg; + + do { + int c = 0; + __m128i v_sse_row = zero_reg; + do { + const int16_t *b = src + c; + + __m128i v_val_0_w = xx_load_128(b + 0 * stride); + __m128i v_val_1_w = xx_load_128(b + 1 * stride); + __m128i v_val_2_w = xx_load_128(b + 2 * stride); + __m128i v_val_3_w = xx_load_128(b + 3 * stride); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d); + v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d); + + const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w); + const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w); + __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23); + v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg); + v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d); + + c += 8; + } while (c < width); + + const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg); + const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg); + v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi); + v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row); + src += 4 * stride; + r += 4; + } while (r < height); + + v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8)); + v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4)); + *sum += _mm_cvtsi128_si32(v_sum_total); + + v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8)); + xx_storel_64(&result, v_sse_total); + return result; +} + +uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, + int height) { + // 4 elements per row only requires half an XMM register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { + // Generic case + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width, + int height, int *sum) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { + // Generic case + return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum); + } else { + return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum); + } +} + +////////////////////////////////////////////////////////////////////////////// +// 1D version +////////////////////////////////////////////////////////////////////////////// + +static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { + const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0); + __m128i v_acc0_q = _mm_setzero_si128(); + __m128i v_acc1_q = _mm_setzero_si128(); + + const int16_t *const end = src + n; + + assert(n % 64 == 0); + + while (src < end) { + const __m128i v_val_0_w = xx_load_128(src); + const __m128i v_val_1_w = xx_load_128(src + 8); + const __m128i v_val_2_w = xx_load_128(src + 16); + const __m128i v_val_3_w = xx_load_128(src + 24); + const __m128i v_val_4_w = xx_load_128(src + 32); + const __m128i v_val_5_w = xx_load_128(src + 40); + const __m128i v_val_6_w = xx_load_128(src + 48); + const __m128i v_val_7_w = xx_load_128(src + 56); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); + v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); + + src += 64; + } + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + return xx_cvtsi128_si64(v_acc0_q); +} + +uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { + if (n % 64 == 0) { + return aom_sum_squares_i16_64n_sse2(src, n); + } else if (n > 64) { + const uint32_t k = n & ~63u; + return aom_sum_squares_i16_64n_sse2(src, k) + + aom_sum_squares_i16_c(src + k, n - k); + } else { + return aom_sum_squares_i16_c(src, n); + } +} + +// Accumulate sum of 16-bit elements in the vector +static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) { + __m128i vtmp = _mm_srli_si128(vec_a, 8); + vec_a = _mm_add_epi16(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 4); + vec_a = _mm_add_epi16(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 2); + vec_a = _mm_add_epi16(vec_a, vtmp); + return _mm_extract_epi16(vec_a, 0); +} + +// Accumulate sum of 32-bit elements in the vector +static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) { + __m128i vtmp = _mm_srli_si128(vec_a, 8); + vec_a = _mm_add_epi32(vec_a, vtmp); + vtmp = _mm_srli_si128(vec_a, 4); + vec_a = _mm_add_epi32(vec_a, vtmp); + return _mm_cvtsi128_si32(vec_a); +} + +uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width, + int height) { + uint8_t *srcp; + uint64_t s = 0, ss = 0; + __m128i vzero = _mm_setzero_si128(); + __m128i v_acc_sum = vzero; + __m128i v_acc_sqs = vzero; + int i, j; + + // Process 16 elements in a row + for (i = 0; i < width - 15; i += 16) { + srcp = src + i; + // Process 8 columns at a time + for (j = 0; j < height - 7; j += 8) { + __m128i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm_loadu_si128((__m128i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero); + __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); + + __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); + __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi16(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); + __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero); + __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0); + v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1); + + __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0); + __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1); + + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi16(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = src; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint8_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} + +uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width, + int height) { + uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp; + uint64_t s = 0, ss = 0; + __m128i vzero = _mm_setzero_si128(); + __m128i v_acc_sum = vzero; + __m128i v_acc_sqs = vzero; + int i, j; + + // Process 8 elements in a row + for (i = 0; i < width - 8; i += 8) { + srcp = srcp1 + i; + // Process 8 columns at a time + for (j = 0; j < height - 8; j += 8) { + __m128i vsrc[8]; + for (int k = 0; k < 8; k++) { + vsrc[k] = _mm_loadu_si128((__m128i *)srcp); + srcp += src_stride; + } + for (int k = 0; k < 8; k++) { + __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero); + __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero); + v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); + + __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi32(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process remaining rows (height not a multiple of 8) + for (; j < height; j++) { + __m128i vsrc = _mm_loadu_si128((__m128i *)srcp); + __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero); + __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero); + v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum); + v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum); + + __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc); + v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0); + srcp += src_stride; + } + + // Update total sum and clear the vectors + s += mm_accumulate_epi32(v_acc_sum); + ss += mm_accumulate_epi32(v_acc_sqs); + v_acc_sum = vzero; + v_acc_sqs = vzero; + } + + // Process the remaining area using C + srcp = srcp1; + for (int k = 0; k < height; k++) { + for (int m = i; m < width; m++) { + uint16_t val = srcp[m]; + s += val; + ss += val * val; + } + srcp += src_stride; + } + return (ss - s * s / (width * height)); +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h new file mode 100644 index 0000000000..5ed3f2c7bf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ +#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ + +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, + int width, int height); + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height); +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); + +uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum); +uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height, + int *sum); +uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height, int *sum); + +#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h new file mode 100644 index 0000000000..6744ec51d0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m128i xx_loadl_32(const void *a) { + int val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static INLINE __m128i xx_loadl_64(const void *a) { + return _mm_loadl_epi64((const __m128i *)a); +} + +static INLINE __m128i xx_load_128(const void *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i xx_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i *)a); +} + +static INLINE void xx_storel_32(void *const a, const __m128i v) { + const int val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); +} + +static INLINE void xx_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i *)a, v); +} + +static INLINE void xx_store_128(void *const a, const __m128i v) { + _mm_store_si128((__m128i *)a, v); +} + +static INLINE void xx_storeu_128(void *const a, const __m128i v) { + _mm_storeu_si128((__m128i *)a, v); +} + +// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set_epi64x() +// acting on 32-bit integers. +static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, e1, 0, e0); +#else + return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); +#endif +} + +// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m128i xx_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, a, 0, a); +#else + return _mm_set1_epi64x((uint32_t)a); +#endif +} + +// Fill an SSE register using an interleaved pair of values, ie. set the +// 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering +// as when a register is stored to / loaded from memory. +// +// This is useful for rearranging filter kernels for use with the _mm_madd_epi16 +// instruction +static INLINE __m128i xx_set2_epi16(int16_t a, int16_t b) { + return _mm_setr_epi16(a, b, a, b, a, b, a, b); +} + +static INLINE __m128i xx_round_epu16(__m128i v_val_w) { + return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { + const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); + return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); + const __m128i v_tmp_d = + _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h new file mode 100644 index 0000000000..b729e5f410 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m256i yy_load_256(const void *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE __m256i yy_loadu_256(const void *a) { + return _mm256_loadu_si256((const __m256i *)a); +} + +static INLINE void yy_store_256(void *const a, const __m256i v) { + _mm256_store_si256((__m256i *)a, v); +} + +static INLINE void yy_storeu_256(void *const a, const __m256i v) { + _mm256_storeu_si256((__m256i *)a, v); +} + +// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm256_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m256i yy_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); +#else + return _mm256_set1_epi64x((uint32_t)a); +#endif +} + +// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We +// therefore define an equivalent function using a different intrinsic. +// ([ hi ], [ lo ]) -> [ hi ][ lo ] +static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); + return yy_set_m128i(mhi, mlo); +} + +static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) { + _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); + _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a)); +} + +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} +#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h new file mode 100644 index 0000000000..9dab750f44 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 16 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 01 11 21 31 __ __ __ __ + // out[2]: 02 12 22 32 03 13 23 33 + // out[3]: 03 13 23 33 __ __ __ __ + // + // Note: The high 64 bits of the output registers are shown for informational + // purposes only. Callers should only use the low 64 bits of the output + // registers. "__" indicates zeros. + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(a0, a1); + out[3] = _mm_srli_si128(out[2], 8); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 0000000000..4105250bc0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, + __m256i *in0, __m256i *in1, const __m256i _r, + const int32_t cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, w0); + __m256i u1 = _mm256_madd_epi16(t1, w0); + __m256i v0 = _mm256_madd_epi16(t0, w1); + __m256i v1 = _mm256_madd_epi16(t1, w1); + + __m256i a0 = _mm256_add_epi32(u0, _r); + __m256i a1 = _mm256_add_epi32(u1, _r); + __m256i b0 = _mm256_add_epi32(v0, _r); + __m256i b1 = _mm256_add_epi32(v1, _r); + + __m256i c0 = _mm256_srai_epi32(a0, cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, cos_bit); + + *in0 = _mm256_packs_epi32(c0, c1); + *in1 = _mm256_packs_epi32(d0, d1); +} + +static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_adds_epi16(_in0, _in1); + *in1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_add_epi32(_in0, _in1); + *in1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_adds_epi16(_in0, _in1); + *out1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_add_epi32(_in0, _in1); + *out1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, + int stride, + __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { + const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); + const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + return _mm256_permute4x64_epi64(b, 0xD8); +} + +static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (int i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (int i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (int i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in, + __m256i *const out) { + const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]); + const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]); + const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]); + const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]); + const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]); + const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i b0 = _mm256_unpacklo_epi32(a0, a1); + const __m256i b1 = _mm256_unpacklo_epi32(a2, a3); + const __m256i b2 = _mm256_unpacklo_epi32(a4, a5); + const __m256i b3 = _mm256_unpacklo_epi32(a6, a7); + const __m256i b4 = _mm256_unpackhi_epi32(a0, a1); + const __m256i b5 = _mm256_unpackhi_epi32(a2, a3); + const __m256i b6 = _mm256_unpackhi_epi32(a4, a5); + const __m256i b7 = _mm256_unpackhi_epi32(a6, a7); + + out[0] = _mm256_unpacklo_epi64(b0, b1); + out[1] = _mm256_unpackhi_epi64(b0, b1); + out[2] = _mm256_unpacklo_epi64(b4, b5); + out[3] = _mm256_unpackhi_epi64(b4, b5); + out[4] = _mm256_unpacklo_epi64(b2, b3); + out[5] = _mm256_unpackhi_epi64(b2, b3); + out[6] = _mm256_unpacklo_epi64(b6, b7); + out[7] = _mm256_unpackhi_epi64(b6, b7); +} + +static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm256_adds_epi16(in[i], round); + in[i] = _mm256_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm256_slli_epi16(in[i], bit); + } + } +} + +static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) { + __m256i tmp, round; + round = _mm256_set1_epi32(1 << (bit - 1)); + tmp = _mm256_add_epi32(vec, round); + return _mm256_srai_epi32(tmp, bit); +} + +static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output, + const int size, const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = round_shift_32_avx2(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm256_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void round_shift_rect_array_32_avx2(__m256i *input, + __m256i *output, + const int size, const int bit, + const int val) { + const __m256i sqrt2 = _mm256_set1_epi32(val); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = round_shift_32_avx2(input[i], bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = round_shift_32_avx2(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m256i r0 = _mm256_slli_epi32(input[i], -bit); + const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0); + output[i] = round_shift_32_avx2(r1, NewSqrt2Bits); + } + } +} + +static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { + const __m256i scale_rounding = + pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m256i b = _mm256_madd_epi16(a, scale_rounding); + return _mm256_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31); + _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo)); + _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi)); + _mm256_store_si256((__m256i *)(b + 64), temp); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride); + } +} + +static INLINE void pack_reg(const __m128i *in1, const __m128i *in2, + __m256i *out) { + out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1); + out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1); + out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1); + out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1); + out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1); + out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1); + out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1); + out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1); +} + +static INLINE void extract_reg(const __m256i *in, __m128i *out1) { + out1[0] = _mm256_castsi256_si128(in[0]); + out1[1] = _mm256_castsi256_si128(in[1]); + out1[2] = _mm256_castsi256_si128(in[2]); + out1[3] = _mm256_castsi256_si128(in[3]); + out1[4] = _mm256_castsi256_si128(in[4]); + out1[5] = _mm256_castsi256_si128(in[5]); + out1[6] = _mm256_castsi256_si128(in[6]); + out1[7] = _mm256_castsi256_si128(in[7]); + + out1[8] = _mm256_extracti128_si256(in[0], 0x01); + out1[9] = _mm256_extracti128_si256(in[1], 0x01); + out1[10] = _mm256_extracti128_si256(in[2], 0x01); + out1[11] = _mm256_extracti128_si256(in[3], 0x01); + out1[12] = _mm256_extracti128_si256(in[4], 0x01); + out1[13] = _mm256_extracti128_si256(in[5], 0x01); + out1[14] = _mm256_extracti128_si256(in[6], 0x01); + out1[15] = _mm256_extracti128_si256(in[7], 0x01); +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h new file mode 100644 index 0000000000..9c99eb93bd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#define pair_set_epi16(a, b) \ + _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) + +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c new file mode 100644 index 0000000000..046d6f10f8 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -0,0 +1,961 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) { + return _mm_add_epi16(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { + return _mm_add_epi32(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1) + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); +} + +static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + return _mm_extract_epi32(res, 1); +} + +// handle pixels (<= 512) +static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); + return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); +} + +// handle 1024 pixels (32x32, 16x64, 64x16) +static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = + _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +// handle 2048 pixels (32x64, 64x32) +static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + vsum = sum_to_32bit_avx2(vsum); + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); + return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance128_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); + variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512) +AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512) +AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512) + +AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512) +AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024) +AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048) + +AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048) + +#if !CONFIG_REALTIME_ONLY +AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024) +AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512) +AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024) +AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512) +#endif + +#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum = _mm256_setzero_si256(); \ + for (int i = 0; i < (bh / uh); i++) { \ + __m256i vsum16; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ + src += uh * src_stride; \ + ref += uh * ref_stride; \ + } \ + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ + const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_AVX2(64, 64, 12, 32) // 64x32 * ( 64/32) +AOM_VAR_LOOP_AVX2(64, 128, 13, 32) // 64x32 * (128/32) +AOM_VAR_LOOP_AVX2(128, 64, 13, 16) // 128x16 * ( 64/16) +AOM_VAR_LOOP_AVX2(128, 128, 14, 16) // 128x16 * (128/16) + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + +static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + +static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, + const __m256i a, + uint8_t *comp_pred) { + const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; + const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); + + const __m256i ma = _mm256_sub_epi8(alpha_max, a); + + const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); + const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); + const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); + const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); + + const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); + const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); + const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); + const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); + + const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); + _mm256_storeu_si256((__m256i *)(comp_pred), roundA); +} + +void aom_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int row = 0; + if (width == 8) { + do { + const __m256i pred_0123 = _mm256_loadu_si256((const __m256i *)(pred)); + const __m128i ref_0 = _mm_loadl_epi64((const __m128i *)(ref)); + const __m128i ref_1 = + _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + const __m128i ref_2 = + _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); + const __m128i ref_3 = + _mm_loadl_epi64((const __m128i *)(ref + 3 * ref_stride)); + const __m128i ref_01 = _mm_unpacklo_epi64(ref_0, ref_1); + const __m128i ref_23 = _mm_unpacklo_epi64(ref_2, ref_3); + + const __m256i ref_0123 = + _mm256_inserti128_si256(_mm256_castsi128_si256(ref_01), ref_23, 1); + const __m256i average = _mm256_avg_epu8(pred_0123, ref_0123); + _mm256_storeu_si256((__m256i *)(comp_pred), average); + + row += 4; + pred += 32; + comp_pred += 32; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 16) { + do { + const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred)); + const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32)); + const __m256i tmp0 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref))); + const __m256i ref_0 = _mm256_inserti128_si256( + tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); + const __m256i tmp1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); + const __m256i ref_1 = _mm256_inserti128_si256( + tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_storeu_si256((__m256i *)(comp_pred), average_0); + _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1); + + row += 4; + pred += 64; + comp_pred += 64; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 32) { + do { + const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred)); + const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_storeu_si256((__m256i *)(comp_pred), average_0); + _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1); + + row += 2; + pred += 64; + comp_pred += 64; + ref += 2 * ref_stride; + } while (row < height); + } else if (width % 64 == 0) { + do { + for (int x = 0; x < width; x += 64) { + const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred + x)); + const __m256i pred_1 = + _mm256_loadu_si256((const __m256i *)(pred + x + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + x + 32)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_storeu_si256((__m256i *)(comp_pred + x), average_0); + _mm256_storeu_si256((__m256i *)(comp_pred + x + 32), average_1); + } + row++; + pred += width; + comp_pred += width; + ref += ref_stride; + } while (row < height); + } else { + aom_comp_avg_pred_c(comp_pred, pred, width, height, ref, ref_stride); + } +} + +void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i = 0; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aA = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aB = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + // comp_pred's stride == width == 16 + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (16 << 2); + i += 4; + } while (i < height); + } else { + do { + for (int x = 0; x < width; x += 32) { + const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x)); + const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x)); + const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x)); + + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_pred += 32; + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + i++; + } while (i < height); + } +} + +static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, + const __m256i s1, + const __m256i a) { + const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); + + const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); + const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); + const __m256i pred_l = _mm256_srai_epi32( + _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); + const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); + const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); + const __m256i pred_h = _mm256_srai_epi32( + _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m256i zero = _mm256_setzero_si256(); + + if (width == 8) { + do { + const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); + const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); + + const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); + + __m256i m = _mm256_castsi128_si256(m_l); + m = _mm256_insertf128_si256(m, m_h, 1); + const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); + + _mm_storeu_si128((__m128i *)(comp_pred + width), + _mm256_extractf128_si256(comp, 1)); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + comp_pred += (width << 1); + i += 2; + } while (i < height); + } else if (width == 16) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); + const __m256i m_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else { + do { + for (int x = 0; x < width; x += 32) { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16)); + + const __m256i m01_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x))); + const __m256i m23_16 = _mm256_cvtepu8_epi16( + _mm_loadu_si128((const __m128i *)(mask + x + 16))); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); + const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); + + comp_pred += 32; + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + i += 1; + } while (i < height); + } +} + +uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { + uint64_t sum = 0; + __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8; + __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16; + __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16; + __m256i res0_4x64, res1_4x64; + __m256i sub_result; + const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + for (int i = 0; i < h; i += 4) { + dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride])); + dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride])); + dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride])); + dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride])); + dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8), + _mm_unpacklo_epi32(dst2_4x8, dst3_4x8)); + dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8); + + src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); + src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); + src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride])); + src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride])); + src0_8x16 = + _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16)); + src1_8x16 = + _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16)); + src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); + + // r15 r14 r13------------r1 r0 - 16 bit + sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); + + // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit + src_16x16 = _mm256_madd_epi16(sub_result, sub_result); + + // accumulation of result + square_result = _mm256_add_epi32(square_result, src_16x16); + } + + // s5 s4 s1 s0 - 64bit + res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); + // s7 s6 s3 s2 - 64bit + res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); + // r3 r2 r1 r0 - 64bit + res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); + // r1+r3 r2+r0 - 64bit + const __m128i sum_1x64 = + _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), + _mm256_extracti128_si256(res0_4x64, 1)); + xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); + return sum; +} + +// Compute mse of four consecutive 4x4 blocks. +// In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially. +// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame +// buffer, thus dstride is a frame level stride. +uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int src_blk_stride, int h) { + uint64_t sum = 0; + __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8; + __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16; + __m256i res0_4x64, res1_4x64; + __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3; + const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + __m256i square_result = zeros; + uint16_t *src_temp = src; + + for (int i = 0; i < h; i += 4) { + dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride])); + dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride])); + dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride])); + dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride])); + + // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30 + dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8); + // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31 + dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8); + // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32 + dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8); + // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33 + dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8); + + // All rows of 1st 4x4 block - r00 r01 r02 r03 + __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0])); + // All rows of 2nd 4x4 block - r10 r11 r12 r13 + __m256i src1_16x16 = + _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride])); + // All rows of 3rd 4x4 block - r20 r21 r22 r23 + __m256i src2_16x16 = + _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride])); + // All rows of 4th 4x4 block - r30 r31 r32 r33 + __m256i src3_16x16 = + _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride])); + + // r00 r10 r02 r12 + __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16); + // r01 r11 r03 r13 + __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16); + // r20 r30 r22 r32 + __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16); + // r21 r31 r23 r33 + __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16); + + // r00 r10 r20 r30 + src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20); + // r01 r11 r21 r31 + src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20); + // r02 r12 r22 r32 + src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31); + // r03 r13 r23 r33 + src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31); + + // r15 r14 r13------------r1 r0 - 16 bit + sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16)); + sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16)); + sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16)); + sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16)); + + // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit + src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0); + src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1); + src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2); + src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3); + + // accumulation of result + src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16); + src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16); + const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16); + square_result = _mm256_add_epi32(square_result, square_result_0); + src_temp += 16; + } + + // s5 s4 s1 s0 - 64bit + res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); + // s7 s6 s3 s2 - 64bit + res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); + // r3 r2 r1 r0 - 64bit + res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); + // r1+r3 r2+r0 - 64bit + const __m128i sum_1x64 = + _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), + _mm256_extracti128_si256(res0_4x64, 1)); + xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); + return sum; +} + +uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { + uint64_t sum = 0; + __m128i dst0_8x8, dst1_8x8, dst3_16x8; + __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16; + __m256i res0_4x64, res1_4x64; + __m256i sub_result; + const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + + for (int i = 0; i < h; i += 2) { + dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); + dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); + dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8); + dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8); + + src0_8x16 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride])); + src1_8x16 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride])); + src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); + + // r15 r14 r13 - - - r1 r0 - 16 bit + sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); + + // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit + src_16x16 = _mm256_madd_epi16(sub_result, sub_result); + + // accumulation of result + square_result = _mm256_add_epi32(square_result, src_16x16); + } + + // s5 s4 s1 s0 - 64bit + res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); + // s7 s6 s3 s2 - 64bit + res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); + // r3 r2 r1 r0 - 64bit + res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); + // r1+r3 r2+r0 - 64bit + const __m128i sum_1x64 = + _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), + _mm256_extracti128_si256(res0_4x64, 1)); + xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); + return sum; +} + +// Compute mse of two consecutive 8x8 blocks. +// In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially. +// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame +// buffer, thus dstride is a frame level stride. +uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int src_blk_stride, int h) { + uint64_t sum = 0; + __m128i dst0_16x8, dst1_16x8; + __m256i dst0_16x16, dst1_16x16; + __m256i res0_4x64, res1_4x64; + __m256i sub_result_0, sub_result_1; + const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); + __m256i square_result = zeros; + uint16_t *src_temp = src; + + for (int i = 0; i < h; i += 2) { + dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride])); + dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride])); + + // row0 of 1st and 2nd 8x8 block - d00 d10 + dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8); + // row1 of 1st and 2nd 8x8 block - d01 d11 + dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8); + + // 2 rows of 1st 8x8 block - r00 r01 + __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0])); + // 2 rows of 2nd 8x8 block - r10 r11 + __m256i src1_16x16 = + _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride])); + // r00 r10 - 128bit + __m256i tmp0_16x16 = + _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20); + // r01 r11 - 128bit + __m256i tmp1_16x16 = + _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31); + + // r15 r14 r13------------r1 r0 - 16 bit + sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16)); + sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16)); + + // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each + src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0); + src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1); + + // accumulation of result + src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16); + square_result = _mm256_add_epi32(square_result, src0_16x16); + src_temp += 16; + } + + // s5 s4 s1 s0 - 64bit + res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros); + // s7 s6 s3 s2 - 64bit + res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros); + // r3 r2 r1 r0 - 64bit + res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64); + // r1+r3 r2+r0 - 64bit + const __m128i sum_1x64 = + _mm_add_epi64(_mm256_castsi256_si128(res0_4x64), + _mm256_extracti128_si256(res0_4x64, 1)); + xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8))); + return sum; +} + +uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4) && + "w=8/4 and h=8/4 must be satisfied"); + switch (w) { + case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); + case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); + default: assert(0 && "unsupported width"); return -1; + } +} + +// Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8 +// block and Chroma uses 4x4 block. In src buffer, each block in a filter block +// is stored sequentially. Hence src_blk_stride is same as block width. Whereas +// dst buffer is a frame buffer, thus dstride is a frame level stride. +uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4) && + "w=8/4 and h=8/4 must be satisfied"); + switch (w) { + case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); + case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); + default: assert(0 && "unsupported width"); return -1; + } +} + +static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src, + const uint8_t *ref, + __m256i set_one_minusone, + __m256i sse_8x16[2], + __m256i sum_8x16[2]) { + const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref)); + + const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256); + const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256); + + const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone); + const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone); + + sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0)); + sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1)); + sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0); + sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1); +} + +static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16, + unsigned int *tot_sse, int *tot_sum) { + // s00 s01 s10 s11 s20 s21 s30 s31 + const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]); + // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33 + const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]); + // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31 + const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0); + // d00 d01 d10 d11 d20 d21 d30 d31 | X + const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08); + // d00 d01 d10 d11 d20 d21 d30 d31 + const __m256i sum_results = + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3)); + + // Add sum & sse registers appropriately to get total sum & sse separately. + // s0 s1 d0 d1 s2 s3 d2 d3 + const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results); + // s0 s1 s2 s3 d0 d1 d2 d3 + const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8); + // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3 + const __m256i sum_sse_order_add_1 = + _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add); + // s0 x x x | d0 x x x + const __m256i sum_sse_order_add_final = + _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1); + // s0 + const uint32_t first_value = + (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0); + *tot_sse += first_value; + // d0 + const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4); + *tot_sum += second_value; + return sum_sse_order_add; +} + +static INLINE void get_var_sse_sum_8x8_quad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, + const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8, + unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) { + assert(h <= 128); // May overflow for larger height. + __m256i sse_8x16[2], sum_8x16[2]; + sum_8x16[0] = _mm256_setzero_si256(); + sse_8x16[0] = _mm256_setzero_si256(); + sum_8x16[1] = sum_8x16[0]; + sse_8x16[1] = sse_8x16[0]; + const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01); + + for (int i = 0; i < h; i++) { + // Process 8x32 block of one row. + calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16); + src += src_stride; + ref += ref_stride; + } + + const __m256i sum_sse_order_add = + calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum); + + // s0 s1 s2 s3 + _mm_storeu_si128((__m128i *)sse8x8, + _mm256_castsi256_si128(sum_sse_order_add)); + // d0 d1 d2 d3 + const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1); + _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8); + + // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3 + const __m128i mull_results = + _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6); + // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3 + const __m128i variance_8x8 = + _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results); + // v0 v1 v2 v3 + _mm_storeu_si128((__m128i *)var8x8, variance_8x8); +} + +static INLINE void get_var_sse_sum_16x16_dual_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, + const int ref_stride, const int h, uint32_t *sse16x16, + unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) { + assert(h <= 128); // May overflow for larger height. + __m256i sse_16x16[2], sum_16x16[2]; + sum_16x16[0] = _mm256_setzero_si256(); + sse_16x16[0] = _mm256_setzero_si256(); + sum_16x16[1] = sum_16x16[0]; + sse_16x16[1] = sse_16x16[0]; + const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01); + + for (int i = 0; i < h; i++) { + // Process 16x32 block of one row. + calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16); + src += src_stride; + ref += ref_stride; + } + + const __m256i sum_sse_order_add = + calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum); + + const __m256i sum_sse_order_add_1 = + _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add); + + // s0+s1 s2+s3 x x + _mm_storel_epi64((__m128i *)sse16x16, + _mm256_castsi256_si128(sum_sse_order_add_1)); + + // d0+d1 d2+d3 x x + const __m128i sum_temp16x16 = + _mm256_extractf128_si256(sum_sse_order_add_1, 1); + + // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3 + const __m128i mull_results = + _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8); + + // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3 + const __m128i variance_16x16 = + _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results); + + // v0 v1 v2 v3 + _mm_storel_epi64((__m128i *)var16x16, variance_16x16); +} + +void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse8x8, int *sum8x8, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var8x8) { + get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8, + sse8x8, sum8x8, tot_sse, tot_sum, var8x8); +} + +void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse16x16, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var16x16) { + get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride, + 16, sse16x16, tot_sse, tot_sum, var16x16); +} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c new file mode 100644 index 0000000000..9e9e70ea01 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; +/* clang-format on */ + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +// Functions related to sub pixel variance width 16 +#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + /* load source and destination of 2 rows and insert*/ \ + src_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ + _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ + dst_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ + _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); + +#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ + _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \ + _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define LOAD_SRC_NEXT_BYTE_INSERT \ + /* load source and another source from next row */ \ + src_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \ + _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \ + /* load source and next row source from 1 byte onwards */ \ + src_next_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \ + _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1); + +#define LOAD_DST_INSERT \ + dst_reg = _mm256_inserti128_si256( \ + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \ + _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1); + +#define LOAD_SRC_MERGE_128BIT(filter) \ + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ + __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \ + __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \ + __m128i filter_128bit = _mm256_castsi256_si128(filter); \ + __m128i pw8_128bit = _mm256_castsi256_si128(pw8); + +#define FILTER_SRC_128BIT(filter) \ + /* filter the source */ \ + src_lo = _mm_maddubs_epi16(src_lo, filter); \ + src_hi = _mm_maddubs_epi16(src_hi, filter); \ + \ + /* add 8 to source */ \ + src_lo = _mm_add_epi16(src_lo, pw8_128bit); \ + src_hi = _mm_add_epi16(src_hi, pw8_128bit); \ + \ + /* divide source by 16 */ \ + src_lo = _mm_srai_epi16(src_lo, 4); \ + src_hi = _mm_srai_epi16(src_hi, 4); + +// TODO(chiyotsai@google.com): These variance functions are macro-fied so we +// don't have to manually optimize the individual for-loops. We could save some +// binary size by optimizing the loops more carefully without duplicating the +// codes with a macro. +#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height) \ + static AOM_INLINE int aom_sub_pixel_variance32x##height##_imp_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse) { \ + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ + __m256i zero_reg; \ + int i, sum; \ + sum_reg = _mm256_setzero_si256(); \ + sse_reg = _mm256_setzero_si256(); \ + zero_reg = _mm256_setzero_si256(); \ + \ + /* x_offset = 0 and y_offset = 0 */ \ + if (x_offset == 0) { \ + if (y_offset == 0) { \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + /* expend each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = 0 and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i src_next_reg; \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, src_stride) \ + /* expend each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = 0 and y_offset = bilin interpolation */ \ + } else { \ + __m256i filter, pw8, src_next_reg; \ + \ + y_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, src_stride) \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + } \ + /* x_offset = 4 and y_offset = 0 */ \ + } else if (x_offset == 4) { \ + if (y_offset == 0) { \ + __m256i src_next_reg; \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, 1) \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = 4 and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i src_next_reg, src_avg; \ + /* load source and another source starting from the next */ \ + /* following byte */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + AVG_NEXT_SRC(src_reg, 1) \ + for (i = 0; i < height; i++) { \ + src_avg = src_reg; \ + src += src_stride; \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, 1) \ + /* average between previous average to current average */ \ + src_avg = _mm256_avg_epu8(src_avg, src_reg); \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_avg, zero_reg) \ + /* save current source average */ \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + /* x_offset = 4 and y_offset = bilin interpolation */ \ + } else { \ + __m256i filter, pw8, src_next_reg, src_avg; \ + y_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load source and another source starting from the next */ \ + /* following byte */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + AVG_NEXT_SRC(src_reg, 1) \ + for (i = 0; i < height; i++) { \ + /* save current source average */ \ + src_avg = src_reg; \ + src += src_stride; \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, 1) \ + MERGE_WITH_SRC(src_avg, src_reg) \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + } \ + /* x_offset = bilin interpolation and y_offset = 0 */ \ + } else { \ + if (y_offset == 0) { \ + __m256i filter, pw8, src_next_reg; \ + x_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = bilin interpolation and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i filter, pw8, src_next_reg, src_pack; \ + x_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(filter) \ + /* convert each 16 bit to 8 bit to each low and high lane source */ \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + for (i = 0; i < height; i++) { \ + src += src_stride; \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(filter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + /* average between previous pack to the current */ \ + src_pack = _mm256_avg_epu8(src_pack, src_reg); \ + MERGE_WITH_SRC(src_pack, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src_pack = src_reg; \ + dst += dst_stride; \ + } \ + /* x_offset = bilin interpolation and y_offset = bilin interpolation \ + */ \ + } else { \ + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ + x_offset <<= 5; \ + xfilter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + y_offset <<= 5; \ + yfilter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load source and another source starting from the next */ \ + /* following byte */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + MERGE_NEXT_SRC(src_reg, 1) \ + \ + FILTER_SRC(xfilter) \ + /* convert each 16 bit to 8 bit to each low and high lane source */ \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + for (i = 0; i < height; i++) { \ + src += src_stride; \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(xfilter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + /* merge previous pack to current pack source */ \ + MERGE_WITH_SRC(src_pack, src_reg) \ + /* filter the source */ \ + FILTER_SRC(yfilter) \ + src_pack = src_reg; \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + } \ + } \ + CALC_SUM_AND_SSE \ + _mm256_zeroupper(); \ + return sum; \ + } \ + unsigned int aom_sub_pixel_variance32x##height##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse) { \ + const int sum = aom_sub_pixel_variance32x##height##_imp_avx2( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ + } + +MAKE_SUB_PIXEL_VAR_32XH(64, 6) +MAKE_SUB_PIXEL_VAR_32XH(32, 5) +MAKE_SUB_PIXEL_VAR_32XH(16, 4) + +#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height. +AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7) +AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6) +AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7) +AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6) +AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5) + +#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height) \ + unsigned int aom_sub_pixel_variance16x##height##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse) { \ + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ + __m256i zero_reg; \ + int i, sum; \ + sum_reg = _mm256_setzero_si256(); \ + sse_reg = _mm256_setzero_si256(); \ + zero_reg = _mm256_setzero_si256(); \ + \ + /* x_offset = 0 and y_offset = 0 */ \ + if (x_offset == 0) { \ + if (y_offset == 0) { \ + for (i = 0; i < height; i += 2) { \ + LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + /* expend each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += (src_stride << 1); \ + dst += (dst_stride << 1); \ + } \ + /* x_offset = 0 and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i src_next_reg; \ + for (i = 0; i < height; i += 2) { \ + LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + AVG_NEXT_SRC_INSERT(src_reg, src_stride) \ + /* expend each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += (src_stride << 1); \ + dst += (dst_stride << 1); \ + } \ + /* x_offset = 0 and y_offset = bilin interpolation */ \ + } else { \ + __m256i filter, pw8, src_next_reg; \ + y_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + for (i = 0; i < height; i += 2) { \ + LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + MERGE_NEXT_SRC_INSERT(src_reg, src_stride) \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += (src_stride << 1); \ + dst += (dst_stride << 1); \ + } \ + } \ + /* x_offset = 4 and y_offset = 0 */ \ + } else if (x_offset == 4) { \ + if (y_offset == 0) { \ + __m256i src_next_reg; \ + for (i = 0; i < height; i += 2) { \ + LOAD_SRC_NEXT_BYTE_INSERT \ + LOAD_DST_INSERT \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += (src_stride << 1); \ + dst += (dst_stride << 1); \ + } \ + /* x_offset = 4 and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i src_next_reg, src_avg, src_temp; \ + /* load and insert source and next row source */ \ + LOAD_SRC_NEXT_BYTE_INSERT \ + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \ + src += src_stride << 1; \ + for (i = 0; i < height - 2; i += 2) { \ + LOAD_SRC_NEXT_BYTE_INSERT \ + src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ + src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \ + src_temp = _mm256_avg_epu8(src_avg, src_temp); \ + LOAD_DST_INSERT \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_temp, zero_reg) \ + /* save current source average */ \ + src_avg = src_next_reg; \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride << 1; \ + src += src_stride << 1; \ + } \ + /* last 2 rows processing happens here */ \ + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ + src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \ + src_next_reg = _mm256_permute2x128_si256( \ + src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \ + LOAD_DST_INSERT \ + src_avg = _mm256_avg_epu8(src_avg, src_next_reg); \ + MERGE_WITH_SRC(src_avg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + } else { \ + /* x_offset = 4 and y_offset = bilin interpolation */ \ + __m256i filter, pw8, src_next_reg, src_avg, src_temp; \ + y_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load and insert source and next row source */ \ + LOAD_SRC_NEXT_BYTE_INSERT \ + src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \ + src += src_stride << 1; \ + for (i = 0; i < height - 2; i += 2) { \ + LOAD_SRC_NEXT_BYTE_INSERT \ + src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \ + src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \ + LOAD_DST_INSERT \ + MERGE_WITH_SRC(src_avg, src_temp) \ + /* save current source average */ \ + src_avg = src_next_reg; \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride << 1; \ + src += src_stride << 1; \ + } \ + /* last 2 rows processing happens here */ \ + __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \ + __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \ + src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \ + src_next_reg = _mm256_permute2x128_si256( \ + src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \ + LOAD_DST_INSERT \ + MERGE_WITH_SRC(src_avg, src_next_reg) \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + } \ + /* x_offset = bilin interpolation and y_offset = 0 */ \ + } else { \ + if (y_offset == 0) { \ + __m256i filter, pw8, src_next_reg; \ + x_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + for (i = 0; i < height; i += 2) { \ + LOAD_SRC_DST_INSERT(src_stride, dst_stride) \ + MERGE_NEXT_SRC_INSERT(src_reg, 1) \ + FILTER_SRC(filter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += (src_stride << 1); \ + dst += (dst_stride << 1); \ + } \ + /* x_offset = bilin interpolation and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i filter, pw8, src_next_reg, src_pack; \ + x_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load and insert source and next row source */ \ + LOAD_SRC_NEXT_BYTE_INSERT \ + MERGE_WITH_SRC(src_reg, src_next_reg) \ + FILTER_SRC(filter) \ + /* convert each 16 bit to 8 bit to each low and high lane source */ \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + src += src_stride << 1; \ + for (i = 0; i < height - 2; i += 2) { \ + LOAD_SRC_NEXT_BYTE_INSERT \ + LOAD_DST_INSERT \ + MERGE_WITH_SRC(src_reg, src_next_reg) \ + FILTER_SRC(filter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \ + /* average between previous pack to the current */ \ + src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \ + MERGE_WITH_SRC(src_pack, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src_pack = src_reg; \ + src += src_stride << 1; \ + dst += dst_stride << 1; \ + } \ + /* last 2 rows processing happens here */ \ + LOAD_SRC_MERGE_128BIT(filter) \ + LOAD_DST_INSERT \ + FILTER_SRC_128BIT(filter_128bit) \ + src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \ + src_next_reg = _mm256_permute2x128_si256( \ + src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \ + /* average between previous pack to the current */ \ + src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \ + MERGE_WITH_SRC(src_pack, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + } else { \ + /* x_offset = bilin interpolation and y_offset = bilin interpolation \ + */ \ + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ + x_offset <<= 5; \ + xfilter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + y_offset <<= 5; \ + yfilter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load and insert source and next row source */ \ + LOAD_SRC_NEXT_BYTE_INSERT \ + MERGE_WITH_SRC(src_reg, src_next_reg) \ + FILTER_SRC(xfilter) \ + /* convert each 16 bit to 8 bit to each low and high lane source */ \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + src += src_stride << 1; \ + for (i = 0; i < height - 2; i += 2) { \ + LOAD_SRC_NEXT_BYTE_INSERT \ + LOAD_DST_INSERT \ + MERGE_WITH_SRC(src_reg, src_next_reg) \ + FILTER_SRC(xfilter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \ + /* average between previous pack to the current */ \ + MERGE_WITH_SRC(src_pack, src_next_reg) \ + /* filter the source */ \ + FILTER_SRC(yfilter) \ + src_pack = src_reg; \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride << 1; \ + dst += dst_stride << 1; \ + } \ + /* last 2 rows processing happens here */ \ + LOAD_SRC_MERGE_128BIT(xfilter) \ + LOAD_DST_INSERT \ + FILTER_SRC_128BIT(filter_128bit) \ + src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \ + src_next_reg = _mm256_permute2x128_si256( \ + src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \ + MERGE_WITH_SRC(src_pack, src_next_reg) \ + FILTER_SRC(yfilter) \ + CALC_SUM_SSE_INSIDE_LOOP \ + } \ + } \ + CALC_SUM_AND_SSE \ + _mm256_zeroupper(); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height)); \ + } + +MAKE_SUB_PIXEL_VAR_16XH(32, 5) +MAKE_SUB_PIXEL_VAR_16XH(16, 4) +MAKE_SUB_PIXEL_VAR_16XH(8, 3) +#if !CONFIG_REALTIME_ONLY +MAKE_SUB_PIXEL_VAR_16XH(64, 6) +MAKE_SUB_PIXEL_VAR_16XH(4, 2) +#endif + +#define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ + int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ + unsigned int *sse) { \ + __m256i sec_reg; \ + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \ + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \ + __m256i zero_reg; \ + int i, sum; \ + sum_reg = _mm256_setzero_si256(); \ + sse_reg = _mm256_setzero_si256(); \ + zero_reg = _mm256_setzero_si256(); \ + \ + /* x_offset = 0 and y_offset = 0 */ \ + if (x_offset == 0) { \ + if (y_offset == 0) { \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ + sec += sec_stride; \ + /* expend each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + } else if (y_offset == 4) { \ + __m256i src_next_reg; \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, src_stride) \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ + sec += sec_stride; \ + /* expend each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = 0 and y_offset = bilin interpolation */ \ + } else { \ + __m256i filter, pw8, src_next_reg; \ + \ + y_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, src_stride) \ + FILTER_SRC(filter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ + sec += sec_stride; \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + } \ + /* x_offset = 4 and y_offset = 0 */ \ + } else if (x_offset == 4) { \ + if (y_offset == 0) { \ + __m256i src_next_reg; \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, 1) \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ + sec += sec_stride; \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = 4 and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i src_next_reg, src_avg; \ + /* load source and another source starting from the next */ \ + /* following byte */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + AVG_NEXT_SRC(src_reg, 1) \ + for (i = 0; i < height; i++) { \ + /* save current source average */ \ + src_avg = src_reg; \ + src += src_stride; \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, 1) \ + /* average between previous average to current average */ \ + src_avg = _mm256_avg_epu8(src_avg, src_reg); \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_avg = _mm256_avg_epu8(src_avg, sec_reg); \ + sec += sec_stride; \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_avg, zero_reg) \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + /* x_offset = 4 and y_offset = bilin interpolation */ \ + } else { \ + __m256i filter, pw8, src_next_reg, src_avg; \ + y_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load source and another source starting from the next */ \ + /* following byte */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + AVG_NEXT_SRC(src_reg, 1) \ + for (i = 0; i < height; i++) { \ + /* save current source average */ \ + src_avg = src_reg; \ + src += src_stride; \ + LOAD_SRC_DST \ + AVG_NEXT_SRC(src_reg, 1) \ + MERGE_WITH_SRC(src_avg, src_reg) \ + FILTER_SRC(filter) \ + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_avg = _mm256_avg_epu8(src_avg, sec_reg); \ + /* expand each byte to 2 bytes */ \ + MERGE_WITH_SRC(src_avg, zero_reg) \ + sec += sec_stride; \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + } \ + /* x_offset = bilin interpolation and y_offset = 0 */ \ + } else { \ + if (y_offset == 0) { \ + __m256i filter, pw8, src_next_reg; \ + x_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + for (i = 0; i < height; i++) { \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(filter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_reg = _mm256_avg_epu8(src_reg, sec_reg); \ + MERGE_WITH_SRC(src_reg, zero_reg) \ + sec += sec_stride; \ + CALC_SUM_SSE_INSIDE_LOOP \ + src += src_stride; \ + dst += dst_stride; \ + } \ + /* x_offset = bilin interpolation and y_offset = 4 */ \ + } else if (y_offset == 4) { \ + __m256i filter, pw8, src_next_reg, src_pack; \ + x_offset <<= 5; \ + filter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(filter) \ + /* convert each 16 bit to 8 bit to each low and high lane source */ \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + for (i = 0; i < height; i++) { \ + src += src_stride; \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(filter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + /* average between previous pack to the current */ \ + src_pack = _mm256_avg_epu8(src_pack, src_reg); \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_pack = _mm256_avg_epu8(src_pack, sec_reg); \ + sec += sec_stride; \ + MERGE_WITH_SRC(src_pack, zero_reg) \ + src_pack = src_reg; \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + /* x_offset = bilin interpolation and y_offset = bilin interpolation \ + */ \ + } else { \ + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \ + x_offset <<= 5; \ + xfilter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + x_offset)); \ + y_offset <<= 5; \ + yfilter = _mm256_load_si256( \ + (__m256i const *)(bilinear_filters_avx2 + y_offset)); \ + pw8 = _mm256_set1_epi16(8); \ + /* load source and another source starting from the next */ \ + /* following byte */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + MERGE_NEXT_SRC(src_reg, 1) \ + \ + FILTER_SRC(xfilter) \ + /* convert each 16 bit to 8 bit to each low and high lane source */ \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + for (i = 0; i < height; i++) { \ + src += src_stride; \ + LOAD_SRC_DST \ + MERGE_NEXT_SRC(src_reg, 1) \ + FILTER_SRC(xfilter) \ + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + /* merge previous pack to current pack source */ \ + MERGE_WITH_SRC(src_pack, src_reg) \ + /* filter the source */ \ + FILTER_SRC(yfilter) \ + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \ + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \ + src_pack = _mm256_avg_epu8(src_pack, sec_reg); \ + MERGE_WITH_SRC(src_pack, zero_reg) \ + src_pack = src_reg; \ + sec += sec_stride; \ + CALC_SUM_SSE_INSIDE_LOOP \ + dst += dst_stride; \ + } \ + } \ + } \ + CALC_SUM_AND_SSE \ + _mm256_zeroupper(); \ + return sum; \ + } \ + unsigned int aom_sub_pixel_avg_variance32x##height##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse, \ + const uint8_t *sec_ptr) { \ + const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ + sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ + } + +MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6) +MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5) +MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4) + +#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height. +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7) +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6) +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7) +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6) +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5) diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c new file mode 100644 index 0000000000..699002195b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const int8_t f0 = (int8_t)(filter[0] >> 1); + const int8_t f1 = (int8_t)(filter[1] >> 1); + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = xx_loadl_64(a + 1); + + // unpack to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source = _mm_unpacklo_epi8(source_low, source_hi); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c new file mode 100644 index 0000000000..faec9cf73d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" + +unsigned int aom_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = xx_loadu_128(src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return (unsigned int)_mm_cvtsi128_si32(vsum); +} + +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); + return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); +} + +static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); + return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); +} + +static INLINE void load16_8to16_sse2(const uint8_t *const p, __m128i *out) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)p); + out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); // lower 8 values + out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); // upper 8 values +} + +// Accumulate 4 32bit numbers in val to 1 32bit number +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return (unsigned int)_mm_cvtsi128_si32(val); +} + +// Accumulate 8 16bit in sum to 4 32bit number +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} + +static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src, ref); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +// diff sum of 128 pixels can still fit in 16bit integer +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); +} + +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); +} + +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = (int)add32x4_sse2(vsum); +} + +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = sum_to_32bit_sse2(vsum); + *sum = (int)add32x4_sse2(vsum); +} + +static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 256); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src, src_stride); + const __m128i r = load4x2_sse2(ref, ref_stride); + + variance_kernel_sse2(s, r, sse, sum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + +static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 128); // May overflow for larger height. + *sum = _mm_setzero_si128(); + *sse = _mm_setzero_si128(); + for (int i = 0; i < h; i++) { + const __m128i s = load8_8to16_sse2(src); + const __m128i r = load8_8to16_sse2(ref); + + variance_kernel_sse2(s, r, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance16_kernel_sse2(const uint8_t *const src, + const uint8_t *const ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); +} + +static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 64); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src, ref, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 16); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + variance16_kernel_sse2(src + 32, ref + 32, sse, sum); + variance16_kernel_sse2(src + 48, ref + 48, sse, sum); + src += src_stride; + ref += ref_stride; + } +} + +static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 8); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < 4; ++j) { + const int offset0 = j << 5; + const int offset1 = offset0 + 16; + variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); + variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); + } + src += src_stride; + ref += ref_stride; + } +} + +void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse8x8, int *sum8x8, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var8x8) { + // Loop over 4 8x8 blocks. Process one 8x32 block. + for (int k = 0; k < 4; k++) { + const uint8_t *src = src_ptr; + const uint8_t *ref = ref_ptr; + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + for (int i = 0; i < 8; i++) { + const __m128i s = load8_8to16_sse2(src + (k * 8)); + const __m128i r = load8_8to16_sse2(ref + (k * 8)); + const __m128i diff = _mm_sub_epi16(s, r); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff, diff)); + vsum = _mm_add_epi16(vsum, diff); + + src += src_stride; + ref += ref_stride; + } + variance_final_128_pel_sse2(vsse, vsum, &sse8x8[k], &sum8x8[k]); + } + + // Calculate variance at 8x8 level and total sse, sum of 8x32 block. + *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3]; + *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3]; + for (int i = 0; i < 4; i++) + var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6); +} + +void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse16x16, + unsigned int *tot_sse, int *tot_sum, + uint32_t *var16x16) { + int sum16x16[2] = { 0 }; + // Loop over 2 16x16 blocks. Process one 16x32 block. + for (int k = 0; k < 2; k++) { + const uint8_t *src = src_ptr; + const uint8_t *ref = ref_ptr; + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + for (int i = 0; i < 16; i++) { + __m128i s[2]; + __m128i r[2]; + load16_8to16_sse2(src + (k * 16), s); + load16_8to16_sse2(ref + (k * 16), r); + const __m128i diff0 = _mm_sub_epi16(s[0], r[0]); + const __m128i diff1 = _mm_sub_epi16(s[1], r[1]); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi16(vsum, _mm_add_epi16(diff0, diff1)); + src += src_stride; + ref += ref_stride; + } + variance_final_256_pel_sse2(vsse, vsum, &sse16x16[k], &sum16x16[k]); + } + + // Calculate variance at 16x16 level and total sse, sum of 16x32 block. + *tot_sse += sse16x16[0] + sse16x16[1]; + *tot_sum += sum16x16[0] + sum16x16[1]; + for (int i = 0; i < 2; i++) + var16x16[i] = + sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8); +} + +#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum; \ + int sum = 0; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128) +AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128) +AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128) + +AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128) +AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128) +AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128) + +AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128) +AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256) +AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512) + +AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256) +AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512) +AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024) + +#if !CONFIG_REALTIME_ONLY +AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128) +AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256) +AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024) +#endif + +#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum = _mm_setzero_si128(); \ + for (int i = 0; i < (bh / uh); ++i) { \ + __m128i vsum16; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ + src += (src_stride * uh); \ + ref += (ref_stride * uh); \ + } \ + *sse = add32x4_sse2(vsse); \ + int sum = (int)add32x4_sse2(vsum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_SSE2(32, 64, 11, 32) // 32x32 * ( 64/32 ) + +AOM_VAR_LOOP_SSE2(64, 32, 11, 16) // 64x16 * ( 32/16 ) +AOM_VAR_LOOP_SSE2(64, 64, 12, 16) // 64x16 * ( 64/16 ) +AOM_VAR_LOOP_SSE2(64, 128, 13, 16) // 64x16 * ( 128/16 ) + +AOM_VAR_LOOP_SSE2(128, 64, 13, 8) // 128x8 * ( 64/8 ) +AOM_VAR_LOOP_SSE2(128, 128, 14, 8) // 128x8 * ( 128/8 ) + +#if !CONFIG_REALTIME_ONLY +AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024) +#endif + +unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int aom_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); +DECLS(ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#if !CONFIG_REALTIME_ONLY +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) +#else +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) +#endif + +FNS(sse2) +FNS(ssse3) + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); +DECLS(ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#if !CONFIG_REALTIME_ONLY +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) +#else +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) +#endif + +FNS(sse2) +FNS(ssse3) + +#undef FNS +#undef FN + +static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, + const __m128i s1, + const __m128i a) { + const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i a_inv = _mm_sub_epi16(alpha_max, a); + + const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); + const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); + const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); + const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); + const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); + const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); + const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i comp = _mm_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m128i zero = _mm_setzero_si128(); + + if (width == 8) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 16) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else { + do { + for (int x = 0; x < width; x += 32) { + for (int j = 0; j < 2; j++) { + const __m128i s0 = + _mm_loadu_si128((const __m128i *)(src0 + x + j * 16)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16)); + const __m128i s1 = + _mm_loadu_si128((const __m128i *)(src1 + x + j * 16)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16)); + + const __m128i m_8 = + _mm_loadu_si128((const __m128i *)(mask + x + j * 16)); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); + } + comp_pred += 32; + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + i += 1; + } while (i < height); + } +} + +uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { + uint64_t sum = 0; + __m128i dst0_8x8, dst1_8x8, dst_16x8; + __m128i src0_16x4, src1_16x4, src_16x8; + __m128i res0_32x4, res0_64x2, res1_64x2; + __m128i sub_result_16x8; + const __m128i zeros = _mm_setzero_si128(); + __m128i square_result = _mm_setzero_si128(); + for (int i = 0; i < h; i += 2) { + dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride])); + dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride])); + dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros); + + src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); + src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); + src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4); + + sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8); + + res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8); + + res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros); + res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros); + + square_result = + _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2)); + } + const __m128i sum_64x1 = + _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); + xx_storel_64(&sum, sum_64x1); + return sum; +} + +uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { + uint64_t sum = 0; + __m128i dst_8x8, dst_16x8; + __m128i src_16x8; + __m128i res0_32x4, res0_64x2, res1_64x2; + __m128i sub_result_16x8; + const __m128i zeros = _mm_setzero_si128(); + __m128i square_result = _mm_setzero_si128(); + + for (int i = 0; i < h; i++) { + dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); + dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros); + + src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]); + + sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8); + + res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8); + + res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros); + res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros); + + square_result = + _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2)); + } + const __m128i sum_64x1 = + _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); + xx_storel_64(&sum, sum_64x1); + return sum; +} + +uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4) && + "w=8/4 and h=8/4 must satisfy"); + switch (w) { + case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); + case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); + default: assert(0 && "unsupported width"); return -1; + } +} + +uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4) && + "w=8/4 and h=8/4 must be satisfied"); + const int num_blks = 16 / w; + uint64_t sum = 0; + for (int i = 0; i < num_blks; i++) { + sum += aom_mse_wxh_16bit_sse2(dst, dstride, src, w, w, h); + dst += w; + src += (w * h); + } + return sum; +} diff --git a/third_party/aom/aom_mem/aom_mem.c b/third_party/aom/aom_mem/aom_mem.c new file mode 100644 index 0000000000..807ddcf05e --- /dev/null +++ b/third_party/aom/aom_mem/aom_mem.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_mem.h" +#include +#include +#include +#include "include/aom_mem_intrnl.h" +#include "aom/aom_integer.h" + +static size_t GetAllocationPaddingSize(size_t align) { + assert(align > 0); + assert(align < SIZE_MAX - ADDRESS_STORAGE_SIZE); + return align - 1 + ADDRESS_STORAGE_SIZE; +} + +// Returns 0 in case of overflow of nmemb * size. +static int check_size_argument_overflow(size_t nmemb, size_t size, + size_t align) { + if (nmemb == 0) return 1; + const size_t alloc_padding = GetAllocationPaddingSize(align); +#if defined(AOM_MAX_ALLOCABLE_MEMORY) + assert(AOM_MAX_ALLOCABLE_MEMORY >= alloc_padding); + assert(AOM_MAX_ALLOCABLE_MEMORY <= SIZE_MAX); + if (size > (AOM_MAX_ALLOCABLE_MEMORY - alloc_padding) / nmemb) return 0; +#else + if (size > (SIZE_MAX - alloc_padding) / nmemb) return 0; +#endif + return 1; +} + +static size_t *GetMallocAddressLocation(void *const mem) { + return ((size_t *)mem) - 1; +} + +static void SetActualMallocAddress(void *const mem, + const void *const malloc_addr) { + size_t *const malloc_addr_location = GetMallocAddressLocation(mem); + *malloc_addr_location = (size_t)malloc_addr; +} + +static void *GetActualMallocAddress(void *const mem) { + const size_t *const malloc_addr_location = GetMallocAddressLocation(mem); + return (void *)(*malloc_addr_location); +} + +void *aom_memalign(size_t align, size_t size) { + void *x = NULL; + if (!check_size_argument_overflow(1, size, align)) return NULL; + const size_t aligned_size = size + GetAllocationPaddingSize(align); + void *const addr = malloc(aligned_size); + if (addr) { + x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align); + SetActualMallocAddress(x, addr); + } + return x; +} + +void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); } + +void *aom_calloc(size_t num, size_t size) { + if (!check_size_argument_overflow(num, size, DEFAULT_ALIGNMENT)) return NULL; + const size_t total_size = num * size; + void *const x = aom_malloc(total_size); + if (x) memset(x, 0, total_size); + return x; +} + +void aom_free(void *memblk) { + if (memblk) { + void *addr = GetActualMallocAddress(memblk); + free(addr); + } +} diff --git a/third_party/aom/aom_mem/aom_mem.cmake b/third_party/aom/aom_mem/aom_mem.cmake new file mode 100644 index 0000000000..346588d2db --- /dev/null +++ b/third_party/aom/aom_mem/aom_mem.cmake @@ -0,0 +1,29 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_MEM_AOM_MEM_CMAKE_) + return() +endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_ +set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1) + +list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c" + "${AOM_ROOT}/aom_mem/aom_mem.h" + "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h") + +# Creates the aom_mem build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_aom_mem_targets) + add_library(aom_mem OBJECT ${AOM_MEM_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() +endfunction() diff --git a/third_party/aom/aom_mem/aom_mem.h b/third_party/aom/aom_mem/aom_mem.h new file mode 100644 index 0000000000..ca4af7fc61 --- /dev/null +++ b/third_party/aom/aom_mem/aom_mem.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_MEM_AOM_MEM_H_ +#define AOM_AOM_MEM_AOM_MEM_H_ + +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +#if defined(__uClinux__) +#include +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef AOM_MAX_ALLOCABLE_MEMORY +#if SIZE_MAX > (1ULL << 32) +#define AOM_MAX_ALLOCABLE_MEMORY 8589934592 // 8 GB +#else +// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings. +#define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16)) +#endif +#endif + +void *aom_memalign(size_t align, size_t size); +void *aom_malloc(size_t size); +void *aom_calloc(size_t num, size_t size); +void aom_free(void *memblk); + +static INLINE void *aom_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) *dest16++ = val; + return dest; +} + +/*returns an addr aligned to the byte boundary specified by align*/ +#define aom_align_addr(addr, align) \ + (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1)) + +#include + +#ifdef AOM_MEM_PLTFRM +#include AOM_MEM_PLTFRM +#endif + +#if CONFIG_DEBUG +#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \ + do { \ + lval = (expr); \ + if (!lval) \ + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval " at %s:%d", __FILE__, \ + __LINE__); \ + } while (0) +#else +#define AOM_CHECK_MEM_ERROR(error_info, lval, expr) \ + do { \ + lval = (expr); \ + if (!lval) \ + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval); \ + } while (0) +#endif + +#if defined(__cplusplus) +} +#endif + +#endif // AOM_AOM_MEM_AOM_MEM_H_ diff --git a/third_party/aom/aom_mem/include/aom_mem_intrnl.h b/third_party/aom/aom_mem/include/aom_mem_intrnl.h new file mode 100644 index 0000000000..2c9819de92 --- /dev/null +++ b/third_party/aom/aom_mem/include/aom_mem_intrnl.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ +#define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ + +#include "config/aom_config.h" + +#define ADDRESS_STORAGE_SIZE sizeof(size_t) + +#ifndef DEFAULT_ALIGNMENT +#if defined(VXWORKS) +/*default addr alignment to use in calls to aom_* functions other than + aom_memalign*/ +#define DEFAULT_ALIGNMENT 32 +#else +#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */ +#endif +#endif + +#endif // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_ diff --git a/third_party/aom/aom_ports/aarch32_cpudetect.c b/third_party/aom/aom_ports/aarch32_cpudetect.c new file mode 100644 index 0000000000..753f957112 --- /dev/null +++ b/third_party/aom/aom_ports/aarch32_cpudetect.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// Feature detection code for Armv7-A / AArch32. + +#include "arm_cpudetect.h" + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + // MSVC has no inline __asm support for Arm, but it does let you __emit + // instructions via their assembled hex code. + // All of these instructions should be essentially nops. + __try { + // VORR q0,q0,q0 + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + // Ignore exception. + } +#endif // HAVE_NEON + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + uint64_t features = android_getCpuFeatures(); + if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define AOM_AARCH32_HWCAP_NEON (1 << 12) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON + if (hwcap & AOM_AARCH32_HWCAP_NEON) flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} +#else // end __linux__ +#error \ + "Runtime CPU detection selected, but no CPU detection method " \ +"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0." +#endif + +int aom_arm_cpu_caps(void) { + int flags = 0; + if (arm_cpu_env_flags(&flags)) { + return flags; + } + return arm_get_cpu_caps() & arm_cpu_env_mask(); +} diff --git a/third_party/aom/aom_ports/aarch64_cpudetect.c b/third_party/aom/aom_ports/aarch64_cpudetect.c new file mode 100644 index 0000000000..43d5a149c8 --- /dev/null +++ b/third_party/aom/aom_ports/aarch64_cpudetect.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "arm_cpudetect.h" + +#if defined(__APPLE__) +#include +#endif + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT + +// sysctlbyname() parameter documentation for instruction set characteristics: +// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics +static INLINE bool have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return false; + } + return feature_present; +} + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON +#if HAVE_ARM_CRC32 + if (have_feature("hw.optional.armv8_crc32")) flags |= HAS_ARM_CRC32; +#endif // HAVE_ARM_CRC32 +#if HAVE_NEON_DOTPROD + if (have_feature("hw.optional.arm.FEAT_DotProd")) flags |= HAS_NEON_DOTPROD; +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (have_feature("hw.optional.arm.FEAT_I8MM")) flags |= HAS_NEON_I8MM; +#endif // HAVE_NEON_I8MM + return flags; +} + +#elif defined(_WIN32) // end __APPLE__ + +static int arm_get_cpu_caps(void) { + int flags = 0; +// IsProcessorFeaturePresent() parameter documentation: +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_ARM_CRC32 + if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_ARM_CRC32; + } +#endif // HAVE_ARM_CRC32 +#if HAVE_NEON_DOTPROD +// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 20348, supported by Windows 11 and Windows Server 2022. +#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_NEON_DOTPROD; + } +#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_NEON_DOTPROD + // No I8MM or SVE feature detection available on Windows at time of writing. + return flags; +} + +#elif defined(ANDROID_USE_CPU_FEATURES_LIB) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + return flags; +} + +#elif defined(__linux__) // end defined(AOM_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define AOM_AARCH64_HWCAP_CRC32 (1 << 7) +#define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define AOM_AARCH64_HWCAP_SVE (1 << 22) +#define AOM_AARCH64_HWCAP2_I8MM (1 << 13) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_ARM_CRC32 + if (hwcap & AOM_AARCH64_HWCAP_CRC32) flags |= HAS_ARM_CRC32; +#endif // HAVE_ARM_CRC32 +#if HAVE_NEON_DOTPROD + if (hwcap & AOM_AARCH64_HWCAP_ASIMDDP) flags |= HAS_NEON_DOTPROD; +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (hwcap2 & AOM_AARCH64_HWCAP2_I8MM) flags |= HAS_NEON_I8MM; +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE; +#endif // HAVE_SVE + return flags; +} + +#elif defined(__Fuchsia__) // end __linux__ + +#include +#include + +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282. +#ifndef ZX_ARM64_FEATURE_ISA_I8MM +#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) +#endif +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083. +#ifndef ZX_ARM64_FEATURE_ISA_SVE +#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20)) +#endif + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + uint32_t features; + zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (status != ZX_OK) return flags; +#if HAVE_ARM_CRC32 + if (features & ZX_ARM64_FEATURE_ISA_CRC32) flags |= HAS_ARM_CRC32; +#endif // HAVE_ARM_CRC32 +#if HAVE_NEON_DOTPROD + if (features & ZX_ARM64_FEATURE_ISA_DP) flags |= HAS_NEON_DOTPROD; +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (features & ZX_ARM64_FEATURE_ISA_I8MM) flags |= HAS_NEON_I8MM; +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (features & ZX_ARM64_FEATURE_ISA_SVE) flags |= HAS_SVE; +#endif // HAVE_SVE + return flags; +} + +#else // end __Fuchsia__ +#error \ + "Runtime CPU detection selected, but no CPU detection method " \ +"available for your platform. Rerun cmake with -DCONFIG_RUNTIME_CPU_DETECT=0." +#endif + +int aom_arm_cpu_caps(void) { + int flags = 0; + if (!arm_cpu_env_flags(&flags)) { + flags = arm_get_cpu_caps() & arm_cpu_env_mask(); + } + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_NEON_I8MM; + + // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE; + if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE; + + return flags; +} diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h new file mode 100644 index 0000000000..680120feea --- /dev/null +++ b/third_party/aom/aom_ports/aom_once.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_AOM_ONCE_H_ +#define AOM_AOM_PORTS_AOM_ONCE_H_ + +#include "config/aom_config.h" + +/* Implement a function wrapper to guarantee initialization + * thread-safety for library singletons. + * + * NOTE: This function uses static locks, and can only be + * used with one common argument per compilation unit. So + * + * file1.c: + * aom_once(foo); + * ... + * aom_once(foo); + * + * file2.c: + * aom_once(bar); + * + * will ensure foo() and bar() are each called only once, but in + * + * file1.c: + * aom_once(foo); + * aom_once(bar): + * + * bar() will never be called because the lock is used up + * by the call to foo(). + */ + +#if CONFIG_MULTITHREAD && defined(_WIN32) +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include +/* Declare a per-compilation-unit state variable to track the progress + * of calling func() only once. This must be at global scope because + * local initializers are not thread-safe in MSVC prior to Visual + * Studio 2015. + */ +static INIT_ONCE aom_init_once = INIT_ONCE_STATIC_INIT; + +static void aom_once(void (*func)(void)) { + BOOL pending; + InitOnceBeginInitialize(&aom_init_once, 0, &pending, NULL); + if (!pending) { + // Initialization has already completed. + return; + } + func(); + InitOnceComplete(&aom_init_once, 0, NULL); +} + +#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H +#include +static void aom_once(void (*func)(void)) { + static pthread_once_t lock = PTHREAD_ONCE_INIT; + pthread_once(&lock, func); +} + +#else +/* Default version that performs no synchronization. */ + +static void aom_once(void (*func)(void)) { + static volatile int done; + + if (!done) { + func(); + done = 1; + } +} +#endif + +#endif // AOM_AOM_PORTS_AOM_ONCE_H_ diff --git a/third_party/aom/aom_ports/aom_ports.cmake b/third_party/aom/aom_ports/aom_ports.cmake new file mode 100644 index 0000000000..8fd2ffd078 --- /dev/null +++ b/third_party/aom/aom_ports/aom_ports.cmake @@ -0,0 +1,96 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_) + return() +endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_ +set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1) + +list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h" + "${AOM_ROOT}/aom_ports/aom_timer.h" "${AOM_ROOT}/aom_ports/bitops.h" + "${AOM_ROOT}/aom_ports/emmintrin_compat.h" + "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h" + "${AOM_ROOT}/aom_ports/mem_ops_aligned.h" + "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h") + +list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm") + +list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm") + +list(APPEND AOM_PORTS_SOURCES_AARCH32 + "${AOM_ROOT}/aom_ports/aarch32_cpudetect.c") +list(APPEND AOM_PORTS_SOURCES_AARCH64 + "${AOM_ROOT}/aom_ports/aarch64_cpudetect.c") + +if(CONFIG_RUNTIME_CPU_DETECT AND ANDROID_NDK) + include_directories(${ANDROID_NDK}/sources/android/cpufeatures) + list(APPEND AOM_PORTS_SOURCES_ARM + "${ANDROID_NDK}/sources/android/cpufeatures/cpu-features.c") +endif() + +list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h" + "${AOM_ROOT}/aom_ports/ppc_cpudetect.c") + +# For arm and x86 targets: +# +# * Creates the aom_ports build target, adds the includes in aom_ports to the +# target, and makes libaom depend on it. +# +# Otherwise: +# +# * Adds the includes in aom_ports to the libaom target. +# +# For all target platforms: +# +# * The libaom target must exist before this function is called. +function(setup_aom_ports_targets) + if(XCODE AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") + add_asm_library("aom_ports" "AOM_PORTS_ASM_X86") + # Xcode is the only one + set(aom_ports_is_embedded 1) + set(aom_ports_has_symbols 1) + elseif(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") + add_asm_library("aom_ports" "AOM_PORTS_ASM_X86") + set(aom_ports_has_symbols 1) + elseif("${AOM_TARGET_CPU}" STREQUAL "arm64") + add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH64}) + set(aom_ports_has_symbols 1) + elseif("${AOM_TARGET_CPU}" MATCHES "arm") + add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_AARCH32}) + set(aom_ports_has_symbols 1) + elseif("${AOM_TARGET_CPU}" MATCHES "ppc") + add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC}) + set(aom_ports_has_symbols 1) + endif() + + if("${AOM_TARGET_CPU}" MATCHES "arm|ppc") + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + # Note AOM_PORTS_INCLUDES_X86 are not added to the aom_ports, aom or + # aom_static targets to avoid compilation issues in projects that enable ASM + # language support in project(). These sources were never included in + # libaom_srcs.*; if it becomes necessary for a particular generator another + # method should be used. + if(aom_ports_has_symbols) + if(NOT aom_ports_is_embedded) + target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES}) + endif() + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) + else() + target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES}) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES}) + endif() + endif() +endfunction() diff --git a/third_party/aom/aom_ports/aom_timer.h b/third_party/aom/aom_ports/aom_timer.h new file mode 100644 index 0000000000..642c5a08ba --- /dev/null +++ b/third_party/aom/aom_ports/aom_timer.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_AOM_TIMER_H_ +#define AOM_AOM_PORTS_AOM_TIMER_H_ + +#include "config/aom_config.h" + +#if CONFIG_OS_SUPPORT + +#include +#include + +#if defined(_WIN32) +/* + * Win32 specific includes + */ +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include +#else +/* + * POSIX specific includes + */ +#include + +/* timersub is not provided by msys at this time. */ +#ifndef timersub +#define timersub(a, b, result) \ + do { \ + (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((result)->tv_usec < 0) { \ + --(result)->tv_sec; \ + (result)->tv_usec += 1000000; \ + } \ + } while (0) +#endif +#endif + +struct aom_usec_timer { +#if defined(_WIN32) + LARGE_INTEGER begin, end; +#else + struct timeval begin, end; +#endif +}; + +static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->begin); +#else + gettimeofday(&t->begin, NULL); +#endif +} + +static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { +#if defined(_WIN32) + QueryPerformanceCounter(&t->end); +#else + gettimeofday(&t->end, NULL); +#endif +} + +static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) { +#if defined(_WIN32) + LARGE_INTEGER freq, diff; + + diff.QuadPart = t->end.QuadPart - t->begin.QuadPart; + + QueryPerformanceFrequency(&freq); + return diff.QuadPart * 1000000 / freq.QuadPart; +#else + struct timeval diff; + + timersub(&t->end, &t->begin, &diff); + return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec; +#endif +} + +#else /* CONFIG_OS_SUPPORT = 0*/ + +/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */ +#ifndef timersub +#define timersub(a, b, result) +#endif + +struct aom_usec_timer { + void *dummy; +}; + +static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; } + +static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; } + +static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) { + (void)t; + return 0; +} + +#endif /* CONFIG_OS_SUPPORT */ + +#endif // AOM_AOM_PORTS_AOM_TIMER_H_ diff --git a/third_party/aom/aom_ports/arm.h b/third_party/aom/aom_ports/arm.h new file mode 100644 index 0000000000..853741d19a --- /dev/null +++ b/third_party/aom/aom_ports/arm.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_ARM_H_ +#define AOM_AOM_PORTS_ARM_H_ +#include + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Armv7-A optional Neon instructions, mandatory from Armv8.0-A. +#define HAS_NEON (1 << 0) +// Armv8.0-A optional CRC32 instructions, mandatory from Armv8.1-A. +#define HAS_ARM_CRC32 (1 << 1) +// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A. +#define HAS_NEON_DOTPROD (1 << 2) +// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. +#define HAS_NEON_I8MM (1 << 3) +// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. +#define HAS_SVE (1 << 4) + +int aom_arm_cpu_caps(void); + +// Earlier gcc compilers have issues with some neon intrinsics +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \ + __GNUC_MINOR__ <= 6 +#define AOM_INCOMPATIBLE_GCC +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_ARM_H_ diff --git a/third_party/aom/aom_ports/arm_cpudetect.h b/third_party/aom/aom_ports/arm_cpudetect.h new file mode 100644 index 0000000000..33c2d1bb6a --- /dev/null +++ b/third_party/aom/aom_ports/arm_cpudetect.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/arm.h" +#include "config/aom_config.h" + +#include +#include +#include + +#if defined(_WIN32) +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#undef WIN32_EXTRA_LEAN +#define WIN32_EXTRA_LEAN +#include +#endif + +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +#if defined(__ANDROID__) && (__ANDROID_API__ < 18) +#define ANDROID_USE_CPU_FEATURES_LIB 1 +// Use getauxval() when targeting (64-bit) Android with API level >= 18. +// getauxval() is supported since Android API level 18 (Android 4.3.) +// First Android version with 64-bit support was Android 5.x (API level 21). +#include +#endif + +static bool arm_cpu_env_flags(int *flags) { + const char *env = getenv("AOM_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return true; + } + return false; +} + +static int arm_cpu_env_mask(void) { + const char *env = getenv("AOM_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} diff --git a/third_party/aom/aom_ports/bitops.h b/third_party/aom/aom_ports/bitops.h new file mode 100644 index 0000000000..0795855083 --- /dev/null +++ b/third_party/aom/aom_ports/bitops.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_BITOPS_H_ +#define AOM_AOM_PORTS_BITOPS_H_ + +#include +#include + +#include "aom_ports/msvc.h" +#include "config/aom_config.h" + +#ifdef _MSC_VER +#if defined(_M_X64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM) +#include +#define USE_MSC_INTRINSICS +#endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// get_msb: +// Returns (int)floor(log2(n)). n must be > 0. +// These versions of get_msb() are only valid when n != 0 because all +// of the optimized versions are undefined when n == 0: + +// GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html +// MSVC: https://learn.microsoft.com/en-us/cpp/intrinsics/compiler-intrinsics + +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_msb(unsigned int n) { + assert(n != 0); + return 31 ^ __builtin_clz(n); +} +#elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanReverse) + +static INLINE int get_msb(unsigned int n) { + unsigned long first_set_bit; + assert(n != 0); + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#else +static INLINE int get_msb(unsigned int n) { + int log = 0; + unsigned int value = n; + + assert(n != 0); + + for (int shift = 16; shift != 0; shift >>= 1) { + const unsigned int x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int aom_clzll(uint64_t n) { return __builtin_clzll(n); } +#elif defined(USE_MSC_INTRINSICS) +#if defined(_M_X64) || defined(_M_ARM64) +#pragma intrinsic(_BitScanReverse64) +#endif + +static INLINE int aom_clzll(uint64_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) +#if defined(_M_X64) || defined(_M_ARM64) + const unsigned char bit_set = + _BitScanReverse64(&first_set_bit, (unsigned __int64)n); +#else // !(defined(_M_X64) || defined(_M_ARM64)) + const unsigned long n_hi = (unsigned long)(n >> 32); // NOLINT(runtime/int) + if (n_hi != 0) { + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); + assert(bit_set != 0); + (void)bit_set; + return 31 ^ (int)first_set_bit; + } + const unsigned char bit_set = + _BitScanReverse(&first_set_bit, (unsigned long)n); // NOLINT(runtime/int) +#endif + assert(bit_set != 0); + (void)bit_set; + return 63 ^ (int)first_set_bit; +} +#undef USE_MSC_INTRINSICS +#else +static INLINE int aom_clzll(uint64_t n) { + assert(n != 0); + + int res = 0; + uint64_t high_bit = 1ULL << 63; + while (!(n & high_bit)) { + res++; + n <<= 1; + } + return res; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_BITOPS_H_ diff --git a/third_party/aom/aom_ports/emmintrin_compat.h b/third_party/aom/aom_ports/emmintrin_compat.h new file mode 100644 index 0000000000..85d218a3d2 --- /dev/null +++ b/third_party/aom/aom_ports/emmintrin_compat.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ +#define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ + +#if defined(__GNUC__) && __GNUC__ < 4 +/* From emmintrin.h (gcc 4.5.3) */ +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_ps(__m128d __A) { + return (__m128)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_si128(__m128d __A) { + return (__m128i)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_pd(__m128 __A) { + return (__m128d)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_si128(__m128 __A) { + return (__m128i)__A; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_ps(__m128i __A) { + return (__m128)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_pd(__m128i __A) { + return (__m128d)__A; +} +#endif + +#endif // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/third_party/aom/aom_ports/float.asm b/third_party/aom/aom_ports/float.asm new file mode 100644 index 0000000000..abff60a7a4 --- /dev/null +++ b/third_party/aom/aom_ports/float.asm @@ -0,0 +1,33 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + +%include "aom_ports/x86_abi_support.asm" + +section .text +%if LIBAOM_YASM_WIN64 +globalsym(aom_winx64_fldcw) +sym(aom_winx64_fldcw): + sub rsp, 8 + mov [rsp], rcx ; win x64 specific + fldcw [rsp] + add rsp, 8 + ret + + +globalsym(aom_winx64_fstcw) +sym(aom_winx64_fstcw): + sub rsp, 8 + fstcw [rsp] + mov rax, [rsp] + add rsp, 8 + ret +%endif diff --git a/third_party/aom/aom_ports/mem.h b/third_party/aom/aom_ports/mem.h new file mode 100644 index 0000000000..a70ce825b1 --- /dev/null +++ b/third_party/aom/aom_ports/mem.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MEM_H_ +#define AOM_AOM_PORTS_MEM_H_ + +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C) +#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n))) +#elif defined(_MSC_VER) +#define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val +#else +#warning No alignment directives known for this compiler. +#define DECLARE_ALIGNED(n, typ, val) typ val +#endif + +#if HAVE_NEON && defined(_MSC_VER) +#define __builtin_prefetch(x) +#endif + +/* Shift down with rounding for use when n >= 0. Usually value >= 0, but the + * macro can be used with a negative value if the direction of rounding is + * acceptable. + */ +#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n)) + +/* Shift down with rounding for signed integers, for use when n >= 0 */ +#define ROUND_POWER_OF_TWO_SIGNED(value, n) \ + (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \ + : ROUND_POWER_OF_TWO((value), (n))) + +/* Shift down with rounding for use when n >= 0 (64-bit value). Usually + * value >= 0, but the macro can be used with a negative value if the direction + * of rounding is acceptable. + */ +#define ROUND_POWER_OF_TWO_64(value, n) \ + (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n)) +/* Shift down with rounding for signed integers, for use when n >= 0 (64-bit + * value) + */ +#define ROUND_POWER_OF_TWO_SIGNED_64(value, n) \ + (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \ + : ROUND_POWER_OF_TWO_64((value), (n))) + +/* Shift down with ceil() for use when n >= 0 and value >= 0.*/ +#define CEIL_POWER_OF_TWO(value, n) (((value) + (1 << (n)) - 1) >> (n)) + +/* shift right or left depending on sign of n */ +#define RIGHT_SIGNED_SHIFT(value, n) \ + ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n))) + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) +#define ALIGN_POWER_OF_TWO_UNSIGNED(value, n) \ + (((value) + ((1u << (n)) - 1)) & ~((1u << (n)) - 1)) + +#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y)) + +#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) +#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) + +/*!\brief force enum to be unsigned 1 byte*/ +#define UENUM1BYTE(enumvar) \ + ; \ + typedef uint8_t enumvar + +/*!\brief force enum to be signed 1 byte*/ +#define SENUM1BYTE(enumvar) \ + ; \ + typedef int8_t enumvar + +/*!\brief force enum to be unsigned 2 byte*/ +#define UENUM2BYTE(enumvar) \ + ; \ + typedef uint16_t enumvar + +/*!\brief force enum to be signed 2 byte*/ +#define SENUM2BYTE(enumvar) \ + ; \ + typedef int16_t enumvar + +/*!\brief force enum to be unsigned 4 byte*/ +#define UENUM4BYTE(enumvar) \ + ; \ + typedef uint32_t enumvar + +/*!\brief force enum to be unsigned 4 byte*/ +#define SENUM4BYTE(enumvar) \ + ; \ + typedef int32_t enumvar + +#endif // AOM_AOM_PORTS_MEM_H_ diff --git a/third_party/aom/aom_ports/mem_ops.h b/third_party/aom/aom_ports/mem_ops.h new file mode 100644 index 0000000000..2b5bc0f0fb --- /dev/null +++ b/third_party/aom/aom_ports/mem_ops.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MEM_OPS_H_ +#define AOM_AOM_PORTS_MEM_OPS_H_ + +/* \file + * \brief Provides portable memory access primitives + * + * This function provides portable primitives for getting and setting of + * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations + * can be performed on unaligned data regardless of hardware support for + * unaligned accesses. + * + * The type used to pass the integral values may be changed by defining + * MEM_VALUE_T with the appropriate type. The type given must be an integral + * numeric type. + * + * The actual functions instantiated have the MEM_VALUE_T type name pasted + * on to the symbol name. This allows the developer to instantiate these + * operations for multiple types within the same translation unit. This is + * of somewhat questionable utility, but the capability exists nonetheless. + * Users not making use of this functionality should call the functions + * without the type name appended, and the preprocessor will take care of + * it. + * + * NOTE: This code is not supported on platforms where char > 1 octet ATM. + */ + +#ifndef MAU_T +/* Minimum Access Unit for this target */ +#define MAU_T unsigned char +#endif + +#ifndef MEM_VALUE_T +#define MEM_VALUE_T int +#endif + +#undef MEM_VALUE_T_SZ_BITS +#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3) + +#undef mem_ops_wrap_symbol +#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T) +#undef mem_ops_wrap_symbol2 +#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ) +#undef mem_ops_wrap_symbol3 +#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ + +/* + * Include aligned access routines + */ +#define INCLUDED_BY_MEM_OPS_H +#include "mem_ops_aligned.h" +#undef INCLUDED_BY_MEM_OPS_H + +#undef mem_get_be16 +#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16) +static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 8; + val |= mem[1]; + return val; +} + +#undef mem_get_be24 +#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24) +static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[0] << 16; + val |= mem[1] << 8; + val |= mem[2]; + return val; +} + +#undef mem_get_be32 +#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32) +static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[0]) << 24; + val |= mem[1] << 16; + val |= mem[2] << 8; + val |= mem[3]; + return val; +} + +#undef mem_get_le16 +#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16) +static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le24 +#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24) +static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#undef mem_get_le32 +#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32) +static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { + unsigned MEM_VALUE_T val; + const MAU_T *mem = (const MAU_T *)vmem; + + val = ((unsigned MEM_VALUE_T)mem[3]) << 24; + val |= mem[2] << 16; + val |= mem[1] << 8; + val |= mem[0]; + return val; +} + +#define mem_get_s_generic(end, sz) \ + static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \ + const MAU_T *mem = (const MAU_T *)vmem; \ + signed MEM_VALUE_T val = mem_get_##end##sz(mem); \ + return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \ + } + +/* clang-format off */ +#undef mem_get_sbe16 +#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16) +mem_get_s_generic(be, 16) + +#undef mem_get_sbe24 +#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24) +mem_get_s_generic(be, 24) + +#undef mem_get_sbe32 +#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32) +mem_get_s_generic(be, 32) + +#undef mem_get_sle16 +#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16) +mem_get_s_generic(le, 16) + +#undef mem_get_sle24 +#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24) +mem_get_s_generic(le, 24) + +#undef mem_get_sle32 +#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32) +mem_get_s_generic(le, 32) + +#undef mem_put_be16 +#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) +static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 8) & 0xff); + mem[1] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be24 +#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) +static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 16) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_be32 +#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) +static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 24) & 0xff); + mem[1] = (MAU_T)((val >> 16) & 0xff); + mem[2] = (MAU_T)((val >> 8) & 0xff); + mem[3] = (MAU_T)((val >> 0) & 0xff); +} + +#undef mem_put_le16 +#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) +static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); +} + +#undef mem_put_le24 +#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) +static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); +} + +#undef mem_put_le32 +#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) +static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { + MAU_T *mem = (MAU_T *)vmem; + + mem[0] = (MAU_T)((val >> 0) & 0xff); + mem[1] = (MAU_T)((val >> 8) & 0xff); + mem[2] = (MAU_T)((val >> 16) & 0xff); + mem[3] = (MAU_T)((val >> 24) & 0xff); +} +/* clang-format on */ +#endif // AOM_AOM_PORTS_MEM_OPS_H_ diff --git a/third_party/aom/aom_ports/mem_ops_aligned.h b/third_party/aom/aom_ports/mem_ops_aligned.h new file mode 100644 index 0000000000..37c3675318 --- /dev/null +++ b/third_party/aom/aom_ports/mem_ops_aligned.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ +#define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ + +#include "aom/aom_integer.h" + +/* \file + * \brief Provides portable memory access primitives for operating on aligned + * data + * + * This file is split from mem_ops.h for easier maintenance. See mem_ops.h + * for a more detailed description of these primitives. + */ +#ifndef INCLUDED_BY_MEM_OPS_H +#error Include mem_ops.h, not mem_ops_aligned.h directly. +#endif + +/* Architectures that provide instructions for doing this byte swapping + * could redefine these macros. + */ +#define swap_endian_16(val, raw) \ + do { \ + val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \ + } while (0) +#define swap_endian_32(val, raw) \ + do { \ + val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \ + ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000); \ + } while (0) +#define swap_endian_16_se(val, raw) \ + do { \ + swap_endian_16(val, raw); \ + val = ((val << 16) >> 16); \ + } while (0) +#define swap_endian_32_se(val, raw) swap_endian_32(val, raw) + +#define mem_get_ne_aligned_generic(end, sz) \ + static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ + const void *vmem) { \ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ + return *mem; \ + } + +#define mem_get_sne_aligned_generic(end, sz) \ + static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ + const void *vmem) { \ + const int##sz##_t *mem = (const int##sz##_t *)vmem; \ + return *mem; \ + } + +#define mem_get_se_aligned_generic(end, sz) \ + static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \ + const void *vmem) { \ + const uint##sz##_t *mem = (const uint##sz##_t *)vmem; \ + unsigned MEM_VALUE_T val, raw = *mem; \ + swap_endian_##sz(val, raw); \ + return val; \ + } + +#define mem_get_sse_aligned_generic(end, sz) \ + static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \ + const void *vmem) { \ + const int##sz##_t *mem = (const int##sz##_t *)vmem; \ + unsigned MEM_VALUE_T val, raw = *mem; \ + swap_endian_##sz##_se(val, raw); \ + return val; \ + } + +#define mem_put_ne_aligned_generic(end, sz) \ + static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem, \ + MEM_VALUE_T val) { \ + uint##sz##_t *mem = (uint##sz##_t *)vmem; \ + *mem = (uint##sz##_t)val; \ + } + +#define mem_put_se_aligned_generic(end, sz) \ + static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem, \ + MEM_VALUE_T val) { \ + uint##sz##_t *mem = (uint##sz##_t *)vmem, raw; \ + swap_endian_##sz(raw, val); \ + *mem = (uint##sz##_t)raw; \ + } + +#include "config/aom_config.h" + +#if CONFIG_BIG_ENDIAN +#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz) +#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz) +#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz) +#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz) +#else +#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz) +#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz) +#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz) +#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz) +#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz) +#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz) +#endif + +/* clang-format off */ +#undef mem_get_be16_aligned +#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned) +mem_get_be_aligned_generic(16) + +#undef mem_get_be32_aligned +#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned) +mem_get_be_aligned_generic(32) + +#undef mem_get_le16_aligned +#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned) +mem_get_le_aligned_generic(16) + +#undef mem_get_le32_aligned +#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned) +mem_get_le_aligned_generic(32) + +#undef mem_get_sbe16_aligned +#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned) +mem_get_sbe_aligned_generic(16) + +#undef mem_get_sbe32_aligned +#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned) +mem_get_sbe_aligned_generic(32) + +#undef mem_get_sle16_aligned +#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned) +mem_get_sle_aligned_generic(16) + +#undef mem_get_sle32_aligned +#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned) +mem_get_sle_aligned_generic(32) + +#undef mem_put_be16_aligned +#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned) +mem_put_be_aligned_generic(16) + +#undef mem_put_be32_aligned +#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned) +mem_put_be_aligned_generic(32) + +#undef mem_put_le16_aligned +#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned) +mem_put_le_aligned_generic(16) + +#undef mem_put_le32_aligned +#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned) +mem_put_le_aligned_generic(32) + +#undef mem_get_ne_aligned_generic +#undef mem_get_se_aligned_generic +#undef mem_get_sne_aligned_generic +#undef mem_get_sse_aligned_generic +#undef mem_put_ne_aligned_generic +#undef mem_put_se_aligned_generic +#undef swap_endian_16 +#undef swap_endian_32 +#undef swap_endian_16_se +#undef swap_endian_32_se +/* clang-format on */ + +#endif // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/third_party/aom/aom_ports/msvc.h b/third_party/aom/aom_ports/msvc.h new file mode 100644 index 0000000000..e78e605f2f --- /dev/null +++ b/third_party/aom/aom_ports/msvc.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_MSVC_H_ +#define AOM_AOM_PORTS_MSVC_H_ +#ifdef _MSC_VER + +#include "config/aom_config.h" + +#if _MSC_VER < 1900 // VS2015 provides snprintf +#define snprintf _snprintf +#endif // _MSC_VER < 1900 + +#if _MSC_VER < 1800 // VS2013 provides round +#include +static INLINE double round(double x) { + if (x < 0) + return ceil(x - 0.5); + else + return floor(x + 0.5); +} + +static INLINE float roundf(float x) { + if (x < 0) + return (float)ceil(x - 0.5f); + else + return (float)floor(x + 0.5f); +} + +static INLINE long lroundf(float x) { + if (x < 0) + return (long)(x - 0.5f); + else + return (long)(x + 0.5f); +} +#endif // _MSC_VER < 1800 + +#if HAVE_AVX +#include +// Note: +// _mm256_insert_epi16 intrinsics is available from vs2017. +// We define this macro for vs2015 and earlier. The +// intrinsics used here are in vs2015 document: +// https://msdn.microsoft.com/en-us/library/hh977022.aspx +// Input parameters: +// a: __m256i, +// d: int16_t, +// indx: imm8 (0 - 15) +#if _MSC_VER <= 1900 +#define _mm256_insert_epi16(a, d, indx) \ + _mm256_insertf128_si256( \ + a, \ + _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \ + indx >> 3) + +static INLINE int _mm256_extract_epi32(__m256i a, const int i) { + return a.m256i_i32[i & 7]; +} +static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) { + __m256i c = a; + c.m256i_i32[i & 7] = b; + return c; +} +#endif // _MSC_VER <= 1900 +#endif // HAVE_AVX +#endif // _MSC_VER +#endif // AOM_AOM_PORTS_MSVC_H_ diff --git a/third_party/aom/aom_ports/ppc.h b/third_party/aom/aom_ports/ppc.h new file mode 100644 index 0000000000..3159bda682 --- /dev/null +++ b/third_party/aom/aom_ports/ppc.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_PPC_H_ +#define AOM_AOM_PORTS_PPC_H_ +#include + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_VSX 0x01 + +int ppc_simd_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_PPC_H_ diff --git a/third_party/aom/aom_ports/ppc_cpudetect.c b/third_party/aom/aom_ports/ppc_cpudetect.c new file mode 100644 index 0000000000..ce4d5ae231 --- /dev/null +++ b/third_party/aom/aom_ports/ppc_cpudetect.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/ppc.h" + +#if CONFIG_RUNTIME_CPU_DETECT +static int cpu_env_flags(int *flags) { + char *env; + env = getenv("AOM_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int cpu_env_mask(void) { + char *env; + env = getenv("AOM_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +int ppc_simd_caps(void) { + int flags; + int mask; + int fd; + ssize_t count; + unsigned int i; + uint64_t buf[64]; + + // If AOM_SIMD_CAPS_MASK is set then allow only those capabilities. + if (!cpu_env_flags(&flags)) { + return flags; + } + + mask = cpu_env_mask(); + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) { + return 0; + } + + while ((count = read(fd, buf, sizeof(buf))) > 0) { + for (i = 0; i < (count / sizeof(*buf)); i += 2) { + if (buf[i] == AT_HWCAP) { +#if HAVE_VSX + if (buf[i + 1] & PPC_FEATURE_HAS_VSX) { + flags |= HAS_VSX; + } +#endif // HAVE_VSX + goto out_close; + } else if (buf[i] == AT_NULL) { + goto out_close; + } + } + } +out_close: + close(fd); + return flags & mask; +} +#else +// If there is no RTCD the function pointers are not used and can not be +// changed. +int ppc_simd_caps(void) { return 0; } +#endif // CONFIG_RUNTIME_CPU_DETECT diff --git a/third_party/aom/aom_ports/sanitizer.h b/third_party/aom/aom_ports/sanitizer.h new file mode 100644 index 0000000000..1dd8eb4cf4 --- /dev/null +++ b/third_party/aom/aom_ports/sanitizer.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_SANITIZER_H_ +#define AOM_AOM_PORTS_SANITIZER_H_ + +// AddressSanitizer support. + +// Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used. +// Clang. +#if defined(__has_feature) +#if __has_feature(address_sanitizer) +#define AOM_ADDRESS_SANITIZER 1 +#endif +#endif // defined(__has_feature) +// GCC. +#if defined(__SANITIZE_ADDRESS__) +#define AOM_ADDRESS_SANITIZER 1 +#endif // defined(__SANITIZE_ADDRESS__) + +// Define the macros for AddressSanitizer manual memory poisoning. See +// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning. +#if defined(AOM_ADDRESS_SANITIZER) +#include +#else +#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) +#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size)) +#endif + +#endif // AOM_AOM_PORTS_SANITIZER_H_ diff --git a/third_party/aom/aom_ports/x86.h b/third_party/aom/aom_ports/x86.h new file mode 100644 index 0000000000..c089984085 --- /dev/null +++ b/third_party/aom/aom_ports/x86.h @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_PORTS_X86_H_ +#define AOM_AOM_PORTS_X86_H_ +#include + +#if defined(_MSC_VER) +#include /* For __cpuidex, __rdtsc */ +#endif + +#include "aom/aom_integer.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + AOM_CPU_UNKNOWN = -1, + AOM_CPU_AMD, + AOM_CPU_AMD_OLD, + AOM_CPU_CENTAUR, + AOM_CPU_CYRIX, + AOM_CPU_INTEL, + AOM_CPU_NEXGEN, + AOM_CPU_NSC, + AOM_CPU_RISE, + AOM_CPU_SIS, + AOM_CPU_TRANSMETA, + AOM_CPU_TRANSMETA_OLD, + AOM_CPU_UMC, + AOM_CPU_VIA, + + AOM_CPU_LAST +} aom_cpu_t; + +#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__) +#if AOM_ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx) \ + __asm__ __volatile__("cpuid \n\t" \ + : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#else +#define cpuid(func, func2, ax, bx, cx, dx) \ + __asm__ __volatile__( \ + "mov %%ebx, %%edi \n\t" \ + "cpuid \n\t" \ + "xchg %%edi, %%ebx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#endif +#elif defined(__SUNPRO_C) || \ + defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ +#if AOM_ARCH_X86_64 +#define cpuid(func, func2, ax, bx, cx, dx) \ + asm volatile( \ + "xchg %rsi, %rbx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "xchg %rsi, %rbx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#else +#define cpuid(func, func2, ax, bx, cx, dx) \ + asm volatile( \ + "pushl %ebx \n\t" \ + "cpuid \n\t" \ + "movl %ebx, %edi \n\t" \ + "popl %ebx \n\t" \ + : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ + : "a"(func), "c"(func2)) +#endif +#else /* end __SUNPRO__ */ +#if AOM_ARCH_X86_64 +#if defined(_MSC_VER) && _MSC_VER > 1500 +#define cpuid(func, func2, a, b, c, d) \ + do { \ + int regs[4]; \ + __cpuidex(regs, func, func2); \ + a = regs[0]; \ + b = regs[1]; \ + c = regs[2]; \ + d = regs[3]; \ + } while (0) +#else +#define cpuid(func, func2, a, b, c, d) \ + do { \ + int regs[4]; \ + __cpuid(regs, func); \ + a = regs[0]; \ + b = regs[1]; \ + c = regs[2]; \ + d = regs[3]; \ + } while (0) +#endif +#else +/* clang-format off */ +#define cpuid(func, func2, a, b, c, d) \ + __asm mov eax, func \ + __asm mov ecx, func2 \ + __asm cpuid \ + __asm mov a, eax \ + __asm mov b, ebx \ + __asm mov c, ecx \ + __asm mov d, edx +#endif +/* clang-format on */ +#endif /* end others */ + +// NaCl has no support for xgetbv or the raw opcode. +#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) +static INLINE uint64_t xgetbv(void) { + const uint32_t ecx = 0; + uint32_t eax, edx; + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n" + : "=a"(eax), "=d"(edx) + : "c"(ecx)); + return ((uint64_t)edx << 32) | eax; +} +#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \ + _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 +#include +#define xgetbv() _xgetbv(0) +#elif defined(_MSC_VER) && defined(_M_IX86) +static INLINE uint64_t xgetbv(void) { + uint32_t eax_, edx_; + __asm { + xor ecx, ecx // ecx = 0 + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 + mov eax_, eax + mov edx_, edx + } + return ((uint64_t)edx_ << 32) | eax_; +} +#else +#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1700 +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) +#define getenv(x) NULL +#endif +#endif + +#define HAS_MMX 0x01 +#define HAS_SSE 0x02 +#define HAS_SSE2 0x04 +#define HAS_SSE3 0x08 +#define HAS_SSSE3 0x10 +#define HAS_SSE4_1 0x20 +#define HAS_AVX 0x40 +#define HAS_AVX2 0x80 +#define HAS_SSE4_2 0x100 +#ifndef BIT +#define BIT(n) (1u << (n)) +#endif + +static INLINE int x86_simd_caps(void) { + unsigned int flags = 0; + unsigned int mask = ~0u; + unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; + char *env; + + /* See if the CPU capabilities are being overridden by the environment */ + env = getenv("AOM_SIMD_CAPS"); + + if (env && *env) return (int)strtol(env, NULL, 0); + + env = getenv("AOM_SIMD_CAPS_MASK"); + + if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); + + /* Ensure that the CPUID instruction supports extended features */ + cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); + + if (max_cpuid_val < 1) return 0; + + /* Get the standard feature flags */ + cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_edx & BIT(23)) flags |= HAS_MMX; + + if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */ + + if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */ + + if (reg_ecx & BIT(0)) flags |= HAS_SSE3; + + if (reg_ecx & BIT(9)) flags |= HAS_SSSE3; + + if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; + + if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2; + + // bits 27 (OSXSAVE) & 28 (256-bit AVX) + if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { + // Check for OS-support of YMM state. Necessary for AVX and AVX2. + if ((xgetbv() & 0x6) == 0x6) { + flags |= HAS_AVX; + + if (max_cpuid_val >= 7) { + /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ + cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + + if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + } + } + } + + (void)reg_eax; // Avoid compiler warning on unused-but-set variable. + + return flags & mask; +} + +// Fine-Grain Measurement Functions +// +// If you are timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. +static INLINE unsigned int x86_readtsc(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tsc; + __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); + return tsc; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tsc; + asm volatile("rdtsc\n\t" : "=a"(tsc) :); + return tsc; +#else +#if AOM_ARCH_X86_64 + return (unsigned int)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} +// 64-bit CPU cycle counter +static INLINE uint64_t x86_readtsc64(void) { +#if defined(__GNUC__) && __GNUC__ + uint32_t hi, lo; + __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + uint_t hi, lo; + asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#else +#if AOM_ARCH_X86_64 + return (uint64_t)__rdtsc(); +#else + __asm rdtsc; +#endif +#endif +} + +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if AOM_ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; + return v; +} + +#if defined(__GNUC__) && __GNUC__ +#define x86_pause_hint() __asm__ __volatile__("pause \n\t") +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#define x86_pause_hint() asm volatile("pause \n\t") +#else +#if AOM_ARCH_X86_64 +#define x86_pause_hint() _mm_pause(); +#else +#define x86_pause_hint() __asm pause +#endif +#endif + +#if defined(__GNUC__) && __GNUC__ +static void x87_set_control_word(unsigned short mode) { + __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :); + return mode; +} +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +static void x87_set_control_word(unsigned short mode) { + asm volatile("fldcw %0" : : "m"(*&mode)); +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); + return mode; +} +#elif AOM_ARCH_X86_64 +/* No fldcw intrinsics on Windows x64, punt to external asm */ +extern void aom_winx64_fldcw(unsigned short mode); +extern unsigned short aom_winx64_fstcw(void); +#define x87_set_control_word aom_winx64_fldcw +#define x87_get_control_word aom_winx64_fstcw +#else +static void x87_set_control_word(unsigned short mode) { + __asm { fldcw mode } +} +static unsigned short x87_get_control_word(void) { + unsigned short mode; + __asm { fstcw mode } + return mode; +} +#endif + +static INLINE unsigned int x87_set_double_precision(void) { + unsigned int mode = x87_get_control_word(); + // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 + // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf + // 8.1.5.2 Precision Control Field + // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") + // determine the number of bits used in floating point calculations. To match + // later SSE instructions restrict x87 operations to Double Precision (0x200). + // Precision PC Field + // Single Precision (24-Bits) 00B + // Reserved 01B + // Double Precision (53-Bits) 10B + // Extended Precision (64-Bits) 11B + x87_set_control_word((mode & ~0x300u) | 0x200u); + return mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_PORTS_X86_H_ diff --git a/third_party/aom/aom_ports/x86_abi_support.asm b/third_party/aom/aom_ports/x86_abi_support.asm new file mode 100644 index 0000000000..f1a65f53e5 --- /dev/null +++ b/third_party/aom/aom_ports/x86_abi_support.asm @@ -0,0 +1,416 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "config/aom_config.asm" + +; 32/64 bit compatibility macros +; +; In general, we make the source use 64 bit syntax, then twiddle with it using +; the preprocessor to get the 32 bit syntax on 32 bit platforms. +; +%ifidn __OUTPUT_FORMAT__,elf32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,macho32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,win32 +%define ABI_IS_32BIT 1 +%elifidn __OUTPUT_FORMAT__,aout +%define ABI_IS_32BIT 1 +%else +%define ABI_IS_32BIT 0 +%endif + +%if ABI_IS_32BIT +%define rax eax +%define rbx ebx +%define rcx ecx +%define rdx edx +%define rsi esi +%define rdi edi +%define rsp esp +%define rbp ebp +%define movsxd mov +%macro movq 2 + %ifidn %1,eax + movd %1,%2 + %elifidn %2,eax + movd %1,%2 + %elifidn %1,ebx + movd %1,%2 + %elifidn %2,ebx + movd %1,%2 + %elifidn %1,ecx + movd %1,%2 + %elifidn %2,ecx + movd %1,%2 + %elifidn %1,edx + movd %1,%2 + %elifidn %2,edx + movd %1,%2 + %elifidn %1,esi + movd %1,%2 + %elifidn %2,esi + movd %1,%2 + %elifidn %1,edi + movd %1,%2 + %elifidn %2,edi + movd %1,%2 + %elifidn %1,esp + movd %1,%2 + %elifidn %2,esp + movd %1,%2 + %elifidn %1,ebp + movd %1,%2 + %elifidn %2,ebp + movd %1,%2 + %else + movq %1,%2 + %endif +%endmacro +%endif + + +; LIBAOM_YASM_WIN64 +; Set LIBAOM_YASM_WIN64 if output is Windows 64bit so the code will work if x64 +; or win64 is defined on the Yasm command line. +%ifidn __OUTPUT_FORMAT__,win64 +%define LIBAOM_YASM_WIN64 1 +%elifidn __OUTPUT_FORMAT__,x64 +%define LIBAOM_YASM_WIN64 1 +%else +%define LIBAOM_YASM_WIN64 0 +%endif + +; Declare groups of platforms +%ifidn __OUTPUT_FORMAT__,elf32 + %define LIBAOM_ELF 1 +%elifidn __OUTPUT_FORMAT__,elfx32 + %define LIBAOM_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define LIBAOM_ELF 1 +%else + %define LIBAOM_ELF 0 +%endif + +%ifidn __OUTPUT_FORMAT__,macho32 + %define LIBAOM_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define LIBAOM_MACHO 1 +%else + %define LIBAOM_MACHO 0 +%endif + +; sym() +; Return the proper symbol name for the target ABI. +; +; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols +; with C linkage be prefixed with an underscore. +; +%if LIBAOM_ELF || LIBAOM_YASM_WIN64 + %define sym(x) x +%else + ; Mach-O / COFF + %define sym(x) _ %+ x +%endif + +; globalsym() +; Return a global declaration with the proper decoration for the target ABI. +; +; When CHROMIUM is defined, include attributes to hide the symbol from the +; global namespace. +; +; Chromium doesn't like exported global symbols due to symbol clashing with +; plugins among other things. +; +; Requires Chromium's patched copy of yasm: +; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 +; http://www.tortall.net/projects/yasm/ticket/236 +; or nasm > 2.14. +; +%ifdef CHROMIUM + %ifdef __NASM_VER__ + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + ; nasm < 2.14 does not support :private_extern directive + %fatal Must use nasm 2.14 or newer + %endif + %endif + + %if LIBAOM_ELF + %define globalsym(x) global sym(x) %+ :function hidden + %elif LIBAOM_MACHO + %define globalsym(x) global sym(x) %+ :private_extern + %else + ; COFF / PE32+ + %define globalsym(x) global sym(x) + %endif +%else + %define globalsym(x) global sym(x) +%endif + +; arg() +; Return the address specification of the given argument +; +%if ABI_IS_32BIT + %define arg(x) [ebp+8+4*x] +%else + ; 64 bit ABI passes arguments in registers. This is a workaround to get up + ; and running quickly. Relies on SHADOW_ARGS_TO_STACK + %if LIBAOM_YASM_WIN64 + %define arg(x) [rbp+16+8*x] + %else + %define arg(x) [rbp-8-8*x] + %endif +%endif + +; REG_SZ_BYTES, REG_SZ_BITS +; Size of a register +%if ABI_IS_32BIT +%define REG_SZ_BYTES 4 +%define REG_SZ_BITS 32 +%else +%define REG_SZ_BYTES 8 +%define REG_SZ_BITS 64 +%endif + + +; ALIGN_STACK +; This macro aligns the stack to the given alignment (in bytes). The stack +; is left such that the previous value of the stack pointer is the first +; argument on the stack (ie, the inverse of this macro is 'pop rsp.') +; This macro uses one temporary register, which is not preserved, and thus +; must be specified as an argument. +%macro ALIGN_STACK 2 + mov %2, rsp + and rsp, -%1 + lea rsp, [rsp - (%1 - REG_SZ_BYTES)] + push %2 +%endmacro + + +; +; The Microsoft assembler tries to impose a certain amount of type safety in +; its register usage. YASM doesn't recognize these directives, so we just +; %define them away to maintain as much compatibility as possible with the +; original inline assembler we're porting from. +; +%idefine PTR +%idefine XMMWORD +%idefine MMWORD + +; PIC macros +; +%if ABI_IS_32BIT + %if CONFIG_PIC=1 + %ifidn __OUTPUT_FORMAT__,elf32 + %define WRT_PLT wrt ..plt + %macro GET_GOT 1 + extern _GLOBAL_OFFSET_TABLE_ + push %1 + call %%get_got + %%sub_offset: + jmp %%exitGG + %%get_got: + mov %1, [esp] + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc + ret + %%exitGG: + %undef GLOBAL + %define GLOBAL(x) x + %1 wrt ..gotoff + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %elifidn __OUTPUT_FORMAT__,macho32 + %macro GET_GOT 1 + push %1 + call %%get_got + %%get_got: + pop %1 + %undef GLOBAL + %define GLOBAL(x) x + %1 - %%get_got + %undef RESTORE_GOT + %define RESTORE_GOT pop %1 + %endmacro + %endif + %endif + + %ifdef CHROMIUM + %ifidn __OUTPUT_FORMAT__,macho32 + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%else + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) rel x + %ifidn __OUTPUT_FORMAT__,elf64 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,elfx32 + %define WRT_PLT wrt ..plt + %define HIDDEN_DATA(x) x:data hidden + %elifidn __OUTPUT_FORMAT__,macho64 + %ifdef CHROMIUM + %define HIDDEN_DATA(x) x:private_extern + %else + %define HIDDEN_DATA(x) x + %endif + %else + %define HIDDEN_DATA(x) x + %endif +%endif +%ifnmacro GET_GOT + %macro GET_GOT 1 + %endmacro + %define GLOBAL(x) x +%endif +%ifndef RESTORE_GOT +%define RESTORE_GOT +%endif +%ifndef WRT_PLT +%define WRT_PLT +%endif + +%if ABI_IS_32BIT + %macro SHADOW_ARGS_TO_STACK 1 + %endm + %define UNSHADOW_ARGS +%else +%if LIBAOM_YASM_WIN64 + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + mov arg(0),rcx + %endif + %if %1 > 1 + mov arg(1),rdx + %endif + %if %1 > 2 + mov arg(2),r8 + %endif + %if %1 > 3 + mov arg(3),r9 + %endif + %endm +%else + %macro SHADOW_ARGS_TO_STACK 1 ; argc + %if %1 > 0 + push rdi + %endif + %if %1 > 1 + push rsi + %endif + %if %1 > 2 + push rdx + %endif + %if %1 > 3 + push rcx + %endif + %if %1 > 4 + push r8 + %endif + %if %1 > 5 + push r9 + %endif + %if %1 > 6 + %assign i %1-6 + %assign off 16 + %rep i + mov rax,[rbp+off] + push rax + %assign off off+8 + %endrep + %endif + %endm +%endif + %define UNSHADOW_ARGS mov rsp, rbp +%endif + +; Win64 ABI requires that XMM6:XMM15 are callee saved +; SAVE_XMM n, [u] +; store registers 6-n on the stack +; if u is specified, use unaligned movs. +; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return +; value. Typically we follow this up with 'push rbp' - re-aligning the stack - +; but in some cases this is not done and unaligned movs must be used. +%if LIBAOM_YASM_WIN64 +%macro SAVE_XMM 1-2 a + %if %1 < 6 + %error Only xmm registers 6-15 must be preserved + %else + %assign last_xmm %1 + %define movxmm movdq %+ %2 + %assign xmm_stack_space ((last_xmm - 5) * 16) + sub rsp, xmm_stack_space + %assign i 6 + %rep (last_xmm - 5) + movxmm [rsp + ((i - 6) * 16)], xmm %+ i + %assign i i+1 + %endrep + %endif +%endmacro +%macro RESTORE_XMM 0 + %ifndef last_xmm + %error RESTORE_XMM must be paired with SAVE_XMM n + %else + %assign i last_xmm + %rep (last_xmm - 5) + movxmm xmm %+ i, [rsp +((i - 6) * 16)] + %assign i i-1 + %endrep + add rsp, xmm_stack_space + ; there are a couple functions which return from multiple places. + ; otherwise, we could uncomment these: + ; %undef last_xmm + ; %undef xmm_stack_space + ; %undef movxmm + %endif +%endmacro +%else +%macro SAVE_XMM 1-2 +%endmacro +%macro RESTORE_XMM 0 +%endmacro +%endif + +; Name of the rodata section +; +; .rodata seems to be an elf-ism, as it doesn't work on OSX. +; +%ifidn __OUTPUT_FORMAT__,macho64 +%define SECTION_RODATA section .text +%elifidn __OUTPUT_FORMAT__,macho32 +%macro SECTION_RODATA 0 +section .text +%endmacro +%elifidn __OUTPUT_FORMAT__,aout +%define SECTION_RODATA section .data +%else +%define SECTION_RODATA section .rodata +%endif + + +; Tell GNU ld that we don't require an executable stack. +%ifidn __OUTPUT_FORMAT__,elf32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elf64 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%elifidn __OUTPUT_FORMAT__,elfx32 +section .note.GNU-stack noalloc noexec nowrite progbits +section .text +%endif diff --git a/third_party/aom/aom_scale/aom_scale.cmake b/third_party/aom/aom_scale/aom_scale.cmake new file mode 100644 index 0000000000..ea94dbc063 --- /dev/null +++ b/third_party/aom/aom_scale/aom_scale.cmake @@ -0,0 +1,37 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_) + return() +endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_ +set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1) + +list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h" + "${AOM_ROOT}/aom_scale/generic/aom_scale.c" + "${AOM_ROOT}/aom_scale/generic/gen_scalers.c" + "${AOM_ROOT}/aom_scale/generic/yv12config.c" + "${AOM_ROOT}/aom_scale/generic/yv12extend.c" + "${AOM_ROOT}/aom_scale/yv12config.h") + +# Creates the aom_scale build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_aom_scale_targets) + add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES}) + target_sources(aom PRIVATE $) + + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE) +endfunction() diff --git a/third_party/aom/aom_scale/aom_scale.h b/third_party/aom/aom_scale/aom_scale.h new file mode 100644 index 0000000000..11812a1453 --- /dev/null +++ b/third_party/aom/aom_scale/aom_scale.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_SCALE_AOM_SCALE_H_ +#define AOM_AOM_SCALE_AOM_SCALE_H_ + +#include "aom_scale/yv12config.h" + +extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, unsigned char temp_height, + unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, + unsigned int interlaced, const int num_planes); + +#endif // AOM_AOM_SCALE_AOM_SCALE_H_ diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.c b/third_party/aom/aom_scale/aom_scale_rtcd.c new file mode 100644 index 0000000000..93def357d8 --- /dev/null +++ b/third_party/aom/aom_scale/aom_scale_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/aom_scale_rtcd.h" + +#include "aom_ports/aom_once.h" + +void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.pl b/third_party/aom/aom_scale/aom_scale_rtcd.pl new file mode 100644 index 0000000000..ae0a85687f --- /dev/null +++ b/third_party/aom/aom_scale/aom_scale_rtcd.pl @@ -0,0 +1,55 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub aom_scale_forward_decls() { +print <> 4); + source += source_step; + dest += dest_step; + } +} + +/**************************************************************************** + * + * ROUTINE : scale1d_2t1_ps + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on + * in source. + * unsigned int source_scale : Scale for source (UNUSED). + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on + * in destination. + * unsigned int dest_scale : Scale for destination + * (UNUSED). + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-to-1 point subsampled scaling. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_2t1_ps(const unsigned char *source, int source_step, + unsigned int source_scale, + unsigned int source_length, unsigned char *dest, + int dest_step, unsigned int dest_scale, + unsigned int dest_length) { + const unsigned char *const dest_end = dest + dest_length * dest_step; + (void)source_length; + (void)source_scale; + (void)dest_scale; + + source_step *= 2; // Every other row. + + while (dest < dest_end) { + *dest = *source; + source += source_step; + dest += dest_step; + } +} +/**************************************************************************** + * + * ROUTINE : scale1d_c + * + * INPUTS : const unsigned char *source : Pointer to data to be scaled. + * int source_step : Number of pixels to step on + * in source. + * unsigned int source_scale : Scale for source. + * unsigned int source_length : Length of source (UNUSED). + * unsigned char *dest : Pointer to output data array. + * int dest_step : Number of pixels to step on + * in destination. + * unsigned int dest_scale : Scale for destination. + * unsigned int dest_length : Length of destination. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs linear interpolation in one dimension. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +static void scale1d_c(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length) { + const unsigned char *const dest_end = dest + dest_length * dest_step; + const unsigned int round_value = dest_scale / 2; + unsigned int left_modifier = dest_scale; + unsigned int right_modifier = 0; + unsigned char left_pixel = source[0]; + unsigned char right_pixel = source[source_step]; + + (void)source_length; + + /* These asserts are needed if there are boundary issues... */ + /* assert ( dest_scale > source_scale );*/ + /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) * + * source_scale);*/ + + while (dest < dest_end) { + *dest = (unsigned char)((left_modifier * left_pixel + + right_modifier * right_pixel + round_value) / + dest_scale); + + right_modifier += source_scale; + + while (right_modifier > dest_scale) { + right_modifier -= dest_scale; + source += source_step; + left_pixel = source[0]; + right_pixel = source[source_step]; + } + + left_modifier = dest_scale - right_modifier; + } +} + +/**************************************************************************** + * + * ROUTINE : Scale2D + * + * INPUTS : const unsigned char *source : Pointer to data to be + * scaled. + * int source_pitch : Stride of source image. + * unsigned int source_width : Width of input image. + * unsigned int source_height : Height of input image. + * unsigned char *dest : Pointer to output data + * array. + * int dest_pitch : Stride of destination + * image. + * unsigned int dest_width : Width of destination image. + * unsigned int dest_height : Height of destination + * image. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor + * numerator. + * unsigned int hratio : Horizontal scale factor + * denominator. + * unsigned int vscale : Vertical scale factor + * numerator. + * unsigned int vratio : Vertical scale factor + * denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +static void Scale2D( + /*const*/ + unsigned char *source, int source_pitch, unsigned int source_width, + unsigned int source_height, unsigned char *dest, int dest_pitch, + unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area, + unsigned char temp_area_height, unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, unsigned int interlaced) { + unsigned int i, j, k; + unsigned int bands; + unsigned int dest_band_height; + unsigned int source_band_height; + + typedef void (*Scale1D)(const unsigned char *source, int source_step, + unsigned int source_scale, unsigned int source_length, + unsigned char *dest, int dest_step, + unsigned int dest_scale, unsigned int dest_length); + + Scale1D Scale1Dv = scale1d_c; + Scale1D Scale1Dh = scale1d_c; + + void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, + unsigned int) = NULL; + void (*vert_band_scale)(unsigned char *, int, unsigned char *, int, + unsigned int) = NULL; + + int ratio_scalable = 1; + int interpolation = 0; + + unsigned char *source_base; + unsigned char *line_src; + + source_base = (unsigned char *)source; + + if (source_pitch < 0) { + int offset; + + offset = (source_height - 1); + offset *= source_pitch; + + source_base += offset; + } + + /* find out the ratio for each direction */ + switch (hratio * 10 / hscale) { + case 8: + /* 4-5 Scale in Width direction */ + horiz_line_scale = aom_horizontal_line_5_4_scale; + break; + case 6: + /* 3-5 Scale in Width direction */ + horiz_line_scale = aom_horizontal_line_5_3_scale; + break; + case 5: + /* 1-2 Scale in Width direction */ + horiz_line_scale = aom_horizontal_line_2_1_scale; + break; + default: + /* The ratio is not acceptable now */ + /* throw("The ratio is not acceptable for now!"); */ + ratio_scalable = 0; + break; + } + + switch (vratio * 10 / vscale) { + case 8: + /* 4-5 Scale in vertical direction */ + vert_band_scale = aom_vertical_band_5_4_scale; + source_band_height = 5; + dest_band_height = 4; + break; + case 6: + /* 3-5 Scale in vertical direction */ + vert_band_scale = aom_vertical_band_5_3_scale; + source_band_height = 5; + dest_band_height = 3; + break; + case 5: + /* 1-2 Scale in vertical direction */ + + if (interlaced) { + /* if the content is interlaced, point sampling is used */ + vert_band_scale = aom_vertical_band_2_1_scale; + } else { + interpolation = 1; + /* if the content is progressive, interplo */ + vert_band_scale = aom_vertical_band_2_1_scale_i; + } + + source_band_height = 2; + dest_band_height = 1; + break; + default: + /* The ratio is not acceptable now */ + /* throw("The ratio is not acceptable for now!"); */ + ratio_scalable = 0; + break; + } + + if (ratio_scalable) { + if (source_height == dest_height) { + /* for each band of the image */ + for (k = 0; k < dest_height; ++k) { + horiz_line_scale(source, source_width, dest, dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (interpolation) { + if (source < source_base) source = source_base; + + horiz_line_scale(source, source_width, temp_area, dest_width); + } + + for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height; + ++k) { + /* scale one band horizontally */ + for (i = 0; i < source_band_height; ++i) { + /* Trap case where we could read off the base of the source buffer */ + + line_src = source + i * source_pitch; + + if (line_src < source_base) line_src = source_base; + + horiz_line_scale(line_src, source_width, + temp_area + (i + 1) * dest_pitch, dest_width); + } + + /* Vertical scaling is in place */ + vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, + dest_width); + + if (interpolation) + memcpy(temp_area, temp_area + source_band_height * dest_pitch, + dest_width); + + /* Next band... */ + source += (unsigned long)source_band_height * source_pitch; + dest += (unsigned long)dest_band_height * dest_pitch; + } + + return; + } + + if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps; + + if (vscale == 2 && vratio == 1) { + if (interlaced) + Scale1Dv = scale1d_2t1_ps; + else + Scale1Dv = scale1d_2t1_i; + } + + if (source_height == dest_height) { + /* for each band of the image */ + for (k = 0; k < dest_height; ++k) { + Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, + dest_width); + source += source_pitch; + dest += dest_pitch; + } + + return; + } + + if (dest_height > source_height) { + dest_band_height = temp_area_height - 1; + source_band_height = dest_band_height * source_height / dest_height; + } else { + source_band_height = temp_area_height - 1; + dest_band_height = source_band_height * vratio / vscale; + } + + /* first row needs to be done so that we can stay one row ahead for vertical + * zoom */ + Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, + dest_width); + + /* for each band of the image */ + bands = (dest_height + dest_band_height - 1) / dest_band_height; + + for (k = 0; k < bands; ++k) { + /* scale one band horizontally */ + for (i = 1; i < source_band_height + 1; ++i) { + if (k * source_band_height + i < source_height) { + Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1, + temp_area + i * dest_pitch, 1, hratio, dest_width); + } else { /* Duplicate the last row */ + /* copy temp_area row 0 over from last row in the past */ + memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch, + dest_pitch); + } + } + + /* scale one band vertically */ + for (j = 0; j < dest_width; ++j) { + Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1, + &dest[j], dest_pitch, vratio, dest_band_height); + } + + /* copy temp_area row 0 over from last row in the past */ + memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch); + + /* move to the next band */ + source += source_band_height * source_pitch; + dest += dest_band_height * dest_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : aom_scale_frame + * + * INPUTS : YV12_BUFFER_CONFIG *src : Pointer to frame to be + * scaled. + * YV12_BUFFER_CONFIG *dst : Pointer to buffer to hold + * scaled frame. + * unsigned char *temp_area : Pointer to temp work area. + * unsigned char temp_area_height : Height of temp work area. + * unsigned int hscale : Horizontal scale factor + * numerator. + * unsigned int hratio : Horizontal scale factor + * denominator. + * unsigned int vscale : Vertical scale factor + * numerator. + * unsigned int vratio : Vertical scale factor + * denominator. + * unsigned int interlaced : Interlace flag. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs 2-tap linear interpolation in two dimensions. + * + * SPECIAL NOTES : Expansion is performed one band at a time to help with + * caching. + * + ****************************************************************************/ +void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + unsigned char *temp_area, unsigned char temp_height, + unsigned int hscale, unsigned int hratio, + unsigned int vscale, unsigned int vratio, + unsigned int interlaced, const int num_planes) { + const int dw = (hscale - 1 + src->y_width * hratio) / hscale; + const int dh = (vscale - 1 + src->y_height * vratio) / vscale; + + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int plane_dw = dw >> is_uv; + const int plane_dh = dh >> is_uv; + + Scale2D((unsigned char *)src->buffers[plane], src->strides[is_uv], + src->widths[is_uv], src->heights[is_uv], + (unsigned char *)dst->buffers[plane], dst->strides[is_uv], plane_dw, + plane_dh, temp_area, temp_height, hscale, hratio, vscale, vratio, + interlaced); + + if (plane_dw < dst->widths[is_uv]) + for (int i = 0; i < plane_dh; ++i) + memset(dst->buffers[plane] + i * dst->strides[is_uv] + plane_dw - 1, + dst->buffers[plane][i * dst->strides[is_uv] + plane_dw - 2], + dst->widths[is_uv] - plane_dw + 1); + + if (plane_dh < dst->heights[is_uv]) + for (int i = plane_dh - 1; i < dst->heights[is_uv]; ++i) + memcpy(dst->buffers[plane] + i * dst->strides[is_uv], + dst->buffers[plane] + (plane_dh - 2) * dst->strides[is_uv], + dst->widths[is_uv] + 1); + } +} diff --git a/third_party/aom/aom_scale/generic/gen_scalers.c b/third_party/aom/aom_scale/generic/gen_scalers.c new file mode 100644 index 0000000000..549e2aa690 --- /dev/null +++ b/third_party/aom/aom_scale/generic/gen_scalers.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_scale_rtcd.h" + +#include "aom_scale/aom_scale.h" +#include "aom_mem/aom_mem.h" +/**************************************************************************** + * Imports + ****************************************************************************/ + +/**************************************************************************** + * + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination + * (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 4 to 5. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void aom_horizontal_line_5_4_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + const unsigned char *const source_end = source + source_width; + (void)dest_width; + + while (source < source_end) { + const unsigned int a = source[0]; + const unsigned int b = source[1]; + const unsigned int c = source[2]; + const unsigned int d = source[3]; + const unsigned int e = source[4]; + + dest[0] = (unsigned char)a; + dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + source += 5; + dest += 4; + } +} + +void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + const unsigned char *const dest_end = dest + dest_width; + while (dest < dest_end) { + const unsigned int a = source[0 * src_pitch]; + const unsigned int b = source[1 * src_pitch]; + const unsigned int c = source[2 * src_pitch]; + const unsigned int d = source[3 * src_pitch]; + const unsigned int e = source[4 * src_pitch]; + + dest[0 * dest_pitch] = (unsigned char)a; + dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8); + dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8); + dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8); + + ++source; + ++dest; + } +} + +/*7*************************************************************************** + * + * ROUTINE : aom_horizontal_line_3_5_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination + * (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 3 to 5. + * + * SPECIAL NOTES : None. + * + * + ****************************************************************************/ +void aom_horizontal_line_5_3_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + const unsigned char *const source_end = source + source_width; + (void)dest_width; + while (source < source_end) { + const unsigned int a = source[0]; + const unsigned int b = source[1]; + const unsigned int c = source[2]; + const unsigned int d = source[3]; + const unsigned int e = source[4]; + + dest[0] = (unsigned char)a; + dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + source += 5; + dest += 3; + } +} + +void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + const unsigned char *const dest_end = dest + dest_width; + while (dest < dest_end) { + const unsigned int a = source[0 * src_pitch]; + const unsigned int b = source[1 * src_pitch]; + const unsigned int c = source[2 * src_pitch]; + const unsigned int d = source[3 * src_pitch]; + const unsigned int e = source[4 * src_pitch]; + + dest[0 * dest_pitch] = (unsigned char)a; + dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8); + dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8); + + ++source; + ++dest; + } +} + +/**************************************************************************** + * + * ROUTINE : aom_horizontal_line_1_2_scale_c + * + * INPUTS : const unsigned char *source : Pointer to source data. + * unsigned int source_width : Stride of source. + * unsigned char *dest : Pointer to destination data. + * unsigned int dest_width : Stride of destination + * (NOT USED). + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Copies horizontal line of pixels from source to + * destination scaling up by 1 to 2. + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void aom_horizontal_line_2_1_scale_c(const unsigned char *source, + unsigned int source_width, + unsigned char *dest, + unsigned int dest_width) { + const unsigned char *const source_end = source + source_width; + (void)dest_width; + while (source < source_end) { + dest[0] = source[0]; + source += 2; + ++dest; + } +} + +void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + (void)dest_pitch; + (void)src_pitch; + memcpy(dest, source, dest_width); +} + +void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, + unsigned char *dest, int dest_pitch, + unsigned int dest_width) { + const unsigned char *const dest_end = dest + dest_width; + (void)dest_pitch; + while (dest < dest_end) { + const unsigned int a = source[-src_pitch] * 3; + const unsigned int b = source[0] * 10; + const unsigned int c = source[src_pitch] * 3; + dest[0] = (unsigned char)((8 + a + b + c) >> 4); + ++source; + ++dest; + } +} diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c new file mode 100644 index 0000000000..94b400b9e0 --- /dev/null +++ b/third_party/aom/aom_scale/generic/yv12config.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/internal/aom_image_internal.h" +#include "aom_dsp/pyramid.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" + +/**************************************************************************** + * Exports + ****************************************************************************/ + +/**************************************************************************** + * + ****************************************************************************/ + +// TODO(jkoleszar): Maybe replace this with struct aom_image +int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf) { + if (ybf->buffer_alloc_sz > 0) { + aom_free(ybf->buffer_alloc); + } +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + if (ybf->y_pyramid) { + aom_free_pyramid(ybf->y_pyramid); + } + if (ybf->corners) { + av1_free_corner_list(ybf->corners); + } +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + aom_remove_metadata_from_frame_buffer(ybf); + /* buffer_alloc isn't accessed by most functions. Rather y_buffer, + u_buffer and v_buffer point to buffer_alloc and are used. Clear out + all of this so that a freed pointer isn't inadvertently used */ + memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG)); + return 0; + } + + return AOM_CODEC_MEM_ERROR; +} + +static int realloc_frame_buffer_aligned( + YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, + int use_highbitdepth, int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, + void *cb_priv, const int y_stride, const uint64_t yplane_size, + const uint64_t uvplane_size, const int aligned_width, + const int aligned_height, const int uv_width, const int uv_height, + const int uv_stride, const int uv_border_w, const int uv_border_h, + int num_pyramid_levels, int alloc_y_plane_only) { + if (ybf) { + const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; + const uint64_t frame_size = + (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); + + uint8_t *buf = NULL; + +#if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER + // We should only need an 8-bit version of the source frame if we are + // encoding in non-realtime mode + (void)num_pyramid_levels; + assert(num_pyramid_levels == 0); +#endif // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER + +#if defined AOM_MAX_ALLOCABLE_MEMORY + // The size of ybf->buffer_alloc. + uint64_t alloc_size = frame_size; +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + // The size of ybf->y_pyramid + if (num_pyramid_levels > 0) { + alloc_size += aom_get_pyramid_alloc_size( + width, height, num_pyramid_levels, use_highbitdepth); + alloc_size += av1_get_corner_list_size(); + } +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + // The decoder may allocate REF_FRAMES frame buffers in the frame buffer + // pool. Bound the total amount of allocated memory as if these REF_FRAMES + // frame buffers were allocated in a single allocation. + if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES) + return AOM_CODEC_MEM_ERROR; +#endif + + if (cb != NULL) { + const int align_addr_extra_size = 31; + const uint64_t external_frame_size = frame_size + align_addr_extra_size; + + assert(fb != NULL); + + if (external_frame_size != (size_t)external_frame_size) + return AOM_CODEC_MEM_ERROR; + + // Allocation to hold larger frame, or first allocation. + if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) + return AOM_CODEC_MEM_ERROR; + + if (fb->data == NULL || fb->size < external_frame_size) + return AOM_CODEC_MEM_ERROR; + + ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32); + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, (size_t)frame_size); +#endif +#endif + } else if (frame_size > ybf->buffer_alloc_sz) { + // Allocation to hold larger frame, or first allocation. + aom_free(ybf->buffer_alloc); + ybf->buffer_alloc = NULL; + ybf->buffer_alloc_sz = 0; + + if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR; + + ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size); + if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR; + + ybf->buffer_alloc_sz = (size_t)frame_size; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); + } + + ybf->y_crop_width = width; + ybf->y_crop_height = height; + ybf->y_width = aligned_width; + ybf->y_height = aligned_height; + ybf->y_stride = y_stride; + + ybf->uv_crop_width = (width + ss_x) >> ss_x; + ybf->uv_crop_height = (height + ss_y) >> ss_y; + ybf->uv_width = uv_width; + ybf->uv_height = uv_height; + ybf->uv_stride = uv_stride; + + ybf->border = border; + ybf->frame_size = (size_t)frame_size; + ybf->subsampling_x = ss_x; + ybf->subsampling_y = ss_y; + + buf = ybf->buffer_alloc; + if (use_highbitdepth) { + // Store uint16 addresses when using 16bit framebuffers + buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); + ybf->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + ybf->flags = 0; + } + + ybf->y_buffer = (uint8_t *)aom_align_addr( + buf + (border * y_stride) + border, aom_byte_align); + if (!alloc_y_plane_only) { + ybf->u_buffer = (uint8_t *)aom_align_addr( + buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w, + aom_byte_align); + ybf->v_buffer = + (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w, + aom_byte_align); + } else { + ybf->u_buffer = NULL; + ybf->v_buffer = NULL; + } + + ybf->use_external_reference_buffers = 0; + +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + if (ybf->y_pyramid) { + aom_free_pyramid(ybf->y_pyramid); + ybf->y_pyramid = NULL; + } + if (ybf->corners) { + av1_free_corner_list(ybf->corners); + ybf->corners = NULL; + } + if (num_pyramid_levels > 0) { + ybf->y_pyramid = aom_alloc_pyramid(width, height, num_pyramid_levels, + use_highbitdepth); + if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR; + ybf->corners = av1_alloc_corner_list(); + if (!ybf->corners) return AOM_CODEC_MEM_ERROR; + } +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + + ybf->corrupted = 0; /* assume not corrupted by errors */ + return 0; + } + return AOM_CODEC_MEM_ERROR; +} + +static int calc_stride_and_planesize( + const int ss_x, const int ss_y, const int aligned_width, + const int aligned_height, const int border, const int byte_alignment, + int alloc_y_plane_only, int *y_stride, int *uv_stride, + uint64_t *yplane_size, uint64_t *uvplane_size, const int uv_height) { + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * aom_img_set_rect(). */ + if (border & 0x1f) return AOM_CODEC_MEM_ERROR; + *y_stride = aom_calc_y_stride(aligned_width, border); + *yplane_size = + (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment; + + if (!alloc_y_plane_only) { + *uv_stride = *y_stride >> ss_x; + *uvplane_size = + (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) + + byte_alignment; + } else { + *uv_stride = 0; + *uvplane_size = 0; + } + return 0; +} + +int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, + int num_pyramid_levels, int alloc_y_plane_only) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + return AOM_CODEC_MEM_ERROR; +#endif + + if (ybf) { + int y_stride = 0; + int uv_stride = 0; + uint64_t yplane_size = 0; + uint64_t uvplane_size = 0; + const int aligned_width = (width + 7) & ~7; + const int aligned_height = (height + 7) & ~7; + const int uv_width = aligned_width >> ss_x; + const int uv_height = aligned_height >> ss_y; + const int uv_border_w = border >> ss_x; + const int uv_border_h = border >> ss_y; + + int error = calc_stride_and_planesize( + ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment, + alloc_y_plane_only, &y_stride, &uv_stride, &yplane_size, &uvplane_size, + uv_height); + if (error) return error; + return realloc_frame_buffer_aligned( + ybf, width, height, ss_x, ss_y, use_highbitdepth, border, + byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, + aligned_width, aligned_height, uv_width, uv_height, uv_stride, + uv_border_w, uv_border_h, num_pyramid_levels, alloc_y_plane_only); + } + return AOM_CODEC_MEM_ERROR; +} + +int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, int border, + int byte_alignment, int num_pyramid_levels, + int alloc_y_plane_only) { + if (ybf) { + aom_free_frame_buffer(ybf); + return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, + use_highbitdepth, border, byte_alignment, + NULL, NULL, NULL, num_pyramid_levels, + alloc_y_plane_only); + } + return AOM_CODEC_MEM_ERROR; +} + +void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) { + if (ybf && ybf->metadata) { + aom_img_metadata_array_free(ybf->metadata); + ybf->metadata = NULL; + } +} + +int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf, + const aom_metadata_array_t *arr) { + if (!ybf || !arr || !arr->metadata_array) return -1; + if (ybf->metadata == arr) return 0; + aom_remove_metadata_from_frame_buffer(ybf); + ybf->metadata = aom_img_metadata_array_alloc(arr->sz); + if (!ybf->metadata) return -1; + for (size_t i = 0; i < ybf->metadata->sz; i++) { + ybf->metadata->metadata_array[i] = aom_img_metadata_alloc( + arr->metadata_array[i]->type, arr->metadata_array[i]->payload, + arr->metadata_array[i]->sz, arr->metadata_array[i]->insert_flag); + if (ybf->metadata->metadata_array[i] == NULL) { + aom_img_metadata_array_free(ybf->metadata); + ybf->metadata = NULL; + return -1; + } + } + ybf->metadata->sz = arr->sz; + return 0; +} diff --git a/third_party/aom/aom_scale/generic/yv12extend.c b/third_party/aom/aom_scale/generic/yv12extend.c new file mode 100644 index 0000000000..5546112d40 --- /dev/null +++ b/third_party/aom/aom_scale/generic/yv12extend.c @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +static void extend_plane(uint8_t *const src, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right, int v_start, + int v_end) { + assert(src != NULL); + int i; + const int linesize = extend_left + extend_right + width; + assert(linesize <= src_stride); + + /* copy the left and right most columns out */ + uint8_t *src_ptr1 = src + v_start * src_stride; + uint8_t *src_ptr2 = src + v_start * src_stride + width - 1; + uint8_t *dst_ptr1 = src + v_start * src_stride - extend_left; + uint8_t *dst_ptr2 = src_ptr2 + 1; + + for (i = v_start; i < v_end; ++i) { + memset(dst_ptr1, src_ptr1[0], extend_left); + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + dst_ptr1 = src_ptr1 + src_stride * -extend_top; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += src_stride; + } + + src_ptr2 = src_ptr1 + src_stride * (height - 1); + dst_ptr2 = src_ptr2; + + for (i = 0; i < extend_bottom; ++i) { + dst_ptr2 += src_stride; + memcpy(dst_ptr2, src_ptr2, linesize); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void extend_plane_high(uint8_t *const src8, int src_stride, int width, + int height, int extend_top, int extend_left, + int extend_bottom, int extend_right, int v_start, + int v_end) { + int i; + const int linesize = extend_left + extend_right + width; + assert(linesize <= src_stride); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + /* copy the left and right most columns out */ + uint16_t *src_ptr1 = src + v_start * src_stride; + uint16_t *src_ptr2 = src + v_start * src_stride + width - 1; + uint16_t *dst_ptr1 = src + v_start * src_stride - extend_left; + uint16_t *dst_ptr2 = src_ptr2 + 1; + + for (i = v_start; i < v_end; ++i) { + aom_memset16(dst_ptr1, src_ptr1[0], extend_left); + aom_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + dst_ptr1 = src_ptr1 + src_stride * -extend_top; + + for (i = 0; i < extend_top; ++i) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += src_stride; + } + + src_ptr2 = src_ptr1 + src_stride * (height - 1); + dst_ptr2 = src_ptr2; + + for (i = 0; i < extend_bottom; ++i) { + dst_ptr2 += src_stride; + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_extend_frame_borders_plane_row_c(const YV12_BUFFER_CONFIG *ybf, + int plane, int v_start, int v_end) { + const int ext_size = ybf->border; + const int ss_x = ybf->subsampling_x; + const int ss_y = ybf->subsampling_y; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + + const int is_uv = plane > 0; + const int top = ext_size >> (is_uv ? ss_y : 0); + const int left = ext_size >> (is_uv ? ss_x : 0); + const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; + const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; + const int extend_top_border = (v_start == 0); + const int extend_bottom_border = (v_end == ybf->crop_heights[is_uv]); + +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], + extend_top_border ? top : 0, left, + extend_bottom_border ? bottom : 0, right, v_start, v_end); + return; + } +#endif + + extend_plane(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], + extend_top_border ? top : 0, left, + extend_bottom_border ? bottom : 0, right, v_start, v_end); +} + +void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, + const int num_planes) { + assert(ybf->border % 2 == 0); + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int plane_border = ybf->border >> is_uv; + extend_plane_high( + ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv], + ybf->crop_heights[is_uv], plane_border, plane_border, + plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv], + plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0, + ybf->crop_heights[is_uv]); + } + return; + } +#endif + + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int plane_border = ybf->border >> is_uv; + extend_plane(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], + plane_border, plane_border, + plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv], + plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv], 0, + ybf->crop_heights[is_uv]); + } +} + +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size, + const int num_planes) { + const int ss_x = ybf->subsampling_x; + const int ss_y = ybf->subsampling_y; + + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); + +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int top = ext_size >> (is_uv ? ss_y : 0); + const int left = ext_size >> (is_uv ? ss_x : 0); + const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; + const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; + extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, + left, bottom, right, 0, ybf->crop_heights[is_uv]); + } + return; + } +#endif + + for (int plane = 0; plane < num_planes; ++plane) { + const int is_uv = plane > 0; + const int top = ext_size >> (is_uv ? ss_y : 0); + const int left = ext_size >> (is_uv ? ss_x : 0); + const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv]; + const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv]; + extend_plane(ybf->buffers[plane], ybf->strides[is_uv], + ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left, + bottom, right, 0, ybf->crop_heights[is_uv]); + } +} + +void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) { + extend_frame(ybf, ybf->border, num_planes); +} + +void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf, + const int num_planes) { + const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS) + ? AOMINNERBORDERINPIXELS + : ybf->border; + extend_frame(ybf, inner_bw, num_planes); +} + +void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) { + int ext_size = ybf->border; + assert(ybf->y_height - ybf->y_crop_height < 16); + assert(ybf->y_width - ybf->y_crop_width < 16); + assert(ybf->y_height - ybf->y_crop_height >= 0); + assert(ybf->y_width - ybf->y_crop_width >= 0); +#if CONFIG_AV1_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high( + ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height); + return; + } +#endif + extend_plane( + ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width, 0, ybf->y_crop_height); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memcpy(dst, src, num * sizeof(uint16_t)); +} +#endif + +// Copies the source image into the destination image and updates the +// destination's UMV borders. +// Note: The frames are assumed to be identical in size. +void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, const int num_planes) { + assert(src_bc->y_width == dst_bc->y_width); + assert(src_bc->y_height == dst_bc->y_height); + +#if CONFIG_AV1_HIGHBITDEPTH + assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) == + (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH)); + + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + for (int plane = 0; plane < num_planes; ++plane) { + const uint8_t *plane_src = src_bc->buffers[plane]; + uint8_t *plane_dst = dst_bc->buffers[plane]; + const int is_uv = plane > 0; + + for (int row = 0; row < src_bc->heights[is_uv]; ++row) { + memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]); + plane_src += src_bc->strides[is_uv]; + plane_dst += dst_bc->strides[is_uv]; + } + } + aom_yv12_extend_frame_borders_c(dst_bc, num_planes); + return; + } +#endif + for (int plane = 0; plane < num_planes; ++plane) { + const uint8_t *plane_src = src_bc->buffers[plane]; + uint8_t *plane_dst = dst_bc->buffers[plane]; + const int is_uv = plane > 0; + + for (int row = 0; row < src_bc->heights[is_uv]; ++row) { + memcpy(plane_dst, plane_src, src_bc->widths[is_uv]); + plane_src += src_bc->strides[is_uv]; + plane_dst += dst_bc->strides[is_uv]; + } + } + aom_yv12_extend_frame_borders_c(dst_bc, num_planes); +} + +void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + +#if CONFIG_AV1_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } +} + +void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc) { + int row; + const uint8_t *src = src_bc->u_buffer; + uint8_t *dst = dst_bc->u_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst, src, src_bc->uv_width); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc) { + int row; + const uint8_t *src = src_bc->v_buffer; + uint8_t *dst = dst_bc->v_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + for (row = 0; row < src_bc->uv_height; ++row) { + memcpy(dst, src, src_bc->uv_width); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst_ybc, int hstart2, + int vstart2) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1); + uint16_t *dst16 = + CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + src = (src + vstart1 * src_ybc->y_stride + hstart1); + dst = (dst + vstart2 * dst_ybc->y_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst, src, (hend1 - hstart1)); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } +} + +void aom_yv12_partial_coloc_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, + int hend, int vstart, int vend) { + aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc, + hstart, vstart); +} + +void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst_bc, int hstart2, + int vstart2) { + int row; + const uint8_t *src = src_bc->u_buffer; + uint8_t *dst = dst_bc->u_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1); + uint16_t *dst16 = + CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2); + for (row = vstart1; row < vend1; ++row) { + memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + src = (src + vstart1 * src_bc->uv_stride + hstart1); + dst = (dst + vstart2 * dst_bc->uv_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst, src, (hend1 - hstart1)); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_partial_coloc_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int hstart, + int hend, int vstart, int vend) { + aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, + vstart); +} + +void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst_bc, int hstart2, + int vstart2) { + int row; + const uint8_t *src = src_bc->v_buffer; + uint8_t *dst = dst_bc->v_buffer; +#if CONFIG_AV1_HIGHBITDEPTH + if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1); + uint16_t *dst16 = + CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2); + for (row = vstart1; row < vend1; ++row) { + memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t)); + src16 += src_bc->uv_stride; + dst16 += dst_bc->uv_stride; + } + return; + } +#endif + src = (src + vstart1 * src_bc->uv_stride + hstart1); + dst = (dst + vstart2 * dst_bc->uv_stride + hstart2); + + for (row = vstart1; row < vend1; ++row) { + memcpy(dst, src, (hend1 - hstart1)); + src += src_bc->uv_stride; + dst += dst_bc->uv_stride; + } +} + +void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int hstart, + int hend, int vstart, int vend) { + aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, + vstart); +} + +int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border, + int byte_alignment, + int num_pyramid_levels, int num_planes) { + if (ybf) { + if (new_border == ybf->border) return 0; + YV12_BUFFER_CONFIG new_buf; + memset(&new_buf, 0, sizeof(new_buf)); + const int error = aom_alloc_frame_buffer( + &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x, + ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border, + byte_alignment, num_pyramid_levels, 0); + if (error) return error; + // Copy image buffer + aom_yv12_copy_frame(ybf, &new_buf, num_planes); + + // Extend up to new border + aom_extend_frame_borders(&new_buf, num_planes); + + // Now free the old buffer and replace with the new + aom_free_frame_buffer(ybf); + memcpy(ybf, &new_buf, sizeof(new_buf)); + return 0; + } + return -2; +} diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h new file mode 100644 index 0000000000..f192a3032e --- /dev/null +++ b/third_party/aom/aom_scale/yv12config.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_SCALE_YV12CONFIG_H_ +#define AOM_AOM_SCALE_YV12CONFIG_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_frame_buffer.h" +#include "aom/aom_integer.h" +#include "aom/internal/aom_image_internal.h" + +/*!\cond */ + +#define AOMINNERBORDERINPIXELS 160 +#define AOM_INTERP_EXTEND 4 +#define AOM_BORDER_IN_PIXELS 288 +#define AOM_ENC_NO_SCALE_BORDER 160 +#define AOM_ENC_ALLINTRA_BORDER 64 +#define AOM_DEC_BORDER_IN_PIXELS 64 + +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY +struct image_pyramid; +struct corner_list; +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + +/*!\endcond */ +/*! + * \brief YV12 frame buffer data structure + */ +typedef struct yv12_buffer_config { + /*!\cond */ + union { + struct { + int y_width; + int uv_width; + }; + int widths[2]; + }; + union { + struct { + int y_height; + int uv_height; + }; + int heights[2]; + }; + union { + struct { + int y_crop_width; + int uv_crop_width; + }; + int crop_widths[2]; + }; + union { + struct { + int y_crop_height; + int uv_crop_height; + }; + int crop_heights[2]; + }; + union { + struct { + int y_stride; + int uv_stride; + }; + int strides[2]; + }; + union { + struct { + uint8_t *y_buffer; + uint8_t *u_buffer; + uint8_t *v_buffer; + }; + uint8_t *buffers[3]; + }; + + // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally + // allocated memory or external buffers. + int use_external_reference_buffers; + // This is needed to store y_buffer, u_buffer, and v_buffer when set reference + // uses an external refernece, and restore those buffer pointers after the + // external reference frame is no longer used. + uint8_t *store_buf_adr[3]; + + // Global motion search data +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + // 8-bit downsampling pyramid for the Y plane + struct image_pyramid *y_pyramid; + struct corner_list *corners; +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + + uint8_t *buffer_alloc; + size_t buffer_alloc_sz; + int border; + size_t frame_size; + int subsampling_x; + int subsampling_y; + unsigned int bit_depth; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + uint8_t monochrome; + aom_chroma_sample_position_t chroma_sample_position; + aom_color_range_t color_range; + int render_width; + int render_height; + + int corrupted; + int flags; + aom_metadata_array_t *metadata; + /*!\endcond */ +} YV12_BUFFER_CONFIG; + +/*!\cond */ + +#define YV12_FLAG_HIGHBITDEPTH 8 + +// Allocate a frame buffer +// +// If ybf currently contains an image, all associated memory will be freed and +// then reallocated. In contrast, aom_realloc_frame_buffer() will reuse any +// existing allocations where possible. So, if ybf is likely to already be +// set up, please consider aom_realloc_frame_buffer() instead. +// +// See aom_realloc_frame_buffer() for the meanings of the arguments, and +// available return values. +int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, int border, + int byte_alignment, int num_pyramid_levels, + int alloc_y_plane_only); + +// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must +// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not +// NULL, then libaom is using the frame buffer callbacks to handle memory. +// If cb is not NULL, libaom will call cb with minimum size in bytes needed +// to decode the current frame. If cb is NULL, libaom will allocate memory +// internally to decode the current frame. +// +// If num_pyramid_levels > 0, then an image pyramid will be allocated with +// the specified number of levels. +// +// Any buffer which may become a source or ref frame buffer in the encoder +// must have num_pyramid_levels = cpi->image_pyramid_levels. This will cause +// an image pyramid to be allocated if one is needed. +// +// Any other buffers (in particular, any buffers inside the decoder) +// must have cpi->image_pyramid_levels = 0, as a pyramid is unneeded there. +// +// Returns 0 on success. Returns < 0 on failure. +int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, + int num_pyramid_levels, int alloc_y_plane_only); + +int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +/*!\endcond */ +/*!\brief Removes metadata from YUV_BUFFER_CONFIG struct. + * + * Frees metadata in frame buffer. + * Frame buffer metadata pointer will be set to NULL. + * + * \param[in] ybf Frame buffer struct pointer + */ +void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf); + +/*!\brief Copy metadata to YUV_BUFFER_CONFIG struct. + * + * Copies metadata to frame buffer. + * Frame buffer will clear any previous metadata and will reallocate the + * metadata array to the new metadata size. Then, it will copy the new metadata + * array into it. + * If arr metadata pointer points to the same address as current metadata in the + * frame buffer, function will do nothing and return 0. + * Returns 0 on success or -1 on failure. + * + * \param[in] ybf Frame buffer struct pointer + * \param[in] arr Metadata array struct pointer + */ +int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf, + const aom_metadata_array_t *arr); + +/*!\brief Calculate the stride required for the image. + * + * Calculates the stride value for an image from aligned width and border. + * Returns the y stride value. + * + * \param[in] aligned_width Aligned width of the image + * \param[in] border Border in pixels + */ +static AOM_INLINE int aom_calc_y_stride(int aligned_width, int border) { + return ((aligned_width + 2 * border) + 31) & ~31; +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AOM_SCALE_YV12CONFIG_H_ diff --git a/third_party/aom/aom_util/aom_thread.c b/third_party/aom/aom_util/aom_thread.c new file mode 100644 index 0000000000..fa3b0a25e4 --- /dev/null +++ b/third_party/aom/aom_util/aom_thread.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// Multi-threaded worker +// +// Original source: +// https://chromium.googlesource.com/webm/libwebp + +// Enable GNU extensions in glibc so that we can call pthread_setname_np(). +// This must be before any #include statements. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include // for memset() + +#include "aom_mem/aom_mem.h" +#include "aom_ports/sanitizer.h" +#include "aom_util/aom_thread.h" + +#if CONFIG_MULTITHREAD + +struct AVxWorkerImpl { + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +}; + +//------------------------------------------------------------------------------ + +static void execute(AVxWorker *const worker); // Forward declaration. + +static THREADFN thread_loop(void *ptr) { + AVxWorker *const worker = (AVxWorker *)ptr; +#ifdef __APPLE__ + if (worker->thread_name != NULL) { + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. The maximum size of the thread_name buffer was + // noted in the Chromium source code and was confirmed by experiments. If + // thread_name is too long, pthread_setname_np returns -1 with errno + // ENAMETOOLONG (63). + char thread_name[64]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(thread_name); + } +#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__) + if (worker->thread_name != NULL) { + // Linux and Android require names (with nul) fit in 16 chars, otherwise + // pthread_setname_np() returns ERANGE (34). + char thread_name[16]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(pthread_self(), thread_name); + } +#endif + pthread_mutex_lock(&worker->impl_->mutex_); + for (;;) { + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + if (worker->status_ == WORK) { + // When worker->status_ is WORK, the main thread doesn't change + // worker->status_ and will wait until the worker changes worker->status_ + // to OK. See change_state(). So the worker can safely call execute() + // without holding worker->impl_->mutex_. When the worker reacquires + // worker->impl_->mutex_, worker->status_ must still be WORK. + pthread_mutex_unlock(&worker->impl_->mutex_); + execute(worker); + pthread_mutex_lock(&worker->impl_->mutex_); + assert(worker->status_ == WORK); + worker->status_ = OK; + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + } else { + assert(worker->status_ == NOT_OK); // finish the worker + break; + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) { + // No-op when attempting to change state on a thread that didn't come up. + // Checking status_ without acquiring the lock first would result in a data + // race. + if (worker->impl_ == NULL) return; + + pthread_mutex_lock(&worker->impl_->mutex_); + if (worker->status_ >= OK) { + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->impl_->condition_); + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); +} + +#endif // CONFIG_MULTITHREAD + +//------------------------------------------------------------------------------ + +static void init(AVxWorker *const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +static int sync(AVxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +static int reset(AVxWorker *const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#if CONFIG_MULTITHREAD + worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_)); + if (worker->impl_ == NULL) { + return 0; + } + if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + goto Error; + } + if (pthread_cond_init(&worker->impl_->condition_, NULL)) { + pthread_mutex_destroy(&worker->impl_->mutex_); + goto Error; + } + pthread_attr_t attr; + if (pthread_attr_init(&attr)) goto Error2; + // Debug ASan builds require at least ~1MiB of stack; prevents + // failures on macOS arm64 where the default is 512KiB. + // See: https://crbug.com/aomedia/3379 +#if defined(AOM_ADDRESS_SANITIZER) && defined(__APPLE__) && AOM_ARCH_ARM && \ + !defined(NDEBUG) + size_t stacksize; + if (!pthread_attr_getstacksize(&attr, &stacksize)) { + const size_t kMinStackSize = 1 << 20; // 1 MiB + if (stacksize < kMinStackSize && + pthread_attr_setstacksize(&attr, kMinStackSize)) { + pthread_attr_destroy(&attr); + goto Error2; + } + } +#endif + pthread_mutex_lock(&worker->impl_->mutex_); + ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->impl_->mutex_); + pthread_attr_destroy(&attr); + if (!ok) { + Error2: + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + Error: + aom_free(worker->impl_); + worker->impl_ = NULL; + return 0; + } +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = sync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +static void execute(AVxWorker *const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + +static void launch(AVxWorker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, WORK); +#else + execute(worker); +#endif +} + +static void end(AVxWorker *const worker) { +#if CONFIG_MULTITHREAD + if (worker->impl_ != NULL) { + change_state(worker, NOT_OK); + pthread_join(worker->impl_->thread_, NULL); + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + aom_free(worker->impl_); + worker->impl_ = NULL; + } +#else + worker->status_ = NOT_OK; + assert(worker->impl_ == NULL); +#endif + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +static AVxWorkerInterface g_worker_interface = { init, reset, sync, + launch, execute, end }; + +int aom_set_worker_interface(const AVxWorkerInterface *const winterface) { + if (winterface == NULL || winterface->init == NULL || + winterface->reset == NULL || winterface->sync == NULL || + winterface->launch == NULL || winterface->execute == NULL || + winterface->end == NULL) { + return 0; + } + g_worker_interface = *winterface; + return 1; +} + +const AVxWorkerInterface *aom_get_worker_interface(void) { + return &g_worker_interface; +} + +//------------------------------------------------------------------------------ diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h new file mode 100644 index 0000000000..ec2ea43491 --- /dev/null +++ b/third_party/aom/aom_util/aom_thread.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// Multi-threaded worker +// +// Original source: +// https://chromium.googlesource.com/webm/libwebp + +#ifndef AOM_AOM_UTIL_AOM_THREAD_H_ +#define AOM_AOM_UTIL_AOM_THREAD_H_ + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NUM_THREADS 64 + +#if CONFIG_MULTITHREAD + +#if defined(_WIN32) && !HAVE_PTHREAD_H +// Prevent leaking max/min macros. +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef int pthread_attr_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#define THREADFN unsigned int __stdcall +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +static INLINE int pthread_attr_init(pthread_attr_t *attr) { + (void)attr; + return 0; +} + +static INLINE int pthread_attr_destroy(pthread_attr_t *attr) { + (void)attr; + return 0; +} + +static INLINE int pthread_create(pthread_t *const thread, + const pthread_attr_t *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != + WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + ok = SleepConditionVariableCS(condition, mutex, INFINITE); + return !ok; +} +#else // _WIN32 +#include // NOLINT +#define THREADFN void * +#define THREAD_RETURN(val) val +#endif + +#endif // CONFIG_MULTITHREAD + +// State of the worker thread object +typedef enum { + NOT_OK = 0, // object is unusable + OK, // ready to work + WORK // busy finishing the current task +} AVxWorkerStatus; + +// Function to be called by the worker thread. Takes two opaque pointers as +// arguments (data1 and data2). Should return true on success and return false +// in case of error. +typedef int (*AVxWorkerHook)(void *, void *); + +// Platform-dependent implementation details for the worker. +typedef struct AVxWorkerImpl AVxWorkerImpl; + +// Synchronization object used to launch job in the worker thread +typedef struct { + AVxWorkerImpl *impl_; + AVxWorkerStatus status_; + // Thread name for the debugger. If not NULL, must point to a string that + // outlives the worker thread. For portability, use a name <= 15 characters + // long (not including the terminating NUL character). + const char *thread_name; + AVxWorkerHook hook; // hook to call + void *data1; // first argument passed to 'hook' + void *data2; // second argument passed to 'hook' + int had_error; // true if a call to 'hook' returned false +} AVxWorker; + +// The interface for all thread-worker related functions. All these functions +// must be implemented. +typedef struct { + // Must be called first, before any other method. + void (*init)(AVxWorker *const worker); + // Must be called to initialize the object and spawn the thread. Re-entrant. + // Will potentially launch the thread. Returns false in case of error. + int (*reset)(AVxWorker *const worker); + // Makes sure the previous work is finished. Returns true if worker->had_error + // was not set and no error condition was triggered by the working thread. + int (*sync)(AVxWorker *const worker); + // Triggers the thread to call hook() with data1 and data2 arguments. These + // hook/data1/data2 values can be changed at any time before calling this + // function, but not be changed afterward until the next call to Sync(). + void (*launch)(AVxWorker *const worker); + // This function is similar to launch() except that it calls the + // hook directly instead of using a thread. Convenient to bypass the thread + // mechanism while still using the AVxWorker structs. sync() must + // still be called afterward (for error reporting). + void (*execute)(AVxWorker *const worker); + // Kill the thread and terminate the object. To use the object again, one + // must call reset() again. + void (*end)(AVxWorker *const worker); +} AVxWorkerInterface; + +// Install a new set of threading functions, overriding the defaults. This +// should be done before any workers are started, i.e., before any encoding or +// decoding takes place. The contents of the interface struct are copied, it +// is safe to free the corresponding memory after this call. This function is +// not thread-safe. Return false in case of invalid pointer or methods. +int aom_set_worker_interface(const AVxWorkerInterface *const winterface); + +// Retrieve the currently set thread worker interface. +const AVxWorkerInterface *aom_get_worker_interface(void); + +//------------------------------------------------------------------------------ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_UTIL_AOM_THREAD_H_ diff --git a/third_party/aom/aom_util/aom_util.cmake b/third_party/aom/aom_util/aom_util.cmake new file mode 100644 index 0000000000..6bf4fafc4c --- /dev/null +++ b/third_party/aom/aom_util/aom_util.cmake @@ -0,0 +1,34 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_) + return() +endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_ +set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1) + +list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c" + "${AOM_ROOT}/aom_util/aom_thread.h" + "${AOM_ROOT}/aom_util/endian_inl.h") + +if(CONFIG_BITSTREAM_DEBUG) + list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/debug_util.c" + "${AOM_ROOT}/aom_util/debug_util.h") +endif() + +# Creates the aom_util build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_aom_util_targets) + add_library(aom_util OBJECT ${AOM_UTIL_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() +endfunction() diff --git a/third_party/aom/aom_util/debug_util.c b/third_party/aom/aom_util/debug_util.c new file mode 100644 index 0000000000..d0792e34a4 --- /dev/null +++ b/third_party/aom/aom_util/debug_util.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include "aom_util/debug_util.h" + +static int frame_idx_w = 0; + +static int frame_idx_r = 0; + +void aom_bitstream_queue_set_frame_write(int frame_idx) { + frame_idx_w = frame_idx; +} + +int aom_bitstream_queue_get_frame_write(void) { return frame_idx_w; } + +void aom_bitstream_queue_set_frame_read(int frame_idx) { + frame_idx_r = frame_idx; +} + +int aom_bitstream_queue_get_frame_read(void) { return frame_idx_r; } + +#if CONFIG_BITSTREAM_DEBUG +#define QUEUE_MAX_SIZE 4000000 +static int result_queue[QUEUE_MAX_SIZE]; +static int nsymbs_queue[QUEUE_MAX_SIZE]; +static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16]; + +static int queue_r = 0; +static int queue_w = 0; +static int queue_prev_w = -1; +static int skip_r = 0; +static int skip_w = 0; + +void bitstream_queue_set_skip_write(int skip) { skip_w = skip; } + +void bitstream_queue_set_skip_read(int skip) { skip_r = skip; } + +void bitstream_queue_record_write(void) { queue_prev_w = queue_w; } + +void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; } + +int bitstream_queue_get_write(void) { return queue_w; } + +int bitstream_queue_get_read(void) { return queue_r; } + +void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) { + if (!skip_r) { + if (queue_w == queue_r) { + printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + *result = result_queue[queue_r]; + *nsymbs = nsymbs_queue[queue_r]; + memcpy(cdf, cdf_queue[queue_r], *nsymbs * sizeof(*cdf)); + queue_r = (queue_r + 1) % QUEUE_MAX_SIZE; + } +} + +void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) { + // If you observe a CDF error: + // - Set 'debug_cdf_mismatch' to true + // - Set target_frame_idx_r and target_queue_r to where CDF error was reported + // - Set a breakpoint in debugger at the 'fprintf' below. + const bool debug_cdf_mismatch = false; + if (debug_cdf_mismatch) { + int target_frame_idx_r = 1; + int target_queue_r = 18005; + if (frame_idx_w == target_frame_idx_r && queue_w == target_queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + } + } + if (!skip_w) { + result_queue[queue_w] = result; + nsymbs_queue[queue_w] = nsymbs; + memcpy(cdf_queue[queue_w], cdf, nsymbs * sizeof(*cdf)); + queue_w = (queue_w + 1) % QUEUE_MAX_SIZE; + if (queue_w == queue_r) { + printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + } +} +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +static int frame_buf_idx_r = 0; +static int frame_buf_idx_w = 0; +static int max_frame_buf_num = 5; +#define MAX_FRAME_STRIDE 1280 +#define MAX_FRAME_HEIGHT 720 +static uint16_t + frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only +static uint16_t + frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm +static int frame_stride = MAX_FRAME_STRIDE; +static int frame_height = MAX_FRAME_HEIGHT; +static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; +void mismatch_move_frame_idx_w(void) { + frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num; + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf overflow\n"); + assert(0); + } +} + +void mismatch_reset_frame(int num_planes) { + for (int plane = 0; plane < num_planes; ++plane) { + memset(frame_pre[frame_buf_idx_w][plane], 0, + sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size); + memset(frame_tx[frame_buf_idx_w][plane], 0, + sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size); + } +} + +void mismatch_move_frame_idx_r(void) { + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf underflow\n"); + assert(0); + } + frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num; +} + +void mismatch_record_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + frame_pre[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + int ref_frame_idx = 3; + int ref_frame_offset = 4; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c && + ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r && + ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + } +#endif +} +void mismatch_record_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + frame_tx[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + int ref_frame_idx = 3; + int ref_frame_offset = 4; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + } +#endif +} +void mismatch_check_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + if (frame_pre[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + printf( + "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d " + "pixel_c %d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_pre[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +void mismatch_check_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd) { + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + for (int r = 0; r < blk_h; ++r) { + for (int c = 0; c < blk_w; ++c) { + if (frame_tx[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + printf( + "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c " + "%d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_tx[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (int rr = 0; rr < blk_h; ++rr) { + for (int cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +#endif // CONFIG_MISMATCH_DEBUG diff --git a/third_party/aom/aom_util/debug_util.h b/third_party/aom/aom_util/debug_util.h new file mode 100644 index 0000000000..23cad2a5b9 --- /dev/null +++ b/third_party/aom/aom_util/debug_util.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_ +#define AOM_AOM_UTIL_DEBUG_UTIL_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void aom_bitstream_queue_set_frame_write(int frame_idx); +int aom_bitstream_queue_get_frame_writee(void); +void aom_bitstream_queue_set_frame_read(int frame_idx); +int aom_bitstream_queue_get_frame_read(void); + +#if CONFIG_BITSTREAM_DEBUG +/* This is a debug tool used to detect bitstream error. On encoder side, it + * pushes each bit and probability into a queue before the bit is written into + * the Arithmetic coder. On decoder side, whenever a bit is read out from the + * Arithmetic coder, it pops out the reference bit and probability from the + * queue as well. If the two results do not match, this debug tool will report + * an error. This tool can be used to pin down the bitstream error precisely. + * By combining gdb's backtrace method, we can detect which module causes the + * bitstream error. */ +int bitstream_queue_get_write(void); +int bitstream_queue_get_read(void); +void bitstream_queue_record_write(void); +void bitstream_queue_reset_write(void); +void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs); +void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs); +void bitstream_queue_set_skip_write(int skip); +void bitstream_queue_set_skip_read(int skip); +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +void mismatch_move_frame_idx_w(); +void mismatch_move_frame_idx_r(); +void mismatch_reset_frame(int num_planes); +void mismatch_record_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +void mismatch_record_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +void mismatch_check_block_pre(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +void mismatch_check_block_tx(const uint8_t *src, int src_stride, + int frame_offset, int plane, int pixel_c, + int pixel_r, int blk_w, int blk_h, int highbd); +#endif // CONFIG_MISMATCH_DEBUG + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_UTIL_DEBUG_UTIL_H_ diff --git a/third_party/aom/aom_util/endian_inl.h b/third_party/aom/aom_util/endian_inl.h new file mode 100644 index 0000000000..b69102a7f5 --- /dev/null +++ b/third_party/aom/aom_util/endian_inl.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// Endian related functions. + +#ifndef AOM_AOM_UTIL_ENDIAN_INL_H_ +#define AOM_AOM_UTIL_ENDIAN_INL_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#if defined(__GNUC__) +#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_GCC_VERSION 0 +#define LOCAL_GCC_PREREQ(maj, min) 0 +#endif + +// handle clang compatibility +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + +#if defined(WORDS_BIGENDIAN) +#define HToLE32 BSwap32 +#define HToLE16 BSwap16 +#define HToBE64(x) (x) +#define HToBE32(x) (x) +#else +#define HToLE32(x) (x) +#define HToLE16(x) (x) +#define HToBE64(X) BSwap64(X) +#define HToBE32(X) BSwap32(X) +#endif + +#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#define HAVE_BUILTIN_BSWAP16 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) +#define HAVE_BUILTIN_BSWAP32 +#endif + +#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) +#define HAVE_BUILTIN_BSWAP64 +#endif + +static INLINE uint16_t BSwap16(uint16_t x) { +#if defined(HAVE_BUILTIN_BSWAP16) + return __builtin_bswap16(x); +#elif defined(_MSC_VER) + return _byteswap_ushort(x); +#else + // gcc will recognize a 'rorw $8, ...' here: + return (x >> 8) | ((x & 0xff) << 8); +#endif // HAVE_BUILTIN_BSWAP16 +} + +static INLINE uint32_t BSwap32(uint32_t x) { +#if defined(HAVE_BUILTIN_BSWAP32) + return __builtin_bswap32(x); +#elif defined(__i386__) || defined(__x86_64__) + uint32_t swapped_bytes; + __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint32_t)_byteswap_ulong(x); +#else + return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24); +#endif // HAVE_BUILTIN_BSWAP32 +} + +static INLINE uint64_t BSwap64(uint64_t x) { +#if defined(HAVE_BUILTIN_BSWAP64) + return __builtin_bswap64(x); +#elif defined(__x86_64__) + uint64_t swapped_bytes; + __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x)); + return swapped_bytes; +#elif defined(_MSC_VER) + return (uint64_t)_byteswap_uint64(x); +#else // generic code for swapping 64-bit values (suggested by bdb@) + x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32); + x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16); + x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8); + return x; +#endif // HAVE_BUILTIN_BSWAP64 +} + +#endif // AOM_AOM_UTIL_ENDIAN_INL_H_ diff --git a/third_party/aom/aomedia_logo_200.png b/third_party/aom/aomedia_logo_200.png new file mode 100644 index 0000000000..4a3b9fcc0c Binary files /dev/null and b/third_party/aom/aomedia_logo_200.png differ diff --git a/third_party/aom/apps/aomdec.c b/third_party/aom/apps/aomdec.c new file mode 100644 index 0000000000..15734cb6a9 --- /dev/null +++ b/third_party/aom/apps/aomdec.c @@ -0,0 +1,1088 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#if CONFIG_OS_SUPPORT +#if HAVE_UNISTD_H +#include // NOLINT +#elif !defined(STDOUT_FILENO) +#define STDOUT_FILENO 1 +#endif +#endif + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem_ops.h" +#include "common/args.h" +#include "common/ivfdec.h" +#include "common/md5_utils.h" +#include "common/obudec.h" +#include "common/tools_common.h" + +#if CONFIG_WEBM_IO +#include "common/webmdec.h" +#endif + +#include "common/rawenc.h" +#include "common/y4menc.h" + +#if CONFIG_LIBYUV +#include "third_party/libyuv/include/libyuv/scale.h" +#endif + +static const char *exec_name; + +struct AvxDecInputContext { + struct AvxInputContext *aom_input_ctx; + struct ObuDecInputContext *obu_ctx; + struct WebmInputContext *webm_ctx; +}; + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t looparg = + ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); +static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); +static const arg_def_t use_yv12 = + ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames"); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Output raw I420 frames"); +static const arg_def_t flipuvarg = + ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output"); +static const arg_def_t rawvideo = + ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames"); +static const arg_def_t noblitarg = + ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames"); +static const arg_def_t progressarg = + ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes"); +static const arg_def_t limitarg = + ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); +static const arg_def_t skiparg = + ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); +static const arg_def_t summaryarg = + ARG_DEF(NULL, "summary", 0, "Show timing summary"); +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output file name pattern (see below)"); +static const arg_def_t threadsarg = + ARG_DEF("t", "threads", 1, "Max threads to use"); +static const arg_def_t rowmtarg = + ARG_DEF(NULL, "row-mt", 1, "Enable row based multi-threading, default: 0"); +static const arg_def_t verbosearg = + ARG_DEF("v", "verbose", 0, "Show version string"); +static const arg_def_t scalearg = + ARG_DEF("S", "scale", 0, "Scale output frames uniformly"); +static const arg_def_t continuearg = + ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error"); +static const arg_def_t fb_arg = + ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use"); +static const arg_def_t md5arg = + ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame"); +static const arg_def_t framestatsarg = + ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); +static const arg_def_t outbitdeptharg = + ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); +static const arg_def_t isannexb = + ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format"); +static const arg_def_t oppointarg = ARG_DEF( + NULL, "oppoint", 1, "Select an operating point of a scalable bitstream"); +static const arg_def_t outallarg = ARG_DEF( + NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream"); +static const arg_def_t skipfilmgrain = + ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application"); + +static const arg_def_t *all_args[] = { + &help, &codecarg, &use_yv12, &use_i420, + &flipuvarg, &rawvideo, &noblitarg, &progressarg, + &limitarg, &skiparg, &summaryarg, &outputfile, + &threadsarg, &rowmtarg, &verbosearg, &scalearg, + &fb_arg, &md5arg, &framestatsarg, &continuearg, + &outbitdeptharg, &isannexb, &oppointarg, &outallarg, + &skipfilmgrain, NULL +}; + +#if CONFIG_LIBYUV +// Returns 0 on success and returns -1 on failure. +static INLINE int libyuv_scale(const aom_image_t *src, aom_image_t *dst, + FilterModeEnum mode) { + if (src->fmt != dst->fmt) { + fprintf(stderr, + "%s failed to scale output frame because format changed from %s to " + "%s\n", + exec_name, image_format_to_string(dst->fmt), + image_format_to_string(src->fmt)); + return -1; + } + if (src->fmt == AOM_IMG_FMT_I42016) { + return I420Scale_16( + (uint16_t *)src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y] / 2, + (uint16_t *)src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U] / 2, + (uint16_t *)src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V] / 2, + src->d_w, src->d_h, (uint16_t *)dst->planes[AOM_PLANE_Y], + dst->stride[AOM_PLANE_Y] / 2, (uint16_t *)dst->planes[AOM_PLANE_U], + dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V], + dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode); + } + if (src->fmt == AOM_IMG_FMT_I420) { + return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], + src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], + src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], + src->d_w, src->d_h, dst->planes[AOM_PLANE_Y], + dst->stride[AOM_PLANE_Y], dst->planes[AOM_PLANE_U], + dst->stride[AOM_PLANE_U], dst->planes[AOM_PLANE_V], + dst->stride[AOM_PLANE_V], dst->d_w, dst->d_h, mode); + } + fprintf(stderr, "%s cannot scale output frame of format %s\n", exec_name, + image_format_to_string(src->fmt)); + return -1; +} +#endif + +static void show_help(FILE *fout, int shorthelp) { + fprintf(fout, "Usage: %s filename\n\n", exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "Options:\n"); + arg_show_usage(fout, all_args); + fprintf(fout, + "\nOutput File Patterns:\n\n" + " The -o argument specifies the name of the file(s) to " + "write to. If the\n argument does not include any escape " + "characters, the output will be\n written to a single file. " + "Otherwise, the filename will be calculated by\n expanding " + "the following escape characters:\n"); + fprintf(fout, + "\n\t%%w - Frame width" + "\n\t%%h - Frame height" + "\n\t%% - Frame number, zero padded to places (1..9)" + "\n\n Pattern arguments are only supported in conjunction " + "with the --yv12 and\n --i420 options. If the -o option is " + "not specified, the output will be\n directed to stdout.\n"); + fprintf(fout, "\nIncluded decoders:\n\n"); + + for (int i = 0; i < get_aom_decoder_count(); ++i) { + aom_codec_iface_t *decoder = get_aom_decoder_by_index(i); + fprintf(fout, " %-6s - %s\n", get_short_name_by_aom_decoder(decoder), + aom_codec_iface_name(decoder)); + } +} + +void usage_exit(void) { + show_help(stderr, 1); + exit(EXIT_FAILURE); +} + +static int raw_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size) { + unsigned char raw_hdr[RAW_FRAME_HDR_SZ]; + size_t frame_size = 0; + + if (read_from_input(input_ctx, RAW_FRAME_HDR_SZ, raw_hdr) != + RAW_FRAME_HDR_SZ) { + if (!input_eof(input_ctx)) + aom_tools_warn("Failed to read RAW frame size\n"); + } else { + const size_t kCorruptFrameThreshold = 256 * 1024 * 1024; + const size_t kFrameTooSmallThreshold = 256 * 1024; + frame_size = mem_get_le32(raw_hdr); + + if (frame_size > kCorruptFrameThreshold) { + aom_tools_warn("Read invalid frame size (%u)\n", + (unsigned int)frame_size); + frame_size = 0; + } + + if (frame_size < kFrameTooSmallThreshold) { + aom_tools_warn( + "Warning: Read invalid frame size (%u) - not a raw file?\n", + (unsigned int)frame_size); + } + + if (frame_size > *buffer_size) { + uint8_t *new_buf = realloc(*buffer, 2 * frame_size); + if (new_buf) { + *buffer = new_buf; + *buffer_size = 2 * frame_size; + } else { + aom_tools_warn("Failed to allocate compressed data buffer\n"); + frame_size = 0; + } + } + } + + if (!input_eof(input_ctx)) { + if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) { + aom_tools_warn("Failed to read full frame\n"); + return 1; + } + *bytes_read = frame_size; + } + + return 0; +} + +static int read_frame(struct AvxDecInputContext *input, uint8_t **buf, + size_t *bytes_in_buffer, size_t *buffer_size) { + switch (input->aom_input_ctx->file_type) { +#if CONFIG_WEBM_IO + case FILE_TYPE_WEBM: + return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer, + buffer_size); +#endif + case FILE_TYPE_RAW: + return raw_read_frame(input->aom_input_ctx, buf, bytes_in_buffer, + buffer_size); + case FILE_TYPE_IVF: + return ivf_read_frame(input->aom_input_ctx, buf, bytes_in_buffer, + buffer_size, NULL); + case FILE_TYPE_OBU: + return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer, + buffer_size); + default: return 1; + } +} + +static int file_is_raw(struct AvxInputContext *input) { + uint8_t buf[32]; + int is_raw = 0; + aom_codec_stream_info_t si; + memset(&si, 0, sizeof(si)); + + if (buffer_input(input, 32, buf, /*buffered=*/true) == 32) { + int i; + + if (mem_get_le32(buf) < 256 * 1024 * 1024) { + for (i = 0; i < get_aom_decoder_count(); ++i) { + aom_codec_iface_t *decoder = get_aom_decoder_by_index(i); + if (!aom_codec_peek_stream_info(decoder, buf + 4, 32 - 4, &si)) { + is_raw = 1; + input->fourcc = get_fourcc_by_aom_decoder(decoder); + input->width = si.w; + input->height = si.h; + input->framerate.numerator = 30; + input->framerate.denominator = 1; + break; + } + } + } + } + + rewind_detect(input); + return is_raw; +} + +static void show_progress(int frame_in, int frame_out, uint64_t dx_time) { + fprintf(stderr, + "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r", + frame_in, frame_out, dx_time, + (double)frame_out * 1000000.0 / (double)dx_time); +} + +struct ExternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +}; + +struct ExternalFrameBufferList { + int num_external_frame_buffers; + struct ExternalFrameBuffer *ext_fb; +}; + +// Callback used by libaom to request an external frame buffer. |cb_priv| +// Application private data passed into the set function. |min_size| is the +// minimum size in bytes needed to decode the next frame. |fb| pointer to the +// frame buffer. +static int get_av1_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + int i; + struct ExternalFrameBufferList *const ext_fb_list = + (struct ExternalFrameBufferList *)cb_priv; + if (ext_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) { + if (!ext_fb_list->ext_fb[i].in_use) break; + } + + if (i == ext_fb_list->num_external_frame_buffers) return -1; + + if (ext_fb_list->ext_fb[i].size < min_size) { + free(ext_fb_list->ext_fb[i].data); + ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t)); + if (!ext_fb_list->ext_fb[i].data) return -1; + + ext_fb_list->ext_fb[i].size = min_size; + } + + fb->data = ext_fb_list->ext_fb[i].data; + fb->size = ext_fb_list->ext_fb[i].size; + ext_fb_list->ext_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the external frame buffer. + fb->priv = &ext_fb_list->ext_fb[i]; + return 0; +} + +// Callback used by libaom when there are no references to the frame buffer. +// |cb_priv| user private data passed into the set function. |fb| pointer +// to the frame buffer. +static int release_av1_frame_buffer(void *cb_priv, + aom_codec_frame_buffer_t *fb) { + struct ExternalFrameBuffer *const ext_fb = + (struct ExternalFrameBuffer *)fb->priv; + (void)cb_priv; + ext_fb->in_use = 0; + return 0; +} + +static void generate_filename(const char *pattern, char *out, size_t q_len, + unsigned int d_w, unsigned int d_h, + unsigned int frame_in) { + const char *p = pattern; + char *q = out; + + do { + char *next_pat = strchr(p, '%'); + + if (p == next_pat) { + size_t pat_len; + + /* parse the pattern */ + q[q_len - 1] = '\0'; + switch (p[1]) { + case 'w': snprintf(q, q_len - 1, "%d", d_w); break; + case 'h': snprintf(q, q_len - 1, "%d", d_h); break; + case '1': snprintf(q, q_len - 1, "%d", frame_in); break; + case '2': snprintf(q, q_len - 1, "%02d", frame_in); break; + case '3': snprintf(q, q_len - 1, "%03d", frame_in); break; + case '4': snprintf(q, q_len - 1, "%04d", frame_in); break; + case '5': snprintf(q, q_len - 1, "%05d", frame_in); break; + case '6': snprintf(q, q_len - 1, "%06d", frame_in); break; + case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; + case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; + case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; + default: die("Unrecognized pattern %%%c\n", p[1]); + } + + pat_len = strlen(q); + if (pat_len >= q_len - 1) die("Output filename too long.\n"); + q += pat_len; + p += 2; + q_len -= pat_len; + } else { + size_t copy_len; + + /* copy the next segment */ + if (!next_pat) + copy_len = strlen(p); + else + copy_len = next_pat - p; + + if (copy_len >= q_len - 1) die("Output filename too long.\n"); + + memcpy(q, p, copy_len); + q[copy_len] = '\0'; + q += copy_len; + p += copy_len; + q_len -= copy_len; + } + } while (*p); +} + +static int is_single_file(const char *outfile_pattern) { + const char *p = outfile_pattern; + + do { + p = strchr(p, '%'); + if (p && p[1] >= '1' && p[1] <= '9') + return 0; // pattern contains sequence number, so it's not unique + if (p) p++; + } while (p); + + return 1; +} + +static void print_md5(unsigned char digest[16], const char *filename) { + int i; + + for (i = 0; i < 16; ++i) printf("%02x", digest[i]); + printf(" %s\n", filename); +} + +static FILE *open_outfile(const char *name) { + if (strcmp("-", name) == 0) { + set_binary_mode(stdout); + return stdout; + } else { + FILE *file = fopen(name, "wb"); + if (!file) fatal("Failed to open output file '%s'", name); + return file; + } +} + +static int main_loop(int argc, const char **argv_) { + aom_codec_ctx_t decoder; + char *fn = NULL; + int i; + int ret = EXIT_FAILURE; + uint8_t *buf = NULL; + size_t bytes_in_buffer = 0, buffer_size = 0; + FILE *infile; + int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; + int do_md5 = 0, progress = 0; + int stop_after = 0, summary = 0, quiet = 1; + int arg_skip = 0; + int keep_going = 0; + uint64_t dx_time = 0; + struct arg arg; + char **argv, **argi, **argj; + + int single_file; + int use_y4m = 1; + int opt_yv12 = 0; + int opt_i420 = 0; + int opt_raw = 0; + aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; + unsigned int fixed_output_bit_depth = 0; + unsigned int is_annexb = 0; + int frames_corrupted = 0; + int dec_flags = 0; + int do_scale = 0; + int operating_point = 0; + int output_all_layers = 0; + int skip_film_grain = 0; + int enable_row_mt = 0; + aom_image_t *scaled_img = NULL; + aom_image_t *img_shifted = NULL; + int frame_avail, got_data, flush_decoder = 0; + int num_external_frame_buffers = 0; + struct ExternalFrameBufferList ext_fb_list = { 0, NULL }; + + const char *outfile_pattern = NULL; + char outfile_name[PATH_MAX] = { 0 }; + FILE *outfile = NULL; + + FILE *framestats_file = NULL; + + MD5Context md5_ctx; + unsigned char md5_digest[16]; + + struct AvxDecInputContext input = { NULL, NULL, NULL }; + struct AvxInputContext aom_input_ctx; + memset(&aom_input_ctx, 0, sizeof(aom_input_ctx)); +#if CONFIG_WEBM_IO + struct WebmInputContext webm_ctx; + memset(&webm_ctx, 0, sizeof(webm_ctx)); + input.webm_ctx = &webm_ctx; +#endif + struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 }; + int is_ivf = 0; + + obu_ctx.avx_ctx = &aom_input_ctx; + input.obu_ctx = &obu_ctx; + input.aom_input_ctx = &aom_input_ctx; + + /* Parse command line */ + exec_name = argv_[0]; + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } + + aom_codec_iface_t *interface = NULL; + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + memset(&arg, 0, sizeof(arg)); + arg.argv_step = 1; + + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { + interface = get_aom_decoder_by_short_name(arg.val); + if (!interface) + die("Error: Unrecognized argument (%s) to --codec\n", arg.val); + } else if (arg_match(&arg, &looparg, argi)) { + // no-op + } else if (arg_match(&arg, &outputfile, argi)) { + outfile_pattern = arg.val; + } else if (arg_match(&arg, &use_yv12, argi)) { + use_y4m = 0; + flipuv = 1; + opt_yv12 = 1; + opt_i420 = 0; + opt_raw = 0; + } else if (arg_match(&arg, &use_i420, argi)) { + use_y4m = 0; + flipuv = 0; + opt_yv12 = 0; + opt_i420 = 1; + opt_raw = 0; + } else if (arg_match(&arg, &rawvideo, argi)) { + use_y4m = 0; + opt_yv12 = 0; + opt_i420 = 0; + opt_raw = 1; + } else if (arg_match(&arg, &flipuvarg, argi)) { + flipuv = 1; + } else if (arg_match(&arg, &noblitarg, argi)) { + noblit = 1; + } else if (arg_match(&arg, &progressarg, argi)) { + progress = 1; + } else if (arg_match(&arg, &limitarg, argi)) { + stop_after = arg_parse_uint(&arg); + } else if (arg_match(&arg, &skiparg, argi)) { + arg_skip = arg_parse_uint(&arg); + } else if (arg_match(&arg, &md5arg, argi)) { + do_md5 = 1; + } else if (arg_match(&arg, &framestatsarg, argi)) { + framestats_file = fopen(arg.val, "w"); + if (!framestats_file) { + die("Error: Could not open --framestats file (%s) for writing.\n", + arg.val); + } + } else if (arg_match(&arg, &summaryarg, argi)) { + summary = 1; + } else if (arg_match(&arg, &threadsarg, argi)) { + cfg.threads = arg_parse_uint(&arg); +#if !CONFIG_MULTITHREAD + if (cfg.threads > 1) { + die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = " + "0.\n", + cfg.threads); + } +#endif + } else if (arg_match(&arg, &rowmtarg, argi)) { + enable_row_mt = arg_parse_uint(&arg); + } else if (arg_match(&arg, &verbosearg, argi)) { + quiet = 0; + } else if (arg_match(&arg, &scalearg, argi)) { + do_scale = 1; + } else if (arg_match(&arg, &fb_arg, argi)) { + num_external_frame_buffers = arg_parse_uint(&arg); + } else if (arg_match(&arg, &continuearg, argi)) { + keep_going = 1; + } else if (arg_match(&arg, &outbitdeptharg, argi)) { + fixed_output_bit_depth = arg_parse_uint(&arg); + } else if (arg_match(&arg, &isannexb, argi)) { + is_annexb = 1; + input.obu_ctx->is_annexb = 1; + } else if (arg_match(&arg, &oppointarg, argi)) { + operating_point = arg_parse_int(&arg); + } else if (arg_match(&arg, &outallarg, argi)) { + output_all_layers = 1; + } else if (arg_match(&arg, &skipfilmgrain, argi)) { + skip_film_grain = 1; + } else { + argj++; + } + } + + /* Check for unrecognized options */ + for (argi = argv; *argi; argi++) + if (argi[0][0] == '-' && strlen(argi[0]) > 1) + die("Error: Unrecognized option %s\n", *argi); + + /* Handle non-option arguments */ + fn = argv[0]; + + if (!fn) { + free(argv); + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } + + const bool using_file = strcmp(fn, "-") != 0; + /* Open file */ + infile = using_file ? fopen(fn, "rb") : set_binary_mode(stdin); + + if (!infile) { + fatal("Failed to open input file '%s'", using_file ? fn : "stdin"); + } +#if CONFIG_OS_SUPPORT + /* Make sure we don't dump to the terminal, unless forced to with -o - */ + if (!outfile_pattern && isatty(STDOUT_FILENO) && !do_md5 && !noblit) { + fprintf(stderr, + "Not dumping raw video to your terminal. Use '-o -' to " + "override.\n"); + free(argv); + return EXIT_FAILURE; + } +#endif + input.aom_input_ctx->filename = fn; + input.aom_input_ctx->file = infile; + + // TODO(https://crbug.com/aomedia/1706): webm type does not support reading + // from stdin yet, and file_is_webm is not using the detect buffer when + // determining the type. Therefore it should only be checked when using a file + // and needs to be checked prior to other types. + if (false) { +#if CONFIG_WEBM_IO + } else if (using_file && file_is_webm(input.webm_ctx, input.aom_input_ctx)) { + input.aom_input_ctx->file_type = FILE_TYPE_WEBM; +#endif + } else if (file_is_ivf(input.aom_input_ctx)) { + input.aom_input_ctx->file_type = FILE_TYPE_IVF; + is_ivf = 1; + } else if (file_is_obu(&obu_ctx)) { + input.aom_input_ctx->file_type = FILE_TYPE_OBU; + } else if (file_is_raw(input.aom_input_ctx)) { + input.aom_input_ctx->file_type = FILE_TYPE_RAW; + } else { + fprintf(stderr, "Unrecognized input file type.\n"); +#if CONFIG_WEBM_IO + if (!using_file) { + fprintf(stderr, "aomdec does not support piped WebM input.\n"); + } +#else + fprintf(stderr, "aomdec was built without WebM container support.\n"); +#endif + free(argv); + return EXIT_FAILURE; + } + + outfile_pattern = outfile_pattern ? outfile_pattern : "-"; + single_file = is_single_file(outfile_pattern); + + if (!noblit && single_file) { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, + aom_input_ctx.width, aom_input_ctx.height, 0); + if (do_md5) + MD5Init(&md5_ctx); + else + outfile = open_outfile(outfile_name); + } + + if (use_y4m && !noblit) { + if (!single_file) { + fprintf(stderr, + "YUV4MPEG2 not supported with output patterns," + " try --i420 or --yv12 or --rawvideo.\n"); + return EXIT_FAILURE; + } + +#if CONFIG_WEBM_IO + if (aom_input_ctx.file_type == FILE_TYPE_WEBM) { + if (webm_guess_framerate(input.webm_ctx, input.aom_input_ctx)) { + fprintf(stderr, + "Failed to guess framerate -- error parsing " + "webm file?\n"); + return EXIT_FAILURE; + } + } +#endif + } + + aom_codec_iface_t *fourcc_interface = + get_aom_decoder_by_fourcc(aom_input_ctx.fourcc); + + if (is_ivf && !fourcc_interface) + fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc); + + if (interface && fourcc_interface && interface != fourcc_interface) + aom_tools_warn("Header indicates codec: %s\n", + aom_codec_iface_name(fourcc_interface)); + else + interface = fourcc_interface; + + if (!interface) interface = get_aom_decoder_by_index(0); + + dec_flags = 0; + if (aom_codec_dec_init(&decoder, interface, &cfg, dec_flags)) { + fprintf(stderr, "Failed to initialize decoder: %s\n", + aom_codec_error(&decoder)); + goto fail2; + } + + if (!quiet) fprintf(stderr, "%s\n", decoder.name); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) { + fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OPERATING_POINT, + operating_point)) { + fprintf(stderr, "Failed to set operating_point: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS, + output_all_layers)) { + fprintf(stderr, "Failed to set output_all_layers: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_SKIP_FILM_GRAIN, + skip_film_grain)) { + fprintf(stderr, "Failed to set skip_film_grain: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_ROW_MT, enable_row_mt)) { + fprintf(stderr, "Failed to set row multithreading mode: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + + if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); + while (arg_skip) { + if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; + arg_skip--; + } + + if (num_external_frame_buffers > 0) { + ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; + ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( + num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); + if (!ext_fb_list.ext_fb) { + fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n"); + goto fail; + } + if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer, + release_av1_frame_buffer, + &ext_fb_list)) { + fprintf(stderr, "Failed to configure external frame buffers: %s\n", + aom_codec_error(&decoder)); + goto fail; + } + } + + frame_avail = 1; + got_data = 0; + + if (framestats_file) fprintf(framestats_file, "bytes,qp\r\n"); + + /* Decode file */ + while (frame_avail || got_data) { + aom_codec_iter_t iter = NULL; + aom_image_t *img; + struct aom_usec_timer timer; + int corrupted = 0; + + frame_avail = 0; + if (!stop_after || frame_in < stop_after) { + if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { + frame_avail = 1; + frame_in++; + + aom_usec_timer_start(&timer); + + if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) { + const char *detail = aom_codec_error_detail(&decoder); + aom_tools_warn("Failed to decode frame %d: %s", frame_in, + aom_codec_error(&decoder)); + + if (detail) aom_tools_warn("Additional information: %s", detail); + if (!keep_going) goto fail; + } + + if (framestats_file) { + int qp; + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER, + &qp)) { + aom_tools_warn("Failed AOMD_GET_LAST_QUANTIZER: %s", + aom_codec_error(&decoder)); + if (!keep_going) goto fail; + } + fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp); + } + + aom_usec_timer_mark(&timer); + dx_time += aom_usec_timer_elapsed(&timer); + } else { + flush_decoder = 1; + } + } else { + flush_decoder = 1; + } + + aom_usec_timer_start(&timer); + + if (flush_decoder) { + // Flush the decoder. + if (aom_codec_decode(&decoder, NULL, 0, NULL)) { + aom_tools_warn("Failed to flush decoder: %s", + aom_codec_error(&decoder)); + } + } + + aom_usec_timer_mark(&timer); + dx_time += aom_usec_timer_elapsed(&timer); + + got_data = 0; + // TODO(aomedia:3519): Change the prototype of aom_codec_get_frame_fn_t to + // facilitate error handling. + while ((img = aom_codec_get_frame(&decoder, &iter))) { + ++frame_out; + got_data = 1; + + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED, + &corrupted)) { + aom_tools_warn("Failed AOM_GET_FRAME_CORRUPTED: %s", + aom_codec_error(&decoder)); + if (!keep_going) goto fail; + } + frames_corrupted += corrupted; + + if (progress) show_progress(frame_in, frame_out, dx_time); + + if (!noblit) { + const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V }; + const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U }; + const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; + + if (do_scale) { + if (frame_out == 1) { + // If the output frames are to be scaled to a fixed display size + // then use the width and height specified in the container. If + // either of these is set to 0, use the display size set in the + // first frame header. If that is unavailable, use the raw decoded + // size of the first decoded frame. + int render_width = aom_input_ctx.width; + int render_height = aom_input_ctx.height; + if (!render_width || !render_height) { + int render_size[2]; + if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_GET_DISPLAY_SIZE, + render_size)) { + // As last resort use size of first frame as display size. + render_width = img->d_w; + render_height = img->d_h; + } else { + render_width = render_size[0]; + render_height = render_size[1]; + } + } + scaled_img = + aom_img_alloc(NULL, img->fmt, render_width, render_height, 16); + if (!scaled_img) { + fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n", + render_width, render_height); + goto fail; + } + scaled_img->bit_depth = img->bit_depth; + scaled_img->monochrome = img->monochrome; + scaled_img->csp = img->csp; + } + + if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) { +#if CONFIG_LIBYUV + if (libyuv_scale(img, scaled_img, kFilterBox) != 0) goto fail; + img = scaled_img; +#else + fprintf( + stderr, + "Failed to scale output frame: %s.\n" + "libyuv is required for scaling but is currently disabled.\n" + "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n", + aom_codec_error(&decoder)); + goto fail; +#endif + } + } + // Default to codec bit depth if output bit depth not set + unsigned int output_bit_depth; + if (!fixed_output_bit_depth && single_file) { + output_bit_depth = img->bit_depth; + } else { + output_bit_depth = fixed_output_bit_depth; + } + // Shift up or down if necessary + if (output_bit_depth != 0) { + if (!aom_shift_img(output_bit_depth, &img, &img_shifted)) { + fprintf(stderr, "Error allocating image\n"); + goto fail; + } + } + + aom_input_ctx.width = img->d_w; + aom_input_ctx.height = img->d_h; + + int num_planes = (opt_raw && img->monochrome) ? 1 : 3; + if (single_file) { + if (use_y4m) { + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; + size_t len = 0; + if (frame_out == 1) { + // Y4M file header + len = y4m_write_file_header( + y4m_buf, sizeof(y4m_buf), aom_input_ctx.width, + aom_input_ctx.height, &aom_input_ctx.framerate, + img->monochrome, img->csp, img->fmt, img->bit_depth, + img->range); + if (img->csp == AOM_CSP_COLOCATED) { + fprintf(stderr, + "Warning: Y4M lacks a colorspace for colocated " + "chroma. Using a placeholder.\n"); + } + if (do_md5) { + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); + } else { + fputs(y4m_buf, outfile); + } + } + + // Y4M frame header + len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); + if (do_md5) { + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); + y4m_update_image_md5(img, planes, &md5_ctx); + } else { + fputs(y4m_buf, outfile); + y4m_write_image_file(img, planes, outfile); + } + } else { + if (frame_out == 1) { + // Check if --yv12 or --i420 options are consistent with the + // bit-stream decoded + if (opt_i420) { + if (img->fmt != AOM_IMG_FMT_I420 && + img->fmt != AOM_IMG_FMT_I42016) { + fprintf(stderr, + "Cannot produce i420 output for bit-stream.\n"); + goto fail; + } + } + if (opt_yv12) { + if ((img->fmt != AOM_IMG_FMT_I420 && + img->fmt != AOM_IMG_FMT_YV12) || + img->bit_depth != 8) { + fprintf(stderr, + "Cannot produce yv12 output for bit-stream.\n"); + goto fail; + } + } + } + if (do_md5) { + raw_update_image_md5(img, planes, num_planes, &md5_ctx); + } else { + raw_write_image_file(img, planes, num_planes, outfile); + } + } + } else { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, + img->d_h, frame_in); + if (do_md5) { + MD5Init(&md5_ctx); + if (use_y4m) { + y4m_update_image_md5(img, planes, &md5_ctx); + } else { + raw_update_image_md5(img, planes, num_planes, &md5_ctx); + } + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + outfile = open_outfile(outfile_name); + if (use_y4m) { + y4m_write_image_file(img, planes, outfile); + } else { + raw_write_image_file(img, planes, num_planes, outfile); + } + fclose(outfile); + } + } + } + } + } + + if (summary || progress) { + show_progress(frame_in, frame_out, dx_time); + fprintf(stderr, "\n"); + } + + if (frames_corrupted) { + fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted); + } else { + ret = EXIT_SUCCESS; + } + +fail: + + if (aom_codec_destroy(&decoder)) { + fprintf(stderr, "Failed to destroy decoder: %s\n", + aom_codec_error(&decoder)); + } + +fail2: + + if (!noblit && single_file) { + if (do_md5) { + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + fclose(outfile); + } + } + +#if CONFIG_WEBM_IO + if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM) + webm_free(input.webm_ctx); +#endif + if (input.aom_input_ctx->file_type == FILE_TYPE_OBU) + obudec_free(input.obu_ctx); + + if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf); + + if (scaled_img) aom_img_free(scaled_img); + if (img_shifted) aom_img_free(img_shifted); + + for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) { + free(ext_fb_list.ext_fb[i].data); + } + free(ext_fb_list.ext_fb); + + fclose(infile); + if (framestats_file) fclose(framestats_file); + + free(argv); + + return ret; +} + +int main(int argc, const char **argv_) { + unsigned int loops = 1, i; + char **argv, **argi, **argj; + struct arg arg; + int error = 0; + + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + memset(&arg, 0, sizeof(arg)); + arg.argv_step = 1; + + if (arg_match(&arg, &looparg, argi)) { + loops = arg_parse_uint(&arg); + break; + } + } + free(argv); + for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_); + return error; +} diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c new file mode 100644 index 0000000000..3c9c136eed --- /dev/null +++ b/third_party/aom/apps/aomenc.c @@ -0,0 +1,2688 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "apps/aomenc.h" + +#include "config/aom_config.h" + +#include +#include +#include +#include +#include +#include +#include + +#if CONFIG_AV1_DECODER +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#endif + +#include "aom/aom_encoder.h" +#include "aom/aom_integer.h" +#include "aom/aomcx.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem_ops.h" +#include "common/args.h" +#include "common/ivfenc.h" +#include "common/tools_common.h" +#include "common/warnings.h" + +#if CONFIG_WEBM_IO +#include "common/webmenc.h" +#endif + +#include "common/y4minput.h" +#include "examples/encoder_util.h" +#include "stats/aomstats.h" +#include "stats/rate_hist.h" + +#if CONFIG_LIBYUV +#include "third_party/libyuv/include/libyuv/scale.h" +#endif + +/* Swallow warnings about unused results of fread/fwrite */ +static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { + return fread(ptr, size, nmemb, stream); +} +#define fread wrap_fread + +static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, + FILE *stream) { + return fwrite(ptr, size, nmemb, stream); +} +#define fwrite wrap_fwrite + +static const char *exec_name; + +static AOM_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv( + aom_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) { + if (ctx->err) { + const char *detail = aom_codec_error_detail(ctx); + + vfprintf(stderr, s, ap); + fprintf(stderr, ": %s\n", aom_codec_error(ctx)); + + if (detail) fprintf(stderr, " %s\n", detail); + + if (fatal) { + aom_codec_destroy(ctx); + exit(EXIT_FAILURE); + } + } +} + +static AOM_TOOLS_FORMAT_PRINTF(2, + 3) void ctx_exit_on_error(aom_codec_ctx_t *ctx, + const char *s, ...) { + va_list ap; + + va_start(ap, s); + warn_or_exit_on_errorv(ctx, 1, s, ap); + va_end(ap); +} + +static AOM_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error( + aom_codec_ctx_t *ctx, int fatal, const char *s, ...) { + va_list ap; + + va_start(ap, s); + warn_or_exit_on_errorv(ctx, fatal, s, ap); + va_end(ap); +} + +static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +static int file_is_y4m(const char detect[4]) { + if (memcmp(detect, "YUV4", 4) == 0) { + return 1; + } + return 0; +} + +static int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, + AOME_SET_ENABLEAUTOALTREF, + AOME_SET_SHARPNESS, + AOME_SET_STATIC_THRESHOLD, + AV1E_SET_ROW_MT, + AV1E_SET_FP_MT, + AV1E_SET_TILE_COLUMNS, + AV1E_SET_TILE_ROWS, + AV1E_SET_ENABLE_TPL_MODEL, + AV1E_SET_ENABLE_KEYFRAME_FILTERING, + AOME_SET_ARNR_MAXFRAMES, + AOME_SET_ARNR_STRENGTH, + AOME_SET_TUNING, + AOME_SET_CQ_LEVEL, + AOME_SET_MAX_INTRA_BITRATE_PCT, + AV1E_SET_MAX_INTER_BITRATE_PCT, + AV1E_SET_GF_CBR_BOOST_PCT, + AV1E_SET_LOSSLESS, + AV1E_SET_ENABLE_CDEF, + AV1E_SET_ENABLE_RESTORATION, + AV1E_SET_ENABLE_RECT_PARTITIONS, + AV1E_SET_ENABLE_AB_PARTITIONS, + AV1E_SET_ENABLE_1TO4_PARTITIONS, + AV1E_SET_MIN_PARTITION_SIZE, + AV1E_SET_MAX_PARTITION_SIZE, + AV1E_SET_ENABLE_DUAL_FILTER, + AV1E_SET_ENABLE_CHROMA_DELTAQ, + AV1E_SET_ENABLE_INTRA_EDGE_FILTER, + AV1E_SET_ENABLE_ORDER_HINT, + AV1E_SET_ENABLE_TX64, + AV1E_SET_ENABLE_FLIP_IDTX, + AV1E_SET_ENABLE_RECT_TX, + AV1E_SET_ENABLE_DIST_WTD_COMP, + AV1E_SET_ENABLE_MASKED_COMP, + AV1E_SET_ENABLE_ONESIDED_COMP, + AV1E_SET_ENABLE_INTERINTRA_COMP, + AV1E_SET_ENABLE_SMOOTH_INTERINTRA, + AV1E_SET_ENABLE_DIFF_WTD_COMP, + AV1E_SET_ENABLE_INTERINTER_WEDGE, + AV1E_SET_ENABLE_INTERINTRA_WEDGE, + AV1E_SET_ENABLE_GLOBAL_MOTION, + AV1E_SET_ENABLE_WARPED_MOTION, + AV1E_SET_ENABLE_FILTER_INTRA, + AV1E_SET_ENABLE_SMOOTH_INTRA, + AV1E_SET_ENABLE_PAETH_INTRA, + AV1E_SET_ENABLE_CFL_INTRA, + AV1E_SET_ENABLE_DIAGONAL_INTRA, + AV1E_SET_FORCE_VIDEO_MODE, + AV1E_SET_ENABLE_OBMC, + AV1E_SET_ENABLE_OVERLAY, + AV1E_SET_ENABLE_PALETTE, + AV1E_SET_ENABLE_INTRABC, + AV1E_SET_ENABLE_ANGLE_DELTA, + AV1E_SET_DISABLE_TRELLIS_QUANT, + AV1E_SET_ENABLE_QM, + AV1E_SET_QM_MIN, + AV1E_SET_QM_MAX, + AV1E_SET_REDUCED_TX_TYPE_SET, + AV1E_SET_INTRA_DCT_ONLY, + AV1E_SET_INTER_DCT_ONLY, + AV1E_SET_INTRA_DEFAULT_TX_ONLY, + AV1E_SET_QUANT_B_ADAPT, + AV1E_SET_COEFF_COST_UPD_FREQ, + AV1E_SET_MODE_COST_UPD_FREQ, + AV1E_SET_MV_COST_UPD_FREQ, + AV1E_SET_FRAME_PARALLEL_DECODING, + AV1E_SET_ERROR_RESILIENT_MODE, + AV1E_SET_AQ_MODE, + AV1E_SET_DELTAQ_MODE, + AV1E_SET_DELTAQ_STRENGTH, + AV1E_SET_DELTALF_MODE, + AV1E_SET_FRAME_PERIODIC_BOOST, + AV1E_SET_NOISE_SENSITIVITY, + AV1E_SET_TUNE_CONTENT, + AV1E_SET_CDF_UPDATE_MODE, + AV1E_SET_COLOR_PRIMARIES, + AV1E_SET_TRANSFER_CHARACTERISTICS, + AV1E_SET_MATRIX_COEFFICIENTS, + AV1E_SET_CHROMA_SAMPLE_POSITION, + AV1E_SET_MIN_GF_INTERVAL, + AV1E_SET_MAX_GF_INTERVAL, + AV1E_SET_GF_MIN_PYRAMID_HEIGHT, + AV1E_SET_GF_MAX_PYRAMID_HEIGHT, + AV1E_SET_SUPERBLOCK_SIZE, + AV1E_SET_NUM_TG, + AV1E_SET_MTU, + AV1E_SET_TIMING_INFO_TYPE, + AV1E_SET_FILM_GRAIN_TEST_VECTOR, + AV1E_SET_FILM_GRAIN_TABLE, +#if CONFIG_DENOISE + AV1E_SET_DENOISE_NOISE_LEVEL, + AV1E_SET_DENOISE_BLOCK_SIZE, + AV1E_SET_ENABLE_DNL_DENOISING, +#endif // CONFIG_DENOISE + AV1E_SET_MAX_REFERENCE_FRAMES, + AV1E_SET_REDUCED_REFERENCE_SET, + AV1E_SET_ENABLE_REF_FRAME_MVS, + AV1E_SET_TARGET_SEQ_LEVEL_IDX, + AV1E_SET_TIER_MASK, + AV1E_SET_MIN_CR, + AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, + AV1E_SET_CHROMA_SUBSAMPLING_X, + AV1E_SET_CHROMA_SUBSAMPLING_Y, +#if CONFIG_TUNE_VMAF + AV1E_SET_VMAF_MODEL_PATH, +#endif + AV1E_SET_DV_COST_UPD_FREQ, + AV1E_SET_PARTITION_INFO_PATH, + AV1E_SET_ENABLE_DIRECTIONAL_INTRA, + AV1E_SET_ENABLE_TX_SIZE_SEARCH, + AV1E_SET_LOOPFILTER_CONTROL, + AV1E_SET_AUTO_INTRA_TOOLS_OFF, + AV1E_ENABLE_RATE_GUIDE_DELTAQ, + AV1E_SET_RATE_DISTRIBUTION_INFO, + 0 }; + +const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help, + &g_av1_codec_arg_defs.use_cfg, + &g_av1_codec_arg_defs.debugmode, + &g_av1_codec_arg_defs.outputfile, + &g_av1_codec_arg_defs.codecarg, + &g_av1_codec_arg_defs.passes, + &g_av1_codec_arg_defs.pass_arg, + &g_av1_codec_arg_defs.fpf_name, + &g_av1_codec_arg_defs.limit, + &g_av1_codec_arg_defs.skip, + &g_av1_codec_arg_defs.good_dl, + &g_av1_codec_arg_defs.rt_dl, + &g_av1_codec_arg_defs.ai_dl, + &g_av1_codec_arg_defs.quietarg, + &g_av1_codec_arg_defs.verbosearg, + &g_av1_codec_arg_defs.psnrarg, + &g_av1_codec_arg_defs.use_webm, + &g_av1_codec_arg_defs.use_ivf, + &g_av1_codec_arg_defs.use_obu, + &g_av1_codec_arg_defs.q_hist_n, + &g_av1_codec_arg_defs.rate_hist_n, + &g_av1_codec_arg_defs.disable_warnings, + &g_av1_codec_arg_defs.disable_warning_prompt, + &g_av1_codec_arg_defs.recontest, + NULL }; + +const arg_def_t *global_args[] = { + &g_av1_codec_arg_defs.use_nv12, + &g_av1_codec_arg_defs.use_yv12, + &g_av1_codec_arg_defs.use_i420, + &g_av1_codec_arg_defs.use_i422, + &g_av1_codec_arg_defs.use_i444, + &g_av1_codec_arg_defs.usage, + &g_av1_codec_arg_defs.threads, + &g_av1_codec_arg_defs.profile, + &g_av1_codec_arg_defs.width, + &g_av1_codec_arg_defs.height, + &g_av1_codec_arg_defs.forced_max_frame_width, + &g_av1_codec_arg_defs.forced_max_frame_height, +#if CONFIG_WEBM_IO + &g_av1_codec_arg_defs.stereo_mode, +#endif + &g_av1_codec_arg_defs.timebase, + &g_av1_codec_arg_defs.framerate, + &g_av1_codec_arg_defs.global_error_resilient, + &g_av1_codec_arg_defs.bitdeptharg, + &g_av1_codec_arg_defs.inbitdeptharg, + &g_av1_codec_arg_defs.lag_in_frames, + &g_av1_codec_arg_defs.large_scale_tile, + &g_av1_codec_arg_defs.monochrome, + &g_av1_codec_arg_defs.full_still_picture_hdr, + &g_av1_codec_arg_defs.use_16bit_internal, + &g_av1_codec_arg_defs.save_as_annexb, + NULL +}; + +const arg_def_t *rc_args[] = { &g_av1_codec_arg_defs.dropframe_thresh, + &g_av1_codec_arg_defs.resize_mode, + &g_av1_codec_arg_defs.resize_denominator, + &g_av1_codec_arg_defs.resize_kf_denominator, + &g_av1_codec_arg_defs.superres_mode, + &g_av1_codec_arg_defs.superres_denominator, + &g_av1_codec_arg_defs.superres_kf_denominator, + &g_av1_codec_arg_defs.superres_qthresh, + &g_av1_codec_arg_defs.superres_kf_qthresh, + &g_av1_codec_arg_defs.end_usage, + &g_av1_codec_arg_defs.target_bitrate, + &g_av1_codec_arg_defs.min_quantizer, + &g_av1_codec_arg_defs.max_quantizer, + &g_av1_codec_arg_defs.undershoot_pct, + &g_av1_codec_arg_defs.overshoot_pct, + &g_av1_codec_arg_defs.buf_sz, + &g_av1_codec_arg_defs.buf_initial_sz, + &g_av1_codec_arg_defs.buf_optimal_sz, + &g_av1_codec_arg_defs.bias_pct, + &g_av1_codec_arg_defs.minsection_pct, + &g_av1_codec_arg_defs.maxsection_pct, + NULL }; + +const arg_def_t *kf_args[] = { &g_av1_codec_arg_defs.fwd_kf_enabled, + &g_av1_codec_arg_defs.kf_min_dist, + &g_av1_codec_arg_defs.kf_max_dist, + &g_av1_codec_arg_defs.kf_disabled, + &g_av1_codec_arg_defs.sframe_dist, + &g_av1_codec_arg_defs.sframe_mode, + NULL }; + +// TODO(bohanli): Currently all options are supported by the key & value API. +// Consider removing the control ID usages? +const arg_def_t *av1_ctrl_args[] = { + &g_av1_codec_arg_defs.cpu_used_av1, + &g_av1_codec_arg_defs.auto_altref, + &g_av1_codec_arg_defs.sharpness, + &g_av1_codec_arg_defs.static_thresh, + &g_av1_codec_arg_defs.rowmtarg, + &g_av1_codec_arg_defs.fpmtarg, + &g_av1_codec_arg_defs.tile_cols, + &g_av1_codec_arg_defs.tile_rows, + &g_av1_codec_arg_defs.enable_tpl_model, + &g_av1_codec_arg_defs.enable_keyframe_filtering, + &g_av1_codec_arg_defs.arnr_maxframes, + &g_av1_codec_arg_defs.arnr_strength, + &g_av1_codec_arg_defs.tune_metric, + &g_av1_codec_arg_defs.cq_level, + &g_av1_codec_arg_defs.max_intra_rate_pct, + &g_av1_codec_arg_defs.max_inter_rate_pct, + &g_av1_codec_arg_defs.gf_cbr_boost_pct, + &g_av1_codec_arg_defs.lossless, + &g_av1_codec_arg_defs.enable_cdef, + &g_av1_codec_arg_defs.enable_restoration, + &g_av1_codec_arg_defs.enable_rect_partitions, + &g_av1_codec_arg_defs.enable_ab_partitions, + &g_av1_codec_arg_defs.enable_1to4_partitions, + &g_av1_codec_arg_defs.min_partition_size, + &g_av1_codec_arg_defs.max_partition_size, + &g_av1_codec_arg_defs.enable_dual_filter, + &g_av1_codec_arg_defs.enable_chroma_deltaq, + &g_av1_codec_arg_defs.enable_intra_edge_filter, + &g_av1_codec_arg_defs.enable_order_hint, + &g_av1_codec_arg_defs.enable_tx64, + &g_av1_codec_arg_defs.enable_flip_idtx, + &g_av1_codec_arg_defs.enable_rect_tx, + &g_av1_codec_arg_defs.enable_dist_wtd_comp, + &g_av1_codec_arg_defs.enable_masked_comp, + &g_av1_codec_arg_defs.enable_onesided_comp, + &g_av1_codec_arg_defs.enable_interintra_comp, + &g_av1_codec_arg_defs.enable_smooth_interintra, + &g_av1_codec_arg_defs.enable_diff_wtd_comp, + &g_av1_codec_arg_defs.enable_interinter_wedge, + &g_av1_codec_arg_defs.enable_interintra_wedge, + &g_av1_codec_arg_defs.enable_global_motion, + &g_av1_codec_arg_defs.enable_warped_motion, + &g_av1_codec_arg_defs.enable_filter_intra, + &g_av1_codec_arg_defs.enable_smooth_intra, + &g_av1_codec_arg_defs.enable_paeth_intra, + &g_av1_codec_arg_defs.enable_cfl_intra, + &g_av1_codec_arg_defs.enable_diagonal_intra, + &g_av1_codec_arg_defs.force_video_mode, + &g_av1_codec_arg_defs.enable_obmc, + &g_av1_codec_arg_defs.enable_overlay, + &g_av1_codec_arg_defs.enable_palette, + &g_av1_codec_arg_defs.enable_intrabc, + &g_av1_codec_arg_defs.enable_angle_delta, + &g_av1_codec_arg_defs.disable_trellis_quant, + &g_av1_codec_arg_defs.enable_qm, + &g_av1_codec_arg_defs.qm_min, + &g_av1_codec_arg_defs.qm_max, + &g_av1_codec_arg_defs.reduced_tx_type_set, + &g_av1_codec_arg_defs.use_intra_dct_only, + &g_av1_codec_arg_defs.use_inter_dct_only, + &g_av1_codec_arg_defs.use_intra_default_tx_only, + &g_av1_codec_arg_defs.quant_b_adapt, + &g_av1_codec_arg_defs.coeff_cost_upd_freq, + &g_av1_codec_arg_defs.mode_cost_upd_freq, + &g_av1_codec_arg_defs.mv_cost_upd_freq, + &g_av1_codec_arg_defs.frame_parallel_decoding, + &g_av1_codec_arg_defs.error_resilient_mode, + &g_av1_codec_arg_defs.aq_mode, + &g_av1_codec_arg_defs.deltaq_mode, + &g_av1_codec_arg_defs.deltaq_strength, + &g_av1_codec_arg_defs.deltalf_mode, + &g_av1_codec_arg_defs.frame_periodic_boost, + &g_av1_codec_arg_defs.noise_sens, + &g_av1_codec_arg_defs.tune_content, + &g_av1_codec_arg_defs.cdf_update_mode, + &g_av1_codec_arg_defs.input_color_primaries, + &g_av1_codec_arg_defs.input_transfer_characteristics, + &g_av1_codec_arg_defs.input_matrix_coefficients, + &g_av1_codec_arg_defs.input_chroma_sample_position, + &g_av1_codec_arg_defs.min_gf_interval, + &g_av1_codec_arg_defs.max_gf_interval, + &g_av1_codec_arg_defs.gf_min_pyr_height, + &g_av1_codec_arg_defs.gf_max_pyr_height, + &g_av1_codec_arg_defs.superblock_size, + &g_av1_codec_arg_defs.num_tg, + &g_av1_codec_arg_defs.mtu_size, + &g_av1_codec_arg_defs.timing_info, + &g_av1_codec_arg_defs.film_grain_test, + &g_av1_codec_arg_defs.film_grain_table, +#if CONFIG_DENOISE + &g_av1_codec_arg_defs.denoise_noise_level, + &g_av1_codec_arg_defs.denoise_block_size, + &g_av1_codec_arg_defs.enable_dnl_denoising, +#endif // CONFIG_DENOISE + &g_av1_codec_arg_defs.max_reference_frames, + &g_av1_codec_arg_defs.reduced_reference_set, + &g_av1_codec_arg_defs.enable_ref_frame_mvs, + &g_av1_codec_arg_defs.target_seq_level_idx, + &g_av1_codec_arg_defs.set_tier_mask, + &g_av1_codec_arg_defs.set_min_cr, + &g_av1_codec_arg_defs.vbr_corpus_complexity_lap, + &g_av1_codec_arg_defs.input_chroma_subsampling_x, + &g_av1_codec_arg_defs.input_chroma_subsampling_y, +#if CONFIG_TUNE_VMAF + &g_av1_codec_arg_defs.vmaf_model_path, +#endif + &g_av1_codec_arg_defs.dv_cost_upd_freq, + &g_av1_codec_arg_defs.partition_info_path, + &g_av1_codec_arg_defs.enable_rate_guide_deltaq, + &g_av1_codec_arg_defs.rate_distribution_info, + &g_av1_codec_arg_defs.enable_directional_intra, + &g_av1_codec_arg_defs.enable_tx_size_search, + &g_av1_codec_arg_defs.loopfilter_control, + &g_av1_codec_arg_defs.auto_intra_tools_off, + NULL, +}; + +const arg_def_t *av1_key_val_args[] = { + &g_av1_codec_arg_defs.passes, + &g_av1_codec_arg_defs.two_pass_output, + &g_av1_codec_arg_defs.second_pass_log, + &g_av1_codec_arg_defs.fwd_kf_dist, + &g_av1_codec_arg_defs.strict_level_conformance, + &g_av1_codec_arg_defs.sb_qp_sweep, + &g_av1_codec_arg_defs.dist_metric, + &g_av1_codec_arg_defs.kf_max_pyr_height, + NULL, +}; + +static const arg_def_t *no_args[] = { NULL }; + +static void show_help(FILE *fout, int shorthelp) { + fprintf(fout, "Usage: %s -o dst_filename src_filename\n", + exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "\nOptions:\n"); + arg_show_usage(fout, main_args); + fprintf(fout, "\nEncoder Global Options:\n"); + arg_show_usage(fout, global_args); + fprintf(fout, "\nRate Control Options:\n"); + arg_show_usage(fout, rc_args); + fprintf(fout, "\nKeyframe Placement Options:\n"); + arg_show_usage(fout, kf_args); +#if CONFIG_AV1_ENCODER + fprintf(fout, "\nAV1 Specific Options:\n"); + arg_show_usage(fout, av1_ctrl_args); + arg_show_usage(fout, av1_key_val_args); +#endif + fprintf(fout, + "\nStream timebase (--timebase):\n" + " The desired precision of timestamps in the output, expressed\n" + " in fractional seconds. Default is 1/1000.\n"); + fprintf(fout, "\nIncluded encoders:\n\n"); + + const int num_encoder = get_aom_encoder_count(); + for (int i = 0; i < num_encoder; ++i) { + aom_codec_iface_t *encoder = get_aom_encoder_by_index(i); + const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; + fprintf(fout, " %-6s - %s %s\n", get_short_name_by_aom_encoder(encoder), + aom_codec_iface_name(encoder), defstr); + } + fprintf(fout, "\n "); + fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); +} + +void usage_exit(void) { + show_help(stderr, 1); + exit(EXIT_FAILURE); +} + +#if CONFIG_AV1_ENCODER +#define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map) +#define ARG_KEY_VAL_CNT_MAX NELEMENTS(av1_key_val_args) +#endif + +#if !CONFIG_WEBM_IO +typedef int stereo_format_t; +struct WebmOutputContext { + int debug; +}; +#endif + +/* Per-stream configuration */ +struct stream_config { + struct aom_codec_enc_cfg cfg; + const char *out_fn; + const char *stats_fn; + stereo_format_t stereo_fmt; + int arg_ctrls[ARG_CTRL_CNT_MAX][2]; + int arg_ctrl_cnt; + const char *arg_key_vals[ARG_KEY_VAL_CNT_MAX][2]; + int arg_key_val_cnt; + int write_webm; + const char *film_grain_filename; + int write_ivf; + // whether to use 16bit internal buffers + int use_16bit_internal; +#if CONFIG_TUNE_VMAF + const char *vmaf_model_path; +#endif + const char *partition_info_path; + unsigned int enable_rate_guide_deltaq; + const char *rate_distribution_info; + aom_color_range_t color_range; + const char *two_pass_input; + const char *two_pass_output; + int two_pass_width; + int two_pass_height; +}; + +struct stream_state { + int index; + struct stream_state *next; + struct stream_config config; + FILE *file; + struct rate_hist *rate_hist; + struct WebmOutputContext webm_ctx; + uint64_t psnr_sse_total[2]; + uint64_t psnr_samples_total[2]; + double psnr_totals[2][4]; + int psnr_count[2]; + int counts[64]; + aom_codec_ctx_t encoder; + unsigned int frames_out; + uint64_t cx_time; + size_t nbytes; + stats_io_t stats; + struct aom_image *img; + aom_codec_ctx_t decoder; + int mismatch_seen; + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; + const char *orig_out_fn; + unsigned int orig_width; + unsigned int orig_height; + int orig_write_webm; + int orig_write_ivf; + char tmp_out_fn[1000]; +}; + +static void validate_positive_rational(const char *msg, + struct aom_rational *rat) { + if (rat->den < 0) { + rat->num *= -1; + rat->den *= -1; + } + + if (rat->num < 0) die("Error: %s must be positive\n", msg); + + if (!rat->den) die("Error: %s has zero denominator\n", msg); +} + +static void init_config(cfg_options_t *config) { + memset(config, 0, sizeof(cfg_options_t)); + config->super_block_size = 0; // Dynamic + config->max_partition_size = 128; + config->min_partition_size = 4; + config->disable_trellis_quant = 3; +} + +/* Parses global config arguments into the AvxEncoderConfig. Note that + * argv is modified and overwrites all parsed arguments. + */ +static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) { + char **argi, **argj; + struct arg arg; + const int num_encoder = get_aom_encoder_count(); + char **argv_local = (char **)*argv; + if (num_encoder < 1) die("Error: no valid encoder available\n"); + + /* Initialize default parameters */ + memset(global, 0, sizeof(*global)); + global->codec = get_aom_encoder_by_index(num_encoder - 1); + global->passes = 0; + global->color_type = I420; + global->csp = AOM_CSP_UNKNOWN; + global->show_psnr = 0; + + int cfg_included = 0; + init_config(&global->encoder_config); + + for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + if (arg_match(&arg, &g_av1_codec_arg_defs.use_cfg, argi)) { + if (!cfg_included) { + parse_cfg(arg.val, &global->encoder_config); + cfg_included = 1; + } + } else if (arg_match(&arg, &g_av1_codec_arg_defs.help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.codecarg, argi)) { + global->codec = get_aom_encoder_by_short_name(arg.val); + if (!global->codec) + die("Error: Unrecognized argument (%s) to --codec\n", arg.val); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.passes, argi)) { + global->passes = arg_parse_uint(&arg); + + if (global->passes < 1 || global->passes > 3) + die("Error: Invalid number of passes (%d)\n", global->passes); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.pass_arg, argi)) { + global->pass = arg_parse_uint(&arg); + + if (global->pass < 1 || global->pass > 3) + die("Error: Invalid pass selected (%d)\n", global->pass); + } else if (arg_match(&arg, + &g_av1_codec_arg_defs.input_chroma_sample_position, + argi)) { + global->csp = arg_parse_enum(&arg); + /* Flag is used by later code as well, preserve it. */ + argj++; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.usage, argi)) { + global->usage = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.good_dl, argi)) { + global->usage = AOM_USAGE_GOOD_QUALITY; // Good quality usage + } else if (arg_match(&arg, &g_av1_codec_arg_defs.rt_dl, argi)) { + global->usage = AOM_USAGE_REALTIME; // Real-time usage + } else if (arg_match(&arg, &g_av1_codec_arg_defs.ai_dl, argi)) { + global->usage = AOM_USAGE_ALL_INTRA; // All intra usage + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_nv12, argi)) { + global->color_type = NV12; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_yv12, argi)) { + global->color_type = YV12; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i420, argi)) { + global->color_type = I420; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i422, argi)) { + global->color_type = I422; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_i444, argi)) { + global->color_type = I444; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.quietarg, argi)) { + global->quiet = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.verbosearg, argi)) { + global->verbose = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.limit, argi)) { + global->limit = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.skip, argi)) { + global->skip_frames = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.psnrarg, argi)) { + if (arg.val) + global->show_psnr = arg_parse_int(&arg); + else + global->show_psnr = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.recontest, argi)) { + global->test_decode = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.framerate, argi)) { + global->framerate = arg_parse_rational(&arg); + validate_positive_rational(arg.name, &global->framerate); + global->have_framerate = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.debugmode, argi)) { + global->debug = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.q_hist_n, argi)) { + global->show_q_hist_buckets = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_hist_n, argi)) { + global->show_rate_hist_buckets = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warnings, argi)) { + global->disable_warnings = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.disable_warning_prompt, + argi)) { + global->disable_warning_prompt = 1; + } else { + argj++; + } + } + + if (global->pass) { + /* DWIM: Assume the user meant passes=2 if pass=2 is specified */ + if (global->pass > global->passes) { + aom_tools_warn("Assuming --pass=%d implies --passes=%d\n", global->pass, + global->pass); + global->passes = global->pass; + } + } + /* Validate global config */ + if (global->passes == 0) { +#if CONFIG_AV1_ENCODER + // Make default AV1 passes = 2 until there is a better quality 1-pass + // encoder + if (global->codec != NULL) + global->passes = + (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0 && + global->usage != AOM_USAGE_REALTIME) + ? 2 + : 1; +#else + global->passes = 1; +#endif + } + + if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) { + aom_tools_warn("Enforcing one-pass encoding in realtime mode\n"); + if (global->pass > 1) + die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass); + global->passes = 1; + } + + if (global->usage == AOM_USAGE_ALL_INTRA && global->passes > 1) { + aom_tools_warn("Enforcing one-pass encoding in all intra mode\n"); + global->passes = 1; + } +} + +static void open_input_file(struct AvxInputContext *input, + aom_chroma_sample_position_t csp) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.aom_fmt; + input->bit_depth = input->y4m.bit_depth; + input->color_range = input->y4m.color_range; + } else + fatal("Unsupported Y4M stream."); + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +static void close_input_file(struct AvxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} + +static struct stream_state *new_stream(struct AvxEncoderConfig *global, + struct stream_state *prev) { + struct stream_state *stream; + + stream = calloc(1, sizeof(*stream)); + if (stream == NULL) { + fatal("Failed to allocate new stream."); + } + + if (prev) { + memcpy(stream, prev, sizeof(*stream)); + stream->index++; + prev->next = stream; + } else { + aom_codec_err_t res; + + /* Populate encoder configuration */ + res = aom_codec_enc_config_default(global->codec, &stream->config.cfg, + global->usage); + if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res)); + + /* Change the default timebase to a high enough value so that the + * encoder will always create strictly increasing timestamps. + */ + stream->config.cfg.g_timebase.den = 1000; + + /* Never use the library's default resolution, require it be parsed + * from the file or set on the command line. + */ + stream->config.cfg.g_w = 0; + stream->config.cfg.g_h = 0; + + /* Initialize remaining stream parameters */ + stream->config.write_webm = 1; + stream->config.write_ivf = 0; + +#if CONFIG_WEBM_IO + stream->config.stereo_fmt = STEREO_FORMAT_MONO; + stream->webm_ctx.last_pts_ns = -1; + stream->webm_ctx.writer = NULL; + stream->webm_ctx.segment = NULL; +#endif + + /* Allows removal of the application version from the EBML tags */ + stream->webm_ctx.debug = global->debug; + memcpy(&stream->config.cfg.encoder_cfg, &global->encoder_config, + sizeof(stream->config.cfg.encoder_cfg)); + } + + /* Output files must be specified for each stream */ + stream->config.out_fn = NULL; + stream->config.two_pass_input = NULL; + stream->config.two_pass_output = NULL; + stream->config.two_pass_width = 0; + stream->config.two_pass_height = 0; + + stream->next = NULL; + return stream; +} + +static void set_config_arg_ctrls(struct stream_config *config, int key, + const struct arg *arg) { + int j; + if (key == AV1E_SET_FILM_GRAIN_TABLE) { + config->film_grain_filename = arg->val; + return; + } + + // For target level, the settings should accumulate rather than overwrite, + // so we simply append it. + if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) { + j = config->arg_ctrl_cnt; + assert(j < ARG_CTRL_CNT_MAX); + config->arg_ctrls[j][0] = key; + config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); + ++config->arg_ctrl_cnt; + return; + } + + /* Point either to the next free element or the first instance of this + * control. + */ + for (j = 0; j < config->arg_ctrl_cnt; j++) + if (config->arg_ctrls[j][0] == key) break; + + /* Update/insert */ + assert(j < ARG_CTRL_CNT_MAX); + config->arg_ctrls[j][0] = key; + config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); + + if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) { + aom_tools_warn( + "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n"); + config->arg_ctrls[j][1] = 1; + } + + if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++; +} + +static void set_config_arg_key_vals(struct stream_config *config, + const char *name, const struct arg *arg) { + int j; + const char *val = arg->val; + // For target level, the settings should accumulate rather than overwrite, + // so we simply append it. + if (strcmp(name, "target-seq-level-idx") == 0) { + j = config->arg_key_val_cnt; + assert(j < ARG_KEY_VAL_CNT_MAX); + config->arg_key_vals[j][0] = name; + config->arg_key_vals[j][1] = val; + ++config->arg_key_val_cnt; + return; + } + + /* Point either to the next free element or the first instance of this + * option. + */ + for (j = 0; j < config->arg_key_val_cnt; j++) + if (strcmp(name, config->arg_key_vals[j][0]) == 0) break; + + /* Update/insert */ + assert(j < ARG_KEY_VAL_CNT_MAX); + config->arg_key_vals[j][0] = name; + config->arg_key_vals[j][1] = val; + + if (strcmp(name, g_av1_codec_arg_defs.auto_altref.long_name) == 0) { + int auto_altref = arg_parse_int(arg); + if (auto_altref > 1) { + aom_tools_warn( + "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n"); + config->arg_key_vals[j][1] = "1"; + } + } + + if (j == config->arg_key_val_cnt) config->arg_key_val_cnt++; +} + +static int parse_stream_params(struct AvxEncoderConfig *global, + struct stream_state *stream, char **argv) { + char **argi, **argj; + struct arg arg; + static const arg_def_t **ctrl_args = no_args; + static const arg_def_t **key_val_args = no_args; + static const int *ctrl_args_map = NULL; + struct stream_config *config = &stream->config; + int eos_mark_found = 0; + int webm_forced = 0; + + // Handle codec specific options + if (0) { +#if CONFIG_AV1_ENCODER + } else if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) { + // TODO(jingning): Reuse AV1 specific encoder configuration parameters. + // Consider to expand this set for AV1 encoder control. +#if __STDC_VERSION__ >= 201112L + _Static_assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map), + "The av1_ctrl_args and av1_arg_ctrl_map arrays must be of " + "the same size."); +#else + assert(NELEMENTS(av1_ctrl_args) == NELEMENTS(av1_arg_ctrl_map)); +#endif + ctrl_args = av1_ctrl_args; + ctrl_args_map = av1_arg_ctrl_map; + key_val_args = av1_key_val_args; +#endif + } + + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + /* Once we've found an end-of-stream marker (--) we want to continue + * shifting arguments but not consuming them. + */ + if (eos_mark_found) { + argj++; + continue; + } else if (!strcmp(*argj, "--")) { + eos_mark_found = 1; + continue; + } + + if (arg_match(&arg, &g_av1_codec_arg_defs.outputfile, argi)) { + config->out_fn = arg.val; + if (!webm_forced) { + const size_t out_fn_len = strlen(config->out_fn); + if (out_fn_len >= 4 && + !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) { + config->write_webm = 0; + config->write_ivf = 1; + } else if (out_fn_len >= 4 && + !strcmp(config->out_fn + out_fn_len - 4, ".obu")) { + config->write_webm = 0; + config->write_ivf = 0; + } + } + } else if (arg_match(&arg, &g_av1_codec_arg_defs.fpf_name, argi)) { + config->stats_fn = arg.val; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_webm, argi)) { +#if CONFIG_WEBM_IO + config->write_webm = 1; + webm_forced = 1; +#else + die("Error: --webm specified but webm is disabled."); +#endif + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_ivf, argi)) { + config->write_webm = 0; + config->write_ivf = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_obu, argi)) { + config->write_webm = 0; + config->write_ivf = 0; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.threads, argi)) { + config->cfg.g_threads = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.profile, argi)) { + config->cfg.g_profile = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.width, argi)) { + config->cfg.g_w = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.height, argi)) { + config->cfg.g_h = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_width, + argi)) { + config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.forced_max_frame_height, + argi)) { + config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.bitdeptharg, argi)) { + config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.inbitdeptharg, argi)) { + config->cfg.g_input_bit_depth = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_x, + argi)) { + stream->chroma_subsampling_x = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.input_chroma_subsampling_y, + argi)) { + stream->chroma_subsampling_y = arg_parse_uint(&arg); +#if CONFIG_WEBM_IO + } else if (arg_match(&arg, &g_av1_codec_arg_defs.stereo_mode, argi)) { + config->stereo_fmt = arg_parse_enum_or_int(&arg); +#endif + } else if (arg_match(&arg, &g_av1_codec_arg_defs.timebase, argi)) { + config->cfg.g_timebase = arg_parse_rational(&arg); + validate_positive_rational(arg.name, &config->cfg.g_timebase); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.global_error_resilient, + argi)) { + config->cfg.g_error_resilient = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.lag_in_frames, argi)) { + config->cfg.g_lag_in_frames = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.large_scale_tile, argi)) { + config->cfg.large_scale_tile = arg_parse_uint(&arg); + if (config->cfg.large_scale_tile) { + global->codec = get_aom_encoder_by_short_name("av1"); + } + } else if (arg_match(&arg, &g_av1_codec_arg_defs.monochrome, argi)) { + config->cfg.monochrome = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.full_still_picture_hdr, + argi)) { + config->cfg.full_still_picture_hdr = 1; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_16bit_internal, + argi)) { + config->use_16bit_internal = CONFIG_AV1_HIGHBITDEPTH; + if (!config->use_16bit_internal) { + aom_tools_warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n", + arg.name); + } + } else if (arg_match(&arg, &g_av1_codec_arg_defs.dropframe_thresh, argi)) { + config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_mode, argi)) { + config->cfg.rc_resize_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_denominator, + argi)) { + config->cfg.rc_resize_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.resize_kf_denominator, + argi)) { + config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_mode, argi)) { + config->cfg.rc_superres_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_denominator, + argi)) { + config->cfg.rc_superres_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_denominator, + argi)) { + config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_qthresh, argi)) { + config->cfg.rc_superres_qthresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.superres_kf_qthresh, + argi)) { + config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.end_usage, argi)) { + config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.target_bitrate, argi)) { + config->cfg.rc_target_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.min_quantizer, argi)) { + config->cfg.rc_min_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.max_quantizer, argi)) { + config->cfg.rc_max_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.undershoot_pct, argi)) { + config->cfg.rc_undershoot_pct = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.overshoot_pct, argi)) { + config->cfg.rc_overshoot_pct = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_sz, argi)) { + config->cfg.rc_buf_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_initial_sz, argi)) { + config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.buf_optimal_sz, argi)) { + config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.bias_pct, argi)) { + config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg); + if (global->passes < 2) + aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.minsection_pct, argi)) { + config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg); + + if (global->passes < 2) + aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.maxsection_pct, argi)) { + config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg); + + if (global->passes < 2) + aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.fwd_kf_enabled, argi)) { + config->cfg.fwd_kf_enabled = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_min_dist, argi)) { + config->cfg.kf_min_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_max_dist, argi)) { + config->cfg.kf_max_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_disabled, argi)) { + config->cfg.kf_mode = AOM_KF_DISABLED; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_dist, argi)) { + config->cfg.sframe_dist = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.sframe_mode, argi)) { + config->cfg.sframe_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.save_as_annexb, argi)) { + config->cfg.save_as_annexb = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_width, argi)) { + config->cfg.tile_width_count = + arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.tile_height, argi)) { + config->cfg.tile_height_count = + arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS); +#if CONFIG_TUNE_VMAF + } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) { + config->vmaf_model_path = arg.val; +#endif + } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path, + argi)) { + config->partition_info_path = arg.val; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.enable_rate_guide_deltaq, + argi)) { + config->enable_rate_guide_deltaq = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.rate_distribution_info, + argi)) { + config->rate_distribution_info = arg.val; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets, + argi)) { + config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.fixed_qp_offsets, argi)) { + config->cfg.use_fixed_qp_offsets = 1; + } else if (global->usage == AOM_USAGE_REALTIME && + arg_match(&arg, &g_av1_codec_arg_defs.enable_restoration, + argi)) { + if (arg_parse_uint(&arg) == 1) { + aom_tools_warn("non-zero %s option ignored in realtime mode.\n", + arg.name); + } + } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_input, argi)) { + config->two_pass_input = arg.val; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_output, argi)) { + config->two_pass_output = arg.val; + } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_width, argi)) { + config->two_pass_width = arg_parse_int(&arg); + } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_height, argi)) { + config->two_pass_height = arg_parse_int(&arg); + } else { + int i, match = 0; + // check if the control ID API supports this arg + if (ctrl_args_map) { + for (i = 0; ctrl_args[i]; i++) { + if (arg_match(&arg, ctrl_args[i], argi)) { + match = 1; + set_config_arg_ctrls(config, ctrl_args_map[i], &arg); + break; + } + } + } + if (!match) { + // check if the key & value API supports this arg + for (i = 0; key_val_args[i]; i++) { + if (arg_match(&arg, key_val_args[i], argi)) { + match = 1; + set_config_arg_key_vals(config, key_val_args[i]->long_name, &arg); + break; + } + } + } + if (!match) argj++; + } + } + config->use_16bit_internal |= config->cfg.g_bit_depth > AOM_BITS_8; + + if (global->usage == AOM_USAGE_REALTIME && config->cfg.g_lag_in_frames != 0) { + aom_tools_warn("non-zero lag-in-frames option ignored in realtime mode.\n"); + config->cfg.g_lag_in_frames = 0; + } + + if (global->usage == AOM_USAGE_ALL_INTRA) { + if (config->cfg.g_lag_in_frames != 0) { + aom_tools_warn( + "non-zero lag-in-frames option ignored in all intra mode.\n"); + config->cfg.g_lag_in_frames = 0; + } + if (config->cfg.kf_max_dist != 0) { + aom_tools_warn( + "non-zero max key frame distance option ignored in all intra " + "mode.\n"); + config->cfg.kf_max_dist = 0; + } + } + + // set the passes field using key & val API + if (config->arg_key_val_cnt >= ARG_KEY_VAL_CNT_MAX) { + die("Not enough buffer for the key & value API."); + } + config->arg_key_vals[config->arg_key_val_cnt][0] = "passes"; + switch (global->passes) { + case 0: config->arg_key_vals[config->arg_key_val_cnt][1] = "0"; break; + case 1: config->arg_key_vals[config->arg_key_val_cnt][1] = "1"; break; + case 2: config->arg_key_vals[config->arg_key_val_cnt][1] = "2"; break; + case 3: config->arg_key_vals[config->arg_key_val_cnt][1] = "3"; break; + default: die("Invalid value of --passes."); + } + config->arg_key_val_cnt++; + + // set the two_pass_output field + if (!config->two_pass_output && global->passes == 3) { + // If not specified, set the name of two_pass_output file here. + snprintf(stream->tmp_out_fn, sizeof(stream->tmp_out_fn), + "%.980s_pass2_%d.ivf", stream->config.out_fn, stream->index); + stream->config.two_pass_output = stream->tmp_out_fn; + } + if (config->two_pass_output) { + config->arg_key_vals[config->arg_key_val_cnt][0] = "two-pass-output"; + config->arg_key_vals[config->arg_key_val_cnt][1] = config->two_pass_output; + config->arg_key_val_cnt++; + } + + return eos_mark_found; +} + +#define FOREACH_STREAM(iterator, list) \ + for (struct stream_state *iterator = list; iterator; \ + iterator = iterator->next) + +static void validate_stream_config(const struct stream_state *stream, + const struct AvxEncoderConfig *global) { + const struct stream_state *streami; + (void)global; + + if (!stream->config.cfg.g_w || !stream->config.cfg.g_h) + fatal( + "Stream %d: Specify stream dimensions with --width (-w) " + " and --height (-h)", + stream->index); + + /* Even if bit depth is set on the command line flag to be lower, + * it is upgraded to at least match the input bit depth. + */ + assert(stream->config.cfg.g_input_bit_depth <= + (unsigned int)stream->config.cfg.g_bit_depth); + + for (streami = stream; streami; streami = streami->next) { + /* All streams require output files */ + if (!streami->config.out_fn) + fatal("Stream %d: Output file is required (specify with -o)", + streami->index); + + /* Check for two streams outputting to the same file */ + if (streami != stream) { + const char *a = stream->config.out_fn; + const char *b = streami->config.out_fn; + if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul")) + fatal("Stream %d: duplicate output file (from stream %d)", + streami->index, stream->index); + } + + /* Check for two streams sharing a stats file. */ + if (streami != stream) { + const char *a = stream->config.stats_fn; + const char *b = streami->config.stats_fn; + if (a && b && !strcmp(a, b)) + fatal("Stream %d: duplicate stats file (from stream %d)", + streami->index, stream->index); + } + } +} + +static void set_stream_dimensions(struct stream_state *stream, unsigned int w, + unsigned int h) { + if (!stream->config.cfg.g_w) { + if (!stream->config.cfg.g_h) + stream->config.cfg.g_w = w; + else + stream->config.cfg.g_w = w * stream->config.cfg.g_h / h; + } + if (!stream->config.cfg.g_h) { + stream->config.cfg.g_h = h * stream->config.cfg.g_w / w; + } +} + +static const char *file_type_to_string(enum VideoFileType t) { + switch (t) { + case FILE_TYPE_RAW: return "RAW"; + case FILE_TYPE_Y4M: return "Y4M"; + default: return "Other"; + } +} + +static void show_stream_config(struct stream_state *stream, + struct AvxEncoderConfig *global, + struct AvxInputContext *input) { +#define SHOW(field) \ + fprintf(stderr, " %-28s = %d\n", #field, stream->config.cfg.field) + + if (stream->index == 0) { + fprintf(stderr, "Codec: %s\n", aom_codec_iface_name(global->codec)); + fprintf(stderr, "Source file: %s File Type: %s Format: %s\n", + input->filename, file_type_to_string(input->file_type), + image_format_to_string(input->fmt)); + } + if (stream->next || stream->index) + fprintf(stderr, "\nStream Index: %d\n", stream->index); + fprintf(stderr, "Destination file: %s\n", stream->config.out_fn); + fprintf(stderr, "Coding path: %s\n", + stream->config.use_16bit_internal ? "HBD" : "LBD"); + fprintf(stderr, "Encoder parameters:\n"); + + SHOW(g_usage); + SHOW(g_threads); + SHOW(g_profile); + SHOW(g_w); + SHOW(g_h); + SHOW(g_bit_depth); + SHOW(g_input_bit_depth); + SHOW(g_timebase.num); + SHOW(g_timebase.den); + SHOW(g_error_resilient); + SHOW(g_pass); + SHOW(g_lag_in_frames); + SHOW(large_scale_tile); + SHOW(rc_dropframe_thresh); + SHOW(rc_resize_mode); + SHOW(rc_resize_denominator); + SHOW(rc_resize_kf_denominator); + SHOW(rc_superres_mode); + SHOW(rc_superres_denominator); + SHOW(rc_superres_kf_denominator); + SHOW(rc_superres_qthresh); + SHOW(rc_superres_kf_qthresh); + SHOW(rc_end_usage); + SHOW(rc_target_bitrate); + SHOW(rc_min_quantizer); + SHOW(rc_max_quantizer); + SHOW(rc_undershoot_pct); + SHOW(rc_overshoot_pct); + SHOW(rc_buf_sz); + SHOW(rc_buf_initial_sz); + SHOW(rc_buf_optimal_sz); + SHOW(rc_2pass_vbr_bias_pct); + SHOW(rc_2pass_vbr_minsection_pct); + SHOW(rc_2pass_vbr_maxsection_pct); + SHOW(fwd_kf_enabled); + SHOW(kf_mode); + SHOW(kf_min_dist); + SHOW(kf_max_dist); + +#define SHOW_PARAMS(field) \ + fprintf(stderr, " %-28s = %d\n", #field, \ + stream->config.cfg.encoder_cfg.field) + if (global->encoder_config.init_by_cfg_file) { + SHOW_PARAMS(super_block_size); + SHOW_PARAMS(max_partition_size); + SHOW_PARAMS(min_partition_size); + SHOW_PARAMS(disable_ab_partition_type); + SHOW_PARAMS(disable_rect_partition_type); + SHOW_PARAMS(disable_1to4_partition_type); + SHOW_PARAMS(disable_flip_idtx); + SHOW_PARAMS(disable_cdef); + SHOW_PARAMS(disable_lr); + SHOW_PARAMS(disable_obmc); + SHOW_PARAMS(disable_warp_motion); + SHOW_PARAMS(disable_global_motion); + SHOW_PARAMS(disable_dist_wtd_comp); + SHOW_PARAMS(disable_diff_wtd_comp); + SHOW_PARAMS(disable_inter_intra_comp); + SHOW_PARAMS(disable_masked_comp); + SHOW_PARAMS(disable_one_sided_comp); + SHOW_PARAMS(disable_palette); + SHOW_PARAMS(disable_intrabc); + SHOW_PARAMS(disable_cfl); + SHOW_PARAMS(disable_smooth_intra); + SHOW_PARAMS(disable_filter_intra); + SHOW_PARAMS(disable_dual_filter); + SHOW_PARAMS(disable_intra_angle_delta); + SHOW_PARAMS(disable_intra_edge_filter); + SHOW_PARAMS(disable_tx_64x64); + SHOW_PARAMS(disable_smooth_inter_intra); + SHOW_PARAMS(disable_inter_inter_wedge); + SHOW_PARAMS(disable_inter_intra_wedge); + SHOW_PARAMS(disable_paeth_intra); + SHOW_PARAMS(disable_trellis_quant); + SHOW_PARAMS(disable_ref_frame_mv); + SHOW_PARAMS(reduced_reference_set); + SHOW_PARAMS(reduced_tx_type_set); + } +} + +static void open_output_file(struct stream_state *stream, + struct AvxEncoderConfig *global, + const struct AvxRational *pixel_aspect_ratio, + const char *encoder_settings) { + const char *fn = stream->config.out_fn; + const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg; + + if (cfg->g_pass == AOM_RC_FIRST_PASS) return; + + stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout); + + if (!stream->file) fatal("Failed to open output file"); + + if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR)) + fatal("WebM output to pipes not supported."); + +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + stream->webm_ctx.stream = stream->file; + if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg, + stream->config.stereo_fmt, + get_fourcc_by_aom_encoder(global->codec), + pixel_aspect_ratio, encoder_settings) != 0) { + fatal("WebM writer initialization failed."); + } + } +#else + (void)pixel_aspect_ratio; + (void)encoder_settings; +#endif + + if (!stream->config.write_webm && stream->config.write_ivf) { + ivf_write_file_header(stream->file, cfg, + get_fourcc_by_aom_encoder(global->codec), 0); + } +} + +static void close_output_file(struct stream_state *stream, + unsigned int fourcc) { + const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg; + + if (cfg->g_pass == AOM_RC_FIRST_PASS) return; + +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + if (write_webm_file_footer(&stream->webm_ctx) != 0) { + fatal("WebM writer finalization failed."); + } + } +#endif + + if (!stream->config.write_webm && stream->config.write_ivf) { + if (!fseek(stream->file, 0, SEEK_SET)) + ivf_write_file_header(stream->file, &stream->config.cfg, fourcc, + stream->frames_out); + } + + fclose(stream->file); +} + +static void setup_pass(struct stream_state *stream, + struct AvxEncoderConfig *global, int pass) { + if (stream->config.stats_fn) { + if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass)) + fatal("Failed to open statistics store"); + } else { + if (!stats_open_mem(&stream->stats, pass)) + fatal("Failed to open statistics store"); + } + + if (global->passes == 1) { + stream->config.cfg.g_pass = AOM_RC_ONE_PASS; + } else { + switch (pass) { + case 0: stream->config.cfg.g_pass = AOM_RC_FIRST_PASS; break; + case 1: stream->config.cfg.g_pass = AOM_RC_SECOND_PASS; break; + case 2: stream->config.cfg.g_pass = AOM_RC_THIRD_PASS; break; + default: fatal("Failed to set pass"); + } + } + + if (pass) { + stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); + } + + stream->cx_time = 0; + stream->nbytes = 0; + stream->frames_out = 0; +} + +static void initialize_encoder(struct stream_state *stream, + struct AvxEncoderConfig *global) { + int i; + int flags = 0; + + flags |= (global->show_psnr >= 1) ? AOM_CODEC_USE_PSNR : 0; + flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0; + + /* Construct Encoder Context */ + aom_codec_enc_init(&stream->encoder, global->codec, &stream->config.cfg, + flags); + ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder"); + + for (i = 0; i < stream->config.arg_ctrl_cnt; i++) { + int ctrl = stream->config.arg_ctrls[i][0]; + int value = stream->config.arg_ctrls[i][1]; + if (aom_codec_control(&stream->encoder, ctrl, value)) + fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value); + + ctx_exit_on_error(&stream->encoder, "Failed to control codec"); + } + + for (i = 0; i < stream->config.arg_key_val_cnt; i++) { + const char *name = stream->config.arg_key_vals[i][0]; + const char *val = stream->config.arg_key_vals[i][1]; + if (aom_codec_set_option(&stream->encoder, name, val)) + fprintf(stderr, "Error: Tried to set option %s = %s\n", name, val); + + ctx_exit_on_error(&stream->encoder, "Failed to set codec option"); + } + +#if CONFIG_TUNE_VMAF + if (stream->config.vmaf_model_path) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH, + stream->config.vmaf_model_path); + ctx_exit_on_error(&stream->encoder, "Failed to set vmaf model path"); + } +#endif + if (stream->config.partition_info_path) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_PARTITION_INFO_PATH, + stream->config.partition_info_path); + ctx_exit_on_error(&stream->encoder, "Failed to set partition info path"); + } + if (stream->config.enable_rate_guide_deltaq) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_ENABLE_RATE_GUIDE_DELTAQ, + stream->config.enable_rate_guide_deltaq); + ctx_exit_on_error(&stream->encoder, "Failed to enable rate guide deltaq"); + } + if (stream->config.rate_distribution_info) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_RATE_DISTRIBUTION_INFO, + stream->config.rate_distribution_info); + ctx_exit_on_error(&stream->encoder, "Failed to set rate distribution info"); + } + + if (stream->config.film_grain_filename) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE, + stream->config.film_grain_filename); + ctx_exit_on_error(&stream->encoder, "Failed to set film grain table"); + } + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_COLOR_RANGE, + stream->config.color_range); + ctx_exit_on_error(&stream->encoder, "Failed to set color range"); + +#if CONFIG_AV1_DECODER + if (global->test_decode != TEST_DECODE_OFF) { + aom_codec_iface_t *decoder = get_aom_decoder_by_short_name( + get_short_name_by_aom_encoder(global->codec)); + aom_codec_dec_cfg_t cfg = { 0, 0, 0, !stream->config.use_16bit_internal }; + aom_codec_dec_init(&stream->decoder, decoder, &cfg, 0); + + if (strcmp(get_short_name_by_aom_encoder(global->codec), "av1") == 0) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE, + stream->config.cfg.large_scale_tile); + ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode"); + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1D_SET_IS_ANNEXB, + stream->config.cfg.save_as_annexb); + ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb"); + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_ROW, + -1); + ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row"); + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_COL, + -1); + ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col"); + } + } +#endif +} + +// Convert the input image 'img' to a monochrome image. The Y plane of the +// output image is a shallow copy of the Y plane of the input image, therefore +// the input image must remain valid for the lifetime of the output image. The U +// and V planes of the output image are set to null pointers. The output image +// format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400. +static void convert_image_to_monochrome(const struct aom_image *img, + struct aom_image *monochrome_img) { + *monochrome_img = *img; + monochrome_img->fmt = AOM_IMG_FMT_I420; + if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + } + monochrome_img->monochrome = 1; + monochrome_img->csp = AOM_CSP_UNKNOWN; + monochrome_img->x_chroma_shift = 1; + monochrome_img->y_chroma_shift = 1; + monochrome_img->planes[AOM_PLANE_U] = NULL; + monochrome_img->planes[AOM_PLANE_V] = NULL; + monochrome_img->stride[AOM_PLANE_U] = 0; + monochrome_img->stride[AOM_PLANE_V] = 0; + monochrome_img->sz = 0; + monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + monochrome_img->img_data = NULL; + monochrome_img->img_data_owner = 0; + monochrome_img->self_allocd = 0; +} + +static void encode_frame(struct stream_state *stream, + struct AvxEncoderConfig *global, struct aom_image *img, + unsigned int frames_in) { + aom_codec_pts_t frame_start, next_frame_start; + struct aom_codec_enc_cfg *cfg = &stream->config.cfg; + struct aom_usec_timer timer; + + frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + next_frame_start = + (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) / + cfg->g_timebase.num / global->framerate.num; + + /* Scale if necessary */ + if (img) { + if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) && + (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != AOM_IMG_FMT_I42016) { + fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) { + stream->img = + aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); + } + I420Scale_16( + (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2, + (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2, + (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2, + img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y], + stream->img->stride[AOM_PLANE_Y] / 2, + (uint16_t *)stream->img->planes[AOM_PLANE_U], + stream->img->stride[AOM_PLANE_U] / 2, + (uint16_t *)stream->img->planes[AOM_PLANE_V], + stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w, + stream->img->d_h, kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "libyuv is required for scaling but is currently " + "disabled.\n" + "Be sure to specify -DCONFIG_LIBYUV=1 when running " + "cmake.\n", + stream->index); +#endif + } + } + if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) { + if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) { + fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name); + exit(EXIT_FAILURE); + } +#if CONFIG_LIBYUV + if (!stream->img) + stream->img = + aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16); + I420Scale( + img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y], + img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U], + img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V], img->d_w, img->d_h, + stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y], + stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U], + stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V], + stream->img->d_w, stream->img->d_h, kFilterBox); + img = stream->img; +#else + stream->encoder.err = 1; + ctx_exit_on_error(&stream->encoder, + "Stream %d: Failed to encode frame.\n" + "Scaling disabled in this configuration. \n" + "To enable, configure with --enable-libyuv\n", + stream->index); +#endif + } + + struct aom_image monochrome_img; + if (img && cfg->monochrome) { + convert_image_to_monochrome(img, &monochrome_img); + img = &monochrome_img; + } + + aom_usec_timer_start(&timer); + aom_codec_encode(&stream->encoder, img, frame_start, + (uint32_t)(next_frame_start - frame_start), 0); + aom_usec_timer_mark(&timer); + stream->cx_time += aom_usec_timer_elapsed(&timer); + ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame", + stream->index); +} + +static void update_quantizer_histogram(struct stream_state *stream) { + if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) { + int q; + + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AOME_GET_LAST_QUANTIZER_64, + &q); + ctx_exit_on_error(&stream->encoder, "Failed to read quantizer"); + stream->counts[q]++; + } +} + +static void get_cx_data(struct stream_state *stream, + struct AvxEncoderConfig *global, int *got_data) { + const aom_codec_cx_pkt_t *pkt; + const struct aom_codec_enc_cfg *cfg = &stream->config.cfg; + aom_codec_iter_t iter = NULL; + + *got_data = 0; + while ((pkt = aom_codec_get_cx_data(&stream->encoder, &iter))) { + static size_t fsize = 0; + static FileOffset ivf_header_pos = 0; + + switch (pkt->kind) { + case AOM_CODEC_CX_FRAME_PKT: + ++stream->frames_out; + if (!global->quiet) + fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz); + + update_rate_histogram(stream->rate_hist, cfg, pkt); +#if CONFIG_WEBM_IO + if (stream->config.write_webm) { + if (write_webm_block(&stream->webm_ctx, cfg, pkt) != 0) { + fatal("WebM writer failed."); + } + } +#endif + if (!stream->config.write_webm) { + if (stream->config.write_ivf) { + if (pkt->data.frame.partition_id <= 0) { + ivf_header_pos = ftello(stream->file); + fsize = pkt->data.frame.sz; + + ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize); + } else { + fsize += pkt->data.frame.sz; + + const FileOffset currpos = ftello(stream->file); + fseeko(stream->file, ivf_header_pos, SEEK_SET); + ivf_write_frame_size(stream->file, fsize); + fseeko(stream->file, currpos, SEEK_SET); + } + } + + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + stream->file); + } + stream->nbytes += pkt->data.raw.sz; + + *got_data = 1; +#if CONFIG_AV1_DECODER + if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) { + aom_codec_decode(&stream->decoder, pkt->data.frame.buf, + pkt->data.frame.sz, NULL); + if (stream->decoder.err) { + warn_or_exit_on_error(&stream->decoder, + global->test_decode == TEST_DECODE_FATAL, + "Failed to decode frame %d in stream %d", + stream->frames_out + 1, stream->index); + stream->mismatch_seen = stream->frames_out + 1; + } + } +#endif + break; + case AOM_CODEC_STATS_PKT: + stream->frames_out++; + stats_write(&stream->stats, pkt->data.twopass_stats.buf, + pkt->data.twopass_stats.sz); + stream->nbytes += pkt->data.raw.sz; + break; + case AOM_CODEC_PSNR_PKT: + + if (global->show_psnr >= 1) { + int i; + + stream->psnr_sse_total[0] += pkt->data.psnr.sse[0]; + stream->psnr_samples_total[0] += pkt->data.psnr.samples[0]; + for (i = 0; i < 4; i++) { + if (!global->quiet) + fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]); + stream->psnr_totals[0][i] += pkt->data.psnr.psnr[i]; + } + stream->psnr_count[0]++; + +#if CONFIG_AV1_HIGHBITDEPTH + if (stream->config.cfg.g_input_bit_depth < + (unsigned int)stream->config.cfg.g_bit_depth) { + stream->psnr_sse_total[1] += pkt->data.psnr.sse_hbd[0]; + stream->psnr_samples_total[1] += pkt->data.psnr.samples_hbd[0]; + for (i = 0; i < 4; i++) { + if (!global->quiet) + fprintf(stderr, "%.3f ", pkt->data.psnr.psnr_hbd[i]); + stream->psnr_totals[1][i] += pkt->data.psnr.psnr_hbd[i]; + } + stream->psnr_count[1]++; + } +#endif + } + + break; + default: break; + } + } +} + +static void show_psnr(struct stream_state *stream, double peak, int64_t bps) { + int i; + double ovpsnr; + + if (!stream->psnr_count[0]) return; + + fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); + ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[0], peak, + (double)stream->psnr_sse_total[0]); + fprintf(stderr, " %.3f", ovpsnr); + + for (i = 0; i < 4; i++) { + fprintf(stderr, " %.3f", stream->psnr_totals[0][i] / stream->psnr_count[0]); + } + if (bps > 0) { + fprintf(stderr, " %7" PRId64 " bps", bps); + } + fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000); + fprintf(stderr, "\n"); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void show_psnr_hbd(struct stream_state *stream, double peak, + int64_t bps) { + int i; + double ovpsnr; + // Compute PSNR based on stream bit depth + if (!stream->psnr_count[1]) return; + + fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index); + ovpsnr = sse_to_psnr((double)stream->psnr_samples_total[1], peak, + (double)stream->psnr_sse_total[1]); + fprintf(stderr, " %.3f", ovpsnr); + + for (i = 0; i < 4; i++) { + fprintf(stderr, " %.3f", stream->psnr_totals[1][i] / stream->psnr_count[1]); + } + if (bps > 0) { + fprintf(stderr, " %7" PRId64 " bps", bps); + } + fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000); + fprintf(stderr, "\n"); +} +#endif + +static float usec_to_fps(uint64_t usec, unsigned int frames) { + return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0); +} + +static void test_decode(struct stream_state *stream, + enum TestDecodeFatality fatal) { + aom_image_t enc_img, dec_img; + + if (stream->mismatch_seen) return; + + /* Get the internal reference frame */ + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, + &enc_img); + AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, + &dec_img); + + if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t enc_hbd_img; + aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); + enc_img = enc_hbd_img; + } + if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t dec_hbd_img; + aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); + dec_img = dec_hbd_img; + } + } + + ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame"); + ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame"); + + if (!aom_compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + aom_find_mismatch(&enc_img, &dec_img, y, u, v); + } + stream->decoder.err = 1; + warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL, + "Stream %d: Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + stream->index, stream->frames_out, y[0], y[1], y[2], + y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]); + stream->mismatch_seen = stream->frames_out; + } + + aom_img_free(&enc_img); + aom_img_free(&dec_img); +} + +static void print_time(const char *label, int64_t etl) { + int64_t hours; + int64_t mins; + int64_t secs; + + if (etl >= 0) { + hours = etl / 3600; + etl -= hours * 3600; + mins = etl / 60; + etl -= mins * 60; + secs = etl; + + fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label, + hours, mins, secs); + } else { + fprintf(stderr, "[%3s unknown] ", label); + } +} + +static void clear_stream_count_state(struct stream_state *stream) { + // PSNR counters + for (int k = 0; k < 2; k++) { + stream->psnr_sse_total[k] = 0; + stream->psnr_samples_total[k] = 0; + for (int i = 0; i < 4; i++) { + stream->psnr_totals[k][i] = 0; + } + stream->psnr_count[k] = 0; + } + // q hist + memset(stream->counts, 0, sizeof(stream->counts)); +} + +// aomenc will downscale the second pass if: +// 1. the specific pass is not given by commandline (aomenc will perform all +// passes) +// 2. there are more than 2 passes in total +// 3. current pass is the second pass (the parameter pass starts with 0 so +// pass == 1) +static int pass_need_downscale(int global_pass, int global_passes, int pass) { + return !global_pass && global_passes > 2 && pass == 1; +} + +int main(int argc, const char **argv_) { + int pass; + aom_image_t raw; + aom_image_t raw_shift; + int allocated_raw_shift = 0; + int do_16bit_internal = 0; + int input_shift = 0; + int frame_avail, got_data; + + struct AvxInputContext input; + struct AvxEncoderConfig global; + struct stream_state *streams = NULL; + char **argv, **argi; + uint64_t cx_time = 0; + int stream_cnt = 0; + int res = 0; + int profile_updated = 0; + + memset(&input, 0, sizeof(input)); + memset(&raw, 0, sizeof(raw)); + exec_name = argv_[0]; + + /* Setup default input stream settings */ + input.framerate.numerator = 30; + input.framerate.denominator = 1; + input.only_i420 = 1; + input.bit_depth = 0; + + /* First parse the global configuration values, because we want to apply + * other parameters on top of the default configuration provided by the + * codec. + */ + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } + parse_global_config(&global, &argv); + + if (argc < 2) usage_exit(); + + switch (global.color_type) { + case I420: input.fmt = AOM_IMG_FMT_I420; break; + case I422: input.fmt = AOM_IMG_FMT_I422; break; + case I444: input.fmt = AOM_IMG_FMT_I444; break; + case YV12: input.fmt = AOM_IMG_FMT_YV12; break; + case NV12: input.fmt = AOM_IMG_FMT_NV12; break; + } + + { + /* Now parse each stream's parameters. Using a local scope here + * due to the use of 'stream' as loop variable in FOREACH_STREAM + * loops + */ + struct stream_state *stream = NULL; + + do { + stream = new_stream(&global, stream); + stream_cnt++; + if (!streams) streams = stream; + } while (parse_stream_params(&global, stream, argv)); + } + + /* Check for unrecognized options */ + for (argi = argv; *argi; argi++) + if (argi[0][0] == '-' && argi[0][1]) + die("Error: Unrecognized option %s\n", *argi); + + FOREACH_STREAM(stream, streams) { + check_encoder_config(global.disable_warning_prompt, &global, + &stream->config.cfg); + + // If large_scale_tile = 1, only support to output to ivf format. + if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf) + die("only support ivf output format while large-scale-tile=1\n"); + } + + /* Handle non-option arguments */ + input.filename = argv[0]; + const char *orig_input_filename = input.filename; + FOREACH_STREAM(stream, streams) { + stream->orig_out_fn = stream->config.out_fn; + stream->orig_width = stream->config.cfg.g_w; + stream->orig_height = stream->config.cfg.g_h; + stream->orig_write_ivf = stream->config.write_ivf; + stream->orig_write_webm = stream->config.write_webm; + } + + if (!input.filename) { + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } + + /* Decide if other chroma subsamplings than 4:2:0 are supported */ + if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) + input.only_i420 = 0; + + for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) { + if (pass > 1) { + FOREACH_STREAM(stream, streams) { clear_stream_count_state(stream); } + } + + int frames_in = 0, seen_frames = 0; + int64_t estimated_time_left = -1; + int64_t average_rate = -1; + int64_t lagged_count = 0; + const int need_downscale = + pass_need_downscale(global.pass, global.passes, pass); + + // Set the output to the specified two-pass output file, and + // restore the width and height to the original values. + FOREACH_STREAM(stream, streams) { + if (need_downscale) { + stream->config.out_fn = stream->config.two_pass_output; + // Libaom currently only supports the ivf format for the third pass. + stream->config.write_ivf = 1; + stream->config.write_webm = 0; + } else { + stream->config.out_fn = stream->orig_out_fn; + stream->config.write_ivf = stream->orig_write_ivf; + stream->config.write_webm = stream->orig_write_webm; + } + stream->config.cfg.g_w = stream->orig_width; + stream->config.cfg.g_h = stream->orig_height; + } + + // For second pass in three-pass encoding, set the input to + // the given two-pass-input file if available. If the scaled input is not + // given, we will attempt to re-scale the original input. + input.filename = orig_input_filename; + const char *two_pass_input = NULL; + if (need_downscale) { + FOREACH_STREAM(stream, streams) { + if (stream->config.two_pass_input) { + two_pass_input = stream->config.two_pass_input; + input.filename = two_pass_input; + break; + } + } + } + + open_input_file(&input, global.csp); + + /* If the input file doesn't specify its w/h (raw files), try to get + * the data from the first stream's configuration. + */ + if (!input.width || !input.height) { + if (two_pass_input) { + FOREACH_STREAM(stream, streams) { + if (stream->config.two_pass_width && stream->config.two_pass_height) { + input.width = stream->config.two_pass_width; + input.height = stream->config.two_pass_height; + break; + } + } + } else { + FOREACH_STREAM(stream, streams) { + if (stream->config.cfg.g_w && stream->config.cfg.g_h) { + input.width = stream->config.cfg.g_w; + input.height = stream->config.cfg.g_h; + break; + } + } + } + } + + /* Update stream configurations from the input file's parameters */ + if (!input.width || !input.height) { + if (two_pass_input) { + fatal( + "Specify downscaled stream dimensions with --two-pass-width " + " and --two-pass-height"); + } else { + fatal( + "Specify stream dimensions with --width (-w) " + " and --height (-h)"); + } + } + + if (need_downscale) { + FOREACH_STREAM(stream, streams) { + if (stream->config.two_pass_width && stream->config.two_pass_height) { + stream->config.cfg.g_w = stream->config.two_pass_width; + stream->config.cfg.g_h = stream->config.two_pass_height; + } else if (two_pass_input) { + stream->config.cfg.g_w = input.width; + stream->config.cfg.g_h = input.height; + } else if (stream->orig_width && stream->orig_height) { +#if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL + stream->config.cfg.g_w = stream->orig_width; + stream->config.cfg.g_h = stream->orig_height; +#else // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL + stream->config.cfg.g_w = (stream->orig_width + 1) / 2; + stream->config.cfg.g_h = (stream->orig_height + 1) / 2; +#endif // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL + } else { +#if CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL + stream->config.cfg.g_w = input.width; + stream->config.cfg.g_h = input.height; +#else // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL + stream->config.cfg.g_w = (input.width + 1) / 2; + stream->config.cfg.g_h = (input.height + 1) / 2; +#endif // CONFIG_BITRATE_ACCURACY || CONFIG_BITRATE_ACCURACY_BL + } + } + } + + /* If input file does not specify bit-depth but input-bit-depth parameter + * exists, assume that to be the input bit-depth. However, if the + * input-bit-depth paramter does not exist, assume the input bit-depth + * to be the same as the codec bit-depth. + */ + if (!input.bit_depth) { + FOREACH_STREAM(stream, streams) { + if (stream->config.cfg.g_input_bit_depth) + input.bit_depth = stream->config.cfg.g_input_bit_depth; + else + input.bit_depth = stream->config.cfg.g_input_bit_depth = + (int)stream->config.cfg.g_bit_depth; + } + if (input.bit_depth > 8) input.fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + } else { + FOREACH_STREAM(stream, streams) { + stream->config.cfg.g_input_bit_depth = input.bit_depth; + } + } + + FOREACH_STREAM(stream, streams) { + if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016 && + input.fmt != AOM_IMG_FMT_NV12) { + /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile + was selected. */ + switch (stream->config.cfg.g_profile) { + case 0: + if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 || + input.fmt == AOM_IMG_FMT_I44416)) { + if (!stream->config.cfg.monochrome) { + stream->config.cfg.g_profile = 1; + profile_updated = 1; + } + } else if (input.bit_depth == 12 || + ((input.fmt == AOM_IMG_FMT_I422 || + input.fmt == AOM_IMG_FMT_I42216) && + !stream->config.cfg.monochrome)) { + stream->config.cfg.g_profile = 2; + profile_updated = 1; + } + break; + case 1: + if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 || + input.fmt == AOM_IMG_FMT_I42216) { + stream->config.cfg.g_profile = 2; + profile_updated = 1; + } else if (input.bit_depth < 12 && + (input.fmt == AOM_IMG_FMT_I420 || + input.fmt == AOM_IMG_FMT_I42016)) { + stream->config.cfg.g_profile = 0; + profile_updated = 1; + } + break; + case 2: + if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 || + input.fmt == AOM_IMG_FMT_I44416)) { + stream->config.cfg.g_profile = 1; + profile_updated = 1; + } else if (input.bit_depth < 12 && + (input.fmt == AOM_IMG_FMT_I420 || + input.fmt == AOM_IMG_FMT_I42016)) { + stream->config.cfg.g_profile = 0; + profile_updated = 1; + } else if (input.bit_depth == 12 && + input.file_type == FILE_TYPE_Y4M) { + // Note that here the input file values for chroma subsampling + // are used instead of those from the command line. + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_X, + input.y4m.dst_c_dec_h >> 1); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling x"); + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_Y, + input.y4m.dst_c_dec_v >> 1); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling y"); + } else if (input.bit_depth == 12 && + input.file_type == FILE_TYPE_RAW) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_X, + stream->chroma_subsampling_x); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling x"); + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_CHROMA_SUBSAMPLING_Y, + stream->chroma_subsampling_y); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling y"); + } + break; + default: break; + } + } + /* Automatically set the codec bit depth to match the input bit depth. + * Upgrade the profile if required. */ + if (stream->config.cfg.g_input_bit_depth > + (unsigned int)stream->config.cfg.g_bit_depth) { + stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth; + if (!global.quiet) { + fprintf(stderr, + "Warning: automatically updating bit depth to %d to " + "match input format.\n", + stream->config.cfg.g_input_bit_depth); + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + if (stream->config.cfg.g_bit_depth > 8) { + fatal("Unsupported bit-depth with CONFIG_AV1_HIGHBITDEPTH=0\n"); + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (stream->config.cfg.g_bit_depth > 10) { + switch (stream->config.cfg.g_profile) { + case 0: + case 1: + stream->config.cfg.g_profile = 2; + profile_updated = 1; + break; + default: break; + } + } + if (stream->config.cfg.g_bit_depth > 8) { + stream->config.use_16bit_internal = 1; + } + if (profile_updated && !global.quiet) { + fprintf(stderr, + "Warning: automatically updating to profile %d to " + "match input format.\n", + stream->config.cfg.g_profile); + } + if ((global.show_psnr == 2) && (stream->config.cfg.g_input_bit_depth == + stream->config.cfg.g_bit_depth)) { + fprintf(stderr, + "Warning: --psnr==2 and --psnr==1 will provide same " + "results when input bit-depth == stream bit-depth, " + "falling back to default psnr value\n"); + global.show_psnr = 1; + } + if (global.show_psnr < 0 || global.show_psnr > 2) { + fprintf(stderr, + "Warning: --psnr can take only 0,1,2 as values," + "falling back to default psnr value\n"); + global.show_psnr = 1; + } + /* Set limit */ + stream->config.cfg.g_limit = global.limit; + } + + FOREACH_STREAM(stream, streams) { + set_stream_dimensions(stream, input.width, input.height); + stream->config.color_range = input.color_range; + } + FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); } + + /* Ensure that --passes and --pass are consistent. If --pass is set and + * --passes >= 2, ensure --fpf was set. + */ + if (global.pass > 0 && global.pass <= 3 && global.passes >= 2) { + FOREACH_STREAM(stream, streams) { + if (!stream->config.stats_fn) + die("Stream %d: Must specify --fpf when --pass=%d" + " and --passes=%d\n", + stream->index, global.pass, global.passes); + } + } + +#if !CONFIG_WEBM_IO + FOREACH_STREAM(stream, streams) { + if (stream->config.write_webm) { + stream->config.write_webm = 0; + stream->config.write_ivf = 0; + aom_tools_warn("aomenc compiled w/o WebM support. Writing OBU stream."); + } + } +#endif + + /* Use the frame rate from the file only if none was specified + * on the command-line. + */ + if (!global.have_framerate) { + global.framerate.num = input.framerate.numerator; + global.framerate.den = input.framerate.denominator; + } + FOREACH_STREAM(stream, streams) { + stream->config.cfg.g_timebase.den = global.framerate.num; + stream->config.cfg.g_timebase.num = global.framerate.den; + } + /* Show configuration */ + if (global.verbose && pass == 0) { + FOREACH_STREAM(stream, streams) { + show_stream_config(stream, &global, &input); + } + } + + if (pass == (global.pass ? global.pass - 1 : 0)) { + // The Y4M reader does its own allocation. + if (input.file_type != FILE_TYPE_Y4M) { + aom_img_alloc(&raw, input.fmt, input.width, input.height, 32); + } + FOREACH_STREAM(stream, streams) { + stream->rate_hist = + init_rate_histogram(&stream->config.cfg, &global.framerate); + } + } + + FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); } + FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); } + FOREACH_STREAM(stream, streams) { + char *encoder_settings = NULL; +#if CONFIG_WEBM_IO + // Test frameworks may compare outputs from different versions, but only + // wish to check for bitstream changes. The encoder-settings tag, however, + // can vary if the version is updated, even if no encoder algorithm + // changes were made. To work around this issue, do not output + // the encoder-settings tag when --debug is enabled (which is the flag + // that test frameworks should use, when they want deterministic output + // from the container format). + if (stream->config.write_webm && !stream->webm_ctx.debug) { + encoder_settings = extract_encoder_settings( + aom_codec_version_str(), argv_, argc, input.filename); + if (encoder_settings == NULL) { + fprintf( + stderr, + "Warning: unable to extract encoder settings. Continuing...\n"); + } + } +#endif + open_output_file(stream, &global, &input.pixel_aspect_ratio, + encoder_settings); + free(encoder_settings); + } + + if (strcmp(get_short_name_by_aom_encoder(global.codec), "av1") == 0) { + // Check to see if at least one stream uses 16 bit internal. + // Currently assume that the bit_depths for all streams using + // highbitdepth are the same. + FOREACH_STREAM(stream, streams) { + if (stream->config.use_16bit_internal) { + do_16bit_internal = 1; + } + input_shift = (int)stream->config.cfg.g_bit_depth - + stream->config.cfg.g_input_bit_depth; + } + } + + frame_avail = 1; + got_data = 0; + + while (frame_avail || got_data) { + struct aom_usec_timer timer; + + if (!global.limit || frames_in < global.limit) { + frame_avail = read_frame(&input, &raw); + + if (frame_avail) frames_in++; + seen_frames = + frames_in > global.skip_frames ? frames_in - global.skip_frames : 0; + + if (!global.quiet) { + float fps = usec_to_fps(cx_time, seen_frames); + fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes); + + if (stream_cnt == 1) + fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in, + streams->frames_out, (int64_t)streams->nbytes); + else + fprintf(stderr, "frame %4d ", frames_in); + + fprintf(stderr, "%7" PRId64 " %s %.2f %s ", + cx_time > 9999999 ? cx_time / 1000 : cx_time, + cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60, + fps >= 1.0 ? "fps" : "fpm"); + print_time("ETA", estimated_time_left); + // mingw-w64 gcc does not match msvc for stderr buffering behavior + // and uses line buffering, thus the progress output is not + // real-time. The fflush() is here to make sure the progress output + // is sent out while the clip is being processed. + fflush(stderr); + } + + } else { + frame_avail = 0; + } + + if (frames_in > global.skip_frames) { + aom_image_t *frame_to_encode; + if (input_shift || (do_16bit_internal && input.bit_depth == 8)) { + assert(do_16bit_internal); + // Input bit depth and stream bit depth do not match, so up + // shift frame to stream bit depth + if (!allocated_raw_shift) { + aom_img_alloc(&raw_shift, raw.fmt | AOM_IMG_FMT_HIGHBITDEPTH, + input.width, input.height, 32); + allocated_raw_shift = 1; + } + aom_img_upshift(&raw_shift, &raw, input_shift); + frame_to_encode = &raw_shift; + } else { + frame_to_encode = &raw; + } + aom_usec_timer_start(&timer); + if (do_16bit_internal) { + assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH); + FOREACH_STREAM(stream, streams) { + if (stream->config.use_16bit_internal) + encode_frame(stream, &global, + frame_avail ? frame_to_encode : NULL, frames_in); + else + assert(0); + } + } else { + assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0); + FOREACH_STREAM(stream, streams) { + encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL, + frames_in); + } + } + aom_usec_timer_mark(&timer); + cx_time += aom_usec_timer_elapsed(&timer); + + FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); } + + got_data = 0; + FOREACH_STREAM(stream, streams) { + get_cx_data(stream, &global, &got_data); + } + + if (!got_data && input.length && streams != NULL && + !streams->frames_out) { + lagged_count = global.limit ? seen_frames : ftello(input.file); + } else if (input.length) { + int64_t remaining; + int64_t rate; + + if (global.limit) { + const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000; + + rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0; + remaining = 1000 * (global.limit - global.skip_frames - + seen_frames + lagged_count); + } else { + const int64_t input_pos = ftello(input.file); + const int64_t input_pos_lagged = input_pos - lagged_count; + const int64_t input_limit = input.length; + + rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; + remaining = input_limit - input_pos + lagged_count; + } + + average_rate = + (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8; + estimated_time_left = average_rate ? remaining / average_rate : -1; + } + + if (got_data && global.test_decode != TEST_DECODE_OFF) { + FOREACH_STREAM(stream, streams) { + test_decode(stream, global.test_decode); + } + } + } + + fflush(stdout); + if (!global.quiet) fprintf(stderr, "\033[K"); + } + + if (stream_cnt > 1) fprintf(stderr, "\n"); + + if (!global.quiet) { + FOREACH_STREAM(stream, streams) { + const int64_t bpf = + seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0; + const int64_t bps = bpf * global.framerate.num / global.framerate.den; + fprintf(stderr, + "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 + "b/f %7" PRId64 + "b/s" + " %7" PRId64 " %s (%.2f fps)\033[K\n", + pass + 1, global.passes, frames_in, stream->frames_out, + (int64_t)stream->nbytes, bpf, bps, + stream->cx_time > 9999999 ? stream->cx_time / 1000 + : stream->cx_time, + stream->cx_time > 9999999 ? "ms" : "us", + usec_to_fps(stream->cx_time, seen_frames)); + // This instance of cr does not need fflush as it is followed by a + // newline in the same string. + } + } + + if (global.show_psnr >= 1) { + if (get_fourcc_by_aom_encoder(global.codec) == AV1_FOURCC) { + FOREACH_STREAM(stream, streams) { + int64_t bps = 0; + if (global.show_psnr == 1) { + if (stream->psnr_count[0] && seen_frames && global.framerate.den) { + bps = (int64_t)stream->nbytes * 8 * + (int64_t)global.framerate.num / global.framerate.den / + seen_frames; + } + show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1, + bps); + } + if (global.show_psnr == 2) { +#if CONFIG_AV1_HIGHBITDEPTH + if (stream->config.cfg.g_input_bit_depth < + (unsigned int)stream->config.cfg.g_bit_depth) + show_psnr_hbd(stream, (1 << stream->config.cfg.g_bit_depth) - 1, + bps); +#endif + } + } + } else { + FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); } + } + } + + if (pass == global.passes - 1) { + FOREACH_STREAM(stream, streams) { + int num_operating_points; + int levels[32]; + int target_levels[32]; + aom_codec_control(&stream->encoder, AV1E_GET_NUM_OPERATING_POINTS, + &num_operating_points); + aom_codec_control(&stream->encoder, AV1E_GET_SEQ_LEVEL_IDX, levels); + aom_codec_control(&stream->encoder, AV1E_GET_TARGET_SEQ_LEVEL_IDX, + target_levels); + + for (int i = 0; i < num_operating_points; i++) { + if (levels[i] > target_levels[i]) { + if (levels[i] == 31) { + aom_tools_warn( + "Failed to encode to target level %d.%d for operating point " + "%d. The output level is SEQ_LEVEL_MAX", + 2 + (target_levels[i] >> 2), target_levels[i] & 3, i); + } else { + aom_tools_warn( + "Failed to encode to target level %d.%d for operating point " + "%d. The output level is %d.%d", + 2 + (target_levels[i] >> 2), target_levels[i] & 3, i, + 2 + (levels[i] >> 2), levels[i] & 3); + } + } + } + } + } + + FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); } + + if (global.test_decode != TEST_DECODE_OFF) { + FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->decoder); } + } + + close_input_file(&input); + + if (global.test_decode == TEST_DECODE_FATAL) { + FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; } + } + FOREACH_STREAM(stream, streams) { + close_output_file(stream, get_fourcc_by_aom_encoder(global.codec)); + } + + FOREACH_STREAM(stream, streams) { + stats_close(&stream->stats, global.passes - 1); + } + + if (global.pass) break; + } + + if (global.show_q_hist_buckets) { + FOREACH_STREAM(stream, streams) { + show_q_histogram(stream->counts, global.show_q_hist_buckets); + } + } + + if (global.show_rate_hist_buckets) { + FOREACH_STREAM(stream, streams) { + show_rate_histogram(stream->rate_hist, &stream->config.cfg, + global.show_rate_hist_buckets); + } + } + FOREACH_STREAM(stream, streams) { destroy_rate_histogram(stream->rate_hist); } + +#if CONFIG_INTERNAL_STATS + /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now, + * to match some existing utilities. + */ + if (!(global.pass == 1 && global.passes == 2)) { + FOREACH_STREAM(stream, streams) { + FILE *f = fopen("opsnr.stt", "a"); + if (stream->mismatch_seen) { + fprintf(f, "First mismatch occurred in frame %d\n", + stream->mismatch_seen); + } else { + fprintf(f, "No mismatch detected in recon buffers\n"); + } + fclose(f); + } + } +#endif + + if (allocated_raw_shift) aom_img_free(&raw_shift); + aom_img_free(&raw); + free(argv); + free(streams); + return res ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/third_party/aom/apps/aomenc.h b/third_party/aom/apps/aomenc.h new file mode 100644 index 0000000000..935d5fcd16 --- /dev/null +++ b/third_party/aom/apps/aomenc.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_APPS_AOMENC_H_ +#define AOM_APPS_AOMENC_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" +#include "av1/arg_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + I420, // 4:2:0 8+ bit-depth + I422, // 4:2:2 8+ bit-depth + I444, // 4:4:4 8+ bit-depth + YV12, // 4:2:0 with uv flipped, only 8-bit depth + NV12, // 4:2:0 with uv interleaved, only 8-bit depth +} ColorInputType; + +/* Configuration elements common to all streams. */ +struct AvxEncoderConfig { + aom_codec_iface_t *codec; + int passes; + int pass; + unsigned int usage; + ColorInputType color_type; + int quiet; + int verbose; + int limit; + int skip_frames; + int show_psnr; + enum TestDecodeFatality test_decode; + int have_framerate; + struct aom_rational framerate; + int debug; + int show_q_hist_buckets; + int show_rate_hist_buckets; + int disable_warnings; + int disable_warning_prompt; + int experimental_bitstream; + aom_chroma_sample_position_t csp; + cfg_options_t encoder_config; +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_APPS_AOMENC_H_ diff --git a/third_party/aom/av1/arg_defs.c b/third_party/aom/av1/arg_defs.c new file mode 100644 index 0000000000..057565411a --- /dev/null +++ b/third_party/aom/av1/arg_defs.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/arg_defs.h" + +static const struct arg_enum_list test_decode_enum[] = { + { "off", TEST_DECODE_OFF }, + { "fatal", TEST_DECODE_FATAL }, + { "warn", TEST_DECODE_WARN }, + { NULL, 0 } +}; + +static const struct arg_enum_list bitdepth_enum[] = { + { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 } +}; + +#if CONFIG_WEBM_IO +static const struct arg_enum_list stereo_mode_enum[] = { + { "mono", STEREO_FORMAT_MONO }, + { "left-right", STEREO_FORMAT_LEFT_RIGHT }, + { "bottom-top", STEREO_FORMAT_BOTTOM_TOP }, + { "top-bottom", STEREO_FORMAT_TOP_BOTTOM }, + { "right-left", STEREO_FORMAT_RIGHT_LEFT }, + { NULL, 0 } +}; +#endif + +static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR }, + { "cbr", AOM_CBR }, + { "cq", AOM_CQ }, + { "q", AOM_Q }, + { NULL, 0 } }; + +static const struct arg_enum_list tuning_enum[] = { + { "psnr", AOM_TUNE_PSNR }, + { "ssim", AOM_TUNE_SSIM }, + { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING }, + { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING }, + { "vmaf", AOM_TUNE_VMAF_MAX_GAIN }, + { "vmaf_neg", AOM_TUNE_VMAF_NEG_MAX_GAIN }, + { "butteraugli", AOM_TUNE_BUTTERAUGLI }, + { "vmaf_saliency_map", AOM_TUNE_VMAF_SALIENCY_MAP }, + { NULL, 0 } +}; + +static const struct arg_enum_list dist_metric_enum[] = { + { "psnr", AOM_DIST_METRIC_PSNR }, + { "qm-psnr", AOM_DIST_METRIC_QM_PSNR }, + { NULL, 0 } +}; + +#if CONFIG_AV1_ENCODER +static const struct arg_enum_list timing_info_enum[] = { + { "unspecified", AOM_TIMING_UNSPECIFIED }, + { "constant", AOM_TIMING_EQUAL }, + { "model", AOM_TIMING_DEC_MODEL }, + { NULL, 0 } +}; + +static const struct arg_enum_list superblock_size_enum[] = { + { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC }, + { "64", AOM_SUPERBLOCK_SIZE_64X64 }, + { "128", AOM_SUPERBLOCK_SIZE_128X128 }, + { NULL, 0 } +}; + +static const struct arg_enum_list matrix_coefficients_enum[] = { + { "identity", AOM_CICP_MC_IDENTITY }, + { "bt709", AOM_CICP_MC_BT_709 }, + { "unspecified", AOM_CICP_MC_UNSPECIFIED }, + { "fcc73", AOM_CICP_MC_FCC }, + { "bt470bg", AOM_CICP_MC_BT_470_B_G }, + { "bt601", AOM_CICP_MC_BT_601 }, + { "smpte240", AOM_CICP_CP_SMPTE_240 }, + { "ycgco", AOM_CICP_MC_SMPTE_YCGCO }, + { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL }, + { "bt2020cl", AOM_CICP_MC_BT_2020_CL }, + { "smpte2085", AOM_CICP_MC_SMPTE_2085 }, + { "chromncl", AOM_CICP_MC_CHROMAT_NCL }, + { "chromcl", AOM_CICP_MC_CHROMAT_CL }, + { "ictcp", AOM_CICP_MC_ICTCP }, + { NULL, 0 } +}; + +static const struct arg_enum_list chroma_sample_position_enum[] = { + { "unknown", AOM_CSP_UNKNOWN }, + { "vertical", AOM_CSP_VERTICAL }, + { "colocated", AOM_CSP_COLOCATED }, + { NULL, 0 } +}; + +static const struct arg_enum_list tune_content_enum[] = { + { "default", AOM_CONTENT_DEFAULT }, + { "screen", AOM_CONTENT_SCREEN }, + { "film", AOM_CONTENT_FILM }, + { NULL, 0 } +}; + +static const struct arg_enum_list transfer_characteristics_enum[] = { + { "unspecified", AOM_CICP_CP_UNSPECIFIED }, + { "bt709", AOM_CICP_TC_BT_709 }, + { "bt470m", AOM_CICP_TC_BT_470_M }, + { "bt470bg", AOM_CICP_TC_BT_470_B_G }, + { "bt601", AOM_CICP_TC_BT_601 }, + { "smpte240", AOM_CICP_TC_SMPTE_240 }, + { "lin", AOM_CICP_TC_LINEAR }, + { "log100", AOM_CICP_TC_LOG_100 }, + { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 }, + { "iec61966", AOM_CICP_TC_IEC_61966 }, + { "bt1361", AOM_CICP_TC_BT_1361 }, + { "srgb", AOM_CICP_TC_SRGB }, + { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT }, + { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT }, + { "smpte2084", AOM_CICP_TC_SMPTE_2084 }, + { "hlg", AOM_CICP_TC_HLG }, + { "smpte428", AOM_CICP_TC_SMPTE_428 }, + { NULL, 0 } +}; + +static const struct arg_enum_list color_primaries_enum[] = { + { "bt709", AOM_CICP_CP_BT_709 }, + { "unspecified", AOM_CICP_CP_UNSPECIFIED }, + { "bt601", AOM_CICP_CP_BT_601 }, + { "bt470m", AOM_CICP_CP_BT_470_M }, + { "bt470bg", AOM_CICP_CP_BT_470_B_G }, + { "smpte240", AOM_CICP_CP_SMPTE_240 }, + { "film", AOM_CICP_CP_GENERIC_FILM }, + { "bt2020", AOM_CICP_CP_BT_2020 }, + { "xyz", AOM_CICP_CP_XYZ }, + { "smpte431", AOM_CICP_CP_SMPTE_431 }, + { "smpte432", AOM_CICP_CP_SMPTE_432 }, + { "ebu3213", AOM_CICP_CP_EBU_3213 }, + { NULL, 0 } +}; +#endif // CONFIG_AV1_ENCODER + +const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { + .help = ARG_DEF(NULL, "help", 0, "Show usage options and exit"), + .debugmode = + ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"), + .outputfile = ARG_DEF("o", "output", 1, "Output filename"), + .use_nv12 = ARG_DEF(NULL, "nv12", 0, "Input file is NV12"), + .use_yv12 = ARG_DEF(NULL, "yv12", 0, "Input file is YV12"), + .use_i420 = ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)"), + .use_i422 = ARG_DEF(NULL, "i422", 0, "Input file is I422"), + .use_i444 = ARG_DEF(NULL, "i444", 0, "Input file is I444"), + .codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"), + .passes = ARG_DEF("p", "passes", 1, "Number of passes (1/2/3)"), + .pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2/3)"), + .fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"), + .limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"), + .skip = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"), + .good_dl = ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"), + .rt_dl = ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"), + .ai_dl = ARG_DEF(NULL, "allintra", 0, "Use all intra mode"), + .quietarg = ARG_DEF("q", "quiet", 0, "Do not print encode progress"), + .verbosearg = ARG_DEF("v", "verbose", 0, "Show encoder parameters"), + .psnrarg = ARG_DEF( + NULL, "psnr", -1, + "Show PSNR in status line " + "(0: Disable PSNR status line display, 1: PSNR calculated using input " + "bit-depth (default), 2: PSNR calculated using stream bit-depth); " + "takes default option when arguments are not specified"), + .use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use"), + .recontest = ARG_DEF_ENUM(NULL, "test-decode", 1, + "Test encode/decode mismatch", test_decode_enum), + .framerate = ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)"), + .use_webm = + ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)"), + .use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF"), + .use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU"), + .q_hist_n = + ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)"), + .rate_hist_n = + ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)"), + .disable_warnings = + ARG_DEF(NULL, "disable-warnings", 0, + "Disable warnings about potentially incorrect encode settings"), + .disable_warning_prompt = + ARG_DEF("y", "disable-warning-prompt", 0, + "Display warnings, but do not prompt user to continue"), + .bitdeptharg = + ARG_DEF_ENUM("b", "bit-depth", 1, "Bit depth for codec", bitdepth_enum), + .inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input"), + + .input_chroma_subsampling_x = ARG_DEF(NULL, "input-chroma-subsampling-x", 1, + "Chroma subsampling x value"), + .input_chroma_subsampling_y = ARG_DEF(NULL, "input-chroma-subsampling-y", 1, + "Chroma subsampling y value"), + + .usage = ARG_DEF("u", "usage", 1, + "Usage profile number to use (0: good, 1: rt, 2: allintra)"), + .threads = ARG_DEF("t", "threads", 1, "Max number of threads to use"), + .profile = ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use"), + .width = ARG_DEF("w", "width", 1, "Frame width"), + .height = ARG_DEF("h", "height", 1, "Frame height"), + .forced_max_frame_width = ARG_DEF(NULL, "forced_max_frame_width", 1, + "Maximum frame width value to force"), + .forced_max_frame_height = ARG_DEF(NULL, "forced_max_frame_height", 1, + "Maximum frame height value to force"), +#if CONFIG_WEBM_IO + .stereo_mode = ARG_DEF_ENUM(NULL, "stereo-mode", 1, "Stereo 3D video format", + stereo_mode_enum), +#endif + .timebase = ARG_DEF(NULL, "timebase", 1, + "Output timestamp precision (fractional seconds)"), + .global_error_resilient = ARG_DEF(NULL, "global-error-resilient", 1, + "Enable global error resiliency features"), + .lag_in_frames = + ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"), + .large_scale_tile = ARG_DEF( + NULL, "large-scale-tile", 1, + "Large scale tile coding (0: off (default), 1: on (ivf output only))"), + .monochrome = + ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"), + .full_still_picture_hdr = ARG_DEF(NULL, "full-still-picture-hdr", 0, + "Use full header for still picture"), + .use_16bit_internal = + ARG_DEF(NULL, "use-16bit-internal", 0, "Force use of 16-bit pipeline"), + .dropframe_thresh = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"), + .resize_mode = ARG_DEF( + NULL, "resize-mode", 1, + "Frame resize mode (0: off (default), 1: fixed, 2: random, 3: dynamic)"), + .resize_denominator = + ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator"), + .resize_kf_denominator = ARG_DEF(NULL, "resize-kf-denominator", 1, + "Frame resize keyframe denominator"), + .superres_mode = + ARG_DEF(NULL, "superres-mode", 1, + "Frame super-resolution mode (0: disabled (default), 1: fixed, " + "2: random, 3: qthresh, 4: auto)"), + .superres_denominator = ARG_DEF(NULL, "superres-denominator", 1, + "Frame super-resolution denominator"), + .superres_kf_denominator = + ARG_DEF(NULL, "superres-kf-denominator", 1, + "Frame super-resolution keyframe denominator"), + .superres_qthresh = ARG_DEF(NULL, "superres-qthresh", 1, + "Frame super-resolution qindex threshold"), + .superres_kf_qthresh = + ARG_DEF(NULL, "superres-kf-qthresh", 1, + "Frame super-resolution keyframe qindex threshold"), + .end_usage = + ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum), + .target_bitrate = ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)"), + .min_quantizer = ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer"), + .max_quantizer = ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer"), + .undershoot_pct = ARG_DEF(NULL, "undershoot-pct", 1, + "Datarate undershoot (min) target (%)"), + .overshoot_pct = + ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)"), + .buf_sz = ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)"), + .buf_initial_sz = + ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)"), + .buf_optimal_sz = + ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)"), + .bias_pct = ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"), + .minsection_pct = + ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"), + .maxsection_pct = + ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"), + .fwd_kf_enabled = + ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes"), + .kf_min_dist = + ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"), + .kf_max_dist = + ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)"), + .kf_disabled = ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement"), + .sframe_dist = ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)"), + .sframe_mode = + ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)"), + .save_as_annexb = ARG_DEF(NULL, "annexb", 1, "Save as Annex-B"), + .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1, + "Noise sensitivity (frames to blur)"), + .sharpness = ARG_DEF(NULL, "sharpness", 1, + "Bias towards block sharpness in rate-distortion " + "optimization of transform coefficients " + "(0..7), default is 0"), + .static_thresh = + ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"), + .auto_altref = + ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"), + .arnr_maxframes = + ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"), + .arnr_strength = + ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"), + .tune_metric = ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with", + tuning_enum), + .dist_metric = ARG_DEF_ENUM( + NULL, "dist-metric", 1, + "Distortion metric to use for in-block optimization", dist_metric_enum), + .cq_level = + ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"), + .max_intra_rate_pct = + ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"), +#if CONFIG_AV1_ENCODER + .cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1, + "Speed setting (0..6 in good mode, 5..11 in realtime " + "mode, 0..9 in all intra mode)"), + .rowmtarg = + ARG_DEF(NULL, "row-mt", 1, + "Enable row based multi-threading (0: off, 1: on (default))"), + .fpmtarg = ARG_DEF( + NULL, "fp-mt", 1, + "Enable frame parallel multi-threading (0: off (default), 1: on)"), + .tile_cols = + ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"), + .tile_rows = + ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2"), + .enable_tpl_model = ARG_DEF(NULL, "enable-tpl-model", 1, + "RDO based on frame temporal dependency " + "(0: off, 1: backward source based); " + "required for deltaq mode"), + .enable_keyframe_filtering = ARG_DEF( + NULL, "enable-keyframe-filtering", 1, + "Apply temporal filtering on key frame " + "(0: no filter, 1: filter without overlay (default), " + "2: filter with overlay - experimental, may break random access in " + "players)"), + .tile_width = ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)"), + .tile_height = + ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)"), + .lossless = ARG_DEF(NULL, "lossless", 1, + "Lossless mode (0: false (default), 1: true)"), + .enable_cdef = ARG_DEF( + NULL, "enable-cdef", 1, + "Enable the constrained directional enhancement filter (0: false, " + "1: true (default), 2: disable for non-reference frames)"), + .enable_restoration = ARG_DEF(NULL, "enable-restoration", 1, + "Enable the loop restoration filter (0: false " + "(default in realtime mode), " + "1: true (default in non-realtime mode))"), + .enable_rect_partitions = ARG_DEF(NULL, "enable-rect-partitions", 1, + "Enable rectangular partitions " + "(0: false, 1: true (default))"), + .enable_ab_partitions = + ARG_DEF(NULL, "enable-ab-partitions", 1, + "Enable ab partitions (0: false, 1: true (default))"), + .enable_1to4_partitions = ARG_DEF(NULL, "enable-1to4-partitions", 1, + "Enable 1:4 and 4:1 partitions " + "(0: false, 1: true (default))"), + .min_partition_size = + ARG_DEF(NULL, "min-partition-size", 1, + "Set min partition size " + "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128); " + "with 4k+ resolutions or higher speed settings, min " + "partition size will have a minimum of 8"), + .max_partition_size = + ARG_DEF(NULL, "max-partition-size", 1, + "Set max partition size " + "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"), + .enable_dual_filter = ARG_DEF(NULL, "enable-dual-filter", 1, + "Enable dual filter " + "(0: false, 1: true (default))"), + .enable_chroma_deltaq = ARG_DEF(NULL, "enable-chroma-deltaq", 1, + "Enable chroma delta quant " + "(0: false (default), 1: true)"), + .enable_intra_edge_filter = ARG_DEF(NULL, "enable-intra-edge-filter", 1, + "Enable intra edge filtering " + "(0: false, 1: true (default))"), + .enable_order_hint = ARG_DEF(NULL, "enable-order-hint", 1, + "Enable order hint " + "(0: false, 1: true (default))"), + .enable_tx64 = + ARG_DEF(NULL, "enable-tx64", 1, + "Enable 64-pt transform (0: false, 1: true (default))"), + .enable_flip_idtx = + ARG_DEF(NULL, "enable-flip-idtx", 1, + "Enable extended transform type (0: false, 1: true (default)) " + "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, " + "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, " + "H_ADST, V_FLIPADST, H_FLIPADST"), + .enable_rect_tx = + ARG_DEF(NULL, "enable-rect-tx", 1, + "Enable rectangular transform (0: false, 1: true (default))"), + .enable_dist_wtd_comp = ARG_DEF(NULL, "enable-dist-wtd-comp", 1, + "Enable distance-weighted compound " + "(0: false, 1: true (default))"), + .enable_masked_comp = ARG_DEF(NULL, "enable-masked-comp", 1, + "Enable masked (wedge/diff-wtd) compound " + "(0: false, 1: true (default))"), + .enable_onesided_comp = ARG_DEF(NULL, "enable-onesided-comp", 1, + "Enable one sided compound " + "(0: false, 1: true (default))"), + .enable_interintra_comp = ARG_DEF(NULL, "enable-interintra-comp", 1, + "Enable interintra compound " + "(0: false, 1: true (default))"), + .enable_smooth_interintra = ARG_DEF(NULL, "enable-smooth-interintra", 1, + "Enable smooth interintra mode " + "(0: false, 1: true (default))"), + .enable_diff_wtd_comp = ARG_DEF(NULL, "enable-diff-wtd-comp", 1, + "Enable difference-weighted compound " + "(0: false, 1: true (default))"), + .enable_interinter_wedge = ARG_DEF(NULL, "enable-interinter-wedge", 1, + "Enable interinter wedge compound " + "(0: false, 1: true (default))"), + .enable_interintra_wedge = ARG_DEF(NULL, "enable-interintra-wedge", 1, + "Enable interintra wedge compound " + "(0: false, 1: true (default))"), + .enable_global_motion = ARG_DEF(NULL, "enable-global-motion", 1, + "Enable global motion " + "(0: false, 1: true (default))"), + .enable_warped_motion = ARG_DEF(NULL, "enable-warped-motion", 1, + "Enable local warped motion " + "(0: false, 1: true (default))"), + .enable_filter_intra = ARG_DEF(NULL, "enable-filter-intra", 1, + "Enable filter intra prediction mode " + "(0: false, 1: true (default))"), + .enable_smooth_intra = ARG_DEF(NULL, "enable-smooth-intra", 1, + "Enable smooth intra prediction modes " + "(0: false, 1: true (default))"), + .enable_paeth_intra = ARG_DEF( + NULL, "enable-paeth-intra", 1, + "Enable Paeth intra prediction mode (0: false, 1: true (default))"), + .enable_cfl_intra = ARG_DEF(NULL, "enable-cfl-intra", 1, + "Enable chroma from luma intra prediction mode " + "(0: false, 1: true (default))"), + .enable_directional_intra = + ARG_DEF(NULL, "enable-directional-intra", 1, + "Enable directional intra prediction modes " + "(0: false, 1: true (default))"), + .enable_diagonal_intra = + ARG_DEF(NULL, "enable-diagonal-intra", 1, + "Enable diagonal (D45 to D203) intra prediction modes, which are " + "a subset of directional modes; has no effect if " + "enable-directional-intra is 0 (0: false, 1: true (default))"), + .force_video_mode = ARG_DEF( + NULL, "force-video-mode", 1, + "Force video mode even for a single frame (0: false (default), 1: true)"), + .enable_obmc = ARG_DEF(NULL, "enable-obmc", 1, + "Enable OBMC (0: false, 1: true (default))"), + .enable_overlay = + ARG_DEF(NULL, "enable-overlay", 1, + "Enable coding overlay frames (0: false, 1: true (default))"), + .enable_palette = + ARG_DEF(NULL, "enable-palette", 1, + "Enable palette prediction mode (0: false, 1: true (default))"), + .enable_intrabc = ARG_DEF(NULL, "enable-intrabc", 1, + "Enable intra block copy prediction mode " + "(0: false, 1: true (default))"), + .enable_angle_delta = + ARG_DEF(NULL, "enable-angle-delta", 1, + "Enable intra angle delta (0: false, 1: true (default))"), + .disable_trellis_quant = ARG_DEF( + NULL, "disable-trellis-quant", 1, + "Disable trellis optimization of quantized coefficients (0: false " + "1: true 2: true for rd search 3: true for estimate yrd search " + "(default))"), + .enable_qm = + ARG_DEF(NULL, "enable-qm", 1, + "Enable quantisation matrices (0: false (default), 1: true)"), + .qm_min = ARG_DEF(NULL, "qm-min", 1, + "Min quant matrix flatness (0..15), default is 8"), + .qm_max = ARG_DEF(NULL, "qm-max", 1, + "Max quant matrix flatness (0..15), default is 15"), + .reduced_tx_type_set = ARG_DEF(NULL, "reduced-tx-type-set", 1, + "Use reduced set of transform types"), + .use_intra_dct_only = + ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes"), + .use_inter_dct_only = + ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes"), + .use_intra_default_tx_only = + ARG_DEF(NULL, "use-intra-default-tx-only", 1, + "Use Default-transform only for INTRA modes"), + .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"), + .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1, + "Update freq for coeff costs. " + "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), + .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1, + "Update freq for mode costs. " + "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), + .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1, + "Update freq for mv costs. " + "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), + .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1, + "Update freq for dv costs. " + "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), + .num_tg = ARG_DEF(NULL, "num-tile-groups", 1, + "Maximum number of tile groups, default is 1"), + .mtu_size = + ARG_DEF(NULL, "mtu-size", 1, + "MTU size for a tile group, default is 0 (no MTU targeting), " + "overrides maximum number of tile groups"), + .timing_info = ARG_DEF_ENUM( + NULL, "timing-info", 1, + "Signal timing info in the bitstream (model only works for no " + "hidden frames, no super-res yet):", + timing_info_enum), +#if CONFIG_TUNE_VMAF + .vmaf_model_path = + ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"), +#endif + .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1, + "Partition information read and write path"), + .enable_rate_guide_deltaq = + ARG_DEF(NULL, "enable-rate-guide-deltaq", 1, + "Enable rate guide deltaq (1), by default off (0). " + "It requires --deltaq-mode=3. " + "If turned on, it requires an input file specified " + "by --rate-distribution-info."), + .rate_distribution_info = + ARG_DEF(NULL, "rate-distribution-info", 1, + "Rate distribution information input." + "It requires --enable-rate-guide-deltaq=1."), + .film_grain_test = ARG_DEF( + NULL, "film-grain-test", 1, + "Film grain test vectors (0: none (default), 1: test-1 2: test-2, " + "... 16: test-16)"), + .film_grain_table = ARG_DEF(NULL, "film-grain-table", 1, + "Path to file containing film grain parameters"), +#if CONFIG_DENOISE + .denoise_noise_level = + ARG_DEF(NULL, "denoise-noise-level", 1, + "Amount of noise (from 0 = don't denoise, to 50)"), + .denoise_block_size = ARG_DEF(NULL, "denoise-block-size", 1, + "Denoise block size (default = 32)"), + .enable_dnl_denoising = ARG_DEF(NULL, "enable-dnl-denoising", 1, + "Apply denoising to the frame " + "being encoded when denoise-noise-level is " + "enabled (0: false, 1: true (default))"), +#endif + .enable_ref_frame_mvs = + ARG_DEF(NULL, "enable-ref-frame-mvs", 1, + "Enable temporal mv prediction (default is 1)"), + .frame_parallel_decoding = + ARG_DEF(NULL, "frame-parallel", 1, + "Enable frame parallel decodability features " + "(0: false (default), 1: true)"), + .error_resilient_mode = ARG_DEF(NULL, "error-resilient", 1, + "Enable error resilient features " + "(0: false (default), 1: true)"), + .aq_mode = ARG_DEF(NULL, "aq-mode", 1, + "Adaptive quantization mode (0: off (default), 1: " + "variance 2: complexity, " + "3: cyclic refresh)"), + .deltaq_mode = + ARG_DEF(NULL, "deltaq-mode", 1, + "Delta qindex mode (0: off, 1: deltaq objective (default), " + "2: deltaq placeholder, 3: key frame visual quality, 4: user " + "rating based visual quality optimization); " + "requires --enable-tpl-model=1"), + .deltaq_strength = ARG_DEF(NULL, "deltaq-strength", 1, + "Deltaq strength for" + " --deltaq-mode=4 (%)"), + .deltalf_mode = ARG_DEF(NULL, "delta-lf-mode", 1, + "Enable delta-lf-mode (0: off (default), 1: on)"), + .frame_periodic_boost = + ARG_DEF(NULL, "frame-boost", 1, + "Enable frame periodic boost (0: off (default), 1: on)"), + .gf_cbr_boost_pct = ARG_DEF(NULL, "gf-cbr-boost", 1, + "Boost for Golden Frame in CBR mode (pct)"), + .max_inter_rate_pct = + ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"), + .min_gf_interval = ARG_DEF( + NULL, "min-gf-interval", 1, + "Min gf/arf frame interval (default 0, indicating in-built behavior)"), + .max_gf_interval = ARG_DEF( + NULL, "max-gf-interval", 1, + "Max gf/arf frame interval (default 0, indicating in-built behavior)"), + .gf_min_pyr_height = + ARG_DEF(NULL, "gf-min-pyr-height", 1, + "Min height for GF group pyramid structure (0 (default) to 5)"), + .gf_max_pyr_height = ARG_DEF( + NULL, "gf-max-pyr-height", 1, + "Maximum height for GF group pyramid structure (0 to 5 (default))"), + .max_reference_frames = ARG_DEF(NULL, "max-reference-frames", 1, + "Maximum number of reference frames allowed " + "per frame (3 to 7 (default))"), + .reduced_reference_set = + ARG_DEF(NULL, "reduced-reference-set", 1, + "Use reduced set of single and compound references (0: off " + "(default), 1: on)"), + .target_seq_level_idx = + ARG_DEF(NULL, "target-seq-level-idx", 1, + "Target sequence level index. " + "Possible values are in the form of \"ABxy\". " + "AB: Operating point (OP) index, " + "xy: Target level index for the OP. " + "E.g. \"0\" means target level index 0 (2.0) for the 0th OP, " + "\"1019\" means target level index 19 (6.3) for the 10th OP."), + .set_min_cr = ARG_DEF( + NULL, "min-cr", 1, + "Set minimum compression ratio. Take integer values. Default is 0. " + "If non-zero, encoder will try to keep the compression ratio of " + "each frame to be higher than the given value divided by 100."), + + .input_color_primaries = ARG_DEF_ENUM( + NULL, "color-primaries", 1, + "Color primaries (CICP) of input content:", color_primaries_enum), + + .input_transfer_characteristics = + ARG_DEF_ENUM(NULL, "transfer-characteristics", 1, + "Transfer characteristics (CICP) of input content:", + transfer_characteristics_enum), + + .input_matrix_coefficients = ARG_DEF_ENUM( + NULL, "matrix-coefficients", 1, + "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum), + + .input_chroma_sample_position = + ARG_DEF_ENUM(NULL, "chroma-sample-position", 1, + "The chroma sample position when chroma 4:2:0 is signaled:", + chroma_sample_position_enum), + + .tune_content = ARG_DEF_ENUM(NULL, "tune-content", 1, "Tune content type", + tune_content_enum), + + .cdf_update_mode = + ARG_DEF(NULL, "cdf-update-mode", 1, + "CDF update mode for entropy coding " + "(0: no CDF update, 1: update CDF on all frames (default), " + "2: selectively update CDF on some frames)"), + + .superblock_size = ARG_DEF_ENUM(NULL, "sb-size", 1, "Superblock size to use", + superblock_size_enum), + + .set_tier_mask = + ARG_DEF(NULL, "set-tier-mask", 1, + "Set bit mask to specify which tier each of the 32 possible " + "operating points conforms to. " + "Bit value 0 (default): Main Tier, 1: High Tier."), + + .use_fixed_qp_offsets = + ARG_DEF(NULL, "use-fixed-qp-offsets", 1, + "Enable fixed QP offsets for frames at different levels of the " + "pyramid. Selected automatically from --cq-level if " + "--fixed-qp-offsets is not provided. If this option is not " + "specified (default), offsets are adaptively chosen by the " + "encoder."), + + .fixed_qp_offsets = ARG_DEF( + NULL, "fixed-qp-offsets", 1, + "Set fixed QP offsets for frames at different levels of the " + "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, " + "and 3 levels of internal alt-refs. If this option is not " + "specified (default), offsets are adaptively chosen by the " + "encoder."), + + .vbr_corpus_complexity_lap = ARG_DEF( + NULL, "vbr-corpus-complexity-lap", 1, + "Set average corpus complexity per mb for single pass VBR using lap. " + "(0..10000), default is 0"), + + .fwd_kf_dist = ARG_DEF(NULL, "fwd-kf-dist", -1, + "Set distance between forward keyframes. A value of " + "-1 (default) means no repetitive forward keyframes."), + + .enable_tx_size_search = ARG_DEF( + NULL, "enable-tx-size-search", 1, + "Enable transform size search to find the best size for each block. " + "If false, transforms always have the largest possible size " + "(0: false, 1: true (default)). Ignored in non rd pick mode in " + "real-time coding."), + + .loopfilter_control = ARG_DEF( + NULL, "loopfilter-control", 1, + "Control loop filtering " + "(0: Loopfilter disabled for all frames, 1: Enable loopfilter for all " + "frames (default), 2: Disable loopfilter for non-reference frames, 3: " + "Disable loopfilter for frames with low motion)"), + + .auto_intra_tools_off = ARG_DEF( + NULL, "auto-intra-tools-off", 1, + "Automatically turn off several intra coding tools for allintra mode; " + "only in effect if --deltaq-mode=3"), + + .two_pass_input = + ARG_DEF(NULL, "two-pass-input", 1, + "The input file for the second pass for three-pass encoding"), + .two_pass_output = ARG_DEF( + NULL, "two-pass-output", 1, + "The output file for the first two passes for three-pass encoding"), + .two_pass_width = + ARG_DEF(NULL, "two-pass-width", 1, "The width of two-pass-input"), + .two_pass_height = + ARG_DEF(NULL, "two-pass-height", 1, "The height of two-pass-input"), + .second_pass_log = + ARG_DEF("spf", "second-pass-log", 1, "Log file from second pass"), + .strict_level_conformance = + ARG_DEF(NULL, "strict-level-conformance", 1, + "When set to 1, exit the encoder when it fails to encode " + "to a given target level"), + .kf_max_pyr_height = ARG_DEF( + NULL, "kf-max-pyr-height", 1, + "Maximum height of pyramid structure used for the GOP starting with a " + "key frame (-1 to 5). When set to -1 (default), it does not have any " + "effect. The actual maximum pyramid height will be the minimum of this " + "value and the value of gf_max_pyr_height."), + .sb_qp_sweep = + ARG_DEF(NULL, "sb-qp-sweep", 1, + "When set to 1, enable the superblock level qp sweep for a " + "given lambda to minimize the rdcost."), +#endif // CONFIG_AV1_ENCODER +}; diff --git a/third_party/aom/av1/arg_defs.h b/third_party/aom/av1/arg_defs.h new file mode 100644 index 0000000000..73c78caec8 --- /dev/null +++ b/third_party/aom/av1/arg_defs.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ARG_DEFS_H_ +#define AOM_AV1_ARG_DEFS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" +#include "common/args_helper.h" +#if CONFIG_WEBM_IO +#include "common/webmenc.h" +#endif +#include "aom/aomcx.h" + +enum TestDecodeFatality { + TEST_DECODE_OFF, + TEST_DECODE_FATAL, + TEST_DECODE_WARN, +}; + +typedef struct av1_codec_arg_definitions { + arg_def_t help; + arg_def_t debugmode; + arg_def_t outputfile; + arg_def_t use_nv12; + arg_def_t use_yv12; + arg_def_t use_i420; + arg_def_t use_i422; + arg_def_t use_i444; + arg_def_t codecarg; + arg_def_t passes; + arg_def_t pass_arg; + arg_def_t fpf_name; + arg_def_t limit; + arg_def_t skip; + arg_def_t good_dl; + arg_def_t rt_dl; + arg_def_t ai_dl; + arg_def_t quietarg; + arg_def_t verbosearg; + arg_def_t psnrarg; + arg_def_t use_cfg; + arg_def_t recontest; + arg_def_t framerate; + arg_def_t use_webm; + arg_def_t use_ivf; + arg_def_t use_obu; + arg_def_t q_hist_n; + arg_def_t rate_hist_n; + arg_def_t disable_warnings; + arg_def_t disable_warning_prompt; + arg_def_t bitdeptharg; + arg_def_t inbitdeptharg; + arg_def_t input_chroma_subsampling_x; + arg_def_t input_chroma_subsampling_y; + arg_def_t usage; + arg_def_t threads; + arg_def_t profile; + arg_def_t width; + arg_def_t height; + arg_def_t forced_max_frame_width; + arg_def_t forced_max_frame_height; +#if CONFIG_WEBM_IO + arg_def_t stereo_mode; +#endif + arg_def_t timebase; + arg_def_t global_error_resilient; + arg_def_t lag_in_frames; + arg_def_t large_scale_tile; + arg_def_t monochrome; + arg_def_t full_still_picture_hdr; + arg_def_t use_16bit_internal; + arg_def_t dropframe_thresh; + arg_def_t resize_mode; + arg_def_t resize_denominator; + arg_def_t resize_kf_denominator; + arg_def_t superres_mode; + arg_def_t superres_denominator; + arg_def_t superres_kf_denominator; + arg_def_t superres_qthresh; + arg_def_t superres_kf_qthresh; + arg_def_t end_usage; + arg_def_t target_bitrate; + arg_def_t min_quantizer; + arg_def_t max_quantizer; + arg_def_t undershoot_pct; + arg_def_t overshoot_pct; + arg_def_t buf_sz; + arg_def_t buf_initial_sz; + arg_def_t buf_optimal_sz; + arg_def_t bias_pct; + arg_def_t minsection_pct; + arg_def_t maxsection_pct; + arg_def_t fwd_kf_enabled; + arg_def_t kf_min_dist; + arg_def_t kf_max_dist; + arg_def_t kf_disabled; + arg_def_t sframe_dist; + arg_def_t sframe_mode; + arg_def_t save_as_annexb; + arg_def_t noise_sens; + arg_def_t sharpness; + arg_def_t static_thresh; + arg_def_t auto_altref; + arg_def_t arnr_maxframes; + arg_def_t arnr_strength; + arg_def_t tune_metric; + arg_def_t dist_metric; + arg_def_t cq_level; + arg_def_t max_intra_rate_pct; +#if CONFIG_AV1_ENCODER + arg_def_t cpu_used_av1; + arg_def_t rowmtarg; + arg_def_t fpmtarg; + arg_def_t tile_cols; + arg_def_t tile_rows; + arg_def_t enable_tpl_model; + arg_def_t enable_keyframe_filtering; + arg_def_t tile_width; + arg_def_t tile_height; + arg_def_t lossless; + arg_def_t enable_cdef; + arg_def_t enable_restoration; + arg_def_t enable_rect_partitions; + arg_def_t enable_ab_partitions; + arg_def_t enable_1to4_partitions; + arg_def_t min_partition_size; + arg_def_t max_partition_size; + arg_def_t enable_dual_filter; + arg_def_t enable_chroma_deltaq; + arg_def_t enable_intra_edge_filter; + arg_def_t enable_order_hint; + arg_def_t enable_tx64; + arg_def_t enable_flip_idtx; + arg_def_t enable_rect_tx; + arg_def_t enable_dist_wtd_comp; + arg_def_t enable_masked_comp; + arg_def_t enable_onesided_comp; + arg_def_t enable_interintra_comp; + arg_def_t enable_smooth_interintra; + arg_def_t enable_diff_wtd_comp; + arg_def_t enable_interinter_wedge; + arg_def_t enable_interintra_wedge; + arg_def_t enable_global_motion; + arg_def_t enable_warped_motion; + arg_def_t enable_filter_intra; + arg_def_t enable_smooth_intra; + arg_def_t enable_paeth_intra; + arg_def_t enable_cfl_intra; + arg_def_t enable_directional_intra; + arg_def_t enable_diagonal_intra; + arg_def_t force_video_mode; + arg_def_t enable_obmc; + arg_def_t enable_overlay; + arg_def_t enable_palette; + arg_def_t enable_intrabc; + arg_def_t enable_angle_delta; + arg_def_t disable_trellis_quant; + arg_def_t enable_qm; + arg_def_t qm_min; + arg_def_t qm_max; + arg_def_t reduced_tx_type_set; + arg_def_t use_intra_dct_only; + arg_def_t use_inter_dct_only; + arg_def_t use_intra_default_tx_only; + arg_def_t quant_b_adapt; + arg_def_t coeff_cost_upd_freq; + arg_def_t mode_cost_upd_freq; + arg_def_t mv_cost_upd_freq; + arg_def_t dv_cost_upd_freq; + arg_def_t num_tg; + arg_def_t mtu_size; + arg_def_t timing_info; +#if CONFIG_TUNE_VMAF + arg_def_t vmaf_model_path; +#endif + arg_def_t partition_info_path; + arg_def_t enable_rate_guide_deltaq; + arg_def_t rate_distribution_info; + arg_def_t film_grain_test; + arg_def_t film_grain_table; +#if CONFIG_DENOISE + arg_def_t denoise_noise_level; + arg_def_t denoise_block_size; + arg_def_t enable_dnl_denoising; +#endif + arg_def_t enable_ref_frame_mvs; + arg_def_t frame_parallel_decoding; + arg_def_t error_resilient_mode; + arg_def_t aq_mode; + arg_def_t deltaq_mode; + arg_def_t deltaq_strength; + arg_def_t deltalf_mode; + arg_def_t frame_periodic_boost; + arg_def_t gf_cbr_boost_pct; + arg_def_t max_inter_rate_pct; + arg_def_t min_gf_interval; + arg_def_t max_gf_interval; + arg_def_t gf_min_pyr_height; + arg_def_t gf_max_pyr_height; + arg_def_t max_reference_frames; + arg_def_t reduced_reference_set; + arg_def_t target_seq_level_idx; + arg_def_t set_min_cr; + arg_def_t input_color_primaries; + arg_def_t input_transfer_characteristics; + arg_def_t input_matrix_coefficients; + arg_def_t input_chroma_sample_position; + arg_def_t tune_content; + arg_def_t cdf_update_mode; + arg_def_t superblock_size; + arg_def_t set_tier_mask; + arg_def_t use_fixed_qp_offsets; + arg_def_t fixed_qp_offsets; + arg_def_t vbr_corpus_complexity_lap; + arg_def_t fwd_kf_dist; + arg_def_t enable_tx_size_search; + arg_def_t loopfilter_control; + arg_def_t two_pass_input; + arg_def_t two_pass_output; + arg_def_t two_pass_width; + arg_def_t two_pass_height; + arg_def_t second_pass_log; + arg_def_t auto_intra_tools_off; + arg_def_t strict_level_conformance; + arg_def_t kf_max_pyr_height; + arg_def_t sb_qp_sweep; +#endif // CONFIG_AV1_ENCODER +} av1_codec_arg_definitions_t; + +extern const av1_codec_arg_definitions_t g_av1_codec_arg_defs; + +#ifdef __cplusplus +} +#endif +#endif // AOM_AV1_ARG_DEFS_H_ diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake new file mode 100644 index 0000000000..15577d0c0e --- /dev/null +++ b/third_party/aom/av1/av1.cmake @@ -0,0 +1,715 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AV1_AV1_CMAKE_) + return() +endif() # AOM_AV1_AV1_CMAKE_ +set(AOM_AV1_AV1_CMAKE_ 1) + +list(APPEND AOM_AV1_COMMON_SOURCES + "${AOM_ROOT}/common/args_helper.h" + "${AOM_ROOT}/common/args_helper.c" + "${AOM_ROOT}/av1/arg_defs.h" + "${AOM_ROOT}/av1/arg_defs.c" + "${AOM_ROOT}/av1/av1_iface_common.h" + "${AOM_ROOT}/av1/common/alloccommon.c" + "${AOM_ROOT}/av1/common/alloccommon.h" + "${AOM_ROOT}/av1/common/av1_common_int.h" + "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c" + "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h" + "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h" + "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c" + "${AOM_ROOT}/av1/common/av1_loopfilter.c" + "${AOM_ROOT}/av1/common/av1_loopfilter.h" + "${AOM_ROOT}/av1/common/av1_txfm.c" + "${AOM_ROOT}/av1/common/av1_txfm.h" + "${AOM_ROOT}/av1/common/blockd.c" + "${AOM_ROOT}/av1/common/blockd.h" + "${AOM_ROOT}/av1/common/cdef.c" + "${AOM_ROOT}/av1/common/cdef.h" + "${AOM_ROOT}/av1/common/cdef_block.c" + "${AOM_ROOT}/av1/common/cdef_block.h" + "${AOM_ROOT}/av1/common/cfl.c" + "${AOM_ROOT}/av1/common/cfl.h" + "${AOM_ROOT}/av1/common/common.h" + "${AOM_ROOT}/av1/common/common_data.c" + "${AOM_ROOT}/av1/common/common_data.h" + "${AOM_ROOT}/av1/common/convolve.c" + "${AOM_ROOT}/av1/common/convolve.h" + "${AOM_ROOT}/av1/common/debugmodes.c" + "${AOM_ROOT}/av1/common/entropy.c" + "${AOM_ROOT}/av1/common/entropy.h" + "${AOM_ROOT}/av1/common/entropymode.c" + "${AOM_ROOT}/av1/common/entropymode.h" + "${AOM_ROOT}/av1/common/entropymv.c" + "${AOM_ROOT}/av1/common/entropymv.h" + "${AOM_ROOT}/av1/common/enums.h" + "${AOM_ROOT}/av1/common/filter.h" + "${AOM_ROOT}/av1/common/frame_buffers.c" + "${AOM_ROOT}/av1/common/frame_buffers.h" + "${AOM_ROOT}/av1/common/idct.c" + "${AOM_ROOT}/av1/common/idct.h" + "${AOM_ROOT}/av1/common/mv.h" + "${AOM_ROOT}/av1/common/mvref_common.c" + "${AOM_ROOT}/av1/common/mvref_common.h" + "${AOM_ROOT}/av1/common/obu_util.c" + "${AOM_ROOT}/av1/common/obu_util.h" + "${AOM_ROOT}/av1/common/pred_common.c" + "${AOM_ROOT}/av1/common/pred_common.h" + "${AOM_ROOT}/av1/common/quant_common.c" + "${AOM_ROOT}/av1/common/quant_common.h" + "${AOM_ROOT}/av1/common/reconinter.c" + "${AOM_ROOT}/av1/common/reconinter.h" + "${AOM_ROOT}/av1/common/reconinter_template.inc" + "${AOM_ROOT}/av1/common/reconintra.c" + "${AOM_ROOT}/av1/common/reconintra.h" + "${AOM_ROOT}/av1/common/resize.c" + "${AOM_ROOT}/av1/common/resize.h" + "${AOM_ROOT}/av1/common/restoration.c" + "${AOM_ROOT}/av1/common/restoration.h" + "${AOM_ROOT}/av1/common/scale.c" + "${AOM_ROOT}/av1/common/scale.h" + "${AOM_ROOT}/av1/common/scan.c" + "${AOM_ROOT}/av1/common/scan.h" + "${AOM_ROOT}/av1/common/seg_common.c" + "${AOM_ROOT}/av1/common/seg_common.h" + "${AOM_ROOT}/av1/common/thread_common.c" + "${AOM_ROOT}/av1/common/thread_common.h" + "${AOM_ROOT}/av1/common/tile_common.c" + "${AOM_ROOT}/av1/common/tile_common.h" + "${AOM_ROOT}/av1/common/timing.c" + "${AOM_ROOT}/av1/common/timing.h" + "${AOM_ROOT}/av1/common/token_cdfs.h" + "${AOM_ROOT}/av1/common/txb_common.c" + "${AOM_ROOT}/av1/common/txb_common.h" + "${AOM_ROOT}/av1/common/warped_motion.c" + "${AOM_ROOT}/av1/common/warped_motion.h") + +list(APPEND AOM_AV1_DECODER_SOURCES + "${AOM_ROOT}/av1/av1_dx_iface.c" + "${AOM_ROOT}/av1/decoder/decodeframe.c" + "${AOM_ROOT}/av1/decoder/decodeframe.h" + "${AOM_ROOT}/av1/decoder/decodemv.c" + "${AOM_ROOT}/av1/decoder/decodemv.h" + "${AOM_ROOT}/av1/decoder/decoder.c" + "${AOM_ROOT}/av1/decoder/decoder.h" + "${AOM_ROOT}/av1/decoder/decodetxb.c" + "${AOM_ROOT}/av1/decoder/decodetxb.h" + "${AOM_ROOT}/av1/decoder/detokenize.c" + "${AOM_ROOT}/av1/decoder/detokenize.h" + "${AOM_ROOT}/av1/decoder/dthread.h" + "${AOM_ROOT}/av1/decoder/grain_synthesis.c" + "${AOM_ROOT}/av1/decoder/grain_synthesis.h" + "${AOM_ROOT}/av1/decoder/obu.h" + "${AOM_ROOT}/av1/decoder/obu.c") + +list(APPEND AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/av1_cx_iface.c" + "${AOM_ROOT}/av1/av1_cx_iface.h" + "${AOM_ROOT}/av1/encoder/aq_complexity.c" + "${AOM_ROOT}/av1/encoder/aq_complexity.h" + "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c" + "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h" + "${AOM_ROOT}/av1/encoder/aq_variance.c" + "${AOM_ROOT}/av1/encoder/aq_variance.h" + "${AOM_ROOT}/av1/encoder/allintra_vis.c" + "${AOM_ROOT}/av1/encoder/allintra_vis.h" + "${AOM_ROOT}/av1/encoder/enc_enums.h" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h" + "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c" + "${AOM_ROOT}/av1/encoder/av1_quantize.c" + "${AOM_ROOT}/av1/encoder/av1_quantize.h" + "${AOM_ROOT}/av1/encoder/bitstream.c" + "${AOM_ROOT}/av1/encoder/bitstream.h" + "${AOM_ROOT}/av1/encoder/block.h" + "${AOM_ROOT}/av1/encoder/cnn.c" + "${AOM_ROOT}/av1/encoder/cnn.h" + "${AOM_ROOT}/av1/encoder/compound_type.c" + "${AOM_ROOT}/av1/encoder/compound_type.h" + "${AOM_ROOT}/av1/encoder/context_tree.c" + "${AOM_ROOT}/av1/encoder/context_tree.h" + "${AOM_ROOT}/av1/encoder/cost.c" + "${AOM_ROOT}/av1/encoder/cost.h" + "${AOM_ROOT}/av1/encoder/encodeframe.c" + "${AOM_ROOT}/av1/encoder/encodeframe.h" + "${AOM_ROOT}/av1/encoder/encodeframe_utils.c" + "${AOM_ROOT}/av1/encoder/encodeframe_utils.h" + "${AOM_ROOT}/av1/encoder/encodemb.c" + "${AOM_ROOT}/av1/encoder/encodemb.h" + "${AOM_ROOT}/av1/encoder/encodemv.c" + "${AOM_ROOT}/av1/encoder/encodemv.h" + "${AOM_ROOT}/av1/encoder/encode_strategy.c" + "${AOM_ROOT}/av1/encoder/encode_strategy.h" + "${AOM_ROOT}/av1/encoder/encoder.c" + "${AOM_ROOT}/av1/encoder/encoder.h" + "${AOM_ROOT}/av1/encoder/encoder_alloc.h" + "${AOM_ROOT}/av1/encoder/encoder_utils.c" + "${AOM_ROOT}/av1/encoder/encoder_utils.h" + "${AOM_ROOT}/av1/encoder/encodetxb.c" + "${AOM_ROOT}/av1/encoder/encodetxb.h" + "${AOM_ROOT}/av1/encoder/ethread.c" + "${AOM_ROOT}/av1/encoder/ethread.h" + "${AOM_ROOT}/av1/encoder/extend.c" + "${AOM_ROOT}/av1/encoder/extend.h" + "${AOM_ROOT}/av1/encoder/external_partition.c" + "${AOM_ROOT}/av1/encoder/external_partition.h" + "${AOM_ROOT}/av1/encoder/firstpass.c" + "${AOM_ROOT}/av1/encoder/firstpass.h" + "${AOM_ROOT}/av1/encoder/global_motion.c" + "${AOM_ROOT}/av1/encoder/global_motion.h" + "${AOM_ROOT}/av1/encoder/global_motion_facade.c" + "${AOM_ROOT}/av1/encoder/global_motion_facade.h" + "${AOM_ROOT}/av1/encoder/gop_structure.c" + "${AOM_ROOT}/av1/encoder/gop_structure.h" + "${AOM_ROOT}/av1/encoder/grain_test_vectors.h" + "${AOM_ROOT}/av1/encoder/hash.c" + "${AOM_ROOT}/av1/encoder/hash.h" + "${AOM_ROOT}/av1/encoder/hash_motion.c" + "${AOM_ROOT}/av1/encoder/hash_motion.h" + "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c" + "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h" + "${AOM_ROOT}/av1/encoder/interp_search.c" + "${AOM_ROOT}/av1/encoder/interp_search.h" + "${AOM_ROOT}/av1/encoder/level.c" + "${AOM_ROOT}/av1/encoder/level.h" + "${AOM_ROOT}/av1/encoder/lookahead.c" + "${AOM_ROOT}/av1/encoder/lookahead.h" + "${AOM_ROOT}/av1/encoder/mcomp.c" + "${AOM_ROOT}/av1/encoder/mcomp.h" + "${AOM_ROOT}/av1/encoder/mcomp_structs.h" + "${AOM_ROOT}/av1/encoder/ml.c" + "${AOM_ROOT}/av1/encoder/ml.h" + "${AOM_ROOT}/av1/encoder/model_rd.h" + "${AOM_ROOT}/av1/encoder/motion_search_facade.c" + "${AOM_ROOT}/av1/encoder/motion_search_facade.h" + "${AOM_ROOT}/av1/encoder/mv_prec.c" + "${AOM_ROOT}/av1/encoder/mv_prec.h" + "${AOM_ROOT}/av1/encoder/palette.c" + "${AOM_ROOT}/av1/encoder/palette.h" + "${AOM_ROOT}/av1/encoder/partition_search.h" + "${AOM_ROOT}/av1/encoder/partition_search.c" + "${AOM_ROOT}/av1/encoder/partition_strategy.h" + "${AOM_ROOT}/av1/encoder/partition_strategy.c" + "${AOM_ROOT}/av1/encoder/pass2_strategy.h" + "${AOM_ROOT}/av1/encoder/pass2_strategy.c" + "${AOM_ROOT}/av1/encoder/pickcdef.c" + "${AOM_ROOT}/av1/encoder/pickcdef.h" + "${AOM_ROOT}/av1/encoder/picklpf.c" + "${AOM_ROOT}/av1/encoder/picklpf.h" + "${AOM_ROOT}/av1/encoder/pickrst.c" + "${AOM_ROOT}/av1/encoder/pickrst.h" + "${AOM_ROOT}/av1/encoder/ratectrl.c" + "${AOM_ROOT}/av1/encoder/ratectrl.h" + "${AOM_ROOT}/av1/encoder/rc_utils.h" + "${AOM_ROOT}/av1/encoder/rd.c" + "${AOM_ROOT}/av1/encoder/rd.h" + "${AOM_ROOT}/av1/encoder/rdopt.c" + "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c" + "${AOM_ROOT}/av1/encoder/nonrd_opt.c" + "${AOM_ROOT}/av1/encoder/nonrd_opt.h" + "${AOM_ROOT}/av1/encoder/rdopt.h" + "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h" + "${AOM_ROOT}/av1/encoder/rdopt_utils.h" + "${AOM_ROOT}/av1/encoder/reconinter_enc.c" + "${AOM_ROOT}/av1/encoder/reconinter_enc.h" + "${AOM_ROOT}/av1/encoder/segmentation.c" + "${AOM_ROOT}/av1/encoder/segmentation.h" + "${AOM_ROOT}/av1/encoder/sorting_network.h" + "${AOM_ROOT}/av1/encoder/speed_features.c" + "${AOM_ROOT}/av1/encoder/speed_features.h" + "${AOM_ROOT}/av1/encoder/superres_scale.c" + "${AOM_ROOT}/av1/encoder/superres_scale.h" + "${AOM_ROOT}/av1/encoder/svc_layercontext.c" + "${AOM_ROOT}/av1/encoder/svc_layercontext.h" + "${AOM_ROOT}/av1/encoder/temporal_filter.c" + "${AOM_ROOT}/av1/encoder/temporal_filter.h" + "${AOM_ROOT}/av1/encoder/thirdpass.c" + "${AOM_ROOT}/av1/encoder/thirdpass.h" + "${AOM_ROOT}/av1/encoder/tokenize.c" + "${AOM_ROOT}/av1/encoder/tokenize.h" + "${AOM_ROOT}/av1/encoder/tpl_model.c" + "${AOM_ROOT}/av1/encoder/tpl_model.h" + "${AOM_ROOT}/av1/encoder/tx_search.c" + "${AOM_ROOT}/av1/encoder/tx_search.h" + "${AOM_ROOT}/av1/encoder/txb_rdopt.c" + "${AOM_ROOT}/av1/encoder/txb_rdopt.h" + "${AOM_ROOT}/av1/encoder/txb_rdopt_utils.h" + "${AOM_ROOT}/av1/encoder/intra_mode_search.c" + "${AOM_ROOT}/av1/encoder/intra_mode_search.h" + "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h" + "${AOM_ROOT}/av1/encoder/wedge_utils.c" + "${AOM_ROOT}/av1/encoder/var_based_part.c" + "${AOM_ROOT}/av1/encoder/var_based_part.h" + "${AOM_ROOT}/av1/encoder/av1_noise_estimate.c" + "${AOM_ROOT}/av1/encoder/av1_noise_estimate.h" + "${AOM_ROOT}/third_party/fastfeat/fast.c" + "${AOM_ROOT}/third_party/fastfeat/fast.h" + "${AOM_ROOT}/third_party/fastfeat/fast_9.c" + "${AOM_ROOT}/third_party/fastfeat/nonmax.c" + "${AOM_ROOT}/third_party/vector/vector.c" + "${AOM_ROOT}/third_party/vector/vector.h" + "${AOM_ROOT}/av1/encoder/dwt.c" + "${AOM_ROOT}/av1/encoder/dwt.h") + +list(APPEND AOM_AV1_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h" + "${AOM_ROOT}/av1/common/x86/cdef_block_sse2.c" + "${AOM_ROOT}/av1/common/x86/cfl_sse2.c" + "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c" + "${AOM_ROOT}/av1/common/x86/convolve_sse2.c" + "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c" + "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c" + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h" + "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c" + "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c" + "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c" + "${AOM_ROOT}/av1/common/x86/resize_ssse3.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c" + "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h" + "${AOM_ROOT}/av1/common/x86/cdef_block_sse4.c" + "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c" + "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c" + "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c" + "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c" + "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c" + "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h" + "${AOM_ROOT}/av1/common/x86/cdef_block_avx2.c" + "${AOM_ROOT}/av1/common/x86/cfl_avx2.c" + "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c" + "${AOM_ROOT}/av1/common/x86/convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c" + "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c" + "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c" + "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c" + "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") + +list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm" + "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h" + "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c" + "${AOM_ROOT}/av1/encoder/x86/ml_sse3.h") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3 + "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c") + +list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" + "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/av1_k_means_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/ml_avx2.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h" + "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD + "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SVE + "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32 + "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_NEON + "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c" + "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h" + "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c" + "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c" + "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c" + "${AOM_ROOT}/av1/common/arm/cdef_block_neon.c" + "${AOM_ROOT}/av1/common/arm/cfl_neon.c" + "${AOM_ROOT}/av1/common/arm/compound_convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/convolve_neon.h" + "${AOM_ROOT}/av1/common/arm/highbd_inv_txfm_neon.c" + "${AOM_ROOT}/av1/common/arm/reconinter_neon.c" + "${AOM_ROOT}/av1/common/arm/reconintra_neon.c" + "${AOM_ROOT}/av1/common/arm/resize_neon.c" + "${AOM_ROOT}/av1/common/arm/selfguided_neon.c" + "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c" + "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_NEON_DOTPROD + "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_dotprod.c" + "${AOM_ROOT}/av1/common/arm/convolve_neon_dotprod.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_NEON_I8MM + "${AOM_ROOT}/av1/common/arm/compound_convolve_neon_i8mm.c" + "${AOM_ROOT}/av1/common/arm/convolve_neon_i8mm.c" + "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_SVE + "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c") + +list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 + "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c") + +list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c") + +if(CONFIG_TUNE_VMAF) + list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c" + "${AOM_ROOT}/av1/encoder/tune_vmaf.h") +endif() + +if(CONFIG_TUNE_BUTTERAUGLI) + list(APPEND AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/encoder/tune_butteraugli.c" + "${AOM_ROOT}/av1/encoder/tune_butteraugli.h") +endif() + +if(CONFIG_SALIENCY_MAP) + list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/saliency_map.c" + "${AOM_ROOT}/av1/encoder/saliency_map.h") +endif() + +if(CONFIG_OPTICAL_FLOW_API) + list(APPEND AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/encoder/sparse_linear_solver.c" + "${AOM_ROOT}/av1/encoder/sparse_linear_solver.h" + "${AOM_ROOT}/av1/encoder/optical_flow.c" + "${AOM_ROOT}/av1/encoder/optical_flow.h") +endif() + +if(CONFIG_AV1_TEMPORAL_DENOISING) + list(APPEND AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.c" + "${AOM_ROOT}/av1/encoder/av1_temporal_denoiser.h") + + list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/av1/encoder/x86/av1_temporal_denoiser_sse2.c") + + list(APPEND AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c") +endif() + +if(CONFIG_AV1_HIGHBITDEPTH) + list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c" + "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c" + "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c") + + list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c" + "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c") + + list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c" + "${AOM_ROOT}/av1/common/x86/highbd_warp_affine_avx2.c") + + list(APPEND AOM_AV1_COMMON_INTRIN_NEON + "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_convolve_horiz_rs_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_convolve_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_convolve_scale_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_reconinter_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_reconintra_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c" + "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c") + + list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c") + + list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c") + + list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_avx2.c") + + list(APPEND AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/neon/highbd_pickrst_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/highbd_rdopt_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/highbd_temporal_filter_neon.c") +endif() + +if(CONFIG_ACCOUNTING) + list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c" + "${AOM_ROOT}/av1/decoder/accounting.h") +endif() + +if(CONFIG_INSPECTION) + list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c" + "${AOM_ROOT}/av1/decoder/inspection.h") +endif() + +if(CONFIG_INTERNAL_STATS) + list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c") +endif() + +if(CONFIG_REALTIME_ONLY) + list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") + + list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c") + + list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c") + + list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES + "${AOM_ROOT}/av1/encoder/cnn.c" + "${AOM_ROOT}/av1/encoder/cnn.h" + "${AOM_ROOT}/av1/encoder/firstpass.c" + "${AOM_ROOT}/av1/encoder/firstpass.h" + "${AOM_ROOT}/av1/encoder/global_motion.c" + "${AOM_ROOT}/av1/encoder/global_motion.h" + "${AOM_ROOT}/av1/encoder/global_motion_facade.c" + "${AOM_ROOT}/av1/encoder/global_motion_facade.h" + "${AOM_ROOT}/av1/encoder/gop_structure.c" + "${AOM_ROOT}/av1/encoder/gop_structure.h" + "${AOM_ROOT}/av1/encoder/misc_model_weights.h" + "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h" + "${AOM_ROOT}/av1/encoder/partition_model_weights.h" + "${AOM_ROOT}/av1/encoder/pass2_strategy.c" + "${AOM_ROOT}/av1/encoder/picklpf.h" + "${AOM_ROOT}/av1/encoder/pickrst.c" + "${AOM_ROOT}/av1/encoder/temporal_filter.c" + "${AOM_ROOT}/av1/encoder/temporal_filter.h" + "${AOM_ROOT}/av1/encoder/tpl_model.c" + "${AOM_ROOT}/av1/encoder/tpl_model.h") +endif() + +# Setup AV1 common/decoder/encoder targets. The libaom target must exist before +# this function is called. +function(setup_av1_targets) + add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES}) + list(APPEND AOM_LIB_TARGETS aom_av1_common) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + if(CONFIG_AV1_DECODER) + add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + endif() + + if(HAVE_SSE2) + require_compiler_flag_nomsvc("-msse2" NO) + add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSE2") + if(CONFIG_AV1_DECODER) + if(AOM_AV1_DECODER_ASM_SSE2) + add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2") + endif() + + if(AOM_AV1_DECODER_INTRIN_SSE2) + add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder" + "AOM_AV1_DECODER_INTRIN_SSE2") + endif() + endif() + + if(CONFIG_AV1_ENCODER) + add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2") + add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE2") + endif() + endif() + + if(HAVE_SSE3) + require_compiler_flag_nomsvc("-msse3" NO) + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE3") + endif() + endif() + + if(HAVE_SSSE3) + require_compiler_flag_nomsvc("-mssse3" NO) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSSE3") + + if(CONFIG_AV1_DECODER) + if(AOM_AV1_DECODER_INTRIN_SSSE3) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder" + "AOM_AV1_DECODER_INTRIN_SSSE3") + endif() + endif() + if(CONFIG_AV1_ENCODER) + if(AOM_AV1_ENCODER_INTRIN_SSSE3) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSSE3") + endif() + endif() + endif() + + if(HAVE_SSE4_1) + require_compiler_flag_nomsvc("-msse4.1" NO) + add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSE4_1") + + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + add_asm_library("aom_av1_encoder_ssse3" + "AOM_AV1_ENCODER_ASM_SSSE3_X86_64") + endif() + + if(AOM_AV1_ENCODER_INTRIN_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE4_1") + endif() + endif() + endif() + + if(HAVE_SSE4_2) + require_compiler_flag_nomsvc("-msse4.2" NO) + if(CONFIG_AV1_ENCODER) + if(AOM_AV1_ENCODER_INTRIN_SSE4_2) + add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE4_2") + endif() + endif() + endif() + + if(HAVE_AVX2) + require_compiler_flag_nomsvc("-mavx2" NO) + add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_AVX2") + + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_AVX2") + endif() + endif() + + if(HAVE_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_NEON") + endif() + endif() + + if(HAVE_ARM_CRC32) + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "arm_crc32" + "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_ARM_CRC32") + endif() + endif() + + if(HAVE_NEON_DOTPROD) + add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" + "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_NEON_DOTPROD") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod" + "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD") + endif() + endif() + + if(HAVE_NEON_I8MM) + add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm" + "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_NEON_I8MM") + endif() + + if(HAVE_SVE) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SVE") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SVE") + endif() + endif() + + if(HAVE_VSX) + if(AOM_AV1_COMMON_INTRIN_VSX) + add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_VSX") + endif() + endif() + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c new file mode 100644 index 0000000000..9214feb4e6 --- /dev/null +++ b/third_party/aom/av1/av1_cx_iface.c @@ -0,0 +1,4712 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "aom_mem/aom_mem.h" +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "aom_ports/mem_ops.h" + +#include "aom/aom_encoder.h" +#include "aom/internal/aom_codec_internal.h" + +#include "aom_dsp/flow_estimation/flow_estimation.h" + +#include "av1/av1_cx_iface.h" +#include "av1/av1_iface_common.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/external_partition.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/rc_utils.h" +#include "av1/arg_defs.h" + +#include "common/args_helper.h" + +struct av1_extracfg { + int cpu_used; + unsigned int enable_auto_alt_ref; + unsigned int enable_auto_bwd_ref; + unsigned int noise_sensitivity; + unsigned int sharpness; + unsigned int static_thresh; + unsigned int row_mt; + unsigned int fp_mt; + unsigned int tile_columns; // log2 number of tile columns + unsigned int tile_rows; // log2 number of tile rows + unsigned int enable_tpl_model; + unsigned int enable_keyframe_filtering; + unsigned int arnr_max_frames; + unsigned int arnr_strength; + unsigned int min_gf_interval; + unsigned int max_gf_interval; + unsigned int gf_min_pyr_height; + unsigned int gf_max_pyr_height; + aom_tune_metric tuning; + const char *vmaf_model_path; + const char *partition_info_path; + unsigned int enable_rate_guide_deltaq; + const char *rate_distribution_info; + aom_dist_metric dist_metric; + unsigned int cq_level; // constrained quality level + unsigned int rc_max_intra_bitrate_pct; + unsigned int rc_max_inter_bitrate_pct; + unsigned int gf_cbr_boost_pct; + unsigned int lossless; + unsigned int enable_cdef; + unsigned int enable_restoration; + unsigned int force_video_mode; + unsigned int enable_obmc; + unsigned int disable_trellis_quant; + unsigned int enable_qm; + unsigned int qm_y; + unsigned int qm_u; + unsigned int qm_v; + unsigned int qm_min; + unsigned int qm_max; + unsigned int num_tg; + unsigned int mtu_size; + + aom_timing_info_type_t timing_info_type; + unsigned int frame_parallel_decoding_mode; + int enable_dual_filter; + unsigned int enable_chroma_deltaq; + AQ_MODE aq_mode; + DELTAQ_MODE deltaq_mode; + int deltaq_strength; + int deltalf_mode; + unsigned int frame_periodic_boost; + aom_bit_depth_t bit_depth; + aom_tune_content content; + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + aom_chroma_sample_position_t chroma_sample_position; + int color_range; + int render_width; + int render_height; + aom_superblock_size_t superblock_size; + unsigned int single_tile_decoding; + int error_resilient_mode; + int s_frame_mode; + + int film_grain_test_vector; + const char *film_grain_table_filename; + unsigned int motion_vector_unit_test; +#if CONFIG_FPMT_TEST + unsigned int fpmt_unit_test; +#endif + unsigned int cdf_update_mode; + int enable_rect_partitions; // enable rectangular partitions for sequence + int enable_ab_partitions; // enable AB partitions for sequence + int enable_1to4_partitions; // enable 1:4 and 4:1 partitions for sequence + int min_partition_size; // min partition size [4,8,16,32,64,128] + int max_partition_size; // max partition size [4,8,16,32,64,128] + int enable_intra_edge_filter; // enable intra-edge filter for sequence + int enable_order_hint; // enable order hint for sequence + int enable_tx64; // enable 64-pt transform usage for sequence + int enable_flip_idtx; // enable flip and identity transform types + int enable_rect_tx; // enable rectangular transform usage for sequence + int enable_dist_wtd_comp; // enable dist wtd compound for sequence + int max_reference_frames; // maximum number of references per frame + int enable_reduced_reference_set; // enable reduced set of references + int enable_ref_frame_mvs; // sequence level + int allow_ref_frame_mvs; // frame level + int enable_masked_comp; // enable masked compound for sequence + int enable_onesided_comp; // enable one sided compound for sequence + int enable_interintra_comp; // enable interintra compound for sequence + int enable_smooth_interintra; // enable smooth interintra mode usage + int enable_diff_wtd_comp; // enable diff-wtd compound usage + int enable_interinter_wedge; // enable interinter-wedge compound usage + int enable_interintra_wedge; // enable interintra-wedge compound usage + int enable_global_motion; // enable global motion usage for sequence + int enable_warped_motion; // sequence level + int allow_warped_motion; // frame level + int enable_filter_intra; // enable filter intra for sequence + int enable_smooth_intra; // enable smooth intra modes for sequence + int enable_paeth_intra; // enable Paeth intra mode for sequence + int enable_cfl_intra; // enable CFL uv intra mode for sequence + int enable_directional_intra; // enable directional modes for sequence + int enable_diagonal_intra; // enable D45 to D203 intra modes for sequence + int enable_superres; + int enable_overlay; // enable overlay for filtered arf frames + int enable_palette; + int enable_intrabc; + int enable_angle_delta; +#if CONFIG_DENOISE + float noise_level; + int noise_block_size; + int enable_dnl_denoising; +#endif + + unsigned int chroma_subsampling_x; + unsigned int chroma_subsampling_y; + int reduced_tx_type_set; + int use_intra_dct_only; + int use_inter_dct_only; + int use_intra_default_tx_only; + int enable_tx_size_search; + int quant_b_adapt; + unsigned int vbr_corpus_complexity_lap; + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; + // min_cr / 100 is the target minimum compression ratio for each frame. + unsigned int min_cr; + COST_UPDATE_TYPE coeff_cost_upd_freq; + COST_UPDATE_TYPE mode_cost_upd_freq; + COST_UPDATE_TYPE mv_cost_upd_freq; + COST_UPDATE_TYPE dv_cost_upd_freq; + unsigned int ext_tile_debug; + unsigned int sb_multipass_unit_test; + // Total number of passes. If this number is -1, then we assume passes = 1 or + // 2 (passes = 1 if pass == AOM_RC_ONE_PASS and passes = 2 otherwise). + int passes; + int fwd_kf_dist; + + LOOPFILTER_CONTROL loopfilter_control; + // Indicates if the application of post-processing filters should be skipped + // on reconstructed frame. + unsigned int skip_postproc_filtering; + // the name of the second pass output file when passes > 2 + const char *two_pass_output; + const char *second_pass_log; + // Automatically determine whether to disable several intra tools + // when "--deltaq-mode=3" is true. + // Default as 0. + // When set to 1, the encoder will analyze the reconstruction quality + // as compared to the source image in the preprocessing pass. + // If the recontruction quality is considered high enough, we disable + // the following intra coding tools, for better encoding speed: + // "--enable_smooth_intra", + // "--enable_paeth_intra", + // "--enable_cfl_intra", + // "--enable_diagonal_intra". + int auto_intra_tools_off; + int strict_level_conformance; + int kf_max_pyr_height; + int sb_qp_sweep; +}; + +#if CONFIG_REALTIME_ONLY +// Settings changed for realtime only build: +// cpu_used: 7 +// enable_tpl_model: 0 +// enable_restoration: 0 +// enable_obmc: 0 +// deltaq_mode: NO_DELTA_Q +// enable_global_motion usage: 0 +// enable_warped_motion at sequence level: 0 +// allow_warped_motion at frame level: 0 +// coeff_cost_upd_freq: COST_UPD_OFF +// mode_cost_upd_freq: COST_UPD_OFF +// mv_cost_upd_freq: COST_UPD_OFF +// dv_cost_upd_freq: COST_UPD_OFF +static const struct av1_extracfg default_extra_cfg = { + 7, // cpu_used + 1, // enable_auto_alt_ref + 0, // enable_auto_bwd_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 1, // row_mt + 0, // fp_mt + 0, // tile_columns + 0, // tile_rows + 0, // enable_tpl_model + 1, // enable_keyframe_filtering + 7, // arnr_max_frames + 5, // arnr_strength + 0, // min_gf_interval; 0 -> default decision + 0, // max_gf_interval; 0 -> default decision + 0, // gf_min_pyr_height + 5, // gf_max_pyr_height + AOM_TUNE_PSNR, // tuning + "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path + ".", // partition info path + 0, // enable rate guide deltaq + "./rate_map.txt", // rate distribution input + AOM_DIST_METRIC_PSNR, // dist_metric + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + 1, // enable_cdef + 0, // enable_restoration + 0, // force_video_mode + 0, // enable_obmc + 3, // disable_trellis_quant + 0, // enable_qm + DEFAULT_QM_Y, // qm_y + DEFAULT_QM_U, // qm_u + DEFAULT_QM_V, // qm_v + DEFAULT_QM_FIRST, // qm_min + DEFAULT_QM_LAST, // qm_max + 1, // max number of tile groups + 0, // mtu_size + AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream + 0, // frame_parallel_decoding_mode + 1, // enable dual filter + 0, // enable delta quant in chroma planes + NO_AQ, // aq_mode + NO_DELTA_Q, // deltaq_mode + 100, // deltaq_strength + 0, // delta lf mode + 0, // frame_periodic_boost + AOM_BITS_8, // Bit depth + AOM_CONTENT_DEFAULT, // content + AOM_CICP_CP_UNSPECIFIED, // CICP color primaries + AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics + AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients + AOM_CSP_UNKNOWN, // chroma sample position + 0, // color range + 0, // render width + 0, // render height + AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size + 1, // this depends on large_scale_tile. + 0, // error_resilient_mode off by default. + 0, // s_frame_mode off by default. + 0, // film_grain_test_vector + NULL, // film_grain_table_filename + 0, // motion_vector_unit_test +#if CONFIG_FPMT_TEST + 0, // fpmt_unit_test +#endif + 1, // CDF update mode + 1, // enable rectangular partitions + 1, // enable ab shape partitions + 1, // enable 1:4 and 4:1 partitions + 4, // min_partition_size + 128, // max_partition_size + 1, // enable intra edge filter + 1, // frame order hint + 1, // enable 64-pt transform usage + 1, // enable flip and identity transform + 1, // enable rectangular transform usage + 1, // dist-wtd compound + 7, // max_reference_frames + 0, // enable_reduced_reference_set + 1, // enable_ref_frame_mvs sequence level + 1, // allow ref_frame_mvs frame level + 1, // enable masked compound at sequence level + 1, // enable one sided compound at sequence level + 1, // enable interintra compound at sequence level + 1, // enable smooth interintra mode + 1, // enable difference-weighted compound + 1, // enable interinter wedge compound + 1, // enable interintra wedge compound + 0, // enable_global_motion usage + 0, // enable_warped_motion at sequence level + 0, // allow_warped_motion at frame level + 1, // enable filter intra at sequence level + 1, // enable smooth intra modes usage for sequence + 1, // enable Paeth intra mode usage for sequence + 1, // enable CFL uv intra mode usage for sequence + 1, // enable directional intra mode usage for sequence + 1, // enable D45 to D203 intra mode usage for sequence + 1, // superres + 1, // enable overlay + 1, // enable palette + 1, // enable intrabc + 1, // enable angle delta +#if CONFIG_DENOISE + 0, // noise_level + 32, // noise_block_size + 1, // enable_dnl_denoising +#endif + 0, // chroma_subsampling_x + 0, // chroma_subsampling_y + 0, // reduced_tx_type_set + 0, // use_intra_dct_only + 0, // use_inter_dct_only + 0, // use_intra_default_tx_only + 1, // enable_tx_size_search + 0, // quant_b_adapt + 0, // vbr_corpus_complexity_lap + { + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + }, // target_seq_level_idx + 0, // tier_mask + 0, // min_cr + COST_UPD_OFF, // coeff_cost_upd_freq + COST_UPD_OFF, // mode_cost_upd_freq + COST_UPD_OFF, // mv_cost_upd_freq + COST_UPD_OFF, // dv_cost_upd_freq + 0, // ext_tile_debug + 0, // sb_multipass_unit_test + -1, // passes + -1, // fwd_kf_dist + LOOPFILTER_ALL, // loopfilter_control + 0, // skip_postproc_filtering + NULL, // two_pass_output + NULL, // second_pass_log + 0, // auto_intra_tools_off + 0, // strict_level_conformance + -1, // kf_max_pyr_height + 0, // sb_qp_sweep +}; +#else +static const struct av1_extracfg default_extra_cfg = { + 0, // cpu_used + 1, // enable_auto_alt_ref + 0, // enable_auto_bwd_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 1, // row_mt + 0, // fp_mt + 0, // tile_columns + 0, // tile_rows + 1, // enable_tpl_model + 1, // enable_keyframe_filtering + 7, // arnr_max_frames + 5, // arnr_strength + 0, // min_gf_interval; 0 -> default decision + 0, // max_gf_interval; 0 -> default decision + 0, // gf_min_pyr_height + 5, // gf_max_pyr_height + AOM_TUNE_PSNR, // tuning + "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path + ".", // partition info path + 0, // enable rate guide deltaq + "./rate_map.txt", // rate distribution input + AOM_DIST_METRIC_PSNR, // dist_metric + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + 1, // enable_cdef + 1, // enable_restoration + 0, // force_video_mode + 1, // enable_obmc + 3, // disable_trellis_quant + 0, // enable_qm + DEFAULT_QM_Y, // qm_y + DEFAULT_QM_U, // qm_u + DEFAULT_QM_V, // qm_v + DEFAULT_QM_FIRST, // qm_min + DEFAULT_QM_LAST, // qm_max + 1, // max number of tile groups + 0, // mtu_size + AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream + 0, // frame_parallel_decoding_mode + 1, // enable dual filter + 0, // enable delta quant in chroma planes + NO_AQ, // aq_mode + DELTA_Q_OBJECTIVE, // deltaq_mode + 100, // deltaq_strength + 0, // delta lf mode + 0, // frame_periodic_boost + AOM_BITS_8, // Bit depth + AOM_CONTENT_DEFAULT, // content + AOM_CICP_CP_UNSPECIFIED, // CICP color primaries + AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics + AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients + AOM_CSP_UNKNOWN, // chroma sample position + 0, // color range + 0, // render width + 0, // render height + AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size + 1, // this depends on large_scale_tile. + 0, // error_resilient_mode off by default. + 0, // s_frame_mode off by default. + 0, // film_grain_test_vector + NULL, // film_grain_table_filename + 0, // motion_vector_unit_test +#if CONFIG_FPMT_TEST + 0, // fpmt_unit_test +#endif + 1, // CDF update mode + 1, // enable rectangular partitions + 1, // enable ab shape partitions + 1, // enable 1:4 and 4:1 partitions + 4, // min_partition_size + 128, // max_partition_size + 1, // enable intra edge filter + 1, // frame order hint + 1, // enable 64-pt transform usage + 1, // enable flip and identity transform + 1, // enable rectangular transform usage + 1, // dist-wtd compound + 7, // max_reference_frames + 0, // enable_reduced_reference_set + 1, // enable_ref_frame_mvs sequence level + 1, // allow ref_frame_mvs frame level + 1, // enable masked compound at sequence level + 1, // enable one sided compound at sequence level + 1, // enable interintra compound at sequence level + 1, // enable smooth interintra mode + 1, // enable difference-weighted compound + 1, // enable interinter wedge compound + 1, // enable interintra wedge compound + 1, // enable_global_motion usage + 1, // enable_warped_motion at sequence level + 1, // allow_warped_motion at frame level + 1, // enable filter intra at sequence level + 1, // enable smooth intra modes usage for sequence + 1, // enable Paeth intra mode usage for sequence + 1, // enable CFL uv intra mode usage for sequence + 1, // enable directional intra mode usage for sequence + 1, // enable D45 to D203 intra mode usage for sequence + 1, // superres + 1, // enable overlay + 1, // enable palette + 1, // enable intrabc + 1, // enable angle delta +#if CONFIG_DENOISE + 0, // noise_level + 32, // noise_block_size + 1, // enable_dnl_denoising +#endif + 0, // chroma_subsampling_x + 0, // chroma_subsampling_y + 0, // reduced_tx_type_set + 0, // use_intra_dct_only + 0, // use_inter_dct_only + 0, // use_intra_default_tx_only + 1, // enable_tx_size_search + 0, // quant_b_adapt + 0, // vbr_corpus_complexity_lap + { + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + }, // target_seq_level_idx + 0, // tier_mask + 0, // min_cr + COST_UPD_SB, // coeff_cost_upd_freq + COST_UPD_SB, // mode_cost_upd_freq + COST_UPD_SB, // mv_cost_upd_freq + COST_UPD_SB, // dv_cost_upd_freq + 0, // ext_tile_debug + 0, // sb_multipass_unit_test + -1, // passes + -1, // fwd_kf_dist + LOOPFILTER_ALL, // loopfilter_control + 0, // skip_postproc_filtering + NULL, // two_pass_output + NULL, // second_pass_log + 0, // auto_intra_tools_off + 0, // strict_level_conformance + -1, // kf_max_pyr_height + 0, // sb_qp_sweep +}; +#endif + +struct aom_codec_alg_priv { + aom_codec_priv_t base; + aom_codec_enc_cfg_t cfg; + struct av1_extracfg extra_cfg; + aom_rational64_t timestamp_ratio; + aom_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; + AV1EncoderConfig oxcf; + AV1_PRIMARY *ppi; + unsigned char *cx_data; + size_t cx_data_sz; + size_t pending_cx_data_sz; + aom_image_t preview_img; + aom_enc_frame_flags_t next_frame_flags; + aom_codec_pkt_list_decl(256) pkt_list; + unsigned int fixed_kf_cntr; + // BufferPool that holds all reference frames. + BufferPool *buffer_pool; + + // lookahead instance variables + BufferPool *buffer_pool_lap; + FIRSTPASS_STATS *frame_stats_buffer; + // Number of stats buffers required for look ahead + int num_lap_buffers; + STATS_BUFFER_CTX stats_buf_context; + bool monochrome_on_init; +}; + +static INLINE int gcd(int64_t a, int b) { + int remainder; + while (b > 0) { + remainder = (int)(a % b); + a = b; + b = remainder; + } + + return (int)a; +} + +static void reduce_ratio(aom_rational64_t *ratio) { + const int denom = gcd(ratio->num, ratio->den); + ratio->num /= denom; + ratio->den /= denom; +} + +// Called by encoder_encode() only. Must not be called by encoder_init() +// because the `error` paramerer will be destroyed by aom_codec_enc_init_ver() +// after encoder_init() returns an error. See the "IMPORTANT" comment in +// aom_codec_enc_init_ver(). +static aom_codec_err_t update_error_state( + aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { + const aom_codec_err_t res = error->error_code; + + if (res != AOM_CODEC_OK) + ctx->base.err_detail = error->has_detail ? error->detail : NULL; + + return res; +} + +// This function deep copies a string src to *dst. For default string we will +// use a string literal, and otherwise we will allocate memory for the string. +static aom_codec_err_t allocate_and_set_string(const char *src, + const char *default_src, + const char **dst, + char *err_detail) { + if (!src) { + snprintf(err_detail, ARG_ERR_MSG_MAX_LEN, + "Null pointer given to a string parameter."); + return AOM_CODEC_INVALID_PARAM; + } + if (*dst && strcmp(src, *dst) == 0) return AOM_CODEC_OK; + // If the input is exactly the same as default, we will use the string + // literal, so do not free here. + if (*dst != default_src) { + aom_free((void *)*dst); + } + + if (default_src && strcmp(src, default_src) == 0) { + // default_src should be a string literal + *dst = default_src; + } else { + size_t len = strlen(src) + 1; + char *tmp = aom_malloc(len * sizeof(*tmp)); + if (!tmp) { + snprintf(err_detail, ARG_ERR_MSG_MAX_LEN, + "Failed to allocate memory for copying parameters."); + return AOM_CODEC_MEM_ERROR; + } + memcpy(tmp, src, len); + *dst = tmp; + } + return 0; +} + +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return AOM_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!((p)->memb >= (lo) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_HI(p, memb, hi) \ + do { \ + if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \ + } while (0) + +#define RANGE_CHECK_BOOL(p, memb) \ + do { \ + if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \ + } while (0) + +static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, + const aom_codec_enc_cfg_t *cfg, + const struct av1_extracfg *extra_cfg) { + RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available + RANGE_CHECK_HI(cfg, g_forced_max_frame_width, 65536); // 16 bits available + RANGE_CHECK_HI(cfg, g_forced_max_frame_height, 65536); // 16 bits available + if (cfg->g_forced_max_frame_width) { + RANGE_CHECK_HI(cfg, g_w, cfg->g_forced_max_frame_width); + } + if (cfg->g_forced_max_frame_height) { + RANGE_CHECK_HI(cfg, g_h, cfg->g_forced_max_frame_height); + } + // To avoid integer overflows when multiplying width by height (or values + // derived from width and height) using the int type, impose a maximum frame + // area (width * height) of 2^30. + const unsigned int max_frame_width = + cfg->g_forced_max_frame_width ? cfg->g_forced_max_frame_width : cfg->g_w; + const unsigned int max_frame_height = cfg->g_forced_max_frame_height + ? cfg->g_forced_max_frame_height + : cfg->g_h; + const int64_t max_frame_area = (int64_t)max_frame_width * max_frame_height; + if (max_frame_area > (1 << 30)) { + ERROR("max_frame_area out of range [..2^30]"); + } + RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); + RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); + RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1); + + RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); + RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); + RANGE_CHECK_BOOL(extra_cfg, lossless); + RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1); + RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1); + RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1); + RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME); +#else + RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA); +#endif + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); + RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO); + RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); + RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS); + RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); + if (cfg->g_usage == AOM_USAGE_ALL_INTRA) { + RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); + RANGE_CHECK_HI(cfg, kf_max_dist, 0); + } + RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1); + RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1); + if (extra_cfg->max_gf_interval > 0) { + RANGE_CHECK(extra_cfg, max_gf_interval, + AOMMAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1)); + } + RANGE_CHECK_HI(extra_cfg, gf_min_pyr_height, 5); + RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 5); + if (extra_cfg->gf_min_pyr_height > extra_cfg->gf_max_pyr_height) { + ERROR( + "gf_min_pyr_height must be less than or equal to " + "gf_max_pyramid_height"); + } + + RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1); + RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK_HI(cfg, rc_superres_mode, AOM_SUPERRES_AUTO); + RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR, + SCALE_NUMERATOR << 1); + RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63); + RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63); + RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2); + + RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2); +#if CONFIG_FPMT_TEST + RANGE_CHECK_HI(extra_cfg, fpmt_unit_test, 1); +#endif + RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1); + RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1); + RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1); + RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2); + RANGE_CHECK(extra_cfg, cpu_used, 0, + (cfg->g_usage == AOM_USAGE_REALTIME) ? 11 : 9); + RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); + RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64, + AOM_SUPERBLOCK_SIZE_DYNAMIC); + RANGE_CHECK_HI(cfg, large_scale_tile, 1); + RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1); + RANGE_CHECK_HI(extra_cfg, enable_rate_guide_deltaq, 1); + + RANGE_CHECK_HI(extra_cfg, row_mt, 1); + RANGE_CHECK_HI(extra_cfg, fp_mt, 1); + + RANGE_CHECK_HI(extra_cfg, tile_columns, 6); + RANGE_CHECK_HI(extra_cfg, tile_rows, 6); + + RANGE_CHECK_HI(cfg, monochrome, 1); + + if (cfg->large_scale_tile && extra_cfg->aq_mode) + ERROR( + "Adaptive quantization are not supported in large scale tile " + "coding."); + + RANGE_CHECK_HI(extra_cfg, sharpness, 7); + RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15); + RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); + RANGE_CHECK_HI(extra_cfg, cq_level, 63); + RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12); + RANGE_CHECK(cfg, g_input_bit_depth, 8, 12); + RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1); + + if (cfg->g_pass >= AOM_RC_SECOND_PASS) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + const FIRSTPASS_STATS *stats; + + if (cfg->rc_twopass_stats_in.buf == NULL) + ERROR("rc_twopass_stats_in.buf not set."); + + if (cfg->rc_twopass_stats_in.sz % packet_sz) + ERROR("rc_twopass_stats_in.sz indicates truncated packet."); + + if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) + ERROR("rc_twopass_stats_in requires at least two packets."); + + stats = + (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1; + + if ((int)(stats->count + 0.5) != n_packets - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } + + if (extra_cfg->passes != -1 && cfg->g_pass == AOM_RC_ONE_PASS && + extra_cfg->passes != 1) { + ERROR("One pass encoding but passes != 1."); + } + + if (extra_cfg->passes != -1 && (int)cfg->g_pass > extra_cfg->passes) { + ERROR("Current pass is larger than total number of passes."); + } + + if (cfg->g_profile == (unsigned int)PROFILE_1 && cfg->monochrome) { + ERROR("Monochrome is not supported in profile 1"); + } + + if (cfg->g_profile <= (unsigned int)PROFILE_1 && + cfg->g_bit_depth > AOM_BITS_10) { + ERROR("Codec bit-depth 12 not supported in profile < 2"); + } + if (cfg->g_profile <= (unsigned int)PROFILE_1 && + cfg->g_input_bit_depth > 10) { + ERROR("Source bit-depth 12 not supported in profile < 2"); + } + + if (cfg->rc_end_usage == AOM_Q) { + RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1); + } else { + if (cfg->use_fixed_qp_offsets > 0) { + ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q"); + } + } + + RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709, + AOM_CICP_CP_EBU_3213); // Need to check range more precisely to + // check for reserved values? + RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709, + AOM_CICP_TC_HLG); + RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY, + AOM_CICP_MC_ICTCP); + RANGE_CHECK(extra_cfg, color_range, 0, 1); + + /* Average corpus complexity is supported only in the case of single pass + * VBR*/ + if (cfg->g_pass == AOM_RC_ONE_PASS && cfg->rc_end_usage == AOM_VBR) + RANGE_CHECK_HI(extra_cfg, vbr_corpus_complexity_lap, + MAX_VBR_CORPUS_COMPLEXITY); + else if (extra_cfg->vbr_corpus_complexity_lap != 0) + ERROR( + "VBR corpus complexity is supported only in the case of single pass " + "VBR mode."); + +#if !CONFIG_TUNE_BUTTERAUGLI + if (extra_cfg->tuning == AOM_TUNE_BUTTERAUGLI) { + ERROR( + "This error may be related to the wrong configuration options: try to " + "set -DCONFIG_TUNE_BUTTERAUGLI=1 at the time CMake is run."); + } +#endif + +#if !CONFIG_TUNE_VMAF + if (extra_cfg->tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + extra_cfg->tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + ERROR( + "This error may be related to the wrong configuration options: try to " + "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run."); + } +#endif + + RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_SALIENCY_MAP); + + RANGE_CHECK(extra_cfg, dist_metric, AOM_DIST_METRIC_PSNR, + AOM_DIST_METRIC_QM_PSNR); + + RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED, + AOM_TIMING_DEC_MODEL); + + RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16); + + if (extra_cfg->lossless) { + if (extra_cfg->aq_mode != 0) + ERROR("Only --aq_mode=0 can be used with --lossless=1."); + if (extra_cfg->enable_chroma_deltaq) + ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1."); + } + + RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7); + RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1); + RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1); + RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1); + + RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3); + RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3); + RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3); + RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3); + RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3); + + RANGE_CHECK(extra_cfg, min_partition_size, 4, 128); + RANGE_CHECK(extra_cfg, max_partition_size, 4, 128); + RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size); + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + const int level_idx = extra_cfg->target_seq_level_idx[i]; + if (!is_valid_seq_level_idx(level_idx) && + level_idx != SEQ_LEVEL_KEEP_STATS) { + ERROR("Target sequence level index is invalid"); + } + } + + RANGE_CHECK(extra_cfg, deltaq_strength, 0, 1000); + RANGE_CHECK_HI(extra_cfg, loopfilter_control, 3); + RANGE_CHECK_BOOL(extra_cfg, skip_postproc_filtering); + RANGE_CHECK_HI(extra_cfg, enable_cdef, 2); + RANGE_CHECK_BOOL(extra_cfg, auto_intra_tools_off); + RANGE_CHECK_BOOL(extra_cfg, strict_level_conformance); + RANGE_CHECK_BOOL(extra_cfg, sb_qp_sweep); + + RANGE_CHECK(extra_cfg, kf_max_pyr_height, -1, 5); + if (extra_cfg->kf_max_pyr_height != -1 && + extra_cfg->kf_max_pyr_height < (int)extra_cfg->gf_min_pyr_height) { + ERROR( + "The value of kf-max-pyr-height should not be smaller than " + "gf-min-pyr-height"); + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx, + const aom_image_t *img) { + switch (img->fmt) { + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_NV12: + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: break; + case AOM_IMG_FMT_I444: + case AOM_IMG_FMT_I44416: + if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 && + !ctx->cfg.monochrome) { + ERROR("Invalid image format. I444 images not supported in profile."); + } + break; + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I42216: + if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) { + ERROR("Invalid image format. I422 images not supported in profile."); + } + break; + default: + ERROR( + "Invalid image format. Only YV12, NV12, I420, I422, I444 images are " + "supported."); + break; + } + + if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) + ERROR("Image size must match encoder init configuration size"); + +#if CONFIG_TUNE_BUTTERAUGLI + if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + if (img->bit_depth > 8) { + ERROR("Only 8 bit depth images supported in tune=butteraugli mode."); + } + if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 && + img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) { + ERROR( + "Only BT.709 and BT.601 matrix coefficients supported in " + "tune=butteraugli mode. Identity matrix is treated as BT.601."); + } + } +#endif + + return AOM_CODEC_OK; +} + +static int get_image_bps(const aom_image_t *img) { + switch (img->fmt) { + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_NV12: + case AOM_IMG_FMT_I420: return 12; + case AOM_IMG_FMT_I422: return 16; + case AOM_IMG_FMT_I444: return 24; + case AOM_IMG_FMT_YV1216: + case AOM_IMG_FMT_I42016: return 24; + case AOM_IMG_FMT_I42216: return 32; + case AOM_IMG_FMT_I44416: return 48; + default: assert(0 && "Invalid image format"); break; + } + return 0; +} + +// Set appropriate options to disable frame super-resolution. +static void disable_superres(SuperResCfg *const superres_cfg) { + superres_cfg->superres_mode = AOM_SUPERRES_NONE; + superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; + superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; + superres_cfg->superres_qthresh = 255; + superres_cfg->superres_kf_qthresh = 255; +} + +static void update_default_encoder_config(const cfg_options_t *cfg, + struct av1_extracfg *extra_cfg) { + extra_cfg->enable_cdef = (cfg->disable_cdef == 0) ? 1 : 0; + extra_cfg->enable_restoration = (cfg->disable_lr == 0); + extra_cfg->superblock_size = + (cfg->super_block_size == 64) ? AOM_SUPERBLOCK_SIZE_64X64 + : (cfg->super_block_size == 128) ? AOM_SUPERBLOCK_SIZE_128X128 + : AOM_SUPERBLOCK_SIZE_DYNAMIC; + extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0); + extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0); + extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0); + extra_cfg->enable_dual_filter = (cfg->disable_dual_filter == 0); + extra_cfg->enable_angle_delta = (cfg->disable_intra_angle_delta == 0); + extra_cfg->enable_rect_partitions = (cfg->disable_rect_partition_type == 0); + extra_cfg->enable_ab_partitions = (cfg->disable_ab_partition_type == 0); + extra_cfg->enable_1to4_partitions = (cfg->disable_1to4_partition_type == 0); + extra_cfg->max_partition_size = cfg->max_partition_size; + extra_cfg->min_partition_size = cfg->min_partition_size; + extra_cfg->enable_intra_edge_filter = (cfg->disable_intra_edge_filter == 0); + extra_cfg->enable_tx64 = (cfg->disable_tx_64x64 == 0); + extra_cfg->enable_flip_idtx = (cfg->disable_flip_idtx == 0); + extra_cfg->enable_masked_comp = (cfg->disable_masked_comp == 0); + extra_cfg->enable_interintra_comp = (cfg->disable_inter_intra_comp == 0); + extra_cfg->enable_smooth_interintra = (cfg->disable_smooth_inter_intra == 0); + extra_cfg->enable_interinter_wedge = (cfg->disable_inter_inter_wedge == 0); + extra_cfg->enable_interintra_wedge = (cfg->disable_inter_intra_wedge == 0); + extra_cfg->enable_global_motion = (cfg->disable_global_motion == 0); + extra_cfg->enable_filter_intra = (cfg->disable_filter_intra == 0); + extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0); + extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0); + extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0); + extra_cfg->enable_obmc = (cfg->disable_obmc == 0); + extra_cfg->enable_palette = (cfg->disable_palette == 0); + extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0); + extra_cfg->disable_trellis_quant = cfg->disable_trellis_quant; + extra_cfg->allow_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0); + extra_cfg->enable_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0); + extra_cfg->enable_onesided_comp = (cfg->disable_one_sided_comp == 0); + extra_cfg->enable_reduced_reference_set = cfg->reduced_reference_set; + extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set; +} + +static void set_encoder_config(AV1EncoderConfig *oxcf, + const aom_codec_enc_cfg_t *cfg, + struct av1_extracfg *extra_cfg) { + if (cfg->encoder_cfg.init_by_cfg_file) { + update_default_encoder_config(&cfg->encoder_cfg, extra_cfg); + } + + TuneCfg *const tune_cfg = &oxcf->tune_cfg; + + FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + + TileConfig *const tile_cfg = &oxcf->tile_cfg; + + ResizeCfg *const resize_cfg = &oxcf->resize_cfg; + + GFConfig *const gf_cfg = &oxcf->gf_cfg; + + PartitionCfg *const part_cfg = &oxcf->part_cfg; + + IntraModeCfg *const intra_mode_cfg = &oxcf->intra_mode_cfg; + + TxfmSizeTypeCfg *const txfm_cfg = &oxcf->txfm_cfg; + + CompoundTypeCfg *const comp_type_cfg = &oxcf->comp_type_cfg; + + SuperResCfg *const superres_cfg = &oxcf->superres_cfg; + + KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; + + DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; + + RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + QuantizationCfg *const q_cfg = &oxcf->q_cfg; + + ColorCfg *const color_cfg = &oxcf->color_cfg; + + InputCfg *const input_cfg = &oxcf->input_cfg; + + AlgoCfg *const algo_cfg = &oxcf->algo_cfg; + + ToolCfg *const tool_cfg = &oxcf->tool_cfg; + + const int is_vbr = cfg->rc_end_usage == AOM_VBR; + oxcf->profile = cfg->g_profile; + oxcf->max_threads = (int)cfg->g_threads; + + switch (cfg->g_usage) { + case AOM_USAGE_REALTIME: oxcf->mode = REALTIME; break; + case AOM_USAGE_ALL_INTRA: oxcf->mode = ALLINTRA; break; + default: oxcf->mode = GOOD; break; + } + + // Set frame-dimension related configuration. + frm_dim_cfg->width = cfg->g_w; + frm_dim_cfg->height = cfg->g_h; + frm_dim_cfg->forced_max_frame_width = cfg->g_forced_max_frame_width; + frm_dim_cfg->forced_max_frame_height = cfg->g_forced_max_frame_height; + frm_dim_cfg->render_width = extra_cfg->render_width; + frm_dim_cfg->render_height = extra_cfg->render_height; + + // Set input video related configuration. + input_cfg->input_bit_depth = cfg->g_input_bit_depth; + // guess a frame rate if out of whack, use 30 + input_cfg->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; + if (cfg->g_pass >= AOM_RC_SECOND_PASS) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + input_cfg->limit = n_packets - 1; + } else { + input_cfg->limit = cfg->g_limit; + } + input_cfg->chroma_subsampling_x = extra_cfg->chroma_subsampling_x; + input_cfg->chroma_subsampling_y = extra_cfg->chroma_subsampling_y; + if (input_cfg->init_framerate > 180) { + input_cfg->init_framerate = 30; + dec_model_cfg->timing_info_present = 0; + } + + // Set Decoder model configuration. + if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL || + extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) { + dec_model_cfg->timing_info_present = 1; + dec_model_cfg->timing_info.num_units_in_display_tick = cfg->g_timebase.num; + dec_model_cfg->timing_info.time_scale = cfg->g_timebase.den; + dec_model_cfg->timing_info.num_ticks_per_picture = 1; + } else { + dec_model_cfg->timing_info_present = 0; + } + if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) { + dec_model_cfg->timing_info.equal_picture_interval = 1; + dec_model_cfg->decoder_model_info_present_flag = 0; + dec_model_cfg->display_model_info_present_flag = 1; + } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) { + dec_model_cfg->num_units_in_decoding_tick = cfg->g_timebase.num; + dec_model_cfg->timing_info.equal_picture_interval = 0; + dec_model_cfg->decoder_model_info_present_flag = 1; + dec_model_cfg->display_model_info_present_flag = 1; + } + + oxcf->pass = cfg->g_pass; + // For backward compatibility, assume that if extra_cfg->passes==-1, then + // passes = 1 or 2. + if (extra_cfg->passes == -1) { + if (cfg->g_pass == AOM_RC_ONE_PASS) { + oxcf->passes = 1; + } else { + oxcf->passes = 2; + } + } else { + oxcf->passes = extra_cfg->passes; + } + + // Set Rate Control configuration. + rc_cfg->max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; + rc_cfg->max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; + rc_cfg->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; + rc_cfg->mode = cfg->rc_end_usage; + rc_cfg->min_cr = extra_cfg->min_cr; + rc_cfg->best_allowed_q = + extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer); + rc_cfg->worst_allowed_q = + extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer); + rc_cfg->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level); + rc_cfg->under_shoot_pct = cfg->rc_undershoot_pct; + rc_cfg->over_shoot_pct = cfg->rc_overshoot_pct; + rc_cfg->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; + rc_cfg->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; + rc_cfg->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; + // Convert target bandwidth from Kbit/s to Bit/s + rc_cfg->target_bandwidth = 1000 * cfg->rc_target_bitrate; + rc_cfg->drop_frames_water_mark = cfg->rc_dropframe_thresh; + rc_cfg->vbr_corpus_complexity_lap = extra_cfg->vbr_corpus_complexity_lap; + rc_cfg->vbrbias = cfg->rc_2pass_vbr_bias_pct; + rc_cfg->vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; + rc_cfg->vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; + + // Set Toolset related configuration. + tool_cfg->bit_depth = cfg->g_bit_depth; + tool_cfg->cdef_control = (CDEF_CONTROL)extra_cfg->enable_cdef; + tool_cfg->enable_restoration = + (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration; + tool_cfg->force_video_mode = extra_cfg->force_video_mode; + tool_cfg->enable_palette = extra_cfg->enable_palette; + // FIXME(debargha): Should this be: + // tool_cfg->enable_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs & + // extra_cfg->enable_order_hint ? + // Disallow using temporal MVs while large_scale_tile = 1. + tool_cfg->enable_ref_frame_mvs = + extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile; + tool_cfg->superblock_size = extra_cfg->superblock_size; + tool_cfg->enable_monochrome = cfg->monochrome; + tool_cfg->full_still_picture_hdr = cfg->full_still_picture_hdr != 0; + tool_cfg->enable_dual_filter = extra_cfg->enable_dual_filter; + tool_cfg->enable_order_hint = extra_cfg->enable_order_hint; + tool_cfg->enable_interintra_comp = extra_cfg->enable_interintra_comp; + tool_cfg->ref_frame_mvs_present = + extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint; + + // Explicitly disable global motion in a few cases: + // * For realtime mode, we never search global motion, and disabling + // it here prevents later code from allocating buffers we don't need + // * For large scale tile mode, some of the intended use cases expect + // all frame headers to be identical. This breaks if global motion is + // used, since global motion data is stored in the frame header. + // eg, see test/lightfield_test.sh, which checks that all frame headers + // are the same. + tool_cfg->enable_global_motion = extra_cfg->enable_global_motion && + cfg->g_usage != AOM_USAGE_REALTIME && + !cfg->large_scale_tile; + + tool_cfg->error_resilient_mode = + cfg->g_error_resilient | extra_cfg->error_resilient_mode; + tool_cfg->frame_parallel_decoding_mode = + extra_cfg->frame_parallel_decoding_mode; + + // Set Quantization related configuration. + q_cfg->using_qm = extra_cfg->enable_qm; + q_cfg->qm_minlevel = extra_cfg->qm_min; + q_cfg->qm_maxlevel = extra_cfg->qm_max; + q_cfg->quant_b_adapt = extra_cfg->quant_b_adapt; + q_cfg->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq; + q_cfg->aq_mode = extra_cfg->aq_mode; + q_cfg->deltaq_mode = extra_cfg->deltaq_mode; + q_cfg->deltaq_strength = extra_cfg->deltaq_strength; + q_cfg->use_fixed_qp_offsets = + cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q); + q_cfg->enable_hdr_deltaq = + (q_cfg->deltaq_mode == DELTA_Q_HDR) && + (cfg->g_bit_depth == AOM_BITS_10) && + (extra_cfg->color_primaries == AOM_CICP_CP_BT_2020); + + tool_cfg->enable_deltalf_mode = + (q_cfg->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode; + + // Set cost update frequency configuration. + oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq; + oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq; + // Avoid MV cost update for allintra encoding mode. + oxcf->cost_upd_freq.mv = (cfg->kf_max_dist != 0) + ? (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq + : COST_UPD_OFF; + oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq; + + // Set frame resize mode configuration. + resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode; + resize_cfg->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator; + resize_cfg->resize_kf_scale_denominator = + (uint8_t)cfg->rc_resize_kf_denominator; + if (resize_cfg->resize_mode == RESIZE_FIXED && + resize_cfg->resize_scale_denominator == SCALE_NUMERATOR && + resize_cfg->resize_kf_scale_denominator == SCALE_NUMERATOR) + resize_cfg->resize_mode = RESIZE_NONE; + + // Set encoder algorithm related configuration. + algo_cfg->enable_overlay = extra_cfg->enable_overlay; + algo_cfg->disable_trellis_quant = extra_cfg->disable_trellis_quant; + algo_cfg->sharpness = extra_cfg->sharpness; + algo_cfg->arnr_max_frames = extra_cfg->arnr_max_frames; + algo_cfg->arnr_strength = extra_cfg->arnr_strength; + algo_cfg->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode; + // TODO(any): Fix and Enable TPL for resize-mode > 0 + algo_cfg->enable_tpl_model = + resize_cfg->resize_mode ? 0 : extra_cfg->enable_tpl_model; + algo_cfg->loopfilter_control = extra_cfg->loopfilter_control; + algo_cfg->skip_postproc_filtering = extra_cfg->skip_postproc_filtering; + + // Set two-pass stats configuration. + oxcf->twopass_stats_in = cfg->rc_twopass_stats_in; + + if (extra_cfg->two_pass_output) + oxcf->two_pass_output = extra_cfg->two_pass_output; + + oxcf->second_pass_log = extra_cfg->second_pass_log; + + // Set Key frame configuration. + kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled; + kf_cfg->auto_key = + cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; + kf_cfg->key_freq_min = cfg->kf_min_dist; + kf_cfg->key_freq_max = cfg->kf_max_dist; + kf_cfg->sframe_dist = cfg->sframe_dist; + kf_cfg->sframe_mode = cfg->sframe_mode; + kf_cfg->enable_sframe = extra_cfg->s_frame_mode; + kf_cfg->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering; + kf_cfg->fwd_kf_dist = extra_cfg->fwd_kf_dist; + // Disable key frame filtering in all intra mode. + if (cfg->kf_max_dist == 0) { + kf_cfg->enable_keyframe_filtering = 0; + } + kf_cfg->enable_intrabc = extra_cfg->enable_intrabc; + + oxcf->speed = extra_cfg->cpu_used; + // TODO(yunqingwang, any) In REALTIME mode, 1080p performance at speed 5 & 6 + // is quite bad. Force to use speed 7 for now. Will investigate it when we + // work on rd path optimization later. + if (oxcf->mode == REALTIME && AOMMIN(cfg->g_w, cfg->g_h) >= 1080 && + oxcf->speed < 7) + oxcf->speed = 7; + + // Set Color related configuration. + color_cfg->color_primaries = extra_cfg->color_primaries; + color_cfg->transfer_characteristics = extra_cfg->transfer_characteristics; + color_cfg->matrix_coefficients = extra_cfg->matrix_coefficients; + color_cfg->color_range = extra_cfg->color_range; + color_cfg->chroma_sample_position = extra_cfg->chroma_sample_position; + + // Set Group of frames configuration. + // Force lag_in_frames to 0 for REALTIME mode + gf_cfg->lag_in_frames = (oxcf->mode == REALTIME) + ? 0 + : clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS); + gf_cfg->enable_auto_arf = extra_cfg->enable_auto_alt_ref; + gf_cfg->enable_auto_brf = extra_cfg->enable_auto_bwd_ref; + gf_cfg->min_gf_interval = extra_cfg->min_gf_interval; + gf_cfg->max_gf_interval = extra_cfg->max_gf_interval; + gf_cfg->gf_min_pyr_height = extra_cfg->gf_min_pyr_height; + gf_cfg->gf_max_pyr_height = extra_cfg->gf_max_pyr_height; + + // Set tune related configuration. + tune_cfg->tuning = extra_cfg->tuning; + tune_cfg->vmaf_model_path = extra_cfg->vmaf_model_path; + tune_cfg->content = extra_cfg->content; + if (cfg->large_scale_tile) { + tune_cfg->film_grain_test_vector = 0; + tune_cfg->film_grain_table_filename = NULL; + } else { + tune_cfg->film_grain_test_vector = extra_cfg->film_grain_test_vector; + tune_cfg->film_grain_table_filename = extra_cfg->film_grain_table_filename; + } + tune_cfg->dist_metric = extra_cfg->dist_metric; +#if CONFIG_DENOISE + oxcf->noise_level = extra_cfg->noise_level; + oxcf->noise_block_size = extra_cfg->noise_block_size; + oxcf->enable_dnl_denoising = extra_cfg->enable_dnl_denoising; +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING + // Temporal denoiser is for nonrd pickmode so disable it for speed < 7. + // Also disable it for speed 7 for now since it needs to be modified for + // the check_partition_merge_mode feature. + if (cfg->g_bit_depth == AOM_BITS_8 && oxcf->speed > 7) { + oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; + } else { + oxcf->noise_sensitivity = 0; + } +#endif + // Set Tile related configuration. + tile_cfg->num_tile_groups = extra_cfg->num_tg; + // In large-scale tile encoding mode, num_tile_groups is always 1. + if (cfg->large_scale_tile) tile_cfg->num_tile_groups = 1; + tile_cfg->mtu = extra_cfg->mtu_size; + tile_cfg->enable_large_scale_tile = cfg->large_scale_tile; + tile_cfg->enable_single_tile_decoding = + (tile_cfg->enable_large_scale_tile) ? extra_cfg->single_tile_decoding : 0; + tile_cfg->tile_columns = extra_cfg->tile_columns; + tile_cfg->tile_rows = extra_cfg->tile_rows; + tile_cfg->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS); + tile_cfg->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS); + for (int i = 0; i < tile_cfg->tile_width_count; i++) { + tile_cfg->tile_widths[i] = cfg->tile_widths[i]; + } + for (int i = 0; i < tile_cfg->tile_height_count; i++) { + tile_cfg->tile_heights[i] = cfg->tile_heights[i]; + } + tile_cfg->enable_ext_tile_debug = extra_cfg->ext_tile_debug; + + if (tile_cfg->enable_large_scale_tile) { + // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or + // AOM_SUPERBLOCK_SIZE_128X128 while tile_cfg->enable_large_scale_tile = 1. + // If superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to + // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile). + if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 && + extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128) + tool_cfg->superblock_size = AOM_SUPERBLOCK_SIZE_64X64; + } + + // Set reference frame related configuration. + oxcf->ref_frm_cfg.max_reference_frames = extra_cfg->max_reference_frames; + oxcf->ref_frm_cfg.enable_reduced_reference_set = + extra_cfg->enable_reduced_reference_set; + oxcf->ref_frm_cfg.enable_onesided_comp = extra_cfg->enable_onesided_comp; + + oxcf->row_mt = extra_cfg->row_mt; + oxcf->fp_mt = extra_cfg->fp_mt; + + // Set motion mode related configuration. + oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc; + oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion; +#if !CONFIG_REALTIME_ONLY + if (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7 && + oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) { + // TODO(marpan): warped motion is causing a crash for RT mode with screen + // in nonrd (speed >= 7), for non-realtime build. + // Re-enable/allow when the issue is fixed. + oxcf->motion_mode_cfg.enable_warped_motion = 0; + oxcf->motion_mode_cfg.allow_warped_motion = 0; + } else { + oxcf->motion_mode_cfg.allow_warped_motion = + (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion); + } +#else + oxcf->motion_mode_cfg.allow_warped_motion = + (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7) + ? false + : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion); +#endif + + // Set partition related configuration. + part_cfg->enable_rect_partitions = extra_cfg->enable_rect_partitions; + part_cfg->enable_ab_partitions = extra_cfg->enable_ab_partitions; + part_cfg->enable_1to4_partitions = extra_cfg->enable_1to4_partitions; + part_cfg->min_partition_size = extra_cfg->min_partition_size; + part_cfg->max_partition_size = extra_cfg->max_partition_size; + + // Set intra mode configuration. + intra_mode_cfg->enable_angle_delta = extra_cfg->enable_angle_delta; + intra_mode_cfg->enable_intra_edge_filter = + extra_cfg->enable_intra_edge_filter; + intra_mode_cfg->enable_filter_intra = extra_cfg->enable_filter_intra; + intra_mode_cfg->enable_smooth_intra = extra_cfg->enable_smooth_intra; + intra_mode_cfg->enable_paeth_intra = extra_cfg->enable_paeth_intra; + intra_mode_cfg->enable_cfl_intra = extra_cfg->enable_cfl_intra; + intra_mode_cfg->enable_directional_intra = + extra_cfg->enable_directional_intra; + intra_mode_cfg->enable_diagonal_intra = extra_cfg->enable_diagonal_intra; + intra_mode_cfg->auto_intra_tools_off = extra_cfg->auto_intra_tools_off; + + // Set transform size/type configuration. + txfm_cfg->enable_tx64 = extra_cfg->enable_tx64; + txfm_cfg->enable_flip_idtx = extra_cfg->enable_flip_idtx; + txfm_cfg->enable_rect_tx = extra_cfg->enable_rect_tx; + txfm_cfg->reduced_tx_type_set = extra_cfg->reduced_tx_type_set; + txfm_cfg->use_intra_dct_only = extra_cfg->use_intra_dct_only; + txfm_cfg->use_inter_dct_only = extra_cfg->use_inter_dct_only; + txfm_cfg->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only; + txfm_cfg->enable_tx_size_search = extra_cfg->enable_tx_size_search; + + // Set compound type configuration. + comp_type_cfg->enable_dist_wtd_comp = + extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint; + comp_type_cfg->enable_masked_comp = extra_cfg->enable_masked_comp; + comp_type_cfg->enable_diff_wtd_comp = + extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp; + comp_type_cfg->enable_interinter_wedge = + extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge; + comp_type_cfg->enable_smooth_interintra = + extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra; + comp_type_cfg->enable_interintra_wedge = + extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge; + + // Set Super-resolution mode configuration. + if (extra_cfg->lossless || cfg->large_scale_tile) { + disable_superres(superres_cfg); + } else { + superres_cfg->superres_mode = cfg->rc_superres_mode; + superres_cfg->superres_scale_denominator = + (uint8_t)cfg->rc_superres_denominator; + superres_cfg->superres_kf_scale_denominator = + (uint8_t)cfg->rc_superres_kf_denominator; + superres_cfg->superres_qthresh = + av1_quantizer_to_qindex(cfg->rc_superres_qthresh); + superres_cfg->superres_kf_qthresh = + av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh); + if (superres_cfg->superres_mode == AOM_SUPERRES_FIXED && + superres_cfg->superres_scale_denominator == SCALE_NUMERATOR && + superres_cfg->superres_kf_scale_denominator == SCALE_NUMERATOR) { + disable_superres(superres_cfg); + } + if (superres_cfg->superres_mode == AOM_SUPERRES_QTHRESH && + superres_cfg->superres_qthresh == 255 && + superres_cfg->superres_kf_qthresh == 255) { + disable_superres(superres_cfg); + } + } + + superres_cfg->enable_superres = + (superres_cfg->superres_mode != AOM_SUPERRES_NONE) && + extra_cfg->enable_superres; + if (!superres_cfg->enable_superres) { + disable_superres(superres_cfg); + } + + if (input_cfg->limit == 1) { + // still picture mode, display model and timing is meaningless + dec_model_cfg->display_model_info_present_flag = 0; + dec_model_cfg->timing_info_present = 0; + } + + oxcf->save_as_annexb = cfg->save_as_annexb; + + // Set unit test related configuration. + oxcf->unit_test_cfg.motion_vector_unit_test = + extra_cfg->motion_vector_unit_test; + oxcf->unit_test_cfg.sb_multipass_unit_test = + extra_cfg->sb_multipass_unit_test; + + oxcf->border_in_pixels = + av1_get_enc_border_size(av1_is_resize_needed(oxcf), + (oxcf->kf_cfg.key_freq_max == 0), BLOCK_128X128); + memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx, + sizeof(oxcf->target_seq_level_idx)); + oxcf->tier_mask = extra_cfg->tier_mask; + + oxcf->partition_info_path = extra_cfg->partition_info_path; + + oxcf->enable_rate_guide_deltaq = extra_cfg->enable_rate_guide_deltaq; + oxcf->rate_distribution_info = extra_cfg->rate_distribution_info; + + oxcf->strict_level_conformance = extra_cfg->strict_level_conformance; + + oxcf->kf_max_pyr_height = extra_cfg->kf_max_pyr_height; + + oxcf->sb_qp_sweep = extra_cfg->sb_qp_sweep; +} + +AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg) { + AV1EncoderConfig oxcf; + struct av1_extracfg extra_cfg = default_extra_cfg; + set_encoder_config(&oxcf, cfg, &extra_cfg); + return oxcf; +} + +static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, + const aom_codec_enc_cfg_t *cfg) { + aom_codec_err_t res; + int force_key = 0; + + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { + if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS) + ERROR("Cannot change width or height after initialization"); + // Note: function encoder_set_config() is allowed to be called multiple + // times. However, when the original frame width or height is less than two + // times of the new frame width or height, a forced key frame should be + // used. To make sure the correct detection of a forced key frame, we need + // to update the frame width and height only when the actual encoding is + // performed. cpi->last_coded_width and cpi->last_coded_height are used to + // track the actual coded frame size. + if (ctx->ppi->cpi->last_coded_width && ctx->ppi->cpi->last_coded_height && + (!valid_ref_frame_size(ctx->ppi->cpi->last_coded_width, + ctx->ppi->cpi->last_coded_height, cfg->g_w, + cfg->g_h) || + ((int)cfg->g_w > ctx->ppi->cpi->last_coded_width) || + ((int)cfg->g_h > ctx->ppi->cpi->last_coded_height))) { + force_key = 1; + } + } + + if (ctx->monochrome_on_init && cfg->monochrome == 0) { + // TODO(aomedia:3465): Allow this case to work without requiring re-init + // of encoder. + ERROR("Cannot change to monochrome = 0 after init with monochrome"); + } + + // Prevent increasing lag_in_frames. This check is stricter than it needs + // to be -- the limit is not increasing past the first lag_in_frames + // value, but we don't track the initial config, only the last successful + // config. + if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames) + ERROR("Cannot increase lag_in_frames"); + // Prevent changing lag_in_frames if Lookahead Processing is enabled + if (cfg->g_lag_in_frames != ctx->cfg.g_lag_in_frames && + ctx->num_lap_buffers > 0) + ERROR("Cannot change lag_in_frames if LAP is enabled"); + + res = validate_config(ctx, cfg, &ctx->extra_cfg); + + if (res == AOM_CODEC_OK) { + ctx->cfg = *cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + // On profile change, request a key frame + force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile; + bool is_sb_size_changed = false; + av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); + for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) { + av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf, + is_sb_size_changed); + } + if (ctx->ppi->cpi_lap != NULL) { + av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed); + } + } + + if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF; + + return res; +} + +static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) { + return av1_get_global_headers(ctx->ppi); +} + +static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = av1_get_quantizer(ctx->ppi->cpi); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->ppi->cpi)); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_loopfilter_level(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = ctx->ppi->cpi->common.lf.filter_level[0]; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = ctx->ppi->p_rc.baseline_gf_interval; + return AOM_CODEC_OK; +} + +static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx, + const struct av1_extracfg *extra_cfg) { + const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); + if (res == AOM_CODEC_OK) { + ctx->extra_cfg = *extra_cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + av1_check_fpmt_config(ctx->ppi, &ctx->oxcf); + bool is_sb_size_changed = false; + av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); + for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) { + AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i]; + struct aom_internal_error_info *const error = cpi->common.error; + if (setjmp(error->jmp)) { + error->setjmp = 0; + return error->error_code; + } + error->setjmp = 1; + av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed); + error->setjmp = 0; + } + if (ctx->ppi->cpi_lap != NULL) { + AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap; + struct aom_internal_error_info *const error = cpi_lap->common.error; + if (setjmp(error->jmp)) { + error->setjmp = 0; + return error->error_code; + } + error->setjmp = 1; + av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed); + error->setjmp = 0; + } + } + return res; +} + +static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int row_mt = CAST(AV1E_SET_ROW_MT, args); + if (row_mt == ctx->extra_cfg.row_mt) return AOM_CODEC_OK; + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.row_mt = row_mt; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args); + if (tile_columns == ctx->extra_cfg.tile_columns) return AOM_CODEC_OK; + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_columns = tile_columns; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int tile_rows = CAST(AV1E_SET_TILE_ROWS, args); + if (tile_rows == ctx->extra_cfg.tile_rows) return AOM_CODEC_OK; + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_rows = tile_rows; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args); +#if CONFIG_REALTIME_ONLY + if (tpl_model_arg) { + ERROR("TPL model can't be turned on in realtime only build."); + } +#endif + extra_cfg.enable_tpl_model = tpl_model_arg; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_keyframe_filtering( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_keyframe_filtering = + CAST(AV1E_SET_ENABLE_KEYFRAME_FILTERING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tuning = CAST(AOME_SET_TUNING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_inter_bitrate_pct = + CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args); +#if CONFIG_REALTIME_ONLY + if (restoration_arg) { + ERROR("Restoration can't be turned on in realtime only build."); + } +#endif + extra_cfg.enable_restoration = restoration_arg; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.force_video_mode = CAST(AV1E_SET_FORCE_VIDEO_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args); +#if CONFIG_REALTIME_ONLY + if (obmc_arg) { + ERROR("OBMC can't be enabled in realtime only build."); + } +#endif + extra_cfg.enable_obmc = obmc_arg; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_chroma_deltaq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_chroma_deltaq = CAST(AV1E_SET_ENABLE_CHROMA_DELTAQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_rect_partitions( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_rect_partitions = + CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_1to4_partitions( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_1to4_partitions = + CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_intra_edge_filter( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_intra_edge_filter = + CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_rect_tx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_rect_tx = CAST(AV1E_SET_ENABLE_RECT_TX, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_reduced_reference_set( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_reduced_reference_set = + CAST(AV1E_SET_REDUCED_REFERENCE_SET, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interintra_comp( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interintra_comp = + CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_smooth_interintra( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_smooth_interintra = + CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interinter_wedge( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interinter_wedge = + CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interintra_wedge( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interintra_wedge = + CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args); +#if CONFIG_REALTIME_ONLY + if (global_motion_arg) { + ERROR("Global motion can't be enabled in realtime only build."); + } +#endif + extra_cfg.enable_global_motion = global_motion_arg; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args); +#if CONFIG_REALTIME_ONLY + if (warped_motion_arg) { + ERROR("Warped motion can't be enabled in realtime only build."); + } +#endif + extra_cfg.enable_warped_motion = warped_motion_arg; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_directional_intra( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_directional_intra = + CAST(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_diagonal_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_diagonal_intra = CAST(AV1E_SET_ENABLE_DIAGONAL_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_overlay(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_overlay = CAST(AV1E_SET_ENABLE_OVERLAY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_parallel_decoding_mode = + CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args); + + // Skip AQ mode if using fixed QP for current frame. + if (ctx->ppi->cpi->rc.use_external_qp_one_pass) extra_cfg.aq_mode = 0; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_intra_default_tx_only = + CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_tx_size_search(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tx_size_search = CAST(AV1E_SET_ENABLE_TX_SIZE_SEARCH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_REALTIME_ONLY + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args); + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_set_vbr_corpus_complexity_lap( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.vbr_corpus_complexity_lap = + CAST(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, args); + return update_extra_cfg(ctx, &extra_cfg); +} +static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const char *str = CAST(AV1E_SET_VMAF_MODEL_PATH, args); + const aom_codec_err_t ret = allocate_and_set_string( + str, default_extra_cfg.vmaf_model_path, &extra_cfg.vmaf_model_path, + ctx->ppi->error.detail); + if (ret != AOM_CODEC_OK) return ret; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const char *str = CAST(AV1E_SET_PARTITION_INFO_PATH, args); + const aom_codec_err_t ret = allocate_and_set_string( + str, default_extra_cfg.partition_info_path, + &extra_cfg.partition_info_path, ctx->ppi->error.detail); + if (ret != AOM_CODEC_OK) return ret; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_rate_guide_deltaq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_rate_guide_deltaq = + CAST(AV1E_ENABLE_RATE_GUIDE_DELTAQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rate_distribution_info( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const char *str = CAST(AV1E_SET_RATE_DISTRIBUTION_INFO, args); + const aom_codec_err_t ret = allocate_and_set_string( + str, default_extra_cfg.rate_distribution_info, + &extra_cfg.rate_distribution_info, ctx->ppi->error.detail); + if (ret != AOM_CODEC_OK) return ret; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_film_grain_test_vector( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.film_grain_test_vector = + CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const char *str = CAST(AV1E_SET_FILM_GRAIN_TABLE, args); + if (str == NULL) { + // this parameter allows NULL as its value + extra_cfg.film_grain_table_filename = str; + } else { + const aom_codec_err_t ret = allocate_and_set_string( + str, default_extra_cfg.film_grain_table_filename, + &extra_cfg.film_grain_table_filename, ctx->ppi->error.detail); + if (ret != AOM_CODEC_OK) return ret; + } + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_DENOISE + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_level = + ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f; + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_DENOISE + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_set_enable_dnl_denoising(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_DENOISE + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_dnl_denoising = CAST(AV1E_SET_ENABLE_DNL_DENOISING, args); + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args); +#if CONFIG_REALTIME_ONLY + if (deltaq_arg > NO_DELTA_Q) { + ERROR("Delta Q mode can't be enabled in realtime only build."); + } +#endif + extra_cfg.deltaq_mode = deltaq_arg; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_deltaq_strength(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.deltaq_strength = CAST(AV1E_SET_DELTAQ_STRENGTH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_deltalf_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.deltalf_mode = CAST(AV1E_SET_DELTALF_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_gf_min_pyr_height(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_min_pyr_height = CAST(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_motion_vector_unit_test( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.motion_vector_unit_test = + CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_fpmt_unit_test(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_FPMT_TEST + (void)args; + (void)ctx; + return AOM_CODEC_INCAPABLE; +#else + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.fpmt_unit_test = CAST(AV1E_SET_FP_MT_UNIT_TEST, args); + ctx->ppi->fpmt_unit_test_cfg = (extra_cfg.fpmt_unit_test == 1) + ? PARALLEL_ENCODE + : PARALLEL_SIMULATION_ENCODE; + return update_extra_cfg(ctx, &extra_cfg); +#endif +} + +static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.ext_tile_debug = CAST(AV1E_ENABLE_EXT_TILE_DEBUG, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args); + const int level = val % 100; + const int operating_point_idx = val / 100; + if (operating_point_idx < 0 || + operating_point_idx >= MAX_NUM_OPERATING_POINTS) { + char *const err_string = ctx->ppi->error.detail; + snprintf(err_string, ARG_ERR_MSG_MAX_LEN, + "Invalid operating point index: %d", operating_point_idx); + ctx->base.err_detail = err_string; + return AOM_CODEC_INVALID_PARAM; + } + extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_cr(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_cr = CAST(AV1E_SET_MIN_CR, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_sb_multipass_unit_test( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sb_multipass_unit_test = + CAST(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_enable_sb_qp_sweep(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sb_qp_sweep = CAST(AV1E_ENABLE_SB_QP_SWEEP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_COMP *const cpi = ctx->ppi->cpi; + aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args); + aom_ext_part_config_t config; + // TODO(chengchen): verify the sb_size has been set at this point. + config.superblock_size = cpi->common.seq_params->sb_size; + const aom_codec_err_t status = + av1_ext_part_create(funcs, config, &cpi->ext_part_controller); + return status; +} + +static aom_codec_err_t ctrl_set_loopfilter_control(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.loopfilter_control = CAST(AV1E_SET_LOOPFILTER_CONTROL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_skip_postproc_filtering( + aom_codec_alg_priv_t *ctx, va_list args) { + // Skipping the application of post-processing filters is allowed only + // for ALLINTRA mode. + if (ctx->cfg.g_usage != AOM_USAGE_ALL_INTRA) return AOM_CODEC_INCAPABLE; + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.skip_postproc_filtering = + CAST(AV1E_SET_SKIP_POSTPROC_FILTERING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_rtc_external_rc(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->ppi->cpi->rc.rtc_external_ratectrl = + CAST(AV1E_SET_RTC_EXTERNAL_RC, args); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_quantizer_one_pass(aom_codec_alg_priv_t *ctx, + va_list args) { + const int qp = CAST(AV1E_SET_QUANTIZER_ONE_PASS, args); + + if (qp < 0 || qp > 63) return AOM_CODEC_INVALID_PARAM; + + aom_codec_enc_cfg_t *cfg = &ctx->cfg; + struct av1_extracfg extra_cfg = ctx->extra_cfg; + cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp; + extra_cfg.aq_mode = 0; + ctx->ppi->cpi->rc.use_external_qp_one_pass = 1; + + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_bitrate_one_pass_cbr(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; + AV1EncoderConfig *oxcf = &cpi->oxcf; + if (!is_one_pass_rt_params(cpi) || oxcf->rc_cfg.mode != AOM_CBR || + cpi->ppi->use_svc || ppi->num_fp_contexts != 1 || ppi->cpi_lap != NULL) { + return AOM_CODEC_INVALID_PARAM; + } + const int new_bitrate = CAST(AV1E_SET_BITRATE_ONE_PASS_CBR, args); + ctx->cfg.rc_target_bitrate = new_bitrate; + oxcf->rc_cfg.target_bandwidth = new_bitrate * 1000; + set_primary_rc_buffer_sizes(oxcf, ppi); + av1_new_framerate(cpi, cpi->framerate); + check_reset_rc_flag(cpi); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_max_consec_frame_drop_cbr( + aom_codec_alg_priv_t *ctx, va_list args) { + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; + const int max_consec_drop = CAST(AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, args); + if (max_consec_drop < 0) return AOM_CODEC_INVALID_PARAM; + cpi->rc.max_consec_drop = max_consec_drop; + cpi->rc.drop_count_consec = 0; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; + cpi->svc.framedrop_mode = CAST(AV1E_SET_SVC_FRAME_DROP_MODE, args); + if (cpi->svc.framedrop_mode != AOM_LAYER_DROP && + cpi->svc.framedrop_mode != AOM_FULL_SUPERFRAME_DROP) + return AOM_CODEC_INVALID_PARAM; + else + return AOM_CODEC_OK; +} + +#if !CONFIG_REALTIME_ONLY +static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, + STATS_BUFFER_CTX *stats_buf_context, + int num_lap_buffers) { + aom_codec_err_t res = AOM_CODEC_OK; + + int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); + *frame_stats_buffer = + (FIRSTPASS_STATS *)aom_calloc(size, sizeof(FIRSTPASS_STATS)); + if (*frame_stats_buffer == NULL) return AOM_CODEC_MEM_ERROR; + + stats_buf_context->stats_in_start = *frame_stats_buffer; + stats_buf_context->stats_in_end = stats_buf_context->stats_in_start; + stats_buf_context->stats_in_buf_end = + stats_buf_context->stats_in_start + size; + + stats_buf_context->total_left_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS)); + if (stats_buf_context->total_left_stats == NULL) return AOM_CODEC_MEM_ERROR; + av1_twopass_zero_stats(stats_buf_context->total_left_stats); + stats_buf_context->total_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS)); + if (stats_buf_context->total_stats == NULL) return AOM_CODEC_MEM_ERROR; + av1_twopass_zero_stats(stats_buf_context->total_stats); + return res; +} +#endif + +aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi, + AV1_COMP **p_cpi, + BufferPool **p_buffer_pool, + const AV1EncoderConfig *oxcf, + COMPRESSOR_STAGE stage, + int lap_lag_in_frames) { + aom_codec_err_t res = AOM_CODEC_OK; + BufferPool *buffer_pool = *p_buffer_pool; + + if (buffer_pool == NULL) { + buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); + if (buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; + buffer_pool->num_frame_bufs = + (oxcf->mode == ALLINTRA) ? FRAME_BUFFERS_ALLINTRA : FRAME_BUFFERS; + buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc( + buffer_pool->num_frame_bufs, sizeof(*buffer_pool->frame_bufs)); + if (buffer_pool->frame_bufs == NULL) { + buffer_pool->num_frame_bufs = 0; + aom_free(buffer_pool); + return AOM_CODEC_MEM_ERROR; + } +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&buffer_pool->pool_mutex, NULL)) { + aom_free(buffer_pool->frame_bufs); + buffer_pool->frame_bufs = NULL; + buffer_pool->num_frame_bufs = 0; + aom_free(buffer_pool); + return AOM_CODEC_MEM_ERROR; + } +#endif + *p_buffer_pool = buffer_pool; + } + *p_cpi = + av1_create_compressor(ppi, oxcf, buffer_pool, stage, lap_lag_in_frames); + if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR; + + return res; +} + +static aom_codec_err_t ctrl_set_fp_mt(aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.fp_mt = CAST(AV1E_SET_FP_MT, args); + const aom_codec_err_t result = update_extra_cfg(ctx, &extra_cfg); + int num_fp_contexts = 1; + if (ctx->ppi->num_fp_contexts == 1) { + num_fp_contexts = + av1_compute_num_fp_contexts(ctx->ppi, &ctx->ppi->parallel_cpi[0]->oxcf); + if (num_fp_contexts > 1) { + int i; + for (i = 1; i < num_fp_contexts; i++) { + int res = av1_create_context_and_bufferpool( + ctx->ppi, &ctx->ppi->parallel_cpi[i], &ctx->buffer_pool, &ctx->oxcf, + ENCODE_STAGE, -1); + if (res != AOM_CODEC_OK) { + return res; + } +#if !CONFIG_REALTIME_ONLY + ctx->ppi->parallel_cpi[i]->twopass_frame.stats_in = + ctx->ppi->twopass.stats_buf_ctx->stats_in_start; +#endif + } + } + } + ctx->ppi->num_fp_contexts = num_fp_contexts; + return result; +} + +static aom_codec_err_t ctrl_set_auto_intra_tools_off(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.auto_intra_tools_off = CAST(AV1E_SET_AUTO_INTRA_TOOLS_OFF, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) { + aom_codec_err_t res = AOM_CODEC_OK; + + if (ctx->priv == NULL) { + aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv)); + if (priv == NULL) return AOM_CODEC_MEM_ERROR; + + ctx->priv = (aom_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + + // Update the reference to the config structure to an internal copy. + assert(ctx->config.enc); + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; + + priv->extra_cfg = default_extra_cfg; + // Special handling: + // By default, if omitted, --enable-cdef = 1. + // Here we set its default value to 0 when --allintra is turned on. + // However, if users set --enable-cdef = 1 from command line, + // The encoder still respects it. + if (priv->cfg.g_usage == ALLINTRA) { + priv->extra_cfg.enable_cdef = 0; + } + av1_initialize_enc(priv->cfg.g_usage, priv->cfg.rc_end_usage); + + res = validate_config(priv, &priv->cfg, &priv->extra_cfg); + + if (res == AOM_CODEC_OK) { + int *num_lap_buffers = &priv->num_lap_buffers; + int lap_lag_in_frames = 0; + *num_lap_buffers = 0; + priv->timestamp_ratio.den = priv->cfg.g_timebase.den; + priv->timestamp_ratio.num = + (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC; + reduce_ratio(&priv->timestamp_ratio); + + set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); + if (priv->oxcf.rc_cfg.mode != AOM_CBR && + priv->oxcf.pass == AOM_RC_ONE_PASS && priv->oxcf.mode == GOOD) { + // Enable look ahead - enabled for AOM_Q, AOM_CQ, AOM_VBR + *num_lap_buffers = + AOMMIN((int)priv->cfg.g_lag_in_frames, + AOMMIN(MAX_LAP_BUFFERS, priv->oxcf.kf_cfg.key_freq_max + + SCENE_CUT_KEY_TEST_INTERVAL)); + if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >= + LAP_LAG_IN_FRAMES) { + lap_lag_in_frames = LAP_LAG_IN_FRAMES; + } + } + priv->oxcf.use_highbitdepth = + (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; + + priv->monochrome_on_init = priv->cfg.monochrome; + + priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head, + *num_lap_buffers, &priv->oxcf); + if (!priv->ppi) return AOM_CODEC_MEM_ERROR; + +#if !CONFIG_REALTIME_ONLY + res = create_stats_buffer(&priv->frame_stats_buffer, + &priv->stats_buf_context, *num_lap_buffers); + if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR; + + assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); + int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS); + for (int i = 0; i < size; i++) + priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i]; + + priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context; +#endif + + assert(priv->ppi->num_fp_contexts >= 1); + res = av1_create_context_and_bufferpool( + priv->ppi, &priv->ppi->parallel_cpi[0], &priv->buffer_pool, + &priv->oxcf, ENCODE_STAGE, -1); + if (res != AOM_CODEC_OK) { + return res; + } +#if !CONFIG_REALTIME_ONLY + priv->ppi->parallel_cpi[0]->twopass_frame.stats_in = + priv->ppi->twopass.stats_buf_ctx->stats_in_start; +#endif + priv->ppi->cpi = priv->ppi->parallel_cpi[0]; + + // Create another compressor if look ahead is enabled + if (res == AOM_CODEC_OK && *num_lap_buffers) { + res = av1_create_context_and_bufferpool( + priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, + LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS)); + } + } + } + + return res; +} + +void av1_destroy_context_and_bufferpool(AV1_COMP *cpi, + BufferPool **p_buffer_pool) { + av1_remove_compressor(cpi); + if (*p_buffer_pool) { + av1_free_ref_frame_buffers(*p_buffer_pool); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex); +#endif + aom_free(*p_buffer_pool); + *p_buffer_pool = NULL; + } +} + +static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, + FIRSTPASS_STATS *frame_stats_buffer) { + aom_free(stats_buf_context->total_left_stats); + aom_free(stats_buf_context->total_stats); + aom_free(frame_stats_buffer); +} + +static void check_and_free_string(const char *default_str, const char **ptr) { + if (*ptr == default_str) { + // Default should be a literal. Do not free. + return; + } + aom_free((void *)*ptr); + *ptr = NULL; +} + +static void destroy_extra_config(struct av1_extracfg *extra_cfg) { +#if CONFIG_TUNE_VMAF + check_and_free_string(default_extra_cfg.vmaf_model_path, + &extra_cfg->vmaf_model_path); +#endif + check_and_free_string(default_extra_cfg.two_pass_output, + &extra_cfg->two_pass_output); + check_and_free_string(default_extra_cfg.two_pass_output, + &extra_cfg->second_pass_log); + check_and_free_string(default_extra_cfg.partition_info_path, + &extra_cfg->partition_info_path); + check_and_free_string(default_extra_cfg.rate_distribution_info, + &extra_cfg->rate_distribution_info); + check_and_free_string(default_extra_cfg.film_grain_table_filename, + &extra_cfg->film_grain_table_filename); +} + +static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) { + free(ctx->cx_data); + destroy_extra_config(&ctx->extra_cfg); + + if (ctx->ppi) { + AV1_PRIMARY *ppi = ctx->ppi; + for (int i = 0; i < MAX_PARALLEL_FRAMES - 1; i++) { + if (ppi->parallel_frames_data[i].cx_data) { + free(ppi->parallel_frames_data[i].cx_data); + } + } +#if CONFIG_ENTROPY_STATS + print_entropy_stats(ppi); +#endif +#if CONFIG_INTERNAL_STATS + print_internal_stats(ppi); +#endif + + for (int i = 0; i < MAX_PARALLEL_FRAMES; i++) { + av1_destroy_context_and_bufferpool(ppi->parallel_cpi[i], + &ctx->buffer_pool); + } + ppi->cpi = NULL; + + if (ppi->cpi_lap) { + av1_destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap); + } + av1_remove_primary_compressor(ppi); + } + destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); + aom_free(ctx); + return AOM_CODEC_OK; +} + +static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi, + unsigned int lib_flags) { + aom_codec_frame_flags_t flags = lib_flags << 16; + if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY; + if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY; + if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH; + if (lib_flags & FRAMEFLAGS_ERROR_RESILIENT) + flags |= AOM_FRAME_IS_ERROR_RESILIENT; + if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE; + + return flags; +} + +static INLINE int get_src_border_in_pixels(AV1_COMP *cpi, BLOCK_SIZE sb_size) { + if (cpi->oxcf.mode != REALTIME || av1_is_resize_needed(&cpi->oxcf)) + return cpi->oxcf.border_in_pixels; + + const int sb_size_in_pixels_log2 = mi_size_wide_log2[sb_size] + MI_SIZE_LOG2; + const int sb_aligned_width = + ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.width, sb_size_in_pixels_log2); + const int sb_aligned_height = + ALIGN_POWER_OF_TWO(cpi->oxcf.frm_dim_cfg.height, sb_size_in_pixels_log2); + // Align the border pixels to a multiple of 32. + const int border_pixels_width = + ALIGN_POWER_OF_TWO(sb_aligned_width - cpi->oxcf.frm_dim_cfg.width, 5); + const int border_pixels_height = + ALIGN_POWER_OF_TWO(sb_aligned_height - cpi->oxcf.frm_dim_cfg.height, 5); + const int border_in_pixels = + AOMMAX(AOMMAX(border_pixels_width, border_pixels_height), 32); + return border_in_pixels; +} + +// TODO(Mufaddal): Check feasibility of abstracting functions related to LAP +// into a separate function. +static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, + const aom_image_t *img, + aom_codec_pts_t pts, + unsigned long duration, + aom_enc_frame_flags_t enc_flags) { + const size_t kMinCompressedSize = 8192; + volatile aom_codec_err_t res = AOM_CODEC_OK; + AV1_PRIMARY *const ppi = ctx->ppi; + volatile aom_codec_pts_t ptsvol = pts; + AV1_COMP_DATA cpi_data = { 0 }; + + cpi_data.timestamp_ratio = &ctx->timestamp_ratio; + cpi_data.flush = !img; + // LAP context + AV1_COMP *cpi_lap = ppi->cpi_lap; + if (ppi->cpi == NULL) return AOM_CODEC_INVALID_PARAM; + + ppi->cpi->last_coded_width = ppi->cpi->oxcf.frm_dim_cfg.width; + ppi->cpi->last_coded_height = ppi->cpi->oxcf.frm_dim_cfg.height; + + if (ppi->lap_enabled && cpi_lap == NULL && + ppi->cpi->oxcf.pass == AOM_RC_ONE_PASS) + return AOM_CODEC_INVALID_PARAM; + + if (img != NULL) { + res = validate_img(ctx, img); + if (res == AOM_CODEC_OK) { + const size_t uncompressed_frame_sz = + ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) * + ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8; + + // Due to the presence of no-show frames, the ctx->cx_data buffer holds + // compressed data corresponding to multiple frames. As no-show frames are + // not possible for all intra frame encoding with no forward key frames, + // the buffer is allocated with a smaller size in this case. + // + // For pseudo random input, the compressed frame size is seen to exceed + // the uncompressed frame size, but is less than 2 times the uncompressed + // frame size. Hence the size of the buffer is chosen as 2 times the + // uncompressed frame size. + int multiplier = 8; + if (ppi->cpi->oxcf.kf_cfg.key_freq_max == 0 && + !ppi->cpi->oxcf.kf_cfg.fwd_kf_enabled) + multiplier = 2; + size_t data_sz = uncompressed_frame_sz * multiplier; + if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; + if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { + ctx->cx_data_sz = data_sz; + free(ctx->cx_data); + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + ctx->cx_data_sz = 0; + return AOM_CODEC_MEM_ERROR; + } + } + for (int i = 0; i < ppi->num_fp_contexts - 1; i++) { + if (ppi->parallel_frames_data[i].cx_data == NULL) { + ppi->parallel_frames_data[i].cx_data_sz = uncompressed_frame_sz; + ppi->parallel_frames_data[i].frame_display_order_hint = -1; + ppi->parallel_frames_data[i].frame_size = 0; + ppi->parallel_frames_data[i].cx_data = + (unsigned char *)malloc(ppi->parallel_frames_data[i].cx_data_sz); + if (ppi->parallel_frames_data[i].cx_data == NULL) { + ppi->parallel_frames_data[i].cx_data_sz = 0; + return AOM_CODEC_MEM_ERROR; + } + } + } + } + } + + aom_codec_pkt_list_init(&ctx->pkt_list); + + volatile aom_enc_frame_flags_t flags = enc_flags; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(ppi->error.jmp)) { + ppi->error.setjmp = 0; + res = update_error_state(ctx, &ppi->error); + return res; + } + ppi->error.setjmp = 1; + + if (ppi->use_svc && ppi->cpi->svc.use_flexible_mode == 0 && flags == 0) + av1_set_svc_fixed_mode(ppi->cpi); + + // Note(yunqing): While applying encoding flags, always start from enabling + // all, and then modifying according to the flags. Previous frame's flags are + // overwritten. + av1_apply_encoding_flags(ppi->cpi, flags); + if (cpi_lap != NULL) { + av1_apply_encoding_flags(cpi_lap, flags); + } + +#if CONFIG_TUNE_VMAF + if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + aom_init_vmaf_model(&ppi->cpi->vmaf_info.vmaf_model, + ppi->cpi->oxcf.tune_cfg.vmaf_model_path); + } +#endif + + // Handle fixed keyframe intervals + if (is_stat_generation_stage(ppi->cpi) || is_one_pass_rt_params(ppi->cpi)) { + if (ctx->cfg.kf_mode == AOM_KF_AUTO && + ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { + if (ppi->cpi->common.spatial_layer_id == 0 && + ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) { + flags |= AOM_EFLAG_FORCE_KF; + ctx->fixed_kf_cntr = 1; + } + } + } + + if (res == AOM_CODEC_OK) { + AV1_COMP *cpi = ppi->cpi; + + // Set up internal flags + if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) ppi->b_calculate_psnr = 1; + + if (img != NULL) { + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = ptsvol; + ctx->pts_offset_initialized = 1; + } + ptsvol -= ctx->pts_offset; + int64_t src_time_stamp = + timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol); + int64_t src_end_time_stamp = + timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + duration); + + YV12_BUFFER_CONFIG sd; + res = image2yuvconfig(img, &sd); + // When generating a monochrome stream, make |sd| a monochrome image. + if (ctx->cfg.monochrome) { + sd.u_buffer = sd.v_buffer = NULL; + sd.uv_stride = 0; + sd.monochrome = 1; + } + int use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0; + int subsampling_x = sd.subsampling_x; + int subsampling_y = sd.subsampling_y; + + if (!ppi->lookahead) { + int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.gf_cfg.lag_in_frames + : cpi->oxcf.gf_cfg.lag_in_frames; + AV1EncoderConfig *oxcf = &cpi->oxcf; + const BLOCK_SIZE sb_size = av1_select_sb_size( + oxcf, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + ppi->number_spatial_layers); + oxcf->border_in_pixels = + av1_get_enc_border_size(av1_is_resize_needed(oxcf), + oxcf->kf_cfg.key_freq_max == 0, sb_size); + for (int i = 0; i < ppi->num_fp_contexts; i++) { + ppi->parallel_cpi[i]->oxcf.border_in_pixels = oxcf->border_in_pixels; + } + + const int src_border_in_pixels = get_src_border_in_pixels(cpi, sb_size); + ppi->lookahead = av1_lookahead_init( + cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, + subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames, + src_border_in_pixels, cpi->common.features.byte_alignment, + ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0), + cpi->image_pyramid_levels); + } + if (!ppi->lookahead) + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); + for (int i = 0; i < ppi->num_fp_contexts; i++) { + aom_codec_err_t err = + av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, + subsampling_x, subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, err, + "av1_check_initial_width() failed"); + } + } + if (cpi_lap != NULL) { + aom_codec_err_t err = av1_check_initial_width( + cpi_lap, use_highbitdepth, subsampling_x, subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, err, + "av1_check_initial_width() failed"); + } + } + + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + src_time_stamp, src_end_time_stamp)) { + res = update_error_state(ctx, cpi->common.error); + } + ctx->next_frame_flags = 0; + } + + cpi_data.cx_data = ctx->cx_data; + cpi_data.cx_data_sz = ctx->cx_data_sz; + + /* Any pending invisible frames? */ + if (ctx->pending_cx_data_sz) { + cpi_data.cx_data += ctx->pending_cx_data_sz; + cpi_data.cx_data_sz -= ctx->pending_cx_data_sz; + + /* TODO: this is a minimal check, the underlying codec doesn't respect + * the buffer size anyway. + */ + if (cpi_data.cx_data_sz < ctx->cx_data_sz / 2) { + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, + "Compressed data buffer too small"); + } + } + + int is_frame_visible = 0; + int has_no_show_keyframe = 0; + int num_workers = 0; + + if (cpi->oxcf.pass == AOM_RC_FIRST_PASS) { +#if !CONFIG_REALTIME_ONLY + num_workers = ppi->p_mt_info.num_mod_workers[MOD_FP] = + av1_fp_compute_num_enc_workers(cpi); +#endif + } else { + av1_compute_num_workers_for_mt(cpi); + num_workers = av1_get_max_num_workers(cpi); + } + if (num_workers > 1 && ppi->p_mt_info.num_workers < num_workers) { + // Obtain the maximum no. of frames that can be supported in a parallel + // encode set. + if (is_stat_consumption_stage(cpi)) { + ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf); + } + if (ppi->p_mt_info.num_workers > 0) { + av1_terminate_workers(ppi); + free_thread_data(ppi); + aom_free(ppi->p_mt_info.tile_thr_data); + ppi->p_mt_info.tile_thr_data = NULL; + aom_free(ppi->p_mt_info.workers); + ppi->p_mt_info.workers = NULL; + ppi->p_mt_info.num_workers = 0; + for (int j = 0; j < ppi->num_fp_contexts; j++) { + aom_free(ppi->parallel_cpi[j]->td.tctx); + ppi->parallel_cpi[j]->td.tctx = NULL; + } + } + av1_create_workers(ppi, num_workers); + av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS); + } + + // Re-allocate thread data if workers for encoder multi-threading stage + // exceeds prev_num_enc_workers. + const int num_enc_workers = + av1_get_num_mod_workers_for_alloc(&ppi->p_mt_info, MOD_ENC); + if (ppi->p_mt_info.prev_num_enc_workers < num_enc_workers && + num_enc_workers <= ppi->p_mt_info.num_workers) { + free_thread_data(ppi); + for (int j = 0; j < ppi->num_fp_contexts; j++) { + aom_free(ppi->parallel_cpi[j]->td.tctx); + ppi->parallel_cpi[j]->td.tctx = NULL; + } + av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS); + } + + for (int i = 0; i < ppi->num_fp_contexts; i++) { + av1_init_frame_mt(ppi, ppi->parallel_cpi[i]); + } + if (cpi_lap != NULL) { + av1_init_frame_mt(ppi, cpi_lap); + } +#if CONFIG_MULTITHREAD + if (ppi->p_mt_info.num_workers > 1) { + for (int i = 0; i < ppi->num_fp_contexts; i++) { + av1_init_mt_sync(ppi->parallel_cpi[i], + ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS); + } + if (cpi_lap != NULL) { + av1_init_mt_sync(cpi_lap, 1); + } + } +#endif // CONFIG_MULTITHREAD + + // Call for LAP stage + if (cpi_lap != NULL) { + AV1_COMP_DATA cpi_lap_data = { 0 }; + cpi_lap_data.flush = !img; + cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio; + const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data); + if (status > AOM_CODEC_OK) { + aom_internal_error_copy(&ppi->error, cpi_lap->common.error); + } + av1_post_encode_updates(cpi_lap, &cpi_lap_data); + } + + // Recalculate the maximum number of frames that can be encoded in + // parallel at the beginning of sub gop. + if (is_stat_consumption_stage(cpi) && ppi->gf_group.size > 0 && + cpi->gf_frame_index == ppi->gf_group.size) { + ppi->num_fp_contexts = av1_compute_num_fp_contexts(ppi, &cpi->oxcf); + } + + // Get the next visible frame. Invisible frames get packed with the next + // visible frame. + while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) { + int simulate_parallel_frame = 0; + int status = -1; + cpi->do_frame_data_update = true; + cpi->ref_idx_to_skip = INVALID_IDX; + cpi->ref_refresh_index = INVALID_IDX; + cpi->refresh_idx_available = false; + +#if CONFIG_FPMT_TEST + simulate_parallel_frame = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; + if (simulate_parallel_frame) { + if (ppi->num_fp_contexts > 1 && ppi->gf_group.size > 1) { + if (cpi->gf_frame_index < ppi->gf_group.size) { + calc_frame_data_update_flag(&ppi->gf_group, cpi->gf_frame_index, + &cpi->do_frame_data_update); + } + } + status = av1_get_compressed_data(cpi, &cpi_data); + } + +#endif // CONFIG_FPMT_TEST + if (!simulate_parallel_frame) { + if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + status = av1_get_compressed_data(cpi, &cpi_data); + } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == + 1) { + // In case of an error, longjmp() would be invoked and hence "status" + // is set to AOM_CODEC_OK here. + av1_compress_parallel_frames(ppi, &cpi_data); + status = AOM_CODEC_OK; + } else { + // No possibility of failures from this function and hence "status" is + // set to AOM_CODEC_OK here. + cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data); + status = AOM_CODEC_OK; + } + } + if (status == -1) break; + if (status != AOM_CODEC_OK) { + aom_internal_error_copy(&ppi->error, cpi->common.error); + } + if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) { + av1_init_sc_decisions(ppi); + } + + ppi->seq_params_locked = 1; + av1_post_encode_updates(cpi, &cpi_data); + +#if CONFIG_ENTROPY_STATS + if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame) + av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts); +#endif +#if CONFIG_INTERNAL_STATS + if (ppi->cpi->oxcf.pass != 1) { + ppi->total_time_compress_data += cpi->time_compress_data; + ppi->total_recode_hits += cpi->frame_recode_hits; + ppi->total_bytes += cpi->bytes; + for (int i = 0; i < MAX_MODES; i++) { + ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i]; + } + } +#endif // CONFIG_INTERNAL_STATS + + if (!cpi_data.frame_size) continue; + assert(cpi_data.cx_data != NULL && cpi_data.cx_data_sz != 0); + const int write_temporal_delimiter = + !cpi->common.spatial_layer_id && !ctx->pending_cx_data_sz; + + if (write_temporal_delimiter) { + uint32_t obu_header_size = 1; + const uint32_t obu_payload_size = 0; + const size_t length_field_size = + aom_uleb_size_in_bytes(obu_payload_size); + + const size_t move_offset = obu_header_size + length_field_size; + memmove(ctx->cx_data + move_offset, ctx->cx_data, cpi_data.frame_size); + obu_header_size = + av1_write_obu_header(&ppi->level_params, &cpi->frame_header_count, + OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data); + + // OBUs are preceded/succeeded by an unsigned leb128 coded integer. + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, + ctx->cx_data) != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); + } + + cpi_data.frame_size += + obu_header_size + obu_payload_size + length_field_size; + } + + if (ctx->oxcf.save_as_annexb) { + size_t curr_frame_size = cpi_data.frame_size; + if (av1_convert_sect5obus_to_annexb(cpi_data.cx_data, + &curr_frame_size) != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); + } + cpi_data.frame_size = curr_frame_size; + + // B_PRIME (add frame size) + const size_t length_field_size = + aom_uleb_size_in_bytes(cpi_data.frame_size); + memmove(cpi_data.cx_data + length_field_size, cpi_data.cx_data, + cpi_data.frame_size); + if (av1_write_uleb_obu_size(0, (uint32_t)cpi_data.frame_size, + cpi_data.cx_data) != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); + } + cpi_data.frame_size += length_field_size; + } + + ctx->pending_cx_data_sz += cpi_data.frame_size; + + cpi_data.cx_data += cpi_data.frame_size; + cpi_data.cx_data_sz -= cpi_data.frame_size; + + is_frame_visible = cpi->common.show_frame; + + has_no_show_keyframe |= + (!is_frame_visible && + cpi->common.current_frame.frame_type == KEY_FRAME); + } + if (is_frame_visible) { + // Add the frame packet to the list of returned packets. + aom_codec_cx_pkt_t pkt; + + // decrement frames_left counter + ppi->frames_left = AOMMAX(0, ppi->frames_left - 1); + if (ctx->oxcf.save_as_annexb) { + // B_PRIME (add TU size) + size_t tu_size = ctx->pending_cx_data_sz; + const size_t length_field_size = aom_uleb_size_in_bytes(tu_size); + memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size); + if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, ctx->cx_data) != + AOM_CODEC_OK) { + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); + } + ctx->pending_cx_data_sz += length_field_size; + } + + pkt.kind = AOM_CODEC_CX_FRAME_PKT; + + pkt.data.frame.buf = ctx->cx_data; + pkt.data.frame.sz = ctx->pending_cx_data_sz; + pkt.data.frame.partition_id = -1; + pkt.data.frame.vis_frame_size = cpi_data.frame_size; + + pkt.data.frame.pts = ticks_to_timebase_units(cpi_data.timestamp_ratio, + cpi_data.ts_frame_start) + + ctx->pts_offset; + pkt.data.frame.flags = get_frame_pkt_flags(cpi, cpi_data.lib_flags); + if (has_no_show_keyframe) { + // If one of the invisible frames in the packet is a keyframe, set + // the delayed random access point flag. + pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT; + } + pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units( + cpi_data.timestamp_ratio, + cpi_data.ts_frame_end - cpi_data.ts_frame_start); + + aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + + ctx->pending_cx_data_sz = 0; + } + } + + ppi->error.setjmp = 0; + return res; +} + +static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx, + aom_codec_iter_t *iter) { + return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter); +} + +static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + av1_set_reference_enc(ctx->ppi->cpi, frame->idx, &sd); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering) + return AOM_CODEC_INCAPABLE; + av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG sd; + + image2yuvconfig(&frame->img, &sd); + av1_copy_reference_enc(ctx->ppi->cpi, frame->idx, &sd); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + if (ctx->ppi->cpi->oxcf.algo_cfg.skip_postproc_filtering) + return AOM_CODEC_INCAPABLE; + av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + + if (frame != NULL) { + YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->ppi->cpi->common, frame->idx); + if (fb == NULL) return AOM_CODEC_ERROR; + + yuvconfig2image(&frame->img, fb, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *const new_img = va_arg(args, aom_image_t *); + + if (new_img != NULL) { + YV12_BUFFER_CONFIG new_frame; + + if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) { + yuvconfig2image(new_img, &new_frame, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *const new_img = va_arg(args, aom_image_t *); + + if (new_img != NULL) { + YV12_BUFFER_CONFIG new_frame; + + if (av1_get_last_show_frame(ctx->ppi->cpi, &new_frame) == 0) { + YV12_BUFFER_CONFIG sd; + image2yuvconfig(new_img, &sd); + return av1_copy_new_frame_enc(&ctx->ppi->cpi->common, &new_frame, &sd); + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) { + YV12_BUFFER_CONFIG sd; + + if (av1_get_preview_raw_frame(ctx->ppi->cpi, &sd) == 0) { + yuvconfig2image(&ctx->preview_img, &sd, NULL); + return &ctx->preview_img; + } else { + return NULL; + } +} + +static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + const int reference_flag = va_arg(args, int); + + av1_use_as_reference(&ctx->ppi->cpi->ext_flags.ref_frame_flags, + reference_flag); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx, + va_list args) { + (void)ctx; + (void)args; + + // TODO(yaowu): Need to re-implement and test for AV1. + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_active_map_t *const map = va_arg(args, aom_active_map_t *); + + if (map) { + if (!av1_set_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows, + (int)map->cols)) + return AOM_CODEC_OK; + else + return AOM_CODEC_INVALID_PARAM; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_active_map_t *const map = va_arg(args, aom_active_map_t *); + + if (map) { + if (!av1_get_active_map(ctx->ppi->cpi, map->active_map, (int)map->rows, + (int)map->cols)) + return AOM_CODEC_OK; + else + return AOM_CODEC_INVALID_PARAM; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *); + + if (mode) { + const int res = av1_set_internal_size( + &ctx->ppi->cpi->oxcf, &ctx->ppi->cpi->resize_pending_params, + mode->h_scaling_mode, mode->v_scaling_mode); + av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf); + return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx, + va_list args) { + const int spatial_layer_id = va_arg(args, int); + if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS) + return AOM_CODEC_INVALID_PARAM; + ctx->ppi->cpi->common.spatial_layer_id = spatial_layer_id; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx, + va_list args) { + const int number_spatial_layers = va_arg(args, int); + if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS) + return AOM_CODEC_INVALID_PARAM; + ctx->ppi->number_spatial_layers = number_spatial_layers; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *); + ctx->ppi->cpi->common.spatial_layer_id = data->spatial_layer_id; + ctx->ppi->cpi->common.temporal_layer_id = data->temporal_layer_id; + ctx->ppi->cpi->svc.spatial_layer_id = data->spatial_layer_id; + ctx->ppi->cpi->svc.temporal_layer_id = data->temporal_layer_id; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; + AV1_COMMON *const cm = &cpi->common; + AV1EncoderConfig *oxcf = &cpi->oxcf; + aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *); + int64_t target_bandwidth = 0; + ppi->number_spatial_layers = params->number_spatial_layers; + ppi->number_temporal_layers = params->number_temporal_layers; + cpi->svc.number_spatial_layers = params->number_spatial_layers; + cpi->svc.number_temporal_layers = params->number_temporal_layers; + if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) { + unsigned int sl, tl; + ctx->ppi->use_svc = 1; + const int num_layers = + ppi->number_spatial_layers * ppi->number_temporal_layers; + for (int layer = 0; layer < num_layers; ++layer) { + if (params->max_quantizers[layer] > 63 || + params->min_quantizers[layer] < 0 || + params->min_quantizers[layer] > params->max_quantizers[layer]) { + return AOM_CODEC_INVALID_PARAM; + } + } + if (!av1_alloc_layer_context(cpi, num_layers)) return AOM_CODEC_MEM_ERROR; + + for (sl = 0; sl < ppi->number_spatial_layers; ++sl) { + for (tl = 0; tl < ppi->number_temporal_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + lc->max_q = params->max_quantizers[layer]; + lc->min_q = params->min_quantizers[layer]; + lc->scaling_factor_num = AOMMAX(1, params->scaling_factor_num[sl]); + lc->scaling_factor_den = AOMMAX(1, params->scaling_factor_den[sl]); + const int layer_target_bitrate = params->layer_target_bitrate[layer]; + if (layer_target_bitrate > INT_MAX / 1000) { + lc->layer_target_bitrate = INT_MAX; + } else { + lc->layer_target_bitrate = 1000 * layer_target_bitrate; + } + lc->framerate_factor = params->framerate_factor[tl]; + if (tl == ppi->number_temporal_layers - 1) + target_bandwidth += lc->layer_target_bitrate; + } + } + if (cm->current_frame.frame_number == 0) { + if (!cpi->ppi->seq_params_locked) { + SequenceHeader *const seq_params = &ppi->seq_params; + seq_params->operating_points_cnt_minus_1 = + ppi->number_spatial_layers * ppi->number_temporal_layers - 1; + av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1); + } + av1_init_layer_context(cpi); + } + oxcf->rc_cfg.target_bandwidth = target_bandwidth; + set_primary_rc_buffer_sizes(oxcf, cpi->ppi); + av1_update_layer_context_change_config(cpi, target_bandwidth); + check_reset_rc_flag(cpi); + } + av1_check_fpmt_config(ctx->ppi, &ctx->ppi->cpi->oxcf); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_COMP *const cpi = ctx->ppi->cpi; + aom_svc_ref_frame_config_t *const data = + va_arg(args, aom_svc_ref_frame_config_t *); + cpi->ppi->rtc_ref.set_ref_frame_config = 1; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + cpi->ppi->rtc_ref.reference[i] = data->reference[i]; + cpi->ppi->rtc_ref.ref_idx[i] = data->ref_idx[i]; + } + for (unsigned int i = 0; i < REF_FRAMES; ++i) + cpi->ppi->rtc_ref.refresh[i] = data->refresh[i]; + cpi->svc.use_flexible_mode = 1; + cpi->svc.ksvc_fixed_mode = 0; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_svc_ref_frame_comp_pred( + aom_codec_alg_priv_t *ctx, va_list args) { + AV1_COMP *const cpi = ctx->ppi->cpi; + aom_svc_ref_frame_comp_pred_t *const data = + va_arg(args, aom_svc_ref_frame_comp_pred_t *); + cpi->ppi->rtc_ref.ref_frame_comp[0] = data->use_comp_pred[0]; + cpi->ppi->rtc_ref.ref_frame_comp[1] = data->use_comp_pred[1]; + cpi->ppi->rtc_ref.ref_frame_comp[2] = data->use_comp_pred[2]; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_transfer_characteristics( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.transfer_characteristics = + CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_sample_position( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_sample_position = + CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + int *const render_size = va_arg(args, int *); + extra_cfg.render_width = render_size[0]; + extra_cfg.render_height = render_size[1]; + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx, + const char *name, const char *value) { + if (ctx == NULL || name == NULL || value == NULL) + return AOM_CODEC_INVALID_PARAM; + struct av1_extracfg extra_cfg = ctx->extra_cfg; + // Used to mock the argv with just one string "--{name}={value}" + char *argv[2] = { NULL, "" }; + size_t len = strlen(name) + strlen(value) + 4; + char *const err_string = ctx->ppi->error.detail; + +#if __STDC_VERSION__ >= 201112L + // We use the keyword _Static_assert because clang-cl does not allow the + // convenience macro static_assert to be used in function scope. See + // https://bugs.llvm.org/show_bug.cgi?id=48904. + _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN, + "The size of the err_msg buffer for arg_match_helper must be " + "at least ARG_ERR_MSG_MAX_LEN"); +#else + assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN); +#endif + + argv[0] = aom_malloc(len * sizeof(argv[1][0])); + if (!argv[0]) return AOM_CODEC_MEM_ERROR; + snprintf(argv[0], len, "--%s=%s", name, value); + struct arg arg; + aom_codec_err_t err = AOM_CODEC_OK; + + int match = 1; + if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_keyframe_filtering, + argv, err_string)) { + extra_cfg.enable_keyframe_filtering = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_gf_interval, argv, + err_string)) { + extra_cfg.min_gf_interval = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_gf_interval, argv, + err_string)) { + extra_cfg.max_gf_interval = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_min_pyr_height, + argv, err_string)) { + extra_cfg.gf_min_pyr_height = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_max_pyr_height, + argv, err_string)) { + extra_cfg.gf_max_pyr_height = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cpu_used_av1, argv, + err_string)) { + extra_cfg.cpu_used = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_altref, argv, + err_string)) { + extra_cfg.enable_auto_alt_ref = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.noise_sens, argv, + err_string)) { + extra_cfg.noise_sensitivity = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sharpness, argv, + err_string)) { + extra_cfg.sharpness = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.static_thresh, argv, + err_string)) { + extra_cfg.static_thresh = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.rowmtarg, argv, + err_string)) { + extra_cfg.row_mt = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fpmtarg, argv, + err_string)) { + extra_cfg.fp_mt = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_cols, argv, + err_string)) { + extra_cfg.tile_columns = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_rows, argv, + err_string)) { + extra_cfg.tile_rows = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tpl_model, + argv, err_string)) { + extra_cfg.enable_tpl_model = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_maxframes, argv, + err_string)) { + extra_cfg.arnr_max_frames = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.arnr_strength, argv, + err_string)) { + extra_cfg.arnr_strength = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_metric, argv, + err_string)) { + extra_cfg.tuning = arg_parse_enum_helper(&arg, err_string); + } +#if CONFIG_TUNE_VMAF + else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argv, + err_string)) { + err = allocate_and_set_string(value, default_extra_cfg.vmaf_model_path, + &extra_cfg.vmaf_model_path, err_string); + } +#endif + else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path, + argv, err_string)) { + err = allocate_and_set_string(value, default_extra_cfg.partition_info_path, + &extra_cfg.partition_info_path, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_rate_guide_deltaq, + argv, err_string)) { + extra_cfg.enable_rate_guide_deltaq = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.rate_distribution_info, + argv, err_string)) { + err = + allocate_and_set_string(value, default_extra_cfg.rate_distribution_info, + &extra_cfg.rate_distribution_info, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dist_metric, argv, + err_string)) { + extra_cfg.dist_metric = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv, + err_string)) { + extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct, + argv, err_string)) { + extra_cfg.rc_max_intra_bitrate_pct = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_inter_rate_pct, + argv, err_string)) { + extra_cfg.rc_max_inter_bitrate_pct = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.gf_cbr_boost_pct, + argv, err_string)) { + extra_cfg.gf_cbr_boost_pct = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.lossless, argv, + err_string)) { + extra_cfg.lossless = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cdef, argv, + err_string)) { + extra_cfg.enable_cdef = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_restoration, + argv, err_string)) { + extra_cfg.enable_restoration = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.force_video_mode, + argv, err_string)) { + extra_cfg.force_video_mode = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_obmc, argv, + err_string)) { + extra_cfg.enable_obmc = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.disable_trellis_quant, + argv, err_string)) { + extra_cfg.disable_trellis_quant = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_qm, argv, + err_string)) { + extra_cfg.enable_qm = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_max, argv, + err_string)) { + extra_cfg.qm_max = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.qm_min, argv, + err_string)) { + extra_cfg.qm_min = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.num_tg, argv, + err_string)) { + extra_cfg.num_tg = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mtu_size, argv, + err_string)) { + extra_cfg.mtu_size = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.timing_info, argv, + err_string)) { + extra_cfg.timing_info_type = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.frame_parallel_decoding, + argv, err_string)) { + extra_cfg.frame_parallel_decoding_mode = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dual_filter, + argv, err_string)) { + extra_cfg.enable_dual_filter = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_chroma_deltaq, + argv, err_string)) { + extra_cfg.enable_chroma_deltaq = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.aq_mode, argv, + err_string)) { + extra_cfg.aq_mode = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_mode, argv, + err_string)) { + extra_cfg.deltaq_mode = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltaq_strength, argv, + err_string)) { + extra_cfg.deltaq_strength = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.deltalf_mode, argv, + err_string)) { + extra_cfg.deltalf_mode = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.frame_periodic_boost, + argv, err_string)) { + extra_cfg.frame_periodic_boost = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tune_content, argv, + err_string)) { + extra_cfg.content = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.input_color_primaries, + argv, err_string)) { + extra_cfg.color_primaries = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper( + &arg, &g_av1_codec_arg_defs.input_transfer_characteristics, + argv, err_string)) { + extra_cfg.transfer_characteristics = + arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.input_matrix_coefficients, + argv, err_string)) { + extra_cfg.matrix_coefficients = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper( + &arg, &g_av1_codec_arg_defs.input_chroma_sample_position, argv, + err_string)) { + extra_cfg.chroma_sample_position = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.superblock_size, argv, + err_string)) { + extra_cfg.superblock_size = arg_parse_enum_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.error_resilient_mode, + argv, err_string)) { + extra_cfg.error_resilient_mode = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sframe_mode, argv, + err_string)) { + extra_cfg.s_frame_mode = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_test, argv, + err_string)) { + extra_cfg.film_grain_test_vector = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_table, + argv, err_string)) { + if (value == NULL) { + // this parameter allows NULL as its value + extra_cfg.film_grain_table_filename = value; + } else { + err = allocate_and_set_string( + value, default_extra_cfg.film_grain_table_filename, + &extra_cfg.film_grain_table_filename, err_string); + } + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cdf_update_mode, argv, + err_string)) { + extra_cfg.cdf_update_mode = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_rect_partitions, + argv, err_string)) { + extra_cfg.enable_rect_partitions = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ab_partitions, + argv, err_string)) { + extra_cfg.enable_ab_partitions = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_1to4_partitions, + argv, err_string)) { + extra_cfg.enable_1to4_partitions = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.min_partition_size, + argv, err_string)) { + extra_cfg.min_partition_size = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_partition_size, + argv, err_string)) { + extra_cfg.max_partition_size = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_intra_edge_filter, + argv, err_string)) { + extra_cfg.enable_intra_edge_filter = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_order_hint, + argv, err_string)) { + extra_cfg.enable_order_hint = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_tx64, argv, + err_string)) { + extra_cfg.enable_tx64 = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_flip_idtx, + argv, err_string)) { + extra_cfg.enable_flip_idtx = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_rect_tx, argv, + err_string)) { + extra_cfg.enable_rect_tx = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dist_wtd_comp, + argv, err_string)) { + extra_cfg.enable_dist_wtd_comp = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_reference_frames, + argv, err_string)) { + extra_cfg.max_reference_frames = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_reference_set, + argv, err_string)) { + extra_cfg.enable_reduced_reference_set = + arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_ref_frame_mvs, + argv, err_string)) { + extra_cfg.enable_ref_frame_mvs = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_masked_comp, + argv, err_string)) { + extra_cfg.enable_masked_comp = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_onesided_comp, + argv, err_string)) { + extra_cfg.enable_onesided_comp = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_interintra_comp, + argv, err_string)) { + extra_cfg.enable_interintra_comp = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_smooth_interintra, + argv, err_string)) { + extra_cfg.enable_smooth_interintra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diff_wtd_comp, + argv, err_string)) { + extra_cfg.enable_diff_wtd_comp = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_interinter_wedge, + argv, err_string)) { + extra_cfg.enable_interinter_wedge = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_interintra_wedge, + argv, err_string)) { + extra_cfg.enable_interintra_wedge = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_global_motion, + argv, err_string)) { + extra_cfg.enable_global_motion = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_warped_motion, + argv, err_string)) { + extra_cfg.enable_warped_motion = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_filter_intra, + argv, err_string)) { + extra_cfg.enable_filter_intra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_smooth_intra, + argv, err_string)) { + extra_cfg.enable_smooth_intra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_paeth_intra, + argv, err_string)) { + extra_cfg.enable_paeth_intra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cfl_intra, + argv, err_string)) { + extra_cfg.enable_cfl_intra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.enable_directional_intra, + argv, err_string)) { + extra_cfg.enable_directional_intra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diagonal_intra, + argv, err_string)) { + extra_cfg.enable_diagonal_intra = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_overlay, argv, + err_string)) { + extra_cfg.enable_overlay = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_palette, argv, + err_string)) { + extra_cfg.enable_palette = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_intrabc, argv, + err_string)) { + extra_cfg.enable_intrabc = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_angle_delta, + argv, err_string)) { + extra_cfg.enable_angle_delta = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.reduced_tx_type_set, + argv, err_string)) { + extra_cfg.reduced_tx_type_set = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_intra_dct_only, + argv, err_string)) { + extra_cfg.use_intra_dct_only = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.use_inter_dct_only, + argv, err_string)) { + extra_cfg.use_inter_dct_only = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.use_intra_default_tx_only, + argv, err_string)) { + extra_cfg.use_intra_default_tx_only = + arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.quant_b_adapt, argv, + err_string)) { + extra_cfg.quant_b_adapt = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.vbr_corpus_complexity_lap, + argv, err_string)) { + extra_cfg.vbr_corpus_complexity_lap = + arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_tier_mask, argv, + err_string)) { + extra_cfg.tier_mask = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.set_min_cr, argv, + err_string)) { + extra_cfg.min_cr = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.coeff_cost_upd_freq, + argv, err_string)) { + extra_cfg.coeff_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mode_cost_upd_freq, + argv, err_string)) { + extra_cfg.mode_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq, + argv, err_string)) { + extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq, + argv, err_string)) { + extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); + } +#if CONFIG_DENOISE + else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level, + argv, err_string)) { + extra_cfg.noise_level = + (float)arg_parse_int_helper(&arg, err_string) / 10.0f; + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_block_size, + argv, err_string)) { + extra_cfg.noise_block_size = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_dnl_denoising, + argv, err_string)) { + extra_cfg.enable_dnl_denoising = arg_parse_uint_helper(&arg, err_string); + } +#endif + else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.target_seq_level_idx, + argv, err_string)) { + const int val = arg_parse_int_helper(&arg, err_string); + const int level = val % 100; + const int operating_point_idx = val / 100; + if (operating_point_idx < 0 || + operating_point_idx >= MAX_NUM_OPERATING_POINTS) { + snprintf(err_string, ARG_ERR_MSG_MAX_LEN, + "Invalid operating point index: %d", operating_point_idx); + err = AOM_CODEC_INVALID_PARAM; + } else { + extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level; + } + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.input_chroma_subsampling_x, + argv, err_string)) { + extra_cfg.chroma_subsampling_x = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.input_chroma_subsampling_y, + argv, err_string)) { + extra_cfg.chroma_subsampling_y = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.passes, argv, + err_string)) { + extra_cfg.passes = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fwd_kf_dist, argv, + err_string)) { + extra_cfg.fwd_kf_dist = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.two_pass_output, argv, + err_string)) { + err = allocate_and_set_string(value, default_extra_cfg.two_pass_output, + &extra_cfg.two_pass_output, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.second_pass_log, argv, + err_string)) { + err = allocate_and_set_string(value, default_extra_cfg.second_pass_log, + &extra_cfg.second_pass_log, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.loopfilter_control, + argv, err_string)) { + extra_cfg.loopfilter_control = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.auto_intra_tools_off, + argv, err_string)) { + extra_cfg.auto_intra_tools_off = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, + &g_av1_codec_arg_defs.strict_level_conformance, + argv, err_string)) { + extra_cfg.strict_level_conformance = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.sb_qp_sweep, argv, + err_string)) { + extra_cfg.sb_qp_sweep = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.kf_max_pyr_height, + argv, err_string)) { + extra_cfg.kf_max_pyr_height = arg_parse_int_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_width, argv, + err_string)) { + ctx->cfg.tile_width_count = arg_parse_list_helper( + &arg, ctx->cfg.tile_widths, MAX_TILE_WIDTHS, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.tile_height, argv, + err_string)) { + ctx->cfg.tile_height_count = arg_parse_list_helper( + &arg, ctx->cfg.tile_heights, MAX_TILE_HEIGHTS, err_string); + } else { + match = 0; + snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot find aom option %s", + name); + } + aom_free(argv[0]); + + if (err != AOM_CODEC_OK) { + ctx->base.err_detail = err_string; + return err; + } + + if (strlen(err_string) != 0) { + ctx->base.err_detail = err_string; + return AOM_CODEC_INVALID_PARAM; + } + + ctx->base.err_detail = NULL; + + if (!match) { + return AOM_CODEC_INVALID_PARAM; + } + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params, + arg); +} + +static aom_codec_err_t ctrl_get_target_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + return av1_get_target_seq_level_idx(&ctx->ppi->seq_params, + &ctx->ppi->level_params, arg); +} + +static aom_codec_err_t ctrl_get_num_operating_points(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + *arg = ctx->ppi->seq_params.operating_points_cnt_minus_1 + 1; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_luma_cdef_strength(aom_codec_alg_priv_t *ctx, + va_list args) { + int *arg = va_arg(args, int *); + AV1_COMMON const *cm = &ctx->ppi->cpi->common; + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + memcpy(arg, cm->cdef_info.cdef_strengths, CDEF_MAX_STRENGTHS * sizeof(*arg)); + + return AOM_CODEC_OK; +} + +static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { + { AV1_COPY_REFERENCE, ctrl_copy_reference }, + { AOME_USE_REFERENCE, ctrl_use_reference }, + + // Setters + { AV1_SET_REFERENCE, ctrl_set_reference }, + { AOME_SET_ROI_MAP, ctrl_set_roi_map }, + { AOME_SET_ACTIVEMAP, ctrl_set_active_map }, + { AOME_SET_SCALEMODE, ctrl_set_scale_mode }, + { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id }, + { AOME_SET_CPUUSED, ctrl_set_cpuused }, + { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref }, + { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref }, + { AOME_SET_SHARPNESS, ctrl_set_sharpness }, + { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, + { AV1E_SET_ROW_MT, ctrl_set_row_mt }, + { AV1E_SET_FP_MT, ctrl_set_fp_mt }, + { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, + { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows }, + { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model }, + { AV1E_SET_ENABLE_KEYFRAME_FILTERING, ctrl_set_enable_keyframe_filtering }, + { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, + { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, + { AOME_SET_TUNING, ctrl_set_tuning }, + { AOME_SET_CQ_LEVEL, ctrl_set_cq_level }, + { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct }, + { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers }, + { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct }, + { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, + { AV1E_SET_LOSSLESS, ctrl_set_lossless }, + { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef }, + { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration }, + { AV1E_SET_FORCE_VIDEO_MODE, ctrl_set_force_video_mode }, + { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc }, + { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant }, + { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm }, + { AV1E_SET_QM_Y, ctrl_set_qm_y }, + { AV1E_SET_QM_U, ctrl_set_qm_u }, + { AV1E_SET_QM_V, ctrl_set_qm_v }, + { AV1E_SET_QM_MIN, ctrl_set_qm_min }, + { AV1E_SET_QM_MAX, ctrl_set_qm_max }, + { AV1E_SET_NUM_TG, ctrl_set_num_tg }, + { AV1E_SET_MTU, ctrl_set_mtu }, + { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type }, + { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode }, + { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode }, + { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode }, + { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions }, + { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions }, + { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions }, + { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size }, + { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size }, + { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter }, + { AV1E_SET_ENABLE_CHROMA_DELTAQ, ctrl_set_enable_chroma_deltaq }, + { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter }, + { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint }, + { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 }, + { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx }, + { AV1E_SET_ENABLE_RECT_TX, ctrl_set_enable_rect_tx }, + { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp }, + { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames }, + { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set }, + { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs }, + { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs }, + { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp }, + { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp }, + { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp }, + { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra }, + { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp }, + { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge }, + { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge }, + { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion }, + { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion }, + { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion }, + { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra }, + { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra }, + { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra }, + { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra }, + { AV1E_SET_ENABLE_DIRECTIONAL_INTRA, ctrl_set_enable_directional_intra }, + { AV1E_SET_ENABLE_DIAGONAL_INTRA, ctrl_set_enable_diagonal_intra }, + { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres }, + { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay }, + { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette }, + { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc }, + { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta }, + { AV1E_SET_AQ_MODE, ctrl_set_aq_mode }, + { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set }, + { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only }, + { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only }, + { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only }, + { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt }, + { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq }, + { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq }, + { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq }, + { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode }, + { AV1E_SET_DELTAQ_STRENGTH, ctrl_set_deltaq_strength }, + { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode }, + { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost }, + { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content }, + { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode }, + { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries }, + { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics }, + { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients }, + { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position }, + { AV1E_SET_COLOR_RANGE, ctrl_set_color_range }, + { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity }, + { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval }, + { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval }, + { AV1E_SET_GF_MIN_PYRAMID_HEIGHT, ctrl_set_gf_min_pyr_height }, + { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height }, + { AV1E_SET_RENDER_SIZE, ctrl_set_render_size }, + { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size }, + { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding }, + { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path }, + { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path }, + { AV1E_ENABLE_RATE_GUIDE_DELTAQ, ctrl_enable_rate_guide_deltaq }, + { AV1E_SET_RATE_DISTRIBUTION_INFO, ctrl_set_rate_distribution_info }, + { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector }, + { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table }, + { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level }, + { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size }, + { AV1E_SET_ENABLE_DNL_DENOISING, ctrl_set_enable_dnl_denoising }, + { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { AV1E_SET_FP_MT_UNIT_TEST, ctrl_enable_fpmt_unit_test }, + { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug }, + { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx }, + { AV1E_SET_TIER_MASK, ctrl_set_tier_mask }, + { AV1E_SET_MIN_CR, ctrl_set_min_cr }, + { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id }, + { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params }, + { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, + { AV1E_SET_SVC_REF_FRAME_COMP_PRED, ctrl_set_svc_ref_frame_comp_pred }, + { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap }, + { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test }, + { AV1E_ENABLE_SB_QP_SWEEP, ctrl_enable_sb_qp_sweep }, + { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq }, + { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition }, + { AV1E_SET_ENABLE_TX_SIZE_SEARCH, ctrl_set_enable_tx_size_search }, + { AV1E_SET_LOOPFILTER_CONTROL, ctrl_set_loopfilter_control }, + { AV1E_SET_SKIP_POSTPROC_FILTERING, ctrl_set_skip_postproc_filtering }, + { AV1E_SET_AUTO_INTRA_TOOLS_OFF, ctrl_set_auto_intra_tools_off }, + { AV1E_SET_RTC_EXTERNAL_RC, ctrl_set_rtc_external_rc }, + { AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, + { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr }, + { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr }, + { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode }, + + // Getters + { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, + { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, + { AOME_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level }, + { AV1_GET_REFERENCE, ctrl_get_reference }, + { AV1E_GET_ACTIVEMAP, ctrl_get_active_map }, + { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, + { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, + { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x }, + { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y }, + { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx }, + { AV1E_GET_BASELINE_GF_INTERVAL, ctrl_get_baseline_gf_interval }, + { AV1E_GET_TARGET_SEQ_LEVEL_IDX, ctrl_get_target_seq_level_idx }, + { AV1E_GET_NUM_OPERATING_POINTS, ctrl_get_num_operating_points }, + { AV1E_GET_LUMA_CDEF_STRENGTH, ctrl_get_luma_cdef_strength }, + + CTRL_MAP_END, +}; + +static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { +#if !CONFIG_REALTIME_ONLY + { + // NOLINT + AOM_USAGE_GOOD_QUALITY, // g_usage - non-realtime usage + 0, // g_threads + 0, // g_profile + + 320, // g_w + 240, // g_h + 0, // g_limit + 0, // g_forced_max_frame_width + 0, // g_forced_max_frame_height + AOM_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + AOM_RC_ONE_PASS, // g_pass + + 35, // g_lag_in_frames + + 0, // rc_dropframe_thresh + RESIZE_NONE, // rc_resize_mode + SCALE_NUMERATOR, // rc_resize_denominator + SCALE_NUMERATOR, // rc_resize_kf_denominator + + AOM_SUPERRES_NONE, // rc_superres_mode + SCALE_NUMERATOR, // rc_superres_denominator + SCALE_NUMERATOR, // rc_superres_kf_denominator + 63, // rc_superres_qthresh + 32, // rc_superres_kf_qthresh + + AOM_VBR, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bitrate + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + + // keyframing settings (kf) + 0, // fwd_kf_enabled + AOM_KF_AUTO, // kf_mode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + }, +#endif // !CONFIG_REALTIME_ONLY + { + // NOLINT + AOM_USAGE_REALTIME, // g_usage - real-time usage + 0, // g_threads + 0, // g_profile + + 320, // g_w + 240, // g_h + 0, // g_limit + 0, // g_forced_max_frame_width + 0, // g_forced_max_frame_height + AOM_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + AOM_RC_ONE_PASS, // g_pass + + 0, // g_lag_in_frames + + 0, // rc_dropframe_thresh + RESIZE_NONE, // rc_resize_mode + SCALE_NUMERATOR, // rc_resize_denominator + SCALE_NUMERATOR, // rc_resize_kf_denominator + + AOM_SUPERRES_NONE, // rc_superres_mode + SCALE_NUMERATOR, // rc_superres_denominator + SCALE_NUMERATOR, // rc_superres_kf_denominator + 63, // rc_superres_qthresh + 32, // rc_superres_kf_qthresh + + AOM_CBR, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bitrate + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + + // keyframing settings (kf) + 0, // fwd_kf_enabled + AOM_KF_AUTO, // kf_mode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + }, +#if !CONFIG_REALTIME_ONLY + { + // NOLINT + AOM_USAGE_ALL_INTRA, // g_usage - all intra usage + 0, // g_threads + 0, // g_profile + + 320, // g_w + 240, // g_h + 0, // g_limit + 0, // g_forced_max_frame_width + 0, // g_forced_max_frame_height + AOM_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + AOM_RC_ONE_PASS, // g_pass + + 0, // g_lag_in_frames + + 0, // rc_dropframe_thresh + RESIZE_NONE, // rc_resize_mode + SCALE_NUMERATOR, // rc_resize_denominator + SCALE_NUMERATOR, // rc_resize_kf_denominator + + AOM_SUPERRES_NONE, // rc_superres_mode + SCALE_NUMERATOR, // rc_superres_denominator + SCALE_NUMERATOR, // rc_superres_kf_denominator + 63, // rc_superres_qthresh + 32, // rc_superres_kf_qthresh + + AOM_Q, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bitrate + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + + // keyframing settings (kf) + 0, // fwd_kf_enabled + AOM_KF_DISABLED, // kf_mode + 0, // kf_min_dist + 0, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + }, +#endif // !CONFIG_REALTIME_ONLY +}; + +// This data structure and function are exported in aom/aomcx.h +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +aom_codec_iface_t aom_codec_av1_cx_algo = { + "AOMedia Project AV1 Encoder" VERSION_STRING, + AOM_CODEC_INTERNAL_ABI_VERSION, + (CONFIG_AV1_HIGHBITDEPTH ? AOM_CODEC_CAP_HIGHBITDEPTH : 0) | + AOM_CODEC_CAP_ENCODER | AOM_CODEC_CAP_PSNR, // aom_codec_caps_t + encoder_init, // aom_codec_init_fn_t + encoder_destroy, // aom_codec_destroy_fn_t + encoder_ctrl_maps, // aom_codec_ctrl_fn_map_t + { + // NOLINT + NULL, // aom_codec_peek_si_fn_t + NULL, // aom_codec_get_si_fn_t + NULL, // aom_codec_decode_fn_t + NULL, // aom_codec_get_frame_fn_t + NULL // aom_codec_set_fb_fn_t + }, + { + // NOLINT + NELEMENTS(encoder_usage_cfg), // cfg_count + encoder_usage_cfg, // aom_codec_enc_cfg_t + encoder_encode, // aom_codec_encode_fn_t + encoder_get_cxdata, // aom_codec_get_cx_data_fn_t + encoder_set_config, // aom_codec_enc_config_set_fn_t + encoder_get_global_headers, // aom_codec_get_global_headers_fn_t + encoder_get_preview // aom_codec_get_preview_frame_fn_t + }, + encoder_set_option // aom_codec_set_option_fn_t +}; + +aom_codec_iface_t *aom_codec_av1_cx(void) { return &aom_codec_av1_cx_algo; } diff --git a/third_party/aom/av1/av1_cx_iface.h b/third_party/aom/av1/av1_cx_iface.h new file mode 100644 index 0000000000..b2a7005ea5 --- /dev/null +++ b/third_party/aom/av1/av1_cx_iface.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_AV1_CX_IFACE_H_ +#define AOM_AV1_AV1_CX_IFACE_H_ +#include "av1/encoder/encoder.h" +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg); + +aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi, + AV1_COMP **p_cpi, + BufferPool **p_buffer_pool, + const AV1EncoderConfig *oxcf, + COMPRESSOR_STAGE stage, + int lap_lag_in_frames); + +void av1_destroy_context_and_bufferpool(AV1_COMP *cpi, + BufferPool **p_buffer_pool); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_AV1_CX_IFACE_H_ diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c new file mode 100644 index 0000000000..3d7e132ab8 --- /dev/null +++ b/third_party/aom/av1/av1_dx_iface.c @@ -0,0 +1,1777 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_version.h" + +#include "aom/internal/aom_codec_internal.h" +#include "aom/internal/aom_image_internal.h" +#include "aom/aomdx.h" +#include "aom/aom_decoder.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem_ops.h" +#include "aom_util/aom_thread.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/enums.h" +#include "av1/common/obu_util.h" + +#include "av1/decoder/decoder.h" +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/grain_synthesis.h" +#include "av1/decoder/obu.h" + +#include "av1/av1_iface_common.h" + +struct aom_codec_alg_priv { + aom_codec_priv_t base; + aom_codec_dec_cfg_t cfg; + aom_codec_stream_info_t si; + aom_image_t img; + int img_avail; + int flushed; + int invert_tile_order; + RefCntBuffer *last_show_frame; // Last output frame buffer + int byte_alignment; + int skip_loop_filter; + int skip_film_grain; + int decode_tile_row; + int decode_tile_col; + unsigned int tile_mode; + unsigned int ext_tile_debug; + unsigned int row_mt; + EXTERNAL_REFERENCES ext_refs; + unsigned int is_annexb; + int operating_point; + int output_all_layers; + + AVxWorker *frame_worker; + + aom_image_t image_with_grain; + aom_codec_frame_buffer_t grain_image_frame_buffers[MAX_NUM_SPATIAL_LAYERS]; + size_t num_grain_image_frame_buffers; + int need_resync; // wait for key/intra-only frame + // BufferPool that holds all reference frames. Shared by all the FrameWorkers. + BufferPool *buffer_pool; + + // External frame buffer info to save for AV1 common. + void *ext_priv; // Private data associated with the external frame buffers. + aom_get_frame_buffer_cb_fn_t get_ext_fb_cb; + aom_release_frame_buffer_cb_fn_t release_ext_fb_cb; + +#if CONFIG_INSPECTION + aom_inspect_cb inspect_cb; + void *inspect_ctx; +#endif +}; + +static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) { + // This function only allocates space for the aom_codec_alg_priv_t + // structure. More memory may be required at the time the stream + // information becomes known. + if (!ctx->priv) { + aom_codec_alg_priv_t *const priv = + (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv)); + if (priv == NULL) return AOM_CODEC_MEM_ERROR; + + ctx->priv = (aom_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; + priv->flushed = 0; + + // TODO(tdaede): this should not be exposed to the API + priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING; + if (ctx->config.dec) { + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; + } + priv->num_grain_image_frame_buffers = 0; + // Turn row_mt on by default. + priv->row_mt = 1; + + // Turn on normal tile coding mode by default. + // 0 is for normal tile coding mode, and 1 is for large scale tile coding + // mode(refer to lightfield example). + priv->tile_mode = 0; + priv->decode_tile_row = -1; + priv->decode_tile_col = -1; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) { + if (ctx->frame_worker != NULL) { + AVxWorker *const worker = ctx->frame_worker; + aom_get_worker_interface()->end(worker); + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + if (frame_worker_data != NULL && frame_worker_data->pbi != NULL) { + AV1Decoder *const pbi = frame_worker_data->pbi; + aom_free(pbi->common.tpl_mvs); + pbi->common.tpl_mvs = NULL; + av1_remove_common(&pbi->common); + av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync); + av1_free_cdef_sync(&pbi->cdef_sync); + av1_free_restoration_buffers(&pbi->common); + av1_decoder_remove(pbi); + } + aom_free(frame_worker_data); + } + + if (ctx->buffer_pool) { + for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) { + ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv, + &ctx->grain_image_frame_buffers[i]); + } + av1_free_ref_frame_buffers(ctx->buffer_pool); + av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); +#endif + } + + aom_free(ctx->frame_worker); + aom_free(ctx->buffer_pool); + assert(!ctx->img.self_allocd); + aom_img_free(&ctx->img); + aom_free(ctx); + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) { + const uint32_t num_units_in_display_tick = + aom_rb_read_unsigned_literal(rb, 32); + const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32); + if (num_units_in_display_tick == 0 || time_scale == 0) + return AOM_CODEC_UNSUP_BITSTREAM; + const uint8_t equal_picture_interval = aom_rb_read_bit(rb); + if (equal_picture_interval) { + const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); + if (num_ticks_per_picture_minus_1 == UINT32_MAX) { + // num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1. + return AOM_CODEC_UNSUP_BITSTREAM; + } + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_decoder_model_info( + struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) { + *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5); + const uint32_t num_units_in_decoding_tick = + aom_rb_read_unsigned_literal(rb, 32); + const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5); + const uint8_t frame_presentation_time_length_minus_1 = + aom_rb_read_literal(rb, 5); + (void)num_units_in_decoding_tick; + (void)buffer_removal_time_length_minus_1; + (void)frame_presentation_time_length_minus_1; + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_op_parameters_info( + struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) { + const int n = buffer_delay_length_minus_1 + 1; + const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); + const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); + const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb); + (void)decoder_buffer_delay; + (void)encoder_buffer_delay; + (void)low_delay_mode_flag; + return AOM_CODEC_OK; +} + +// Parses the operating points (including operating_point_idc, seq_level_idx, +// and seq_tier) and then sets si->number_spatial_layers and +// si->number_temporal_layers based on operating_point_idc[0]. +static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb, + int is_reduced_header, + aom_codec_stream_info_t *si) { + int operating_point_idc0 = 0; + if (is_reduced_header) { + aom_rb_read_literal(rb, LEVEL_BITS); // level + } else { + uint8_t decoder_model_info_present_flag = 0; + int buffer_delay_length_minus_1 = 0; + aom_codec_err_t status; + const uint8_t timing_info_present_flag = aom_rb_read_bit(rb); + if (timing_info_present_flag) { + if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status; + decoder_model_info_present_flag = aom_rb_read_bit(rb); + if (decoder_model_info_present_flag) { + if ((status = parse_decoder_model_info( + rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK) + return status; + } + } + const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb); + const uint8_t operating_points_cnt_minus_1 = + aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); + for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) { + int operating_point_idc; + operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); + if (i == 0) operating_point_idc0 = operating_point_idc; + int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level + if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier + if (decoder_model_info_present_flag) { + const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb); + if (decoder_model_present_for_this_op) { + if ((status = parse_op_parameters_info( + rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK) + return status; + } + } + if (initial_display_delay_present_flag) { + const uint8_t initial_display_delay_present_for_this_op = + aom_rb_read_bit(rb); + if (initial_display_delay_present_for_this_op) + aom_rb_read_literal(rb, 4); // initial_display_delay_minus_1 + } + } + } + + if (aom_get_num_layers_from_operating_point_idc( + operating_point_idc0, &si->number_spatial_layers, + &si->number_temporal_layers) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, + size_t data_sz, + aom_codec_stream_info_t *si, + int *is_intra_only) { + int intra_only_flag = 0; + int got_sequence_header = 0; + int found_keyframe = 0; + + if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM; + + si->w = 0; + si->h = 0; + si->is_kf = 0; // is_kf indicates whether the current packet contains a RAP + + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + size_t payload_size = 0; + size_t bytes_read = 0; + uint8_t reduced_still_picture_hdr = 0; + aom_codec_err_t status = aom_read_obu_header_and_size( + data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); + if (status != AOM_CODEC_OK) return status; + + // If the first OBU is a temporal delimiter, skip over it and look at the next + // OBU in the bitstream + if (obu_header.type == OBU_TEMPORAL_DELIMITER) { + // Skip any associated payload (there shouldn't be one, but just in case) + if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME; + data += bytes_read + payload_size; + data_sz -= bytes_read + payload_size; + + status = aom_read_obu_header_and_size( + data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); + if (status != AOM_CODEC_OK) return status; + } + while (1) { + data += bytes_read; + data_sz -= bytes_read; + if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME; + // Check that the selected OBU is a sequence header + if (obu_header.type == OBU_SEQUENCE_HEADER) { + // Sanity check on sequence header size + if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME; + // Read a few values from the sequence header payload + struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + + av1_read_profile(&rb); // profile + const uint8_t still_picture = aom_rb_read_bit(&rb); + reduced_still_picture_hdr = aom_rb_read_bit(&rb); + + if (!still_picture && reduced_still_picture_hdr) { + return AOM_CODEC_UNSUP_BITSTREAM; + } + + if (parse_operating_points(&rb, reduced_still_picture_hdr, si) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + int num_bits_width = aom_rb_read_literal(&rb, 4) + 1; + int num_bits_height = aom_rb_read_literal(&rb, 4) + 1; + int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1; + int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1; + si->w = max_frame_width; + si->h = max_frame_height; + got_sequence_header = 1; + } else if (obu_header.type == OBU_FRAME_HEADER || + obu_header.type == OBU_FRAME) { + if (got_sequence_header && reduced_still_picture_hdr) { + found_keyframe = 1; + break; + } else { + // make sure we have enough bits to get the frame type out + if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME; + struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + const int show_existing_frame = aom_rb_read_bit(&rb); + if (!show_existing_frame) { + const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2); + if (frame_type == KEY_FRAME) { + found_keyframe = 1; + break; // Stop here as no further OBUs will change the outcome. + } else if (frame_type == INTRA_ONLY_FRAME) { + intra_only_flag = 1; + } + } + } + } + // skip past any unread OBU header data + data += payload_size; + data_sz -= payload_size; + if (data_sz == 0) break; // exit if we're out of OBUs + status = aom_read_obu_header_and_size( + data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); + if (status != AOM_CODEC_OK) return status; + } + if (got_sequence_header && found_keyframe) si->is_kf = 1; + if (is_intra_only != NULL) *is_intra_only = intra_only_flag; + return AOM_CODEC_OK; +} + +static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz, + aom_codec_stream_info_t *si) { + return decoder_peek_si_internal(data, data_sz, si, NULL); +} + +static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx, + aom_codec_stream_info_t *si) { + memcpy(si, &ctx->si, sizeof(*si)); + + return AOM_CODEC_OK; +} + +static void set_error_detail(aom_codec_alg_priv_t *ctx, + const char *const error) { + ctx->base.err_detail = error; +} + +static aom_codec_err_t update_error_state( + aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { + if (error->error_code) + set_error_detail(ctx, error->has_detail ? error->detail : NULL); + + return error->error_code; +} + +static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + cm->cur_frame = NULL; + cm->features.byte_alignment = ctx->byte_alignment; + pbi->skip_loop_filter = ctx->skip_loop_filter; + pbi->skip_film_grain = ctx->skip_film_grain; + + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = av1_get_frame_buffer; + pool->release_fb_cb = av1_release_frame_buffer; + + if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers)) + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + + pool->cb_priv = &pool->int_frame_buffers; + } +} + +static int frame_worker_hook(void *arg1, void *arg2) { + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; + const uint8_t *data = frame_worker_data->data; + (void)arg2; + + int result = av1_receive_compressed_data(frame_worker_data->pbi, + frame_worker_data->data_size, &data); + frame_worker_data->data_end = data; + + if (result != 0) { + // Check decode result in serial decode. + frame_worker_data->pbi->need_resync = 1; + } + return !result; +} + +static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + ctx->last_show_frame = NULL; + ctx->need_resync = 1; + ctx->flushed = 0; + + ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); + if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; + ctx->buffer_pool->num_frame_bufs = FRAME_BUFFERS; + ctx->buffer_pool->frame_bufs = (RefCntBuffer *)aom_calloc( + ctx->buffer_pool->num_frame_bufs, sizeof(*ctx->buffer_pool->frame_bufs)); + if (ctx->buffer_pool->frame_bufs == NULL) { + ctx->buffer_pool->num_frame_bufs = 0; + aom_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + return AOM_CODEC_MEM_ERROR; + } + +#if CONFIG_MULTITHREAD + if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { + aom_free(ctx->buffer_pool->frame_bufs); + ctx->buffer_pool->frame_bufs = NULL; + ctx->buffer_pool->num_frame_bufs = 0; + aom_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + return AOM_CODEC_MEM_ERROR; + } +#endif + + ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker)); + if (ctx->frame_worker == NULL) { + set_error_detail(ctx, "Failed to allocate frame_worker"); + return AOM_CODEC_MEM_ERROR; + } + + AVxWorker *const worker = ctx->frame_worker; + winterface->init(worker); + worker->thread_name = "aom frameworker"; + worker->data1 = aom_memalign(32, sizeof(FrameWorkerData)); + if (worker->data1 == NULL) { + winterface->end(worker); + aom_free(worker); + ctx->frame_worker = NULL; + set_error_detail(ctx, "Failed to allocate frame_worker_data"); + return AOM_CODEC_MEM_ERROR; + } + FrameWorkerData *frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool); + if (frame_worker_data->pbi == NULL) { + winterface->end(worker); + aom_free(frame_worker_data); + aom_free(worker); + ctx->frame_worker = NULL; + set_error_detail(ctx, "Failed to allocate frame_worker_data->pbi"); + return AOM_CODEC_MEM_ERROR; + } + frame_worker_data->frame_context_ready = 0; + frame_worker_data->received_frame = 0; + frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth; + + // If decoding in serial mode, FrameWorker thread could create tile worker + // thread or loopfilter thread. + frame_worker_data->pbi->max_threads = ctx->cfg.threads; + frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; + frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode; + frame_worker_data->pbi->is_annexb = ctx->is_annexb; + frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; + frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; + frame_worker_data->pbi->operating_point = ctx->operating_point; + frame_worker_data->pbi->output_all_layers = ctx->output_all_layers; + frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; + frame_worker_data->pbi->row_mt = ctx->row_mt; + frame_worker_data->pbi->is_fwd_kf_present = 0; + frame_worker_data->pbi->is_arf_frame_present = 0; + worker->hook = frame_worker_hook; + + init_buffer_callbacks(ctx); + + return AOM_CODEC_OK; +} + +static INLINE void check_resync(aom_codec_alg_priv_t *const ctx, + const AV1Decoder *const pbi) { + // Clear resync flag if worker got a key frame or intra only frame. + if (ctx->need_resync == 1 && pbi->need_resync == 0 && + frame_is_intra_only(&pbi->common)) + ctx->need_resync = 0; +} + +static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx, + const uint8_t **data, size_t data_sz, + void *user_priv) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + // Determine the stream parameters. Note that we rely on peek_si to + // validate that we have a buffer that does not wrap around the top + // of the heap. + if (!ctx->si.h) { + int is_intra_only = 0; + ctx->si.is_annexb = ctx->is_annexb; + const aom_codec_err_t res = + decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only); + if (res != AOM_CODEC_OK) return res; + + if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR; + } + + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->data = *data; + frame_worker_data->data_size = data_sz; + frame_worker_data->user_priv = user_priv; + frame_worker_data->received_frame = 1; + + frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode; + frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row; + frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col; + frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug; + frame_worker_data->pbi->row_mt = ctx->row_mt; + frame_worker_data->pbi->ext_refs = ctx->ext_refs; + + frame_worker_data->pbi->is_annexb = ctx->is_annexb; + + worker->had_error = 0; + winterface->execute(worker); + + // Update data pointer after decode. + *data = frame_worker_data->data_end; + + if (worker->had_error) + return update_error_state(ctx, &frame_worker_data->pbi->error); + + check_resync(ctx, frame_worker_data->pbi); + + return AOM_CODEC_OK; +} + +static void release_pending_output_frames(aom_codec_alg_priv_t *ctx) { + // Release any pending output frames from the previous decoder_decode or + // decoder_inspect call. We need to do this even if the decoder is being + // flushed or the input arguments are invalid. + if (ctx->frame_worker) { + BufferPool *const pool = ctx->buffer_pool; + lock_buffer_pool(pool); + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + struct AV1Decoder *pbi = frame_worker_data->pbi; + for (size_t j = 0; j < pbi->num_output_frames; j++) { + decrease_ref_count(pbi->output_frames[j], pool); + } + pbi->num_output_frames = 0; + unlock_buffer_pool(pool); + for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) { + pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]); + ctx->grain_image_frame_buffers[j].data = NULL; + ctx->grain_image_frame_buffers[j].size = 0; + ctx->grain_image_frame_buffers[j].priv = NULL; + } + ctx->num_grain_image_frame_buffers = 0; + } +} + +// This function enables the inspector to inspect non visible frames. +static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx, + const uint8_t *data, size_t data_sz, + void *user_priv) { + aom_codec_err_t res = AOM_CODEC_OK; + + release_pending_output_frames(ctx); + + /* Sanity checks */ + /* NULL data ptr allowed if data_sz is 0 too */ + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return AOM_CODEC_OK; + } + if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM; + + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; + + const uint8_t *data_start = data; + const uint8_t *data_end = data + data_sz; + + uint64_t frame_size; + if (ctx->is_annexb) { + // read the size of this temporal unit + size_t length_of_size; + uint64_t temporal_unit_size; + if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size, + &length_of_size) != 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + data_start += length_of_size; + if (temporal_unit_size > (size_t)(data_end - data_start)) + return AOM_CODEC_CORRUPT_FRAME; + data_end = data_start + temporal_unit_size; + + // read the size of this frame unit + if (aom_uleb_decode(data_start, (size_t)(data_end - data_start), + &frame_size, &length_of_size) != 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + data_start += length_of_size; + if (frame_size > (size_t)(data_end - data_start)) + return AOM_CODEC_CORRUPT_FRAME; + } else { + frame_size = (uint64_t)(data_end - data_start); + } + + if (ctx->frame_worker == NULL) { + res = init_decoder(ctx); + if (res != AOM_CODEC_OK) return res; + } + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)ctx->frame_worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + AV1_COMMON *const cm = &pbi->common; +#if CONFIG_INSPECTION + frame_worker_data->pbi->inspect_cb = ctx->inspect_cb; + frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx; +#endif + res = av1_receive_compressed_data(frame_worker_data->pbi, (size_t)frame_size, + &data_start); + check_resync(ctx, frame_worker_data->pbi); + + if (ctx->frame_worker->had_error) + return update_error_state(ctx, &frame_worker_data->pbi->error); + + // Allow extra zero bytes after the frame end + while (data_start < data_end) { + const uint8_t marker = data_start[0]; + if (marker) break; + ++data_start; + } + + Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv; + data2->idx = -1; + if (cm->cur_frame) { + for (int i = 0; i < REF_FRAMES; ++i) + if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i; + } + data2->buf = data_start; + data2->show_existing = cm->show_existing_frame; + return res; +} + +static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, + const uint8_t *data, size_t data_sz, + void *user_priv) { + aom_codec_err_t res = AOM_CODEC_OK; + +#if CONFIG_INSPECTION + if (user_priv != 0) { + return decoder_inspect(ctx, data, data_sz, user_priv); + } +#endif + + release_pending_output_frames(ctx); + + /* Sanity checks */ + /* NULL data ptr allowed if data_sz is 0 too */ + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return AOM_CODEC_OK; + } + if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM; + + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; + + // Initialize the decoder worker on the first frame. + if (ctx->frame_worker == NULL) { + res = init_decoder(ctx); + if (res != AOM_CODEC_OK) return res; + } + + const uint8_t *data_start = data; + const uint8_t *data_end = data + data_sz; + + if (ctx->is_annexb) { + // read the size of this temporal unit + size_t length_of_size; + uint64_t temporal_unit_size; + if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size, + &length_of_size) != 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + data_start += length_of_size; + if (temporal_unit_size > (size_t)(data_end - data_start)) + return AOM_CODEC_CORRUPT_FRAME; + data_end = data_start + temporal_unit_size; + } + + // Decode in serial mode. + while (data_start < data_end) { + uint64_t frame_size; + if (ctx->is_annexb) { + // read the size of this frame unit + size_t length_of_size; + if (aom_uleb_decode(data_start, (size_t)(data_end - data_start), + &frame_size, &length_of_size) != 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + data_start += length_of_size; + if (frame_size > (size_t)(data_end - data_start)) + return AOM_CODEC_CORRUPT_FRAME; + } else { + frame_size = (uint64_t)(data_end - data_start); + } + + res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv); + if (res != AOM_CODEC_OK) return res; + + // Allow extra zero bytes after the frame end + while (data_start < data_end) { + const uint8_t marker = data_start[0]; + if (marker) break; + ++data_start; + } + } + + return res; +} + +typedef struct { + BufferPool *pool; + aom_codec_frame_buffer_t *fb; +} AllocCbParam; + +static void *AllocWithGetFrameBufferCb(void *priv, size_t size) { + AllocCbParam *param = (AllocCbParam *)priv; + if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0) + return NULL; + if (param->fb->data == NULL || param->fb->size < size) return NULL; + return param->fb->data; +} + +// If grain_params->apply_grain is false, returns img. Otherwise, adds film +// grain to img, saves the result in grain_img, and returns grain_img. +static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx, + aom_image_t *img, + aom_image_t *grain_img, + aom_film_grain_t *grain_params) { + if (!grain_params->apply_grain) return img; + + const int w_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_w, 1); + const int h_even = ALIGN_POWER_OF_TWO_UNSIGNED(img->d_h, 1); + + BufferPool *const pool = ctx->buffer_pool; + aom_codec_frame_buffer_t *fb = + &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers]; + AllocCbParam param; + param.pool = pool; + param.fb = fb; + if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16, + AllocWithGetFrameBufferCb, ¶m)) { + return NULL; + } + + grain_img->user_priv = img->user_priv; + grain_img->fb_priv = fb->priv; + if (av1_add_film_grain(grain_params, img, grain_img)) { + pool->release_fb_cb(pool->cb_priv, fb); + return NULL; + } + + ctx->num_grain_image_frame_buffers++; + return grain_img; +} + +// Copies and clears the metadata from AV1Decoder. +static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) { + if (pbi->metadata && img) { + assert(!img->metadata); + img->metadata = pbi->metadata; + pbi->metadata = NULL; + } +} + +static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, + aom_codec_iter_t *iter) { + aom_image_t *img = NULL; + + if (!iter) { + return NULL; + } + + // To avoid having to allocate any extra storage, treat 'iter' as + // simply a pointer to an integer index + uintptr_t *index = (uintptr_t *)iter; + + if (ctx->frame_worker == NULL) { + return NULL; + } + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + pbi->error.error_code = AOM_CODEC_OK; + pbi->error.has_detail = 0; + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + // Wait for the frame from worker thread. + if (!winterface->sync(worker)) { + // Decoding failed. Release the worker thread. + frame_worker_data->received_frame = 0; + ctx->need_resync = 1; + // TODO(aomedia:3519): Set an error code. Check if a different error code + // should be used if ctx->flushed != 1. + return NULL; + } + // Check if worker has received any frames. + if (frame_worker_data->received_frame == 1) { + frame_worker_data->received_frame = 0; + check_resync(ctx, frame_worker_data->pbi); + } + YV12_BUFFER_CONFIG *sd; + aom_film_grain_t *grain_params; + if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) != + 0) { + return NULL; + } + RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; + ctx->last_show_frame = output_frame_buf; + if (ctx->need_resync) return NULL; + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); + move_decoder_metadata_to_img(pbi, &ctx->img); + + if (!pbi->ext_tile_debug && tiles->large_scale) { + *index += 1; // Advance the iterator to point to the next image + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); + move_decoder_metadata_to_img(pbi, &ctx->img); + img = &ctx->img; + return img; + } + + const int num_planes = av1_num_planes(cm); + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_row >= 0) { + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); + const int mi_row = tile_row * tile_height; + const int ssy = ctx->img.y_chroma_shift; + int plane; + ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += + mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; + } + } + ctx->img.d_h = + AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; + } + + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_col >= 0) { + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); + const int mi_col = tile_col * tile_width; + const int ssx = ctx->img.x_chroma_shift; + const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + int plane; + ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); + } + } + ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; + } + + ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; + img = &ctx->img; + img->temporal_id = output_frame_buf->temporal_id; + img->spatial_id = output_frame_buf->spatial_id; + if (pbi->skip_film_grain) grain_params->apply_grain = 0; + aom_image_t *res = + add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); + if (!res) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.has_detail = 1; + snprintf(pbi->error.detail, sizeof(pbi->error.detail), + "Grain synthesis failed\n"); + return res; + } + *index += 1; // Advance the iterator to point to the next image + return res; +} + +static aom_codec_err_t decoder_set_fb_fn( + aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get, + aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + if (cb_get == NULL || cb_release == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + if (ctx->frame_worker != NULL) { + // If the decoder has already been initialized, do not accept changes to + // the frame buffer functions. + return AOM_CODEC_ERROR; + } + + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *); + + if (data) { + av1_ref_frame_t *const frame = data; + YV12_BUFFER_CONFIG sd; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + image2yuvconfig(&frame->img, &sd); + return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx, + frame->use_external_ref, &sd); + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *); + if (frame) { + YV12_BUFFER_CONFIG sd; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + image2yuvconfig(&frame->img, &sd); + return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd); + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *); + if (data) { + YV12_BUFFER_CONFIG *fb; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); + if (fb == NULL) return AOM_CODEC_ERROR; + yuvconfig2image(&data->img, fb, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *new_img = va_arg(args, aom_image_t *); + if (new_img) { + YV12_BUFFER_CONFIG new_frame; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + + if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) { + yuvconfig2image(new_img, &new_frame, NULL); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_image_t *img = va_arg(args, aom_image_t *); + if (img) { + YV12_BUFFER_CONFIG new_frame; + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + + if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) { + YV12_BUFFER_CONFIG sd; + image2yuvconfig(img, &sd); + return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame, + &sd); + } else { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const update_info = va_arg(args, int *); + + if (update_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + *update_info = + frame_worker_data->pbi->common.current_frame.refresh_frame_flags; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + *arg = ((FrameWorkerData *)ctx->frame_worker->data1) + ->pbi->common.quant_params.base_qindex; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_fwd_kf_value(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + *arg = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_fwd_kf_present; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_altref_present(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + *arg = + ((FrameWorkerData *)ctx->frame_worker->data1)->pbi->is_arf_frame_present; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_frame_flags(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + AV1Decoder *pbi = ((FrameWorkerData *)ctx->frame_worker->data1)->pbi; + *arg = 0; + switch (pbi->common.current_frame.frame_type) { + case KEY_FRAME: + *arg |= AOM_FRAME_IS_KEY; + *arg |= AOM_FRAME_IS_INTRAONLY; + if (!pbi->common.show_frame) { + *arg |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT; + } + break; + case INTRA_ONLY_FRAME: *arg |= AOM_FRAME_IS_INTRAONLY; break; + case S_FRAME: *arg |= AOM_FRAME_IS_SWITCH; break; + } + if (pbi->common.features.error_resilient_mode) { + *arg |= AOM_FRAME_IS_ERROR_RESILIENT; + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_tile_info(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_tile_info *const tile_info = va_arg(args, aom_tile_info *); + + if (tile_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + const CommonTileParams *tiles = &pbi->common.tiles; + + int tile_rows = tiles->rows; + int tile_cols = tiles->cols; + + if (tiles->uniform_spacing) { + tile_info->tile_rows = 1 << tiles->log2_rows; + tile_info->tile_columns = 1 << tiles->log2_cols; + } else { + tile_info->tile_rows = tile_rows; + tile_info->tile_columns = tile_cols; + } + + for (int tile_col = 1; tile_col <= tile_cols; tile_col++) { + tile_info->tile_widths[tile_col - 1] = + tiles->col_start_sb[tile_col] - tiles->col_start_sb[tile_col - 1]; + } + + for (int tile_row = 1; tile_row <= tile_rows; tile_row++) { + tile_info->tile_heights[tile_row - 1] = + tiles->row_start_sb[tile_row] - tiles->row_start_sb[tile_row - 1]; + } + tile_info->num_tile_groups = pbi->num_tile_groups; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_screen_content_tools_info( + aom_codec_alg_priv_t *ctx, va_list args) { + aom_screen_content_tools_info *const sc_info = + va_arg(args, aom_screen_content_tools_info *); + if (sc_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + sc_info->allow_screen_content_tools = + pbi->common.features.allow_screen_content_tools; + sc_info->allow_intrabc = pbi->common.features.allow_intrabc; + sc_info->force_integer_mv = + (int)pbi->common.features.cur_frame_force_integer_mv; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_still_picture_info *const still_picture_info = + va_arg(args, aom_still_picture_info *); + if (still_picture_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture; + still_picture_info->is_reduced_still_picture_hdr = + (int)(pbi->seq_params.reduced_still_picture_hdr); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_superblock_size_t *const sb_size = va_arg(args, aom_superblock_size_t *); + if (sb_size) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + if (pbi->seq_params.sb_size == BLOCK_128X128) { + *sb_size = AOM_SUPERBLOCK_SIZE_128X128; + } else { + *sb_size = AOM_SUPERBLOCK_SIZE_64X64; + } + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_show_existing_frame_flag( + aom_codec_alg_priv_t *ctx, va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + *arg = ((FrameWorkerData *)ctx->frame_worker->data1) + ->pbi->common.show_existing_frame; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_s_frame_info(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_s_frame_info *const s_frame_info = va_arg(args, aom_s_frame_info *); + if (s_frame_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + s_frame_info->is_s_frame = pbi->sframe_info.is_s_frame; + s_frame_info->is_s_frame_at_altref = + pbi->sframe_info.is_s_frame_at_altref; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx, + va_list args) { + int *corrupted = va_arg(args, int *); + + if (corrupted) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + if (pbi->seen_frame_header && pbi->num_output_frames == 0) + return AOM_CODEC_ERROR; + if (ctx->last_show_frame != NULL) + *corrupted = ctx->last_show_frame->buf.corrupted; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const frame_size = va_arg(args, int *); + + if (frame_size) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + frame_size[0] = cm->width; + frame_size[1] = cm->height; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *); + + if (frame_header_info) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size; + frame_header_info->coded_tile_data = pbi->obu_size_hdr.data; + frame_header_info->extra_size = pbi->frame_header_size; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_tile_data *const tile_data = va_arg(args, aom_tile_data *); + + if (tile_data) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1Decoder *pbi = frame_worker_data->pbi; + tile_data->coded_tile_data_size = + pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size; + tile_data->coded_tile_data = + pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx, + va_list args) { + av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *); + + if (data) { + av1_ext_ref_frame_t *const ext_frames = data; + ctx->ext_refs.num = ext_frames->num; + for (int i = 0; i < ctx->ext_refs.num; i++) { + image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]); + } + return AOM_CODEC_OK; + } else { + return AOM_CODEC_INVALID_PARAM; + } +} + +static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const render_size = va_arg(args, int *); + + if (render_size) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + render_size[0] = cm->render_width; + render_size[1] = cm->render_height; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const bit_depth = va_arg(args, unsigned int *); + AVxWorker *const worker = ctx->frame_worker; + + if (bit_depth) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + *bit_depth = cm->seq_params->bit_depth; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y, + int use_highbitdepth) { + aom_img_fmt_t fmt = 0; + + if (subsampling_x == 0 && subsampling_y == 0) + fmt = AOM_IMG_FMT_I444; + else if (subsampling_x == 1 && subsampling_y == 0) + fmt = AOM_IMG_FMT_I422; + else if (subsampling_x == 1 && subsampling_y == 1) + fmt = AOM_IMG_FMT_I420; + + if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + return fmt; +} + +static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx, + va_list args) { + aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *); + AVxWorker *const worker = ctx->frame_worker; + + if (img_fmt) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + + *img_fmt = get_img_format(cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const tile_size = va_arg(args, unsigned int *); + AVxWorker *const worker = ctx->frame_worker; + + if (tile_size) { + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + const AV1_COMMON *const cm = &frame_worker_data->pbi->common; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_tile_count(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const tile_count = va_arg(args, unsigned int *); + + if (tile_count) { + AVxWorker *const worker = ctx->frame_worker; + if (worker) { + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + *tile_count = frame_worker_data->pbi->tile_count_minus_1 + 1; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + return AOM_CODEC_INVALID_PARAM; +} + +static aom_codec_err_t ctrl_get_base_q_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)ctx->frame_worker->data1; + *arg = frame_worker_data->pbi->common.quant_params.base_qindex; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_show_frame_flag(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)ctx->frame_worker->data1; + *arg = frame_worker_data->pbi->common.show_frame; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_order_hint(aom_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const arg = va_arg(args, unsigned int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)ctx->frame_worker->data1; + *arg = frame_worker_data->pbi->common.current_frame.order_hint; + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_mi_info(aom_codec_alg_priv_t *ctx, + va_list args) { + int mi_row = va_arg(args, int); + int mi_col = va_arg(args, int); + MB_MODE_INFO *mi = va_arg(args, MB_MODE_INFO *); + if (mi == NULL) return AOM_CODEC_INVALID_PARAM; + if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)ctx->frame_worker->data1; + if (frame_worker_data == NULL) return AOM_CODEC_ERROR; + + AV1_COMMON *cm = &frame_worker_data->pbi->common; + const int mi_rows = cm->mi_params.mi_rows; + const int mi_cols = cm->mi_params.mi_cols; + const int mi_stride = cm->mi_params.mi_stride; + const int offset = mi_row * mi_stride + mi_col; + + if (mi_row < 0 || mi_row >= mi_rows || mi_col < 0 || mi_col >= mi_cols) { + return AOM_CODEC_INVALID_PARAM; + } + + memcpy(mi, cm->mi_params.mi_grid_base[offset], sizeof(*mi)); + + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->invert_tile_order = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx, + va_list args) { + const int legacy_byte_alignment = 0; + const int min_byte_alignment = 32; + const int max_byte_alignment = 1024; + const int byte_alignment = va_arg(args, int); + + if (byte_alignment != legacy_byte_alignment && + (byte_alignment < min_byte_alignment || + byte_alignment > max_byte_alignment || + (byte_alignment & (byte_alignment - 1)) != 0)) + return AOM_CODEC_INVALID_PARAM; + + ctx->byte_alignment = byte_alignment; + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->common.features.byte_alignment = byte_alignment; + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_loop_filter = va_arg(args, int); + + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->skip_loop_filter = ctx->skip_loop_filter; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->skip_film_grain = va_arg(args, int); + + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + frame_worker_data->pbi->skip_film_grain = ctx->skip_film_grain; + } + + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_ACCOUNTING + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + Accounting **acct = va_arg(args, Accounting **); + + if (acct) { + if (ctx->frame_worker) { + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = + (FrameWorkerData *)worker->data1; + AV1Decoder *pbi = frame_worker_data->pbi; + *acct = &pbi->accounting; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } + + return AOM_CODEC_INVALID_PARAM; +#endif +} + +static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->decode_tile_row = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->decode_tile_col = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->tile_mode = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->is_annexb = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->operating_point = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->output_all_layers = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx, + va_list args) { +#if !CONFIG_INSPECTION + (void)ctx; + (void)args; + return AOM_CODEC_INCAPABLE; +#else + aom_inspect_init *init = va_arg(args, aom_inspect_init *); + ctx->inspect_cb = init->inspect_cb; + ctx->inspect_ctx = init->inspect_ctx; + return AOM_CODEC_OK; +#endif +} + +static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->ext_tile_debug = va_arg(args, int); + return AOM_CODEC_OK; +} + +static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, unsigned int); + return AOM_CODEC_OK; +} + +static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { + { AV1_COPY_REFERENCE, ctrl_copy_reference }, + + // Setters + { AV1_SET_REFERENCE, ctrl_set_reference }, + { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order }, + { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, + { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, + { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row }, + { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col }, + { AV1_SET_TILE_MODE, ctrl_set_tile_mode }, + { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb }, + { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point }, + { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers }, + { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback }, + { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug }, + { AV1D_SET_ROW_MT, ctrl_set_row_mt }, + { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr }, + { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain }, + + // Getters + { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, + { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer }, + { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, + { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth }, + { AV1D_GET_IMG_FORMAT, ctrl_get_img_format }, + { AV1D_GET_TILE_SIZE, ctrl_get_tile_size }, + { AV1D_GET_TILE_COUNT, ctrl_get_tile_count }, + { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size }, + { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size }, + { AV1_GET_ACCOUNTING, ctrl_get_accounting }, + { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image }, + { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, + { AV1_GET_REFERENCE, ctrl_get_reference }, + { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info }, + { AV1D_GET_TILE_DATA, ctrl_get_tile_data }, + { AOMD_GET_FWD_KF_PRESENT, ctrl_get_fwd_kf_value }, + { AOMD_GET_ALTREF_PRESENT, ctrl_get_altref_present }, + { AOMD_GET_FRAME_FLAGS, ctrl_get_frame_flags }, + { AOMD_GET_TILE_INFO, ctrl_get_tile_info }, + { AOMD_GET_SCREEN_CONTENT_TOOLS_INFO, ctrl_get_screen_content_tools_info }, + { AOMD_GET_STILL_PICTURE, ctrl_get_still_picture }, + { AOMD_GET_SB_SIZE, ctrl_get_sb_size }, + { AOMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag }, + { AOMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info }, + { AOMD_GET_SHOW_FRAME_FLAG, ctrl_get_show_frame_flag }, + { AOMD_GET_BASE_Q_IDX, ctrl_get_base_q_idx }, + { AOMD_GET_ORDER_HINT, ctrl_get_order_hint }, + { AV1D_GET_MI_INFO, ctrl_get_mi_info }, + CTRL_MAP_END, +}; + +// This data structure and function are exported in aom/aomdx.h +#ifndef VERSION_STRING +#define VERSION_STRING +#endif +aom_codec_iface_t aom_codec_av1_dx_algo = { + "AOMedia Project AV1 Decoder" VERSION_STRING, + AOM_CODEC_INTERNAL_ABI_VERSION, + AOM_CODEC_CAP_DECODER | + AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t + decoder_init, // aom_codec_init_fn_t + decoder_destroy, // aom_codec_destroy_fn_t + decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t + { + // NOLINT + decoder_peek_si, // aom_codec_peek_si_fn_t + decoder_get_si, // aom_codec_get_si_fn_t + decoder_decode, // aom_codec_decode_fn_t + decoder_get_frame, // aom_codec_get_frame_fn_t + decoder_set_fb_fn, // aom_codec_set_fb_fn_t + }, + { + // NOLINT + 0, + NULL, // aom_codec_enc_cfg_t + NULL, // aom_codec_encode_fn_t + NULL, // aom_codec_get_cx_data_fn_t + NULL, // aom_codec_enc_config_set_fn_t + NULL, // aom_codec_get_global_headers_fn_t + NULL // aom_codec_get_preview_frame_fn_t + }, + NULL // aom_codec_set_option_fn_t +}; + +// Decoder interface for inspecting frame data. It uses decoder_inspect instead +// of decoder_decode so it only decodes one frame at a time, whether the frame +// is shown or not. +aom_codec_iface_t aom_codec_av1_inspect_algo = { + "AOMedia Project AV1 Decoder Inspector" VERSION_STRING, + AOM_CODEC_INTERNAL_ABI_VERSION, + AOM_CODEC_CAP_DECODER | + AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t + decoder_init, // aom_codec_init_fn_t + decoder_destroy, // aom_codec_destroy_fn_t + decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t + { + // NOLINT + decoder_peek_si, // aom_codec_peek_si_fn_t + decoder_get_si, // aom_codec_get_si_fn_t + decoder_inspect, // aom_codec_decode_fn_t + decoder_get_frame, // aom_codec_get_frame_fn_t + decoder_set_fb_fn, // aom_codec_set_fb_fn_t + }, + { + // NOLINT + 0, + NULL, // aom_codec_enc_cfg_t + NULL, // aom_codec_encode_fn_t + NULL, // aom_codec_get_cx_data_fn_t + NULL, // aom_codec_enc_config_set_fn_t + NULL, // aom_codec_get_global_headers_fn_t + NULL // aom_codec_get_preview_frame_fn_t + }, + NULL // aom_codec_set_option_fn_t +}; + +aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; } diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h new file mode 100644 index 0000000000..b923c3dcff --- /dev/null +++ b/third_party/aom/av1/av1_iface_common.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_AV1_IFACE_COMMON_H_ +#define AOM_AV1_AV1_IFACE_COMMON_H_ + +#include + +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +extern aom_codec_iface_t aom_codec_av1_inspect_algo; + +static AOM_INLINE void yuvconfig2image(aom_image_t *img, + const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /* aom_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields. + */ + int bps; + if (!yv12->subsampling_y) { + if (!yv12->subsampling_x) { + img->fmt = AOM_IMG_FMT_I444; + bps = 24; + } else { + img->fmt = AOM_IMG_FMT_I422; + bps = 16; + } + } else { + img->fmt = AOM_IMG_FMT_I420; + bps = 12; + } + img->cp = yv12->color_primaries; + img->tc = yv12->transfer_characteristics; + img->mc = yv12->matrix_coefficients; + img->monochrome = yv12->monochrome; + img->csp = yv12->chroma_sample_position; + img->range = yv12->color_range; + img->bit_depth = 8; + img->w = yv12->y_width; + img->h = yv12->y_height; + img->d_w = yv12->y_crop_width; + img->d_h = yv12->y_crop_height; + img->r_w = yv12->render_width; + img->r_h = yv12->render_height; + img->x_chroma_shift = yv12->subsampling_x; + img->y_chroma_shift = yv12->subsampling_y; + img->planes[AOM_PLANE_Y] = yv12->y_buffer; + img->planes[AOM_PLANE_U] = yv12->u_buffer; + img->planes[AOM_PLANE_V] = yv12->v_buffer; + img->stride[AOM_PLANE_Y] = yv12->y_stride; + img->stride[AOM_PLANE_U] = yv12->uv_stride; + img->stride[AOM_PLANE_V] = yv12->uv_stride; + if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { + bps *= 2; + // aom_image_t uses byte strides and a pointer to the first byte + // of the image. + img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH); + img->bit_depth = yv12->bit_depth; + img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer); + img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer); + img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer); + img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride; + img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride; + img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride; + } + img->bps = bps; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; + img->sz = yv12->frame_size; + assert(!yv12->metadata); + img->metadata = NULL; +} + +static AOM_INLINE aom_codec_err_t image2yuvconfig(const aom_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + yv12->y_buffer = img->planes[AOM_PLANE_Y]; + yv12->u_buffer = img->planes[AOM_PLANE_U]; + yv12->v_buffer = img->planes[AOM_PLANE_V]; + + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; + yv12->render_width = img->r_w; + yv12->render_height = img->r_h; + yv12->y_width = img->w; + yv12->y_height = img->h; + + yv12->uv_width = (yv12->y_width + img->x_chroma_shift) >> img->x_chroma_shift; + yv12->uv_height = + (yv12->y_height + img->y_chroma_shift) >> img->y_chroma_shift; + yv12->uv_crop_width = + (yv12->y_crop_width + img->x_chroma_shift) >> img->x_chroma_shift; + yv12->uv_crop_height = + (yv12->y_crop_height + img->y_chroma_shift) >> img->y_chroma_shift; + + yv12->y_stride = img->stride[AOM_PLANE_Y]; + yv12->uv_stride = img->stride[AOM_PLANE_U]; + yv12->color_primaries = img->cp; + yv12->transfer_characteristics = img->tc; + yv12->matrix_coefficients = img->mc; + yv12->monochrome = img->monochrome; + yv12->chroma_sample_position = img->csp; + yv12->color_range = img->range; + + if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + // In aom_image_t + // planes point to uint8 address of start of data + // stride counts uint8s to reach next row + // In YV12_BUFFER_CONFIG + // y_buffer, u_buffer, v_buffer point to uint16 address of data + // stride and border counts in uint16s + // This means that all the address calculations in the main body of code + // should work correctly. + // However, before we do any pixel operations we need to cast the address + // to a uint16 ponter and double its value. + yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); + yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); + yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); + yv12->y_stride >>= 1; + yv12->uv_stride >>= 1; + yv12->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + yv12->flags = 0; + } + + // Note(yunqing): if img is allocated the same as the frame buffer, y_stride + // is 32-byte aligned. Also, handle the cases while allocating img without a + // border or stride_align is less than 32. + int border = (yv12->y_stride - (int)((img->w + 31) & ~31u)) / 2; + yv12->border = (border < 0) ? 0 : border; + yv12->subsampling_x = img->x_chroma_shift; + yv12->subsampling_y = img->y_chroma_shift; + yv12->metadata = img->metadata; + return AOM_CODEC_OK; +} + +#endif // AOM_AV1_AV1_IFACE_COMMON_H_ diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c new file mode 100644 index 0000000000..2a9a8beb40 --- /dev/null +++ b/third_party/aom/av1/common/alloccommon.c @@ -0,0 +1,506 @@ +/* + * + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom_mem/aom_mem.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/cdef_block.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/thread_common.h" + +int av1_get_MBs(int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); + const int mi_cols = aligned_width >> MI_SIZE_LOG2; + const int mi_rows = aligned_height >> MI_SIZE_LOG2; + + const int mb_cols = ROUND_POWER_OF_TWO(mi_cols, 2); + const int mb_rows = ROUND_POWER_OF_TWO(mi_rows, 2); + return mb_rows * mb_cols; +} + +void av1_free_ref_frame_buffers(BufferPool *pool) { + int i; + + for (i = 0; i < pool->num_frame_bufs; ++i) { + if (pool->frame_bufs[i].ref_count > 0 && + pool->frame_bufs[i].raw_frame_buffer.data != NULL) { + pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); + pool->frame_bufs[i].raw_frame_buffer.data = NULL; + pool->frame_bufs[i].raw_frame_buffer.size = 0; + pool->frame_bufs[i].raw_frame_buffer.priv = NULL; + pool->frame_bufs[i].ref_count = 0; + } + aom_free(pool->frame_bufs[i].mvs); + pool->frame_bufs[i].mvs = NULL; + aom_free(pool->frame_bufs[i].seg_map); + pool->frame_bufs[i].seg_map = NULL; + aom_free_frame_buffer(&pool->frame_bufs[i].buf); + } + aom_free(pool->frame_bufs); + pool->frame_bufs = NULL; + pool->num_frame_bufs = 0; +} + +static INLINE void free_cdef_linebuf_conditional( + AV1_COMMON *const cm, const size_t *new_linebuf_size) { + CdefInfo *cdef_info = &cm->cdef_info; + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) { + aom_free(cdef_info->linebuf[plane]); + cdef_info->linebuf[plane] = NULL; + } + } +} + +static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm, + uint16_t **colbuf, + uint16_t **srcbuf, + const size_t *new_colbuf_size, + const size_t new_srcbuf_size) { + CdefInfo *cdef_info = &cm->cdef_info; + if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) { + aom_free(*srcbuf); + *srcbuf = NULL; + } + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) { + aom_free(colbuf[plane]); + colbuf[plane] = NULL; + } + } +} + +static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) { + aom_free(*srcbuf); + *srcbuf = NULL; + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + aom_free(colbuf[plane]); + colbuf[plane] = NULL; + } +} + +static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt, + const int num_mi_rows) { + if (*cdef_row_mt == NULL) return; +#if CONFIG_MULTITHREAD + for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { + if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) { + pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); + aom_free((*cdef_row_mt)[row_idx].row_mutex_); + } + if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) { + pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); + aom_free((*cdef_row_mt)[row_idx].row_cond_); + } + } +#else + (void)num_mi_rows; +#endif // CONFIG_MULTITHREAD + aom_free(*cdef_row_mt); + *cdef_row_mt = NULL; +} + +void av1_free_cdef_buffers(AV1_COMMON *const cm, + AV1CdefWorkerData **cdef_worker, + AV1CdefSync *cdef_sync) { + CdefInfo *cdef_info = &cm->cdef_info; + const int num_mi_rows = cdef_info->allocated_mi_rows; + + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + aom_free(cdef_info->linebuf[plane]); + cdef_info->linebuf[plane] = NULL; + } + // De-allocation of column buffer & source buffer (worker_0). + free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf); + + free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows); + + if (cdef_info->allocated_num_workers < 2) return; + if (*cdef_worker != NULL) { + for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) { + // De-allocation of column buffer & source buffer for remaining workers. + free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); + } + aom_free(*cdef_worker); + *cdef_worker = NULL; + } +} + +static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf, + const int num_planes) { + CdefInfo *cdef_info = &cm->cdef_info; + for (int plane = 0; plane < num_planes; plane++) { + if (linebuf[plane] == NULL) + CHECK_MEM_ERROR(cm, linebuf[plane], + aom_malloc(cdef_info->allocated_linebuf_size[plane])); + } +} + +static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf, + uint16_t **srcbuf, const int num_planes) { + CdefInfo *cdef_info = &cm->cdef_info; + if (*srcbuf == NULL) + CHECK_MEM_ERROR(cm, *srcbuf, + aom_memalign(16, cdef_info->allocated_srcbuf_size)); + + for (int plane = 0; plane < num_planes; plane++) { + if (colbuf[plane] == NULL) + CHECK_MEM_ERROR(cm, colbuf[plane], + aom_malloc(cdef_info->allocated_colbuf_size[plane])); + } +} + +static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm, + AV1CdefRowSync **cdef_row_mt, + const int num_mi_rows) { + if (*cdef_row_mt != NULL) return; + + CHECK_MEM_ERROR(cm, *cdef_row_mt, + aom_calloc(num_mi_rows, sizeof(**cdef_row_mt))); +#if CONFIG_MULTITHREAD + for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { + CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_, + aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_))); + pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL); + + CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_, + aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_))); + pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL); + } +#endif // CONFIG_MULTITHREAD +} + +void av1_alloc_cdef_buffers(AV1_COMMON *const cm, + AV1CdefWorkerData **cdef_worker, + AV1CdefSync *cdef_sync, int num_workers, + int init_worker) { + const int num_planes = av1_num_planes(cm); + size_t new_linebuf_size[MAX_MB_PLANE] = { 0 }; + size_t new_colbuf_size[MAX_MB_PLANE] = { 0 }; + size_t new_srcbuf_size = 0; + CdefInfo *const cdef_info = &cm->cdef_info; + // Check for configuration change + const int num_mi_rows = + (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int is_num_workers_changed = + cdef_info->allocated_num_workers != num_workers; + const int is_cdef_enabled = + cm->seq_params->enable_cdef && !cm->tiles.large_scale; + + // num-bufs=3 represents ping-pong buffers for top linebuf, + // followed by bottom linebuf. + // ping-pong is to avoid top linebuf over-write by consecutive row. + int num_bufs = 3; + if (num_workers > 1) + num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + if (is_cdef_enabled) { + // Calculate src buffer size + new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE; + for (int plane = 0; plane < num_planes; plane++) { + const int shift = + plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x; + // Calculate top and bottom line buffer size + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs * + (CDEF_VBORDER << 1) * (luma_stride >> shift); + // Calculate column buffer size + const int block_height = + (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER; + new_colbuf_size[plane] = + sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER; + } + } + + // Free src, line and column buffers for worker 0 in case of reallocation + free_cdef_linebuf_conditional(cm, new_linebuf_size); + free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf, + new_colbuf_size, new_srcbuf_size); + + // The flag init_worker indicates if cdef_worker has to be allocated for the + // frame. This is passed as 1 always from decoder. At encoder side, it is 0 + // when called for parallel frames during FPMT (where cdef_worker is shared + // across parallel frames) and 1 otherwise. + if (*cdef_worker != NULL && init_worker) { + if (is_num_workers_changed) { + // Free src and column buffers for remaining workers in case of change in + // num_workers + for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) + free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); + + aom_free(*cdef_worker); + *cdef_worker = NULL; + } else if (num_workers > 1) { + // Free src and column buffers for remaining workers in case of + // reallocation + for (int idx = num_workers - 1; idx >= 1; idx--) + free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf, + &(*cdef_worker)[idx].srcbuf, new_colbuf_size, + new_srcbuf_size); + } + } + + if (cdef_info->allocated_mi_rows != num_mi_rows) + free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows); + + // Store allocated sizes for reallocation + cdef_info->allocated_srcbuf_size = new_srcbuf_size; + av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size); + av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size); + // Store configuration to check change in configuration + cdef_info->allocated_mi_rows = num_mi_rows; + cdef_info->allocated_num_workers = num_workers; + + if (!is_cdef_enabled) return; + + // Memory allocation of column buffer & source buffer (worker_0). + alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes); + alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes); + + if (num_workers < 2) return; + + if (init_worker) { + if (*cdef_worker == NULL) + CHECK_MEM_ERROR(cm, *cdef_worker, + aom_calloc(num_workers, sizeof(**cdef_worker))); + + // Memory allocation of column buffer & source buffer for remaining workers. + for (int idx = num_workers - 1; idx >= 1; idx--) + alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf, + &(*cdef_worker)[idx].srcbuf, num_planes); + } + + alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt, + cdef_info->allocated_mi_rows); +} + +// Allocate buffers which are independent of restoration_unit_size +void av1_alloc_restoration_buffers(AV1_COMMON *cm, bool is_sgr_enabled) { + const int num_planes = av1_num_planes(cm); + + if (cm->rst_tmpbuf == NULL && is_sgr_enabled) { + CHECK_MEM_ERROR(cm, cm->rst_tmpbuf, + (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); + } + + if (cm->rlbs == NULL) { + CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers))); + } + + // For striped loop restoration, we divide each plane into "stripes", + // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET + // luma pixels to match the output from CDEF. We will need to store 2 * + // RESTORATION_CTX_VERT lines of data for each stripe. + int mi_h = cm->mi_params.mi_rows; + const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2); + const int num_stripes = (ext_h + 63) / 64; + + // Now we need to allocate enough space to store the line buffers for the + // stripes + const int frame_w = cm->superres_upscaled_width; + const int use_highbd = cm->seq_params->use_highbitdepth; + + for (int p = 0; p < num_planes; ++p) { + const int is_uv = p > 0; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ; + const int stride = ALIGN_POWER_OF_TWO(plane_w, 5); + const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT + << use_highbd; + RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; + + if (buf_size != boundaries->stripe_boundary_size || + boundaries->stripe_boundary_above == NULL || + boundaries->stripe_boundary_below == NULL) { + aom_free(boundaries->stripe_boundary_above); + aom_free(boundaries->stripe_boundary_below); + + CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above, + (uint8_t *)aom_memalign(32, buf_size)); + CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below, + (uint8_t *)aom_memalign(32, buf_size)); + + boundaries->stripe_boundary_size = buf_size; + } + boundaries->stripe_boundary_stride = stride; + } +} + +void av1_free_restoration_buffers(AV1_COMMON *cm) { + int p; + for (p = 0; p < MAX_MB_PLANE; ++p) + av1_free_restoration_struct(&cm->rst_info[p]); + aom_free(cm->rst_tmpbuf); + cm->rst_tmpbuf = NULL; + aom_free(cm->rlbs); + cm->rlbs = NULL; + for (p = 0; p < MAX_MB_PLANE; ++p) { + RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries; + aom_free(boundaries->stripe_boundary_above); + aom_free(boundaries->stripe_boundary_below); + boundaries->stripe_boundary_above = NULL; + boundaries->stripe_boundary_below = NULL; + } + + aom_free_frame_buffer(&cm->rst_frame); +} + +void av1_free_above_context_buffers(CommonContexts *above_contexts) { + int i; + const int num_planes = above_contexts->num_planes; + + for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) { + for (i = 0; i < num_planes; i++) { + if (above_contexts->entropy[i] == NULL) break; + aom_free(above_contexts->entropy[i][tile_row]); + above_contexts->entropy[i][tile_row] = NULL; + } + if (above_contexts->partition != NULL) { + aom_free(above_contexts->partition[tile_row]); + above_contexts->partition[tile_row] = NULL; + } + + if (above_contexts->txfm != NULL) { + aom_free(above_contexts->txfm[tile_row]); + above_contexts->txfm[tile_row] = NULL; + } + } + for (i = 0; i < num_planes; i++) { + aom_free(above_contexts->entropy[i]); + above_contexts->entropy[i] = NULL; + } + aom_free(above_contexts->partition); + above_contexts->partition = NULL; + + aom_free(above_contexts->txfm); + above_contexts->txfm = NULL; + + above_contexts->num_tile_rows = 0; + above_contexts->num_mi_cols = 0; + above_contexts->num_planes = 0; +} + +void av1_free_context_buffers(AV1_COMMON *cm) { + if (cm->mi_params.free_mi != NULL) cm->mi_params.free_mi(&cm->mi_params); + + av1_free_above_context_buffers(&cm->above_contexts); +} + +int av1_alloc_above_context_buffers(CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes) { + const int aligned_mi_cols = + ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2); + + // Allocate above context buffers + above_contexts->num_tile_rows = num_tile_rows; + above_contexts->num_mi_cols = aligned_mi_cols; + above_contexts->num_planes = num_planes; + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->entropy[0])); + if (!above_contexts->entropy[plane_idx]) return 1; + } + + above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc( + num_tile_rows, sizeof(above_contexts->partition)); + if (!above_contexts->partition) return 1; + + above_contexts->txfm = + (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm)); + if (!above_contexts->txfm) return 1; + + for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) { + for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) { + above_contexts->entropy[plane_idx][tile_row] = + (ENTROPY_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row])); + if (!above_contexts->entropy[plane_idx][tile_row]) return 1; + } + + above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->partition[tile_row])); + if (!above_contexts->partition[tile_row]) return 1; + + above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc( + aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row])); + if (!above_contexts->txfm[tile_row]) return 1; + } + + return 0; +} + +// Allocate the dynamically allocated arrays in 'mi_params' assuming +// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of +// the struct members. +static int alloc_mi(CommonModeInfoParams *mi_params) { + const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows); + const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows; + const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int alloc_mi_size = + mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d); + + if (mi_params->mi_alloc_size < alloc_mi_size || + mi_params->mi_grid_size < mi_grid_size) { + mi_params->free_mi(mi_params); + + mi_params->mi_alloc = + aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc)); + if (!mi_params->mi_alloc) return 1; + mi_params->mi_alloc_size = alloc_mi_size; + + mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( + mi_grid_size, sizeof(*mi_params->mi_grid_base)); + if (!mi_params->mi_grid_base) return 1; + + mi_params->tx_type_map = + aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); + if (!mi_params->tx_type_map) return 1; + mi_params->mi_grid_size = mi_grid_size; + } + + return 0; +} + +int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height, + BLOCK_SIZE min_partition_size) { + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->set_mb_mi(mi_params, width, height, min_partition_size); + if (alloc_mi(mi_params)) goto fail; + return 0; + +fail: + // clear the mi_* values to force a realloc on resync + mi_params->set_mb_mi(mi_params, 0, 0, BLOCK_4X4); + av1_free_context_buffers(cm); + return 1; +} + +void av1_remove_common(AV1_COMMON *cm) { + av1_free_context_buffers(cm); + + aom_free(cm->fc); + cm->fc = NULL; + aom_free(cm->default_frame_context); + cm->default_frame_context = NULL; +} + +void av1_init_mi_buffers(CommonModeInfoParams *mi_params) { + mi_params->setup_mi(mi_params); +} diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h new file mode 100644 index 0000000000..d31b4c56b6 --- /dev/null +++ b/third_party/aom/av1/common/alloccommon.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_ +#define AOM_AV1_COMMON_ALLOCCOMMON_H_ + +#define INVALID_IDX -1 // Invalid buffer index. + +#include + +#include "config/aom_config.h" + +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; +struct BufferPool; +struct CommonContexts; +struct CommonModeInfoParams; +struct AV1CdefWorker; +struct AV1CdefSyncData; + +void av1_remove_common(struct AV1Common *cm); + +int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts, + int num_tile_rows, int num_mi_cols, + int num_planes); +void av1_free_above_context_buffers(struct CommonContexts *above_contexts); +int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height, + BLOCK_SIZE min_partition_size); +void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params); +void av1_free_context_buffers(struct AV1Common *cm); + +void av1_free_ref_frame_buffers(struct BufferPool *pool); +void av1_alloc_cdef_buffers(struct AV1Common *const cm, + struct AV1CdefWorker **cdef_worker, + struct AV1CdefSyncData *cdef_sync, int num_workers, + int init_worker); +void av1_free_cdef_buffers(struct AV1Common *const cm, + struct AV1CdefWorker **cdef_worker, + struct AV1CdefSyncData *cdef_sync); +void av1_alloc_restoration_buffers(struct AV1Common *cm, bool is_sgr_enabled); +void av1_free_restoration_buffers(struct AV1Common *cm); + +int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height); +void av1_free_state_buffers(struct AV1Common *cm); + +int av1_get_MBs(int width, int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_ diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c new file mode 100644 index 0000000000..09e5166b14 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c @@ -0,0 +1,4217 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/arm/av1_inv_txfm_neon.h" + +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +// 1D functions +static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { + { av1_idct4, av1_iadst4, av1_iidentity4_c }, + { av1_idct8, av1_iadst8, av1_iidentity8_c }, + { av1_idct16, av1_iadst16, av1_iidentity16_c }, + { av1_idct32, NULL, NULL }, + { av1_idct64, NULL, NULL }, +}; + +static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + int16x8_t temp_output; + for (int i = 0; i < height; ++i, j += step) { + temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); + temp_output = vaddq_s16(temp_output, in[j]); + vst1_u8(output, vqmovun_s16(temp_output)); + output += stride; + } +} + +static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, + int16x8_t res0, + int16x8_t res1) { + int16x8_t temp_output[2]; + uint8x16_t temp_output_8q; + temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); + temp_output[0] = vaddq_s16(temp_output[0], res0); + temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); + temp_output[1] = vaddq_s16(temp_output[1], res1); + temp_output_8q = + vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); + return temp_output_8q; +} + +static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, + uint8_t *output, int stride, + int flipud, int height) { + uint8x16_t temp_output_8q; + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp_output_8q = vld1q_u8(output + i * stride); + temp_output_8q = + lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); + vst1q_u8((output + i * stride), temp_output_8q); + } +} + +static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, + int value) { + for (int i = 0; i < size; i++) { + a[i] = vdupq_n_s16((int16_t)value); + } +} + +static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1, + int16_t coef2, int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0_l, s0_h, s1_l, s1_h; + int16x4_t v0[2], v1[2]; + + s0_l = vmull_n_s16(vget_low_s16(in0), coef1); + s0_h = vmull_n_s16(vget_high_s16(in0), coef1); + s1_l = vmull_n_s16(vget_low_s16(in0), coef2); + s1_h = vmull_n_s16(vget_high_s16(in0), coef2); + + v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, const int16x4_t c, + int16x8_t *t0, int16x8_t *t1) { + int32x4_t s0[2], s1[2]; + int16x4_t v0[2], v1[2]; + + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); + + v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); + + *t0 = vcombine_s16(v0[0], v0[1]); + *t1 = vcombine_s16(v1[0], v1[1]); +} + +static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { + int32x4_t t0[2], t1[2]; + int16x4_t v0[2], v1[2]; + + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + + v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); + v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); + v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); + v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); + + x[0] = vcombine_s16(v0[0], v0[1]); + x[1] = vcombine_s16(v1[0], v1[1]); +} + +static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + int16x4_t val = vdup_n_s16(c0); + val = vset_lane_s16(c1, val, 1); + val = vset_lane_s16(c2, val, 2); + val = vset_lane_s16(c3, val, 3); + return val; +} + +static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[20], (int16_t)cospi[44]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[8]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + // Stage 1 + x[0] = in[7]; + x[1] = in[0]; + x[2] = in[5]; + x[3] = in[2]; + x[4] = in[3]; + x[5] = in[4]; + x[6] = in[1]; + x[7] = in[6]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + + // Stage 3 + x[0] = vqaddq_s16(s0, s4); + x[1] = vqaddq_s16(s1, s5); + x[2] = vqaddq_s16(s2, s6); + x[3] = vqaddq_s16(s3, s7); + x[4] = vqsubq_s16(s0, s4); + x[5] = vqsubq_s16(s1, s5); + x[6] = vqsubq_s16(s2, s6); + x[7] = vqsubq_s16(s3, s7); + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + s2 = x[2]; + s3 = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); + + // Stage 5 + x[0] = vqaddq_s16(s0, s2); + x[1] = vqaddq_s16(s1, s3); + x[2] = vqsubq_s16(s0, s2); + x[3] = vqsubq_s16(s1, s3); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vqnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vqnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vqnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vqnegq_s16(x[1]); +} + +static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[8]; + int16x8_t s0, s1, s4, s5; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + + btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[4] = s0; + x[5] = s1; + + // Stage 4 + s0 = x[0]; + s1 = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); + + // Stage 5 + x[0] = s0; + x[1] = s1; + x[2] = s0; + x[3] = s1; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + + // stage 6 + btf_16_half_neon(x + 2, c2); + btf_16_half_neon(x + 6, c2); + + // Stage 7 + out[0] = x[0]; + out[1] = vqnegq_s16(x[4]); + out[2] = x[6]; + out[3] = vqnegq_s16(x[2]); + out[4] = x[3]; + out[5] = vqnegq_s16(x[7]); + out[6] = x[5]; + out[7] = vqnegq_s16(x[1]); +} + +static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[8], step2[8]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + // stage 2 + btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); + + // stage 3 + btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]); + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + + // stage 4 + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]); + + // stage 5 + out[0] = vqaddq_s16(step1[0], step2[7]); + out[1] = vqaddq_s16(step1[1], step1[6]); + out[2] = vqaddq_s16(step1[2], step1[5]); + out[3] = vqaddq_s16(step1[3], step2[4]); + out[4] = vqsubq_s16(step1[3], step2[4]); + out[5] = vqsubq_s16(step1[2], step1[5]); + out[6] = vqsubq_s16(step1[1], step1[6]); + out[7] = vqsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 4 + // stage 5 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; +} + +void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); + for (int i = 0; i < size; i++) { + arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); + } +} + +static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) { + int16x8_t temp[8]; + for (int i = 0; i < size; ++i) { + temp[i] = input[size - 1 - i]; + } + for (int i = 0; i < size; ++i) { + input[i] = temp[i]; + } +} + +static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input, + int stride, + int16x8_t *const a, + int out_size) { + for (int i = 0; i < out_size; ++i) { + a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), + vmovn_s32(vld1q_s32(input + 4))); + input += stride; + } +} + +static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output, + int txw_idx, int8_t size, int bit) { + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]); + int16x4_t low_i16, high_i16; + int32x4_t low_i32, high_i32; + for (int i = 0; i < size; i++) { + int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale); + int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale); + low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4); + high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4); + low_i16 = vqmovn_s32(low_i32); + high_i16 = vqmovn_s32(high_i32); + output[i] = vcombine_s16(low_i16, high_i16); + } +} + +static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output, + int size) { + int32x4_t out_low, out_high; + int16x4_t low, high; + + for (int z = 0; z < size; ++z) { + out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); + out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); + + low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); + high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); + + output[z] = vcombine_s16(low, high); + } +} + +static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 4 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; +} + +static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); + btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); + btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); + + step2[0] = in[0]; + step2[1] = in[8]; + step2[2] = in[4]; + step2[3] = in[12]; + step2[4] = in[2]; + step2[5] = in[10]; + step2[6] = in[6]; + step2[7] = in[14]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); + btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[16], step2[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c1 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[4]; + step2[4] = in[2]; + step2[6] = in[6]; + + btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); + + // stage 3 + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + + // stage 4 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + + // stage 6 + btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +} + +static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[10], (int16_t)cospi[54]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[26], (int16_t)cospi[38]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // Stage 2 + btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); + btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); + btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); + btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); + btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); + btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c5); + btf_16_half_neon(x + 6, c5); + btf_16_half_neon(x + 10, c5); + btf_16_half_neon(x + 14, c5); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[10]; + int16x8_t s0, s1, s4, s5; + int16x8_t s8, s9, s12, s13; + + // Stage 1 + x[1] = in[0]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + + // Stage 3 + x[0] = s0; + x[1] = s1; + x[8] = s0; + x[9] = s1; + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + + // Stage 5 + x[0] = t[0]; + x[1] = t[1]; + x[4] = t[0]; + x[5] = t[1]; + x[8] = s8; + x[9] = s9; + x[12] = s8; + x[13] = s9; + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + t[8] = x[8]; + t[9] = x[9]; + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + + // Stage 7 + x[0] = t[0]; + x[1] = t[1]; + x[2] = t[0]; + x[3] = t[1]; + x[4] = s4; + x[5] = s5; + x[6] = s4; + x[7] = s5; + x[8] = t[8]; + x[9] = t[9]; + x[10] = t[8]; + x[11] = t[9]; + x[12] = s12; + x[13] = s13; + x[14] = s12; + x[15] = s13; + + // Stage 8 + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + int16x8_t x[16]; + int16x8_t t[14]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; + + // Stage 1 + x[1] = in[0]; + x[3] = in[2]; + x[5] = in[4]; + x[7] = in[6]; + x[8] = in[7]; + x[10] = in[5]; + x[12] = in[3]; + x[14] = in[1]; + + // Stage 2 + btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); + btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); + btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); + btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); + + btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); + btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); + btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); + btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); + + // Stage 3 + x[0] = vqaddq_s16(s0, s8); + x[1] = vqaddq_s16(s1, s9); + x[2] = vqaddq_s16(s2, s10); + x[3] = vqaddq_s16(s3, s11); + x[4] = vqaddq_s16(s4, s12); + x[5] = vqaddq_s16(s5, s13); + x[6] = vqaddq_s16(s6, s14); + x[7] = vqaddq_s16(s7, s15); + x[8] = vqsubq_s16(s0, s8); + x[9] = vqsubq_s16(s1, s9); + x[10] = vqsubq_s16(s2, s10); + x[11] = vqsubq_s16(s3, s11); + x[12] = vqsubq_s16(s4, s12); + x[13] = vqsubq_s16(s5, s13); + x[14] = vqsubq_s16(s6, s14); + x[15] = vqsubq_s16(s7, s15); + + // Stage 4 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); + btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11); + btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12); + btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14); + + // Stage 5 + x[0] = vqaddq_s16(t[0], t[4]); + x[1] = vqaddq_s16(t[1], t[5]); + x[2] = vqaddq_s16(t[2], t[6]); + x[3] = vqaddq_s16(t[3], t[7]); + x[4] = vqsubq_s16(t[0], t[4]); + x[5] = vqsubq_s16(t[1], t[5]); + x[6] = vqsubq_s16(t[2], t[6]); + x[7] = vqsubq_s16(t[3], t[7]); + x[8] = vqaddq_s16(s8, s12); + x[9] = vqaddq_s16(s9, s13); + x[10] = vqaddq_s16(s10, s14); + x[11] = vqaddq_s16(s11, s15); + x[12] = vqsubq_s16(s8, s12); + x[13] = vqsubq_s16(s9, s13); + x[14] = vqsubq_s16(s10, s14); + x[15] = vqsubq_s16(s11, s15); + + // stage 6 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); + btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); + btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14); + + // Stage 7 + x[0] = vqaddq_s16(t[0], t[2]); + x[1] = vqaddq_s16(t[1], t[3]); + x[2] = vqsubq_s16(t[0], t[2]); + x[3] = vqsubq_s16(t[1], t[3]); + x[4] = vqaddq_s16(s4, s6); + x[5] = vqaddq_s16(s5, s7); + x[6] = vqsubq_s16(s4, s6); + x[7] = vqsubq_s16(s5, s7); + x[8] = vqaddq_s16(t[8], t[10]); + x[9] = vqaddq_s16(t[9], t[11]); + x[10] = vqsubq_s16(t[8], t[10]); + x[11] = vqsubq_s16(t[9], t[11]); + x[12] = vqaddq_s16(s12, s14); + x[13] = vqaddq_s16(s13, s15); + x[14] = vqsubq_s16(s12, s14); + x[15] = vqsubq_s16(s13, s15); + + // Stage 8 + btf_16_half_neon(x + 2, c1); + btf_16_half_neon(x + 6, c1); + btf_16_half_neon(x + 10, c1); + btf_16_half_neon(x + 14, c1); + + // Stage 9 + out[0] = x[0]; + out[1] = vqnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vqnegq_s16(x[4]); + out[4] = x[6]; + out[5] = vqnegq_s16(x[14]); + out[6] = x[10]; + out[7] = vqnegq_s16(x[2]); + out[8] = x[3]; + out[9] = vqnegq_s16(x[11]); + out[10] = x[15]; + out[11] = vqnegq_s16(x[7]); + out[12] = x[5]; + out[13] = vqnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vqnegq_s16(x[1]); +} + +static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], + (int16_t)cospi[34], (int16_t)cospi[30]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], + (int16_t)cospi[50], (int16_t)cospi[14]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54], + (int16_t)cospi[42], (int16_t)cospi[22]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38], + (int16_t)cospi[58], (int16_t)cospi[6]); + const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c8 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c9 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 2 + + btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); + btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); + btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); + btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); + btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); + btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); + + step2[0] = in[0]; + step2[1] = in[16]; + step2[2] = in[8]; + step2[3] = in[24]; + step2[4] = in[4]; + step2[5] = in[20]; + step2[6] = in[12]; + step2[7] = in[28]; + step2[8] = in[2]; + step2[9] = in[18]; + step2[10] = in[10]; + step2[11] = in[26]; + step2[12] = in[6]; + step2[13] = in[22]; + step2[14] = in[14]; + step2[15] = in[30]; + + // stage 3 + + btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); + btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); + btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); + btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + step1[4] = step2[4]; + step1[5] = step2[5]; + step1[6] = step2[6]; + step1[7] = step2[7]; + + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); + btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); + btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); + btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); + btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[1], step1[2]); + step2[2] = vqsubq_s16(step1[1], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; +} + +static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[4]; + step2[8] = in[2]; + step2[12] = in[6]; + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + // stage 3 + step1[0] = step2[0]; + step1[4] = step2[4]; + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[16] = step2[16]; + step1[17] = step2[16]; + step1[18] = step2[19]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[21] = step2[20]; + step1[22] = step2[23]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[24]; + step1[26] = step2[27]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[29] = step2[28]; + step1[30] = step2[31]; + step1[31] = step2[31]; + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[8]; + step2[10] = step1[11]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[13] = step1[12]; + step2[14] = step1[15]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); + + step1[4] = step2[4]; + step1[5] = step2[4]; + step1[6] = step2[7]; + step1[7] = step2[7]; + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); + + step2[0] = step1[0]; + step2[1] = step1[0]; + step2[2] = step1[0]; + step2[3] = step1[0]; + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} + +static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1[32], step2[32]; + int32x4_t t32[16]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c2 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c3 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); + btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); + btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); + btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); + btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); + btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); + btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); + btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); + + step2[0] = in[0]; + step2[2] = in[8]; + step2[4] = in[4]; + step2[6] = in[12]; + step2[8] = in[2]; + step2[10] = in[10]; + step2[12] = in[6]; + step2[14] = in[14]; + + // stage 3 + + btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); + btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); + btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); + btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[16] = vqaddq_s16(step2[16], step2[17]); + step1[17] = vqsubq_s16(step2[16], step2[17]); + step1[18] = vqsubq_s16(step2[19], step2[18]); + step1[19] = vqaddq_s16(step2[19], step2[18]); + step1[20] = vqaddq_s16(step2[20], step2[21]); + step1[21] = vqsubq_s16(step2[20], step2[21]); + step1[22] = vqsubq_s16(step2[23], step2[22]); + step1[23] = vqaddq_s16(step2[23], step2[22]); + step1[24] = vqaddq_s16(step2[24], step2[25]); + step1[25] = vqsubq_s16(step2[24], step2[25]); + step1[26] = vqsubq_s16(step2[27], step2[26]); + step1[27] = vqaddq_s16(step2[27], step2[26]); + step1[28] = vqaddq_s16(step2[28], step2[29]); + step1[29] = vqsubq_s16(step2[28], step2[29]); + step1[30] = vqsubq_s16(step2[31], step2[30]); + step1[31] = vqaddq_s16(step2[31], step2[30]); + + // stage 4 + + btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); + btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); + btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); + btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); + btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); + btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[8] = vqaddq_s16(step1[8], step1[9]); + step2[9] = vqsubq_s16(step1[8], step1[9]); + step2[10] = vqsubq_s16(step1[11], step1[10]); + step2[11] = vqaddq_s16(step1[11], step1[10]); + step2[12] = vqaddq_s16(step1[12], step1[13]); + step2[13] = vqsubq_s16(step1[12], step1[13]); + step2[14] = vqsubq_s16(step1[15], step1[14]); + step2[15] = vqaddq_s16(step1[15], step1[14]); + step2[16] = step1[16]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[31] = step1[31]; + + // stage 5 + + t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); + + step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + + btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); + btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); + btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); + + step1[4] = vqaddq_s16(step2[4], step2[5]); + step1[5] = vqsubq_s16(step2[4], step2[5]); + step1[6] = vqsubq_s16(step2[7], step2[6]); + step1[7] = vqaddq_s16(step2[7], step2[6]); + step1[8] = step2[8]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[19]); + step1[17] = vqaddq_s16(step2[17], step2[18]); + step1[18] = vqsubq_s16(step2[17], step2[18]); + step1[19] = vqsubq_s16(step2[16], step2[19]); + step1[20] = vqsubq_s16(step2[23], step2[20]); + step1[21] = vqsubq_s16(step2[22], step2[21]); + step1[22] = vqaddq_s16(step2[22], step2[21]); + step1[23] = vqaddq_s16(step2[23], step2[20]); + step1[24] = vqaddq_s16(step2[24], step2[27]); + step1[25] = vqaddq_s16(step2[25], step2[26]); + step1[26] = vqsubq_s16(step2[25], step2[26]); + step1[27] = vqsubq_s16(step2[24], step2[27]); + step1[28] = vqsubq_s16(step2[31], step2[28]); + step1[29] = vqsubq_s16(step2[30], step2[29]); + step1[30] = vqaddq_s16(step2[30], step2[29]); + step1[31] = vqaddq_s16(step2[31], step2[28]); + + // stage 6 + + btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); + btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); + btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); + btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); + btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); + + step2[0] = vqaddq_s16(step1[0], step1[3]); + step2[1] = vqaddq_s16(step1[0], step1[2]); + step2[2] = vqsubq_s16(step1[0], step1[2]); + step2[3] = vqsubq_s16(step1[0], step1[3]); + step2[4] = step1[4]; + step2[7] = step1[7]; + step2[8] = vqaddq_s16(step1[8], step1[11]); + step2[9] = vqaddq_s16(step1[9], step1[10]); + step2[10] = vqsubq_s16(step1[9], step1[10]); + step2[11] = vqsubq_s16(step1[8], step1[11]); + step2[12] = vqsubq_s16(step1[15], step1[12]); + step2[13] = vqsubq_s16(step1[14], step1[13]); + step2[14] = vqaddq_s16(step1[14], step1[13]); + step2[15] = vqaddq_s16(step1[15], step1[12]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); + btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); + + step1[0] = vqaddq_s16(step2[0], step2[7]); + step1[1] = vqaddq_s16(step2[1], step2[6]); + step1[2] = vqaddq_s16(step2[2], step2[5]); + step1[3] = vqaddq_s16(step2[3], step2[4]); + step1[4] = vqsubq_s16(step2[3], step2[4]); + step1[5] = vqsubq_s16(step2[2], step2[5]); + step1[6] = vqsubq_s16(step2[1], step2[6]); + step1[7] = vqsubq_s16(step2[0], step2[7]); + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = vqaddq_s16(step2[16], step2[23]); + step1[17] = vqaddq_s16(step2[17], step2[22]); + step1[18] = vqaddq_s16(step2[18], step2[21]); + step1[19] = vqaddq_s16(step2[19], step2[20]); + step1[20] = vqsubq_s16(step2[19], step2[20]); + step1[21] = vqsubq_s16(step2[18], step2[21]); + step1[22] = vqsubq_s16(step2[17], step2[22]); + step1[23] = vqsubq_s16(step2[16], step2[23]); + step1[24] = vqsubq_s16(step2[31], step2[24]); + step1[25] = vqsubq_s16(step2[30], step2[25]); + step1[26] = vqsubq_s16(step2[29], step2[26]); + step1[27] = vqsubq_s16(step2[28], step2[27]); + step1[28] = vqaddq_s16(step2[27], step2[28]); + step1[29] = vqaddq_s16(step2[26], step2[29]); + step1[30] = vqaddq_s16(step2[25], step2[30]); + step1[31] = vqaddq_s16(step2[24], step2[31]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); + btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); + btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); + btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); + + step2[0] = vqaddq_s16(step1[0], step1[15]); + step2[1] = vqaddq_s16(step1[1], step1[14]); + step2[2] = vqaddq_s16(step1[2], step1[13]); + step2[3] = vqaddq_s16(step1[3], step1[12]); + step2[4] = vqaddq_s16(step1[4], step1[11]); + step2[5] = vqaddq_s16(step1[5], step1[10]); + step2[6] = vqaddq_s16(step1[6], step1[9]); + step2[7] = vqaddq_s16(step1[7], step1[8]); + step2[8] = vqsubq_s16(step1[7], step1[8]); + step2[9] = vqsubq_s16(step1[6], step1[9]); + step2[10] = vqsubq_s16(step1[5], step1[10]); + step2[11] = vqsubq_s16(step1[4], step1[11]); + step2[12] = vqsubq_s16(step1[3], step1[12]); + step2[13] = vqsubq_s16(step1[2], step1[13]); + step2[14] = vqsubq_s16(step1[1], step1[14]); + step2[15] = vqsubq_s16(step1[0], step1[15]); + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[18]; + step2[19] = step1[19]; + step2[28] = step1[28]; + step2[29] = step1[29]; + step2[30] = step1[30]; + step2[31] = step1[31]; + + // stage 9 + + out[0] = vqaddq_s16(step2[0], step2[31]); + out[1] = vqaddq_s16(step2[1], step2[30]); + out[2] = vqaddq_s16(step2[2], step2[29]); + out[3] = vqaddq_s16(step2[3], step2[28]); + out[4] = vqaddq_s16(step2[4], step2[27]); + out[5] = vqaddq_s16(step2[5], step2[26]); + out[6] = vqaddq_s16(step2[6], step2[25]); + out[7] = vqaddq_s16(step2[7], step2[24]); + out[8] = vqaddq_s16(step2[8], step2[23]); + out[9] = vqaddq_s16(step2[9], step2[22]); + out[10] = vqaddq_s16(step2[10], step2[21]); + out[11] = vqaddq_s16(step2[11], step2[20]); + out[12] = vqaddq_s16(step2[12], step2[19]); + out[13] = vqaddq_s16(step2[13], step2[18]); + out[14] = vqaddq_s16(step2[14], step2[17]); + out[15] = vqaddq_s16(step2[15], step2[16]); + out[16] = vqsubq_s16(step2[15], step2[16]); + out[17] = vqsubq_s16(step2[14], step2[17]); + out[18] = vqsubq_s16(step2[13], step2[18]); + out[19] = vqsubq_s16(step2[12], step2[19]); + out[20] = vqsubq_s16(step2[11], step2[20]); + out[21] = vqsubq_s16(step2[10], step2[21]); + out[22] = vqsubq_s16(step2[9], step2[22]); + out[23] = vqsubq_s16(step2[8], step2[23]); + out[24] = vqsubq_s16(step2[7], step2[24]); + out[25] = vqsubq_s16(step2[6], step2[25]); + out[26] = vqsubq_s16(step2[5], step2[26]); + out[27] = vqsubq_s16(step2[4], step2[27]); + out[28] = vqsubq_s16(step2[3], step2[28]); + out[29] = vqsubq_s16(step2[2], step2[29]); + out[30] = vqsubq_s16(step2[1], step2[30]); + out[31] = vqsubq_s16(step2[0], step2[31]); +} +static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]); + btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]); + btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]); + btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]); + + step1[0] = vqaddq_s16(step2[0], step2[15]); + step1[1] = vqaddq_s16(step2[1], step2[14]); + step1[2] = vqaddq_s16(step2[2], step2[13]); + step1[3] = vqaddq_s16(step2[3], step2[12]); + step1[4] = vqaddq_s16(step2[4], step2[11]); + step1[5] = vqaddq_s16(step2[5], step2[10]); + step1[6] = vqaddq_s16(step2[6], step2[9]); + step1[7] = vqaddq_s16(step2[7], step2[8]); + step1[8] = vqsubq_s16(step2[7], step2[8]); + step1[9] = vqsubq_s16(step2[6], step2[9]); + step1[10] = vqsubq_s16(step2[5], step2[10]); + step1[11] = vqsubq_s16(step2[4], step2[11]); + step1[12] = vqsubq_s16(step2[3], step2[12]); + step1[13] = vqsubq_s16(step2[2], step2[13]); + step1[14] = vqsubq_s16(step2[1], step2[14]); + step1[15] = vqsubq_s16(step2[0], step2[15]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[47]); + step1[33] = vqaddq_s16(step2[33], step2[46]); + step1[34] = vqaddq_s16(step2[34], step2[45]); + step1[35] = vqaddq_s16(step2[35], step2[44]); + step1[36] = vqaddq_s16(step2[36], step2[43]); + step1[37] = vqaddq_s16(step2[37], step2[42]); + step1[38] = vqaddq_s16(step2[38], step2[41]); + step1[39] = vqaddq_s16(step2[39], step2[40]); + step1[40] = vqsubq_s16(step2[39], step2[40]); + step1[41] = vqsubq_s16(step2[38], step2[41]); + step1[42] = vqsubq_s16(step2[37], step2[42]); + step1[43] = vqsubq_s16(step2[36], step2[43]); + step1[44] = vqsubq_s16(step2[35], step2[44]); + step1[45] = vqsubq_s16(step2[34], step2[45]); + step1[46] = vqsubq_s16(step2[33], step2[46]); + step1[47] = vqsubq_s16(step2[32], step2[47]); + step1[48] = vqsubq_s16(step2[63], step2[48]); + step1[49] = vqsubq_s16(step2[62], step2[49]); + step1[50] = vqsubq_s16(step2[61], step2[50]); + step1[51] = vqsubq_s16(step2[60], step2[51]); + step1[52] = vqsubq_s16(step2[59], step2[52]); + step1[53] = vqsubq_s16(step2[58], step2[53]); + step1[54] = vqsubq_s16(step2[57], step2[54]); + step1[55] = vqsubq_s16(step2[56], step2[55]); + step1[56] = vqaddq_s16(step2[56], step2[55]); + step1[57] = vqaddq_s16(step2[57], step2[54]); + step1[58] = vqaddq_s16(step2[58], step2[53]); + step1[59] = vqaddq_s16(step2[59], step2[52]); + step1[60] = vqaddq_s16(step2[60], step2[51]); + step1[61] = vqaddq_s16(step2[61], step2[50]); + step1[62] = vqaddq_s16(step2[62], step2[49]); + step1[63] = vqaddq_s16(step2[63], step2[48]); +} + +static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + + btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]); + btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]); + btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]); + btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]); + btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]); + btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]); + btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]); + btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]); + + step2[0] = vqaddq_s16(step1[0], step1[31]); + step2[1] = vqaddq_s16(step1[1], step1[30]); + step2[2] = vqaddq_s16(step1[2], step1[29]); + step2[3] = vqaddq_s16(step1[3], step1[28]); + step2[4] = vqaddq_s16(step1[4], step1[27]); + step2[5] = vqaddq_s16(step1[5], step1[26]); + step2[6] = vqaddq_s16(step1[6], step1[25]); + step2[7] = vqaddq_s16(step1[7], step1[24]); + step2[8] = vqaddq_s16(step1[8], step1[23]); + step2[9] = vqaddq_s16(step1[9], step1[22]); + step2[10] = vqaddq_s16(step1[10], step1[21]); + step2[11] = vqaddq_s16(step1[11], step1[20]); + step2[12] = vqaddq_s16(step1[12], step1[19]); + step2[13] = vqaddq_s16(step1[13], step1[18]); + step2[14] = vqaddq_s16(step1[14], step1[17]); + step2[15] = vqaddq_s16(step1[15], step1[16]); + step2[16] = vqsubq_s16(step1[15], step1[16]); + step2[17] = vqsubq_s16(step1[14], step1[17]); + step2[18] = vqsubq_s16(step1[13], step1[18]); + step2[19] = vqsubq_s16(step1[12], step1[19]); + step2[20] = vqsubq_s16(step1[11], step1[20]); + step2[21] = vqsubq_s16(step1[10], step1[21]); + step2[22] = vqsubq_s16(step1[9], step1[22]); + step2[23] = vqsubq_s16(step1[8], step1[23]); + step2[24] = vqsubq_s16(step1[7], step1[24]); + step2[25] = vqsubq_s16(step1[6], step1[25]); + step2[26] = vqsubq_s16(step1[5], step1[26]); + step2[27] = vqsubq_s16(step1[4], step1[27]); + step2[28] = vqsubq_s16(step1[3], step1[28]); + step2[29] = vqsubq_s16(step1[2], step1[29]); + step2[30] = vqsubq_s16(step1[1], step1[30]); + step2[31] = vqsubq_s16(step1[0], step1[31]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[37] = step1[37]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[58] = step1[58]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; +} + +static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[2] = in[16]; + step2[4] = in[8]; + step2[6] = in[24]; + step2[8] = in[4]; + step2[10] = in[20]; + step2[12] = in[12]; + step2[14] = in[28]; + step2[16] = in[2]; + step2[18] = in[18]; + step2[20] = in[10]; + step2[22] = in[26]; + step2[24] = in[6]; + step2[26] = in[22]; + step2[28] = in[14]; + step2[30] = in[30]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]); + btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]); + btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]); + btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]); + btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = step2[4]; + step1[6] = step2[6]; + step1[8] = step2[8]; + step1[10] = step2[10]; + step1[12] = step2[12]; + step1[14] = step2[14]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]); + btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]); + btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = vqaddq_s16(step2[32], step2[33]); + step1[33] = vqsubq_s16(step2[32], step2[33]); + step1[34] = vqsubq_s16(step2[35], step2[34]); + step1[35] = vqaddq_s16(step2[35], step2[34]); + step1[36] = vqaddq_s16(step2[36], step2[37]); + step1[37] = vqsubq_s16(step2[36], step2[37]); + step1[38] = vqsubq_s16(step2[39], step2[38]); + step1[39] = vqaddq_s16(step2[39], step2[38]); + step1[40] = vqaddq_s16(step2[40], step2[41]); + step1[41] = vqsubq_s16(step2[40], step2[41]); + step1[42] = vqsubq_s16(step2[43], step2[42]); + step1[43] = vqaddq_s16(step2[43], step2[42]); + step1[44] = vqaddq_s16(step2[44], step2[45]); + step1[45] = vqsubq_s16(step2[44], step2[45]); + step1[46] = vqsubq_s16(step2[47], step2[46]); + step1[47] = vqaddq_s16(step2[47], step2[46]); + step1[48] = vqaddq_s16(step2[48], step2[49]); + step1[49] = vqsubq_s16(step2[48], step2[49]); + step1[50] = vqsubq_s16(step2[51], step2[50]); + step1[51] = vqaddq_s16(step2[51], step2[50]); + step1[52] = vqaddq_s16(step2[52], step2[53]); + step1[53] = vqsubq_s16(step2[52], step2[53]); + step1[54] = vqsubq_s16(step2[55], step2[54]); + step1[55] = vqaddq_s16(step2[55], step2[54]); + step1[56] = vqaddq_s16(step2[56], step2[57]); + step1[57] = vqsubq_s16(step2[56], step2[57]); + step1[58] = vqsubq_s16(step2[59], step2[58]); + step1[59] = vqaddq_s16(step2[59], step2[58]); + step1[60] = vqaddq_s16(step2[60], step2[61]); + step1[61] = vqsubq_s16(step2[60], step2[61]); + step1[62] = vqsubq_s16(step2[63], step2[62]); + step1[63] = vqaddq_s16(step2[63], step2[62]); + + // stage 4 + + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]); + btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = vqaddq_s16(step1[16], step1[17]); + step2[17] = vqsubq_s16(step1[16], step1[17]); + step2[18] = vqsubq_s16(step1[19], step1[18]); + step2[19] = vqaddq_s16(step1[19], step1[18]); + step2[20] = vqaddq_s16(step1[20], step1[21]); + step2[21] = vqsubq_s16(step1[20], step1[21]); + step2[22] = vqsubq_s16(step1[23], step1[22]); + step2[23] = vqaddq_s16(step1[23], step1[22]); + step2[24] = vqaddq_s16(step1[24], step1[25]); + step2[25] = vqsubq_s16(step1[24], step1[25]); + step2[26] = vqsubq_s16(step1[27], step1[26]); + step2[27] = vqaddq_s16(step1[27], step1[26]); + step2[28] = vqaddq_s16(step1[28], step1[29]); + step2[29] = vqsubq_s16(step1[28], step1[29]); + step2[30] = vqsubq_s16(step1[31], step1[30]); + step2[31] = vqaddq_s16(step1[31], step1[30]); + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + step1[2] = step2[2]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = vqaddq_s16(step2[8], step2[9]); + step1[9] = vqsubq_s16(step2[8], step2[9]); + step1[10] = vqsubq_s16(step2[11], step2[10]); + step1[11] = vqaddq_s16(step2[11], step2[10]); + step1[12] = vqaddq_s16(step2[12], step2[13]); + step1[13] = vqsubq_s16(step2[12], step2[13]); + step1[14] = vqsubq_s16(step2[15], step2[14]); + step1[15] = vqaddq_s16(step2[15], step2[14]); + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = vqaddq_s16(step1[4], step1[5]); + step2[5] = vqsubq_s16(step1[4], step1[5]); + step2[6] = vqsubq_s16(step1[7], step1[6]); + step2[7] = vqaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = vqaddq_s16(step2[0], step2[3]); + step1[1] = vqaddq_s16(step2[1], step2[2]); + step1[2] = vqsubq_s16(step2[1], step2[2]); + step1[3] = vqsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step1; + int32x4_t t32[2]; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + + t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]); + t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]); + + step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), + vrshrn_n_s32(t32[1], INV_COS_BIT)); + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + out[0] = step1; + out[1] = step1; + out[2] = step1; + out[3] = step1; + out[4] = step1; + out[5] = step1; + out[6] = step1; + out[7] = step1; + out[8] = step1; + out[9] = step1; + out[10] = step1; + out[11] = step1; + out[12] = step1; + out[13] = step1; + out[14] = step1; + out[15] = step1; + out[16] = step1; + out[17] = step1; + out[18] = step1; + out[19] = step1; + out[20] = step1; + out[21] = step1; + out[22] = step1; + out[23] = step1; + out[24] = step1; + out[25] = step1; + out[26] = step1; + out[27] = step1; + out[28] = step1; + out[29] = step1; + out[30] = step1; + out[31] = step1; + out[32] = step1; + out[33] = step1; + out[34] = step1; + out[35] = step1; + out[36] = step1; + out[37] = step1; + out[38] = step1; + out[39] = step1; + out[40] = step1; + out[41] = step1; + out[42] = step1; + out[43] = step1; + out[44] = step1; + out[45] = step1; + out[46] = step1; + out[47] = step1; + out[48] = step1; + out[49] = step1; + out[50] = step1; + out[51] = step1; + out[52] = step1; + out[53] = step1; + out[54] = step1; + out[55] = step1; + out[56] = step1; + out[57] = step1; + out[58] = step1; + out[59] = step1; + out[60] = step1; + out[61] = step1; + out[62] = step1; + out[63] = step1; +} + +static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[8] = in[4]; + step2[16] = in[2]; + step2[24] = in[6]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[8] = step2[8]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + step1[16] = step2[16]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[31] = step2[31]; + step1[32] = step2[32]; + step1[33] = step2[33]; + step1[34] = step2[33]; + step1[35] = step2[32]; + step1[36] = step2[39]; + step1[37] = step2[38]; + step1[38] = step2[38]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[41]; + step1[42] = step2[41]; + step1[43] = step2[40]; + step1[44] = step2[47]; + step1[45] = step2[46]; + step1[46] = step2[46]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[49]; + step1[50] = step2[49]; + step1[51] = step2[48]; + step1[52] = step2[55]; + step1[53] = step2[54]; + step1[54] = step2[54]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[57]; + step1[58] = step2[57]; + step1[59] = step2[56]; + step1[60] = step2[63]; + step1[61] = step2[62]; + step1[62] = step2[62]; + step1[63] = step2[63]; + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + step2[16] = step1[16]; + step2[17] = step1[17]; + step2[18] = step1[17]; + step2[19] = step1[16]; + step2[20] = step1[23]; + step2[21] = step1[22]; + step2[22] = step1[22]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[25]; + step2[26] = step1[25]; + step2[27] = step1[24]; + step2[28] = step1[31]; + step2[29] = step1[30]; + step2[30] = step1[30]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[8] = step2[8]; + step1[9] = step2[9]; + step1[10] = step2[9]; + step1[11] = step2[8]; + step1[12] = step2[15]; + step1[13] = step2[14]; + step1[14] = step2[14]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]); + + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[3]; + step2[5] = step1[2]; + step2[6] = step1[1]; + step2[7] = step1[0]; + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + int16x8_t step2[64], step1[64]; + + const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], + (int16_t)cospi[36], (int16_t)cospi[28]); + const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], + (int16_t)cospi[52], (int16_t)cospi[12]); + const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], + (int16_t)cospi[40], (int16_t)cospi[24]); + const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], + (int16_t)cospi[16], (int16_t)cospi[48]); + const int16x4_t c4 = + set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), + (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); + const int16x4_t c5 = + set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), + (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); + const int16x4_t c6 = + set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), + (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); + const int16x4_t c7 = + set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), + (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); + + // stage 1 + // stage 2 + + step2[0] = in[0]; + step2[4] = in[8]; + step2[8] = in[4]; + step2[12] = in[12]; + step2[16] = in[2]; + step2[20] = in[10]; + step2[24] = in[6]; + step2[28] = in[14]; + + btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); + btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); + btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); + btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); + btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); + btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); + btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); + btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); + + // stage 3 + + step1[0] = step2[0]; + step1[4] = step2[4]; + step1[8] = step2[8]; + step1[12] = step2[12]; + + btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); + btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); + btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); + btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); + + step1[32] = step2[32]; + step1[33] = step2[32]; + step1[34] = step2[35]; + step1[35] = step2[35]; + step1[36] = step2[36]; + step1[37] = step2[36]; + step1[38] = step2[39]; + step1[39] = step2[39]; + step1[40] = step2[40]; + step1[41] = step2[40]; + step1[42] = step2[43]; + step1[43] = step2[43]; + step1[44] = step2[44]; + step1[45] = step2[44]; + step1[46] = step2[47]; + step1[47] = step2[47]; + step1[48] = step2[48]; + step1[49] = step2[48]; + step1[50] = step2[51]; + step1[51] = step2[51]; + step1[52] = step2[52]; + step1[53] = step2[52]; + step1[54] = step2[55]; + step1[55] = step2[55]; + step1[56] = step2[56]; + step1[57] = step2[56]; + step1[58] = step2[59]; + step1[59] = step2[59]; + step1[60] = step2[60]; + step1[61] = step2[60]; + step1[62] = step2[63]; + step1[63] = step2[63]; + + // stage 4 + + step2[0] = step1[0]; + step2[4] = step1[4]; + + btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); + btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); + btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); + btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); + btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); + btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); + btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); + btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); + btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); + btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); + + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + step2[32] = step1[32]; + step2[35] = step1[35]; + step2[36] = step1[36]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[43] = step1[43]; + step2[44] = step1[44]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[51] = step1[51]; + step2[52] = step1[52]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[59] = step1[59]; + step2[60] = step1[60]; + step2[63] = step1[63]; + + // stage 5 + + step1[0] = step2[0]; + + btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); + btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); + btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); + btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); + btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); + + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[16] = step2[16]; + step1[19] = step2[19]; + step1[20] = step2[20]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[35]); + step1[33] = vqaddq_s16(step2[33], step2[34]); + step1[34] = vqsubq_s16(step2[33], step2[34]); + step1[35] = vqsubq_s16(step2[32], step2[35]); + step1[36] = vqsubq_s16(step2[39], step2[36]); + step1[37] = vqsubq_s16(step2[38], step2[37]); + step1[38] = vqaddq_s16(step2[38], step2[37]); + step1[39] = vqaddq_s16(step2[39], step2[36]); + step1[40] = vqaddq_s16(step2[40], step2[43]); + step1[41] = vqaddq_s16(step2[41], step2[42]); + step1[42] = vqsubq_s16(step2[41], step2[42]); + step1[43] = vqsubq_s16(step2[40], step2[43]); + step1[44] = vqsubq_s16(step2[47], step2[44]); + step1[45] = vqsubq_s16(step2[46], step2[45]); + step1[46] = vqaddq_s16(step2[46], step2[45]); + step1[47] = vqaddq_s16(step2[47], step2[44]); + step1[48] = vqaddq_s16(step2[48], step2[51]); + step1[49] = vqaddq_s16(step2[49], step2[50]); + step1[50] = vqsubq_s16(step2[49], step2[50]); + step1[51] = vqsubq_s16(step2[48], step2[51]); + step1[52] = vqsubq_s16(step2[55], step2[52]); + step1[53] = vqsubq_s16(step2[54], step2[53]); + step1[54] = vqaddq_s16(step2[54], step2[53]); + step1[55] = vqaddq_s16(step2[55], step2[52]); + step1[56] = vqaddq_s16(step2[56], step2[59]); + step1[57] = vqaddq_s16(step2[57], step2[58]); + step1[58] = vqsubq_s16(step2[57], step2[58]); + step1[59] = vqsubq_s16(step2[56], step2[59]); + step1[60] = vqsubq_s16(step2[63], step2[60]); + step1[61] = vqsubq_s16(step2[62], step2[61]); + step1[62] = vqaddq_s16(step2[62], step2[61]); + step1[63] = vqaddq_s16(step2[63], step2[60]); + + // stage 6 + + btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); + btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); + btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); + btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); + btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); + btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); + btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); + btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); + btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); + btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); + btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); + + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[19]); + step2[17] = vqaddq_s16(step1[17], step1[18]); + step2[18] = vqsubq_s16(step1[17], step1[18]); + step2[19] = vqsubq_s16(step1[16], step1[19]); + step2[20] = vqsubq_s16(step1[23], step1[20]); + step2[21] = vqsubq_s16(step1[22], step1[21]); + step2[22] = vqaddq_s16(step1[22], step1[21]); + step2[23] = vqaddq_s16(step1[23], step1[20]); + step2[24] = vqaddq_s16(step1[24], step1[27]); + step2[25] = vqaddq_s16(step1[25], step1[26]); + step2[26] = vqsubq_s16(step1[25], step1[26]); + step2[27] = vqsubq_s16(step1[24], step1[27]); + step2[28] = vqsubq_s16(step1[31], step1[28]); + step2[29] = vqsubq_s16(step1[30], step1[29]); + step2[30] = vqaddq_s16(step1[30], step1[29]); + step2[31] = vqaddq_s16(step1[31], step1[28]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[38] = step1[38]; + step2[39] = step1[39]; + step2[40] = step1[40]; + step2[41] = step1[41]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[54] = step1[54]; + step2[55] = step1[55]; + step2[56] = step1[56]; + step2[57] = step1[57]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 7 + + btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); + btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); + btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); + btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); + btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); + + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + step1[7] = step2[7]; + step1[8] = vqaddq_s16(step2[8], step2[11]); + step1[9] = vqaddq_s16(step2[9], step2[10]); + step1[10] = vqsubq_s16(step2[9], step2[10]); + step1[11] = vqsubq_s16(step2[8], step2[11]); + step1[12] = vqsubq_s16(step2[15], step2[12]); + step1[13] = vqsubq_s16(step2[14], step2[13]); + step1[14] = vqaddq_s16(step2[14], step2[13]); + step1[15] = vqaddq_s16(step2[15], step2[12]); + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + step1[32] = vqaddq_s16(step2[32], step2[39]); + step1[33] = vqaddq_s16(step2[33], step2[38]); + step1[34] = vqaddq_s16(step2[34], step2[37]); + step1[35] = vqaddq_s16(step2[35], step2[36]); + step1[36] = vqsubq_s16(step2[35], step2[36]); + step1[37] = vqsubq_s16(step2[34], step2[37]); + step1[38] = vqsubq_s16(step2[33], step2[38]); + step1[39] = vqsubq_s16(step2[32], step2[39]); + step1[40] = vqsubq_s16(step2[47], step2[40]); + step1[41] = vqsubq_s16(step2[46], step2[41]); + step1[42] = vqsubq_s16(step2[45], step2[42]); + step1[43] = vqsubq_s16(step2[44], step2[43]); + step1[44] = vqaddq_s16(step2[43], step2[44]); + step1[45] = vqaddq_s16(step2[42], step2[45]); + step1[46] = vqaddq_s16(step2[41], step2[46]); + step1[47] = vqaddq_s16(step2[40], step2[47]); + step1[48] = vqaddq_s16(step2[48], step2[55]); + step1[49] = vqaddq_s16(step2[49], step2[54]); + step1[50] = vqaddq_s16(step2[50], step2[53]); + step1[51] = vqaddq_s16(step2[51], step2[52]); + step1[52] = vqsubq_s16(step2[51], step2[52]); + step1[53] = vqsubq_s16(step2[50], step2[53]); + step1[54] = vqsubq_s16(step2[49], step2[54]); + step1[55] = vqsubq_s16(step2[48], step2[55]); + step1[56] = vqsubq_s16(step2[63], step2[56]); + step1[57] = vqsubq_s16(step2[62], step2[57]); + step1[58] = vqsubq_s16(step2[61], step2[58]); + step1[59] = vqsubq_s16(step2[60], step2[59]); + step1[60] = vqaddq_s16(step2[59], step2[60]); + step1[61] = vqaddq_s16(step2[58], step2[61]); + step1[62] = vqaddq_s16(step2[57], step2[62]); + step1[63] = vqaddq_s16(step2[56], step2[63]); + + // stage 8 + + btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); + btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); + btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); + btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); + btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); + btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); + btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); + btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); + btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); + btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); + + step2[0] = vqaddq_s16(step1[0], step1[7]); + step2[1] = vqaddq_s16(step1[1], step1[6]); + step2[2] = vqaddq_s16(step1[2], step1[5]); + step2[3] = vqaddq_s16(step1[3], step1[4]); + step2[4] = vqsubq_s16(step1[3], step1[4]); + step2[5] = vqsubq_s16(step1[2], step1[5]); + step2[6] = vqsubq_s16(step1[1], step1[6]); + step2[7] = vqsubq_s16(step1[0], step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + step2[16] = vqaddq_s16(step1[16], step1[23]); + step2[17] = vqaddq_s16(step1[17], step1[22]); + step2[18] = vqaddq_s16(step1[18], step1[21]); + step2[19] = vqaddq_s16(step1[19], step1[20]); + step2[20] = vqsubq_s16(step1[19], step1[20]); + step2[21] = vqsubq_s16(step1[18], step1[21]); + step2[22] = vqsubq_s16(step1[17], step1[22]); + step2[23] = vqsubq_s16(step1[16], step1[23]); + step2[24] = vqsubq_s16(step1[31], step1[24]); + step2[25] = vqsubq_s16(step1[30], step1[25]); + step2[26] = vqsubq_s16(step1[29], step1[26]); + step2[27] = vqsubq_s16(step1[28], step1[27]); + step2[28] = vqaddq_s16(step1[28], step1[27]); + step2[29] = vqaddq_s16(step1[29], step1[26]); + step2[30] = vqaddq_s16(step1[30], step1[25]); + step2[31] = vqaddq_s16(step1[31], step1[24]); + step2[32] = step1[32]; + step2[33] = step1[33]; + step2[34] = step1[34]; + step2[35] = step1[35]; + step2[44] = step1[44]; + step2[45] = step1[45]; + step2[46] = step1[46]; + step2[47] = step1[47]; + step2[48] = step1[48]; + step2[49] = step1[49]; + step2[50] = step1[50]; + step2[51] = step1[51]; + step2[60] = step1[60]; + step2[61] = step1[61]; + step2[62] = step1[62]; + step2[63] = step1[63]; + + // stage 9 + idct64_stage9_neon(step2, step1, cos_bit); + + // stage 10 + idct64_stage10_neon(step1, step2, cos_bit); + + // stage 11 + + out[0] = vqaddq_s16(step2[0], step2[63]); + out[1] = vqaddq_s16(step2[1], step2[62]); + out[2] = vqaddq_s16(step2[2], step2[61]); + out[3] = vqaddq_s16(step2[3], step2[60]); + out[4] = vqaddq_s16(step2[4], step2[59]); + out[5] = vqaddq_s16(step2[5], step2[58]); + out[6] = vqaddq_s16(step2[6], step2[57]); + out[7] = vqaddq_s16(step2[7], step2[56]); + out[8] = vqaddq_s16(step2[8], step2[55]); + out[9] = vqaddq_s16(step2[9], step2[54]); + out[10] = vqaddq_s16(step2[10], step2[53]); + out[11] = vqaddq_s16(step2[11], step2[52]); + out[12] = vqaddq_s16(step2[12], step2[51]); + out[13] = vqaddq_s16(step2[13], step2[50]); + out[14] = vqaddq_s16(step2[14], step2[49]); + out[15] = vqaddq_s16(step2[15], step2[48]); + out[16] = vqaddq_s16(step2[16], step2[47]); + out[17] = vqaddq_s16(step2[17], step2[46]); + out[18] = vqaddq_s16(step2[18], step2[45]); + out[19] = vqaddq_s16(step2[19], step2[44]); + out[20] = vqaddq_s16(step2[20], step2[43]); + out[21] = vqaddq_s16(step2[21], step2[42]); + out[22] = vqaddq_s16(step2[22], step2[41]); + out[23] = vqaddq_s16(step2[23], step2[40]); + out[24] = vqaddq_s16(step2[24], step2[39]); + out[25] = vqaddq_s16(step2[25], step2[38]); + out[26] = vqaddq_s16(step2[26], step2[37]); + out[27] = vqaddq_s16(step2[27], step2[36]); + out[28] = vqaddq_s16(step2[28], step2[35]); + out[29] = vqaddq_s16(step2[29], step2[34]); + out[30] = vqaddq_s16(step2[30], step2[33]); + out[31] = vqaddq_s16(step2[31], step2[32]); + out[32] = vqsubq_s16(step2[31], step2[32]); + out[33] = vqsubq_s16(step2[30], step2[33]); + out[34] = vqsubq_s16(step2[29], step2[34]); + out[35] = vqsubq_s16(step2[28], step2[35]); + out[36] = vqsubq_s16(step2[27], step2[36]); + out[37] = vqsubq_s16(step2[26], step2[37]); + out[38] = vqsubq_s16(step2[25], step2[38]); + out[39] = vqsubq_s16(step2[24], step2[39]); + out[40] = vqsubq_s16(step2[23], step2[40]); + out[41] = vqsubq_s16(step2[22], step2[41]); + out[42] = vqsubq_s16(step2[21], step2[42]); + out[43] = vqsubq_s16(step2[20], step2[43]); + out[44] = vqsubq_s16(step2[19], step2[44]); + out[45] = vqsubq_s16(step2[18], step2[45]); + out[46] = vqsubq_s16(step2[17], step2[46]); + out[47] = vqsubq_s16(step2[16], step2[47]); + out[48] = vqsubq_s16(step2[15], step2[48]); + out[49] = vqsubq_s16(step2[14], step2[49]); + out[50] = vqsubq_s16(step2[13], step2[50]); + out[51] = vqsubq_s16(step2[12], step2[51]); + out[52] = vqsubq_s16(step2[11], step2[52]); + out[53] = vqsubq_s16(step2[10], step2[53]); + out[54] = vqsubq_s16(step2[9], step2[54]); + out[55] = vqsubq_s16(step2[8], step2[55]); + out[56] = vqsubq_s16(step2[7], step2[56]); + out[57] = vqsubq_s16(step2[6], step2[57]); + out[58] = vqsubq_s16(step2[5], step2[58]); + out[59] = vqsubq_s16(step2[4], step2[59]); + out[60] = vqsubq_s16(step2[3], step2[60]); + out[61] = vqsubq_s16(step2[2], step2[61]); + out[62] = vqsubq_s16(step2[1], step2[62]); + out[63] = vqsubq_s16(step2[0], step2[63]); +} + +// Functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_neon + lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct8_low1_neon, idct8_neon, NULL, NULL }, + { iadst8_low1_neon, iadst8_neon, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL }, + { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon, + idct64_low32_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)tx_type; + int16x8_t a[32 * 4]; + int16x8_t b[32 * 4]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = txfm_size_row; + int temp_b = 0; + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = txfm_size_row; + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + int temp_b = 0; + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + row_txfm(cur_a, cur_a, INV_COS_BIT); + av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + flip_buf_ud_neon(&cur_a[j * 8], 8); + transpose_arrays_s16_8x8( + &cur_a[j * 8], + &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], + txh_idx, txfm_size_row, -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon( + &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[16 * 2]; + int16x8_t b[16 * 2]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), + 0); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = txfm_size_row; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + int temp_b = 0; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, + -shift[0]); + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X4; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; ++c) + temp_in[c] = input[c * txfm_size_row]; + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X8; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, + NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_8X4; + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, + 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, + NewSqrt2Bits); + + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_4X16; + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = input[c * txfm_size_row]; + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, int eob) { + (void)eob; + TX_SIZE tx_size = TX_16X4; + DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); + int32_t *temp_in = txfm_buf; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16 }; + int r; + const transform_1d_neon row_txfm = + lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_neon col_txfm = + lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < txfm_size_row; i++) { + for (int c = 0; c < txfm_size_col; c++) + temp_in[c] = input[c * txfm_size_row]; + row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + input++; + buf_ptr += txfm_size_col; + } + + for (int c = 0; c < txfm_size_col; ++c) { + if (lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, 16); + col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + + if (ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + clip_pixel(output[r * stride + c] + temp_out[r]); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = clip_pixel(output[r * stride + c] + + temp_out[txfm_size_row - r - 1]); + } + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int16x8_t a[64 * 8]; + int16x8_t b[64 * 8]; + int eobx, eoby, ud_flip, lr_flip; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + int temp_b = 0; + + const transform_neon row_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_neon col_txfm = + lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + int16x8_t *cur_a = &a[i * txfm_size_col]; + load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, + buf_size_nonzero_w); + input += 8; + if (abs(rect_type) == 1) { + round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); + } + row_txfm(cur_a, cur_a, INV_COS_BIT); + av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); + if (lr_flip == 1) { + for (int j = 0; j < buf_size_w_div8; ++j) { + flip_buf_ud_neon(&cur_a[j * 8], 8); + transpose_arrays_s16_8x8( + &cur_a[j * 8], + &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); + } + temp_b += 8; + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); + } + temp_b += 8; + } + } + for (int j = 0; j < buf_size_w_div8; ++j) { + col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); + av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, + -shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); + } +} + +static INLINE void lowbd_inv_txfm2d_add_universe_neon( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size, + eob); + break; + + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + + default: + lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob); + break; + + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob); + break; + + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob); + break; + + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob); + break; + + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob); + break; + + default: + lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, + tx_size, eob); + break; + } +} +void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h new file mode 100644 index 0000000000..97099c2042 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ +#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" + +typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output, + const int8_t cos_bit, + const int8_t *stage_ptr); +typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output, + int8_t cos_bit); + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob / (eoby_max + 1); + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_ diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c new file mode 100644 index 0000000000..f955a379f7 --- /dev/null +++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c @@ -0,0 +1,30 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +void av1_round_shift_array_neon(int32_t *arr, int size, int bit) { + assert(!(size % 4)); + if (!bit) return; + const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); + for (int i = 0; i < size; i += 4) { + int32x4_t tmp_q_s32 = vld1q_s32(arr); + tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4); + vst1q_s32(arr, tmp_q_s32); + arr += 4; + } +} diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c new file mode 100644 index 0000000000..7afb1a909d --- /dev/null +++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c @@ -0,0 +1,102 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" + +void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 2); + assert(w >= 2); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (w > 8) { + do { + int i = 0; + do { + uint8x16_t m0 = vld1q_u8(mask + i); + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + const uint8x8_t m0 = vld1_u8(mask); + do { + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(dst, blend); + + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 4) { + const uint8x8_t m0 = load_unaligned_dup_u8_4x2(mask); + do { + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 2 && h >= 16) { + const uint8x8_t m0 = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask)); + do { + uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + store_u8x2_strided_x2(dst, dst_stride, blend); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + aom_blend_a64_hmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, w, h); + } +} diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c new file mode 100644 index 0000000000..9aea29992a --- /dev/null +++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c @@ -0,0 +1,112 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/arm/blend_neon.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int w, int h) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 2); + assert(w >= 2); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (w > 8) { + do { + uint8x16_t m0 = vdupq_n_u8(mask[0]); + int i = 0; + do { + uint8x16_t s0 = vld1q_u8(src0 + i); + uint8x16_t s1 = vld1q_u8(src1 + i); + + uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1); + + vst1q_u8(dst + i, blend); + + i += 16; + } while (i < w); + + mask += 1; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8x8_t m0 = vdup_n_u8(mask[0]); + uint8x8_t s0 = vld1_u8(src0); + uint8x8_t s1 = vld1_u8(src1); + + uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); + + vst1_u8(dst, blend); + + mask += 1; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } while (--h != 0); + } else if (w == 4) { + do { + const uint16x4_t m0 = vdup_n_u16((uint16_t)mask[0]); + const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[1]); + const uint8x8_t m = vmovn_u16(vcombine_u16(m0, m1)); + uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); + + store_u8x4_strided_x2(dst, dst_stride, blend); + + mask += 2; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 2 && h >= 16) { + do { + uint16x4_t m0 = vdup_n_u16(0); + m0 = vld1_lane_u16((uint16_t *)mask, m0, 0); + uint8x8_t m = + vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0]; + uint8x8_t s0 = load_unaligned_u8_2x2(src0, src0_stride); + uint8x8_t s1 = load_unaligned_u8_2x2(src1, src1_stride); + + uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); + + store_u8x2_strided_x2(dst, dst_stride, blend); + + mask += 2; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + aom_blend_a64_vmask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, w, h); + } +} diff --git a/third_party/aom/av1/common/arm/cdef_block_neon.c b/third_party/aom/av1/common/arm/cdef_block_neon.c new file mode 100644 index 0000000000..53d3a9f1e0 --- /dev/null +++ b/third_party/aom/av1/common/arm/cdef_block_neon.c @@ -0,0 +1,1355 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/cdef_block.h" + +void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + do { + const uint8_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 16) { + uint8x16_t row = vld1q_u8(src_ptr + w); + uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } }; + vst2q_u8((uint8_t *)(dst_ptr + w), row_u16); + + w += 16; + } + if (width - w >= 8) { + uint8x8_t row = vld1_u8(src_ptr + w); + vst1q_u16(dst_ptr + w, vmovl_u8(row)); + w += 8; + } + if (width - w == 4) { + for (int i = w; i < w + 4; i++) { + dst_ptr[i] = src_ptr[i]; + } + } + + src += sstride; + dst += dstride; + } while (--height != 0); +} + +void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 8) { + uint16x8_t row = vld1q_u16(src_ptr + w); + vst1q_u16(dst_ptr + w, row); + + w += 8; + } + if (width - w == 4) { + uint16x4_t row = vld1_u16(src_ptr + w); + vst1_u16(dst_ptr + w, row); + } + + src += sstride; + dst += dstride; + } while (--height != 0); +} + +// partial A is a 16-bit vector of the form: +// [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: +// [0 y1 y2 y3 y4 y5 y6 y7]. +// This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... +// (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 +// and const2. +static INLINE uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala, + int16x8_t partialb, + uint32x4_t const1, + uint32x4_t const2) { + // Reverse partial B. + // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. + uint8x16_t pattern = vreinterpretq_u8_u64( + vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), + vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); + +#if AOM_ARCH_AARCH64 + partialb = + vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern)); +#else + int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)), + vget_high_s8(vreinterpretq_s8_s16(partialb)) } }; + int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); + int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); + partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); +#endif + + // Square and add the corresponding x and y values. + int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); + cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); + int32x4_t cost_hi = + vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); + cost_hi = + vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); + + // Multiply by constant. + uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); + return cost; +} + +// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal +// down-right, 6 is vertical). +// +// For each direction the lines are shifted so that we can perform a +// basic sum on each vector element. For example, direction 5 is "south by +// southeast", so we need to add the pixels along each line i below: +// +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// For this to fit nicely in vectors, the lines need to be shifted like so: +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// In this configuration we can now perform SIMD additions to get the cost +// along direction 5. Since this won't fit into a single 128-bit vector, we use +// two of them to compute each half of the new configuration, and pad the empty +// spaces with zeros. Similar shifting is done for other directions, except +// direction 6 which is straightforward as it's the vertical direction. +static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); + + // Partial sums for lines 0 and 1. + int16x8_t partial4a = vextq_s16(zero, lines[0], 1); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); + int16x8_t partial4b = vextq_s16(lines[0], zero, 1); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); + int16x8_t tmp = vaddq_s16(lines[0], lines[1]); + int16x8_t partial5a = vextq_s16(zero, tmp, 3); + int16x8_t partial5b = vextq_s16(tmp, zero, 3); + int16x8_t partial7a = vextq_s16(zero, tmp, 6); + int16x8_t partial7b = vextq_s16(tmp, zero, 6); + int16x8_t partial6 = tmp; + + // Partial sums for lines 2 and 3. + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); + tmp = vaddq_s16(lines[2], lines[3]); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); + partial6 = vaddq_s16(partial6, tmp); + + // Partial sums for lines 4 and 5. + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); + tmp = vaddq_s16(lines[4], lines[5]); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); + partial6 = vaddq_s16(partial6, tmp); + + // Partial sums for lines 6 and 7. + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); + partial4a = vaddq_s16(partial4a, lines[7]); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); + tmp = vaddq_s16(lines[6], lines[7]); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); + partial6 = vaddq_s16(partial6, tmp); + + uint32x4_t const0 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), + vcreate_u64((uint64_t)210 << 32 | 280))); + uint32x4_t const1 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), + vcreate_u64((uint64_t)105 << 32 | 120))); + uint32x4_t const2 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420))); + uint32x4_t const3 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140), + vcreate_u64((uint64_t)105 << 32 | 105))); + + // Compute costs in terms of partial sums. + int32x4_t partial6_s32 = + vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); + partial6_s32 = + vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); + costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); + costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); + costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; +} + +static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala, + int16x8_t partialb, + int16x8_t partialc, + uint32x4_t const0) { + // Reverse partial c. + // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }. + uint8x16_t pattern = vreinterpretq_u8_u64( + vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a), + vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302))); + +#if AOM_ARCH_AARCH64 + partialc = + vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern)); +#else + int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)), + vget_high_s8(vreinterpretq_s8_s16(partialc)) } }; + int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); + int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); + partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); +#endif + + int32x4_t partiala_s32 = vpaddlq_s16(partiala); + int32x4_t partialb_s32 = vpaddlq_s16(partialb); + int32x4_t partialc_s32 = vpaddlq_s16(partialc); + + partiala_s32 = vmulq_s32(partiala_s32, partiala_s32); + partialb_s32 = vmulq_s32(partialb_s32, partialb_s32); + partialc_s32 = vmulq_s32(partialc_s32, partialc_s32); + + partiala_s32 = vaddq_s32(partiala_s32, partialc_s32); + + uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0); + return cost; +} + +// This function computes the cost along directions 0, 1, 2, 3. (0 means +// 45-degree up-right, 2 is horizontal). +// +// For direction 1 and 3 ("east northeast" and "east southeast") the shifted +// lines need three vectors instead of two. For direction 1 for example, we need +// to compute the sums along the line i below: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Which means we need the following configuration: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Three vectors are needed to compute this, as well as some extra pairwise +// additions. +static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); + + // Compute diagonal directions (1, 2, 3). + // Partial sums for lines 0 and 1. + int16x8_t partial0a = lines[0]; + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7)); + int16x8_t partial0b = vextq_s16(lines[1], zero, 7); + int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6)); + int16x8_t partial1b = vextq_s16(lines[1], zero, 6); + int16x8_t partial3a = vextq_s16(lines[0], zero, 2); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4)); + int16x8_t partial3b = vextq_s16(zero, lines[0], 2); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4)); + + // Partial sums for lines 2 and 3. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2)); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6)); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6)); + partial3b = vaddq_s16(partial3b, lines[3]); + + // Partial sums for lines 4 and 5. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3)); + partial1b = vaddq_s16(partial1b, lines[4]); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6)); + int16x8_t partial1c = vextq_s16(lines[5], zero, 6); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4)); + int16x8_t partial3c = vextq_s16(zero, lines[4], 2); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4)); + + // Partial sums for lines 6 and 7. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6)); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6)); + partial3c = vaddq_s16(partial3c, lines[7]); + + // Special case for direction 2 as it's just a sum along each line. + int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] }; + int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] }; + int32x4_t partial2a = horizontal_add_4d_s16x8(lines03); + int32x4_t partial2b = horizontal_add_4d_s16x8(lines47); + + uint32x4_t partial2a_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a)); + uint32x4_t partial2b_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b)); + + uint32x4_t const0 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), + vcreate_u64((uint64_t)210 << 32 | 280))); + uint32x4_t const1 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), + vcreate_u64((uint64_t)105 << 32 | 120))); + uint32x4_t const2 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420), + vcreate_u64((uint64_t)105 << 32 | 140))); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1); + costs[1] = + fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2); + costs[2] = vaddq_u32(partial2a_u32, partial2b_u32); + costs[2] = vmulq_n_u32(costs[2], 105); + costs[3] = + fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; +} + +int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + uint32_t cost[8]; + uint32_t best_cost = 0; + int best_dir = 0; + int16x8_t lines[8]; + for (int i = 0; i < 8; i++) { + uint16x8_t s = vld1q_u16(&img[i * stride]); + lines[i] = vreinterpretq_s16_u16( + vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128))); + } + + // Compute "mostly vertical" directions. + uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4); + + // Compute "mostly horizontal" directions. + uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost); + + // Find max cost as well as its index to get best_dir. + // The max cost needs to be propagated in the whole vector to find its + // position in the original cost vectors cost03 and cost47. + uint32x4_t cost07 = vmaxq_u32(cost03, cost47); +#if AOM_ARCH_AARCH64 + best_cost = vmaxvq_u32(cost07); + uint32x4_t max_cost = vdupq_n_u32(best_cost); + uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)), + vreinterpretq_u8_u32( + vceqq_u32(max_cost, cost47)) } }; + // idx = { 28, 24, 20, 16, 12, 8, 4, 0 }; + uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); + // Get the lowest 8 bit of each 32-bit elements and reverse them. + uint8x8_t tbl = vqtbl2_u8(costs, idx); + uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0); + best_dir = aom_clzll(a) >> 3; +#else + uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07)); + cost64 = vpmax_u32(cost64, cost64); + uint32x4_t max_cost = vcombine_u32(cost64, cost64); + best_cost = vget_lane_u32(cost64, 0); + uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)), + vmovn_u32(vceqq_u32(max_cost, cost47))); + uint8x8_t idx = + vand_u8(vmovn_u16(costs), + vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); + int sum = horizontal_add_u8x8(idx); + best_dir = get_msb(sum ^ (sum - 1)); +#endif + + // Difference between the optimal variance and the variance along the + // orthogonal direction. Again, the sum(x^2) terms cancel out. + *var = best_cost - cost[(best_dir + 4) & 7]; + // We'd normally divide by 840, but dividing by 1024 is close enough + // for what we're going to do with this. + *var >>= 10; + return best_dir; +} + +void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) +static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b, + unsigned int threshold, int adjdamp) { + uint16x8_t diff = vabdq_u16(a, b); + const uint16x8_t a_gt_b = vcgtq_u16(a, b); + const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold), + vshlq_u16(diff, vdupq_n_s16(-adjdamp))); + const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s)); + return vbslq_s16(a_gt_b, clip, vnegq_s16(clip)); +} + +static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4], + const int *pri_taps, int pri_strength, + int pri_damping, int16x8_t *sum) { + // Near taps + int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping); + int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping); + // sum += pri_taps[0] * (n0 + n1) + n0 = vaddq_s16(n0, n1); + *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]); + + // Far taps + int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping); + int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping); + // sum += pri_taps[1] * (f0 + f1) + f0 = vaddq_s16(f0, f1); + *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]); +} + +static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8], + const int *sec_taps, int sec_strength, + int sec_damping, int16x8_t *sum) { + // Near taps + int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping); + int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping); + int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping); + int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]); + + // Far taps + s0 = constrain16(tap[4], s, sec_strength, sec_damping); + s1 = constrain16(tap[5], s, sec_strength, sec_damping); + s2 = constrain16(tap[6], s, sec_strength, sec_damping); + s3 = constrain16(tap[7], s, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]); +} + +void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + const uint8x8_t res = vqmovn_u16(s); + vst1_u8(dst8, res); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + const uint8x8_t res = vqmovn_u16(s); + store_u8x4_strided_x2(dst8, dstride, res); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + if (block_width == 8) { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + vst1q_u16(dst16, s); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); + } else { + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + store_u16x4_strided_x2(dst16, dstride, s); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c new file mode 100644 index 0000000000..0871b4fe06 --- /dev/null +++ b/third_party/aom/av1/common/arm/cfl_neon.c @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset, + int16x8_t sub) { + vst1q_s16(dst + offset, + vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub)); +} + +static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) { + return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset)); +} + +// Load half of a vector and duplicated in other half +static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) { + return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr)); +} + +// Store half of a vector. +static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) { + vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u16(val), 0); +} + +// Store half of a vector. +static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) { + vst1_lane_u32((uint32_t *)ptr, vreinterpret_u32_u8(val), 0); +} + +static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride)); + vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride)); + vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1)); + } else if (width == 16) { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride)); + vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1)); + } else { + const uint8x8x4_t top = vld4_u8(input); + const uint8x8x4_t bot = vld4_u8(input + input_stride); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]); + // equivalent to a vpaddlq_u8 (because vld4q interleaves) + const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]); + uint16x8x2_t sum; + sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); + sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); + vst2q_u16(pred_buf_q3, sum); + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input)); + vsth_u16(pred_buf_q3, vshl_n_u16(top, 2)); + } else if (width == 8) { + const uint16x4_t top = vpaddl_u8(vld1_u8(input)); + vst1_u16(pred_buf_q3, vshl_n_u16(top, 2)); + } else if (width == 16) { + const uint16x8_t top = vpaddlq_u8(vld1q_u8(input)); + vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2)); + } else { + const uint8x8x4_t top = vld4_u8(input); + uint16x8x2_t sum; + // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves) + sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2); + sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2); + vst2q_u16(pred_buf_q3, sum); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3); + vst1_u16(pred_buf_q3, vget_low_u16(top)); + } else if (width == 8) { + const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3); + vst1q_u16(pred_buf_q3, top); + } else { + const uint8x16_t top = vld1q_u8(input); + vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3)); + vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3)); + if (width == 32) { + const uint8x16_t next_top = vld1q_u8(input + 16); + vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3)); + vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3)); + } + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +#if CONFIG_AV1_HIGHBITDEPTH +#if !AOM_ARCH_AARCH64 +uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { + return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)), + vpadd_u16(vget_low_u16(b), vget_high_u16(b))); +} +#endif + +static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + const uint16x4_t bot = vld1_u16(input + input_stride); + const uint16x4_t sum = vadd_u16(top, bot); + const uint16x4_t hsum = vpadd_u16(sum, sum); + vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); + } else if (width < 32) { + const uint16x8_t top = vld1q_u16(input); + const uint16x8_t bot = vld1q_u16(input + input_stride); + const uint16x8_t sum = vaddq_u16(top, bot); + if (width == 8) { + const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum)); + vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1)); + } else { + const uint16x8_t top_1 = vld1q_u16(input + 8); + const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride); + const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1); + const uint16x8_t hsum = vpaddq_u16(sum, sum_1); + vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1)); + } + } else { + const uint16x8x4_t top = vld4q_u16(input); + const uint16x8x4_t bot = vld4q_u16(input + input_stride); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]); + uint16x8x2_t sum; + sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1); + sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1); + vst2q_u16(pred_buf_q3, sum); + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + const uint16x4_t hsum = vpadd_u16(top, top); + vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); + } else if (width == 8) { + const uint16x4x2_t top = vld2_u16(input); + // equivalent to a vpadd_u16 (because vld2 interleaves) + const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]); + vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2)); + } else if (width == 16) { + const uint16x8x2_t top = vld2q_u16(input); + // equivalent to a vpaddq_u16 (because vld2q interleaves) + const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]); + vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2)); + } else { + const uint16x8x4_t top = vld4q_u16(input); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]); + // equivalent to a vpaddq_u16 (because vld4q interleaves) + const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]); + uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2), + vshlq_n_u16(hsum_1, 2) } }; + vst2q_u16(pred_buf_q3, result); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const uint16x4_t top = vld1_u16(input); + vst1_u16(pred_buf_q3, vshl_n_u16(top, 3)); + } else if (width == 8) { + const uint16x8_t top = vld1q_u16(input); + vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3)); + } else if (width == 16) { + uint16x8x2_t top = vld2q_u16(input); + top.val[0] = vshlq_n_u16(top.val[0], 3); + top.val[1] = vshlq_n_u16(top.val[1], 3); + vst2q_u16(pred_buf_q3, top); + } else { + uint16x8x4_t top = vld4q_u16(input); + top.val[0] = vshlq_n_u16(top.val[0], 3); + top.val[1] = vshlq_n_u16(top.val[1], 3); + top.val[2] = vshlq_n_u16(top.val[2], 3); + top.val[3] = vshlq_n_u16(top.val[3], 3); + vst4q_u16(pred_buf_q3, top); + } + input += input_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +CFL_GET_SUBSAMPLE_FUNCTION(neon) + +static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst, + int width, int height, + int round_offset, + const int num_pel_log2) { + const uint16_t *const end = src + height * CFL_BUF_LINE; + + // Round offset is not needed, because NEON will handle the rounding. + (void)round_offset; + + // To optimize the use of the CPU pipeline, we process 4 rows per iteration + const int step = 4 * CFL_BUF_LINE; + + // At this stage, the prediction buffer contains scaled reconstructed luma + // pixels, which are positive integer and only require 15 bits. By using + // unsigned integer for the sum, we can do one addition operation inside 16 + // bits (8 lanes) before having to convert to 32 bits (4 lanes). + const uint16_t *sum_buf = src; + uint32x4_t sum_32x4 = vdupq_n_u32(0); + do { + // For all widths, we load, add and combine the data so it fits in 4 lanes. + if (width == 4) { + const uint16x4_t a0 = + vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE)); + const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE), + vld1_u16(sum_buf + 3 * CFL_BUF_LINE)); + sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1)); + } else if (width == 8) { + const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE); + const uint16x8_t a1 = + vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE); + sum_32x4 = vpadalq_u16(sum_32x4, a0); + sum_32x4 = vpadalq_u16(sum_32x4, a1); + } else { + const uint16x8_t row0 = vldaddq_u16(sum_buf, 8); + const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8); + const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8); + const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8); + sum_32x4 = vpadalq_u16(sum_32x4, row0); + sum_32x4 = vpadalq_u16(sum_32x4, row1); + sum_32x4 = vpadalq_u16(sum_32x4, row2); + sum_32x4 = vpadalq_u16(sum_32x4, row3); + + if (width == 32) { + const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8); + const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8); + const uint16x8_t row2_1 = + vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8); + const uint16x8_t row3_1 = + vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8); + + sum_32x4 = vpadalq_u16(sum_32x4, row0_1); + sum_32x4 = vpadalq_u16(sum_32x4, row1_1); + sum_32x4 = vpadalq_u16(sum_32x4, row2_1); + sum_32x4 = vpadalq_u16(sum_32x4, row3_1); + } + } + sum_buf += step; + } while (sum_buf < end); + + // Permute and add in such a way that each lane contains the block sum. + // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A] +#if AOM_ARCH_AARCH64 + sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); + sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4); +#else + uint32x4_t flip = + vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4)); + sum_32x4 = vaddq_u32(sum_32x4, flip); + sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4)); +#endif + + // Computing the average could be done using scalars, but getting off the NEON + // engine introduces latency, so we use vqrshrn. + int16x4_t avg_16x4; + // Constant propagation makes for some ugly code. + switch (num_pel_log2) { + case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break; + case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break; + case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break; + case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break; + case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break; + case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break; + case 10: + avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10)); + break; + default: assert(0); + } + + if (width == 4) { + do { + vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4)); + src += CFL_BUF_LINE; + dst += CFL_BUF_LINE; + } while (src < end); + } else { + const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4); + do { + vldsubstq_s16(dst, src, 0, avg_16x8); + vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8); + + if (width > 8) { + vldsubstq_s16(dst, src, 8, avg_16x8); + vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8); + } + if (width == 32) { + vldsubstq_s16(dst, src, 16, avg_16x8); + vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24, avg_16x8); + vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8); + vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8); + } + src += step; + dst += step; + } while (src < end); + } +} + +CFL_SUB_AVG_FN(neon) + +// Saturating negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +// Notes: +// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in +// practice, as scaled_luma is the multiplication of two absolute values. +// * In the Intel equivalent, elements in a are zeroed out when the +// corresponding elements in b are zero. Because vsign is used twice in a +// row, with b in the first call becoming a in the second call, there's no +// impact from not zeroing out. +static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) { + const int16x4_t mask = vshr_n_s16(b, 15); + return veor_s16(vadd_s16(a, mask), mask); +} + +// Saturating negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +// Notes: +// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in +// practice, as scaled_luma is the multiplication of two absolute values. +// * In the Intel equivalent, elements in a are zeroed out when the +// corresponding elements in b are zero. Because vsignq is used twice in a +// row, with b in the first call becoming a in the second call, there's no +// impact from not zeroing out. +static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vshrq_n_s16(b, 15); + return veorq_s16(vaddq_s16(a, mask), mask); +} + +static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3, + int16x4_t alpha_sign, int abs_alpha_q12, + int16x4_t dc) { + const int16x4_t ac_q3 = vld1_s16(pred_buf_q3); + const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3); + int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12); + return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc); +} + +static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3); + const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3); + int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12); + return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc); +} + +static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2 + // does not interleave, but is not currently available in the compilier used + // by the AOM build system. + const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3); + const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); + const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); + const int16x8_t scaled_luma_0 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); + const int16x8_t scaled_luma_1 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); + int16x8x2_t result; + result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); + result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); + return result; +} + +static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3, + int16x8_t alpha_sign, int abs_alpha_q12, + int16x8_t dc) { + // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4 + // does not interleave, but is not currently available in the compilier used + // by the AOM build system. + const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3); + const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]); + const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]); + const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]); + const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]); + const int16x8_t scaled_luma_0 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12); + const int16x8_t scaled_luma_1 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12); + const int16x8_t scaled_luma_2 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12); + const int16x8_t scaled_luma_3 = + vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12); + int16x8x4_t result; + result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc); + result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc); + result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc); + result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc); + return result; +} + +static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; + const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; + if (width == 4) { + const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); + const int16x4_t dc = vdup_n_s16(*dst); + do { + const int16x4_t pred = + predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred))); + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } else { + const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); + const int16x8_t dc = vdupq_n_s16(*dst); + do { + if (width == 8) { + vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign, + abs_alpha_q12, dc))); + } else if (width == 16) { + const int16x8x2_t pred = + predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]), + vqmovun_s16(pred.val[1]) } }; + vst2_u8(dst, predun); + } else { + const int16x8x4_t pred = + predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + const uint8x8x4_t predun = { + { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]), + vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) } + }; + vst4_u8(dst, predun); + } + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } +} + +CFL_PREDICT_FN(neon, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) { + return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0))); +} + +static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) { + return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0))); +} + +static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) { + uint16x8x2_t result; + result.val[0] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); + result.val[1] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); + return result; +} + +static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) { + uint16x8x4_t result; + result.val[0] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0))); + result.val[1] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0))); + result.val[2] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0))); + result.val[3] = vreinterpretq_u16_s16( + vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0))); + return result; +} + +static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + const int max = (1 << bd) - 1; + const int16_t abs_alpha_q12 = abs(alpha_q3) << 9; + const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE; + if (width == 4) { + const int16x4_t alpha_sign = vdup_n_s16(alpha_q3); + const int16x4_t dc = vdup_n_s16(*dst); + const int16x4_t max_16x4 = vdup_n_s16(max); + do { + const int16x4_t scaled_luma = + predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst1_u16(dst, clamp_s16(scaled_luma, max_16x4)); + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } else { + const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3); + const int16x8_t dc = vdupq_n_s16(*dst); + const int16x8_t max_16x8 = vdupq_n_s16(max); + do { + if (width == 8) { + const int16x8_t pred = + predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst1q_u16(dst, clampq_s16(pred, max_16x8)); + } else if (width == 16) { + const int16x8x2_t pred = + predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst2q_u16(dst, clamp2q_s16(pred, max_16x8)); + } else { + const int16x8x4_t pred = + predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc); + vst4q_u16(dst, clamp4q_s16(pred, max_16x8)); + } + dst += dst_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); + } +} + +CFL_PREDICT_FN(neon, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon.c b/third_party/aom/av1/common/arm/compound_convolve_neon.c new file mode 100644 index 0000000000..6a596234dc --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon.c @@ -0,0 +1,2719 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/arm/compound_convolve_neon.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t x_filter, + const int16x4_t horiz_const) { + int16x4_t sum = horiz_const; + sum = vmla_lane_s16(sum, s0, x_filter, 0); + sum = vmla_lane_s16(sum, s1, x_filter, 1); + sum = vmla_lane_s16(sum, s2, x_filter, 2); + sum = vmla_lane_s16(sum, s3, x_filter, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshr_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t x_filter, + const int16x8_t horiz_const) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, x_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + const int16_t *x_filter_ptr, const int im_h, int w) { + const int bd = 8; + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w == 4) { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + do { + const uint8_t *s; + int16_t *d = dst_ptr; + int width = w; + + __builtin_prefetch(src_ptr + 0 * src_stride); + __builtin_prefetch(src_ptr + 1 * src_stride); + __builtin_prefetch(src_ptr + 2 * src_stride); + __builtin_prefetch(src_ptr + 3 * src_stride); + __builtin_prefetch(src_ptr + 4 * src_stride); + __builtin_prefetch(src_ptr + 5 * src_stride); + __builtin_prefetch(src_ptr + 6 * src_stride); + __builtin_prefetch(src_ptr + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s = src_ptr + 7; + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + __builtin_prefetch(dst_ptr + 4 * dst_stride); + __builtin_prefetch(dst_ptr + 5 * dst_stride); + __builtin_prefetch(dst_ptr + 6 * dst_stride); + __builtin_prefetch(dst_ptr + 7 * dst_stride); + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, + x_filter, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, + x_filter, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, + x_filter, horiz_const); + int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, horiz_const); + int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; + } while (height > 8); +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(src_ptr); + int16x8_t s0 = + vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7 + + s = src_ptr + 8; + __builtin_prefetch(dst_ptr); + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + vst1q_s16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, + x_filter_ptr, im_h, w); + + if (clamped_y_taps == 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } +} + +static INLINE void dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); + const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); + uint16x4_t d1 = + vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); + uint16x4_t d2 = + vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); + uint16x4_t d3 = + vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_dist_wtd_avg_4x4( + dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, + vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); + + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + do { + const uint8_t *s = src; + CONV_BUF_TYPE *d = dst; + uint8_t *d_u8 = dst8; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); + uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); + uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); + uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, + vreinterpretq_s16_u16(round_offset_vec), + &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_copy_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); + const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); + uint16x4_t d1 = + vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); + uint16x4_t d2 = + vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); + uint16x4_t d3 = + vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + vreinterpretq_s16_u16(round_offset_vec), &d01, + &d23); + + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + do { + const uint8_t *s = src; + CONV_BUF_TYPE *d = dst; + uint8_t *d_u8 = dst8; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); + uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); + uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); + uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + vreinterpretq_s16_u16(round_offset_vec), &d0_u8, + &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + dst8 += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_copy_neon(const uint8_t *src, + int src_stride, int w, int h, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const uint16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const uint16x8_t round_offset_vec = vdupq_n_u16(round_offset); + const uint8x8_t shift_by_bits = vdup_n_u8(1 << (FILTER_BITS - ROUND0_BITS)); + + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(src, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + vget_low_u16(vmlal_u8(round_offset_vec, s0, shift_by_bits)); + uint16x4_t d1 = + vget_low_u16(vmlal_u8(round_offset_vec, s1, shift_by_bits)); + uint16x4_t d2 = + vget_low_u16(vmlal_u8(round_offset_vec, s2, shift_by_bits)); + uint16x4_t d3 = + vget_low_u16(vmlal_u8(round_offset_vec, s3, shift_by_bits)); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + do { + const uint8_t *s = src; + CONV_BUF_TYPE *d = dst; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3; + load_u8_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = vmlal_u8(round_offset_vec, s0, shift_by_bits); + uint16x8_t d1 = vmlal_u8(round_offset_vec, s1, shift_by_bits); + uint16x8_t d2 = vmlal_u8(round_offset_vec, s2, shift_by_bits); + uint16x8_t d3 = vmlal_u8(round_offset_vec, s3, shift_by_bits); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, + int h, ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_copy_dist_wtd_avg_neon( + src, src_stride, dst8, dst8_stride, w, h, conv_params); + } else { + dist_wtd_convolve_2d_copy_avg_neon(src, src_stride, dst8, dst8_stride, w, + h, conv_params); + } + } else { + dist_wtd_convolve_2d_copy_neon(src, src_stride, w, h, conv_params); + } +} + +static INLINE uint16x4_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t x_filter, + const int16x4_t round_offset) { + int16x4_t sum = vmul_lane_s16(s0, x_filter, 0); + sum = vmla_lane_s16(sum, s1, x_filter, 1); + sum = vmla_lane_s16(sum, s2, x_filter, 2); + sum = vmla_lane_s16(sum, s3, x_filter, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpret_u16_s16(res); +} + +static INLINE uint16x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t x_filter, + const int16x8_t round_offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int16x8_t sum = vmulq_lane_s16(s0, x_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlaq_lane_s16(sum, s7, x_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + __builtin_prefetch(dst8_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(dst8_ptr, d01); + + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + } while (--height != 0); + } else { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height >= 8) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, round_offset_vec); + + transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, + bck_offset, round_offset_vec, &d4_u8, &d5_u8, + &d6_u8, &d7_u8); + + store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, + d7_u8); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + dst8_ptr += 8 * dst8_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (height > 0) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + __builtin_prefetch(d); + + s += 8; + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + + s0 = s8; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + height--; + } + } +} + +static INLINE void dist_wtd_convolve_x_avg_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + __builtin_prefetch(dst8_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(dst8_ptr, d01); + + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + } while (--height != 0); + } else { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height >= 8) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, round_offset_vec); + + transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, + round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); + + store_u8_8x4(d_u8 + 4 * dst8_stride, dst8_stride, d4_u8, d5_u8, d6_u8, + d7_u8); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + dst8_ptr += 8 * dst8_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (height > 0) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + __builtin_prefetch(d); + + s += 8; + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + + s0 = s8; + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + dst8_ptr += dst8_stride; + height--; + } + } +} + +static INLINE void dist_wtd_convolve_x_neon( + const uint8_t *src, int src_stride, int w, int h, + const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + __builtin_prefetch(dst_ptr); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint16x4_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, + vget_low_s16(round_offset_vec)); + + vst1_u16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height >= 8) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, round_offset_vec); + + transpose_elems_inplace_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (height > 0) { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + __builtin_prefetch(d); + + s = src_ptr + 8; + + do { + t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint16x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + round_offset_vec); + + vst1q_u16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } + } +} + +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_x_dist_wtd_avg_neon(src, src_stride, dst8, dst8_stride, + w, h, filter_params_x, subpel_x_qn, + conv_params); + } else { + dist_wtd_convolve_x_avg_neon(src, src_stride, dst8, dst8_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } + } else { + dist_wtd_convolve_x_neon(src, src_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } +} + +static INLINE uint16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, + const int16x4_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + // Filter values at indices 0 and 7 are 0. + int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); + + // We halved the convolution filter values so -1 from the right shift. + int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpret_u16_s16(res); +} + +static INLINE uint16x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, + const int16x8_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + // Filter values at indices 0 and 7 are 0. + int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 2); + + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr + (5 * src_stride); + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + uint16x8_t d1 = + convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); + uint16x8_t d2 = + convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); + uint16x8_t d3 = + convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); + uint16x8_t d4 = + convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); + uint16x8_t d5 = + convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); + uint16x8_t d6 = + convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); + uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, + round_offset_vec); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, + bck_offset, round_offset_vec, &d4_u8, &d5_u8, + &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_6tap_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr + (5 * src_stride); + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + uint16x8_t d1 = + convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); + uint16x8_t d2 = + convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); + uint16x8_t d3 = + convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); + uint16x8_t d4 = + convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); + uint16x8_t d5 = + convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); + uint16x8_t d6 = + convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); + uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, + round_offset_vec); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, + round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_6tap_neon(const uint8_t *src_ptr, + int src_stride, int w, int h, + const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter, + vget_low_s16(round_offset_vec)); + + vst1_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr + (5 * src_stride); + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + uint16x8_t d1 = + convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter, round_offset_vec); + uint16x8_t d2 = + convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter, round_offset_vec); + uint16x8_t d3 = + convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter, round_offset_vec); + uint16x8_t d4 = + convolve6_8_y(s4, s5, s6, s7, s8, s9, y_filter, round_offset_vec); + uint16x8_t d5 = + convolve6_8_y(s5, s6, s7, s8, s9, s10, y_filter, round_offset_vec); + uint16x8_t d6 = + convolve6_8_y(s6, s7, s8, s9, s10, s11, y_filter, round_offset_vec); + uint16x8_t d7 = convolve6_8_y(s7, s8, s9, s10, s11, s12, y_filter, + round_offset_vec); + + store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint16x8_t d0 = + convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter, round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + + vst1q_u16(d, d0); + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t y_filter, + const int16x4_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 0); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmla_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmla_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x4_t res = vrsra_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpret_u16_s16(res); +} + +static INLINE uint16x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t y_filter, + const int16x8_t round_offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int16x8_t sum = vmulq_lane_s16(s0, y_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlaq_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlaq_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); + + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vrsraq_n_s16(round_offset, sum, ROUND0_BITS - 1); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_y_8tap_dist_wtd_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 0 * dst8_stride); + __builtin_prefetch(d_u8 + 1 * dst8_stride); + __builtin_prefetch(d_u8 + 2 * dst8_stride); + __builtin_prefetch(d_u8 + 3 * dst8_stride); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + __builtin_prefetch(s + 4 * src_stride); + __builtin_prefetch(s + 5 * src_stride); + __builtin_prefetch(s + 6 * src_stride); + __builtin_prefetch(s + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, + y_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter, round_offset_vec); + + __builtin_prefetch(d + 0 * dst8_stride); + __builtin_prefetch(d + 1 * dst8_stride); + __builtin_prefetch(d + 2 * dst8_stride); + __builtin_prefetch(d + 3 * dst8_stride); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_dist_wtd_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, fwd_offset, + bck_offset, round_offset_vec, &d4_u8, &d5_u8, + &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + __builtin_prefetch(d); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_8tap_avg_neon( + const uint8_t *src_ptr, int src_stride, uint8_t *dst8_ptr, + const int dst8_stride, int w, int h, const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + __builtin_prefetch(d_u8 + 0 * dst8_stride); + __builtin_prefetch(d_u8 + 1 * dst8_stride); + __builtin_prefetch(d_u8 + 2 * dst8_stride); + __builtin_prefetch(d_u8 + 3 * dst8_stride); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01, d23; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01, &d23); + + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + d_u8 += 4 * dst8_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + + __builtin_prefetch(d); + + uint16x4_t dd0 = vld1_u16(d); + + uint8x8_t d01; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); + + store_u8_4x1(d_u8, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + d_u8 += dst8_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + dst8_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + __builtin_prefetch(s + 4 * src_stride); + __builtin_prefetch(s + 5 * src_stride); + __builtin_prefetch(s + 6 * src_stride); + __builtin_prefetch(s + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, + y_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter, round_offset_vec); + + __builtin_prefetch(d + 0 * dst8_stride); + __builtin_prefetch(d + 1 * dst8_stride); + __builtin_prefetch(d + 2 * dst8_stride); + __builtin_prefetch(d + 3 * dst8_stride); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + uint16x8_t dd4, dd5, dd6, dd7; + load_u16_8x4(d + 4 * dst_stride, dst_stride, &dd4, &dd5, &dd6, &dd7); + + uint8x8_t d4_u8, d5_u8, d6_u8, d7_u8; + compute_basic_avg_8x4(dd4, dd5, dd6, dd7, d4, d5, d6, d7, + round_offset_vec, &d4_u8, &d5_u8, &d6_u8, &d7_u8); + + store_u8_8x4(d_u8, dst8_stride, d4_u8, d5_u8, d6_u8, d7_u8); + d_u8 += 4 * dst8_stride; + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + __builtin_prefetch(d); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE void dist_wtd_convolve_y_8tap_neon(const uint8_t *src_ptr, + int src_stride, int w, int h, + const int16x8_t y_filter, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + int width = w; + + if (w == 4 || h == 4) { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + + uint8x8_t t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(s + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(s + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(s + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s + 0 * src_stride); + t1 = load_unaligned_u8_4x1(s + 1 * src_stride); + t2 = load_unaligned_u8_4x1(s + 2 * src_stride); + t3 = load_unaligned_u8_4x1(s + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + vget_low_s16(round_offset_vec)); + uint16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + vget_low_s16(round_offset_vec)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + t0 = load_unaligned_u8_4x1(s); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + + uint16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + vget_low_s16(round_offset_vec)); + + vst1_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 4; + dst_ptr += 4; + width -= 4; + } while (width != 0); + } else { + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + __builtin_prefetch(s + 4 * src_stride); + __builtin_prefetch(s + 5 * src_stride); + __builtin_prefetch(s + 6 * src_stride); + __builtin_prefetch(s + 7 * src_stride); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + __builtin_prefetch(dst_ptr + 0 * dst_stride); + __builtin_prefetch(dst_ptr + 1 * dst_stride); + __builtin_prefetch(dst_ptr + 2 * dst_stride); + __builtin_prefetch(dst_ptr + 3 * dst_stride); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + uint16x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round_offset_vec); + uint16x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round_offset_vec); + uint16x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round_offset_vec); + uint16x8_t d4 = convolve8_8_y(s4, s5, s6, s7, s8, s9, s10, s11, + y_filter, round_offset_vec); + uint16x8_t d5 = convolve8_8_y(s5, s6, s7, s8, s9, s10, s11, s12, + y_filter, round_offset_vec); + uint16x8_t d6 = convolve8_8_y(s6, s7, s8, s9, s10, s11, s12, s13, + y_filter, round_offset_vec); + uint16x8_t d7 = convolve8_8_y(s7, s8, s9, s10, s11, s12, s13, s14, + y_filter, round_offset_vec); + + store_u16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8 * src_stride; + d += 8 * dst_stride; + height -= 8; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + __builtin_prefetch(dst_ptr); + + uint16x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round_offset_vec); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + + vst1q_u16(d, d0); + + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + // Vertical filter. + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + // Filter values are even, so downshift by 1 to reduce intermediate + // precision requirements. + const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); + + const int vert_offset = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - (vert_offset * src_stride); + + if (get_filter_tap(filter_params_y, subpel_y_qn) <= 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_y_6tap_dist_wtd_avg_neon( + src_ptr + src_stride, src_stride, dst8, dst8_stride, w, h, y_filter, + conv_params); + } else { + dist_wtd_convolve_y_6tap_avg_neon(src_ptr + src_stride, src_stride, + dst8, dst8_stride, w, h, y_filter, + conv_params); + } + } else { + dist_wtd_convolve_y_6tap_neon(src_ptr + src_stride, src_stride, w, h, + y_filter, conv_params); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_y_8tap_dist_wtd_avg_neon(src_ptr, src_stride, dst8, + dst8_stride, w, h, y_filter, + conv_params); + } else { + dist_wtd_convolve_y_8tap_avg_neon(src_ptr, src_stride, dst8, + dst8_stride, w, h, y_filter, + conv_params); + } + } else { + dist_wtd_convolve_y_8tap_neon(src_ptr, src_stride, w, h, y_filter, + conv_params); + } + } +} diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon.h b/third_party/aom/av1/common/arm/compound_convolve_neon.h new file mode 100644 index 0000000000..d719680a32 --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon.h @@ -0,0 +1,1164 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ + +#include + +#include "av1/common/convolve.h" +#include "av1/common/enums.h" +#include "av1/common/filter.h" + +static INLINE void compute_dist_wtd_avg_4x1(uint16x4_t dd0, uint16x4_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x4_t round_offset, + uint8x8_t *d0_u8) { + uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); + blend0 = vmlal_n_u16(blend0, d0, bck_offset); + + uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); + + int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); + + int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); + + *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_4x1(uint16x4_t dd0, uint16x4_t d0, + const int16x4_t round_offset, + uint8x8_t *d0_u8) { + uint16x4_t avg0 = vhadd_u16(dd0, d0); + + int16x4_t dst0 = vsub_s16(vreinterpret_s16_u16(avg0), round_offset); + + int16x8_t dst0q = vcombine_s16(dst0, vdup_n_s16(0)); + + *d0_u8 = vqrshrun_n_s16(dst0q, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_dist_wtd_avg_8x1(uint16x8_t dd0, uint16x8_t d0, + const uint16_t fwd_offset, + const uint16_t bck_offset, + const int16x8_t round_offset, + uint8x8_t *d0_u8) { + uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); + blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); + uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); + blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); + + uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_8x1(uint16x8_t dd0, uint16x8_t d0, + const int16x8_t round_offset, + uint8x8_t *d0_u8) { + uint16x8_t avg0 = vhaddq_u16(dd0, d0); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_dist_wtd_avg_4x4( + uint16x4_t dd0, uint16x4_t dd1, uint16x4_t dd2, uint16x4_t dd3, + uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, + const uint16_t fwd_offset, const uint16_t bck_offset, + const int16x8_t round_offset, uint8x8_t *d01_u8, uint8x8_t *d23_u8) { + uint32x4_t blend0 = vmull_n_u16(dd0, fwd_offset); + blend0 = vmlal_n_u16(blend0, d0, bck_offset); + uint32x4_t blend1 = vmull_n_u16(dd1, fwd_offset); + blend1 = vmlal_n_u16(blend1, d1, bck_offset); + uint32x4_t blend2 = vmull_n_u16(dd2, fwd_offset); + blend2 = vmlal_n_u16(blend2, d2, bck_offset); + uint32x4_t blend3 = vmull_n_u16(dd3, fwd_offset); + blend3 = vmlal_n_u16(blend3, d3, bck_offset); + + uint16x4_t avg0 = vshrn_n_u32(blend0, DIST_PRECISION_BITS); + uint16x4_t avg1 = vshrn_n_u32(blend1, DIST_PRECISION_BITS); + uint16x4_t avg2 = vshrn_n_u32(blend2, DIST_PRECISION_BITS); + uint16x4_t avg3 = vshrn_n_u32(blend3, DIST_PRECISION_BITS); + + int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); + int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); + + dst_01 = vsubq_s16(dst_01, round_offset); + dst_23 = vsubq_s16(dst_23, round_offset); + + *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); + *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_4x4(uint16x4_t dd0, uint16x4_t dd1, + uint16x4_t dd2, uint16x4_t dd3, + uint16x4_t d0, uint16x4_t d1, + uint16x4_t d2, uint16x4_t d3, + const int16x8_t round_offset, + uint8x8_t *d01_u8, uint8x8_t *d23_u8) { + uint16x4_t avg0 = vhadd_u16(dd0, d0); + uint16x4_t avg1 = vhadd_u16(dd1, d1); + uint16x4_t avg2 = vhadd_u16(dd2, d2); + uint16x4_t avg3 = vhadd_u16(dd3, d3); + + int16x8_t dst_01 = vreinterpretq_s16_u16(vcombine_u16(avg0, avg1)); + int16x8_t dst_23 = vreinterpretq_s16_u16(vcombine_u16(avg2, avg3)); + + dst_01 = vsubq_s16(dst_01, round_offset); + dst_23 = vsubq_s16(dst_23, round_offset); + + *d01_u8 = vqrshrun_n_s16(dst_01, FILTER_BITS - ROUND0_BITS); + *d23_u8 = vqrshrun_n_s16(dst_23, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_dist_wtd_avg_8x4( + uint16x8_t dd0, uint16x8_t dd1, uint16x8_t dd2, uint16x8_t dd3, + uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, + const uint16_t fwd_offset, const uint16_t bck_offset, + const int16x8_t round_offset, uint8x8_t *d0_u8, uint8x8_t *d1_u8, + uint8x8_t *d2_u8, uint8x8_t *d3_u8) { + uint32x4_t blend0_lo = vmull_n_u16(vget_low_u16(dd0), fwd_offset); + blend0_lo = vmlal_n_u16(blend0_lo, vget_low_u16(d0), bck_offset); + uint32x4_t blend0_hi = vmull_n_u16(vget_high_u16(dd0), fwd_offset); + blend0_hi = vmlal_n_u16(blend0_hi, vget_high_u16(d0), bck_offset); + + uint32x4_t blend1_lo = vmull_n_u16(vget_low_u16(dd1), fwd_offset); + blend1_lo = vmlal_n_u16(blend1_lo, vget_low_u16(d1), bck_offset); + uint32x4_t blend1_hi = vmull_n_u16(vget_high_u16(dd1), fwd_offset); + blend1_hi = vmlal_n_u16(blend1_hi, vget_high_u16(d1), bck_offset); + + uint32x4_t blend2_lo = vmull_n_u16(vget_low_u16(dd2), fwd_offset); + blend2_lo = vmlal_n_u16(blend2_lo, vget_low_u16(d2), bck_offset); + uint32x4_t blend2_hi = vmull_n_u16(vget_high_u16(dd2), fwd_offset); + blend2_hi = vmlal_n_u16(blend2_hi, vget_high_u16(d2), bck_offset); + + uint32x4_t blend3_lo = vmull_n_u16(vget_low_u16(dd3), fwd_offset); + blend3_lo = vmlal_n_u16(blend3_lo, vget_low_u16(d3), bck_offset); + uint32x4_t blend3_hi = vmull_n_u16(vget_high_u16(dd3), fwd_offset); + blend3_hi = vmlal_n_u16(blend3_hi, vget_high_u16(d3), bck_offset); + + uint16x8_t avg0 = vcombine_u16(vshrn_n_u32(blend0_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend0_hi, DIST_PRECISION_BITS)); + uint16x8_t avg1 = vcombine_u16(vshrn_n_u32(blend1_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend1_hi, DIST_PRECISION_BITS)); + uint16x8_t avg2 = vcombine_u16(vshrn_n_u32(blend2_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend2_hi, DIST_PRECISION_BITS)); + uint16x8_t avg3 = vcombine_u16(vshrn_n_u32(blend3_lo, DIST_PRECISION_BITS), + vshrn_n_u32(blend3_hi, DIST_PRECISION_BITS)); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); + int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); + int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); + *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); + *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); + *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); +} + +static INLINE void compute_basic_avg_8x4(uint16x8_t dd0, uint16x8_t dd1, + uint16x8_t dd2, uint16x8_t dd3, + uint16x8_t d0, uint16x8_t d1, + uint16x8_t d2, uint16x8_t d3, + const int16x8_t round_offset, + uint8x8_t *d0_u8, uint8x8_t *d1_u8, + uint8x8_t *d2_u8, uint8x8_t *d3_u8) { + uint16x8_t avg0 = vhaddq_u16(dd0, d0); + uint16x8_t avg1 = vhaddq_u16(dd1, d1); + uint16x8_t avg2 = vhaddq_u16(dd2, d2); + uint16x8_t avg3 = vhaddq_u16(dd3, d3); + + int16x8_t dst0 = vsubq_s16(vreinterpretq_s16_u16(avg0), round_offset); + int16x8_t dst1 = vsubq_s16(vreinterpretq_s16_u16(avg1), round_offset); + int16x8_t dst2 = vsubq_s16(vreinterpretq_s16_u16(avg2), round_offset); + int16x8_t dst3 = vsubq_s16(vreinterpretq_s16_u16(avg3), round_offset); + + *d0_u8 = vqrshrun_n_s16(dst0, FILTER_BITS - ROUND0_BITS); + *d1_u8 = vqrshrun_n_s16(dst1, FILTER_BITS - ROUND0_BITS); + *d2_u8 = vqrshrun_n_s16(dst2, FILTER_BITS - ROUND0_BITS); + *d3_u8 = vqrshrun_n_s16(dst3, FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint16x4_t +convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = offset_const; + // Filter values at indices 0 and 7 are 0. + sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t +convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = offset_const; + // Filter values at indices 0 and 7 are 0. + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x4_t d1 = + convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x4_t d2 = + convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x4_t d3 = + convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_6tap_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x4_t d1 = + convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x4_t d2 = + convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x4_t d3 = + convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_6tap_neon( + int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, + const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x4_t d1 = + convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x4_t d2 = + convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x4_t d3 = + convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + + uint16x4_t d0 = + convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + vst1_u16(dst_ptr, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + uint16x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_const); + uint16x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_const); + uint16x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_const); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + + uint16x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_const); + + vst1q_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t +convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = offset_const; + sum = vmlal_lane_s16(sum, s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t +convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t y_filter, const int32x4_t offset_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = offset_const; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + + int32x4_t sum1 = offset_const; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + offset_const); + uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + offset_const); + uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, + vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_const); + uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_const); + uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_dist_wtd_avg_8x1(dd0, d0, fwd_offset, bck_offset, + round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_8tap_avg_neon( + int16_t *src_ptr, const int src_stride, uint8_t *dst8_ptr, int dst8_stride, + ConvolveParams *conv_params, const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + offset_const); + uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + offset_const); + uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + dst8_ptr += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + + uint16x4_t dd0 = vld1_u16(dst_ptr); + + uint8x8_t d01_u8; + compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); + + store_u8_4x1(dst8_ptr, d01_u8); + dst8_ptr += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_const); + uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_const); + uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + d_u8 += 4 * dst8_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + + uint16x8_t dd0 = vld1q_u16(d); + + uint8x8_t d0_u8; + compute_basic_avg_8x1(dd0, d0, round_offset_vec, &d0_u8); + + vst1_u8(d_u8, d0_u8); + d_u8 += dst8_stride; + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + dst8_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void dist_wtd_convolve_2d_vert_8tap_neon( + int16_t *src_ptr, const int src_stride, ConvolveParams *conv_params, + const int16x8_t y_filter, int h, int w) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits); + + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + uint16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + offset_const); + uint16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + offset_const); + uint16x4_t d3 = convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + + uint16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + offset_const); + + vst1_u16(dst_ptr, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + int16_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + uint16x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_const); + uint16x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_const); + uint16x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_const); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + + uint16x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_const); + + vst1q_u16(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +#endif // AOM_AV1_COMMON_ARM_COMPOUND_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c new file mode 100644 index 0000000000..3aeffbb0e6 --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_dotprod.c @@ -0,0 +1,675 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/arm/compound_convolve_neon.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. */ + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + const int16_t *x_filter_ptr, const int im_h, int w) { + const int bd = 8; + const int32_t horiz_const = (1 << (bd + FILTER_BITS - 2)); + // Dot product constants and other shims. + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold horiz_const into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const + + (1 << ((ROUND0_BITS - 1) - 1))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_dist_wtd_convolve_2d_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + dist_wtd_convolve_2d_horiz_neon_dotprod(src_ptr, src_stride, im_block, + im_stride, x_filter_ptr, im_h, w); + + if (clamped_y_taps == 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } +} + +static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1)); +} + +static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples, + const int8x8_t x_filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. */ + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], x_filter, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], x_filter, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + + // Dot-product constants and other shims. + const uint8x16_t range_limit = vdupq_n_u8(128); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold round_offset into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + int32x4_t correction = + vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_avg_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + + // Dot-product constants and other shims. + const uint8x16_t range_limit = vdupq_n_u8(128); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold round_offset into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + int32x4_t correction = + vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_neon_dotprod( + const uint8_t *src, int src_stride, int w, int h, + const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + + // Dot-product constants and other shims. + const uint8x16_t range_limit = vdupq_n_u8(128); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // Fold round_offset into the dot-product filter correction constant. The + // additional shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non- + // rounding shifts - which are generally faster than rounding shifts on + // modern CPUs. (The extra -1 is needed because we halved the filter values.) + int32x4_t correction = + vdupq_n_s32(correction_s32 + (round_offset << (ROUND0_BITS - 1)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_dist_wtd_convolve_x_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_x_dist_wtd_avg_neon_dotprod( + src, src_stride, dst8, dst8_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } else { + dist_wtd_convolve_x_avg_neon_dotprod(src, src_stride, dst8, dst8_stride, + w, h, filter_params_x, subpel_x_qn, + conv_params); + } + } else { + dist_wtd_convolve_x_neon_dotprod(src, src_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } +} diff --git a/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c b/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c new file mode 100644 index 0000000000..a72af9e36a --- /dev/null +++ b/third_party/aom/av1/common/arm/compound_convolve_neon_i8mm.c @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/arm/compound_convolve_neon.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], x_filter, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], x_filter, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void dist_wtd_convolve_2d_horiz_neon_i8mm( + const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, + const int16_t *x_filter_ptr, const int im_h, int w) { + const int bd = 8; + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_dist_wtd_convolve_2d_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + dist_wtd_convolve_2d_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, + x_filter_ptr, im_h, w); + + if (clamped_y_taps == 6) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_6tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_6tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } else { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_2d_vert_8tap_dist_wtd_avg_neon( + im_block, im_stride, dst8, dst8_stride, conv_params, y_filter, h, + w); + } else { + dist_wtd_convolve_2d_vert_8tap_avg_neon(im_block, im_stride, dst8, + dst8_stride, conv_params, + y_filter, h, w); + } + } else { + dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, conv_params, + y_filter, h, w); + } + } +} + +static INLINE uint16x4_t convolve4_4_x(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16_t permute_tbl, + const int32x4_t round_offset) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(round_offset, permuted_samples, x_filter, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vreinterpret_u16_s16(vshrn_n_s32(sum, ROUND0_BITS - 1)); +} + +static INLINE uint16x8_t convolve8_8_x(uint8x16_t samples, + const int8x8_t x_filter, + const uint8x16x3_t permute_tbl, + const int32x4_t round_offset) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(round_offset, permuted_samples[0], x_filter, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], x_filter, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(round_offset, permuted_samples[1], x_filter, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], x_filter, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + int16x8_t res = vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); + return vreinterpretq_u16_s16(res); +} + +static INLINE void dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t round_offset_shim = vdupq_n_s32( + (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); + + const uint16_t fwd_offset = conv_params->fwd_offset; + const uint16_t bck_offset = conv_params->bck_offset; + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_dist_wtd_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, + bck_offset, round_offset_vec, &d0_u8, &d1_u8, + &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_avg_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + const int16x8_t round_offset_vec = vdupq_n_s16(round_offset); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t round_offset_shim = vdupq_n_s32( + (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + uint8_t *dst8_ptr = dst8; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x4_t dd0, dd1, dd2, dd3; + load_u16_4x4(dst_ptr, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d01_u8, d23_u8; + compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d01_u8, &d23_u8); + + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + uint8_t *d_u8 = dst8_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); + + uint16x8_t dd0, dd1, dd2, dd3; + load_u16_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + uint8x8_t d0_u8, d1_u8, d2_u8, d3_u8; + compute_basic_avg_8x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, + round_offset_vec, &d0_u8, &d1_u8, &d2_u8, &d3_u8); + + store_u8_8x4(d_u8, dst8_stride, d0_u8, d1_u8, d2_u8, d3_u8); + + s += 8; + d += 8; + d_u8 += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + dst8_ptr += 4 * dst8_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void dist_wtd_convolve_x_neon_i8mm( + const uint8_t *src, int src_stride, int w, int h, + const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int16_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) + + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int32x4_t round_offset_shim = vdupq_n_s32( + (round_offset << (ROUND0_BITS - 1)) + (1 << ((ROUND0_BITS - 1) - 1))); + + // Horizontal filter. + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - horiz_offset; + CONV_BUF_TYPE *dst_ptr = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int height = h; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = + convolve4_4_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d1 = + convolve4_4_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d2 = + convolve4_4_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x4_t d3 = + convolve4_4_x(s3, x_filter, permute_tbl, round_offset_shim); + + store_u16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + CONV_BUF_TYPE *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x8_t d0 = + convolve8_8_x(s0, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d1 = + convolve8_8_x(s1, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d2 = + convolve8_8_x(s2, x_filter, permute_tbl, round_offset_shim); + uint16x8_t d3 = + convolve8_8_x(s3, x_filter, permute_tbl, round_offset_shim); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_dist_wtd_convolve_x_neon_i8mm( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + if (conv_params->do_average) { + if (UNLIKELY(conv_params->use_dist_wtd_comp_avg)) { + dist_wtd_convolve_x_dist_wtd_avg_neon_i8mm( + src, src_stride, dst8, dst8_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } else { + dist_wtd_convolve_x_avg_neon_i8mm(src, src_stride, dst8, dst8_stride, w, + h, filter_params_x, subpel_x_qn, + conv_params); + } + } else { + dist_wtd_convolve_x_neon_i8mm(src, src_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } +} diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c new file mode 100644 index 0000000000..10442f9bf9 --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon.c @@ -0,0 +1,1659 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/convolve_neon.h" + +static INLINE int16x4_t convolve12_4_x(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x4_t s8, const int16x4_t s9, + const int16x4_t s10, const int16x4_t s11, + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t horiz_const) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = horiz_const; + sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); + + return vqrshrn_n_s32(sum, FILTER_BITS); +} + +static INLINE void convolve_x_sr_12tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16_t *x_filter_ptr) { + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + + // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right + // shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); + +#if AOM_ARCH_AARCH64 + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + s += 11; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = + convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d1 = + convolve12_4_x(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d2 = + convolve12_4_x(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d3 = + convolve12_4_x(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_0_7, x_filter_8_11, horiz_const); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(d, dst_stride, d01); + store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + +#else // !AOM_ARCH_AARCH64 + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t t0 = vld1q_u8(s); + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + int16x8_t tt8 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + + int16x4_t s0 = vget_low_s16(tt0); + int16x4_t s4 = vget_high_s16(tt0); + int16x4_t s8 = vget_low_s16(tt8); + int16x4_t s12 = vget_high_s16(tt8); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 + int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 + int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 + int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 + int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 + int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 + + int16x4_t d0 = + convolve12_4_x(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + + uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0))); + + store_u8_4x1(d, dd0); + + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +#endif // AOM_ARCH_AARCH64 +} + +static INLINE uint8x8_t convolve4_4_x(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filter, + const int16x4_t horiz_const) { + int16x4_t sum = horiz_const; + sum = vmla_lane_s16(sum, s0, filter, 0); + sum = vmla_lane_s16(sum, s1, filter, 1); + sum = vmla_lane_s16(sum, s2, filter, 2); + sum = vmla_lane_s16(sum, s3, filter, 3); + + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(vcombine_s16(sum, vdup_n_s16(0)), FILTER_BITS - 1); +} + +static INLINE uint8x8_t convolve8_8_x(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, + const int16x8_t horiz_const) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + return; + } + + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + src -= horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int16x8_t horiz_const = vdupq_n_s16(1 << ((ROUND0_BITS - 1) - 1)); + + if (w <= 4) { + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src += 2; + + do { + uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + uint8x8_t d0 = + convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const)); + + store_u8_4x1(dst, d0); + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + // Filter values are even so halve to reduce precision requirements. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (h >= 8) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + int width = w; + const uint8_t *s = src + 7; + uint8_t *d = dst; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(d + 4 * dst_stride); + __builtin_prefetch(d + 5 * dst_stride); + __builtin_prefetch(d + 6 * dst_stride); + __builtin_prefetch(d + 7 * dst_stride); + + do { + uint8x8_t t8, t9, t10, t11, t12, t13, t14; + load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14); + + transpose_elems_inplace_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, + &t14); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + + uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const); + uint8x8_t d1 = convolve8_8_x(s1, s2, s3, s4, s5, s6, s7, s8, x_filter, + horiz_const); + uint8x8_t d2 = convolve8_8_x(s2, s3, s4, s5, s6, s7, s8, s9, x_filter, + horiz_const); + uint8x8_t d3 = convolve8_8_x(s3, s4, s5, s6, s7, s8, s9, s10, x_filter, + horiz_const); + uint8x8_t d4 = convolve8_8_x(s4, s5, s6, s7, s8, s9, s10, s11, x_filter, + horiz_const); + uint8x8_t d5 = convolve8_8_x(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + uint8x8_t d6 = convolve8_8_x(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + uint8x8_t d7 = convolve8_8_x(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } +#endif // AOM_ARCH_AARCH64 + + while (h-- != 0) { + uint8x8_t t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + int width = w; + const uint8_t *s = src + 8; + uint8_t *d = dst; + + __builtin_prefetch(d); + + do { + uint8x8_t t8 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + uint8x8_t d0 = convolve8_8_x(s0, s1, s2, s3, s4, s5, s6, s7, x_filter, + horiz_const); + + vst1_u8(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } + } +} + +static INLINE int16x4_t convolve6_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter_0_7) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + // Filter values at indices 0 and 7 are 0. + int16x4_t sum = vmul_lane_s16(s0, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmla_lane_s16(sum, s5, y_filter_4_7, 2); + + return sum; +} + +static INLINE uint8x8_t convolve6_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filters) { + const int16x4_t y_filter_lo = vget_low_s16(y_filters); + const int16x4_t y_filter_hi = vget_high_s16(y_filters); + + // Filter values at indices 0 and 7 are 0. + int16x8_t sum = vmulq_lane_s16(s0, y_filter_lo, 1); + sum = vmlaq_lane_s16(sum, s1, y_filter_lo, 2); + sum = vmlaq_lane_s16(sum, s2, y_filter_lo, 3); + sum = vmlaq_lane_s16(sum, s3, y_filter_hi, 0); + sum = vmlaq_lane_s16(sum, s4, y_filter_hi, 1); + sum = vmlaq_lane_s16(sum, s5, y_filter_hi, 2); + // We halved the convolution filter values so -1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_y_sr_6tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16x8_t y_filter) { + if (w <= 4) { + uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + + int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); + int16x4_t d1 = convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter); + int16x4_t d2 = convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d3 = convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter); + + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + + int16x4_t d0 = convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter); + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = + vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4; + load_u8_8x5(s, src_stride, &t0, &t1, &t2, &t3, &t4); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t5, t6, t7, t8; + load_u8_8x4(s, src_stride, &t5, &t6, &t7, &t8); + + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + + uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); + uint8x8_t d1 = convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter); + uint8x8_t d2 = convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter); + uint8x8_t d3 = convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint8x8_t d0 = convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter); + + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve8_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); + sum = vmla_lane_s16(sum, s1, filter_lo, 1); + sum = vmla_lane_s16(sum, s2, filter_lo, 2); + sum = vmla_lane_s16(sum, s3, filter_lo, 3); + sum = vmla_lane_s16(sum, s4, filter_hi, 0); + sum = vmla_lane_s16(sum, s5, filter_hi, 1); + sum = vmla_lane_s16(sum, s6, filter_hi, 2); + sum = vmla_lane_s16(sum, s7, filter_hi, 3); + + return sum; +} + +static INLINE uint8x8_t convolve8_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_y_sr_8tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + const int dst_stride, int w, int h, + const int16x8_t y_filter) { + if (w <= 4) { + uint8x8_t t0 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t1 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t2 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t3 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + uint8x8_t t4 = load_unaligned_u8_4x1(src_ptr + 4 * src_stride); + uint8x8_t t5 = load_unaligned_u8_4x1(src_ptr + 5 * src_stride); + uint8x8_t t6 = load_unaligned_u8_4x1(src_ptr + 6 * src_stride); + + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr + 0 * src_stride); + uint8x8_t t8 = load_unaligned_u8_4x1(src_ptr + 1 * src_stride); + uint8x8_t t9 = load_unaligned_u8_4x1(src_ptr + 2 * src_stride); + uint8x8_t t10 = load_unaligned_u8_4x1(src_ptr + 3 * src_stride); + + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d1 = convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + int16x4_t d2 = convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + int16x4_t d3 = convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + uint8x8_t t7 = load_unaligned_u8_4x1(src_ptr); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + + int16x4_t d0 = convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + // We halved the convolution filter values so -1 from the right shift. + uint8x8_t d01 = + vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint8x8_t d1 = convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint8x8_t d2 = convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint8x8_t d3 = convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); + + uint8x8_t d0 = convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve12_4_y(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x4_t s8, const int16x4_t s9, + const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, + const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + int16x4_t sum; + + sum = vmul_lane_s16(s0, y_filter_0_3, 0); + sum = vmla_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmla_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmla_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmla_lane_s16(sum, s4, y_filter_4_7, 0); + + sum = vmla_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmla_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmla_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmla_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmla_lane_s16(sum, s11, y_filter_8_11, 3); + + // Saturating addition is required for the largest filter taps to avoid + // overflow (while staying in 16-bit elements.) + sum = vqadd_s16(sum, vmul_lane_s16(s5, y_filter_4_7, 1)); + sum = vqadd_s16(sum, vmul_lane_s16(s6, y_filter_4_7, 2)); + + return sum; +} + +static INLINE uint8x8_t convolve12_8_y(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t s8, const int16x8_t s9, + const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, + const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, y_filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, y_filter_4_7, 0); + + sum = vmlaq_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlaq_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlaq_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlaq_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlaq_lane_s16(sum, s11, y_filter_8_11, 3); + + // Saturating addition is required for the largest filter taps to avoid + // overflow (while staying in 16-bit elements.) + sum = vqaddq_s16(sum, vmulq_lane_s16(s5, y_filter_4_7, 1)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s6, y_filter_4_7, 2)); + + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE void convolve_y_sr_12tap_neon(const uint8_t *src_ptr, + int src_stride, uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *y_filter_ptr) { + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + if (w <= 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, + &t8, &t9, &t10); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + src_ptr += 11 * src_stride; + + do { + uint8x8_t t11, t12, t13, t14; + load_u8_8x4(src_ptr, src_stride, &t11, &t12, &t13, &t14); + + int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t11))); + int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t12))); + int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t13))); + int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t14))); + + int16x4_t d0 = convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, y_filter_0_7, y_filter_8_11); + int16x4_t d1 = convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, s12, y_filter_0_7, y_filter_8_11); + int16x4_t d2 = convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, s13, y_filter_0_7, y_filter_8_11); + int16x4_t d3 = convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, y_filter_0_7, y_filter_8_11); + + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + uint8_t *d = dst_ptr; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, + &t9, &t10); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + s += 11 * src_stride; + + do { + uint8x8_t t11, t12, t13, t14; + load_u8_8x4(s, src_stride, &t11, &t12, &t13, &t14); + + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14)); + + uint8x8_t d0 = convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, + s10, s11, y_filter_0_7, y_filter_8_11); + uint8x8_t d1 = convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, s12, y_filter_0_7, y_filter_8_11); + uint8x8_t d2 = convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, s13, y_filter_0_7, y_filter_8_11); + uint8x8_t d3 = convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, y_filter_0_7, y_filter_8_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + if (w == 2 || h == 2) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y, + subpel_y_qn); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int vert_offset = clamped_y_taps / 2 - 1; + + src -= vert_offset * src_stride; + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (y_filter_taps > 8) { + convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr); + return; + } + + // Filter values are even so halve to reduce precision requirements. + const int16x8_t y_filter = vshrq_n_s16(vld1q_s16(y_filter_ptr), 1); + + if (y_filter_taps < 8) { + convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); + } else { + convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, y_filter); + } +} + +static INLINE int16x4_t +convolve12_4_2d_h(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t x_filter_0_7, const int16x4_t x_filter_8_11, + const int32x4_t horiz_const) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = horiz_const; + sum = vmlal_lane_s16(sum, s0, x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, x_filter_8_11, 3); + + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + const int bd = 8; + // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts - + // which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_const = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + +#if AOM_ARCH_AARCH64 + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + s += 11; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s12 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s13 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s14 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t d0 = + convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d1 = + convolve12_4_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d2 = + convolve12_4_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + x_filter_0_7, x_filter_8_11, horiz_const); + int16x4_t d3 = + convolve12_4_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + x_filter_0_7, x_filter_8_11, horiz_const); + + transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3); + store_s16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t t0 = vld1q_u8(s); + int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + + int16x4_t s0 = vget_low_s16(tt0); + int16x4_t s4 = vget_high_s16(tt0); + int16x4_t s8 = vget_low_s16(tt1); + int16x4_t s12 = vget_high_s16(tt1); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + int16x4_t s5 = vext_s16(s4, s8, 1); // a5 a6 a7 a8 + int16x4_t s6 = vext_s16(s4, s8, 2); // a6 a7 a8 a9 + int16x4_t s7 = vext_s16(s4, s8, 3); // a7 a8 a9 a10 + int16x4_t s9 = vext_s16(s8, s12, 1); // a9 a10 a11 a12 + int16x4_t s10 = vext_s16(s8, s12, 2); // a10 a11 a12 a13 + int16x4_t s11 = vext_s16(s8, s12, 3); // a11 a12 a13 a14 + + int16x4_t d0 = + convolve12_4_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + x_filter_0_7, x_filter_8_11, horiz_const); + vst1_s16(d, d0); + + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +} + +static INLINE int16x4_t convolve4_4_2d_h(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t filter, + const int16x4_t horiz_const) { + int16x4_t sum = horiz_const; + sum = vmla_lane_s16(sum, s0, filter, 0); + sum = vmla_lane_s16(sum, s1, filter, 1); + sum = vmla_lane_s16(sum, s2, filter, 2); + sum = vmla_lane_s16(sum, s3, filter, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshr_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, + const int16x8_t horiz_const) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = horiz_const; + sum = vmlaq_lane_s16(sum, s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s3, filter_lo, 3); + sum = vmlaq_lane_s16(sum, s4, filter_hi, 0); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + + // We halved the convolution filter values so -1 from the right shift. + return vshrq_n_s16(sum, ROUND0_BITS - 1); +} + +static INLINE void convolve_2d_sr_horiz_neon(const uint8_t *src, int src_stride, + int16_t *im_block, int im_stride, + int w, int im_h, + const int16_t *x_filter_ptr) { + const int bd = 8; + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w <= 4) { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x4_t x_filter = vshr_n_s16(vld1_s16(x_filter_ptr + 2), 1); + + src_ptr += 2; + + do { + uint8x8_t t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + + int16x4_t s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4 + int16x4_t s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5 + int16x4_t s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6 + + int16x4_t d0 = convolve4_4_2d_h(s0, s1, s2, s3, x_filter, horiz_const); + + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + // A shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // (The extra -1 is needed because we halved the filter values.) + const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int16x8_t x_filter = vshrq_n_s16(vld1q_s16(x_filter_ptr), 1); + +#if AOM_ARCH_AARCH64 + while (height > 8) { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + s += 7; + + do { + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, s2, s3, s4, s5, s6, s7, s8, + x_filter, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, s3, s4, s5, s6, s7, s8, s9, + x_filter, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, s4, s5, s6, s7, s8, s9, s10, + x_filter, horiz_const); + int16x8_t d4 = convolve8_8_2d_h(s4, s5, s6, s7, s8, s9, s10, s11, + x_filter, horiz_const); + int16x8_t d5 = convolve8_8_2d_h(s5, s6, s7, s8, s9, s10, s11, s12, + x_filter, horiz_const); + int16x8_t d6 = convolve8_8_2d_h(s6, s7, s8, s9, s10, s11, s12, s13, + x_filter, horiz_const); + int16x8_t d7 = convolve8_8_2d_h(s7, s8, s9, s10, s11, s12, s13, s14, + x_filter, horiz_const); + + transpose_elems_inplace_s16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_s16_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 8 * src_stride; + dst_ptr += 8 * dst_stride; + height -= 8; + } +#endif // AOM_ARCH_AARCH64 + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + uint8x8_t t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7 + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + + do { + uint8x8_t t1 = vld1_u8(s + 8); // a8 a9 a10 a11 a12 a13 a14 a15 + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + + int16x8_t s1 = vextq_s16(s0, s8, 1); // a1 a2 a3 a4 a5 a6 a7 a8 + int16x8_t s2 = vextq_s16(s0, s8, 2); // a2 a3 a4 a5 a6 a7 a8 a9 + int16x8_t s3 = vextq_s16(s0, s8, 3); // a3 a4 a5 a6 a7 a8 a9 a10 + int16x8_t s4 = vextq_s16(s0, s8, 4); // a4 a5 a6 a7 a8 a9 a10 a11 + int16x8_t s5 = vextq_s16(s0, s8, 5); // a5 a6 a7 a8 a9 a10 a11 a12 + int16x8_t s6 = vextq_s16(s0, s8, 6); // a6 a7 a8 a9 a10 a11 a12 a13 + int16x8_t s7 = vextq_s16(s0, s8, 7); // a7 a8 a9 a10 a11 a12 a13 a14 + + int16x8_t d0 = convolve8_8_2d_h(s0, s1, s2, s3, s4, s5, s6, s7, + x_filter, horiz_const); + + vst1q_s16(d, d0); + + s0 = s8; + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, im_stride, w, + im_h, x_filter_0_7, x_filter_8_11); + + convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_8_11); + } else { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, w, im_h, + x_filter_ptr); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (clamped_y_taps <= 6) { + convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } else { + convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } + } +} + +void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + if (w <= 4) { + do { + uint8x8_t s0_0 = vld1_u8(src); + uint8x8_t s0_1 = vld1_u8(src + 1); + uint8x8_t s1_0 = vld1_u8(src + src_stride); + uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); + + uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); + uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); + + if (w == 2) { + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); + } else { + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); + } + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { + do { + uint8x8_t s0_0 = vld1_u8(src); + uint8x8_t s0_1 = vld1_u8(src + 1); + uint8x8_t s1_0 = vld1_u8(src + src_stride); + uint8x8_t s1_1 = vld1_u8(src + src_stride + 1); + + uint8x8_t d0 = vrhadd_u8(s0_0, s0_1); + uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); + + vst1_u8(dst, d0); + vst1_u8(dst + dst_stride, d1); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + const uint8_t *src_ptr = src; + uint8_t *dst_ptr = dst; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t s1 = vld1q_u8(src_ptr + 1); + + uint8x16_t d0 = vrhaddq_u8(s0, s1); + + vst1q_u8(dst_ptr, d0); + + src_ptr += 16; + dst_ptr += 16; + width -= 16; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } +} + +void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + + if (w <= 4) { + do { + uint8x8_t s0 = load_unaligned_u8_4x1(src); + uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride); + uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride); + + uint8x8_t d0 = vrhadd_u8(s0, s1); + uint8x8_t d1 = vrhadd_u8(s1, s2); + + if (w == 2) { + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); + } else { + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); + } + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else if (w == 8) { + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + src_stride); + uint8x8_t s2 = vld1_u8(src + 2 * src_stride); + + uint8x8_t d0 = vrhadd_u8(s0, s1); + uint8x8_t d1 = vrhadd_u8(s1, s2); + + vst1_u8(dst, d0); + vst1_u8(dst + dst_stride, d1); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + const uint8_t *src_ptr = src; + uint8_t *dst_ptr = dst; + int height = h; + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t s1 = vld1q_u8(src_ptr + src_stride); + + uint8x16_t d0 = vrhaddq_u8(s0, s1); + + vst1q_u8(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + src += 16; + dst += 16; + w -= 16; + } while (w != 0); + } +} + +void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + + uint16_t *im = im_block; + + // Horizontal filter. + if (w <= 4) { + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + + uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1)); + + // Safe to store the whole vector, the im buffer is big enough. + vst1_u16(im, sum); + + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } else { + do { + const uint8_t *src_ptr = src; + uint16_t *im_ptr = im; + int width = w; + + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + 1); + + uint16x8_t sum = vaddl_u8(s0, s1); + + vst1q_u16(im_ptr, sum); + + src_ptr += 8; + im_ptr += 8; + width -= 8; + } while (width != 0); + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } + + im = im_block; + + // Vertical filter. + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(im); + uint16x4_t s1 = vld1_u16(im + im_stride); + uint16x4_t s2 = vld1_u16(im + 2 * im_stride); + + uint16x4_t sum0 = vadd_u16(s0, s1); + uint16x4_t sum1 = vadd_u16(s1, s2); + + uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2); + uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2); + + if (w == 2) { + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); + } else { + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); + } + + im += 2 * im_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); + } else { + do { + uint16_t *im_ptr = im; + uint8_t *dst_ptr = dst; + int height = h; + + do { + uint16x8_t s0 = vld1q_u16(im_ptr); + uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); + + uint16x8_t sum = vaddq_u16(s0, s1); + uint8x8_t d0 = vqrshrn_n_u16(sum, 2); + + vst1_u8(dst_ptr, d0); + + im_ptr += im_stride; + dst_ptr += dst_stride; + } while (--height != 0); + im += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h new file mode 100644 index 0000000000..9fbf8aa12f --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon.h @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +static INLINE int32x4_t +convolve12_4_2d_v(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); + + return sum; +} + +static INLINE uint8x8_t +convolve12_8_2d_v(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, + const int16x8_t sub_const) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_12tap_neon( + int16_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride, int w, + int h, const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + if (w <= 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_4x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, + &s8, &s9, &s10); + src_ptr += 11 * src_stride; + + do { + int16x4_t s11, s12, s13, s14; + load_s16_4x4(src_ptr, src_stride, &s11, &s12, &s13, &s14); + + int32x4_t d0 = convolve12_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, + s10, s11, y_filter_0_7, y_filter_8_11); + int32x4_t d1 = convolve12_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, s12, y_filter_0_7, y_filter_8_11); + int32x4_t d2 = convolve12_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, s13, y_filter_0_7, y_filter_8_11); + int32x4_t d3 = + convolve12_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + y_filter_0_7, y_filter_8_11); + + int16x8_t dd01 = + vcombine_s16(vqrshrn_n_s32(d0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(d1, 2 * FILTER_BITS - ROUND0_BITS)); + int16x8_t dd23 = + vcombine_s16(vqrshrn_n_s32(d2, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(d3, 2 * FILTER_BITS - ROUND0_BITS)); + + dd01 = vsubq_s16(dd01, sub_const); + dd23 = vsubq_s16(dd23, sub_const); + + uint8x8_t d01 = vqmovun_s16(dd01); + uint8x8_t d23 = vqmovun_s16(dd23); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x8_t s11, s12, s13, s14; + load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint8x8_t d0 = + convolve12_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + y_filter_0_7, y_filter_8_11, sub_const); + uint8x8_t d1 = + convolve12_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + y_filter_0_7, y_filter_8_11, sub_const); + uint8x8_t d2 = + convolve12_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, y_filter_0_7, y_filter_8_11, sub_const); + uint8x8_t d3 = + convolve12_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, y_filter_0_7, y_filter_8_11, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve8_4_2d_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t y_filter) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint8x8_t convolve8_8_2d_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t y_filter, + const int16x8_t sub_const) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_8tap_neon(int16_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16x8_t y_filter) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + if (w <= 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src_ptr += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s7, s8, s9, s10; + load_s16_4x4(src_ptr, src_stride, &s7, &s8, &s9, &s10); + + int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d1 = convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + int16x4_t d2 = convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + int16x4_t d3 = + convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s7 = vld1_s16(src_ptr); + int16x4_t d0 = convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint8x8_t d01 = + vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, sub_const); + uint8x8_t d1 = convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, sub_const); + uint8x8_t d2 = convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, sub_const); + uint8x8_t d3 = convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s7 = vld1q_s16(s); + uint8x8_t d0 = convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, sub_const); + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s5 = s6; + s6 = s7; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE int16x4_t convolve6_4_2d_v(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 2); + + return vqrshrn_n_s32(sum, 2 * FILTER_BITS - ROUND0_BITS); +} + +static INLINE uint8x8_t convolve6_8_2d_v(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, + const int16x8_t sub_const) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 2); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 2); + + int16x8_t res = + vcombine_s16(vqrshrn_n_s32(sum0, 2 * FILTER_BITS - ROUND0_BITS), + vqrshrn_n_s32(sum1, 2 * FILTER_BITS - ROUND0_BITS)); + res = vsubq_s16(res, sub_const); + + return vqmovun_s16(res); +} + +static INLINE void convolve_2d_sr_vert_6tap_neon(int16_t *src_ptr, + int src_stride, + uint8_t *dst_ptr, + int dst_stride, int w, int h, + const int16x8_t y_filter) { + const int bd = 8; + const int16x8_t sub_const = vdupq_n_s16(1 << (bd - 1)); + + if (w <= 4) { + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4); + src_ptr += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x4_t s5, s6, s7, s8; + load_s16_4x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + + int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); + int16x4_t d1 = convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter); + int16x4_t d2 = convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter); + int16x4_t d3 = convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter); + + uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); + uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); + + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; +#else // !AOM_ARCH_AARCH64 + int16x4_t s5 = vld1_s16(src_ptr); + int16x4_t d0 = convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter); + uint8x8_t d01 = + vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); + + store_u8_4x1(dst_ptr, d01); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + src_ptr += src_stride; + dst_ptr += dst_stride; + h--; +#endif // AOM_ARCH_AARCH64 + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + int16_t *s = src_ptr; + uint8_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { +#if AOM_ARCH_AARCH64 + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint8x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); + uint8x8_t d1 = + convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, y_filter, sub_const); + uint8x8_t d2 = + convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, y_filter, sub_const); + uint8x8_t d3 = + convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, y_filter, sub_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; +#else // !AOM_ARCH_AARCH64 + int16x8_t s5 = vld1q_s16(s); + uint8x8_t d0 = + convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, y_filter, sub_const); + vst1_u8(d, d0); + + s0 = s1; + s1 = s2; + s2 = s3; + s3 = s4; + s4 = s5; + s += src_stride; + d += dst_stride; + height--; +#endif // AOM_ARCH_AARCH64 + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/convolve_neon_dotprod.c b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c new file mode 100644 index 0000000000..c29229eb09 --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon_dotprod.c @@ -0,0 +1,793 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve12_4_x(uint8x16_t samples, + const int8x16_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0); + sum = vdotq_laneq_s32(sum, permuted_samples[1], filter, 1); + sum = vdotq_laneq_s32(sum, permuted_samples[2], filter, 2); + + return vqrshrn_n_s32(sum, FILTER_BITS); +} + +static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2], + const int8x16_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples[2], permuted_samples[4]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit)); + clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filter, 0); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2); + // Second 4 output values. + sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filter, 0); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS), + vqrshrn_n_s32(sum[1], FILTER_BITS)); + return vqmovun_s16(sum_s16); +} + +static INLINE void convolve_x_sr_12tap_neon_dotprod( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const int16_t *x_filter_ptr) { + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); + const int8x16_t filter = + vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); + + const int32_t correction_s32 = + vaddvq_s32(vaddq_s32(vpaddlq_s16(vshlq_n_s16(filter_0_7, FILTER_BITS)), + vpaddlq_s16(vshlq_n_s16(filter_8_15, FILTER_BITS)))); + // A shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding right + // shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + int32x4_t correction = vdupq_n_s32(correction_s32 + (1 << (ROUND0_BITS - 1))); + const uint8x16_t range_limit = vdupq_n_u8(128); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + // Special case the following no-op filter as 128 won't fit into the + // 8-bit signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(filter_0_7, 5) == 128) { + // Undo the horizontal offset in the calling function. + src += 5; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x8_t d0 = vld1_u8(s); + if (w == 4) { + store_u8_4x1(d, d0); + } else { + vst1_u8(d, d0); + } + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve12_4_x(s0, filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve12_4_x(s1, filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve12_4_x(s2, filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve12_4_x(s3, filter, correction, range_limit, permute_tbl); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = + convolve12_8_x(s0, filter, correction, range_limit, permute_tbl); + uint8x8_t d1 = + convolve12_8_x(s1, filter, correction, range_limit, permute_tbl); + uint8x8_t d2 = + convolve12_8_x(s2, filter, correction, range_limit, permute_tbl); + uint8x8_t d3 = + convolve12_8_x(s3, filter, correction, range_limit, permute_tbl); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filter, 0); + + // Packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. */ + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filter, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filter, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filter, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filter, 1); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1])); + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); +} + +void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + return; + } + + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + src -= horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + // Dot product constants. + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + // This shim of (1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int32x4_t correction = + vdupq_n_s32(correction_s32 + (1 << ((ROUND0_BITS - 1) - 1))); + const uint8x16_t range_limit = vdupq_n_u8(128); + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve4_4_x(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve4_4_x(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve4_4_x(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve4_4_x(s3, x_filter, correction, range_limit, permute_tbl); + + // We halved the convolution filter values so - 1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + int width = w; + const uint8_t *s = src; + uint8_t *d = dst; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = + convolve8_8_x(s0, x_filter, correction, range_limit, permute_tbl); + uint8x8_t d1 = + convolve8_8_x(s1, x_filter, correction, range_limit, permute_tbl); + uint8x8_t d2 = + convolve8_8_x(s2, x_filter, correction, range_limit, permute_tbl); + uint8x8_t d3 = + convolve8_8_x(s3, x_filter, correction, range_limit, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, + const int8x16_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0); + sum = vdotq_laneq_s32(sum, permuted_samples[1], filters, 1); + sum = vdotq_laneq_s32(sum, permuted_samples[2], filters, 2); + + // Narrow and re-pack. + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], + const int8x16_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples[2], permuted_samples[4]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples[0] = vreinterpretq_s8_u8(vsubq_u8(samples[0], range_limit)); + clamped_samples[1] = vreinterpretq_s8_u8(vsubq_u8(samples[1], range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_s8(clamped_samples[1], permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_laneq_s32(correction, permuted_samples[0], filters, 0); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1); + sum[0] = vdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2); + // Second 4 output values. + sum[1] = vdotq_laneq_s32(correction, permuted_samples[1], filters, 0); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1); + sum[1] = vdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2); + + // Narrow and re-pack. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS), + vshrn_n_s32(sum[1], ROUND0_BITS)); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon_dotprod( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + const int bd = 8; + + // Special case the following no-op filter as 128 won't fit into the 8-bit + // signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { + const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); + // Undo the horizontal offset in the calling function. + src_ptr += 5; + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0 = vld1_u8(s); + uint16x8_t d0 = vaddw_u8(horiz_const, s0); + d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); + // Store 8 elements to avoid additional branches. This is safe if the + // actual block width is < 8 because the intermediate buffer is large + // enough to accommodate 128x128 blocks. + vst1q_s16(d, vreinterpretq_s16_u16(d0)); + + d += 8; + s += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + // Dot product constants. + const int32x4_t correct_tmp = + vaddq_s32(vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[0], 7)), + vpaddlq_s16(vshlq_n_s16(x_filter_s16.val[1], 7))); + const int32x4_t correction = + vdupq_n_s32(vaddvq_s32(correct_tmp) + horiz_const); + const uint8x16_t range_limit = vdupq_n_u8(128); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, + range_limit, permute_tbl); + int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, + range_limit, permute_tbl); + int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, + range_limit, permute_tbl); + int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, + range_limit, permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, + range_limit, permute_tbl); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16_t permute_tbl) { + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t clamped_samples = + vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t permuted_samples = vqtbl1q_s8(clamped_samples, permute_tbl); + + // Accumulate dot product into 'correction' to account for range clamp. + int32x4_t sum = vdotq_lane_s32(correction, permuted_samples, filters, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t filters, + const int32x4_t correction, + const uint8x16_t range_limit, + const uint8x16x3_t permute_tbl) { + int8x16_t clamped_samples, permuted_samples[3]; + int32x4_t sum[2]; + + // Clamp sample range to [-128, 127] for 8-bit signed dot product. + clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit)); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]); + + // Accumulate dot product into 'correction' to account for range clamp. + // First 4 output values. + sum[0] = vdotq_lane_s32(correction, permuted_samples[0], filters, 0); + sum[0] = vdotq_lane_s32(sum[0], permuted_samples[1], filters, 1); + // Second 4 output values. + sum[1] = vdotq_lane_s32(correction, permuted_samples[1], filters, 0); + sum[1] = vdotq_lane_s32(sum[1], permuted_samples[2], filters, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_neon_dotprod( + const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, + int im_h, const int16_t *x_filter_ptr) { + const int bd = 8; + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The outermost -1 is needed because we halved the filter values. + const int32_t horiz_const = + ((1 << (bd + FILTER_BITS - 2)) + (1 << ((ROUND0_BITS - 1) - 1))); + // Dot product constants. + const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr); + const int32_t correction_s32 = + vaddlvq_s16(vshlq_n_s16(x_filter_s16, FILTER_BITS - 1)); + const int32x4_t correction = vdupq_n_s32(correction_s32 + horiz_const); + const uint8x16_t range_limit = vdupq_n_u8(128); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + int16x4_t d1 = + convolve4_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl); + int16x4_t d2 = + convolve4_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl); + int16x4_t d3 = + convolve4_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = + convolve4_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, range_limit, + permute_tbl); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, range_limit, + permute_tbl); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, range_limit, + permute_tbl); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_0_7, + x_filter_8_11); + + convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_8_11); + } else { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + convolve_2d_sr_horiz_neon_dotprod(src_ptr, src_stride, im_block, im_stride, + w, im_h, x_filter_ptr); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (clamped_y_taps <= 6) { + convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } else { + convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } + } +} diff --git a/third_party/aom/av1/common/arm/convolve_neon_i8mm.c b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c new file mode 100644 index 0000000000..bbcd6f201a --- /dev/null +++ b/third_party/aom/av1/common/arm/convolve_neon_i8mm.c @@ -0,0 +1,702 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/convolve_neon.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x4_t convolve12_4_x(uint8x16_t samples, + const int8x16_t filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0); + sum = vusdotq_laneq_s32(sum, permuted_samples[1], filter, 1); + sum = vusdotq_laneq_s32(sum, permuted_samples[2], filter, 2); + + return vqrshrn_n_s32(sum, FILTER_BITS); +} + +static INLINE uint8x8_t convolve12_8_x(uint8x16_t samples[2], + const int8x16_t filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[4]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filter, 0); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filter, 1); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filter, 2); + // Second 4 output values. + sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filter, 0); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filter, 1); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filter, 2); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum[0], FILTER_BITS), + vqrshrn_n_s32(sum[1], FILTER_BITS)); + return vqmovun_s16(sum_s16); +} + +static INLINE void convolve_x_sr_12tap_neon_i8mm(const uint8_t *src, + int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filter_ptr) { + const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0)); + const int8x16_t filter = + vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15)); + + // Special case the following no-op filter as 128 won't fit into the + // 8-bit signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(filter_0_7, 5) == 128) { + // Undo the horizontal offset in the calling function. + src += 5; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x8_t d0 = vld1_u8(s); + if (w == 4) { + store_u8_4x1(d, d0); + } else { + vst1_u8(d, d0); + } + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding + // right shift by FILTER_BITS - instead of a first rounding right shift by + // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS - + // ROUND0_BITS. + const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1)); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const); + + uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); + uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + dst += 4 * dst_stride; + src += 4 * src_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const); + + store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_x(uint8x16_t samples, const int8x8_t filter, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filter, 0); + + // Packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filter, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filter, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filter, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filter, 1); + + int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum[0]), vmovn_s32(sum[1])); + // We halved the convolution filter values so - 1 from the right shift. + return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1); +} + +void av1_convolve_x_sr_neon_i8mm(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + return; + } + + const uint8_t horiz_offset = filter_params_x->taps / 2 - 1; + src -= horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + convolve_x_sr_12tap_neon_i8mm(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr); + return; + } + + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use a single + // rounding right shift by FILTER_BITS - instead of a first rounding right + // shift by ROUND0_BITS, followed by second rounding right shift by + // FILTER_BITS - ROUND0_BITS. + // The outermost -1 is needed because we will halve the filter values. + const int32x4_t horiz_const = vdupq_n_s32(1 << ((ROUND0_BITS - 1) - 1)); + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_x(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve4_4_x(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve4_4_x(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve4_4_x(s3, x_filter, permute_tbl, horiz_const); + + // We halved the convolution filter values so - 1 from the right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl, horiz_const); + uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl, horiz_const); + uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl, horiz_const); + uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl, horiz_const); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE int16x4_t convolve12_4_2d_h(uint8x16_t samples, + const int8x16_t filters, + const uint8x16x3_t permute_tbl, + int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0); + sum = vusdotq_laneq_s32(sum, permuted_samples[1], filters, 1); + sum = vusdotq_laneq_s32(sum, permuted_samples[2], filters, 2); + + // Narrow and re-pack. + return vshrn_n_s32(sum, ROUND0_BITS); +} + +static INLINE int16x8_t convolve12_8_2d_h(uint8x16_t samples[2], + const int8x16_t filters, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[4]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples[0], permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples[0], permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples[0], permute_tbl.val[2]); + // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + permuted_samples[3] = vqtbl1q_u8(samples[1], permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_laneq_s32(horiz_const, permuted_samples[0], filters, 0); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[1], filters, 1); + sum[0] = vusdotq_laneq_s32(sum[0], permuted_samples[2], filters, 2); + // Second 4 output values. + sum[1] = vusdotq_laneq_s32(horiz_const, permuted_samples[1], filters, 0); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[2], filters, 1); + sum[1] = vusdotq_laneq_s32(sum[1], permuted_samples[3], filters, 2); + + // Narrow and re-pack. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS), + vshrn_n_s32(sum[1], ROUND0_BITS)); +} + +static INLINE void convolve_2d_sr_horiz_12tap_neon_i8mm( + const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr, + const int dst_stride, int w, int h, const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11) { + const int bd = 8; + + // Special case the following no-op filter as 128 won't fit into the + // 8-bit signed dot-product instruction: + // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 } + if (vgetq_lane_s16(x_filter_0_7, 5) == 128) { + const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1))); + // Undo the horizontal offset in the calling function. + src_ptr += 5; + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0 = vld1_u8(s); + uint16x8_t d0 = vaddw_u8(horiz_const, s0); + d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS); + // Store 8 elements to avoid additional branches. This is safe if the + // actual block width is < 8 because the intermediate buffer is large + // enough to accommodate 128x128 blocks. + vst1q_s16(d, vreinterpretq_s16_u16(d0)); + + d += 8; + s += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + // Narrow filter values to 8-bit. + const int16x8x2_t x_filter_s16 = { + { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) } + }; + const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]), + vmovn_s16(x_filter_s16.val[1])); + // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts + // - which are generally faster than rounding shifts on modern CPUs. + const int32x4_t horiz_const = + vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1))); + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + if (w <= 4) { + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = + convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = + convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = + convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = + convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = + convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + + } else { + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = + convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = + convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = + convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0[2]; + s0[0] = vld1q_u8(s); + s0[1] = vld1q_u8(s + 4); + int16x8_t d0 = + convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } + } +} + +static INLINE int16x4_t convolve4_4_2d_h(uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl, + const int32x4_t horiz_const) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + // First 4 output values. + int32x4_t sum = vusdotq_lane_s32(horiz_const, permuted_samples, filters, 0); + + // We halved the convolution filter values so -1 from the right shift. + return vshrn_n_s32(sum, ROUND0_BITS - 1); +} + +static INLINE int16x8_t convolve8_8_2d_h(uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl, + const int32x4_t horiz_const) { + uint8x16_t permuted_samples[3]; + int32x4_t sum[2]; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]); + + // First 4 output values. + sum[0] = vusdotq_lane_s32(horiz_const, permuted_samples[0], filters, 0); + sum[0] = vusdotq_lane_s32(sum[0], permuted_samples[1], filters, 1); + // Second 4 output values. + sum[1] = vusdotq_lane_s32(horiz_const, permuted_samples[1], filters, 0); + sum[1] = vusdotq_lane_s32(sum[1], permuted_samples[2], filters, 1); + + // Narrow and re-pack. + // We halved the convolution filter values so -1 from the right shift. + return vcombine_s16(vshrn_n_s32(sum[0], ROUND0_BITS - 1), + vshrn_n_s32(sum[1], ROUND0_BITS - 1)); +} + +static INLINE void convolve_2d_sr_horiz_neon_i8mm( + const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w, + int im_h, const int16_t *x_filter_ptr) { + const int bd = 8; + // This shim of 1 << ((ROUND0_BITS - 1) - 1) enables us to use non-rounding + // shifts - which are generally faster than rounding shifts on modern CPUs. + // The outermost -1 is needed because we halved the filter values. + const int32x4_t horiz_const = vdupq_n_s32((1 << (bd + FILTER_BITS - 2)) + + (1 << ((ROUND0_BITS - 1) - 1))); + + const uint8_t *src_ptr = src; + int16_t *dst_ptr = im_block; + int dst_stride = im_stride; + int height = im_h; + + if (w <= 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + // 4-tap filters are used for blocks having width <= 4. + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = + vshrn_n_s16(vcombine_s16(vld1_s16(x_filter_ptr + 2), vdup_n_s16(0)), 1); + + src_ptr += 2; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x4_t d1 = convolve4_4_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x4_t d2 = convolve4_4_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x4_t d3 = convolve4_4_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + int16x4_t d0 = convolve4_4_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1_s16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + // Filter values are even, so halve to reduce intermediate precision reqs. + const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, permute_tbl, horiz_const); + int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, permute_tbl, horiz_const); + int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, permute_tbl, horiz_const); + + store_s16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + const uint8_t *s = src_ptr; + int16_t *d = dst_ptr; + int width = w; + + do { + uint8x16_t s0 = vld1q_u8(s); + int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, permute_tbl, horiz_const); + vst1q_s16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_convolve_2d_sr_neon_i8mm(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (w == 2 || h == 2) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + return; + } + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (filter_params_x->taps > 8) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + convolve_2d_sr_horiz_12tap_neon_i8mm(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_0_7, + x_filter_8_11); + + convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_0_7, y_filter_8_11); + } else { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + + convolve_2d_sr_horiz_neon_i8mm(src_ptr, src_stride, im_block, im_stride, w, + im_h, x_filter_ptr); + + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (clamped_y_taps <= 6) { + convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } else { + convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter); + } + } +} diff --git a/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c new file mode 100644 index 0000000000..fc03a2ee04 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_compound_convolve_neon.c @@ -0,0 +1,2031 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + +static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr, + int src_stride, uint16_t *dst_ptr, + int dst_stride, int w, int h, + ConvolveParams *conv_params, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_12_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), + vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), + vqrshrun_n_s32(d1, ROUND_SHIFT)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE uint16x4_t highbd_12_convolve6_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); + + return vqshrun_n_s32(sum, ROUND0_BITS + 2); +} + +static INLINE uint16x4_t +highbd_convolve6_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 2); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_12_convolve6_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), + vqshrun_n_s32(sum1, ROUND0_BITS + 2)); +} + +static INLINE uint16x8_t +highbd_convolve6_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 2); + + return vcombine_u16(vqshrun_n_s32(sum0, 3), vqshrun_n_s32(sum1, ROUND0_BITS)); +} + +static INLINE void highbd_12_dist_wtd_convolve_x_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE void highbd_dist_wtd_convolve_x_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE uint16x4_t highbd_12_convolve8_4( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t filter, + const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS + 2); +} + +static INLINE uint16x4_t +highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter, const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, filter_4_7, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE uint16x8_t highbd_12_convolve8_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t filter, + const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS + 2), + vqshrun_n_s32(sum1, ROUND0_BITS + 2)); +} + +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter, const int32x4_t offset) { + const int16x4_t filter_0_3 = vget_low_s16(filter); + const int16x4_t filter_4_7 = vget_high_s16(filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filter_4_7, 3); + + return vcombine_u16(vqshrun_n_s32(sum0, ROUND0_BITS), + vqshrun_n_s32(sum1, ROUND0_BITS)); +} + +static INLINE uint16x4_t highbd_12_convolve4_4_x(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + return vqshrun_n_s32(sum, 5); +} + +static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + return vqshrun_n_s32(sum, ROUND0_BITS); +} + +static INLINE void highbd_12_dist_wtd_convolve_x_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 2); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], + s0[6], s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], + s1[6], s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], + s2[6], s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], + s3[6], s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_x_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 2); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], + s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], + s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], + s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], + s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_dist_wtd_convolve_x_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int horiz_offset = filter_params_x->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int offset_avg = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int offset_convolve = (1 << (conv_params->round_0 - 1)) + + (1 << (bd + FILTER_BITS)) + + (1 << (bd + FILTER_BITS - 1)); + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + // horizontal filter + if (bd == 12) { + if (conv_params->do_average) { + if (x_filter_taps <= 6 && w != 4) { + highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, + im_stride, w, h, x_filter_ptr, + offset_convolve); + } else { + highbd_12_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, + w, h, x_filter_ptr, offset_convolve); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, offset_avg, bd); + } else { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, offset_avg, bd); + } + } else { + if (x_filter_taps <= 6 && w != 4) { + highbd_12_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, + dst16_stride, w, h, + x_filter_ptr, offset_convolve); + } else { + highbd_12_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, + w, h, x_filter_ptr, offset_convolve); + } + } + } else { + if (conv_params->do_average) { + if (x_filter_taps <= 6 && w != 4) { + highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, im_block, + im_stride, w, h, x_filter_ptr, + offset_convolve); + } else { + highbd_dist_wtd_convolve_x_neon(src, src_stride, im_block, im_stride, w, + h, x_filter_ptr, offset_convolve); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, offset_avg, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, offset_avg, bd); + } + } else { + if (x_filter_taps <= 6 && w != 4) { + highbd_dist_wtd_convolve_x_6tap_neon(src + 1, src_stride, dst16, + dst16_stride, w, h, x_filter_ptr, + offset_convolve); + } else { + highbd_dist_wtd_convolve_x_neon(src, src_stride, dst16, dst16_stride, w, + h, x_filter_ptr, offset_convolve); + } + } + } +} + +static INLINE void highbd_12_dist_wtd_convolve_y_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_12_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d1 = + highbd_12_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x4_t d2 = + highbd_12_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x4_t d3 = + highbd_12_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + highbd_12_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_y_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_convolve6_4(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d1 = + highbd_convolve6_4(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x4_t d2 = + highbd_convolve6_4(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x4_t d3 = + highbd_convolve6_4(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + highbd_convolve6_8(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve6_8(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve6_8(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve6_8(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_12_dist_wtd_convolve_y_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = highbd_12_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x4_t d1 = highbd_12_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x4_t d2 = highbd_12_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x4_t d3 = highbd_12_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_12_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d1 = highbd_12_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x8_t d2 = highbd_12_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x8_t d3 = highbd_12_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} +static INLINE void highbd_dist_wtd_convolve_y_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x4_t d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x4_t d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x4_t d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x8_t d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x8_t d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_highbd_dist_wtd_convolve_y_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = filter_params_y->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset_avg = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_offset_conv = (1 << (conv_params->round_0 - 1)) + + (1 << (bd + FILTER_BITS)) + + (1 << (bd + FILTER_BITS - 1)); + + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (bd == 12) { + if (conv_params->do_average) { + if (y_filter_taps <= 6) { + highbd_12_dist_wtd_convolve_y_6tap_neon( + src + src_stride, src_stride, im_block, im_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_12_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, + im_stride, w, h, y_filter_ptr, + round_offset_conv); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, round_offset_avg, + bd); + } else { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } + } else { + if (y_filter_taps <= 6) { + highbd_12_dist_wtd_convolve_y_6tap_neon( + src + src_stride, src_stride, dst16, dst16_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_12_dist_wtd_convolve_y_8tap_neon( + src, src_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv); + } + } + } else { + if (conv_params->do_average) { + if (y_filter_taps <= 6) { + highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, + im_block, im_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, im_block, + im_stride, w, h, y_filter_ptr, + round_offset_conv); + } + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, round_offset_avg, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } + } else { + if (y_filter_taps <= 6) { + highbd_dist_wtd_convolve_y_6tap_neon(src + src_stride, src_stride, + dst16, dst16_stride, w, h, + y_filter_ptr, round_offset_conv); + } else { + highbd_dist_wtd_convolve_y_8tap_neon(src, src_stride, dst16, + dst16_stride, w, h, y_filter_ptr, + round_offset_conv); + } + } + } +} + +static INLINE void highbd_2d_copy_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, int w, + int h, const int round_bits, + const int offset) { + if (w <= 4) { + const int16x4_t round_shift_s16 = vdup_n_s16(round_bits); + const uint16x4_t offset_u16 = vdup_n_u16(offset); + + for (int y = 0; y < h; ++y) { + const uint16x4_t s = vld1_u16(src_ptr + y * src_stride); + uint16x4_t d = vshl_u16(s, round_shift_s16); + d = vadd_u16(d, offset_u16); + if (w == 2) { + store_u16_2x1(dst_ptr + y * dst_stride, d); + } else { + vst1_u16(dst_ptr + y * dst_stride, d); + } + } + } else { + const int16x8_t round_shift_s16 = vdupq_n_s16(round_bits); + const uint16x8_t offset_u16 = vdupq_n_u16(offset); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; x += 8) { + const uint16x8_t s = vld1q_u16(src_ptr + y * src_stride + x); + uint16x8_t d = vshlq_u16(s, round_shift_s16); + d = vaddq_u16(d, offset_u16); + vst1q_u16(dst_ptr + y * dst_stride + x, d); + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_copy_neon(const uint16_t *src, + int src_stride, uint16_t *dst, + int dst_stride, int w, int h, + ConvolveParams *conv_params, + int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + + const int im_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + + if (conv_params->do_average) { + highbd_2d_copy_neon(src, src_stride, im_block, im_stride, w, h, round_bits, + round_offset); + } else { + highbd_2d_copy_neon(src, src_stride, dst16, dst16_stride, w, h, round_bits, + round_offset); + } + + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + if (bd == 12) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, round_offset, bd); + } else { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, round_offset, bd); + } + } else { + if (bd == 12) { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset, bd); + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, round_offset, bd); + } + } + } +} + +static INLINE uint16x4_t highbd_convolve6_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve6_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_2d_vert_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, offset_vec); + uint16x4_t d1 = + highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, offset_vec); + uint16x4_t d2 = + highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, offset_vec); + uint16x4_t d3 = + highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = highbd_convolve6_8_2d_v(s0, s1, s2, s3, s4, s5, + y_filter, offset_vec); + uint16x8_t d1 = highbd_convolve6_8_2d_v(s1, s2, s3, s4, s5, s6, + y_filter, offset_vec); + uint16x8_t d2 = highbd_convolve6_8_2d_v(s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d3 = highbd_convolve6_8_2d_v(s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_dist_wtd_convolve_2d_vert_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int offset) { + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w <= 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x4_t d1 = highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x4_t d2 = highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x4_t d3 = highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, + y_filter, offset_vec); + uint16x8_t d1 = highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, + y_filter, offset_vec); + uint16x8_t d2 = highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, + y_filter, offset_vec); + uint16x8_t d3 = highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, + y_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_12_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_12_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_12_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6]; + load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); + + uint16x8_t d0 = highbd_12_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE void highbd_dist_wtd_convolve_2d_horiz_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + uint16x8_t d1 = highbd_convolve6_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], x_filter, offset_vec); + uint16x8_t d2 = highbd_convolve6_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], x_filter, offset_vec); + uint16x8_t d3 = highbd_convolve6_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6]; + load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); + + uint16x8_t d0 = highbd_convolve6_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE void highbd_12_dist_wtd_convolve_2d_horiz_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 1); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_12_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_12_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_12_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[4]; + load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + + uint16x4_t d0 = highbd_12_convolve4_4_x(s0, x_filter, offset_vec); + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], + s0[6], s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_12_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], + s1[6], s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_12_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], + s2[6], s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_12_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], + s3[6], s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + + uint16x8_t d0 = + highbd_12_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], + s0[6], s0[7], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +static INLINE void highbd_dist_wtd_convolve_2d_horiz_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, const int offset) { + // The smallest block height is 4, and the horizontal convolution needs to + // process an extra (filter_taps/2 - 1) lines for the vertical convolution. + assert(h >= 5); + const int32x4_t offset_vec = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 1); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); + uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset_vec); + uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset_vec); + uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset_vec); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[4]; + load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset_vec); + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], + s0[7], x_filter, offset_vec); + uint16x8_t d1 = + highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], + s1[7], x_filter, offset_vec); + uint16x8_t d2 = + highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], + s2[7], x_filter, offset_vec); + uint16x8_t d3 = + highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], + s3[7], x_filter, offset_vec); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + + uint16x8_t d0 = + highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], + s0[7], x_filter, offset_vec); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_highbd_dist_wtd_convolve_2d_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + DECLARE_ALIGNED(16, uint16_t, + im_block2[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + // The extra shim of (1 << (conv_params->round_0 - 1)) allows us to use a + // faster non-rounding non-saturating left shift. + const int round_offset_conv_x = + (1 << (bd + FILTER_BITS - 1)) + (1 << (conv_params->round_0 - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset_conv_y = (1 << y_offset_bits); + const int round_offset_avg = + ((1 << (y_offset_bits - conv_params->round_1)) + + (1 << (y_offset_bits - conv_params->round_1 - 1))); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + // horizontal filter + if (bd == 12) { + if (x_filter_taps <= 6 && w != 4) { + highbd_12_dist_wtd_convolve_2d_horiz_6tap_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } else { + highbd_12_dist_wtd_convolve_2d_horiz_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } + } else { + if (x_filter_taps <= 6 && w != 4) { + highbd_dist_wtd_convolve_2d_horiz_6tap_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } else { + highbd_dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + round_offset_conv_x); + } + } + + // vertical filter + if (y_filter_taps <= 6) { + if (conv_params->do_average) { + highbd_dist_wtd_convolve_2d_vert_6tap_neon(im_block, im_stride, im_block2, + im_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } else { + highbd_dist_wtd_convolve_2d_vert_6tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } + } else { + if (conv_params->do_average) { + highbd_dist_wtd_convolve_2d_vert_8tap_neon(im_block, im_stride, im_block2, + im_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } else { + highbd_dist_wtd_convolve_2d_vert_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, y_filter_ptr, + round_offset_conv_y); + } + } + + // Do the compound averaging outside the loop, avoids branching within the + // main loop + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + if (bd == 12) { + highbd_12_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, + w, h, conv_params, round_offset_avg, + bd); + } else { + highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, + h, conv_params, round_offset_avg, bd); + } + } else { + if (bd == 12) { + highbd_12_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } else { + highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, round_offset_avg, bd); + } + } + } +} diff --git a/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c new file mode 100644 index 0000000000..4f1c25d122 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_horiz_rs_neon.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +#define UPSCALE_NORMATIVE_TAPS 8 + +void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + int x0_qn, int x_step_qn, int bd) { + const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1; + + static const int32_t kIdx[4] = { 0, 1, 2, 3 }; + const int32x4_t idx = vld1q_s32(kIdx); + const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK); + const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS); + const int32x4_t offset_s32 = vdupq_n_s32(0); + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + const uint16_t *src_ptr = src - horiz_offset; + uint16_t *dst_ptr = dst; + + if (w <= 4) { + int height = h; + uint16_t *d = dst_ptr; + + do { + int x_qn = x0_qn; + + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) = + // 2 + const int32x4_t src_idx = + vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1); + // Similarly for the filter vector indices, we calculate the filter + // indices for 4 columns. First we calculate the indices: + // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS + // Then we calculate the actual pointers, multiplying with + // UPSCALE_UPSCALE_NORMATIVE_TAPS + // again shift left by 1 + const int32x4_t x_filter4_idx = vshlq_n_s32( + vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1); + // Even though pointers are unsigned 32/64-bit ints we do signed + // addition The reason for this is that x_qn can be negative, leading to + // negative offsets. Argon test + // profile0_core/streams/test10573_11003.obu was failing because of + // this. +#if AOM_ARCH_AARCH64 + uint64x2_t tmp4[2]; + tmp4[0] = vreinterpretq_u64_s64(vaddw_s32( + vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx))); + tmp4[1] = vreinterpretq_u64_s64(vaddw_s32( + vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx))); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); + + // filter vectors + tmp4[0] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + tmp4[1] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint64_t *)&x_filter4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); +#else + uint32x4_t tmp4; + tmp4 = vreinterpretq_u32_s32( + vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx)); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, tmp4); + + // filter vectors + tmp4 = vreinterpretq_u32_s32( + vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx, + vdupq_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint32_t *)&x_filter4_ptr; + vst1q_u32(tmp_ptr, tmp4); +#endif // AOM_ARCH_AARCH64 + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + d0 = vmin_u16(d0, max); + + if (w == 2) { + store_u16_2x1(d, d0); + } else { + vst1_u16(d, d0); + } + + src_ptr += src_stride; + d += dst_stride; + height--; + } while (height > 0); + } else { + int height = h; + + do { + int width = w; + int x_qn = x0_qn; + uint16_t *d = dst_ptr; + const uint16_t *s = src_ptr; + + do { + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const int32x4_t xqn_idx = + vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) + // = 2 + const int32x4_t src_idx = + vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1); + + // Similarly for the filter vector indices, we calculate the filter + // indices for 4 columns. First we calculate the indices: + // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS + // Then we calculate the actual pointers, multiplying with + // UPSCALE_UPSCALE_NORMATIVE_TAPS + // again shift left by 1 + const int32x4_t x_filter4_idx = vshlq_n_s32( + vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), + 1); + // Even though pointers are unsigned 32/64-bit ints we do signed + // addition The reason for this is that x_qn can be negative, leading to + // negative offsets. Argon test + // profile0_core/streams/test10573_11003.obu was failing because of + // this. +#if AOM_ARCH_AARCH64 + uint64x2_t tmp4[2]; + tmp4[0] = vreinterpretq_u64_s64( + vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx))); + tmp4[1] = vreinterpretq_u64_s64( + vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx))); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); + + // filter vectors + tmp4[0] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + tmp4[1] = vreinterpretq_u64_s64(vmlal_s32( + vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx), + vdup_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint64_t *)&x_filter4_ptr; + vst1q_u64(tmp_ptr, tmp4[0]); + vst1q_u64(tmp_ptr + 2, tmp4[1]); +#else + uint32x4_t tmp4; + tmp4 = vreinterpretq_u32_s32( + vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx)); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, tmp4); + + // filter vectors + tmp4 = vreinterpretq_u32_s32( + vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx, + vdupq_n_s32(UPSCALE_NORMATIVE_TAPS))); + + const int16_t *x_filter4_ptr[4]; + tmp_ptr = (uint32_t *)&x_filter4_ptr; + vst1q_u32(tmp_ptr, tmp4); +#endif // AOM_ARCH_AARCH64 + + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale X convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + d0 = vmin_u16(d0, max); + vst1_u16(d, d0); + + x_qn += 4 * x_step_qn; + d += 4; + width -= 4; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } while (height > 0); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_neon.c new file mode 100644 index 0000000000..3a3e33fcba --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_neon.c @@ -0,0 +1,2120 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +static INLINE uint16x4_t +highbd_convolve6_4_y(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t +highbd_convolve6_8_y(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_convolve_y_sr_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, const int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + + if (w == 4) { + const int16_t *s = (const int16_t *)(src_ptr + src_stride); + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = + highbd_convolve6_4_y(s0, s1, s2, s3, s4, s5, y_filter_0_7); + uint16x4_t d1 = + highbd_convolve6_4_y(s1, s2, s3, s4, s5, s6, y_filter_0_7); + uint16x4_t d2 = + highbd_convolve6_4_y(s2, s3, s4, s5, s6, s7, y_filter_0_7); + uint16x4_t d3 = + highbd_convolve6_4_y(s3, s4, s5, s6, s7, s8, y_filter_0_7); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + // Width is a multiple of 8 and height is a multiple of 4. + do { + int height = h; + const int16_t *s = (const int16_t *)(src_ptr + src_stride); + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = + highbd_convolve6_8_y(s0, s1, s2, s3, s4, s5, y_filter_0_7); + uint16x8_t d1 = + highbd_convolve6_8_y(s1, s2, s3, s4, s5, s6, y_filter_0_7); + uint16x8_t d2 = + highbd_convolve6_8_y(s2, s3, s4, s5, s6, s7, y_filter_0_7); + uint16x8_t d3 = + highbd_convolve6_8_y(s3, s4, s5, s6, s7, s8, y_filter_0_7); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_y( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_y( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_convolve_y_sr_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint16x4_t d1 = + highbd_convolve8_4_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint16x4_t d2 = + highbd_convolve8_4_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint16x4_t d3 = + highbd_convolve8_4_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8_y(s0, s1, s2, s3, s4, s5, s6, s7, y_filter); + uint16x8_t d1 = + highbd_convolve8_8_y(s1, s2, s3, s4, s5, s6, s7, s8, y_filter); + uint16x8_t d2 = + highbd_convolve8_8_y(s2, s3, s4, s5, s6, s7, s8, s9, y_filter); + uint16x8_t d3 = + highbd_convolve8_8_y(s3, s4, s5, s6, s7, s8, s9, s10, y_filter); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_y( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum = vmull_lane_s16(s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); + + return vqrshrun_n_s32(sum, COMPOUND_ROUND1_BITS); +} + +static INLINE uint16x8_t highbd_convolve12_8_y( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, COMPOUND_ROUND1_BITS), + vqrshrun_n_s32(sum1, COMPOUND_ROUND1_BITS)); +} + +static INLINE void highbd_convolve_y_sr_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x4_t s11, s12, s13, s14; + load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x4_t d0 = + highbd_convolve12_4_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, y_filter_0_7, y_filter_8_11); + uint16x4_t d1 = + highbd_convolve12_4_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, y_filter_0_7, y_filter_8_11); + uint16x4_t d2 = + highbd_convolve12_4_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, y_filter_0_7, y_filter_8_11); + uint16x4_t d3 = + highbd_convolve12_4_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, + s14, y_filter_0_7, y_filter_8_11); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x8_t s11, s12, s13, s14; + load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x8_t d0 = + highbd_convolve12_8_y(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, + s11, y_filter_0_7, y_filter_8_11); + uint16x8_t d1 = + highbd_convolve12_8_y(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, + s12, y_filter_0_7, y_filter_8_11); + uint16x8_t d2 = + highbd_convolve12_8_y(s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, y_filter_0_7, y_filter_8_11); + uint16x8_t d3 = + highbd_convolve12_8_y(s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, + s13, s14, y_filter_0_7, y_filter_8_11); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +void av1_highbd_convolve_y_sr_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int vert_offset = filter_params_y->taps / 2 - 1; + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (y_filter_taps > 8) { + highbd_convolve_y_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); + return; + } + if (y_filter_taps < 8) { + highbd_convolve_y_sr_6tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); + return; + } + + highbd_convolve_y_sr_8tap_neon(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); +} + +static INLINE uint16x8_t highbd_convolve6_8_x(const int16x8_t s[6], + const int16x8_t x_filter, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = offset; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); + + int32x4_t sum1 = offset; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE void highbd_convolve_x_sr_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + int bd) { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); + + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = highbd_convolve6_8_x(s0, x_filter, offset); + uint16x8_t d1 = highbd_convolve6_8_x(s1, x_filter, offset); + uint16x8_t d2 = highbd_convolve6_8_x(s2, x_filter, offset); + uint16x8_t d3 = highbd_convolve6_8_x(s3, x_filter, offset); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE uint16x4_t highbd_convolve4_4_x(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t offset) { + int32x4_t sum = offset; + sum = vmlal_lane_s16(sum, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + return vqrshrun_n_s32(sum, FILTER_BITS); +} + +static INLINE uint16x8_t highbd_convolve8_8_x(const int16x8_t s[8], + const int16x8_t x_filter, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = offset; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + + int32x4_t sum1 = offset; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE void highbd_convolve_x_sr_neon(const uint16_t *src_ptr, + int src_stride, uint16_t *dst_ptr, + int dst_stride, int w, int h, + const int16_t *x_filter_ptr, + ConvolveParams *conv_params, + int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); + + if (w == 4) { + // 4-tap filters are used for blocks having width == 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 2); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_x(s0, x_filter, offset); + uint16x4_t d1 = highbd_convolve4_4_x(s1, x_filter, offset); + uint16x4_t d2 = highbd_convolve4_4_x(s2, x_filter, offset); + uint16x4_t d3 = highbd_convolve4_4_x(s3, x_filter, offset); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_x(s0, x_filter, offset); + uint16x8_t d1 = highbd_convolve8_8_x(s1, x_filter, offset); + uint16x8_t d2 = highbd_convolve8_8_x(s2, x_filter, offset); + uint16x8_t d3 = highbd_convolve8_8_x(s3, x_filter, offset); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_x(const int16x4_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = offset; + sum = vmlal_lane_s16(sum, s[0], x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); + + return vqrshrun_n_s32(sum, FILTER_BITS); +} + +static INLINE uint16x8_t highbd_convolve12_8_x(const int16x8_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum0 = offset; + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); + + int32x4_t sum1 = offset; + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); + + return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); +} + +static INLINE void highbd_convolve_x_sr_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int32x4_t offset = vdupq_n_s32(1 << (conv_params->round_0 - 1)); + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[12], s1[12], s2[12], s3[12]; + load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x4_t d0 = + highbd_convolve12_4_x(s0, x_filter_0_7, x_filter_8_11, offset); + uint16x4_t d1 = + highbd_convolve12_4_x(s1, x_filter_0_7, x_filter_8_11, offset); + uint16x4_t d2 = + highbd_convolve12_4_x(s2, x_filter_0_7, x_filter_8_11, offset); + uint16x4_t d3 = + highbd_convolve12_4_x(s3, x_filter_0_7, x_filter_8_11, offset); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[12], s1[12], s2[12], s3[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x8_t d0 = + highbd_convolve12_8_x(s0, x_filter_0_7, x_filter_8_11, offset); + uint16x8_t d1 = + highbd_convolve12_8_x(s1, x_filter_0_7, x_filter_8_11, offset); + uint16x8_t d2 = + highbd_convolve12_8_x(s2, x_filter_0_7, x_filter_8_11, offset); + uint16x8_t d3 = + highbd_convolve12_8_x(s3, x_filter_0_7, x_filter_8_11, offset); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_convolve_x_sr_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, bd); + return; + } + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + if (x_filter_taps > 8) { + highbd_convolve_x_sr_12tap_neon(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + if (x_filter_taps <= 6 && w != 4) { + highbd_convolve_x_sr_6tap_neon(src + 1, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + + highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); +} + +static INLINE uint16x4_t highbd_convolve6_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x8_t y_filter, const int32x4_t round_shift, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s3, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 2); + + sum = vshlq_s32(sum, round_shift); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve6_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t y_filter, const int32x4_t round_shift, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 2); + + sum0 = vshlq_s32(sum0, round_shift); + sum1 = vshlq_s32(sum1, round_shift); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_vert_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, + int bd, const int offset) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + const int round1_shift = conv_params->round_1; + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + int16x4_t s0, s1, s2, s3, s4; + load_s16_4x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x4_t s5, s6, s7, s8; + load_s16_4x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x4_t d0 = highbd_convolve6_4_2d_v(s0, s1, s2, s3, s4, s5, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d1 = highbd_convolve6_4_2d_v(s1, s2, s3, s4, s5, s6, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d2 = highbd_convolve6_4_2d_v(s2, s3, s4, s5, s6, s7, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d3 = highbd_convolve6_4_2d_v(s3, s4, s5, s6, s7, s8, y_filter, + round1_shift_s32, offset_s32); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + s += 5 * src_stride; + + do { + int16x8_t s5, s6, s7, s8; + load_s16_8x4(s, src_stride, &s5, &s6, &s7, &s8); + + uint16x8_t d0 = highbd_convolve6_8_2d_v( + s0, s1, s2, s3, s4, s5, y_filter, round1_shift_s32, offset_s32); + uint16x8_t d1 = highbd_convolve6_8_2d_v( + s1, s2, s3, s4, s5, s6, y_filter, round1_shift_s32, offset_s32); + uint16x8_t d2 = highbd_convolve6_8_2d_v( + s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, offset_s32); + uint16x8_t d3 = highbd_convolve6_8_2d_v( + s3, s4, s5, s6, s7, s8, y_filter, round1_shift_s32, offset_s32); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + sum = vshlq_s32(sum, round_shift); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_hi, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_hi, 3); + + sum0 = vshlq_s32(sum0, round_shift); + sum1 = vshlq_s32(sum1, round_shift); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_vert_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, + int bd, const int offset) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + const int round1_shift = conv_params->round_1; + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d1 = + highbd_convolve8_4_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d2 = + highbd_convolve8_4_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round1_shift_s32, offset_s32); + uint16x4_t d3 = + highbd_convolve8_4_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round1_shift_s32, offset_s32); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8_2d_v(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, + round1_shift_s32, offset_s32); + uint16x8_t d1 = + highbd_convolve8_8_2d_v(s1, s2, s3, s4, s5, s6, s7, s8, y_filter, + round1_shift_s32, offset_s32); + uint16x8_t d2 = + highbd_convolve8_8_2d_v(s2, s3, s4, s5, s6, s7, s8, s9, y_filter, + round1_shift_s32, offset_s32); + uint16x8_t d3 = + highbd_convolve8_8_2d_v(s3, s4, s5, s6, s7, s8, s9, s10, y_filter, + round1_shift_s32, offset_s32); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_2d_v( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x4_t s8, + const int16x4_t s9, const int16x4_t s10, const int16x4_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s8, y_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s9, y_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s10, y_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s11, y_filter_8_11, 3); + + sum = vshlq_s32(sum, round_shift); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve12_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t s10, const int16x8_t s11, + const int16x8_t y_filter_0_7, const int16x4_t y_filter_8_11, + const int32x4_t round_shift, const int32x4_t offset) { + const int16x4_t y_filter_0_3 = vget_low_s16(y_filter_0_7); + const int16x4_t y_filter_4_7 = vget_high_s16(y_filter_0_7); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), y_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), y_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), y_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), y_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), y_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), y_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), y_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s8), y_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s9), y_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s10), y_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s11), y_filter_8_11, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), y_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), y_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), y_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), y_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), y_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), y_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), y_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s8), y_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s9), y_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s10), y_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s11), y_filter_8_11, 3); + + sum0 = vshlq_s32(sum0, round_shift); + sum1 = vshlq_s32(sum1, round_shift); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_vert_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *y_filter_ptr, ConvolveParams *conv_params, + const int bd, const int offset) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + const int round1_shift = conv_params->round_1; + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_shift); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x4_t s11, s12, s13, s14; + load_s16_4x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x4_t d0 = highbd_convolve12_4_2d_v( + s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x4_t d1 = highbd_convolve12_4_2d_v( + s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x4_t d2 = highbd_convolve12_4_2d_v( + s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x4_t d3 = highbd_convolve12_4_2d_v( + s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + + d0 = vmin_u16(d0, vget_low_u16(max)); + d1 = vmin_u16(d1, vget_low_u16(max)); + d2 = vmin_u16(d2, vget_low_u16(max)); + d3 = vmin_u16(d3, vget_low_u16(max)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + int height = h; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &s10); + s += 11 * src_stride; + + do { + int16x8_t s11, s12, s13, s14; + load_s16_8x4(s, src_stride, &s11, &s12, &s13, &s14); + + uint16x8_t d0 = highbd_convolve12_8_2d_v( + s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x8_t d1 = highbd_convolve12_8_2d_v( + s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x8_t d2 = highbd_convolve12_8_2d_v( + s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + uint16x8_t d3 = highbd_convolve12_8_2d_v( + s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, y_filter_0_7, + y_filter_8_11, round1_shift_s32, offset_s32); + + d0 = vminq_u16(d0, max); + d1 = vminq_u16(d1, max); + d2 = vminq_u16(d2, max); + d3 = vminq_u16(d3, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s7 = s11; + s8 = s12; + s9 = s13; + s10 = s14; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE uint16x8_t highbd_convolve6_8_2d_h(const int16x8_t s[6], + const int16x8_t x_filter, + const int32x4_t shift_s32, + const int32x4_t offset) { + // Values at indices 0 and 7 of y_filter are zero. + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 2); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 2); + + sum0 = vqrshlq_s32(sum0, shift_s32); + sum1 = vqrshlq_s32(sum1, shift_s32); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_6tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + const int offset) { + // The smallest block height processed by the SIMD functions is 4, and the + // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines + // for the vertical convolution. + assert(h >= 5); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6], s1[6], s2[6], s3[6]; + load_s16_8x6(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5]); + load_s16_8x6(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5]); + load_s16_8x6(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5]); + load_s16_8x6(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5]); + + uint16x8_t d0 = + highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); + uint16x8_t d1 = + highbd_convolve6_8_2d_h(s1, x_filter, shift_s32, offset_s32); + uint16x8_t d2 = + highbd_convolve6_8_2d_h(s2, x_filter, shift_s32, offset_s32); + uint16x8_t d3 = + highbd_convolve6_8_2d_h(s3, x_filter, shift_s32, offset_s32); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[6]; + load_s16_8x6(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5]); + + uint16x8_t d0 = + highbd_convolve6_8_2d_h(s0, x_filter, shift_s32, offset_s32); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); +} + +static INLINE uint16x4_t highbd_convolve4_4_2d_h(const int16x4_t s[4], + const int16x4_t x_filter, + const int32x4_t shift_s32, + const int32x4_t offset) { + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter, 3); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_h(const int16x8_t s[8], + const int16x8_t x_filter, + const int32x4_t shift_s32, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + + sum0 = vqrshlq_s32(sum0, shift_s32); + sum1 = vqrshlq_s32(sum1, shift_s32); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + const int offset) { + // The smallest block height processed by the SIMD functions is 4, and the + // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines + // for the vertical convolution. + assert(h >= 5); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + if (w == 4) { + // 4-tap filters are used for blocks having width <= 4. + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16_t *s = (const int16_t *)(src_ptr + 1); + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = + highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); + uint16x4_t d1 = + highbd_convolve4_4_2d_h(s1, x_filter, shift_s32, offset_s32); + uint16x4_t d2 = + highbd_convolve4_4_2d_h(s2, x_filter, shift_s32, offset_s32); + uint16x4_t d3 = + highbd_convolve4_4_2d_h(s3, x_filter, shift_s32, offset_s32); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[4]; + load_s16_4x4(s, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + + uint16x4_t d0 = + highbd_convolve4_4_2d_h(s0, x_filter, shift_s32, offset_s32); + + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + const int16x8_t x_filter = vld1q_s16(x_filter_ptr); + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = + highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); + uint16x8_t d1 = + highbd_convolve8_8_2d_h(s1, x_filter, shift_s32, offset_s32); + uint16x8_t d2 = + highbd_convolve8_8_2d_h(s2, x_filter, shift_s32, offset_s32); + uint16x8_t d3 = + highbd_convolve8_8_2d_h(s3, x_filter, shift_s32, offset_s32); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + + uint16x8_t d0 = + highbd_convolve8_8_2d_h(s0, x_filter, shift_s32, offset_s32); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_2d_h(const int16x4_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t shift_s32, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum = vmlal_lane_s16(offset, s[0], x_filter_0_3, 0); + sum = vmlal_lane_s16(sum, s[1], x_filter_0_3, 1); + sum = vmlal_lane_s16(sum, s[2], x_filter_0_3, 2); + sum = vmlal_lane_s16(sum, s[3], x_filter_0_3, 3); + sum = vmlal_lane_s16(sum, s[4], x_filter_4_7, 0); + sum = vmlal_lane_s16(sum, s[5], x_filter_4_7, 1); + sum = vmlal_lane_s16(sum, s[6], x_filter_4_7, 2); + sum = vmlal_lane_s16(sum, s[7], x_filter_4_7, 3); + sum = vmlal_lane_s16(sum, s[8], x_filter_8_11, 0); + sum = vmlal_lane_s16(sum, s[9], x_filter_8_11, 1); + sum = vmlal_lane_s16(sum, s[10], x_filter_8_11, 2); + sum = vmlal_lane_s16(sum, s[11], x_filter_8_11, 3); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +static INLINE uint16x8_t highbd_convolve12_8_2d_h(const int16x8_t s[12], + const int16x8_t x_filter_0_7, + const int16x4_t x_filter_8_11, + const int32x4_t shift_s32, + const int32x4_t offset) { + const int16x4_t x_filter_0_3 = vget_low_s16(x_filter_0_7); + const int16x4_t x_filter_4_7 = vget_high_s16(x_filter_0_7); + + int32x4_t sum0 = vmlal_lane_s16(offset, vget_low_s16(s[0]), x_filter_0_3, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[1]), x_filter_0_3, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[2]), x_filter_0_3, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[3]), x_filter_0_3, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[4]), x_filter_4_7, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[5]), x_filter_4_7, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[6]), x_filter_4_7, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[7]), x_filter_4_7, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[8]), x_filter_8_11, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[9]), x_filter_8_11, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[10]), x_filter_8_11, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s[11]), x_filter_8_11, 3); + + int32x4_t sum1 = vmlal_lane_s16(offset, vget_high_s16(s[0]), x_filter_0_3, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[1]), x_filter_0_3, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[2]), x_filter_0_3, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[3]), x_filter_0_3, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[4]), x_filter_4_7, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[5]), x_filter_4_7, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[6]), x_filter_4_7, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[7]), x_filter_4_7, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[8]), x_filter_8_11, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[9]), x_filter_8_11, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[10]), x_filter_8_11, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s[11]), x_filter_8_11, 3); + + sum0 = vqrshlq_s32(sum0, shift_s32); + sum1 = vqrshlq_s32(sum1, shift_s32); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_12tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int16_t *x_filter_ptr, ConvolveParams *conv_params, + const int offset) { + // The smallest block height processed by the SIMD functions is 4, and the + // horizontal convolution needs to process an extra (filter_taps/2 - 1) lines + // for the vertical convolution. + assert(h >= 5); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr); + const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + if (w == 4) { + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x4_t s0[12], s1[12], s2[12], s3[12]; + load_s16_4x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_4x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_4x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_4x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + uint16x4_t d1 = highbd_convolve12_4_2d_h(s1, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + uint16x4_t d2 = highbd_convolve12_4_2d_h(s2, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + uint16x4_t d3 = highbd_convolve12_4_2d_h(s3, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h > 4); + + do { + int16x4_t s0[12]; + load_s16_4x12(s, 1, &s0[0], &s0[1], &s0[2], &s0[3], &s0[4], &s0[5], + &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], &s0[11]); + + uint16x4_t d0 = highbd_convolve12_4_2d_h(s0, x_filter_0_7, x_filter_8_11, + shift_s32, offset_s32); + + vst1_u16(d, d0); + + s += src_stride; + d += dst_stride; + } while (--h != 0); + } else { + int height = h; + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[12], s1[12], s2[12], s3[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + load_s16_8x12(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7], &s2[8], &s2[9], &s2[10], + &s2[11]); + load_s16_8x12(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7], &s3[8], &s3[9], &s3[10], + &s3[11]); + + uint16x8_t d0 = highbd_convolve12_8_2d_h( + s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + uint16x8_t d1 = highbd_convolve12_8_2d_h( + s1, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + uint16x8_t d2 = highbd_convolve12_8_2d_h( + s2, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + uint16x8_t d3 = highbd_convolve12_8_2d_h( + s3, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += 4 * src_stride; + dst_ptr += 4 * dst_stride; + height -= 4; + } while (height > 4); + + do { + int width = w; + const int16_t *s = (const int16_t *)src_ptr; + uint16_t *d = dst_ptr; + + do { + int16x8_t s0[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + + uint16x8_t d0 = highbd_convolve12_8_2d_h( + s0, x_filter_0_7, x_filter_8_11, shift_s32, offset_s32); + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + } +} + +void av1_highbd_convolve_2d_sr_neon(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params, bd); + return; + } + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int clamped_x_taps = x_filter_taps < 6 ? 6 : x_filter_taps; + + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + const int clamped_y_taps = y_filter_taps < 6 ? 6 : y_filter_taps; + const int im_h = h + clamped_y_taps - 1; + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + const int x_offset_initial = (1 << (bd + FILTER_BITS - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a + // simple shift left instead of a rounding saturating shift left. + const int y_offset = + (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + if (x_filter_taps > 8) { + highbd_convolve_2d_sr_horiz_12tap_neon(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset_initial); + + highbd_convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + return; + } + if (x_filter_taps <= 6 && w != 4) { + highbd_convolve_2d_sr_horiz_6tap_neon(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset_initial); + } else { + highbd_convolve_2d_sr_horiz_neon(src_ptr, src_stride, im_block, im_stride, + w, im_h, x_filter_ptr, conv_params, + x_offset_initial); + } + + if (y_filter_taps <= 6) { + highbd_convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } else { + highbd_convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } +} + +// Filter used is [64, 64]. +void av1_highbd_convolve_x_sr_intrabc_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + (void)bd; + + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(src); + uint16x4_t s1 = vld1_u16(src + 1); + + uint16x4_t d0 = vrhadd_u16(s0, s1); + + if (w == 2) { + store_u16_2x1(dst, d0); + } else { + vst1_u16(dst, d0); + } + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + int width = w; + + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 1); + + uint16x8_t d0 = vrhaddq_u16(s0, s1); + + vst1q_u16(dst_ptr, d0); + + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width != 0); + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } +} + +// Filter used is [64, 64]. +void av1_highbd_convolve_y_sr_intrabc_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + int bd) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + (void)bd; + + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(src); + uint16x4_t s1 = vld1_u16(src + src_stride); + + uint16x4_t d0 = vrhadd_u16(s0, s1); + + if (w == 2) { + store_u16_2x1(dst, d0); + } else { + vst1_u16(dst, d0); + } + + src += src_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + int height = h; + + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + src_stride); + + uint16x8_t d0 = vrhaddq_u16(s0, s1); + + vst1q_u16(dst_ptr, d0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +// Both horizontal and vertical passes use the same 2-tap filter: [64, 64]. +void av1_highbd_convolve_2d_sr_intrabc_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + (void)bd; + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + 1; + int im_stride = MAX_SB_SIZE; + + uint16x8_t vert_offset = vdupq_n_u16(1); + + uint16_t *im = im_block; + + // Horizontal filter. + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(src); + uint16x4_t s1 = vld1_u16(src + 1); + + uint16x4_t d0 = vadd_u16(s0, s1); + + // Safe to store the whole vector, the im buffer is big enough. + vst1_u16(im, d0); + + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } else { + do { + const uint16_t *src_ptr = src; + uint16_t *im_ptr = im; + int width = w; + + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 1); + + uint16x8_t d0 = vaddq_u16(s0, s1); + + vst1q_u16(im_ptr, d0); + + src_ptr += 8; + im_ptr += 8; + width -= 8; + } while (width != 0); + src += src_stride; + im += im_stride; + } while (--im_h != 0); + } + + im = im_block; + + // Vertical filter. + if (w <= 4) { + do { + uint16x4_t s0 = vld1_u16(im); + uint16x4_t s1 = vld1_u16(im + im_stride); + + uint16x4_t d0 = vhadd_u16(s0, s1); + d0 = vhadd_u16(d0, vget_low_u16(vert_offset)); + + if (w == 2) { + store_u16_2x1(dst, d0); + } else { + vst1_u16(dst, d0); + } + + im += im_stride; + dst += dst_stride; + } while (--h != 0); + } else { + do { + uint16_t *im_ptr = im; + uint16_t *dst_ptr = dst; + int height = h; + + do { + uint16x8_t s0 = vld1q_u16(im_ptr); + uint16x8_t s1 = vld1q_u16(im_ptr + im_stride); + + uint16x8_t d0 = vhaddq_u16(s0, s1); + d0 = vhaddq_u16(d0, vert_offset); + + vst1q_u16(dst_ptr, d0); + + im_ptr += im_stride; + dst_ptr += dst_stride; + } while (--height != 0); + im += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_convolve_neon.h b/third_party/aom/av1/common/arm/highbd_convolve_neon.h new file mode 100644 index 0000000000..08b2bda4e5 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_neon.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ + +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/convolve.h" + +static INLINE int32x4_t highbd_convolve8_4_s32( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t offset) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + int32x4_t sum = vmlal_lane_s16(offset, s0, y_filter_lo, 0); + sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1); + sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2); + sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3); + sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0); + sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1); + sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2); + sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3); + + return sum; +} + +static INLINE uint16x4_t highbd_convolve8_4_sr_s32_s16( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t shift_s32, const int32x4_t offset) { + int32x4_t sum = + highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +// Like above but also perform round shifting and subtract correction term +static INLINE uint16x4_t highbd_convolve8_4_srsub_s32_s16( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset, + const int32x4_t correction) { + int32x4_t sum = + highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset); + + sum = vsubq_s32(vqrshlq_s32(sum, round_shift), correction); + return vqmovun_s32(sum); +} + +static INLINE void highbd_convolve8_8_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t offset, int32x4_t *sum0, int32x4_t *sum1) { + const int16x4_t y_filter_lo = vget_low_s16(y_filter); + const int16x4_t y_filter_hi = vget_high_s16(y_filter); + + *sum0 = vmlal_lane_s16(offset, vget_low_s16(s0), y_filter_lo, 0); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2); + *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3); + + *sum1 = vmlal_lane_s16(offset, vget_high_s16(s0), y_filter_lo, 0); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2); + *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3); +} + +// Like above but also perform round shifting and subtract correction term +static INLINE uint16x8_t highbd_convolve8_8_srsub_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter, + const int32x4_t round_shift, const int32x4_t offset, + const int32x4_t correction) { + int32x4_t sum0; + int32x4_t sum1; + highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, offset, + &sum0, &sum1); + + sum0 = vsubq_s32(vqrshlq_s32(sum0, round_shift), correction); + sum1 = vsubq_s32(vqrshlq_s32(sum1, round_shift), correction); + + return vcombine_u16(vqmovun_s32(sum0), vqmovun_s32(sum1)); +} + +static INLINE int32x4_t highbd_convolve8_2d_scale_horiz4x8_s32( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t *filters_lo, + const int16x4_t *filters_hi, const int32x4_t offset) { + int16x4_t s_lo[] = { vget_low_s16(s0), vget_low_s16(s1), vget_low_s16(s2), + vget_low_s16(s3) }; + int16x4_t s_hi[] = { vget_high_s16(s0), vget_high_s16(s1), vget_high_s16(s2), + vget_high_s16(s3) }; + + transpose_array_inplace_u16_4x4((uint16x4_t *)s_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)s_hi); + + int32x4_t sum = vmlal_s16(offset, s_lo[0], filters_lo[0]); + sum = vmlal_s16(sum, s_lo[1], filters_lo[1]); + sum = vmlal_s16(sum, s_lo[2], filters_lo[2]); + sum = vmlal_s16(sum, s_lo[3], filters_lo[3]); + sum = vmlal_s16(sum, s_hi[0], filters_hi[0]); + sum = vmlal_s16(sum, s_hi[1], filters_hi[1]); + sum = vmlal_s16(sum, s_hi[2], filters_hi[2]); + sum = vmlal_s16(sum, s_hi[3], filters_hi[3]); + + return sum; +} + +static INLINE uint16x4_t highbd_convolve8_2d_scale_horiz4x8_s32_s16( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t *filters_lo, + const int16x4_t *filters_hi, const int32x4_t shift_s32, + const int32x4_t offset) { + int32x4_t sum = highbd_convolve8_2d_scale_horiz4x8_s32( + s0, s1, s2, s3, filters_lo, filters_hi, offset); + + sum = vqrshlq_s32(sum, shift_s32); + return vqmovun_s32(sum); +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_CONVOLVE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c b/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c new file mode 100644 index 0000000000..702c651536 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_convolve_scale_neon.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +static INLINE void highbd_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int round_bits, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const int32x4_t round_shift = vdupq_n_s32(-round_bits); + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w <= 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + d0 = vqrshlq_s32(d0, round_shift); + + uint16x4_t d0_u16 = vqmovun_s32(d0); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + if (w == 2) { + store_u16_2x1(dst_ptr, d0_u16); + } else { + vst1_u16(dst_ptr, d0_u16); + } + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + d0 = vqrshlq_s32(d0, round_shift); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + d1 = vqrshlq_s32(d1, round_shift); + + uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + const int round_bits, const int offset, + const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const int32x4_t round_shift = vdupq_n_s32(-round_bits); + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w <= 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + d0 = vqrshlq_s32(d0, round_shift); + + uint16x4_t d0_u16 = vqmovun_s32(d0); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + if (w == 2) { + store_u16_2x1(dst_ptr, d0_u16); + } else { + vst1_u16(dst_ptr, d0_u16); + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + d0_lo = vqrshlq_s32(d0_lo, round_shift); + d0_hi = vqrshlq_s32(d0_hi, round_shift); + + uint16x8_t d0 = vcombine_u16(vqmovun_s32(d0_lo), vqmovun_s32(d0_hi)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_convolve_2d_x_scale_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int subpel_x_qn, const int x_step_qn, + const InterpFilterParams *filter_params, ConvolveParams *conv_params, + const int offset) { + static const uint32_t kIdx[4] = { 0, 1, 2, 3 }; + const uint32x4_t idx = vld1q_u32(kIdx); + const uint32x4_t subpel_mask = vdupq_n_u32(SCALE_SUBPEL_MASK); + const int32x4_t shift_s32 = vdupq_n_s32(-conv_params->round_0); + const int32x4_t offset_s32 = vdupq_n_s32(offset); + + if (w <= 4) { + int height = h; + uint16_t *d = dst_ptr; + + do { + int x_qn = subpel_x_qn; + + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const uint32x4_t xqn_idx = vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) = + // 2 + const uint32x4_t src_idx_u32 = + vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1); +#if AOM_ARCH_AARCH64 + uint64x2_t src4[2]; + src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr), + vget_low_u32(src_idx_u32)); + src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)src_ptr), + vget_high_u32(src_idx_u32)); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, src4[0]); + vst1q_u64(tmp_ptr + 2, src4[1]); +#else + uint32x4_t src4; + src4 = vaddq_u32(vdupq_n_u32((const uint32_t)src_ptr), src_idx_u32); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, src4); +#endif // AOM_ARCH_AARCH64 + // Same for the filter vectors + const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32( + vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS)); + int32_t x_filter4_idx[4]; + vst1q_s32(x_filter4_idx, filter_idx_s32); + const int16_t *x_filter4_ptr[4]; + + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // We could easily do this using SIMD as well instead of calling the + // inline function 4 times. + x_filter4_ptr[0] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[0]); + x_filter4_ptr[1] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[1]); + x_filter4_ptr[2] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[2]); + x_filter4_ptr[3] = + av1_get_interp_filter_subpel_kernel(filter_params, x_filter4_idx[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + if (w == 2) { + store_u16_2x1(d, d0); + } else { + vst1_u16(d, d0); + } + + src_ptr += src_stride; + d += dst_stride; + height--; + } while (height > 0); + } else { + int height = h; + + do { + int width = w; + int x_qn = subpel_x_qn; + uint16_t *d = dst_ptr; + const uint16_t *s = src_ptr; + + do { + // Load 4 src vectors at a time, they might be the same, but we have to + // calculate the indices anyway. Doing it in SIMD and then storing the + // indices is faster than having to calculate the expression + // &src_ptr[((x_qn + 0*x_step_qn) >> SCALE_SUBPEL_BITS)] 4 times + // Ideally this should be a gather using the indices, but NEON does not + // have that, so have to emulate + const uint32x4_t xqn_idx = + vmlaq_n_u32(vdupq_n_u32(x_qn), idx, x_step_qn); + // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) + // = 2 + const uint32x4_t src_idx_u32 = + vshlq_n_u32(vshrq_n_u32(xqn_idx, SCALE_SUBPEL_BITS), 1); +#if AOM_ARCH_AARCH64 + uint64x2_t src4[2]; + src4[0] = vaddw_u32(vdupq_n_u64((const uint64_t)s), + vget_low_u32(src_idx_u32)); + src4[1] = vaddw_u32(vdupq_n_u64((const uint64_t)s), + vget_high_u32(src_idx_u32)); + int16_t *src4_ptr[4]; + uint64_t *tmp_ptr = (uint64_t *)&src4_ptr; + vst1q_u64(tmp_ptr, src4[0]); + vst1q_u64(tmp_ptr + 2, src4[1]); +#else + uint32x4_t src4; + src4 = vaddq_u32(vdupq_n_u32((const uint32_t)s), src_idx_u32); + int16_t *src4_ptr[4]; + uint32_t *tmp_ptr = (uint32_t *)&src4_ptr; + vst1q_u32(tmp_ptr, src4); +#endif // AOM_ARCH_AARCH64 + // Same for the filter vectors + const int32x4_t filter_idx_s32 = vreinterpretq_s32_u32( + vshrq_n_u32(vandq_u32(xqn_idx, subpel_mask), SCALE_EXTRA_BITS)); + int32_t x_filter4_idx[4]; + vst1q_s32(x_filter4_idx, filter_idx_s32); + const int16_t *x_filter4_ptr[4]; + + // Load source + int16x8_t s0 = vld1q_s16(src4_ptr[0]); + int16x8_t s1 = vld1q_s16(src4_ptr[1]); + int16x8_t s2 = vld1q_s16(src4_ptr[2]); + int16x8_t s3 = vld1q_s16(src4_ptr[3]); + + // We could easily do this using SIMD as well instead of calling the + // inline function 4 times. + x_filter4_ptr[0] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[0]); + x_filter4_ptr[1] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[1]); + x_filter4_ptr[2] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[2]); + x_filter4_ptr[3] = av1_get_interp_filter_subpel_kernel( + filter_params, x_filter4_idx[3]); + + // Actually load the filters + const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]); + const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]); + const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]); + const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]); + + // Group low and high parts and transpose + int16x4_t filters_lo[] = { vget_low_s16(x_filter0), + vget_low_s16(x_filter1), + vget_low_s16(x_filter2), + vget_low_s16(x_filter3) }; + int16x4_t filters_hi[] = { vget_high_s16(x_filter0), + vget_high_s16(x_filter1), + vget_high_s16(x_filter2), + vget_high_s16(x_filter3) }; + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo); + transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi); + + // Run the 2D Scale X convolution + uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16( + s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); + + vst1_u16(d, d0); + + x_qn += 4 * x_step_qn; + d += 4; + width -= 4; + } while (width > 0); + + src_ptr += src_stride; + dst_ptr += dst_stride; + height--; + } while (height > 0); + } +} + +static INLINE void highbd_convolve_2d_y_scale_8tap_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int subpel_y_qn, const int y_step_qn, + const InterpFilterParams *filter_params, const int round1_bits, + const int offset) { + const int32x4_t offset_s32 = vdupq_n_s32(1 << offset); + + const int32x4_t round1_shift_s32 = vdupq_n_s32(-round1_bits); + if (w <= 4) { + int height = h; + uint16_t *d = dst_ptr; + int y_qn = subpel_y_qn; + + do { + const int16_t *s = + (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_4x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + const int16_t *y_filter_ptr = + av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + uint16x4_t d0 = highbd_convolve8_4_srsub_s32_s16( + s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, + offset_s32, vdupq_n_s32(0)); + + if (w == 2) { + store_u16_2x1(d, d0); + } else { + vst1_u16(d, d0); + } + + y_qn += y_step_qn; + d += dst_stride; + height--; + } while (height > 0); + } else { + int width = w; + + do { + int height = h; + int y_qn = subpel_y_qn; + + uint16_t *d = dst_ptr; + + do { + const int16_t *s = + (const int16_t *)&src_ptr[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + const int16_t *y_filter_ptr = + av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx); + const int16x8_t y_filter = vld1q_s16(y_filter_ptr); + + uint16x8_t d0 = highbd_convolve8_8_srsub_s32_s16( + s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round1_shift_s32, + offset_s32, vdupq_n_s32(0)); + vst1q_u16(d, d0); + + y_qn += y_step_qn; + d += dst_stride; + height--; + } while (height > 0); + src_ptr += 8; + dst_ptr += 8; + width -= 8; + } while (width > 0); + } +} + +static INLINE void highbd_convolve_correct_offset_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, const int round_bits, const int offset, const int bd) { + const int32x4_t round_shift_s32 = vdupq_n_s32(-round_bits); + const int16x4_t offset_s16 = vdup_n_s16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w <= 4) { + for (int y = 0; y < h; ++y) { + const int16x4_t s = vld1_s16((const int16_t *)src_ptr + y * src_stride); + const int32x4_t d0 = + vqrshlq_s32(vsubl_s16(s, offset_s16), round_shift_s32); + uint16x4_t d = vqmovun_s32(d0); + d = vmin_u16(d, vget_low_u16(max)); + if (w == 2) { + store_u16_2x1(dst_ptr + y * dst_stride, d); + } else { + vst1_u16(dst_ptr + y * dst_stride, d); + } + } + } else { + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; x += 8) { + // Subtract round offset and convolve round + const int16x8_t s = + vld1q_s16((const int16_t *)src_ptr + y * src_stride + x); + const int32x4_t d0 = vqrshlq_s32(vsubl_s16(vget_low_s16(s), offset_s16), + round_shift_s32); + const int32x4_t d1 = vqrshlq_s32( + vsubl_s16(vget_high_s16(s), offset_s16), round_shift_s32); + uint16x8_t d01 = vcombine_u16(vqmovun_s32(d0), vqmovun_s32(d1)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst_ptr + y * dst_stride + x, d01); + } + } + } +} + +void av1_highbd_convolve_2d_scale_neon( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + uint16_t *im_block = (uint16_t *)aom_memalign( + 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP)); + if (!im_block) return; + uint16_t *im_block2 = (uint16_t *)aom_memalign( + 16, 2 * sizeof(uint16_t) * MAX_SB_SIZE * (MAX_SB_SIZE + MAX_FILTER_TAP)); + if (!im_block2) { + aom_free(im_block); // free the first block and return. + return; + } + + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + const int im_stride = MAX_SB_SIZE; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + + const int vert_offset = filter_params_y->taps / 2 - 1; + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int x_offset_bits = (1 << (bd + FILTER_BITS - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int y_offset_correction = + ((1 << (y_offset_bits - conv_params->round_1)) + + (1 << (y_offset_bits - conv_params->round_1 - 1))); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + highbd_convolve_2d_x_scale_8tap_neon( + src_ptr, src_stride, im_block, im_stride, w, im_h, subpel_x_qn, x_step_qn, + filter_params_x, conv_params, x_offset_bits); + if (conv_params->is_compound && !conv_params->do_average) { + highbd_convolve_2d_y_scale_8tap_neon( + im_block, im_stride, dst16, dst16_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params->round_1, y_offset_bits); + } else { + highbd_convolve_2d_y_scale_8tap_neon( + im_block, im_stride, im_block2, im_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params->round_1, y_offset_bits); + } + + // Do the compound averaging outside the loop, avoids branching within the + // main loop + if (conv_params->is_compound) { + if (conv_params->do_average) { + if (conv_params->use_dist_wtd_comp_avg) { + highbd_dist_wtd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, + h, conv_params, bits, y_offset_correction, + bd); + } else { + highbd_comp_avg_neon(im_block2, im_stride, dst, dst_stride, w, h, + conv_params, bits, y_offset_correction, bd); + } + } + } else { + highbd_convolve_correct_offset_neon(im_block2, im_stride, dst, dst_stride, + w, h, bits, y_offset_correction, bd); + } + aom_free(im_block); + aom_free(im_block2); +} diff --git a/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c b/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c new file mode 100644 index 0000000000..84bc8fd963 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_inv_txfm_neon.c @@ -0,0 +1,5994 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you canzip + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#if AOM_ARCH_AARCH64 +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + int32x4x2_t swap_low = vtrnq_s32(x0, x1); \ + int32x4x2_t swap_high = vtrnq_s32(x2, x3); \ + y0 = vreinterpretq_s32_s64( \ + vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \ + vreinterpretq_s64_s32(swap_high.val[0]))); \ + y1 = vreinterpretq_s32_s64( \ + vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \ + vreinterpretq_s64_s32(swap_high.val[1]))); \ + y2 = vreinterpretq_s32_s64( \ + vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \ + vreinterpretq_s64_s32(swap_high.val[0]))); \ + y3 = vreinterpretq_s32_s64( \ + vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \ + vreinterpretq_s64_s32(swap_high.val[1]))); \ + } while (0) +#else +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + int32x4x2_t swap_low = vtrnq_s32(x0, x1); \ + int32x4x2_t swap_high = vtrnq_s32(x2, x3); \ + y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2), \ + swap_high.val[0], 2); \ + y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2), \ + swap_high.val[1], 2); \ + y2 = vextq_s32(swap_low.val[0], \ + vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \ + y3 = vextq_s32(swap_low.val[1], \ + vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \ + } while (0) +#endif // AOM_ARCH_AARCH64 + +static INLINE void transpose_4x4(const int32x4_t *in, int32x4_t *out) { + TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]); +} + +static INLINE void transpose_8x8(const int32x4_t *in, int32x4_t *out) { + TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); + TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); + TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); + TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], + out[15]); +} + +static INLINE void round_shift_array_32_neon(int32x4_t *input, + int32x4_t *output, const int size, + const int bit) { + const int32x4_t v_bit = vdupq_n_s32(-bit); + for (int i = 0; i < size; i++) { + output[i] = vrshlq_s32(input[i], v_bit); + } +} + +static INLINE void round_shift_rect_array_32_neon(int32x4_t *input, + int32x4_t *output, + const int size) { + for (int i = 0; i < size; i++) { + const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2); + output[i] = vrshrq_n_s32(r0, NewSqrt2Bits); + } +} + +static INLINE int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0, + const int32_t *n1, const int32x4_t *w1, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, *n0); + x = vmlaq_n_s32(x, *w1, *n1); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_neon_mode11_r( + const int32_t *n0, const int32x4_t *w0, const int32_t *n1, + const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, -*n0); + x = vmlaq_n_s32(x, *w1, -*n1); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_neon_mode01_r( + const int32_t *n0, const int32x4_t *w0, const int32_t *n1, + const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, *n0); + x = vmlsq_n_s32(x, *w1, *n1); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_neon_mode10_r( + const int32_t *n0, const int32x4_t *w0, const int32_t *n1, + const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w1, *n1); + x = vmlsq_n_s32(x, *w0, *n0); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_0_neon_r(const int32_t *n0, + const int32x4_t *w0, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, *n0); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE int32x4_t half_btf_0_m_neon_r(const int32_t *n0, + const int32x4_t *w0, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t x; + x = vmlaq_n_s32(*rnding, *w0, -*n0); + x = vshlq_s32(x, *v_bit); + return x; +} + +static INLINE void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit, + const int num_cols); + +typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit, + int32_t do_cols, int32_t bd, + int32_t out_shift); + +static INLINE uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min, + const uint16x8_t *max) { + int16x8_t clamped; + clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max)); + clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min)); + return vreinterpretq_u16_s16(clamped); +} + +static INLINE void round_shift_4x4(int32x4_t *in, int shift) { + if (shift != 0) { + const int32x4_t v_shift = vdupq_n_s32(-shift); + in[0] = vrshlq_s32(in[0], v_shift); + in[1] = vrshlq_s32(in[1], v_shift); + in[2] = vrshlq_s32(in[2], v_shift); + in[3] = vrshlq_s32(in[3], v_shift); + } +} + +static void round_shift_8x8(int32x4_t *in, int shift) { + assert(shift != 0); + const int32x4_t v_shift = vdupq_n_s32(-shift); + in[0] = vrshlq_s32(in[0], v_shift); + in[1] = vrshlq_s32(in[1], v_shift); + in[2] = vrshlq_s32(in[2], v_shift); + in[3] = vrshlq_s32(in[3], v_shift); + in[4] = vrshlq_s32(in[4], v_shift); + in[5] = vrshlq_s32(in[5], v_shift); + in[6] = vrshlq_s32(in[6], v_shift); + in[7] = vrshlq_s32(in[7], v_shift); + in[8] = vrshlq_s32(in[8], v_shift); + in[9] = vrshlq_s32(in[9], v_shift); + in[10] = vrshlq_s32(in[10], v_shift); + in[11] = vrshlq_s32(in[11], v_shift); + in[12] = vrshlq_s32(in[12], v_shift); + in[13] = vrshlq_s32(in[13], v_shift); + in[14] = vrshlq_s32(in[14], v_shift); + in[15] = vrshlq_s32(in[15], v_shift); +} + +static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, int size) { + int32x4_t a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = vmaxq_s32(in[i], *clamp_lo); + out[i] = vminq_s32(a0, *clamp_hi); + + a1 = vmaxq_s32(in[i + 1], *clamp_lo); + out[i + 1] = vminq_s32(a1, *clamp_hi); + + a0 = vmaxq_s32(in[i + 2], *clamp_lo); + out[i + 2] = vminq_s32(a0, *clamp_hi); + + a1 = vmaxq_s32(in[i + 3], *clamp_lo); + out[i + 3] = vminq_s32(a1, *clamp_hi); + } +} + +static INLINE uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred, + int32x4_t res0, + int32x4_t res1, + const int bd) { + const uint16x8_t v_zero = vdupq_n_u16(0); + int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero); + int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1); + uint16x8x2_t x; + x.val[0] = vreinterpretq_u16_s32( + vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred)))); + x.val[1] = vreinterpretq_u16_s32( + vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred)))); + x.val[0] = vreinterpretq_u16_s32( + vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val)); + x.val[0] = vreinterpretq_u16_s32( + vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val)); + x.val[1] = vreinterpretq_u16_s32( + vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val)); + x.val[1] = vreinterpretq_u16_s32( + vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val)); + uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])), + vqmovn_u32(vreinterpretq_u32_u16(x.val[1]))); + return res; +} + +static INLINE uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred, + int32x4_t res0, + const int bd) { + uint16x4_t x0_ = vreinterpret_u16_s16( + vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred)))); + uint16x8_t x0 = vcombine_u16(x0_, x0_); + const uint16x8_t vmin = vdupq_n_u16(0); + const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); + x0 = highbd_clamp_u16(&x0, &vmin, &vmax); + return vget_low_u16(x0); +} + +static INLINE void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + uint16x4_t v = vld1_u16(output + i * stride); + uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd); + + vst1_u16(output + i * stride, u); + } +} + +static INLINE void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + uint16x8_t v = vld1q_u16(output + i * stride); + uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd); + + vst1q_u16(output + i * stride, u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + int32x4_t *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = vld1q_s32(in + i * stride); + } +} + +static INLINE void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) { + in[0] = vld1q_s32(coeff + 0); + in[1] = vld1q_s32(coeff + 4); + in[2] = vld1q_s32(coeff + 8); + in[3] = vld1q_s32(coeff + 12); +} + +static void addsub_neon(const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1, + const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) { + int32x4_t a0 = vaddq_s32(in0, in1); + int32x4_t a1 = vsubq_s32(in0, in1); + + a0 = vmaxq_s32(a0, *clamp_lo); + a0 = vminq_s32(a0, *clamp_hi); + a1 = vmaxq_s32(a1, *clamp_lo); + a1 = vminq_s32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_shift) { + int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift); + int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift); + + in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo); + in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi); + in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo); + in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi); + + *in0 = in0_w_offset; + *in1 = in1_w_offset; +} + +static INLINE void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30], + v_bit, rnding); + bf1[30] = + half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding); + bf1[17] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29], + v_bit, rnding); + bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29], + v_bit, rnding); + bf1[18] = temp2; + + temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26], + v_bit, rnding); + bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit, + rnding); + bf1[21] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25], + v_bit, rnding); + bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25], + v_bit, rnding); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14], + v_bit, rnding); + bf1[14] = + half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding); + bf1[9] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13], + v_bit, rnding); + bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13], + v_bit, rnding); + bf1[10] = temp2; + + addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], + v_bit, rnding); + bf1[6] = + half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding); + bf1[5] = temp1; + + addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29], + v_bit, rnding); + bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit, + rnding); + bf1[18] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28], + v_bit, rnding); + bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit, + rnding); + bf1[19] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27], + v_bit, rnding); + bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27], + v_bit, rnding); + bf1[20] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26], + v_bit, rnding); + bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26], + v_bit, rnding); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], + v_bit, rnding); + bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit, + rnding); + bf1[10] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], + v_bit, rnding); + bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit, + rnding); + bf1[11] = temp2; + + addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2; + addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], + v_bit, rnding); + bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit, + rnding); + bf1[20] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], + v_bit, rnding); + bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit, + rnding); + bf1[21] = temp2; + temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], + v_bit, rnding); + bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit, + rnding); + bf1[22] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], + v_bit, rnding); + bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit, + rnding); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out, + const int do_cols, const int bd, + const int out_shift, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi) { + addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 32; i += 8) { + round_shift_4x4(out + i, out_shift); + round_shift_4x4(out + i + 4, out_shift); + } + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1, + int32x4_t *out0, int32x4_t *out1, + const int32x4_t *clamp_lo, const int32x4_t *clamp_hi, + const int32x4_t *v_shift, int32x4_t *offset) { + int32x4_t a0 = vaddq_s32(*offset, *in0); + int32x4_t a1 = vsubq_s32(*offset, *in1); + + a0 = vshlq_s32(a0, *v_shift); + a1 = vshlq_s32(a1, *v_shift); + + a0 = vmaxq_s32(a0, *clamp_lo); + a0 = vminq_s32(a0, *clamp_hi); + a1 = vmaxq_s32(a1, *clamp_lo); + a1 = vminq_s32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + int32x4_t u0, u1, u2, u3; + int32x4_t v0, v1, v2, v3, x, y; + + // Stage 0-1-2 + + u0 = in[0]; + u1 = in[1]; + u2 = in[2]; + u3 = in[3]; + + const int32x4_t v_bit = vdupq_n_s32(-bit); + + x = vmlaq_n_s32(rnding, u0, cospi[32]); + y = vmulq_n_s32(u2, cospi[32]); + v0 = vaddq_s32(x, y); + v0 = vshlq_s32(v0, v_bit); + + v1 = vsubq_s32(x, y); + v1 = vshlq_s32(v1, v_bit); + + x = vmlaq_n_s32(rnding, u1, cospi[48]); + v2 = vmlsq_n_s32(x, u3, cospi[16]); + v2 = vshlq_s32(v2, v_bit); + + x = vmlaq_n_s32(rnding, u1, cospi[16]); + v3 = vmlaq_n_s32(x, u3, cospi[48]); + v3 = vshlq_s32(v3, v_bit); + // Stage 3 + addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); + addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); + + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift); + shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift); + } +} + +static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *sinpi = sinpi_arr(bit); + const int32x4_t zero = vdupq_n_s32(0); + int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1)); + const int32x2_t mul = vdup_n_s32(1 << 4); + int32x4_t t; + int32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + int32x4_t x0, x1, x2, x3; + int32x4_t u0, u1, u2, u3; + + x0 = in[0]; + x1 = in[1]; + x2 = in[2]; + x3 = in[3]; + + s0 = vmulq_n_s32(x0, sinpi[1]); + s1 = vmulq_n_s32(x0, sinpi[2]); + s2 = vmulq_n_s32(x1, sinpi[3]); + s3 = vmulq_n_s32(x2, sinpi[4]); + s4 = vmulq_n_s32(x2, sinpi[1]); + s5 = vmulq_n_s32(x3, sinpi[2]); + s6 = vmulq_n_s32(x3, sinpi[4]); + t = vsubq_s32(x0, x2); + s7 = vaddq_s32(t, x3); + + t = vaddq_s32(s0, s3); + s0 = vaddq_s32(t, s5); + t = vsubq_s32(s1, s4); + s1 = vsubq_s32(t, s6); + s3 = s2; + s2 = vmulq_n_s32(s7, sinpi[3]); + + u0 = vaddq_s32(s0, s3); + u1 = vaddq_s32(s1, s3); + u2 = s2; + t = vaddq_s32(s0, s1); + u3 = vsubq_s32(t, s3); + + // u0 + int32x4x2_t u0x; + u0x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul)); + u0x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding)); + + u0 = vextq_s32(u0, zero, 1); + u0x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul)); + u0x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding)); + + u0x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u0x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u0x = vzipq_s32(u0x.val[0], u0x.val[1]); +#if AOM_ARCH_AARCH64 + u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]), + vreinterpretq_s64_s32(u0x.val[1]))); +#else + u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1])); +#endif // AOM_ARCH_AARCH64 + // u1 + int32x4x2_t u1x; + u1x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul)); + u1x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding)); + + u1 = vextq_s32(u1, zero, 1); + u1x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul)); + u1x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding)); + + u1x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u1x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u1x = vzipq_s32(u1x.val[0], u1x.val[1]); +#if AOM_ARCH_AARCH64 + u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]), + vreinterpretq_s64_s32(u1x.val[1]))); +#else + u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1])); +#endif // AOM_ARCH_AARCH64 + + // u2 + int32x4x2_t u2x; + u2x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul)); + u2x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding)); + + u2 = vextq_s32(u2, zero, 1); + u2x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul)); + u2x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding)); + + u2x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u2x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u2x = vzipq_s32(u2x.val[0], u2x.val[1]); +#if AOM_ARCH_AARCH64 + u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]), + vreinterpretq_s64_s32(u2x.val[1]))); +#else + u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1])); +#endif // AOM_ARCH_AARCH64 + + // u3 + int32x4x2_t u3x; + u3x.val[0] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul)); + u3x.val[0] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding)); + + u3 = vextq_s32(u3, zero, 1); + u3x.val[1] = vreinterpretq_s32_s64( + vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul)); + u3x.val[1] = vreinterpretq_s32_s64( + vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding)); + + u3x.val[0] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1)); + u3x.val[1] = vreinterpretq_s32_s16(vextq_s16( + vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1)); + + u3x = vzipq_s32(u3x.val[0], u3x.val[1]); +#if AOM_ARCH_AARCH64 + u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]), + vreinterpretq_s64_s32(u3x.val[1]))); +#else + u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1])); +#endif // AOM_ARCH_AARCH64 + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + uint32x4_t u0, u1, u2, u3; + uint16x4_t v0, v1, v2, v3; + round_shift_4x4(in, shift); + + v0 = vld1_u16(output + 0 * stride); + v1 = vld1_u16(output + 1 * stride); + v2 = vld1_u16(output + 2 * stride); + v3 = vld1_u16(output + 3 * stride); + + if (fliplr) { + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0])); + in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1])); + in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2])); + in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3])); + in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2)); + } + + if (flipud) { + u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0); + u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1); + u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2); + u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3); + } else { + u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0); + u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1); + u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2); + u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3); + } + + uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1)); + uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3)); + const uint16x8_t vmin = vdupq_n_u16(0); + const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); + u4 = highbd_clamp_u16(&u4, &vmin, &vmax); + u5 = highbd_clamp_u16(&u5, &vmin, &vmax); + + vst1_u16(output + 0 * stride, vget_low_u16(u4)); + vst1_u16(output + 1 * stride, vget_high_u16(u4)); + vst1_u16(output + 2 * stride, vget_low_u16(u5)); + vst1_u16(output + 3 * stride, vget_high_u16(u5)); +} + +static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + int32x4_t zero = vdupq_n_s32(0); + int32x2_t fact = vdup_n_s32(NewSqrt2); + int32x4x2_t a0; + const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1)); + + for (int i = 0; i < 4; i++) { + a0.val[0] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact)); + a0.val[0] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits)); + a0.val[1] = vextq_s32(in[i], zero, 1); + a0.val[1] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact)); + a0.val[1] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits)); + + a0 = vzipq_s32(a0.val[0], a0.val[1]); +#if AOM_ARCH_AARCH64 + out[i] = vreinterpretq_s32_s64(vzip1q_s64( + vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1]))); +#else + out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2); +#endif + } + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + int32x4_t in[4]; + + const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case IDTX: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(input, in); + idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(input, in); + iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case H_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0); + transpose_4x4(in, in); + iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + default: assert(0); + } +} + +// 8x8 +static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) { + in[0] = vld1q_s32(coeff + 0); + in[1] = vld1q_s32(coeff + 4); + in[2] = vld1q_s32(coeff + 8); + in[3] = vld1q_s32(coeff + 12); + in[4] = vld1q_s32(coeff + 16); + in[5] = vld1q_s32(coeff + 20); + in[6] = vld1q_s32(coeff + 24); + in[7] = vld1q_s32(coeff + 28); + in[8] = vld1q_s32(coeff + 32); + in[9] = vld1q_s32(coeff + 36); + in[10] = vld1q_s32(coeff + 40); + in[11] = vld1q_s32(coeff + 44); + in[12] = vld1q_s32(coeff + 48); + in[13] = vld1q_s32(coeff + 52); + in[14] = vld1q_s32(coeff + 56); + in[15] = vld1q_s32(coeff + 60); +} + +static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; + int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; + int32x4_t x, y; + int col; + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + const int32x4_t v_bit = vdupq_n_s32(-bit); + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + // stage 2 + u0 = in[0 * 2 + col]; + u1 = in[4 * 2 + col]; + u2 = in[2 * 2 + col]; + u3 = in[6 * 2 + col]; + + x = vmulq_n_s32(in[1 * 2 + col], cospi[56]); + u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]); + u4 = vaddq_s32(u4, rnding); + u4 = vshlq_s32(u4, v_bit); + + x = vmulq_n_s32(in[1 * 2 + col], cospi[8]); + u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]); + u7 = vaddq_s32(u7, rnding); + u7 = vshlq_s32(u7, v_bit); + + x = vmulq_n_s32(in[5 * 2 + col], cospi[24]); + u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]); + u5 = vaddq_s32(u5, rnding); + u5 = vshlq_s32(u5, v_bit); + + x = vmulq_n_s32(in[5 * 2 + col], cospi[40]); + u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]); + u6 = vaddq_s32(u6, rnding); + u6 = vshlq_s32(u6, v_bit); + + // stage 3 + x = vmulq_n_s32(u0, cospi[32]); + y = vmulq_n_s32(u1, cospi[32]); + v0 = vaddq_s32(x, y); + v0 = vaddq_s32(v0, rnding); + v0 = vshlq_s32(v0, v_bit); + + v1 = vsubq_s32(x, y); + v1 = vaddq_s32(v1, rnding); + v1 = vshlq_s32(v1, v_bit); + + x = vmulq_n_s32(u2, cospi[48]); + v2 = vmlaq_n_s32(x, u3, -cospi[16]); + v2 = vaddq_s32(v2, rnding); + v2 = vshlq_s32(v2, v_bit); + + x = vmulq_n_s32(u2, cospi[16]); + v3 = vmlaq_n_s32(x, u3, cospi[48]); + v3 = vaddq_s32(v3, rnding); + v3 = vshlq_s32(v3, v_bit); + + addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = vmulq_n_s32(v5, cospi[32]); + y = vmulq_n_s32(v6, cospi[32]); + u6 = vaddq_s32(y, x); + u6 = vaddq_s32(u6, rnding); + u6 = vshlq_s32(u6, v_bit); + + u5 = vsubq_s32(y, x); + u5 = vaddq_s32(u5, rnding); + u5 = vshlq_s32(u5, v_bit); + + // stage 5 + addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int32x4_t kZero = vdupq_n_s32(0); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u[8], v[8], x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-1-2 + // (1) + u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]); + u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]); + u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]); + u[1] = vshlq_s32(u[1], v_bit); + + // (2) + u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]); + u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]); + u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]); + u[3] = vshlq_s32(u[3], v_bit); + + // (3) + u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]); + u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]); + u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]); + u[5] = vshlq_s32(u[5], v_bit); + + // (4) + u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]); + u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]); + u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 3 + addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); + u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]); + u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]); + u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 5 + addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + u[2] = vaddq_s32(v[0], x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(v[0], x); + u[3] = vshlq_s32(u[3], v_bit); + + v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + u[6] = vaddq_s32(v[0], x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(v[0], x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[2] = vsubq_s32(kZero, u[4]); + out[4] = u[6]; + out[6] = vsubq_s32(kZero, u[2]); + out[8] = u[3]; + out[10] = vsubq_s32(kZero, u[7]); + out[12] = u[5]; + out[14] = vsubq_s32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]); + u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]); + u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]); + u[1] = vshlq_s32(u[1], v_bit); + + // (2) + u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]); + u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]); + u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]); + u[3] = vshlq_s32(u[3], v_bit); + + // (3) + u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]); + u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]); + u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]); + u[5] = vshlq_s32(u[5], v_bit); + + // (4) + u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]); + u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]); + u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 3 + addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); + u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]); + u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]); + u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 5 + addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + u[2] = vaddq_s32(v[0], x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(v[0], x); + u[3] = vshlq_s32(u[3], v_bit); + + v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + u[6] = vaddq_s32(v[0], x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(v[0], x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[1] = u[0]; + out[3] = vsubq_s32(kZero, u[4]); + out[5] = u[6]; + out[7] = vsubq_s32(kZero, u[2]); + out[9] = u[3]; + out[11] = vsubq_s32(kZero, u[7]); + out[13] = u[5]; + out[15] = vsubq_s32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + out[0] = vaddq_s32(in[0], in[0]); + out[1] = vaddq_s32(in[1], in[1]); + out[2] = vaddq_s32(in[2], in[2]); + out[3] = vaddq_s32(in[3], in[3]); + out[4] = vaddq_s32(in[4], in[4]); + out[5] = vaddq_s32(in[5], in[5]); + out[6] = vaddq_s32(in[6], in[6]); + out[7] = vaddq_s32(in[7], in[7]); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8); + } +} + +static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo, + int32x4_t res_hi, int fliplr, int bd) { + uint16x8x2_t x; + + if (fliplr) { + res_lo = vrev64q_s32(res_lo); + res_lo = vextq_s32(res_lo, res_lo, 2); + res_hi = vrev64q_s32(res_hi); + res_hi = vextq_s32(res_hi, res_hi, 2); + x.val[0] = vreinterpretq_u16_s32( + vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred)))); + x.val[1] = vreinterpretq_u16_s32( + vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred)))); + + } else { + x.val[0] = vreinterpretq_u16_s32( + vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred)))); + x.val[1] = vreinterpretq_u16_s32( + vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred)))); + } + + uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])), + vqmovn_u32(vreinterpretq_u32_u16(x.val[1]))); + const uint16x8_t vmin = vdupq_n_u16(0); + const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1); + return highbd_clamp_u16(&x2, &vmin, &vmax); +} + +static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7; + uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7; + round_shift_8x8(in, shift); + + v0 = vld1q_u16(output + 0 * stride); + v1 = vld1q_u16(output + 1 * stride); + v2 = vld1q_u16(output + 2 * stride); + v3 = vld1q_u16(output + 3 * stride); + v4 = vld1q_u16(output + 4 * stride); + v5 = vld1q_u16(output + 5 * stride); + v6 = vld1q_u16(output + 6 * stride); + v7 = vld1q_u16(output + 7 * stride); + + if (flipud) { + u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); + u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); + u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); + u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); + u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); + u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); + u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); + u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); + } else { + u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); + u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); + u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); + u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); + u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); + u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); + u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); + u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); + } + + vst1q_u16(output + 0 * stride, u0); + vst1q_u16(output + 1 * stride, u1); + vst1q_u16(output + 2 * stride, u2); + vst1q_u16(output + 3 * stride, u3); + vst1q_u16(output + 4 * stride, u4); + vst1q_u16(output + 5 * stride, u5); + vst1q_u16(output + 6 * stride, u6); + vst1q_u16(output + 7 * stride, u7); +} + +void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + int32x4_t in[16], out[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in); + idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_8x8(input, in); + idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in); + idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + default: assert(0); + } +} + +static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-1-2-3 + x = vmulq_n_s32(in[0], cospi[32]); + x = vaddq_s32(vshlq_s32(x, v_bit), rnding); + + // stage 4-5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + x = vaddq_s32(x, offset); + x = vshlq_s32(x, vdupq_n_s32(-out_shift)); + } + + x = vmaxq_s32(x, clamp_lo); + x = vminq_s32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} + +static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; + int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; + int32x4_t x, y; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = vmlaq_n_s32(rnding, in[1], cospi[56]); + u4 = vmlaq_n_s32(x, in[7], -cospi[8]); + u4 = vshlq_s32(u4, v_bit); + + x = vmlaq_n_s32(rnding, in[1], cospi[8]); + u7 = vmlaq_n_s32(x, in[7], cospi[56]); + u7 = vshlq_s32(u7, v_bit); + + x = vmlaq_n_s32(rnding, in[5], cospi[24]); + u5 = vmlaq_n_s32(x, in[3], -cospi[40]); + u5 = vshlq_s32(u5, v_bit); + + x = vmlaq_n_s32(rnding, in[5], cospi[40]); + u6 = vmlaq_n_s32(x, in[3], cospi[24]); + u6 = vshlq_s32(u6, v_bit); + + // stage 3 + x = vmlaq_n_s32(rnding, u0, cospi[32]); + y = vmulq_n_s32(u1, cospi[32]); + v0 = vaddq_s32(x, y); + v0 = vshlq_s32(v0, v_bit); + + v1 = vsubq_s32(x, y); + v1 = vshlq_s32(v1, v_bit); + + x = vmlaq_n_s32(rnding, u2, cospi[48]); + v2 = vmlaq_n_s32(x, u3, -cospi[16]); + v2 = vshlq_s32(v2, v_bit); + + x = vmlaq_n_s32(rnding, u2, cospi[16]); + v3 = vmlaq_n_s32(x, u3, cospi[48]); + v3 = vshlq_s32(v3, v_bit); + + addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = vmulq_n_s32(v5, cospi[32]); + y = vmlaq_n_s32(rnding, v6, cospi[32]); + u6 = vaddq_s32(y, x); + u6 = vshlq_s32(u6, v_bit); + + u5 = vsubq_s32(y, x); + u5 = vshlq_s32(u5, v_bit); + + // stage 5 + addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} + +static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int32x4_t u[8], x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-2 + + u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]); + u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit); + + // stage 3-4 + int32x4_t temp1, temp2; + temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]); + temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]); + temp1 = vshlq_s32(temp1, v_bit); + u[4] = temp1; + + temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]); + u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + // stage 5-6 + temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]); + x = vmulq_n_s32(u[1], cospi[32]); + u[2] = vaddq_s32(temp1, x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(temp1, x); + u[3] = vshlq_s32(u[3], v_bit); + + temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]); + x = vmulq_n_s32(u[5], cospi[32]); + u[6] = vaddq_s32(temp1, x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(temp1, x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = vnegq_s32(u[4]); + out[2] = u[6]; + out[3] = vnegq_s32(u[2]); + out[4] = u[3]; + out[5] = vnegq_s32(u[7]); + out[6] = u[5]; + out[7] = vnegq_s32(u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + } +} + +static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u[8], v[8], x; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-2 + + u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]); + u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]); + u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]); + u[1] = vshlq_s32(u[1], v_bit); + + // (2) + u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]); + u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]); + u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]); + u[3] = vshlq_s32(u[3], v_bit); + + // (3) + u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]); + u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]); + u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]); + u[5] = vshlq_s32(u[5], v_bit); + + // (4) + u[6] = vmulq_n_s32(in[1], cospi[52]); + u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]); + u[6] = vaddq_s32(u[6], rnding); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmulq_n_s32(in[1], cospi[12]); + u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]); + u[7] = vaddq_s32(u[7], rnding); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 3 + addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]); + u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]); + u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]); + u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 5 + addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + u[2] = vaddq_s32(v[0], x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(v[0], x); + u[3] = vshlq_s32(u[3], v_bit); + + v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + u[6] = vaddq_s32(v[0], x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(v[0], x); + u[7] = vshlq_s32(u[7], v_bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = vnegq_s32(u[4]); + out[2] = u[6]; + out[3] = vnegq_s32(u[2]); + out[4] = u[3]; + out[5] = vnegq_s32(u[7]); + out[6] = u[5]; + out[7] = vnegq_s32(u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + } +} + +static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-4 + in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]); + in[0] = vshlq_s32(in[0], v_bit); + + // stage 5-7 + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + if (out_shift != 0) { + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + in[0] = vaddq_s32(in[0], offset); + in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift)); + } + } + + in[0] = vmaxq_s32(in[0], clamp_lo); + in[0] = vminq_s32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; +} + +static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + int32x4_t u[16], x, y; + // stage 0-1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + + u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding); + u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding); + + u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding); + u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding); + + u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); + u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); + + // stage 3 + u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding); + u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding); + u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding); + u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding); + + addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = vmlaq_n_s32(rnding, u[0], cospi[32]); + u[0] = vshlq_s32(x, v_bit); + u[1] = u[0]; + + u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding); + u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding); + + addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + u[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + u[9] = x; + y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit, + &rnding); + u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit, + &rnding); + u[10] = y; + + // stage 5 + addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = vmulq_n_s32(u[5], cospi[32]); + y = vmlaq_n_s32(rnding, u[6], cospi[32]); + u[5] = vsubq_s32(y, x); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vaddq_s32(y, x); + u[6] = vshlq_s32(u[6], v_bit); + + addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = vmulq_n_s32(u[10], cospi[32]); + y = vmlaq_n_s32(rnding, u[13], cospi[32]); + u[10] = vsubq_s32(y, x); + u[10] = vshlq_s32(u[10], v_bit); + + u[13] = vaddq_s32(x, y); + u[13] = vshlq_s32(u[13], v_bit); + + x = vmulq_n_s32(u[11], cospi[32]); + y = vmlaq_n_s32(rnding, u[12], cospi[32]); + u[11] = vsubq_s32(y, x); + u[11] = vshlq_s32(u[11], v_bit); + + u[12] = vaddq_s32(x, y); + u[12] = vshlq_s32(u[12], v_bit); + // stage 7 + addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + int32x4_t v[16], x, y, temp1, temp2; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0 + // stage 1 + // stage 2 + v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]); + v[0] = vshlq_s32(v[0], v_bit); + + v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]); + v[1] = vshlq_s32(v[1], v_bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]); + temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]); + temp1 = vshlq_s32(temp1, v_bit); + + temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]); + temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]); + temp2 = vshlq_s32(temp2, v_bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]); + temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]); + temp1 = vshlq_s32(temp1, v_bit); + + temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]); + temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]); + temp2 = vshlq_s32(temp2, v_bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]); + temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]); + temp1 = vshlq_s32(temp1, v_bit); + + temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]); + temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]); + temp2 = vshlq_s32(temp2, v_bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = vmlaq_n_s32(rnding, v[2], cospi[32]); + x = vmulq_n_s32(v[3], cospi[32]); + v[2] = vaddq_s32(y, x); + v[2] = vshlq_s32(v[2], v_bit); + + v[3] = vsubq_s32(y, x); + v[3] = vshlq_s32(v[3], v_bit); + + y = vmlaq_n_s32(rnding, v[6], cospi[32]); + x = vmulq_n_s32(v[7], cospi[32]); + v[6] = vaddq_s32(y, x); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vsubq_s32(y, x); + v[7] = vshlq_s32(v[7], v_bit); + + y = vmlaq_n_s32(rnding, v[10], cospi[32]); + x = vmulq_n_s32(v[11], cospi[32]); + v[10] = vaddq_s32(y, x); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vsubq_s32(y, x); + v[11] = vshlq_s32(v[11], v_bit); + + y = vmlaq_n_s32(rnding, v[14], cospi[32]); + x = vmulq_n_s32(v[15], cospi[32]); + v[14] = vaddq_s32(y, x); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vsubq_s32(y, x); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = vnegq_s32(v[8]); + out[2] = v[12]; + out[3] = vnegq_s32(v[4]); + out[4] = v[6]; + out[5] = vnegq_s32(v[14]); + out[6] = v[10]; + out[7] = vnegq_s32(v[2]); + out[8] = v[3]; + out[9] = vnegq_s32(v[11]); + out[10] = v[15]; + out[11] = vnegq_s32(v[7]); + out[12] = v[5]; + out[13] = vnegq_s32(v[13]); + out[14] = v[9]; + out[15] = vnegq_s32(v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t zero = vdupq_n_s32(0); + int32x4_t u[16], x, y; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-2 + u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]); + u[0] = vshlq_s32(u[0], v_bit); + + u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]); + u[1] = vshlq_s32(u[1], v_bit); + + u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]); + u[3] = vshlq_s32(u[3], v_bit); + + u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]); + u[7] = vshlq_s32(u[7], v_bit); + + u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]); + u[8] = vshlq_s32(u[8], v_bit); + + u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]); + u[9] = vshlq_s32(u[9], v_bit); + + u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]); + u[10] = vshlq_s32(u[10], v_bit); + + u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]); + u[11] = vshlq_s32(u[11], v_bit); + + u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]); + u[12] = vshlq_s32(u[12], v_bit); + + u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]); + u[13] = vshlq_s32(u[13], v_bit); + + u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 3 + addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = vmlaq_n_s32(rnding, u[8], cospi[56]); + u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]); + u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]); + u[8] = vshlq_s32(u[8], v_bit); + + u[9] = vmlsq_n_s32(y, u[9], cospi[8]); + u[9] = vshlq_s32(u[9], v_bit); + + y = vmlaq_n_s32(rnding, u[10], cospi[24]); + u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]); + u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]); + u[10] = vshlq_s32(u[10], v_bit); + + u[11] = vmlsq_n_s32(y, u[11], cospi[40]); + u[11] = vshlq_s32(u[11], v_bit); + + y = vmlaq_n_s32(rnding, u[12], cospi[8]); + u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]); + u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]); + u[12] = vshlq_s32(u[12], v_bit); + + u[13] = vmlaq_n_s32(y, u[13], cospi[56]); + u[13] = vshlq_s32(u[13], v_bit); + + y = vmlaq_n_s32(rnding, u[14], cospi[40]); + u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]); + u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vmlaq_n_s32(y, u[15], cospi[24]); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 5 + addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + y = vmlaq_n_s32(rnding, u[4], cospi[48]); + u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]); + u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]); + u[4] = vshlq_s32(u[4], v_bit); + + u[5] = vmlsq_n_s32(y, u[5], cospi[16]); + u[5] = vshlq_s32(u[5], v_bit); + + y = vmlaq_n_s32(rnding, u[6], cospi[16]); + u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]); + u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vmlaq_n_s32(y, u[7], cospi[48]); + u[7] = vshlq_s32(u[7], v_bit); + + y = vmlaq_n_s32(rnding, u[12], cospi[48]); + u[12] = vmulq_n_s32(u[12], cospi[16]); + u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]); + u[12] = vshlq_s32(u[12], v_bit); + + u[13] = vmlsq_n_s32(y, u[13], cospi[16]); + u[13] = vshlq_s32(u[13], v_bit); + + y = vmlaq_n_s32(rnding, u[14], cospi[16]); + u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]); + u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vmlaq_n_s32(y, u[15], cospi[48]); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 7 + addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = vmlaq_n_s32(rnding, u[2], cospi[32]); + x = vmulq_n_s32(u[3], cospi[32]); + u[2] = vaddq_s32(y, x); + u[2] = vshlq_s32(u[2], v_bit); + + u[3] = vsubq_s32(y, x); + u[3] = vshlq_s32(u[3], v_bit); + y = vmlaq_n_s32(rnding, u[6], cospi[32]); + x = vmulq_n_s32(u[7], cospi[32]); + u[6] = vaddq_s32(y, x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = vsubq_s32(y, x); + u[7] = vshlq_s32(u[7], v_bit); + + y = vmlaq_n_s32(rnding, u[10], cospi[32]); + x = vmulq_n_s32(u[11], cospi[32]); + u[10] = vaddq_s32(y, x); + u[10] = vshlq_s32(u[10], v_bit); + + u[11] = vsubq_s32(y, x); + u[11] = vshlq_s32(u[11], v_bit); + + y = vmlaq_n_s32(rnding, u[14], cospi[32]); + x = vmulq_n_s32(u[15], cospi[32]); + u[14] = vaddq_s32(y, x); + u[14] = vshlq_s32(u[14], v_bit); + + u[15] = vsubq_s32(y, x); + u[15] = vshlq_s32(u[15], v_bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = vsubq_s32(zero, u[8]); + out[2] = u[12]; + out[3] = vsubq_s32(zero, u[4]); + out[4] = u[6]; + out[5] = vsubq_s32(zero, u[14]); + out[6] = u[10]; + out[7] = vsubq_s32(zero, u[2]); + out[8] = u[3]; + out[9] = vsubq_s32(zero, u[11]); + out[10] = u[15]; + out[11] = vsubq_s32(zero, u[7]); + out[12] = u[5]; + out[13] = vsubq_s32(zero, u[13]); + out[14] = u[9]; + out[15] = vsubq_s32(zero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t u[16], v[16], x, y; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + { + // stage 0-1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit, + &rnding); + v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit, + &rnding); + v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13], + &v_bit, &rnding); + v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12], + &v_bit, &rnding); + v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit, + &rnding); + v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit, + &rnding); + v[14] = + half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding); + v[15] = + half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit, + &rnding); + u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit, + &rnding); + u[6] = + half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding); + u[7] = + half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding); + addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = vmlaq_n_s32(rnding, u[0], cospi[32]); + y = vmulq_n_s32(u[1], cospi[32]); + v[0] = vaddq_s32(x, y); + v[0] = vshlq_s32(v[0], v_bit); + + v[1] = vsubq_s32(x, y); + v[1] = vshlq_s32(v[1], v_bit); + + v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit, + &rnding); + v[3] = + half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding); + addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], + &v_bit, &rnding); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], + &v_bit, &rnding); + v[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + v[15] = u[15]; + + // stage 5 + addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = vmulq_n_s32(v[5], cospi[32]); + y = vmlaq_n_s32(rnding, v[6], cospi[32]); + u[5] = vsubq_s32(y, x); + u[5] = vshlq_s32(u[5], v_bit); + + u[6] = vaddq_s32(y, x); + u[6] = vshlq_s32(u[6], v_bit); + + u[7] = v[7]; + addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = vmulq_n_s32(u[10], cospi[32]); + y = vmlaq_n_s32(rnding, u[13], cospi[32]); + v[10] = vsubq_s32(y, x); + v[10] = vshlq_s32(v[10], v_bit); + + v[13] = vaddq_s32(x, y); + v[13] = vshlq_s32(v[13], v_bit); + + x = vmulq_n_s32(u[11], cospi[32]); + y = vmlaq_n_s32(rnding, u[12], cospi[32]); + v[11] = vsubq_s32(y, x); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = vaddq_s32(x, y); + v[12] = vshlq_s32(v[12], v_bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = + vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t zero = vdupq_n_s32(0); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + int32x4_t u[16], v[16], x, y; + // Calculate the column 0, 1, 2, 3 + // stage 0 + // stage 1 + // stage 2 + v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]); + v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]); + v[0] = vshlq_s32(v[0], v_bit); + + v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]); + v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]); + v[1] = vshlq_s32(v[1], v_bit); + + v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]); + v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]); + v[2] = vshlq_s32(v[2], v_bit); + + v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]); + v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]); + v[3] = vshlq_s32(v[3], v_bit); + + v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]); + v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]); + v[4] = vshlq_s32(v[4], v_bit); + + v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]); + v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]); + v[5] = vshlq_s32(v[5], v_bit); + + v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]); + v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]); + v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]); + v[7] = vshlq_s32(v[7], v_bit); + + v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]); + v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]); + v[8] = vshlq_s32(v[8], v_bit); + + v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]); + v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]); + v[9] = vshlq_s32(v[9], v_bit); + + v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]); + v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]); + v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]); + v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]); + v[12] = vshlq_s32(v[12], v_bit); + + v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]); + v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]); + v[13] = vshlq_s32(v[13], v_bit); + + v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]); + v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]); + v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 3 + addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]); + v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]); + v[8] = vshlq_s32(v[8], v_bit); + + v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]); + v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]); + v[9] = vshlq_s32(v[9], v_bit); + + v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]); + v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]); + v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]); + v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]); + v[12] = vshlq_s32(v[12], v_bit); + + v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]); + v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]); + v[13] = vshlq_s32(v[13], v_bit); + + v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]); + v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]); + v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 5 + addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]); + v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]); + v[4] = vshlq_s32(v[4], v_bit); + + v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]); + v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]); + v[5] = vshlq_s32(v[5], v_bit); + + v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]); + v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]); + v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]); + v[7] = vshlq_s32(v[7], v_bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]); + v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]); + v[12] = vshlq_s32(v[12], v_bit); + + v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]); + v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]); + v[13] = vshlq_s32(v[13], v_bit); + + v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]); + v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]); + v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 7 + addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = vmlaq_n_s32(rnding, u[2], cospi[32]); + x = vmulq_n_s32(u[3], cospi[32]); + v[2] = vaddq_s32(y, x); + v[2] = vshlq_s32(v[2], v_bit); + + v[3] = vsubq_s32(y, x); + v[3] = vshlq_s32(v[3], v_bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = vmlaq_n_s32(rnding, u[6], cospi[32]); + x = vmulq_n_s32(u[7], cospi[32]); + v[6] = vaddq_s32(y, x); + v[6] = vshlq_s32(v[6], v_bit); + + v[7] = vsubq_s32(y, x); + v[7] = vshlq_s32(v[7], v_bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = vmlaq_n_s32(rnding, u[10], cospi[32]); + x = vmulq_n_s32(u[11], cospi[32]); + v[10] = vaddq_s32(y, x); + v[10] = vshlq_s32(v[10], v_bit); + + v[11] = vsubq_s32(y, x); + v[11] = vshlq_s32(v[11], v_bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = vmlaq_n_s32(rnding, u[14], cospi[32]); + x = vmulq_n_s32(u[15], cospi[32]); + v[14] = vaddq_s32(y, x); + v[14] = vshlq_s32(v[14], v_bit); + + v[15] = vsubq_s32(y, x); + v[15] = vshlq_s32(v[15], v_bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = vsubq_s32(zero, v[8]); + out[2] = v[12]; + out[3] = vsubq_s32(zero, v[4]); + out[4] = v[6]; + out[5] = vsubq_s32(zero, v[14]); + out[6] = v[10]; + out[7] = vsubq_s32(zero, v[2]); + out[8] = v[3]; + out[9] = vsubq_s32(zero, v[11]); + out[10] = v[15]; + out[11] = vsubq_s32(zero, v[7]); + out[12] = v[5]; + out[13] = vsubq_s32(zero, v[13]); + out[14] = v[9]; + out[15] = vsubq_s32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + const int32x4_t v_shift = vdupq_n_s32(-out_shift); + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + &v_shift, &offset); + neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, &v_shift, &offset); + } +} + +static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + (void)bit; + int32x2_t fact = vdup_n_s32(2 * NewSqrt2); + int32x4x2_t a0; + int32x4_t zero = vdupq_n_s32(0); + const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1)); + for (int i = 0; i < 16; i++) { + a0.val[0] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact)); + a0.val[0] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits)); + a0.val[1] = vextq_s32(in[i], zero, 1); + a0.val[1] = vreinterpretq_s32_s64( + vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact)); + a0.val[1] = vreinterpretq_s32_s64( + vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits)); + a0 = vzipq_s32(a0.val[0], a0.val[1]); +#if AOM_ARCH_AARCH64 + out[i] = vreinterpretq_s32_s64(vzip1q_s64( + vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1]))); +#else + out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2); +#endif + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16); + } +} + +static INLINE void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int i; + int32x4_t temp1, temp2, temp3, temp4; + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, + rnding); + u[13] = + half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding); + u[10] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, + rnding); + u[12] = + half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); + } + + temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit, + rnding); + u[56] = + half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding); + u[57] = + half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding); + u[58] = + half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding); + u[59] = + half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit, + rnding); + temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit, + rnding); + temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit, + rnding); + temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit, + rnding); + u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit, + rnding); + u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit, + rnding); + u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit, + rnding); + u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit, + rnding); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int i; + int32x4_t temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, + rnding); + u[24] = + half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding); + u[25] = + half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding); + u[26] = + half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding); + u[27] = + half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi, + const int32x4_t *v_bit, + const int32x4_t *rnding) { + int32x4_t temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, + rnding); + u[52] = + half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding); + u[53] = + half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding); + u[54] = + half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding); + u[55] = + half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, + rnding); + temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, + rnding); + temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, + rnding); + temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, + rnding); + u[48] = + half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding); + u[49] = + half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding); + u[50] = + half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding); + u[51] = + half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_neon(int32x4_t *u, int32x4_t *out, + int do_cols, int bd, int out_shift, + const int32x4_t *clamp_lo, + const int32x4_t *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4); + } + } +} + +static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + { + int32x4_t x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1); + x = vaddq_s32(x, offset); + x = vshlq_s32(x, vdupq_n_s32(-out_shift)); + } + } + x = vmaxq_s32(x, clamp_lo); + x = vminq_s32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} + +static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + { + int32x4_t u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); + u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); + u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); + u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); + u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); + u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); + u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); + u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); + + // stage 3 + u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding); + u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding); + u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding); + u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + int32x4_t temp1, temp2; + u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], + &v_bit, &rnding); + u[62] = + half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); + u[33] = temp1; + + temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], + &v_bit, &rnding); + u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], + &v_bit, &rnding); + u[57] = temp2; + + temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], + &v_bit, &rnding); + u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, + &rnding); + u[41] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], + &v_bit, &rnding); + u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], + &v_bit, &rnding); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], + &v_bit, &rnding); + u[30] = + half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding); + u[17] = temp1; + + temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25], + &v_bit, &rnding); + u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25], + &v_bit, &rnding); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[0] = temp1; + + temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], + &v_bit, &rnding); + u[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], + &v_bit, &rnding); + u[61] = + half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); + u[34] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], + &v_bit, &rnding); + u[60] = + half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); + u[35] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], + &v_bit, &rnding); + u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], + &v_bit, &rnding); + u[36] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], + &v_bit, &rnding); + u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], + &v_bit, &rnding); + u[37] = temp2; + temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], + &v_bit, &rnding); + u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, + &rnding); + u[42] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], + &v_bit, &rnding); + u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, + &rnding); + u[43] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], + &v_bit, &rnding); + u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], + &v_bit, &rnding); + u[44] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], + &v_bit, &rnding); + u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], + &v_bit, &rnding); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29], + &v_bit, &rnding); + u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit, + &rnding); + u[18] = temp1; + temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28], + &v_bit, &rnding); + u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit, + &rnding); + u[19] = temp2; + temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27], + &v_bit, &rnding); + u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27], + &v_bit, &rnding); + u[20] = temp1; + temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26], + &v_bit, &rnding); + u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26], + &v_bit, &rnding); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + u[9] = u[9]; + + idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 9 + idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 10 + idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 11 + idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + { + int32x4_t u[64]; + int32x4_t tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); + u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); + u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding); + u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding); + u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding); + u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding); + u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); + u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); + u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); + u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); + u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding); + u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding); + u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); + u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); + u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding); + u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding); + + // stage 3 + u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding); + u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding); + u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding); + u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding); + u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding); + u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding); + u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding); + u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); + u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit, + &rnding); + tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit, + &rnding); + tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], + &v_bit, &rnding); + u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], + &v_bit, &rnding); + u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit, + &rnding); + u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61], + &v_bit, &rnding); + u[62] = + half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], + &v_bit, &rnding); + tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53], + &v_bit, &rnding); + tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit, + &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], + &v_bit, &rnding); + u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], + &v_bit, &rnding); + u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit, + &rnding); + u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53], + &v_bit, &rnding); + u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, + &rnding); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding); + u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit, + &rnding); + tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit, + &rnding); + tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25], + &v_bit, &rnding); + u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25], + &v_bit, &rnding); + u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit, + &rnding); + u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29], + &v_bit, &rnding); + u[30] = + half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + u[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + u[9] = tmp1; + tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13], + &v_bit, &rnding); + u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], + &v_bit, &rnding); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit, + &rnding); + tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit, + &rnding); + tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit, + &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit, + &rnding); + u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], + &v_bit, &rnding); + u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], + &v_bit, &rnding); + u[60] = + half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); + u[61] = + half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], + &v_bit, &rnding); + tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], + &v_bit, &rnding); + tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], + &v_bit, &rnding); + u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], + &v_bit, &rnding); + u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], + &v_bit, &rnding); + u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, + &rnding); + u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, + &rnding); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, + &rnding); + u[6] = + half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding); + u[5] = tmp1; + addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29], + &v_bit, &rnding); + tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28], + &v_bit, &rnding); + tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27], + &v_bit, &rnding); + tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26], + &v_bit, &rnding); + u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26], + &v_bit, &rnding); + u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27], + &v_bit, &rnding); + u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit, + &rnding); + u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit, + &rnding); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 9 + idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 10 + idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 11 + idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + + { + int32x4_t u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding); + v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding); + v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding); + v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding); + v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding); + v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding); + v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding); + v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding); + v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding); + v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding); + v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding); + v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding); + v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding); + v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding); + v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding); + v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding); + v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding); + v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding); + v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding); + v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding); + v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding); + v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding); + v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding); + v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding); + v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding); + v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding); + v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding); + v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding); + v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding); + v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding); + v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding); + v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding); + + // stage 3 + u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding); + u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding); + u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding); + u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding); + u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding); + u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding); + u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding); + u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding); + u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding); + u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding); + u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding); + u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding); + u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding); + u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding); + u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding); + u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding); + + for (i = 32; i < 64; i += 4) { + addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding); + v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding); + v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding); + v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding); + v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding); + v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding); + v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding); + v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding); + + for (i = 16; i < 32; i += 4) { + addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], + &v_bit, &rnding); + v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], + &v_bit, &rnding); + v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58], + &v_bit, &rnding); + v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57], + &v_bit, &rnding); + v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54], + &v_bit, &rnding); + v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53], + &v_bit, &rnding); + v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50], + &v_bit, &rnding); + v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49], + &v_bit, &rnding); + v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49], + &v_bit, &rnding); + v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit, + &rnding); + v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53], + &v_bit, &rnding); + v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit, + &rnding); + v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57], + &v_bit, &rnding); + v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit, + &rnding); + v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61], + &v_bit, &rnding); + v[62] = + half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding); + + // stage 5 + u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding); + u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding); + u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding); + u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding); + + for (i = 8; i < 16; i += 4) { + addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30], + &v_bit, &rnding); + u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29], + &v_bit, &rnding); + u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26], + &v_bit, &rnding); + u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25], + &v_bit, &rnding); + u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25], + &v_bit, &rnding); + u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit, + &rnding); + u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29], + &v_bit, &rnding); + u[30] = + half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding); + + for (i = 32; i < 64; i += 8) { + addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding); + v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding); + v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding); + + addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit, + &rnding); + v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], + &v_bit, &rnding); + v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], + &v_bit, &rnding); + v[14] = + half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding); + + for (i = 16; i < 32; i += 8) { + addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], + &v_bit, &rnding); + v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], + &v_bit, &rnding); + v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], + &v_bit, &rnding); + v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], + &v_bit, &rnding); + v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53], + &v_bit, &rnding); + v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52], + &v_bit, &rnding); + v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51], + &v_bit, &rnding); + v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50], + &v_bit, &rnding); + v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50], + &v_bit, &rnding); + v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51], + &v_bit, &rnding); + v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit, + &rnding); + v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit, + &rnding); + v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58], + &v_bit, &rnding); + v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59], + &v_bit, &rnding); + v[60] = + half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding); + v[61] = + half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding); + + // stage 7 + addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, + &rnding); + u[6] = + half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding); + + addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29], + &v_bit, &rnding); + u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28], + &v_bit, &rnding); + u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27], + &v_bit, &rnding); + u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26], + &v_bit, &rnding); + u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26], + &v_bit, &rnding); + u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27], + &v_bit, &rnding); + u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit, + &rnding); + u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit, + &rnding); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], + &v_bit, &rnding); + v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], + &v_bit, &rnding); + v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit, + &rnding); + v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit, + &rnding); + + for (i = 16; i < 20; ++i) { + addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], + &v_bit, &rnding); + v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], + &v_bit, &rnding); + v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], + &v_bit, &rnding); + v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], + &v_bit, &rnding); + v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], + &v_bit, &rnding); + v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], + &v_bit, &rnding); + v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], + &v_bit, &rnding); + v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], + &v_bit, &rnding); + v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], + &v_bit, &rnding); + v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], + &v_bit, &rnding); + v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], + &v_bit, &rnding); + v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], + &v_bit, &rnding); + v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit, + &rnding); + v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit, + &rnding); + v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit, + &rnding); + v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit, + &rnding); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27], + &v_bit, &rnding); + u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26], + &v_bit, &rnding); + u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25], + &v_bit, &rnding); + u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24], + &v_bit, &rnding); + u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit, + &rnding); + u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit, + &rnding); + u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit, + &rnding); + u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit, + &rnding); + + for (i = 32; i < 40; i++) { + addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], + &v_bit, &rnding); + v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], + &v_bit, &rnding); + v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], + &v_bit, &rnding); + v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], + &v_bit, &rnding); + v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], + &v_bit, &rnding); + v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], + &v_bit, &rnding); + v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], + &v_bit, &rnding); + v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], + &v_bit, &rnding); + v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit, + &rnding); + v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit, + &rnding); + v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit, + &rnding); + v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit, + &rnding); + v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit, + &rnding); + v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit, + &rnding); + v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit, + &rnding); + v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit, + &rnding); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = + vdupq_n_s32((1 << (log_range_out - 1)) - 1); + for (i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, + 4); + } + } + } +} + +static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0-1 + bf1 = in[0]; + + // stage 2-5 + bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding); + + // stage 6-9 + if (do_cols) { + bf1 = vmaxq_s32(bf1, clamp_lo); + bf1 = vminq_s32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1))); + clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift)); + } + } + + bf1 = vmaxq_s32(bf1, clamp_lo); + bf1 = vminq_s32(bf1, clamp_hi); + + for (int i = 0; i < 32; i++) out[i] = bf1; +} + +static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1[32]; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + // stage 0-1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding); + bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding); + bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding); + bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding); + bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding); + bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding); + bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding); + bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding); + + // stage 3 + bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding); + bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding); + + bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding); + bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding); + bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_neon(bf1, cospi, &v_bit, &rnding); + + // stage 5 + bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 7 + idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 8 + idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 9 + idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1[32]; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + + // stage 0-1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding); + bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding); + bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding); + bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding); + bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding); + bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding); + bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding); + bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding); + bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding); + bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding); + bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding); + bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding); + bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding); + bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding); + bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding); + bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding); + + // stage 3 + bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding); + bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding); + bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding); + bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding); + bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding); + bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding); + bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding); + bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding); + + addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding); + bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding); + bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding); + bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding); + + addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_neon(bf1, cospi, &v_bit, &rnding); + + // stage 5 + bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding); + bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding); + + addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 6 + addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 7 + idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + + // stage 8 + idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding); + // stage 9 + idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1))); + const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1); + int32x4_t bf1[32], bf0[32]; + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1)); + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + for (int i = 0; i < 16; i++) bf0[i] = bf1[i]; + + bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31], + &v_bit, &rnding); + bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30], + &v_bit, &rnding); + bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29], + &v_bit, &rnding); + bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28], + &v_bit, &rnding); + bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27], + &v_bit, &rnding); + bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26], + &v_bit, &rnding); + bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25], + &v_bit, &rnding); + bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24], + &v_bit, &rnding); + bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit, + &rnding); + bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit, + &rnding); + bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit, + &rnding); + bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit, + &rnding); + bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit, + &rnding); + bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit, + &rnding); + bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit, + &rnding); + bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit, + &rnding); + + // stage 3 + for (int i = 0; i < 8; i++) bf1[i] = bf0[i]; + + bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15], + &v_bit, &rnding); + bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14], + &v_bit, &rnding); + bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13], + &v_bit, &rnding); + bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12], + &v_bit, &rnding); + bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit, + &rnding); + bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit, + &rnding); + bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit, + &rnding); + bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit, + &rnding); + + addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7], + &v_bit, &rnding); + bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6], + &v_bit, &rnding); + bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit, + &rnding); + bf0[7] = + half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding); + + addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30], + &v_bit, &rnding); + bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29], + &v_bit, &rnding); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26], + &v_bit, &rnding); + bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25], + &v_bit, &rnding); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25], + &v_bit, &rnding); + bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit, + &rnding); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29], + &v_bit, &rnding); + bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit, + &rnding); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit, + &rnding); + bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], + &v_bit, &rnding); + bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3], + &v_bit, &rnding); + bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit, + &rnding); + addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14], + &v_bit, &rnding); + bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13], + &v_bit, &rnding); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13], + &v_bit, &rnding); + bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit, + &rnding); + bf1[15] = bf0[15]; + addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], + &v_bit, &rnding); + bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit, + &rnding); + bf0[7] = bf1[7]; + addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29], + &v_bit, &rnding); + bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28], + &v_bit, &rnding); + bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27], + &v_bit, &rnding); + bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26], + &v_bit, &rnding); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26], + &v_bit, &rnding); + bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27], + &v_bit, &rnding); + bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit, + &rnding); + bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit, + &rnding); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], + &v_bit, &rnding); + bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], + &v_bit, &rnding); + bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit, + &rnding); + bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit, + &rnding); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], + &v_bit, &rnding); + bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], + &v_bit, &rnding); + bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], + &v_bit, &rnding); + bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], + &v_bit, &rnding); + bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit, + &rnding); + bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit, + &rnding); + bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit, + &rnding); + bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit, + &rnding); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit, + int do_cols, int bd, int out_shift) { + (void)bit; + for (int i = 0; i < 32; i += 16) { + out[i] = vshlq_n_s32(in[i], 2); + out[i + 1] = vshlq_n_s32(in[i + 1], 2); + out[i + 2] = vshlq_n_s32(in[i + 2], 2); + out[i + 3] = vshlq_n_s32(in[i + 3], 2); + out[i + 4] = vshlq_n_s32(in[i + 4], 2); + out[i + 5] = vshlq_n_s32(in[i + 5], 2); + out[i + 6] = vshlq_n_s32(in[i + 6], 2); + out[i + 7] = vshlq_n_s32(in[i + 7], 2); + out[i + 8] = vshlq_n_s32(in[i + 8], 2); + out[i + 9] = vshlq_n_s32(in[i + 9], 2); + out[i + 10] = vshlq_n_s32(in[i + 10], 2); + out[i + 11] = vshlq_n_s32(in[i + 11], 2); + out[i + 12] = vshlq_n_s32(in[i + 12], 2); + out[i + 13] = vshlq_n_s32(in[i + 13], 2); + out[i + 14] = vshlq_n_s32(in[i + 14], 2); + out[i + 15] = vshlq_n_s32(in[i + 15], 2); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1))); + const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +// 1D itx types +typedef enum ATTRIBUTE_PACKED { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} ITX_TYPE_1D; + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +static const transform_1d_neon + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4x4_neon, NULL, NULL, NULL }, + { iadst4x4_neon, NULL, NULL, NULL }, + { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL }, + }, + { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL }, + { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL }, + { iidentity8_neon, iidentity8_neon, NULL, NULL } }, + { + { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL }, + { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL }, + { iidentity16_neon, NULL, iidentity16_neon, NULL }, + }, + { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon, + idct32x32_neon }, + { NULL, NULL, NULL, NULL }, + { iidentity32_neon, NULL, NULL, NULL } }, + { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon, + idct64x64_neon }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_4X8; + int32x4_t buf1[32] = { vdupq_n_s32(0) }; + + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[8]; + load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); + load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); + round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); + + if (lr_flip) { + TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], + buf1[7]); + } else { + TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_8X4; + int32x4_t buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[8]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *buf1_ptr; + if (lr_flip) { + flip_buf_neon(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < 2; i++) { + int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + // write to buffer + highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row, + bd); +} + +void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_4X16; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + for (int i = 0; i < (txfm_size_row >> 2); i++) { + const int32_t *input_row = input + i * 4; + int32x4_t *buf0_cur = buf0 + i * 4; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); + row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, const int bd) { + TX_SIZE tx_size = TX_16X4; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *buf1_ptr; + if (lr_flip) { + flip_buf_neon(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, txfm_size_row, + bd); + } +} + +static void highbd_inv_txfm2d_add_4x16_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, int eob, + const int bd) { + (void)eob; + TX_SIZE tx_size = TX_4X16; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_col); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + const int32_t *input_row = input; + int32x4_t *buf0_cur = buf0; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row); + for (int i = 0; i < (txfm_size_row >> 2); i++) { + row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, int eob, + const int bd) { + (void)eob; + TX_SIZE tx_size = TX_16X4; + int32x4_t buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + int32x4_t buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + for (int j = 0; j < buf_size_w_div8; j++) { + TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j], + buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]); + } + row_txfm(buf1, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *buf1_ptr; + if (lr_flip) { + flip_buf_neon(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row, + INV_COS_BIT, 1, bd, 0); + } + round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, txfm_size_row, + bd); + } +} + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static INLINE void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size) { + if (tx_size == 2) { + *eoby = 15, *eobx = 15; + } else if (tx_size == 3) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 4) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 7) { + *eoby = 15, *eobx = 7; + } else if (tx_size == 8) { + *eoby = 7, *eobx = 15; + } else if (tx_size == 9) { + *eoby = 31, *eobx = 15; + } else if (tx_size == 10) { + *eoby = 15, *eobx = 31; + } else if (tx_size == 11) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 12) { + *eoby = 31, *eobx = 31; + } else if (tx_size == 15) { + *eoby = 31, *eobx = 7; + } else if (tx_size == 16) { + *eoby = 7, *eobx = 31; + } else if (tx_size == 17) { + *eoby = 31, *eobx = 15; + } else if (tx_size == 18) { + *eoby = 15, *eobx = 31; + } else { + *eoby = 0, *eobx = 0; + } +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size) { + const int txfm_size_row = tx_size_high[tx_size]; + *eoby = AOMMIN(32, txfm_size_row) - 1; + *eobx = 0; +} + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size) { + const int txfm_size_col = tx_size_wide[tx_size]; + *eobx = AOMMIN(32, txfm_size_col) - 1; + *eoby = 0; +} + +static void inv_txfm2d_add_h_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + const int bd) { + int32x4_t buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + assert(row_txfm != NULL); + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + assert(col_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + int32x4_t buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + int32x4_t *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} + +static void inv_txfm2d_add_v_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + const int bd) { + int32x4_t buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + assert(row_txfm != NULL); + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + assert(col_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + int32x4_t buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } + } +} + +static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, const int bd) { + int32x4_t buf1[64 * 4]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + assert(row_txfm != NULL); + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + assert(col_txfm != NULL); + for (int i = 0; i < (row_max >> 2); ++i) { + int32x4_t buf0[32]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = buf1 + i * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + int32x4_t *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, 0, txfm_size_row, bd); + } + } +} + +static void inv_txfm2d_add_no_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + const int bd) { + int32x4_t buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = txfm_size_col >> 2; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + int32x4_t buf0[64]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = &buf1[i * 4]; + + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + int32x4_t buf1[64 * 16]; + int eobx, eoby; + highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_col); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_neon row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_neon col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + int32x4_t buf0[64]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) { + int32x4_t *buf0_cur = &buf0[j * 4]; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + int32x4_t *_buf1 = &buf1[i * 4]; + + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_neon(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case IDTX: + inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + default: assert(0); break; + } +} + +static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + case IDTX: + inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride, + tx_type, tx_size, bd); + break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type, + txfm_param->tx_size, txfm_param->eob, + bd); + break; + default: + av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} + +void av1_highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_4x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x4_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16, + bd); +} + +void av1_highbd_inv_txfm_add_4x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, eob, bd); +} + +void av1_highbd_inv_txfm_add_16x4_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, eob, bd); +} + +void av1_highbd_inv_txfm_add_8x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_8X16, txfm_param->eob, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X8, txfm_param->eob, txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8, + bd); +} + +void av1_highbd_inv_txfm_add_16x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X32, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_16X32, bd); +} + +void av1_highbd_inv_txfm_add_32x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X16, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_32X16, bd); +} + +void av1_highbd_inv_txfm_add_32x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X32, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_32X32, bd); +} + +void av1_highbd_inv_txfm_add_64x64_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_64X64, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_64X64, bd); +} + +void av1_highbd_inv_txfm_add_32x64_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X64, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_32X64, bd); +} + +void av1_highbd_inv_txfm_add_64x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_64X32, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_64X32, bd); +} + +void av1_highbd_inv_txfm_add_64x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_64X16, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_64X16, bd); +} + +void av1_highbd_inv_txfm_add_16x64_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X64, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_16X64, bd); +} + +void av1_highbd_inv_txfm_add_16x16_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_16X16, txfm_param->eob, + txfm_param->bd); +} + +void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, + TX_16X16, bd); +} + +void av1_highbd_inv_txfm_add_32x8_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_32X8, txfm_param->eob, txfm_param->bd); +} + +void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8, + bd); +} + +void av1_highbd_inv_txfm_add_8x32_neon(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + highbd_inv_txfm2d_add_universe_neon(input, dest, stride, txfm_param->tx_type, + TX_8X32, txfm_param->eob, txfm_param->bd); +} + +void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest, + int stride, TX_TYPE tx_type, const int bd) { + inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32, + bd); +} + +void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const TX_SIZE tx_size = txfm_param->tx_size; + + TX_TYPE tx_type = txfm_param->tx_type; + int bd = txfm_param->bd; + switch (tx_size) { + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_8X4: + av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_4X16: + av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); + break; + case TX_8X16: + av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X8: + av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X32: + av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X16: + av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X16: + av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X32: + av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_64X64: + av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X64: + av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_64X32: + av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_16X64: + av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_64X16: + av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_32X8: + av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + case TX_8X32: + av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type, + bd); + break; + } +} diff --git a/third_party/aom/av1/common/arm/highbd_reconinter_neon.c b/third_party/aom/av1/common/arm/highbd_reconinter_neon.c new file mode 100644 index 0000000000..da7f6c57d0 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_reconinter_neon.c @@ -0,0 +1,327 @@ +/* + * + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" + +static INLINE void diffwtd_mask_highbd_neon(uint8_t *mask, bool inverse, + const uint16_t *src0, + int src0_stride, + const uint16_t *src1, + int src1_stride, int h, int w, + const unsigned int bd) { + assert(DIFF_FACTOR > 0); + uint8x16_t max_alpha = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA); + uint8x16_t mask_base = vdupq_n_u8(38); + uint8x16_t mask_diff = vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA - 38); + + if (bd == 8) { + if (w >= 16) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0_lo = vld1q_u16(src0_ptr); + uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); + uint16x8_t s1_lo = vld1q_u16(src1_ptr); + uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); + + uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); + uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(mask_diff, diff); + } else { + m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); + } + + vst1q_u8(mask_ptr, m); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + width -= 16; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0 = vld1q_u16(src0_ptr); + uint16x8_t s1 = vld1q_u16(src1_ptr); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + vst1_u8(mask_ptr, m); + + src0_ptr += 8; + src1_ptr += 8; + mask_ptr += 8; + width -= 8; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + store_u8x4_strided_x2(mask, w, m); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + mask += 2 * w; + h -= 2; + } while (h != 0); + } + } else if (bd == 10) { + if (w >= 16) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0_lo = vld1q_u16(src0_ptr); + uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); + uint16x8_t s1_lo = vld1q_u16(src1_ptr); + uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); + + uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); + uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 2 + DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 2 + DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(mask_diff, diff); + } else { + m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); + } + + vst1q_u8(mask_ptr, m); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + width -= 16; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0 = vld1q_u16(src0_ptr); + uint16x8_t s1 = vld1q_u16(src1_ptr); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + vst1_u8(mask_ptr, m); + + src0_ptr += 8; + src1_ptr += 8; + mask_ptr += 8; + width -= 8; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 2 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + store_u8x4_strided_x2(mask, w, m); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + mask += 2 * w; + h -= 2; + } while (h != 0); + } + } else { + assert(bd == 12); + if (w >= 16) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0_lo = vld1q_u16(src0_ptr); + uint16x8_t s0_hi = vld1q_u16(src0_ptr + 8); + uint16x8_t s1_lo = vld1q_u16(src1_ptr); + uint16x8_t s1_hi = vld1q_u16(src1_ptr + 8); + + uint16x8_t diff_lo_u16 = vabdq_u16(s0_lo, s1_lo); + uint16x8_t diff_hi_u16 = vabdq_u16(s0_hi, s1_hi); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, 4 + DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, 4 + DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(mask_diff, diff); + } else { + m = vminq_u8(vaddq_u8(diff, mask_base), max_alpha); + } + + vst1q_u8(mask_ptr, m); + + src0_ptr += 16; + src1_ptr += 16; + mask_ptr += 16; + width -= 16; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 8) { + do { + uint8_t *mask_ptr = mask; + const uint16_t *src0_ptr = src0; + const uint16_t *src1_ptr = src1; + int width = w; + do { + uint16x8_t s0 = vld1q_u16(src0_ptr); + uint16x8_t s1 = vld1q_u16(src1_ptr); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + vst1_u8(mask_ptr, m); + + src0_ptr += 8; + src1_ptr += 8; + mask_ptr += 8; + width -= 8; + } while (width != 0); + mask += w; + src0 += src0_stride; + src1 += src1_stride; + } while (--h != 0); + } else if (w == 4) { + do { + uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride); + uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride); + + uint16x8_t diff_u16 = vabdq_u16(s0, s1); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, 4 + DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vget_low_u8(mask_diff), diff_u8); + } else { + m = vmin_u8(vadd_u8(diff_u8, vget_low_u8(mask_base)), + vget_low_u8(max_alpha)); + } + + store_u8x4_strided_x2(mask, w, m); + + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + mask += 2 * w; + h -= 2; + } while (h != 0); + } + } +} + +void av1_build_compound_diffwtd_mask_highbd_neon( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + assert(h % 4 == 0); + assert(w % 4 == 0); + assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38); + + if (mask_type == DIFFWTD_38) { + diffwtd_mask_highbd_neon(mask, /*inverse=*/false, CONVERT_TO_SHORTPTR(src0), + src0_stride, CONVERT_TO_SHORTPTR(src1), + src1_stride, h, w, bd); + } else { // mask_type == DIFFWTD_38_INV + diffwtd_mask_highbd_neon(mask, /*inverse=*/true, CONVERT_TO_SHORTPTR(src0), + src0_stride, CONVERT_TO_SHORTPTR(src1), + src1_stride, h, w, bd); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_reconintra_neon.c b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c new file mode 100644 index 0000000000..170491b504 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_reconintra_neon.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/sum_neon.h" + +#define MAX_UPSAMPLE_SZ 16 + +void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) { + if (!strength) return; + assert(sz >= 0 && sz <= 129); + + DECLARE_ALIGNED(16, static const uint16_t, + idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 }; + const uint16x8_t index = vld1q_u16(idx); + + uint16_t edge[160]; // Max value of sz + enough padding for vector accesses. + memcpy(edge + 1, p, sz * sizeof(*p)); + + // Populate extra space appropriately. + edge[0] = edge[1]; + edge[sz + 1] = edge[sz]; + edge[sz + 2] = edge[sz]; + + // Don't overwrite first pixel. + uint16_t *dst = p + 1; + sz--; + + if (strength == 1) { // Filter: {4, 8, 4}. + const uint16_t *src = edge + 1; + + while (sz >= 8) { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + // Make use of the identity: + // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 + uint16x8_t t0 = vaddq_u16(s0, s2); + uint16x8_t t1 = vaddq_u16(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 2); + + vst1q_u16(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + // Make use of the identity: + // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 + uint16x8_t t0 = vaddq_u16(s0, s2); + uint16x8_t t1 = vaddq_u16(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 2); + + // Mask off out-of-bounds indices. + uint16x8_t current_dst = vld1q_u16(dst); + uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); + res = vbslq_u16(mask, res, current_dst); + + vst1q_u16(dst, res); + } + } else if (strength == 2) { // Filter: {5, 6, 5}. + const uint16_t *src = edge + 1; + + const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6), + vdupq_n_u16(5) } }; + while (sz >= 8) { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + uint16x8_t accum = vmulq_u16(s0, filter.val[0]); + accum = vmlaq_u16(accum, s1, filter.val[1]); + accum = vmlaq_u16(accum, s2, filter.val[2]); + uint16x8_t res = vrshrq_n_u16(accum, 4); + + vst1q_u16(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + + uint16x8_t accum = vmulq_u16(s0, filter.val[0]); + accum = vmlaq_u16(accum, s1, filter.val[1]); + accum = vmlaq_u16(accum, s2, filter.val[2]); + uint16x8_t res = vrshrq_n_u16(accum, 4); + + // Mask off out-of-bounds indices. + uint16x8_t current_dst = vld1q_u16(dst); + uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); + res = vbslq_u16(mask, res, current_dst); + + vst1q_u16(dst, res); + } + } else { // Filter {2, 4, 4, 4, 2}. + const uint16_t *src = edge; + + while (sz >= 8) { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + uint16x8_t s4 = vld1q_u16(src + 4); + + // Make use of the identity: + // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 + uint16x8_t t0 = vaddq_u16(s0, s4); + uint16x8_t t1 = vaddq_u16(s1, s2); + t1 = vaddq_u16(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 3); + + vst1q_u16(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + uint16x8_t s4 = vld1q_u16(src + 4); + + // Make use of the identity: + // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 + uint16x8_t t0 = vaddq_u16(s0, s4); + uint16x8_t t1 = vaddq_u16(s1, s2); + t1 = vaddq_u16(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint16x8_t res = vrshrq_n_u16(sum, 3); + + // Mask off out-of-bounds indices. + uint16x8_t current_dst = vld1q_u16(dst); + uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); + res = vbslq_u16(mask, res, current_dst); + + vst1q_u16(dst, res); + } + } +} + +void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) { + if (!sz) return; + + assert(sz <= MAX_UPSAMPLE_SZ); + + uint16_t edge[MAX_UPSAMPLE_SZ + 3]; + const uint16_t *src = edge; + + // Copy p[-1..(sz-1)] and pad out both ends. + edge[0] = p[-1]; + edge[1] = p[-1]; + memcpy(edge + 2, p, sz * 2); + edge[sz + 2] = p[sz - 1]; + p[-2] = p[-1]; + + uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1); + + uint16_t *dst = p - 1; + + if (bd == 12) { + do { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + + uint16x8_t t0 = vaddq_u16(s1, s2); + uint16x8_t t1 = vaddq_u16(s0, s3); + uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9); + acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1))); + uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9); + acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1))); + + uint16x8x2_t res; + res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4)); + // Clamp pixel values at bitdepth maximum. + res.val[0] = vminq_u16(res.val[0], pixel_val_max); + res.val[1] = s2; + + vst2q_u16(dst, res); + + src += 8; + dst += 16; + sz -= 8; + } while (sz > 0); + } else { // Bit depth is 8 or 10. + do { + uint16x8_t s0 = vld1q_u16(src); + uint16x8_t s1 = vld1q_u16(src + 1); + uint16x8_t s2 = vld1q_u16(src + 2); + uint16x8_t s3 = vld1q_u16(src + 3); + + uint16x8_t t0 = vaddq_u16(s0, s3); + uint16x8_t t1 = vaddq_u16(s1, s2); + t1 = vmulq_n_u16(t1, 9); + t1 = vqsubq_u16(t1, t0); + + uint16x8x2_t res; + res.val[0] = vrshrq_n_u16(t1, 4); + // Clamp pixel values at bitdepth maximum. + res.val[0] = vminq_u16(res.val[0], pixel_val_max); + res.val[1] = s2; + + vst2q_u16(dst, res); + + src += 8; + dst += 16; + sz -= 8; + } while (sz > 0); + } +} diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c new file mode 100644 index 0000000000..c6f1e3ad92 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, + int sx, int alpha) { + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = horizontal_add_4d_s32x4(m0123); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, + int sx, int alpha) { + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int32x4_t m0 = vmull_s16(vget_low_s16(f[0]), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f[0]), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f[1]), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f[1]), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f[2]), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f[2]), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f[3]), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f[3]), vget_high_s16(rv3)); + int32x4_t m4 = vmull_s16(vget_low_s16(f[4]), vget_low_s16(rv4)); + m4 = vmlal_s16(m4, vget_high_s16(f[4]), vget_high_s16(rv4)); + int32x4_t m5 = vmull_s16(vget_low_s16(f[5]), vget_low_s16(rv5)); + m5 = vmlal_s16(m5, vget_high_s16(f[5]), vget_high_s16(rv5)); + int32x4_t m6 = vmull_s16(vget_low_s16(f[6]), vget_low_s16(rv6)); + m6 = vmlal_s16(m6, vget_high_s16(f[6]), vget_high_s16(rv6)); + int32x4_t m7 = vmull_s16(vget_low_s16(f[7]), vget_low_s16(rv7)); + m7 = vmlal_s16(m7, vget_high_s16(f[7]), vget_high_s16(rv7)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); + int32x4_t res1 = horizontal_add_4d_s32x4(m4567); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, + int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = horizontal_add_4d_s32x4(m0123); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, + int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + int32x4_t m4 = vmull_s16(vget_low_s16(f), vget_low_s16(rv4)); + m4 = vmlal_s16(m4, vget_high_s16(f), vget_high_s16(rv4)); + int32x4_t m5 = vmull_s16(vget_low_s16(f), vget_low_s16(rv5)); + m5 = vmlal_s16(m5, vget_high_s16(f), vget_high_s16(rv5)); + int32x4_t m6 = vmull_s16(vget_low_s16(f), vget_low_s16(rv6)); + m6 = vmlal_s16(m6, vget_high_s16(f), vget_high_s16(rv6)); + int32x4_t m7 = vmull_s16(vget_low_s16(f), vget_low_s16(rv7)); + m7 = vmlal_s16(m7, vget_high_s16(f), vget_high_s16(rv7)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); + int32x4_t res1 = horizontal_add_4d_s32x4(m4567); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + return m0123; +} + +static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); + return (int32x4x2_t){ { m0123, m4567 } }; +} + +static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, + int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), + vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), + vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + return horizontal_add_4d_s32x4(m0123); +} + +static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, + int gamma) { + int16x8_t s0 = tmp[0]; + int16x8_t s1 = tmp[1]; + int16x8_t s2 = tmp[2]; + int16x8_t s3 = tmp[3]; + int16x8_t s4 = tmp[4]; + int16x8_t s5 = tmp[5]; + int16x8_t s6 = tmp[6]; + int16x8_t s7 = tmp[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); + m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); + int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); + m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); + int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); + m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); + int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); + m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + int32x4x2_t ret; + ret.val[0] = horizontal_add_4d_s32x4(m0123); + ret.val[1] = horizontal_add_4d_s32x4(m4567); + return ret; +} + +void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} diff --git a/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h new file mode 100644 index 0000000000..3b8982898e --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_warp_plane_neon.h @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, + int sx, int alpha); + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, + int sx, int alpha); + +static INLINE int16x8_t highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, + int sx); + +static INLINE int16x8_t highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, + int sx); + +static INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, int sy); + +static INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, int sy); + +static INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, int sy, + int gamma); + +static INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, int sy, + int gamma); + +static INLINE int16x8_t load_filters_1(int ofs) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + return vld1q_s16(base + ofs0 * 8); +} + +static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); +} + +static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); + const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); + const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); + const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); + out[4] = vld1q_s16(base + ofs4 * 8); + out[5] = vld1q_s16(base + ofs5 * 8); + out[6] = vld1q_s16(base + ofs6 * 8); + out[7] = vld1q_s16(base + ofs7 * 8); +} + +static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) { + const int limit = (1 << bd) - 1; + return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); +} + +static INLINE void warp_affine_horizontal(const uint16_t *ref, int width, + int height, int stride, int p_width, + int16_t alpha, int16_t beta, int iy4, + int sx4, int ix4, int16x8_t tmp[], + int bd) { + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + + if (ix4 <= -7) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } else if (ix4 >= width + 6) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = + (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } + + static const uint16_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint16x8_t indx0 = vld1q_u16(kIotaArr); + const uint16x8_t indx1 = vld1q_u16(kIotaArr + 8); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + \ + if (out_of_boundary_left >= 0) { \ + uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); \ + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); \ + uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); \ + uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); \ + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \ + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \ + } \ + if (out_of_boundary_right >= 0) { \ + uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); \ + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); \ + uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); \ + uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); \ + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \ + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } +} + +static INLINE void highbd_vertical_filter_4x1_f4( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4_t sum0 = gamma == 0 ? vertical_filter_4x1_f1(tmp, sy) + : vertical_filter_4x1_f4(tmp, sy, gamma); + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + vst1_u16(dst16, res0); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + return; + } + + uint16x4_t p0 = vld1_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + vst1_u16(dst16, res0); +} + +static INLINE void highbd_vertical_filter_8x1_f8( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4x2_t sums = gamma == 0 ? vertical_filter_8x1_f1(tmp, sy) + : vertical_filter_8x1_f8(tmp, sy, gamma); + int32x4_t sum0 = sums.val[0]; + int32x4_t sum1 = sums.val[1]; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + vst1_u16(p + 4, vqmovun_s32(sum1)); + return; + } + + uint16x8_t p0 = vld1q_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); + int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec1 = vmulq_n_s32(p_vec1, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + p_vec1 = vhaddq_s32(p_vec1, sum1); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); + + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); +} + +static INLINE void warp_affine_vertical( + uint16_t *pred, int p_width, int p_height, int p_stride, int bd, + uint16_t *dst, int dst_stride, bool is_compound, bool do_average, + bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, + const int16x8_t *tmp, int i, int sy4, int j) { + int limit_height = p_height > 4 ? 8 : 4; + + if (p_width > 4) { + // p_width == 8 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_8x1_f8( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } else { + // p_width == 4 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_4x1_f4( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } +} + +static INLINE void highbd_warp_affine_common( + const int32_t *mat, const uint16_t *ref, int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const bool is_compound = conv_params->is_compound; + const bool do_average = conv_params->do_average; + const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fwd = conv_params->fwd_offset; + const int bwd = conv_params->bck_offset; + + assert(IMPLIES(is_compound, dst != NULL)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4 + p_col) << subsampling_x; + const int32_t src_y = (i + 4 + p_row) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Each horizontal filter result is formed by the sum of up to eight + // multiplications by filter values and then a shift. Although both the + // inputs and filters are loaded as int16, the input data is at most bd + // bits and the filters are at most 8 bits each. Additionally since we + // know all possible filter values we know that the sum of absolute + // filter values will fit in at most 9 bits. With this in mind we can + // conclude that the sum of each filter application will fit in bd + 9 + // bits. The shift following the summation is ROUND0_BITS (which is 3), + // +2 for 12-bit, which gives us a final storage of: + // bd == 8: ( 8 + 9) - 3 => 14 bits + // bd == 10: (10 + 9) - 3 => 16 bits + // bd == 12: (12 + 9) - 5 => 16 bits + // So it is safe to use int16x8_t as the intermediate storage type here. + int16x8_t tmp[15]; + + warp_affine_horizontal(ref, width, height, stride, p_width, alpha, beta, + iy4, sx4, ix4, tmp, bd); + warp_affine_vertical(pred, p_width, p_height, p_stride, bd, dst, + dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, tmp, + i, sy4, j); + } + } +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c new file mode 100644 index 0000000000..a6bd6d38e4 --- /dev/null +++ b/third_party/aom/av1/common/arm/highbd_wiener_convolve_neon.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/convolve.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#define HBD_WIENER_5TAP_HORIZ(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve5_8_2d_h( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int16x8_t s04 = vaddq_s16(s0, s4); \ + int16x8_t s13 = vaddq_s16(s1, s3); \ + \ + /* x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \ + int32x4_t sum_lo = \ + vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); \ + \ + int32x4_t sum_hi = \ + vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_5tap_horiz( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int width = w; \ + \ + do { \ + int16x8_t s0, s1, s2, s3, s4; \ + load_s16_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); \ + \ + uint16x8_t d0 = name##_wiener_convolve5_8_2d_h( \ + s0, s1, s2, s3, s4, x_filter, round_vec, im_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += 8; \ + d += 8; \ + width -= 8; \ + } while (width != 0); \ + src_ptr += src_stride; \ + dst_ptr += dst_stride; \ + } while (--h != 0); \ + } + +HBD_WIENER_5TAP_HORIZ(highbd, WIENER_ROUND0_BITS) +HBD_WIENER_5TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) + +#undef HBD_WIENER_5TAP_HORIZ + +#define HBD_WIENER_7TAP_HORIZ(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve7_8_2d_h( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ + const int16x8_t s6, const int16x4_t x_filter, const int32x4_t round_vec, \ + const uint16x8_t im_max_val) { \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int16x8_t s06 = vaddq_s16(s0, s6); \ + int16x8_t s15 = vaddq_s16(s1, s5); \ + int16x8_t s24 = vaddq_s16(s2, s4); \ + \ + int32x4_t sum_lo = \ + vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); \ + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); \ + \ + int32x4_t sum_hi = \ + vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); \ + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), im_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_7tap_horiz( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, \ + const int32x4_t round_vec, const uint16x8_t im_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int width = w; \ + \ + do { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6; \ + load_s16_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_h( \ + s0, s1, s2, s3, s4, s5, s6, x_filter, round_vec, im_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += 8; \ + d += 8; \ + width -= 8; \ + } while (width != 0); \ + src_ptr += src_stride; \ + dst_ptr += dst_stride; \ + } while (--h != 0); \ + } + +HBD_WIENER_7TAP_HORIZ(highbd, WIENER_ROUND0_BITS) +HBD_WIENER_7TAP_HORIZ(highbd_12, WIENER_ROUND0_BITS + 2) + +#undef HBD_WIENER_7TAP_HORIZ + +#define HBD_WIENER_5TAP_VERT(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve5_8_2d_v( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ + const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int32x4_t s04_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s4)); \ + int32x4_t s13_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s3)); \ + \ + /* y_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) */ \ + int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s04_lo, y_filter_lo, 1); \ + sum_lo = vmlaq_lane_s32(sum_lo, s13_lo, y_filter_hi, 0); \ + sum_lo = \ + vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s2)), y_filter_hi, 1); \ + \ + int32x4_t s04_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s4)); \ + int32x4_t s13_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s3)); \ + \ + int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s04_hi, y_filter_lo, 1); \ + sum_hi = vmlaq_lane_s32(sum_hi, s13_hi, y_filter_hi, 0); \ + sum_hi = \ + vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s2)), y_filter_hi, 1); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_5tap_vert( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int height = h; \ + \ + while (height > 3) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; \ + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); \ + \ + uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \ + s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \ + uint16x8_t d1 = name##_wiener_convolve5_8_2d_v( \ + s1, s2, s3, s4, s5, y_filter, round_vec, res_max_val); \ + uint16x8_t d2 = name##_wiener_convolve5_8_2d_v( \ + s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + uint16x8_t d3 = name##_wiener_convolve5_8_2d_v( \ + s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ + \ + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ + \ + s += 4 * src_stride; \ + d += 4 * dst_stride; \ + height -= 4; \ + } \ + \ + while (height-- != 0) { \ + int16x8_t s0, s1, s2, s3, s4; \ + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); \ + \ + uint16x8_t d0 = name##_wiener_convolve5_8_2d_v( \ + s0, s1, s2, s3, s4, y_filter, round_vec, res_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += src_stride; \ + d += dst_stride; \ + } \ + \ + src_ptr += 8; \ + dst_ptr += 8; \ + w -= 8; \ + } while (w != 0); \ + } + +HBD_WIENER_5TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) +HBD_WIENER_5TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) + +#undef HBD_WIENER_5TAP_VERT + +#define HBD_WIENER_7TAP_VERT(name, shift) \ + static INLINE uint16x8_t name##_wiener_convolve7_8_2d_v( \ + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, \ + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, \ + const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec, \ + const uint16x8_t res_max_val) { \ + const int32x2_t y_filter_lo = vget_low_s32(vmovl_s16(y_filter)); \ + const int32x2_t y_filter_hi = vget_high_s32(vmovl_s16(y_filter)); \ + /* Wiener filter is symmetric so add mirrored source elements. */ \ + int32x4_t s06_lo = vaddl_s16(vget_low_s16(s0), vget_low_s16(s6)); \ + int32x4_t s15_lo = vaddl_s16(vget_low_s16(s1), vget_low_s16(s5)); \ + int32x4_t s24_lo = vaddl_s16(vget_low_s16(s2), vget_low_s16(s4)); \ + \ + int32x4_t sum_lo = vmlaq_lane_s32(round_vec, s06_lo, y_filter_lo, 0); \ + sum_lo = vmlaq_lane_s32(sum_lo, s15_lo, y_filter_lo, 1); \ + sum_lo = vmlaq_lane_s32(sum_lo, s24_lo, y_filter_hi, 0); \ + sum_lo = \ + vmlaq_lane_s32(sum_lo, vmovl_s16(vget_low_s16(s3)), y_filter_hi, 1); \ + \ + int32x4_t s06_hi = vaddl_s16(vget_high_s16(s0), vget_high_s16(s6)); \ + int32x4_t s15_hi = vaddl_s16(vget_high_s16(s1), vget_high_s16(s5)); \ + int32x4_t s24_hi = vaddl_s16(vget_high_s16(s2), vget_high_s16(s4)); \ + \ + int32x4_t sum_hi = vmlaq_lane_s32(round_vec, s06_hi, y_filter_lo, 0); \ + sum_hi = vmlaq_lane_s32(sum_hi, s15_hi, y_filter_lo, 1); \ + sum_hi = vmlaq_lane_s32(sum_hi, s24_hi, y_filter_hi, 0); \ + sum_hi = \ + vmlaq_lane_s32(sum_hi, vmovl_s16(vget_high_s16(s3)), y_filter_hi, 1); \ + \ + uint16x4_t res_lo = vqrshrun_n_s32(sum_lo, shift); \ + uint16x4_t res_hi = vqrshrun_n_s32(sum_hi, shift); \ + \ + return vminq_u16(vcombine_u16(res_lo, res_hi), res_max_val); \ + } \ + \ + static INLINE void name##_convolve_add_src_7tap_vert( \ + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, \ + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, \ + const int32x4_t round_vec, const uint16x8_t res_max_val) { \ + do { \ + const int16_t *s = (int16_t *)src_ptr; \ + uint16_t *d = dst_ptr; \ + int height = h; \ + \ + while (height > 3) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \ + load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, \ + &s8, &s9); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ + s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + uint16x8_t d1 = name##_wiener_convolve7_8_2d_v( \ + s1, s2, s3, s4, s5, s6, s7, y_filter, round_vec, res_max_val); \ + uint16x8_t d2 = name##_wiener_convolve7_8_2d_v( \ + s2, s3, s4, s5, s6, s7, s8, y_filter, round_vec, res_max_val); \ + uint16x8_t d3 = name##_wiener_convolve7_8_2d_v( \ + s3, s4, s5, s6, s7, s8, s9, y_filter, round_vec, res_max_val); \ + \ + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); \ + \ + s += 4 * src_stride; \ + d += 4 * dst_stride; \ + height -= 4; \ + } \ + \ + while (height-- != 0) { \ + int16x8_t s0, s1, s2, s3, s4, s5, s6; \ + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); \ + \ + uint16x8_t d0 = name##_wiener_convolve7_8_2d_v( \ + s0, s1, s2, s3, s4, s5, s6, y_filter, round_vec, res_max_val); \ + \ + vst1q_u16(d, d0); \ + \ + s += src_stride; \ + d += dst_stride; \ + } \ + \ + src_ptr += 8; \ + dst_ptr += 8; \ + w -= 8; \ + } while (w != 0); \ + } + +HBD_WIENER_7TAP_VERT(highbd, 2 * FILTER_BITS - WIENER_ROUND0_BITS) +HBD_WIENER_7TAP_VERT(highbd_12, 2 * FILTER_BITS - WIENER_ROUND0_BITS - 2) + +#undef HBD_WIENER_7TAP_VERT + +static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) { + assert(filter[7] == 0); + if (filter[0] == 0 && filter[6] == 0) { + return WIENER_WIN_REDUCED; + } + return WIENER_WIN; +} + +void av1_highbd_wiener_convolve_add_src_neon( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *x_filter, int x_step_q4, + const int16_t *y_filter, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + (void)x_step_q4; + (void)y_step_q4; + + assert(w % 8 == 0); + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(x_filter[7] == 0 && y_filter[7] == 0); + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]); + + const int x_filter_taps = get_wiener_filter_taps(x_filter); + const int y_filter_taps = get_wiener_filter_taps(y_filter); + int16x4_t x_filter_s16 = vld1_s16(x_filter); + int16x4_t y_filter_s16 = vld1_s16(y_filter); + // Add 128 to tap 3. (Needed for rounding.) + x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48)); + y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48)); + + const int im_stride = MAX_SB_SIZE; + const int im_h = h + y_filter_taps - 1; + const int horiz_offset = x_filter_taps / 2; + const int vert_offset = (y_filter_taps / 2) * (int)src_stride; + + const int extraprec_clamp_limit = + WIENER_CLAMP_LIMIT(conv_params->round_0, bd); + const uint16x8_t im_max_val = vdupq_n_u16(extraprec_clamp_limit - 1); + const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); + + const uint16x8_t res_max_val = vdupq_n_u16((1 << bd) - 1); + const int32x4_t vert_round_vec = + vdupq_n_s32(-(1 << (bd + conv_params->round_1 - 1))); + + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + if (bd == 12) { + if (x_filter_taps == WIENER_WIN_REDUCED) { + highbd_12_convolve_add_src_5tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } else { + highbd_12_convolve_add_src_7tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } + + if (y_filter_taps == WIENER_WIN_REDUCED) { + highbd_12_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, + w, h, y_filter_s16, vert_round_vec, + res_max_val); + } else { + highbd_12_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, + w, h, y_filter_s16, vert_round_vec, + res_max_val); + } + + } else { + if (x_filter_taps == WIENER_WIN_REDUCED) { + highbd_convolve_add_src_5tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } else { + highbd_convolve_add_src_7tap_horiz( + src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w, + im_h, x_filter_s16, horiz_round_vec, im_max_val); + } + + if (y_filter_taps == WIENER_WIN_REDUCED) { + highbd_convolve_add_src_5tap_vert(im_block, im_stride, dst, dst_stride, w, + h, y_filter_s16, vert_round_vec, + res_max_val); + } else { + highbd_convolve_add_src_7tap_vert(im_block, im_stride, dst, dst_stride, w, + h, y_filter_s16, vert_round_vec, + res_max_val); + } + } +} diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c new file mode 100644 index 0000000000..2b0274cc64 --- /dev/null +++ b/third_party/aom/av1/common/arm/reconinter_neon.c @@ -0,0 +1,217 @@ +/* + * + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "config/av1_rtcd.h" + +static AOM_INLINE void diffwtd_mask_d16_neon( + uint8_t *mask, const bool inverse, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round)); + + if (w >= 16) { + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0_lo = vld1q_u16(src0 + j); + uint16x8_t s1_lo = vld1q_u16(src1 + j); + uint16x8_t s0_hi = vld1q_u16(src0 + j + 8); + uint16x8_t s1_hi = vld1q_u16(src1 + j + 8); + + uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec); + uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec); + uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2); + uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2); + uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8); + + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + j += 16; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + } while (++i < h); + } else if (w == 8) { + int i = 0; + do { + uint16x8_t s0 = vld1q_u16(src0); + uint16x8_t s1 = vld1q_u16(src1); + + uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0 + } else { + m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64)); + } + + vst1_u8(mask, m); + + mask += 8; + src0 += src0_stride; + src1 += src1_stride; + } while (++i < h); + } else if (w == 4) { + int i = 0; + do { + uint16x8_t s0 = + vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride)); + uint16x8_t s1 = + vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride)); + + uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec); + uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2); + uint8x8_t m; + if (inverse) { + m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0 + } else { + m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64)); + } + + vst1_u8(mask, m); + + mask += 8; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + i += 2; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_neon( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + assert(h >= 4); + assert(w >= 4); + assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38)); + + if (mask_type == DIFFWTD_38) { + diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1, + src1_stride, h, w, conv_params, bd); + } else { // mask_type == DIFFWTD_38_INV + diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1, + src1_stride, h, w, conv_params, bd); + } +} + +static AOM_INLINE void diffwtd_mask_neon(uint8_t *mask, const bool inverse, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + if (w >= 16) { + int i = 0; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src0 + j); + uint8x16_t s1 = vld1q_u8(src1 + j); + + uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + j += 16; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + } while (++i < h); + } else if (w == 8) { + int i = 0; + do { + uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride)); + uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride)); + + uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + src0 += 2 * src0_stride; + src1 += 2 * src1_stride; + i += 2; + } while (i < h); + } else if (w == 4) { + int i = 0; + do { + uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride); + uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride); + + uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2); + uint8x16_t m; + if (inverse) { + m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0 + } else { + m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64)); + } + + vst1q_u8(mask, m); + + mask += 16; + src0 += 4 * src0_stride; + src1 += 4 * src1_stride; + i += 4; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_neon(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + assert(h % 4 == 0); + assert(w % 4 == 0); + assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38); + + if (mask_type == DIFFWTD_38) { + diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1, + src1_stride, h, w); + } else { // mask_type == DIFFWTD_38_INV + diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1, + src1_stride, h, w); + } +} diff --git a/third_party/aom/av1/common/arm/reconintra_neon.c b/third_party/aom/av1/common/arm/reconintra_neon.c new file mode 100644 index 0000000000..3db39987a6 --- /dev/null +++ b/third_party/aom/av1/common/arm/reconintra_neon.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +#define MAX_UPSAMPLE_SZ 16 + +// These kernels are a transposed version of those defined in reconintra.c, +// with the absolute value of the negatives taken in the top row. +DECLARE_ALIGNED(16, const uint8_t, + av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = { + // clang-format off + { + { 6, 5, 3, 3, 4, 3, 3, 3 }, + { 10, 2, 1, 1, 6, 2, 2, 1 }, + { 0, 10, 1, 1, 0, 6, 2, 2 }, + { 0, 0, 10, 2, 0, 0, 6, 2 }, + { 0, 0, 0, 10, 0, 0, 0, 6 }, + { 12, 9, 7, 5, 2, 2, 2, 3 }, + { 0, 0, 0, 0, 12, 9, 7, 5 } + }, + { + { 10, 6, 4, 2, 10, 6, 4, 2 }, + { 16, 0, 0, 0, 16, 0, 0, 0 }, + { 0, 16, 0, 0, 0, 16, 0, 0 }, + { 0, 0, 16, 0, 0, 0, 16, 0 }, + { 0, 0, 0, 16, 0, 0, 0, 16 }, + { 10, 6, 4, 2, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 10, 6, 4, 2 } + }, + { + { 8, 8, 8, 8, 4, 4, 4, 4 }, + { 8, 0, 0, 0, 4, 0, 0, 0 }, + { 0, 8, 0, 0, 0, 4, 0, 0 }, + { 0, 0, 8, 0, 0, 0, 4, 0 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 16, 16, 16, 16, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16, 16, 16, 16 } + }, + { + { 2, 1, 1, 0, 1, 1, 1, 1 }, + { 8, 3, 2, 1, 4, 3, 2, 2 }, + { 0, 8, 3, 2, 0, 4, 3, 2 }, + { 0, 0, 8, 3, 0, 0, 4, 3 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 10, 6, 4, 2, 3, 4, 4, 3 }, + { 0, 0, 0, 0, 10, 6, 4, 3 } + }, + { + { 12, 10, 9, 8, 10, 9, 8, 7 }, + { 14, 0, 0, 0, 12, 1, 0, 0 }, + { 0, 14, 0, 0, 0, 12, 0, 0 }, + { 0, 0, 14, 0, 0, 0, 12, 1 }, + { 0, 0, 0, 14, 0, 0, 0, 12 }, + { 14, 12, 11, 10, 0, 0, 1, 1 }, + { 0, 0, 0, 0, 14, 12, 11, 9 } + } + // clang-format on +}; + +#define FILTER_INTRA_SCALE_BITS 4 + +void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(width <= 32 && height <= 32); + + const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]); + const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]); + const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]); + const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]); + const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]); + const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]); + const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]); + + uint8_t buffer[33][33]; + // Populate the top row in the scratch buffer with data from above. + memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t)); + // Populate the first column in the scratch buffer with data from the left. + int r = 0; + do { + buffer[r + 1][0] = left[r]; + } while (++r < height); + + // Computing 4 cols per iteration (instead of 8) for 8x blocks is faster. + if (width <= 8) { + r = 1; + do { + int c = 1; + uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1 = vdup_lane_u8(s1234, 0); + uint8x8_t s2 = vdup_lane_u8(s1234, 1); + uint8x8_t s3 = vdup_lane_u8(s1234, 2); + uint8x8_t s4 = vdup_lane_u8(s1234, 3); + + uint16x8_t sum = vmull_u8(s1, f1); + // First row of each filter has all negative values so subtract. + sum = vmlsl_u8(sum, s0, f0); + sum = vmlal_u8(sum, s2, f2); + sum = vmlal_u8(sum, s3, f3); + sum = vmlal_u8(sum, s4, f4); + sum = vmlal_u8(sum, s5, f5); + sum = vmlal_u8(sum, s6, f6); + + uint8x8_t res = + vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS); + + // Store buffer[r + 0][c] and buffer[r + 1][c]. + store_u8x4_strided_x2(&buffer[r][c], 33, res); + + store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res); + + s0 = s4; + s5 = vdup_lane_u8(res, 3); + s6 = vdup_lane_u8(res, 7); + c += 4; + } while (c < width + 1); + + r += 2; + } while (r < height + 1); + } else { + r = 1; + do { + int c = 1; + uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); + uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); + uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); + uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); + + uint16x8_t sum_lo = vmull_u8(s1_lo, f1); + // First row of each filter has all negative values so subtract. + sum_lo = vmlsl_u8(sum_lo, s0_lo, f0); + sum_lo = vmlal_u8(sum_lo, s2_lo, f2); + sum_lo = vmlal_u8(sum_lo, s3_lo, f3); + sum_lo = vmlal_u8(sum_lo, s4_lo, f4); + sum_lo = vmlal_u8(sum_lo, s5_lo, f5); + sum_lo = vmlal_u8(sum_lo, s6_lo, f6); + + uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo), + FILTER_INTRA_SCALE_BITS); + + uint8x8_t s0_hi = s4_lo; + uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); + uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); + uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); + uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); + uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); + uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); + + uint16x8_t sum_hi = vmull_u8(s1_hi, f1); + // First row of each filter has all negative values so subtract. + sum_hi = vmlsl_u8(sum_hi, s0_hi, f0); + sum_hi = vmlal_u8(sum_hi, s2_hi, f2); + sum_hi = vmlal_u8(sum_hi, s3_hi, f3); + sum_hi = vmlal_u8(sum_hi, s4_hi, f4); + sum_hi = vmlal_u8(sum_hi, s5_hi, f5); + sum_hi = vmlal_u8(sum_hi, s6_hi, f6); + + uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi), + FILTER_INTRA_SCALE_BITS); + + uint32x2x2_t res = + vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); + + vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0])); + vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1])); + + vst1_u8(dst + (r - 1) * stride + c - 1, + vreinterpret_u8_u32(res.val[0])); + vst1_u8(dst + (r + 0) * stride + c - 1, + vreinterpret_u8_u32(res.val[1])); + + s0_lo = s4_hi; + s5_lo = vdup_lane_u8(res_hi, 3); + s6_lo = vdup_lane_u8(res_hi, 7); + c += 8; + } while (c < width + 1); + + r += 2; + } while (r < height + 1); + } +} + +void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) { + if (!strength) return; + assert(sz >= 0 && sz <= 129); + + uint8_t edge[160]; // Max value of sz + enough padding for vector accesses. + memcpy(edge + 1, p, sz * sizeof(*p)); + + // Populate extra space appropriately. + edge[0] = edge[1]; + edge[sz + 1] = edge[sz]; + edge[sz + 2] = edge[sz]; + + // Don't overwrite first pixel. + uint8_t *dst = p + 1; + sz--; + + if (strength == 1) { // Filter: {4, 8, 4}. + const uint8_t *src = edge + 1; + + while (sz >= 8) { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + // Make use of the identity: + // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 + uint16x8_t t0 = vaddl_u8(s0, s2); + uint16x8_t t1 = vaddl_u8(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 2); + + vst1_u8(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + uint16x8_t t0 = vaddl_u8(s0, s2); + uint16x8_t t1 = vaddl_u8(s1, s1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 2); + + // Mask off out-of-bounds indices. + uint8x8_t current_dst = vld1_u8(dst); + uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); + res = vbsl_u8(mask, res, current_dst); + + vst1_u8(dst, res); + } + } else if (strength == 2) { // Filter: {5, 6, 5}. + const uint8_t *src = edge + 1; + + const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } }; + + while (sz >= 8) { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + uint16x8_t accum = vmull_u8(s0, filter.val[0]); + accum = vmlal_u8(accum, s1, filter.val[1]); + accum = vmlal_u8(accum, s2, filter.val[2]); + uint8x8_t res = vrshrn_n_u16(accum, 4); + + vst1_u8(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + + uint16x8_t accum = vmull_u8(s0, filter.val[0]); + accum = vmlal_u8(accum, s1, filter.val[1]); + accum = vmlal_u8(accum, s2, filter.val[2]); + uint8x8_t res = vrshrn_n_u16(accum, 4); + + // Mask off out-of-bounds indices. + uint8x8_t current_dst = vld1_u8(dst); + uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); + res = vbsl_u8(mask, res, current_dst); + + vst1_u8(dst, res); + } + } else { // Filter {2, 4, 4, 4, 2}. + const uint8_t *src = edge; + + while (sz >= 8) { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + uint8x8_t s3 = vld1_u8(src + 3); + uint8x8_t s4 = vld1_u8(src + 4); + + // Make use of the identity: + // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 + uint16x8_t t0 = vaddl_u8(s0, s4); + uint16x8_t t1 = vaddl_u8(s1, s2); + t1 = vaddw_u8(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 3); + + vst1_u8(dst, res); + + src += 8; + dst += 8; + sz -= 8; + } + + if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + uint8x8_t s3 = vld1_u8(src + 3); + uint8x8_t s4 = vld1_u8(src + 4); + + uint16x8_t t0 = vaddl_u8(s0, s4); + uint16x8_t t1 = vaddl_u8(s1, s2); + t1 = vaddw_u8(t1, s3); + t1 = vaddq_u16(t1, t1); + uint16x8_t sum = vaddq_u16(t0, t1); + uint8x8_t res = vrshrn_n_u16(sum, 3); + + // Mask off out-of-bounds indices. + uint8x8_t current_dst = vld1_u8(dst); + uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100)); + res = vbsl_u8(mask, res, current_dst); + + vst1_u8(dst, res); + } + } +} + +void av1_upsample_intra_edge_neon(uint8_t *p, int sz) { + if (!sz) return; + + assert(sz <= MAX_UPSAMPLE_SZ); + + uint8_t edge[MAX_UPSAMPLE_SZ + 3]; + const uint8_t *src = edge; + + // Copy p[-1..(sz-1)] and pad out both ends. + edge[0] = p[-1]; + edge[1] = p[-1]; + memcpy(edge + 2, p, sz); + edge[sz + 2] = p[sz - 1]; + p[-2] = p[-1]; + + uint8_t *dst = p - 1; + + do { + uint8x8_t s0 = vld1_u8(src); + uint8x8_t s1 = vld1_u8(src + 1); + uint8x8_t s2 = vld1_u8(src + 2); + uint8x8_t s3 = vld1_u8(src + 3); + + int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3)); + int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2)); + t1 = vmulq_n_s16(t1, 9); + t1 = vsubq_s16(t1, t0); + + uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } }; + + vst2_u8(dst, res); + + src += 8; + dst += 16; + sz -= 8; + } while (sz > 0); +} diff --git a/third_party/aom/av1/common/arm/resize_neon.c b/third_party/aom/av1/common/arm/resize_neon.c new file mode 100644 index 0000000000..b00ebd1fc2 --- /dev/null +++ b/third_party/aom/av1/common/arm/resize_neon.c @@ -0,0 +1,1178 @@ +/* + * + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/resize.h" +#include "config/av1_rtcd.h" +#include "config/aom_scale_rtcd.h" + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x4_t sum = vmul_lane_s16(s0, filter_lo, 0); + sum = vmla_lane_s16(sum, s1, filter_lo, 1); + sum = vmla_lane_s16(sum, s2, filter_lo, 2); + sum = vmla_lane_s16(sum, s5, filter_hi, 1); + sum = vmla_lane_s16(sum, s6, filter_hi, 2); + sum = vmla_lane_s16(sum, s7, filter_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filter) { + const int16x4_t filter_lo = vget_low_s16(filter); + const int16x4_t filter_hi = vget_high_s16(filter); + + int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filter_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filter_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filter_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filter_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filter_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0)); + return vqrshrun_n_s16(sum, 7); +} + +static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, + const int16x8_t filter) { + int16x8_t ss0 = vreinterpretq_s16_u16(vmovl_u8(s[0])); + int16x8_t ss1 = vreinterpretq_s16_u16(vmovl_u8(s[1])); + int16x8_t ss2 = vreinterpretq_s16_u16(vmovl_u8(s[2])); + int16x8_t ss3 = vreinterpretq_s16_u16(vmovl_u8(s[3])); + int16x8_t ss4 = vreinterpretq_s16_u16(vmovl_u8(s[4])); + int16x8_t ss5 = vreinterpretq_s16_u16(vmovl_u8(s[5])); + int16x8_t ss6 = vreinterpretq_s16_u16(vmovl_u8(s[6])); + int16x8_t ss7 = vreinterpretq_s16_u16(vmovl_u8(s[7])); + + return convolve8_8(ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, filter); +} + +static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x2_t s = vld2q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x4_t s = vld4q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_bilinear_kernel( + const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2, + const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1, + uint8_t *const dst) { + const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0); + const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0); + const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0); + const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0); + const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1); + const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1); + const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1); + const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1); + + const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07 + const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F + const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17 + const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F + const uint16x8_t v0 = vmull_u8(hor0, coef0); + const uint16x8_t v1 = vmull_u8(hor1, coef0); + const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1); + const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1); + // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F + const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7)); + vst1q_u8(dst, d); +} + +static INLINE void scale_plane_2_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E + // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F + const uint8x16x2_t s0 = vld2q_u8(src0); + // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E + // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F + const uint8x16x2_t s1 = vld2q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 32; + src1 += 32; + dst += 16; + x -= 16; + } while (x); + src0 += 2 * (src_stride - max_width); + src1 += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // (*) -- useless + // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C + // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D + // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*) + // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*) + const uint8x16x4_t s0 = vld4q_u8(src0); + // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C + // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D + // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*) + // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*) + const uint8x16x4_t s1 = vld4q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 64; + src1 += 64; + dst += 16; + x -= 16; + } while (x); + src0 += 4 * (src_stride - max_width); + src1 += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[14], d[4]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + // Note: processing 4x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + transpose_elems_inplace_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71 + d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72 + d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73 + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + transpose_elems_inplace_u8_8x4(&d[0], &d[1], &d[2], &d[3]); + vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]), + 0); + vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]), + 0); + vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]), + 0); + vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]), + 0); + vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]), + 1); + vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]), + 1); + vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]), + 1); + vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]), + 1); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + t += 4; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 6 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17 + d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27 + d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[12], d[2]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + // Note: processing 2x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_elems_u8_4x8(s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], + &s[0], &s[1], &s[2], &s[3]); + x = width_hor; + + do { + uint8x8x2_t dd; + src += 8; + load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + transpose_elems_inplace_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71 + // dd.val[0]: 00 01 20 21 40 41 60 61 + // dd.val[1]: 10 11 30 31 50 51 70 71 + dd = vtrn_u8(d[0], d[1]); + vst1_lane_u16((uint16_t *)(t + 0 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 0); + vst1_lane_u16((uint16_t *)(t + 1 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 0); + vst1_lane_u16((uint16_t *)(t + 2 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 1); + vst1_lane_u16((uint16_t *)(t + 3 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 1); + vst1_lane_u16((uint16_t *)(t + 4 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 2); + vst1_lane_u16((uint16_t *)(t + 5 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 2); + vst1_lane_u16((uint16_t *)(t + 6 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 3); + vst1_lane_u16((uint16_t *)(t + 7 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 3); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + t += 2; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]); + t += 4 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, + const uint8x8_t *const coef) { + const uint16x8_t h0 = vmull_u8(s[0], coef[0]); + const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); + + return vrshrn_n_u16(h1, 7); +} + +// Notes for 4 to 3 scaling: +// +// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be +// multiple of 6, and no less than w. +// +// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be +// multiple of 8, and no less than w. +// +// 3. 8 columns are calculated in each horizontal inner loop for further +// vertical scaling, so height_hor must be multiple of 8, and no less than +// 4 * h / 3. +// +// 4. 6 columns are calculated in each vertical inner loop, so height_ver must +// be multiple of 6, and no less than h. +// +// 5. The physical location of the last row of the 4 to 3 scaled frame is +// decided by phase_scaler, and are always less than 1 pixel below the last row +// of the original image. +static void scale_plane_4_to_3_bilinear(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We only need 1 extra row below because there are only 2 bilinear + // coefficients. + const int height_hor = (4 * h / 3 + 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[9], d[8], c[6]; + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[BILINEAR].filter_ptr; + assert(w && h); + + c[0] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][3]); + c[1] = vdup_n_u8((uint8_t)interp_kernel[phase_scaler][4]); + c[2] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][3]); + c[3] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 1 * step_q4) & SUBPEL_MASK][4]); + c[4] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][3]); + c[5] = vdup_n_u8( + (uint8_t)interp_kernel[(phase_scaler + 2 * step_q4) & SUBPEL_MASK][4]); + + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + src += 1; + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + x = width_hor; + + do { + load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + src += 8; + transpose_elems_inplace_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], + &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3 - 1; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + t += 8 * stride_hor; + + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 1); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + const int16x8_t filters0 = vld1q_s16( + (const int16_t *)&coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters1 = vld1q_s16( + (const int16_t *)&coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters2 = vld1q_s16( + (const int16_t *)&coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[15], d[8]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + transpose_elems_inplace_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13], &s[14]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], + &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 7 * stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + t += 8 * stride_hor; + + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 7); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling in NEON. +static INLINE bool has_normative_scaler_neon(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + const bool has_normative_scaler = + (2 * dst_width == src_width && 2 * dst_height == src_height) || + (4 * dst_width == src_width && 4 * dst_height == src_height) || + (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); + + return has_normative_scaler; +} + +void av1_resize_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase, const int num_planes) { + bool has_normative_scaler = + has_normative_scaler_neon(src->y_crop_width, src->y_crop_height, + dst->y_crop_width, dst->y_crop_height); + + if (num_planes > 1) { + has_normative_scaler = + has_normative_scaler && + has_normative_scaler_neon(src->uv_crop_width, src->uv_crop_height, + dst->uv_crop_width, dst->uv_crop_height); + } + + if (!has_normative_scaler) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + return; + } + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + int malloc_failed = 0; + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; + const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + if (phase == 0) { + scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0, c1); + } else { + const int buffer_stride = (dst_y_w + 3) & ~3; + const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + if (phase == 0) { + scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0, c1); + } else { + const int buffer_stride = (dst_y_w + 1) & ~1; + const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else { + assert(4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h); + // 4 to 3 + const int buffer_stride = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; + const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + if (filter == BILINEAR) { + scale_plane_4_to_3_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, phase, temp_buffer); + } else { + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel, phase, temp_buffer); + } + free(temp_buffer); + } + } + + if (malloc_failed) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + } else { + aom_extend_frame_borders(dst, num_planes); + } +} + +static INLINE void scaledconvolve_horiz_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + + src -= SUBPEL_TAPS / 2 - 1; + + y = h; + do { + int x_q4 = x0_q4; + x = 0; + do { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + int16x8_t ss[4]; + int16x4_t t[8], tt; + + load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); + transpose_elems_inplace_u8_8x4(&s[0], &s[1], &s[2], &s[3]); + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + t[0] = vget_low_s16(ss[0]); + t[1] = vget_low_s16(ss[1]); + t[2] = vget_low_s16(ss[2]); + t[3] = vget_low_s16(ss[3]); + t[4] = vget_high_s16(ss[0]); + t[5] = vget_high_s16(ss[1]); + t[6] = vget_high_s16(ss[2]); + t[7] = vget_high_s16(ss[3]); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], + filters); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + store_u8_4x1(&temp[4 * z], d); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + { + const uint8x8x4_t d4 = vld4_u8(temp); + store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]); + store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]); + store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]); + store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]); + } + x += 4; + } while (x < w); + + src += src_stride * 4; + dst += dst_stride * 4; + y -= 4; + } while (y > 0); +} + +static INLINE void scaledconvolve_horiz_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, const int w, const int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = (h + 7) & ~7; + + do { + int x_q4 = x0_q4; + x = 0; + do { + uint8x8_t d[8]; + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); + uint8x8_t s[8]; + load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + transpose_elems_inplace_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], + &s[5], &s[6], &s[7]); + d[0] = scale_filter_8(s, filters); + vst1_u8(&temp[8 * z], d[0]); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], + &d[7]); + transpose_elems_inplace_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], + &d[6], &d[7]); + store_u8_8x8(dst + x, dst_stride, d[0], d[1], d[2], d[3], d[4], d[5], + d[6], d[7]); + x += 8; + } while (x < w); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static INLINE void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + int16x4_t t[8], tt; + + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); + t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); + t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); + t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); + t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); + t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); + t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); + t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); + + tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); + d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); + store_u8_4x1(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x8_t s[8], d; + load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + d = scale_filter_8(s, filters); + vst1_u8(dst, d); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +static INLINE void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { + int x, y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + y = h; + do { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + if (y_q4 & SUBPEL_MASK) { + x = 0; + do { + const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); + uint8x16_t ss[8]; + uint8x8_t s[8], d[2]; + load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], + &ss[5], &ss[6], &ss[7]); + s[0] = vget_low_u8(ss[0]); + s[1] = vget_low_u8(ss[1]); + s[2] = vget_low_u8(ss[2]); + s[3] = vget_low_u8(ss[3]); + s[4] = vget_low_u8(ss[4]); + s[5] = vget_low_u8(ss[5]); + s[6] = vget_low_u8(ss[6]); + s[7] = vget_low_u8(ss[7]); + d[0] = scale_filter_8(s, filters); + + s[0] = vget_high_u8(ss[0]); + s[1] = vget_high_u8(ss[1]); + s[2] = vget_high_u8(ss[2]); + s[3] = vget_high_u8(ss[3]); + s[4] = vget_high_u8(ss[4]); + s[5] = vget_high_u8(ss[5]); + s[6] = vget_high_u8(ss[6]); + s[7] = vget_high_u8(ss[7]); + d[1] = scale_filter_8(s, filters); + vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); + src_y += 16; + x += 16; + } while (x < w); + } else { + memcpy(dst, &src_y[3 * src_stride], w); + } + dst += dst_stride; + y_q4 += y_step_q4; + } while (--y); +} + +void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } +} diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c new file mode 100644 index 0000000000..1d3a3cc038 --- /dev/null +++ b/third_party/aom/av1/common/arm/selfguided_neon.c @@ -0,0 +1,1595 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/common.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" + +// Constants used for right shift in final_filter calculation. +#define NB_EVEN 5 +#define NB_ODD 4 + +static INLINE void calc_ab_fast_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5, + int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec, + uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec, + uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2, + const int buf_stride) { + uint32x4_t q0, q1, q2, q3; + uint32x4_t p0, p1, p2, p3; + uint16x4_t d0, d1, d2, d3; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + + q0 = vmulq_u32(s4, s4); + q1 = vmulq_u32(s5, s5); + q2 = vmulq_u32(s6, s6); + q3 = vmulq_u32(s7, s7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 4; y++) { + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3); + } + p0 = vsubl_u16(sgrproj_sgr, d0); + p1 = vsubl_u16(sgrproj_sgr, d1); + p2 = vsubl_u16(sgrproj_sgr, d2); + p3 = vsubl_u16(sgrproj_sgr, d3); + + s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec); + s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec); + s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec); + s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec); + + s4 = vmulq_u32(s4, p0); + s5 = vmulq_u32(s5, p1); + s6 = vmulq_u32(s6, p2); + s7 = vmulq_u32(s7, p3); + + p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); +} +static INLINE void calc_ab_internal_common( + uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4, + uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0, + uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4, + uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7, + uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val, + uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1, + uint16_t *dst_A16, int32_t *dst2, const int buf_stride) { + uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7; + uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7; + uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7; + + s0 = vmulq_u32(s0, const_n_val); + s1 = vmulq_u32(s1, const_n_val); + s2 = vmulq_u32(s2, const_n_val); + s3 = vmulq_u32(s3, const_n_val); + s4 = vmulq_u32(s4, const_n_val); + s5 = vmulq_u32(s5, const_n_val); + s6 = vmulq_u32(s6, const_n_val); + s7 = vmulq_u32(s7, const_n_val); + + d0 = vget_low_u16(s16_4); + d1 = vget_low_u16(s16_5); + d2 = vget_low_u16(s16_6); + d3 = vget_low_u16(s16_7); + d4 = vget_high_u16(s16_4); + d5 = vget_high_u16(s16_5); + d6 = vget_high_u16(s16_6); + d7 = vget_high_u16(s16_7); + + q0 = vmull_u16(d0, d0); + q1 = vmull_u16(d1, d1); + q2 = vmull_u16(d2, d2); + q3 = vmull_u16(d3, d3); + q4 = vmull_u16(d4, d4); + q5 = vmull_u16(d5, d5); + q6 = vmull_u16(d6, d6); + q7 = vmull_u16(d7, d7); + + p0 = vcleq_u32(q0, s0); + p1 = vcleq_u32(q1, s1); + p2 = vcleq_u32(q2, s2); + p3 = vcleq_u32(q3, s3); + p4 = vcleq_u32(q4, s4); + p5 = vcleq_u32(q5, s5); + p6 = vcleq_u32(q6, s6); + p7 = vcleq_u32(q7, s7); + + q0 = vsubq_u32(s0, q0); + q1 = vsubq_u32(s1, q1); + q2 = vsubq_u32(s2, q2); + q3 = vsubq_u32(s3, q3); + q4 = vsubq_u32(s4, q4); + q5 = vsubq_u32(s5, q5); + q6 = vsubq_u32(s6, q6); + q7 = vsubq_u32(s7, q7); + + p0 = vandq_u32(p0, q0); + p1 = vandq_u32(p1, q1); + p2 = vandq_u32(p2, q2); + p3 = vandq_u32(p3, q3); + p4 = vandq_u32(p4, q4); + p5 = vandq_u32(p5, q5); + p6 = vandq_u32(p6, q6); + p7 = vandq_u32(p7, q7); + + p0 = vmulq_u32(p0, s_vec); + p1 = vmulq_u32(p1, s_vec); + p2 = vmulq_u32(p2, s_vec); + p3 = vmulq_u32(p3, s_vec); + p4 = vmulq_u32(p4, s_vec); + p5 = vmulq_u32(p5, s_vec); + p6 = vmulq_u32(p6, s_vec); + p7 = vmulq_u32(p7, s_vec); + + p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS); + p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS); + p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS); + p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS); + p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS); + p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS); + p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS); + p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS); + + p0 = vminq_u32(p0, const_val); + p1 = vminq_u32(p1, const_val); + p2 = vminq_u32(p2, const_val); + p3 = vminq_u32(p3, const_val); + p4 = vminq_u32(p4, const_val); + p5 = vminq_u32(p5, const_val); + p6 = vminq_u32(p6, const_val); + p7 = vminq_u32(p7, const_val); + + { + store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3); + store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7); + + for (int x = 0; x < 4; x++) { + for (int y = 0; y < 8; y++) { + dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]]; + } + } + load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7); + } + + s16_4 = vsubq_u16(sgrproj_sgr, s16_4); + s16_5 = vsubq_u16(sgrproj_sgr, s16_5); + s16_6 = vsubq_u16(sgrproj_sgr, s16_6); + s16_7 = vsubq_u16(sgrproj_sgr, s16_7); + + s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec); + s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec); + s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec); + s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec); + s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec); + s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec); + s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec); + s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec); + + s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4))); + s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5))); + s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6))); + s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7))); + s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4))); + s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5))); + s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6))); + s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7))); + + p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS); + p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS); + p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS); + p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS); + p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS); + p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS); + p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS); + p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS); + + store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0), + vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2), + vreinterpretq_s32_u32(p3)); + store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4), + vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6), + vreinterpretq_s32_u32(p7)); +} + +static INLINE void boxsum2_square_sum_calc( + int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5, + int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10, + int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) { + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; + int32x4_t r12, r34, r67, r89, r1011; + int32x4_t r345, r6789, r789; + + d1 = vmull_s16(t1, t1); + d2 = vmull_s16(t2, t2); + d3 = vmull_s16(t3, t3); + d4 = vmull_s16(t4, t4); + d5 = vmull_s16(t5, t5); + d6 = vmull_s16(t6, t6); + d7 = vmull_s16(t7, t7); + d8 = vmull_s16(t8, t8); + d9 = vmull_s16(t9, t9); + d10 = vmull_s16(t10, t10); + d11 = vmull_s16(t11, t11); + + r12 = vaddq_s32(d1, d2); + r34 = vaddq_s32(d3, d4); + r67 = vaddq_s32(d6, d7); + r89 = vaddq_s32(d8, d9); + r1011 = vaddq_s32(d10, d11); + r345 = vaddq_s32(r34, d5); + r6789 = vaddq_s32(r67, r89); + r789 = vsubq_s32(r6789, d6); + *r0 = vaddq_s32(r12, r345); + *r1 = vaddq_s32(r67, r345); + *r2 = vaddq_s32(d5, r6789); + *r3 = vaddq_s32(r789, r1011); +} + +static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16, + int32_t *dst32, int32_t *dst2, const int dst_stride, + const int width, const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *dst1_16_ptr, *src_ptr; + int32_t *dst2_ptr; + int h, w, count = 0; + const int dst_stride_2 = (dst_stride << 1); + const int dst_stride_8 = (dst_stride << 3); + + dst1_16_ptr = dst16; + dst2_ptr = dst2; + src_ptr = src; + w = width; + { + int16x8_t t1, t2, t3, t4, t5, t6, t7; + int16x8_t t8, t9, t10, t11, t12; + + int16x8_t q12345, q56789, q34567, q7891011; + int16x8_t q12, q34, q67, q89, q1011; + int16x8_t q345, q6789, q789; + + int32x4_t r12345, r56789, r34567, r7891011; + + do { + h = height; + dst1_16_ptr = dst16 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + + dst1_16_ptr += dst_stride_2; + dst2_ptr += dst_stride_2; + do { + load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8); + src_ptr += 4 * src_stride; + load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12); + + q12 = vaddq_s16(t1, t2); + q34 = vaddq_s16(t3, t4); + q67 = vaddq_s16(t6, t7); + q89 = vaddq_s16(t8, t9); + q1011 = vaddq_s16(t10, t11); + q345 = vaddq_s16(q34, t5); + q6789 = vaddq_s16(q67, q89); + q789 = vaddq_s16(q89, t7); + q12345 = vaddq_s16(q12, q345); + q34567 = vaddq_s16(q67, q345); + q56789 = vaddq_s16(t5, q6789); + q7891011 = vaddq_s16(q789, q1011); + + store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789, + q7891011); + dst1_16_ptr += dst_stride_8; + + boxsum2_square_sum_calc( + vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3), + vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6), + vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9), + vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011); + + boxsum2_square_sum_calc( + vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3), + vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6), + vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9), + vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789, + &r7891011); + + store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789, + r7891011); + dst2_ptr += (dst_stride_8); + h -= 8; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst16, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst16, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } + } + + { + int16x4_t s1, s2, s3, s4, s5, s6, s7, s8; + int32x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int32x4_t q12345, q34567, q23456, q45678; + int32x4_t q23, q45, q67; + int32x4_t q2345, q4567; + + int32x4_t r12345, r34567, r23456, r45678; + int32x4_t r23, r45, r67; + int32x4_t r2345, r4567; + + int32_t *src2_ptr, *dst1_32_ptr; + int16_t *src1_ptr; + count = 0; + h = height; + do { + dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2); + dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2); + src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2); + w = width; + + dst1_32_ptr += 2; + dst2_ptr += 2; + load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4); + transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4); + load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4); + transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4); + do { + src1_ptr += 4; + src2_ptr += 4; + load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8); + transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8); + load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8); + transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8); + q23 = vaddl_s16(s2, s3); + q45 = vaddl_s16(s4, s5); + q67 = vaddl_s16(s6, s7); + q2345 = vaddq_s32(q23, q45); + q4567 = vaddq_s32(q45, q67); + q12345 = vaddq_s32(vmovl_s16(s1), q2345); + q23456 = vaddq_s32(q2345, vmovl_s16(s6)); + q34567 = vaddq_s32(q4567, vmovl_s16(s3)); + q45678 = vaddq_s32(q4567, vmovl_s16(s8)); + + transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678); + store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567, + q45678); + dst1_32_ptr += 4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + + r23 = vaddq_s32(d2, d3); + r45 = vaddq_s32(d4, d5); + r67 = vaddq_s32(d6, d7); + r2345 = vaddq_s32(r23, r45); + r4567 = vaddq_s32(r45, r67); + r12345 = vaddq_s32(d1, r2345); + r23456 = vaddq_s32(r2345, d6); + r34567 = vaddq_s32(r4567, d3); + r45678 = vaddq_s32(r4567, d8); + + transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678); + store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678); + dst2_ptr += 4; + d1 = d5; + d2 = d6; + d3 = d7; + d4 = d8; + w -= 4; + } while (w > 0); + h -= 8; + count++; + } while (h > 0); + } +} + +static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int r, + const int s, const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7; + + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + dst_A16 = A16 + (count << 2) * buf_stride; + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + w = width; + do { + load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3); + load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s16_4 = s16_0; + s16_5 = s16_1; + s16_6 = s16_2; + s16_7 = s16_3; + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16, + uint16_t *B16, int32_t *B, + const int buf_stride, const int width, + const int height, const int bit_depth, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *dst2, count = 0; + uint16_t *dst_A16, *src2; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR); + const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint16x8_t s16_0, s16_1, s16_2, s16_3; + uint16x8_t s16_4, s16_5, s16_6, s16_7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B16 + (count << 2) * buf_stride; + dst2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7); + load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec); + + s16_4 = vrshlq_u16(s16_0, bd_min_2_vec); + s16_5 = vrshlq_u16(s16_1, bd_min_2_vec); + s16_6 = vrshlq_u16(s16_2, bd_min_2_vec); + s16_7 = vrshlq_u16(s16_3, bd_min_2_vec); + + calc_ab_internal_common( + s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4, + s16_5, s16_6, s16_7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride); + + w -= 8; + dst2 += 8; + src1 += 8; + src2 += 8; + dst_A16 += 8; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int r, const int s, + const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vreinterpretq_u32_s32(sr0); + s1 = vreinterpretq_u32_s32(sr1); + s2 = vreinterpretq_u32_s32(sr2); + s3 = vreinterpretq_u32_s32(sr3); + s4 = vreinterpretq_u32_s32(sr4); + s5 = vreinterpretq_u32_s32(sr5); + s6 = vreinterpretq_u32_s32(sr6); + s7 = vreinterpretq_u32_s32(sr7); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16, + int32_t *B, const int buf_stride, + const int width, const int height, + const int bit_depth, const int r, + const int s, const int ht_inc) { + int32_t *src1, *src2, count = 0; + uint16_t *dst_A16; + const uint32_t n = (2 * r + 1) * (2 * r + 1); + const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8)); + const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1)); + const uint32x4_t const_n_val = vdupq_n_u32(n); + const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR); + const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]); + const uint32x4_t const_val = vdupq_n_u32(255); + + int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7; + uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7; + + const uint32x4_t s_vec = vdupq_n_u32(s); + int w, h = height; + + do { + src1 = A + (count << 2) * buf_stride; + src2 = B + (count << 2) * buf_stride; + dst_A16 = A16 + (count << 2) * buf_stride; + w = width; + do { + load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3); + load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7); + + s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec); + s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec); + s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec); + s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec); + s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec); + s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec); + s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec); + s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec); + + calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5, + sr6, sr7, const_n_val, s_vec, const_val, + one_by_n_minus_1_vec, sgrproj_sgr, src1, + dst_A16, src2, buf_stride); + + w -= 4; + src1 += 4; + src2 += 4; + dst_A16 += 4; + } while (w > 0); + count++; + h -= (ht_inc * 4); + } while (h > 0); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1, + int32_t *dst2, const int dst_stride, const int width, + const int height) { + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + int16_t *src_ptr; + int32_t *dst2_ptr; + uint16_t *dst1_ptr; + int h, w, count = 0; + + w = width; + { + int16x8_t s1, s2, s3, s4, s5, s6, s7, s8; + int16x8_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r345, r456, r567, r78, r678; + int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high; + int32x4_t r2, r3, r5, r6, r7, r8; + int16x8_t q678, q78; + + do { + dst1_ptr = dst1 + (count << 3); + dst2_ptr = dst2 + (count << 3); + src_ptr = src + (count << 3); + h = height; + + load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + src_ptr += 4 * src_stride; + + q23 = vaddq_s16(s2, s3); + q234 = vaddq_s16(q23, s4); + q34 = vaddq_s16(s3, s4); + dst1_ptr += (dst_stride << 1); + + r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2)); + r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3)); + r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_low = vaddq_s32(r23, r4_low); + r34_low = vaddq_s32(r3, r4_low); + + r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2)); + r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3)); + r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4)); + r23 = vaddq_s32(r2, r3); + r234_high = vaddq_s32(r23, r4_high); + r34_high = vaddq_s32(r3, r4_high); + + dst2_ptr += (dst_stride << 1); + + do { + load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8); + src_ptr += 4 * src_stride; + + q345 = vaddq_s16(s5, q34); + q56 = vaddq_s16(s5, s6); + q456 = vaddq_s16(s4, q56); + q567 = vaddq_s16(s7, q56); + q78 = vaddq_s16(s7, s8); + q678 = vaddq_s16(s6, q78); + + store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += (dst_stride << 2); + + s4 = s8; + q34 = q78; + q234 = q678; + + r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5)); + r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6)); + r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7)); + r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8)); + + r345 = vaddq_s32(r5, r34_low); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_low, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567); + + r4_low = r8; + r34_low = r78; + r234_low = r678; + + r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5)); + r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6)); + r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7)); + r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8)); + + r345 = vaddq_s32(r5, r34_high); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4_high, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567); + dst2_ptr += (dst_stride << 2); + + r4_high = r8; + r34_high = r78; + r234_high = r678; + + h -= 4; + } while (h > 0); + w -= 8; + count++; + } while (w > 0); + + // memset needed for row pixels as 2nd stage of boxsum filter uses + // first 2 rows of dst1, dst2 buffer which is not filled in first stage. + for (int x = 0; x < 2; x++) { + memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1)); + memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2)); + } + + // memset needed for extra columns as 2nd stage of boxsum filter uses + // last 2 columns of dst1, dst2 buffer which is not filled in first stage. + for (int x = 2; x < height + 2; x++) { + int dst_offset = x * dst_stride + width + 2; + memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1)); + memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2)); + } + } + + { + int16x4_t d1, d2, d3, d4, d5, d6, d7, d8; + int16x4_t q23, q34, q56, q234, q345, q456, q567; + int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678; + int32x4_t r1, r2, r3, r4, r5, r6, r7, r8; + int16x4_t q678, q78; + + int32_t *src2_ptr; + uint16_t *src1_ptr; + count = 0; + h = height; + w = width; + do { + dst1_ptr = dst1 + (count << 2) * dst_stride; + dst2_ptr = dst2 + (count << 2) * dst_stride; + src1_ptr = dst1 + (count << 2) * dst_stride; + src2_ptr = dst2 + (count << 2) * dst_stride; + w = width; + + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4); + transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4); + load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4); + transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4); + src1_ptr += 4; + src2_ptr += 4; + + q23 = vadd_s16(d2, d3); + q234 = vadd_s16(q23, d4); + q34 = vadd_s16(d3, d4); + dst1_ptr += 2; + r23 = vaddq_s32(r2, r3); + r234 = vaddq_s32(r23, r4); + r34 = vaddq_s32(r3, r4); + dst2_ptr += 2; + + do { + load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8); + transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8); + load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8); + transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8); + src1_ptr += 4; + src2_ptr += 4; + + q345 = vadd_s16(d5, q34); + q56 = vadd_s16(d5, d6); + q456 = vadd_s16(d4, q56); + q567 = vadd_s16(d7, q56); + q78 = vadd_s16(d7, d8); + q678 = vadd_s16(d6, q78); + transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567); + store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567); + dst1_ptr += 4; + + d4 = d8; + q34 = q78; + q234 = q678; + + r345 = vaddq_s32(r5, r34); + r56 = vaddq_s32(r5, r6); + r456 = vaddq_s32(r4, r56); + r567 = vaddq_s32(r7, r56); + r78 = vaddq_s32(r7, r8); + r678 = vaddq_s32(r6, r78); + transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567); + store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567); + dst2_ptr += 4; + + r4 = r8; + r34 = r78; + r234 = r678; + w -= 4; + } while (w > 0); + h -= 4; + count++; + } while (h > 0); + } +} + +static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + int32x4_t fours, threes, res; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x)))); + threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes); + return res; +} + +static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl; + uint16x8_t r0, r1; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xb = vaddq_u16(xb, x); + xt = vaddq_u16(xt, xr); + xl = vaddq_u16(xl, xb); + xl = vaddq_u16(xl, xt); + + r0 = vshlq_n_u16(xl, 2); + + xbl = vaddq_u16(xbl, xbr); + xtl = vaddq_u16(xtl, xtr); + xtl = vaddq_u16(xtl, xbl); + + r1 = vshlq_n_u16(xtl, 2); + r1 = vsubq_u16(r1, xtl); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1)))); +} + +static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) { + int32x4_t xtr, xt, xtl, xbr, xb, xbl; + int32x4_t fives, sixes, fives_plus_sixes; + + xtl = vld1q_s32(buf - buf_stride - 1); + xt = vld1q_s32(buf - buf_stride); + xtr = vld1q_s32(buf - buf_stride + 1); + xbl = vld1q_s32(buf + buf_stride - 1); + xb = vld1q_s32(buf + buf_stride); + xbr = vld1q_s32(buf + buf_stride + 1); + + fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl))); + sixes = vaddq_s32(xt, xb); + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride, + int32x4_t *a0, int32x4_t *a1) { + uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0; + + xtl = vld1q_u16(buf - buf_stride - 1); + xt = vld1q_u16(buf - buf_stride); + xtr = vld1q_u16(buf - buf_stride + 1); + xbl = vld1q_u16(buf + buf_stride - 1); + xb = vld1q_u16(buf + buf_stride); + xbr = vld1q_u16(buf + buf_stride + 1); + + xbr = vaddq_u16(xbr, xbl); + xtr = vaddq_u16(xtr, xtl); + xbr = vaddq_u16(xbr, xtr); + xtl = vshlq_n_u16(xbr, 2); + xbr = vaddq_u16(xtl, xbr); + + xb = vaddq_u16(xb, xt); + xb0 = vshlq_n_u16(xb, 1); + xb = vshlq_n_u16(xb, 2); + xb = vaddq_u16(xb, xb0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb)))); +} + +static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) { + int32x4_t xl, x, xr; + int32x4_t fives, sixes, fives_plus_sixes; + + xl = vld1q_s32(buf - 1); + x = vld1q_s32(buf); + xr = vld1q_s32(buf + 1); + fives = vaddq_s32(xl, xr); + sixes = x; + fives_plus_sixes = vaddq_s32(fives, sixes); + + return vaddq_s32( + vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes); +} + +static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0, + int32x4_t *a1) { + uint16x8_t xl, x, xr; + uint16x8_t x0; + + xl = vld1q_u16(buf - 1); + x = vld1q_u16(buf); + xr = vld1q_u16(buf + 1); + xl = vaddq_u16(xl, xr); + x0 = vshlq_n_u16(xl, 2); + xl = vaddq_u16(xl, x0); + + x0 = vshlq_n_u16(x, 1); + x = vshlq_n_u16(x, 2); + x = vaddq_u16(x, x0); + + *a0 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x)))); + *a1 = vreinterpretq_s32_u32( + vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x)))); +} + +static void final_filter_fast_internal(uint16_t *A, int32_t *B, + const int buf_stride, int16_t *src, + const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + + A_tmp = A; + B_tmp = B; + src_ptr = src; + dst_ptr = dst; + h = height; + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + if (!(count & 1)) { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride); + b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } else { + do { + s0 = vld1q_s16(src_ptr); + cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_fast_odd_row(B_tmp); + b_res1 = cross_sum_fast_odd_row(B_tmp + 4); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS); + + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + } + count++; + h -= 1; + } while (h > 0); +} + +void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride, + int16_t *src, const int src_stride, int32_t *dst, + const int dst_stride, const int width, + const int height) { + int16x8_t s0; + int32_t *B_tmp, *dst_ptr; + uint16_t *A_tmp; + int16_t *src_ptr; + int32x4_t a_res0, a_res1, b_res0, b_res1; + int w, h, count = 0; + + assert(SGRPROJ_SGR_BITS == 8); + assert(SGRPROJ_RST_BITS == 4); + h = height; + + do { + A_tmp = (A + count * buf_stride); + B_tmp = (B + count * buf_stride); + src_ptr = (src + count * src_stride); + dst_ptr = (dst + count * dst_stride); + w = width; + do { + s0 = vld1q_s16(src_ptr); + cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1); + a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0); + a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1); + + b_res0 = cross_sum_inp_s32(B_tmp, buf_stride); + b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride); + a_res0 = vaddq_s32(a_res0, b_res0); + a_res1 = vaddq_s32(a_res1, b_res1); + + a_res0 = + vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + a_res1 = + vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS); + vst1q_s32(dst_ptr, a_res0); + vst1q_s32(dst_ptr + 4, a_res1); + + A_tmp += 8; + B_tmp += 8; + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 0); + count++; + h -= 1; + } while (h > 0); +} + +static INLINE void restoration_fast_internal(uint16_t *dgd16, int width, + int height, int dgd_stride, + int32_t *dst, int dst_stride, + int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + const int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + int32_t *sum_buf = B_; + uint16_t *tmp16_buf = A16_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 0); + assert(r == 2); + + // input(dgd16) is 16bit. + // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is + // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit + // buffer(square_sum_buf). + boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride, + width_ext, height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + +#if CONFIG_AV1_HIGHBITDEPTH + if (bit_depth > 8) { + calc_ab_fast_internal_hbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, + bit_depth, r, params->s[radius_idx], 2); + } else { + calc_ab_fast_internal_lbd( + (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r, + params->s[radius_idx], 2); + } +#else + (void)bit_depth; + calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1), + (tmp16_buf - buf_stride - 1), + (sum_buf - buf_stride - 1), buf_stride * 2, + width + 2, height + 2, r, params->s[radius_idx], 2); +#endif + final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16, + dgd_stride, dst, dst_stride, width, height); +} + +static INLINE void restoration_internal(uint16_t *dgd16, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + uint16_t A16_[RESTORATION_PROC_UNIT_PELS]; + uint16_t B16_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *square_sum_buf = A_; + uint16_t *sum_buf = B16_; + uint16_t *A16 = A16_; + int32_t *B = B_; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + assert(radius_idx == 1); + assert(r == 1); + + // input(dgd16) is 16bit. + // sum of pixels output will be in 16bit(sum_buf). + // sum of squares output is kept in 32bit buffer(square_sum_buf). + boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT - + SGRPROJ_BORDER_HORZ), + dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext, + height_ext); + + square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + +#if CONFIG_AV1_HIGHBITDEPTH + // Calculation of a, b. a output is in 16bit tmp_buf which is in range of + // [1, 256] for all bit depths. b output is kept in 32bit buffer. + if (bit_depth > 8) { + calc_ab_internal_hbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, bit_depth, r, params->s[radius_idx], 1); + } else { + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, + height + 2, r, params->s[radius_idx], 1); + } +#else + (void)bit_depth; + calc_ab_internal_lbd((square_sum_buf - buf_stride - 1), + (A16 - buf_stride - 1), (sum_buf - buf_stride - 1), + (B - buf_stride - 1), buf_stride, width + 2, height + 2, + r, params->s[radius_idx], 1); +#endif + final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst, + dst_stride, width, height); +} + +static INLINE void src_convert_u8_to_u16(const uint8_t *src, + const int src_stride, uint16_t *dst, + const int dst_stride, const int width, + const int height) { + const uint8_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + + uint8x8_t t1, t2, t3, t4; + uint16x8_t s1, s2, s3, s4; + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + if (w >= 7) { + do { + load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4); + s1 = vmovl_u8(t1); + s2 = vmovl_u8(t2); + s3 = vmovl_u8(t3); + s4 = vmovl_u8(t4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + } + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + for (int x = 0; x < h; x++) { + for (int y = 0; y < width; y++) { + dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride]; + } + } + + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride, + uint16_t *dst, const int dst_stride, + int width, int height) { + const uint16_t *src_ptr; + uint16_t *dst_ptr; + int h, w, count = 0; + uint16x8_t s1, s2, s3, s4; + + h = height; + do { + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + w = width; + do { + load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4); + store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4); + src_ptr += 8; + dst_ptr += 8; + w -= 8; + } while (w > 7); + + for (int y = 0; y < w; y++) { + dst_ptr[y] = src_ptr[y]; + dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride]; + dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride]; + dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride]; + } + count++; + h -= 4; + } while (h > 3); + + src_ptr = src + (count << 2) * src_stride; + dst_ptr = dst + (count << 2) * dst_stride; + + for (int x = 0; x < h; x++) { + memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride), + sizeof(uint16_t) * width); + } + // memset uninitialized rows of src buffer as they are needed for the + // boxsum filter calculation. + for (int x = height; x < height + 5; x++) + memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst)); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height, + int stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif + + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, + flt_stride, bit_depth, sgr_params_idx, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride, + bit_depth, sgr_params_idx, 1); + return 0; +} + +int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS]; + const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ; + uint16_t *dgd16 = + dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + const int dgd_stride = stride; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + + assert(!(params->r[0] == 0 && params->r[1] == 0)); + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8); + src_convert_hbd_copy( + dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } else { + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, + dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); + } +#else + (void)highbd; + src_convert_u8_to_u16( + dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride, + dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ, + dgd16_stride, width_ext, height_ext); +#endif + if (params->r[0] > 0) + restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width, + bit_depth, eps, 0); + if (params->r[1] > 0) + restoration_internal(dgd16, width, height, dgd16_stride, flt1, width, + bit_depth, eps, 1); + + av1_decode_xq(xqd, xq, params); + + { + int16_t *src_ptr; + uint8_t *dst_ptr; +#if CONFIG_AV1_HIGHBITDEPTH + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8); + uint16_t *dst16_ptr; +#endif + int16x4_t d0, d4; + int16x8_t r0, s0; + uint16x8_t r4; + int32x4_t u0, u4, v0, v4, f00, f10; + uint8x8_t t0; + int count = 0, w = width, h = height, rc = 0; + + const int32x4_t xq0_vec = vdupq_n_s32(xq[0]); + const int32x4_t xq1_vec = vdupq_n_s32(xq[1]); + const int16x8_t zero = vdupq_n_s16(0); + const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1); + src_ptr = (int16_t *)dgd16; + do { + w = width; + count = 0; + dst_ptr = dst8 + rc * dst_stride; +#if CONFIG_AV1_HIGHBITDEPTH + dst16_ptr = dst16 + rc * dst_stride; +#endif + do { + s0 = vld1q_s16(src_ptr + count); + + u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS); + u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS); + + v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS); + v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + f00 = vld1q_s32(flt0 + count); + f10 = vld1q_s32(flt0 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq0_vec, f00); + v4 = vmlaq_s32(v4, xq0_vec, f10); + } + + if (params->r[1] > 0) { + f00 = vld1q_s32(flt1 + count); + f10 = vld1q_s32(flt1 + count + 4); + + f00 = vsubq_s32(f00, u0); + f10 = vsubq_s32(f10, u4); + + v0 = vmlaq_s32(v0, xq1_vec, f00); + v4 = vmlaq_s32(v4, xq1_vec, f10); + } + + d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + r0 = vcombine_s16(d0, d4); + + r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero)); + +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + r4 = vminq_u16(r4, max); + vst1q_u16(dst16_ptr, r4); + dst16_ptr += 8; + } else { + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); + dst_ptr += 8; + } +#else + (void)max; + t0 = vqmovn_u16(r4); + vst1_u8(dst_ptr, t0); + dst_ptr += 8; +#endif + w -= 8; + count += 8; + } while (w > 0); + + src_ptr += dgd16_stride; + flt1 += width; + flt0 += width; + rc++; + h--; + } while (h > 0); + } + return 0; +} diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c new file mode 100644 index 0000000000..4723154398 --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "warp_plane_neon.h" + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f[0], in16_lo); + int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f[0], in16_lo); + int16x8_t m1 = vmulq_s16(f[1], vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f[2], vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f[3], vextq_s16(in16_lo, in16_hi, 3)); + int16x8_t m4 = vmulq_s16(f[4], vextq_s16(in16_lo, in16_hi, 4)); + int16x8_t m5 = vmulq_s16(f[5], vextq_s16(in16_lo, in16_hi, 5)); + int16x8_t m6 = vmulq_s16(f[6], vextq_s16(in16_lo, in16_hi, 6)); + int16x8_t m7 = vmulq_s16(f[7], vextq_s16(in16_lo, in16_hi, 7)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6), + vpaddlq_s16(m7) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f_s16, in16_lo); + int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int16x8_t in16_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(in))); + int16x8_t in16_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(in))); + + int16x8_t m0 = vmulq_s16(f_s16, in16_lo); + int16x8_t m1 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 1)); + int16x8_t m2 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 2)); + int16x8_t m3 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 3)); + int16x8_t m4 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 4)); + int16x8_t m5 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 5)); + int16x8_t m6 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 6)); + int16x8_t m7 = vmulq_s16(f_s16, vextq_s16(in16_lo, in16_hi, 7)); + + int32x4_t m0123_pairs[] = { vpaddlq_s16(m0), vpaddlq_s16(m1), vpaddlq_s16(m2), + vpaddlq_s16(m3) }; + int32x4_t m4567_pairs[] = { vpaddlq_s16(m4), vpaddlq_s16(m5), vpaddlq_s16(m6), + vpaddlq_s16(m7) }; + + int32x4_t tmp_res_low = horizontal_add_4d_s32x4(m0123_pairs); + int32x4_t tmp_res_high = horizontal_add_4d_s32x4(m4567_pairs); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy) { + int16x4_t s0 = vget_low_s16(src[0]); + int16x4_t s1 = vget_low_s16(src[1]); + int16x4_t s2 = vget_low_s16(src[2]); + int16x4_t s3 = vget_low_s16(src[3]); + int16x4_t s4 = vget_low_s16(src[4]); + int16x4_t s5 = vget_low_s16(src[5]); + int16x4_t s6 = vget_low_s16(src[6]); + int16x4_t s7 = vget_low_s16(src[7]); + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); + + *res = m0123; +} + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), + vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), + vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + + *res = horizontal_add_4d_s32x4(m0123_pairs); +} + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); + + *res_low = m0123; + *res_high = m4567; +} + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); + m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); + int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); + m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); + int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); + m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); + int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); + m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + int32x4_t m4567_pairs[] = { m4, m5, m6, m7 }; + + *res_low = horizontal_add_4d_s32x4(m0123_pairs); + *res_high = horizontal_add_4d_s32x4(m4567_pairs); +} + +void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, conv_params, alpha, beta, gamma, delta); +} diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.h b/third_party/aom/av1/common/arm/warp_plane_neon.h new file mode 100644 index 0000000000..5afd72f4ab --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ +#define AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha); + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha); + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx); + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx); + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy); + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma); + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy); + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma); + +static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { + out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> + WARPEDDIFF_PREC_BITS))); +} + +static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) { + out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[2] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 2 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[3] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 3 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[4] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 4 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[5] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 5 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[6] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 6 * stride) >> + WARPEDDIFF_PREC_BITS))); + out[7] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 7 * stride) >> + WARPEDDIFF_PREC_BITS))); +} + +static INLINE int clamp_iy(int iy, int height) { + return clamp(iy, 0, height - 1); +} + +static INLINE void warp_affine_horizontal(const uint8_t *ref, int width, + int height, int stride, int p_width, + int p_height, int16_t alpha, + int16_t beta, const int64_t x4, + const int64_t y4, const int i, + int16x8_t tmp[]) { + const int bd = 8; + const int reduce_bits_horiz = ROUND0_BITS; + const int height_limit = AOMMIN(8, p_height - i) + 7; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + if (ix4 <= -7) { + for (int k = 0; k < height_limit; ++k) { + int iy = clamp_iy(iy4 + k - 7, height); + int16_t dup_val = + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } else if (ix4 >= width + 6) { + for (int k = 0; k < height_limit; ++k) { + int iy = clamp_iy(iy4 + k - 7, height); + int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } + + static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint8x16_t indx = vld1q_u8(kIotaArr); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + \ + if (out_of_boundary_left >= 0) { \ + int limit = out_of_boundary_left + 1; \ + uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcleq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + if (out_of_boundary_right >= 0) { \ + int limit = 15 - (out_of_boundary_right + 1); \ + uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcgeq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), + alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), + alpha); + } + } + } +} + +static INLINE void warp_affine_vertical( + uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound, + uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg, + int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j, + int16x8_t tmp[], const int fwd, const int bwd) { + const int bd = 8; + const int reduce_bits_horiz = ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + int add_const_vert; + if (is_compound) { + add_const_vert = + (1 << offset_bits_vert) + (1 << (COMPOUND_ROUND1_BITS - 1)); + } else { + add_const_vert = + (1 << offset_bits_vert) + (1 << (2 * FILTER_BITS - ROUND0_BITS - 1)); + } + const int sub_constant = (1 << (bd - 1)) + (1 << bd); + + const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS; + const int res_sub_const = + (1 << (2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - 1)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS)) - + (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1)); + + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + if (p_width > 4) { + for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + const int16x8_t *v_src = tmp + (k + 4); + + int32x4_t res_lo, res_hi; + if (gamma == 0) { + vertical_filter_8x1_f1(v_src, &res_lo, &res_hi, sy); + } else { + vertical_filter_8x1_f8(v_src, &res_lo, &res_hi, sy, gamma); + } + + res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert)); + res_hi = vaddq_s32(res_hi, vdupq_n_s32(add_const_vert)); + + if (is_compound) { + uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j]; + int16x8_t res_s16 = + vcombine_s16(vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS), + vshrn_n_s32(res_hi, COMPOUND_ROUND1_BITS)); + if (do_average) { + int16x8_t tmp16 = vreinterpretq_s16_u16(vld1q_u16(p)); + if (use_dist_wtd_comp_avg) { + int32x4_t tmp32_lo = vmull_n_s16(vget_low_s16(tmp16), fwd); + int32x4_t tmp32_hi = vmull_n_s16(vget_high_s16(tmp16), fwd); + tmp32_lo = vmlal_n_s16(tmp32_lo, vget_low_s16(res_s16), bwd); + tmp32_hi = vmlal_n_s16(tmp32_hi, vget_high_s16(res_s16), bwd); + tmp16 = vcombine_s16(vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS), + vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS)); + } else { + tmp16 = vhaddq_s16(tmp16, res_s16); + } + int16x8_t res = vaddq_s16(tmp16, vdupq_n_s16(res_sub_const)); + uint8x8_t res8 = vqshrun_n_s16( + res, 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + vst1_u8(&pred[(i + k + 4) * p_stride + j], res8); + } else { + vst1q_u16(p, vreinterpretq_u16_s16(res_s16)); + } + } else { + int16x8_t res16 = + vcombine_s16(vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS), + vshrn_n_s32(res_hi, 2 * FILTER_BITS - ROUND0_BITS)); + res16 = vsubq_s16(res16, vdupq_n_s16(sub_constant)); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + vst1_u8(p, vqmovun_s16(res16)); + } + } + } else { + // p_width == 4 + for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + const int16x8_t *v_src = tmp + (k + 4); + + int32x4_t res_lo; + if (gamma == 0) { + vertical_filter_4x1_f1(v_src, &res_lo, sy); + } else { + vertical_filter_4x1_f4(v_src, &res_lo, sy, gamma); + } + + res_lo = vaddq_s32(res_lo, vdupq_n_s32(add_const_vert)); + + if (is_compound) { + uint16_t *const p = (uint16_t *)&dst[(i + k + 4) * dst_stride + j]; + + int16x4_t res_lo_s16 = vshrn_n_s32(res_lo, COMPOUND_ROUND1_BITS); + if (do_average) { + uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j]; + int16x4_t tmp16_lo = vreinterpret_s16_u16(vld1_u16(p)); + if (use_dist_wtd_comp_avg) { + int32x4_t tmp32_lo = vmull_n_s16(tmp16_lo, fwd); + tmp32_lo = vmlal_n_s16(tmp32_lo, res_lo_s16, bwd); + tmp16_lo = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS); + } else { + tmp16_lo = vhadd_s16(tmp16_lo, res_lo_s16); + } + int16x4_t res = vadd_s16(tmp16_lo, vdup_n_s16(res_sub_const)); + uint8x8_t res8 = vqshrun_n_s16( + vcombine_s16(res, vdup_n_s16(0)), + 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS); + vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res8), 0); + } else { + uint16x4_t res_u16_low = vreinterpret_u16_s16(res_lo_s16); + vst1_u16(p, res_u16_low); + } + } else { + int16x4_t res16 = vshrn_n_s32(res_lo, 2 * FILTER_BITS - ROUND0_BITS); + res16 = vsub_s16(res16, vdup_n_s16(sub_constant)); + + uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j]; + uint8x8_t val = vqmovun_s16(vcombine_s16(res16, vdup_n_s16(0))); + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0); + } + } + } +} + +static INLINE void av1_warp_affine_common( + const int32_t *mat, const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const int is_compound = conv_params->is_compound; + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + assert(IMPLIES(is_compound, dst != NULL)); + assert(IMPLIES(do_average, is_compound)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int16x8_t tmp[15]; + warp_affine_horizontal(ref, width, height, stride, p_width, p_height, + alpha, beta, x4, y4, i, tmp); + warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, + dst_stride, do_average, use_dist_wtd_comp_avg, gamma, + delta, y4, i, j, tmp, w0, w1); + } + } +} + +#endif // AOM_AV1_COMMON_ARM_WARP_PLANE_NEON_H_ diff --git a/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c new file mode 100644 index 0000000000..39e3ad99f4 --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_neon_i8mm.c @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "warp_plane_neon.h" + +DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5])); + int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4)); + uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5)); + uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6)); + uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8); + int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + int32x4_t tmp_res_high = vpaddq_s32(m45, m67); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + uint8x16_t in_89ab = vqtbl1q_u8(in, perm2); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0); + m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + int32x4_t tmp_res_high = m4567; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy) { + int16x4_t s0 = vget_low_s16(src[0]); + int16x4_t s1 = vget_low_s16(src[1]); + int16x4_t s2 = vget_low_s16(src[2]); + int16x4_t s3 = vget_low_s16(src[3]); + int16x4_t s4 = vget_low_s16(src[4]); + int16x4_t s5 = vget_low_s16(src[5]); + int16x4_t s6 = vget_low_s16(src[6]); + int16x4_t s7 = vget_low_s16(src[7]); + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); + + *res = m0123; +} + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), + vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), + vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + + *res = horizontal_add_4d_s32x4(m0123_pairs); +} + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); + + *res_low = m0123; + *res_high = m4567; +} + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int32x4_t m0 = vmull_s16(vget_low_s16(s0), vget_low_s16(f[0])); + m0 = vmlal_s16(m0, vget_high_s16(s0), vget_high_s16(f[0])); + int32x4_t m1 = vmull_s16(vget_low_s16(s1), vget_low_s16(f[1])); + m1 = vmlal_s16(m1, vget_high_s16(s1), vget_high_s16(f[1])); + int32x4_t m2 = vmull_s16(vget_low_s16(s2), vget_low_s16(f[2])); + m2 = vmlal_s16(m2, vget_high_s16(s2), vget_high_s16(f[2])); + int32x4_t m3 = vmull_s16(vget_low_s16(s3), vget_low_s16(f[3])); + m3 = vmlal_s16(m3, vget_high_s16(s3), vget_high_s16(f[3])); + int32x4_t m4 = vmull_s16(vget_low_s16(s4), vget_low_s16(f[4])); + m4 = vmlal_s16(m4, vget_high_s16(s4), vget_high_s16(f[4])); + int32x4_t m5 = vmull_s16(vget_low_s16(s5), vget_low_s16(f[5])); + m5 = vmlal_s16(m5, vget_high_s16(s5), vget_high_s16(f[5])); + int32x4_t m6 = vmull_s16(vget_low_s16(s6), vget_low_s16(f[6])); + m6 = vmlal_s16(m6, vget_high_s16(s6), vget_high_s16(f[6])); + int32x4_t m7 = vmull_s16(vget_low_s16(s7), vget_low_s16(f[7])); + m7 = vmlal_s16(m7, vget_high_s16(s7), vget_high_s16(f[7])); + + int32x4_t m0123_pairs[] = { m0, m1, m2, m3 }; + int32x4_t m4567_pairs[] = { m4, m5, m6, m7 }; + + *res_low = horizontal_add_4d_s32x4(m0123_pairs); + *res_high = horizontal_add_4d_s32x4(m4567_pairs); +} + +void av1_warp_affine_neon_i8mm(const int32_t *mat, const uint8_t *ref, + int width, int height, int stride, uint8_t *pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params, + int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, conv_params, alpha, beta, gamma, delta); +} diff --git a/third_party/aom/av1/common/arm/warp_plane_sve.c b/third_party/aom/av1/common/arm/warp_plane_sve.c new file mode 100644 index 0000000000..8a4bf5747b --- /dev/null +++ b/third_party/aom/av1/common/arm/warp_plane_sve.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/arm/dot_sve.h" +#include "warp_plane_neon.h" + +DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, + int alpha) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + // Loading the 8 filter taps + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int8x16_t f01_u8 = vcombine_s8(vmovn_s16(f[0]), vmovn_s16(f[1])); + int8x16_t f23_u8 = vcombine_s8(vmovn_s16(f[2]), vmovn_s16(f[3])); + int8x16_t f45_u8 = vcombine_s8(vmovn_s16(f[4]), vmovn_s16(f[5])); + int8x16_t f67_u8 = vcombine_s8(vmovn_s16(f[6]), vmovn_s16(f[7])); + + uint8x8_t in0 = vget_low_u8(in); + uint8x8_t in1 = vget_low_u8(vextq_u8(in, in, 1)); + uint8x8_t in2 = vget_low_u8(vextq_u8(in, in, 2)); + uint8x8_t in3 = vget_low_u8(vextq_u8(in, in, 3)); + uint8x8_t in4 = vget_low_u8(vextq_u8(in, in, 4)); + uint8x8_t in5 = vget_low_u8(vextq_u8(in, in, 5)); + uint8x8_t in6 = vget_low_u8(vextq_u8(in, in, 6)); + uint8x8_t in7 = vget_low_u8(vextq_u8(in, in, 7)); + + int32x4_t m01 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in0, in1), f01_u8); + int32x4_t m23 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in2, in3), f23_u8); + int32x4_t m45 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in4, in5), f45_u8); + int32x4_t m67 = vusdotq_s32(vdupq_n_s32(0), vcombine_u8(in6, in7), f67_u8); + + int32x4_t tmp_res_low = vpaddq_s32(m01, m23); + int32x4_t tmp_res_high = vpaddq_s32(m45, m67); + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + + uint16x8_t res = + vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), vdup_n_u16(0)); + return vreinterpretq_s16_u16(res); +} + +static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { + const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); + + int16x8_t f_s16 = + vld1q_s16((int16_t *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + int8x16_t f_s8 = vcombine_s8(vmovn_s16(f_s16), vmovn_s16(f_s16)); + + uint8x16_t perm0 = vld1q_u8(&usdot_permute_idx[0]); + uint8x16_t perm1 = vld1q_u8(&usdot_permute_idx[16]); + uint8x16_t perm2 = vld1q_u8(&usdot_permute_idx[32]); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t in_0123 = vqtbl1q_u8(in, perm0); + uint8x16_t in_4567 = vqtbl1q_u8(in, perm1); + uint8x16_t in_89ab = vqtbl1q_u8(in, perm2); + + int32x4_t m0123 = vusdotq_laneq_s32(vdupq_n_s32(0), in_0123, f_s8, 0); + m0123 = vusdotq_laneq_s32(m0123, in_4567, f_s8, 1); + + int32x4_t m4567 = vusdotq_laneq_s32(vdupq_n_s32(0), in_4567, f_s8, 0); + m4567 = vusdotq_laneq_s32(m4567, in_89ab, f_s8, 1); + + int32x4_t tmp_res_low = m0123; + int32x4_t tmp_res_high = m4567; + + tmp_res_low = vaddq_s32(tmp_res_low, add_const); + tmp_res_high = vaddq_s32(tmp_res_high, add_const); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(tmp_res_low, ROUND0_BITS), + vqrshrun_n_s32(tmp_res_high, ROUND0_BITS)); + return vreinterpretq_s16_u16(res); +} + +static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, + int sy) { + int16x4_t s0 = vget_low_s16(src[0]); + int16x4_t s1 = vget_low_s16(src[1]); + int16x4_t s2 = vget_low_s16(src[2]); + int16x4_t s3 = vget_low_s16(src[3]); + int16x4_t s4 = vget_low_s16(src[4]); + int16x4_t s5 = vget_low_s16(src[5]); + int16x4_t s6 = vget_low_s16(src[6]); + int16x4_t s7 = vget_low_s16(src[7]); + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(s0, vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s1, vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s2, vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s3, vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, s4, vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, s5, vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, s6, vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, s7, vget_high_s16(f), 3); + + *res = m0123; +} + +static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), + vget_low_s16(src[3]), vget_low_s16(src[4]), vget_low_s16(src[5]), + vget_low_s16(src[6]), vget_low_s16(src[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); +} + +static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + + int16x8_t f = + vld1q_s16((int16_t *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(s0), vget_low_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s1), vget_low_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s2), vget_low_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s3), vget_low_s16(f), 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s4), vget_high_s16(f), 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s5), vget_high_s16(f), 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s6), vget_high_s16(f), 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(s7), vget_high_s16(f), 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(s0), vget_low_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s1), vget_low_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s2), vget_low_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s3), vget_low_s16(f), 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s4), vget_high_s16(f), 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s5), vget_high_s16(f), 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s6), vget_high_s16(f), 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(s7), vget_high_s16(f), 3); + + *res_low = m0123; + *res_high = m4567; +} + +static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { + int16x8_t s0 = src[0]; + int16x8_t s1 = src[1]; + int16x8_t s2 = src[2]; + int16x8_t s3 = src[3]; + int16x8_t s4 = src[4]; + int16x8_t s5 = src[5]; + int16x8_t s6 = src[6]; + int16x8_t s7 = src[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + *res_low = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + *res_high = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); +} + +void av1_warp_affine_sve(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + av1_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, conv_params, alpha, beta, gamma, delta); +} diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c new file mode 100644 index 0000000000..6440c16adb --- /dev/null +++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/common.h" +#include "av1/common/restoration.h" + +static INLINE uint16x8_t wiener_convolve5_8_2d_h( + const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2, + const uint8x8_t t3, const uint8x8_t t4, const int16x4_t x_filter, + const int32x4_t round_vec, const uint16x8_t im_max_val) { + // Since the Wiener filter is symmetric about the middle tap (tap 2) add + // mirrored source elements before multiplying filter coefficients. + int16x8_t s04 = vreinterpretq_s16_u16(vaddl_u8(t0, t4)); + int16x8_t s13 = vreinterpretq_s16_u16(vaddl_u8(t1, t3)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + + // x_filter[0] = 0. (5-tap filters are 0-padded to 7 taps.) + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), x_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), x_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), x_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), x_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), x_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), x_filter, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS), + vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS)); + + return vminq_u16(res, im_max_val); +} + +static INLINE void convolve_add_src_horiz_5tap_neon( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int32x4_t round_vec, const uint16x8_t im_max_val) { + do { + const uint8_t *s = src_ptr; + uint16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3, s4; + load_u8_8x5(s, 1, &s0, &s1, &s2, &s3, &s4); + + uint16x8_t d0 = wiener_convolve5_8_2d_h(s0, s1, s2, s3, s4, x_filter, + round_vec, im_max_val); + + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +} + +static INLINE uint16x8_t wiener_convolve7_8_2d_h( + const uint8x8_t t0, const uint8x8_t t1, const uint8x8_t t2, + const uint8x8_t t3, const uint8x8_t t4, const uint8x8_t t5, + const uint8x8_t t6, const int16x4_t x_filter, const int32x4_t round_vec, + const uint16x8_t im_max_val) { + // Since the Wiener filter is symmetric about the middle tap (tap 3) add + // mirrored source elements before multiplying by filter coefficients. + int16x8_t s06 = vreinterpretq_s16_u16(vaddl_u8(t0, t6)); + int16x8_t s15 = vreinterpretq_s16_u16(vaddl_u8(t1, t5)); + int16x8_t s24 = vreinterpretq_s16_u16(vaddl_u8(t2, t4)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), x_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), x_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), x_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), x_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), x_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), x_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), x_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), x_filter, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum_lo, WIENER_ROUND0_BITS), + vqrshrun_n_s32(sum_hi, WIENER_ROUND0_BITS)); + + return vminq_u16(res, im_max_val); +} + +static INLINE void convolve_add_src_horiz_7tap_neon( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int32x4_t round_vec, const uint16x8_t im_max_val) { + do { + const uint8_t *s = src_ptr; + uint16_t *d = dst_ptr; + int width = w; + + do { + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(s, 1, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = wiener_convolve7_8_2d_h(s0, s1, s2, s3, s4, s5, s6, + x_filter, round_vec, im_max_val); + + vst1q_u16(d, d0); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + } while (--h != 0); +} + +static INLINE uint8x8_t wiener_convolve5_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x4_t y_filter, + const int32x4_t round_vec) { + // Since the Wiener filter is symmetric about the middle tap (tap 2) add + // mirrored source elements before multiplying by filter coefficients. + int16x8_t s04 = vaddq_s16(s0, s4); + int16x8_t s13 = vaddq_s16(s1, s3); + + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s04), y_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s13), y_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), y_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s04), y_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s13), y_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), y_filter, 3); + + int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res_lo, res_hi)); +} + +static INLINE void convolve_add_src_vert_5tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, + const int32x4_t round_vec) { + do { + const int16_t *s = (int16_t *)src; + uint8_t *d = dst; + int height = h; + + while (height > 3) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + uint8x8_t d0 = + wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec); + uint8x8_t d1 = + wiener_convolve5_8_2d_v(s1, s2, s3, s4, s5, y_filter, round_vec); + uint8x8_t d2 = + wiener_convolve5_8_2d_v(s2, s3, s4, s5, s6, y_filter, round_vec); + uint8x8_t d3 = + wiener_convolve5_8_2d_v(s3, s4, s5, s6, s7, y_filter, round_vec); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } + + while (height-- != 0) { + int16x8_t s0, s1, s2, s3, s4; + load_s16_8x5(s, src_stride, &s0, &s1, &s2, &s3, &s4); + + uint8x8_t d0 = + wiener_convolve5_8_2d_v(s0, s1, s2, s3, s4, y_filter, round_vec); + + vst1_u8(d, d0); + + d += dst_stride; + s += src_stride; + } + + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE uint8x8_t wiener_convolve7_8_2d_v( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x4_t y_filter, const int32x4_t round_vec) { + // Since the Wiener filter is symmetric about the middle tap (tap 3) add + // mirrored source elements before multiplying by filter coefficients. + int16x8_t s06 = vaddq_s16(s0, s6); + int16x8_t s15 = vaddq_s16(s1, s5); + int16x8_t s24 = vaddq_s16(s2, s4); + + int32x4_t sum_lo = vmlal_lane_s16(round_vec, vget_low_s16(s06), y_filter, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s15), y_filter, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s24), y_filter, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), y_filter, 3); + + int32x4_t sum_hi = vmlal_lane_s16(round_vec, vget_high_s16(s06), y_filter, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s15), y_filter, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s24), y_filter, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), y_filter, 3); + + int16x4_t res_lo = vshrn_n_s32(sum_lo, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + int16x4_t res_hi = vshrn_n_s32(sum_hi, 2 * FILTER_BITS - WIENER_ROUND0_BITS); + + return vqmovun_s16(vcombine_s16(res_lo, res_hi)); +} + +static INLINE void convolve_add_src_vert_7tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t y_filter, + const int32x4_t round_vec) { + do { + const int16_t *s = (int16_t *)src; + uint8_t *d = dst; + int height = h; + + while (height > 3) { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; + load_s16_8x10(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9); + + uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6, + y_filter, round_vec); + uint8x8_t d1 = wiener_convolve7_8_2d_v(s1, s2, s3, s4, s5, s6, s7, + y_filter, round_vec); + uint8x8_t d2 = wiener_convolve7_8_2d_v(s2, s3, s4, s5, s6, s7, s8, + y_filter, round_vec); + uint8x8_t d3 = wiener_convolve7_8_2d_v(s3, s4, s5, s6, s7, s8, s9, + y_filter, round_vec); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } + + while (height-- != 0) { + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + uint8x8_t d0 = wiener_convolve7_8_2d_v(s0, s1, s2, s3, s4, s5, s6, + y_filter, round_vec); + + vst1_u8(d, d0); + + d += dst_stride; + s += src_stride; + } + + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static AOM_INLINE int get_wiener_filter_taps(const int16_t *filter) { + assert(filter[7] == 0); + if (filter[0] == 0 && filter[6] == 0) { + return WIENER_WIN_REDUCED; + } + return WIENER_WIN; +} + +// Wiener filter 2D +// Apply horizontal filter and store in a temporary buffer. When applying +// vertical filter, overwrite the original pixel values. +void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *x_filter, int x_step_q4, + const int16_t *y_filter, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + (void)x_step_q4; + (void)y_step_q4; + (void)conv_params; + + assert(w % 8 == 0); + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(x_filter[7] == 0 && y_filter[7] == 0); + // For bd == 8, assert horizontal filtering output will not exceed 15-bit: + assert(8 + 1 + FILTER_BITS - conv_params->round_0 <= 15); + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + WIENER_WIN - 1) * MAX_SB_SIZE]); + + const int x_filter_taps = get_wiener_filter_taps(x_filter); + const int y_filter_taps = get_wiener_filter_taps(y_filter); + int16x4_t x_filter_s16 = vld1_s16(x_filter); + int16x4_t y_filter_s16 = vld1_s16(y_filter); + // Add 128 to tap 3. (Needed for rounding.) + x_filter_s16 = vadd_s16(x_filter_s16, vcreate_s16(128ULL << 48)); + y_filter_s16 = vadd_s16(y_filter_s16, vcreate_s16(128ULL << 48)); + + const int im_stride = MAX_SB_SIZE; + const int im_h = h + y_filter_taps - 1; + const int horiz_offset = x_filter_taps / 2; + const int vert_offset = (y_filter_taps / 2) * (int)src_stride; + + const int bd = 8; + const uint16x8_t im_max_val = + vdupq_n_u16((1 << (bd + 1 + FILTER_BITS - WIENER_ROUND0_BITS)) - 1); + const int32x4_t horiz_round_vec = vdupq_n_s32(1 << (bd + FILTER_BITS - 1)); + + const int32x4_t vert_round_vec = + vdupq_n_s32((1 << (2 * FILTER_BITS - WIENER_ROUND0_BITS - 1)) - + (1 << (bd + (2 * FILTER_BITS - WIENER_ROUND0_BITS) - 1))); + + if (x_filter_taps == WIENER_WIN_REDUCED) { + convolve_add_src_horiz_5tap_neon(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter_s16, horiz_round_vec, im_max_val); + } else { + convolve_add_src_horiz_7tap_neon(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, im_h, + x_filter_s16, horiz_round_vec, im_max_val); + } + + if (y_filter_taps == WIENER_WIN_REDUCED) { + convolve_add_src_vert_5tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_s16, vert_round_vec); + } else { + convolve_add_src_vert_7tap_neon(im_block, im_stride, dst, dst_stride, w, h, + y_filter_s16, vert_round_vec); + } +} diff --git a/third_party/aom/av1/common/av1_common_int.h b/third_party/aom/av1/common/av1_common_int.h new file mode 100644 index 0000000000..4c0cb99d2b --- /dev/null +++ b/third_party/aom/av1/common/av1_common_int.h @@ -0,0 +1,1882 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_ +#define AOM_AV1_COMMON_AV1_COMMON_INT_H_ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/internal/aom_codec_internal.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_util/aom_thread.h" +#include "av1/common/alloccommon.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/enums.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/mv.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" +#include "av1/common/tile_common.h" +#include "av1/common/timing.h" +#include "aom_dsp/grain_params.h" +#include "aom_dsp/grain_table.h" +#include "aom_dsp/odintrin.h" +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) && defined(__has_warning) +#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT +#endif +#elif defined(__GNUC__) && __GNUC__ >= 7 +#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT +#endif + +#ifndef AOM_FALLTHROUGH_INTENDED +#define AOM_FALLTHROUGH_INTENDED \ + do { \ + } while (0) +#endif + +#define CDEF_MAX_STRENGTHS 16 + +/* Constant values while waiting for the sequence header */ +#define FRAME_ID_LENGTH 15 +#define DELTA_FRAME_ID_LENGTH 14 + +#define FRAME_CONTEXTS (FRAME_BUFFERS + 1) +// Extra frame context which is always kept at default values +#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1) +#define PRIMARY_REF_BITS 3 +#define PRIMARY_REF_NONE 7 + +#define NUM_PING_PONG_BUFFERS 2 + +#define MAX_NUM_TEMPORAL_LAYERS 8 +#define MAX_NUM_SPATIAL_LAYERS 4 +/* clang-format off */ +// clang-format seems to think this is a pointer dereference and not a +// multiplication. +#define MAX_NUM_OPERATING_POINTS \ + (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS) +/* clang-format on */ + +// TODO(jingning): Turning this on to set up transform coefficient +// processing timer. +#define TXCOEFF_TIMER 0 +#define TXCOEFF_COST_TIMER 0 + +/*!\cond */ + +enum { + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} UENUM1BYTE(REFERENCE_MODE); + +enum { + /** + * Frame context updates are disabled + */ + REFRESH_FRAME_CONTEXT_DISABLED, + /** + * Update frame context to values resulting from backward probability + * updates based on entropy/counts in the decoded frame + */ + REFRESH_FRAME_CONTEXT_BACKWARD, +} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE); + +#define MFMV_STACK_SIZE 3 +typedef struct { + int_mv mfmv0; + uint8_t ref_frame_offset; +} TPL_MV_REF; + +typedef struct { + int_mv mv; + MV_REFERENCE_FRAME ref_frame; +} MV_REF; + +typedef struct RefCntBuffer { + // For a RefCntBuffer, the following are reference-holding variables: + // - cm->ref_frame_map[] + // - cm->cur_frame + // - cm->scaled_ref_buf[] (encoder only) + // - pbi->output_frame_index[] (decoder only) + // With that definition, 'ref_count' is the number of reference-holding + // variables that are currently referencing this buffer. + // For example: + // - suppose this buffer is at index 'k' in the buffer pool, and + // - Total 'n' of the variables / array elements above have value 'k' (that + // is, they are pointing to buffer at index 'k'). + // Then, pool->frame_bufs[k].ref_count = n. + int ref_count; + + unsigned int order_hint; + unsigned int ref_order_hints[INTER_REFS_PER_FRAME]; + + // These variables are used only in encoder and compare the absolute + // display order hint to compute the relative distance and overcome + // the limitation of get_relative_dist() which returns incorrect + // distance when a very old frame is used as a reference. + unsigned int display_order_hint; + unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME]; + // Frame's level within the hierarchical structure. + unsigned int pyramid_level; + MV_REF *mvs; + uint8_t *seg_map; + struct segmentation seg; + int mi_rows; + int mi_cols; + // Width and height give the size of the buffer (before any upscaling, unlike + // the sizes that can be derived from the buf structure) + int width; + int height; + WarpedMotionParams global_motion[REF_FRAMES]; + int showable_frame; // frame can be used as show existing frame in future + uint8_t film_grain_params_present; + aom_film_grain_t film_grain_params; + aom_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; + int temporal_id; // Temporal layer ID of the frame + int spatial_id; // Spatial layer ID of the frame + FRAME_TYPE frame_type; + + // This is only used in the encoder but needs to be indexed per ref frame + // so it's extremely convenient to keep it here. + int interp_filter_selected[SWITCHABLE]; + + // Inter frame reference frame delta for loop filter + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT frame_context; +} RefCntBuffer; + +typedef struct BufferPool { +// Protect BufferPool from being accessed by several FrameWorkers at +// the same time during frame parallel decode. +// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. +// TODO(wtc): Remove this. See +// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630. +#if CONFIG_MULTITHREAD + pthread_mutex_t pool_mutex; +#endif + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + + aom_get_frame_buffer_cb_fn_t get_fb_cb; + aom_release_frame_buffer_cb_fn_t release_fb_cb; + + RefCntBuffer *frame_bufs; + uint8_t num_frame_bufs; + + // Frame buffers allocated internally by the codec. + InternalFrameBufferList int_frame_buffers; +} BufferPool; + +/*!\endcond */ + +/*!\brief Parameters related to CDEF */ +typedef struct { + //! CDEF column line buffer + uint16_t *colbuf[MAX_MB_PLANE]; + //! CDEF top & bottom line buffer + uint16_t *linebuf[MAX_MB_PLANE]; + //! CDEF intermediate buffer + uint16_t *srcbuf; + //! CDEF column line buffer sizes + size_t allocated_colbuf_size[MAX_MB_PLANE]; + //! CDEF top and bottom line buffer sizes + size_t allocated_linebuf_size[MAX_MB_PLANE]; + //! CDEF intermediate buffer size + size_t allocated_srcbuf_size; + //! CDEF damping factor + int cdef_damping; + //! Number of CDEF strength values + int nb_cdef_strengths; + //! CDEF strength values for luma + int cdef_strengths[CDEF_MAX_STRENGTHS]; + //! CDEF strength values for chroma + int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; + //! Number of CDEF strength values in bits + int cdef_bits; + //! Number of rows in the frame in 4 pixel + int allocated_mi_rows; + //! Number of CDEF workers + int allocated_num_workers; +} CdefInfo; + +/*!\cond */ + +typedef struct { + int delta_q_present_flag; + // Resolution of delta quant + int delta_q_res; + int delta_lf_present_flag; + // Resolution of delta lf level + int delta_lf_res; + // This is a flag for number of deltas of loop filter level + // 0: use 1 delta, for y_vertical, y_horizontal, u, and v + // 1: use separate deltas for each filter level + int delta_lf_multi; +} DeltaQInfo; + +typedef struct { + int enable_order_hint; // 0 - disable order hint, and related tools + int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs, + // frame_sign_bias + // if 0, enable_dist_wtd_comp and + // enable_ref_frame_mvs must be set as 0. + int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes + // 1 - enable it + int enable_ref_frame_mvs; // 0 - disable ref frame mvs + // 1 - enable it +} OrderHintInfo; + +// Sequence header structure. +// Note: All syntax elements of sequence_header_obu that need to be +// bit-identical across multiple sequence headers must be part of this struct, +// so that consistency is checked by are_seq_headers_consistent() function. +// One exception is the last member 'op_params' that is ignored by +// are_seq_headers_consistent() function. +typedef struct SequenceHeader { + int num_bits_width; + int num_bits_height; + int max_frame_width; + int max_frame_height; + // Whether current and reference frame IDs are signaled in the bitstream. + // Frame id numbers are additional information that do not affect the + // decoding process, but provide decoders with a way of detecting missing + // reference frames so that appropriate action can be taken. + uint8_t frame_id_numbers_present_flag; + int frame_id_length; + int delta_frame_id_length; + BLOCK_SIZE sb_size; // Size of the superblock used for this frame + int mib_size; // Size of the superblock in units of MI blocks + int mib_size_log2; // Log 2 of above. + + OrderHintInfo order_hint_info; + + uint8_t force_screen_content_tools; // 0 - force off + // 1 - force on + // 2 - adaptive + uint8_t still_picture; // Video is a single frame still picture + uint8_t reduced_still_picture_hdr; // Use reduced header for still picture + uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel + // 1 - force to integer + // 2 - adaptive + uint8_t enable_filter_intra; // enables/disables filterintra + uint8_t enable_intra_edge_filter; // enables/disables edge upsampling + uint8_t enable_interintra_compound; // enables/disables interintra_compound + uint8_t enable_masked_compound; // enables/disables masked compound + uint8_t enable_dual_filter; // 0 - disable dual interpolation filter + // 1 - enable vert/horz filter selection + uint8_t enable_warped_motion; // 0 - disable warp for the sequence + // 1 - enable warp for the sequence + uint8_t enable_superres; // 0 - Disable superres for the sequence + // and no frame level superres flag + // 1 - Enable superres for the sequence + // enable per-frame superres flag + uint8_t enable_cdef; // To turn on/off CDEF + uint8_t enable_restoration; // To turn on/off loop restoration + BITSTREAM_PROFILE profile; + + // Color config. + aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, + // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. + uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers. + uint8_t monochrome; // Monochrome video + aom_color_primaries_t color_primaries; + aom_transfer_characteristics_t transfer_characteristics; + aom_matrix_coefficients_t matrix_coefficients; + int color_range; + int subsampling_x; // Chroma subsampling for x + int subsampling_y; // Chroma subsampling for y + aom_chroma_sample_position_t chroma_sample_position; + uint8_t separate_uv_delta_q; + uint8_t film_grain_params_present; + + // Operating point info. + int operating_points_cnt_minus_1; + int operating_point_idc[MAX_NUM_OPERATING_POINTS]; + int timing_info_present; + aom_timing_info_t timing_info; + uint8_t decoder_model_info_present_flag; + aom_dec_model_info_t decoder_model_info; + uint8_t display_model_info_present_flag; + AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS]; + uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in spec. One bit: 0 or 1. + + // IMPORTANT: the op_params member must be at the end of the struct so that + // are_seq_headers_consistent() can be implemented with a memcmp() call. + // TODO(urvang): We probably don't need the +1 here. + aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; +} SequenceHeader; + +typedef struct { + int skip_mode_allowed; + int skip_mode_flag; + int ref_frame_idx_0; + int ref_frame_idx_1; +} SkipModeInfo; + +typedef struct { + FRAME_TYPE frame_type; + REFERENCE_MODE reference_mode; + + unsigned int order_hint; + unsigned int display_order_hint; + // Frame's level within the hierarchical structure. + unsigned int pyramid_level; + unsigned int frame_number; + SkipModeInfo skip_mode_info; + int refresh_frame_flags; // Which ref frames are overwritten by this frame + int frame_refs_short_signaling; +} CurrentFrame; + +/*!\endcond */ + +/*! + * \brief Frame level features. + */ +typedef struct { + /*! + * If true, CDF update in the symbol encoding/decoding process is disabled. + */ + bool disable_cdf_update; + /*! + * If true, motion vectors are specified to eighth pel precision; and + * if false, motion vectors are specified to quarter pel precision. + */ + bool allow_high_precision_mv; + /*! + * If true, force integer motion vectors; if false, use the default. + */ + bool cur_frame_force_integer_mv; + /*! + * If true, palette tool and/or intra block copy tools may be used. + */ + bool allow_screen_content_tools; + bool allow_intrabc; /*!< If true, intra block copy tool may be used. */ + bool allow_warped_motion; /*!< If true, frame may use warped motion mode. */ + /*! + * If true, using previous frames' motion vectors for prediction is allowed. + */ + bool allow_ref_frame_mvs; + /*! + * If true, frame is fully lossless at coded resolution. + * */ + bool coded_lossless; + /*! + * If true, frame is fully lossless at upscaled resolution. + */ + bool all_lossless; + /*! + * If true, the frame is restricted to a reduced subset of the full set of + * transform types. + */ + bool reduced_tx_set_used; + /*! + * If true, error resilient mode is enabled. + * Note: Error resilient mode allows the syntax of a frame to be parsed + * independently of previously decoded frames. + */ + bool error_resilient_mode; + /*! + * If false, only MOTION_MODE that may be used is SIMPLE_TRANSLATION; + * if true, all MOTION_MODES may be used. + */ + bool switchable_motion_mode; + TX_MODE tx_mode; /*!< Transform mode at frame level. */ + InterpFilter interp_filter; /*!< Interpolation filter at frame level. */ + /*! + * The reference frame that contains the CDF values and other state that + * should be loaded at the start of the frame. + */ + int primary_ref_frame; + /*! + * Byte alignment of the planes in the reference buffers. + */ + int byte_alignment; + /*! + * Flag signaling how frame contexts should be updated at the end of + * a frame decode. + */ + REFRESH_FRAME_CONTEXT_MODE refresh_frame_context; +} FeatureFlags; + +/*! + * \brief Params related to tiles. + */ +typedef struct CommonTileParams { + int cols; /*!< number of tile columns that frame is divided into */ + int rows; /*!< number of tile rows that frame is divided into */ + int max_width_sb; /*!< maximum tile width in superblock units. */ + int max_height_sb; /*!< maximum tile height in superblock units. */ + + /*! + * Min width of non-rightmost tile in MI units. Only valid if cols > 1. + */ + int min_inner_width; + + /*! + * If true, tiles are uniformly spaced with power-of-two number of rows and + * columns. + * If false, tiles have explicitly configured widths and heights. + */ + int uniform_spacing; + + /** + * \name Members only valid when uniform_spacing == 1 + */ + /**@{*/ + int log2_cols; /*!< log2 of 'cols'. */ + int log2_rows; /*!< log2 of 'rows'. */ + int width; /*!< tile width in MI units */ + int height; /*!< tile height in MI units */ + /**@}*/ + + /*! + * Min num of tile columns possible based on 'max_width_sb' and frame width. + */ + int min_log2_cols; + /*! + * Min num of tile rows possible based on 'max_height_sb' and frame height. + */ + int min_log2_rows; + /*! + * Max num of tile columns possible based on frame width. + */ + int max_log2_cols; + /*! + * Max num of tile rows possible based on frame height. + */ + int max_log2_rows; + /*! + * log2 of min number of tiles (same as min_log2_cols + min_log2_rows). + */ + int min_log2; + /*! + * col_start_sb[i] is the start position of tile column i in superblock units. + * valid for 0 <= i <= cols + */ + int col_start_sb[MAX_TILE_COLS + 1]; + /*! + * row_start_sb[i] is the start position of tile row i in superblock units. + * valid for 0 <= i <= rows + */ + int row_start_sb[MAX_TILE_ROWS + 1]; + /*! + * If true, we are using large scale tile mode. + */ + unsigned int large_scale; + /*! + * Only relevant when large_scale == 1. + * If true, the independent decoding of a single tile or a section of a frame + * is allowed. + */ + unsigned int single_tile_decoding; +} CommonTileParams; + +typedef struct CommonModeInfoParams CommonModeInfoParams; +/*! + * \brief Params related to MB_MODE_INFO arrays and related info. + */ +struct CommonModeInfoParams { + /*! + * Number of rows in the frame in 16 pixel units. + * This is computed from frame height aligned to a multiple of 8. + */ + int mb_rows; + /*! + * Number of cols in the frame in 16 pixel units. + * This is computed from frame width aligned to a multiple of 8. + */ + int mb_cols; + + /*! + * Total MBs = mb_rows * mb_cols. + */ + int MBs; + + /*! + * Number of rows in the frame in 4 pixel (MB_MODE_INFO) units. + * This is computed from frame height aligned to a multiple of 8. + */ + int mi_rows; + /*! + * Number of cols in the frame in 4 pixel (MB_MODE_INFO) units. + * This is computed from frame width aligned to a multiple of 8. + */ + int mi_cols; + + /*! + * An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block + * in the frame. + * Note: This array should be treated like a scratch memory, and should NOT be + * accessed directly, in most cases. Please use 'mi_grid_base' array instead. + */ + MB_MODE_INFO *mi_alloc; + /*! + * Number of allocated elements in 'mi_alloc'. + */ + int mi_alloc_size; + /*! + * Stride for 'mi_alloc' array. + */ + int mi_alloc_stride; + /*! + * The minimum block size that each element in 'mi_alloc' can correspond to. + * For decoder, this is always BLOCK_4X4. + * For encoder, this is BLOCK_8X8 for resolution >= 4k case or REALTIME mode + * case. Otherwise, this is BLOCK_4X4. + */ + BLOCK_SIZE mi_alloc_bsize; + + /*! + * Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'. + * It's possible that: + * - Multiple pointers in the grid point to the same element in 'mi_alloc' + * (for example, for all 4x4 blocks that belong to the same partition block). + * - Some pointers can be NULL (for example, for blocks outside visible area). + */ + MB_MODE_INFO **mi_grid_base; + /*! + * Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also). + */ + int mi_grid_size; + /*! + * Stride for 'mi_grid_base' (and 'tx_type_map' also). + */ + int mi_stride; + + /*! + * An array of tx types for each 4x4 block in the frame. + * Number of allocated elements is same as 'mi_grid_size', and stride is + * same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of + * 'mi_grid_base'. + */ + TX_TYPE *tx_type_map; + + /** + * \name Function pointers to allow separate logic for encoder and decoder. + */ + /**@{*/ + /*! + * Free the memory allocated to arrays in 'mi_params'. + * \param[in,out] mi_params object containing common mode info parameters + */ + void (*free_mi)(struct CommonModeInfoParams *mi_params); + /*! + * Initialize / reset appropriate arrays in 'mi_params'. + * \param[in,out] mi_params object containing common mode info parameters + */ + void (*setup_mi)(struct CommonModeInfoParams *mi_params); + /*! + * Allocate required memory for arrays in 'mi_params'. + * \param[in,out] mi_params object containing common mode info + * parameters + * \param width frame width + * \param height frame height + * \param min_partition_size minimum partition size allowed while + * encoding + */ + void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width, + int height, BLOCK_SIZE min_partition_size); + /**@}*/ +}; + +typedef struct CommonQuantParams CommonQuantParams; +/*! + * \brief Parameters related to quantization at the frame level. + */ +struct CommonQuantParams { + /*! + * Base qindex of the frame in the range 0 to 255. + */ + int base_qindex; + + /*! + * Delta of qindex (from base_qindex) for Y plane DC coefficient. + * Note: y_ac_delta_q is implicitly 0. + */ + int y_dc_delta_q; + + /*! + * Delta of qindex (from base_qindex) for U plane DC coefficients. + */ + int u_dc_delta_q; + /*! + * Delta of qindex (from base_qindex) for U plane AC coefficients. + */ + int v_dc_delta_q; + + /*! + * Delta of qindex (from base_qindex) for V plane DC coefficients. + * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. + */ + int u_ac_delta_q; + /*! + * Delta of qindex (from base_qindex) for V plane AC coefficients. + * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. + */ + int v_ac_delta_q; + + /* + * Note: The qindex per superblock may have a delta from the qindex obtained + * at frame level from parameters above, based on 'cm->delta_q_info'. + */ + + /** + * \name True dequantizers. + * The dequantizers below are true dequantizers used only in the + * dequantization process. They have the same coefficient + * shift/scale as TX. + */ + /**@{*/ + int16_t y_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for Y plane */ + int16_t u_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for U plane */ + int16_t v_dequant_QTX[MAX_SEGMENTS][2]; /*!< Dequant for V plane */ + /**@}*/ + + /** + * \name Global quantization matrix tables. + */ + /**@{*/ + /*! + * Global dequantization matrix table. + */ + const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + /*! + * Global quantization matrix table. + */ + const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL]; + /**@}*/ + + /** + * \name Local dequantization matrix tables for each frame. + */ + /**@{*/ + /*! + * Local dequant matrix for Y plane. + */ + const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + /*! + * Local dequant matrix for U plane. + */ + const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + /*! + * Local dequant matrix for V plane. + */ + const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + /**@}*/ + + /*! + * Flag indicating whether quantization matrices are being used: + * - If true, qm_level_y, qm_level_u and qm_level_v indicate the level + * indices to be used to access appropriate global quant matrix tables. + * - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'. + */ + bool using_qmatrix; + /** + * \name Valid only when using_qmatrix == true + * Indicate the level indices to be used to access appropriate global quant + * matrix tables. + */ + /**@{*/ + int qmatrix_level_y; /*!< Level index for Y plane */ + int qmatrix_level_u; /*!< Level index for U plane */ + int qmatrix_level_v; /*!< Level index for V plane */ + /**@}*/ +}; + +typedef struct CommonContexts CommonContexts; +/*! + * \brief Contexts used for transmitting various symbols in the bitstream. + */ +struct CommonContexts { + /*! + * Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type. + * partition[i][j] is the context for ith tile row, jth mi_col. + */ + PARTITION_CONTEXT **partition; + + /*! + * Context used to derive context for multiple symbols: + * - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit + * to transmit skip_txfm flag. + * - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit + * sign. + * entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col. + */ + ENTROPY_CONTEXT **entropy[MAX_MB_PLANE]; + + /*! + * Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to + * transmit 'is_split' flag to indicate if this transform block should be + * split into smaller sub-blocks. + * txfm[i][j] is the context for ith tile row, jth mi_col. + */ + TXFM_CONTEXT **txfm; + + /*! + * Dimensions that were used to allocate the arrays above. + * If these dimensions change, the arrays may have to be re-allocated. + */ + int num_planes; /*!< Corresponds to av1_num_planes(cm) */ + int num_tile_rows; /*!< Corresponds to cm->tiles.row */ + int num_mi_cols; /*!< Corresponds to cm->mi_params.mi_cols */ +}; + +/*! + * \brief Top level common structure used by both encoder and decoder. + */ +typedef struct AV1Common { + /*! + * Information about the current frame that is being coded. + */ + CurrentFrame current_frame; + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info *error; + + /*! + * AV1 allows two types of frame scaling operations: + * 1. Frame super-resolution: that allows coding a frame at lower resolution + * and after decoding the frame, normatively scales and restores the frame -- + * inside the coding loop. + * 2. Frame resize: that allows coding frame at lower/higher resolution, and + * then non-normatively upscale the frame at the time of rendering -- outside + * the coding loop. + * Hence, the need for 3 types of dimensions. + */ + + /** + * \name Coded frame dimensions. + */ + /**@{*/ + int width; /*!< Coded frame width */ + int height; /*!< Coded frame height */ + /**@}*/ + + /** + * \name Rendered frame dimensions. + * Dimensions after applying both super-resolution and resize to the coded + * frame. Different from coded dimensions if super-resolution and/or resize + * are being used for this frame. + */ + /**@{*/ + int render_width; /*!< Rendered frame width */ + int render_height; /*!< Rendered frame height */ + /**@}*/ + + /** + * \name Super-resolved frame dimensions. + * Frame dimensions after applying super-resolution to the coded frame (if + * present), but before applying resize. + * Larger than the coded dimensions if super-resolution is being used for + * this frame. + * Different from rendered dimensions if resize is being used for this frame. + */ + /**@{*/ + int superres_upscaled_width; /*!< Super-resolved frame width */ + int superres_upscaled_height; /*!< Super-resolved frame height */ + /**@}*/ + + /*! + * The denominator of the superres scale used by this frame. + * Note: The numerator is fixed to be SCALE_NUMERATOR. + */ + uint8_t superres_scale_denominator; + + /*! + * buffer_removal_times[op_num] specifies the frame removal time in units of + * DecCT clock ticks counted from the removal time of the last random access + * point for operating point op_num. + * TODO(urvang): We probably don't need the +1 here. + */ + uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1]; + /*! + * Presentation time of the frame in clock ticks DispCT counted from the + * removal time of the last random access point for the operating point that + * is being decoded. + */ + uint32_t frame_presentation_time; + + /*! + * Buffer where previous frame is stored. + */ + RefCntBuffer *prev_frame; + + /*! + * Buffer into which the current frame will be stored and other related info. + * TODO(hkuang): Combine this with cur_buf in macroblockd. + */ + RefCntBuffer *cur_frame; + + /*! + * For encoder, we have a two-level mapping from reference frame type to the + * corresponding buffer in the buffer pool: + * * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ... + * EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1) + * * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to + * the reference counted buffer structure RefCntBuffer, taken from the buffer + * pool cm->buffer_pool->frame_bufs. + * + * LAST_FRAME, ..., EXTREF_FRAME + * | | + * v v + * remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] + * | | + * v v + * ref_frame_map[], ..., ref_frame_map[] + * + * Note: INTRA_FRAME always refers to the current frame, so there's no need to + * have a remapped index for the same. + */ + int remapped_ref_idx[REF_FRAMES]; + + /*! + * Scale of the current frame with respect to itself. + * This is currently used for intra block copy, which behaves like an inter + * prediction mode, where the reference frame is the current frame itself. + */ + struct scale_factors sf_identity; + + /*! + * Scale factors of the reference frame with respect to the current frame. + * This is required for generating inter prediction and will be non-identity + * for a reference frame, if it has different dimensions than the coded + * dimensions of the current frame. + */ + struct scale_factors ref_scale_factors[REF_FRAMES]; + + /*! + * For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to + * the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + * For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps + * remapped reference index 'j' (that is, original reference type 'i') to + * a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + */ + RefCntBuffer *ref_frame_map[REF_FRAMES]; + + /*! + * If true, this frame is actually shown after decoding. + * If false, this frame is coded in the bitstream, but not shown. It is only + * used as a reference for other frames coded later. + */ + int show_frame; + + /*! + * If true, this frame can be used as a show-existing frame for other frames + * coded later. + * When 'show_frame' is true, this is always true for all non-keyframes. + * When 'show_frame' is false, this value is transmitted in the bitstream. + */ + int showable_frame; + + /*! + * If true, show an existing frame coded before, instead of actually coding a + * frame. The existing frame comes from one of the existing reference buffers, + * as signaled in the bitstream. + */ + int show_existing_frame; + + /*! + * Whether some features are allowed or not. + */ + FeatureFlags features; + + /*! + * Params related to MB_MODE_INFO arrays and related info. + */ + CommonModeInfoParams mi_params; + +#if CONFIG_ENTROPY_STATS + /*! + * Context type used by token CDFs, in the range 0 .. (TOKEN_CDF_Q_CTXS - 1). + */ + int coef_cdf_category; +#endif // CONFIG_ENTROPY_STATS + + /*! + * Quantization params. + */ + CommonQuantParams quant_params; + + /*! + * Segmentation info for current frame. + */ + struct segmentation seg; + + /*! + * Segmentation map for previous frame. + */ + uint8_t *last_frame_seg_map; + + /** + * \name Deblocking filter parameters. + */ + /**@{*/ + loop_filter_info_n lf_info; /*!< Loop filter info */ + struct loopfilter lf; /*!< Loop filter parameters */ + /**@}*/ + + /** + * \name Loop Restoration filter parameters. + */ + /**@{*/ + RestorationInfo rst_info[MAX_MB_PLANE]; /*!< Loop Restoration filter info */ + int32_t *rst_tmpbuf; /*!< Scratch buffer for self-guided restoration */ + RestorationLineBuffers *rlbs; /*!< Line buffers needed by loop restoration */ + YV12_BUFFER_CONFIG rst_frame; /*!< Stores the output of loop restoration */ + /**@}*/ + + /*! + * CDEF (Constrained Directional Enhancement Filter) parameters. + */ + CdefInfo cdef_info; + + /*! + * Parameters for film grain synthesis. + */ + aom_film_grain_t film_grain_params; + + /*! + * Parameters for delta quantization and delta loop filter level. + */ + DeltaQInfo delta_q_info; + + /*! + * Global motion parameters for each reference frame. + */ + WarpedMotionParams global_motion[REF_FRAMES]; + + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader *seq_params; + + /*! + * Current CDFs of all the symbols for the current frame. + */ + FRAME_CONTEXT *fc; + /*! + * Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE + * (e.g. for a keyframe). These default CDFs are defined by the bitstream and + * copied from default CDF tables for each symbol. + */ + FRAME_CONTEXT *default_frame_context; + + /*! + * Parameters related to tiling. + */ + CommonTileParams tiles; + + /*! + * External BufferPool passed from outside. + */ + BufferPool *buffer_pool; + + /*! + * Above context buffers and their sizes. + * Note: above contexts are allocated in this struct, as their size is + * dependent on frame width, while left contexts are declared and allocated in + * MACROBLOCKD struct, as they have a fixed size. + */ + CommonContexts above_contexts; + + /** + * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1 + */ + /**@{*/ + int current_frame_id; /*!< frame ID for the current frame. */ + int ref_frame_id[REF_FRAMES]; /*!< frame IDs for the reference frames. */ + /**@}*/ + + /*! + * Motion vectors provided by motion field estimation. + * tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where: + * mi_row = 2 * row, + * mi_col = 2 * col, and + * stride = cm->mi_params.mi_stride / 2 + */ + TPL_MV_REF *tpl_mvs; + /*! + * Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function. + */ + int tpl_mvs_mem_size; + /*! + * ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and + * current frame is positive; and 0 otherwise. + */ + int ref_frame_sign_bias[REF_FRAMES]; + /*! + * ref_frame_side[k] is 1 if relative distance between reference 'k' and + * current frame is positive, -1 if relative distance is 0; and 0 otherwise. + * TODO(jingning): This can be combined with sign_bias later. + */ + int8_t ref_frame_side[REF_FRAMES]; + + /*! + * Temporal layer ID of this frame + * (in the range 0 ... (number_temporal_layers - 1)). + */ + int temporal_layer_id; + + /*! + * Spatial layer ID of this frame + * (in the range 0 ... (number_spatial_layers - 1)). + */ + int spatial_layer_id; + +#if TXCOEFF_TIMER + int64_t cum_txcoeff_timer; + int64_t txcoeff_timer; + int txb_count; +#endif // TXCOEFF_TIMER + +#if TXCOEFF_COST_TIMER + int64_t cum_txcoeff_cost_timer; + int64_t txcoeff_cost_timer; + int64_t txcoeff_cost_count; +#endif // TXCOEFF_COST_TIMER +} AV1_COMMON; + +/*!\cond */ + +// TODO(hkuang): Don't need to lock the whole pool after implementing atomic +// frame reference count. +static void lock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static void unlock_buffer_pool(BufferPool *const pool) { +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&pool->pool_mutex); +#else + (void)pool; +#endif +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) return NULL; + if (cm->ref_frame_map[index] == NULL) return NULL; + return &cm->ref_frame_map[index]->buf; +} + +static INLINE int get_free_fb(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + const int num_frame_bufs = cm->buffer_pool->num_frame_bufs; + for (i = 0; i < num_frame_bufs; ++i) + if (frame_bufs[i].ref_count == 0) break; + + if (i != num_frame_bufs) { + if (frame_bufs[i].buf.use_external_reference_buffers) { + // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the + // external reference buffers. Restore the buffer pointers to point to the + // internally allocated memory. + YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf; + ybf->y_buffer = ybf->store_buf_adr[0]; + ybf->u_buffer = ybf->store_buf_adr[1]; + ybf->v_buffer = ybf->store_buf_adr[2]; + ybf->use_external_reference_buffers = 0; + } + + frame_bufs[i].ref_count = 1; + } else { + // We should never run out of free buffers. If this assertion fails, there + // is a reference leak. + assert(0 && "Ran out of free frame buffers. Likely a reference leak."); + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = INVALID_IDX; + } + + unlock_buffer_pool(cm->buffer_pool); + return i; +} + +static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) { + // Release the previously-used frame-buffer + if (cm->cur_frame != NULL) { + --cm->cur_frame->ref_count; + cm->cur_frame = NULL; + } + + // Assign a new framebuffer + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) return NULL; + + cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx]; +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + aom_invalidate_pyramid(cm->cur_frame->buf.y_pyramid); + av1_invalidate_corner_list(cm->cur_frame->buf.corners); +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY + av1_zero(cm->cur_frame->interp_filter_selected); + return cm->cur_frame; +} + +// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref +// counts accordingly. +static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr, + RefCntBuffer *rhs_ptr) { + RefCntBuffer *const old_ptr = *lhs_ptr; + if (old_ptr != NULL) { + assert(old_ptr->ref_count > 0); + // One less reference to the buffer at 'old_ptr', so decrease ref count. + --old_ptr->ref_count; + } + + *lhs_ptr = rhs_ptr; + // One more reference to the buffer at 'rhs_ptr', so increase ref count. + ++rhs_ptr->ref_count; +} + +static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) { + return cm->current_frame.frame_type == KEY_FRAME || + cm->current_frame.frame_type == INTRA_ONLY_FRAME; +} + +static INLINE int frame_is_sframe(const AV1_COMMON *cm) { + return cm->current_frame.frame_type == S_FRAME; +} + +// These functions take a reference frame label between LAST_FRAME and +// EXTREF_FRAME inclusive. Note that this is different to the indexing +// previously used by the frame_refs[] array. +static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME ref_frame) { + return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME) + ? cm->remapped_ref_idx[ref_frame - LAST_FRAME] + : INVALID_IDX; +} + +static INLINE RefCntBuffer *get_ref_frame_buf( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Both const and non-const versions of this function are provided so that it +// can be used with a const AV1_COMMON if needed. +static INLINE const struct scale_factors *get_ref_scale_factors_const( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE struct scale_factors *get_ref_scale_factors( + AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE RefCntBuffer *get_primary_ref_frame_buf( + const AV1_COMMON *const cm) { + const int primary_ref_frame = cm->features.primary_ref_frame; + if (primary_ref_frame == PRIMARY_REF_NONE) return NULL; + const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Returns 1 if this frame might allow mvs from some reference frame. +static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && + cm->seq_params->order_hint_info.enable_ref_frame_mvs && + cm->seq_params->order_hint_info.enable_order_hint && + !frame_is_intra_only(cm); +} + +// Returns 1 if this frame might use warped_motion +static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) { + return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) && + cm->seq_params->enable_warped_motion; +} + +static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { + const int buf_rows = buf->mi_rows; + const int buf_cols = buf->mi_cols; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (buf->mvs == NULL || buf_rows != mi_params->mi_rows || + buf_cols != mi_params->mi_cols) { + aom_free(buf->mvs); + buf->mi_rows = mi_params->mi_rows; + buf->mi_cols = mi_params->mi_cols; + CHECK_MEM_ERROR(cm, buf->mvs, + (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) * + ((mi_params->mi_cols + 1) >> 1), + sizeof(*buf->mvs))); + aom_free(buf->seg_map); + CHECK_MEM_ERROR( + cm, buf->seg_map, + (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols, + sizeof(*buf->seg_map))); + } + + const int mem_size = + ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1); + + if (cm->tpl_mvs == NULL || cm->tpl_mvs_mem_size < mem_size) { + aom_free(cm->tpl_mvs); + CHECK_MEM_ERROR(cm, cm->tpl_mvs, + (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs))); + cm->tpl_mvs_mem_size = mem_size; + } +} + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); + +static INLINE int av1_num_planes(const AV1_COMMON *cm) { + return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE; +} + +static INLINE void av1_init_above_context(CommonContexts *above_contexts, + int num_planes, int tile_row, + MACROBLOCKD *xd) { + for (int i = 0; i < num_planes; ++i) { + xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row]; + } + xd->above_partition_context = above_contexts->partition[tile_row]; + xd->above_txfm_context = above_contexts->txfm[tile_row]; +} + +static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) { + const int num_planes = av1_num_planes(cm); + const CommonQuantParams *const quant_params = &cm->quant_params; + + for (int i = 0; i < num_planes; ++i) { + if (xd->plane[i].plane_type == PLANE_TYPE_Y) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX, + sizeof(quant_params->y_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix, + sizeof(quant_params->y_iqmatrix)); + + } else { + if (i == AOM_PLANE_U) { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX, + sizeof(quant_params->u_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix, + sizeof(quant_params->u_iqmatrix)); + } else { + memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX, + sizeof(quant_params->v_dequant_QTX)); + memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix, + sizeof(quant_params->v_iqmatrix)); + } + } + } + xd->mi_stride = cm->mi_params.mi_stride; + xd->error_info = cm->error; + cfl_init(&xd->cfl, cm->seq_params); +} + +static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col, + const int num_planes) { + int i; + int row_offset = mi_row; + int col_offset = mi_col; + for (i = 0; i < num_planes; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + // Offset the buffer pointer + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + row_offset = mi_row - 1; + if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + col_offset = mi_col - 1; + int above_idx = col_offset; + int left_idx = row_offset & MAX_MIB_MASK; + pd->above_entropy_context = + &xd->above_entropy_context[i][above_idx >> pd->subsampling_x]; + pd->left_entropy_context = + &xd->left_entropy_context[i][left_idx >> pd->subsampling_y]; + } +} + +static INLINE int calc_mi_size(int len) { + // len is in mi units. Align to a multiple of SBs. + return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2); +} + +static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, + const int num_planes) { + int i; + for (i = 0; i < num_planes; i++) { + xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x; + xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y; + + xd->plane[i].width = AOMMAX(xd->plane[i].width, 4); + xd->plane[i].height = AOMMAX(xd->plane[i].height, 4); + } +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, int mi_col, int bw, + int mi_rows, int mi_cols) { + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE)); + xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Are edges available for intra prediction? + xd->up_available = (mi_row > tile->mi_row_start); + + const int ss_x = xd->plane[1].subsampling_x; + const int ss_y = xd->plane[1].subsampling_y; + + xd->left_available = (mi_col > tile->mi_col_start); + xd->chroma_up_available = xd->up_available; + xd->chroma_left_available = xd->left_available; + if (ss_x && bw < mi_size_wide[BLOCK_8X8]) + xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start; + if (ss_y && bh < mi_size_high[BLOCK_8X8]) + xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start; + if (xd->up_available) { + xd->above_mbmi = xd->mi[-xd->mi_stride]; + } else { + xd->above_mbmi = NULL; + } + + if (xd->left_available) { + xd->left_mbmi = xd->mi[-1]; + } else { + xd->left_mbmi = NULL; + } + + const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !ss_x); + xd->is_chroma_ref = chroma_ref; + if (chroma_ref) { + // To help calculate the "above" and "left" chroma blocks, note that the + // current block may cover multiple luma blocks (e.g., if partitioned into + // 4x4 luma blocks). + // First, find the top-left-most luma block covered by this chroma block + MB_MODE_INFO **base_mi = + &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)]; + + // Then, we consider the luma region covered by the left or above 4x4 chroma + // prediction. We want to point to the chroma reference block in that + // region, which is the bottom-right-most mi unit. + // This leads to the following offsets: + MB_MODE_INFO *chroma_above_mi = + xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL; + xd->chroma_above_mbmi = chroma_above_mi; + + MB_MODE_INFO *chroma_left_mi = + xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL; + xd->chroma_left_mbmi = chroma_left_mi; + } + + xd->height = bh; + xd->width = bw; + + xd->is_last_vertical_rect = 0; + if (xd->width < xd->height) { + if (!((mi_col + xd->width) & (xd->height - 1))) { + xd->is_last_vertical_rect = 1; + } + } + + xd->is_first_horizontal_rect = 0; + if (xd->width > xd->height) + if (!(mi_row & (xd->width - 1))) xd->is_first_horizontal_rect = 1; +} + +static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi) { + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + return tile_ctx->kf_y_cdf[above_ctx][left_ctx]; +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col; + PARTITION_CONTEXT *const left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + memset(above_ctx, partition_context_lookup[subsize].above, bw); + memset(left_ctx, partition_context_lookup[subsize].left, bh); +} + +static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize, + int subsampling_x, int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) && + ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x); + return ref_pos; +} + +static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf, + size_t element) { + assert(cdf != NULL); + return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element]; +} + +static INLINE void partition_gather_horz_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_HORZ); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_B); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void partition_gather_vert_alike(aom_cdf_prob *out, + const aom_cdf_prob *const in, + BLOCK_SIZE bsize) { + (void)bsize; + out[0] = CDF_PROB_TOP; + out[0] -= cdf_element_prob(in, PARTITION_VERT); + out[0] -= cdf_element_prob(in, PARTITION_SPLIT); + out[0] -= cdf_element_prob(in, PARTITION_HORZ_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_A); + out[0] -= cdf_element_prob(in, PARTITION_VERT_B); + if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4); + out[0] = AOM_ICDF(out[0]); + out[1] = AOM_ICDF(CDF_PROB_TOP); +} + +static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE subsize, + BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (bsize >= BLOCK_8X8) { + const int hbs = mi_size_wide[bsize] / 2; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + switch (partition) { + case PARTITION_SPLIT: + if (bsize != BLOCK_8X8) break; + AOM_FALLTHROUGH_INTENDED; + case PARTITION_NONE: + case PARTITION_HORZ: + case PARTITION_VERT: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + update_partition_context(xd, mi_row, mi_col, subsize, bsize); + break; + case PARTITION_HORZ_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize); + break; + case PARTITION_HORZ_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize); + break; + case PARTITION_VERT_A: + update_partition_context(xd, mi_row, mi_col, bsize2, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize); + break; + case PARTITION_VERT_B: + update_partition_context(xd, mi_row, mi_col, subsize, subsize); + update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize); + break; + default: assert(0 && "Invalid partition type"); + } + } +} + +static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col; + const PARTITION_CONTEXT *left_ctx = + xd->left_partition_context + (mi_row & MAX_MIB_MASK); + // Minimum partition point is 8x8. Offset the bsl accordingly. + const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8]; + int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1; + + assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]); + assert(bsl >= 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +// Return the number of elements in the partition CDF when +// partitioning the (square) block with luma block size of bsize. +static INLINE int partition_cdf_length(BLOCK_SIZE bsize) { + if (bsize <= BLOCK_8X8) + return PARTITION_TYPES; + else if (bsize == BLOCK_128X128) + return EXT_PARTITION_TYPES - 2; + else + return EXT_PARTITION_TYPES; +} + +static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + assert(bsize < BLOCK_SIZES_ALL); + int max_blocks_wide = block_size_wide[bsize]; + + if (xd->mb_to_right_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x); + } + + // Scale the width in the transform block unit. + return max_blocks_wide >> MI_SIZE_LOG2; +} + +static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + int max_blocks_high = block_size_high[bsize]; + + if (xd->mb_to_bottom_edge < 0) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y); + } + + // Scale the height in the transform block unit. + return max_blocks_high >> MI_SIZE_LOG2; +} + +static INLINE void av1_zero_above_context(AV1_COMMON *const cm, + const MACROBLOCKD *xd, + int mi_col_start, int mi_col_end, + const int tile_row) { + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + const int width = mi_col_end - mi_col_start; + const int aligned_width = + ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2); + const int offset_y = mi_col_start; + const int width_y = aligned_width; + const int offset_uv = offset_y >> seq_params->subsampling_x; + const int width_uv = width_y >> seq_params->subsampling_x; + CommonContexts *const above_contexts = &cm->above_contexts; + + av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y); + if (num_planes > 1) { + if (above_contexts->entropy[1][tile_row] && + above_contexts->entropy[2][tile_row]) { + av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv, + width_uv); + av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv, + width_uv); + } else { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid value of planes"); + } + } + + av1_zero_array(above_contexts->partition[tile_row] + mi_col_start, + aligned_width); + + memset(above_contexts->txfm[tile_row] + mi_col_start, + tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT)); +} + +static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) { + av1_zero(xd->left_entropy_context); + av1_zero(xd->left_partition_context); + + memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST], + sizeof(xd->left_txfm_context_buffer)); +} + +static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) { + int i; + for (i = 0; i < len; ++i) txfm_ctx[i] = txs; +} + +static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip, + const MACROBLOCKD *xd) { + uint8_t bw = tx_size_wide[tx_size]; + uint8_t bh = tx_size_high[tx_size]; + + if (skip) { + bw = n4_w * MI_SIZE; + bh = n4_h * MI_SIZE; + } + + set_txfm_ctx(xd->above_txfm_context, bw, n4_w); + set_txfm_ctx(xd->left_txfm_context, bh, n4_h); +} + +static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + return mi_row * mi_params->mi_stride + mi_col; +} + +static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_row = mi_row / mi_alloc_size_1d; + const int mi_alloc_col = mi_col / mi_alloc_size_1d; + + return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col; +} + +// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi. +static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params, + MACROBLOCKD *const xd, int mi_row, + int mi_col) { + // 'mi_grid_base' should point to appropriate memory in 'mi'. + const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); + mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx]; + // 'xd->mi' should point to an offset in 'mi_grid_base'; + xd->mi = mi_params->mi_grid_base + mi_grid_idx; + // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'. + xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; +} + +static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx, + TXFM_CONTEXT *left_ctx, + TX_SIZE tx_size, TX_SIZE txb_size) { + BLOCK_SIZE bsize = txsize_to_bsize[txb_size]; + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + uint8_t txw = tx_size_wide[tx_size]; + uint8_t txh = tx_size_high[tx_size]; + int i; + for (i = 0; i < bh; ++i) left_ctx[i] = txh; + for (i = 0; i < bw; ++i) above_ctx[i] = txw; +} + +static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) { + switch (tx_dim) { + case 128: + case 64: return TX_64X64; break; + case 32: return TX_32X32; break; + case 16: return TX_16X16; break; + case 8: return TX_8X8; break; + default: return TX_4X4; + } +} + +static INLINE TX_SIZE get_tx_size(int width, int height) { + if (width == height) { + return get_sqr_tx_size(width); + } + if (width < height) { + if (width + width == height) { + switch (width) { + case 4: return TX_4X8; break; + case 8: return TX_8X16; break; + case 16: return TX_16X32; break; + case 32: return TX_32X64; break; + } + } else { + switch (width) { + case 4: return TX_4X16; break; + case 8: return TX_8X32; break; + case 16: return TX_16X64; break; + } + } + } else { + if (height + height == width) { + switch (height) { + case 4: return TX_8X4; break; + case 8: return TX_16X8; break; + case 16: return TX_32X16; break; + case 32: return TX_64X32; break; + } + } else { + switch (height) { + case 4: return TX_16X4; break; + case 8: return TX_32X8; break; + case 16: return TX_64X16; break; + } + } + } + assert(0); + return TX_4X4; +} + +static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx, + const TXFM_CONTEXT *const left_ctx, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + const uint8_t txw = tx_size_wide[tx_size]; + const uint8_t txh = tx_size_high[tx_size]; + const int above = *above_ctx < txw; + const int left = *left_ctx < txh; + int category = TXFM_PARTITION_CONTEXTS; + + // dummy return, not used by others. + if (tx_size <= TX_4X4) return 0; + + TX_SIZE max_tx_size = + get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize])); + + if (max_tx_size >= TX_8X8) { + category = + (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) + + (TX_SIZES - 1 - max_tx_size) * 2; + } + assert(category != TXFM_PARTITION_CONTEXTS); + return category * 3 + above + left; +} + +// Compute the next partition in the direction of the sb_type stored in the mi +// array, starting with bsize. +static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) + return PARTITION_INVALID; + + const int offset = mi_row * mi_params->mi_stride + mi_col; + MB_MODE_INFO **mi = mi_params->mi_grid_base + offset; + const BLOCK_SIZE subsize = mi[0]->bsize; + + assert(bsize < BLOCK_SIZES_ALL); + + if (subsize == bsize) return PARTITION_NONE; + + const int bhigh = mi_size_high[bsize]; + const int bwide = mi_size_wide[bsize]; + const int sshigh = mi_size_high[subsize]; + const int sswide = mi_size_wide[subsize]; + + if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows && + mi_col + bhigh / 2 < mi_params->mi_cols) { + // In this case, the block might be using an extended partition + // type. + const MB_MODE_INFO *const mbmi_right = mi[bwide / 2]; + const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride]; + + if (sswide == bwide) { + // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or + // PARTITION_HORZ_B. To distinguish the latter two, check if the lower + // half was split. + if (sshigh * 4 == bhigh) return PARTITION_HORZ_4; + assert(sshigh * 2 == bhigh); + + if (mbmi_below->bsize == subsize) + return PARTITION_HORZ; + else + return PARTITION_HORZ_B; + } else if (sshigh == bhigh) { + // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or + // PARTITION_VERT_B. To distinguish the latter two, check if the right + // half was split. + if (sswide * 4 == bwide) return PARTITION_VERT_4; + assert(sswide * 2 == bhigh); + + if (mbmi_right->bsize == subsize) + return PARTITION_VERT; + else + return PARTITION_VERT_B; + } else { + // Smaller width and smaller height. Might be PARTITION_SPLIT or could be + // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both + // dimensions, we immediately know this is a split (which will recurse to + // get to subsize). Otherwise look down and to the right. With + // PARTITION_VERT_A, the right block will have height bhigh; with + // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise + // it's PARTITION_SPLIT. + if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT; + + if (mi_size_wide[mbmi_below->bsize] == bwide) return PARTITION_HORZ_A; + if (mi_size_high[mbmi_right->bsize] == bhigh) return PARTITION_VERT_A; + + return PARTITION_SPLIT; + } + } + const int vert_split = sswide < bwide; + const int horz_split = sshigh < bhigh; + const int split_idx = (vert_split << 1) | horz_split; + assert(split_idx != 0); + + static const PARTITION_TYPE base_partitions[4] = { + PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT + }; + + return base_partitions[split_idx]; +} + +static INLINE void set_sb_size(SequenceHeader *const seq_params, + BLOCK_SIZE sb_size) { + seq_params->sb_size = sb_size; + seq_params->mib_size = mi_size_wide[seq_params->sb_size]; + seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size]; +} + +// Returns true if the frame is fully lossless at the coded resolution. +// Note: If super-resolution is used, such a frame will still NOT be lossless at +// the upscaled resolution. +static INLINE int is_coded_lossless(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + int coded_lossless = 1; + if (cm->seg.enabled) { + for (int i = 0; i < MAX_SEGMENTS; ++i) { + if (!xd->lossless[i]) { + coded_lossless = 0; + break; + } + } + } else { + coded_lossless = xd->lossless[0]; + } + return coded_lossless; +} + +static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) { + return seq_level_idx == SEQ_LEVEL_MAX || + (seq_level_idx < SEQ_LEVELS && + // The following levels are currently undefined. + seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 && + seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 && + seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 +#if !CONFIG_CWG_C013 + && seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 && + seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3 && + seq_level_idx != SEQ_LEVEL_8_0 && seq_level_idx != SEQ_LEVEL_8_1 && + seq_level_idx != SEQ_LEVEL_8_2 && seq_level_idx != SEQ_LEVEL_8_3 +#endif + ); +} + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_COMMON_INT_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c new file mode 100644 index 0000000000..8d69efcd2d --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm1d.c @@ -0,0 +1,1841 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_txfm.h" + +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 4; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[2]; + bf1[2] = input[1]; + bf1[3] = input[3]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); +} + +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 8; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[4]; + bf1[2] = input[2]; + bf1[3] = input[6]; + bf1[4] = input[1]; + bf1[5] = input[5]; + bf1[6] = input[3]; + bf1[7] = input[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); +} + +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 16; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[8]; + bf1[2] = input[4]; + bf1[3] = input[12]; + bf1[4] = input[2]; + bf1[5] = input[10]; + bf1[6] = input[6]; + bf1[7] = input[14]; + bf1[8] = input[1]; + bf1[9] = input[9]; + bf1[10] = input[5]; + bf1[11] = input[13]; + bf1[12] = input[3]; + bf1[13] = input[11]; + bf1[14] = input[7]; + bf1[15] = input[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); +} + +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 32; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[16]; + bf1[2] = input[8]; + bf1[3] = input[24]; + bf1[4] = input[4]; + bf1[5] = input[20]; + bf1[6] = input[12]; + bf1[7] = input[28]; + bf1[8] = input[2]; + bf1[9] = input[18]; + bf1[10] = input[10]; + bf1[11] = input[26]; + bf1[12] = input[6]; + bf1[13] = input[22]; + bf1[14] = input[14]; + bf1[15] = input[30]; + bf1[16] = input[1]; + bf1[17] = input[17]; + bf1[18] = input[9]; + bf1[19] = input[25]; + bf1[20] = input[5]; + bf1[21] = input[21]; + bf1[22] = input[13]; + bf1[23] = input[29]; + bf1[24] = input[3]; + bf1[25] = input[19]; + bf1[26] = input[11]; + bf1[27] = input[27]; + bf1[28] = input[7]; + bf1[29] = input[23]; + bf1[30] = input[15]; + bf1[31] = input[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); + bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); + bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); + bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); + bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); + bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); + bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); + bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); + bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); + bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); + bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); + bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); + bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); + bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); + bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); + bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); + bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); + bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); + bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); + bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); + bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); + bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); + bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); + bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); +} + +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + int32_t x0 = input[0]; + int32_t x1 = input[1]; + int32_t x2 = input[2]; + int32_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + assert(sinpi[1] + sinpi[2] == sinpi[4]); + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit); + s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit); + s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit); + s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit); + s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit); + s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit); + s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit); + + // stage 2 + // NOTICE: (x0 - x2) here may use one extra bit compared to the + // opt_range_row/col specified in av1_gen_inv_stage_range() + s7 = range_check_value((x0 - x2) + x3, stage_range[2]); + + // stage 3 + s0 = range_check_value(s0 + s3, stage_range[3] + bit); + s1 = range_check_value(s1 - s4, stage_range[3] + bit); + s3 = range_check_value(s2, stage_range[3] + bit); + s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit); + + // stage 4 + s0 = range_check_value(s0 + s5, stage_range[4] + bit); + s1 = range_check_value(s1 - s6, stage_range[4] + bit); + + // stage 5 + x0 = range_check_value(s0 + s3, stage_range[5] + bit); + x1 = range_check_value(s1 + s3, stage_range[5] + bit); + x2 = range_check_value(s2, stage_range[5] + bit); + x3 = range_check_value(s0 + s1, stage_range[5] + bit); + + // stage 6 + x3 = range_check_value(x3 - s3, stage_range[6] + bit); + + output[0] = round_shift(x0, bit); + output[1] = round_shift(x1, bit); + output[2] = round_shift(x2, bit); + output[3] = round_shift(x3, bit); +} + +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 8; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[7]; + bf1[1] = input[0]; + bf1[2] = input[5]; + bf1[3] = input[2]; + bf1[4] = input[3]; + bf1[5] = input[4]; + bf1[6] = input[1]; + bf1[7] = input[6]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); + bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); + bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); + bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = -bf0[4]; + bf1[2] = bf0[6]; + bf1[3] = -bf0[2]; + bf1[4] = bf0[3]; + bf1[5] = -bf0[7]; + bf1[6] = bf0[5]; + bf1[7] = -bf0[1]; +} + +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 16; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[15]; + bf1[1] = input[0]; + bf1[2] = input[13]; + bf1[3] = input[2]; + bf1[4] = input[11]; + bf1[5] = input[4]; + bf1[6] = input[9]; + bf1[7] = input[6]; + bf1[8] = input[7]; + bf1[9] = input[8]; + bf1[10] = input[5]; + bf1[11] = input[10]; + bf1[12] = input[3]; + bf1[13] = input[12]; + bf1[14] = input[1]; + bf1[15] = input[14]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]); + bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]); + bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]); + bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]); + bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]); + bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]); + bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]); + bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = -bf0[8]; + bf1[2] = bf0[12]; + bf1[3] = -bf0[4]; + bf1[4] = bf0[6]; + bf1[5] = -bf0[14]; + bf1[6] = bf0[10]; + bf1[7] = -bf0[2]; + bf1[8] = bf0[3]; + bf1[9] = -bf0[11]; + bf1[10] = bf0[15]; + bf1[11] = -bf0[7]; + bf1[12] = bf0[5]; + bf1[13] = -bf0[13]; + bf1[14] = bf0[9]; + bf1[15] = -bf0[1]; +} + +void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 4; ++i) { + output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits); + } + assert(stage_range[0] + NewSqrt2Bits <= 32); +} + +void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2); +} + +void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); +} + +void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + (void)stage_range; + for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4); +} + +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + assert(output != input); + const int32_t size = 64; + const int32_t *cospi = cospi_arr(cos_bit); + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0]; + bf1[1] = input[32]; + bf1[2] = input[16]; + bf1[3] = input[48]; + bf1[4] = input[8]; + bf1[5] = input[40]; + bf1[6] = input[24]; + bf1[7] = input[56]; + bf1[8] = input[4]; + bf1[9] = input[36]; + bf1[10] = input[20]; + bf1[11] = input[52]; + bf1[12] = input[12]; + bf1[13] = input[44]; + bf1[14] = input[28]; + bf1[15] = input[60]; + bf1[16] = input[2]; + bf1[17] = input[34]; + bf1[18] = input[18]; + bf1[19] = input[50]; + bf1[20] = input[10]; + bf1[21] = input[42]; + bf1[22] = input[26]; + bf1[23] = input[58]; + bf1[24] = input[6]; + bf1[25] = input[38]; + bf1[26] = input[22]; + bf1[27] = input[54]; + bf1[28] = input[14]; + bf1[29] = input[46]; + bf1[30] = input[30]; + bf1[31] = input[62]; + bf1[32] = input[1]; + bf1[33] = input[33]; + bf1[34] = input[17]; + bf1[35] = input[49]; + bf1[36] = input[9]; + bf1[37] = input[41]; + bf1[38] = input[25]; + bf1[39] = input[57]; + bf1[40] = input[5]; + bf1[41] = input[37]; + bf1[42] = input[21]; + bf1[43] = input[53]; + bf1[44] = input[13]; + bf1[45] = input[45]; + bf1[46] = input[29]; + bf1[47] = input[61]; + bf1[48] = input[3]; + bf1[49] = input[35]; + bf1[50] = input[19]; + bf1[51] = input[51]; + bf1[52] = input[11]; + bf1[53] = input[43]; + bf1[54] = input[27]; + bf1[55] = input[59]; + bf1[56] = input[7]; + bf1[57] = input[39]; + bf1[58] = input[23]; + bf1[59] = input[55]; + bf1[60] = input[15]; + bf1[61] = input[47]; + bf1[62] = input[31]; + bf1[63] = input[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit); + bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit); + bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit); + bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit); + bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit); + bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit); + bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit); + bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit); + bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit); + bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit); + bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit); + bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit); + bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]); + bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]); + bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]); + bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]); + bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]); + bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]); + bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]); + bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]); + bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]); + bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]); + bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]); + bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]); + bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]); + bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]); + bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]); + bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]); + bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]); + bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]); + bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]); + bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]); + bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]); + bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]); + bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit); + bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit); + bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]); + bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]); + bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]); + bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]); + bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]); + bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]); + bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]); + bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]); + bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit); + bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit); + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit); + bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit); + bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]); + bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]); + bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]); + bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]); + bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit); + bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]); + bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]); + bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]); + bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]); + bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]); + bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]); + bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]); + bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]); + bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]); + bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]); + bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]); + bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]); + bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]); + bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]); + bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]); + bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]); + bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]); + bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]); + bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]); + bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]); + bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit); + bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]); + bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]); + bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]); + bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]); + bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]); + bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]); + bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]); + bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]); + bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit); + bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit); + bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit); + bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit); + bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]); + bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]); + bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]); + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]); + bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]); + bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]); + bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit); + bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit); + bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit); + bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]); + bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]); + bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]); + bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]); + bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]); + bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]); + bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]); + bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]); + bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]); + bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]); + bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]); + bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]); + bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]); + bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]); + bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]); + bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]); + bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]); + bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]); + bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]); + bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]); + bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]); + bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]); + bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]); + bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]); + bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]); + bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]); + bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]); + bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]); + bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit); + bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit); + bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit); + bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit); + bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit); + bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit); + bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit); + bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]); + bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]); + bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]); + bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]); + bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]); + bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]); + bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]); + bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]); + bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]); + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]); + bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]); + bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]); + bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]); + bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]); + bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]); + bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]); + bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]); + bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]); + bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]); + bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]); + bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]); + bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]); + bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]); + bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]); + bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]); + bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]); + bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]); + bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]); + bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]); + bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + bf0 = output; + bf1 = step; + bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]); + bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]); + bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]); + bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]); + bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]); + bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]); + bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]); + bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]); + bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]); + bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]); + bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]); + bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]); + bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]); + bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]); + bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]); + bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]); + bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]); + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]); + bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]); + bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]); + bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]); + bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]); + bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]); + bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]); + bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]); + bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]); + bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]); + bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]); + bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]); + bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]); + bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]); + bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]); + bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]); + bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]); + bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]); + bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]); + bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]); + bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]); + bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]); + bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]); + bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]); + bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]); + bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]); + bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]); + bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]); + bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]); + bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]); + bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]); + bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]); + bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]); + bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]); + bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]); + bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]); + bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]); + bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]); + bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]); + bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]); + bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]); + bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]); + bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]); + bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]); + bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]); + bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]); + bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]); + bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]); + bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]); + bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]); + bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]); + bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]); + bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]); + bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]); + bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]); + bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]); + bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]); + bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]); + bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]); + bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]); + bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]); + bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]); + bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]); + bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]); +} diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h new file mode 100644 index 0000000000..e1d5d98d10 --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm1d.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE int32_t clamp_value(int32_t value, int8_t bit) { + if (bit <= 0) return value; // Do nothing for invalid clamp bit. + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + return (int32_t)clamp64(value, min_value, max_value); +} + +static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) { + for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit); +} + +void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h new file mode 100644 index 0000000000..b4f7801295 --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ +#include "av1/common/av1_inv_txfm1d.h" + +// sum of fwd_shift_## +static const int8_t inv_start_range[TX_SIZES_ALL] = { + 5, // 4x4 transform + 6, // 8x8 transform + 7, // 16x16 transform + 7, // 32x32 transform + 7, // 64x64 transform + 5, // 4x8 transform + 5, // 8x4 transform + 6, // 8x16 transform + 6, // 16x8 transform + 6, // 16x32 transform + 6, // 32x16 transform + 6, // 32x64 transform + 6, // 64x32 transform + 6, // 4x16 transform + 6, // 16x4 transform + 7, // 8x32 transform + 7, // 32x8 transform + 7, // 16x64 transform + 7, // 64x16 transform +}; + +extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL]; + +// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12 +// for each valid row and col combination +#define INV_COS_BIT 12 + +#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c new file mode 100644 index 0000000000..ee67dffe23 --- /dev/null +++ b/third_party/aom/av1/common/av1_inv_txfm2d.c @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/av1_inv_txfm1d.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" + +void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_low_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0] >> UNIT_QUANT_SHIFT; + c1 = ip[4 * 1] >> UNIT_QUANT_SHIFT; + d1 = ip[4 * 2] >> UNIT_QUANT_SHIFT; + b1 = ip[4 * 3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + + op[4 * 0] = a1; + op[4 * 1] = b1; + op[4 * 2] = c1; + op[4 * 3] = d1; + ip++; + op++; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[0]; + c1 = ip[1]; + d1 = ip[2]; + b1 = ip[3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + + range_check_value(a1, bd + 1); + range_check_value(b1, bd + 1); + range_check_value(c1, bd + 1); + range_check_value(d1, bd + 1); + + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); + + ip += 4; + dest++; + } +} + +void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_low_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void)bd; + + a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = a1; + op[1] = op[2] = op[3] = e1; + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = + highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = + highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = + highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = + highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_idct4; + case TXFM_TYPE_DCT8: return av1_idct8; + case TXFM_TYPE_DCT16: return av1_idct16; + case TXFM_TYPE_DCT32: return av1_idct32; + case TXFM_TYPE_DCT64: return av1_idct64; + case TXFM_TYPE_ADST4: return av1_iadst4; + case TXFM_TYPE_ADST8: return av1_iadst8; + case TXFM_TYPE_ADST16: return av1_iadst16; + case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c; + default: assert(0); return NULL; + } +} + +static const int8_t inv_shift_4x4[2] = { 0, -4 }; +static const int8_t inv_shift_8x8[2] = { -1, -4 }; +static const int8_t inv_shift_16x16[2] = { -2, -4 }; +static const int8_t inv_shift_32x32[2] = { -2, -4 }; +static const int8_t inv_shift_64x64[2] = { -2, -4 }; +static const int8_t inv_shift_4x8[2] = { 0, -4 }; +static const int8_t inv_shift_8x4[2] = { 0, -4 }; +static const int8_t inv_shift_8x16[2] = { -1, -4 }; +static const int8_t inv_shift_16x8[2] = { -1, -4 }; +static const int8_t inv_shift_16x32[2] = { -1, -4 }; +static const int8_t inv_shift_32x16[2] = { -1, -4 }; +static const int8_t inv_shift_32x64[2] = { -1, -4 }; +static const int8_t inv_shift_64x32[2] = { -1, -4 }; +static const int8_t inv_shift_4x16[2] = { -1, -4 }; +static const int8_t inv_shift_16x4[2] = { -1, -4 }; +static const int8_t inv_shift_8x32[2] = { -2, -4 }; +static const int8_t inv_shift_32x8[2] = { -2, -4 }; +static const int8_t inv_shift_16x64[2] = { -2, -4 }; +static const int8_t inv_shift_64x16[2] = { -2, -4 }; + +const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = { + inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32, + inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, + inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64, + inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32, + inv_shift_32x8, inv_shift_16x64, inv_shift_64x16, +}; + +static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; + +void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + cfg->shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + cfg->cos_bit_col = INV_COS_BIT; + cfg->cos_bit_row = INV_COS_BIT; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + if (cfg->txfm_type_col == TXFM_TYPE_ADST4) { + memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range)); + } + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + if (cfg->txfm_type_row == TXFM_TYPE_ADST4) { + memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range)); + } + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; +} + +void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, + int bd) { + const int fwd_shift = inv_start_range[tx_size]; + const int8_t *shift = cfg->shift; + int8_t opt_range_row, opt_range_col; + if (bd == 8) { + opt_range_row = 16; + opt_range_col = 16; + } else if (bd == 10) { + opt_range_row = 18; + opt_range_col = 16; + } else { + assert(bd == 12); + opt_range_row = 20; + opt_range_col = 18; + } + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1; + (void)real_range_row; + if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { + // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 + // so opt_range_row >= real_range_row will not hold + stage_range_row[i] = opt_range_row; + } else { + assert(opt_range_row >= real_range_row); + stage_range_row[i] = opt_range_row; + } + } + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + int real_range_col = + cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; + (void)real_range_col; + if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { + // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 + // so opt_range_col >= real_range_col will not hold + stage_range_col[i] = opt_range_col; + } else { + assert(opt_range_col >= real_range_col); + stage_range_col[i] = opt_range_col; + } + } +} + +static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output, + int stride, TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf, TX_SIZE tx_size, + int bd) { + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row); + + // txfm_buf's length is txfm_size_row * txfm_size_col + 2 * + // AOMMAX(txfm_size_row, txfm_size_col) + // it is used for intermediate data buffering + const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); + int32_t *temp_in = txfm_buf; + int32_t *temp_out = temp_in + buf_offset; + int32_t *buf = temp_out + buf_offset; + int32_t *buf_ptr = buf; + int c, r; + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + if (abs(rect_type) == 1) { + for (c = 0; c < txfm_size_col; ++c) { + temp_in[c] = round_shift( + (int64_t)input[c * txfm_size_row + r] * NewInvSqrt2, NewSqrt2Bits); + } + clamp_buf(temp_in, txfm_size_col, bd + 8); + txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); + } else { + for (c = 0; c < txfm_size_col; ++c) { + temp_in[c] = input[c * txfm_size_row + r]; + } + clamp_buf(temp_in, txfm_size_col, bd + 8); + txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); + } + av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); + buf_ptr += txfm_size_col; + } + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + c]; + } else { + // flip left right + for (r = 0; r < txfm_size_row; ++r) + temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; + } + clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16)); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = + highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); + } + } else { + // flip upside down + for (r = 0; r < txfm_size_row; ++r) { + output[r * stride + c] = highbd_clip_pixel_add( + output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); + } + } + } +} + +static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output, + int stride, int32_t *txfm_buf, + TX_TYPE tx_type, TX_SIZE tx_size, + int bd) { + TXFM_2D_FLIP_CFG cfg; + av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg); + // Forward shift sum uses larger square size, to be consistent with what + // av1_gen_inv_stage_range() does for inverse shifts. + inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd); +} + +void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd); +} + +void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd); +} + +void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd); +} + +void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd); +} + +void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd); +} + +void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd); +} + +void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd); +} + +void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd); +} + +void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd); +} + +void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd); +} + +void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // TODO(urvang): Can the same array be reused, instead of using a new array? + // Remap 32x32 input into a modified 64x64 by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 64]; + for (int col = 0; col < 32; ++col) { + memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); + memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64, + bd); +} + +void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x32 input into a modified 64x32 by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[32 * 64]; + memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input)); + memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32, + bd); +} + +void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x32 input into a modified 32x64 input by: + // - Copying over these values in top-left 32x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 32]; + for (int col = 0; col < 32; ++col) { + memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); + memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64, + bd); +} + +void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 16x32 input into a modified 16x64 input by: + // - Copying over these values in top-left 16x32 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[64 * 16]; + for (int col = 0; col < 16; ++col) { + memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); + memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); + } + DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64, + bd); +} + +void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + // Remap 32x16 input into a modified 64x16 by: + // - Copying over these values in top-left 32x16 locations. + // - Setting the rest of the locations to 0. + int32_t mod_input[16 * 64]; + memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input)); + memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input)); + DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); + inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16, + bd); +} + +void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd); +} + +void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd); +} + +void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd); +} + +void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); + inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd); +} diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c new file mode 100644 index 0000000000..5af025c654 --- /dev/null +++ b/third_party/aom/av1/common/av1_loopfilter.c @@ -0,0 +1,2099 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" + +enum { + USE_SINGLE, + USE_DUAL, + USE_QUAD, +} UENUM1BYTE(USE_FILTER_TYPE); + +static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = { + { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H }, + { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U }, + { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V } +}; + +static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 }, + { 2, 2 }, + { 3, 3 } }; + +static const int mode_lf_lut[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1, // INTER_MODES (GLOBALMV == 0) + 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0) +}; + +static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { + int lvl; + + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { + // Set loop filter parameters that control sharpness. + int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) block_inside_limit = 1; + + memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH); + memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +uint8_t av1_get_filter_level(const AV1_COMMON *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi) { + const int segment_id = mbmi->segment_id; + if (cm->delta_q_info.delta_lf_present_flag) { + int8_t delta_lf; + if (cm->delta_q_info.delta_lf_multi) { + const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx]; + delta_lf = mbmi->delta_lf[delta_lf_idx]; + } else { + delta_lf = mbmi->delta_lf_from_base; + } + int base_level; + if (plane == 0) + base_level = cm->lf.filter_level[dir_idx]; + else if (plane == 1) + base_level = cm->lf.filter_level_u; + else + base_level = cm->lf.filter_level_v; + int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER); + assert(plane >= 0 && plane <= 2); + const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx]; + if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) { + const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id); + lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); + } + + if (cm->lf.mode_ref_delta_enabled) { + const int scale = 1 << (lvl_seg >> 5); + lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale; + if (mbmi->ref_frame[0] > INTRA_FRAME) + lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale; + lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER); + } + return lvl_seg; + } else { + return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]] + [mode_lf_lut[mbmi->mode]]; + } +} + +void av1_loop_filter_init(AV1_COMMON *cm) { + assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut)); + loop_filter_info_n *lfi = &cm->lf_info; + struct loopfilter *lf = &cm->lf; + int lvl; + + // init limits for given sharpness + update_sharpness(lfi, lf->sharpness_level); + + // init hev threshold const vectors + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) + memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); +} + +// Update the loop filter for the current frame. +// This should be called before loop_filter_rows(), +// av1_loop_filter_frame() calls this function directly. +void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start, + int plane_end) { + int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE]; + int plane; + int seg_id; + // n_shift is the multiplier for lf_deltas + // the multiplier is 1 for when filter_lvl is between 0 and 31; + // 2 when filter_lvl is between 32 and 63 + loop_filter_info_n *const lfi = &cm->lf_info; + struct loopfilter *const lf = &cm->lf; + const struct segmentation *const seg = &cm->seg; + + // update sharpness limits + update_sharpness(lfi, lf->sharpness_level); + + filt_lvl[0] = cm->lf.filter_level[0]; + filt_lvl[1] = cm->lf.filter_level_u; + filt_lvl[2] = cm->lf.filter_level_v; + + filt_lvl_r[0] = cm->lf.filter_level[1]; + filt_lvl_r[1] = cm->lf.filter_level_u; + filt_lvl_r[2] = cm->lf.filter_level_v; + + assert(plane_start >= AOM_PLANE_Y); + assert(plane_end <= MAX_MB_PLANE); + + for (plane = plane_start; plane < plane_end; plane++) { + if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0]) + break; + else if (plane == 1 && !filt_lvl[1]) + continue; + else if (plane == 2 && !filt_lvl[2]) + continue; + + for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { + for (int dir = 0; dir < 2; ++dir) { + int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane]; + const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir]; + if (segfeature_active(seg, seg_id, seg_lf_feature_id)) { + const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id); + lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER); + } + + if (!lf->mode_ref_delta_enabled) { + // we could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas + memset(lfi->lvl[plane][seg_id][dir], lvl_seg, + sizeof(lfi->lvl[plane][seg_id][dir])); + } else { + int ref, mode; + const int scale = 1 << (lvl_seg >> 5); + const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale; + lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] = + clamp(intra_lvl, 0, MAX_LOOP_FILTER); + + for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) { + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale + + lf->mode_deltas[mode] * scale; + lfi->lvl[plane][seg_id][dir][ref][mode] = + clamp(inter_lvl, 0, MAX_LOOP_FILTER); + } + } + } + } + } + } +} + +static AOM_FORCE_INLINE TX_SIZE +get_transform_size(const MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi, + const int mi_row, const int mi_col, const int plane, + const int ss_x, const int ss_y) { + assert(mbmi != NULL); + if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4; + + TX_SIZE tx_size = (plane == AOM_PLANE_Y) + ? mbmi->tx_size + : av1_get_max_uv_txsize(mbmi->bsize, ss_x, ss_y); + assert(tx_size < TX_SIZES_ALL); + if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip_txfm) { + const BLOCK_SIZE sb_type = mbmi->bsize; + const int blk_row = mi_row & (mi_size_high[sb_type] - 1); + const int blk_col = mi_col & (mi_size_wide[sb_type] - 1); + const TX_SIZE mb_tx_size = + mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)]; + assert(mb_tx_size < TX_SIZES_ALL); + tx_size = mb_tx_size; + } + + return tx_size; +} + +static const int tx_dim_to_filter_length[TX_SIZES] = { 4, 8, 14, 14, 14 }; + +// Return TX_SIZE from get_transform_size(), so it is plane and direction +// aware +static TX_SIZE set_lpf_parameters( + AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y, + const int plane, const struct macroblockd_plane *const plane_ptr) { + // reset to initial values + params->filter_length = 0; + + // no deblocking is required + const uint32_t width = plane_ptr->dst.width; + const uint32_t height = plane_ptr->dst.height; + if ((width <= x) || (height <= y)) { + // just return the smallest transform unit size + return TX_4X4; + } + + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + // for sub8x8 block, chroma prediction mode is obtained from the bottom/right + // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row + // and mi_col should map to the bottom/right mi structure, i.e, both mi_row + // and mi_col should be odd number for chroma plane. + const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2); + const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2); + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + // If current mbmi is not correctly setup, return an invalid value to stop + // filtering. One example is that if this tile is not coded, then its mbmi + // it not set up. + if (mbmi == NULL) return TX_INVALID; + + const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane, + scale_horz, scale_vert); + + { + const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y); + const uint32_t transform_masks = + edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = (coord & transform_masks) ? (0) : (1); + + if (!tu_edge) return ts; + + // prepare outer edge parameters. deblock the edge if it's an edge of a TU + { + const uint32_t curr_level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); + const int curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); + uint32_t level = curr_level; + if (coord) { + { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + if (mi_prev == NULL) return TX_INVALID; + const int pv_row = + (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert)); + const int pv_col = + (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col); + const TX_SIZE pv_ts = get_transform_size( + xd, mi_prev, pv_row, pv_col, plane, scale_horz, scale_vert); + + const uint32_t pv_lvl = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); + + const int pv_skip_txfm = + mi_prev->skip_txfm && is_inter_block(mi_prev); + const BLOCK_SIZE bsize = get_plane_block_size( + mbmi->bsize, plane_ptr->subsampling_x, plane_ptr->subsampling_y); + assert(bsize < BLOCK_SIZES_ALL); + const int prediction_masks = edge_dir == VERT_EDGE + ? block_size_wide[bsize] - 1 + : block_size_high[bsize] - 1; + const int32_t pu_edge = !(coord & prediction_masks); + // if the current and the previous blocks are skipped, + // deblock the edge if the edge belongs to a PU's edge only. + if ((curr_level || pv_lvl) && + (!pv_skip_txfm || !curr_skipped || pu_edge)) { + const int dim = (VERT_EDGE == edge_dir) + ? AOMMIN(tx_size_wide_unit_log2[ts], + tx_size_wide_unit_log2[pv_ts]) + : AOMMIN(tx_size_high_unit_log2[ts], + tx_size_high_unit_log2[pv_ts]); + if (plane) { + params->filter_length = (dim == 0) ? 4 : 6; + } else { + assert(dim < TX_SIZES); + assert(dim >= 0); + params->filter_length = tx_dim_to_filter_length[dim]; + } + + // update the level if the current block is skipped, + // but the previous one is not + level = (curr_level) ? (curr_level) : (pv_lvl); + } + } + } + // prepare common parameters + if (params->filter_length) { + const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; + params->lfthr = limits; + } + } + } + + return ts; +} + +static const uint32_t vert_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_16X16 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_32X32 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_64X64 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_4X8 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X4 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_8X16 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_16X8 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_16X32 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_32X16 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_32X64 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_64X32 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_4X16 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_16X4 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_8X32 + { + 4, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, + }, + // TX_32X8 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_16X64 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, + // TX_64X16 + { + 4, 8, 14, 14, 14, 4, 8, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, 14, + }, +}; + +static const uint32_t horz_filter_length_luma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_16X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_64X64 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_4X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_8X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_16X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_16X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X64 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_64X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_4X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_16X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X32 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_32X8 + { + 4, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, + }, + // TX_16X64 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, + // TX_64X16 + { + 4, 8, 14, 14, 14, 8, 4, 14, 8, 14, 14, 14, 14, 14, 4, 14, 8, 14, 14, + }, +}; + +static const uint32_t vert_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_64X64 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_4X8 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X4 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_8X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X8 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X64 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_64X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_4X16 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_16X4 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_8X32 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_32X8 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_16X64 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, + // TX_64X16 + { + 4, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, + }, +}; + +static const uint32_t horz_filter_length_chroma[TX_SIZES_ALL][TX_SIZES_ALL] = { + // TX_4X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_64X64 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_4X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_8X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X64 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_64X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_4X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X4 + { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + }, + // TX_8X32 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_32X8 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_16X64 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, + // TX_64X16 + { + 4, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, + }, +}; + +static AOM_FORCE_INLINE void set_one_param_for_line_luma( + AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, + const struct macroblockd_plane *const plane_ptr, int coord, + bool is_first_block, TX_SIZE prev_tx_size, const ptrdiff_t mode_step, + int *min_dim) { + (void)plane_ptr; + assert(mi_col << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.width && + mi_row << MI_SIZE_LOG2 < (uint32_t)plane_ptr->dst.height); + const int is_vert = edge_dir == VERT_EDGE; + // reset to initial values + params->filter_length = 0; + + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + assert(mbmi); + + const TX_SIZE ts = + get_transform_size(xd, mi[0], mi_row, mi_col, AOM_PLANE_Y, 0, 0); + +#ifndef NDEBUG + const uint32_t transform_masks = + is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1); + assert(tu_edge); +#endif // NDEBUG + // If we are not the first block, then coord is always true, so + // !is_first_block is technically redundant. But we are keeping it here so the + // compiler can compile away this conditional if we pass in is_first_block := + // false + bool curr_skipped = false; + if (!is_first_block || coord) { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + const int pv_row = is_vert ? mi_row : (mi_row - 1); + const int pv_col = is_vert ? (mi_col - 1) : mi_col; + const TX_SIZE pv_ts = + is_first_block + ? get_transform_size(xd, mi_prev, pv_row, pv_col, AOM_PLANE_Y, 0, 0) + : prev_tx_size; + if (is_first_block) { + *min_dim = is_vert ? block_size_high[mi_prev->bsize] + : block_size_wide[mi_prev->bsize]; + } + assert(mi_prev); + uint8_t level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, mbmi); + if (!level) { + level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_Y, + mi_prev); + } + + const int32_t pu_edge = mi_prev != mbmi; + + // The quad loop filter assumes that all the transform blocks within a + // 8x16/16x8/16x16 prediction block are of the same size. + assert(IMPLIES( + !pu_edge && (mbmi->bsize >= BLOCK_8X16 && mbmi->bsize <= BLOCK_16X16), + pv_ts == ts)); + + if (!pu_edge) { + curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); + } + if ((pu_edge || !curr_skipped) && level) { + params->filter_length = is_vert ? vert_filter_length_luma[ts][pv_ts] + : horz_filter_length_luma[ts][pv_ts]; + + // prepare common parameters + const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; + params->lfthr = limits; + } + } + const int block_dim = + is_vert ? block_size_high[mbmi->bsize] : block_size_wide[mbmi->bsize]; + *min_dim = AOMMIN(*min_dim, block_dim); + + *tx_size = ts; +} + +// Similar to set_lpf_parameters, but does so one row/col at a time to reduce +// calls to \ref get_transform_size and \ref av1_get_filter_level +static AOM_FORCE_INLINE void set_lpf_parameters_for_line_luma( + AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, + const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range, + const ptrdiff_t mode_step, int *min_dim) { + const int is_vert = edge_dir == VERT_EDGE; + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row; + TX_SIZE prev_tx_size = TX_INVALID; + + // Unroll the first iteration of the loop + set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, mi_row, + plane_ptr, *counter_ptr, true, prev_tx_size, + mode_step, min_dim); + + // Advance + int advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units; + params += advance_units; + tx_size += advance_units; + + while (*counter_ptr < mi_range) { + set_one_param_for_line_luma(params, tx_size, cm, xd, edge_dir, mi_col, + mi_row, plane_ptr, *counter_ptr, false, + prev_tx_size, mode_step, min_dim); + + // Advance + advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units; + params += advance_units; + tx_size += advance_units; + } +} + +static AOM_FORCE_INLINE void set_one_param_for_line_chroma( + AV1_DEBLOCKING_PARAMETERS *const params, TX_SIZE *tx_size, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, int coord, + bool is_first_block, TX_SIZE prev_tx_size, + const struct macroblockd_plane *const plane_ptr, const ptrdiff_t mode_step, + const int scale_horz, const int scale_vert, int *min_dim, int plane, + int joint_filter_chroma) { + const int is_vert = edge_dir == VERT_EDGE; + (void)plane_ptr; + assert((mi_col << MI_SIZE_LOG2) < + (uint32_t)(plane_ptr->dst.width << scale_horz) && + (mi_row << MI_SIZE_LOG2) < + (uint32_t)(plane_ptr->dst.height << scale_vert)); + // reset to initial values + params->filter_length = 0; + + // for sub8x8 block, chroma prediction mode is obtained from the + // bottom/right mi structure of the co-located 8x8 luma block. so for chroma + // plane, mi_row and mi_col should map to the bottom/right mi structure, + // i.e, both mi_row and mi_col should be odd number for chroma plane. + mi_row |= scale_vert; + mi_col |= scale_horz; + MB_MODE_INFO **mi = + cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col; + const MB_MODE_INFO *mbmi = mi[0]; + assert(mbmi); + + const TX_SIZE ts = get_transform_size(xd, mi[0], mi_row, mi_col, plane, + scale_horz, scale_vert); + *tx_size = ts; + +#ifndef NDEBUG + const uint32_t transform_masks = + is_vert ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1; + const int32_t tu_edge = ((coord * MI_SIZE) & transform_masks) ? (0) : (1); + assert(tu_edge); +#endif // NDEBUG + + // If we are not the first block, then coord is always true, so + // !is_first_block is technically redundant. But we are keeping it here so the + // compiler can compile away this conditional if we pass in is_first_block := + // false + bool curr_skipped = false; + if (!is_first_block || coord) { + const MB_MODE_INFO *const mi_prev = *(mi - mode_step); + assert(mi_prev); + const int pv_row = is_vert ? (mi_row) : (mi_row - (1 << scale_vert)); + const int pv_col = is_vert ? (mi_col - (1 << scale_horz)) : (mi_col); + const TX_SIZE pv_ts = + is_first_block ? get_transform_size(xd, mi_prev, pv_row, pv_col, plane, + scale_horz, scale_vert) + : prev_tx_size; + if (is_first_block) { + *min_dim = is_vert ? tx_size_high[pv_ts] : tx_size_wide[pv_ts]; + } + + uint8_t level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi); + if (!level) { + level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev); + } +#ifndef NDEBUG + if (joint_filter_chroma) { + uint8_t v_level = + av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, mbmi); + if (!v_level) { + v_level = av1_get_filter_level(cm, &cm->lf_info, edge_dir, AOM_PLANE_V, + mi_prev); + } + assert(level == v_level); + } +#else + (void)joint_filter_chroma; +#endif // NDEBUG + const int32_t pu_edge = mi_prev != mbmi; + + if (!pu_edge) { + curr_skipped = mbmi->skip_txfm && is_inter_block(mbmi); + } + // For realtime mode, u and v have the same level + if ((!curr_skipped || pu_edge) && level) { + params->filter_length = is_vert ? vert_filter_length_chroma[ts][pv_ts] + : horz_filter_length_chroma[ts][pv_ts]; + + const loop_filter_thresh *const limits = cm->lf_info.lfthr; + params->lfthr = limits + level; + } + } + const int tx_dim = is_vert ? tx_size_high[ts] : tx_size_wide[ts]; + *min_dim = AOMMIN(*min_dim, tx_dim); +} + +static AOM_FORCE_INLINE void set_lpf_parameters_for_line_chroma( + AV1_DEBLOCKING_PARAMETERS *const params_buf, TX_SIZE *tx_buf, + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const EDGE_DIR edge_dir, uint32_t mi_col, uint32_t mi_row, + const struct macroblockd_plane *const plane_ptr, const uint32_t mi_range, + const ptrdiff_t mode_step, const int scale_horz, const int scale_vert, + int *min_dim, int plane, int joint_filter_chroma) { + const int is_vert = edge_dir == VERT_EDGE; + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + uint32_t *counter_ptr = is_vert ? &mi_col : &mi_row; + const uint32_t scale = is_vert ? scale_horz : scale_vert; + TX_SIZE prev_tx_size = TX_INVALID; + + // Unroll the first iteration of the loop + set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col, + mi_row, *counter_ptr, true, prev_tx_size, + plane_ptr, mode_step, scale_horz, scale_vert, + min_dim, plane, joint_filter_chroma); + + // Advance + int advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units << scale; + params += advance_units; + tx_size += advance_units; + + while (*counter_ptr < mi_range) { + set_one_param_for_line_chroma(params, tx_size, cm, xd, edge_dir, mi_col, + mi_row, *counter_ptr, false, prev_tx_size, + plane_ptr, mode_step, scale_horz, scale_vert, + min_dim, plane, joint_filter_chroma); + + // Advance + advance_units = + is_vert ? tx_size_wide_unit[*tx_size] : tx_size_high_unit[*tx_size]; + prev_tx_size = *tx_size; + *counter_ptr += advance_units << scale; + params += advance_units; + tx_size += advance_units; + } +} + +static AOM_INLINE void filter_vert(uint8_t *dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, + const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_4_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_6_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_vertical_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_8_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_vertical_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_14_dual( + dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + limits->mblim, limits->lim, limits->hev_thr, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_vertical_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_vertical_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_vertical_8(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_vertical_14(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +static AOM_INLINE void filter_vert_chroma( + uint8_t *u_dst, uint8_t *v_dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *u_limits = params->lfthr; + const loop_filter_thresh *v_limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst); + uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + u_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + v_dst_shortptr + (2 * MI_SIZE * dst_stride), dst_stride, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_vertical_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_vertical_4(u_dst_shortptr, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_4(v_dst_shortptr, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_vertical_6(u_dst_shortptr, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + bit_depth); + aom_highbd_lpf_vertical_6(v_dst_shortptr, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + bit_depth); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_vertical_4_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_vertical_6_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_4_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_6_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + u_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_vertical_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_vertical_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +void av1_filter_block_plane_vert(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + + for (int y = 0; y < y_range; y++) { + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = + set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, + VERT_EDGE, curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + filter_vert(p, dst_stride, ¶ms, cm->seq_params, USE_SINGLE); + + // advance the destination pointer + advance_units = tx_size_wide_unit[tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + } + } +} + +void av1_filter_block_plane_vert_opt( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int plane_mi_cols = + CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2); + const int plane_mi_rows = + CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2); + // Whenever 'pipeline_lpf_mt_with_enc' is enabled, height of the unit to + // filter (i.e., y_range) is calculated based on the size of the superblock + // used. + const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), + (1 << num_mis_in_lpf_unit_height_log2)); + // Width of the unit to filter (i.e., x_range) should always be calculated + // based on maximum superblock size as this function is called for mi_col = 0, + // MAX_MIB_SIZE, 2 * MAX_MIB_SIZE etc. + const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE); + const ptrdiff_t mode_step = 1; + for (int y = 0; y < y_range; y++) { + const uint32_t curr_y = mi_row + y; + const uint32_t x_start = mi_col; + const uint32_t x_end = mi_col + x_range; + int min_block_height = block_size_high[BLOCK_128X128]; + set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, VERT_EDGE, + x_start, curr_y, plane_ptr, x_end, + mode_step, &min_block_height); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + USE_FILTER_TYPE use_filter_type = USE_SINGLE; + + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + + if ((y & 3) == 0 && (y + 3) < y_range && min_block_height >= 16) { + // If we are on a row which is a multiple of 4, and the minimum height is + // 16 pixels, then the current and right 3 cols must contain the same + // prediction block. This is because dim 16 can only happen every unit of + // 4 mi's. + use_filter_type = USE_QUAD; + y += 3; + } else if ((y + 1) < y_range && min_block_height >= 8) { + use_filter_type = USE_DUAL; + y += 1; + } + + for (int x = 0; x < x_range;) { + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + filter_vert(p, dst_stride, params, cm->seq_params, use_filter_type); + + // advance the destination pointer + const uint32_t advance_units = tx_size_wide_unit[*tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + params += advance_units; + tx_size += advance_units; + } + } +} + +void av1_filter_block_plane_vert_opt_chroma( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int mi_cols = + ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int mi_rows = + ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert); + const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz); + const int y_range = + AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + const ptrdiff_t mode_step = (ptrdiff_t)1 << scale_horz; + + for (int y = 0; y < y_range; y++) { + const uint32_t curr_y = mi_row + (y << scale_vert); + const uint32_t x_start = mi_col + (0 << scale_horz); + const uint32_t x_end = mi_col + (x_range << scale_horz); + int min_height = tx_size_high[TX_64X64]; + set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, VERT_EDGE, + x_start, curr_y, plane_ptr, x_end, + mode_step, scale_horz, scale_vert, + &min_height, plane, joint_filter_chroma); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + int use_filter_type = USE_SINGLE; + int y_inc = 0; + + if ((y & 3) == 0 && (y + 3) < y_range && min_height >= 16) { + // If we are on a row which is a multiple of 4, and the minimum height is + // 16 pixels, then the current and below 3 rows must contain the same tx + // block. This is because dim 16 can only happen every unit of 4 mi's. + use_filter_type = USE_QUAD; + y_inc = 3; + } else if (y % 2 == 0 && (y + 1) < y_range && min_height >= 8) { + // If we are on an even row, and the minimum height is 8 pixels, then the + // current and below rows must contain the same tx block. This is because + // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1, + // etc. + use_filter_type = USE_DUAL; + y_inc = 1; + } + + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE; + if (joint_filter_chroma) { + uint8_t *u_dst = plane_ptr[0].dst.buf + offset; + uint8_t *v_dst = plane_ptr[1].dst.buf + offset; + filter_vert_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params, + use_filter_type); + } else { + uint8_t *dst_ptr = plane_ptr->dst.buf + offset; + filter_vert(dst_ptr, dst_stride, params, cm->seq_params, + use_filter_type); + } + + // advance the destination pointer + const uint32_t advance_units = tx_size_wide_unit[*tx_size]; + x += advance_units; + params += advance_units; + tx_size += advance_units; + } + y += y_inc; + } +} + +static AOM_INLINE void filter_horz(uint8_t *dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, + const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *dst_shortptr = CONVERT_TO_SHORTPTR(dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_4_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_6_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_horizontal_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_8_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_horizontal_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + aom_highbd_lpf_horizontal_14_dual( + dst_shortptr + (2 * MI_SIZE), dst_stride, limits->mblim, + limits->lim, limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_horizontal_8_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_horizontal_14_dual( + dst_shortptr, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, limits->hev_thr, + bit_depth); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 8-tap filtering + case 8: + aom_highbd_lpf_horizontal_8(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // apply 14-tap filtering + case 14: + aom_highbd_lpf_horizontal_14(dst_shortptr, dst_stride, limits->mblim, + limits->lim, limits->hev_thr, bit_depth); + break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14_quad(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14_dual(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14(dst, dst_stride, limits->mblim, limits->lim, + limits->hev_thr); + break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +static AOM_INLINE void filter_horz_chroma( + uint8_t *u_dst, uint8_t *v_dst, int dst_stride, + const AV1_DEBLOCKING_PARAMETERS *params, const SequenceHeader *seq_params, + USE_FILTER_TYPE use_filter_type) { + const loop_filter_thresh *u_limits = params->lfthr; + const loop_filter_thresh *v_limits = params->lfthr; +#if CONFIG_AV1_HIGHBITDEPTH + const int use_highbitdepth = seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = seq_params->bit_depth; + if (use_highbitdepth) { + uint16_t *u_dst_shortptr = CONVERT_TO_SHORTPTR(u_dst); + uint16_t *v_dst_shortptr = CONVERT_TO_SHORTPTR(v_dst); + if (use_filter_type == USE_QUAD) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + u_dst_shortptr + (2 * MI_SIZE), dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + v_dst_shortptr + (2 * MI_SIZE), dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6_dual( + u_dst_shortptr, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6_dual( + v_dst_shortptr, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_highbd_lpf_horizontal_4(u_dst_shortptr, dst_stride, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_4(v_dst_shortptr, dst_stride, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_highbd_lpf_horizontal_6(u_dst_shortptr, dst_stride, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr, bit_depth); + aom_highbd_lpf_horizontal_6(v_dst_shortptr, dst_stride, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr, bit_depth); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + if (use_filter_type == USE_QUAD) { + // Only one set of loop filter parameters (mblim, lim and hev_thr) is + // passed as argument to quad loop filter because quad loop filter is + // called for those cases where all the 4 set of loop filter parameters + // are equal. + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_horizontal_4_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_quad(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr); + aom_lpf_horizontal_6_quad(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else if (use_filter_type == USE_DUAL) { + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_4_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6_dual(u_dst, dst_stride, u_limits->mblim, + u_limits->lim, u_limits->hev_thr, + u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_6_dual(v_dst, dst_stride, v_limits->mblim, + v_limits->lim, v_limits->hev_thr, + v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); + // no filtering + default: break; + } + } else { + assert(use_filter_type == USE_SINGLE); + switch (params->filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_4(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + u_limits->hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + aom_lpf_horizontal_6(u_dst, dst_stride, u_limits->mblim, u_limits->lim, + u_limits->hev_thr); + aom_lpf_horizontal_6(v_dst, dst_stride, v_limits->mblim, v_limits->lim, + v_limits->hev_thr); + break; + case 8: + case 14: assert(0); break; + // no filtering + default: break; + } + } +#if !CONFIG_AV1_HIGHBITDEPTH + (void)seq_params; +#endif // !CONFIG_AV1_HIGHBITDEPTH +} + +void av1_filter_block_plane_horz(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + for (int x = 0; x < x_range; x++) { + uint8_t *p = dst_ptr + x * MI_SIZE; + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + filter_horz(p, dst_stride, ¶ms, cm->seq_params, USE_SINGLE); + + // advance the destination pointer + advance_units = tx_size_high_unit[tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + } + } +} + +void av1_filter_block_plane_horz_opt( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2) { + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int plane_mi_cols = + CEIL_POWER_OF_TWO(plane_ptr->dst.width, MI_SIZE_LOG2); + const int plane_mi_rows = + CEIL_POWER_OF_TWO(plane_ptr->dst.height, MI_SIZE_LOG2); + const int y_range = AOMMIN((int)(plane_mi_rows - mi_row), + (1 << num_mis_in_lpf_unit_height_log2)); + const int x_range = AOMMIN((int)(plane_mi_cols - mi_col), MAX_MIB_SIZE); + + const ptrdiff_t mode_step = cm->mi_params.mi_stride; + for (int x = 0; x < x_range; x++) { + const uint32_t curr_x = mi_col + x; + const uint32_t y_start = mi_row; + const uint32_t y_end = mi_row + y_range; + int min_block_width = block_size_high[BLOCK_128X128]; + set_lpf_parameters_for_line_luma(params_buf, tx_buf, cm, xd, HORZ_EDGE, + curr_x, y_start, plane_ptr, y_end, + mode_step, &min_block_width); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + USE_FILTER_TYPE filter_type = USE_SINGLE; + + uint8_t *p = dst_ptr + x * MI_SIZE; + + if ((x & 3) == 0 && (x + 3) < x_range && min_block_width >= 16) { + // If we are on a col which is a multiple of 4, and the minimum width is + // 16 pixels, then the current and right 3 cols must contain the same + // prediction block. This is because dim 16 can only happen every unit of + // 4 mi's. + filter_type = USE_QUAD; + x += 3; + } else if ((x + 1) < x_range && min_block_width >= 8) { + filter_type = USE_DUAL; + x += 1; + } + + for (int y = 0; y < y_range;) { + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + filter_horz(p, dst_stride, params, cm->seq_params, filter_type); + + // advance the destination pointer + const uint32_t advance_units = tx_size_high_unit[*tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + params += advance_units; + tx_size += advance_units; + } + } +} + +void av1_filter_block_plane_horz_opt_chroma( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + const int dst_stride = plane_ptr->dst.stride; + // Ensure that mi_cols/mi_rows are calculated based on frame dimension aligned + // to MI_SIZE. + const int mi_cols = + ((plane_ptr->dst.width << scale_horz) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int mi_rows = + ((plane_ptr->dst.height << scale_vert) + MI_SIZE - 1) >> MI_SIZE_LOG2; + const int plane_mi_rows = ROUND_POWER_OF_TWO(mi_rows, scale_vert); + const int plane_mi_cols = ROUND_POWER_OF_TWO(mi_cols, scale_horz); + const int y_range = + AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + ((1 << num_mis_in_lpf_unit_height_log2) >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + const ptrdiff_t mode_step = cm->mi_params.mi_stride << scale_vert; + for (int x = 0; x < x_range; x++) { + const uint32_t y_start = mi_row + (0 << scale_vert); + const uint32_t curr_x = mi_col + (x << scale_horz); + const uint32_t y_end = mi_row + (y_range << scale_vert); + int min_width = tx_size_wide[TX_64X64]; + set_lpf_parameters_for_line_chroma(params_buf, tx_buf, cm, xd, HORZ_EDGE, + curr_x, y_start, plane_ptr, y_end, + mode_step, scale_horz, scale_vert, + &min_width, plane, joint_filter_chroma); + + AV1_DEBLOCKING_PARAMETERS *params = params_buf; + TX_SIZE *tx_size = tx_buf; + USE_FILTER_TYPE use_filter_type = USE_SINGLE; + int x_inc = 0; + + if ((x & 3) == 0 && (x + 3) < x_range && min_width >= 16) { + // If we are on a col which is a multiple of 4, and the minimum width is + // 16 pixels, then the current and right 3 cols must contain the same tx + // block. This is because dim 16 can only happen every unit of 4 mi's. + use_filter_type = USE_QUAD; + x_inc = 3; + } else if (x % 2 == 0 && (x + 1) < x_range && min_width >= 8) { + // If we are on an even col, and the minimum width is 8 pixels, then the + // current and left cols must contain the same tx block. This is because + // dim 4 can only happen every unit of 2**0, and 8 every unit of 2**1, + // etc. + use_filter_type = USE_DUAL; + x_inc = 1; + } + + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + if (*tx_size == TX_INVALID) { + params->filter_length = 0; + *tx_size = TX_4X4; + } + + const int offset = y * MI_SIZE * dst_stride + x * MI_SIZE; + if (joint_filter_chroma) { + uint8_t *u_dst = plane_ptr[0].dst.buf + offset; + uint8_t *v_dst = plane_ptr[1].dst.buf + offset; + filter_horz_chroma(u_dst, v_dst, dst_stride, params, cm->seq_params, + use_filter_type); + } else { + uint8_t *dst_ptr = plane_ptr->dst.buf + offset; + filter_horz(dst_ptr, dst_stride, params, cm->seq_params, + use_filter_type); + } + + // advance the destination pointer + const int advance_units = tx_size_high_unit[*tx_size]; + y += advance_units; + params += advance_units; + tx_size += advance_units; + } + x += x_inc; + } +} diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h new file mode 100644 index 0000000000..c9880cf5da --- /dev/null +++ b/third_party/aom/av1/common/av1_loopfilter.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_ +#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_ + +#include "config/aom_config.h" + +#include "aom/internal/aom_codec_internal.h" + +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_LOOP_FILTER 63 +#define MAX_SHARPNESS 7 + +#define SIMD_WIDTH 16 + +enum lf_path { + LF_PATH_420, + LF_PATH_444, + LF_PATH_SLOW, +}; + +/*!\cond */ +enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR); +typedef struct { + uint64_t bits[4]; +} FilterMask; + +struct loopfilter { + int filter_level[2]; + int filter_level_u; + int filter_level_v; + + int sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, Last2+Last3, + // GF, BRF, ARF2, ARF + int8_t ref_deltas[REF_FRAMES]; + + // 0 = ZERO_MV, MV + int8_t mode_deltas[MAX_MODE_LF_DELTAS]; +}; + +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]); +} loop_filter_thresh; + +typedef struct { + loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; + uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS]; +} loop_filter_info_n; + +typedef struct AV1_DEBLOCKING_PARAMETERS { + // length of the filter applied to the outer edge + uint8_t filter_length; + // deblocking limits + const loop_filter_thresh *lfthr; +} AV1_DEBLOCKING_PARAMETERS; + +typedef struct LoopFilterWorkerData { + YV12_BUFFER_CONFIG *frame_buffer; + struct AV1Common *cm; + struct macroblockd_plane planes[MAX_MB_PLANE]; + // TODO(Ranjit): When the filter functions are modified to use xd->lossless + // add lossless as a member here. + MACROBLOCKD *xd; + + AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE]; + TX_SIZE tx_buf[MAX_MIB_SIZE]; + struct aom_internal_error_info error_info; +} LFWorkerData; +/*!\endcond */ + +/* assorted loopfilter functions which get used elsewhere */ +struct AV1Common; +struct macroblockd; +struct AV1LfSyncData; + +void av1_loop_filter_init(struct AV1Common *cm); + +void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start, + int plane_end); + +void av1_filter_block_plane_vert(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col); + +void av1_filter_block_plane_horz(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, const uint32_t mi_col); + +void av1_filter_block_plane_vert_opt( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2); + +void av1_filter_block_plane_vert_opt_chroma( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2); + +void av1_filter_block_plane_horz_opt( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int num_mis_in_lpf_unit_height_log2); + +void av1_filter_block_plane_horz_opt_chroma( + const struct AV1Common *const cm, const MACROBLOCKD *const xd, + const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, + const uint32_t mi_col, AV1_DEBLOCKING_PARAMETERS *params_buf, + TX_SIZE *tx_buf, int plane, bool joint_filter_chroma, + int num_mis_in_lpf_unit_height_log2); + +uint8_t av1_get_filter_level(const struct AV1Common *cm, + const loop_filter_info_n *lfi_n, const int dir_idx, + int plane, const MB_MODE_INFO *mbmi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_ diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c new file mode 100644 index 0000000000..8a35dca369 --- /dev/null +++ b/third_party/aom/av1/common/av1_rtcd.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "config/aom_config.h" + +#define RTCD_C +#include "config/av1_rtcd.h" + +#include "aom_ports/aom_once.h" + +void av1_rtcd(void) { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl new file mode 100644 index 0000000000..c5fe389ba1 --- /dev/null +++ b/third_party/aom/av1/common/av1_rtcd_defs.pl @@ -0,0 +1,655 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +sub av1_common_forward_decls() { +print < 0) { + for (i = 0; i < size; i++) { + arr[i] = round_shift(arr[i], bit); + } + } else { + for (i = 0; i < size; i++) { + arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN, + INT32_MAX); + } + } + } +} + +const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = { + { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 }, + { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 }, + { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 }, + { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, + TXFM_TYPE_IDENTITY32 }, + { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID } +}; + +const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = { + 4, // TXFM_TYPE_DCT4 + 6, // TXFM_TYPE_DCT8 + 8, // TXFM_TYPE_DCT16 + 10, // TXFM_TYPE_DCT32 + 12, // TXFM_TYPE_DCT64 + 7, // TXFM_TYPE_ADST4 + 8, // TXFM_TYPE_ADST8 + 10, // TXFM_TYPE_ADST16 + 1, // TXFM_TYPE_IDENTITY4 + 1, // TXFM_TYPE_IDENTITY8 + 1, // TXFM_TYPE_IDENTITY16 + 1, // TXFM_TYPE_IDENTITY32 +}; + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + + int in_range = 1; + + for (int i = 0; i < size; ++i) { + if (buf[i] < min_value || buf[i] > max_value) { + in_range = 0; + } + } + + if (!in_range) { + fprintf(stderr, "Error: coeffs contain out-of-range values\n"); + fprintf(stderr, "size: %d\n", size); + fprintf(stderr, "stage: %d\n", stage); + fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value, + max_value); + + fprintf(stderr, "coeffs: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", input[j]); + } + fprintf(stderr, "]\n"); + + fprintf(stderr, " buf: "); + + fprintf(stderr, "["); + for (int j = 0; j < size; j++) { + if (j > 0) fprintf(stderr, ", "); + fprintf(stderr, "%d", buf[j]); + } + fprintf(stderr, "]\n\n"); + } + + assert(in_range); +#else + (void)stage; + (void)input; + (void)buf; + (void)size; + (void)bit; +#endif +} diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h new file mode 100644 index 0000000000..7ad70af86a --- /dev/null +++ b/third_party/aom/av1/common/av1_txfm.h @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_AV1_TXFM_H_ +#define AOM_AV1_COMMON_AV1_TXFM_H_ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "av1/common/enums.h" +#include "av1/common/blockd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if !defined(DO_RANGE_CHECK_CLAMP) +#define DO_RANGE_CHECK_CLAMP 0 +#endif + +extern const int32_t av1_cospi_arr_data[4][64]; +extern const int32_t av1_sinpi_arr_data[4][5]; + +#define MAX_TXFM_STAGE_NUM 12 + +static const int cos_bit_min = 10; + +#define NewSqrt2Bits ((int32_t)12) +// 2^12 * sqrt(2) +static const int32_t NewSqrt2 = 5793; +// 2^12 / sqrt(2) +static const int32_t NewInvSqrt2 = 2896; + +static INLINE const int32_t *cospi_arr(int n) { + return av1_cospi_arr_data[n - cos_bit_min]; +} + +static INLINE const int32_t *sinpi_arr(int n) { + return av1_sinpi_arr_data[n - cos_bit_min]; +} + +// The reduced bit-width and permuted arrays are only used in the Arm Neon +// implementations in av1_fwd_txfm2d_neon.c and highbd_fwd_txfm_neon.c for now. +#if HAVE_NEON +// Store cospi/sinpi costants in Q2.13 format. +// See: https://en.wikipedia.org/wiki/Q_(number_format) +extern const int16_t av1_cospi_arr_q13_data[4][128]; +extern const int16_t av1_sinpi_arr_q13_data[4][4]; + +extern const int32_t av1_cospi_arr_s32_data[4][66]; + +static INLINE const int16_t *cospi_arr_q13(int n) { + return av1_cospi_arr_q13_data[n - cos_bit_min]; +} + +static INLINE const int16_t *sinpi_arr_q13(int n) { + return av1_sinpi_arr_q13_data[n - cos_bit_min]; +} + +static INLINE const int32_t *cospi_arr_s32(int n) { + return av1_cospi_arr_s32_data[n - cos_bit_min]; +} +#endif // HAVE_NEON + +static INLINE int32_t range_check_value(int32_t value, int8_t bit) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + const int64_t max_value = (1LL << (bit - 1)) - 1; + const int64_t min_value = -(1LL << (bit - 1)); + if (value < min_value || value > max_value) { + fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit); +#if !CONFIG_AV1_ENCODER + assert(0); +#endif + } +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING +#if DO_RANGE_CHECK_CLAMP + bit = AOMMIN(bit, 31); + return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1); +#endif // DO_RANGE_CHECK_CLAMP + (void)bit; + return value; +} + +static INLINE int32_t round_shift(int64_t value, int bit) { + assert(bit >= 1); + return (int32_t)((value + (1ll << (bit - 1))) >> bit); +} + +static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1, + int bit) { + int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1); + int64_t intermediate = result_64 + (1LL << (bit - 1)); + // NOTE(rachelbarker): The value 'result_64' may not necessarily fit + // into 32 bits. However, the result of this function is nominally + // ROUND_POWER_OF_TWO_64(result_64, bit) + // and that is required to fit into stage_range[stage] many bits + // (checked by range_check_buf()). + // + // Here we've unpacked that rounding operation, and it can be shown + // that the value of 'intermediate' here *does* fit into 32 bits + // for any conformant bitstream. + // The upshot is that, if you do all this calculation using + // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic, + // then you'll still get the correct result. + // To provide a check on this logic, we assert that 'intermediate' + // would fit into an int32 if range checking is enabled. +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX); +#endif + return (int32_t)(intermediate >> bit); +} + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + return clip_pixel_highbd(dest + (int)trans, bd); +} + +typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); + +typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd); + +enum { + TXFM_TYPE_DCT4, + TXFM_TYPE_DCT8, + TXFM_TYPE_DCT16, + TXFM_TYPE_DCT32, + TXFM_TYPE_DCT64, + TXFM_TYPE_ADST4, + TXFM_TYPE_ADST8, + TXFM_TYPE_ADST16, + TXFM_TYPE_IDENTITY4, + TXFM_TYPE_IDENTITY8, + TXFM_TYPE_IDENTITY16, + TXFM_TYPE_IDENTITY32, + TXFM_TYPES, + TXFM_TYPE_INVALID, +} UENUM1BYTE(TXFM_TYPE); + +typedef struct TXFM_2D_FLIP_CFG { + TX_SIZE tx_size; + int ud_flip; // flip upside down + int lr_flip; // flip left to right + const int8_t *shift; + int8_t cos_bit_col; + int8_t cos_bit_row; + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + TXFM_TYPE txfm_type_col; + TXFM_TYPE txfm_type_row; + int stage_num_col; + int stage_num_row; +} TXFM_2D_FLIP_CFG; + +static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + *ud_flip = 0; + *lr_flip = 0; + break; + case IDTX: + case V_DCT: + case H_DCT: + case V_ADST: + case H_ADST: + *ud_flip = 0; + *lr_flip = 0; + break; + case FLIPADST_DCT: + case FLIPADST_ADST: + case V_FLIPADST: + *ud_flip = 1; + *lr_flip = 0; + break; + case DCT_FLIPADST: + case ADST_FLIPADST: + case H_FLIPADST: + *ud_flip = 0; + *lr_flip = 1; + break; + case FLIPADST_FLIPADST: + *ud_flip = 1; + *lr_flip = 1; + break; + default: + *ud_flip = 0; + *lr_flip = 0; + assert(0); + } +} + +static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { + get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip); +} + +// Utility function that returns the log of the ratio of the col and row +// sizes. +static INLINE int get_rect_tx_log_ratio(int col, int row) { + if (col == row) return 0; + if (col > row) { + if (col == row * 2) return 1; + if (col == row * 4) return 2; + assert(0 && "Unsupported transform size"); + } else { + if (row == col * 2) return -1; + if (row == col * 4) return -2; + assert(0 && "Unsupported transform size"); + } + return 0; // Invalid +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd); + +void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, + int bd); + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg); +void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg); +extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D]; +extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES]; +static INLINE int get_txw_idx(TX_SIZE tx_size) { + return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]; +} +static INLINE int get_txh_idx(TX_SIZE tx_size) { + return tx_size_high_log2[tx_size] - tx_size_high_log2[0]; +} + +void av1_range_check_buf(int32_t stage, const int32_t *input, + const int32_t *buf, int32_t size, int8_t bit); +#define MAX_TXWH_IDX 5 +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // AOM_AV1_COMMON_AV1_TXFM_H_ diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c new file mode 100644 index 0000000000..1d597502ce --- /dev/null +++ b/third_party/aom/av1/common/blockd.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) { + if (!left_mi) return DC_PRED; + assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi)); + return left_mi->mode; +} + +PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) { + if (!above_mi) return DC_PRED; + assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi)); + return above_mi->mode; +} + +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff; + const int txs_wide = tx_size_wide_unit[tx_size]; + const int txs_high = tx_size_high_unit[tx_size]; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + const int blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff); + memset(a, has_eob, sizeof(*a) * above_contexts); + memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts)); + } else { + memset(a, has_eob, sizeof(*a) * txs_wide); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + const int blocks_high = max_block_high(xd, plane_bsize, plane); + const int left_contexts = AOMMIN(txs_high, blocks_high - loff); + memset(l, has_eob, sizeof(*l) * left_contexts); + memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts)); + } else { + memset(l, has_eob, sizeof(*l) * txs_high); + } +} +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes) { + assert(bsize < BLOCK_SIZES_ALL); + const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref; + for (int i = 0; i < nplanes; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int txs_wide = mi_size_wide[plane_bsize]; + const int txs_high = mi_size_high[plane_bsize]; + memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide); + memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high); + } +} + +void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) { + xd->delta_lf_from_base = 0; + const int frame_lf_count = + num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0; +} + +void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) { + for (int p = 0; p < num_planes; ++p) { + set_default_wiener(xd->wiener_info + p); + set_default_sgrproj(xd->sgrproj_info + p); + } +} + +void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, + const int num_planes) { + int i; + + for (i = 0; i < num_planes; i++) { + xd->plane[i].plane_type = get_plane_type(i); + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } + for (i = num_planes; i < MAX_MB_PLANE; i++) { + xd->plane[i].subsampling_x = 1; + xd->plane[i].subsampling_y = 1; + } +} diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h new file mode 100644 index 0000000000..0cfd1f3954 --- /dev/null +++ b/third_party/aom/av1/common/blockd.h @@ -0,0 +1,1612 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_BLOCKD_H_ +#define AOM_AV1_COMMON_BLOCKD_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" + +#include "av1/common/common_data.h" +#include "av1/common/quant_common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/mv.h" +#include "av1/common/scale.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define USE_B_QUANT_NO_TRELLIS 1 + +#define MAX_MB_PLANE 3 + +#define MAX_DIFFWTD_MASK_BITS 1 + +#define INTERINTRA_WEDGE_SIGN 0 + +#define DEFAULT_INTER_TX_TYPE DCT_DCT + +#define MAX_PALETTE_BLOCK_WIDTH 64 + +#define MAX_PALETTE_BLOCK_HEIGHT 64 + +/*!\cond */ + +// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS +enum { + DIFFWTD_38 = 0, + DIFFWTD_38_INV, + DIFFWTD_MASK_TYPES, +} UENUM1BYTE(DIFFWTD_MASK_TYPE); + +enum { + KEY_FRAME = 0, + INTER_FRAME = 1, + INTRA_ONLY_FRAME = 2, // replaces intra-only + S_FRAME = 3, + FRAME_TYPES, +} UENUM1BYTE(FRAME_TYPE); + +static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { + return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; +} + +static INLINE int is_inter_mode(PREDICTION_MODE mode) { + return mode >= INTER_MODE_START && mode < INTER_MODE_END; +} + +typedef struct { + uint8_t *plane[MAX_MB_PLANE]; + int stride[MAX_MB_PLANE]; +} BUFFER_SET; + +static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) { + return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END; +} +static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) { + return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END; +} + +static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) { + static const PREDICTION_MODE lut[] = { + DC_PRED, // DC_PRED + V_PRED, // V_PRED + H_PRED, // H_PRED + D45_PRED, // D45_PRED + D135_PRED, // D135_PRED + D113_PRED, // D113_PRED + D157_PRED, // D157_PRED + D203_PRED, // D203_PRED + D67_PRED, // D67_PRED + SMOOTH_PRED, // SMOOTH_PRED + SMOOTH_V_PRED, // SMOOTH_V_PRED + SMOOTH_H_PRED, // SMOOTH_H_PRED + PAETH_PRED, // PAETH_PRED + NEARESTMV, // NEARESTMV + NEARMV, // NEARMV + GLOBALMV, // GLOBALMV + NEWMV, // NEWMV + NEARESTMV, // NEAREST_NEARESTMV + NEARMV, // NEAR_NEARMV + NEARESTMV, // NEAREST_NEWMV + NEWMV, // NEW_NEARESTMV + NEARMV, // NEAR_NEWMV + NEWMV, // NEW_NEARMV + GLOBALMV, // GLOBAL_GLOBALMV + NEWMV, // NEW_NEWMV + }; + assert(NELEMENTS(lut) == MB_MODE_COUNT); + assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode)); + return lut[mode]; +} + +static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) { + static const PREDICTION_MODE lut[] = { + MB_MODE_COUNT, // DC_PRED + MB_MODE_COUNT, // V_PRED + MB_MODE_COUNT, // H_PRED + MB_MODE_COUNT, // D45_PRED + MB_MODE_COUNT, // D135_PRED + MB_MODE_COUNT, // D113_PRED + MB_MODE_COUNT, // D157_PRED + MB_MODE_COUNT, // D203_PRED + MB_MODE_COUNT, // D67_PRED + MB_MODE_COUNT, // SMOOTH_PRED + MB_MODE_COUNT, // SMOOTH_V_PRED + MB_MODE_COUNT, // SMOOTH_H_PRED + MB_MODE_COUNT, // PAETH_PRED + MB_MODE_COUNT, // NEARESTMV + MB_MODE_COUNT, // NEARMV + MB_MODE_COUNT, // GLOBALMV + MB_MODE_COUNT, // NEWMV + NEARESTMV, // NEAREST_NEARESTMV + NEARMV, // NEAR_NEARMV + NEWMV, // NEAREST_NEWMV + NEARESTMV, // NEW_NEARESTMV + NEWMV, // NEAR_NEWMV + NEARMV, // NEW_NEARMV + GLOBALMV, // GLOBAL_GLOBALMV + NEWMV, // NEW_NEWMV + }; + assert(NELEMENTS(lut) == MB_MODE_COUNT); + assert(is_inter_compound_mode(mode)); + return lut[mode]; +} + +static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) { + return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV || + mode == NEW_NEARMV); +} + +static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) { + return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV || + mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV); +} + +static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { + return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD); +} + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +typedef struct { + // Value of base colors for Y, U, and V + uint16_t palette_colors[3 * PALETTE_MAX_SIZE]; + // Number of base colors for Y (0) and UV (1) + uint8_t palette_size[2]; +} PALETTE_MODE_INFO; + +typedef struct { + FILTER_INTRA_MODE filter_intra_mode; + uint8_t use_filter_intra; +} FILTER_INTRA_MODE_INFO; + +static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = { + DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED +}; + +#if CONFIG_RD_DEBUG +#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE) +#endif + +typedef struct RD_STATS { + int rate; + int zero_rate; + int64_t dist; + // Please be careful of using rdcost, it's not guaranteed to be set all the + // time. + // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In + // these functions, make sure rdcost is always up-to-date according to + // rate/dist. + int64_t rdcost; + int64_t sse; + uint8_t skip_txfm; // sse should equal to dist when skip_txfm == 1 +#if CONFIG_RD_DEBUG + int txb_coeff_cost[MAX_MB_PLANE]; +#endif // CONFIG_RD_DEBUG +} RD_STATS; + +// This struct is used to group function args that are commonly +// sent together in functions related to interinter compound modes +typedef struct { + uint8_t *seg_mask; + int8_t wedge_index; + int8_t wedge_sign; + DIFFWTD_MASK_TYPE mask_type; + COMPOUND_TYPE type; +} INTERINTER_COMPOUND_DATA; + +#define INTER_TX_SIZE_BUF_LEN 16 +#define TXK_TYPE_BUF_LEN 64 +/*!\endcond */ + +/*! \brief Stores the prediction/txfm mode of the current coding block + */ +typedef struct MB_MODE_INFO { + /***************************************************************************** + * \name General Info of the Coding Block + ****************************************************************************/ + /**@{*/ + /*! \brief The block size of the current coding block */ + BLOCK_SIZE bsize; + /*! \brief The partition type of the current coding block. */ + PARTITION_TYPE partition; + /*! \brief The prediction mode used */ + PREDICTION_MODE mode; + /*! \brief The UV mode when intra is used */ + UV_PREDICTION_MODE uv_mode; + /*! \brief The q index for the current coding block. */ + int current_qindex; + /**@}*/ + + /***************************************************************************** + * \name Inter Mode Info + ****************************************************************************/ + /**@{*/ + /*! \brief The motion vectors used by the current inter mode */ + int_mv mv[2]; + /*! \brief The reference frames for the MV */ + MV_REFERENCE_FRAME ref_frame[2]; + /*! \brief Filter used in subpel interpolation. */ + int_interpfilters interp_filters; + /*! \brief The motion mode used by the inter prediction. */ + MOTION_MODE motion_mode; + /*! \brief Number of samples used by warp causal */ + uint8_t num_proj_ref; + /*! \brief The number of overlapped neighbors above/left for obmc/warp motion + * mode. */ + uint8_t overlappable_neighbors; + /*! \brief The parameters used in warp motion mode. */ + WarpedMotionParams wm_params; + /*! \brief The type of intra mode used by inter-intra */ + INTERINTRA_MODE interintra_mode; + /*! \brief The type of wedge used in interintra mode. */ + int8_t interintra_wedge_index; + /*! \brief Struct that stores the data used in interinter compound mode. */ + INTERINTER_COMPOUND_DATA interinter_comp; + /**@}*/ + + /***************************************************************************** + * \name Intra Mode Info + ****************************************************************************/ + /**@{*/ + /*! \brief Directional mode delta: the angle is base angle + (angle_delta * + * step). */ + int8_t angle_delta[PLANE_TYPES]; + /*! \brief The type of filter intra mode used (if applicable). */ + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + /*! \brief Chroma from Luma: Joint sign of alpha Cb and alpha Cr */ + int8_t cfl_alpha_signs; + /*! \brief Chroma from Luma: Index of the alpha Cb and alpha Cr combination */ + uint8_t cfl_alpha_idx; + /*! \brief Stores the size and colors of palette mode */ + PALETTE_MODE_INFO palette_mode_info; + /**@}*/ + + /***************************************************************************** + * \name Transform Info + ****************************************************************************/ + /**@{*/ + /*! \brief Whether to skip transforming and sending. */ + uint8_t skip_txfm; + /*! \brief Transform size when fixed size txfm is used (e.g. intra modes). */ + TX_SIZE tx_size; + /*! \brief Transform size when recursive txfm tree is on. */ + TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + /**@}*/ + + /***************************************************************************** + * \name Loop Filter Info + ****************************************************************************/ + /**@{*/ + /*! \copydoc MACROBLOCKD::delta_lf_from_base */ + int8_t delta_lf_from_base; + /*! \copydoc MACROBLOCKD::delta_lf */ + int8_t delta_lf[FRAME_LF_COUNT]; + /**@}*/ + + /***************************************************************************** + * \name Bitfield for Memory Reduction + ****************************************************************************/ + /**@{*/ + /*! \brief The segment id */ + uint8_t segment_id : 3; + /*! \brief Only valid when temporal update if off. */ + uint8_t seg_id_predicted : 1; + /*! \brief Which ref_mv to use */ + uint8_t ref_mv_idx : 2; + /*! \brief Inter skip mode */ + uint8_t skip_mode : 1; + /*! \brief Whether intrabc is used. */ + uint8_t use_intrabc : 1; + /*! \brief Indicates if masked compound is used(1) or not (0). */ + uint8_t comp_group_idx : 1; + /*! \brief Indicates whether dist_wtd_comp(0) is used or not (0). */ + uint8_t compound_idx : 1; + /*! \brief Whether to use interintra wedge */ + uint8_t use_wedge_interintra : 1; + /*! \brief CDEF strength per BLOCK_64X64 */ + int8_t cdef_strength : 4; + /**@}*/ + +#if CONFIG_RD_DEBUG + /*! \brief RD info used for debugging */ + RD_STATS rd_stats; + /*! \brief The current row in unit of 4x4 blocks for debugging */ + int mi_row; + /*! \brief The current col in unit of 4x4 blocks for debugging */ + int mi_col; +#endif +#if CONFIG_INSPECTION + /*! \brief Whether we are skipping the current rows or columns. */ + int16_t tx_skip[TXK_TYPE_BUF_LEN]; +#endif +} MB_MODE_INFO; + +/*!\cond */ + +static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) { + return mbmi->use_intrabc; +} + +static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) { + assert(mode < UV_INTRA_MODES); + static const PREDICTION_MODE uv2y[] = { + DC_PRED, // UV_DC_PRED + V_PRED, // UV_V_PRED + H_PRED, // UV_H_PRED + D45_PRED, // UV_D45_PRED + D135_PRED, // UV_D135_PRED + D113_PRED, // UV_D113_PRED + D157_PRED, // UV_D157_PRED + D203_PRED, // UV_D203_PRED + D67_PRED, // UV_D67_PRED + SMOOTH_PRED, // UV_SMOOTH_PRED + SMOOTH_V_PRED, // UV_SMOOTH_V_PRED + SMOOTH_H_PRED, // UV_SMOOTH_H_PRED + PAETH_PRED, // UV_PAETH_PRED + DC_PRED, // UV_CFL_PRED + INTRA_INVALID, // UV_INTRA_MODES + INTRA_INVALID, // UV_MODE_INVALID + }; + return uv2y[mode]; +} + +static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { + return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME; +} + +static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[1] > INTRA_FRAME; +} + +static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) { + return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^ + (mbmi->ref_frame[1] >= BWDREF_FRAME))); +} + +static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) { + static const MV_REFERENCE_FRAME lut[] = { + LAST_FRAME, // LAST_LAST2_FRAMES, + LAST_FRAME, // LAST_LAST3_FRAMES, + LAST_FRAME, // LAST_GOLDEN_FRAMES, + BWDREF_FRAME, // BWDREF_ALTREF_FRAMES, + LAST2_FRAME, // LAST2_LAST3_FRAMES + LAST2_FRAME, // LAST2_GOLDEN_FRAMES, + LAST3_FRAME, // LAST3_GOLDEN_FRAMES, + BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES, + ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES, + }; + assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); + return lut[ref_idx]; +} + +static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) { + static const MV_REFERENCE_FRAME lut[] = { + LAST2_FRAME, // LAST_LAST2_FRAMES, + LAST3_FRAME, // LAST_LAST3_FRAMES, + GOLDEN_FRAME, // LAST_GOLDEN_FRAMES, + ALTREF_FRAME, // BWDREF_ALTREF_FRAMES, + LAST3_FRAME, // LAST2_LAST3_FRAMES + GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES, + GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES, + ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES, + ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES, + }; + assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS); + return lut[ref_idx]; +} + +PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi); + +PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi); + +static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi, + TransformationType type) { + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->bsize; + const int block_size_allowed = + AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; + return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION && + block_size_allowed; +} + +#if CONFIG_MISMATCH_DEBUG +static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, + int mi_row, int tx_blk_col, int tx_blk_row, + int subsampling_x, int subsampling_y) { + *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) + + (tx_blk_col << MI_SIZE_LOG2); + *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) + + (tx_blk_row << MI_SIZE_LOG2); +} +#endif + +enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision); + +struct buf_2d { + uint8_t *buf; + uint8_t *buf0; + int width; + int height; + int stride; +}; + +typedef struct eob_info { + uint16_t eob; + uint16_t max_scan_line; +} eob_info; + +typedef struct { + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]); + eob_info eob_data[MAX_MB_PLANE] + [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)]; + DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]); +} CB_BUFFER; + +typedef struct macroblockd_plane { + PLANE_TYPE plane_type; + int subsampling_x; + int subsampling_y; + struct buf_2d dst; + struct buf_2d pre[2]; + ENTROPY_CONTEXT *above_entropy_context; + ENTROPY_CONTEXT *left_entropy_context; + + // The dequantizers below are true dequantizers used only in the + // dequantization process. They have the same coefficient + // shift/scale as TX. + int16_t seg_dequant_QTX[MAX_SEGMENTS][2]; + // Pointer to color index map of: + // - Current coding block, on encoder side. + // - Current superblock, on decoder side. + uint8_t *color_index_map; + + // block size in pixels + uint8_t width, height; + + qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; + qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL]; +} MACROBLOCKD_PLANE; + +#define BLOCK_OFFSET(i) ((i) << 4) + +/*!\endcond */ + +/*!\brief Parameters related to Wiener Filter */ +typedef struct { + /*! + * Vertical filter kernel. + */ + DECLARE_ALIGNED(16, InterpKernel, vfilter); + + /*! + * Horizontal filter kernel. + */ + DECLARE_ALIGNED(16, InterpKernel, hfilter); +} WienerInfo; + +/*!\brief Parameters related to Sgrproj Filter */ +typedef struct { + /*! + * Parameter index. + */ + int ep; + + /*! + * Weights for linear combination of filtered versions + */ + int xqd[2]; +} SgrprojInfo; + +/*!\cond */ + +#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32) +#define CFL_BUF_LINE (32) +#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3) +#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4) +#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE) +typedef struct cfl_ctx { + // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid + // shifts) + uint16_t recon_buf_q3[CFL_BUF_SQUARE]; + // Q3 AC contributions (reconstructed luma pixels - tx block avg) + int16_t ac_buf_q3[CFL_BUF_SQUARE]; + + // Cache the DC_PRED when performing RDO, so it does not have to be recomputed + // for every scaling parameter + bool dc_pred_is_cached[CFL_PRED_PLANES]; + // Whether the DC_PRED cache is enabled. The DC_PRED cache is disabled when + // decoding. + bool use_dc_pred_cache; + // Only cache the first row of the DC_PRED + int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE]; + + // Height and width currently used in the CfL prediction buffer. + int buf_height, buf_width; + + int are_parameters_computed; + + // Chroma subsampling + int subsampling_x, subsampling_y; + + // Whether the reconstructed luma pixels need to be stored + int store_y; +} CFL_CTX; + +typedef struct dist_wtd_comp_params { + int use_dist_wtd_comp_avg; + int fwd_offset; + int bck_offset; +} DIST_WTD_COMP_PARAMS; + +struct scale_factors; + +/*!\endcond */ + +/*! \brief Variables related to current coding block. + * + * This is a common set of variables used by both encoder and decoder. + * Most/all of the pointers are mere pointers to actual arrays are allocated + * elsewhere. This is mostly for coding convenience. + */ +typedef struct macroblockd { + /** + * \name Position of current macroblock in mi units + */ + /**@{*/ + int mi_row; /*!< Row position in mi units. */ + int mi_col; /*!< Column position in mi units. */ + /**@}*/ + + /*! + * Same as cm->mi_params.mi_stride, copied here for convenience. + */ + int mi_stride; + + /*! + * True if current block transmits chroma information. + * More detail: + * Smallest supported block size for both luma and chroma plane is 4x4. Hence, + * in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma + * blocks smaller than 8x8 maybe combined into one chroma block. + * For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4 + * luma blocks. Then, a single chroma block of size 4x4 will cover the area of + * these four luma blocks. This is implemented in bitstream as follows: + * - There are four MB_MODE_INFO structs for the four luma blocks. + * - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit + * any information for chroma planes. + * - Last block will have is_chroma_ref = true and transmits chroma + * information for the 4x4 chroma block that covers whole 8x8 area covered by + * four luma blocks. + * Similar logic applies for chroma blocks that cover 2 or 3 luma blocks. + */ + bool is_chroma_ref; + + /*! + * Info specific to each plane. + */ + struct macroblockd_plane plane[MAX_MB_PLANE]; + + /*! + * Tile related info. + */ + TileInfo tile; + + /*! + * Appropriate offset inside cm->mi_params.mi_grid_base based on current + * mi_row and mi_col. + */ + MB_MODE_INFO **mi; + + /*! + * True if 4x4 block above the current block is available. + */ + bool up_available; + /*! + * True if 4x4 block to the left of the current block is available. + */ + bool left_available; + /*! + * True if the above chrome reference block is available. + */ + bool chroma_up_available; + /*! + * True if the left chrome reference block is available. + */ + bool chroma_left_available; + + /*! + * MB_MODE_INFO for 4x4 block to the left of the current block, if + * left_available == true; otherwise NULL. + */ + MB_MODE_INFO *left_mbmi; + /*! + * MB_MODE_INFO for 4x4 block above the current block, if + * up_available == true; otherwise NULL. + */ + MB_MODE_INFO *above_mbmi; + /*! + * Above chroma reference block if is_chroma_ref == true for the current block + * and chroma_up_available == true; otherwise NULL. + * See also: the special case logic when current chroma block covers more than + * one luma blocks in set_mi_row_col(). + */ + MB_MODE_INFO *chroma_left_mbmi; + /*! + * Left chroma reference block if is_chroma_ref == true for the current block + * and chroma_left_available == true; otherwise NULL. + * See also: the special case logic when current chroma block covers more than + * one luma blocks in set_mi_row_col(). + */ + MB_MODE_INFO *chroma_above_mbmi; + + /*! + * Appropriate offset based on current 'mi_row' and 'mi_col', inside + * 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or + * 'MACROBLOCK' structs. + */ + uint8_t *tx_type_map; + /*! + * Stride for 'tx_type_map'. Note that this may / may not be same as + * 'mi_stride', depending on which actual array 'tx_type_map' points to. + */ + int tx_type_map_stride; + + /** + * \name Distance of this macroblock from frame edges in 1/8th pixel units. + */ + /**@{*/ + int mb_to_left_edge; /*!< Distance from left edge */ + int mb_to_right_edge; /*!< Distance from right edge */ + int mb_to_top_edge; /*!< Distance from top edge */ + int mb_to_bottom_edge; /*!< Distance from bottom edge */ + /**@}*/ + + /*! + * Scale factors for reference frames of the current block. + * These are pointers into 'cm->ref_scale_factors'. + */ + const struct scale_factors *block_ref_scale_factors[2]; + + /*! + * - On encoder side: points to cpi->source, which is the buffer containing + * the current *source* frame (maybe filtered). + * - On decoder side: points to cm->cur_frame->buf, which is the buffer into + * which current frame is being *decoded*. + */ + const YV12_BUFFER_CONFIG *cur_buf; + + /*! + * Entropy contexts for the above blocks. + * above_entropy_context[i][j] corresponds to above entropy context for ith + * plane and jth mi column of this *frame*, wrt current 'mi_row'. + * These are pointers into 'cm->above_contexts.entropy'. + */ + ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE]; + /*! + * Entropy contexts for the left blocks. + * left_entropy_context[i][j] corresponds to left entropy context for ith + * plane and jth mi row of this *superblock*, wrt current 'mi_col'. + * Note: These contain actual data, NOT pointers. + */ + ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE]; + + /*! + * Partition contexts for the above blocks. + * above_partition_context[i] corresponds to above partition context for ith + * mi column of this *frame*, wrt current 'mi_row'. + * This is a pointer into 'cm->above_contexts.partition'. + */ + PARTITION_CONTEXT *above_partition_context; + /*! + * Partition contexts for the left blocks. + * left_partition_context[i] corresponds to left partition context for ith + * mi row of this *superblock*, wrt current 'mi_col'. + * Note: These contain actual data, NOT pointers. + */ + PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE]; + + /*! + * Transform contexts for the above blocks. + * above_txfm_context[i] corresponds to above transform context for ith mi col + * from the current position (mi row and mi column) for this *frame*. + * This is a pointer into 'cm->above_contexts.txfm'. + */ + TXFM_CONTEXT *above_txfm_context; + /*! + * Transform contexts for the left blocks. + * left_txfm_context[i] corresponds to left transform context for ith mi row + * from the current position (mi_row and mi_col) for this *superblock*. + * This is a pointer into 'left_txfm_context_buffer'. + */ + TXFM_CONTEXT *left_txfm_context; + /*! + * left_txfm_context_buffer[i] is the left transform context for ith mi_row + * in this *superblock*. + * Behaves like an internal actual buffer which 'left_txt_context' points to, + * and never accessed directly except to fill in initial default values. + */ + TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE]; + + /** + * \name Default values for the two restoration filters for each plane. + * Default values for the two restoration filters for each plane. + * These values are used as reference values when writing the bitstream. That + * is, we transmit the delta between the actual values in + * cm->rst_info[plane].unit_info[unit_idx] and these reference values. + */ + /**@{*/ + WienerInfo wiener_info[MAX_MB_PLANE]; /*!< Defaults for Wiener filter*/ + SgrprojInfo sgrproj_info[MAX_MB_PLANE]; /*!< Defaults for SGR filter */ + /**@}*/ + + /** + * \name Block dimensions in MB_MODE_INFO units. + */ + /**@{*/ + uint8_t width; /*!< Block width in MB_MODE_INFO units */ + uint8_t height; /*!< Block height in MB_MODE_INFO units */ + /**@}*/ + + /*! + * Contains the motion vector candidates found during motion vector prediction + * process. ref_mv_stack[i] contains the candidates for ith type of + * reference frame (single/compound). The actual number of candidates found in + * ref_mv_stack[i] is stored in either dcb->ref_mv_count[i] (decoder side) + * or mbmi_ext->ref_mv_count[i] (encoder side). + */ + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + /*! + * weight[i][j] is the weight for ref_mv_stack[i][j] and used to compute the + * DRL (dynamic reference list) mode contexts. + */ + uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; + + /*! + * True if this is the last vertical rectangular block in a VERTICAL or + * VERTICAL_4 partition. + */ + bool is_last_vertical_rect; + /*! + * True if this is the 1st horizontal rectangular block in a HORIZONTAL or + * HORIZONTAL_4 partition. + */ + bool is_first_horizontal_rect; + + /*! + * Counts of each reference frame in the above and left neighboring blocks. + * NOTE: Take into account both single and comp references. + */ + uint8_t neighbors_ref_counts[REF_FRAMES]; + + /*! + * Current CDFs of all the symbols for the current tile. + */ + FRAME_CONTEXT *tile_ctx; + + /*! + * Bit depth: copied from cm->seq_params->bit_depth for convenience. + */ + int bd; + + /*! + * Quantizer index for each segment (base qindex + delta for each segment). + */ + int qindex[MAX_SEGMENTS]; + /*! + * lossless[s] is true if segment 's' is coded losslessly. + */ + int lossless[MAX_SEGMENTS]; + /*! + * Q index for the coding blocks in this superblock will be stored in + * mbmi->current_qindex. Now, when cm->delta_q_info.delta_q_present_flag is + * true, mbmi->current_qindex is computed by taking 'current_base_qindex' as + * the base, and adding any transmitted delta qindex on top of it. + * Precisely, this is the latest qindex used by the first coding block of a + * non-skip superblock in the current tile; OR + * same as cm->quant_params.base_qindex (if not explicitly set yet). + * Note: This is 'CurrentQIndex' in the AV1 spec. + */ + int current_base_qindex; + + /*! + * Same as cm->features.cur_frame_force_integer_mv. + */ + int cur_frame_force_integer_mv; + + /*! + * Pointer to cm->error. + */ + struct aom_internal_error_info *error_info; + + /*! + * Same as cm->global_motion. + */ + const WarpedMotionParams *global_motion; + + /*! + * Since actual frame level loop filtering level value is not available + * at the beginning of the tile (only available during actual filtering) + * at encoder side.we record the delta_lf (against the frame level loop + * filtering level) and code the delta between previous superblock's delta + * lf and current delta lf. It is equivalent to the delta between previous + * superblock's actual lf and current lf. + */ + int8_t delta_lf_from_base; + /*! + * We have four frame filter levels for different plane and direction. So, to + * support the per superblock update, we need to add a few more params: + * 0. delta loop filter level for y plane vertical + * 1. delta loop filter level for y plane horizontal + * 2. delta loop filter level for u plane + * 3. delta loop filter level for v plane + * To make it consistent with the reference to each filter level in segment, + * we need to -1, since + * - SEG_LVL_ALT_LF_Y_V = 1; + * - SEG_LVL_ALT_LF_Y_H = 2; + * - SEG_LVL_ALT_LF_U = 3; + * - SEG_LVL_ALT_LF_V = 4; + */ + int8_t delta_lf[FRAME_LF_COUNT]; + /*! + * cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the + * current superblock has already been read from (decoder) / written to + * (encoder) the bitstream; and false otherwise. + * More detail: + * 1. CDEF strength is transmitted only once per CDEF unit, in the 1st + * non-skip coding block. So, we need this array to keep track of whether CDEF + * strengths for the given CDEF units have been transmitted yet or not. + * 2. Superblock size can be either 128x128 or 64x64, but CDEF unit size is + * fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if + * superblock size is 128x128). Hence the array size is 4. + * 3. In the current implementation, CDEF strength for this CDEF unit is + * stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside + * cm->mi_params.mi_grid_base). + */ + bool cdef_transmitted[4]; + + /*! + * Mask for this block used for compound prediction. + */ + uint8_t *seg_mask; + + /*! + * CFL (chroma from luma) related parameters. + */ + CFL_CTX cfl; + + /*! + * Offset to plane[p].color_index_map. + * Currently: + * - On encoder side, this is always 0 as 'color_index_map' is allocated per + * *coding block* there. + * - On decoder side, this may be non-zero, as 'color_index_map' is a (static) + * memory pointing to the base of a *superblock* there, and we need an offset + * to it to get the color index map for current coding block. + */ + uint16_t color_index_map_offset[2]; + + /*! + * Temporary buffer used for convolution in case of compound reference only + * for (weighted or uniform) averaging operation. + * There are pointers to actual buffers allocated elsewhere: e.g. + * - In decoder, 'pbi->td.tmp_conv_dst' or + * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and + * - In encoder, 'x->tmp_conv_dst' or + * 'cpi->tile_thr_data[t].td->mb.tmp_conv_dst'. + */ + CONV_BUF_TYPE *tmp_conv_dst; + /*! + * Temporary buffers used to build OBMC prediction by above (index 0) and left + * (index 1) predictors respectively. + * tmp_obmc_bufs[i][p * MAX_SB_SQUARE] is the buffer used for plane 'p'. + * There are pointers to actual buffers allocated elsewhere: e.g. + * - In decoder, 'pbi->td.tmp_obmc_bufs' or + * 'pbi->thread_data[t].td->xd.tmp_conv_dst' and + * -In encoder, 'x->tmp_pred_bufs' or + * 'cpi->tile_thr_data[t].td->mb.tmp_pred_bufs'. + */ + uint8_t *tmp_obmc_bufs[2]; +} MACROBLOCKD; + +/*!\cond */ + +static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) { +#if CONFIG_AV1_HIGHBITDEPTH + return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; +#else + (void)xd; + return 0; +#endif +} + +static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) { +#if CONFIG_AV1_HIGHBITDEPTH + return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? CONVERT_TO_BYTEPTR(buf16) + : buf16; +#else + (void)xd; + return buf16; +#endif +} + +typedef struct BitDepthInfo { + int bit_depth; + /*! Is the image buffer high bit depth? + * Low bit depth buffer uses uint8_t. + * High bit depth buffer uses uint16_t. + * Equivalent to cm->seq_params->use_highbitdepth + */ + int use_highbitdepth_buf; +} BitDepthInfo; + +static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) { + BitDepthInfo bit_depth_info; + bit_depth_info.bit_depth = xd->bd; + bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd); + assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf, + bit_depth_info.bit_depth == 8)); + return bit_depth_info; +} + +static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_4X4: return 0; + case BLOCK_8X8: return 1; + case BLOCK_16X16: return 2; + case BLOCK_32X32: return 3; + case BLOCK_64X64: return 4; + case BLOCK_128X128: return 5; + default: return SQR_BLOCK_SIZES; + } +} + +// For a square block size 'bsize', returns the size of the sub-blocks used by +// the given partition type. If the partition produces sub-blocks of different +// sizes, then the function returns the largest sub-block size. +// Implements the Partition_Subsize lookup table in the spec (Section 9.3. +// Conversion tables). +// Note: the input block size should be square. +// Otherwise it's considered invalid. +static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + if (partition == PARTITION_INVALID) { + return BLOCK_INVALID; + } else { + const int sqr_bsize_idx = get_sqr_bsize_idx(bsize); + return sqr_bsize_idx >= SQR_BLOCK_SIZES + ? BLOCK_INVALID + : subsize_lookup[partition][sqr_bsize_idx]; + } +} + +static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi, + PLANE_TYPE plane_type) { + static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = { + DCT_DCT, // DC_PRED + ADST_DCT, // V_PRED + DCT_ADST, // H_PRED + DCT_DCT, // D45_PRED + ADST_ADST, // D135_PRED + ADST_DCT, // D113_PRED + DCT_ADST, // D157_PRED + DCT_ADST, // D203_PRED + ADST_DCT, // D67_PRED + ADST_ADST, // SMOOTH_PRED + ADST_DCT, // SMOOTH_V_PRED + DCT_ADST, // SMOOTH_H_PRED + ADST_ADST, // PAETH_PRED + }; + const PREDICTION_MODE mode = + (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); + assert(mode < INTRA_MODES); + return _intra_mode_to_tx_type[mode]; +} + +static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; } + +static INLINE int block_signals_txsize(BLOCK_SIZE bsize) { + return bsize > BLOCK_4X4; +} + +// Number of transform types in each set type +static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = { + 1, 2, 5, 7, 12, 16, +}; + +static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = { + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, +}; + +// The bitmask corresponds to the transform types as defined in +// enums.h TX_TYPE enumeration type. Setting the bit 0 means to disable +// the use of the corresponding transform type in that table. +// The av1_derived_intra_tx_used_flag table is used when +// use_reduced_intra_txset is set to 2, where one only searches +// the transform types derived from residual statistics. +static const uint16_t av1_derived_intra_tx_used_flag[INTRA_MODES] = { + 0x0209, // DC_PRED: 0000 0010 0000 1001 + 0x0403, // V_PRED: 0000 0100 0000 0011 + 0x0805, // H_PRED: 0000 1000 0000 0101 + 0x020F, // D45_PRED: 0000 0010 0000 1111 + 0x0009, // D135_PRED: 0000 0000 0000 1001 + 0x0009, // D113_PRED: 0000 0000 0000 1001 + 0x0009, // D157_PRED: 0000 0000 0000 1001 + 0x0805, // D203_PRED: 0000 1000 0000 0101 + 0x0403, // D67_PRED: 0000 0100 0000 0011 + 0x0205, // SMOOTH_PRED: 0000 0010 0000 1001 + 0x0403, // SMOOTH_V_PRED: 0000 0100 0000 0011 + 0x0805, // SMOOTH_H_PRED: 0000 1000 0000 0101 + 0x0209, // PAETH_PRED: 0000 0010 0000 1001 +}; + +static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = { + 0x080F, // DC_PRED: 0000 1000 0000 1111 + 0x040F, // V_PRED: 0000 0100 0000 1111 + 0x080F, // H_PRED: 0000 1000 0000 1111 + 0x020F, // D45_PRED: 0000 0010 0000 1111 + 0x080F, // D135_PRED: 0000 1000 0000 1111 + 0x040F, // D113_PRED: 0000 0100 0000 1111 + 0x080F, // D157_PRED: 0000 1000 0000 1111 + 0x080F, // D203_PRED: 0000 1000 0000 1111 + 0x040F, // D67_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_PRED: 0000 1000 0000 1111 + 0x040F, // SMOOTH_V_PRED: 0000 0100 0000 1111 + 0x080F, // SMOOTH_H_PRED: 0000 1000 0000 1111 + 0x0C0E, // PAETH_PRED: 0000 1100 0000 1110 +}; + +static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = { + 0x0001, // 0000 0000 0000 0001 + 0x0201, // 0000 0010 0000 0001 + 0x020F, // 0000 0010 0000 1111 + 0x0E0F, // 0000 1110 0000 1111 + 0x0FFF, // 0000 1111 1111 1111 + 0xFFFF, // 1111 1111 1111 1111 +}; + +static const TxSetType av1_ext_tx_set_lookup[2][2] = { + { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX }, + { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT }, +}; + +static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size]; + if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY; + if (tx_size_sqr_up == TX_32X32) + return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY; + if (use_reduced_set) + return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX; + const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size]; + return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16]; +} + +// Maps tx set types to the indices. +static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = { + { // Intra + 0, -1, 2, 1, -1, -1 }, + { // Inter + 0, 3, -1, -1, 2, 1 }, +}; + +static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const TxSetType set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); + return ext_tx_set_index[is_inter][set_type]; +} + +static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter, + int use_reduced_set) { + const int set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set); + return av1_num_ext_tx_set[set_type]; +} + +#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2)) +#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2)) + +static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) { + const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize]; + if (bsize == BLOCK_4X4) + return AOMMIN(max_txsize_lookup[bsize], largest_tx_size); + if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size) + return max_rect_tx_size; + else + return largest_tx_size; +} + +static const uint8_t mode_to_angle_map[INTRA_MODES] = { + 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, +}; + +// Converts block_index for given transform size to index of the block in raster +// order. +static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size, + int block_idx) { + // For transform size 4x8, the possible block_idx values are 0 & 2, because + // block_idx values are incremented in steps of size 'tx_width_unit x + // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to + // block number 1 in raster order, inside an 8x8 MI block. + // For any other transform size, the two indices are equivalent. + return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx; +} + +// Inverse of above function. +// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now. +static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size, + int raster_order) { + assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4); + // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4. + return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0; +} + +static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd, + TX_SIZE tx_size, + int use_screen_content_tools) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y || + xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 || + use_screen_content_tools) + return DEFAULT_INTER_TX_TYPE; + + return intra_mode_to_tx_type(mbmi, plane_type); +} + +// Implements the get_plane_residual_size() function in the spec (Section +// 5.11.38. Get plane residual size function). +static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + int subsampling_x, + int subsampling_y) { + assert(bsize < BLOCK_SIZES_ALL); + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + return av1_ss_size_lookup[bsize][subsampling_x][subsampling_y]; +} + +/* + * Logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ +static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row, + int blk_col) { + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1, + }; + const int index = + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); + assert(index < INTER_TX_SIZE_BUF_LEN); + return index; +} + +#if CONFIG_INSPECTION +/* + * Here is the logic to generate the lookup tables: + * + * TX_SIZE txs = max_txsize_rect_lookup[bsize]; + * for (int level = 0; level < MAX_VARTX_DEPTH; ++level) + * txs = sub_tx_size_map[txs]; + * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + * const int bw_uint_log2 = mi_size_wide_log2[bsize]; + * const int stride_log2 = bw_uint_log2 - tx_w_log2; + */ +static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row, + int blk_col) { + static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, + }; + static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2, + }; + const int index = + ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) + + (blk_col >> tw_w_log2_table[bsize]); + assert(index < TXK_TYPE_BUF_LEN); + return index; +} +#endif // CONFIG_INSPECTION + +static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row, + int blk_col, TX_SIZE tx_size, + TX_TYPE tx_type) { + const int stride = xd->tx_type_map_stride; + xd->tx_type_map[blk_row * stride + blk_col] = tx_type; + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + // The 16x16 unit is due to the constraint from tx_64x64 which sets the + // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block + // size, the constraint takes effect in 32x16 / 16x32 size too. To solve + // the intricacy, cover all the 16x16 units inside a 64 level transform. + if (txw == tx_size_wide_unit[TX_64X64] || + txh == tx_size_high_unit[TX_64X64]) { + const int tx_unit = tx_size_wide_unit[TX_16X16]; + for (int idy = 0; idy < txh; idy += tx_unit) { + for (int idx = 0; idx < txw; idx += tx_unit) { + xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type; + } + } + } +} + +static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd, + PLANE_TYPE plane_type, int blk_row, + int blk_col, TX_SIZE tx_size, + int reduced_tx_set) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) { + return DCT_DCT; + } + + TX_TYPE tx_type; + if (plane_type == PLANE_TYPE_Y) { + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + } else { + if (is_inter_block(mbmi)) { + // scale back to y plane's coordinate + const struct macroblockd_plane *const pd = &xd->plane[plane_type]; + blk_row <<= pd->subsampling_y; + blk_col <<= pd->subsampling_x; + tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + } else { + // In intra mode, uv planes don't share the same prediction mode as y + // plane, so the tx_type should not be shared + tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV); + } + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set); + if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT; + } + assert(tx_type < TX_TYPES); + assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), + reduced_tx_set)][tx_type]); + return tx_type; +} + +void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y, + const int num_planes); + +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * int depth = 0; + * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + */ +static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) { + static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + }; + return bsize_to_max_depth_table[bsize]; +} + +/* + * Logic to generate the lookup table: + * + * TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + * assert(tx_size != TX_4X4); + * int depth = 0; + * while (tx_size != TX_4X4) { + * depth++; + * tx_size = sub_tx_size_map[tx_size]; + * } + * assert(depth < 10); + */ +static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = { + 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4, + }; + const int depth = bsize_to_tx_size_depth_table[bsize]; + assert(depth <= MAX_TX_CATS); + return depth - 1; +} + +static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) { + TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + TX_SIZE tx_size = max_tx_size; + for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size]; + return tx_size; +} + +static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) { + switch (tx_size) { + case TX_64X64: + case TX_64X32: + case TX_32X64: return TX_32X32; + case TX_64X16: return TX_32X16; + case TX_16X64: return TX_16X32; + default: return tx_size; + } +} + +static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize]; + return av1_get_adjusted_tx_size(uv_tx); +} + +static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + if (xd->lossless[mbmi->segment_id]) return TX_4X4; + if (plane == 0) return mbmi->tx_size; + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + return av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y); +} + +void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize, + const int num_planes); + +void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes); + +void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes); + +typedef void (*foreach_transformed_block_visitor)(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); + +void av1_set_entropy_contexts(const MACROBLOCKD *xd, + struct macroblockd_plane *pd, int plane, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff); + +#define MAX_INTERINTRA_SB_SQUARE 32 * 32 +static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) { + return (mbmi->ref_frame[0] > INTRA_FRAME && + mbmi->ref_frame[1] == INTRA_FRAME); +} + +static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) { + return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32); +} + +static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) { + return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END); +} + +static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) { + return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME); +} + +static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) { + return is_interintra_allowed_bsize(mbmi->bsize) && + is_interintra_allowed_mode(mbmi->mode) && + is_interintra_allowed_ref(mbmi->ref_frame); +} + +static INLINE int is_interintra_allowed_bsize_group(int group) { + int i; + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + if (size_group_lookup[i] == group && + is_interintra_allowed_bsize((BLOCK_SIZE)i)) { + return 1; + } + } + return 0; +} + +static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[0] > INTRA_FRAME && + mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi); +} + +static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { + if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; + const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize]; + if (plane == 0) return max_txsize; // luma + return av1_get_adjusted_tx_size(max_txsize); // chroma +} + +static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; +} + +static INLINE int is_motion_variation_allowed_compound( + const MB_MODE_INFO *mbmi) { + return !has_second_ref(mbmi); +} + +// input: log2 of length, 0(4), 1(8), ... +static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 }; + +static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) { + return mbmi->overlappable_neighbors != 0; +} + +static INLINE MOTION_MODE +motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, int allow_warped_motion) { + if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; + if (xd->cur_frame_force_integer_mv == 0) { + const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype; + if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION; + } + if (is_motion_variation_allowed_bsize(mbmi->bsize) && + is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME && + is_motion_variation_allowed_compound(mbmi)) { + assert(!has_second_ref(mbmi)); + if (mbmi->num_proj_ref >= 1 && allow_warped_motion && + !xd->cur_frame_force_integer_mv && + !av1_is_scaled(xd->block_ref_scale_factors[0])) { + return WARPED_CAUSAL; + } + return OBMC_CAUSAL; + } + return SIMPLE_TRANSLATION; +} + +static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) { + return (is_inter_block(mbmi)); +} + +static INLINE int av1_allow_palette(int allow_screen_content_tools, + BLOCK_SIZE sb_type) { + assert(sb_type < BLOCK_SIZES_ALL); + return allow_screen_content_tools && + block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH && + block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT && + sb_type >= BLOCK_8X8; +} + +// Returns sub-sampled dimensions of the given block. +// The output values for 'rows_within_bounds' and 'cols_within_bounds' will +// differ from 'height' and 'width' when part of the block is outside the +// right +// and/or bottom image boundary. +static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane, + const MACROBLOCKD *xd, int *width, + int *height, + int *rows_within_bounds, + int *cols_within_bounds) { + const int block_height = block_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + const int block_rows = (xd->mb_to_bottom_edge >= 0) + ? block_height + : (xd->mb_to_bottom_edge >> 3) + block_height; + const int block_cols = (xd->mb_to_right_edge >= 0) + ? block_width + : (xd->mb_to_right_edge >> 3) + block_width; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0)); + assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0)); + assert(block_width >= block_cols); + assert(block_height >= block_rows); + const int plane_block_width = block_width >> pd->subsampling_x; + const int plane_block_height = block_height >> pd->subsampling_y; + // Special handling for chroma sub8x8. + const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4; + const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4; + if (width) { + *width = plane_block_width + 2 * is_chroma_sub8_x; + assert(*width >= 0); + } + if (height) { + *height = plane_block_height + 2 * is_chroma_sub8_y; + assert(*height >= 0); + } + if (rows_within_bounds) { + *rows_within_bounds = + (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y; + assert(*rows_within_bounds >= 0); + } + if (cols_within_bounds) { + *cols_within_bounds = + (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x; + assert(*cols_within_bounds >= 0); + } +} + +/* clang-format off */ +// Pointer to a three-dimensional array whose first dimension is PALETTE_SIZES. +typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; +// Pointer to a const three-dimensional array whose first dimension is +// PALETTE_SIZES. +typedef const int (*ColorCost)[PALETTE_COLOR_INDEX_CONTEXTS][PALETTE_COLORS]; +/* clang-format on */ + +typedef struct { + int rows; + int cols; + int n_colors; + int plane_width; + int plane_height; + uint8_t *color_map; + MapCdf map_cdf; + ColorCost color_cost; +} Av1ColorMapParam; + +static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int ref; + + // First check if all modes are GLOBALMV + if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0; + + if (AOMMIN(mi_size_wide[mbmi->bsize], mi_size_high[mbmi->bsize]) < 2) + return 0; + + // Now check if all global motion is non translational + for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0; + } + return 1; +} + +static INLINE PLANE_TYPE get_plane_type(int plane) { + return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV; +} + +static INLINE int av1_get_max_eob(TX_SIZE tx_size) { + if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) { + return 1024; + } + if (tx_size == TX_16X64 || tx_size == TX_64X16) { + return 512; + } + return tx_size_2d[tx_size]; +} + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_BLOCKD_H_ diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c new file mode 100644 index 0000000000..12e9545441 --- /dev/null +++ b/third_party/aom/av1/common/cdef.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef.h" +#include "av1/common/cdef_block.h" +#include "av1/common/reconinter.h" +#include "av1/common/thread_common.h" + +static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col, + int mi_stride) { + MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col; + for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) { + for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) { + if (!mbmi[c]->skip_txfm) return 0; + } + } + + return 1; +} + +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bs) { + MB_MODE_INFO **grid = mi_params->mi_grid_base; + int maxc = mi_params->mi_cols - mi_col; + int maxr = mi_params->mi_rows - mi_row; + + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) + maxc = AOMMIN(maxc, MI_SIZE_128X128); + else + maxc = AOMMIN(maxc, MI_SIZE_64X64); + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) + maxr = AOMMIN(maxr, MI_SIZE_128X128); + else + maxr = AOMMIN(maxr, MI_SIZE_64X64); + + const int r_step = 2; // mi_size_high[BLOCK_8X8] + const int c_step = 2; // mi_size_wide[BLOCK_8X8] + const int r_shift = 1; + const int c_shift = 1; + int count = 0; + for (int r = 0; r < maxr; r += r_step) { + for (int c = 0; c < maxc; c += c_step) { + if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, + mi_params->mi_stride)) { + dlist[count].by = r >> r_shift; + dlist[count].bx = c >> c_shift; + count++; + } + } + } + return count; +} + +void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, int width, + int height) { + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize) { + const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; + cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); +} + +void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize) { + const uint16_t *base = + &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; + cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); +} + +void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, + int dstride, const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, int hsize) { + if (cm->seq_params->use_highbitdepth) { + av1_cdef_copy_sb8_16_highbd(dst, dstride, src, src_voffset, src_hoffset, + sstride, vsize, hsize); + } else { + av1_cdef_copy_sb8_16_lowbd(dst, dstride, src, src_voffset, src_hoffset, + sstride, vsize, hsize); + } +} + +static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src, + int sstride, int v, int h) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +// Prepares intermediate input buffer for CDEF. +// Inputs: +// cm: Pointer to common structure. +// fb_info: Pointer to the CDEF block-level parameter structure. +// colbuf: Left column buffer for CDEF. +// cdef_left: Left block is filtered or not. +// fbc, fbr: col and row index of a block. +// plane: plane index Y/CB/CR. +// Returns: +// Nothing will be returned. +static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info, + uint16_t **const colbuf, const int cdef_left, + int fbc, int fbr, int plane) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + uint16_t *src = fb_info->src; + const int luma_stride = + ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4); + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + int cstart = 0; + if (!cdef_left) cstart = -CDEF_HBORDER; + int rend, cend; + const int nhb = + AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + const int nvb = + AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + const int hsize = nhb << fb_info->mi_wide_l2; + const int vsize = nvb << fb_info->mi_high_l2; + const uint16_t *top_linebuf = fb_info->top_linebuf[plane]; + const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane]; + const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE; + const int stride = + luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x); + + if (fbc == nhfb - 1) + cend = hsize; + else + cend = hsize + CDEF_HBORDER; + + if (fbr == nvfb - 1) + rend = vsize; + else + rend = vsize + CDEF_VBORDER; + + /* Copy in the pixels we need from the current superblock for + deringing.*/ + av1_cdef_copy_sb8_16( + cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], + CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart, + fb_info->dst_stride, vsize, cend - cstart); + + /* Copy in the pixels we need for the current superblock from bottom buffer.*/ + if (fbr < nvfb - 1) { + copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize); + } else { + fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, + hsize, CDEF_VERY_LARGE); + } + if (fbr < nvfb - 1 && fbc > 0) { + copy_rect(&src[bot_offset], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride, + CDEF_VBORDER, CDEF_HBORDER); + } else { + fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (fbr < nvfb - 1 && fbc < nhfb - 1) { + copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, + CDEF_HBORDER); + } else { + fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, + CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + + /* Copy in the pixels we need from the current superblock from top buffer.*/ + if (fbr > 0) { + copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset], + stride, CDEF_VBORDER, hsize); + } else { + fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, + CDEF_VERY_LARGE); + } + if (fbr > 0 && fbc > 0) { + copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER], + stride, CDEF_VBORDER, CDEF_HBORDER); + } else { + fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (fbr > 0 && fbc < nhfb - 1) { + copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, + &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, + CDEF_HBORDER); + } else { + fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, + CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (cdef_left) { + /* If we deringed the superblock on the left then we need to copy in + saved pixels. */ + copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER, + rend + CDEF_VBORDER, CDEF_HBORDER); + } + /* Saving pixels in case we need to dering the superblock on the + right. */ + copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, + rend + CDEF_VBORDER, CDEF_HBORDER); + + if (fb_info->frame_boundary[LEFT]) { + fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (fb_info->frame_boundary[RIGHT]) { + fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, + vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } +} + +static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane, + uint8_t use_highbitdepth) { + int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset; + if (use_highbitdepth) { + av1_cdef_filter_fb( + NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride, + &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], + fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane, + fb_info->dlist, fb_info->cdef_count, fb_info->level, + fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift); + } else { + av1_cdef_filter_fb( + fb_info->dst + offset, NULL, fb_info->dst_stride, + &fb_info->src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], + fb_info->xdec, fb_info->ydec, fb_info->dir, NULL, fb_info->var, plane, + fb_info->dlist, fb_info->cdef_count, fb_info->level, + fb_info->sec_strength, fb_info->damping, fb_info->coeff_shift); + } +} + +// Initializes block-level parameters for CDEF. +static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, int *level, + int *sec_strength, int fbc, int fbr, + int plane) { + const PLANE_TYPE plane_type = get_plane_type(plane); + fb_info->level = level[plane_type]; + fb_info->sec_strength = sec_strength[plane_type]; + fb_info->dst = xd->plane[plane].dst.buf; + fb_info->dst_stride = xd->plane[plane].dst.stride; + + fb_info->xdec = xd->plane[plane].subsampling_x; + fb_info->ydec = xd->plane[plane].subsampling_y; + fb_info->mi_wide_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_x; + fb_info->mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + fb_info->roffset = MI_SIZE_64X64 * fbr << fb_info->mi_high_l2; + fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2; +} + +static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, uint16_t **const colbuf, + int *cdef_left, int fbc, int fbr) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mbmi_cdef_strength = + mi_params + ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] + ->cdef_strength; + const int num_planes = av1_num_planes(cm); + int is_zero_level[PLANE_TYPES] = { 1, 1 }; + int level[PLANE_TYPES] = { 0 }; + int sec_strength[PLANE_TYPES] = { 0 }; + const CdefInfo *const cdef_info = &cm->cdef_info; + + if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc] == NULL || + mbmi_cdef_strength == -1) { + av1_zero_array(cdef_left, num_planes); + return; + } + + // Compute level and secondary strength for planes + level[PLANE_TYPE_Y] = + cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_Y] = + cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_Y] += sec_strength[PLANE_TYPE_Y] == 3; + is_zero_level[PLANE_TYPE_Y] = + (level[PLANE_TYPE_Y] == 0) && (sec_strength[PLANE_TYPE_Y] == 0); + + if (num_planes > 1) { + level[PLANE_TYPE_UV] = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_UV] = + cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS; + sec_strength[PLANE_TYPE_UV] += sec_strength[PLANE_TYPE_UV] == 3; + is_zero_level[PLANE_TYPE_UV] = + (level[PLANE_TYPE_UV] == 0) && (sec_strength[PLANE_TYPE_UV] == 0); + } + + if (is_zero_level[PLANE_TYPE_Y] && is_zero_level[PLANE_TYPE_UV]) { + av1_zero_array(cdef_left, num_planes); + return; + } + + fb_info->cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64, + fbc * MI_SIZE_64X64, + fb_info->dlist, BLOCK_64X64); + if (!fb_info->cdef_count) { + av1_zero_array(cdef_left, num_planes); + return; + } + + for (int plane = 0; plane < num_planes; plane++) { + // Do not skip cdef filtering for luma plane as filter direction is + // computed based on luma. + if (plane && is_zero_level[get_plane_type(plane)]) { + cdef_left[plane] = 0; + continue; + } + cdef_init_fb_col(xd, fb_info, level, sec_strength, fbc, fbr, plane); + cdef_prepare_fb(cm, fb_info, colbuf, cdef_left[plane], fbc, fbr, plane); + cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth); + cdef_left[plane] = 1; + } +} + +// Initializes row-level parameters for CDEF frame. +void av1_cdef_init_fb_row(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr) { + (void)cdef_sync; + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + const bool ping_pong = fbr & 1; + // for the current filter block, it's top left corner mi structure (mi_tl) + // is first accessed to check whether the top and left boundaries are + // frame boundaries. Then bottom-left and top-right mi structures are + // accessed to check whether the bottom and right boundaries + // (respectively) are frame boundaries. + // + // Note that we can't just check the bottom-right mi structure - eg. if + // we're at the right-hand edge of the frame but not the bottom, then + // the bottom-right mi is NULL but the bottom-left is not. + fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; + if (fbr != nvfb - 1) + fb_info->frame_boundary[BOTTOM] = + (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; + else + fb_info->frame_boundary[BOTTOM] = 1; + + fb_info->src = src; + fb_info->damping = cm->cdef_info.cdef_damping; + fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + av1_zero(fb_info->dir); + av1_zero(fb_info->var); + + for (int plane = 0; plane < num_planes; plane++) { + const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + const int stride = luma_stride >> xd->plane[plane].subsampling_x; + // here ping-pong buffers are maintained for top linebuf + // to avoid linebuf over-write by consecutive row. + uint16_t *const top_linebuf = + &linebuf[plane][ping_pong * CDEF_VBORDER * stride]; + fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride]; + + if (fbr != nvfb - 1) // top line buffer copy + av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf, + offset - CDEF_VBORDER, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + fb_info->top_linebuf[plane] = + &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride]; + + if (fbr != nvfb - 1) // bottom line buffer copy + av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride, + xd->plane[plane].dst.buf, offset, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + } +} + +void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, + uint16_t **const linebuf, uint16_t **const colbuf, + uint16_t *const src, int fbr, + cdef_init_fb_row_t cdef_init_fb_row_fn, + struct AV1CdefSyncData *const cdef_sync, + struct aom_internal_error_info *error_info) { + // TODO(aomedia:3276): Pass error_info to the low-level functions as required + // in future to handle error propagation. + (void)error_info; + CdefBlockInfo fb_info; + int cdef_left[MAX_MB_PLANE] = { 1, 1, 1 }; + const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr); +#if CONFIG_MULTITHREAD + if (cdef_sync && cm->cdef_info.allocated_num_workers > 1) { + pthread_mutex_lock(cdef_sync->mutex_); + const bool cdef_mt_exit = cdef_sync->cdef_mt_exit; + pthread_mutex_unlock(cdef_sync->mutex_); + // Exit in case any worker has encountered an error. + if (cdef_mt_exit) return; + } +#endif + for (int fbc = 0; fbc < nhfb; fbc++) { + fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0; + if (fbc != nhfb - 1) + fb_info.frame_boundary[RIGHT] = + (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0; + else + fb_info.frame_boundary[RIGHT] = 1; + cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left[0], fbc, fbr); + } +} + +// Perform CDEF on input frame. +// Inputs: +// frame: Pointer to input frame buffer. +// cm: Pointer to common structure. +// xd: Pointer to common current coding block structure. +// Returns: +// Nothing will be returned. +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, + MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) { + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + + for (int fbr = 0; fbr < nvfb; fbr++) + av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf, + cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL, + xd->error_info); +} diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h new file mode 100644 index 0000000000..a56cd9db4a --- /dev/null +++ b/third_party/aom/av1/common/cdef.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_CDEF_H_ +#define AOM_AV1_COMMON_CDEF_H_ + +#define CDEF_STRENGTH_BITS 6 + +#define CDEF_PRI_STRENGTHS 16 +#define CDEF_SEC_STRENGTHS 4 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cdef_block.h" + +enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY); + +struct AV1CdefSyncData; + +/*!\brief Parameters related to CDEF Block */ +typedef struct { + uint16_t *src; /*!< CDEF intermediate buffer */ + uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */ + uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */ + uint8_t *dst; /*!< CDEF destination buffer */ + cdef_list + dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */ + + int xdec; /*!< Sub-sampling X */ + int ydec; /*!< Sub-sampling X */ + int mi_wide_l2; /*!< Pixels per mi unit in width */ + int mi_high_l2; /*!< Pixels per mi unit in height */ + int frame_boundary[BOUNDARIES]; /*!< frame boundaries */ + + int damping; /*!< CDEF damping factor */ + int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */ + int level; /*!< CDEF filtering level */ + int sec_strength; /*!< CDEF secondary strength */ + int cdef_count; /*!< Number of CDEF sub-blocks in superblock */ + int dir[CDEF_NBLOCKS] + [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/ + int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */ + + int dst_stride; /*!< CDEF destination buffer stride */ + int coffset; /*!< current superblock offset in a row */ + int roffset; /*!< current row offset */ +} CdefBlockInfo; + +static INLINE int sign(int i) { return i < 0 ? -1 : 1; } + +static INLINE int constrain(int diff, int threshold, int damping) { + if (!threshold) return 0; + + const int shift = AOMMAX(0, damping - get_msb(threshold)); + return sign(diff) * + AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift))); +} + +#ifdef __cplusplus +extern "C" { +#endif + +int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, cdef_list *dlist, + BLOCK_SIZE bsize); + +typedef void (*cdef_init_fb_row_t)( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); + +/*!\brief Function for applying CDEF to a frame + * + * \ingroup in_loop_cdef + * This function applies CDEF to a frame. + * + * \param[in, out] frame Compressed frame buffer + * \param[in, out] cm Pointer to top level common structure + * \param[in] xd Pointer to common current coding block structure + * \param[in] cdef_init_fb_row_fn Function Pointer + * + * \remark Nothing is returned. Instead, the filtered frame is output in + * \c frame. + */ +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, + MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn); +void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, + uint16_t **const linebuf, uint16_t **const colbuf, + uint16_t *const src, int fbr, + cdef_init_fb_row_t cdef_init_fb_row_fn, + struct AV1CdefSyncData *const cdef_sync, + struct aom_internal_error_info *error_info); +void av1_cdef_init_fb_row(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_COMMON_CDEF_H_ diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c new file mode 100644 index 0000000000..ce7039f374 --- /dev/null +++ b/third_party/aom/av1/common/cdef_block.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/cdef.h" +/* +This is Cdef_Directions (section 7.15.3) with 2 padding entries at the +beginning and end of the table. The cdef direction range is [0, 7] and the +first index is offset +/-2. This removes the need to constrain the first +index to the same range using e.g., & 7. +*/ +DECLARE_ALIGNED(16, const int, cdef_directions_padded[12][2]) = { + /* Padding: cdef_directions[6] */ + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, + /* Padding: cdef_directions[7] */ + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, + + /* Begin cdef_directions */ + { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 }, + { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 }, + { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, + { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, + /* End cdef_directions */ + + /* Padding: cdef_directions[0] */ + { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, + /* Padding: cdef_directions[1] */ + { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, +}; + +const int (*const cdef_directions)[2] = cdef_directions_padded + 2; + +/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. + The search minimizes the weighted variance along all the lines in a + particular direction, i.e. the squared error between the input and a + "predicted" block where each pixel is replaced by the average along a line + in a particular direction. Since each direction have the same sum(x^2) term, + that term is never computed. See Section 2, step 2, of: + http://jmvalin.ca/notes/intra_paint.pdf */ +int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + int i; + int32_t cost[8] = { 0 }; + int partial[8][15] = { { 0 } }; + int32_t best_cost = 0; + int best_dir = 0; + /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. + The output is then 840 times larger, but we don't care for finding + the max. */ + static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 }; + for (i = 0; i < 8; i++) { + int j; + for (j = 0; j < 8; j++) { + int x; + /* We subtract 128 here to reduce the maximum range of the squared + partial sums. */ + x = (img[i * stride + j] >> coeff_shift) - 128; + partial[0][i + j] += x; + partial[1][i + j / 2] += x; + partial[2][i] += x; + partial[3][3 + i - j / 2] += x; + partial[4][7 + i - j] += x; + partial[5][3 - i / 2 + j] += x; + partial[6][j] += x; + partial[7][i / 2 + j] += x; + } + } + for (i = 0; i < 8; i++) { + cost[2] += partial[2][i] * partial[2][i]; + cost[6] += partial[6][i] * partial[6][i]; + } + cost[2] *= div_table[8]; + cost[6] *= div_table[8]; + for (i = 0; i < 7; i++) { + cost[0] += (partial[0][i] * partial[0][i] + + partial[0][14 - i] * partial[0][14 - i]) * + div_table[i + 1]; + cost[4] += (partial[4][i] * partial[4][i] + + partial[4][14 - i] * partial[4][14 - i]) * + div_table[i + 1]; + } + cost[0] += partial[0][7] * partial[0][7] * div_table[8]; + cost[4] += partial[4][7] * partial[4][7] * div_table[8]; + for (i = 1; i < 8; i += 2) { + int j; + for (j = 0; j < 4 + 1; j++) { + cost[i] += partial[i][3 + j] * partial[i][3 + j]; + } + cost[i] *= div_table[8]; + for (j = 0; j < 4 - 1; j++) { + cost[i] += (partial[i][j] * partial[i][j] + + partial[i][10 - j] * partial[i][10 - j]) * + div_table[2 * j + 2]; + } + } + for (i = 0; i < 8; i++) { + if (cost[i] > best_cost) { + best_cost = cost[i]; + best_dir = i; + } + } + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var = best_cost - cost[(best_dir + 4) & 7]; + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var >>= 10; + return best_dir; +} + +void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var1, int32_t *var2, + int coeff_shift, int *out1, int *out2) { + *out1 = cdef_find_dir_c(img1, stride, var1, coeff_shift); + *out2 = cdef_find_dir_c(img2, stride, var2, coeff_shift); +} + +const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; +const int cdef_sec_taps[2] = { 2, 1 }; + +/* Smooth in the direction detected. */ +static void cdef_filter_block_internal( + uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, int pri_damping, + int sec_damping, int coeff_shift, int block_width, int block_height, + int enable_primary, int enable_secondary) { + const int clipping_required = (enable_primary && enable_secondary); + int i, j, k; + const int s = CDEF_BSTRIDE; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + int16_t sum = 0; + int16_t y; + int16_t x = in[i * s + j]; + int max = x; + int min = x; + for (k = 0; k < 2; k++) { + if (enable_primary) { + int16_t p0 = in[i * s + j + cdef_directions[dir][k]]; + int16_t p1 = in[i * s + j - cdef_directions[dir][k]]; + sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping); + sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping); + if (clipping_required) { + if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max); + if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max); + min = AOMMIN(p0, min); + min = AOMMIN(p1, min); + } + } + if (enable_secondary) { + int16_t s0 = in[i * s + j + cdef_directions[dir + 2][k]]; + int16_t s1 = in[i * s + j - cdef_directions[dir + 2][k]]; + int16_t s2 = in[i * s + j + cdef_directions[dir - 2][k]]; + int16_t s3 = in[i * s + j - cdef_directions[dir - 2][k]]; + if (clipping_required) { + if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max); + if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max); + if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max); + if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max); + min = AOMMIN(s0, min); + min = AOMMIN(s1, min); + min = AOMMIN(s2, min); + min = AOMMIN(s3, min); + } + sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping); + sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping); + } + } + y = ((int16_t)x + ((8 + sum - (sum < 0)) >> 4)); + if (clipping_required) { + y = clamp(y, min, max); + } + + if (dst8) + dst8[i * dstride + j] = (uint8_t)y; + else + dst16[i * dstride + j] = (uint16_t)y; + } + } +} + +void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/1); +} + +void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/0); +} + +void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/1); +} + +void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/0); +} + +void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/1); +} + +void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/1, /*enable_secondary=*/0); +} + +void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/1); +} + +void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, + coeff_shift, block_width, block_height, + /*enable_primary=*/0, /*enable_secondary=*/0); +} + +/* Compute the primary filter strength for an 8x8 block based on the + directional variance difference. A high variance difference means + that we have a highly directional pattern (e.g. a high contrast + edge), so we can apply more deringing. A low variance means that we + either have a low contrast edge, or a non-directional texture, so + we want to be careful not to blur. */ +static INLINE int adjust_strength(int strength, int32_t var) { + const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0; + /* We use the variance of 8x8 blocks to adjust the strength. */ + return var ? (strength * (4 + i) + 8) >> 4 : 0; +} + +static AOM_INLINE void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], + int cdef_count, int coeff_shift, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) { + int bi; + + // Find direction of two 8x8 blocks together. + for (bi = 0; bi < cdef_count - 1; bi += 2) { + const int by = dlist[bi].by; + const int bx = dlist[bi].bx; + const int by2 = dlist[bi + 1].by; + const int bx2 = dlist[bi + 1].bx; + const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx; + const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2; + cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx], + &var[by2][bx2], coeff_shift, &dir[by][bx], + &dir[by2][bx2]); + } + + // Process remaining 8x8 blocks here. One 8x8 at a time. + if (cdef_count % 2) { + const int by = dlist[bi].by; + const int bx = dlist[bi].bx; + dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx], + CDEF_BSTRIDE, &var[by][bx], coeff_shift); + } +} + +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + const uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift) { + int bi; + int bx; + int by; + const int pri_strength = level << coeff_shift; + sec_strength <<= coeff_shift; + damping += coeff_shift - (pli != AOM_PLANE_Y); + const int bw_log2 = 3 - xdec; + const int bh_log2 = 3 - ydec; + if (dirinit && pri_strength == 0 && sec_strength == 0) { + // If we're here, both primary and secondary strengths are 0, and + // we still haven't written anything to y[] yet, so we just copy + // the input to y[]. This is necessary only for av1_cdef_search() + // and only av1_cdef_search() sets dirinit. + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + // TODO(stemidts/jmvalin): SIMD optimisations + for (int iy = 0; iy < 1 << bh_log2; iy++) { + memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)], + &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)], + ((size_t)1 << bw_log2) * sizeof(*dst16)); + } + } + return; + } + + if (pli == 0) { + if (!dirinit || !*dirinit) { + aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir); + if (dirinit) *dirinit = 1; + } + } + if (pli == 1 && xdec != ydec) { + for (bi = 0; bi < cdef_count; bi++) { + static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 }; + static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 }; + by = dlist[bi].by; + bx = dlist[bi].bx; + dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]]; + } + } + + if (dst8) { + const int block_width = 8 >> xdec; + const int block_height = 8 >> ydec; + /* + * strength_index == 0 : enable_primary = 1, enable_secondary = 1 + * strength_index == 1 : enable_primary = 1, enable_secondary = 0 + * strength_index == 2 : enable_primary = 0, enable_secondary = 1 + * strength_index == 3 : enable_primary = 0, enable_secondary = 0 + */ + const cdef_filter_block_func cdef_filter_fn[4] = { + cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3 + }; + + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + const int t = + (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); + const int strength_index = (sec_strength == 0) | ((t == 0) << 1); + + cdef_filter_fn[strength_index]( + &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, + sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, + coeff_shift, block_width, block_height); + } + } else { + const int block_width = 8 >> xdec; + const int block_height = 8 >> ydec; + /* + * strength_index == 0 : enable_primary = 1, enable_secondary = 1 + * strength_index == 1 : enable_primary = 1, enable_secondary = 0 + * strength_index == 2 : enable_primary = 0, enable_secondary = 1 + * strength_index == 3 : enable_primary = 0, enable_secondary = 0 + */ + const cdef_filter_block_func cdef_filter_fn[4] = { + cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3 + }; + + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + const int t = + (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); + const int strength_index = (sec_strength == 0) | ((t == 0) << 1); + + cdef_filter_fn[strength_index]( + &dst16[dirinit ? bi << (bw_log2 + bh_log2) + : (by << bh_log2) * dstride + (bx << bw_log2)], + dirinit ? 1 << bw_log2 : dstride, + &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, + sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, + coeff_shift, block_width, block_height); + } + } +} diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h new file mode 100644 index 0000000000..b5e4f124ae --- /dev/null +++ b/third_party/aom/av1/common/cdef_block.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_H_ + +#include "aom_dsp/odintrin.h" + +#define CDEF_BLOCKSIZE 64 +#define CDEF_BLOCKSIZE_LOG2 6 +#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8) +#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2) + +/* We need to buffer two vertical lines. */ +#define CDEF_VBORDER (2) +/* We only need to buffer three horizontal pixels too, but let's align to + 16 bytes (8 x 16 bits) to make vectorization easier. */ +#define CDEF_HBORDER (8) +#define CDEF_BSTRIDE \ + ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3) + +#define CDEF_VERY_LARGE (0x4000) +#define CDEF_INBUF_SIZE \ + (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER)) + +extern const int cdef_pri_taps[2][2]; +extern const int cdef_sec_taps[2]; +extern const int (*const cdef_directions)[2]; + +typedef struct { + uint8_t by; + uint8_t bx; +} cdef_list; + +typedef void (*cdef_filter_block_func)(void *dest, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height); + +void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, + const uint16_t *in, int xdec, int ydec, + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, + cdef_list *dlist, int cdef_count, int level, + int sec_strength, int damping, int coeff_shift); + +static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h, + uint16_t x) { + for (int i = 0; i < v; i++) { + for (int j = 0; j < h; j++) { + dst[i * dstride + j] = x; + } + } +} +#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_ diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h new file mode 100644 index 0000000000..5c62201f1e --- /dev/null +++ b/third_party/aom/av1/common/cdef_block_simd.h @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ +#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ + +#include "config/av1_rtcd.h" + +#include "av1/common/cdef_block.h" + +/* partial A is a 16-bit vector of the form: + [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: + [0 y1 y2 y3 y4 y5 y6 y7]. + This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... + (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 + and const2. */ +static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1, + v128 const2) { + v128 tmp; + /* Reverse partial B. */ + partialb = v128_shuffle_8( + partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); + /* Interleave the x and y values of identical indices and pair x8 with 0. */ + tmp = partiala; + partiala = v128_ziplo_16(partialb, partiala); + partialb = v128_ziphi_16(partialb, tmp); + /* Square and add the corresponding x and y values. */ + partiala = v128_madd_s16(partiala, partiala); + partialb = v128_madd_s16(partialb, partialb); + /* Multiply by constant. */ + partiala = v128_mullo_s32(partiala, const1); + partialb = v128_mullo_s32(partialb, const2); + /* Sum all results. */ + partiala = v128_add_32(partiala, partialb); + return partiala; +} + +static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { + v128 t0, t1, t2, t3; + t0 = v128_ziplo_32(x1, x0); + t1 = v128_ziplo_32(x3, x2); + t2 = v128_ziphi_32(x1, x0); + t3 = v128_ziphi_32(x3, x2); + x0 = v128_ziplo_64(t1, t0); + x1 = v128_ziphi_64(t1, t0); + x2 = v128_ziplo_64(t3, t2); + x3 = v128_ziphi_64(t3, t2); + return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); +} + +/* Computes cost for directions 0, 5, 6 and 7. We can call this function again + to compute the remaining directions. */ +static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { + v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; + v128 partial6; + v128 tmp; + /* Partial sums for lines 0 and 1. */ + partial4a = v128_shl_n_byte(lines[0], 14); + partial4b = v128_shr_n_byte(lines[0], 2); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); + tmp = v128_add_16(lines[0], lines[1]); + partial5a = v128_shl_n_byte(tmp, 10); + partial5b = v128_shr_n_byte(tmp, 6); + partial7a = v128_shl_n_byte(tmp, 4); + partial7b = v128_shr_n_byte(tmp, 12); + partial6 = tmp; + + /* Partial sums for lines 2 and 3. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); + tmp = v128_add_16(lines[2], lines[3]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); + partial6 = v128_add_16(partial6, tmp); + + /* Partial sums for lines 4 and 5. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); + tmp = v128_add_16(lines[4], lines[5]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); + partial6 = v128_add_16(partial6, tmp); + + /* Partial sums for lines 6 and 7. */ + partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); + partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); + partial4a = v128_add_16(partial4a, lines[7]); + tmp = v128_add_16(lines[6], lines[7]); + partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); + partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); + partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); + partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); + partial6 = v128_add_16(partial6, tmp); + + /* Compute costs in terms of partial sums. */ + partial4a = + fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), + v128_from_32(105, 120, 140, 168)); + partial7a = + fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), + v128_from_32(105, 105, 105, 140)); + partial5a = + fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), + v128_from_32(105, 105, 105, 140)); + partial6 = v128_madd_s16(partial6, partial6); + partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); + + partial4a = hsum4(partial4a, partial5a, partial6, partial7a); + v128_store_unaligned(tmp_cost1, partial4a); + return partial4a; +} + +/* transpose and reverse the order of the lines -- equivalent to a 90-degree + counter-clockwise rotation of the pixels. */ +static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) { + const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); + const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); + const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); + const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); + const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); + const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); + const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); + const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); + + const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); + const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); + const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); + const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); + const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); + const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); + const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); + const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); + + res[7] = v128_ziplo_64(tr1_1, tr1_0); + res[6] = v128_ziphi_64(tr1_1, tr1_0); + res[5] = v128_ziplo_64(tr1_3, tr1_2); + res[4] = v128_ziphi_64(tr1_3, tr1_2); + res[3] = v128_ziplo_64(tr1_5, tr1_4); + res[2] = v128_ziphi_64(tr1_5, tr1_4); + res[1] = v128_ziplo_64(tr1_7, tr1_6); + res[0] = v128_ziphi_64(tr1_7, tr1_6); +} + +int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, + int coeff_shift) { + int i; + int32_t cost[8]; + int32_t best_cost = 0; + int best_dir = 0; + v128 lines[8]; + for (i = 0; i < 8; i++) { + lines[i] = v128_load_unaligned(&img[i * stride]); + lines[i] = + v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); + } + + /* Compute "mostly vertical" directions. */ + v128 dir47 = compute_directions(lines, cost + 4); + + array_reverse_transpose_8x8(lines, lines); + + /* Compute "mostly horizontal" directions. */ + v128 dir03 = compute_directions(lines, cost); + + v128 max = v128_max_s32(dir03, dir47); + max = v128_max_s32(max, v128_align(max, max, 8)); + max = v128_max_s32(max, v128_align(max, max, 4)); + best_cost = v128_low_u32(max); + v128 t = + v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); + best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); + best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros + + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var = best_cost - cost[(best_dir + 4) & 7]; + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var >>= 10; + return best_dir; +} + +// Work around compiler out of memory issues with Win32 builds. This issue has +// been observed with Visual Studio 2017, 2019, and 2022 (version 17.4). +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1940 +#define CDEF_INLINE static INLINE +#else +#define CDEF_INLINE SIMD_INLINE +#endif + +// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) +CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, + unsigned int adjdamp) { + v256 diff = v256_sub_16(a, b); + const v256 sign = v256_shr_n_s16(diff, 15); + diff = v256_abs_s16(diff); + const v256 s = + v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); + return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); +} + +SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max, + v256 cdef_large_value_mask) { + if (is_lowbd) { + v256 max_u8; + max_u8 = tap[0]; + max_u8 = v256_max_u8(max_u8, tap[1]); + max_u8 = v256_max_u8(max_u8, tap[2]); + max_u8 = v256_max_u8(max_u8, tap[3]); + /* The source is 16 bits, however, we only really care about the lower + 8 bits. The upper 8 bits contain the "large" flag. After the final + primary max has been calculated, zero out the upper 8 bits. Use this + to find the "16 bit" max. */ + max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); + } else { + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); + } + return max; +} + +SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max, + v256 cdef_large_value_mask) { + if (is_lowbd) { + v256 max_u8; + max_u8 = tap[0]; + max_u8 = v256_max_u8(max_u8, tap[1]); + max_u8 = v256_max_u8(max_u8, tap[2]); + max_u8 = v256_max_u8(max_u8, tap[3]); + max_u8 = v256_max_u8(max_u8, tap[4]); + max_u8 = v256_max_u8(max_u8, tap[5]); + max_u8 = v256_max_u8(max_u8, tap[6]); + max_u8 = v256_max_u8(max_u8, tap[7]); + /* The source is 16 bits, however, we only really care about the lower + 8 bits. The upper 8 bits contain the "large" flag. After the final + primary max has been calculated, zero out the upper 8 bits. Use this + to find the "16 bit" max. */ + max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); + } else { + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); + max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); + } + return max; +} + +// MSVC takes far too much time optimizing these. +// https://bugs.chromium.org/p/aomedia/issues/detail?id=3395 +#if defined(_MSC_VER) && !defined(__clang__) +#pragma optimize("", off) +#endif + +CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, int pri_damping, + int sec_damping, int coeff_shift, int height, + int enable_primary, int enable_secondary) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + const int clipping_required = enable_primary && enable_secondary; + v256 p0, p1, p2, p3; + v256 sum, row, res; + v256 max, min; + const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + int i; + + if (enable_primary && pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (enable_secondary && sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + + for (i = 0; i < height; i += 4) { + sum = v256_zero(); + row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); + max = min = row; + + if (enable_primary) { + v256 tap[4]; + // Primary near taps + tap[0] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); + p0 = constrain16(tap[0], row, pri_strength, pri_damping); + tap[1] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); + p1 = constrain16(tap[1], row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); + + // Primary far taps + tap[2] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); + p0 = constrain16(tap[2], row, pri_strength, pri_damping); + tap[3] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); + p1 = constrain16(tap[3], row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); + if (clipping_required) { + max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + } + } + + if (enable_secondary) { + v256 tap[8]; + // Secondary near taps + tap[0] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); + p0 = constrain16(tap[0], row, sec_strength, sec_damping); + tap[1] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); + p1 = constrain16(tap[1], row, sec_strength, sec_damping); + tap[2] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); + p2 = constrain16(tap[2], row, sec_strength, sec_damping); + tap[3] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); + p3 = constrain16(tap[3], row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // Secondary far taps + tap[4] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); + p0 = constrain16(tap[4], row, sec_strength, sec_damping); + tap[5] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); + p1 = constrain16(tap[5], row, sec_strength, sec_damping); + tap[6] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); + p2 = constrain16(tap[6], row, sec_strength, sec_damping); + tap[7] = + v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), + v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); + p3 = constrain16(tap[7], row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + if (clipping_required) { + max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + min = v256_min_s16(min, tap[4]); + min = v256_min_s16(min, tap[5]); + min = v256_min_s16(min, tap[6]); + min = v256_min_s16(min, tap[7]); + } + } + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + if (clipping_required) { + res = v256_min_s16(v256_max_s16(res, min), max); + } + + if (is_lowbd) { + const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); + u32_store_aligned(&dst8[(i + 0) * dstride], + v64_high_u32(v128_high_v64(res_128))); + u32_store_aligned(&dst8[(i + 1) * dstride], + v64_low_u32(v128_high_v64(res_128))); + u32_store_aligned(&dst8[(i + 2) * dstride], + v64_high_u32(v128_low_v64(res_128))); + u32_store_aligned(&dst8[(i + 3) * dstride], + v64_low_u32(v128_low_v64(res_128))); + } else { + v64_store_aligned(&dst16[(i + 0) * dstride], + v128_high_v64(v256_high_v128(res))); + v64_store_aligned(&dst16[(i + 1) * dstride], + v128_low_v64(v256_high_v128(res))); + v64_store_aligned(&dst16[(i + 2) * dstride], + v128_high_v64(v256_low_v128(res))); + v64_store_aligned(&dst16[(i + 3) * dstride], + v128_low_v64(v256_low_v128(res))); + } + } +} + +CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int pri_strength, + int sec_strength, int dir, int pri_damping, + int sec_damping, int coeff_shift, int height, + int enable_primary, int enable_secondary) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + const int clipping_required = enable_primary && enable_secondary; + int i; + v256 sum, p0, p1, p2, p3, row, res; + const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); + v256 max, min; + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (enable_primary && pri_strength) + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + if (enable_secondary && sec_strength) + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + + for (i = 0; i < height; i += 2) { + v256 tap[8]; + sum = v256_zero(); + row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), + v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); + + min = max = row; + if (enable_primary) { + // Primary near taps + tap[0] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); + tap[1] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); + p0 = constrain16(tap[0], row, pri_strength, pri_damping); + p1 = constrain16(tap[1], row, pri_strength, pri_damping); + + // sum += pri_taps[0] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); + + // Primary far taps + tap[2] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); + tap[3] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); + p0 = constrain16(tap[2], row, pri_strength, pri_damping); + p1 = constrain16(tap[3], row, pri_strength, pri_damping); + + // sum += pri_taps[1] * (p0 + p1) + sum = v256_add_16( + sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); + + if (clipping_required) { + max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + } + // End primary + } + + if (enable_secondary) { + // Secondary near taps + tap[0] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); + tap[1] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); + tap[2] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); + tap[3] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); + p0 = constrain16(tap[0], row, sec_strength, sec_damping); + p1 = constrain16(tap[1], row, sec_strength, sec_damping); + p2 = constrain16(tap[2], row, sec_strength, sec_damping); + p3 = constrain16(tap[3], row, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + // Secondary far taps + tap[4] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); + tap[5] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); + tap[6] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); + tap[7] = v256_from_v128( + v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), + v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); + p0 = constrain16(tap[4], row, sec_strength, sec_damping); + p1 = constrain16(tap[5], row, sec_strength, sec_damping); + p2 = constrain16(tap[6], row, sec_strength, sec_damping); + p3 = constrain16(tap[7], row, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), + v256_add_16(v256_add_16(p0, p1), + v256_add_16(p2, p3)))); + + if (clipping_required) { + max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); + + min = v256_min_s16(min, tap[0]); + min = v256_min_s16(min, tap[1]); + min = v256_min_s16(min, tap[2]); + min = v256_min_s16(min, tap[3]); + min = v256_min_s16(min, tap[4]); + min = v256_min_s16(min, tap[5]); + min = v256_min_s16(min, tap[6]); + min = v256_min_s16(min, tap[7]); + } + // End secondary + } + + // res = row + ((sum - (sum < 0) + 8) >> 4) + sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); + res = v256_add_16(sum, v256_dup_16(8)); + res = v256_shr_n_s16(res, 4); + res = v256_add_16(row, res); + if (clipping_required) { + res = v256_min_s16(v256_max_s16(res, min), max); + } + + if (is_lowbd) { + const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); + v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); + v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); + } else { + v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); + v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); + } + } +} + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma optimize("", on) +#endif + +SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int height) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + int i; + for (i = 0; i < height; i += 4) { + const v128 row0 = + v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); + const v128 row1 = + v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), + v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); + if (is_lowbd) { + /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ + const v128 res_128 = v128_pack_s16_u8(row1, row0); + u32_store_aligned(&dst8[(i + 0) * dstride], + v64_high_u32(v128_low_v64(res_128))); + u32_store_aligned(&dst8[(i + 1) * dstride], + v64_low_u32(v128_low_v64(res_128))); + u32_store_aligned(&dst8[(i + 2) * dstride], + v64_high_u32(v128_high_v64(res_128))); + u32_store_aligned(&dst8[(i + 3) * dstride], + v64_low_u32(v128_high_v64(res_128))); + } else { + v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0)); + v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0)); + v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1)); + v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1)); + } + } +} + +SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, + const uint16_t *in, int height) { + uint8_t *dst8 = (uint8_t *)dest; + uint16_t *dst16 = (uint16_t *)dest; + int i; + for (i = 0; i < height; i += 2) { + const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); + const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); + if (is_lowbd) { + /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ + const v128 res_128 = v128_pack_s16_u8(row1, row0); + v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); + v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); + } else { + v128_store_unaligned(&dst16[i * dstride], row0); + v128_store_unaligned(&dst16[(i + 1) * dstride], row1); + } + } +} + +void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } else { + filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } +} +void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + + if (block_width == 8) { + copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + } else { + copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + } +} + +void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } else { + filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/1, + /*enable_secondary=*/0); + } +} +void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + if (block_width == 8) { + filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } else { + filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, + sec_strength, dir, pri_damping, sec_damping, coeff_shift, + block_height, /*enable_primary=*/0, + /*enable_secondary=*/1); + } +} + +void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, + int coeff_shift, int block_width, + int block_height) { + (void)pri_strength; + (void)sec_strength; + (void)dir; + (void)pri_damping; + (void)sec_damping; + (void)coeff_shift; + (void)block_width; + if (block_width == 8) { + copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + } else { + copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + } +} + +void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + int i, j; + for (i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v128 row = v128_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], row); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} + +#undef CDEF_INLINE + +#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c new file mode 100644 index 0000000000..0e37d45980 --- /dev/null +++ b/third_party/aom/av1/common/cfl.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/common_data.h" + +#include "config/av1_rtcd.h" + +void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) { + assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); + assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE); + + memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3)); + memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3)); + cfl->subsampling_x = seq_params->subsampling_x; + cfl->subsampling_y = seq_params->subsampling_y; + cfl->are_parameters_computed = 0; + cfl->store_y = 0; + // The DC_PRED cache is disabled by default and is only enabled in + // cfl_rd_pick_alpha + clear_cfl_dc_pred_cache_flags(cfl); +} + +void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, + CFL_PRED_TYPE pred_plane, int width) { + assert(pred_plane < CFL_PRED_PLANES); + assert(width <= CFL_BUF_LINE); + + if (is_cur_buf_hbd(xd)) { + uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input); + memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1); + return; + } + + memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width); +} + +static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst, + int dst_stride, int width, int height) { + for (int j = 0; j < height; j++) { + memcpy(dst, dc_pred_cache, width); + dst += dst_stride; + } +} + +static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst, + int dst_stride, int width, int height) { + const size_t num_bytes = width << 1; + for (int j = 0; j < height; j++) { + memcpy(dst, dc_pred_cache, num_bytes); + dst += dst_stride; + } +} +void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(pred_plane < CFL_PRED_PLANES); + assert(width <= CFL_BUF_LINE); + assert(height <= CFL_BUF_LINE); + if (is_cur_buf_hbd(xd)) { + uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); + cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride, + width, height); + return; + } + cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride, + width, height); +} + +// Due to frame boundary issues, it is possible that the total area covered by +// chroma exceeds that of luma. When this happens, we fill the missing pixels by +// repeating the last columns and/or rows. +static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) { + const int diff_width = width - cfl->buf_width; + const int diff_height = height - cfl->buf_height; + + if (diff_width > 0) { + const int min_height = height - diff_height; + uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width); + for (int j = 0; j < min_height; j++) { + const uint16_t last_pixel = recon_buf_q3[-1]; + assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); + for (int i = 0; i < diff_width; i++) { + recon_buf_q3[i] = last_pixel; + } + recon_buf_q3 += CFL_BUF_LINE; + } + cfl->buf_width = width; + } + if (diff_height > 0) { + uint16_t *recon_buf_q3 = + cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE); + for (int j = 0; j < diff_height; j++) { + const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE; + assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE); + for (int i = 0; i < width; i++) { + recon_buf_q3[i] = last_row_q3[i]; + } + recon_buf_q3 += CFL_BUF_LINE; + } + cfl->buf_height = height; + } +} + +static void subtract_average_c(const uint16_t *src, int16_t *dst, int width, + int height, int round_offset, int num_pel_log2) { + int sum = round_offset; + const uint16_t *recon = src; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + sum += recon[i]; + } + recon += CFL_BUF_LINE; + } + const int avg = sum >> num_pel_log2; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = src[i] - avg; + } + src += CFL_BUF_LINE; + dst += CFL_BUF_LINE; + } +} + +CFL_SUB_AVG_FN(c) + +static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign, + CFL_PRED_TYPE pred_type) { + const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign) + : CFL_SIGN_V(joint_sign); + if (alpha_sign == CFL_SIGN_ZERO) return 0; + const int abs_alpha_q3 = + (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx); + return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1; +} + +static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3, int width, + int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]); + } + dst += dst_stride; + ac_buf_q3 += CFL_BUF_LINE; + } +} + +CFL_PREDICT_FN(c, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride, + int alpha_q3, int bit_depth, int width, int height) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + dst[i] = clip_pixel_highbd( + get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth); + } + dst += dst_stride; + ac_buf_q3 += CFL_BUF_LINE; + } +} + +CFL_PREDICT_FN(c, hbd) +#endif + +static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) { + CFL_CTX *const cfl = &xd->cfl; + // Do not call cfl_compute_parameters multiple time on the same values. + assert(cfl->are_parameters_computed == 0); + + cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]); + cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3); + cfl->are_parameters_computed = 1; +} + +void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, int plane) { + CFL_CTX *const cfl = &xd->cfl; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(is_cfl_allowed(xd)); + + if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size); + + const int alpha_q3 = + cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1); + assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <= + CFL_BUF_SQUARE); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); + cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, + alpha_q3, xd->bd); + return; + } +#endif + cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3); +} + +static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + for (int j = 0; j < height; j += 2) { + for (int i = 0; i < width; i += 2) { + const int bot = i + input_stride; + output_q3[i >> 1] = + (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; + } + input += input_stride << 1; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i += 2) { + output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + output_q3[i] = input[i] << 3; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + for (int j = 0; j < height; j += 2) { + for (int i = 0; i < width; i += 2) { + const int bot = i + input_stride; + output_q3[i >> 1] = + (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1; + } + input += input_stride << 1; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i += 2) { + output_q3[i >> 1] = (input[i] + input[i + 1]) << 2; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} + +static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input, + int input_stride, + uint16_t *output_q3, int width, + int height) { + assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + output_q3[i] = input[i] << 3; + } + input += input_stride; + output_q3 += CFL_BUF_LINE; + } +} +#endif + +CFL_GET_SUBSAMPLE_FUNCTION(c) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size, + int sub_x, int sub_y) { + if (sub_x == 1) { + if (sub_y == 1) { + return cfl_get_luma_subsampling_420_hbd(tx_size); + } + return cfl_get_luma_subsampling_422_hbd(tx_size); + } + return cfl_get_luma_subsampling_444_hbd(tx_size); +} +#endif + +static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size, + int sub_x, int sub_y) { + if (sub_x == 1) { + if (sub_y == 1) { + return cfl_get_luma_subsampling_420_lbd(tx_size); + } + return cfl_get_luma_subsampling_422_lbd(tx_size); + } + return cfl_get_luma_subsampling_444_lbd(tx_size); +} + +static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride, + int row, int col, TX_SIZE tx_size, int use_hbd) { + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int tx_off_log2 = MI_SIZE_LOG2; + const int sub_x = cfl->subsampling_x; + const int sub_y = cfl->subsampling_y; + const int store_row = row << (tx_off_log2 - sub_y); + const int store_col = col << (tx_off_log2 - sub_x); + const int store_height = height >> sub_y; + const int store_width = width >> sub_x; + + // Invalidate current parameters + cfl->are_parameters_computed = 0; + + // Store the surface of the pixel buffer that was written to, this way we + // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the + // frame boundary) + if (col == 0 && row == 0) { + cfl->buf_width = store_width; + cfl->buf_height = store_height; + } else { + cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width); + cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height); + } + + // Check that we will remain inside the pixel buffer. + assert(store_row + store_height <= CFL_BUF_LINE); + assert(store_col + store_width <= CFL_BUF_LINE); + + // Store the input into the CfL pixel buffer + uint16_t *recon_buf_q3 = + cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col); +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input), + input_stride, recon_buf_q3); + } else { + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, + recon_buf_q3); + } +#else + (void)use_hbd; + cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3); +#endif +} + +// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced +// and non-chroma-referenced blocks are stored together in the CfL buffer. +static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row, + int mi_col, int *row_out, + int *col_out) { + // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s. + if ((mi_row & 0x01) && cfl->subsampling_y) { + assert(*row_out == 0); + (*row_out)++; + } + + // Increment col index for right: 4x8, 4x16 or both right 4x4s. + if ((mi_col & 0x01) && cfl->subsampling_x) { + assert(*col_out == 0); + (*col_out)++; + } +} + +void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, + BLOCK_SIZE bsize) { + CFL_CTX *const cfl = &xd->cfl; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; + + if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { + // Only dimensions of size 4 can have an odd offset. + assert(!((col & 1) && tx_size_wide[tx_size] != 4)); + assert(!((row & 1) && tx_size_high[tx_size] != 4)); + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); + } + cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); +} + +static INLINE int max_intra_block_width(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]); +} + +static INLINE int max_intra_block_height(const MACROBLOCKD *xd, + BLOCK_SIZE plane_bsize, int plane, + TX_SIZE tx_size) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane) + << MI_SIZE_LOG2; + return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]); +} + +void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { + CFL_CTX *const cfl = &xd->cfl; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int row = 0; + int col = 0; + + if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) { + sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col); + } + const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size); + const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size); + tx_size = get_tx_size(width, height); + cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size, + is_cur_buf_hbd(xd)); +} diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h new file mode 100644 index 0000000000..dcaa87bd48 --- /dev/null +++ b/third_party/aom/av1/common/cfl.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CFL_H_ +#define AOM_AV1_COMMON_CFL_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +// Can we use CfL for the current block? +static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize < BLOCK_SIZES_ALL); + if (xd->lossless[mbmi->segment_id]) { + // In lossless, CfL is available when the partition size is equal to the + // transform size. + const int ssx = xd->plane[AOM_PLANE_U].subsampling_x; + const int ssy = xd->plane[AOM_PLANE_U].subsampling_y; + const int plane_bsize = get_plane_block_size(bsize, ssx, ssy); + return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4); + } + // Spec: CfL is available to luma partitions lesser than or equal to 32x32 + return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 && + block_size_high[bsize] <= 32); +} + +// Do we need to save the luma pixels from the current block, +// for a possible future CfL prediction? +static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + + if (cm->seq_params->monochrome) return CFL_DISALLOWED; + + if (!xd->is_chroma_ref) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // If this block has chroma information, we know whether we're + // actually going to perform a CfL prediction + return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) && + mbmi->uv_mode == UV_CFL_PRED); +} + +static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) { + int scaled_luma_q6 = alpha_q3 * pred_buf_q3; + return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6); +} + +static INLINE CFL_PRED_TYPE get_cfl_pred_type(int plane) { + assert(plane > 0); + return (CFL_PRED_TYPE)(plane - 1); +} + +static INLINE void clear_cfl_dc_pred_cache_flags(CFL_CTX *cfl) { + cfl->use_dc_pred_cache = false; + cfl->dc_pred_is_cached[CFL_PRED_U] = false; + cfl->dc_pred_is_cached[CFL_PRED_V] = false; +} + +void av1_cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, int plane); + +void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size); + +void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, + BLOCK_SIZE bsize); + +void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, + CFL_PRED_TYPE pred_plane, int width); + +void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, + TX_SIZE tx_size, CFL_PRED_TYPE pred_plane); + +// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth. +#define CFL_lbd_TYPE uint8_t *cfl_type +#define CFL_hbd_TYPE uint16_t *cfl_type + +// Declare a size-specific wrapper for the size-generic function. The compiler +// will inline the size generic function in here, the advantage is that the size +// will be constant allowing for loop unrolling and other constant propagated +// goodness. +#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ + void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ + const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ + cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ + output_q3, width, height); \ + } + +// Declare size-specific wrappers for all valid CfL sizes. +#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \ + CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \ + CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \ + CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \ + TX_SIZE tx_size) { \ + CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + return subfn_##sub[tx_size]; \ + } + +// Declare an architecture-specific array of function pointers for size-specific +// wrappers. +#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; + +// The RTCD script does not support passing in an array, so we wrap it in this +// function. +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd) +#else +#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \ + CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) +#endif + +// Declare a size-specific wrapper for the size-generic function. The compiler +// will inline the size generic function in here, the advantage is that the size +// will be constant allowing for loop unrolling and other constant propagated +// goodness. +#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ + void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ + int16_t *dst) { \ + subtract_average_##arch(src, dst, width, height, round_offset, \ + num_pel_log2); \ + } + +// Declare size-specific wrappers for all valid CfL sizes. +#define CFL_SUB_AVG_FN(arch) \ + CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \ + CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \ + CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \ + CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \ + CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \ + CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \ + CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \ + CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \ + CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \ + CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \ + CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \ + cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \ + TX_SIZE tx_size) { \ + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ + cfl_subtract_average_4x4_##arch, /* 4x4 */ \ + cfl_subtract_average_8x8_##arch, /* 8x8 */ \ + cfl_subtract_average_16x16_##arch, /* 16x16 */ \ + cfl_subtract_average_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subtract_average_4x8_##arch, /* 4x8 */ \ + cfl_subtract_average_8x4_##arch, /* 8x4 */ \ + cfl_subtract_average_8x16_##arch, /* 8x16 */ \ + cfl_subtract_average_16x8_##arch, /* 16x8 */ \ + cfl_subtract_average_16x32_##arch, /* 16x32 */ \ + cfl_subtract_average_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ + cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ + cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ + cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return sub_avg[tx_size % TX_SIZES_ALL]; \ + } + +// For VSX SIMD optimization, the C versions of width == 4 subtract are +// faster than the VSX. As such, the VSX code calls the C versions. +void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); + +#define CFL_PREDICT_lbd(arch, width, height) \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ + int alpha_q3) { \ + cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ + height); \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +#define CFL_PREDICT_hbd(arch, width, height) \ + void cfl_predict_hbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ + int bd) { \ + cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ + height); \ + } +#endif + +// This wrapper exists because clang format does not like calling macros with +// lowercase letters. +#define CFL_PREDICT_X(arch, width, height, bd) \ + CFL_PREDICT_##bd(arch, width, height) + +#define CFL_PREDICT_FN(arch, bd) \ + CFL_PREDICT_X(arch, 4, 4, bd) \ + CFL_PREDICT_X(arch, 4, 8, bd) \ + CFL_PREDICT_X(arch, 4, 16, bd) \ + CFL_PREDICT_X(arch, 8, 4, bd) \ + CFL_PREDICT_X(arch, 8, 8, bd) \ + CFL_PREDICT_X(arch, 8, 16, bd) \ + CFL_PREDICT_X(arch, 8, 32, bd) \ + CFL_PREDICT_X(arch, 16, 4, bd) \ + CFL_PREDICT_X(arch, 16, 8, bd) \ + CFL_PREDICT_X(arch, 16, 16, bd) \ + CFL_PREDICT_X(arch, 16, 32, bd) \ + CFL_PREDICT_X(arch, 32, 8, bd) \ + CFL_PREDICT_X(arch, 32, 16, bd) \ + CFL_PREDICT_X(arch, 32, 32, bd) \ + cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ + static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ + cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \ + cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \ + cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \ + cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \ + cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \ + cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \ + cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \ + cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \ + cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \ + cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \ + cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \ + cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ + /* index the function pointer array out of bounds. */ \ + return pred[tx_size % TX_SIZES_ALL]; \ + } + +#endif // AOM_AV1_COMMON_CFL_H_ diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h new file mode 100644 index 0000000000..ccb45b68ce --- /dev/null +++ b/third_party/aom/av1/common/common.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_COMMON_H_ +#define AOM_AV1_COMMON_COMMON_H_ + +/* Interface header for common constant data structures and lookup tables */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom/aom_integer.h" +#include "aom_ports/bitops.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Only need this for fixed-size arrays, for structs just assign. +#define av1_copy(dest, src) \ + do { \ + assert(sizeof(dest) == sizeof(src)); \ + memcpy(dest, src, sizeof(src)); \ + } while (0) + +// Use this for variably-sized arrays. +#define av1_copy_array(dest, src, n) \ + do { \ + assert(sizeof(*(dest)) == sizeof(*(src))); \ + memcpy(dest, src, n * sizeof(*(src))); \ + } while (0) + +#define av1_zero(dest) memset(&(dest), 0, sizeof(dest)) +#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest))) + +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; +} + +#define CHECK_MEM_ERROR(cm, lval, expr) \ + AOM_CHECK_MEM_ERROR((cm)->error, lval, expr) + +#define AOM_FRAME_MARKER 0x2 + +#define AV1_MIN_TILE_SIZE_BYTES 1 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_COMMON_H_ diff --git a/third_party/aom/av1/common/common_data.c b/third_party/aom/av1/common/common_data.c new file mode 100644 index 0000000000..482aecfcc0 --- /dev/null +++ b/third_party/aom/av1/common/common_data.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common_data.h" + +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). +/* clang-format off */ +const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2] = { + // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 + // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } }, + { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } }, + { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } }, + { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } }, + { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } }, + { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } }, + { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } }, + { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } }, + { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } }, + { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } }, + { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } }, + { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } }, + { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } }, + { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } }, + { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } }, + { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } }, + { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } }, + { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } }, + { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } }, + { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } } +}; +/* clang-format on */ diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h new file mode 100644 index 0000000000..dfe927c6ef --- /dev/null +++ b/third_party/aom/av1/common/common_data.h @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_COMMON_DATA_H_ +#define AOM_AV1_COMMON_COMMON_DATA_H_ + +#include "av1/common/enums.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Log 2 conversion lookup tables in units of mode info (4x4). +// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4 +}; +// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = { + 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2 +}; + +// Width/height lookup tables in units of mode info (4x4). +// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = { + 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16 +}; + +// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables). +static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = { + 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4 +}; + +// Width/height lookup tables in units of samples. +// The Block_Width table in the spec (Section 9.3. Conversion tables). +static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = { + 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, + 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64 +}; + +// The Block_Height table in the spec (Section 9.3. Conversion tables). +static const uint8_t block_size_high[BLOCK_SIZES_ALL] = { + 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, + 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16 +}; + +// Maps a block size to a context. +// The Size_Group table in the spec (Section 9.3. Conversion tables). +// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize))) +static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 +}; + +static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = { + 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10 +}; + +// A compressed version of the Partition_Subsize table in the spec (9.3. +// Conversion tables), for square block sizes only. +/* clang-format off */ +static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = { + { // PARTITION_NONE + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, + BLOCK_32X32, BLOCK_64X64, BLOCK_128X128 + }, { // PARTITION_HORZ + BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_VERT + BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_SPLIT + BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8, + BLOCK_16X16, BLOCK_32X32, BLOCK_64X64 + }, { // PARTITION_HORZ_A + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_HORZ_B + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8, + BLOCK_32X16, BLOCK_64X32, BLOCK_128X64 + }, { // PARTITION_VERT_A + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_VERT_B + BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16, + BLOCK_16X32, BLOCK_32X64, BLOCK_64X128 + }, { // PARTITION_HORZ_4 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4, + BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID + }, { // PARTITION_VERT_4 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID + } +}; + +static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X4, TX_4X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X8, TX_8X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X16, TX_16X16, TX_32X32, + // 32X64, 64X32, + TX_32X32, TX_32X32, + // 64X64 + TX_64X64, + // 64x128, 128x64, 128x128 + TX_64X64, TX_64X64, TX_64X64, + // 4x16, 16x4, 8x32 + TX_4X4, TX_4X4, TX_8X8, + // 32x8, 16x64 64x16 + TX_8X8, TX_16X16, TX_16X16 +}; + +static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = { + // 4X4 + TX_4X4, + // 4X8, 8X4, 8X8 + TX_4X8, TX_8X4, TX_8X8, + // 8X16, 16X8, 16X16 + TX_8X16, TX_16X8, TX_16X16, + // 16X32, 32X16, 32X32 + TX_16X32, TX_32X16, TX_32X32, + // 32X64, 64X32, + TX_32X64, TX_64X32, + // 64X64 + TX_64X64, + // 64x128, 128x64, 128x128 + TX_64X64, TX_64X64, TX_64X64, + // 4x16, 16x4, + TX_4X16, TX_16X4, + // 8x32, 32x8 + TX_8X32, TX_32X8, + // 16x64, 64x16 + TX_16X64, TX_64X16 +}; + +static const TX_TYPE_1D vtx_tab[TX_TYPES] = { + DCT_1D, ADST_1D, DCT_1D, ADST_1D, + FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D, + DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D, +}; + +static const TX_TYPE_1D htx_tab[TX_TYPES] = { + DCT_1D, DCT_1D, ADST_1D, ADST_1D, + DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D, + IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, +}; + +#define TXSIZE_CAT_INVALID (-1) + +/* clang-format on */ + +static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_4X4, // TX_8X8 + TX_8X8, // TX_16X16 + TX_16X16, // TX_32X32 + TX_32X32, // TX_64X64 + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 + TX_8X8, // TX_8X16 + TX_8X8, // TX_16X8 + TX_16X16, // TX_16X32 + TX_16X16, // TX_32X16 + TX_32X32, // TX_32X64 + TX_32X32, // TX_64X32 + TX_4X8, // TX_4X16 + TX_8X4, // TX_16X4 + TX_8X16, // TX_8X32 + TX_16X8, // TX_32X8 + TX_16X32, // TX_16X64 + TX_32X16, // TX_64X16 +}; + +static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_4X4, // TX_4X8 + TX_8X8, // TX_8X4 + TX_8X8, // TX_8X16 + TX_16X16, // TX_16X8 + TX_16X16, // TX_16X32 + TX_32X32, // TX_32X16 + TX_32X32, // TX_32X64 + TX_64X64, // TX_64X32 + TX_4X4, // TX_4X16 + TX_16X16, // TX_16X4 + TX_8X8, // TX_8X32 + TX_32X32, // TX_32X8 + TX_16X16, // TX_16X64 + TX_64X64, // TX_64X16 +}; + +static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_8X8, // TX_4X8 + TX_4X4, // TX_8X4 + TX_16X16, // TX_8X16 + TX_8X8, // TX_16X8 + TX_32X32, // TX_16X32 + TX_16X16, // TX_32X16 + TX_64X64, // TX_32X64 + TX_32X32, // TX_64X32 + TX_16X16, // TX_4X16 + TX_4X4, // TX_16X4 + TX_32X32, // TX_8X32 + TX_8X8, // TX_32X8 + TX_64X64, // TX_16X64 + TX_16X16, // TX_64X16 +}; + +#define TX_SIZE_W_MIN 4 + +// Transform block width in pixels +static const int tx_size_wide[TX_SIZES_ALL] = { + 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, +}; + +#define TX_SIZE_H_MIN 4 + +// Transform block height in pixels +static const int tx_size_high[TX_SIZES_ALL] = { + 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, +}; + +// Transform block width in unit +static const int tx_size_wide_unit[TX_SIZES_ALL] = { + 1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16, +}; + +// Transform block height in unit +static const int tx_size_high_unit[TX_SIZES_ALL] = { + 1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4, +}; + +// Transform block width in log2 +static const int tx_size_wide_log2[TX_SIZES_ALL] = { + 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, +}; + +// Transform block width in log2 unit +static const int tx_size_wide_unit_log2[TX_SIZES_ALL] = { + 0, 1, 2, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 2, 1, 3, 2, 4, +}; + +// Transform block height in log2 +static const int tx_size_high_log2[TX_SIZES_ALL] = { + 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, +}; + +// Transform block height in log2 unit +static const int tx_size_high_unit_log2[TX_SIZES_ALL] = { + 0, 1, 2, 3, 4, 1, 0, 2, 1, 3, 2, 4, 3, 2, 0, 3, 1, 4, 2, +}; + +static const int tx_size_2d[TX_SIZES_ALL + 1] = { + 16, 64, 256, 1024, 4096, 32, 32, 128, 128, 512, + 512, 2048, 2048, 64, 64, 256, 256, 1024, 1024, +}; + +static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 + BLOCK_64X64, // TX_64X64 + BLOCK_4X8, // TX_4X8 + BLOCK_8X4, // TX_8X4 + BLOCK_8X16, // TX_8X16 + BLOCK_16X8, // TX_16X8 + BLOCK_16X32, // TX_16X32 + BLOCK_32X16, // TX_32X16 + BLOCK_32X64, // TX_32X64 + BLOCK_64X32, // TX_64X32 + BLOCK_4X16, // TX_4X16 + BLOCK_16X4, // TX_16X4 + BLOCK_8X32, // TX_8X32 + BLOCK_32X8, // TX_32X8 + BLOCK_16X64, // TX_16X64 + BLOCK_64X16, // TX_64X16 +}; + +static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_4X4, // TX_4X8 + TX_4X4, // TX_8X4 + TX_8X8, // TX_8X16 + TX_8X8, // TX_16X8 + TX_16X16, // TX_16X32 + TX_16X16, // TX_32X16 + TX_32X32, // TX_32X64 + TX_32X32, // TX_64X32 + TX_4X4, // TX_4X16 + TX_4X4, // TX_16X4 + TX_8X8, // TX_8X32 + TX_8X8, // TX_32X8 + TX_16X16, // TX_16X64 + TX_16X16, // TX_64X16 +}; + +static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = { + TX_4X4, // TX_4X4 + TX_8X8, // TX_8X8 + TX_16X16, // TX_16X16 + TX_32X32, // TX_32X32 + TX_64X64, // TX_64X64 + TX_8X8, // TX_4X8 + TX_8X8, // TX_8X4 + TX_16X16, // TX_8X16 + TX_16X16, // TX_16X8 + TX_32X32, // TX_16X32 + TX_32X32, // TX_32X16 + TX_64X64, // TX_32X64 + TX_64X64, // TX_64X32 + TX_16X16, // TX_4X16 + TX_16X16, // TX_16X4 + TX_32X32, // TX_8X32 + TX_32X32, // TX_32X8 + TX_64X64, // TX_16X64 + TX_64X64, // TX_64X16 +}; + +static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = { + 0, // TX_4X4 + 2, // TX_8X8 + 4, // TX_16X16 + 6, // TX_32X32 + 6, // TX_64X64 + 1, // TX_4X8 + 1, // TX_8X4 + 3, // TX_8X16 + 3, // TX_16X8 + 5, // TX_16X32 + 5, // TX_32X16 + 6, // TX_32X64 + 6, // TX_64X32 + 2, // TX_4X16 + 2, // TX_16X4 + 4, // TX_8X32 + 4, // TX_32X8 + 5, // TX_16X64 + 5, // TX_64X16 +}; + +static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_64X64, // TX_MODE_LARGEST + TX_64X64, // TX_MODE_SELECT +}; + +// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual +// size function). +extern const BLOCK_SIZE av1_ss_size_lookup[BLOCK_SIZES_ALL][2][2]; + +// Generates 5 bit field in which each bit set to 1 represents +// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16 +// and 8x8. 10000 means we just split the 128x128 to 64x64 +/* clang-format off */ +static const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES_ALL] = { + { 31, 31 }, // 4X4 - {0b11111, 0b11111} + { 31, 30 }, // 4X8 - {0b11111, 0b11110} + { 30, 31 }, // 8X4 - {0b11110, 0b11111} + { 30, 30 }, // 8X8 - {0b11110, 0b11110} + { 30, 28 }, // 8X16 - {0b11110, 0b11100} + { 28, 30 }, // 16X8 - {0b11100, 0b11110} + { 28, 28 }, // 16X16 - {0b11100, 0b11100} + { 28, 24 }, // 16X32 - {0b11100, 0b11000} + { 24, 28 }, // 32X16 - {0b11000, 0b11100} + { 24, 24 }, // 32X32 - {0b11000, 0b11000} + { 24, 16 }, // 32X64 - {0b11000, 0b10000} + { 16, 24 }, // 64X32 - {0b10000, 0b11000} + { 16, 16 }, // 64X64 - {0b10000, 0b10000} + { 16, 0 }, // 64X128- {0b10000, 0b00000} + { 0, 16 }, // 128X64- {0b00000, 0b10000} + { 0, 0 }, // 128X128-{0b00000, 0b00000} + { 31, 28 }, // 4X16 - {0b11111, 0b11100} + { 28, 31 }, // 16X4 - {0b11100, 0b11111} + { 30, 24 }, // 8X32 - {0b11110, 0b11000} + { 24, 30 }, // 32X8 - {0b11000, 0b11110} + { 28, 16 }, // 16X64 - {0b11100, 0b10000} + { 16, 28 }, // 64X16 - {0b10000, 0b11100} +}; +/* clang-format on */ + +static const int intra_mode_context[INTRA_MODES] = { + 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0, +}; + +// Note: this is also used in unit tests. So whenever one changes the table, +// the unit tests need to be changed accordingly. +static const int quant_dist_weight[4][2] = { + { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE } +}; + +static const int quant_dist_lookup_table[4][2] = { + { 9, 7 }, + { 11, 5 }, + { 12, 4 }, + { 13, 3 }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_COMMON_DATA_H_ diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c new file mode 100644 index 0000000000..bb72e0cbd2 --- /dev/null +++ b/third_party/aom/av1/common/convolve.c @@ -0,0 +1,1508 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/resize.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn, int bd) { + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS]; + const int x_filter_idx = + (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + assert(x_filter_idx <= RS_SUBPEL_MASK); + const int16_t *const x_filter = + &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS]; + int sum = 0; + for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k) + sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + + // TODO(aomedia:3393): for 12-tap filter, in extreme cases, the result can + // be beyond the following range. For better prediction, a clamping can be + // added for 12 tap filter to ensure the horizontal filtering result is + // within 16 bit. The same applies to the vertical filtering. + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } + } +} + +void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + const int fo_vert = filter_params_y->taps / 2 - 1; + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS)); + } + } +} + +void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits)); + } + } +} + +// This function is exactly the same as av1_convolve_2d_sr_c, and is an +// optimized version for intrabc. Use the following 2-tap filter: +// DECLARE_ALIGNED(256, static const int16_t, +// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { +// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// }; +void av1_convolve_2d_sr_intrabc_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int bd = 8; + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + int16_t *im = im_block; + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t sum = (1 << bd) + src[x] + src[x + 1]; + assert(0 <= sum && sum < (1 << (bd + 2))); + im[x] = sum; + } + src += src_stride; + im += im_stride; + } + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + int16_t *src_vert = im_block; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t sum = + (1 << (bd + 2)) + src_vert[x] + src_vert[im_stride + x]; + assert(0 <= sum && sum < (1 << (bd + 4))); + const int16_t res = + ROUND_POWER_OF_TWO(sum, 2) - ((1 << bd) + (1 << (bd - 1))); + dst[x] = clip_pixel(res); + } + src_vert += im_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_convolve_y_sr_c, and is an +// optimized version for intrabc. +void av1_convolve_y_sr_intrabc_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t res = src[x] + src[src_stride + x]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); + } + src += src_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_convolve_x_sr_c, and is an +// optimized version for intrabc. +void av1_convolve_x_sr_intrabc_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)conv_params; + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t res = src[x] + src[x + 1]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(res, 1)); + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bd = 8; + + // horizontal filter + const uint8_t *src_horiz = src - fo_vert * src_stride; + for (int y = 0; y < im_h; ++y) { + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *x_filter = + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_x[k - fo_horiz]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + src_horiz += src_stride; + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int x = 0; x < w; ++x) { + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(y_filter_idx < SUBPEL_SHIFTS); + const int16_t *y_filter = + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } + } + src_vert++; + } +} + +static void convolve_2d_scale_wrapper( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn, + y_step_qn, conv_params); +} + +static void convolve_2d_facade_compound( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + if (!need_x && !need_y) { + av1_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, + conv_params); + } else if (need_x && !need_y) { + av1_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } else if (!need_x && need_y) { + av1_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, conv_params); + } else { + assert(need_y && need_x); + av1_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } +} + +static void convolve_2d_facade_single( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + if (!need_x && !need_y) { + aom_convolve_copy(src, src_stride, dst, dst_stride, w, h); + } else if (need_x && !need_y) { + av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, + subpel_x_qn, conv_params); + } else if (!need_x && need_y) { + av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y, + subpel_y_qn); + } else { + assert(need_x && need_y); + av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params); + } +} + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst; + (void)dst_stride; + + const InterpFilterParams *filter_params_x = interp_filters[0]; + const InterpFilterParams *filter_params_y = interp_filters[1]; + + // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case. + // 2-tap filter indicates that it is for IntraBC. + if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert(!scaled); + if (subpel_x_qn && subpel_y_qn) { + av1_convolve_2d_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params); + return; + } else if (subpel_x_qn) { + av1_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + return; + } else if (subpel_y_qn) { + av1_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + return; + } + } + + if (scaled) { + convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params); + } else if (conv_params->is_compound) { + convolve_2d_facade_compound(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } else { + convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + } +} + +void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + const int fo_vert = filter_params_y->taps / 2 - 1; + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd); + } + } +} + +void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + } +} + +// This function is exactly the same as av1_highbd_convolve_2d_sr_c, and is an +// optimized version for intrabc. Use the following 2-tap filter: +// DECLARE_ALIGNED(256, static const int16_t, +// av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { +// 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +// }; +void av1_highbd_convolve_2d_sr_intrabc_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + assert(subpel_x_qn == 8); + assert(subpel_y_qn == 8); + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + (void)filter_params_y; + (void)subpel_y_qn; + (void)conv_params; + + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + int im_h = h + 1; + int im_stride = w; + assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE); + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + int16_t *im = im_block; + for (int y = 0; y < im_h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)) + 64 * (src[x] + src[x + 1]); + assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1))); + sum = ROUND_POWER_OF_TWO(sum, conv_params->round_0); + im[x] = sum; + } + src += src_stride; + im += im_stride; + } + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + int16_t *src_vert = im_block; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t sum = + (1 << offset_bits) + 64 * (src_vert[x] + src_vert[im_stride + x]); + assert(0 <= sum && sum < (1 << (offset_bits + 2))); + const int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) - + ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + src_vert += im_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_highbd_convolve_y_sr_c, and is an +// optimized version for intrabc. +void av1_highbd_convolve_y_sr_intrabc_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + int bd) { + assert(subpel_y_qn == 8); + assert(filter_params_y->taps == 2); + (void)filter_params_y; + (void)subpel_y_qn; + + // vertical filter + // explicitly operate for subpel_y_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + const int32_t res = src[x] + src[src_stride + x]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, 1), bd); + } + src += src_stride; + dst += dst_stride; + } +} + +// This function is exactly the same as av1_highbd_convolve_x_sr_c, and is an +// optimized version for intrabc. +void av1_highbd_convolve_x_sr_intrabc_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + const int bits = FILTER_BITS - conv_params->round_0; + assert(bits >= 0); + assert(subpel_x_qn == 8); + assert(filter_params_x->taps == 2); + assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS); + (void)filter_params_x; + (void)subpel_x_qn; + + // horizontal filter + // explicitly operate for subpel_x_qn = 8. + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 64 * (src[x] + src[x + 1]); + res = ROUND_POWER_OF_TWO(res, conv_params->round_0); + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd); + } + src += src_stride; + dst += dst_stride; + } +} + +void av1_highbd_dist_wtd_convolve_2d_c( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + int x, y, k; + int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (y = 0; y < im_h; ++y) { + for (x = 0; x < w; ++x) { + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + (void)bd; + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int32_t sum = 1 << offset_bits; + for (k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + assert(bits >= 0); + // horizontal filter + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_x->taps; ++k) { + res += x_filter[k] * src[y * src_stride + x - fo_horiz + k]; + } + res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0); + res += round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int bits = FILTER_BITS - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + assert(round_bits >= 0); + assert(bits >= 0); + // vertical filter + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + int32_t res = 0; + for (int k = 0; k < filter_params_y->taps; ++k) { + res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x]; + } + res *= (1 << bits); + res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset; + + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + int bd) { + CONV_BUF_TYPE *dst16 = conv_params->dst; + int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + assert(bits >= 0); + + for (int y = 0; y < h; ++y) { + for (int x = 0; x < w; ++x) { + CONV_BUF_TYPE res = src[y * src_stride + x] << bits; + res += round_offset; + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + tmp -= round_offset; + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } + } +} + +void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + int im_stride = w; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + assert(bits >= 0); + // horizontal filter + const uint16_t *src_horiz = src - fo_vert * src_stride; + for (int y = 0; y < im_h; ++y) { + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *x_filter = + av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx); + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < filter_params_x->taps; ++k) { + sum += x_filter[k] * src_x[k - fo_horiz]; + } + assert(filter_params_x->taps > 8 || + (0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)))); + im_block[y * im_stride + x] = + (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0); + } + src_horiz += src_stride; + } + + // vertical filter + int16_t *src_vert = im_block + fo_vert * im_stride; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + for (int x = 0; x < w; ++x) { + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride]; + const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(y_filter_idx < SUBPEL_SHIFTS); + const int16_t *y_filter = + av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx); + int32_t sum = 1 << offset_bits; + for (int k = 0; k < filter_params_y->taps; ++k) { + sum += y_filter[k] * src_y[(k - fo_vert) * im_stride]; + } + assert(filter_params_y->taps > 8 || + (0 <= sum && sum < (1 << (offset_bits + 2)))); + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } + } + src_vert++; + } +} + +static void highbd_convolve_2d_facade_compound( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const int w, const int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + if (!need_x && !need_y) { + av1_highbd_dist_wtd_convolve_2d_copy(src, src_stride, dst, dst_stride, w, h, + conv_params, bd); + } else if (need_x && !need_y) { + av1_highbd_dist_wtd_convolve_x(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, + bd); + } else if (!need_x && need_y) { + av1_highbd_dist_wtd_convolve_y(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, conv_params, + bd); + } else { + assert(need_x && need_y); + av1_highbd_dist_wtd_convolve_2d(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + } +} + +static void highbd_convolve_2d_facade_single( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const int w, const int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + const bool need_x = subpel_x_qn != 0; + const bool need_y = subpel_y_qn != 0; + + if (!need_x && !need_y) { + aom_highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h); + } else if (need_x && !need_y) { + av1_highbd_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, bd); + } else if (!need_x && need_y) { + av1_highbd_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + } else { + assert(need_x && need_y); + av1_highbd_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params, bd); + } +} + +void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, + uint8_t *dst8, int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, + int scaled, ConvolveParams *conv_params, + int bd) { + (void)x_step_q4; + (void)y_step_q4; + (void)dst_stride; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + const InterpFilterParams *filter_params_x = interp_filters[0]; + const InterpFilterParams *filter_params_y = interp_filters[1]; + + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + // 2-tap filter indicates that it is for IntraBC. + if (filter_params_x->taps == 2 || filter_params_y->taps == 2) { + assert(filter_params_x->taps == 2 && filter_params_y->taps == 2); + assert(!scaled); + if (subpel_x_qn && subpel_y_qn) { + av1_highbd_convolve_2d_sr_intrabc_c( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } else if (subpel_x_qn) { + av1_highbd_convolve_x_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, + conv_params, bd); + return; + } else if (subpel_y_qn) { + av1_highbd_convolve_y_sr_intrabc_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + } + + if (scaled) { + if (conv_params->is_compound) { + assert(conv_params->dst != NULL); + } + av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + x_step_q4, subpel_y_qn, y_step_q4, conv_params, + bd); + } else if (conv_params->is_compound) { + highbd_convolve_2d_facade_compound( + src, src_stride, dst, dst_stride, w, h, filter_params_x, + filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd); + } else { + highbd_convolve_2d_facade_single(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 128x128 pixels. +// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((128 - 1) * 32 + 15) >> 4 + 8 = 263. +#define WIENER_MAX_EXT_SIZE 263 + +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} +#endif + +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, + int round0_bits) { + const int bd = 8; + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + WIENER_CLAMP_LIMIT(round0_bits, bd) - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, + int round1_bits) { + const int bd = 8; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4, + x_step_q4, w, intermediate_height, + conv_params->round_0); + convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, conv_params->round_1); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_convolve_add_src_horiz_hip( + const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int round0_bits, int bd) { + const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src -= SUBPEL_TAPS / 2 - 1; + for (int y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (int x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding; + dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0, + extraprec_clamp_limit - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert_hip( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int round1_bits, int bd) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (int x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (int y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + const int rounding = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + round1_bits - 1)); + const int sum = + highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +void av1_highbd_wiener_convolve_add_src_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + + highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, filters_x, + x0_q4, x_step_q4, w, intermediate_height, + conv_params->round_0, bd); + highbd_convolve_add_src_vert_hip( + temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride, + filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h new file mode 100644 index 0000000000..d6dd8763c3 --- /dev/null +++ b/third_party/aom/av1/common/convolve.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_CONVOLVE_H_ +#define AOM_AV1_COMMON_CONVOLVE_H_ +#include "av1/common/filter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint16_t CONV_BUF_TYPE; +typedef struct ConvolveParams { + int do_average; + CONV_BUF_TYPE *dst; + int dst_stride; + int round_0; + int round_1; + int plane; + int is_compound; + int use_dist_wtd_comp_avg; + int fwd_offset; + int bck_offset; +} ConvolveParams; + +typedef struct WienerConvolveParams { + int round_0; + int round_1; +} WienerConvolveParams; + +#define ROUND0_BITS 3 +#define COMPOUND_ROUND1_BITS 7 +#define WIENER_ROUND0_BITS 3 + +#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0)) + +typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params); + +typedef void (*aom_highbd_convolve_fn_t)( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); + +struct AV1Common; +struct scale_factors; + +void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, int scaled, + ConvolveParams *conv_params); + +static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane, + CONV_BUF_TYPE *dst, + int dst_stride, + int is_compound, int bd) { + ConvolveParams conv_params; + assert(IMPLIES(cmp_index, is_compound)); + + conv_params.is_compound = is_compound; + conv_params.use_dist_wtd_comp_avg = 0; + conv_params.round_0 = ROUND0_BITS; + conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS + : 2 * FILTER_BITS - conv_params.round_0; +#if CONFIG_AV1_HIGHBITDEPTH + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + assert(IMPLIES(bd < 12, intbufrange <= 16)); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + if (!is_compound) conv_params.round_1 -= intbufrange - 16; + } +#else + (void)bd; +#endif // CONFIG_AV1_HIGHBITDEPTH + // TODO(yunqing): The following dst should only be valid while + // is_compound = 1; + conv_params.dst = dst; + conv_params.dst_stride = dst_stride; + conv_params.plane = plane; + + // By default, set do average to 1 if this is the second single prediction + // in a compound mode. + conv_params.do_average = cmp_index; + return conv_params; +} + +static INLINE ConvolveParams get_conv_params(int do_average, int plane, + int bd) { + return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd); +} + +static INLINE WienerConvolveParams get_conv_params_wiener(int bd) { + WienerConvolveParams conv_params; + conv_params.round_0 = WIENER_ROUND0_BITS; + conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0; + const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2; + assert(IMPLIES(bd < 12, intbufrange <= 16)); + if (intbufrange > 16) { + conv_params.round_0 += intbufrange - 16; + conv_params.round_1 -= intbufrange - 16; + } + return conv_params; +} + +void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *interp_filters[2], + const int subpel_x_qn, int x_step_q4, + const int subpel_y_qn, int y_step_q4, + int scaled, ConvolveParams *conv_params, + int bd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_CONVOLVE_H_ diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c new file mode 100644 index 0000000000..7e6160f9a5 --- /dev/null +++ b/third_party/aom/av1/common/debugmodes.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %u, Show:%d, Q:%d): \n", cm->current_frame.frame_number, + cm->show_frame, cm->quant_params.base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor, + size_t member_offset) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO **mi = mi_params->mi_grid_base; + int rows = mi_params->mi_rows; + int cols = mi_params->mi_cols; + char prefix = descriptor[0]; + + log_frame_info(cm, descriptor, file); + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); + mi++; + } + fprintf(file, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(file, "\n"); +} + +void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { + CommonModeInfoParams *mi_params = &cm->mi_params; + FILE *mvs = fopen(file, "a"); + MB_MODE_INFO **mi = mi_params->mi_grid_base; + const int rows = mi_params->mi_rows; + const int cols = mi_params->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, bsize)); + print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); + + // output skip infomation. + log_frame_info(cm, "Skips:", mvs); + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "S "); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[0]->skip_txfm); + mi++; + } + fprintf(mvs, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(mvs, "\n"); + + // output motion vectors. + log_frame_info(cm, "Vectors ", mvs); + mi = mi_params->mi_grid_base; + for (int mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs, "V "); + for (int mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col); + mi++; + } + fprintf(mvs, "\n"); + mi += mi_params->mi_stride - cols; + } + fprintf(mvs, "\n"); + + fclose(mvs); +} + +void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename) { + FILE *hdrFile = fopen(filename, "w"); + fwrite(data, size, sizeof(uint8_t), hdrFile); + + // Reset order hints(7bit + a previous bit) to 0, so that all camera frame + // headers are identical in large scale coding. + uint8_t zero = 0; + fseek(hdrFile, 1, SEEK_SET); + // Reset second byte. + fwrite(&zero, 1, sizeof(uint8_t), hdrFile); + fclose(hdrFile); +} + +void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) { + FILE *fcFile = fopen(filename, "w"); + const uint16_t *fcp = (uint16_t *)fc; + const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t); + unsigned int i; + + for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++); + fclose(fcFile); +} diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c new file mode 100644 index 0000000000..97d95ea394 --- /dev/null +++ b/third_party/aom/av1/common/entropy.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/scan.h" +#include "av1/common/token_cdfs.h" +#include "av1/common/txb_common.h" + +static int get_q_ctx(int q) { + if (q <= 20) return 0; + if (q <= 60) return 1; + if (q <= 120) return 2; + return 3; +} + +void av1_default_coef_probs(AV1_COMMON *cm) { + const int index = get_q_ctx(cm->quant_params.base_qindex); +#if CONFIG_ENTROPY_STATS + cm->coef_cdf_category = index; +#endif + + av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]); + av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]); + av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]); + av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]); + av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]); + av1_copy(cm->fc->coeff_base_eob_cdf, + av1_default_coeff_base_eob_multi_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]); + av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]); +} + +static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, + int num_cdfs, int cdf_stride, + int nsymbs) { + for (int i = 0; i < num_cdfs; i++) { + cdf_ptr[i * cdf_stride + nsymbs] = 0; + } +} + +#define RESET_CDF_COUNTER(cname, nsymbs) \ + RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs)) + +#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname; \ + int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \ + } while (0) + +static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) { + RESET_CDF_COUNTER(nmv->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES); + RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2); + RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE); + RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2); + } +} + +void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) { + RESET_CDF_COUNTER(fc->txb_skip_cdf, 2); + RESET_CDF_COUNTER(fc->eob_extra_cdf, 2); + RESET_CDF_COUNTER(fc->dc_sign_cdf, 2); + RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5); + RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6); + RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7); + RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8); + RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9); + RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10); + RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11); + RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3); + RESET_CDF_COUNTER(fc->coeff_base_cdf, 4); + RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE); + RESET_CDF_COUNTER(fc->newmv_cdf, 2); + RESET_CDF_COUNTER(fc->zeromv_cdf, 2); + RESET_CDF_COUNTER(fc->refmv_cdf, 2); + RESET_CDF_COUNTER(fc->drl_cdf, 2); + RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES); + RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16); + RESET_CDF_COUNTER(fc->interintra_cdf, 2); + RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2); + RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES); + RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES); + RESET_CDF_COUNTER(fc->obmc_cdf, 2); + RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES); + RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2); + RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2); + RESET_CDF_COUNTER(fc->comp_inter_cdf, 2); + RESET_CDF_COUNTER(fc->single_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2); + RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_ref_cdf, 2); + RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2); + RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2); + RESET_CDF_COUNTER(fc->compound_index_cdf, 2); + RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2); + RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2); + RESET_CDF_COUNTER(fc->skip_txfm_cdfs, 2); + RESET_CDF_COUNTER(fc->intra_inter_cdf, 2); + reset_nmv_counter(&fc->nmvc); + reset_nmv_counter(&fc->ndvc); + RESET_CDF_COUNTER(fc->intrabc_cdf, 2); + RESET_CDF_COUNTER(fc->seg.pred_cdf, 2); + RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2); + RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES); + RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES); + RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2); + RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2); + RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES); + RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1, + CDF_SIZE(UV_INTRA_MODES)); + RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10)); + } else if (i < 16) { + RESET_CDF_COUNTER(fc->partition_cdf[i], 10); + } else { + RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10)); + } + } + RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS); + RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES); + RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1); + RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1); + RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1); + RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1); + } + RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES)); + RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS); + RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE); +} diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h new file mode 100644 index 0000000000..53ef3b1c89 --- /dev/null +++ b/third_party/aom/av1/common/entropy.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPY_H_ +#define AOM_AV1_COMMON_ENTROPY_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define TOKEN_CDF_Q_CTXS 4 + +#define TXB_SKIP_CONTEXTS 13 + +#define EOB_COEF_CONTEXTS 9 + +#define SIG_COEF_CONTEXTS_2D 26 +#define SIG_COEF_CONTEXTS_1D 16 +#define SIG_COEF_CONTEXTS_EOB 4 +#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D) + +#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS) +#define DC_SIGN_CONTEXTS 3 + +#define BR_TMP_OFFSET 12 +#define BR_REF_CAT 4 +#define LEVEL_CONTEXTS 21 + +#define NUM_BASE_LEVELS 2 + +#define BR_CDF_SIZE (4) +#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1)) + +#define COEFF_CONTEXT_BITS 3 +#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1) +#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1) + +#define BASE_CONTEXT_POSITION_NUM 12 + +enum { + TX_CLASS_2D = 0, + TX_CLASS_HORIZ = 1, + TX_CLASS_VERT = 2, + TX_CLASSES = 3, +} UENUM1BYTE(TX_CLASS); + +#define DCT_MAX_VALUE 16384 +#define DCT_MAX_VALUE_HIGH10 65536 +#define DCT_MAX_VALUE_HIGH12 262144 + +/* Coefficients are predicted via a 3-dimensional probability table indexed on + * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */ +#define REF_TYPES 2 // intra=0, inter=1 + +struct AV1Common; +struct frame_contexts; +void av1_reset_cdf_symbol_counters(struct frame_contexts *fc); +void av1_default_coef_probs(struct AV1Common *cm); +void av1_init_mode_probs(struct frame_contexts *fc); + +struct frame_contexts; + +typedef char ENTROPY_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; + + switch (tx_size) { + case TX_4X4: + above_ec = a[0] != 0; + left_ec = l[0] != 0; + break; + case TX_4X8: + above_ec = a[0] != 0; + left_ec = !!*(const uint16_t *)l; + break; + case TX_8X4: + above_ec = !!*(const uint16_t *)a; + left_ec = l[0] != 0; + break; + case TX_8X16: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_16X8: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X32: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_32X16: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_8X8: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X16: + above_ec = !!*(const uint32_t *)a; + left_ec = !!*(const uint32_t *)l; + break; + case TX_32X32: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_64X64: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_32X64: + above_ec = !!*(const uint64_t *)a; + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_64X32: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!*(const uint64_t *)l; + break; + case TX_4X16: + above_ec = a[0] != 0; + left_ec = !!*(const uint32_t *)l; + break; + case TX_16X4: + above_ec = !!*(const uint32_t *)a; + left_ec = l[0] != 0; + break; + case TX_8X32: + above_ec = !!*(const uint16_t *)a; + left_ec = !!*(const uint64_t *)l; + break; + case TX_32X8: + above_ec = !!*(const uint64_t *)a; + left_ec = !!*(const uint16_t *)l; + break; + case TX_16X64: + above_ec = !!*(const uint32_t *)a; + left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8)); + break; + case TX_64X16: + above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8)); + left_ec = !!*(const uint32_t *)l; + break; + default: assert(0 && "Invalid transform size."); break; + } + return combine_entropy_contexts(above_ec, left_ec); +} + +static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) { + return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >> + 1); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPY_H_ diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c new file mode 100644 index 0000000000..8381c1fdd0 --- /dev/null +++ b/third_party/aom/av1/common/entropymode.c @@ -0,0 +1,1094 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_mem/aom_mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#include "av1/common/txb_common.h" + +static const aom_cdf_prob + default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE( + INTRA_MODES)] = { + { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, + 24189, 28165, 29093, 30466) }, + { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032, + 24434, 28658, 30172, 31409) }, + { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620, + 26160, 29336, 29929, 31567) }, + { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096, + 24746, 29585, 30958, 32462) }, + { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583, + 26437, 30261, 31073, 32475) } }, + { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023, + 25381, 29014, 30482, 31436) }, + { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423, + 27610, 29905, 31276, 31794) }, + { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405, + 24469, 27915, 29090, 30492) }, + { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825, + 24649, 29153, 31096, 32210) }, + { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516, + 26001, 29675, 30981, 31994) } }, + { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055, + 25729, 29538, 30305, 32077) }, + { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062, + 23219, 27743, 29211, 30907) }, + { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555, + 30467, 30794, 32086) }, + { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523, + 23878, 28975, 30287, 32252) }, + { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561, + 30072, 30737, 32463) } }, + { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419, + 25060, 29696, 30917, 32409) }, + { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468, + 25225, 29485, 31158, 32342) }, + { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605, + 29118, 30078, 32018) }, + { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743, + 30389, 31536, 32528) }, + { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718, + 25769, 29953, 30983, 32485) } }, + { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449, + 26219, 30214, 31150, 32477) }, + { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236, + 25380, 29653, 31143, 32277) }, + { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466, + 29900, 30523, 32261) }, + { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753, + 24615, 29489, 30883, 32482) }, + { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180, + 31355, 31802, 32593) } } + }; + +static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE( + 2 * MAX_ANGLE_DELTA + 1)] = { + { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) }, + { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) }, + { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) }, + { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) }, + { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) }, + { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) }, + { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) }, + { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) } +}; + +static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123, + 26606, 27418, 27945, 29228, 29685, 30349) }, + { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649, + 25527, 27364, 28152, 29701, 29984, 30852) }, + { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654, + 25136, 27073, 27830, 29360, 29730, 30659) }, + { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533, + 23703, 24804, 25352, 26575, 27016, 28049) } }; + +static const aom_cdf_prob + default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE( + UV_INTRA_MODES)] = { + { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, + 28244, 30059, 30941, 31961) }, + { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, + 28359, 29505, 29800, 31796) }, + { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, + 30764, 31777, 32029) }, + { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, + 28577, 30612, 31355, 32493) }, + { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, + 31101, 31744, 32363) }, + { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, + 29711, 31161, 31441, 32550) }, + { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, + 30245, 31837, 32342, 32667) }, + { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, + 29267, 30643, 31961, 32461) }, + { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, + 28443, 30388, 30767, 32416) }, + { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, + 23174, 28861, 30379, 32175) }, + { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, + 23527, 27053, 31397, 32148) }, + { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, + 22482, 25896, 26541, 31819) }, + { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, + 15255, 15753, 16039, 16606) } }, + { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, + 15986, 20086, 20995, 22455, 24212) }, + { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, + 22099, 24228, 24693, 27032, 29472) }, + { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, + 23138, 24256, 24703, 26679) }, + { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, + 21520, 22206, 23389, 24182) }, + { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, + 24911, 25380, 26027, 26376) }, + { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, + 24780, 25386, 26517, 27176) }, + { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, + 23188, 23763, 24455, 24940) }, + { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, + 22336, 23204, 23964, 24793) }, + { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, + 22494, 23139, 24764, 25989) }, + { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, + 15534, 20714, 21789, 23443, 24861) }, + { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, + 15902, 20102, 22696, 23774, 25838) }, + { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, + 15636, 19676, 20474, 23519, 25208) }, + { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, + 9875, 10521, 29048) } } + }; + +static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE( + EXT_PARTITION_TYPES)] = { + { AOM_CDF4(19132, 25510, 30392) }, + { AOM_CDF4(13928, 19855, 28540) }, + { AOM_CDF4(12522, 23679, 28629) }, + { AOM_CDF4(9896, 18783, 25853) }, + { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) }, + { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) }, + { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) }, + { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) }, + { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) }, + { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) }, + { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) }, + { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) }, + { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) }, + { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) }, + { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) }, + { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) }, + { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) }, + { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) }, + { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) }, + { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) }, +}; + +static const aom_cdf_prob default_intra_ext_tx_cdf + [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = { + { + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + }, + { + { + { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) }, + { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) }, + { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) }, + { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) }, + { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) }, + { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) }, + { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) }, + { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) }, + { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) }, + { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) }, + { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) }, + { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) }, + { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) }, + }, + { + { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) }, + { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) }, + { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) }, + { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) }, + { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) }, + { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) }, + { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) }, + { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) }, + { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) }, + { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) }, + { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) }, + { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) }, + { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) }, + }, + { + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + }, + { + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) }, + }, + }, + { + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + { + { AOM_CDF5(1127, 12814, 22772, 27483) }, + { AOM_CDF5(145, 6761, 11980, 26667) }, + { AOM_CDF5(362, 5887, 11678, 16725) }, + { AOM_CDF5(385, 15213, 18587, 30693) }, + { AOM_CDF5(25, 2914, 23134, 27903) }, + { AOM_CDF5(60, 4470, 11749, 23991) }, + { AOM_CDF5(37, 3332, 14511, 21448) }, + { AOM_CDF5(157, 6320, 13036, 17439) }, + { AOM_CDF5(119, 6719, 12906, 29396) }, + { AOM_CDF5(47, 5537, 12576, 21499) }, + { AOM_CDF5(269, 6076, 11258, 23115) }, + { AOM_CDF5(83, 5615, 12001, 17228) }, + { AOM_CDF5(1968, 5556, 12023, 18547) }, + }, + { + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + { AOM_CDF5(6554, 13107, 19661, 26214) }, + }, + }, + }; + +static const aom_cdf_prob + default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE( + TX_TYPES)] = { + { + { 0 }, + { 0 }, + { 0 }, + { 0 }, + }, + { + { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, + 22848, 23934, 25474, 27727, 28915, 30631) }, + { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, + 20408, 22517, 25010, 27116, 28856, 30749) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, + 20480, 22528, 24576, 26624, 28672, 30720) }, + }, + { + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, + 28526, 30529) }, + { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, + 24576, 27307, 30037) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(4167) }, + { AOM_CDF2(1998) }, + { AOM_CDF2(748) }, + }, + }; + +static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = { + AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294) +}; + +static const aom_cdf_prob + default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = { + { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700, + 32704, 32708, 32712, 32716, 32720, 32724) }, + { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620, + 32647, 32668, 32672, 32676, 32680, 32684) }, + { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673, + 32677, 32681, 32685, 32689, 32693, 32697) }, + { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708, + 32712, 32716, 32720, 32724, 32728, 32732) }, + { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394, + 32464, 32516, 32560, 32576, 32593, 32622) }, + { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144, + 32413, 32520, 32594, 32622, 32656, 32660) } + }; + +static const aom_cdf_prob + default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE( + SWITCHABLE_FILTERS)] = { + { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) }, + { AOM_CDF3(422, 2938) }, { AOM_CDF3(28244, 32608) }, + { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) }, + { AOM_CDF3(770, 1152) }, { AOM_CDF3(20889, 25637) }, + { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) }, + { AOM_CDF3(305, 2247) }, { AOM_CDF3(27403, 32636) }, + { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) }, + { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) } + }; + +static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) }, + { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } }; + +static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } }; + +static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) }, + { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } }; + +static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) } +}; + +static const aom_cdf_prob + default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE( + INTER_COMPOUND_MODES)] = { + { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) }, + { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) }, + { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) }, + { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) }, + { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) }, + { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) }, + { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) }, + { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) } + }; + +static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + 2)] = { { AOM_CDF2(16384) }, + { AOM_CDF2(26887) }, + { AOM_CDF2(27597) }, + { AOM_CDF2(30237) } }; + +static const aom_cdf_prob + default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE( + INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(1875, 11082, 27332) }, + { AOM_CDF4(2473, 9996, 26388) }, + { AOM_CDF4(4238, 11537, 25926) } }; + +static const aom_cdf_prob + default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) }, + { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) }, + { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } + }; + +static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MASKED_COMPOUND_TYPES)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, + { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, + { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } +}; + +static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + 16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, + 20359, 22362, 24127, 25702, 27752, 29450, 31171) }, + { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, + 17367, 18452, 19422, 22839, 26127, 29629) }, + { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, + 21332, 24520, 27470, 29456, 30529, 31656) }, + { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, + 19163, 20961, 22884, 24471, 26719, 28714, 30877) }, + { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, + 16730, 18114, 19313, 22521, 26012, 29550) }, + { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, + 17270, 20533, 23434, 25972, 27944, 29570, 31416) }, + { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, + 20638, 22038, 23963, 25311, 26988, 28766, 31012) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, + 24284, 24985, 25684, 27259, 28883, 30911) }, + { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, + 25057, 27251, 29173, 30089, 30960, 31933) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) }, + { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, + 18432, 20480, 22528, 24576, 26624, 28672, 30720) } }; + +static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) }, + { AOM_CDF3(4738, 24765) }, { AOM_CDF3(5391, 25528) }, + { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) }, + { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) }, + { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) }, + { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) }, + { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) }, + { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) }, + { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } }; + +static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(10437) }, { AOM_CDF2(9371) }, { AOM_CDF2(9301) }, + { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) }, + { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) }, + { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) }, + { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) }, + { AOM_CDF2(26879) } +}; + +static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS] + [CDF_SIZE(2)] = { + { AOM_CDF2(806) }, + { AOM_CDF2(16662) }, + { AOM_CDF2(20186) }, + { AOM_CDF2(26538) } + }; + +static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(26828) }, + { AOM_CDF2(24035) }, + { AOM_CDF2(12031) }, + { AOM_CDF2(10640) }, + { AOM_CDF2(2901) } }; + +static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(2)] = { + { AOM_CDF2(1198) }, + { AOM_CDF2(2070) }, + { AOM_CDF2(9166) }, + { AOM_CDF2(7499) }, + { AOM_CDF2(22475) } + }; + +static const aom_cdf_prob + default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - + 1][CDF_SIZE(2)] = { + { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } }, + { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } }, + { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } } + }; + +static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1] + [CDF_SIZE(2)] = { + { { AOM_CDF2(4897) }, + { AOM_CDF2(1555) }, + { AOM_CDF2(4236) }, + { AOM_CDF2(8650) }, + { AOM_CDF2(904) }, + { AOM_CDF2(1444) } }, + { { AOM_CDF2(16973) }, + { AOM_CDF2(16751) }, + { AOM_CDF2(19647) }, + { AOM_CDF2(24773) }, + { AOM_CDF2(11014) }, + { AOM_CDF2(15087) } }, + { { AOM_CDF2(29744) }, + { AOM_CDF2(30279) }, + { AOM_CDF2(31194) }, + { AOM_CDF2(31895) }, + { AOM_CDF2(26875) }, + { AOM_CDF2(30304) } } + }; + +static const aom_cdf_prob + default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = { + { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } }, + { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } }, + { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } } + }; + +static const aom_cdf_prob + default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = { + { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } }, + { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } }, + { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } } + }; + +static const aom_cdf_prob + default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { + { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) }, + { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) }, + { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) }, + { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) }, + { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) }, + { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) }, + { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) } + }; + +static const aom_cdf_prob + default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = { + { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) }, + { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) }, + { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) }, + { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) }, + { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) }, + { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) }, + { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) } + }; + +static const aom_cdf_prob default_palette_y_mode_cdf + [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = { + { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } }, + { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } }, + { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } }, + { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } }, + { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } }, + { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } }, + { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } } + }; + +static const aom_cdf_prob + default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(32461) }, { AOM_CDF2(21488) } + }; + +static const aom_cdf_prob default_palette_y_color_index_cdf + [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { + { + { AOM_CDF2(28710) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(10553) }, + { AOM_CDF2(27036) }, + { AOM_CDF2(31603) }, + }, + { + { AOM_CDF3(27877, 30490) }, + { AOM_CDF3(11532, 25697) }, + { AOM_CDF3(6544, 30234) }, + { AOM_CDF3(23018, 28072) }, + { AOM_CDF3(31915, 32385) }, + }, + { + { AOM_CDF4(25572, 28046, 30045) }, + { AOM_CDF4(9478, 21590, 27256) }, + { AOM_CDF4(7248, 26837, 29824) }, + { AOM_CDF4(19167, 24486, 28349) }, + { AOM_CDF4(31400, 31825, 32250) }, + }, + { + { AOM_CDF5(24779, 26955, 28576, 30282) }, + { AOM_CDF5(8669, 20364, 24073, 28093) }, + { AOM_CDF5(4255, 27565, 29377, 31067) }, + { AOM_CDF5(19864, 23674, 26716, 29530) }, + { AOM_CDF5(31646, 31893, 32147, 32426) }, + }, + { + { AOM_CDF6(23132, 25407, 26970, 28435, 30073) }, + { AOM_CDF6(7443, 17242, 20717, 24762, 27982) }, + { AOM_CDF6(6300, 24862, 26944, 28784, 30671) }, + { AOM_CDF6(18916, 22895, 25267, 27435, 29652) }, + { AOM_CDF6(31270, 31550, 31808, 32059, 32353) }, + }, + { + { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) }, + { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) }, + { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) }, + { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) }, + { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) }, + }, + { + { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) }, + { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) }, + { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) }, + { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) }, + { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) }, + }, + }; + +static const aom_cdf_prob default_palette_uv_color_index_cdf + [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = { + { + { AOM_CDF2(29089) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(8713) }, + { AOM_CDF2(29257) }, + { AOM_CDF2(31610) }, + }, + { + { AOM_CDF3(25257, 29145) }, + { AOM_CDF3(12287, 27293) }, + { AOM_CDF3(7033, 27960) }, + { AOM_CDF3(20145, 25405) }, + { AOM_CDF3(30608, 31639) }, + }, + { + { AOM_CDF4(24210, 27175, 29903) }, + { AOM_CDF4(9888, 22386, 27214) }, + { AOM_CDF4(5901, 26053, 29293) }, + { AOM_CDF4(18318, 22152, 28333) }, + { AOM_CDF4(30459, 31136, 31926) }, + }, + { + { AOM_CDF5(22980, 25479, 27781, 29986) }, + { AOM_CDF5(8413, 21408, 24859, 28874) }, + { AOM_CDF5(2257, 29449, 30594, 31598) }, + { AOM_CDF5(19189, 21202, 25915, 28620) }, + { AOM_CDF5(31844, 32044, 32281, 32518) }, + }, + { + { AOM_CDF6(22217, 24567, 26637, 28683, 30548) }, + { AOM_CDF6(7307, 16406, 19636, 24632, 28424) }, + { AOM_CDF6(4441, 25064, 26879, 28942, 30919) }, + { AOM_CDF6(17210, 20528, 23319, 26750, 29582) }, + { AOM_CDF6(30674, 30953, 31396, 31735, 32207) }, + }, + { + { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) }, + { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) }, + { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) }, + { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) }, + { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) }, + }, + { + { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) }, + { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) }, + { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) }, + { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) }, + { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) }, + }, + }; + +static const aom_cdf_prob + default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) }, + { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) }, + { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) }, + { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) }, + { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) }, + { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) }, + { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) } + }; + +static const aom_cdf_prob default_skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) } +}; + +static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE( + 2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } }; + +static const aom_cdf_prob + default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) }, + { AOM_CDF2(13259) }, { AOM_CDF2(9334) }, { AOM_CDF2(4644) } + }; + +static const aom_cdf_prob + default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = { + { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) }, + { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) } + }; + +static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 30531) }; + +static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE( + FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) }; + +static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE( + 2)] = { { AOM_CDF2(4621) }, { AOM_CDF2(6743) }, { AOM_CDF2(5893) }, + { AOM_CDF2(7866) }, { AOM_CDF2(12551) }, { AOM_CDF2(9394) }, + { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) }, + { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) }, + { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }; + +static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE( + RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) }; + +static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 11570) }; + +static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2( + 16855) }; + +static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = { + AOM_CDF4(28160, 32120, 32677) +}; + +static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE( + DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) }, + { AOM_CDF4(28160, 32120, 32677) } }; +static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = { + AOM_CDF4(28160, 32120, 32677) +}; + +static const aom_cdf_prob + default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = { + { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) } + }; + +static const aom_cdf_prob + default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE( + MAX_SEGMENTS)] = { + { + AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533), + }, + { + AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344), + }, + { + AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679), + }, + }; + +static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] + [CDF_SIZE(MAX_TX_DEPTH + 1)] = { + { { AOM_CDF2(19968) }, + { AOM_CDF2(19968) }, + { AOM_CDF2(24320) } }, + { { AOM_CDF3(12272, 30172) }, + { AOM_CDF3(12272, 30172) }, + { AOM_CDF3(18677, 30848) } }, + { { AOM_CDF3(12986, 15180) }, + { AOM_CDF3(12986, 15180) }, + { AOM_CDF3(24302, 25602) } }, + { { AOM_CDF3(5782, 11475) }, + { AOM_CDF3(5782, 11475) }, + { AOM_CDF3(16803, 22759) } }, + }; + +// Negative values are invalid +const int av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1] = { + -1, -1, 0, -1, -1, 4, 3, 2, 1 +}; + +int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, + int r, int c, int palette_size, + uint8_t *color_order, int *color_idx) { + assert(palette_size <= PALETTE_MAX_SIZE); + assert(r > 0 || c > 0); + + // Get color indices of neighbors. + int color_neighbors[NUM_PALETTE_NEIGHBORS]; + color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1; + color_neighbors[1] = + (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1; + color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1; + + // The +10 below should not be needed. But we get a warning "array subscript + // is above array bounds [-Werror=array-bounds]" without it, possibly due to + // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 + int scores[PALETTE_MAX_SIZE + 10] = { 0 }; + int i; + static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 }; + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + if (color_neighbors[i] >= 0) { + scores[color_neighbors[i]] += weights[i]; + } + } + + int inverse_color_order[PALETTE_MAX_SIZE]; + for (i = 0; i < PALETTE_MAX_SIZE; ++i) { + color_order[i] = i; + inverse_color_order[i] = i; + } + + // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small). + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + int max = scores[i]; + int max_idx = i; + for (int j = i + 1; j < palette_size; ++j) { + if (scores[j] > max) { + max = scores[j]; + max_idx = j; + } + } + if (max_idx != i) { + // Move the score at index 'max_idx' to index 'i', and shift the scores + // from 'i' to 'max_idx - 1' by 1. + const int max_score = scores[max_idx]; + const uint8_t max_color_order = color_order[max_idx]; + for (int k = max_idx; k > i; --k) { + scores[k] = scores[k - 1]; + color_order[k] = color_order[k - 1]; + inverse_color_order[color_order[k]] = k; + } + scores[i] = max_score; + color_order[i] = max_color_order; + inverse_color_order[color_order[i]] = i; + } + } + + if (color_idx != NULL) + *color_idx = inverse_color_order[color_map[r * stride + c]]; + + // Get hash value of context. + int color_index_ctx_hash = 0; + static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; + for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) { + color_index_ctx_hash += scores[i] * hash_multipliers[i]; + } + assert(color_index_ctx_hash > 0); + assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); + + // Lookup context from hash. + const int color_index_ctx = + av1_palette_color_index_context_lookup[color_index_ctx_hash]; + assert(color_index_ctx >= 0); + assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); + return color_index_ctx; +} + +void av1_init_mode_probs(FRAME_CONTEXT *fc) { + av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf); + av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf); + av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf); + av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf); + av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf); + av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf); + av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf); + av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf); + av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf); + av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf); + av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf); + av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf); + av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf); + av1_copy(fc->single_ref_cdf, default_single_ref_cdf); + av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf); + av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs); + av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs); + av1_copy(fc->newmv_cdf, default_newmv_cdf); + av1_copy(fc->zeromv_cdf, default_zeromv_cdf); + av1_copy(fc->refmv_cdf, default_refmv_cdf); + av1_copy(fc->drl_cdf, default_drl_cdf); + av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf); + av1_copy(fc->obmc_cdf, default_obmc_cdf); + av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf); + av1_copy(fc->compound_type_cdf, default_compound_type_cdf); + av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf); + av1_copy(fc->interintra_cdf, default_interintra_cdf); + av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf); + av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf); + av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf); + av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs); + av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf); + av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf); + av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf); + av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf); + av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf); + av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf); + av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf); + av1_copy(fc->partition_cdf, default_partition_cdf); + av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf); + av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf); + av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs); + av1_copy(fc->skip_txfm_cdfs, default_skip_txfm_cdfs); + av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf); + for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++) + av1_copy(fc->seg.spatial_pred_seg_cdf[i], + default_spatial_pred_seg_tree_cdf[i]); + av1_copy(fc->tx_size_cdf, default_tx_size_cdf); + av1_copy(fc->delta_q_cdf, default_delta_q_cdf); + av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf); + av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf); + av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf); + av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf); + av1_copy(fc->intrabc_cdf, default_intrabc_cdf); +} + +void av1_set_default_ref_deltas(int8_t *ref_deltas) { + assert(ref_deltas != NULL); + + ref_deltas[INTRA_FRAME] = 1; + ref_deltas[LAST_FRAME] = 0; + ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME]; + ref_deltas[GOLDEN_FRAME] = -1; + ref_deltas[ALTREF2_FRAME] = -1; + ref_deltas[ALTREF_FRAME] = -1; +} + +void av1_set_default_mode_deltas(int8_t *mode_deltas) { + assert(mode_deltas != NULL); + + mode_deltas[0] = 0; + mode_deltas[1] = 0; +} + +static void set_default_lf_deltas(struct loopfilter *lf) { + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; + + av1_set_default_ref_deltas(lf->ref_deltas); + av1_set_default_mode_deltas(lf->mode_deltas); +} + +void av1_setup_frame_contexts(AV1_COMMON *cm) { + // Store the frame context into a special slot (not associated with any + // reference buffer), so that we can set up cm->pre_fc correctly later + // This function must ONLY be called when cm->fc has been initialized with + // default probs, either by av1_setup_past_independence or after manually + // initializing them + *cm->default_frame_context = *cm->fc; + // TODO(jack.haughton@argondesign.com): don't think this should be necessary, + // but could do with fuller testing + if (cm->tiles.large_scale) { + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, i); + if (buf != NULL) buf->frame_context = *cm->fc; + } + for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) + cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc; + } +} + +void av1_setup_past_independence(AV1_COMMON *cm) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + av1_clearall_segfeatures(&cm->seg); + + if (cm->cur_frame->seg_map) { + memset(cm->cur_frame->seg_map, 0, + (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols)); + } + + // reset mode ref deltas + av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); + av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); + set_default_lf_deltas(&cm->lf); + + av1_default_coef_probs(cm); + av1_init_mode_probs(cm->fc); + av1_init_mv_probs(cm); + cm->fc->initialized = 1; + av1_setup_frame_contexts(cm); +} diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h new file mode 100644 index 0000000000..09cd6bd1e9 --- /dev/null +++ b/third_party/aom/av1/common/entropymode.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ +#define AOM_AV1_COMMON_ENTROPYMODE_H_ + +#include "av1/common/entropy.h" +#include "av1/common/entropymv.h" +#include "av1/common/filter.h" +#include "av1/common/seg_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 + +#define TX_SIZE_CONTEXTS 3 + +#define INTER_OFFSET(mode) ((mode)-NEARESTMV) +#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV) + +// Number of possible contexts for a color index. +// As can be seen from av1_get_palette_color_index_context(), the possible +// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to +// a value from 0 to 4 using 'av1_palette_color_index_context_lookup' table. +#define PALETTE_COLOR_INDEX_CONTEXTS 5 + +// Palette Y mode context for a block is determined by number of neighboring +// blocks (top and/or left) using a palette for Y plane. So, possible Y mode' +// context values are: +// 0 if neither left nor top block uses palette for Y plane, +// 1 if exactly one of left or top block uses palette for Y plane, and +// 2 if both left and top blocks use palette for Y plane. +#define PALETTE_Y_MODE_CONTEXTS 3 + +// Palette UV mode context for a block is determined by whether this block uses +// palette for the Y plane. So, possible values are: +// 0 if this block doesn't use palette for Y plane. +// 1 if this block uses palette for Y plane (i.e. Y palette size > 0). +#define PALETTE_UV_MODE_CONTEXTS 2 + +// Map the number of pixels in a block size to a context +// 64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4) -> 0 +// 128(BLOCK_8X16, BLOCK_16x8) -> 1 +// ... +// 4096(BLOCK_64X64) -> 6 +#define PALATTE_BSIZE_CTXS 7 + +#define MAX_COLOR_CONTEXT_HASH 8 + +#define NUM_PALETTE_NEIGHBORS 3 // left, top-left and top. + +#define KF_MODE_CONTEXTS 5 + +struct AV1Common; + +typedef struct { + const int16_t *scan; + const int16_t *iscan; +} SCAN_ORDER; + +typedef struct frame_contexts { + aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS] + [CDF_SIZE(2)]; + aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)]; + aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)]; + aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)]; + aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)]; + aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)]; + aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)]; + aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)]; + aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB] + [CDF_SIZE(3)]; + aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] + [CDF_SIZE(4)]; + aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] + [CDF_SIZE(BR_CDF_SIZE)]; + + aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]; + + aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS] + [CDF_SIZE(INTER_COMPOUND_MODES)]; + aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL] + [CDF_SIZE(MASKED_COMPOUND_TYPES)]; + aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]; + aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]; + aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS] + [CDF_SIZE(INTERINTRA_MODES)]; + aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]; + aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; + aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]; + aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; + aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [CDF_SIZE(PALETTE_COLORS)]; + aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS] + [CDF_SIZE(2)]; + aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]; + aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob skip_txfm_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]; + aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]; + nmv_context nmvc; + nmv_context ndvc; + aom_cdf_prob intrabc_cdf[CDF_SIZE(2)]; + struct segmentation_probs seg; + aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]; + aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]; + aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]; + aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)]; + aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)]; + aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]; + aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES] + [CDF_SIZE(UV_INTRA_MODES)]; + aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)]; + aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS] + [CDF_SIZE(SWITCHABLE_FILTERS)]; + /* kf_y_cdf is discarded after use, so does not require persistent storage. + However, we keep it with the other CDFs in this struct since it needs to + be copied to each tile to support parallelism just like the others. + */ + aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS] + [CDF_SIZE(INTRA_MODES)]; + + aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES] + [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]; + + aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS] + [CDF_SIZE(MAX_TX_DEPTH + 1)]; + aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)]; + aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)]; + aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)]; + aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [CDF_SIZE(TX_TYPES)]; + aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES] + [CDF_SIZE(TX_TYPES)]; + aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]; + aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)]; + int initialized; +} FRAME_CONTEXT; + +static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 }, + { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 }, + { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 }, +}; + +static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 }, + { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 }, +}; + +void av1_set_default_ref_deltas(int8_t *ref_deltas); +void av1_set_default_mode_deltas(int8_t *mode_deltas); +void av1_setup_frame_contexts(struct AV1Common *cm); +void av1_setup_past_independence(struct AV1Common *cm); + +// Returns (int)ceil(log2(n)). +static INLINE int av1_ceil_log2(int n) { + if (n < 2) return 0; + int i = 1; + unsigned int p = 2; + while (p < (unsigned int)n) { + i++; + p = p << 1; + } + return i; +} + +// Returns the context for palette color index at row 'r' and column 'c', +// along with the 'color_order' of neighbors and the 'color_idx'. +// The 'color_map' is a 2D array with the given 'stride'. +int av1_get_palette_color_index_context(const uint8_t *color_map, int stride, + int r, int c, int palette_size, + uint8_t *color_order, int *color_idx); + +extern const int + av1_palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH + 1]; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPYMODE_H_ diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c new file mode 100644 index 0000000000..e1e42f2f18 --- /dev/null +++ b/third_party/aom/av1/common/entropymv.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/entropymv.h" + +static const nmv_context default_nmv_context = { + { AOM_CDF4(4096, 11264, 19328) }, // joints_cdf + { { + // Vertical component + { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, + 32762, 32767) }, // class_cdf // fp + { { AOM_CDF4(16384, 24576, 26624) }, + { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf + { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf + { AOM_CDF2(128 * 128) }, // sign_cdf + { AOM_CDF2(160 * 128) }, // class0_hp_cdf + { AOM_CDF2(128 * 128) }, // hp_cdf + { AOM_CDF2(216 * 128) }, // class0_cdf + { { AOM_CDF2(128 * 136) }, + { AOM_CDF2(128 * 140) }, + { AOM_CDF2(128 * 148) }, + { AOM_CDF2(128 * 160) }, + { AOM_CDF2(128 * 176) }, + { AOM_CDF2(128 * 192) }, + { AOM_CDF2(128 * 224) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 240) } }, // bits_cdf + }, + { + // Horizontal component + { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, + 32762, 32767) }, // class_cdf // fp + { { AOM_CDF4(16384, 24576, 26624) }, + { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf + { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf + { AOM_CDF2(128 * 128) }, // sign_cdf + { AOM_CDF2(160 * 128) }, // class0_hp_cdf + { AOM_CDF2(128 * 128) }, // hp_cdf + { AOM_CDF2(216 * 128) }, // class0_cdf + { { AOM_CDF2(128 * 136) }, + { AOM_CDF2(128 * 140) }, + { AOM_CDF2(128 * 148) }, + { AOM_CDF2(128 * 160) }, + { AOM_CDF2(128 * 176) }, + { AOM_CDF2(128 * 192) }, + { AOM_CDF2(128 * 224) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 234) }, + { AOM_CDF2(128 * 240) } }, // bits_cdf + } }, +}; + +void av1_init_mv_probs(AV1_COMMON *cm) { + // NB: this sets CDFs too + cm->fc->nmvc = default_nmv_context; + cm->fc->ndvc = default_nmv_context; +} diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h new file mode 100644 index 0000000000..cddc80768c --- /dev/null +++ b/third_party/aom/av1/common/entropymv.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENTROPYMV_H_ +#define AOM_AV1_COMMON_ENTROPYMV_H_ + +#include "config/aom_config.h" + +#include "aom_dsp/prob.h" + +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; + +void av1_init_mv_probs(struct AV1Common *cm); + +#define MV_UPDATE_PROB 252 + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} UENUM1BYTE(MV_JOINT_TYPE); + +static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { + return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; +} + +static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { + return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; +} + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 11 +enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ +} UENUM1BYTE(MV_CLASS_TYPE); + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_BITS_CONTEXTS 6 +#define MV_FP_SIZE 4 + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +#define MV_IN_USE_BITS 14 +#define MV_UPP (1 << MV_IN_USE_BITS) +#define MV_LOW (-(1 << MV_IN_USE_BITS)) + +typedef struct { + aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)]; + aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)]; + aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)]; + aom_cdf_prob sign_cdf[CDF_SIZE(2)]; + aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)]; + aom_cdf_prob hp_cdf[CDF_SIZE(2)]; + aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)]; + aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)]; +} nmv_component; + +typedef struct { + aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)]; + nmv_component comps[2]; +} nmv_context; + +enum { + MV_SUBPEL_NONE = -1, + MV_SUBPEL_LOW_PRECISION = 0, + MV_SUBPEL_HIGH_PRECISION, +} SENUM1BYTE(MvSubpelPrecision); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENTROPYMV_H_ diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h new file mode 100644 index 0000000000..b99a138675 --- /dev/null +++ b/third_party/aom/av1/common/enums.h @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_ENUMS_H_ +#define AOM_AV1_COMMON_ENUMS_H_ + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! @file */ + +/*!\cond */ + +// Max superblock size +#define MAX_SB_SIZE_LOG2 7 +#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2) +#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE) + +// Min superblock size +#define MIN_SB_SIZE_LOG2 6 + +// Pixels per Mode Info (MI) unit +#define MI_SIZE_LOG2 2 +#define MI_SIZE (1 << MI_SIZE_LOG2) + +// MI-units per max superblock (MI Block - MIB) +#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2) +#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2) + +// MI-units per min superblock +#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2) + +// Mask to extract MI offset within max MIB +#define MAX_MIB_MASK (MAX_MIB_SIZE - 1) + +// Maximum number of tile rows and tile columns +#define MAX_TILE_ROWS 64 +#define MAX_TILE_COLS 64 + +#define MAX_VARTX_DEPTH 2 + +#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2) +#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2) + +#define MAX_PALETTE_SQUARE (64 * 64) +// Maximum number of colors in a palette. +#define PALETTE_MAX_SIZE 8 +// Minimum number of colors in a palette. +#define PALETTE_MIN_SIZE 2 + +#define FRAME_OFFSET_BITS 5 +#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1) + +// 4 frame filter levels: y plane vertical, y plane horizontal, +// u plane, and v plane +#define FRAME_LF_COUNT 4 +#define DEFAULT_DELTA_LF_MULTI 0 +#define MAX_MODE_LF_DELTAS 2 + +#define DIST_PRECISION_BITS 4 +#define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16 + +#define PROFILE_BITS 3 +// The following three profiles are currently defined. +// Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only. +// Profile 1. 8-bit and 10-bit 4:4:4 +// Profile 2. 8-bit and 10-bit 4:2:2 +// 12-bit 4:0:0, 4:2:2 and 4:4:4 +// Since we have three bits for the profiles, it can be extended later. +enum { + PROFILE_0, + PROFILE_1, + PROFILE_2, + MAX_PROFILES, +} SENUM1BYTE(BITSTREAM_PROFILE); + +#define OP_POINTS_CNT_MINUS_1_BITS 5 +#define OP_POINTS_IDC_BITS 12 + +// Note: Some enums use the attribute 'packed' to use smallest possible integer +// type, so that we can save memory when they are used in structs/arrays. + +typedef enum ATTRIBUTE_PACKED { + BLOCK_4X4, + BLOCK_4X8, + BLOCK_8X4, + BLOCK_8X8, + BLOCK_8X16, + BLOCK_16X8, + BLOCK_16X16, + BLOCK_16X32, + BLOCK_32X16, + BLOCK_32X32, + BLOCK_32X64, + BLOCK_64X32, + BLOCK_64X64, + BLOCK_64X128, + BLOCK_128X64, + BLOCK_128X128, + BLOCK_4X16, + BLOCK_16X4, + BLOCK_8X32, + BLOCK_32X8, + BLOCK_16X64, + BLOCK_64X16, + BLOCK_SIZES_ALL, + BLOCK_SIZES = BLOCK_4X16, + BLOCK_INVALID = 255, + BLOCK_LARGEST = (BLOCK_SIZES - 1) +} BLOCK_SIZE; + +// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 +#define SQR_BLOCK_SIZES 6 + +// Partition types. R: Recursive +// +// NONE HORZ VERT SPLIT +// +-------+ +-------+ +---+---+ +---+---+ +// | | | | | | | | R | R | +// | | +-------+ | | | +---+---+ +// | | | | | | | | R | R | +// +-------+ +-------+ +---+---+ +---+---+ +// +// HORZ_A HORZ_B VERT_A VERT_B +// +---+---+ +-------+ +---+---+ +---+---+ +// | | | | | | | | | | | +// +---+---+ +---+---+ +---+ | | +---+ +// | | | | | | | | | | | +// +-------+ +---+---+ +---+---+ +---+---+ +// +// HORZ_4 VERT_4 +// +-----+ +-+-+-+ +// +-----+ | | | | +// +-----+ | | | | +// +-----+ +-+-+-+ +enum { + PARTITION_NONE, + PARTITION_HORZ, + PARTITION_VERT, + PARTITION_SPLIT, + PARTITION_HORZ_A, // HORZ split and the top partition is split again + PARTITION_HORZ_B, // HORZ split and the bottom partition is split again + PARTITION_VERT_A, // VERT split and the left partition is split again + PARTITION_VERT_B, // VERT split and the right partition is split again + PARTITION_HORZ_4, // 4:1 horizontal partition + PARTITION_VERT_4, // 4:1 vertical partition + EXT_PARTITION_TYPES, + PARTITION_TYPES = PARTITION_SPLIT + 1, + PARTITION_INVALID = 255 +} UENUM1BYTE(PARTITION_TYPE); + +typedef char PARTITION_CONTEXT; +#define PARTITION_PLOFFSET 4 // number of probability models per block size +#define PARTITION_BLOCK_SIZES 5 +#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET) + +#define TX_SIZE_LUMA_MIN (TX_4X4) +/* We don't need to code a transform size unless the allowed size is at least + one more than the minimum. */ +#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1) + +// Maximum tx_size categories +#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN) +#define MAX_TX_DEPTH 2 + +#define MAX_TX_SIZE_LOG2 (6) +#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2) +#define MIN_TX_SIZE_LOG2 2 +#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2) +#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE) + +// Pad 4 extra columns to remove horizontal availability check. +#define TX_PAD_HOR_LOG2 2 +#define TX_PAD_HOR 4 +// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability +// check. +#define TX_PAD_TOP 0 +#define TX_PAD_BOTTOM 4 +#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) +// Pad 16 extra bytes to avoid reading overflow in SIMD optimization. +#define TX_PAD_END 16 +#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END) + +// Number of maximum size transform blocks in the maximum size superblock +#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2) +#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2) + +// frame transform mode +enum { + ONLY_4X4, // use only 4x4 transform + TX_MODE_LARGEST, // transform size is the largest possible for pu size + TX_MODE_SELECT, // transform specified for each block + TX_MODES, +} UENUM1BYTE(TX_MODE); + +// 1D tx types +enum { + DCT_1D, + ADST_1D, + FLIPADST_1D, + IDTX_1D, + TX_TYPES_1D, +} UENUM1BYTE(TX_TYPE_1D); + +enum { + REG_REG, + REG_SMOOTH, + REG_SHARP, + SMOOTH_REG, + SMOOTH_SMOOTH, + SMOOTH_SHARP, + SHARP_REG, + SHARP_SMOOTH, + SHARP_SHARP, +} UENUM1BYTE(DUAL_FILTER_TYPE); + +#define EXT_TX_SIZES 4 // number of sizes that use extended transforms +#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER +#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA + +enum { + AOM_LAST_FLAG = 1 << 0, + AOM_LAST2_FLAG = 1 << 1, + AOM_LAST3_FLAG = 1 << 2, + AOM_GOLD_FLAG = 1 << 3, + AOM_BWD_FLAG = 1 << 4, + AOM_ALT2_FLAG = 1 << 5, + AOM_ALT_FLAG = 1 << 6, + AOM_REFFRAME_ALL = (1 << 7) - 1 +} UENUM1BYTE(AOM_REFFRAME); + +enum { + UNIDIR_COMP_REFERENCE, + BIDIR_COMP_REFERENCE, + COMP_REFERENCE_TYPES, +} UENUM1BYTE(COMP_REFERENCE_TYPE); + +enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); + +#define CFL_ALPHABET_SIZE_LOG2 4 +#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) +#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1) +#define CFL_INDEX_ZERO CFL_ALPHABET_SIZE +#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) +#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) + +enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE); + +enum { + CFL_SIGN_ZERO, + CFL_SIGN_NEG, + CFL_SIGN_POS, + CFL_SIGNS +} UENUM1BYTE(CFL_SIGN_TYPE); + +enum { + CFL_DISALLOWED, + CFL_ALLOWED, + CFL_ALLOWED_TYPES +} UENUM1BYTE(CFL_ALLOWED_TYPE); + +// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid +#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1) +// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8 +#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5) +// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8 +#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js)) + +// There is no context when the alpha for a given plane is zero. +// So there are 2 fewer contexts than joint signs. +#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS) +#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS) +// Also, the contexts are symmetric under swapping the planes. +#define CFL_CONTEXT_V(js) \ + (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS) + +enum { + PALETTE_MAP, + COLOR_MAP_TYPES, +} UENUM1BYTE(COLOR_MAP_TYPE); + +enum { + TWO_COLORS, + THREE_COLORS, + FOUR_COLORS, + FIVE_COLORS, + SIX_COLORS, + SEVEN_COLORS, + EIGHT_COLORS, + PALETTE_SIZES +} UENUM1BYTE(PALETTE_SIZE); + +enum { + PALETTE_COLOR_ONE, + PALETTE_COLOR_TWO, + PALETTE_COLOR_THREE, + PALETTE_COLOR_FOUR, + PALETTE_COLOR_FIVE, + PALETTE_COLOR_SIX, + PALETTE_COLOR_SEVEN, + PALETTE_COLOR_EIGHT, + PALETTE_COLORS +} UENUM1BYTE(PALETTE_COLOR); + +// Note: All directional predictors must be between V_PRED and D67_PRED (both +// inclusive). +enum { + DC_PRED, // Average of above and left pixels + V_PRED, // Vertical + H_PRED, // Horizontal + D45_PRED, // Directional 45 degree + D135_PRED, // Directional 135 degree + D113_PRED, // Directional 113 degree + D157_PRED, // Directional 157 degree + D203_PRED, // Directional 203 degree + D67_PRED, // Directional 67 degree + SMOOTH_PRED, // Combination of horizontal and vertical interpolation + SMOOTH_V_PRED, // Vertical interpolation + SMOOTH_H_PRED, // Horizontal interpolation + PAETH_PRED, // Predict from the direction of smallest gradient + NEARESTMV, + NEARMV, + GLOBALMV, + NEWMV, + // Compound ref compound modes + NEAREST_NEARESTMV, + NEAR_NEARMV, + NEAREST_NEWMV, + NEW_NEARESTMV, + NEAR_NEWMV, + NEW_NEARMV, + GLOBAL_GLOBALMV, + NEW_NEWMV, + MB_MODE_COUNT, + PRED_MODE_INVALID = MB_MODE_COUNT, + INTRA_MODE_START = DC_PRED, + INTRA_MODE_END = NEARESTMV, + DIR_MODE_START = V_PRED, + DIR_MODE_END = D67_PRED + 1, + INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START, + SINGLE_INTER_MODE_START = NEARESTMV, + SINGLE_INTER_MODE_END = NEAREST_NEARESTMV, + SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START, + COMP_INTER_MODE_START = NEAREST_NEARESTMV, + COMP_INTER_MODE_END = MB_MODE_COUNT, + COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START, + INTER_MODE_START = NEARESTMV, + INTER_MODE_END = MB_MODE_COUNT, + INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. + INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks +} UENUM1BYTE(PREDICTION_MODE); + +// TODO(ltrudeau) Do we really want to pack this? +// TODO(ltrudeau) Do we match with PREDICTION_MODE? +enum { + UV_DC_PRED, // Average of above and left pixels + UV_V_PRED, // Vertical + UV_H_PRED, // Horizontal + UV_D45_PRED, // Directional 45 degree + UV_D135_PRED, // Directional 135 degree + UV_D113_PRED, // Directional 113 degree + UV_D157_PRED, // Directional 157 degree + UV_D203_PRED, // Directional 203 degree + UV_D67_PRED, // Directional 67 degree + UV_SMOOTH_PRED, // Combination of horizontal and vertical interpolation + UV_SMOOTH_V_PRED, // Vertical interpolation + UV_SMOOTH_H_PRED, // Horizontal interpolation + UV_PAETH_PRED, // Predict from the direction of smallest gradient + UV_CFL_PRED, // Chroma-from-Luma + UV_INTRA_MODES, + UV_MODE_INVALID, // For uv_mode in inter blocks +} UENUM1BYTE(UV_PREDICTION_MODE); + +// Number of top model rd to store for pruning y modes in intra mode decision +#define TOP_INTRA_MODEL_COUNT 4 +// Total number of luma intra prediction modes (include both directional and +// non-directional modes) +// Because there are 8 directional modes, each has additional 6 delta angles. +#define LUMA_MODE_COUNT (PAETH_PRED - DC_PRED + 1 + 6 * 8) + +enum { + SIMPLE_TRANSLATION, + OBMC_CAUSAL, // 2-sided OBMC + WARPED_CAUSAL, // 2-sided WARPED + MOTION_MODES +} UENUM1BYTE(MOTION_MODE); + +enum { + II_DC_PRED, + II_V_PRED, + II_H_PRED, + II_SMOOTH_PRED, + INTERINTRA_MODES +} UENUM1BYTE(INTERINTRA_MODE); + +enum { + COMPOUND_AVERAGE, + COMPOUND_DISTWTD, + COMPOUND_WEDGE, + COMPOUND_DIFFWTD, + COMPOUND_TYPES, + MASKED_COMPOUND_TYPES = 2, +} UENUM1BYTE(COMPOUND_TYPE); + +enum { + FILTER_DC_PRED, + FILTER_V_PRED, + FILTER_H_PRED, + FILTER_D157_PRED, + FILTER_PAETH_PRED, + FILTER_INTRA_MODES, +} UENUM1BYTE(FILTER_INTRA_MODE); + +enum { + SEQ_LEVEL_2_0, + SEQ_LEVEL_2_1, + SEQ_LEVEL_2_2, + SEQ_LEVEL_2_3, + SEQ_LEVEL_3_0, + SEQ_LEVEL_3_1, + SEQ_LEVEL_3_2, + SEQ_LEVEL_3_3, + SEQ_LEVEL_4_0, + SEQ_LEVEL_4_1, + SEQ_LEVEL_4_2, + SEQ_LEVEL_4_3, + SEQ_LEVEL_5_0, + SEQ_LEVEL_5_1, + SEQ_LEVEL_5_2, + SEQ_LEVEL_5_3, + SEQ_LEVEL_6_0, + SEQ_LEVEL_6_1, + SEQ_LEVEL_6_2, + SEQ_LEVEL_6_3, + SEQ_LEVEL_7_0, + SEQ_LEVEL_7_1, + SEQ_LEVEL_7_2, + SEQ_LEVEL_7_3, + SEQ_LEVEL_8_0, + SEQ_LEVEL_8_1, + SEQ_LEVEL_8_2, + SEQ_LEVEL_8_3, + SEQ_LEVELS, + SEQ_LEVEL_MAX = 31, + SEQ_LEVEL_KEEP_STATS = 32, +} UENUM1BYTE(AV1_LEVEL); + +#define LEVEL_BITS 5 + +#define DIRECTIONAL_MODES 8 +#define MAX_ANGLE_DELTA 3 +#define ANGLE_STEP 3 + +#define INTER_MODES (1 + NEWMV - NEARESTMV) + +#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV) + +#define SKIP_CONTEXTS 3 +#define SKIP_MODE_CONTEXTS 3 + +#define COMP_INDEX_CONTEXTS 6 +#define COMP_GROUP_IDX_CONTEXTS 6 + +#define NMV_CONTEXTS 3 + +#define NEWMV_MODE_CONTEXTS 6 +#define GLOBALMV_MODE_CONTEXTS 2 +#define REFMV_MODE_CONTEXTS 6 +#define DRL_MODE_CONTEXTS 3 + +#define GLOBALMV_OFFSET 3 +#define REFMV_OFFSET 4 + +#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1) +#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1) +#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1) + +#define COMP_NEWMV_CTXS 5 +#define INTER_MODE_CONTEXTS 8 + +#define DELTA_Q_SMALL 3 +#define DELTA_Q_PROBS (DELTA_Q_SMALL) +#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4 +#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4 +#define DEFAULT_DELTA_Q_RES_DUCKY_ENCODE 4 + +#define DELTA_LF_SMALL 3 +#define DELTA_LF_PROBS (DELTA_LF_SMALL) +#define DEFAULT_DELTA_LF_RES 2 + +/* Segment Feature Masks */ +#define MAX_MV_REF_CANDIDATES 2 + +#define MAX_REF_MV_STACK_SIZE 8 +#define USABLE_REF_MV_STACK_SIZE 4 +#define REF_CAT_LEVEL 640 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 3 + +#define COMP_REF_TYPE_CONTEXTS 5 +#define UNI_COMP_REF_CONTEXTS 3 + +#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3) +typedef uint8_t TXFM_CONTEXT; + +// An enum for single reference types (and some derived values). +enum { + NONE_FRAME = -1, + INTRA_FRAME, + LAST_FRAME, + LAST2_FRAME, + LAST3_FRAME, + GOLDEN_FRAME, + BWDREF_FRAME, + ALTREF2_FRAME, + ALTREF_FRAME, + REF_FRAMES, + + // Extra/scratch reference frame. It may be: + // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or + // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()). + EXTREF_FRAME = REF_FRAMES, + + // Number of inter (non-intra) reference types. + INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1, + + // Number of forward (aka past) reference types. + FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1, + + // Number of backward (aka future) reference types. + BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1, + + SINGLE_REFS = FWD_REFS + BWD_REFS, +}; + +#define REF_FRAMES_LOG2 3 + +// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new +// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the +// encoder in the cpi->scaled_ref_buf array. +// The encoder uses FRAME_BUFFERS only in GOOD and REALTIME encoding modes. +// The decoder also uses FRAME_BUFFERS. +#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME) + +// During allintra encoding, one reference frame buffer is free to be used again +// only after another frame buffer is stored as the reference frame. Hence, it +// is necessary and sufficient to maintain only two reference frame buffers in +// this case. +#define FRAME_BUFFERS_ALLINTRA 2 + +#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME) +#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME) + +// Select all the decoded frame buffer slots +#define SELECT_ALL_BUF_SLOTS 0xFF + +enum { + LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME } + LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME } + LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME } + BWDREF_ALTREF_FRAMES, // { BWDREF_FRAME, ALTREF_FRAME } + LAST2_LAST3_FRAMES, // { LAST2_FRAME, LAST3_FRAME } + LAST2_GOLDEN_FRAMES, // { LAST2_FRAME, GOLDEN_FRAME } + LAST3_GOLDEN_FRAMES, // { LAST3_FRAME, GOLDEN_FRAME } + BWDREF_ALTREF2_FRAMES, // { BWDREF_FRAME, ALTREF2_FRAME } + ALTREF2_ALTREF_FRAMES, // { ALTREF2_FRAME, ALTREF_FRAME } + TOTAL_UNIDIR_COMP_REFS, + // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs + // that are explicitly signaled. + UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1, +} UENUM1BYTE(UNIDIR_COMP_REF); + +#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS) + +#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS) + +// NOTE: A limited number of unidirectional reference pairs can be signalled for +// compound prediction. The use of skip mode, on the other hand, makes it +// possible to have a reference pair not listed for explicit signaling. +#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS) + +// Note: It includes single and compound references. So, it can take values from +// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum. +typedef int8_t MV_REFERENCE_FRAME; + +/*!\endcond */ + +/*!\enum RestorationType + * \brief This enumeration defines various restoration types supported + */ +typedef enum { + RESTORE_NONE, /**< No restoration */ + RESTORE_WIENER, /**< Separable Wiener restoration */ + RESTORE_SGRPROJ, /**< Selfguided restoration */ + RESTORE_SWITCHABLE, /**< Switchable restoration */ + RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, /**< Num Switchable types */ + RESTORE_TYPES = 4, /**< Num Restore types */ +} RestorationType; + +/*!\cond */ +// Picture prediction structures (0-13 are predefined) in scalability metadata. +enum { + SCALABILITY_L1T2 = 0, + SCALABILITY_L1T3 = 1, + SCALABILITY_L2T1 = 2, + SCALABILITY_L2T2 = 3, + SCALABILITY_L2T3 = 4, + SCALABILITY_S2T1 = 5, + SCALABILITY_S2T2 = 6, + SCALABILITY_S2T3 = 7, + SCALABILITY_L2T1h = 8, + SCALABILITY_L2T2h = 9, + SCALABILITY_L2T3h = 10, + SCALABILITY_S2T1h = 11, + SCALABILITY_S2T2h = 12, + SCALABILITY_S2T3h = 13, + SCALABILITY_SS = 14 +} UENUM1BYTE(SCALABILITY_STRUCTURES); + +#define SUPERRES_SCALE_BITS 3 +#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1) + +// In large_scale_tile coding, external references are used. +#define MAX_EXTERNAL_REFERENCES 128 +#define MAX_TILES 512 + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_ENUMS_H_ diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h new file mode 100644 index 0000000000..4344aea916 --- /dev/null +++ b/third_party/aom/av1/common/filter.h @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_FILTER_H_ +#define AOM_AV1_COMMON_FILTER_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_FILTER_TAP 12 + +typedef enum ATTRIBUTE_PACKED { + EIGHTTAP_REGULAR, + EIGHTTAP_SMOOTH, + MULTITAP_SHARP, + BILINEAR, + // Encoder side only filters + MULTITAP_SHARP2, + + INTERP_FILTERS_ALL, + SWITCHABLE_FILTERS = BILINEAR, + SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */ + EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS, + INTERP_INVALID = 0xff, +} InterpFilter; + +enum { + USE_2_TAPS_ORIG = 0, // This is used in temporal filtering. + USE_2_TAPS, + USE_4_TAPS, + USE_8_TAPS, +} UENUM1BYTE(SUBPEL_SEARCH_TYPE); + +enum { + INTERP_EVAL_LUMA_EVAL_CHROMA = 0, + INTERP_SKIP_LUMA_EVAL_CHROMA, + INTERP_EVAL_INVALID, + INTERP_SKIP_LUMA_SKIP_CHROMA, +} UENUM1BYTE(INTERP_EVAL_PLANE); + +enum { + INTERP_HORZ_NEQ_VERT_NEQ = 0, + INTERP_HORZ_EQ_VERT_NEQ, + INTERP_HORZ_NEQ_VERT_EQ, + INTERP_HORZ_EQ_VERT_EQ, + INTERP_PRED_TYPE_ALL, +} UENUM1BYTE(INTERP_PRED_TYPE); +// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters, +// we can use 16 bits for each and have more than enough space. This reduces +// argument passing and unifies the operation of setting a (pair of) filters. +typedef struct InterpFilters { + uint16_t y_filter; + uint16_t x_filter; +} InterpFilters; + +typedef union int_interpfilters { + uint32_t as_int; + InterpFilters as_filters; +} int_interpfilters; + +static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters, + int dir) { + return (InterpFilter)((dir) ? filters.as_filters.x_filter + : filters.as_filters.y_filter); +} + +static INLINE int_interpfilters +av1_broadcast_interp_filter(InterpFilter filter) { + int_interpfilters filters; + filters.as_filters.x_filter = filter; + filters.as_filters.y_filter = filter; + return filters; +} + +static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) { + return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter; +} + +/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */ +#define LOG_SWITCHABLE_FILTERS 2 + +#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4) +#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1) +#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2) +#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff) + +typedef struct InterpFilterParams { + const int16_t *filter_ptr; + uint16_t taps; + InterpFilter interp_filter; +} InterpFilterParams; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_bilinear_filters[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 }, + { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 }, + { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 }, + { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 }, + { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 }, + { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 }, + { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 }, + { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 }, + { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 }, + { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 }, + { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 }, + { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 }, + { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 }, + { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 }, + { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 } +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 }, + { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 } +}; + +DECLARE_ALIGNED(256, static const int16_t, + av1_sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = { + { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }, + { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 }, + { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 }, + { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 }, + { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 }, + { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 }, + { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 }, + { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 }, + { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 }, + { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 }, + { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 }, + { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 }, + { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 }, + { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 }, + { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 }, + { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 } +}; + +static const InterpFilterParams + av1_interp_filter_params_list[INTERP_FILTERS_ALL] = { + { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS, + EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, + MULTITAP_SHARP }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }, + + // The following filters are for encoder only, and now they are used in + // temporal filtering. The predictor block size >= 16 in temporal filter. + { (const int16_t *)av1_sub_pel_filters_12sharp, 12, MULTITAP_SHARP2 }, + }; + +// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel +// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV. +DECLARE_ALIGNED(256, static const int16_t, + av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = { + 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const InterpFilterParams av1_intrabc_filter_params = { + av1_intrabc_bilinear_filter, 2, BILINEAR +}; + +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 }, + { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 }, + { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 }, + { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 }, + { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; +DECLARE_ALIGNED(256, static const InterpKernel, + av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 }, + { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 }, + { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 }, + { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 }, + { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 }, + { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 }, + { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 }, + { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 } +}; + +static const uint16_t + av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = { + { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG), + (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH), + (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) }, + { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP), + (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP), + (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) } + }; + +// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR +static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = { + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR }, + { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, + EIGHTTAP_SMOOTH }, + { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, EIGHTTAP_REGULAR }, + { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, BILINEAR }, +}; + +static INLINE const InterpFilterParams * +av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter, + const int w) { + if (w <= 4 && interp_filter != MULTITAP_SHARP2) + return &av1_interp_4tap[interp_filter]; + return &av1_interp_filter_params_list[interp_filter]; +} + +static INLINE const int16_t *av1_get_interp_filter_kernel( + const InterpFilter interp_filter, int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + return (subpel_search == USE_2_TAPS) + ? av1_interp_4tap[BILINEAR].filter_ptr + : ((subpel_search == USE_4_TAPS) + ? av1_interp_4tap[interp_filter].filter_ptr + : av1_interp_filter_params_list[interp_filter].filter_ptr); +} + +static INLINE const int16_t *av1_get_interp_filter_subpel_kernel( + const InterpFilterParams *const filter_params, const int subpel) { + return filter_params->filter_ptr + filter_params->taps * subpel; +} + +static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) { + assert(subpel_search >= USE_2_TAPS); + + switch (subpel_search) { + case USE_2_TAPS: return &av1_interp_4tap[BILINEAR]; + case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR]; + case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR]; + default: assert(0); return NULL; + } +} + +static INLINE void reset_interp_filter_allowed_mask( + uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + uint16_t tmp = (~(1 << filt_type)) & 0xffff; + *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK); +} + +static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask, + DUAL_FILTER_TYPE filt_type) { + *allow_interp_mask |= (1 << filt_type); +} + +static INLINE uint8_t get_interp_filter_allowed_mask( + uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) { + return (allow_interp_mask >> filt_type) & 1; +} + +static AOM_INLINE int get_filter_tap( + const InterpFilterParams *const filter_params, int subpel_qn) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_qn & SUBPEL_MASK); + if (filter_params->taps == 12) { + return 12; + } + if (filter[0] | filter[7]) { + return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_FILTER_H_ diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c new file mode 100644 index 0000000000..f10ccd5942 --- /dev/null +++ b/third_party/aom/av1/common/frame_buffers.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/frame_buffers.h" +#include "aom_mem/aom_mem.h" + +int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + av1_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; + list->int_fb = (InternalFrameBuffer *)aom_calloc( + list->num_internal_frame_buffers, sizeof(*list->int_fb)); + if (list->int_fb == NULL) { + list->num_internal_frame_buffers = 0; + return 1; + } + return 0; +} + +void av1_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + aom_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + aom_free(list->int_fb); + list->int_fb = NULL; + list->num_internal_frame_buffers = 0; +} + +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + if (list->int_fb[i].data && !list->int_fb[i].in_use) + memset(list->int_fb[i].data, 0, list->int_fb[i].size); + } +} + +int av1_get_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) break; + } + + if (i == int_fb_list->num_internal_frame_buffers) return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + aom_free(int_fb_list->int_fb[i].data); + // The data must be zeroed to fix a valgrind error from the C loop filter + // due to access uninitialized memory in frame border. It could be + // skipped if border were totally removed. + int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size); + if (!int_fb_list->int_fb[i].data) { + int_fb_list->int_fb[i].size = 0; + return -1; + } + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + if (int_fb) int_fb->in_use = 0; + return 0; +} diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h new file mode 100644 index 0000000000..16188e51c7 --- /dev/null +++ b/third_party/aom/av1/common/frame_buffers.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_ +#define AOM_AV1_COMMON_FRAME_BUFFERS_H_ + +#include "aom/aom_frame_buffer.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void av1_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Zeros all unused internal frame buffers. In particular, this zeros the +// frame borders. Call this function after a sequence header change to +// re-initialize the frame borders for the different width, height, or bit +// depth. +void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libaom to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int av1_get_frame_buffer(void *cb_priv, size_t min_size, + aom_codec_frame_buffer_t *fb); + +// Callback used by libaom when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_ diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c new file mode 100644 index 0000000000..bff438f3c6 --- /dev/null +++ b/third_party/aom/av1/common/idct.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" + +int av1_get_tx_scale(const TX_SIZE tx_size) { + const int pels = tx_size_2d[tx_size]; + // Largest possible pels is 4096 (64x64). + return (pels > 256) + (pels > 1024); +} + +// NOTE: The implementation of all inverses need to be aware of the fact +// that input and output could be the same buffer. + +// idct +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd) { + if (eob > 1) + av1_highbd_iwht4x4_16_add(input, dest, stride, bd); + else + av1_highbd_iwht4x4_1_add(input, dest, stride, bd); +} + +void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + + av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); +} + +void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd); +} + +void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int32_t *src = cast_to_int32(input); + av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, + txfm_param->tx_type, txfm_param->bd); +} + +void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + + av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + const int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + assert(tx_type == DCT_DCT); + av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} + +static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, int eob, int reduced_tx_set, + TxfmParam *txfm_param) { + (void)plane; + txfm_param->tx_type = tx_type; + txfm_param->tx_size = tx_size; + txfm_param->eob = eob; + txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id]; + txfm_param->bd = xd->bd; + txfm_param->is_hbd = is_cur_buf_hbd(xd); + txfm_param->tx_set_type = av1_get_ext_tx_set_type( + txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); +} + +void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_32X32: + av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param); + break; + case TX_16X16: + av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param); + break; + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param); + break; + case TX_8X16: + av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param); + break; + case TX_16X8: + av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param); + break; + case TX_16X32: + av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param); + break; + case TX_32X16: + av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param); + break; + case TX_64X64: + av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param); + break; + case TX_32X64: + av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param); + break; + case TX_64X32: + av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param); + break; + case TX_16X64: + av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param); + break; + case TX_64X16: + av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param); + break; + case TX_4X4: + // this is like av1_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param); + break; + case TX_8X32: + av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param); + break; + case TX_32X8: + av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param); + break; + default: assert(0 && "Invalid transform size"); break; + } +} + +void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_SIZE tx_size = txfm_param->tx_size; + DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]); + int tmp_stride = MAX_TX_SIZE; + int w = tx_size_wide[tx_size]; + int h = tx_size_high[tx_size]; + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + tmp[r * tmp_stride + c] = dst[r * stride + c]; + } + } + + av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride, + txfm_param); + + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c]; + } + } +} + +void av1_inverse_transform_block(const MACROBLOCKD *xd, + const tran_low_t *dqcoeff, int plane, + TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, + int stride, int eob, int reduced_tx_set) { + if (!eob) return; + + assert(eob <= av1_get_max_eob(tx_size)); + + TxfmParam txfm_param; + init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set, + &txfm_param); + assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]); + + if (txfm_param.is_hbd) { + av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + } else { + av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param); + } +} diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h new file mode 100644 index 0000000000..004d25d49a --- /dev/null +++ b/third_party/aom/av1/common/idct.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_IDCT_H_ +#define AOM_AV1_COMMON_IDCT_H_ + +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/enums.h" +#include "aom_dsp/txfm_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d)(const tran_low_t *, tran_low_t *); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#define MAX_TX_SCALE 1 +int av1_get_tx_scale(const TX_SIZE tx_size); + +void av1_inverse_transform_block(const MACROBLOCKD *xd, + const tran_low_t *dqcoeff, int plane, + TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst, + int stride, int eob, int reduced_tx_set); +void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, + int eob, int bd); + +static INLINE const int32_t *cast_to_int32(const tran_low_t *input) { + assert(sizeof(int32_t) == sizeof(tran_low_t)); + return (const int32_t *)input; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_IDCT_H_ diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h new file mode 100644 index 0000000000..6828834e05 --- /dev/null +++ b/third_party/aom/av1/common/mv.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_MV_H_ +#define AOM_AV1_COMMON_MV_H_ + +#include + +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define INVALID_MV 0x80008000 +#define INVALID_MV_ROW_COL -32768 +#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3) +#define GET_MV_SUBPEL(x) ((x)*8) + +#define MARK_MV_INVALID(mv) \ + do { \ + ((int_mv *)(mv))->as_int = INVALID_MV; \ + } while (0) +#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col)) + +// The motion vector in units of full pixel +typedef struct fullpel_mv { + int16_t row; + int16_t col; +} FULLPEL_MV; + +// The motion vector in units of 1/8-pel +typedef struct mv { + int16_t row; + int16_t col; +} MV; + +static const MV kZeroMv = { 0, 0 }; +static const FULLPEL_MV kZeroFullMv = { 0, 0 }; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; + FULLPEL_MV as_fullmv; +} int_mv; /* facilitates faster equality tests and copies */ + +typedef struct mv32 { + int32_t row; + int32_t col; +} MV32; + +// The mv limit for fullpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} FullMvLimits; + +// The mv limit for subpel mvs +typedef struct { + int col_min; + int col_max; + int row_min; + int row_max; +} SubpelMvLimits; + +static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) { + const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row), + (int16_t)GET_MV_RAWPEL(subpel_mv->col) }; + return full_mv; +} + +static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) { + const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row), + (int16_t)GET_MV_SUBPEL(full_mv->col) }; + return subpel_mv; +} + +static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) { + mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv); +} + +// Bits of precision used for the model +#define WARPEDMODEL_PREC_BITS 16 + +#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS) +#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3)) + +// Bits of subpel precision for warped interpolation +#define WARPEDPIXEL_PREC_BITS 6 +#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS) + +#define WARP_PARAM_REDUCE_BITS 6 + +#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) + +typedef struct { + int global_warp_allowed; + int local_warp_allowed; +} WarpTypesAllowed; + +// The order of values in the wmmat matrix below is best described +// by the affine transformation: +// [x' (m2 m3 m0 [x +// z . y' = m4 m5 m1 * y +// 1] 0 0 1) 1] +typedef struct { + int32_t wmmat[MAX_PARAMDIM]; + int16_t alpha, beta, gamma, delta; + TransformationType wmtype; + int8_t invalid; +} WarpedMotionParams; + +/* clang-format off */ +static const WarpedMotionParams default_warp_params = { + { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS) }, + 0, 0, 0, 0, + IDENTITY, + 0, +}; +/* clang-format on */ + +// The following constants describe the various precisions +// of different parameters in the global motion experiment. +// +// Given the general homography: +// [x' (a b c [x +// z . y' = d e f * y +// 1] g h i) 1] +// +// Constants using the name ALPHA here are related to parameters +// a, b, d, e. Constants using the name TRANS are related +// to parameters c and f. +// +// Anything ending in PREC_BITS is the number of bits of precision +// to maintain when converting from double to integer. +// +// The ABS parameters are used to create an upper and lower bound +// for each parameter. In other words, after a parameter is integerized +// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS). +// +// XXX_PREC_DIFF and XXX_DECODE_FACTOR +// are computed once here to prevent repetitive +// computation on the decoder side. These are +// to allow the global motion parameters to be encoded in a lower +// precision than the warped model precision. This means that they +// need to be changed to warped precision when they are decoded. +// +// XX_MIN, XX_MAX are also computed to avoid repeated computation + +#define SUBEXPFIN_K 3 +#define GM_TRANS_PREC_BITS 6 +#define GM_ABS_TRANS_BITS 12 +#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3) +#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS) +#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3) +#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF) +#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF) + +#define GM_ALPHA_PREC_BITS 15 +#define GM_ABS_ALPHA_BITS 12 +#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS) +#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF) + +#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS) +#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS) + +#define GM_TRANS_MIN -GM_TRANS_MAX +#define GM_ALPHA_MIN -GM_ALPHA_MAX + +static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) { + const int bw = block_size_wide[bs]; + return mi_col * MI_SIZE + bw / 2 - 1; +} + +static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) { + const int bh = block_size_high[bs]; + return mi_row * MI_SIZE + bh / 2 - 1; +} + +static INLINE int convert_to_trans_prec(int allow_hp, int coor) { + if (allow_hp) + return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3); + else + return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2; +} +static INLINE void integer_mv_precision(MV *mv) { + int mod = (mv->row % 8); + if (mod != 0) { + mv->row -= mod; + if (abs(mod) > 4) { + if (mod > 0) { + mv->row += 8; + } else { + mv->row -= 8; + } + } + } + + mod = (mv->col % 8); + if (mod != 0) { + mv->col -= mod; + if (abs(mod) > 4) { + if (mod > 0) { + mv->col += 8; + } else { + mv->col -= 8; + } + } + } +} +// Convert a global motion vector into a motion vector at the centre of the +// given block. +// +// The resulting motion vector will have three fractional bits of precision. If +// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and +// is_integer is true, the bottom three bits will be zero (so the motion vector +// represents an integer) +static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm, + int allow_hp, BLOCK_SIZE bsize, + int mi_col, int mi_row, + int is_integer) { + int_mv res; + + if (gm->wmtype == IDENTITY) { + res.as_int = 0; + return res; + } + + const int32_t *mat = gm->wmmat; + int x, y, tx, ty; + + if (gm->wmtype == TRANSLATION) { + // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16) + // bits of fractional precision. The offset for a translation is stored in + // entries 0 and 1. For translations, all but the top three (two if + // cm->features.allow_high_precision_mv is false) fractional bits are always + // zero. + // + // After the right shifts, there are 3 fractional bits of precision. If + // allow_hp is false, the bottom bit is always zero (so we don't need a + // call to convert_to_trans_prec here) + // + // Note: There is an AV1 specification bug here: + // + // gm->wmmat[0] is supposed to be the horizontal translation, and so should + // go into res.as_mv.col, and gm->wmmat[1] is supposed to be the vertical + // translation and so should go into res.as_mv.row + // + // However, in the spec, these assignments are accidentally reversed, and so + // we must keep this incorrect logic to match the spec. + // + // See also: https://crbug.com/aomedia/3328 + res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF; + res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF; + assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp)); + if (is_integer) { + integer_mv_precision(&res.as_mv); + } + return res; + } + + x = block_center_x(mi_col, bsize); + y = block_center_y(mi_row, bsize); + + if (gm->wmtype == ROTZOOM) { + assert(gm->wmmat[5] == gm->wmmat[2]); + assert(gm->wmmat[4] == -gm->wmmat[3]); + } + + const int xc = + (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0]; + const int yc = + mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1]; + tx = convert_to_trans_prec(allow_hp, xc); + ty = convert_to_trans_prec(allow_hp, yc); + + res.as_mv.row = ty; + res.as_mv.col = tx; + + if (is_integer) { + integer_mv_precision(&res.as_mv); + } + return res; +} + +static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) { + if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] && + gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) { + return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION); + } + if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4]) + return ROTZOOM; + else + return AFFINE; +} + +typedef struct candidate_mv { + int_mv this_mv; + int_mv comp_mv; +} CANDIDATE_MV; + +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + +static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) { + mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max); + mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_MV_H_ diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c new file mode 100644 index 0000000000..d8889f3eb3 --- /dev/null +++ b/third_party/aom/av1/common/mvref_common.c @@ -0,0 +1,1501 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/mvref_common.h" +#include "av1/common/warped_motion.h" + +// Although we assign 32 bit integers, all the values are strictly under 14 +// bits. +static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340, + 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, + 1024, 963, 910, 862, 819, 780, 744, 712, + 682, 655, 630, 606, 585, 564, 546, 528 }; + +// TODO(jingning): Consider the use of lookup table for (num / den) +// altogether. +static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) { + den = AOMMIN(den, MAX_FRAME_DISTANCE); + num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE) + : AOMMAX(num, -MAX_FRAME_DISTANCE); + const int mv_row = + ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14); + const int mv_col = + ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14); + const int clamp_max = MV_UPP - 1; + const int clamp_min = MV_LOW + 1; + output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max); + output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max); +} + +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis) { + const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); + MV_REF *frame_mvs = + cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); + x_mis = ROUND_POWER_OF_TWO(x_mis, 1); + y_mis = ROUND_POWER_OF_TWO(y_mis, 1); + int w, h; + + for (h = 0; h < y_mis; h++) { + MV_REF *mv = frame_mvs; + for (w = 0; w < x_mis; w++) { + mv->ref_frame = NONE_FRAME; + mv->mv.as_int = 0; + + for (int idx = 0; idx < 2; ++idx) { + MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx]; + if (ref_frame > INTRA_FRAME) { + int8_t ref_idx = cm->ref_frame_side[ref_frame]; + if (ref_idx) continue; + if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) || + (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT)) + continue; + mv->ref_frame = ref_frame; + mv->mv.as_int = mi->mv[idx].as_int; + } + } + mv++; + } + frame_mvs += frame_mvs_stride; + } +} + +static AOM_INLINE void add_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2], + uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count, + CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params, + uint16_t weight) { + if (!is_inter_block(candidate)) return; + assert(weight % 2 == 0); + int index, ref; + + if (rf[1] == NONE_FRAME) { + // single reference frame + for (ref = 0; ref < 2; ++ref) { + if (candidate->ref_frame[ref] == rf[0]) { + const int is_gm_block = + is_global_mv_block(candidate, gm_params[rf[0]].wmtype); + const int_mv this_refmv = + is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref); + for (index = 0; index < *refmv_count; ++index) { + if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) { + ref_mv_weight[index] += weight; + break; + } + } + + // Add a new item to the list. + if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[index].this_mv = this_refmv; + ref_mv_weight[index] = weight; + ++(*refmv_count); + } + if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; + ++*ref_match_count; + } + } + } else { + // compound reference frame + if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) { + int_mv this_refmv[2]; + + for (ref = 0; ref < 2; ++ref) { + if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype)) + this_refmv[ref] = gm_mv_candidates[ref]; + else + this_refmv[ref] = get_block_mv(candidate, ref); + } + + for (index = 0; index < *refmv_count; ++index) { + if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) && + (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) { + ref_mv_weight[index] += weight; + break; + } + } + + // Add a new item to the list. + if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[index].this_mv = this_refmv[0]; + ref_mv_stack[index].comp_mv = this_refmv[1]; + ref_mv_weight[index] = weight; + ++(*refmv_count); + } + if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count; + ++*ref_match_count; + } + } +} + +static AOM_INLINE void scan_row_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col, + const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset, + int *processed_rows) { + int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]); + const int width_8x8 = mi_size_wide[BLOCK_8X8]; + const int width_16x16 = mi_size_wide[BLOCK_16X16]; + int col_offset = 0; + // TODO(jingning): Revisit this part after cb4x4 is stable. + if (abs(row_offset) > 1) { + col_offset = 1; + if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset; + } + const int use_step_16 = (xd->width >= 16); + MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride; + + for (int i = 0; i < end_mi;) { + const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i]; + const int candidate_bsize = candidate->bsize; + const int n4_w = mi_size_wide[candidate_bsize]; + int len = AOMMIN(xd->width, n4_w); + if (use_step_16) + len = AOMMAX(width_16x16, len); + else if (abs(row_offset) > 1) + len = AOMMAX(len, width_8x8); + + uint16_t weight = 2; + if (xd->width >= width_8x8 && xd->width <= n4_w) { + uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1, + mi_size_high[candidate_bsize]); + // Obtain range used in weight calculation. + weight = AOMMAX(weight, inc); + // Update processed rows. + *processed_rows = inc - row_offset - 1; + } + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); + + i += len; + } +} + +static AOM_INLINE void scan_col_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, + const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack, + uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count, + uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset, + int *processed_cols) { + int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]); + const int n8_h_8 = mi_size_high[BLOCK_8X8]; + const int n8_h_16 = mi_size_high[BLOCK_16X16]; + int i; + int row_offset = 0; + if (abs(col_offset) > 1) { + row_offset = 1; + if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset; + } + const int use_step_16 = (xd->height >= 16); + + for (i = 0; i < end_mi;) { + const MB_MODE_INFO *const candidate = + xd->mi[(row_offset + i) * xd->mi_stride + col_offset]; + const int candidate_bsize = candidate->bsize; + const int n4_h = mi_size_high[candidate_bsize]; + int len = AOMMIN(xd->height, n4_h); + if (use_step_16) + len = AOMMAX(n8_h_16, len); + else if (abs(col_offset) > 1) + len = AOMMAX(len, n8_h_8); + + int weight = 2; + if (xd->height >= n8_h_8 && xd->height <= n4_h) { + int inc = AOMMIN(-max_col_offset + col_offset + 1, + mi_size_wide[candidate_bsize]); + // Obtain range used in weight calculation. + weight = AOMMAX(weight, inc); + // Update processed cols. + *processed_cols = inc - col_offset - 1; + } + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, len * weight); + + i += len; + } +} + +static AOM_INLINE void scan_blk_mbmi( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row, + const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset, + int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight, + uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates, + uint8_t *refmv_count) { + const TileInfo *const tile = &xd->tile; + POSITION mi_pos; + + mi_pos.row = row_offset; + mi_pos.col = col_offset; + + if (is_inside(tile, mi_col, mi_row, &mi_pos)) { + const MB_MODE_INFO *const candidate = + xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col]; + const int len = mi_size_wide[BLOCK_8X8]; + + add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count, + newmv_count, ref_mv_stack, ref_mv_weight, + gm_mv_candidates, cm->global_motion, 2 * len); + } // Analyze a single 8x8 block motion information. +} + +static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int mi_row, int mi_col, int bs) { + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; + const int mask_row = mi_row & (sb_mi_size - 1); + const int mask_col = mi_col & (sb_mi_size - 1); + + if (bs > mi_size_wide[BLOCK_64X64]) return 0; + + // In a split partition all apart from the bottom right has a top right + int has_tr = !((mask_row & bs) && (mask_col & bs)); + + // bs > 0 and bs is a power of 2 + assert(bs > 0 && !(bs & (bs - 1))); + + // For each 4x4 group of blocks, when the bottom right is decoded the blocks + // to the right have not been decoded therefore the bottom right does + // not have a top right + while (bs < sb_mi_size) { + if (mask_col & bs) { + if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) { + has_tr = 0; + break; + } + } else { + break; + } + bs <<= 1; + } + + // In a VERTICAL or VERTICAL_4 partition, all partition before the last one + // always have a top right (as the block above will have been decoded). + if (xd->width < xd->height) { + if (!xd->is_last_vertical_rect) has_tr = 1; + } + + // In a HORIZONTAL or HORIZONTAL_4 partition, partitions after the first one + // never have a top right (as the block to the right won't have been decoded). + if (xd->width > xd->height) { + if (!xd->is_first_horizontal_rect) has_tr = 0; + } + + // The bottom left square of a Vertical A (in the old format) does + // not have a top right as it is decoded before the right hand + // rectangle of the partition + if (xd->mi[0]->partition == PARTITION_VERT_A) { + if (xd->width == xd->height) + if (mask_row & bs) has_tr = 0; + } + + return has_tr; +} + +static int check_sb_border(const int mi_row, const int mi_col, + const int row_offset, const int col_offset) { + const int sb_mi_size = mi_size_wide[BLOCK_64X64]; + const int row = mi_row & (sb_mi_size - 1); + const int col = mi_col & (sb_mi_size - 1); + + if (row + row_offset < 0 || row + row_offset >= sb_mi_size || + col + col_offset < 0 || col + col_offset >= sb_mi_size) + return 0; + + return 1; +} + +static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, + int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame, + int blk_row, int blk_col, int_mv *gm_mv_candidates, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int16_t *mode_context) { + POSITION mi_pos; + mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1; + mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1; + + if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0; + + const TPL_MV_REF *prev_frame_mvs = + cm->tpl_mvs + + ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) + + ((mi_col + mi_pos.col) >> 1); + if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0; + + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + + const uint16_t weight_unit = 1; // mi_size_wide[BLOCK_8X8]; + const int cur_frame_index = cm->cur_frame->order_hint; + const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); + const int frame0_index = buf_0->order_hint; + const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, frame0_index); + int idx; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + + int_mv this_refmv; + get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_0, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (rf[1] == NONE_FRAME) { + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break; + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); + } + } else { + // Process compound inter mode + const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); + const int frame1_index = buf_1->order_hint; + const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, frame1_index); + int_mv comp_refmv; + get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, + cur_offset_1, prev_frame_mvs->ref_frame_offset); + lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv, + force_integer_mv); + + if (blk_row == 0 && blk_col == 0) { + if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 || + abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 || + abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 || + abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16) + mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + } + + for (idx = 0; idx < *refmv_count; ++idx) { + if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int && + comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int) + break; + } + + if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit; + + if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) { + ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int; + ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int; + ref_mv_weight[idx] = 2 * weight_unit; + ++(*refmv_count); + } + } + + return 1; +} + +static AOM_INLINE void process_compound_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2], + int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx]; + + for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) { + if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) { + ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx]; + ++ref_id_count[cmp_idx]; + } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[can_rf] != + cm->ref_frame_sign_bias[rf[cmp_idx]]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv; + ++ref_diff_count[cmp_idx]; + } + } + } +} + +static AOM_INLINE void process_single_ref_mv_candidate( + const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) { + for (int rf_idx = 0; rf_idx < 2; ++rf_idx) { + if (candidate->ref_frame[rf_idx] > INTRA_FRAME) { + int_mv this_mv = candidate->mv[rf_idx]; + if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] != + cm->ref_frame_sign_bias[ref_frame]) { + this_mv.as_mv.row = -this_mv.as_mv.row; + this_mv.as_mv.col = -this_mv.as_mv.col; + } + int stack_idx; + for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) { + const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv; + if (this_mv.as_int == stack_mv.as_int) break; + } + + if (stack_idx == *refmv_count) { + ref_mv_stack[stack_idx].this_mv = this_mv; + + // TODO(jingning): Set an arbitrary small number here. The weight + // doesn't matter as long as it is properly initialized. + ref_mv_weight[stack_idx] = 2; + ++(*refmv_count); + } + } + } +} + +static AOM_INLINE void setup_ref_mv_list( + const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame, + uint8_t *const refmv_count, + CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates, + int mi_row, int mi_col, int16_t *mode_context) { + const int bs = AOMMAX(xd->width, xd->height); + const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs); + MV_REFERENCE_FRAME rf[2]; + + const TileInfo *const tile = &xd->tile; + int max_row_offset = 0, max_col_offset = 0; + const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01); + const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01); + int processed_rows = 0; + int processed_cols = 0; + + av1_set_ref_frame(rf, ref_frame); + mode_context[ref_frame] = 0; + *refmv_count = 0; + + // Find valid maximum row/col offset. + if (xd->up_available) { + max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj; + + if (xd->height < mi_size_high[BLOCK_8X8]) + max_row_offset = -(2 << 1) + row_adj; + + max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset); + } + + if (xd->left_available) { + max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj; + + if (xd->width < mi_size_wide[BLOCK_8X8]) + max_col_offset = -(2 << 1) + col_adj; + + max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset); + } + + uint8_t col_match_count = 0; + uint8_t row_match_count = 0; + uint8_t newmv_count = 0; + + // Scan the first above row mode info. row_offset = -1; + if (abs(max_row_offset) >= 1) + scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &newmv_count, gm_mv_candidates, + max_row_offset, &processed_rows); + // Scan the first left column mode info. col_offset = -1; + if (abs(max_col_offset) >= 1) + scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &newmv_count, gm_mv_candidates, + max_col_offset, &processed_cols); + // Check top-right boundary + if (has_tr) + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack, + ref_mv_weight, &row_match_count, &newmv_count, + gm_mv_candidates, refmv_count); + + const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0); + const uint8_t nearest_refmv_count = *refmv_count; + + // TODO(yunqing): for comp_search, do it for all 3 cases. + for (int idx = 0; idx < nearest_refmv_count; ++idx) + ref_mv_weight[idx] += REF_CAT_LEVEL; + + if (cm->features.allow_ref_frame_mvs) { + int is_available = 0; + const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height); + const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width); + const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]); + const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]); + + const int tpl_sample_pos[3][2] = { + { voffset, -2 }, + { voffset, hoffset }, + { voffset - 2, hoffset }, + }; + const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) && + (xd->height < mi_size_high[BLOCK_64X64]) && + (xd->width >= mi_size_wide[BLOCK_8X8]) && + (xd->width < mi_size_wide[BLOCK_64X64]); + + const int step_h = (xd->height >= mi_size_high[BLOCK_64X64]) + ? mi_size_high[BLOCK_16X16] + : mi_size_high[BLOCK_8X8]; + const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64]) + ? mi_size_wide[BLOCK_16X16] + : mi_size_wide[BLOCK_8X8]; + + for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) { + for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) { + int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, + blk_col, gm_mv_candidates, refmv_count, + ref_mv_stack, ref_mv_weight, mode_context); + if (blk_row == 0 && blk_col == 0) is_available = ret; + } + } + + if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET); + + for (int i = 0; i < 3 && allow_extension; ++i) { + const int blk_row = tpl_sample_pos[i][0]; + const int blk_col = tpl_sample_pos[i][1]; + + if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue; + add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col, + gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight, + mode_context); + } + } + + uint8_t dummy_newmv_count = 0; + + // Scan the second outer area. + scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight, + &row_match_count, &dummy_newmv_count, gm_mv_candidates, + refmv_count); + + for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) { + const int row_offset = -(idx << 1) + 1 + row_adj; + const int col_offset = -(idx << 1) + 1 + col_adj; + + if (abs(row_offset) <= abs(max_row_offset) && + abs(row_offset) > processed_rows) + scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &row_match_count, &dummy_newmv_count, + gm_mv_candidates, max_row_offset, &processed_rows); + + if (abs(col_offset) <= abs(max_col_offset) && + abs(col_offset) > processed_cols) + scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight, + refmv_count, &col_match_count, &dummy_newmv_count, + gm_mv_candidates, max_col_offset, &processed_cols); + } + + const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0); + + switch (nearest_match) { + case 0: + if (ref_match_count >= 1) mode_context[ref_frame] |= 1; + if (ref_match_count == 1) + mode_context[ref_frame] |= (1 << REFMV_OFFSET); + else if (ref_match_count >= 2) + mode_context[ref_frame] |= (2 << REFMV_OFFSET); + break; + case 1: + mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3; + if (ref_match_count == 1) + mode_context[ref_frame] |= (3 << REFMV_OFFSET); + else if (ref_match_count >= 2) + mode_context[ref_frame] |= (4 << REFMV_OFFSET); + break; + case 2: + default: + if (newmv_count >= 1) + mode_context[ref_frame] |= 4; + else + mode_context[ref_frame] |= 5; + + mode_context[ref_frame] |= (5 << REFMV_OFFSET); + break; + } + + // Rank the likelihood and assign nearest and near mvs. + int len = nearest_refmv_count; + while (len > 0) { + int nr_len = 0; + for (int idx = 1; idx < len; ++idx) { + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; + nr_len = idx; + } + } + len = nr_len; + } + + len = *refmv_count; + while (len > nearest_refmv_count) { + int nr_len = nearest_refmv_count; + for (int idx = nearest_refmv_count + 1; idx < len; ++idx) { + if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) { + const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1]; + const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1]; + ref_mv_stack[idx - 1] = ref_mv_stack[idx]; + ref_mv_stack[idx] = tmp_mv; + ref_mv_weight[idx - 1] = ref_mv_weight[idx]; + ref_mv_weight[idx] = tmp_ref_mv_weight; + nr_len = idx; + } + } + len = nr_len; + } + + int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width); + mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col); + int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height); + mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row); + const int mi_size = AOMMIN(mi_width, mi_height); + if (rf[1] > NONE_FRAME) { + // TODO(jingning, yunqing): Refactor and consolidate the compound and + // single reference frame modes. Reduce unnecessary redundancy. + if (*refmv_count < MAX_MV_REF_CANDIDATES) { + int_mv ref_id[2][2], ref_diff[2][2]; + int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 }; + + for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) { + const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_wide[candidate->bsize]; + } + + for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) { + const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; + process_compound_ref_mv_candidate( + candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count); + idx += mi_size_high[candidate->bsize]; + } + + // Build up the compound mv predictor + int_mv comp_list[MAX_MV_REF_CANDIDATES][2]; + + for (int idx = 0; idx < 2; ++idx) { + int comp_idx = 0; + for (int list_idx = 0; + list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; + ++list_idx, ++comp_idx) + comp_list[comp_idx][idx] = ref_id[idx][list_idx]; + for (int list_idx = 0; + list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES; + ++list_idx, ++comp_idx) + comp_list[comp_idx][idx] = ref_diff[idx][list_idx]; + for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx) + comp_list[comp_idx][idx] = gm_mv_candidates[idx]; + } + + if (*refmv_count) { + assert(*refmv_count == 1); + if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int && + comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) { + ref_mv_stack[*refmv_count].this_mv = comp_list[1][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1]; + } else { + ref_mv_stack[*refmv_count].this_mv = comp_list[0][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1]; + } + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; + } else { + for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { + ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0]; + ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1]; + ref_mv_weight[*refmv_count] = 2; + ++*refmv_count; + } + } + } + + assert(*refmv_count >= 2); + + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + } + } else { + // Handle single reference frame extension + for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size && + *refmv_count < MAX_MV_REF_CANDIDATES;) { + const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack, ref_mv_weight); + idx += mi_size_wide[candidate->bsize]; + } + + for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size && + *refmv_count < MAX_MV_REF_CANDIDATES;) { + const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1]; + process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count, + ref_mv_stack, ref_mv_weight); + idx += mi_size_high[candidate->bsize]; + } + + for (int idx = 0; idx < *refmv_count; ++idx) { + clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2, + xd->height << MI_SIZE_LOG2, xd); + } + + if (mv_ref_list != NULL) { + for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx) + mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int; + + for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); + ++idx) { + mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int; + } + } + } +} + +void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], + int_mv *global_mvs, int16_t *mode_context) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int_mv gm_mv[2]; + + if (ref_frame == INTRA_FRAME) { + gm_mv[0].as_int = gm_mv[1].as_int = 0; + if (global_mvs != NULL) { + global_mvs[ref_frame].as_int = INVALID_MV; + } + } else { + const BLOCK_SIZE bsize = mi->bsize; + const int allow_high_precision_mv = cm->features.allow_high_precision_mv; + const int force_integer_mv = cm->features.cur_frame_force_integer_mv; + if (ref_frame < REF_FRAMES) { + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1].as_int = 0; + if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0]; + } else { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]], + allow_high_precision_mv, bsize, mi_col, + mi_row, force_integer_mv); + } + } + + setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame], + ref_mv_stack[ref_frame], ref_mv_weight[ref_frame], + mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row, + mi_col, mode_context); +} + +void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv, int is_integer) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer); + } + *nearest_mv = mvlist[0]; + *near_mv = mvlist[1]; +} + +void av1_setup_frame_buf_refs(AV1_COMMON *cm) { + cm->cur_frame->order_hint = cm->current_frame.order_hint; + cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint; + cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level; + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint; + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] = + buf->display_order_hint; + } + } +} + +void av1_setup_frame_sign_bias(AV1_COMMON *cm) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) { + const int ref_order_hint = buf->order_hint; + cm->ref_frame_sign_bias[ref_frame] = + (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint, + (int)cm->current_frame.order_hint) <= 0) + ? 0 + : 1; + } else { + cm->ref_frame_sign_bias[ref_frame] = 0; + } + } +} + +#define MAX_OFFSET_WIDTH 64 +#define MAX_OFFSET_HEIGHT 0 + +static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row, + int blk_col, MV mv, int sign_bias) { + const int base_blk_row = (blk_row >> 3) << 3; + const int base_blk_col = (blk_col >> 3) << 3; + + const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2)) + : -((-mv.row) >> (4 + MI_SIZE_LOG2)); + + const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2)) + : -((-mv.col) >> (4 + MI_SIZE_LOG2)); + + const int row = + (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset; + const int col = + (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset; + + if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 || + col >= (cm->mi_params.mi_cols >> 1)) + return 0; + + if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) || + row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) || + col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) || + col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3)) + return 0; + + *mi_r = row; + *mi_c = col; + + return 1; +} + +// Note: motion_filed_projection finds motion vectors of current frame's +// reference frame, and projects them to current frame. To make it clear, +// let's call current frame's reference frame as start frame. +// Call Start frame's reference frames as reference frames. +// Call ref_offset as frame distances between start frame and its reference +// frames. +static int motion_field_projection(AV1_COMMON *cm, + MV_REFERENCE_FRAME start_frame, int dir) { + TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; + int ref_offset[REF_FRAMES] = { 0 }; + + const RefCntBuffer *const start_frame_buf = + get_ref_frame_buf(cm, start_frame); + if (start_frame_buf == NULL) return 0; + + if (start_frame_buf->frame_type == KEY_FRAME || + start_frame_buf->frame_type == INTRA_ONLY_FRAME) + return 0; + + if (start_frame_buf->mi_rows != cm->mi_params.mi_rows || + start_frame_buf->mi_cols != cm->mi_params.mi_cols) + return 0; + + const int start_frame_order_hint = start_frame_buf->order_hint; + const unsigned int *const ref_order_hints = + &start_frame_buf->ref_order_hints[0]; + const int cur_order_hint = cm->cur_frame->order_hint; + int start_to_current_frame_offset = get_relative_dist( + &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint); + + for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { + ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info, + start_frame_order_hint, + ref_order_hints[rf - LAST_FRAME]); + } + + if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset; + + MV_REF *mv_ref_base = start_frame_buf->mvs; + const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1; + const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1; + + for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) { + for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) { + MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col]; + MV fwd_mv = mv_ref->mv.as_mv; + + if (mv_ref->ref_frame > INTRA_FRAME) { + int_mv this_mv; + int mi_r, mi_c; + const int ref_frame_offset = ref_offset[mv_ref->ref_frame]; + + int pos_valid = + abs(ref_frame_offset) <= MAX_FRAME_DISTANCE && + ref_frame_offset > 0 && + abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE; + + if (pos_valid) { + get_mv_projection(&this_mv.as_mv, fwd_mv, + start_to_current_frame_offset, ref_frame_offset); + pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col, + this_mv.as_mv, dir >> 1); + } + + if (pos_valid) { + const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c; + + tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row; + tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col; + tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset; + } + } + } + } + + return 1; +} + +// cm->ref_frame_side is calculated here, and will be used in +// av1_copy_frame_mvs() to affect how mvs are copied. +void av1_calculate_ref_frame_side(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; + + memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side)); + if (!order_hint_info->enable_order_hint) return; + + const int cur_order_hint = cm->cur_frame->order_hint; + + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + int order_hint = 0; + + if (buf != NULL) order_hint = buf->order_hint; + + if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0) + cm->ref_frame_side[ref_frame] = 1; + else if (order_hint == cur_order_hint) + cm->ref_frame_side[ref_frame] = -1; + } +} + +void av1_setup_motion_field(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; + + if (!order_hint_info->enable_order_hint) return; + + TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; + int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) * + (cm->mi_params.mi_stride >> 1); + for (int idx = 0; idx < size; ++idx) { + tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV; + tpl_mvs_base[idx].ref_frame_offset = 0; + } + + const int cur_order_hint = cm->cur_frame->order_hint; + const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME]; + int ref_order_hint[INTER_REFS_PER_FRAME]; + + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + const int ref_idx = ref_frame - LAST_FRAME; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + int order_hint = 0; + + if (buf != NULL) order_hint = buf->order_hint; + + ref_buf[ref_idx] = buf; + ref_order_hint[ref_idx] = order_hint; + } + + int ref_stamp = MFMV_STACK_SIZE - 1; + + if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) { + const int alt_of_lst_order_hint = + ref_buf[LAST_FRAME - LAST_FRAME] + ->ref_order_hints[ALTREF_FRAME - LAST_FRAME]; + + const int is_lst_overlay = + (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]); + if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2); + --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[BWDREF_FRAME - LAST_FRAME], + cur_order_hint) > 0) { + if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF2_FRAME - LAST_FRAME], + cur_order_hint) > 0) { + if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp; + } + + if (get_relative_dist(order_hint_info, + ref_order_hint[ALTREF_FRAME - LAST_FRAME], + cur_order_hint) > 0 && + ref_stamp >= 0) + if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp; + + if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2); +} + +static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts, + int *pts_inref, int row_offset, int sign_r, + int col_offset, int sign_c) { + const int bw = block_size_wide[mbmi->bsize]; + const int bh = block_size_high[mbmi->bsize]; + const int x = col_offset * MI_SIZE + sign_c * bw / 2 - 1; + const int y = row_offset * MI_SIZE + sign_r * bh / 2 - 1; + + pts[0] = GET_MV_SUBPEL(x); + pts[1] = GET_MV_SUBPEL(y); + pts_inref[0] = pts[0] + mbmi->mv[0].as_mv.col; + pts_inref[1] = pts[1] + mbmi->mv[0].as_mv.row; +} + +// Select samples according to the motion vector difference. +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int thresh = clamp(AOMMAX(bw, bh), 16, 112); + uint8_t ret = 0; + assert(len <= LEAST_SQUARES_SAMPLES_MAX); + + // Only keep the samples with MV differences within threshold. + for (int i = 0; i < len; ++i) { + const int diff = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) + + abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row); + if (diff > thresh) continue; + if (ret != i) { + memcpy(pts + 2 * ret, pts + 2 * i, 2 * sizeof(pts[0])); + memcpy(pts_inref + 2 * ret, pts_inref + 2 * i, 2 * sizeof(pts_inref[0])); + } + ++ret; + } + // Keep at least 1 sample. + return AOMMAX(ret, 1); +} + +// Note: Samples returned are at 1/8-pel precision +// Sample are the neighbor block center point's coordinates relative to the +// left-top pixel of current block. +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref) { + const MB_MODE_INFO *const mbmi0 = xd->mi[0]; + const int ref_frame = mbmi0->ref_frame[0]; + const int up_available = xd->up_available; + const int left_available = xd->left_available; + uint8_t np = 0; + int do_tl = 1; + int do_tr = 1; + const int mi_stride = xd->mi_stride; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // scan the nearest above rows + if (up_available) { + const int mi_row_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride]; + uint8_t superblock_width = mi_size_wide[mbmi->bsize]; + + if (xd->width <= superblock_width) { + // Handle "current block width <= above block width" case. + const int col_offset = -mi_col % superblock_width; + + if (col_offset < 0) do_tl = 0; + if (col_offset + superblock_width > xd->width) do_tr = 0; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } else { + // Handle "current block width > above block width" case. + for (int i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col); + i += superblock_width) { + mbmi = xd->mi[i + mi_row_offset * mi_stride]; + superblock_width = mi_size_wide[mbmi->bsize]; + + if (mbmi->ref_frame[0] == ref_frame && + mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, i, 1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) + return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // scan the nearest left columns + if (left_available) { + const int mi_col_offset = -1; + const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset]; + uint8_t superblock_height = mi_size_high[mbmi->bsize]; + + if (xd->height <= superblock_height) { + // Handle "current block height <= above block height" case. + const int row_offset = -mi_row % superblock_height; + + if (row_offset < 0) do_tl = 0; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1); + pts += 2; + pts_inref += 2; + np++; + if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } else { + // Handle "current block height > above block height" case. + for (int i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row); + i += superblock_height) { + mbmi = xd->mi[mi_col_offset + i * mi_stride]; + superblock_height = mi_size_high[mbmi->bsize]; + + if (mbmi->ref_frame[0] == ref_frame && + mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, i, 1, 0, -1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) + return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // Top-left block + if (do_tl && left_available && up_available) { + const int mi_row_offset = -1; + const int mi_col_offset = -1; + MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride]; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1); + pts += 2; + pts_inref += 2; + if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + // Top-right block + if (do_tr && + has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) { + const POSITION trb_pos = { -1, xd->width }; + const TileInfo *const tile = &xd->tile; + if (is_inside(tile, mi_col, mi_row, &trb_pos)) { + const int mi_row_offset = -1; + const int mi_col_offset = xd->width; + const MB_MODE_INFO *mbmi = + xd->mi[mi_col_offset + mi_row_offset * mi_stride]; + + if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) { + record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1); + if (++np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX; + } + } + } + assert(np <= LEAST_SQUARES_SAMPLES_MAX); + + return np; +} + +void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; + SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->ref_frame_idx_0 = INVALID_IDX; + skip_mode_info->ref_frame_idx_1 = INVALID_IDX; + + if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) || + cm->current_frame.reference_mode == SINGLE_REFERENCE) + return; + + const int cur_order_hint = cm->current_frame.order_hint; + int ref_order_hints[2] = { -1, INT_MAX }; + int ref_idx[2] = { INVALID_IDX, INVALID_IDX }; + + // Identify the nearest forward and backward references. + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) < + 0) { + // Forward reference + if (ref_order_hints[0] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) > 0) { + ref_order_hints[0] = ref_order_hint; + ref_idx[0] = i; + } + } else if (get_relative_dist(order_hint_info, ref_order_hint, + cur_order_hint) > 0) { + // Backward reference + if (ref_order_hints[1] == INT_MAX || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) < 0) { + ref_order_hints[1] = ref_order_hint; + ref_idx[1] = i; + } + } + } + + if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) { + // == Bi-directional prediction == + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) { + // == Forward prediction only == + // Identify the second nearest forward reference. + ref_order_hints[1] = -1; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); + if (buf == NULL) continue; + + const int ref_order_hint = buf->order_hint; + if ((ref_order_hints[0] != -1 && + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[0]) < 0) && + (ref_order_hints[1] == -1 || + get_relative_dist(order_hint_info, ref_order_hint, + ref_order_hints[1]) > 0)) { + // Second closest forward reference + ref_order_hints[1] = ref_order_hint; + ref_idx[1] = i; + } + } + if (ref_order_hints[1] != -1) { + skip_mode_info->skip_mode_allowed = 1; + skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]); + skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]); + } + } +} + +typedef struct { + int map_idx; // frame map index + RefCntBuffer *buf; // frame buffer + int sort_idx; // index based on the offset to be used for sorting +} REF_FRAME_INFO; + +// Compares the sort_idx fields. If they are equal, then compares the map_idx +// fields to break the tie. This ensures a stable sort. +static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { + const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a; + const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b; + + const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx; + if (sort_idx_diff != 0) return sort_idx_diff; + return info_a->map_idx - info_b->map_idx; +} + +static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx, + REF_FRAME_INFO *ref_info) { + assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); + + remapped_ref_idx[frame_idx] = ref_info->map_idx; +} + +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx) { + int lst_frame_sort_idx = -1; + int gld_frame_sort_idx = -1; + + assert(cm->seq_params->order_hint_info.enable_order_hint); + assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0); + const int cur_order_hint = (int)cm->current_frame.order_hint; + const int cur_frame_sort_idx = + 1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1; + + REF_FRAME_INFO ref_frame_info[REF_FRAMES]; + int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 }; + + for (int i = 0; i < REF_FRAMES; ++i) { + const int map_idx = i; + + ref_frame_info[i].map_idx = map_idx; + ref_frame_info[i].sort_idx = -1; + + RefCntBuffer *const buf = cm->ref_frame_map[map_idx]; + ref_frame_info[i].buf = buf; + + if (buf == NULL) continue; + // If this assertion fails, there is a reference leak. + assert(buf->ref_count > 0); + + const int offset = (int)buf->order_hint; + ref_frame_info[i].sort_idx = + (offset == -1) ? -1 + : cur_frame_sort_idx + + get_relative_dist(&cm->seq_params->order_hint_info, + offset, cur_order_hint); + assert(ref_frame_info[i].sort_idx >= -1); + + if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx; + if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx; + } + + // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference + // frames. + if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) { + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests a look-ahead frame as LAST"); + } + if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) { + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests a look-ahead frame as GOLDEN"); + } + + // Sort ref frames based on their frame_offset values. + qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO), + compare_ref_frame_info); + + // Identify forward and backward reference frames. + // Forward reference: offset < order_hint + // Backward reference: offset >= order_hint + int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1; + + for (int i = 0; i < REF_FRAMES; i++) { + if (ref_frame_info[i].sort_idx == -1) { + fwd_start_idx++; + continue; + } + + if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) { + fwd_end_idx = i - 1; + break; + } + } + + int bwd_start_idx = fwd_end_idx + 1; + int bwd_end_idx = REF_FRAMES - 1; + + // === Backward Reference Frames === + + // == ALTREF_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME, + &ref_frame_info[bwd_end_idx]); + ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1; + bwd_end_idx--; + } + + // == BWDREF_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME, + &ref_frame_info[bwd_start_idx]); + ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1; + bwd_start_idx++; + } + + // == ALTREF2_FRAME == + if (bwd_start_idx <= bwd_end_idx) { + set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME, + &ref_frame_info[bwd_start_idx]); + ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1; + } + + // === Forward Reference Frames === + + for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) { + // == LAST_FRAME == + if (ref_frame_info[i].map_idx == lst_map_idx) { + set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME, + &ref_frame_info[i]); + ref_flag_list[LAST_FRAME - LAST_FRAME] = 1; + } + + // == GOLDEN_FRAME == + if (ref_frame_info[i].map_idx == gld_map_idx) { + set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME, + &ref_frame_info[i]); + ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1; + } + } + + assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 && + ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1); + + // == LAST2_FRAME == + // == LAST3_FRAME == + // == BWDREF_FRAME == + // == ALTREF2_FRAME == + // == ALTREF_FRAME == + + // Set up the reference frames in the anti-chronological order. + static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = { + LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME + }; + + int ref_idx; + for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { + const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; + + if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; + + while (fwd_start_idx <= fwd_end_idx && + (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx || + ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) { + fwd_end_idx--; + } + if (fwd_start_idx > fwd_end_idx) break; + + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, + &ref_frame_info[fwd_end_idx]); + ref_flag_list[ref_frame - LAST_FRAME] = 1; + + fwd_end_idx--; + } + + // Assign all the remaining frame(s), if any, to the earliest reference + // frame. + for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { + const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; + if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, + &ref_frame_info[fwd_start_idx]); + ref_flag_list[ref_frame - LAST_FRAME] = 1; + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + assert(ref_flag_list[i] == 1); + } +} diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h new file mode 100644 index 0000000000..3ab784c1ed --- /dev/null +++ b/third_party/aom/av1/common/mvref_common.h @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_ +#define AOM_AV1_COMMON_MVREF_COMMON_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MVREF_ROW_COLS 3 + +// Set the upper limit of the motion vector component magnitude. +// This would make a motion vector fit in 26 bits. Plus 3 bits for the +// reference frame index. A tuple of motion vector can hence be stored within +// 32 bit range for efficient load/store operations. +#define REFMVS_LIMIT ((1 << 12) - 1) + +typedef struct position { + int row; + int col; +} POSITION; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) { + if (!oh->enable_order_hint) return 0; + + const int bits = oh->order_hint_bits_minus_1 + 1; + + assert(bits >= 1); + assert(a >= 0 && a < (1 << bits)); + assert(b >= 0 && b < (1 << bits)); + + int diff = a - b; + const int m = 1 << (bits - 1); + diff = (diff & (m - 1)) - (diff & m); + return diff; +} + +static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) { + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER, + xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER, + xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER, + xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER + }; + clamp_mv(mv, &mv_limits); +} + +static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) { + return candidate->mv[which_mv]; +} + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < tile->mi_row_start || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= tile->mi_row_end || + mi_col + mi_pos->col >= tile->mi_col_end); +} + +static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row, + int row_offset) { + return clamp(row_offset, tile->mi_row_start - mi_row, + tile->mi_row_end - mi_row - 1); +} + +static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col, + int col_offset) { + return clamp(col_offset, tile->mi_col_start - mi_col, + tile->mi_col_end - mi_col - 1); +} + +static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) { + if (is_integer) { + integer_mv_precision(mv); + } else { + if (!allow_hp) { + if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1); + } + } +} + +static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) { + // Single ref pred + if (rf[1] <= INTRA_FRAME) return -1; + + // Bi-directional comp ref pred + if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1; + + for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) { + if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx)) + return ref_idx; + } + return -1; +} + +static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) { + if (rf[1] > INTRA_FRAME) { + const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf); + if (uni_comp_ref_idx >= 0) { + assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) < + MODE_CTX_REF_FRAMES); + return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx; + } else { + return REF_FRAMES + FWD_RF_OFFSET(rf[0]) + + BWD_RF_OFFSET(rf[1]) * FWD_REFS; + } + } + + return rf[0]; +} + +// clang-format off +static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = { + { LAST_FRAME, BWDREF_FRAME }, { LAST2_FRAME, BWDREF_FRAME }, + { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME }, + + { LAST_FRAME, ALTREF2_FRAME }, { LAST2_FRAME, ALTREF2_FRAME }, + { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME }, + + { LAST_FRAME, ALTREF_FRAME }, { LAST2_FRAME, ALTREF_FRAME }, + { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + + { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + + // NOTE: Following reference frame pairs are not supported to be explicitly + // signalled, but they are possibly chosen by the use of skip_mode, + // which may use the most recent one-sided reference frame pair. + { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME }, + { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME}, + { ALTREF2_FRAME, ALTREF_FRAME } +}; +// clang-format on + +static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf, + MV_REFERENCE_FRAME ref_frame_type) { + if (ref_frame_type >= REF_FRAMES) { + rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0]; + rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1]; + } else { + assert(ref_frame_type > NONE_FRAME); + rf[0] = ref_frame_type; + rf[1] = NONE_FRAME; + } +} + +static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = { + { 0, 1, 1, 1, 1 }, + { 1, 2, 3, 4, 4 }, + { 4, 4, 5, 6, 7 }, +}; + +static INLINE int16_t av1_mode_context_analyzer( + const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) { + const int8_t ref_frame = av1_ref_frame_type(rf); + + if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame]; + + const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK; + const int16_t refmv_ctx = + (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK; + + const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( + newmv_ctx, COMP_NEWMV_CTXS - 1)]; + return comp_ctx; +} + +static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) { + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL) + return 0; + + if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) + return 1; + + if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL && + ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL) + return 2; + + return 0; +} + +void av1_setup_frame_buf_refs(AV1_COMMON *cm); +void av1_setup_frame_sign_bias(AV1_COMMON *cm); +void av1_setup_skip_mode_allowed(AV1_COMMON *cm); +void av1_calculate_ref_frame_side(AV1_COMMON *cm); +void av1_setup_motion_field(AV1_COMMON *cm); +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx); + +static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { + av1_zero(xd->neighbors_ref_counts); + + uint8_t *const ref_counts = xd->neighbors_ref_counts; + + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; + + // Above neighbor + if (above_in_image && is_inter_block(above_mbmi)) { + ref_counts[above_mbmi->ref_frame[0]]++; + if (has_second_ref(above_mbmi)) { + ref_counts[above_mbmi->ref_frame[1]]++; + } + } + + // Left neighbor + if (left_in_image && is_inter_block(left_mbmi)) { + ref_counts[left_mbmi->ref_frame[0]]++; + if (has_second_ref(left_mbmi)) { + ref_counts[left_mbmi->ref_frame[1]]++; + } + } +} + +void av1_copy_frame_mvs(const AV1_COMMON *const cm, + const MB_MODE_INFO *const mi, int mi_row, int mi_col, + int x_mis, int y_mis); + +// The global_mvs output parameter points to an array of REF_FRAMES elements. +// The caller may pass a null global_mvs if it does not need the global_mvs +// output. +void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], + CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE], + uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE], + int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], + int_mv *global_mvs, int16_t *mode_context); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, + int_mv *near_mv, int is_integer); + +uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len, + BLOCK_SIZE bsize); +uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, + int *pts_inref); + +#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels +#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64) + +static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile, + int mib_size, int mi_row) { + if (mi_row - mib_size < tile->mi_row_start) { + ref_dv->as_fullmv.row = 0; + ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS; + } else { + ref_dv->as_fullmv.row = -MI_SIZE * mib_size; + ref_dv->as_fullmv.col = 0; + } + convert_fullmv_to_mv(ref_dv); +} + +static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm, + const MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize, int mib_size_log2) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int SCALE_PX_TO_MV = 8; + // Disallow subpixel for now + // SUBPEL_MASK is not the correct scale + if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1)))) + return 0; + + const TileInfo *const tile = &xd->tile; + // Is the source top-left inside the current tile? + const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row; + const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV; + if (src_top_edge < tile_top_edge) return 0; + const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col; + const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV; + if (src_left_edge < tile_left_edge) return 0; + // Is the bottom right inside the current tile? + const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row; + const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV; + if (src_bottom_edge > tile_bottom_edge) return 0; + const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col; + const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV; + if (src_right_edge > tile_right_edge) return 0; + + // Special case for sub 8x8 chroma cases, to prevent referring to chroma + // pixels outside current tile. + if (xd->is_chroma_ref && av1_num_planes(cm) > 1) { + const struct macroblockd_plane *const pd = &xd->plane[1]; + if (bw < 8 && pd->subsampling_x) + if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0; + if (bh < 8 && pd->subsampling_y) + if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0; + } + + // Is the bottom right within an already coded SB? Also consider additional + // constraints to facilitate HW decoder. + const int max_mib_size = 1 << mib_size_log2; + const int active_sb_row = mi_row >> mib_size_log2; + const int active_sb64_col = (mi_col * MI_SIZE) >> 6; + const int sb_size = max_mib_size * MI_SIZE; + const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size; + const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6; + const int total_sb64_per_row = + ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1; + const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col; + const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col; + if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0; + + // Wavefront constraint: use only top left area of frame for reference. + const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64); + const int wf_offset = gradient * (active_sb_row - src_sb_row); + if (src_sb_row > active_sb_row || + src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset) + return 0; + + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_MVREF_COMMON_H_ diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h new file mode 100644 index 0000000000..b84034541e --- /dev/null +++ b/third_party/aom/av1/common/obmc.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_OBMC_H_ +#define AOM_AV1_COMMON_OBMC_H_ + +typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, + void *fun_ctxt, const int num_planes); + +static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm, + MACROBLOCKD *xd, int nb_max, + overlappable_nb_visitor_t fun, + void *fun_ctxt) { + if (!xd->up_available) return; + + const int num_planes = av1_num_planes(cm); + int nb_count = 0; + const int mi_col = xd->mi_col; + // prev_row_mi points into the mi array, starting at the beginning of the + // previous row. + MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; + const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols); + uint8_t mi_step; + for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max; + above_mi_col += mi_step) { + MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; + mi_step = + AOMMIN(mi_size_wide[above_mi[0]->bsize], mi_size_wide[BLOCK_64X64]); + // If we're considering a block with width 4, it should be treated as + // half of a pair of blocks with chroma information in the second. Move + // above_mi_col back to the start of the pair if needed, set above_mbmi + // to point at the block with chroma information, and set mi_step to 2 to + // step over the entire pair at the end of the iteration. + if (mi_step == 1) { + above_mi_col &= ~1; + above_mi = prev_row_mi + above_mi_col + 1; + mi_step = 2; + } + if (is_neighbor_overlappable(*above_mi)) { + ++nb_count; + fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0, + *above_mi, fun_ctxt, num_planes); + } + } +} + +static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm, + MACROBLOCKD *xd, int nb_max, + overlappable_nb_visitor_t fun, + void *fun_ctxt) { + if (!xd->left_available) return; + + const int num_planes = av1_num_planes(cm); + int nb_count = 0; + // prev_col_mi points into the mi array, starting at the top of the + // previous column + const int mi_row = xd->mi_row; + MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; + const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows); + uint8_t mi_step; + for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max; + left_mi_row += mi_step) { + MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; + mi_step = + AOMMIN(mi_size_high[left_mi[0]->bsize], mi_size_high[BLOCK_64X64]); + if (mi_step == 1) { + left_mi_row &= ~1; + left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride; + mi_step = 2; + } + if (is_neighbor_overlappable(*left_mi)) { + ++nb_count; + fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi, + fun_ctxt, num_planes); + } + } +} + +#endif // AOM_AV1_COMMON_OBMC_H_ diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c new file mode 100644 index 0000000000..cfca03bb4d --- /dev/null +++ b/third_party/aom/av1/common/obu_util.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/common/obu_util.h" + +#include "aom_dsp/bitreader_buffer.h" + +static aom_codec_err_t read_obu_size(const uint8_t *data, + size_t bytes_available, + size_t *const obu_size, + size_t *const length_field_size) { + uint64_t u_obu_size = 0; + if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) != + 0) { + return AOM_CODEC_CORRUPT_FRAME; + } + + if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME; + *obu_size = (size_t)u_obu_size; + return AOM_CODEC_OK; +} + +// Parses OBU header and stores values in 'header'. +static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb, + int is_annexb, ObuHeader *header) { + if (!rb || !header) return AOM_CODEC_INVALID_PARAM; + + const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer; + if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size = 1; + + if (aom_rb_read_bit(rb) != 0) { + // Forbidden bit. Must not be set. + return AOM_CODEC_CORRUPT_FRAME; + } + + header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4); + header->has_extension = aom_rb_read_bit(rb); + header->has_size_field = aom_rb_read_bit(rb); + + if (!header->has_size_field && !is_annexb) { + // section 5 obu streams must have obu_size field set. + return AOM_CODEC_UNSUP_BITSTREAM; + } + + // obu_reserved_1bit must be set to 0. The value is ignored by a decoder. + aom_rb_read_bit(rb); + + if (header->has_extension) { + if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME; + + header->size += 1; + header->temporal_layer_id = aom_rb_read_literal(rb, 3); + header->spatial_layer_id = aom_rb_read_literal(rb, 2); + // extension_header_reserved_3bits must be set to 0. The value is ignored by + // a decoder. + aom_rb_read_literal(rb, 3); + } else { + header->temporal_layer_id = 0; + header->spatial_layer_id = 0; + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb) { + if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM; + + // TODO(tomfinegan): Set the error handler here and throughout this file, and + // confirm parsing work done via aom_read_bit_buffer is successful. + struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL, + NULL }; + aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header); + if (parse_result == AOM_CODEC_OK) *consumed = header->size; + return parse_result; +} + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read) { + size_t length_field_size_obu = 0; + size_t length_field_size_payload = 0; + size_t obu_size = 0; + aom_codec_err_t status; + + if (is_annexb) { + // Size field comes before the OBU header, and includes the OBU header + status = + read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu); + + if (status != AOM_CODEC_OK) return status; + } + + struct aom_read_bit_buffer rb = { data + length_field_size_obu, + data + bytes_available, 0, NULL, NULL }; + + status = read_obu_header(&rb, is_annexb, obu_header); + if (status != AOM_CODEC_OK) return status; + + if (!obu_header->has_size_field) { + assert(is_annexb); + // Derive the payload size from the data we've already read + if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME; + + *payload_size = obu_size - obu_header->size; + } else { + // Size field comes after the OBU header, and is just the payload size + status = read_obu_size( + data + length_field_size_obu + obu_header->size, + bytes_available - length_field_size_obu - obu_header->size, + payload_size, &length_field_size_payload); + if (status != AOM_CODEC_OK) return status; + } + + *bytes_read = + length_field_size_obu + obu_header->size + length_field_size_payload; + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h new file mode 100644 index 0000000000..adf3568e15 --- /dev/null +++ b/third_party/aom/av1/common/obu_util.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_OBU_UTIL_H_ +#define AOM_AV1_COMMON_OBU_UTIL_H_ + +#include "aom/aom_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t size; // Size (1 or 2 bytes) of the OBU header (including the + // optional OBU extension header) in the bitstream. + OBU_TYPE type; + int has_size_field; + int has_extension; // Whether the optional OBU extension header is present. + // The following fields come from the OBU extension header. They are set to 0 + // if has_extension is false. + int temporal_layer_id; + int spatial_layer_id; +} ObuHeader; + +aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length, + size_t *consumed, ObuHeader *header, + int is_annexb); + +aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data, + size_t bytes_available, + int is_annexb, + ObuHeader *obu_header, + size_t *const payload_size, + size_t *const bytes_read); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_OBU_UTIL_H_ diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c new file mode 100644 index 0000000000..6f88768f2f --- /dev/null +++ b/third_party/aom/av1/common/ppc/cfl_ppc.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#define OFF_0 0 +#define OFF_1 16 +#define OFF_2 32 +#define OFF_3 48 +#define CFL_BUF_LINE_BYTES 64 +#define CFL_LINE_1 64 +#define CFL_LINE_2 128 +#define CFL_LINE_3 192 + +typedef vector signed char int8x16_t; // NOLINT(runtime/int) +typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) +typedef vector signed short int16x8_t; // NOLINT(runtime/int) +typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) +typedef vector signed int int32x4_t; // NOLINT(runtime/int) +typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) +typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) + +static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, + int width, int height, int round_offset, + int num_pel_log2) { + // int16_t *dst = dst_ptr; + const int16_t *dst_end = dst + height * CFL_BUF_LINE; + const int16_t *sum_buf = (const int16_t *)src_ptr; + const int16_t *end = sum_buf + height * CFL_BUF_LINE; + const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); + const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; + const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B }; + + int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset }; + int32x4_t sum_32x4_1 = { 0, 0, 0, 0 }; + do { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0); + sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1); + if (width >= 16) { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1); + } + if (width == 32) { + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1); + sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0); + sum_32x4_1 = + vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1); + } + } while ((sum_buf += (CFL_BUF_LINE * 2)) < end); + int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1); + + const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64); + sum_32x4 = vec_add(sum_32x4, perm_64); + const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32); + sum_32x4 = vec_add(sum_32x4, perm_32); + const int32x4_t avg = vec_sr(sum_32x4, div_shift); + const int16x8_t vec_avg = vec_pack(avg, avg); + do { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg), + OFF_0 + CFL_BUF_LINE_BYTES, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg), + OFF_0 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg), + OFF_0 + CFL_LINE_3, dst); + if (width >= 16) { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg), + OFF_1 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg), + OFF_1 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg), + OFF_1 + CFL_LINE_3, dst); + } + if (width == 32) { + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg), + OFF_2 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg), + OFF_2 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg), + OFF_2 + CFL_LINE_3, dst); + + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg), + OFF_3 + CFL_LINE_1, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg), + OFF_3 + CFL_LINE_2, dst); + vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg), + OFF_3 + CFL_LINE_3, dst); + } + } while ((dst += CFL_BUF_LINE * 4) < dst_end); +} + +// Declare wrappers for VSX sizes +CFL_SUB_AVG_X(vsx, 8, 4, 16, 5) +CFL_SUB_AVG_X(vsx, 8, 8, 32, 6) +CFL_SUB_AVG_X(vsx, 8, 16, 64, 7) +CFL_SUB_AVG_X(vsx, 8, 32, 128, 8) +CFL_SUB_AVG_X(vsx, 16, 4, 32, 6) +CFL_SUB_AVG_X(vsx, 16, 8, 64, 7) +CFL_SUB_AVG_X(vsx, 16, 16, 128, 8) +CFL_SUB_AVG_X(vsx, 16, 32, 256, 9) +CFL_SUB_AVG_X(vsx, 32, 8, 128, 8) +CFL_SUB_AVG_X(vsx, 32, 16, 256, 9) +CFL_SUB_AVG_X(vsx, 32, 32, 512, 10) + +// Based on observation, for small blocks VSX does not outperform C (no 64bit +// load and store intrinsics). So we call the C code for block widths 4. +cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { + cfl_subtract_average_4x4_c, /* 4x4 */ + cfl_subtract_average_8x8_vsx, /* 8x8 */ + cfl_subtract_average_16x16_vsx, /* 16x16 */ + cfl_subtract_average_32x32_vsx, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_c, /* 4x8 */ + cfl_subtract_average_8x4_vsx, /* 8x4 */ + cfl_subtract_average_8x16_vsx, /* 8x16 */ + cfl_subtract_average_16x8_vsx, /* 16x8 */ + cfl_subtract_average_16x32_vsx, /* 16x32 */ + cfl_subtract_average_32x16_vsx, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_c, /* 4x16 */ + cfl_subtract_average_16x4_vsx, /* 16x4 */ + cfl_subtract_average_8x32_vsx, /* 8x32 */ + cfl_subtract_average_32x8_vsx, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to + // index the function pointer array out of bounds. + return sub_avg[tx_size % TX_SIZES_ALL]; +} diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c new file mode 100644 index 0000000000..5952441d1f --- /dev/null +++ b/third_party/aom/av1/common/pred_common.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +// Returns a context number for the given MB prediction signal +static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi, + const MACROBLOCKD *xd, int dir, + MV_REFERENCE_FRAME ref_frame) { + (void)xd; + + return ((ref_mbmi->ref_frame[0] == ref_frame || + ref_mbmi->ref_frame[1] == ref_frame) + ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01) + : SWITCHABLE_FILTERS); +} + +int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx_offset = + (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET; + assert(dir == 0 || dir == 1); + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET; + int left_type = SWITCHABLE_FILTERS; + int above_type = SWITCHABLE_FILTERS; + + if (xd->left_available) + left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame); + + if (xd->up_available) + above_type = + get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame); + + if (left_type == above_type) { + filter_type_ctx += left_type; + } else if (left_type == SWITCHABLE_FILTERS) { + assert(above_type != SWITCHABLE_FILTERS); + filter_type_ctx += above_type; + } else if (above_type == SWITCHABLE_FILTERS) { + assert(left_type != SWITCHABLE_FILTERS); + filter_type_ctx += left_type; + } else { + filter_type_ctx += SWITCHABLE_FILTERS; + } + + return filter_type_ctx; +} + +static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) { + // Do not add an already existing value + if (*n > 0 && val == cache[*n - 1]) return; + + cache[(*n)++] = val; +} + +int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, + uint16_t *cache) { + const int row = -xd->mb_to_top_edge >> 3; + // Do not refer to above SB row when on SB boundary. + const MB_MODE_INFO *const above_mi = + (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int above_n = 0, left_n = 0; + if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0]; + if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0]; + if (above_n == 0 && left_n == 0) return 0; + int above_idx = plane * PALETTE_MAX_SIZE; + int left_idx = plane * PALETTE_MAX_SIZE; + int n = 0; + const uint16_t *above_colors = + above_mi ? above_mi->palette_mode_info.palette_colors : NULL; + const uint16_t *left_colors = + left_mi ? left_mi->palette_mode_info.palette_colors : NULL; + // Merge the sorted lists of base colors from above and left to get + // combined sorted color cache. + while (above_n > 0 && left_n > 0) { + uint16_t v_above = above_colors[above_idx]; + uint16_t v_left = left_colors[left_idx]; + if (v_left < v_above) { + palette_add_to_cache(cache, &n, v_left); + ++left_idx, --left_n; + } else { + palette_add_to_cache(cache, &n, v_above); + ++above_idx, --above_n; + if (v_left == v_above) ++left_idx, --left_n; + } + } + while (above_n-- > 0) { + uint16_t val = above_colors[above_idx++]; + palette_add_to_cache(cache, &n, val); + } + while (left_n-- > 0) { + uint16_t val = left_colors[left_idx++]; + palette_add_to_cache(cache, &n, val); + } + assert(n <= 2 * PALETTE_MAX_SIZE); + return n; +} + +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +int av1_get_intra_inter_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + return left_intra && above_intra ? 3 : left_intra || above_intra; + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi); + } else { + return 0; + } +} + +#define CHECK_BACKWARD_REFS(ref_frame) \ + (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME)) +#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame) + +int av1_get_reference_mode_context(const MACROBLOCKD *xd) { + int ctx; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + if (has_above && has_left) { // both edges available + if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) + // neither edge uses comp pred (0/1) + ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^ + IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]); + else if (!has_second_ref(above_mbmi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) || + !is_inter_block(above_mbmi)); + else if (!has_second_ref(left_mbmi)) + // one of two edges uses comp pred (2/3) + ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) || + !is_inter_block(left_mbmi)); + else // both edges use comp pred (4) + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; + + if (!has_second_ref(edge_mbmi)) + // edge does not use comp pred (0/1) + ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]); + else + // edge uses comp pred (3) + ctx = 3; + } else { // no edges available (1) + ctx = 1; + } + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; +} + +int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) { + int pred_context; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int above_in_image = xd->up_available; + const int left_in_image = xd->left_available; + + if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + + if (above_intra && left_intra) { // intra/intra + pred_context = 2; + } else if (above_intra || left_intra) { // intra/inter + const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi; + + if (!has_second_ref(inter_mbmi)) // single pred + pred_context = 2; + else // comp pred + pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi); + } else { // inter/inter + const int a_sg = !has_second_ref(above_mbmi); + const int l_sg = !has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0]; + + if (a_sg && l_sg) { // single/single + pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^ + IS_BACKWARD_REF_FRAME(frfl))); + } else if (l_sg || a_sg) { // single/comp + const int uni_rfc = + a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi); + + if (!uni_rfc) // comp bidir + pred_context = 1; + else // comp unidir + pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^ + IS_BACKWARD_REF_FRAME(frfl))); + } else { // comp/comp + const int a_uni_rfc = has_uni_comp_refs(above_mbmi); + const int l_uni_rfc = has_uni_comp_refs(left_mbmi); + + if (!a_uni_rfc && !l_uni_rfc) // bidir/bidir + pred_context = 0; + else if (!a_uni_rfc || !l_uni_rfc) // unidir/bidir + pred_context = 2; + else // unidir/unidir + pred_context = + 3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME))); + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (!is_inter_block(edge_mbmi)) { // intra + pred_context = 2; + } else { // inter + if (!has_second_ref(edge_mbmi)) // single pred + pred_context = 2; + else // comp pred + pred_context = 4 * has_uni_comp_refs(edge_mbmi); + } + } else { // no edges available + pred_context = 2; + } + + assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as either +// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN), +// conditioning on the pair is known as uni-directional. +// +// 3 contexts: Voting is used to compare the count of forward references with +// that of backward references from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of forward references (L, L2, L3, or G) + const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + // Count of backward references (B or A) + const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + + ref_counts[ALTREF_FRAME]; + + const int pred_context = + (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as +// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN), +// conditioning on the pair is known as one of the above three. +// +// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the +// total count of LAST3/GOLDEN from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST2 + const int last2_count = ref_counts[LAST2_FRAME]; + // Count of LAST3 or GOLDEN + const int last3_or_gld_count = + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + + const int pred_context = (last2_count == last3_or_gld_count) + ? 1 + : ((last2_count < last3_or_gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// Returns a context number for the given MB prediction signal +// +// Signal the uni-directional compound reference frame pair as +// either (LAST, LAST3) or (LAST, GOLDEN), +// conditioning on the pair is known as one of the above two. +// +// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the +// total count of GOLDEN_FRAME from the spatial neighbors. +int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST3 + const int last3_count = ref_counts[LAST3_FRAME]; + // Count of GOLDEN + const int gld_count = ref_counts[GOLDEN_FRAME]; + + const int pred_context = + (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS); + return pred_context; +} + +// == Common context functions for both comp and single ref == +// +// Obtain contexts to signal a reference frame to be either LAST/LAST2 or +// LAST3/GOLDEN. +static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST + LAST2 + const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME]; + // Count of LAST3 + GOLDEN + const int last3_gld_count = + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + + const int pred_context = (last_last2_count == last3_gld_count) + ? 1 + : ((last_last2_count < last3_gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame to be either LAST or LAST2. +static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST + const int last_count = ref_counts[LAST_FRAME]; + // Count of LAST2 + const int last2_count = ref_counts[LAST2_FRAME]; + + const int pred_context = + (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN. +static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of LAST3 + const int last3_count = ref_counts[LAST3_FRAME]; + // Count of GOLDEN + const int gld_count = ref_counts[GOLDEN_FRAME]; + + const int pred_context = + (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or +// ALTREF. +static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A) + const int brfarf2_count = + ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME]; + const int arf_count = ref_counts[ALTREF_FRAME]; + + const int pred_context = + (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2. +static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of BWDREF frames (B) + const int brf_count = ref_counts[BWDREF_FRAME]; + // Count of ALTREF2 frames (A2) + const int arf2_count = ref_counts[ALTREF2_FRAME]; + + const int pred_context = + (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// == Context functions for comp ref == +// +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be either +// GOLDEN/LAST3, or LAST/LAST2. +int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) { + return get_pred_context_ll2_or_l3gld(xd); +} + +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be LAST, +// conditioning on that it is known either LAST/LAST2. +int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) { + return get_pred_context_last_or_last2(xd); +} + +// Returns a context number for the given MB prediction signal +// Signal the first reference frame for a compound mode be GOLDEN, +// conditioning on that it is known either GOLDEN or LAST3. +int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) { + return get_pred_context_last3_or_gld(xd); +} + +// Signal the 2nd reference frame for a compound mode be either +// ALTREF, or ALTREF2/BWDREF. +int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) { + return get_pred_context_brfarf2_or_arf(xd); +} + +// Signal the 2nd reference frame for a compound mode be either +// ALTREF2 or BWDREF. +int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) { + return get_pred_context_brf_or_arf2(xd); +} + +// == Context functions for single ref == +// +// For the bit to signal whether the single reference is a forward reference +// frame or a backward reference frame. +int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0]; + + // Count of forward reference frames + const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] + + ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME]; + // Count of backward reference frames + const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] + + ref_counts[ALTREF_FRAME]; + + const int pred_context = + (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2); + + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} + +// For the bit to signal whether the single reference is ALTREF_FRAME or +// non-ALTREF backward reference frame, knowing that it shall be either of +// these 2 choices. +int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { + return get_pred_context_brfarf2_or_arf(xd); +} + +// For the bit to signal whether the single reference is LAST3/GOLDEN or +// LAST2/LAST, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) { + return get_pred_context_ll2_or_l3gld(xd); +} + +// For the bit to signal whether the single reference is LAST2_FRAME or +// LAST_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) { + return get_pred_context_last_or_last2(xd); +} + +// For the bit to signal whether the single reference is GOLDEN_FRAME or +// LAST3_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) { + return get_pred_context_last3_or_gld(xd); +} + +// For the bit to signal whether the single reference is ALTREF2_FRAME or +// BWDREF_FRAME, knowing that it shall be either of these 2 choices. +int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) { + return get_pred_context_brf_or_arf2(xd); +} diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h new file mode 100644 index 0000000000..361a4078d4 --- /dev/null +++ b/third_party/aom/av1/common/pred_common.h @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_PRED_COMMON_H_ +#define AOM_AV1_COMMON_PRED_COMMON_H_ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE uint8_t get_segment_id( + const CommonModeInfoParams *const mi_params, const uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + const int seg_stride = mi_params->mi_cols; + uint8_t segment_id = MAX_SEGMENTS; + + for (int y = 0; y < ymis; ++y) { + for (int x = 0; x < xmis; ++x) { + segment_id = + AOMMIN(segment_id, segment_ids[mi_offset + y * seg_stride + x]); + } + } + + assert(segment_id < MAX_SEGMENTS); + return segment_id; +} + +static INLINE uint8_t av1_get_spatial_seg_pred(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + int *cdf_index, + int skip_over4x4) { + const int step_size = skip_over4x4 ? 2 : 1; + uint8_t prev_ul = UINT8_MAX; // top left segment_id + uint8_t prev_l = UINT8_MAX; // left segment_id + uint8_t prev_u = UINT8_MAX; // top segment_id + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const uint8_t *seg_map = cm->cur_frame->seg_map; + if ((xd->up_available) && (xd->left_available)) { + prev_ul = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size, + mi_col - step_size); + } + if (xd->up_available) { + prev_u = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - step_size, + mi_col - 0); + } + if (xd->left_available) { + prev_l = get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, + mi_col - step_size); + } + assert(IMPLIES(prev_ul != UINT8_MAX, + prev_u != UINT8_MAX && prev_l != UINT8_MAX)); + + // Pick CDF index based on number of matching/out-of-bounds segment IDs. + if (prev_ul == UINT8_MAX) /* Edge cases */ + *cdf_index = 0; + else if ((prev_ul == prev_u) && (prev_ul == prev_l)) + *cdf_index = 2; + else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l)) + *cdf_index = 1; + else + *cdf_index = 0; + + // If 2 or more are identical returns that as predictor, otherwise prev_l. + if (prev_u == UINT8_MAX) // edge case + return prev_l == UINT8_MAX ? 0 : prev_l; + if (prev_l == UINT8_MAX) // edge case + return prev_u; + return (prev_ul == prev_u) ? prev_u : prev_l; +} + +static INLINE uint8_t av1_get_pred_context_seg_id(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0; + const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0; + + return above_sip + left_sip; +} + +static INLINE int get_comp_index_context(const AV1_COMMON *cm, + const MACROBLOCKD *xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + int bck_frame_index = 0, fwd_frame_index = 0; + int cur_frame_index = cm->cur_frame->order_hint; + + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; + + int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info, + fwd_frame_index, cur_frame_index)); + int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, bck_frame_index)); + + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + + int above_ctx = 0, left_ctx = 0; + const int offset = (fwd == bck); + + if (above_mi != NULL) { + if (has_second_ref(above_mi)) + above_ctx = above_mi->compound_idx; + else if (above_mi->ref_frame[0] == ALTREF_FRAME) + above_ctx = 1; + } + + if (left_mi != NULL) { + if (has_second_ref(left_mi)) + left_ctx = left_mi->compound_idx; + else if (left_mi->ref_frame[0] == ALTREF_FRAME) + left_ctx = 1; + } + + return above_ctx + left_ctx + 3 * offset; +} + +static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int above_ctx = 0, left_ctx = 0; + + if (above_mi) { + if (has_second_ref(above_mi)) + above_ctx = above_mi->comp_group_idx; + else if (above_mi->ref_frame[0] == ALTREF_FRAME) + above_ctx = 3; + } + if (left_mi) { + if (has_second_ref(left_mi)) + left_ctx = left_mi->comp_group_idx; + else if (left_mi->ref_frame[0] == ALTREF_FRAME) + left_ctx = 3; + } + + return AOMMIN(5, above_ctx + left_ctx); +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id( + struct segmentation_probs *segp, const MACROBLOCKD *xd) { + return segp->pred_cdf[av1_get_pred_context_seg_id(xd)]; +} + +static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_skip_mode = above_mi ? above_mi->skip_mode : 0; + const int left_skip_mode = left_mi ? left_mi->skip_mode : 0; + return above_skip_mode + left_skip_mode; +} + +static INLINE int av1_get_skip_txfm_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + const int above_skip_txfm = above_mi ? above_mi->skip_txfm : 0; + const int left_skip_txfm = left_mi ? left_mi->skip_txfm : 0; + return above_skip_txfm + left_skip_txfm; +} + +int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir); + +// Get a list of palette base colors that are used in the above and left blocks, +// referred to as "color cache". The return value is the number of colors in the +// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache" +// in ascending order. +int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane, + uint16_t *cache); + +static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8]; +} + +static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + int ctx = 0; + if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0); + if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0); + return ctx; +} + +int av1_get_intra_inter_context(const MACROBLOCKD *xd); + +int av1_get_reference_mode_context(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)]; +} + +static INLINE aom_cdf_prob *av1_get_skip_txfm_cdf(const MACROBLOCKD *xd) { + return xd->tile_ctx->skip_txfm_cdfs[av1_get_skip_txfm_context(xd)]; +} + +int av1_get_comp_reference_type_context(const MACROBLOCKD *xd); + +// == Uni-directional contexts == + +int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_comp_reference_type_context(xd); + return xd->tile_ctx->comp_ref_type_cdf[pred_context]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd); + return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2]; +} + +// == Bi-directional contexts == + +int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd); + +int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p1(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][1]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_ref_p2(xd); + return xd->tile_ctx->comp_ref_cdf[pred_context][2]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_bwdref_p(xd); + return xd->tile_ctx->comp_bwdref_cdf[pred_context][0]; +} + +static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1( + const MACROBLOCKD *xd) { + const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd); + return xd->tile_ctx->comp_bwdref_cdf[pred_context][1]; +} + +// == Single contexts == + +int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd); + +int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd); + +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4]; +} +static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6( + const MACROBLOCKD *xd) { + return xd->tile_ctx + ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5]; +} + +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +static INLINE int get_tx_size_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *mbmi = xd->mi[0]; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->bsize]; + const int max_tx_wide = tx_size_wide[max_tx_size]; + const int max_tx_high = tx_size_high[max_tx_size]; + const int has_above = xd->up_available; + const int has_left = xd->left_available; + + int above = xd->above_txfm_context[0] >= max_tx_wide; + int left = xd->left_txfm_context[0] >= max_tx_high; + + if (has_above) + if (is_inter_block(above_mbmi)) + above = block_size_wide[above_mbmi->bsize] >= max_tx_wide; + + if (has_left) + if (is_inter_block(left_mbmi)) + left = block_size_high[left_mbmi->bsize] >= max_tx_high; + + if (has_above && has_left) + return (above + left); + else if (has_above) + return above; + else if (has_left) + return left; + else + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_PRED_COMMON_H_ diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c new file mode 100644 index 0000000000..b0976287ef --- /dev/null +++ b/third_party/aom/av1/common/quant_common.c @@ -0,0 +1,12876 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = { + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, + 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, + 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, + 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, + 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, + 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, + 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, + 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, + 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, + 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, + 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, + 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, + 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, + 1184, 1232, 1282, 1336, +}; + +static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = { + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, + 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, + 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, + 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, + 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, + 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, + 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, + 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, + 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, + 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, + 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, + 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, + 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, + 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, + 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, + 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, + 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, + 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, +}; + +static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = { + 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, + 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, + 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, + 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, + 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, + 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, + 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, + 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, + 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, + 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, + 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, + 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, + 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, + 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, + 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, + 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, + 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, + 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, + 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, + 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, + 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, + 19718, 20521, 21387, +}; + +static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = { + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, + 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, + 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, + 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, + 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, + 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, + 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, + 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, + 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, + 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, + 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, + 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, + 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, +}; + +static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = { + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, + 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, + 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, + 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, + 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, + 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, + 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, + 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, + 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, + 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, + 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, + 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, + 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, + 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, + 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, + 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, + 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, + 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, +}; + +static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = { + 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, + 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, + 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, + 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, + 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, + 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, + 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, + 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, + 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, + 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, + 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, + 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, + 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, + 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, + 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, + 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, + 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, + 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, + 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, + 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, + 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, + 28143, 28687, 29247, +}; + +// Coefficient scaling and quantization with AV1 TX are tailored to +// the AV1 TX transforms. Regardless of the bit-depth of the input, +// the transform stages scale the coefficient values up by a factor of +// 8 (3 bits) over the scale of the pixel values. Thus, for 8-bit +// input, the coefficients have effectively 11 bits of scale depth +// (8+3), 10-bit input pixels result in 13-bit coefficient depth +// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth. +// All quantizers are built using this invariant of x8, 3-bit scaling, +// thus the Q3 suffix. + +// A partial exception to this rule is large transforms; to avoid +// overflow, TX blocks with > 256 pels (>16x16) are scaled only +// 4-times unity (2 bits) over the pixel depth, and TX blocks with +// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit). +// This descaling is found via av1_tx_get_scale(). Thus, 16x32, 32x16 +// and 32x32 transforms actually return Q2 coefficients, and 32x64, +// 64x32 and 64x64 transforms return Q1 coefficients. However, the +// quantizers are de-scaled down on-the-fly by the same amount +// (av1_tx_get_scale()) during quantization, and as such the +// dequantized/decoded coefficients, even for large TX blocks, are always +// effectively Q3. Meanwhile, quantized/coded coefficients are Q0 +// because Qn quantizers are applied to Qn tx coefficients. + +// Note that encoder decision making (which uses the quantizer to +// generate several bespoke lamdas for RDO and other heuristics) +// expects quantizers to be larger for higher-bitdepth input. In +// addition, the minimum allowable quantizer is 4; smaller values will +// underflow to 0 in the actual quantization routines. + +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); + switch (bit_depth) { + case AOM_BITS_8: return dc_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped]; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) { + const int q_clamped = clamp(qindex + delta, 0, MAXQ); + switch (bit_depth) { + case AOM_BITS_8: return ac_qlookup_QTX[q_clamped]; + case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped]; + case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped]; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +int av1_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + const int seg_qindex = base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); + } else { + return base_qindex; + } +} + +bool av1_use_qmatrix(const CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id) { + // True if explicit Q matrix levels and this is not a lossless segment. + return quant_params->using_qmatrix && !xd->lossless[segment_id]; +} + +const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->giqmatrix[qmlevel][plane][tx_size]; +} +const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel, + int plane, TX_SIZE tx_size) { + assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL || + qmlevel == NUM_QM_LEVELS - 1); + return quant_params->gqmatrix[qmlevel][plane][tx_size]; +} + +// Returns true if the tx_type corresponds to non-identity transform in both +// horizontal and vertical directions. +static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); } + +const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_iqmatrix[seg_id][qm_tx_size] + : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int seg_id = mbmi->segment_id; + const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size); + // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms + return is_2d_transform(tx_type) + ? pd->seg_qmatrix[seg_id][qm_tx_size] + : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; +} + +#define QM_TOTAL_SIZE 3344 +// We only use wt_matrix_ref[q] and iwt_matrix_ref[q] +// for q = 0, ..., NUM_QM_LEVELS - 2. +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; + +void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { + for (int q = 0; q < NUM_QM_LEVELS; ++q) { + for (int c = 0; c < num_planes; ++c) { + int current = 0; + for (int t = 0; t < TX_SIZES_ALL; ++t) { + const int size = tx_size_2d[t]; + const int qm_tx_size = av1_get_adjusted_tx_size(t); + if (q == NUM_QM_LEVELS - 1) { + quant_params->gqmatrix[q][c][t] = NULL; + quant_params->giqmatrix[q][c][t] = NULL; + } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size' + assert(t > qm_tx_size); + quant_params->gqmatrix[q][c][t] = + quant_params->gqmatrix[q][c][qm_tx_size]; + quant_params->giqmatrix[q][c][t] = + quant_params->giqmatrix[q][c][qm_tx_size]; + } else { + assert(current + size <= QM_TOTAL_SIZE); + quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current]; + quant_params->giqmatrix[q][c][t] = + &iwt_matrix_ref[q][c >= 1][current]; + current += size; + } + } + } + } +} + +/* Provide 15 sets of quantization matrices for chroma and luma + and each TX size. Matrices for different TX sizes are in fact + sub-sampled from the 32x32 and 16x16 sizes, but explicitly + defined here for convenience. Intra and inter matrix sets are the + same but changing DEFAULT_QM_INTER_OFFSET from zero allows + for different matrices for inter and intra blocks in the same + frame. + Matrices for different QM levels have been rescaled in the + frequency domain according to different nominal viewing + distances. Matrices for QM level 15 are omitted because they are + not used. + */ +static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { + { + { /* Luma */ + /* Size 4x4 */ + 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200, + /* Size 8x8 */ + 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38, + 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63, + 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89, + 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220, + /* Size 16x16 */ + 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31, + 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32, + 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35, + 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48, + 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63, + 67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71, + 80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92, + 98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98, + 105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110, + 118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113, + 121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107, + 115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100, + 108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96, + 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100, + 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119, + 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220, + 231, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71, + 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32, + 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77, + 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32, + 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83, + 86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35, + 35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91, + 94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42, + 44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100, + 104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49, + 53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106, + 109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63, + 68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34, + 34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77, + 79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36, + 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88, + 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47, + 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98, + 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58, + 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102, + 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67, + 69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110, + 111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71, + 76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117, + 119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, + 87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120, + 121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, + 92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125, + 129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89, + 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131, + 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, + 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133, + 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, + 97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137, + 140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93, + 96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142, + 144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85, + 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148, + 149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81, + 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156, + 152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82, + 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154, + 158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81, + 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154, + 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84, + 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146, + 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90, + 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, + 148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101, + 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130, + 138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197, + 204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, + 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200, + 204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, + 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, + 193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102, + 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173, + 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104, + 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151, + 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119, + 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130, + 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220, + 222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114, + 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191, + 204, 206, 222, 224, 230, 232, 242, + /* Size 4x8 */ + 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75, + 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190, + /* Size 8x4 */ + 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65, + 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190, + /* Size 8x16 */ + 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32, + 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34, + 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50, + 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59, + 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77, + 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86, + 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99, + 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203, + /* Size 16x8 */ + 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34, + 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, + 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102, + 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124, + 122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121, + 144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107, + 128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, + 117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, + 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32, + 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74, + 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34, + 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80, + 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41, + 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87, + 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, + 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, + 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, + 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108, + 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, + 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, + 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, + 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123, + 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, + 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136, + 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96, + 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144, + 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98, + 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152, + 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89, + 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160, + 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84, + 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162, + 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, + 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, + 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, + 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, + 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93, + 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142, + 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217, + /* Size 32x16 */ + 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32, + 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33, + 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, + 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54, + 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69, + 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, + 80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84, + 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90, + 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41, + 43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, + 55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56, + 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60, + 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63, + 75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66, + 77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68, + 79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, + 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, + 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75, + 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72, + 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80, + 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91, + 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155, + 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166, + 168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173, + 171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173, + 178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170, + 174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155, + 168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138, + 141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117, + 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105, + 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111, + 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217, + /* Size 4x16 */ + 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44, + 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72, + 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90, + 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197, + /* Size 16x4 */ + 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54, + 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118, + 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, + 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71, + 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32, + 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, + 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36, + 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, + 91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57, + 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, + 107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65, + 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, + 128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82, + 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, + 153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80, + 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, + 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89, + 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146, + 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, + /* Size 32x8 */ + 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33, + 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50, + 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, + 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90, + 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44, + 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60, + 82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, + 89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, + 97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80, + 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77, + 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94, + 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171, + 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174, + 186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138, + 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105, + 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204 }, + { /* Chroma */ + /* Size 4x4 */ + 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109, + /* Size 8x8 */ + 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46, + 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72, + 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95, + 104, 107, 71, 67, 68, 75, 84, 95, 107, 113, + /* Size 16x16 */ + 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32, + 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45, + 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49, + 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55, + 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67, + 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74, + 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78, + 78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83, + 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58, + 57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59, + 64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69, + 73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73, + 78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78, + 82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83, + 88, 93, 98, 104, 109, 112, 116, + /* Size 32x32 */ + 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60, + 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33, + 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44, + 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66, + 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46, + 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, + 69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50, + 51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38, + 40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55, + 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47, + 48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59, + 60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51, + 51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63, + 63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54, + 55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66, + 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, + 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45, + 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69, + 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51, + 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70, + 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60, + 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74, + 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69, + 70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52, + 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, + 79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49, + 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80, + 81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, + 63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82, + 82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, + 75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, + 89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57, + 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92, + 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60, + 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94, + 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70, + 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95, + 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82, + 84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65, + 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92, + 96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63, + 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, + 99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62, + 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, + 101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61, + 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, + 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63, + 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, + 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66, + 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, + 108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, + 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108, + 110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72, + 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111, + 112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, + 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, + 115, 115, 118, + /* Size 4x8 */ + 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54, + 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105, + /* Size 8x4 */ + 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65, + 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105, + /* Size 8x16 */ + 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40, + 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50, + 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61, + 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73, + 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92, + 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98, + 99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98, + 104, 106, 109, + /* Size 16x8 */ + 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43, + 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54, + 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73, + 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, + 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, + 83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, + 91, 101, 109, + /* Size 16x32 */ + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, + 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34, + 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61, + 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47, + 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63, + 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49, + 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65, + 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, + 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47, + 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, + 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, + 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, + 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, + 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, + 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, + 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, + 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85, + 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59, + 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, + 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61, + 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98, + 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68, + 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103, + 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73, + 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105, + 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75, + 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109, + 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, + 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113, + /* Size 32x16 */ + 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31, + 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42, + 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45, + 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49, + 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, + 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, + 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, + 61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62, + 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45, + 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49, + 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59, + 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68, + 71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, + 80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, + 79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, + 79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, + 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58, + 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58, + 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69, + 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78, + 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, + 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, + 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, + 102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, + 103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, + 106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, + 107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109, + 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77, + 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113, + /* Size 4x16 */ + 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45, + 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57, + 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66, + 70, 74, 80, 85, 91, 96, 101, 103, 105, 107, + /* Size 16x4 */ + 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53, + 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80, + 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68, + 89, 103, 67, 70, 86, 105, 69, 72, 88, 107, + /* Size 8x32 */ + 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60, + 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41, + 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, + 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51, + 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, + 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62, + 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74, + 73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75, + 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63, + 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, + 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60, + 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, + 99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61, + 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, + 104, 106, 106, 109, 109, 108, + /* Size 32x8 */ + 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40, + 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47, + 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60, + 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62, + 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, + 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, + 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, + 79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81, + 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56, + 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74, + 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96, + 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101, + 104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106, + 75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77, + 70, 67, 73, 81, 90, 99, 108 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184, + /* Size 8x8 */ + 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39, + 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71, + 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93, + 106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201, + /* Size 16x16 */ + 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31, + 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33, + 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40, + 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56, + 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70, + 78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89, + 95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97, + 104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106, + 113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117, + 125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125, + 134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119, + 131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110, + 122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104, + 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102, + 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98, + 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65, + 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32, + 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76, + 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32, + 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82, + 85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35, + 36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89, + 92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42, + 45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97, + 100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, + 53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32, + 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65, + 68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35, + 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79, + 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48, + 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90, + 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, + 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100, + 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, + 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106, + 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79, + 84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45, + 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, + 95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46, + 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98, + 100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49, + 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, + 106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54, + 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110, + 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56, + 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113, + 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59, + 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118, + 121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65, + 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122, + 125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70, + 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, + 127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76, + 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, + 130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78, + 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, + 128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162, + 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119, + 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165, + 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, + 120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169, + 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110, + 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174, + 174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106, + 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176, + 180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, + 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, + 176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, + 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, + 173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90, + 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160, + 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96, + 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150, + 155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105, + 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135, + 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211, + 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120, + 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203, + 204, 210, 211, 219, + /* Size 4x8 */ + 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64, + 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177, + /* Size 8x4 */ + 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79, + 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177, + /* Size 8x16 */ + 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32, + 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36, + 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56, + 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73, + 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84, + 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91, + 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87, + 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188, + /* Size 16x8 */ + 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34, + 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, + 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95, + 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113, + 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133, + 142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, + 148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, + 129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, + 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32, + 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72, + 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35, + 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40, + 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85, + 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, + 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, + 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, + 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, + 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, + 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49, + 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, + 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59, + 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118, + 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66, + 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123, + 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72, + 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125, + 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83, + 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126, + 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166, + 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, + 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, + 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, + 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177, + 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110, + 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181, + 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106, + 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185, + 186, 192, 193, 201, + /* Size 32x16 */ + 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32, + 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33, + 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, + 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50, + 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64, + 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, + 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81, + 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87, + 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37, + 40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, + 53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66, + 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71, + 77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82, + 92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, + 98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, + 100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92, + 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97, + 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101, + 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104, + 118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106, + 121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100, + 109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, + 112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, + 101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, + 95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, + 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91, + 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95, + 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107, + 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192, + 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188, + 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166, + 189, 190, 201, + /* Size 4x16 */ + 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41, + 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65, + 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83, + 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183, + /* Size 16x4 */ + 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54, + 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107, + 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, + 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65, + 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32, + 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, + 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38, + 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, + 89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, + 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108, + 110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, + 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, + 128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, + 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141, + 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, + 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163, + 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, + 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, + 171, 174, 179, 181, 188, 188, 190, + /* Size 32x8 */ + 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32, + 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45, + 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, + 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87, + 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42, + 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57, + 71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, + 98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84, + 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92, + 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97, + 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99, + 124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92, + 105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91, + 94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107, + 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, + 114, 104, 100, 111, 127, 145, 166, 190 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105, + /* Size 8x8 */ + 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47, + 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67, + 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92, + 101, 103, 69, 65, 66, 73, 82, 92, 103, 109, + /* Size 16x16 */ + 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31, + 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44, + 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48, + 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54, + 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61, + 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72, + 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76, + 76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80, + 60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58, + 56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58, + 62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67, + 71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71, + 76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76, + 80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80, + 85, 90, 95, 100, 105, 108, 111, + /* Size 32x32 */ + 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32, + 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61, + 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42, + 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64, + 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45, + 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66, + 67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, + 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38, + 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, + 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47, + 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58, + 58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, + 49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, + 62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, + 54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58, + 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45, + 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64, + 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49, + 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68, + 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72, + 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66, + 66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50, + 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, + 75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48, + 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78, + 79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, + 59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80, + 80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, + 70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83, + 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, + 82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56, + 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88, + 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57, + 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91, + 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68, + 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92, + 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, + 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64, + 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, + 93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59, + 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, + 99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62, + 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, + 100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, + 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, + 102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69, + 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104, + 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73, + 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106, + 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77, + 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108, + 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80, + 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75, + 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84, + 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113, + /* Size 4x8 */ + 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52, + 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102, + /* Size 8x4 */ + 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64, + 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102, + /* Size 8x16 */ + 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38, + 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48, + 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56, + 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71, + 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85, + 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95, + 97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96, + 101, 103, 105, + /* Size 16x8 */ + 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41, + 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54, + 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70, + 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78, + 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, + 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, + 81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, + 89, 98, 105, + /* Size 16x32 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33, + 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46, + 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, + 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47, + 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63, + 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47, + 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, + 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, + 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, + 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, + 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, + 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, + 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79, + 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57, + 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, + 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60, + 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96, + 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67, + 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100, + 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73, + 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103, + 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78, + 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71, + 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, + 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109, + /* Size 32x16 */ + 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31, + 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40, + 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45, + 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47, + 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, + 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, + 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, + 60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61, + 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46, + 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47, + 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59, + 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64, + 68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, + 75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, + 78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, + 78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, + 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56, + 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56, + 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68, + 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76, + 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, + 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, + 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, + 100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, + 99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104, + 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73, + 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67, + 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68, + 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109, + /* Size 4x16 */ + 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45, + 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54, + 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65, + 68, 73, 78, 84, 89, 93, 98, 100, 102, 103, + /* Size 16x4 */ + 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53, + 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78, + 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66, + 87, 100, 65, 68, 83, 102, 67, 70, 86, 103, + /* Size 8x32 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57, + 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38, + 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, + 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50, + 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, + 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72, + 71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73, + 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60, + 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, + 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59, + 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, + 97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61, + 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, + 101, 103, 103, 105, 105, 105, + /* Size 32x8 */ + 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38, + 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46, + 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55, + 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61, + 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, + 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, + 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, + 78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79, + 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55, + 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72, + 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94, + 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98, + 101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103, + 73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75, + 68, 65, 71, 78, 87, 96, 105 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169, + /* Size 8x8 */ + 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37, + 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64, + 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87, + 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184, + /* Size 16x16 */ + 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32, + 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34, + 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40, + 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53, + 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73, + 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91, + 96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103, + 107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, + 118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125, + 125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133, + 134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143, + 145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150, + 156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156, + 163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148, + 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141, + 152, 163, 177, 184, 191, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59, + 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32, + 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68, + 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81, + 83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37, + 38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89, + 91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46, + 50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32, + 32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58, + 62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34, + 34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71, + 72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41, + 42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82, + 83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50, + 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92, + 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63, + 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38, + 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, + 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41, + 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91, + 93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54, + 58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99, + 102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66, + 70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105, + 105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, + 73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112, + 112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, + 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113, + 116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, + 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121, + 121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, + 87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122, + 125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87, + 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130, + 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90, + 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134, + 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90, + 92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136, + 139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96, + 98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140, + 144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91, + 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148, + 148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, + 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153, + 153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, + 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151, + 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, + 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151, + 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85, + 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, + 156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83, + 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148, + 153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86, + 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144, + 148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91, + 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, + 144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94, + 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135, + 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101, + 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, + 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199, + /* Size 4x8 */ + 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58, + 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165, + /* Size 8x4 */ + 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69, + 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165, + /* Size 8x16 */ + 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32, + 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38, + 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58, + 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74, + 81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90, + 97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97, + 104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97, + 105, 113, 122, 131, 141, 151, 163, 169, 175, + /* Size 16x8 */ + 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33, + 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, + 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90, + 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111, + 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135, + 131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, + 151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141, + 160, 169, 103, 94, 92, 103, 119, 137, 158, 175, + /* Size 16x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, + 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65, + 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34, + 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77, + 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40, + 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85, + 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, + 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39, + 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, + 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41, + 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, + 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, + 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, + 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, + 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, + 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63, + 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121, + 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73, + 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, + 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75, + 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139, + 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, + 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, + 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, + 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, + 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84, + 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131, + 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87, + 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128, + 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187, + /* Size 32x16 */ + 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32, + 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32, + 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, + 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50, + 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58, + 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, + 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79, + 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84, + 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37, + 39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, + 51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58, + 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75, + 79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81, + 86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, + 103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, + 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98, + 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105, + 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109, + 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111, + 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, + 133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, + 135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121, + 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125, + 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129, + 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117, + 128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105, + 120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103, + 108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98, + 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92, + 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, + 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187, + /* Size 4x16 */ + 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38, + 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58, + 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78, + 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170, + /* Size 16x4 */ + 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47, + 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97, + 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, + 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170, + /* Size 8x32 */ + 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59, + 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32, + 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, + 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38, + 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, + 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58, + 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103, + 103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, + 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120, + 121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, + 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137, + 141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, + 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, + 155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, + 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163, + 168, 169, 175, 175, 176, + /* Size 32x8 */ + 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32, + 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42, + 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, + 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84, + 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40, + 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, + 79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, + 103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91, + 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100, + 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109, + 133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111, + 136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113, + 135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, + 120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, + 103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, + 93, 104, 118, 135, 155, 176 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101, + /* Size 8x8 */ + 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47, + 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65, + 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89, + 97, 99, 67, 63, 64, 71, 79, 89, 99, 104, + /* Size 16x16 */ + 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31, + 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43, + 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46, + 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51, + 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60, + 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68, + 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74, + 73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77, + 57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55, + 53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55, + 59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65, + 69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74, + 79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77, + 82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82, + 87, 92, 96, 101, 104, 106, + /* Size 32x32 */ + 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55, + 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31, + 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60, + 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42, + 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63, + 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45, + 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64, + 65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47, + 49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36, + 37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, + 53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45, + 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55, + 57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50, + 49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59, + 60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51, + 51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62, + 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, + 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45, + 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63, + 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49, + 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67, + 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56, + 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70, + 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63, + 64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48, + 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69, + 69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47, + 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, + 77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, + 58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78, + 77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, + 66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, + 76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53, + 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84, + 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55, + 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88, + 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65, + 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89, + 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, + 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61, + 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, + 86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58, + 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, + 95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, + 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98, + 99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, + 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101, + 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, + 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67, + 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, + 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64, + 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89, + 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64, + 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93, + 96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62, + 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, + 99, 104, 104, 106, 106, 108, + /* Size 4x8 */ + 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50, + 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99, + /* Size 8x4 */ + 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59, + 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99, + /* Size 8x16 */ + 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36, + 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47, + 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56, + 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65, + 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82, + 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93, + 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98, + 100, 102, + /* Size 16x8 */ + 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40, + 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51, + 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67, + 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76, + 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, + 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, + 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, + 95, 102, + /* Size 16x32 */ + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, + 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32, + 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45, + 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60, + 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46, + 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61, + 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, + 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47, + 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, + 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, + 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, + 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, + 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, + 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, + 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, + 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74, + 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57, + 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, + 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58, + 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93, + 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66, + 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97, + 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75, + 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69, + 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81, + 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63, + 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, + 87, 91, 91, 95, 96, 101, 101, 103, 103, 105, + /* Size 32x16 */ + 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31, + 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39, + 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45, + 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47, + 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, + 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, + 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, + 59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60, + 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46, + 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46, + 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56, + 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63, + 65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, + 71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, + 76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, + 76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, + 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54, + 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52, + 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64, + 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75, + 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, + 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, + 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, + 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96, + 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70, + 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65, + 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65, + 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63, + 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105, + /* Size 4x16 */ + 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46, + 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50, + 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64, + 67, 71, 77, 82, 87, 91, 95, 97, 99, 100, + /* Size 16x4 */ + 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51, + 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77, + 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64, + 84, 97, 64, 66, 81, 99, 65, 68, 83, 100, + /* Size 8x32 */ + 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54, + 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36, + 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, + 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50, + 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, + 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56, + 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70, + 69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68, + 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60, + 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, + 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58, + 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, + 94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, + 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, + 100, 102, 102, 101, + /* Size 32x8 */ + 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36, + 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46, + 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54, + 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61, + 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, + 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, + 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, + 76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77, + 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53, + 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68, + 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91, + 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98, + 69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71, + 64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, + 63, 69, 76, 84, 93, 101 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156, + /* Size 8x8 */ + 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36, + 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73, + 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92, + 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168, + /* Size 16x16 */ + 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32, + 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33, + 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38, + 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50, + 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64, + 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83, + 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100, + 102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110, + 109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117, + 117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125, + 126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135, + 137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, + 148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150, + 155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155, + 162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162, + 168, 174, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56, + 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68, + 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77, + 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, + 38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86, + 88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, + 49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32, + 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58, + 59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34, + 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69, + 71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, + 40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78, + 80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, + 50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60, + 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34, + 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73, + 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42, + 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86, + 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55, + 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97, + 97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67, + 71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47, + 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, + 83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44, + 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90, + 93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50, + 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, + 104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51, + 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106, + 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55, + 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112, + 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62, + 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118, + 119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63, + 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120, + 121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68, + 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127, + 128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70, + 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131, + 131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74, + 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136, + 137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76, + 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, + 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76, + 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139, + 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80, + 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, + 144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78, + 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, + 147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80, + 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, + 142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83, + 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, + 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86, + 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, + 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90, + 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, + 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181, + /* Size 4x8 */ + 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57, + 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154, + /* Size 8x4 */ + 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61, + 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154, + /* Size 8x16 */ + 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32, + 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37, + 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54, + 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75, + 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92, + 98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106, + 112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108, + 116, 124, 134, 142, 153, 157, 163, + /* Size 16x8 */ + 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33, + 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48, + 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76, + 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, + 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, + 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, + 142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, + 157, 97, 88, 86, 97, 111, 128, 147, 163, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, + 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65, + 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34, + 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74, + 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, + 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82, + 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, + 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35, + 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, + 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, + 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, + 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, + 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, + 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, + 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, + 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78, + 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116, + 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, + 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, + 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90, + 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135, + 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, + 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, + 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, + 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, + 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, + 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, + 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91, + 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150, + 161, 162, 166, 167, 173, + /* Size 32x16 */ + 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32, + 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32, + 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, + 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44, + 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57, + 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, + 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76, + 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80, + 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34, + 36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, + 48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51, + 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66, + 71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, + 84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, + 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106, + 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108, + 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112, + 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118, + 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, + 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, + 129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131, + 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136, + 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139, + 145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139, + 148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144, + 148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, + 153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, + 153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151, + 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147, + 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144, + 163, 163, 173, + /* Size 4x16 */ + 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35, + 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56, + 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76, + 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159, + /* Size 16x4 */ + 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42, + 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, + 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145, + 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55, + 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32, + 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, + 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39, + 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, + 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, + 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97, + 97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, + 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111, + 110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, + 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126, + 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, + 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, + 144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, + 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, + 163, 163, + /* Size 32x8 */ + 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32, + 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42, + 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, + 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81, + 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39, + 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, + 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85, + 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108, + 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118, + 119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127, + 129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136, + 139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, + 150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, + 157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151, + 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144, + 163 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97, + /* Size 8x8 */ + 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47, + 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62, + 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86, + 91, 95, 65, 61, 62, 68, 76, 86, 95, 100, + /* Size 16x16 */ + 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31, + 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42, + 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48, + 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50, + 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55, + 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64, + 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72, + 71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75, + 54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53, + 51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53, + 56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61, + 66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72, + 77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79, + 83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88, + 93, 97, 100, 102, + /* Size 32x32 */ + 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31, + 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57, + 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40, + 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60, + 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45, + 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62, + 63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, + 47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34, + 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, + 51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43, + 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54, + 55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, + 49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58, + 58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, + 50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54, + 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46, + 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, + 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48, + 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64, + 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55, + 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68, + 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60, + 61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48, + 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66, + 66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46, + 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71, + 71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, + 54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75, + 75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, + 64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78, + 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, + 71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51, + 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78, + 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51, + 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83, + 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61, + 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86, + 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, + 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60, + 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, + 82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57, + 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, + 91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, + 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94, + 95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, + 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95, + 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, + 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62, + 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87, + 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60, + 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, + 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, + 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, + 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, + 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101, + 101, 104, + /* Size 4x8 */ + 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50, + 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95, + /* Size 8x4 */ + 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55, + 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95, + /* Size 8x16 */ + 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34, + 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47, + 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53, + 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63, + 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73, + 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87, + 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95, + 97, 98, + /* Size 16x8 */ + 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37, + 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49, + 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61, + 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, + 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, + 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, + 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, + 92, 98, + /* Size 16x32 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32, + 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54, + 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43, + 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, + 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59, + 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, + 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47, + 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, + 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, + 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, + 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, + 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, + 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, + 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, + 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71, + 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52, + 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, + 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55, + 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, + 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, + 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94, + 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73, + 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64, + 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, + 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60, + 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, + 89, 93, 93, 97, 98, 99, 99, 102, + /* Size 32x16 */ + 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31, + 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39, + 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46, + 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46, + 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50, + 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54, + 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57, + 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59, + 47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46, + 47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, + 51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, + 57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, + 61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, + 66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, + 71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, + 74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, + 54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51, + 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50, + 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58, + 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69, + 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80, + 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90, + 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94, + 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93, + 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62, + 61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59, + 61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, + 67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, + 74, 82, 82, 90, 90, 98, 98, 102, + /* Size 4x16 */ + 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46, + 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49, + 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63, + 66, 70, 75, 80, 85, 89, 93, 94, 96, 97, + /* Size 16x4 */ + 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50, + 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75, + 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63, + 82, 94, 62, 64, 79, 96, 63, 66, 81, 97, + /* Size 8x32 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53, + 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35, + 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48, + 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, + 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, + 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68, + 67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64, + 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54, + 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, + 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57, + 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, + 91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, + 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, + 97, 99, 98, 98, + /* Size 32x8 */ + 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34, + 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45, + 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50, + 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60, + 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, + 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, + 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, + 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75, + 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51, + 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66, + 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83, + 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95, + 67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, + 61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, + 74, 82, 90, 98 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140, + /* Size 8x8 */ + 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35, + 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65, + 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86, + 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153, + /* Size 16x16 */ + 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32, + 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33, + 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37, + 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46, + 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60, + 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74, + 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91, + 95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101, + 104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112, + 59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66, + 63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70, + 67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77, + 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81, + 77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86, + 82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52, + 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62, + 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75, + 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80, + 82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, + 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32, + 32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53, + 54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33, + 34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63, + 66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37, + 38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72, + 74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, + 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81, + 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53, + 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34, + 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68, + 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38, + 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79, + 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51, + 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91, + 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61, + 63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42, + 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75, + 75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44, + 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87, + 89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50, + 51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98, + 99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, + 64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106, + 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, + 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, + 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75, + 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115, + 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80, + 85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65, + 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, + 97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63, + 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, + 99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67, + 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, + 103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70, + 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, + 106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75, + 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110, + 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77, + 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, + 112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78, + 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112, + 113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81, + 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115, + 116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84, + 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, + 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86, + 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, + 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88, + 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, + 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164, + /* Size 4x8 */ + 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50, + 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144, + /* Size 8x4 */ + 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59, + 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144, + /* Size 8x16 */ + 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32, + 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36, + 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51, + 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69, + 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85, + 92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96, + 103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103, + 111, 118, 126, 134, 143, 147, 151, + /* Size 16x8 */ + 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, + 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44, + 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73, + 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, + 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, + 60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81, + 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91, + 82, 80, 90, 103, 119, 137, 151, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, + 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60, + 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, + 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, + 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77, + 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, + 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35, + 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, + 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, + 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, + 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, + 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, + 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, + 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, + 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71, + 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114, + 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, + 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, + 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, + 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79, + 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, + 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, + 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, + 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, + 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100, + 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155, + 155, 160, + /* Size 32x16 */ + 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32, + 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32, + 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, + 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41, + 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50, + 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, + 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70, + 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78, + 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34, + 35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, + 42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50, + 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59, + 65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, + 79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, + 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98, + 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105, + 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, + 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, + 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60, + 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61, + 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65, + 65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68, + 67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, + 72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, + 73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75, + 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78, + 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81, + 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83, + 82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85, + 85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160, + /* Size 4x16 */ + 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35, + 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49, + 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74, + 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, + /* Size 16x4 */ + 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41, + 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, + 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74, + 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52, + 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, + 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36, + 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, + 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51, + 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90, + 90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72, + 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103, + 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, + 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79, + 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, + 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, + 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, + 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, + 152, + /* Size 32x8 */ + 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32, + 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38, + 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, + 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78, + 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35, + 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, + 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82, + 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101, + 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113, + 58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66, + 60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, + 67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, + 73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86, + 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91, + 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91, + /* Size 8x8 */ + 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45, + 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58, + 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82, + 89, 92, 64, 59, 60, 66, 74, 83, 92, 96, + /* Size 16x16 */ + 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31, + 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39, + 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47, + 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48, + 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54, + 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, + 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68, + 69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72, + 52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51, + 49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51, + 53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59, + 62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68, + 71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76, + 80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85, + 89, 94, 96, 98, + /* Size 32x32 */ + 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31, + 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54, + 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39, + 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46, + 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61, + 61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46, + 46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34, + 35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49, + 49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41, + 44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52, + 53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47, + 48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55, + 56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49, + 48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59, + 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, + 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46, + 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, + 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47, + 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61, + 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, + 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66, + 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, + 57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47, + 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62, + 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46, + 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67, + 68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, + 54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71, + 72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, + 61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75, + 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, + 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49, + 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73, + 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48, + 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78, + 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57, + 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83, + 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, + 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57, + 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75, + 75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54, + 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, + 84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, + 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90, + 91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, + 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91, + 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, + 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60, + 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84, + 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57, + 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92, + 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64, + 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96, + 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71, + 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99, + /* Size 4x8 */ + 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47, + 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93, + /* Size 8x4 */ + 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54, + 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93, + /* Size 8x16 */ + 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33, + 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46, + 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53, + 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61, + 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71, + 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82, + 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92, + 94, 95, + /* Size 16x8 */ + 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35, + 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49, + 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60, + 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, + 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, + 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, + 89, 95, + /* Size 16x32 */ + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32, + 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52, + 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41, + 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57, + 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48, + 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58, + 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, + 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47, + 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, + 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, + 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, + 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, + 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, + 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, + 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, + 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66, + 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52, + 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, + 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50, + 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82, + 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, + 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91, + 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, + 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63, + 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, + 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60, + 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, + 86, 90, 90, 95, 95, 96, 96, 98, + /* Size 32x16 */ + 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31, + 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38, + 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46, + 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45, + 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47, + 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51, + 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55, + 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58, + 42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46, + 47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, + 50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, + 54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, + 58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, + 64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, + 69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, + 73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, + 52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50, + 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49, + 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56, + 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64, + 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72, + 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81, + 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90, + 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90, + 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60, + 59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58, + 59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, + 65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, + 71, 79, 79, 87, 87, 95, 95, 98, + /* Size 4x16 */ + 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47, + 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47, + 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61, + 64, 68, 71, 75, 78, 83, 87, 90, 92, 94, + /* Size 16x4 */ + 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49, + 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71, + 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61, + 75, 90, 60, 62, 76, 92, 62, 64, 78, 94, + /* Size 8x32 */ + 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51, + 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33, + 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, + 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47, + 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, + 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53, + 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66, + 65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, + 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54, + 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, + 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56, + 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, + 86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, + 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, + 94, 96, 95, 95, + /* Size 32x8 */ + 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33, + 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46, + 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50, + 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59, + 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, + 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, + 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, + 69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74, + 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50, + 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61, + 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80, + 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92, + 64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, + 59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, + 71, 79, 87, 95 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134, + /* Size 8x8 */ + 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35, + 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61, + 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90, + 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32, + 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32, + 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35, + 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40, + 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51, + 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64, + 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78, + 84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, + 48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51, + 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54, + 54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59, + 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63, + 68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74, + 79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76, + 81, 86, 92, 99, 106, 113, 121, 128, 137, 140, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, + 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68, + 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, + 75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, + 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, + 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, + 59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, + 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, + 69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, + 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, + 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33, + 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58, + 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38, + 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, + 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81, + 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, + 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38, + 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, + 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42, + 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, + 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, + 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, + 90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, + 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, + 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71, + 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49, + 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, + 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50, + 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, + 97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, + 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, + 110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, + 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, + 113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, + 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, + 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, + 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, + 124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, + 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, + 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90, + 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80, + 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, + 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76, + 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, + 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78, + 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, + 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78, + 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, + 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83, + 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, + 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149, + /* Size 4x8 */ + 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50, + 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136, + /* Size 8x4 */ + 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56, + 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136, + /* Size 8x16 */ + 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32, + 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34, + 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42, + 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58, + 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76, + 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92, + 98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103, + 110, 118, 125, 133, 136, + /* Size 16x8 */ + 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, + 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, + 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, + 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, + 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, + 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, + 92, 106, 121, 136, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, + 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65, + 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, + 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, + 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, + 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, + 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, + 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, + 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, + 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, + 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, + 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, + 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, + 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51, + 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, + 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, + 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, + 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, + 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, + 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, + 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, + 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, + 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, + 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, + 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124, + 132, 132, 141, 141, 144, 144, 149, + /* Size 32x16 */ + 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32, + 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32, + 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, + 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41, + 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, + 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, + 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, + 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75, + 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34, + 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, + 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48, + 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58, + 58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, + 65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, + 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, + 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, + 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, + 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49, + 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54, + 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54, + 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, + 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, + 79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84, + 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84, + 97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90, + 104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90, + 104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, + 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, + 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96, + 109, 109, 124, 124, 141, 141, 149, + /* Size 4x16 */ + 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35, + 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50, + 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69, + 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136, + /* Size 16x4 */ + 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38, + 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90, + 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65, + 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48, + 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, + 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, + 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, + 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63, + 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51, + 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, + 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, + 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, + 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, + 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, + 118, 125, 125, 133, 133, 136, 136, 141, + /* Size 32x8 */ + 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, + 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34, + 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50, + 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69, + 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, + 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, + 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, + 79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, + 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49, + 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54, + 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68, + 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84, + 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90, + 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92, + 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89, + /* Size 8x8 */ + 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45, + 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56, + 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75, + 82, 86, 61, 57, 58, 64, 71, 79, 86, 91, + /* Size 16x16 */ + 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31, + 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35, + 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45, + 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46, + 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50, + 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55, + 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61, + 63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68, + 50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50, + 47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49, + 48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53, + 56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61, + 65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71, + 75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79, + 83, 86, 90, 91, + /* Size 32x32 */ + 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31, + 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, + 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38, + 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57, + 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, + 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, + 58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, + 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34, + 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, + 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39, + 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, + 51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, + 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, + 54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, + 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, + 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42, + 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52, + 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48, + 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, + 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, + 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, + 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, + 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47, + 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, + 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45, + 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, + 63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, + 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, + 67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, + 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, + 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, + 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47, + 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, + 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47, + 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72, + 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, + 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, + 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, + 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54, + 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, + 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, + 76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, + 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, + 85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, + 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, + 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, + 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, + 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78, + 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55, + 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, + 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61, + 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91, + 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, + 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95, + /* Size 4x8 */ + 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47, + 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90, + /* Size 8x4 */ + 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54, + 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90, + /* Size 8x16 */ + 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31, + 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43, + 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50, + 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56, + 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64, + 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73, + 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, + 89, 90, + /* Size 16x8 */ + 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, + 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, + 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, + 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, + 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, + 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, + 83, 90, + /* Size 16x32 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, + 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40, + 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54, + 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, + 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, + 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, + 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, + 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, + 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, + 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, + 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, + 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, + 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, + 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, + 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, + 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48, + 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, + 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, + 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, + 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57, + 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83, + 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, + 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, + 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, + 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59, + 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, + 84, 88, 88, 92, 92, 93, 93, 95, + /* Size 32x16 */ + 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31, + 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38, + 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46, + 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45, + 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, + 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51, + 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, + 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57, + 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43, + 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, + 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, + 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, + 56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, + 57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, + 64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, + 67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, + 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48, + 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47, + 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55, + 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62, + 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, + 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76, + 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, + 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88, + 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57, + 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56, + 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, + 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, + 69, 77, 77, 84, 84, 92, 92, 95, + /* Size 4x16 */ + 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47, + 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47, + 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57, + 60, 64, 67, 71, 75, 78, 82, 85, 89, 90, + /* Size 16x4 */ + 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47, + 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67, + 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58, + 72, 85, 57, 60, 75, 89, 59, 61, 75, 90, + /* Size 8x32 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50, + 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, + 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, + 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, + 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, + 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, + 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58, + 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50, + 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, + 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, + 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, + 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, + 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, + 89, 90, 90, 92, + /* Size 32x8 */ + 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31, + 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46, + 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47, + 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54, + 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, + 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, + 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, + 64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, + 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48, + 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55, + 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70, + 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85, + 63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, + 56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, + 69, 77, 84, 92 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108, + /* Size 8x8 */ + 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33, + 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54, + 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89, + 98, 108, 69, 65, 64, 73, 85, 97, 108, 119, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32, + 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32, + 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35, + 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39, + 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48, + 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59, + 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71, + 76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86, + 45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46, + 44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50, + 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58, + 62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67, + 71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76, + 80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86, + 91, 96, 104, 110, 118, 125, 134, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, + 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59, + 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, + 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37, + 38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45, + 45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52, + 54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60, + 64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38, + 39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69, + 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43, + 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, + 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, + 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35, + 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60, + 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42, + 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72, + 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50, + 52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37, + 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59, + 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38, + 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67, + 69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, + 46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77, + 80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, + 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, + 65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45, + 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, + 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45, + 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83, + 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54, + 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97, + 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, + 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, + 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74, + 75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56, + 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, + 86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56, + 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, + 95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59, + 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, + 105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, + 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, + 108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, + 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113, + 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72, + 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118, + 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, + 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, + 134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82, + 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126, + 134, 134, + /* Size 4x8 */ + 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42, + 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111, + /* Size 8x4 */ + 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48, + 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111, + /* Size 8x16 */ + 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32, + 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34, + 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42, + 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56, + 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72, + 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90, + 95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105, + 112, 119, 127, + /* Size 16x8 */ + 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, + 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38, + 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, + 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, + 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, + 79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, + 104, 115, 127, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49, + 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57, + 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36, + 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64, + 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, + 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, + 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, + 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, + 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, + 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, + 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, + 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, + 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, + 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64, + 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50, + 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, + 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50, + 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, + 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, + 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102, + 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, + 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111, + 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74, + 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119, + 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, + 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125, + 133, 133, + /* Size 32x16 */ + 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32, + 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32, + 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33, + 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36, + 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, + 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, + 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, + 65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69, + 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34, + 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36, + 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42, + 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49, + 54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, + 63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, + 71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, + 81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, + 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45, + 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46, + 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54, + 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63, + 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, + 85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, + 87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, + 98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, + 105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, + 114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, + 125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34, + 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43, + 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63, + 67, 71, 76, 81, 85, 92, 98, 105, 111, 118, + /* Size 16x4 */ + 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37, + 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76, + 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63, + 80, 105, 66, 68, 85, 111, 73, 74, 91, 118, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44, + 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, + 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34, + 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, + 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42, + 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73, + 79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58, + 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51, + 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, + 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, + 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, + 94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65, + 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, + 105, 108, 112, 114, 119, 119, 127, 127, + /* Size 32x8 */ + 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32, + 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34, + 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, + 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64, + 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, + 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, + 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, + 71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85, + 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45, + 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60, + 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87, + 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102, + 112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119, + 72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79, + 72, 70, 79, 90, 104, 115, 127 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78, + /* Size 8x8 */ + 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42, + 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53, + 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69, + 73, 77, 57, 54, 52, 58, 65, 72, 77, 82, + /* Size 16x16 */ + 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31, + 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35, + 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45, + 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47, + 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49, + 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53, + 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58, + 60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65, + 49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48, + 46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47, + 47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52, + 55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58, + 61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66, + 68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75, + 78, 82, 85, 89, + /* Size 32x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49, + 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31, + 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50, + 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35, + 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53, + 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42, + 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, + 58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45, + 45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32, + 33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46, + 46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37, + 39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, + 49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44, + 45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51, + 53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48, + 47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54, + 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45, + 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42, + 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, + 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45, + 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, + 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50, + 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58, + 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53, + 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47, + 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, + 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46, + 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, + 58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, + 49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, + 63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, + 55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, + 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47, + 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, + 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46, + 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66, + 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50, + 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72, + 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, + 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51, + 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65, + 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49, + 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, + 71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, + 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75, + 77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, + 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82, + 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, + 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55, + 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72, + 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53, + 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77, + 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, + 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, + 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63, + 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89, + /* Size 4x8 */ + 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45, + 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79, + /* Size 8x4 */ + 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49, + 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79, + /* Size 8x16 */ + 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31, + 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43, + 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50, + 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55, + 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62, + 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70, + 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79, + 82, 86, + /* Size 16x8 */ + 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, + 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47, + 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, + 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, + 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, + 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, + 80, 86, + /* Size 16x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, + 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, + 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, + 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37, + 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51, + 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46, + 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, + 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, + 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42, + 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, + 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, + 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, + 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, + 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, + 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, + 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59, + 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48, + 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, + 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47, + 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, + 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53, + 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78, + 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60, + 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58, + 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, + 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57, + 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, + 78, 80, 82, 83, 85, 85, 89, 89, + /* Size 32x16 */ + 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31, + 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36, + 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42, + 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46, + 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45, + 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47, + 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51, + 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54, + 37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43, + 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, + 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, + 52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, + 53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, + 56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, + 59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, + 63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, + 49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47, + 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46, + 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50, + 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56, + 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62, + 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68, + 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75, + 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82, + 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55, + 54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53, + 53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, + 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, + 67, 68, 75, 75, 80, 82, 86, 89, + /* Size 4x16 */ + 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42, + 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46, + 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53, + 56, 58, 61, 64, 67, 71, 73, 76, 79, 82, + /* Size 16x4 */ + 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47, + 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61, + 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53, + 64, 76, 55, 55, 66, 79, 58, 58, 68, 82, + /* Size 8x32 */ + 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49, + 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32, + 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, + 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44, + 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, + 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50, + 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59, + 60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56, + 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50, + 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, + 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, + 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, + 72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, + 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, + 82, 83, 86, 86, + /* Size 32x8 */ + 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31, + 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46, + 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47, + 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52, + 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, + 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, + 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, + 59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66, + 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46, + 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54, + 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68, + 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79, + 58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, + 53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, + 67, 75, 80, 86 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92, + /* Size 8x8 */ + 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32, + 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47, + 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75, + 82, 92, 63, 59, 58, 65, 73, 84, 92, 105, + /* Size 16x16 */ + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32, + 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32, + 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33, + 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37, + 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41, + 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51, + 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63, + 65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, + 41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42, + 41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45, + 45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49, + 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63, + 67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73, + 77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85, + 92, 97, 101, 105, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46, + 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51, + 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61, + 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, + 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, + 49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54, + 56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, + 37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, + 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45, + 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35, + 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53, + 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39, + 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63, + 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45, + 46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35, + 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, + 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34, + 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, + 60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, + 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, + 69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, + 50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77, + 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, + 57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42, + 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, + 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42, + 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, + 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49, + 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84, + 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, + 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49, + 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, + 68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49, + 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, + 82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, + 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, + 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, + 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, + 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, + 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57, + 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81, + 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59, + 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, + 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, + 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, + 101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, + 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109, + 109, 114, + /* Size 4x8 */ + 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41, + 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97, + /* Size 8x4 */ + 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40, + 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97, + /* Size 8x16 */ + 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32, + 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34, + 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37, + 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50, + 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60, + 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76, + 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97, + 100, 105, + /* Size 16x8 */ + 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, + 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, + 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48, + 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, + 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, + 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, + 92, 105, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44, + 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49, + 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, + 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59, + 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, + 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32, + 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, + 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, + 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, + 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, + 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, + 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, + 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, + 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, + 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41, + 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, + 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48, + 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79, + 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, + 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, + 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, + 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63, + 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, + 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59, + 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, + 88, 92, 92, 97, 98, 100, 105, 105, 109, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32, + 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32, + 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32, + 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34, + 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, + 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, + 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, + 59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58, + 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32, + 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35, + 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37, + 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44, + 46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, + 54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, + 60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, + 72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, + 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41, + 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42, + 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45, + 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56, + 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, + 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, + 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, + 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97, + 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57, + 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60, + 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59, + 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62, + 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32, + 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42, + 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52, + 57, 63, 67, 70, 74, 79, 86, 90, 93, 97, + /* Size 16x4 */ + 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34, + 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67, + 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53, + 74, 90, 57, 56, 77, 93, 61, 58, 79, 97, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39, + 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, + 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34, + 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, + 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, + 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66, + 66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50, + 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42, + 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, + 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49, + 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, + 82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, + 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, + 100, 105, 105, 109, + /* Size 32x8 */ + 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32, + 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34, + 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, + 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58, + 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, + 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, + 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, + 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73, + 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41, + 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54, + 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71, + 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, + 58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, + 58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, + 70, 76, 83, 96, 109 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71, + /* Size 8x8 */ + 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40, + 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51, + 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63, + 66, 70, 55, 52, 50, 54, 60, 66, 70, 76, + /* Size 16x16 */ + 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31, + 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34, + 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42, + 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47, + 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46, + 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50, + 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55, + 56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, + 49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47, + 45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46, + 46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47, + 50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55, + 58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60, + 63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67, + 70, 73, 74, 76, + /* Size 32x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48, + 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, + 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48, + 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34, + 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50, + 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41, + 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, + 53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, + 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31, + 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, + 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34, + 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, + 47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, + 43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, + 50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, + 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39, + 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46, + 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44, + 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49, + 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48, + 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53, + 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48, + 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, + 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46, + 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, + 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, + 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, + 58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, + 53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61, + 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, + 55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46, + 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, + 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45, + 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, + 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, + 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66, + 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, + 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49, + 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60, + 60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, + 66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, + 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, + 69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, + 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, + 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, + 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52, + 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65, + 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, + 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, + 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52, + 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76, + 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58, + 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80, + /* Size 4x8 */ + 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45, + 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73, + /* Size 8x4 */ + 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47, + 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73, + /* Size 8x16 */ + 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31, + 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42, + 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47, + 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53, + 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57, + 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64, + 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73, + 74, 76, + /* Size 16x8 */ + 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32, + 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46, + 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49, + 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, + 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, + 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, + 70, 76, + /* Size 16x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, + 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31, + 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, + 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36, + 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, + 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44, + 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, + 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, + 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38, + 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, + 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, + 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, + 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, + 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, + 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, + 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, + 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, + 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46, + 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, + 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47, + 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65, + 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50, + 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70, + 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, + 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55, + 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, + 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, + 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, + 71, 71, 73, 73, 74, 76, 76, 78, + /* Size 32x16 */ + 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31, + 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34, + 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39, + 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46, + 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45, + 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47, + 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49, + 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51, + 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39, + 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, + 47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, + 49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, + 51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, + 53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, + 54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, + 59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, + 48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46, + 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45, + 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46, + 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54, + 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60, + 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, + 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68, + 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, + 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51, + 51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51, + 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, + 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, + 61, 65, 65, 70, 72, 74, 78, 78, + /* Size 4x16 */ + 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38, + 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45, + 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48, + 51, 55, 58, 60, 62, 65, 68, 70, 71, 73, + /* Size 16x4 */ + 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44, + 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58, + 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48, + 62, 70, 51, 49, 63, 71, 53, 50, 64, 73, + /* Size 8x32 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48, + 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32, + 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, + 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43, + 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, + 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, + 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55, + 55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, + 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47, + 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, + 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48, + 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, + 66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, + 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, + 74, 76, 76, 78, + /* Size 32x8 */ + 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31, + 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44, + 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, + 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51, + 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, + 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, + 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, + 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60, + 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45, + 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52, + 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61, + 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, + 54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, + 50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, + 61, 65, 72, 78 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81, + /* Size 8x8 */ + 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32, + 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42, + 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65, + 71, 77, 53, 50, 51, 55, 61, 70, 77, 85, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32, + 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32, + 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33, + 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35, + 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, + 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51, + 54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64, + 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38, + 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41, + 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45, + 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51, + 57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, + 65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, + 79, 81, 87, 92, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41, + 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46, + 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52, + 56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, + 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, + 37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41, + 44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49, + 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, + 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46, + 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52, + 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, + 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34, + 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, + 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34, + 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51, + 53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, + 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, + 60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, + 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, + 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38, + 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, + 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39, + 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61, + 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42, + 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70, + 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, + 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43, + 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58, + 58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44, + 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, + 69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, + 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75, + 76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, + 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81, + 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, + 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51, + 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, + 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51, + 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77, + 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, + 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, + 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, + 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92, + /* Size 4x8 */ + 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36, + 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83, + /* Size 8x4 */ + 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38, + 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83, + /* Size 8x16 */ + 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32, + 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33, + 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36, + 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42, + 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, + 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, + 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, + 82, 87, + /* Size 16x8 */ + 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, + 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34, + 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, + 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, + 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, + 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, + 79, 87, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, + 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, + 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51, + 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, + 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, + 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, + 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, + 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, + 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, + 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, + 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, + 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49, + 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42, + 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, + 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42, + 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, + 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45, + 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, + 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, + 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52, + 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, + 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54, + 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, + 79, 80, 81, 86, 87, 88, 92, 92, + /* Size 32x16 */ + 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32, + 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32, + 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, + 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34, + 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, + 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41, + 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49, + 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54, + 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32, + 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, + 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, + 36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, + 42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, + 48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, + 53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, + 60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, + 38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38, + 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38, + 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42, + 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52, + 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56, + 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66, + 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76, + 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81, + 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51, + 49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51, + 51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, + 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, + 63, 67, 75, 75, 79, 87, 87, 92, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32, + 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36, + 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49, + 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, + /* Size 16x4 */ + 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34, + 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60, + 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47, + 60, 77, 51, 50, 63, 82, 55, 54, 67, 87, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36, + 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, + 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, + 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36, + 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53, + 55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, + 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43, + 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, + 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, + 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, + 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, + 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, + 82, 83, 87, 87, + /* Size 32x8 */ + 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32, + 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, + 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41, + 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50, + 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, + 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, + 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, + 53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60, + 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37, + 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45, + 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66, + 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77, + 53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, + 51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, + 63, 75, 79, 87 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66, + /* Size 8x8 */ + 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36, + 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50, + 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59, + 61, 64, 51, 48, 48, 51, 54, 60, 64, 68, + /* Size 16x16 */ + 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31, + 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32, + 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40, + 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45, + 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47, + 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47, + 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50, + 50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55, + 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47, + 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45, + 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, + 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50, + 54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, + 57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, + 64, 66, 68, 71, + /* Size 32x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49, + 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47, + 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34, + 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38, + 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50, + 52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44, + 46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31, + 31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45, + 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32, + 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45, + 46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40, + 43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, + 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45, + 45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, + 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47, + 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38, + 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, + 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43, + 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46, + 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47, + 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48, + 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, + 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42, + 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, + 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44, + 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, + 52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, + 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, + 54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, + 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47, + 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, + 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46, + 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, + 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, + 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60, + 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, + 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47, + 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, + 56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, + 61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, + 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63, + 63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, + 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66, + 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, + 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49, + 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, + 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48, + 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64, + 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, + 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, + 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53, + 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71, + /* Size 4x8 */ + 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46, + 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67, + /* Size 8x4 */ + 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48, + 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67, + /* Size 8x16 */ + 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31, + 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38, + 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47, + 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50, + 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55, + 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59, + 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, + 66, 68, + /* Size 16x8 */ + 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, + 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44, + 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, + 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, + 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, + 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, + 65, 68, + /* Size 16x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, + 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46, + 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35, + 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, + 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41, + 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, + 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, + 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38, + 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, + 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, + 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, + 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, + 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, + 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, + 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47, + 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, + 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45, + 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, + 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, + 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63, + 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, + 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50, + 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, + 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50, + 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, + 65, 65, 66, 68, 68, 69, 71, 71, + /* Size 32x16 */ + 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31, + 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32, + 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38, + 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45, + 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46, + 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45, + 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47, + 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49, + 34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39, + 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, + 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, + 47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, + 50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, + 49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, + 52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, + 54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, + 48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46, + 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46, + 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46, + 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52, + 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55, + 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, + 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64, + 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66, + 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49, + 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48, + 48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, + 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, + 55, 57, 62, 62, 65, 68, 68, 71, + /* Size 4x16 */ + 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38, + 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46, + 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47, + 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, + /* Size 16x4 */ + 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43, + 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54, + 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47, + 55, 64, 49, 47, 56, 66, 51, 49, 57, 68, + /* Size 8x32 */ + 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49, + 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31, + 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, + 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, + 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, + 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47, + 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49, + 50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51, + 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48, + 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, + 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, + 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, + 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, + 66, 67, 68, 68, + /* Size 32x8 */ + 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31, + 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40, + 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, + 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47, + 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, + 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, + 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, + 52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54, + 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46, + 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47, + 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59, + 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64, + 52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, + 48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, + 55, 62, 65, 68 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65, + /* Size 8x8 */ + 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32, + 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37, + 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56, + 63, 67, 47, 44, 45, 46, 52, 59, 67, 71, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, + 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34, + 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36, + 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40, + 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44, + 47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51, + 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35, + 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37, + 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40, + 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44, + 47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, + 56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, + 63, 67, 70, 71, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, + 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, + 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, + 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, + 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, + 38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42, + 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, + 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38, + 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42, + 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, + 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, + 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, + 40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33, + 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, + 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, + 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, + 50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54, + 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, + 46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35, + 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, + 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35, + 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54, + 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39, + 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59, + 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, + 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39, + 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, + 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38, + 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, + 57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, + 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, + 66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, + 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, + 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, + 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45, + 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, + 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45, + 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, + 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45, + 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71, + 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53, + 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77, + /* Size 4x8 */ + 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34, + 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67, + /* Size 8x4 */ + 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37, + 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32, + 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34, + 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39, + 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47, + 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58, + 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, + 69, 70, + /* Size 16x8 */ + 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, + 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34, + 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38, + 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, + 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, + 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, + 67, 70, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, + 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, + 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, + 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, + 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, + 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, + 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, + 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, + 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, + 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35, + 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, + 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38, + 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59, + 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, + 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, + 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, + 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, + 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, + 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49, + 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, + 67, 71, 71, 72, 75, 76, 76, 79, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, + 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38, + 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41, + 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49, + 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, + 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, + 33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, + 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, + 37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, + 40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, + 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, + 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, + 35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35, + 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34, + 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40, + 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42, + 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50, + 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56, + 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, + 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71, + 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45, + 44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45, + 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, + 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, + 56, 58, 58, 64, 69, 69, 73, 79, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32, + 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34, + 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, + 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, + /* Size 16x4 */ + 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34, + 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48, + 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43, + 53, 63, 45, 45, 56, 66, 46, 46, 56, 67, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34, + 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, + 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, + 46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41, + 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35, + 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, + 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41, + 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, + 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, + 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, + 69, 70, 70, 73, + /* Size 32x8 */ + 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32, + 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, + 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34, + 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44, + 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, + 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, + 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, + 48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51, + 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34, + 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40, + 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, + 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66, + 44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, + 45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, + 56, 58, 69, 73 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59, + /* Size 8x8 */ + 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35, + 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47, + 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55, + 58, 60, 49, 46, 46, 46, 50, 55, 60, 61, + /* Size 16x16 */ + 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31, + 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31, + 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35, + 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43, + 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47, + 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46, + 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47, + 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50, + 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47, + 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45, + 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46, + 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47, + 49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, + 54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, + 58, 60, 61, 61, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43, + 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48, + 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, + 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, + 42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31, + 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, + 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32, + 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, + 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, + 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, + 46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, + 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, + 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34, + 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46, + 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38, + 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45, + 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, + 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, + 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, + 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39, + 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, + 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, + 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, + 49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, + 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, + 50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, + 48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, + 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, + 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47, + 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46, + 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, + 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55, + 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, + 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47, + 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, + 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, + 55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, + 59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, + 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, + 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48, + 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, + 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, + 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61, + 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50, + 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64, + /* Size 4x8 */ + 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46, + 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59, + /* Size 8x4 */ + 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47, + 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59, + /* Size 8x16 */ + 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31, + 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35, + 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43, + 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48, + 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52, + 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56, + 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, + 61, 61, + /* Size 16x8 */ + 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, + 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42, + 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47, + 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, + 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, + 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, + 59, 61, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, + 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47, + 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33, + 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, + 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, + 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, + 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, + 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, + 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, + 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, + 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, + 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, + 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, + 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, + 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56, + 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, + 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, + 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, + 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, + 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, + 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48, + 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, + 59, 61, 61, 62, 63, 64, 64, 65, + /* Size 32x16 */ + 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31, + 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31, + 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38, + 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40, + 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, + 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45, + 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45, + 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47, + 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35, + 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, + 41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, + 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, + 49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, + 50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, + 47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47, + 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46, + 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47, + 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48, + 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53, + 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55, + 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, + 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61, + 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48, + 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46, + 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, + 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, + 52, 54, 54, 58, 60, 60, 62, 65, + /* Size 4x16 */ + 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38, + 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46, + 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, + 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, + /* Size 16x4 */ + 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42, + 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49, + 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46, + 53, 58, 48, 46, 54, 59, 48, 46, 54, 59, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43, + 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, + 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, + 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, + 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, + 46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49, + 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47, + 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, + 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, + 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, + 61, 61, 61, 62, + /* Size 32x8 */ + 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31, + 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39, + 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46, + 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46, + 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, + 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, + 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, + 49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50, + 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46, + 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47, + 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, + 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59, + 49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, + 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, + 52, 54, 60, 62 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54, + /* Size 8x8 */ + 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32, + 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35, + 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46, + 51, 54, 41, 39, 40, 41, 44, 49, 54, 58, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, + 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, + 40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42, + 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34, + 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34, + 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36, + 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, + 40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, + 45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, + 54, 58, 58, 63, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, + 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, + 34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, + 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, + 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, + 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, + 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, + 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, + 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, + 40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, + 36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, + 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, + 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33, + 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44, + 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, + 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49, + 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, + 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, + 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, + 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52, + 52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, + 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, + 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, + 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, + 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, + 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39, + 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54, + 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, + 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, + 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, + 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, + /* Size 4x8 */ + 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34, + 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56, + /* Size 8x4 */ + 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34, + 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34, + 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35, + 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, + 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, + 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, + 58, 63, + /* Size 16x8 */ + 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, + 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, + 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, + 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, + 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, + 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, + 53, 63, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, + 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, + 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, + 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, + 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, + 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, + 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35, + 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, + 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, + 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, + 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, + 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, + 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, + 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43, + 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, + 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, + 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, + 54, 56, 58, 58, 58, 60, 63, 63, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, + 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, + 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, + 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32, + 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, + 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, + 34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, + 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, + 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, + 42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, + 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, + 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34, + 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36, + 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38, + 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, + 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48, + 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52, + 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, + 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38, + 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39, + 39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, + 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, + 43, 48, 53, 53, 53, 58, 63, 63, + /* Size 4x16 */ + 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33, + 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39, + 39, 40, 40, 45, 45, 51, 51, 54, 54, 58, + /* Size 16x4 */ + 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32, + 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40, + 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39, + 45, 54, 38, 39, 45, 54, 42, 42, 48, 58, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, + 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35, + 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, + 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, + 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, + 48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, + 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, + 58, 60, 63, 63, + /* Size 32x8 */ + 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32, + 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, + 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34, + 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, + 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, + 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, + 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, + 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45, + 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, + 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38, + 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, + 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58, + 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, + 39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, + 43, 53, 53, 63 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54, + /* Size 8x8 */ + 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33, + 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45, + 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51, + 53, 54, 48, 46, 45, 46, 47, 51, 54, 56, + /* Size 16x16 */ + 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31, + 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31, + 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35, + 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40, + 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, + 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, + 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, + 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45, + 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42, + 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46, + 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47, + 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, + 46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, + 49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, + 53, 55, 55, 58, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39, + 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, + 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, + 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42, + 42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, + 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, + 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, + 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, + 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42, + 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, + 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, + 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, + 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, + 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, + 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, + 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, + 45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37, + 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, + 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, + 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, + 46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, + 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, + 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, + 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, + 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42, + 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50, + 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46, + 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, + 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, + 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, + 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, + 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, + 53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, + 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, + 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, + 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, + 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, + 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45, + 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54, + 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, + 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, + 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58, + /* Size 4x8 */ + 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42, + 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55, + /* Size 8x4 */ + 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44, + 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55, + /* Size 8x16 */ + 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31, + 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32, + 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43, + 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47, + 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, + 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, + 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, + 56, 58, + /* Size 16x8 */ + 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, + 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, + 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, + 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, + 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, + 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, + 53, 58, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, + 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, + 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, + 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, + 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, + 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, + 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, + 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, + 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, + 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, + 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, + 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, + 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48, + 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47, + 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, + 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, + 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, + 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, + 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48, + 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, + 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46, + 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, + 53, 54, 56, 56, 56, 57, 58, 58, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31, + 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31, + 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, + 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38, + 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42, + 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, + 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, + 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, + 32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34, + 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, + 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, + 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, + 45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, + 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, + 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, + 45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, + 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, + 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43, + 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46, + 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48, + 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, + 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53, + 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54, + 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, + 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, + 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45, + 45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, + 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, + 46, 49, 53, 53, 53, 56, 58, 58, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34, + 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42, + 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46, + 46, 46, 46, 50, 50, 53, 53, 54, 54, 56, + /* Size 16x4 */ + 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35, + 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46, + 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46, + 50, 54, 47, 46, 50, 54, 47, 45, 49, 56, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39, + 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, + 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, + 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, + 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, + 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47, + 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, + 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, + 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, + 56, 57, 58, 58, + /* Size 32x8 */ + 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31, + 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, + 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46, + 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, + 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, + 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, + 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, + 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47, + 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, + 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48, + 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, + 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56, + 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, + 45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, + 46, 53, 53, 58 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46, + /* Size 8x8 */ + 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, + 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34, + 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38, + 39, 42, 35, 35, 34, 36, 38, 40, 42, 48, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36, + 36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33, + 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33, + 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, + 36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, + 38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, + 42, 45, 48, 48, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, + 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, + 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, + 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, + 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40, + 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, + 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34, + 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, + 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, + 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, + 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, + 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, + 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, + 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35, + 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, + 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, + 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, + 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, + 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48, + 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37, + 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50, + /* Size 4x8 */ + 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32, + 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48, + /* Size 8x4 */ + 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33, + 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34, + 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, + 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40, + 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, + 48, 48, + /* Size 16x8 */ + 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, + 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, + 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34, + 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, + 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, + 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, + 46, 48, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, + 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, + 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, + 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, + 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46, + 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, + 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, + 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, + 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, + 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, + 42, 42, 44, 47, 48, 48, 48, 49, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, + 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, + 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, + 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, + 37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, + 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, + 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33, + 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33, + 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36, + 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, + 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, + 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42, + 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44, + 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35, + 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, + 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, + 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, + 39, 39, 39, 42, 46, 49, 49, 49, + /* Size 4x16 */ + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32, + 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34, + 36, 36, 37, 38, 38, 41, 42, 44, 48, 48, + /* Size 16x4 */ + 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32, + 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37, + 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34, + 37, 44, 35, 34, 38, 48, 35, 34, 38, 48, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35, + 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, + 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, + 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, + 48, 48, 48, 49, + /* Size 32x8 */ + 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32, + 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, + 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, + 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, + 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, + 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, + 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, + 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38, + 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, + 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35, + 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, + 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, + 35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, + 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, + 39, 39, 46, 49 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52, + /* Size 8x8 */ + 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31, + 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42, + 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48, + 48, 50, 48, 47, 46, 47, 47, 49, 50, 53, + /* Size 16x16 */ + 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31, + 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31, + 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31, + 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35, + 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, + 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44, + 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46, + 47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47, + 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38, + 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41, + 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42, + 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46, + 46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, + 48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, + 50, 51, 53, 53, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36, + 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39, + 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42, + 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, + 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, + 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, + 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42, + 42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46, + 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, + 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, + 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41, + 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43, + 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, + 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, + 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, + 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, + 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, + 45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, + 37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, + 47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, + 42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39, + 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, + 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49, + 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, + 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, + 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, + 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, + 50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, + 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51, + 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, + 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47, + 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, + 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, + 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, + 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, + 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53, + 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, + /* Size 4x8 */ + 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38, + 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53, + /* Size 8x4 */ + 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39, + 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53, + /* Size 8x16 */ + 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31, + 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32, + 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35, + 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43, + 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, + 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49, + 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, + 53, 53, + /* Size 16x8 */ + 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, + 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35, + 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42, + 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, + 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, + 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, + 52, 53, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39, + 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42, + 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, + 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, + 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, + 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, + 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, + 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, + 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, + 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, + 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, + 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, + 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, + 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52, + 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, + 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, + 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, + 50, 50, 51, 52, 53, 53, 53, 53, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31, + 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31, + 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, + 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, + 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38, + 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41, + 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46, + 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, + 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, + 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, + 33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, + 37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, + 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, + 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, + 46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, + 47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, + 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, + 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, + 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, + 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46, + 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, + 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, + 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50, + 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51, + 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48, + 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, + 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, + 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, + 47, 47, 47, 49, 52, 53, 53, 53, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, + 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39, + 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46, + 47, 47, 47, 47, 47, 49, 50, 51, 53, 53, + /* Size 16x4 */ + 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32, + 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47, + 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44, + 47, 51, 48, 46, 48, 53, 48, 46, 48, 53, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36, + 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, + 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47, + 47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, + 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, + 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45, + 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, + 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, + 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, + 53, 53, 53, 53, + /* Size 32x8 */ + 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31, + 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, + 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, + 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, + 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, + 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, + 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, + 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47, + 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, + 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44, + 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, + 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, + 47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, + 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, + 47, 47, 52, 53 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35, + /* Size 8x8 */ + 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, + 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34, + 35, 36, 33, 33, 33, 33, 35, 35, 36, 38, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, + 37, 37, 38, 39, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, + 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, + 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, + 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, + /* Size 4x8 */ + 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36, + /* Size 8x4 */ + 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, + 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, + 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, + 36, 38, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, + 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, + 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, + 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, + 37, 38, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, + 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, + 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, + 37, 37, 37, 37, 38, 38, 39, 39, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, + 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, + 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, + 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, + 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, + 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33, + 33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33, + 33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, + 35, 36, 37, 37, 37, 37, 38, 39, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33, + 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, + /* Size 16x4 */ + 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34, + 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33, + 34, 35, 33, 33, 35, 36, 34, 34, 36, 37, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, + 36, 37, 38, 38, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, + 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, + 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, + 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, + 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, + 33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, + 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, + 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, + 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35, + 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, + 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, + 33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, + 35, 37, 37, 38 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47, + /* Size 8x8 */ + 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, + 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35, + 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44, + 47, 47, 40, 41, 41, 42, 44, 45, 47, 48, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, + 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, + 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, + 41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44, + 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34, + 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37, + 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40, + 42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, + 44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, + 47, 47, 48, 48, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35, + 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37, + 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41, + 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, + 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, + 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, + 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39, + 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, + 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, + 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, + 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, + 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, + 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, + 36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41, + 41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43, + 43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, + 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, + 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, + 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, + 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, + 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34, + 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, + 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, + 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45, + 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, + 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43, + 43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, + 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, + 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, + 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, + 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, + 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41, + 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47, + 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, + 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43, + 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48, + /* Size 4x8 */ + 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36, + 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47, + /* Size 8x4 */ + 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36, + 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31, + 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32, + 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35, + 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, + 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, + 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, + 47, 48, + /* Size 16x8 */ + 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, + 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, + 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, + 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, + 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, + 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, + 47, 48, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, + 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, + 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42, + 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, + 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, + 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, + 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, + 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, + 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40, + 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38, + 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, + 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, + 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, + 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, + 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, + 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, + 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, + 47, 47, 47, 47, 48, 48, 48, 48, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31, + 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, + 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, + 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, + 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, + 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42, + 30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31, + 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, + 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, + 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, + 35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, + 40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, + 41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, + 43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, + 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, + 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35, + 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37, + 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39, + 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44, + 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, + 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, + 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, + 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39, + 40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42, + 42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, + 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, + 44, 46, 47, 47, 47, 47, 48, 48, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31, + 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36, + 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40, + 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, + /* Size 16x4 */ + 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32, + 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42, + 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40, + 45, 47, 39, 41, 45, 47, 42, 43, 46, 47, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33, + 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, + 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, + 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, + 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, + 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37, + 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, + 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, + 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, + 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, + 47, 47, 48, 48, + /* Size 32x8 */ + 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31, + 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, + 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, + 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40, + 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, + 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, + 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, + 41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, + 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, + 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38, + 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47, + 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, + 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, + 42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, + 44, 47, 47, 48 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + /* Size 8x4 */ + 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 34, 34, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, + 33, 34, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 34, 34, 34, 34, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, + 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 33, 34, 34, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, + /* Size 16x4 */ + 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, + 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, + 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, + 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, + 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, + 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, + 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, + 32, 32, 33, 34 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39, + /* Size 8x8 */ + 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31, + 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31, + 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36, + 39, 39, 33, 34, 34, 35, 35, 36, 39, 39, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, + 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, + 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, + 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34, + 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35, + 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, + 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, + 38, 39, 39, 39, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31, + 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34, + 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, + 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, + 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, + 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35, + 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, + 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, + 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, + 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36, + 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32, + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, + 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, + 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, + 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, + 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, + 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, + 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, + 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31, + 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40, + /* Size 8x4 */ + 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, + 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, + 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41, + 41, 41, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, + 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, + 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, + 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, + 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, + 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, + 38, 41, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, + 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, + 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, + 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, + 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, + 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37, + 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38, + 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35, + 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, + 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, + 42, 43, 43, 43, 43, 43, 43, 44, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, + 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, + 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, + 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, + 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, + 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33, + 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33, + 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34, + 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, + 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, + 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, + 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, + 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, + 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, + 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, + 36, 36, 36, 38, 39, 40, 42, 44, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35, + 36, 36, 36, 36, 36, 37, 39, 40, 40, 40, + /* Size 16x4 */ + 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31, + 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, + 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36, + 36, 40, 34, 36, 36, 40, 34, 36, 36, 40, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, + 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, + 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, + 37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, + 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, + 41, 41, 41, 42, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31, + 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, + 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, + 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, + 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, + 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, + 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, + 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, + 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, + 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33, + 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35, + 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, + 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, + 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, + 36, 36, 39, 42 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, + 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 32, 32 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + /* Size 8x8 */ + 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, + /* Size 16x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, + /* Size 32x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + /* Size 4x8 */ + 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, + /* Size 8x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32, + /* Size 8x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, + 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, + 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31, + 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, + /* Size 16x4 */ + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, + 32, 32, 31, 31, 32, 32, 30, 31, 32, 32, + /* Size 8x32 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, + 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, + 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, + 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, + 32, 32, 32, 32 }, + }, +}; + +static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { + { + { /* Luma */ + /* Size 4x4 */ + 32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5, + /* Size 8x8 */ + 32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26, + 19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9, + 8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10, + 10, 8, 7, 6, 5, 5, + /* Size 16x16 */ + 32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32, + 32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29, + 28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22, + 20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16, + 15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11, + 11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9, + 9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16, + 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, + 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7, + 7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6, + 11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11, + 11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7, + 6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, + 32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12, + 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28, + 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10, + 10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19, + 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, + 32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30, + 29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, + 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19, + 18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28, + 27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12, + 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22, + 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15, + 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20, + 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, + 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, + 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17, + 17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16, + 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, + 9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, + 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14, + 14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, + 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, + 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, + 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9, + 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, + 5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, + 10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, + 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, + /* Size 4x8 */ + 32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15, + 12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5, + /* Size 8x4 */ + 32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12, + 8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5, + /* Size 8x16 */ + 32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31, + 30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24, + 21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14, + 13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10, + 9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6, + 7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12, + 12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5, + /* Size 16x8 */ + 32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30, + 28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17, + 15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10, + 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13, + 15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8, + 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10, + 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32, + 31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, + 12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27, + 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25, + 25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18, + 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, + 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12, + 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17, + 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, + 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12, + 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, + 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, + 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, + 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, + 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, + 5, 5, 5, 5, + /* Size 32x16 */ + 32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32, + 32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31, + 30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25, + 21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19, + 17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15, + 14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13, + 13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12, + 12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11, + 26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25, + 24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21, + 19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15, + 13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12, + 11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9, + 9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9, + 16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15, + 14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9, + 9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7, + 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, + 13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9, + 8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6, + 6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12, + 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, + 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, + 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10, + 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7, + 6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, + /* Size 4x16 */ + 33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25, + 24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14, + 13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8, + 8, 7, 6, 6, 6, 5, 5, + /* Size 16x4 */ + 33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19, + 13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14, + 11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9, + 7, 5, 9, 9, 7, 5, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30, + 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21, + 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16, + 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12, + 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, + 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, + 5, 5, + /* Size 32x8 */ + 32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31, + 30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20, + 17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13, + 13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11, + 26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23, + 19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12, + 11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9, + 16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10, + 9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12, + 13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7, + 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10, + 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9, + 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6, + 5 }, + { /* Chroma */ + /* Size 4x4 */ + 29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9, + /* Size 8x8 */ + 33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22, + 19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14, + 12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11, + 10, 10, 14, 15, 15, 14, 12, 11, 10, 9, + /* Size 16x16 */ + 32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32, + 29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23, + 22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21, + 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15, + 15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14, + 14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13, + 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, + 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18, + 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, + 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15, + 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, + 12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10, + 9, 9, 9, + /* Size 32x32 */ + 32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31, + 30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23, + 22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22, + 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, + 15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20, + 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27, + 26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22, + 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, + 16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23, + 23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20, + 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, + 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, + 15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20, + 21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21, + 20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, + 13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17, + 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16, + 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15, + 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14, + 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, + 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14, + 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, + 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + /* Size 4x8 */ + 33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19, + 16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10, + /* Size 8x4 */ + 33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16, + 12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10, + /* Size 8x16 */ + 32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26, + 24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20, + 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17, + 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14, + 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10, + 10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, + 10, 9, + /* Size 16x8 */ + 32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24, + 22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, + 18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14, + 14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13, + 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17, + 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, + 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11, + 10, 9, + /* Size 16x32 */ + 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30, + 28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22, + 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, + 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, + 16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22, + 23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, + 15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22, + 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17, + 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, + 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, + 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17, + 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10, + 10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, + 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, + 10, 10, 10, 10, 9, 9, 9, + /* Size 32x16 */ + 32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33, + 27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24, + 22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23, + 21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21, + 20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19, + 18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18, + 17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, + 17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17, + 21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23, + 22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21, + 19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, + 16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15, + 14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13, + 13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13, + 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13, + 13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13, + 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18, + 19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18, + 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15, + 13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13, + 12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, + 10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10, + 10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, + 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, + 15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, + 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14, + 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13, + 13, 11, 11, 10, 10, 9, 9, 9, + /* Size 4x16 */ + 33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23, + 22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18, + 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16, + 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, + /* Size 16x4 */ + 33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19, + 17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13, + 18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15, + 12, 10, 15, 15, 12, 10, 15, 14, 12, 10, + /* Size 8x32 */ + 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25, + 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, + 16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16, + 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, + 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, + 10, 9, 9, 9, + /* Size 32x8 */ + 32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26, + 22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22, + 20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17, + 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, + 21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22, + 19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16, + 14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12, + 13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13, + 16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, + 17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14, + 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, + 10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, + 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15, + 16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14, + 13, 11, 10, 9 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6, + /* Size 8x8 */ + 32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26, + 20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12, + 10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10, + 11, 10, 9, 8, 7, 6, 5, + /* Size 16x16 */ + 32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32, + 32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30, + 28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24, + 22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17, + 16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13, + 12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10, + 10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8, + 17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, + 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, + 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7, + 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, + 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11, + 10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7, + 6, 6, 5, 5, 5, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, + 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30, + 28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23, + 23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18, + 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32, + 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, + 22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, + 11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, + 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27, + 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26, + 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19, + 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, + 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12, + 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20, + 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, + 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, + 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10, + 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15, + 15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, + 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12, + 12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6, + 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12, + 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, + 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + 5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, + 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, + 5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10, + 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, + 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, + /* Size 4x8 */ + 32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16, + 13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6, + /* Size 8x4 */ + 32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13, + 9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6, + /* Size 8x16 */ + 32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32, + 30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26, + 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10, + 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, + 7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12, + 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, + /* Size 16x8 */ + 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30, + 28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18, + 15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11, + 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14, + 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, + 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, + 11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28, + 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, + 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16, + 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24, + 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, + 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, + 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, + 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, + 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11, + 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, + 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, + 7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, + 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, + 5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, + 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, + /* Size 32x16 */ + 32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32, + 32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31, + 29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25, + 23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20, + 17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16, + 15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13, + 13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13, + 12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, + 27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28, + 26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23, + 19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16, + 15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13, + 12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10, + 10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9, + 9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, + 17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12, + 12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8, + 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8, + 12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, + 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8, + 8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, + 6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12, + 12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9, + 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, + 6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10, + 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8, + 7, 7, 6, 6, 5, 5, 5, + /* Size 4x16 */ + 33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25, + 24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16, + 14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10, + 9, 8, 7, 7, 6, 6, 6, 6, + /* Size 16x4 */ + 33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19, + 14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9, + 16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6, + 11, 10, 8, 6, 10, 9, 7, 6, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16, + 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31, + 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24, + 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, + 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, + 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, + 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, + 6, 6, 6, 6, 5, 5, 5, + /* Size 32x8 */ + 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32, + 29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23, + 17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14, + 13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12, + 27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24, + 19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14, + 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9, + 9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14, + 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, + 12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10, + 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10, + 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, + 6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, + 8, 7, 6, 5 }, + { /* Chroma */ + /* Size 4x4 */ + 31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10, + /* Size 8x8 */ + 33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22, + 19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15, + 13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11, + 10, 10, 15, 16, 16, 14, 12, 11, 10, 9, + /* Size 16x16 */ + 32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33, + 29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23, + 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21, + 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17, + 16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13, + 13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, + 17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, + 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18, + 17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15, + 14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, + 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12, + 12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11, + 10, 10, 9, 9, + /* Size 32x32 */ + 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32, + 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17, + 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24, + 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23, + 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, + 15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21, + 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27, + 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22, + 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, + 21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, + 17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, + 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23, + 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21, + 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, + 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, + 14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21, + 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18, + 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18, + 19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, + 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, + 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, + 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, + 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, + 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, + 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16, + 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16, + 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, + 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + /* Size 4x8 */ + 33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20, + 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10, + /* Size 8x4 */ + 33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16, + 13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10, + /* Size 8x16 */ + 32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27, + 25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21, + 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14, + 14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, + 12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11, + 11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, + 10, 10, + /* Size 16x8 */ + 32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25, + 22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19, + 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15, + 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13, + 17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18, + 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14, + 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12, + 10, 10, + /* Size 16x32 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31, + 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22, + 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22, + 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, + 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, + 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, + 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, + 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14, + 14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, + 17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17, + 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, + 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17, + 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 10, 10, 10, 10, 10, 10, 9, + /* Size 32x16 */ + 32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33, + 28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26, + 22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23, + 22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20, + 19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18, + 18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17, + 17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17, + 21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22, + 22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, + 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17, + 17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16, + 15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14, + 14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13, + 13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13, + 13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13, + 17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18, + 19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18, + 17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15, + 14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13, + 12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12, + 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11, + 11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10, + 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, + 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, + 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15, + 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13, + 13, 12, 12, 11, 11, 10, 10, 9, + /* Size 4x16 */ + 33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23, + 22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19, + 17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16, + 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, + /* Size 16x4 */ + 33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19, + 17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13, + 19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16, + 12, 10, 16, 15, 12, 10, 15, 15, 12, 10, + /* Size 8x32 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27, + 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, + 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17, + 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, + 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, + 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 10, 10, 10, + /* Size 32x8 */ + 32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27, + 22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22, + 20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19, + 18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17, + 21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23, + 19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17, + 15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13, + 13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13, + 17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19, + 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14, + 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, + 11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10, + 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, + 16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14, + 13, 12, 11, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6, + /* Size 8x8 */ + 32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28, + 21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13, + 11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11, + 11, 11, 10, 8, 7, 6, 6, + /* Size 16x16 */ + 32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, + 32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30, + 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26, + 24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19, + 17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14, + 13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11, + 11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10, + 9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17, + 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15, + 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, + 9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7, + 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11, + 12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10, + 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, + 32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27, + 25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, + 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, + 32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17, + 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30, + 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, + 14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24, + 23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19, + 19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, + 28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28, + 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, + 12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22, + 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17, + 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22, + 22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21, + 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, + 9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, + 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18, + 19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, + 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, + 16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, + 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12, + 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, + 12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, + 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, + 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, + 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, + 11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11, + 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, + 6, 6, 6, 6, 5, 5, 5, + /* Size 4x8 */ + 32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18, + 15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6, + /* Size 8x4 */ + 32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15, + 10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6, + /* Size 8x16 */ + 32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32, + 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27, + 23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18, + 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13, + 11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8, + 7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11, + 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 16x8 */ + 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31, + 28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20, + 17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11, + 11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16, + 17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9, + 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12, + 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, + /* Size 16x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14, + 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29, + 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25, + 24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, + 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27, + 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, + 13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, + 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9, + 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17, + 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12, + 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13, + 14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, + 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, + 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, + 11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, + 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, + 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9, + 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, + /* Size 32x16 */ + 32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32, + 32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32, + 29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28, + 25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20, + 19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18, + 15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15, + 13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13, + 13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12, + 28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28, + 26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25, + 20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18, + 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14, + 13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12, + 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10, + 9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9, + 17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, + 18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13, + 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9, + 8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8, + 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14, + 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11, + 9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7, + 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12, + 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11, + 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7, + 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, + 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, + /* Size 4x16 */ + 33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27, + 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18, + 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11, + 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 16x4 */ + 33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22, + 16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9, + 17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6, + 12, 11, 8, 6, 11, 10, 8, 6, + /* Size 8x32 */ + 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17, + 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32, + 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, + 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, + 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17, + 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + 10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14, + 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, + 8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, + 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, + 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 6, 6, 6, 6, 6, 6, 6, + /* Size 32x8 */ + 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32, + 29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24, + 19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15, + 13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12, + 28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26, + 20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16, + 13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10, + 9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17, + 15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8, + 8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14, + 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7, + 6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12, + 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, + 10, 11, 11, 10, 9, 8, 7, 6 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10, + /* Size 8x8 */ + 33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22, + 19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16, + 14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12, + 11, 10, 15, 16, 16, 14, 13, 12, 10, 10, + /* Size 16x16 */ + 32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33, + 29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24, + 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22, + 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20, + 19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15, + 15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14, + 14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, + 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19, + 19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19, + 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12, + 12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, + 11, 10, 10, 10, + /* Size 32x32 */ + 32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33, + 30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24, + 23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23, + 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, + 21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28, + 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23, + 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17, + 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, + 22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, + 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21, + 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, + 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, + 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, + 13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19, + 19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, + 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, + 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, + 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, + 10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, + 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16, + 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, + 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, + /* Size 4x8 */ + 33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20, + 18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10, + /* Size 8x4 */ + 33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17, + 14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10, + /* Size 8x16 */ + 32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28, + 26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22, + 20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16, + 15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12, + 12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10, + 10, 10, + /* Size 16x8 */ + 32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26, + 22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20, + 19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15, + 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13, + 18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18, + 17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14, + 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, + 11, 10, + /* Size 16x32 */ + 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32, + 28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23, + 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, + 17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22, + 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, + 22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23, + 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, + 15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19, + 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14, + 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, + 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18, + 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, + 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16, + 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 10, 10, 10, + /* Size 32x16 */ + 32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33, + 29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26, + 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23, + 23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22, + 21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20, + 19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19, + 18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18, + 17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17, + 21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22, + 22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22, + 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18, + 18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16, + 16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, + 14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14, + 13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13, + 13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13, + 18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19, + 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20, + 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12, + 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11, + 11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, + 11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11, + 15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, + 16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, + 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15, + 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13, + 13, 12, 12, 11, 11, 10, 10, 10, + /* Size 4x16 */ + 33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, + 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20, + 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16, + 15, 14, 13, 12, 12, 11, 11, 11, 10, 10, + /* Size 16x4 */ + 33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20, + 19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13, + 20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16, + 12, 11, 16, 16, 13, 10, 16, 15, 12, 10, + /* Size 8x32 */ + 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19, + 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28, + 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, + 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15, + 15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, + 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17, + 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18, + 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 10, 10, + /* Size 32x8 */ + 32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28, + 22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22, + 21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19, + 18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, + 21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22, + 19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17, + 16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14, + 13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13, + 18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19, + 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15, + 13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11, + 11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10, + 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, + 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, + 13, 12, 11, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7, + /* Size 8x8 */ + 32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28, + 22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14, + 12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7, + 11, 12, 12, 10, 9, 8, 7, 6, + /* Size 16x16 */ + 32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32, + 32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31, + 29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27, + 25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20, + 19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16, + 15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12, + 12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10, + 10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17, + 19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18, + 16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12, + 11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, + 8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, + 11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12, + 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18, + 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13, + 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, + 27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, + 12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23, + 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32, + 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30, + 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, + 26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, + 13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, + 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12, + 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17, + 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30, + 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24, + 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15, + 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23, + 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23, + 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, + 10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, + 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, + 9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13, + 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19, + 19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, + 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, + 16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13, + 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, + 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, + 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, + 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, + 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, + 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, + 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, + 7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, + 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, + 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, + /* Size 4x8 */ + 32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18, + 16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7, + /* Size 8x4 */ + 32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17, + 11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7, + /* Size 8x16 */ + 32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32, + 31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28, + 25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19, + 17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14, + 12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10, + 9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, + 11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, + /* Size 16x8 */ + 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31, + 29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21, + 19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13, + 12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9, + 18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14, + 12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7, + 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, + 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16, + 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30, + 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14, + 13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, + 25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19, + 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29, + 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, + 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, + 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17, + 17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, + 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14, + 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12, + 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, + 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, + /* Size 32x16 */ + 32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32, + 32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32, + 30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30, + 25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23, + 21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18, + 17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15, + 14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13, + 13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13, + 29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30, + 28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26, + 21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20, + 17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16, + 14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12, + 12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11, + 10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10, + 9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18, + 19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18, + 16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13, + 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9, + 8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, + 13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, + 13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11, + 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, + 7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12, + 12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11, + 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, + 7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6, + /* Size 4x16 */ + 33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29, + 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18, + 17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11, + 10, 10, 9, 8, 8, 7, 7, 7, 6, + /* Size 16x4 */ + 33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24, + 17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10, + 19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7, + 12, 12, 9, 7, 12, 11, 8, 6, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19, + 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32, + 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26, + 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, + 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17, + 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, + 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, + 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, + 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, + /* Size 32x8 */ + 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32, + 30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24, + 21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18, + 14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13, + 29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26, + 21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16, + 14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12, + 10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18, + 19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13, + 11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8, + 13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13, + 11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, + 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11, + 9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11, + /* Size 8x8 */ + 33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22, + 20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17, + 15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12, + 11, 11, 16, 17, 17, 15, 13, 12, 11, 10, + /* Size 16x16 */ + 32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33, + 30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24, + 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21, + 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20, + 20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19, + 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, + 16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14, + 14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14, + 19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19, + 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, + 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, + 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14, + 13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12, + 11, 11, 10, 10, + /* Size 32x32 */ + 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33, + 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26, + 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23, + 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30, + 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24, + 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, + 19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, + 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18, + 18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, + 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, + 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, + 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, + 15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, + 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22, + 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19, + 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, + 14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, + 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, + 13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, + 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, + 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, + 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, + 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, + /* Size 4x8 */ + 33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20, + 19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11, + /* Size 8x4 */ + 33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19, + 14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11, + /* Size 8x16 */ + 32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30, + 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22, + 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19, + 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14, + 13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12, + 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, + 11, 10, + /* Size 16x8 */ + 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28, + 22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21, + 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17, + 16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, + 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19, + 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, + 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12, + 11, 10, + /* Size 16x32 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32, + 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24, + 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, + 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22, + 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15, + 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, + 18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, + 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20, + 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, + 11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16, + 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, + 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16, + 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17, + 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, + /* Size 32x16 */ + 32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33, + 30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26, + 23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22, + 23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22, + 22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20, + 20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19, + 19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18, + 18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22, + 22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22, + 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17, + 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, + 16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14, + 14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14, + 14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13, + 19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20, + 20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20, + 19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18, + 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15, + 14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13, + 12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, + 11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11, + 11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, + 15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17, + 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17, + 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, + 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14, + 14, 12, 12, 11, 11, 10, 10, 10, + /* Size 4x16 */ + 33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22, + 22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16, + 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, + /* Size 16x4 */ + 33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20, + 19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14, + 20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16, + 12, 11, 17, 16, 13, 11, 16, 16, 13, 11, + /* Size 8x32 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29, + 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19, + 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, + 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 10, 10, 10, + /* Size 32x8 */ + 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30, + 23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23, + 22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, + 19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17, + 22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, + 20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18, + 17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15, + 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14, + 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20, + 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16, + 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12, + 11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11, + 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, + 17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15, + 14, 12, 11, 10 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7, + /* Size 8x8 */ + 32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29, + 26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16, + 13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8, + 7, 12, 13, 13, 11, 10, 8, 7, 7, + /* Size 16x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32, + 32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31, + 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28, + 26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22, + 20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14, + 13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, + 11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10, + 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18, + 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16, + 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12, + 11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8, + 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11, + 12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20, + 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17, + 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14, + 13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, + 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, + 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, + 32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19, + 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31, + 30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16, + 16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28, + 27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14, + 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, + 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, + 30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19, + 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30, + 29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27, + 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20, + 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24, + 24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23, + 23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20, + 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, + 10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, + 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19, + 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, + 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17, + 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12, + 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14, + 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13, + 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12, + 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14, + 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, + 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10, + 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12, + 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, + 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, + 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, + /* Size 4x8 */ + 32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20, + 19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7, + /* Size 8x4 */ + 32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17, + 13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7, + /* Size 8x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32, + 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28, + 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20, + 19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15, + 14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11, + 10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, + 7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, + /* Size 16x8 */ + 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, + 30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23, + 20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14, + 12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, + 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16, + 14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8, + 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, + 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, + 14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27, + 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, + 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21, + 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29, + 29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26, + 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21, + 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, + 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, + 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13, + 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17, + 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, + 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, + 8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14, + 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, + 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, + 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, + 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, + /* Size 32x16 */ + 32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32, + 32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32, + 31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30, + 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25, + 22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20, + 17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17, + 15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15, + 14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13, + 30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30, + 29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27, + 24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20, + 19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17, + 16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14, + 13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12, + 11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10, + 10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10, + 19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19, + 19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18, + 16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13, + 12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10, + 10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8, + 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13, + 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10, + 10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7, + 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12, + 12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11, + 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, + /* Size 4x16 */ + 33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29, + 29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21, + 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13, + 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, + /* Size 16x4 */ + 33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25, + 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10, + 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10, + 7, 13, 12, 9, 7, 12, 12, 9, 7, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20, + 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, + 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, + 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28, + 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, + 14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20, + 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14, + 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17, + 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, + 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, + 7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, + /* Size 32x8 */ + 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32, + 31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27, + 22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18, + 15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13, + 30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29, + 24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18, + 16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12, + 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10, + 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18, + 15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10, + 9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, + 14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8, + 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, + 13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11, + /* Size 8x8 */ + 33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23, + 21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18, + 16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12, + 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, + /* Size 16x16 */ + 32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33, + 32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26, + 24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22, + 22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21, + 21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19, + 19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, + 17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, + 15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20, + 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20, + 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17, + 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15, + 14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13, + 13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12, + 12, 11, 11, 10, + /* Size 32x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33, + 32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26, + 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22, + 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30, + 29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25, + 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22, + 21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, + 18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21, + 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, + 25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, + 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, + 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, + 22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22, + 22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, + 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21, + 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, + 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, + 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, + 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, + 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, + 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18, + 18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, + 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, + 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, + 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17, + 17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, + 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + 10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, + /* Size 4x8 */ + 33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22, + 20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11, + /* Size 8x4 */ + 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19, + 16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11, + /* Size 8x16 */ + 32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31, + 29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22, + 21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14, + 14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, + 12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, + 11, 11, + /* Size 16x8 */ + 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29, + 24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, + 21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17, + 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, + 20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20, + 19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16, + 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, + 12, 11, + /* Size 16x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32, + 31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20, + 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25, + 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, + 18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22, + 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, + 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20, + 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, + 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, + 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 11, 10, + /* Size 32x16 */ + 32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33, + 31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27, + 25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22, + 23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23, + 22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22, + 20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20, + 19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19, + 18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18, + 24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22, + 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21, + 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17, + 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16, + 15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, + 14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, + 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20, + 21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21, + 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18, + 17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16, + 15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14, + 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13, + 12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, + 11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11, + 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17, + 17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, + 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, + 16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14, + 14, 13, 13, 12, 12, 11, 11, 10, + /* Size 4x16 */ + 33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22, + 22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22, + 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 11, 11, 11, + /* Size 16x4 */ + 33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21, + 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14, + 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17, + 14, 11, 17, 17, 13, 11, 17, 16, 13, 11, + /* Size 8x32 */ + 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31, + 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, + 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22, + 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, + 18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19, + 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, + 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19, + 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, + 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18, + 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 11, 11, 11, + /* Size 32x8 */ + 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31, + 25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22, + 22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, + 19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17, + 24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, + 20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19, + 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16, + 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14, + 20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20, + 20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17, + 15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13, + 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11, + 16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, + 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16, + 14, 13, 12, 11 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8, + /* Size 8x8 */ + 32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29, + 26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17, + 14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10, + 9, 8, 13, 14, 13, 12, 10, 9, 8, 7, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32, + 32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32, + 32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29, + 28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26, + 24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20, + 19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, + 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13, + 12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11, + 21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20, + 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19, + 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16, + 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12, + 11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, + 9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, + 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15, + 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, + 14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, + 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, + 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31, + 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17, + 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, + 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, + 15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, + 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, + 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, + 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31, + 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18, + 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27, + 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27, + 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24, + 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, + 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, + 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, + 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14, + 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21, + 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21, + 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, + 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17, + 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, + 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, + 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16, + 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, + 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, + 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, + 7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14, + 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12, + 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, + 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + /* Size 4x8 */ + 32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20, + 19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8, + /* Size 8x4 */ + 32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18, + 13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8, + /* Size 8x16 */ + 32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32, + 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30, + 29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18, + 16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13, + 12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10, + 10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, + /* Size 16x8 */ + 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, + 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27, + 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17, + 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11, + 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19, + 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14, + 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8, + 8, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, + 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, + 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, + 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, + 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, + 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29, + 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, + 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, + 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, + 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, + 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, + 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, + 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, + 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18, + 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, + 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, + 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13, + 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, + 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15, + 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, + 8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12, + 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, + /* Size 32x16 */ + 32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32, + 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32, + 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30, + 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25, + 25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, + 20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17, + 17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, + 15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14, + 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30, + 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27, + 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21, + 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18, + 18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16, + 16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, + 13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11, + 11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, + 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21, + 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20, + 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16, + 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14, + 12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, + 10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9, + 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16, + 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15, + 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11, + 11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, + 8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8, + 7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7, + /* Size 4x16 */ + 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29, + 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20, + 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14, + 13, 12, 11, 11, 10, 9, 9, 8, 8, 8, + /* Size 16x4 */ + 33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27, + 21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, + 23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14, + 11, 8, 14, 13, 10, 8, 14, 13, 10, 8, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21, + 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, + 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16, + 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20, + 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, + 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, + 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, + /* Size 32x8 */ + 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32, + 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30, + 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20, + 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15, + 30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, + 27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20, + 18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, + 13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11, + 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21, + 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16, + 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11, + 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13, + 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, + 11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, + 7 }, + { /* Chroma */ + /* Size 4x4 */ + 32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12, + /* Size 8x8 */ + 33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23, + 21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18, + 17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14, + 12, 12, 17, 18, 18, 16, 14, 13, 12, 11, + /* Size 16x16 */ + 32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, + 33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29, + 26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23, + 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22, + 23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20, + 20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, + 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17, + 16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, + 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20, + 22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21, + 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19, + 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17, + 16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, + 14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, + 12, 12, 11, 11, + /* Size 32x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33, + 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27, + 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, + 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, + 18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, + 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, + 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26, + 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, + 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, + 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18, + 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, + 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24, + 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, + 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, + 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, + 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, + 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, + 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, + 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22, + 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, + 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, + 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19, + 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, + 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, + 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, + 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, + 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, + 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, + /* Size 4x8 */ + 33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22, + 20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11, + /* Size 8x4 */ + 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19, + 16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11, + /* Size 8x16 */ + 32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33, + 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24, + 22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20, + 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18, + 18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14, + 13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12, + 12, 11, + /* Size 16x8 */ + 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32, + 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22, + 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19, + 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15, + 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20, + 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18, + 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14, + 12, 11, + /* Size 16x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26, + 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, + 18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, + 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, + 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21, + 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, + 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, + 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, + 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17, + 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, + 12, 12, 12, 11, 11, 11, 11, 11, + /* Size 32x16 */ + 32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33, + 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27, + 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22, + 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23, + 23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, + 22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20, + 20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, + 19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18, + 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24, + 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21, + 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19, + 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18, + 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18, + 18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, + 16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, + 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21, + 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22, + 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19, + 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17, + 17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, + 15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13, + 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, + 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, + 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18, + 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18, + 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17, + 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15, + 15, 13, 13, 12, 12, 11, 11, 11, + /* Size 4x16 */ + 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22, + 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22, + 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18, + 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, + /* Size 16x4 */ + 33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22, + 22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, + 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18, + 14, 12, 18, 17, 14, 12, 17, 17, 14, 11, + /* Size 8x32 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, + 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20, + 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, + 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, + 12, 11, 11, 11, + /* Size 32x8 */ + 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33, + 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22, + 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22, + 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19, + 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, + 21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19, + 18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, + 16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14, + 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21, + 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19, + 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15, + 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, + 16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, + 18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16, + 15, 13, 12, 11 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9, + /* Size 8x8 */ + 33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31, + 29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19, + 16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12, + 10, 9, 15, 16, 16, 14, 12, 11, 9, 9, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32, + 32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32, + 32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29, + 28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26, + 25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21, + 20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17, + 17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, + 13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12, + 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22, + 23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20, + 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18, + 17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14, + 14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, + 11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9, + 9, 8, 8, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, + 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17, + 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28, + 27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23, + 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20, + 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27, + 26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15, + 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24, + 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31, + 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, + 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29, + 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, + 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14, + 13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, + 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17, + 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27, + 26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25, + 22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, + 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, + 23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23, + 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, + 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23, + 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19, + 18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, + 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, + 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19, + 19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, + 11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, + 10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, + 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16, + 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, + 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16, + 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, + 10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, + 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, + 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, + 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, + 9, 9, 9, 9, 8, 8, 8, 8, + /* Size 4x8 */ + 32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24, + 23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9, + /* Size 8x4 */ + 32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21, + 16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9, + /* Size 8x16 */ + 32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32, + 32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30, + 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24, + 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18, + 17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11, + 11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, + 8, + /* Size 16x8 */ + 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, + 31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27, + 24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, + 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13, + 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, + 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15, + 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9, + 8, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21, + 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18, + 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, + 28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16, + 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, + 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, + 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20, + 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29, + 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, + 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26, + 23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, + 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, + 23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20, + 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, + 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17, + 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, + 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, + 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, + 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, + 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15, + 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, + 8, 8, + /* Size 32x16 */ + 32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32, + 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, + 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31, + 29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28, + 25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24, + 21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20, + 18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17, + 16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15, + 32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30, + 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28, + 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24, + 22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21, + 19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18, + 16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16, + 14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13, + 13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, + 23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23, + 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22, + 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19, + 17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16, + 14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14, + 12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, + 11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10, + 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15, + 17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16, + 16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15, + 14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, + 11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10, + 9, 9, 8, 8, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30, + 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24, + 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16, + 15, 14, 13, 13, 12, 11, 10, 10, 9, 9, + /* Size 16x4 */ + 33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28, + 24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13, + 24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16, + 13, 10, 16, 15, 12, 9, 14, 14, 11, 9, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23, + 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, + 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, + 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, + 18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24, + 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, + 13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18, + 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20, + 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, + 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, + 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15, + 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, + 8, 8, + /* Size 32x8 */ + 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32, + 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30, + 25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, + 18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16, + 32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30, + 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21, + 19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16, + 14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12, + 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23, + 22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17, + 14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, + 11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, + 15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16, + 14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, + 9, 8 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13, + /* Size 8x8 */ + 33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24, + 22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19, + 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15, + 14, 13, 18, 19, 20, 18, 16, 14, 13, 12, + /* Size 16x16 */ + 32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, + 33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29, + 26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23, + 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22, + 22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21, + 20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19, + 19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, + 21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21, + 22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22, + 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20, + 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18, + 17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16, + 15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14, + 13, 12, 12, 12, + /* Size 32x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33, + 33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29, + 27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24, + 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, + 18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, + 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32, + 31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28, + 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21, + 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, + 28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, + 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24, + 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23, + 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, + 17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22, + 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, + 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, + 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, + 14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20, + 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21, + 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, + 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, + 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, + 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, + 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, + 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19, + 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, + 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, + 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, + 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, + /* Size 4x8 */ + 33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23, + 22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13, + /* Size 8x4 */ + 33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21, + 17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13, + /* Size 8x16 */ + 32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33, + 32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24, + 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17, + 16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15, + 14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13, + 12, 12, + /* Size 16x8 */ + 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32, + 26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22, + 23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, + 19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16, + 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21, + 22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, + 16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, + 13, 12, + /* Size 16x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, + 32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28, + 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, + 20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, + 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24, + 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, + 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, + 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21, + 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22, + 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19, + 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, + 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, + 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, + 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, + 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, + 13, 13, 12, 12, 12, 12, 12, 12, + /* Size 32x16 */ + 32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33, + 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28, + 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24, + 22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22, + 23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23, + 21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22, + 20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20, + 19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19, + 28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24, + 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22, + 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19, + 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19, + 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, + 17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, + 16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, + 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22, + 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22, + 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20, + 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18, + 17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17, + 15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15, + 14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14, + 13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, + 18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19, + 19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19, + 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18, + 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17, + 15, 15, 14, 14, 13, 12, 12, 12, + /* Size 4x16 */ + 33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24, + 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22, + 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19, + 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, + /* Size 16x4 */ + 33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22, + 22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17, + 22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19, + 16, 13, 19, 19, 16, 13, 18, 18, 15, 12, + /* Size 8x32 */ + 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21, + 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32, + 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, + 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, + 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, + 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20, + 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, + 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, + 14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, + 12, 12, 12, 12, + /* Size 32x8 */ + 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33, + 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22, + 23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, + 20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20, + 28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, + 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19, + 19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18, + 17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16, + 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22, + 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, + 17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15, + 14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13, + 18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, + 19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17, + 15, 14, 13, 12 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11, + /* Size 8x8 */ + 33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32, + 29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22, + 19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14, + 12, 11, 16, 17, 18, 16, 14, 12, 11, 10, + /* Size 16x16 */ + 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, + 32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32, + 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31, + 29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28, + 26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25, + 24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20, + 19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16, + 16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14, + 25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24, + 25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23, + 23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21, + 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16, + 15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14, + 13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12, + 11, 11, 10, 10, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22, + 22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20, + 19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, + 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25, + 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, + 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, + 30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, + 18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, + 28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, + 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, + 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29, + 29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, + 18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26, + 26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23, + 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29, + 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, + 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30, + 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26, + 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, + 15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19, + 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24, + 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24, + 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, + 14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21, + 20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, + 12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18, + 17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15, + 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, + 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, + 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, + 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, + 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, + 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, + 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18, + 18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, + 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17, + 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, + 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17, + 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, + 10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14, + 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, + /* Size 4x8 */ + 32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25, + 24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11, + /* Size 8x4 */ + 32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26, + 18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11, + /* Size 8x16 */ + 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32, + 32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30, + 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28, + 25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20, + 20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17, + 16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13, + 12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, + 10, 10, + /* Size 16x8 */ + 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32, + 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28, + 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21, + 19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, + 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23, + 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, + 15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13, + 11, 10, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23, + 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21, + 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, + 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17, + 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, + 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32, + 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, + 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, + 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, + 18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, + 27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, + 16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, + 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, + 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25, + 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21, + 21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, + 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19, + 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, + 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16, + 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16, + 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, + 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, + 11, 11, 11, 10, 10, 10, 10, 9, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32, + 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32, + 32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32, + 30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, + 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25, + 25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21, + 21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19, + 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18, + 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32, + 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29, + 28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28, + 25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23, + 22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, + 19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18, + 17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15, + 14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14, + 25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25, + 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24, + 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23, + 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18, + 17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15, + 15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, + 12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, + 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, + 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18, + 18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17, + 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18, + 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14, + 13, 12, 12, 11, 11, 10, 9, 9, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32, + 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24, + 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20, + 18, 16, 15, 15, 14, 13, 12, 11, 11, 11, + /* Size 16x4 */ + 33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30, + 24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15, + 26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19, + 14, 11, 18, 18, 13, 11, 17, 18, 13, 11, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26, + 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, + 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, + 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, + 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, + 16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20, + 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24, + 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21, + 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, + 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18, + 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, + 10, 10, 10, 9, + /* Size 32x8 */ + 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32, + 32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30, + 28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, + 21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18, + 32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31, + 28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24, + 22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19, + 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14, + 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25, + 24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19, + 17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, + 12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11, + 18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, + 18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15, + 13, 12, 11, 9 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14, + /* Size 8x8 */ + 33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26, + 22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20, + 19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16, + 16, 15, 19, 20, 20, 19, 17, 16, 15, 13, + /* Size 16x16 */ + 32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, + 33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30, + 28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24, + 23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22, + 22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22, + 23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20, + 20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, + 21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, + 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, + 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19, + 18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17, + 16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, + 15, 14, 14, 13, + /* Size 32x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, + 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30, + 28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, + 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, + 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33, + 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30, + 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, + 22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, + 20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, + 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26, + 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23, + 23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21, + 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19, + 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, + 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, + 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, + 16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, + 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, + 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, + 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20, + 20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20, + 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, + 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, + 13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18, + 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, + /* Size 4x8 */ + 33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23, + 23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14, + /* Size 8x4 */ + 33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22, + 18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14, + /* Size 8x16 */ + 32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33, + 32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24, + 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22, + 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19, + 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18, + 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, + 16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14, + 14, 13, + /* Size 16x8 */ + 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32, + 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22, + 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21, + 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, + 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22, + 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, + 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16, + 15, 13, + /* Size 16x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33, + 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28, + 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, + 21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, + 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, + 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, + 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, + 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, + 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20, + 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, + 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19, + 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, + 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, + 14, 14, 14, 14, 14, 13, 13, 13, + /* Size 32x16 */ + 32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33, + 33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30, + 27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26, + 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22, + 23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23, + 23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22, + 22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, + 20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, + 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26, + 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23, + 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22, + 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20, + 20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, + 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, + 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17, + 21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22, + 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23, + 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22, + 20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19, + 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17, + 17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, + 16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15, + 14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, + 19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20, + 20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, + 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20, + 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18, + 17, 16, 16, 15, 14, 14, 13, 13, + /* Size 4x16 */ + 33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27, + 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23, + 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21, + 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, + /* Size 16x4 */ + 33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23, + 23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18, + 22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21, + 17, 15, 20, 21, 16, 14, 19, 20, 16, 14, + /* Size 8x32 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32, + 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, + 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22, + 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21, + 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, + 16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, + 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, + 14, 13, 13, 13, + /* Size 32x8 */ + 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33, + 27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23, + 23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, + 22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20, + 28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24, + 22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20, + 20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19, + 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17, + 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23, + 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, + 18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, + 16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14, + 19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, + 20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18, + 17, 16, 14, 13 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13, + /* Size 8x8 */ + 33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32, + 30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24, + 21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16, + 14, 13, 19, 20, 20, 19, 17, 15, 13, 12, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32, + 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32, + 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31, + 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29, + 28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26, + 26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23, + 22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20, + 19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16, + 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27, + 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25, + 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, + 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20, + 18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16, + 16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14, + 13, 13, 12, 11, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25, + 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, + 22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20, + 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, + 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28, + 28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, + 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, + 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, + 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22, + 22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, + 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, + 19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, + 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30, + 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, + 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, + 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24, + 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20, + 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27, + 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26, + 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, + 17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24, + 23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, + 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20, + 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, + 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, + 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14, + 13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, + 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, + 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, + 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20, + 20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, + 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, + 13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, + 11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, + 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, + /* Size 4x8 */ + 32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28, + 27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12, + /* Size 8x4 */ + 32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27, + 21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12, + /* Size 8x16 */ + 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32, + 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31, + 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28, + 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24, + 21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18, + 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, + 15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13, + 12, 12, + /* Size 16x8 */ + 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, + 32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30, + 28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, + 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17, + 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25, + 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21, + 18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14, + 13, 12, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, + 24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, + 22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20, + 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28, + 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, + 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, + 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, + 22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, + 27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, + 17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, + 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21, + 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24, + 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, + 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, + 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, + 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, + 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, + 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20, + 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, + 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, + 13, 13, 13, 12, 12, 12, 11, 11, + /* Size 32x16 */ + 32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32, + 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32, + 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, + 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30, + 29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28, + 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25, + 23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21, + 21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19, + 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32, + 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31, + 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28, + 28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24, + 24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23, + 21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20, + 19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, + 17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16, + 27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27, + 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27, + 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24, + 23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20, + 19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18, + 16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16, + 15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13, + 13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13, + 19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20, + 21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20, + 20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19, + 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17, + 16, 15, 14, 14, 13, 12, 12, 11, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32, + 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28, + 27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21, + 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, + /* Size 16x4 */ + 33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30, + 27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17, + 28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22, + 17, 13, 20, 20, 16, 12, 19, 19, 15, 12, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28, + 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, + 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, + 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, + 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19, + 19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23, + 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24, + 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, + 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, + 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, + 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, + 12, 12, 12, 12, + /* Size 32x8 */ + 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32, + 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, + 29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25, + 23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20, + 32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, + 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27, + 24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20, + 19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17, + 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28, + 26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23, + 19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, + 15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13, + 19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20, + 20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19, + 16, 14, 13, 12 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16, + /* Size 8x8 */ + 33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28, + 25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20, + 19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17, + 17, 16, 20, 21, 21, 20, 19, 17, 16, 15, + /* Size 16x16 */ + 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32, + 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26, + 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23, + 22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22, + 22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22, + 22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20, + 20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22, + 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, + 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, + 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, + 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18, + 18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17, + 16, 16, 15, 14, + /* Size 32x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30, + 30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27, + 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, + 20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, + 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33, + 33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, + 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32, + 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26, + 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, + 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23, + 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, + 30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27, + 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, + 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24, + 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, + 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, + 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, + 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, + 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, + 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, + 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21, + 21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, + 16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, + 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, + 14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, + 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, + /* Size 4x8 */ + 33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22, + 22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15, + /* Size 8x4 */ + 33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21, + 19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15, + /* Size 8x16 */ + 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27, + 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22, + 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19, + 18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, + 17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, + 16, 15, + /* Size 16x8 */ + 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, + 27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23, + 22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, + 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, + 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23, + 23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, + 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17, + 16, 15, + /* Size 16x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29, + 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, + 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22, + 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, + 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, + 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22, + 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23, + 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, + 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, + 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, + 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, + 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20, + 20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16, + 16, 16, 16, 15, 15, 15, 14, 14, + /* Size 32x16 */ + 32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33, + 33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32, + 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27, + 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23, + 22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22, + 23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23, + 22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22, + 22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, + 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25, + 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22, + 22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20, + 20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20, + 21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20, + 20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22, + 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22, + 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22, + 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20, + 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19, + 18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, + 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, + 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16, + 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21, + 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21, + 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, + 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19, + 19, 18, 17, 17, 16, 15, 15, 14, + /* Size 4x16 */ + 33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27, + 26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, + 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22, + 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, + /* Size 16x4 */ + 33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24, + 22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19, + 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22, + 19, 16, 21, 22, 18, 16, 20, 21, 18, 15, + /* Size 8x32 */ + 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21, + 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33, + 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, + 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, + 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, + 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, + 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, + 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, + 16, 15, 15, 15, + /* Size 32x8 */ + 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33, + 28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26, + 22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, + 22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22, + 30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26, + 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21, + 20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20, + 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19, + 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22, + 22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22, + 19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, + 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, + 20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21, + 21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20, + 19, 17, 16, 15 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16, + /* Size 8x8 */ + 33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32, + 31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28, + 25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18, + 16, 15, 22, 23, 23, 22, 20, 17, 15, 14, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, + 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30, + 29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28, + 28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26, + 25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23, + 22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20, + 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29, + 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28, + 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26, + 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23, + 22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, + 18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17, + 16, 15, 15, 14, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, + 26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, + 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22, + 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, + 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, + 27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24, + 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, + 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24, + 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, + 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, + 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31, + 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, + 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, + 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21, + 20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, + 29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, + 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29, + 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29, + 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19, + 19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26, + 26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17, + 17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, + 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26, + 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20, + 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27, + 27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19, + 18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, + 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, + 16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24, + 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, + 23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, + 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, + 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, + 15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14, + 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19, + 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13, + /* Size 4x8 */ + 33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30, + 28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15, + /* Size 8x4 */ + 33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28, + 24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32, + 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30, + 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26, + 25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22, + 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18, + 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, + 15, 15, + /* Size 16x8 */ + 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, + 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30, + 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, + 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20, + 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28, + 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24, + 20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18, + 15, 15, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, + 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, + 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23, + 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, + 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, + 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, + 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, + 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21, + 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17, + 17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, + 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, + 15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, + 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, + 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21, + 21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16, + 15, 14, 14, 14, 14, 13, 13, 13, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33, + 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, + 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27, + 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25, + 23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21, + 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, + 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32, + 31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, + 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28, + 28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26, + 26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, + 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21, + 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19, + 29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29, + 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30, + 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26, + 26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24, + 21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20, + 20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18, + 17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, + 16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14, + 23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23, + 23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23, + 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22, + 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20, + 18, 18, 18, 16, 15, 15, 14, 13, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32, + 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30, + 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, + 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, + /* Size 16x4 */ + 33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30, + 28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21, + 29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24, + 19, 16, 23, 23, 18, 16, 22, 22, 18, 15, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30, + 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, + 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25, + 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29, + 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, + 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25, + 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, + 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, + 15, 15, 15, 14, + /* Size 32x8 */ + 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32, + 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, + 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30, + 25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23, + 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, + 31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29, + 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, + 21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20, + 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30, + 28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26, + 21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, + 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16, + 23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, + 23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21, + 18, 18, 15, 14 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17, + /* Size 8x8 */ + 33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29, + 26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22, + 21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19, + 18, 17, 21, 22, 22, 22, 20, 19, 17, 17, + /* Size 16x16 */ + 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33, + 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33, + 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29, + 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24, + 23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22, + 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22, + 22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22, + 22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20, + 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22, + 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, + 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22, + 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22, + 21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, + 19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, + 18, 17, 17, 17, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28, + 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, + 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33, + 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32, + 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23, + 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, + 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, + 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, + 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30, + 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27, + 25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, + 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26, + 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, + 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, + 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, + 19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, + 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, + 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, + 17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, + 17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, + 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, + /* Size 4x8 */ + 33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17, + /* Size 8x4 */ + 33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22, + 20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17, + /* Size 8x16 */ + 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33, + 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29, + 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, + 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18, + 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, + 17, 17, + /* Size 16x8 */ + 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, + 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, + 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22, + 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20, + 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, + 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22, + 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, + 17, 17, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31, + 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, + 23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, + 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, + 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27, + 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, + 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, + 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, + 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, + 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, + 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, + 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 17, 17, 17, 17, 16, 16, 16, 16, + /* Size 32x16 */ + 32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33, + 33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, + 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27, + 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26, + 23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, + 22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23, + 23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23, + 22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, + 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29, + 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, + 25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, + 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, + 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, + 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, + 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, + 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21, + 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19, + 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19, + 18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, + 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, + 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21, + 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, + 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, + 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21, + 20, 19, 19, 18, 17, 17, 17, 16, + /* Size 4x16 */ + 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27, + 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22, + 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, + 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, + /* Size 16x4 */ + 33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24, + 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21, + 22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, + 19, 18, 21, 22, 19, 17, 21, 22, 19, 17, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24, + 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, + 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, + 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 17, 17, 17, 17, + /* Size 32x8 */ + 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33, + 30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26, + 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22, + 23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22, + 31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27, + 25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22, + 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, + 21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20, + 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22, + 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22, + 20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, + 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17, + 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, + 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, + 20, 19, 17, 17 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19, + /* Size 8x8 */ + 33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, + 32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29, + 28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22, + 20, 19, 25, 26, 26, 25, 23, 21, 19, 18, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, + 28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, + 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, + 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30, + 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30, + 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, + 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26, + 26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, + 23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19, + 19, 18, 18, 16, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, + 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, + 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, + 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, + 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, + 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, + 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, + 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, + 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31, + 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, + 23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, + 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21, + 20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, + 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, + 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, + 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, + 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20, + 20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, + 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, + 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, + 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27, + 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22, + 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19, + 18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, + 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, + 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, + 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, + /* Size 4x8 */ + 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30, + 30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18, + /* Size 8x4 */ + 33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30, + 27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30, + 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29, + 29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, + 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, + 21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, + 18, 16, + /* Size 16x8 */ + 32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, + 32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, + 31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, + 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, + 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30, + 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26, + 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19, + 19, 16, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, + 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, + 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, + 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, + 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, + 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29, + 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, + 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, + 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, + 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, + 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, + 19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, + 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24, + 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, + 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25, + 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, + 19, 18, 18, 18, 18, 17, 16, 16, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, + 29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, + 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25, + 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32, + 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, + 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30, + 30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, + 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, + 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26, + 24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, + 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, + 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30, + 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28, + 28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27, + 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, + 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21, + 21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20, + 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, + 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27, + 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26, + 26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24, + 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24, + 24, 21, 19, 19, 19, 18, 16, 16, + /* Size 4x16 */ + 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31, + 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26, + 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, + /* Size 16x4 */ + 33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32, + 31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26, + 30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26, + 23, 19, 27, 26, 23, 19, 24, 24, 21, 18, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, + 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29, + 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, + 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, + 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, + 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18, + 18, 17, 16, 16, + /* Size 32x8 */ + 32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32, + 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, + 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30, + 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, + 33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, + 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, + 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, + 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23, + 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, + 30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27, + 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, + 21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18, + 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26, + 26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24, + 24, 19, 19, 16 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19, + /* Size 8x8 */ + 33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31, + 29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23, + 22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20, + 19, 19, 21, 22, 23, 22, 22, 20, 19, 18, + /* Size 16x16 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33, + 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33, + 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29, + 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26, + 26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, + 23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, + 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, + 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23, + 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24, + 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, + 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, + 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19, + 19, 19, 19, 18, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, + 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, + 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, + 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, + 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, + 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, + 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, + 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23, + 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, + 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, + 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, + 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, + 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, + 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, + 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, + /* Size 4x8 */ + 33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24, + 23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, + /* Size 8x4 */ + 33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23, + 21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19, + /* Size 8x16 */ + 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33, + 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32, + 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, + 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22, + 22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, + 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, + 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, + 18, 18, + /* Size 16x8 */ + 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, + 33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, + 26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, + 22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, + 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22, + 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22, + 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, + 19, 18, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, + 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, + 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, + 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, + 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, + 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21, + 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21, + 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, + 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22, + 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, + 19, 19, 18, 18, 18, 18, 18, 18, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33, + 33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33, + 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, + 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27, + 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24, + 22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, + 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, + 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23, + 32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30, + 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, + 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23, + 23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, + 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22, + 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, + 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24, + 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21, + 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, + 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, + 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, + 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, + 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, + 23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23, + 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22, + 22, 21, 19, 19, 19, 18, 18, 18, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30, + 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22, + 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, + /* Size 16x4 */ + 33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29, + 24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22, + 24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22, + 20, 19, 22, 22, 20, 19, 22, 23, 21, 18, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26, + 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, + 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 18, + /* Size 32x8 */ + 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33, + 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, + 27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22, + 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, + 32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, + 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23, + 23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, + 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22, + 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, + 24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21, + 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, + 19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18, + 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, + 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, + 22, 19, 19, 18 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22, + /* Size 8x8 */ + 33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, + 32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30, + 29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27, + 26, 24, 29, 29, 30, 28, 27, 26, 24, 21, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, + 29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31, + 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31, + 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, + 28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, + 27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, + 24, 23, 21, 21, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26, + 26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, + 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30, + 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, + 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, + 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, + 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28, + 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29, + 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, + 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, + 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, + 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21, + 21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20, + /* Size 4x8 */ + 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32, + 31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21, + /* Size 8x4 */ + 33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31, + 29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30, + 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, + 29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23, + 21, 21, + /* Size 16x8 */ + 32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, + 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, + 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30, + 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, + 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31, + 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, + 28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, + 22, 21, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, + 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, + 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, + 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, + 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22, + 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, + 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, + 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, + 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, + 24, 24, 23, 22, 21, 21, 21, 21, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, + 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, + 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, + 28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27, + 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, + 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31, + 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31, + 30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, + 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, + 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23, + 29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29, + 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, + 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30, + 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27, + 26, 26, 26, 24, 22, 21, 21, 21, + /* Size 4x16 */ + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32, + 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30, + 28, 28, 28, 27, 27, 25, 24, 23, 21, 21, + /* Size 16x4 */ + 33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32, + 31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28, + 32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30, + 28, 23, 29, 30, 27, 21, 29, 30, 27, 21, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, + 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29, + 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, + 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, + 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, + 21, 21, 21, 21, + /* Size 32x8 */ + 32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32, + 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, + 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, + 30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, + 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, + 32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31, + 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, + 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27, + 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, + 31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29, + 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, + 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23, + 29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, + 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28, + 26, 26, 22, 21 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20, + /* Size 8x8 */ + 33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33, + 32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24, + 22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21, + 21, 20, 21, 22, 22, 22, 22, 21, 20, 19, + /* Size 16x16 */ + 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33, + 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33, + 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33, + 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, + 28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, + 26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23, + 23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22, + 22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22, + 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, + 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25, + 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24, + 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 19, 19, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28, + 28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26, + 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, + 23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, + 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, + 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, + 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, + 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25, + 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, + 28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, + 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, + 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, + 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, + 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, + 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, + 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, + 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19, + 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, + /* Size 4x8 */ + 33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27, + 26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19, + /* Size 8x4 */ + 33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26, + 22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19, + /* Size 8x16 */ + 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33, + 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32, + 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29, + 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, + 22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, + 19, 19, + /* Size 16x8 */ + 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, + 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, + 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24, + 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, + 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24, + 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, + 22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, + 20, 19, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, + 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, + 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, + 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, + 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, + 23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, + 27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, + 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24, + 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, + 21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, + 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 20, 20, 19, 19, 19, 19, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33, + 33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, + 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, + 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27, + 27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25, + 23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22, + 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, + 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33, + 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31, + 31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29, + 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, + 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, + 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23, + 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22, + 22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, + 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, + 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25, + 24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22, + 22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, + 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20, + 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20, + 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, + 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, + 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, + 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 21, 20, 19, 19, 19, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, + 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26, + 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, + /* Size 16x4 */ + 33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32, + 26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22, + 27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23, + 22, 20, 21, 22, 21, 19, 21, 22, 21, 19, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28, + 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, + 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, + 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, + 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, + 19, 19, 19, 19, + /* Size 32x8 */ + 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33, + 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, + 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, + 23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, + 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31, + 31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27, + 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, + 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22, + 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, + 26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23, + 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22, + 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20, + 22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, + 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, + 22, 22, 20, 19 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29, + /* Size 8x8 */ + 33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, + 31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30, + 29, 28, 31, 31, 31, 31, 29, 29, 28, 27, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, + 28, 28, 27, 26, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, + 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, + 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, + 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, + 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, + 26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, + /* Size 4x8 */ + 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, + 32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28, + /* Size 8x4 */ + 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, + 31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, + 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, + 28, 27, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, + 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30, + 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, + 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, + 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, + 28, 27, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, + 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 28, 28, 28, 27, 27, 26, 26, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, + 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, + 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, + 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, + 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, + 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31, + 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, + 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30, + 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30, + 29, 28, 28, 28, 28, 28, 27, 26, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31, + 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, + /* Size 16x4 */ + 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30, + 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31, + 30, 29, 31, 31, 29, 28, 30, 30, 28, 28, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, + 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 27, 27, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33, + 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, + 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, + 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, + 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, + 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31, + 31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, + 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, + 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, + 31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29, + 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, + 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31, + 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30, + 29, 28, 28, 27 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22, + /* Size 8x8 */ + 33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, + 33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29, + 26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23, + 22, 22, 26, 25, 25, 24, 23, 23, 22, 21, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, + 29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, + 26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, + 25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23, + 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30, + 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28, + 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, + 24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, + 23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, + 22, 22, 21, 21, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, + 28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25, + 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, + 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, + 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27, + 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26, + 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, + 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, + 29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, + 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, + 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, + 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, + 28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, + 25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, + 24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, + 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, + 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, + 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, + 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, + 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, + 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, + 24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, + 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26, + 25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, + 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, + 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, + 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, + 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25, + 25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, + 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, + 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, + /* Size 4x8 */ + 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28, + 28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22, + /* Size 8x4 */ + 33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28, + 26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33, + 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32, + 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29, + 28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, + 24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, + 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, + 22, 21, + /* Size 16x8 */ + 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33, + 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, + 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, + 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24, + 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28, + 27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, + 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22, + 22, 21, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, + 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, + 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, + 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, + 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, + 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25, + 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, + 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27, + 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, + 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, + 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, + 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, + 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, + 22, 22, 22, 22, 21, 21, 21, 21, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33, + 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33, + 33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, + 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, + 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, + 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27, + 26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, + 34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33, + 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, + 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, + 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31, + 29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27, + 26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25, + 25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24, + 24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, + 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, + 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29, + 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28, + 28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26, + 25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, + 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, + 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, + 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, + 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26, + 26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24, + 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24, + 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24, + 23, 22, 22, 22, 22, 22, 21, 21, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33, + 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28, + 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26, + 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, + /* Size 16x4 */ + 33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32, + 28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24, + 30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26, + 23, 22, 26, 25, 23, 22, 24, 24, 22, 22, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31, + 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, + 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, + 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, + 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28, + 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, + 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, + 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, + 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, + 22, 22, 21, 21, + /* Size 32x8 */ + 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33, + 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, + 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, + 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26, + 34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, + 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, + 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25, + 25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23, + 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, + 28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27, + 25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, + 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, + 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25, + 24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24, + 23, 22, 22, 21 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + /* Size 8x4 */ + 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 30, 30, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, + 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, + 31, 30, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 30, 30, 30, 30, 30, 30, 30, 30, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, + 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 31, 30, 30, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, + /* Size 16x4 */ + 33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, + 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, + 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, + 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, + 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, + 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, + 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, + 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, + 32, 32, 31, 30 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26, + /* Size 8x8 */ + 33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33, + 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33, + 32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28, + 26, 26, 31, 30, 30, 29, 29, 28, 26, 26, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, + 30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, + 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, + 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32, + 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30, + 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29, + 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, + 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, + 27, 26, 26, 26, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33, + 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30, + 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, + 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, + 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, + 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29, + 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, + 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, + 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, + 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, + 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, + 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28, + 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32, + 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, + 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, + 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, + 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, + 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, + 26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33, + 33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26, + /* Size 8x4 */ + 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, + 32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, + 28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25, + 25, 25, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, + 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, + 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, + 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27, + 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32, + 31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28, + 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, + 27, 25, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, + 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, + 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, + 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, + 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, + 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, + 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, + 24, 24, 24, 24, 24, 24, 24, 23, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, + 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, + 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, + 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, + 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, + 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31, + 31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31, + 31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30, + 30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, + 28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, + 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, + 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, + 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, + 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28, + 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28, + 28, 28, 28, 27, 26, 26, 24, 23, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29, + 28, 28, 28, 28, 28, 28, 26, 26, 26, 26, + /* Size 16x4 */ + 33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33, + 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28, + 33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28, + 28, 26, 30, 28, 28, 26, 30, 28, 28, 26, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, + 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, + 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, + 28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25, + 25, 25, 25, 24, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33, + 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, + 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, + 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, + 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, + 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32, + 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, + 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, + 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, + 32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31, + 31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29, + 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, + 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, + 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28, + 28, 28, 26, 24 }, + }, + { + { /* Luma */ + /* Size 4x4 */ + 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, + 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, + 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, + 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, + 32, 32, 32, 32 }, + { /* Chroma */ + /* Size 4x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + /* Size 8x8 */ + 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, + /* Size 16x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, + /* Size 32x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, + 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + /* Size 4x8 */ + 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, + /* Size 8x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32, + /* Size 8x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, + /* Size 16x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, + 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32, + 32, 32, + /* Size 16x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 32x16 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, + 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, + 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33, + 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, + 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, + 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33, + 32, 32, 32, 32, 32, 32, 32, 32, + /* Size 4x16 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, + /* Size 16x4 */ + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, + 32, 32, 33, 33, 32, 32, 34, 33, 32, 32, + /* Size 8x32 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, + 32, 32, 32, 32, + /* Size 32x8 */ + 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, + 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, + 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, + 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, + 32, 32, 32, 32 }, + }, +}; \ No newline at end of file diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h new file mode 100644 index 0000000000..8f36eb105b --- /dev/null +++ b/third_party/aom/av1/common/quant_common.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_ +#define AOM_AV1_COMMON_QUANT_COMMON_H_ + +#include +#include "aom/aom_codec.h" +#include "av1/common/seg_common.h" +#include "av1/common/enums.h" +#include "av1/common/entropy.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MINQ 0 +#define MAXQ 255 +#define QINDEX_RANGE (MAXQ - MINQ + 1) +#define QINDEX_BITS 8 +// Total number of QM sets stored +#define QM_LEVEL_BITS 4 +#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS) +/* Range of QMS is between first and last value, with offset applied to inter + * blocks*/ +#define DEFAULT_QM_Y 10 +#define DEFAULT_QM_U 11 +#define DEFAULT_QM_V 12 +#define DEFAULT_QM_FIRST 5 +#define DEFAULT_QM_LAST 9 +#define LOSSLESS_Q_STEP 4 // this should equal to dc/ac_qlookup_QTX[0] + +struct AV1Common; +struct CommonQuantParams; +struct macroblockd; + +int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); +int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth); + +int av1_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +// Returns true if we are using quantization matrix. +bool av1_use_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int segment_id); + +// Reduce the large number of quantizers to a smaller number of levels for which +// different matrices may be defined +static INLINE int aom_get_qmlevel(int qindex, int first, int last) { + return first + (qindex * (last + 1 - first)) / QINDEX_RANGE; +} + +// Initialize all global quant/dequant matrices. +void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes); + +// Get global dequant matrix. +const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); +// Get global quant matrix. +const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params, + int qmlevel, int plane, TX_SIZE tx_size); + +// Get either local / global dequant matrix as appropriate. +const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); +// Get either local / global quant matrix as appropriate. +const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params, + const struct macroblockd *xd, int plane, + TX_SIZE tx_size, TX_TYPE tx_type); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_QUANT_COMMON_H_ diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c new file mode 100644 index 0000000000..602fab7237 --- /dev/null +++ b/third_party/aom/av1/common/reconinter.c @@ -0,0 +1,1169 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_ports/aom_once.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +// This function will determine whether or not to create a warped +// prediction. +static int allow_warp(const MB_MODE_INFO *const mbmi, + const WarpTypesAllowed *const warp_types, + const WarpedMotionParams *const gm_params, + int build_for_obmc, const struct scale_factors *const sf, + WarpedMotionParams *final_warp_params) { + // Note: As per the spec, we must test the fixed point scales here, which are + // at a higher precision (1 << 14) than the xs and ys in subpel_params (that + // have 1 << 10 precision). + if (av1_is_scaled(sf)) return 0; + + if (final_warp_params != NULL) *final_warp_params = default_warp_params; + + if (build_for_obmc) return 0; + + if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) { + if (final_warp_params != NULL) + memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params)); + return 1; + } else if (warp_types->global_warp_allowed && !gm_params->invalid) { + if (final_warp_params != NULL) + memcpy(final_warp_params, gm_params, sizeof(*final_warp_params)); + return 1; + } + + return 0; +} + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi) { + if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8) + return; + + if (xd->cur_frame_force_integer_mv) return; + + if (allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0, + inter_pred_params->scale_factors, + &inter_pred_params->warp_params)) { + inter_pred_params->mode = WARP_PRED; + } +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + assert(IMPLIES(inter_pred_params->conv_params.is_compound, + inter_pred_params->conv_params.dst != NULL)); + + if (inter_pred_params->mode == TRANSLATION_PRED) { +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->block_width, + inter_pred_params->block_height, + &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params, + inter_pred_params->bit_depth); + } else { + inter_predictor(src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->block_width, + inter_pred_params->block_height, + &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); + } +#else + inter_predictor(src, src_stride, dst, dst_stride, subpel_params, + inter_pred_params->block_width, + inter_pred_params->block_height, + &inter_pred_params->conv_params, + inter_pred_params->interp_filter_params); +#endif + } + // TODO(jingning): av1_warp_plane() can be further cleaned up. + else if (inter_pred_params->mode == WARP_PRED) { + av1_warp_plane( + &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf, + inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0, + inter_pred_params->ref_frame_buf.width, + inter_pred_params->ref_frame_buf.height, + inter_pred_params->ref_frame_buf.stride, dst, + inter_pred_params->pix_col, inter_pred_params->pix_row, + inter_pred_params->block_width, inter_pred_params->block_height, + dst_stride, inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y, &inter_pred_params->conv_params); + } else { + assert(0 && "Unsupported inter_pred_params->mode"); + } +} + +static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18, + 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; +static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27, + 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; +static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21, + 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; + +static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift, + int width) { + if (shift >= 0) { + memcpy(dst + shift, src, width - shift); + memset(dst, src[0], shift); + } else { + shift = -shift; + memcpy(dst, src + shift, width - shift); + memset(dst + width - shift, src[width - 1], shift); + } +} + +/* clang-format off */ +DECLARE_ALIGNED(16, static uint8_t, + wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, }, + { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used +}; +/* clang-format on */ + +// [negative][direction] +DECLARE_ALIGNED( + 16, static uint8_t, + wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]); + +// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound +// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE. +DECLARE_ALIGNED(16, static uint8_t, + wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]); + +DECLARE_ALIGNED(16, static uint8_t, + smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL] + [MAX_WEDGE_SQUARE]); + +static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2]; + +static const wedge_code_type wedge_codebook_16_hgtw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_hltw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 }, + { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +static const wedge_code_type wedge_codebook_16_heqw[16] = { + { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 }, + { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 }, + { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 }, + { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 }, + { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 }, + { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 }, + { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 }, + { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, +}; + +const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = { + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8], + wedge_masks[BLOCK_8X8] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16], + wedge_masks[BLOCK_8X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8], + wedge_masks[BLOCK_16X8] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16], + wedge_masks[BLOCK_16X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32], + wedge_masks[BLOCK_16X32] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16], + wedge_masks[BLOCK_32X16] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32], + wedge_masks[BLOCK_32X32] }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32], + wedge_masks[BLOCK_8X32] }, + { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8], + wedge_masks[BLOCK_32X8] }, + { 0, NULL, NULL, NULL }, + { 0, NULL, NULL, NULL }, +}; + +static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, + BLOCK_SIZE sb_type) { + const uint8_t *master; + const int bh = block_size_high[sb_type]; + const int bw = block_size_wide[sb_type]; + const wedge_code_type *a = + av1_wedge_params_lookup[sb_type].codebook + wedge_index; + int woff, hoff; + const uint8_t wsignflip = + av1_wedge_params_lookup[sb_type].signflip[wedge_index]; + + assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type)); + woff = (a->x_offset * bw) >> 3; + hoff = (a->y_offset * bh) >> 3; + master = wedge_mask_obl[neg ^ wsignflip][a->direction] + + MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) + + MASK_MASTER_SIZE / 2 - woff; + return master; +} + +const uint8_t *av1_get_compound_type_mask( + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) { + (void)sb_type; + switch (comp_data->type) { + case COMPOUND_WEDGE: + return av1_get_contiguous_soft_mask(comp_data->wedge_index, + comp_data->wedge_sign, sb_type); + default: return comp_data->seg_mask; + } +} + +static AOM_INLINE void diffwtd_mask_d16( + uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + int i, j, m, diff; + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]); + diff = ROUND_POWER_OF_TWO(diff, round); + m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); + mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; + } + } +} + +void av1_build_compound_diffwtd_mask_d16_c( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w, + conv_params, bd); + break; + case DIFFWTD_38_INV: + diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w, + conv_params, bd); + break; + default: assert(0); + } +} + +static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse, + int mask_base, const uint8_t *src0, + int src0_stride, const uint8_t *src1, + int src1_stride, int h, int w) { + int i, j, m, diff; + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + diff = + abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]); + m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA); + mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m; + } + } +} + +void av1_build_compound_diffwtd_mask_c(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w); + break; + case DIFFWTD_38_INV: + diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w); + break; + default: assert(0); + } +} + +static AOM_FORCE_INLINE void diffwtd_mask_highbd( + uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0, + int src0_stride, const uint16_t *src1, int src1_stride, int h, int w, + const unsigned int bd) { + assert(bd >= 8); + if (bd == 8) { + if (which_inverse) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } + } else { + const unsigned int bd_shift = bd - 8; + if (which_inverse) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = + (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = AOM_BLEND_A64_MAX_ALPHA - m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int diff = + (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR; + unsigned int m = negative_to_zero(mask_base + diff); + m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA); + mask[j] = m; + } + src0 += src0_stride; + src1 += src1_stride; + mask += w; + } + } + } +} + +void av1_build_compound_diffwtd_mask_highbd_c( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + switch (mask_type) { + case DIFFWTD_38: + diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, + CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); + break; + case DIFFWTD_38_INV: + diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride, + CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd); + break; + default: assert(0); + } +} + +static AOM_INLINE void init_wedge_master_masks(void) { + int i, j; + const int w = MASK_MASTER_SIZE; + const int h = MASK_MASTER_SIZE; + const int stride = MASK_MASTER_STRIDE; + // Note: index [0] stores the masters, and [1] its complement. + // Generate prototype by shifting the masters + int shift = h / 4; + for (i = 0; i < h; i += 2) { + shift_copy(wedge_master_oblique_even, + &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift, + MASK_MASTER_SIZE); + shift--; + shift_copy(wedge_master_oblique_odd, + &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift, + MASK_MASTER_SIZE); + memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride], + wedge_master_vertical, + MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); + memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride], + wedge_master_vertical, + MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0])); + } + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j]; + wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk; + wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = + wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - msk; + wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] = + wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - msk; + wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] = + wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk; + const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j]; + wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx; + wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] = + wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] = + (1 << WEDGE_WEIGHT_BITS) - mskx; + } + } +} + +static AOM_INLINE void init_wedge_masks(void) { + uint8_t *dst = wedge_mask_buf; + BLOCK_SIZE bsize; + memset(wedge_masks, 0, sizeof(wedge_masks)); + for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) { + const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize]; + const int wtypes = wedge_params->wedge_types; + if (wtypes == 0) continue; + const uint8_t *mask; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int w; + for (w = 0; w < wtypes; ++w) { + mask = get_wedge_mask_inplace(w, 0, bsize); + aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw, + bh); + wedge_params->masks[0][w] = dst; + dst += bw * bh; + + mask = get_wedge_mask_inplace(w, 1, bsize); + aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw /* dst_stride */, bw, + bh); + wedge_params->masks[1][w] = dst; + dst += bw * bh; + } + assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf)); + } +} + +/* clang-format off */ +static const uint8_t ii_weights1d[MAX_SB_SIZE] = { + 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32, + 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16, + 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, + 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, + 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; +static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = { + 32, 16, 16, 16, 8, 8, 8, 4, + 4, 4, 2, 2, 2, 1, 1, 1, + 8, 8, 4, 4, 2, 2 +}; +/* clang-format on */ + +static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride, + BLOCK_SIZE plane_bsize, + INTERINTRA_MODE mode) { + int i, j; + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const int size_scale = ii_size_scales[plane_bsize]; + + switch (mode) { + case II_V_PRED: + for (i = 0; i < bh; ++i) { + memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0])); + mask += stride; + } + break; + + case II_H_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale]; + mask += stride; + } + break; + + case II_SMOOTH_PRED: + for (i = 0; i < bh; ++i) { + for (j = 0; j < bw; ++j) + mask[j] = ii_weights1d[(i < j ? i : j) * size_scale]; + mask += stride; + } + break; + + case II_DC_PRED: + default: + for (i = 0; i < bh; ++i) { + memset(mask, 32, bw * sizeof(mask[0])); + mask += stride; + } + break; + } +} + +static AOM_INLINE void init_smooth_interintra_masks(void) { + for (int m = 0; m < INTERINTRA_MODES; ++m) { + for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) { + const int bw = block_size_wide[bs]; + const int bh = block_size_high[bs]; + if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue; + build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs, + m); + } + } +} + +// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0 +static void init_all_wedge_masks(void) { + init_wedge_master_masks(); + init_wedge_masks(); + init_smooth_interintra_masks(); +} + +void av1_init_wedge_masks(void) { aom_once(init_all_wedge_masks); } + +static AOM_INLINE void build_masked_compound_no_round( + uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, InterPredParams *inter_pred_params) { + const int ssy = inter_pred_params->subsampling_y; + const int ssx = inter_pred_params->subsampling_x; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + const int mask_stride = block_size_wide[sb_type]; +#if CONFIG_AV1_HIGHBITDEPTH + if (inter_pred_params->use_hbd_buf) { + aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, + ssy, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } else { + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); + } +#else + aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h, ssx, ssy, + &inter_pred_params->conv_params); +#endif +} + +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp; + BLOCK_SIZE sb_type = inter_pred_params->sb_type; + + // We're going to call av1_make_inter_predictor to generate a prediction into + // a temporary buffer, then will blend that temporary buffer with that from + // the other reference. + DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]); + uint8_t *tmp_dst = + inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf; + + const int tmp_buf_stride = MAX_SB_SIZE; + CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst; + int org_dst_stride = inter_pred_params->conv_params.dst_stride; + CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf; + inter_pred_params->conv_params.dst = tmp_buf16; + inter_pred_params->conv_params.dst_stride = tmp_buf_stride; + assert(inter_pred_params->conv_params.do_average == 0); + + // This will generate a prediction in tmp_buf for the second reference + av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, + inter_pred_params, subpel_params); + + if (!inter_pred_params->conv_params.plane && + comp_data->type == COMPOUND_DIFFWTD) { + av1_build_compound_diffwtd_mask_d16( + comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride, + tmp_buf16, tmp_buf_stride, inter_pred_params->block_height, + inter_pred_params->block_width, &inter_pred_params->conv_params, + inter_pred_params->bit_depth); + } + build_masked_compound_no_round( + dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride, + comp_data, sb_type, inter_pred_params->block_height, + inter_pred_params->block_width, inter_pred_params); +} + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int *fwd_offset, + int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound) { + assert(fwd_offset != NULL && bck_offset != NULL); + if (!is_compound || mbmi->compound_idx) { + *fwd_offset = 8; + *bck_offset = 8; + *use_dist_wtd_comp_avg = 0; + return; + } + + *use_dist_wtd_comp_avg = 1; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); + const int cur_frame_index = cm->cur_frame->order_hint; + int bck_frame_index = 0, fwd_frame_index = 0; + + if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; + if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; + + int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, + fwd_frame_index, cur_frame_index)), + 0, MAX_FRAME_DISTANCE); + int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, + cur_frame_index, bck_frame_index)), + 0, MAX_FRAME_DISTANCE); + + const int order = d0 <= d1; + + if (d0 == 0 || d1 == 0) { + *fwd_offset = quant_dist_lookup_table[3][order]; + *bck_offset = quant_dist_lookup_table[3][1 - order]; + return; + } + + int i; + for (i = 0; i < 3; ++i) { + int c0 = quant_dist_weight[i][order]; + int c1 = quant_dist_weight[i][!order]; + int d0_c0 = d0 * c0; + int d1_c1 = d1 * c1; + if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break; + } + + *fwd_offset = quant_dist_lookup_table[i][order]; + *bck_offset = quant_dist_lookup_table[i][1 - order]; +} + +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const int plane_start, const int plane_end) { + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) { + struct macroblockd_plane *const pd = &planes[i]; + const int is_uv = i > 0; + setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv], + src->crop_heights[is_uv], src->strides[is_uv], mi_row, + mi_col, NULL, pd->subsampling_x, pd->subsampling_y); + } +} + +void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf, + const int num_planes) { + if (src != NULL) { + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + const int is_uv = i > 0; + setup_pred_plane(&pd->pre[idx], xd->mi[0]->bsize, src->buffers[i], + src->crop_widths[is_uv], src->crop_heights[is_uv], + src->strides[is_uv], mi_row, mi_col, sf, + pd->subsampling_x, pd->subsampling_y); + } + } +} + +// obmc_mask_N[overlap_position] +static const uint8_t obmc_mask_1[1] = { 64 }; +DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 }; + +DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 }; + +static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 }; + +static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54, + 56, 58, 60, 61, 64, 64, 64, 64 }; + +static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44, + 45, 47, 48, 50, 51, 52, 53, 55, + 56, 57, 58, 59, 60, 60, 61, 62, + 64, 64, 64, 64, 64, 64, 64, 64 }; + +static const uint8_t obmc_mask_64[64] = { + 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44, + 45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56, + 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62, + 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, +}; + +const uint8_t *av1_get_obmc_mask(int length) { + switch (length) { + case 1: return obmc_mask_1; + case 2: return obmc_mask_2; + case 4: return obmc_mask_4; + case 8: return obmc_mask_8; + case 16: return obmc_mask_16; + case 32: return obmc_mask_32; + case 64: return obmc_mask_64; + default: assert(0); return NULL; + } +} + +static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *mi, void *fun_ctxt, + const int num_planes) { + (void)xd; + (void)rel_mi_row; + (void)rel_mi_col; + (void)op_mi_size; + (void)dir; + (void)mi; + ++*(uint8_t *)fun_ctxt; + (void)num_planes; +} + +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + + mbmi->overlappable_neighbors = 0; + + if (!is_motion_variation_allowed_bsize(mbmi->bsize)) return; + + foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr, + &mbmi->overlappable_neighbors); + if (mbmi->overlappable_neighbors) return; + foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr, + &mbmi->overlappable_neighbors); +} + +// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if +// block-size of current plane is smaller than 8x8, always only blend with the +// left neighbor(s) (skip blending with the above side). +#define DISABLE_CHROMA_U8X8_OBMC 0 // 0: one-sided obmc; 1: disable + +int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd, int dir) { + assert(is_motion_variation_allowed_bsize(bsize)); + + const BLOCK_SIZE bsize_plane = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + switch (bsize_plane) { +#if DISABLE_CHROMA_U8X8_OBMC + case BLOCK_4X4: + case BLOCK_8X4: + case BLOCK_4X8: return 1; +#else + case BLOCK_4X4: + case BLOCK_8X4: + case BLOCK_4X8: return dir == 0; +#endif + default: return 0; + } +} + +void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) { + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; +} + +struct obmc_inter_pred_ctxt { + uint8_t **adjacent; + int *adjacent_stride; +}; + +static INLINE void build_obmc_inter_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) { + (void)above_mi; + (void)rel_mi_row; + (void)dir; + struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int overlap = + AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + const int bh = overlap >> pd->subsampling_y; + const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + + const int dst_stride = pd->dst.stride; + uint8_t *const dst = &pd->dst.buf[plane_col]; + const int tmp_stride = ctxt->adjacent_stride[plane]; + const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col]; + const uint8_t *const mask = av1_get_obmc_mask(bh); +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) + aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, + tmp_stride, mask, bw, bh, xd->bd); + else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, + mask, bw, bh); +#else + aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif + } +} + +static INLINE void build_obmc_inter_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) { + (void)left_mi; + (void)rel_mi_col; + (void)dir; + struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int overlap = + AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const int bw = overlap >> pd->subsampling_x; + const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y; + const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + + const int dst_stride = pd->dst.stride; + uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride]; + const int tmp_stride = ctxt->adjacent_stride[plane]; + const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride]; + const uint8_t *const mask = av1_get_obmc_mask(bw); + +#if CONFIG_AV1_HIGHBITDEPTH + const int is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) + aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, + tmp_stride, mask, bw, bh, xd->bd); + else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, + mask, bw, bh); +#else + aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask, + bw, bh); +#endif + } +} + +// This function combines motion compensated predictions that are generated by +// top/left neighboring blocks' inter predictors with the regular inter +// prediction. We assume the original prediction (bmc) is stored in +// xd->plane[].dst.buf +void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *above[MAX_MB_PLANE], + int above_stride[MAX_MB_PLANE], + uint8_t *left[MAX_MB_PLANE], + int left_stride[MAX_MB_PLANE]) { + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + + // handle above row + struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride }; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_obmc_inter_pred_above, &ctxt_above); + + // handle left column + struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride }; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_obmc_inter_pred_left, &ctxt_left); +} + +void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, + uint8_t **dst_buf2) { + if (is_cur_buf_hbd(xd)) { + int len = sizeof(uint16_t); + dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); + dst_buf1[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len); + dst_buf1[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len); + dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]); + dst_buf2[1] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len); + dst_buf2[2] = + CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); + } else { + dst_buf1[0] = xd->tmp_obmc_bufs[0]; + dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; + dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; + dst_buf2[0] = xd->tmp_obmc_bufs[1]; + dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; + dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; + } +} + +void av1_setup_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->bsize); + const int above_mi_col = xd->mi_col + rel_mi_col; + + av1_modify_neighbor_predictor_for_obmc(above_mbmi); + + for (int j = 0; j < num_planes; ++j) { + struct macroblockd_plane *const pd = &xd->plane[j]; + setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], + ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const int num_refs = 1 + has_second_ref(above_mbmi); + + for (int ref = 0; ref < num_refs; ++ref) { + const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + xd->block_ref_scale_factors[ref] = sf; + if ((!av1_is_valid_scale(sf))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf, + num_planes); + } + + xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); + xd->mb_to_right_edge = + ctxt->mb_to_far_edge + + (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8; +} + +void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, + uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->bsize); + const int left_mi_row = xd->mi_row + rel_mi_row; + + av1_modify_neighbor_predictor_for_obmc(left_mbmi); + + for (int j = 0; j < num_planes; ++j) { + struct macroblockd_plane *const pd = &xd->plane[j]; + setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j], + ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const int num_refs = 1 + has_second_ref(left_mbmi); + + for (int ref = 0; ref < num_refs; ++ref) { + const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const ref_scale_factors = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[ref] = ref_scale_factors; + if ((!av1_is_valid_scale(ref_scale_factors))) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col, + ref_scale_factors, num_planes); + } + + xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row)); + xd->mb_to_bottom_edge = + ctxt->mb_to_far_edge + + GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE); +} + +static AOM_INLINE void combine_interintra( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred, int compstride, const uint8_t *interpred, + int interstride, const uint8_t *intrapred, int intrastride) { + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + if (use_wedge_interintra) { + if (av1_is_wedge_used(bsize)) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + const int subw = 2 * mi_size_wide[bsize] == bw; + const int subh = 2 * mi_size_high[bsize] == bh; + aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, + interpred, interstride, mask, block_size_wide[bsize], + bw, bh, subw, subh); + } + return; + } + + const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize]; + aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred, + interstride, mask, bw, bw, bh, 0, 0); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void combine_interintra_highbd( + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index, + int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, + uint8_t *comppred8, int compstride, const uint8_t *interpred8, + int interstride, const uint8_t *intrapred8, int intrastride, int bd) { + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + if (use_wedge_interintra) { + if (av1_is_wedge_used(bsize)) { + const uint8_t *mask = + av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + const int subh = 2 * mi_size_high[bsize] == bh; + const int subw = 2 * mi_size_wide[bsize] == bw; + aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, + interpred8, interstride, mask, + block_size_wide[bsize], bw, bh, subw, subh, bd); + } + return; + } + + uint8_t mask[MAX_SB_SQUARE]; + build_smooth_interintra_mask(mask, bw, plane_bsize, mode); + aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride, + interpred8, interstride, mask, bw, bw, bh, 0, 0, + bd); +} +#endif + +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); + PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode]; + assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0); + assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0); + assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0); + assert(xd->mi[0]->use_intrabc == 0); + const SequenceHeader *seq_params = cm->seq_params; + + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, pd->width, + pd->height, max_txsize_rect_lookup[plane_bsize], mode, + 0, 0, FILTER_INTRA_MODES, ctx->plane[plane], + ctx->stride[plane], dst, dst_stride, 0, 0, plane); +} + +void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride) { + const int ssx = xd->plane[plane].subsampling_x; + const int ssy = xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + combine_interintra_highbd( + xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride, xd->bd); + return; + } +#endif + combine_interintra( + xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, + xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize, + plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, + inter_pred, inter_stride, intra_pred, intra_stride); +} + +// build interintra_predictors for one plane +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); + av1_build_intra_predictors_for_interintra( + cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), + MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, + CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE); + } else { + DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]); + av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx, + intrapredictor, MAX_SB_SIZE); + av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor, + MAX_SB_SIZE); + } +} diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h new file mode 100644 index 0000000000..c31f4531e2 --- /dev/null +++ b/third_party/aom/av1/common/reconinter.h @@ -0,0 +1,489 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RECONINTER_H_ +#define AOM_AV1_COMMON_RECONINTER_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/warped_motion.h" +#include "aom/aom_integer.h" + +// Work out how many pixels off the edge of a reference frame we're allowed +// to go when forming an inter prediction. +// The outermost row/col of each referernce frame is extended by +// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep +// at least AOM_INTERP_EXTEND pixels within that to account for filtering. +// +// We have to break this up into two macros to keep both clang-format and +// tools/lint-hunks.py happy. +#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \ + ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND) +#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \ + (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS) + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_WEDGE_TYPES 16 + +#define MAX_WEDGE_SIZE_LOG2 5 // 32x32 +#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2) +#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE) + +#define WEDGE_WEIGHT_BITS 6 + +#define WEDGE_NONE -1 + +// Angles are with respect to horizontal anti-clockwise +enum { + WEDGE_HORIZONTAL = 0, + WEDGE_VERTICAL = 1, + WEDGE_OBLIQUE27 = 2, + WEDGE_OBLIQUE63 = 3, + WEDGE_OBLIQUE117 = 4, + WEDGE_OBLIQUE153 = 5, + WEDGE_DIRECTIONS +} UENUM1BYTE(WedgeDirectionType); + +// 3-tuple: {direction, x_offset, y_offset} +typedef struct { + WedgeDirectionType direction; + int x_offset; + int y_offset; +} wedge_code_type; + +typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES]; + +typedef struct { + int wedge_types; + const wedge_code_type *codebook; + uint8_t *signflip; + wedge_masks_type *masks; +} wedge_params_type; + +extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL]; + +typedef struct SubpelParams { + int xs; + int ys; + int subpel_x; + int subpel_y; + int pos_x; + int pos_y; +} SubpelParams; + +struct build_prediction_ctxt { + const AV1_COMMON *cm; + uint8_t **tmp_buf; + int *tmp_width; + int *tmp_height; + int *tmp_stride; + int mb_to_far_edge; + void *dcb; // Decoder-only coding block. +}; + +typedef enum InterPredMode { + TRANSLATION_PRED, + WARP_PRED, +} InterPredMode; + +typedef enum InterCompMode { + UNIFORM_SINGLE, + UNIFORM_COMP, + MASK_COMP, +} InterCompMode; + +typedef struct InterPredParams { + InterPredMode mode; + InterCompMode comp_mode; + WarpedMotionParams warp_params; + ConvolveParams conv_params; + const InterpFilterParams *interp_filter_params[2]; + int block_width; + int block_height; + int pix_row; + int pix_col; + struct buf_2d ref_frame_buf; + int subsampling_x; + int subsampling_y; + const struct scale_factors *scale_factors; + int bit_depth; + int use_hbd_buf; + INTERINTER_COMPOUND_DATA mask_comp; + BLOCK_SIZE sb_type; + int is_intrabc; + int top; + int left; +} InterPredParams; + +// Initialize sub-pel params required for inter prediction. +static AOM_INLINE void init_subpel_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + SubpelParams *subpel_params, int width, int height) { + const struct scale_factors *sf = inter_pred_params->scale_factors; + int ssx = inter_pred_params->subsampling_x; + int ssy = inter_pred_params->subsampling_y; + int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + orig_pos_y += src_mv->row * (1 << (1 - ssy)); + int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + orig_pos_x += src_mv->col * (1 << (1 - ssx)); + const int is_scaled = av1_is_scaled(sf); + int pos_x, pos_y; + if (LIKELY(!is_scaled)) { + pos_y = av1_unscaled_value(orig_pos_y, sf); + pos_x = av1_unscaled_value(orig_pos_x, sf); + } else { + pos_y = av1_scaled_y(orig_pos_y, sf); + pos_x = av1_scaled_x(orig_pos_x, sf); + } + + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int bottom = (height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + const int right = (width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, inter_pred_params->top, bottom); + pos_x = clamp(pos_x, inter_pred_params->left, right); + + subpel_params->pos_x = pos_x; + subpel_params->pos_y = pos_y; + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; +} + +// Initialize interp filter required for inter prediction. +static AOM_INLINE void init_interp_filter_params( + const InterpFilterParams *interp_filter_params[2], + const InterpFilters *filter, int block_width, int block_height, + int is_intrabc) { + if (UNLIKELY(is_intrabc)) { + interp_filter_params[0] = &av1_intrabc_filter_params; + interp_filter_params[1] = &av1_intrabc_filter_params; + } else { + interp_filter_params[0] = av1_get_interp_filter_params_with_block_size( + (InterpFilter)filter->x_filter, block_width); + interp_filter_params[1] = av1_get_interp_filter_params_with_block_size( + (InterpFilter)filter->y_filter, block_height); + } +} + +// Initialize parameters required for inter prediction at mode level. +static AOM_INLINE void init_inter_mode_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + SubpelParams *subpel_params, const struct scale_factors *sf, int width, + int height) { + inter_pred_params->scale_factors = sf; + init_subpel_params(src_mv, inter_pred_params, subpel_params, width, height); +} + +// Initialize parameters required for inter prediction at block level. +static AOM_INLINE void init_inter_block_params( + InterPredParams *inter_pred_params, int block_width, int block_height, + int pix_row, int pix_col, int subsampling_x, int subsampling_y, + int bit_depth, int use_hbd_buf, int is_intrabc) { + inter_pred_params->block_width = block_width; + inter_pred_params->block_height = block_height; + inter_pred_params->pix_row = pix_row; + inter_pred_params->pix_col = pix_col; + inter_pred_params->subsampling_x = subsampling_x; + inter_pred_params->subsampling_y = subsampling_y; + inter_pred_params->bit_depth = bit_depth; + inter_pred_params->use_hbd_buf = use_hbd_buf; + inter_pred_params->is_intrabc = is_intrabc; + inter_pred_params->mode = TRANSLATION_PRED; + inter_pred_params->comp_mode = UNIFORM_SINGLE; + inter_pred_params->top = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_y); + inter_pred_params->left = -AOM_LEFT_TOP_MARGIN_SCALED(subsampling_x); +} + +// Initialize params required for inter prediction. +static AOM_INLINE void av1_init_inter_params( + InterPredParams *inter_pred_params, int block_width, int block_height, + int pix_row, int pix_col, int subsampling_x, int subsampling_y, + int bit_depth, int use_hbd_buf, int is_intrabc, + const struct scale_factors *sf, const struct buf_2d *ref_buf, + int_interpfilters interp_filters) { + init_inter_block_params(inter_pred_params, block_width, block_height, pix_row, + pix_col, subsampling_x, subsampling_y, bit_depth, + use_hbd_buf, is_intrabc); + init_interp_filter_params(inter_pred_params->interp_filter_params, + &interp_filters.as_filters, block_width, + block_height, is_intrabc); + inter_pred_params->scale_factors = sf; + inter_pred_params->ref_frame_buf = *ref_buf; +} + +static AOM_INLINE void av1_init_comp_mode(InterPredParams *inter_pred_params) { + inter_pred_params->comp_mode = UNIFORM_COMP; +} + +void av1_init_warp_params(InterPredParams *inter_pred_params, + const WarpTypesAllowed *warp_types, int ref, + const MACROBLOCKD *xd, const MB_MODE_INFO *mi); + +static INLINE int has_scale(int xs, int ys) { + return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS; +} + +static INLINE void revert_scale_extra_bits(SubpelParams *sp) { + sp->subpel_x >>= SCALE_EXTRA_BITS; + sp->subpel_y >>= SCALE_EXTRA_BITS; + sp->xs >>= SCALE_EXTRA_BITS; + sp->ys >>= SCALE_EXTRA_BITS; + assert(sp->subpel_x < SUBPEL_SHIFTS); + assert(sp->subpel_y < SUBPEL_SHIFTS); + assert(sp->xs <= SUBPEL_SHIFTS); + assert(sp->ys <= SUBPEL_SHIFTS); +} + +static INLINE void inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, int w, int h, + ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2]) { + assert(conv_params->do_average == 0 || conv_params->do_average == 1); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + if (is_scaled) { + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params); + } else { + SubpelParams sp = *subpel_params; + revert_scale_extra_bits(&sp); + av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, sp.subpel_y, + sp.ys, 0, conv_params); + } +} + +static INLINE void highbd_inter_predictor( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const SubpelParams *subpel_params, int w, int h, + ConvolveParams *conv_params, const InterpFilterParams *interp_filters[2], + int bd) { + assert(conv_params->do_average == 0 || conv_params->do_average == 1); + const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys); + if (is_scaled) { + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, subpel_params->subpel_x, + subpel_params->xs, subpel_params->subpel_y, + subpel_params->ys, 1, conv_params, bd); + } else { + SubpelParams sp = *subpel_params; + revert_scale_extra_bits(&sp); + av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h, + interp_filters, sp.subpel_x, sp.xs, + sp.subpel_y, sp.ys, 0, conv_params, bd); + } +} + +void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi); +int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd, int dir); + +static INLINE int is_interinter_compound_used(COMPOUND_TYPE type, + BLOCK_SIZE sb_type) { + const int comp_allowed = is_comp_ref_allowed(sb_type); + switch (type) { + case COMPOUND_AVERAGE: + case COMPOUND_DISTWTD: + case COMPOUND_DIFFWTD: return comp_allowed; + case COMPOUND_WEDGE: + return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0; + default: assert(0); return 0; + } +} + +static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) { + COMPOUND_TYPE comp_type; + int i; + if (!is_comp_ref_allowed(sb_type)) return 0; + for (i = 0; i < COMPOUND_TYPES; i++) { + comp_type = (COMPOUND_TYPE)i; + if (is_masked_compound_type(comp_type) && + is_interinter_compound_used(comp_type, sb_type)) + return 1; + } + return 0; +} + +static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types; +} + +static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].wedge_types > 0; +} + +void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); +void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride, + uint8_t *dst, int dst_stride, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +// TODO(jkoleszar): yet another mv clamping function :-( +static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, + const MV *src_mv, int bw, int bh, + int ss_x, int ss_y) { + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. + const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS; + const int spel_right = spel_left - SUBPEL_SHIFTS; + const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS; + const int spel_bottom = spel_top - SUBPEL_SHIFTS; + MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))), + (int16_t)(src_mv->col * (1 << (1 - ss_x))) }; + assert(ss_x <= 1); + assert(ss_y <= 1); + const SubpelMvLimits mv_limits = { + xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left, + xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right, + xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top, + xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom + }; + + clamp_mv(&clamped_mv, &mv_limits); + + return clamped_mv; +} + +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { + int x, y; + if (!sf) { + x = x_offset; + y = y_offset; + } else if (av1_is_scaled(sf)) { + x = av1_scaled_x(x_offset, sf) >> SCALE_EXTRA_BITS; + y = av1_scaled_y(y_offset, sf) >> SCALE_EXTRA_BITS; + } else { + x = av1_unscaled_value(x_offset, sf) >> SCALE_EXTRA_BITS; + y = av1_unscaled_value(y_offset, sf) >> SCALE_EXTRA_BITS; + } + return (int64_t)y * stride + x; +} + +static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, + uint8_t *src, int width, int height, + int stride, int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { + // Offset the buffer pointer + if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1)) + mi_row -= 1; + if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1)) + mi_col -= 1; + + const int x = (MI_SIZE * mi_col) >> subsampling_x; + const int y = (MI_SIZE * mi_row) >> subsampling_y; + dst->buf = src + scaled_buffer_offset(x, y, stride, scale); + dst->buf0 = src; + dst->width = width; + dst->height = height; + dst->stride = stride; +} + +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const int plane_start, const int plane_end); + +void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf, const int num_planes); + +static INLINE void set_default_interp_filters( + MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) { + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter)); +} + +static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + if (mbmi->skip_mode) return 0; + if (mbmi->motion_mode == WARPED_CAUSAL) return 0; + if (is_nontrans_global_motion(xd, xd->mi[0])) return 0; + return 1; +} + +// Sets up buffers 'dst_buf1' and 'dst_buf2' from relevant buffers in 'xd' for +// subsequent use in OBMC prediction. +void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, + uint8_t **dst_buf2); + +void av1_setup_build_prediction_by_above_pred( + MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width, + MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt, + const int num_planes); +void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, + uint8_t left_mi_height, + MB_MODE_INFO *left_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes); +void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *above[MAX_MB_PLANE], + int above_stride[MAX_MB_PLANE], + uint8_t *left[MAX_MB_PLANE], + int left_stride[MAX_MB_PLANE]); + +const uint8_t *av1_get_obmc_mask(int length); +void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd); + +#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) +#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) + +void av1_init_wedge_masks(void); + +static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, + int8_t wedge_sign, + BLOCK_SIZE sb_type) { + return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index]; +} + +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int *fwd_offset, + int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound); + +const uint8_t *av1_get_compound_type_mask( + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type); + +// build interintra_predictors for one plane +void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *pred, int stride, + const BUFFER_SET *ctx, int plane, + BLOCK_SIZE bsize); + +void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, + MACROBLOCKD *xd, + BLOCK_SIZE bsize, int plane, + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride); + +void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, + const uint8_t *inter_pred, int inter_stride, + const uint8_t *intra_pred, int intra_stride); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RECONINTER_H_ diff --git a/third_party/aom/av1/common/reconinter_template.inc b/third_party/aom/av1/common/reconinter_template.inc new file mode 100644 index 0000000000..863c13c112 --- /dev/null +++ b/third_party/aom/av1/common/reconinter_template.inc @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef IS_DEC +#error "IS_DEC must be defined for reconinter_template.inc." +#endif + +#if IS_DEC +static AOM_INLINE void build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *src_mv, + InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y, + int ref, uint8_t **mc_buf) { +#else +static AOM_INLINE void build_one_inter_predictor( + uint8_t *dst, int dst_stride, const MV *src_mv, + InterPredParams *inter_pred_params) { +#endif // IS_DEC + SubpelParams subpel_params; + uint8_t *src; + int src_stride; +#if IS_DEC + dec_calc_subpel_params_and_extend(src_mv, inter_pred_params, xd, mi_x, mi_y, + ref, mc_buf, &src, &subpel_params, + &src_stride); +#else + enc_calc_subpel_params(src_mv, inter_pred_params, &src, &subpel_params, + &src_stride); +#endif // IS_DEC + if (inter_pred_params->comp_mode == UNIFORM_SINGLE || + inter_pred_params->comp_mode == UNIFORM_COMP) { + av1_make_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } else { + av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride, + inter_pred_params, &subpel_params); + } +} + +// True if the following hold: +// 1. Not intrabc and not build_for_obmc +// 2. At least one dimension is size 4 with subsampling +// 3. If sub-sampled, none of the previous blocks around the sub-sample +// are intrabc or inter-blocks +static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize, + int is_intrabc, int build_for_obmc) { + if (is_intrabc || build_for_obmc) { + return false; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int is_sub4_x = (block_size_wide[bsize] == 4) && ss_x; + const int is_sub4_y = (block_size_high[bsize] == 4) && ss_y; + if (!is_sub4_x && !is_sub4_y) { + return false; + } + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = is_sub4_y ? -1 : 0; + const int col_start = is_sub4_x ? -1 : 0; + + for (int row = row_start; row <= 0; ++row) { + for (int col = col_start; col <= 0; ++col) { + const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + if (!is_inter_block(this_mbmi)) return false; + if (is_intrabc_block(this_mbmi)) return false; + } + } + return true; +} + +#if IS_DEC +static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm, + MACROBLOCKD *xd, int plane, + const MB_MODE_INFO *mi, + int mi_x, int mi_y, + uint8_t **mc_buf) { +#else +static AOM_INLINE void build_inter_predictors_sub8x8(const AV1_COMMON *cm, + MACROBLOCKD *xd, int plane, + const MB_MODE_INFO *mi, + int mi_x, int mi_y) { +#endif // IS_DEC + const BLOCK_SIZE bsize = mi->bsize; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const bool ss_x = pd->subsampling_x; + const bool ss_y = pd->subsampling_y; + const int b4_w = block_size_wide[bsize] >> ss_x; + const int b4_h = block_size_high[bsize] >> ss_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int b8_w = block_size_wide[plane_bsize]; + const int b8_h = block_size_high[plane_bsize]; + const int is_compound = has_second_ref(mi); + assert(!is_compound); + assert(!is_intrabc_block(mi)); + + // For sub8x8 chroma blocks, we may be covering more than one luma block's + // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for + // the top-left corner of the prediction source - the correct top-left corner + // is at (pre_x, pre_y). + const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0; + const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + int row = row_start; + for (int y = 0; y < b8_h; y += b4_h) { + int col = col_start; + for (int x = 0; x < b8_w; x += b4_w) { + MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; + int ref = 0; + const RefCntBuffer *ref_buf = + get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *const sf = ref_scale_factors; + const struct buf_2d pre_buf = { + NULL, + (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer, + ref_buf->buf.uv_crop_width, + ref_buf->buf.uv_crop_height, + ref_buf->buf.uv_stride, + }; + + const MV mv = this_mbmi->mv[ref].as_mv; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y, + pre_x + x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf, + &pre_buf, this_mbmi->interp_filters); + inter_pred_params.conv_params = + get_conv_params_no_round(ref, plane, NULL, 0, is_compound, xd->bd); + +#if IS_DEC + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, + xd, mi_x + x, mi_y + y, ref, mc_buf); +#else + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); +#endif // IS_DEC + + ++col; + } + ++row; + } +} + +#if IS_DEC +static AOM_INLINE void build_inter_predictors_8x8_and_bigger( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) { +#else +static AOM_INLINE void build_inter_predictors_8x8_and_bigger( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y) { +#endif // IS_DEC + const int is_compound = has_second_ref(mi); + const int is_intrabc = is_intrabc_block(mi); + assert(IMPLIES(is_intrabc, !is_compound)); + struct macroblockd_plane *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + + int is_global[2] = { 0, 0 }; + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + is_global[ref] = is_global_mv_block(mi, wm->wmtype); + } + + const BLOCK_SIZE bsize = mi->bsize; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int row_start = + (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0; + const int col_start = + (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0; + const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y; + + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; + struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; + const MV mv = mi->mv[ref].as_mv; + const WarpTypesAllowed warp_types = { is_global[ref], + mi->motion_mode == WARPED_CAUSAL }; + + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf, + mi->interp_filters); + if (is_compound) av1_init_comp_mode(&inter_pred_params); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + + av1_dist_wtd_comp_weight_assign( + cm, mi, &inter_pred_params.conv_params.fwd_offset, + &inter_pred_params.conv_params.bck_offset, + &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound); + + if (!build_for_obmc) + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + if (is_masked_compound_type(mi->interinter_comp.type)) { + inter_pred_params.sb_type = mi->bsize; + inter_pred_params.mask_comp = mi->interinter_comp; + if (ref == 1) { + inter_pred_params.conv_params.do_average = 0; + inter_pred_params.comp_mode = MASK_COMP; + } + // Assign physical buffer. + inter_pred_params.mask_comp.seg_mask = xd->seg_mask; + } + +#if IS_DEC + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params, xd, + mi_x, mi_y, ref, mc_buf); +#else + build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params); +#endif // IS_DEC + } +} + +#if IS_DEC +static AOM_INLINE void build_inter_predictors( + const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, int mi_x, int mi_y, uint8_t **mc_buf) { + if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi), + build_for_obmc)) { + assert(bw < 8 || bh < 8); + build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y, mc_buf); + } else { + build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, + bh, mi_x, mi_y, mc_buf); + } +} +#else +static AOM_INLINE void build_inter_predictors(const AV1_COMMON *cm, + MACROBLOCKD *xd, int plane, + const MB_MODE_INFO *mi, + int build_for_obmc, int bw, + int bh, int mi_x, int mi_y) { + if (is_sub8x8_inter(xd, plane, mi->bsize, is_intrabc_block(mi), + build_for_obmc)) { + assert(bw < 8 || bh < 8); + build_inter_predictors_sub8x8(cm, xd, plane, mi, mi_x, mi_y); + } else { + build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw, + bh, mi_x, mi_y); + } +} +#endif // IS_DEC diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c new file mode 100644 index 0000000000..20a1e12476 --- /dev/null +++ b/third_party/aom/av1/common/reconintra.c @@ -0,0 +1,1798 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_once.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" + +enum { + NEED_LEFT = 1 << 1, + NEED_ABOVE = 1 << 2, + NEED_ABOVERIGHT = 1 << 3, + NEED_ABOVELEFT = 1 << 4, + NEED_BOTTOMLEFT = 1 << 5, +}; + +#define INTRA_EDGE_FILT 3 +#define INTRA_EDGE_TAPS 5 +#define MAX_UPSAMPLE_SZ 16 +#define NUM_INTRA_NEIGHBOUR_PIXELS (MAX_TX_SIZE * 2 + 32) + +static const uint8_t extend_modes[INTRA_MODES] = { + NEED_ABOVE | NEED_LEFT, // DC + NEED_ABOVE, // V + NEED_LEFT, // H + NEED_ABOVE | NEED_ABOVERIGHT, // D45 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D135 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D113 + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D157 + NEED_LEFT | NEED_BOTTOMLEFT, // D203 + NEED_ABOVE | NEED_ABOVERIGHT, // D67 + NEED_LEFT | NEED_ABOVE, // SMOOTH + NEED_LEFT | NEED_ABOVE, // SMOOTH_V + NEED_LEFT | NEED_ABOVE, // SMOOTH_H + NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // PAETH +}; + +// Tables to store if the top-right reference pixels are available. The flags +// are represented with bits, packed into 8-bit integers. E.g., for the 32x32 +// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster +// order), so its flag is stored at the 3rd bit of the 2nd entry in the table, +// i.e. (table[10 / 8] >> (10 % 8)) & 1. +// . . . . +// . . . . +// . . o . +// . . . . +static uint8_t has_tr_4x4[128] = { + 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, + 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, +}; +static uint8_t has_tr_4x8[64] = { + 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119, + 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, + 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119, + 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, + 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, +}; +static uint8_t has_tr_8x4[64] = { + 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, + 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, +}; +static uint8_t has_tr_8x8[32] = { + 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, + 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, +}; +static uint8_t has_tr_8x16[16] = { + 255, 255, 119, 119, 127, 127, 119, 119, + 255, 127, 119, 119, 127, 127, 119, 119, +}; +static uint8_t has_tr_16x8[16] = { + 255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0, +}; +static uint8_t has_tr_16x16[8] = { + 255, 85, 119, 85, 127, 85, 119, 85, +}; +static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 }; +static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 }; +static uint8_t has_tr_32x32[2] = { 95, 87 }; +static uint8_t has_tr_32x64[1] = { 127 }; +static uint8_t has_tr_64x32[1] = { 19 }; +static uint8_t has_tr_64x64[1] = { 7 }; +static uint8_t has_tr_64x128[1] = { 3 }; +static uint8_t has_tr_128x64[1] = { 1 }; +static uint8_t has_tr_128x128[1] = { 1 }; +static uint8_t has_tr_4x16[32] = { + 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255, + 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127, + 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, +}; +static uint8_t has_tr_16x4[32] = { + 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, + 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, +}; +static uint8_t has_tr_8x32[8] = { + 255, 255, 127, 127, 255, 127, 127, 127, +}; +static uint8_t has_tr_32x8[8] = { + 15, 0, 5, 0, 7, 0, 5, 0, +}; +static uint8_t has_tr_16x64[2] = { 255, 127 }; +static uint8_t has_tr_64x16[2] = { 3, 1 }; + +static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = { + // 4X4 + has_tr_4x4, + // 4X8, 8X4, 8X8 + has_tr_4x8, has_tr_8x4, has_tr_8x8, + // 8X16, 16X8, 16X16 + has_tr_8x16, has_tr_16x8, has_tr_16x16, + // 16X32, 32X16, 32X32 + has_tr_16x32, has_tr_32x16, has_tr_32x32, + // 32X64, 64X32, 64X64 + has_tr_32x64, has_tr_64x32, has_tr_64x64, + // 64x128, 128x64, 128x128 + has_tr_64x128, has_tr_128x64, has_tr_128x128, + // 4x16, 16x4, 8x32 + has_tr_4x16, has_tr_16x4, has_tr_8x32, + // 32x8, 16x64, 64x16 + has_tr_32x8, has_tr_16x64, has_tr_64x16 +}; + +static uint8_t has_tr_vert_8x8[32] = { + 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, + 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, +}; +static uint8_t has_tr_vert_16x16[8] = { + 255, 0, 119, 0, 127, 0, 119, 0, +}; +static uint8_t has_tr_vert_32x32[2] = { 15, 7 }; +static uint8_t has_tr_vert_64x64[1] = { 3 }; + +// The _vert_* tables are like the ordinary tables above, but describe the +// order we visit square blocks when doing a PARTITION_VERT_A or +// PARTITION_VERT_B. This is the same order as normal except for on the last +// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block +// as a pair of squares, which means that these tables work correctly for both +// mixed vertical partition types. +// +// There are tables for each of the square sizes. Vertical rectangles (like +// BLOCK_16X32) use their respective "non-vert" table +static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = { + // 4X4 + NULL, + // 4X8, 8X4, 8X8 + has_tr_4x8, NULL, has_tr_vert_8x8, + // 8X16, 16X8, 16X16 + has_tr_8x16, NULL, has_tr_vert_16x16, + // 16X32, 32X16, 32X32 + has_tr_16x32, NULL, has_tr_vert_32x32, + // 32X64, 64X32, 64X64 + has_tr_32x64, NULL, has_tr_vert_64x64, + // 64x128, 128x64, 128x128 + has_tr_64x128, NULL, has_tr_128x128 +}; + +static const uint8_t *get_has_tr_table(PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + const uint8_t *ret = NULL; + // If this is a mixed vertical partition, look up bsize in orders_vert. + if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { + assert(bsize < BLOCK_SIZES); + ret = has_tr_vert_tables[bsize]; + } else { + ret = has_tr_tables[bsize]; + } + assert(ret); + return ret; +} + +static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, + int mi_col, int top_available, int right_available, + PARTITION_TYPE partition, TX_SIZE txsz, int row_off, + int col_off, int ss_x, int ss_y) { + if (!top_available || !right_available) return 0; + + const int bw_unit = mi_size_wide[bsize]; + const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1); + const int top_right_count_unit = tx_size_wide_unit[txsz]; + + if (row_off > 0) { // Just need to check if enough pixels on the right. + if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) { + // Special case: For 128x128 blocks, the transform unit whose + // top-right corner is at the center of the block does in fact have + // pixels available at its top-right corner. + if (row_off == mi_size_high[BLOCK_64X64] >> ss_y && + col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) { + return 1; + } + const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; + const int col_off_64 = col_off % plane_bw_unit_64; + return col_off_64 + top_right_count_unit < plane_bw_unit_64; + } + return col_off + top_right_count_unit < plane_bw_unit; + } else { + // All top-right pixels are in the block above, which is already available. + if (col_off + top_right_count_unit < plane_bw_unit) return 1; + + const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; + const int bh_in_mi_log2 = mi_size_high_log2[bsize]; + const int sb_mi_size = mi_size_high[sb_size]; + const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; + const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; + + // Top row of superblock: so top-right pixels are in the top and/or + // top-right superblocks, both of which are already available. + if (blk_row_in_sb == 0) return 1; + + // Rightmost column of superblock (and not the top row): so top-right pixels + // fall in the right superblock, which is not available yet. + if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) { + return 0; + } + + // General case (neither top row nor rightmost column): check if the + // top-right block is coded before the current block. + const int this_blk_index = + ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + + blk_col_in_sb + 0; + const int idx1 = this_blk_index / 8; + const int idx2 = this_blk_index % 8; + const uint8_t *has_tr_table = get_has_tr_table(partition, bsize); + return (has_tr_table[idx1] >> idx2) & 1; + } +} + +// Similar to the has_tr_* tables, but store if the bottom-left reference +// pixels are available. +static uint8_t has_bl_4x4[128] = { + 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, + 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, + 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, + 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, + 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, + 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, + 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, +}; +static uint8_t has_bl_4x8[64] = { + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, + 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, +}; +static uint8_t has_bl_8x4[64] = { + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, + 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, +}; +static uint8_t has_bl_8x8[32] = { + 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, + 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, +}; +static uint8_t has_bl_8x16[16] = { + 16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0, +}; +static uint8_t has_bl_16x8[16] = { + 254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0, +}; +static uint8_t has_bl_16x16[8] = { + 84, 16, 84, 0, 84, 16, 84, 0, +}; +static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 }; +static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 }; +static uint8_t has_bl_32x32[2] = { 4, 4 }; +static uint8_t has_bl_32x64[1] = { 0 }; +static uint8_t has_bl_64x32[1] = { 34 }; +static uint8_t has_bl_64x64[1] = { 0 }; +static uint8_t has_bl_64x128[1] = { 0 }; +static uint8_t has_bl_128x64[1] = { 0 }; +static uint8_t has_bl_128x128[1] = { 0 }; +static uint8_t has_bl_4x16[32] = { + 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, + 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, +}; +static uint8_t has_bl_16x4[32] = { + 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, + 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, +}; +static uint8_t has_bl_8x32[8] = { + 0, 1, 0, 0, 0, 1, 0, 0, +}; +static uint8_t has_bl_32x8[8] = { + 238, 78, 238, 14, 238, 78, 238, 14, +}; +static uint8_t has_bl_16x64[2] = { 0, 0 }; +static uint8_t has_bl_64x16[2] = { 42, 42 }; + +static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = { + // 4X4 + has_bl_4x4, + // 4X8, 8X4, 8X8 + has_bl_4x8, has_bl_8x4, has_bl_8x8, + // 8X16, 16X8, 16X16 + has_bl_8x16, has_bl_16x8, has_bl_16x16, + // 16X32, 32X16, 32X32 + has_bl_16x32, has_bl_32x16, has_bl_32x32, + // 32X64, 64X32, 64X64 + has_bl_32x64, has_bl_64x32, has_bl_64x64, + // 64x128, 128x64, 128x128 + has_bl_64x128, has_bl_128x64, has_bl_128x128, + // 4x16, 16x4, 8x32 + has_bl_4x16, has_bl_16x4, has_bl_8x32, + // 32x8, 16x64, 64x16 + has_bl_32x8, has_bl_16x64, has_bl_64x16 +}; + +static uint8_t has_bl_vert_8x8[32] = { + 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, + 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, +}; +static uint8_t has_bl_vert_16x16[8] = { + 254, 16, 254, 0, 254, 16, 254, 0, +}; +static uint8_t has_bl_vert_32x32[2] = { 14, 14 }; +static uint8_t has_bl_vert_64x64[1] = { 2 }; + +// The _vert_* tables are like the ordinary tables above, but describe the +// order we visit square blocks when doing a PARTITION_VERT_A or +// PARTITION_VERT_B. This is the same order as normal except for on the last +// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block +// as a pair of squares, which means that these tables work correctly for both +// mixed vertical partition types. +// +// There are tables for each of the square sizes. Vertical rectangles (like +// BLOCK_16X32) use their respective "non-vert" table +static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = { + // 4X4 + NULL, + // 4X8, 8X4, 8X8 + has_bl_4x8, NULL, has_bl_vert_8x8, + // 8X16, 16X8, 16X16 + has_bl_8x16, NULL, has_bl_vert_16x16, + // 16X32, 32X16, 32X32 + has_bl_16x32, NULL, has_bl_vert_32x32, + // 32X64, 64X32, 64X64 + has_bl_32x64, NULL, has_bl_vert_64x64, + // 64x128, 128x64, 128x128 + has_bl_64x128, NULL, has_bl_128x128 +}; + +static const uint8_t *get_has_bl_table(PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + const uint8_t *ret = NULL; + // If this is a mixed vertical partition, look up bsize in orders_vert. + if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { + assert(bsize < BLOCK_SIZES); + ret = has_bl_vert_tables[bsize]; + } else { + ret = has_bl_tables[bsize]; + } + assert(ret); + return ret; +} + +static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, + int mi_col, int bottom_available, int left_available, + PARTITION_TYPE partition, TX_SIZE txsz, int row_off, + int col_off, int ss_x, int ss_y) { + if (!bottom_available || !left_available) return 0; + + // Special case for 128x* blocks, when col_off is half the block width. + // This is needed because 128x* superblocks are divided into 64x* blocks in + // raster order + if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) { + const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x; + const int col_off_64 = col_off % plane_bw_unit_64; + if (col_off_64 == 0) { + // We are at the left edge of top-right or bottom-right 64x* block. + const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y; + const int row_off_64 = row_off % plane_bh_unit_64; + const int plane_bh_unit = + AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64); + // Check if all bottom-left pixels are in the left 64x* block (which is + // already coded). + return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit; + } + } + + if (col_off > 0) { + // Bottom-left pixels are in the bottom-left block, which is not available. + return 0; + } else { + const int bh_unit = mi_size_high[bsize]; + const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1); + const int bottom_left_count_unit = tx_size_high_unit[txsz]; + + // All bottom-left pixels are in the left block, which is already available. + if (row_off + bottom_left_count_unit < plane_bh_unit) return 1; + + const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; + const int bh_in_mi_log2 = mi_size_high_log2[bsize]; + const int sb_mi_size = mi_size_high[sb_size]; + const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; + const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; + + // Leftmost column of superblock: so bottom-left pixels maybe in the left + // and/or bottom-left superblocks. But only the left superblock is + // available, so check if all required pixels fall in that superblock. + if (blk_col_in_sb == 0) { + const int blk_start_row_off = + blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >> + ss_y; + const int row_off_in_sb = blk_start_row_off + row_off; + const int sb_height_unit = sb_mi_size >> ss_y; + return row_off_in_sb + bottom_left_count_unit < sb_height_unit; + } + + // Bottom row of superblock (and not the leftmost column): so bottom-left + // pixels fall in the bottom superblock, which is not available yet. + if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0; + + // General case (neither leftmost column nor bottom row): check if the + // bottom-left block is coded before the current block. + const int this_blk_index = + ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + + blk_col_in_sb + 0; + const int idx1 = this_blk_index / 8; + const int idx2 = this_blk_index % 8; + const uint8_t *has_bl_table = get_has_bl_table(partition, bsize); + return (has_bl_table[idx1] >> idx2) & 1; + } +} + +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); + +static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL]; +static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL]; + +#if CONFIG_AV1_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL]; +static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL]; +#endif + +static void init_intra_predictors_internal(void) { + assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES); + +#define INIT_RECTANGULAR(p, type) \ + p[TX_4X8] = aom_##type##_predictor_4x8; \ + p[TX_8X4] = aom_##type##_predictor_8x4; \ + p[TX_8X16] = aom_##type##_predictor_8x16; \ + p[TX_16X8] = aom_##type##_predictor_16x8; \ + p[TX_16X32] = aom_##type##_predictor_16x32; \ + p[TX_32X16] = aom_##type##_predictor_32x16; \ + p[TX_32X64] = aom_##type##_predictor_32x64; \ + p[TX_64X32] = aom_##type##_predictor_64x32; \ + p[TX_4X16] = aom_##type##_predictor_4x16; \ + p[TX_16X4] = aom_##type##_predictor_16x4; \ + p[TX_8X32] = aom_##type##_predictor_8x32; \ + p[TX_32X8] = aom_##type##_predictor_32x8; \ + p[TX_16X64] = aom_##type##_predictor_16x64; \ + p[TX_64X16] = aom_##type##_predictor_64x16; + +#define INIT_NO_4X4(p, type) \ + p[TX_8X8] = aom_##type##_predictor_8x8; \ + p[TX_16X16] = aom_##type##_predictor_16x16; \ + p[TX_32X32] = aom_##type##_predictor_32x32; \ + p[TX_64X64] = aom_##type##_predictor_64x64; \ + INIT_RECTANGULAR(p, type) + +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = aom_##type##_predictor_4x4; \ + INIT_NO_4X4(p, type) + + INIT_ALL_SIZES(pred[V_PRED], v) + INIT_ALL_SIZES(pred[H_PRED], h) + INIT_ALL_SIZES(pred[PAETH_PRED], paeth) + INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth) + INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v) + INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h) + INIT_ALL_SIZES(dc_pred[0][0], dc_128) + INIT_ALL_SIZES(dc_pred[0][1], dc_top) + INIT_ALL_SIZES(dc_pred[1][0], dc_left) + INIT_ALL_SIZES(dc_pred[1][1], dc) +#if CONFIG_AV1_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], highbd_v) + INIT_ALL_SIZES(pred_high[H_PRED], highbd_h) + INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth) + INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth) + INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v) + INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h) + INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128) + INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top) + INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left) + INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc) +#endif +#undef intra_pred_allsizes +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + int r, c, x, base, shift, val; + + (void)left; + (void)dy; + assert(dy == 1); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << upsample_above; + const int frac_bits = 6 - upsample_above; + const int base_inc = 1 << upsample_above; + x = dx; + for (r = 0; r < bh; ++r, dst += stride, x += dx) { + base = x >> frac_bits; + shift = ((x << upsample_above) & 0x3F) >> 1; + + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + memset(dst, above[max_base_x], bw * sizeof(dst[0])); + dst += stride; + } + return; + } + + for (c = 0; c < bw; ++c, base += base_inc) { + if (base < max_base_x) { + val = above[base] * (32 - shift) + above[base + 1] * shift; + dst[c] = ROUND_POWER_OF_TWO(val, 5); + } else { + dst[c] = above[max_base_x]; + } + } + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } else { + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } + dst[c] = val; + } + dst += stride; + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + int r, c, y, base, shift, val; + + (void)above; + (void)dx; + + assert(dx == 1); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << upsample_left; + const int frac_bits = 6 - upsample_left; + const int base_inc = 1 << upsample_left; + y = dy; + for (c = 0; c < bw; ++c, y += dy) { + base = y >> frac_bits; + shift = ((y << upsample_left) & 0x3F) >> 1; + + for (r = 0; r < bh; ++r, base += base_inc) { + if (base < max_base_y) { + val = left[base] * (32 - shift) + left[base + 1] * shift; + dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); + } else { + for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; + break; + } + } + } +} + +static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int angle) { + const int dx = av1_get_dx(angle); + const int dy = av1_get_dy(angle); + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + assert(angle > 0 && angle < 270); + + if (angle > 0 && angle < 90) { + av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx, + dy); + } else if (angle > 90 && angle < 180) { + av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above, + upsample_left, dx, dy); + } else if (angle > 180 && angle < 270) { + av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx, + dy); + } else if (angle == 90) { + pred[V_PRED][tx_size](dst, stride, above, left); + } else if (angle == 180) { + pred[H_PRED][tx_size](dst, stride, above, left); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + int r, c, x, base, shift, val; + + (void)left; + (void)dy; + (void)bd; + assert(dy == 1); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << upsample_above; + const int frac_bits = 6 - upsample_above; + const int base_inc = 1 << upsample_above; + x = dx; + for (r = 0; r < bh; ++r, dst += stride, x += dx) { + base = x >> frac_bits; + shift = ((x << upsample_above) & 0x3F) >> 1; + + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above[max_base_x], bw); + dst += stride; + } + return; + } + + for (c = 0; c < bw; ++c, base += base_inc) { + if (base < max_base_x) { + val = above[base] * (32 - shift) + above[base + 1] * shift; + dst[c] = ROUND_POWER_OF_TWO(val, 5); + } else { + dst[c] = above[max_base_x]; + } + } + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + (void)min_base_y; + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + for (int r = 0; r < bh; ++r) { + for (int c = 0; c < bw; ++c) { + int val; + int y = r + 1; + int x = (c << 6) - y * dx; + const int base_x = x >> frac_bits_x; + if (base_x >= min_base_x) { + const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1; + val = above[base_x] * (32 - shift) + above[base_x + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } else { + x = c + 1; + y = (r << 6) - x * dy; + const int base_y = y >> frac_bits_y; + assert(base_y >= min_base_y); + const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1; + val = left[base_y] * (32 - shift) + left[base_y + 1] * shift; + val = ROUND_POWER_OF_TWO(val, 5); + } + dst[c] = val; + } + dst += stride; + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + int r, c, y, base, shift, val; + + (void)above; + (void)dx; + (void)bd; + assert(dx == 1); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << upsample_left; + const int frac_bits = 6 - upsample_left; + const int base_inc = 1 << upsample_left; + y = dy; + for (c = 0; c < bw; ++c, y += dy) { + base = y >> frac_bits; + shift = ((y << upsample_left) & 0x3F) >> 1; + + for (r = 0; r < bh; ++r, base += base_inc) { + if (base < max_base_y) { + val = left[base] * (32 - shift) + left[base + 1] * shift; + dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5); + } else { + for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y]; + break; + } + } + } +} + +static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int angle, int bd) { + const int dx = av1_get_dx(angle); + const int dy = av1_get_dy(angle); + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + assert(angle > 0 && angle < 270); + + if (angle > 0 && angle < 90) { + av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left, + upsample_above, dx, dy, bd); + } else if (angle > 90 && angle < 180) { + av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left, + upsample_above, upsample_left, dx, dy, bd); + } else if (angle > 180 && angle < 270) { + av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, + dx, dy, bd); + } else if (angle == 90) { + pred_high[V_PRED][tx_size](dst, stride, above, left, bd); + } else if (angle == 180) { + pred_high[H_PRED][tx_size](dst, stride, above, left, bd); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +DECLARE_ALIGNED(16, const int8_t, + av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = { + { + { -6, 10, 0, 0, 0, 12, 0, 0 }, + { -5, 2, 10, 0, 0, 9, 0, 0 }, + { -3, 1, 1, 10, 0, 7, 0, 0 }, + { -3, 1, 1, 2, 10, 5, 0, 0 }, + { -4, 6, 0, 0, 0, 2, 12, 0 }, + { -3, 2, 6, 0, 0, 2, 9, 0 }, + { -3, 2, 2, 6, 0, 2, 7, 0 }, + { -3, 1, 2, 2, 6, 3, 5, 0 }, + }, + { + { -10, 16, 0, 0, 0, 10, 0, 0 }, + { -6, 0, 16, 0, 0, 6, 0, 0 }, + { -4, 0, 0, 16, 0, 4, 0, 0 }, + { -2, 0, 0, 0, 16, 2, 0, 0 }, + { -10, 16, 0, 0, 0, 0, 10, 0 }, + { -6, 0, 16, 0, 0, 0, 6, 0 }, + { -4, 0, 0, 16, 0, 0, 4, 0 }, + { -2, 0, 0, 0, 16, 0, 2, 0 }, + }, + { + { -8, 8, 0, 0, 0, 16, 0, 0 }, + { -8, 0, 8, 0, 0, 16, 0, 0 }, + { -8, 0, 0, 8, 0, 16, 0, 0 }, + { -8, 0, 0, 0, 8, 16, 0, 0 }, + { -4, 4, 0, 0, 0, 0, 16, 0 }, + { -4, 0, 4, 0, 0, 0, 16, 0 }, + { -4, 0, 0, 4, 0, 0, 16, 0 }, + { -4, 0, 0, 0, 4, 0, 16, 0 }, + }, + { + { -2, 8, 0, 0, 0, 10, 0, 0 }, + { -1, 3, 8, 0, 0, 6, 0, 0 }, + { -1, 2, 3, 8, 0, 4, 0, 0 }, + { 0, 1, 2, 3, 8, 2, 0, 0 }, + { -1, 4, 0, 0, 0, 3, 10, 0 }, + { -1, 3, 4, 0, 0, 4, 6, 0 }, + { -1, 2, 3, 4, 0, 4, 4, 0 }, + { -1, 2, 2, 3, 4, 3, 3, 0 }, + }, + { + { -12, 14, 0, 0, 0, 14, 0, 0 }, + { -10, 0, 14, 0, 0, 12, 0, 0 }, + { -9, 0, 0, 14, 0, 11, 0, 0 }, + { -8, 0, 0, 0, 14, 10, 0, 0 }, + { -10, 12, 0, 0, 0, 0, 14, 0 }, + { -9, 1, 12, 0, 0, 0, 12, 0 }, + { -8, 0, 0, 12, 0, 1, 11, 0 }, + { -7, 0, 0, 1, 12, 1, 9, 0 }, + }, +}; + +void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + int r, c; + uint8_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); + + for (r = 1; r < bh + 1; r += 2) + for (c = 1; c < bw + 1; c += 4) { + const uint8_t p0 = buffer[r - 1][c - 1]; + const uint8_t p1 = buffer[r - 1][c]; + const uint8_t p2 = buffer[r - 1][c + 1]; + const uint8_t p3 = buffer[r - 1][c + 2]; + const uint8_t p4 = buffer[r - 1][c + 3]; + const uint8_t p5 = buffer[r][c - 1]; + const uint8_t p6 = buffer[r + 1][c - 1]; + for (int k = 0; k < 8; ++k) { + int r_offset = k >> 2; + int c_offset = k & 0x03; + int pr = av1_filter_intra_taps[mode][k][0] * p0 + + av1_filter_intra_taps[mode][k][1] * p1 + + av1_filter_intra_taps[mode][k][2] * p2 + + av1_filter_intra_taps[mode][k][3] * p3 + + av1_filter_intra_taps[mode][k][4] * p4 + + av1_filter_intra_taps[mode][k][5] * p5 + + av1_filter_intra_taps[mode][k][6] * p6; + // Section 7.11.2.3 specifies the right-hand side of the assignment as + // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). + // Since Clip1() clips a negative value to 0, it is safe to replace + // Round2Signed() with Round2(). + buffer[r + r_offset][c + c_offset] = + clip_pixel(ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS)); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); + dst += stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, + const uint16_t *above, + const uint16_t *left, int mode, + int bd) { + int r, c; + uint16_t buffer[33][33]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + assert(bw <= 32 && bh <= 32); + + for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; + memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0])); + + for (r = 1; r < bh + 1; r += 2) + for (c = 1; c < bw + 1; c += 4) { + const uint16_t p0 = buffer[r - 1][c - 1]; + const uint16_t p1 = buffer[r - 1][c]; + const uint16_t p2 = buffer[r - 1][c + 1]; + const uint16_t p3 = buffer[r - 1][c + 2]; + const uint16_t p4 = buffer[r - 1][c + 3]; + const uint16_t p5 = buffer[r][c - 1]; + const uint16_t p6 = buffer[r + 1][c - 1]; + for (int k = 0; k < 8; ++k) { + int r_offset = k >> 2; + int c_offset = k & 0x03; + int pr = av1_filter_intra_taps[mode][k][0] * p0 + + av1_filter_intra_taps[mode][k][1] * p1 + + av1_filter_intra_taps[mode][k][2] * p2 + + av1_filter_intra_taps[mode][k][3] * p3 + + av1_filter_intra_taps[mode][k][4] * p4 + + av1_filter_intra_taps[mode][k][5] * p5 + + av1_filter_intra_taps[mode][k][6] * p6; + // Section 7.11.2.3 specifies the right-hand side of the assignment as + // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ). + // Since Clip1() clips a negative value to 0, it is safe to replace + // Round2Signed() with Round2(). + buffer[r + r_offset][c + c_offset] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(pr, FILTER_INTRA_SCALE_BITS), bd); + } + } + + for (r = 0; r < bh; ++r) { + memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0])); + dst += stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int is_smooth(const MB_MODE_INFO *mbmi, int plane) { + if (plane == 0) { + const PREDICTION_MODE mode = mbmi->mode; + return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED); + } else { + // uv_mode is not set for inter blocks, so need to explicitly + // detect that case. + if (is_inter_block(mbmi)) return 0; + + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED || + uv_mode == UV_SMOOTH_H_PRED); + } +} + +static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) { + const MB_MODE_INFO *above; + const MB_MODE_INFO *left; + + if (plane == 0) { + above = xd->above_mbmi; + left = xd->left_mbmi; + } else { + above = xd->chroma_above_mbmi; + left = xd->chroma_left_mbmi; + } + + return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane)); +} + +static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { + const int d = abs(delta); + int strength = 0; + + const int blk_wh = bs0 + bs1; + if (type == 0) { + if (blk_wh <= 8) { + if (d >= 56) strength = 1; + } else if (blk_wh <= 12) { + if (d >= 40) strength = 1; + } else if (blk_wh <= 16) { + if (d >= 40) strength = 1; + } else if (blk_wh <= 24) { + if (d >= 8) strength = 1; + if (d >= 16) strength = 2; + if (d >= 32) strength = 3; + } else if (blk_wh <= 32) { + if (d >= 1) strength = 1; + if (d >= 4) strength = 2; + if (d >= 32) strength = 3; + } else { + if (d >= 1) strength = 3; + } + } else { + if (blk_wh <= 8) { + if (d >= 40) strength = 1; + if (d >= 64) strength = 2; + } else if (blk_wh <= 16) { + if (d >= 20) strength = 1; + if (d >= 48) strength = 2; + } else if (blk_wh <= 24) { + if (d >= 4) strength = 3; + } else { + if (d >= 1) strength = 3; + } + } + return strength; +} + +void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) { + if (!strength) return; + + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; + const int filt = strength - 1; + uint8_t edge[129]; + + memcpy(edge, p, sz * sizeof(*p)); + for (int i = 1; i < sz; i++) { + int s = 0; + for (int j = 0; j < INTRA_EDGE_TAPS; j++) { + int k = i - 2 + j; + k = (k < 0) ? 0 : k; + k = (k > sz - 1) ? sz - 1 : k; + s += edge[k] * kernel[filt][j]; + } + s = (s + 8) >> 4; + p[i] = s; + } +} + +static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) { + const int kernel[3] = { 5, 6, 5 }; + + int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + + (p_above[0] * kernel[2]); + s = (s + 8) >> 4; + p_above[-1] = s; + p_left[-1] = s; +} + +void av1_upsample_intra_edge_c(uint8_t *p, int sz) { + // interpolate half-sample positions + assert(sz <= MAX_UPSAMPLE_SZ); + + uint8_t in[MAX_UPSAMPLE_SZ + 3]; + // copy p[-1..(sz-1)] and extend first and last samples + in[0] = p[-1]; + in[1] = p[-1]; + for (int i = 0; i < sz; i++) { + in[i + 2] = p[i]; + } + in[sz + 2] = p[sz - 1]; + + // interpolate half-sample edge positions + p[-2] = in[0]; + for (int i = 0; i < sz; i++) { + int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; + s = clip_pixel((s + 8) >> 4); + p[2 * i - 1] = s; + p[2 * i] = in[i + 2]; + } +} + +static void build_directional_and_filter_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) { + int i; + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const int is_dr_mode = av1_is_directional_mode(mode); + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + assert(use_filter_intra || is_dr_mode); + // The left_data, above_data buffers must be zeroed to fix some intermittent + // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 + // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to + // be the potential reason for this issue. + memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); + memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); + + // The default values if ref pixels are not available: + // 128 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + if (is_dr_mode) { + if (p_angle <= 90) + need_above = 1, need_left = 0, need_above_left = 1; + else if (p_angle < 180) + need_above = 1, need_left = 1, need_above_left = 1; + else + need_above = 0, need_left = 1, need_above_left = 1; + } + if (use_filter_intra) need_left = need_above = need_above_left = 1; + + assert(n_top_px >= 0); + assert(n_topright_px >= -1); + assert(n_left_px >= 0); + assert(n_bottomleft_px >= -1); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + // NEED_LEFT + if (need_left) { + const int num_left_pixels_needed = + txhpx + (n_bottomleft_px >= 0 ? txwpx : 0); + i = 0; + if (n_left_px > 0) { + for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (n_bottomleft_px > 0) { + assert(i == txhpx); + for (; i < txhpx + n_bottomleft_px; i++) + left_col[i] = left_ref[i * ref_stride]; + } + if (i < num_left_pixels_needed) + memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); + } else if (n_top_px > 0) { + memset(left_col, above_ref[0], num_left_pixels_needed); + } + } + + // NEED_ABOVE + if (need_above) { + const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (n_topright_px > 0) { + assert(n_top_px == txwpx); + memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px); + i += n_topright_px; + } + if (i < num_top_pixels_needed) + memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i); + } else if (n_left_px > 0) { + memset(above_row, left_ref[0], num_top_pixels_needed); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + + if (use_filter_intra) { + av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, + filter_intra_mode); + return; + } + + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_upsample_intra_edge(above_row, n_px); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_upsample_intra_edge(left_col, n_px); + } + } + dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, + upsample_left, p_angle); +} + +// This function generates the pred data of a given block for non-directional +// intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, SMOOTH_V and PAETH). +static void build_non_directional_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) { + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int need_left = extend_modes[mode] & NEED_LEFT; + const int need_above = extend_modes[mode] & NEED_ABOVE; + const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + int i = 0; + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + + if (need_left) { + memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + memset(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i); + } else if (n_left_px > 0) { + memset(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + + if (mode == DC_PRED) { + dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row, + left_col); + } else { + pred[mode][tx_size](dst, dst_stride, above_row, left_col); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_filter_intra_edge_c(uint16_t *p, int sz, int strength) { + if (!strength) return; + + const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 }, + { 0, 5, 6, 5, 0 }, + { 2, 4, 4, 4, 2 } }; + const int filt = strength - 1; + uint16_t edge[129]; + + memcpy(edge, p, sz * sizeof(*p)); + for (int i = 1; i < sz; i++) { + int s = 0; + for (int j = 0; j < INTRA_EDGE_TAPS; j++) { + int k = i - 2 + j; + k = (k < 0) ? 0 : k; + k = (k > sz - 1) ? sz - 1 : k; + s += edge[k] * kernel[filt][j]; + } + s = (s + 8) >> 4; + p[i] = s; + } +} + +static void highbd_filter_intra_edge_corner(uint16_t *p_above, + uint16_t *p_left) { + const int kernel[3] = { 5, 6, 5 }; + + int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) + + (p_above[0] * kernel[2]); + s = (s + 8) >> 4; + p_above[-1] = s; + p_left[-1] = s; +} + +void av1_highbd_upsample_intra_edge_c(uint16_t *p, int sz, int bd) { + // interpolate half-sample positions + assert(sz <= MAX_UPSAMPLE_SZ); + + uint16_t in[MAX_UPSAMPLE_SZ + 3]; + // copy p[-1..(sz-1)] and extend first and last samples + in[0] = p[-1]; + in[1] = p[-1]; + for (int i = 0; i < sz; i++) { + in[i + 2] = p[i]; + } + in[sz + 2] = p[sz - 1]; + + // interpolate half-sample edge positions + p[-2] = in[0]; + for (int i = 0; i < sz; i++) { + int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3]; + s = (s + 8) >> 4; + s = clip_pixel_highbd(s, bd); + p[2 * i - 1] = s; + p[2 * i] = in[i + 2]; + } +} + +static void highbd_build_intra_predictors( + const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, + PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, int intra_edge_filter_type, + int bit_depth) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint16_t *const above_row = above_data + 16; + uint16_t *const left_col = left_data + 16; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const uint16_t *above_ref = ref - ref_stride; + const uint16_t *left_ref = ref - 1; + const int is_dr_mode = av1_is_directional_mode(mode); + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + int base = 128 << (bit_depth - 8); + // The left_data, above_data buffers must be zeroed to fix some intermittent + // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 + // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are + // seen to be the potential reason for this issue. + aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); + aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); + + // The default values if ref pixels are not available: + // base base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1 + // base+1 A B .. Y Z + // base+1 C D .. W X + // base+1 E F .. U V + // base+1 G H .. S T T T T T + + if (is_dr_mode) { + if (p_angle <= 90) + need_above = 1, need_left = 0, need_above_left = 1; + else if (p_angle < 180) + need_above = 1, need_left = 1, need_above_left = 1; + else + need_above = 0, need_left = 1, need_above_left = 1; + } + if (use_filter_intra) need_left = need_above = need_above_left = 1; + + assert(n_top_px >= 0); + assert(n_topright_px >= -1); + assert(n_left_px >= 0); + assert(n_bottomleft_px >= -1); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : base + 1; + } else { + val = (n_left_px > 0) ? left_ref[0] : base - 1; + } + for (i = 0; i < txhpx; ++i) { + aom_memset16(dst, val, txwpx); + dst += dst_stride; + } + return; + } + + // NEED_LEFT + if (need_left) { + const int num_left_pixels_needed = + txhpx + (n_bottomleft_px >= 0 ? txwpx : 0); + i = 0; + if (n_left_px > 0) { + for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (n_bottomleft_px > 0) { + assert(i == txhpx); + for (; i < txhpx + n_bottomleft_px; i++) + left_col[i] = left_ref[i * ref_stride]; + } + if (i < num_left_pixels_needed) + aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i); + } else if (n_top_px > 0) { + aom_memset16(left_col, above_ref[0], num_left_pixels_needed); + } + } + + // NEED_ABOVE + if (need_above) { + const int num_top_pixels_needed = txwpx + (n_topright_px >= 0 ? txhpx : 0); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); + i = n_top_px; + if (n_topright_px > 0) { + assert(n_top_px == txwpx); + memcpy(above_row + txwpx, above_ref + txwpx, + n_topright_px * sizeof(above_ref[0])); + i += n_topright_px; + } + if (i < num_top_pixels_needed) + aom_memset16(&above_row[i], above_row[i - 1], + num_top_pixels_needed - i); + } else if (n_left_px > 0) { + aom_memset16(above_row, left_ref[0], num_top_pixels_needed); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = base; + } + left_col[-1] = above_row[-1]; + } + + if (use_filter_intra) { + highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, + filter_intra_mode, bit_depth); + return; + } + + if (is_dr_mode) { + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + const int ab_le = need_above_left ? 1 : 0; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + highbd_filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + } + } + highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, + upsample_above, upsample_left, p_angle, bit_depth); + return; + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( + dst, dst_stride, above_row, left_col, bit_depth); + } else { + pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, + int subsampling_y) { + assert(subsampling_x >= 0 && subsampling_x < 2); + assert(subsampling_y >= 0 && subsampling_y < 2); + BLOCK_SIZE bs = bsize; + switch (bsize) { + case BLOCK_4X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_4X8: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X8; + else if (subsampling_y == 1) + bs = BLOCK_4X8; + break; + case BLOCK_8X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X8; + else if (subsampling_x == 1) + bs = BLOCK_8X4; + else if (subsampling_y == 1) + bs = BLOCK_8X8; + break; + case BLOCK_4X16: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_8X16; + else if (subsampling_x == 1) + bs = BLOCK_8X16; + else if (subsampling_y == 1) + bs = BLOCK_4X16; + break; + case BLOCK_16X4: + if (subsampling_x == 1 && subsampling_y == 1) + bs = BLOCK_16X8; + else if (subsampling_x == 1) + bs = BLOCK_16X4; + else if (subsampling_y == 1) + bs = BLOCK_16X8; + break; + default: break; + } + return bs; +} + +void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, + int enable_intra_edge_filter, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, + int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, + const uint8_t *ref, int ref_stride, uint8_t *dst, + int dst_stride, int col_off, int row_off, + int plane) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int x = col_off << MI_SIZE_LOG2; + const int y = row_off << MI_SIZE_LOG2; + const int is_hbd = is_cur_buf_hbd(xd); + + assert(mode < INTRA_MODES); + + if (use_palette) { + int r, c; + const uint8_t *const map = xd->plane[plane != 0].color_index_map + + xd->color_index_map_offset[plane != 0]; + const uint16_t *const palette = + mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; + if (is_hbd) { + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (r = 0; r < txhpx; ++r) { + for (c = 0; c < txwpx; ++c) { + dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]]; + } + } + } else { + for (r = 0; r < txhpx; ++r) { + for (c = 0; c < txwpx; ++c) { + dst[r * dst_stride + c] = + (uint8_t)palette[map[(r + y) * wpx + c + x]]; + } + } + } + return; + } + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int have_top = + row_off || (ss_y ? xd->chroma_up_available : xd->up_available); + const int have_left = + col_off || (ss_x ? xd->chroma_left_available : xd->left_available); + + // Distance between the right edge of this prediction block to + // the frame right edge + const int xr = (xd->mb_to_right_edge >> (3 + ss_x)) + wpx - x - txwpx; + // Distance between the bottom edge of this prediction block to + // the frame bottom edge + const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx; + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + const int is_dr_mode = av1_is_directional_mode(mode); + + // The computations in this function, as well as in build_intra_predictors(), + // are generalized for all intra modes. Some of these operations are not + // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H, + // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a + // separate function build_non_directional_intra_predictors() is introduced + // for these modes to avoid redundant computations while generating pred data. + + // TODO(aomedia:3532): Enable this refactoring for high bd path as well. + if (!is_hbd && !use_filter_intra && !is_dr_mode) { + build_non_directional_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0); + return; + } + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + const int right_available = + mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; + const int bottom_available = + (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end); + + const PARTITION_TYPE partition = mbmi->partition; + + BLOCK_SIZE bsize = mbmi->bsize; + // force 4x4 chroma component block size. + if (ss_x || ss_y) { + bsize = scale_chroma_bsize(bsize, ss_x, ss_y); + } + + int p_angle = 0; + int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT; + int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT; + + if (use_filter_intra) { + need_top_right = 0; + need_bottom_left = 0; + } + if (is_dr_mode) { + p_angle = mode_to_angle_map[mode] + angle_delta; + need_top_right = p_angle < 90; + need_bottom_left = p_angle > 180; + } + + // Possible states for have_top_right(TR) and have_bottom_left(BL) + // -1 : TR and BL are not needed + // 0 : TR and BL are needed but not available + // > 0 : TR and BL are needed and pixels are available + const int have_top_right = + need_top_right ? has_top_right(sb_size, bsize, mi_row, mi_col, have_top, + right_available, partition, tx_size, + row_off, col_off, ss_x, ss_y) + : -1; + const int have_bottom_left = + need_bottom_left ? has_bottom_left(sb_size, bsize, mi_row, mi_col, + bottom_available, have_left, partition, + tx_size, row_off, col_off, ss_x, ss_y) + : -1; + + const int disable_edge_filter = !enable_intra_edge_filter; + const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + highbd_build_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, + tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, + intra_edge_filter_type, xd->bd); + return; + } +#endif + build_directional_and_filter_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, + tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, + intra_edge_filter_type); +} + +void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, int blk_col, int blk_row, + TX_SIZE tx_size) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + const PREDICTION_MODE mode = + (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode); + const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0; + const FILTER_INTRA_MODE filter_intra_mode = + (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra) + ? mbmi->filter_intra_mode_info.filter_intra_mode + : FILTER_INTRA_MODES; + const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP; + const SequenceHeader *seq_params = cm->seq_params; + + if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) { +#if CONFIG_DEBUG + assert(is_cfl_allowed(xd)); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + (void)plane_bsize; + assert(plane_bsize < BLOCK_SIZES_ALL); + if (!xd->lossless[mbmi->segment_id]) { + assert(blk_col == 0); + assert(blk_row == 0); + assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); + assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); + } +#endif + CFL_CTX *const cfl = &xd->cfl; + CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane); + if (!cfl->dc_pred_is_cached[pred_plane]) { + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mode, angle_delta, + use_palette, filter_intra_mode, dst, dst_stride, + dst, dst_stride, blk_col, blk_row, plane); + if (cfl->use_dc_pred_cache) { + cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]); + cfl->dc_pred_is_cached[pred_plane] = true; + } + } else { + cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane); + } + av1_cfl_predict_block(xd, dst, dst_stride, tx_size, plane); + return; + } + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode, + dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane); +} + +void av1_init_intra_predictors(void) { + aom_once(init_intra_predictors_internal); +} diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h new file mode 100644 index 0000000000..fa66ccd541 --- /dev/null +++ b/third_party/aom/av1/common/reconintra.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RECONINTRA_H_ +#define AOM_AV1_COMMON_RECONINTRA_H_ + +#include + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_init_intra_predictors(void); +void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, int blk_col, int blk_row, + TX_SIZE tx_size); +void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, + int enable_intra_edge_filter, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, + int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, + const uint8_t *ref, int ref_stride, uint8_t *dst, + int dst_stride, int col_off, int row_off, + int plane); + +// Mapping of interintra to intra mode for use in the intra component +static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = { + DC_PRED, V_PRED, H_PRED, SMOOTH_PRED +}; + +// Mapping of intra mode to the interintra mode +static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = { + II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_V_PRED, + II_H_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED +}; + +#define FILTER_INTRA_SCALE_BITS 4 + +static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) { + return mode >= V_PRED && mode <= D67_PRED; +} + +static INLINE int av1_is_diagonal_mode(PREDICTION_MODE mode) { + return mode >= D45_PRED && mode <= D67_PRED; +} + +static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) { + return bsize >= BLOCK_8X8; +} + +static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) { + return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools && + cm->features.allow_intrabc; +} + +static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm, + BLOCK_SIZE bs) { + if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0; + + return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32; +} + +static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm, + const MB_MODE_INFO *mbmi) { + return mbmi->mode == DC_PRED && + mbmi->palette_mode_info.palette_size[0] == 0 && + av1_filter_intra_allowed_bsize(cm, mbmi->bsize); +} + +extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]; + +static const int16_t dr_intra_derivative[90] = { + // More evenly spread out angles and limited to 10-bit + // Values that are 0 will never be used + // Approx angle + 0, 0, 0, // + 1023, 0, 0, // 3, ... + 547, 0, 0, // 6, ... + 372, 0, 0, 0, 0, // 9, ... + 273, 0, 0, // 14, ... + 215, 0, 0, // 17, ... + 178, 0, 0, // 20, ... + 151, 0, 0, // 23, ... (113 & 203 are base angles) + 132, 0, 0, // 26, ... + 116, 0, 0, // 29, ... + 102, 0, 0, 0, // 32, ... + 90, 0, 0, // 36, ... + 80, 0, 0, // 39, ... + 71, 0, 0, // 42, ... + 64, 0, 0, // 45, ... (45 & 135 are base angles) + 57, 0, 0, // 48, ... + 51, 0, 0, // 51, ... + 45, 0, 0, 0, // 54, ... + 40, 0, 0, // 58, ... + 35, 0, 0, // 61, ... + 31, 0, 0, // 64, ... + 27, 0, 0, // 67, ... (67 & 157 are base angles) + 23, 0, 0, // 70, ... + 19, 0, 0, // 73, ... + 15, 0, 0, 0, 0, // 76, ... + 11, 0, 0, // 81, ... + 7, 0, 0, // 84, ... + 3, 0, 0, // 87, ... +}; + +// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y. +// If angle > 0 && angle < 90, dx = -((int)(256 / t)); +// If angle > 90 && angle < 180, dx = (int)(256 / t); +// If angle > 180 && angle < 270, dx = 1; +static INLINE int av1_get_dx(int angle) { + if (angle > 0 && angle < 90) { + return dr_intra_derivative[angle]; + } else if (angle > 90 && angle < 180) { + return dr_intra_derivative[180 - angle]; + } else { + // In this case, we are not really going to use dx. We may return any value. + return 1; + } +} + +// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X. +// If angle > 0 && angle < 90, dy = 1; +// If angle > 90 && angle < 180, dy = (int)(256 * t); +// If angle > 180 && angle < 270, dy = -((int)(256 * t)); +static INLINE int av1_get_dy(int angle) { + if (angle > 90 && angle < 180) { + return dr_intra_derivative[angle - 90]; + } else if (angle > 180 && angle < 270) { + return dr_intra_derivative[270 - angle]; + } else { + // In this case, we are not really going to use dy. We may return any value. + return 1; + } +} + +static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta, + int type) { + const int d = abs(delta); + const int blk_wh = bs0 + bs1; + if (d == 0 || d >= 40) return 0; + return type ? (blk_wh <= 8) : (blk_wh <= 16); +} +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_COMMON_RECONINTRA_H_ diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c new file mode 100644 index 0000000000..1b348836a5 --- /dev/null +++ b/third_party/aom/av1/common/resize.c @@ -0,0 +1,1452 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "av1/common/common.h" +#include "av1/common/resize.h" + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +// Filters for interpolation (0.5-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = { + { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, 0, 34, 64, 36, 0, -3, 0 }, + { -3, -1, 34, 64, 36, 1, -3, 0 }, { -3, -1, 33, 64, 37, 1, -3, 0 }, + { -3, -1, 32, 64, 38, 1, -3, 0 }, { -3, -1, 31, 64, 39, 1, -3, 0 }, + { -3, -1, 31, 63, 39, 2, -3, 0 }, { -2, -2, 30, 63, 40, 2, -3, 0 }, + { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 29, 63, 41, 3, -4, 0 }, + { -2, -2, 28, 63, 42, 3, -4, 0 }, { -2, -2, 27, 63, 43, 3, -4, 0 }, + { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 26, 62, 44, 5, -4, 0 }, + { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 }, + { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 }, + { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 22, 61, 48, 7, -4, -1 }, + { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 49, 8, -4, 0 }, + { -1, -4, 20, 60, 50, 8, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 }, + { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 18, 58, 52, 10, -4, -1 }, + { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 }, + { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 }, + { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 }, + { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 }, + { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 }, + { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 }, + { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 }, + { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 }, + { -1, -4, 8, 50, 60, 20, -4, -1 }, { 0, -4, 8, 49, 60, 20, -4, -1 }, + { 0, -4, 7, 49, 60, 21, -3, -2 }, { -1, -4, 7, 48, 61, 22, -3, -2 }, + { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 }, + { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 }, + { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 44, 62, 26, -3, -2 }, + { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 43, 63, 27, -2, -2 }, + { 0, -4, 3, 42, 63, 28, -2, -2 }, { 0, -4, 3, 41, 63, 29, -2, -2 }, + { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 40, 63, 30, -2, -2 }, + { 0, -3, 2, 39, 63, 31, -1, -3 }, { 0, -3, 1, 39, 64, 31, -1, -3 }, + { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 37, 64, 33, -1, -3 }, + { 0, -3, 1, 36, 64, 34, -1, -3 }, { 0, -3, 0, 36, 64, 34, 0, -3 }, +}; + +// Filters for interpolation (0.625-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = { + { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 }, + { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 }, + { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 }, + { 0, -8, 26, 79, 39, -7, -2, 1 }, { 0, -8, 25, 79, 40, -7, -2, 1 }, + { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 23, 78, 42, -6, -2, 1 }, + { 0, -8, 22, 78, 43, -6, -2, 1 }, { 0, -8, 21, 78, 44, -6, -2, 1 }, + { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 19, 77, 47, -5, -3, 1 }, + { 0, -8, 18, 77, 48, -5, -3, 1 }, { 0, -8, 17, 77, 49, -5, -3, 1 }, + { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 76, 51, -4, -3, 1 }, + { 0, -8, 15, 75, 52, -3, -4, 1 }, { 0, -7, 14, 74, 53, -3, -4, 1 }, + { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 12, 73, 55, -2, -4, 1 }, + { 0, -7, 11, 73, 56, -2, -4, 1 }, { 0, -7, 10, 72, 57, -1, -4, 1 }, + { 1, -7, 10, 71, 58, -1, -5, 1 }, { 0, -7, 9, 71, 59, 0, -5, 1 }, + { 1, -7, 8, 70, 60, 0, -5, 1 }, { 1, -7, 7, 69, 61, 1, -5, 1 }, + { 1, -6, 6, 68, 62, 1, -5, 1 }, { 0, -6, 6, 68, 62, 2, -5, 1 }, + { 1, -6, 5, 67, 63, 2, -5, 1 }, { 1, -6, 5, 66, 64, 3, -6, 1 }, + { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -6, 3, 64, 66, 5, -6, 1 }, + { 1, -5, 2, 63, 67, 5, -6, 1 }, { 1, -5, 2, 62, 68, 6, -6, 0 }, + { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 1, 61, 69, 7, -7, 1 }, + { 1, -5, 0, 60, 70, 8, -7, 1 }, { 1, -5, 0, 59, 71, 9, -7, 0 }, + { 1, -5, -1, 58, 71, 10, -7, 1 }, { 1, -4, -1, 57, 72, 10, -7, 0 }, + { 1, -4, -2, 56, 73, 11, -7, 0 }, { 1, -4, -2, 55, 73, 12, -7, 0 }, + { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 53, 74, 14, -7, 0 }, + { 1, -4, -3, 52, 75, 15, -8, 0 }, { 1, -3, -4, 51, 76, 15, -8, 0 }, + { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 49, 77, 17, -8, 0 }, + { 1, -3, -5, 48, 77, 18, -8, 0 }, { 1, -3, -5, 47, 77, 19, -8, 0 }, + { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 44, 78, 21, -8, 0 }, + { 1, -2, -6, 43, 78, 22, -8, 0 }, { 1, -2, -6, 42, 78, 23, -8, 0 }, + { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 40, 79, 25, -8, 0 }, + { 1, -2, -7, 39, 79, 26, -8, 0 }, { 1, -2, -7, 38, 80, 27, -8, -1 }, + { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 }, + { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 }, +}; + +// Filters for interpolation (0.75-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = { + { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 }, + { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 }, + { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 }, + { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 }, + { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -9, 13, 94, 38, -12, 2, 0 }, + { 2, -8, 12, 93, 40, -12, 1, 0 }, { 2, -8, 11, 93, 41, -12, 1, 0 }, + { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -8, 8, 92, 44, -12, 1, 1 }, + { 2, -7, 7, 91, 46, -12, 1, 0 }, { 2, -7, 6, 90, 47, -12, 1, 1 }, + { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 4, 89, 50, -12, 1, 0 }, + { 2, -6, 3, 88, 52, -12, 0, 1 }, { 2, -6, 2, 87, 54, -12, 0, 1 }, + { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, 0, 85, 57, -12, 0, 1 }, + { 2, -5, -1, 84, 58, -11, 0, 1 }, { 2, -5, -2, 83, 60, -11, 0, 1 }, + { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 }, + { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 }, + { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 76, 69, -9, -1, 1 }, + { 1, -3, -6, 75, 70, -8, -2, 1 }, { 1, -2, -7, 74, 71, -8, -2, 1 }, + { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 71, 74, -7, -2, 1 }, + { 1, -2, -8, 70, 75, -6, -3, 1 }, { 1, -1, -9, 69, 76, -6, -3, 1 }, + { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 66, 79, -4, -4, 1 }, + { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 }, + { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 }, + { 1, 0, -11, 58, 84, -1, -5, 2 }, { 1, 0, -12, 57, 85, 0, -5, 2 }, + { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 54, 87, 2, -6, 2 }, + { 1, 0, -12, 52, 88, 3, -6, 2 }, { 0, 1, -12, 50, 89, 4, -6, 2 }, + { 0, 1, -12, 49, 90, 5, -7, 2 }, { 1, 1, -12, 47, 90, 6, -7, 2 }, + { 0, 1, -12, 46, 91, 7, -7, 2 }, { 1, 1, -12, 44, 92, 8, -8, 2 }, + { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 41, 93, 11, -8, 2 }, + { 0, 1, -12, 40, 93, 12, -8, 2 }, { 0, 2, -12, 38, 94, 13, -9, 2 }, + { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 35, 95, 15, -9, 2 }, + { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 }, + { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 }, + { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 }, +}; + +// Filters for interpolation (0.875-band) - note this also filters integer pels. +static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = { + { 3, -8, 13, 112, 13, -8, 3, 0 }, { 2, -7, 12, 112, 15, -8, 3, -1 }, + { 3, -7, 10, 112, 17, -9, 3, -1 }, { 2, -6, 8, 112, 19, -9, 3, -1 }, + { 2, -6, 7, 112, 21, -10, 3, -1 }, { 2, -5, 6, 111, 22, -10, 3, -1 }, + { 2, -5, 4, 111, 24, -10, 3, -1 }, { 2, -4, 3, 110, 26, -11, 3, -1 }, + { 2, -4, 1, 110, 28, -11, 3, -1 }, { 2, -4, 0, 109, 30, -12, 4, -1 }, + { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 }, + { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 }, + { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 }, + { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 }, + { 1, 0, -9, 100, 48, -15, 4, -1 }, { 1, 0, -10, 99, 50, -15, 4, -1 }, + { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -11, 96, 55, -16, 4, -1 }, + { 0, 1, -12, 95, 57, -16, 4, -1 }, { 0, 2, -13, 93, 59, -16, 4, -1 }, + { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 90, 63, -16, 4, -1 }, + { 0, 2, -14, 88, 65, -16, 4, -1 }, { 0, 2, -15, 86, 67, -16, 4, 0 }, + { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 83, 71, -17, 4, 0 }, + { 0, 3, -16, 81, 73, -16, 3, 0 }, { 0, 3, -16, 79, 75, -16, 3, 0 }, + { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 75, 79, -16, 3, 0 }, + { 0, 3, -16, 73, 81, -16, 3, 0 }, { 0, 4, -17, 71, 83, -16, 3, 0 }, + { 0, 4, -17, 69, 84, -15, 3, 0 }, { 0, 4, -16, 67, 86, -15, 2, 0 }, + { -1, 4, -16, 65, 88, -14, 2, 0 }, { -1, 4, -16, 63, 90, -14, 2, 0 }, + { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 59, 93, -13, 2, 0 }, + { -1, 4, -16, 57, 95, -12, 1, 0 }, { -1, 4, -16, 55, 96, -11, 1, 0 }, + { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 50, 99, -10, 0, 1 }, + { -1, 4, -15, 48, 100, -9, 0, 1 }, { -1, 4, -15, 46, 101, -8, 0, 1 }, + { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 }, + { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 }, + { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 }, + { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 }, + { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -11, 26, 110, 3, -4, 2 }, + { -1, 3, -10, 24, 111, 4, -5, 2 }, { -1, 3, -10, 22, 111, 6, -5, 2 }, + { -1, 3, -10, 21, 112, 7, -6, 2 }, { -1, 3, -9, 19, 112, 8, -6, 2 }, + { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 }, +}; + +const int16_t av1_resize_filter_normative[( + 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = { +#if UPSCALE_NORMATIVE_TAPS == 8 + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, + { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, + { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, + { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, + { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, + { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, + { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, + { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, + { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, + { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, + { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, + { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, + { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, + { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, + { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, + { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, + { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, + { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, + { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, + { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, + { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, + { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, + { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, + { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, + { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, + { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, + { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, + { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, + { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, + { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, + { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, + { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, +#else +#error "Invalid value of UPSCALE_NORMATIVE_TAPS" +#endif // UPSCALE_NORMATIVE_TAPS == 8 +}; + +// Filters for interpolation (full-band) - no filtering for integer pixels +#define filteredinterp_filters1000 av1_resize_filter_normative + +// Filters for factor of 2 downsampling. +static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 }; +static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 }; + +static const InterpKernel *choose_interp_filter(int in_length, int out_length) { + int out_length16 = out_length * 16; + if (out_length16 >= in_length * 16) + return filteredinterp_filters1000; + else if (out_length16 >= in_length * 13) + return filteredinterp_filters875; + else if (out_length16 >= in_length * 11) + return filteredinterp_filters750; + else if (out_length16 >= in_length * 9) + return filteredinterp_filters625; + else + return filteredinterp_filters500; +} + +static void interpolate_core(const uint8_t *const input, int in_length, + uint8_t *output, int out_length, + const int16_t *interp_filters, int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + uint8_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } +} + +static void interpolate(const uint8_t *const input, int in_length, + uint8_t *output, int out_length) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + interpolate_core(input, in_length, output, out_length, &interp_filters[0][0], + SUBPEL_TAPS); +} + +int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { + return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; +} + +static int32_t get_upscale_convolve_x0(int in_length, int out_length, + int32_t x_step_qn) { + const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS); + const int32_t x0 = + (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + + RS_SCALE_EXTRA_OFF - err / 2; + return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK); +} + +static void down2_symeven(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half. + const int16_t *filter = av1_down2_symeven_half_filter; + const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static void down2_symodd(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half - 1. + const int16_t *filter = av1_down2_symodd_half_filter; + const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static int get_down2_length(int length, int steps) { + for (int s = 0; s < steps; ++s) length = (length + 1) >> 1; + return length; +} + +static int get_down2_steps(int in_length, int out_length) { + int steps = 0; + int proj_in_length; + while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { + ++steps; + in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } + } + return steps; +} + +static void resize_multistep(const uint8_t *const input, int length, + uint8_t *output, int olength, uint8_t *otmp) { + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + const int steps = get_down2_steps(length, olength); + + if (steps > 0) { + uint8_t *out = NULL; + int filteredlength = length; + + assert(otmp != NULL); + uint8_t *otmp2 = otmp + get_down2_length(length, 1); + for (int s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint8_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + down2_symodd(in, filteredlength, out); + else + down2_symeven(in, filteredlength, out); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + interpolate(out, filteredlength, output, olength); + } + } else { + interpolate(input, length, output, olength); + } +} + +static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +bool av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride) { + int i; + bool mem_status = true; + uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height); + uint8_t *tmpbuf = + (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height)); + uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height); + uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) { + mem_status = false; + goto Error; + } + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + for (i = 0; i < height; ++i) + resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2, + tmpbuf); + for (i = 0; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf); + fill_arr_to_col(output + i, out_stride, height2, arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(tmpbuf); + aom_free(arrbuf); + aom_free(arrbuf2); + return mem_status; +} + +static bool upscale_normative_rect(const uint8_t *const input, int height, + int width, int in_stride, uint8_t *output, + int height2, int width2, int out_stride, + int x_step_qn, int x0_qn, int pad_left, + int pad_right) { + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + assert(height2 == height); + + // Extend the left/right pixels of the tile column if needed + // (either because we can't sample from other tiles, or because we're at + // a frame edge). + // Save the overwritten pixels into tmp_left and tmp_right. + // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra + // column of border pixels compared to what we'd naively think. + const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; + uint8_t *tmp_left = + NULL; // Silence spurious "may be used uninitialized" warnings + uint8_t *tmp_right = NULL; + uint8_t *const in_tl = (uint8_t *)(input - border_cols); // Cast off 'const' + uint8_t *const in_tr = (uint8_t *)(input + width); + if (pad_left) { + tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); + if (!tmp_left) return false; + for (int i = 0; i < height; i++) { + memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols); + memset(in_tl + i * in_stride, input[i * in_stride], border_cols); + } + } + if (pad_right) { + tmp_right = + (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); + if (!tmp_right) { + aom_free(tmp_left); + return false; + } + for (int i = 0; i < height; i++) { + memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols); + memset(in_tr + i * in_stride, input[i * in_stride + width - 1], + border_cols); + } + } + + av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2, + height2, &av1_resize_filter_normative[0][0], x0_qn, + x_step_qn); + + // Restore the left/right border pixels + if (pad_left) { + for (int i = 0; i < height; i++) { + memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols); + } + aom_free(tmp_left); + } + if (pad_right) { + for (int i = 0; i < height; i++) { + memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols); + } + aom_free(tmp_right); + } + return true; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_interpolate_core(const uint16_t *const input, int in_length, + uint16_t *output, int out_length, int bd, + const int16_t *interp_filters, + int interp_taps) { + const int32_t delta = + (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / + out_length; + const int32_t offset = + in_length > out_length + ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length + : -(((int32_t)(out_length - in_length) + << (RS_SCALE_SUBPEL_BITS - 1)) + + out_length / 2) / + out_length; + uint16_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int32_t y; + + x = 0; + y = offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = out_length - 1; + y = delta * x + offset + RS_SCALE_EXTRA_OFF; + while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= + in_length) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; + ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) { + const int pk = int_pel - interp_taps / 2 + 1 + k; + sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; + } + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } else { + // Initial part. + for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + // End part. + for (; x < out_length; ++x, y += delta) { + int_pel = y >> RS_SCALE_SUBPEL_BITS; + sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; + const int16_t *filter = &interp_filters[sub_pel * interp_taps]; + sum = 0; + for (k = 0; k < interp_taps; ++k) + sum += filter[k] * + input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; + *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + } + } +} + +static void highbd_interpolate(const uint16_t *const input, int in_length, + uint16_t *output, int out_length, int bd) { + const InterpKernel *interp_filters = + choose_interp_filter(in_length, out_length); + + highbd_interpolate_core(input, in_length, output, out_length, bd, + &interp_filters[0][0], SUBPEL_TAPS); +} + +static void highbd_down2_symeven(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half. + static const int16_t *filter = av1_down2_symeven_half_filter; + const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += + (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_down2_symodd(const uint16_t *const input, int length, + uint16_t *output, int bd) { + // Actual filter len = 2 * filter_len_half - 1. + static const int16_t *filter = av1_down2_symodd_half_filter; + const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2; + int i, j; + uint16_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel_highbd(sum, bd); + } + } +} + +static void highbd_resize_multistep(const uint16_t *const input, int length, + uint16_t *output, int olength, + uint16_t *otmp, int bd) { + if (length == olength) { + memcpy(output, input, sizeof(output[0]) * length); + return; + } + const int steps = get_down2_steps(length, olength); + + if (steps > 0) { + uint16_t *out = NULL; + int filteredlength = length; + + assert(otmp != NULL); + uint16_t *otmp2 = otmp + get_down2_length(length, 1); + for (int s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint16_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + highbd_down2_symodd(in, filteredlength, out, bd); + else + highbd_down2_symeven(in, filteredlength, out, bd); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + highbd_interpolate(out, filteredlength, output, olength, bd); + } + } else { + highbd_interpolate(input, length, output, olength, bd); + } +} + +static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len, + uint16_t *arr) { + int i; + uint16_t *iptr = img; + uint16_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd) { + int i; + uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height); + uint16_t *tmpbuf = + (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height)); + uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height); + uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2); + if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) + goto Error; + for (i = 0; i < height; ++i) { + highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, + intbuf + width2 * i, width2, tmpbuf, bd); + } + for (i = 0; i < width2; ++i) { + highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf); + highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd); + highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2, + arrbuf2); + } + +Error: + aom_free(intbuf); + aom_free(tmpbuf); + aom_free(arrbuf); + aom_free(arrbuf2); +} + +static bool highbd_upscale_normative_rect(const uint8_t *const input, + int height, int width, int in_stride, + uint8_t *output, int height2, + int width2, int out_stride, + int x_step_qn, int x0_qn, + int pad_left, int pad_right, int bd) { + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); + assert(height2 == height); + + // Extend the left/right pixels of the tile column if needed + // (either because we can't sample from other tiles, or because we're at + // a frame edge). + // Save the overwritten pixels into tmp_left and tmp_right. + // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra + // column of border pixels compared to what we'd naively think. + const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1; + const int border_size = border_cols * sizeof(uint16_t); + uint16_t *tmp_left = + NULL; // Silence spurious "may be used uninitialized" warnings + uint16_t *tmp_right = NULL; + uint16_t *const input16 = CONVERT_TO_SHORTPTR(input); + uint16_t *const in_tl = input16 - border_cols; + uint16_t *const in_tr = input16 + width; + if (pad_left) { + tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height); + if (!tmp_left) return false; + for (int i = 0; i < height; i++) { + memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size); + aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols); + } + } + if (pad_right) { + tmp_right = + (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height); + if (!tmp_right) { + aom_free(tmp_left); + return false; + } + for (int i = 0; i < height; i++) { + memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size); + aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1], + border_cols); + } + } + + av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride, + CONVERT_TO_SHORTPTR(output), out_stride, width2, + height2, &av1_resize_filter_normative[0][0], + x0_qn, x_step_qn, bd); + + // Restore the left/right border pixels + if (pad_left) { + for (int i = 0; i < height; i++) { + memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size); + } + aom_free(tmp_left); + } + if (pad_right) { + for (int i = 0; i < height; i++) { + memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size); + } + aom_free(tmp_right); + } + return true; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride)) + abort(); + if (!av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride)) + abort(); + if (!av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride)) + abort(); +} + +bool av1_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride)) + return false; + if (!av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride)) + return false; + if (!av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride)) + return false; + return true; +} + +bool av1_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth) { + if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride)) + return false; + if (!av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride)) + return false; + if (!av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride)) + return false; + return true; +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, + owidth / 2, ouv_stride, bd); +} + +void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, + owidth / 2, ouv_stride, bd); + av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, + owidth / 2, ouv_stride, bd); +} + +void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd) { + av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, + oy_stride, bd); + av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, + ouv_stride, bd); + av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, + ouv_stride, bd); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_resize_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase_scaler, + const int num_planes) { + assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH || + filter == EIGHTTAP_REGULAR); + const InterpKernel *const kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter].filter_ptr; + + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const uint8_t *src_buffer = src->buffers[i]; + const int src_stride = src->strides[is_uv]; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + uint8_t *dst_buffer = dst->buffers[i]; + const int dst_stride = dst->strides[is_uv]; + for (int y = 0; y < dst_h; y += 16) { + const int y_q4 = + src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler; + for (int x = 0; x < dst_w; x += 16) { + const int x_q4 = + src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = + src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w; + uint8_t *dst_ptr = dst_buffer + y * dst_stride + x; + + // Width and height of the actual working area. + const int work_w = AOMMIN(16, dst_w - x); + const int work_h = AOMMIN(16, dst_h - y); + // SIMD versions of aom_scaled_2d() have some trouble handling + // nonstandard sizes, so fall back on the C version to handle borders. + if (work_w != 16 || work_h != 16) { + aom_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, work_w, work_h); + } else { + aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16, 16); + } + } + } + } + aom_extend_frame_borders(dst, num_planes); +} + +bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + const int num_planes) { + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; +#if CONFIG_AV1_HIGHBITDEPTH + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv], bd); + } else if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], + dst->strides[is_uv])) { + return false; + } +#else + (void)bd; + if (!av1_resize_plane(src->buffers[i], src->crop_heights[is_uv], + src->crop_widths[is_uv], src->strides[is_uv], + dst->buffers[i], dst->crop_heights[is_uv], + dst->crop_widths[is_uv], dst->strides[is_uv])) + return false; +#endif + } + aom_extend_frame_borders(dst, num_planes); + return true; +} + +void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int plane, int rows) { + const int is_uv = (plane > 0); + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x); + const int upscaled_plane_width = + ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); + const int superres_denom = cm->superres_scale_denominator; + + TileInfo tile_col; + const int32_t x_step_qn = av1_get_upscale_convolve_step( + downscaled_plane_width, upscaled_plane_width); + int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width, + upscaled_plane_width, x_step_qn); + + for (int j = 0; j < cm->tiles.cols; j++) { + av1_tile_set_col(&tile_col, cm, j); + // Determine the limits of this tile column in both the source + // and destination images. + // Note: The actual location which we start sampling from is + // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases + // by exactly dst_width * (x_step_qn/2^14) pixels each iteration. + const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x); + const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x); + const int src_width = downscaled_x1 - downscaled_x0; + + const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR; + int upscaled_x1; + if (j == cm->tiles.cols - 1) { + // Note that we can't just use AOMMIN here - due to rounding, + // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than + // upscaled_plane_width. + upscaled_x1 = upscaled_plane_width; + } else { + upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR; + } + + const uint8_t *const src_ptr = src + downscaled_x0; + uint8_t *const dst_ptr = dst + upscaled_x0; + const int dst_width = upscaled_x1 - upscaled_x0; + + const int pad_left = (j == 0); + const int pad_right = (j == cm->tiles.cols - 1); + + bool success; +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params->use_highbitdepth) + success = highbd_upscale_normative_rect( + src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, + dst_stride, x_step_qn, x0_qn, pad_left, pad_right, + cm->seq_params->bit_depth); + else + success = upscale_normative_rect(src_ptr, rows, src_width, src_stride, + dst_ptr, rows, dst_width, dst_stride, + x_step_qn, x0_qn, pad_left, pad_right); +#else + success = upscale_normative_rect(src_ptr, rows, src_width, src_stride, + dst_ptr, rows, dst_width, dst_stride, + x_step_qn, x0_qn, pad_left, pad_right); +#endif + if (!success) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error upscaling frame"); + } + // Update the fractional pixel offset to prepare for the next tile column. + x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS); + } +} + +void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + const int num_planes = av1_num_planes(cm); + for (int i = 0; i < num_planes; ++i) { + const int is_uv = (i > 0); + av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], i, + src->crop_heights[is_uv]); + } + + aom_extend_frame_borders(dst, num_planes); +} + +YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( + AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + const InterpFilter filter, const int phase, const bool use_optimized_scaler, + const bool for_psnr, const int border_in_pixels, + const int num_pyramid_levels) { + // If scaling is performed for the sole purpose of calculating PSNR, then our + // target dimensions are superres upscaled width/height. Otherwise our target + // dimensions are coded width/height. + const int scaled_width = for_psnr ? cm->superres_upscaled_width : cm->width; + const int scaled_height = + for_psnr ? cm->superres_upscaled_height : cm->height; + const bool scaling_required = (scaled_width != unscaled->y_crop_width) || + (scaled_height != unscaled->y_crop_height); + + if (scaling_required) { + const int num_planes = av1_num_planes(cm); + const SequenceHeader *seq_params = cm->seq_params; + + // Reallocate the frame buffer based on the target dimensions when scaling + // is required. + if (aom_realloc_frame_buffer( + scaled, scaled_width, scaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, + num_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled buffer"); + + bool has_optimized_scaler = av1_has_optimized_scaler( + unscaled->y_crop_width, unscaled->y_crop_height, scaled_width, + scaled_height); + if (num_planes > 1) { + has_optimized_scaler = has_optimized_scaler && + av1_has_optimized_scaler(unscaled->uv_crop_width, + unscaled->uv_crop_height, + scaled->uv_crop_width, + scaled->uv_crop_height); + } + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_optimized_scaler && has_optimized_scaler && + cm->seq_params->bit_depth == AOM_BITS_8) { + av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); + } else { + if (!av1_resize_and_extend_frame_nonnormative( + unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffers during resize"); + } +#else + if (use_optimized_scaler && has_optimized_scaler) { + av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); + } else { + if (!av1_resize_and_extend_frame_nonnormative( + unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffers during resize"); + } +#endif + return scaled; + } + return unscaled; +} + +// Calculates the scaled dimension given the original dimension and the scale +// denominator. +static void calculate_scaled_size_helper(int *dim, int denom) { + if (denom != SCALE_NUMERATOR) { + // We need to ensure the constraint in "Appendix A" of the spec: + // * FrameWidth is greater than or equal to 16 + // * FrameHeight is greater than or equal to 16 + // For this, we clamp the downscaled dimension to at least 16. One + // exception: if original dimension itself was < 16, then we keep the + // downscaled dimension to be same as the original, to ensure that resizing + // is valid. + const int min_dim = AOMMIN(16, *dim); + // Use this version if we need *dim to be even + // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom); + // *width <<= 1; + *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom); + *dim = AOMMAX(*dim, min_dim); + } +} + +void av1_calculate_scaled_size(int *width, int *height, int resize_denom) { + calculate_scaled_size_helper(width, resize_denom); + calculate_scaled_size_helper(height, resize_denom); +} + +void av1_calculate_scaled_superres_size(int *width, int *height, + int superres_denom) { + (void)height; + calculate_scaled_size_helper(width, superres_denom); +} + +void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) { + if (denom != SCALE_NUMERATOR) { + // Note: av1_calculate_scaled_superres_size() rounds *up* after division + // when the resulting dimensions are odd. So here, we round *down*. + *width = *width * denom / SCALE_NUMERATOR; + (void)height; + } +} + +// Copy only the config data from 'src' to 'dst'. +static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const dst) { + dst->bit_depth = src->bit_depth; + dst->color_primaries = src->color_primaries; + dst->transfer_characteristics = src->transfer_characteristics; + dst->matrix_coefficients = src->matrix_coefficients; + dst->monochrome = src->monochrome; + dst->chroma_sample_position = src->chroma_sample_position; + dst->color_range = src->color_range; +} + +// TODO(afergs): Look for in-place upscaling +// TODO(afergs): aom_ vs av1_ functions? Which can I use? +// Upscale decoded image. +void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, + int num_pyramid_levels) { + const int num_planes = av1_num_planes(cm); + if (!av1_superres_scaled(cm)) return; + const SequenceHeader *const seq_params = cm->seq_params; + const int byte_alignment = cm->features.byte_alignment; + + YV12_BUFFER_CONFIG copy_buffer; + memset(©_buffer, 0, sizeof(copy_buffer)); + + YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf; + + const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3); + if (aom_alloc_frame_buffer( + ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate copy buffer for superres upscaling"); + + // Copy function assumes the frames are the same size. + // Note that it does not copy YV12_BUFFER_CONFIG config data. + aom_yv12_copy_frame(frame_to_show, ©_buffer, num_planes); + + assert(copy_buffer.y_crop_width == aligned_width); + assert(copy_buffer.y_crop_height == cm->height); + + // Realloc the current frame buffer at a higher resolution in place. + if (pool != NULL) { + // Use callbacks if on the decoder. + aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer; + aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb; + aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb; + void *cb_priv = pool->cb_priv; + + lock_buffer_pool(pool); + // Realloc with callback does not release the frame buffer - release first. + if (release_fb_cb(cb_priv, fb)) { + unlock_buffer_pool(pool); + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to free current frame buffer before superres upscaling"); + } + // aom_realloc_frame_buffer() leaves config data for frame_to_show intact + if (aom_realloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, + num_pyramid_levels, 0)) { + unlock_buffer_pool(pool); + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate current frame buffer for superres upscaling"); + } + unlock_buffer_pool(pool); + } else { + // Make a copy of the config data for frame_to_show in copy_buffer + copy_buffer_config(frame_to_show, ©_buffer); + + // Don't use callbacks on the encoder. + // aom_alloc_frame_buffer() clears the config data for frame_to_show + if (aom_alloc_frame_buffer( + frame_to_show, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, byte_alignment, num_pyramid_levels, 0)) + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate current frame buffer for superres upscaling"); + + // Restore config data back to frame_to_show + copy_buffer_config(©_buffer, frame_to_show); + } + // TODO(afergs): verify frame_to_show is correct after realloc + // encoder: + // decoder: + + assert(frame_to_show->y_crop_width == cm->superres_upscaled_width); + assert(frame_to_show->y_crop_height == cm->superres_upscaled_height); + + // Scale up and back into frame_to_show. + assert(frame_to_show->y_crop_width != cm->width); + av1_upscale_normative_and_extend_frame(cm, ©_buffer, frame_to_show); + + // Free the copy buffer + aom_free_frame_buffer(©_buffer); +} diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h new file mode 100644 index 0000000000..0ba3108f72 --- /dev/null +++ b/third_party/aom/av1/common/resize.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RESIZE_H_ +#define AOM_AV1_COMMON_RESIZE_H_ + +#include +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +bool av1_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, int width2, + int out_stride); +// TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from +// av1/exports_com and delete this function. +void av1_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +bool av1_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); +bool av1_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, uint8_t *oy, + int oy_stride, uint8_t *ou, uint8_t *ov, + int ouv_stride, int oheight, int owidth); + +void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, + int in_stride, uint8_t *output, int height2, + int width2, int out_stride, int bd); +void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); +void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, + uint8_t *ov, int ouv_stride, int oheight, + int owidth, int bd); + +void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int plane, int rows); +void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( + AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + const InterpFilter filter, const int phase, const bool use_optimized_scaler, + const bool for_psnr, const int border_in_pixels, + const int num_pyramid_levels); + +bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd, + const int num_planes); + +// Calculates the scaled dimensions from the given original dimensions and the +// resize scale denominator. +void av1_calculate_scaled_size(int *width, int *height, int resize_denom); + +// Similar to above, but calculates scaled dimensions after superres from the +// given original dimensions and superres scale denominator. +void av1_calculate_scaled_superres_size(int *width, int *height, + int superres_denom); + +// Inverse of av1_calculate_scaled_superres_size() above: calculates the +// original dimensions from the given scaled dimensions and the scale +// denominator. +void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); + +void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, + int num_pyramid_levels); + +// Returns 1 if a superres upscaled frame is scaled and 0 otherwise. +static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { + // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling + // required even though cm->superres_scale_denominator != SCALE_NUMERATOR. + // So, the following check is more accurate. + return (cm->width != cm->superres_upscaled_width); +} + +// The optimized scaler av1_resize_and_extend_frame() can only handle scaling +// ratios >= 1/4 and <= 16. See comment in aom_convolve8_c() for detail. +// Visual assessment shows that if the scaling ratio or its reciprocal is not a +// multiple of 1/16, there are some artifacts in the output of the optimized +// scaler, especially on lines, due to non-exact ratio representation. SSSE3 +// and NEON have a specialized 3/4 version of av1_resize_and_extend_frame() +// that does not have this issue. +// +// Use the non-normative scaler av1_resize_and_extend_frame_nonnormative() +// for other scaling ratios. +static INLINE bool av1_has_optimized_scaler(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + bool has_optimized_scaler = + (dst_width * 4 >= src_width && dst_height * 4 >= src_height) && + (dst_width <= src_width * 16 && dst_height <= src_height * 16) && + (16 * dst_width % src_width == 0) && (16 * src_width % dst_width == 0) && + (16 * dst_height % src_height == 0) && + (16 * src_height % dst_height == 0); +#if HAVE_SSSE3 || HAVE_NEON + has_optimized_scaler = + has_optimized_scaler || + (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height); +#endif + return has_optimized_scaler; +} + +#define UPSCALE_NORMATIVE_TAPS 8 +extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS] + [UPSCALE_NORMATIVE_TAPS]; + +int32_t av1_get_upscale_convolve_step(int in_length, int out_length); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RESIZE_H_ diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c new file mode 100644 index 0000000000..0be126fa65 --- /dev/null +++ b/third_party/aom/av1/common/restoration.c @@ -0,0 +1,1494 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "av1/common/thread_common.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" + +#include "aom_ports/mem.h" + +// The 's' values are calculated based on original 'r' and 'e' values in the +// spec using GenSgrprojVtable(). +// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid). +const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = { + { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } }, + { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } }, + { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } }, + { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } }, + { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } }, + { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } }, + { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } }, + { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } }, +}; + +void av1_get_upsampled_plane_size(const AV1_COMMON *cm, int is_uv, int *plane_w, + int *plane_h) { + int ss_x = is_uv && cm->seq_params->subsampling_x; + int ss_y = is_uv && cm->seq_params->subsampling_y; + *plane_w = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); + *plane_h = ROUND_POWER_OF_TWO(cm->height, ss_y); +} + +// Count horizontal or vertical units in a plane (use a width or height for +// plane_size, respectively). We basically want to divide the plane size by the +// size of a restoration unit. Rather than rounding up unconditionally as you +// might expect, we round to nearest, which models the way a right or bottom +// restoration unit can extend to up to 150% its normal width or height. +// +// The max with 1 is to deal with small frames, which may be smaller than +// half of an LR unit in size. +int av1_lr_count_units(int unit_size, int plane_size) { + return AOMMAX((plane_size + (unit_size >> 1)) / unit_size, 1); +} + +void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi, + int is_uv) { + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + const int unit_size = rsi->restoration_unit_size; + const int horz_units = av1_lr_count_units(unit_size, plane_w); + const int vert_units = av1_lr_count_units(unit_size, plane_h); + + rsi->num_rest_units = horz_units * vert_units; + rsi->horz_units = horz_units; + rsi->vert_units = vert_units; + + aom_free(rsi->unit_info); + CHECK_MEM_ERROR(cm, rsi->unit_info, + (RestorationUnitInfo *)aom_memalign( + 16, sizeof(*rsi->unit_info) * rsi->num_rest_units)); +} + +void av1_free_restoration_struct(RestorationInfo *rst_info) { + aom_free(rst_info->unit_info); + rst_info->unit_info = NULL; +} + +#if 0 +// Pair of values for each sgrproj parameter: +// Index 0 corresponds to r[0], e[0] +// Index 1 corresponds to r[1], e[1] +int sgrproj_mtable[SGRPROJ_PARAMS][2]; + +static void GenSgrprojVtable(void) { + for (int i = 0; i < SGRPROJ_PARAMS; ++i) { + const sgr_params_type *const params = &av1_sgr_params[i]; + for (int j = 0; j < 2; ++j) { + const int e = params->e[j]; + const int r = params->r[j]; + if (r == 0) { // filter is disabled + sgrproj_mtable[i][j] = -1; // mark invalid + } else { // filter is enabled + const int n = (2 * r + 1) * (2 * r + 1); + const int n2e = n * n * e; + assert(n2e != 0); + sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e); + } + } + } +} +#endif + +void av1_loop_restoration_precal(void) { +#if 0 + GenSgrprojVtable(); +#endif +} + +static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert) { + uint8_t *data_p; + int i; + for (i = 0; i < height; ++i) { + data_p = data + i * stride; + memset(data_p - border_horz, data_p[0], border_horz); + memset(data_p + width, data_p[width - 1], border_horz); + } + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, width + 2 * border_horz); + } + for (i = height; i < height + border_vert; ++i) { + memcpy(data_p + i * stride, data_p + (height - 1) * stride, + width + 2 * border_horz); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void extend_frame_highbd(uint16_t *data, int width, int height, + int stride, int border_horz, int border_vert) { + uint16_t *data_p; + int i, j; + for (i = 0; i < height; ++i) { + data_p = data + i * stride; + for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0]; + for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1]; + } + data_p = data - border_horz; + for (i = -border_vert; i < 0; ++i) { + memcpy(data_p + i * stride, data_p, + (width + 2 * border_horz) * sizeof(uint16_t)); + } + for (i = height; i < height + border_vert; ++i) { + memcpy(data_p + i * stride, data_p + (height - 1) * stride, + (width + 2 * border_horz) * sizeof(uint16_t)); + } +} + +static void copy_rest_unit_highbd(int width, int height, const uint16_t *src, + int src_stride, uint16_t *dst, + int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst)); +} +#endif + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride, + border_horz, border_vert); + return; + } +#endif + (void)highbd; + extend_frame_lowbd(data, width, height, stride, border_horz, border_vert); +} + +static void copy_rest_unit_lowbd(int width, int height, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride) { + for (int i = 0; i < height; ++i) + memcpy(dst + i * dst_stride, src + i * src_stride, width); +} + +static void copy_rest_unit(int width, int height, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + copy_rest_unit_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), dst_stride); + return; + } +#endif + (void)highbd; + copy_rest_unit_lowbd(width, height, src, src_stride, dst, dst_stride); +} + +#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d)) + +// With striped loop restoration, the filtering for each 64-pixel stripe gets +// most of its input from the output of CDEF (stored in data8), but we need to +// fill out a border of 3 pixels above/below the stripe according to the +// following rules: +// +// * At the top and bottom of the frame, we copy the outermost row of CDEF +// pixels three times. This extension is done by a call to av1_extend_frame() +// at the start of the loop restoration process, so the value of +// copy_above/copy_below doesn't strictly matter. +// +// * All other boundaries are stripe boundaries within the frame. In that case, +// we take 2 rows of deblocked pixels and extend them to 3 rows of context. +static void get_stripe_boundary_info(const RestorationTileLimits *limits, + int plane_w, int plane_h, int ss_y, + int *copy_above, int *copy_below) { + (void)plane_w; + + *copy_above = 1; + *copy_below = 1; + + const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; + + const int first_stripe_in_plane = (limits->v_start == 0); + const int this_stripe_height = + full_stripe_height - (first_stripe_in_plane ? runit_offset : 0); + const int last_stripe_in_plane = + (limits->v_start + this_stripe_height >= plane_h); + + if (first_stripe_in_plane) *copy_above = 0; + if (last_stripe_in_plane) *copy_below = 0; +} + +// Overwrite the border pixels around a processing stripe so that the conditions +// listed above get_stripe_boundary_info() are preserved. +// We save the pixels which get overwritten into a temporary buffer, so that +// they can be restored by restore_processing_stripe_boundary() after we've +// processed the stripe. +// +// limits gives the rectangular limits of the remaining stripes for the current +// restoration unit. rsb is the stored stripe boundaries (taken from either +// deblock or CDEF output as necessary). +static void setup_processing_stripe_boundary( + const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb, + int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride, + RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) { + // Offsets within the line buffers. The buffer logically starts at column + // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ) + // has column x0 in the buffer. + const int buf_stride = rsb->stripe_boundary_stride; + const int buf_x0_off = limits->h_start; + const int line_width = + (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; + const int line_size = line_width << use_highbd; + + const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; + + // Replace RESTORATION_BORDER pixels above the top of the stripe + // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above + // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by + // duplicating the topmost of the 2 lines (see the AOMMAX call when + // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1). + if (!opt) { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + for (int i = -RESTORATION_BORDER; i < 0; ++i) { + const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0); + const int buf_off = buf_x0_off + buf_row * buf_stride; + const uint8_t *buf = + rsb->stripe_boundary_above + (buf_off << use_highbd); + uint8_t *dst8 = data8_tl + i * data_stride; + // Save old pixels, then replace with data from stripe_boundary_above + memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER], + REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), buf, line_size); + } + } + + // Replace RESTORATION_BORDER pixels below the bottom of the stripe. + // The second buffer row is repeated, so src_row gets the values 0, 1, 1 + // for i = 0, 1, 2. + if (copy_below) { + const int stripe_end = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; + + for (int i = 0; i < RESTORATION_BORDER; ++i) { + const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1); + const int buf_off = buf_x0_off + buf_row * buf_stride; + const uint8_t *src = + rsb->stripe_boundary_below + (buf_off << use_highbd); + + uint8_t *dst8 = data8_bl + i * data_stride; + // Save old pixels, then replace with data from stripe_boundary_below + memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), src, line_size); + } + } + } else { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + // Only save and overwrite i=-RESTORATION_BORDER line. + uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; + // Save old pixels, then replace with data from stripe_boundary_above + memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), + REAL_PTR(use_highbd, + data8_tl + (-RESTORATION_BORDER + 1) * data_stride), + line_size); + } + + if (copy_below) { + const int stripe_end = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride; + + // Only save and overwrite i=2 line. + uint8_t *dst8 = data8_bl + 2 * data_stride; + // Save old pixels, then replace with data from stripe_boundary_below + memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size); + memcpy(REAL_PTR(use_highbd, dst8), + REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size); + } + } +} + +// Once a processing stripe is finished, this function sets the boundary +// pixels which were overwritten by setup_processing_stripe_boundary() +// back to their original values +static void restore_processing_stripe_boundary( + const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs, + int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above, + int copy_below, int opt) { + const int line_width = + (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ; + const int line_size = line_width << use_highbd; + + const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ; + + if (!opt) { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + for (int i = -RESTORATION_BORDER; i < 0; ++i) { + uint8_t *dst8 = data8_tl + i * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), + rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size); + } + } + + if (copy_below) { + const int stripe_bottom = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; + + for (int i = 0; i < RESTORATION_BORDER; ++i) { + if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break; + + uint8_t *dst8 = data8_bl + i * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size); + } + } + } else { + if (copy_above) { + uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride; + + // Only restore i=-RESTORATION_BORDER line. + uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size); + } + + if (copy_below) { + const int stripe_bottom = limits->v_start + h; + uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride; + + // Only restore i=2 line. + if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) { + uint8_t *dst8 = data8_bl + 2 * data_stride; + memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size); + } + } + } +} + +static void wiener_filter_stripe(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + (void)tmpbuf; + (void)bit_depth; + (void)error_info; + assert(bit_depth == 8); + const WienerConvolveParams conv_params = get_conv_params_wiener(8); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); + const uint8_t *src_p = src + j; + uint8_t *dst_p = dst + j; + av1_wiener_convolve_add_src( + src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16, + rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params); + } +} + +/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1) + over the input. The window is of size (2r + 1)x(2r + 1), and we + specialize to r = 1, 2, 3. A default function is used for r > 3. + + Each loop follows the same format: We keep a window's worth of input + in individual variables and select data out of that as appropriate. +*/ +static void boxsum1(int32_t *src, int width, int height, int src_stride, + int sqr, int32_t *dst, int dst_stride) { + int i, j, a, b, c; + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + // Vertical sum over 3-pixel regions, from src into dst. + if (!sqr) { + for (j = 0; j < width; ++j) { + a = src[j]; + b = src[src_stride + j]; + c = src[2 * src_stride + j]; + + dst[j] = a + b; + for (i = 1; i < height - 2; ++i) { + // Loop invariant: At the start of each iteration, + // a = src[(i - 1) * src_stride + j] + // b = src[(i ) * src_stride + j] + // c = src[(i + 1) * src_stride + j] + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = src[(i + 2) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c; + dst[(i + 1) * dst_stride + j] = b + c; + } + } else { + for (j = 0; j < width; ++j) { + a = src[j] * src[j]; + b = src[src_stride + j] * src[src_stride + j]; + c = src[2 * src_stride + j] * src[2 * src_stride + j]; + + dst[j] = a + b; + for (i = 1; i < height - 2; ++i) { + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c; + dst[(i + 1) * dst_stride + j] = b + c; + } + } + + // Horizontal sum over 3-pixel regions of dst + for (i = 0; i < height; ++i) { + a = dst[i * dst_stride]; + b = dst[i * dst_stride + 1]; + c = dst[i * dst_stride + 2]; + + dst[i * dst_stride] = a + b; + for (j = 1; j < width - 2; ++j) { + // Loop invariant: At the start of each iteration, + // a = src[i * src_stride + (j - 1)] + // b = src[i * src_stride + (j )] + // c = src[i * src_stride + (j + 1)] + dst[i * dst_stride + j] = a + b + c; + a = b; + b = c; + c = dst[i * dst_stride + (j + 2)]; + } + dst[i * dst_stride + j] = a + b + c; + dst[i * dst_stride + (j + 1)] = b + c; + } +} + +static void boxsum2(int32_t *src, int width, int height, int src_stride, + int sqr, int32_t *dst, int dst_stride) { + int i, j, a, b, c, d, e; + assert(width > 2 * SGRPROJ_BORDER_HORZ); + assert(height > 2 * SGRPROJ_BORDER_VERT); + + // Vertical sum over 5-pixel regions, from src into dst. + if (!sqr) { + for (j = 0; j < width; ++j) { + a = src[j]; + b = src[src_stride + j]; + c = src[2 * src_stride + j]; + d = src[3 * src_stride + j]; + e = src[4 * src_stride + j]; + + dst[j] = a + b + c; + dst[dst_stride + j] = a + b + c + d; + for (i = 2; i < height - 3; ++i) { + // Loop invariant: At the start of each iteration, + // a = src[(i - 2) * src_stride + j] + // b = src[(i - 1) * src_stride + j] + // c = src[(i ) * src_stride + j] + // d = src[(i + 1) * src_stride + j] + // e = src[(i + 2) * src_stride + j] + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = src[(i + 3) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[(i + 1) * dst_stride + j] = b + c + d + e; + dst[(i + 2) * dst_stride + j] = c + d + e; + } + } else { + for (j = 0; j < width; ++j) { + a = src[j] * src[j]; + b = src[src_stride + j] * src[src_stride + j]; + c = src[2 * src_stride + j] * src[2 * src_stride + j]; + d = src[3 * src_stride + j] * src[3 * src_stride + j]; + e = src[4 * src_stride + j] * src[4 * src_stride + j]; + + dst[j] = a + b + c; + dst[dst_stride + j] = a + b + c + d; + for (i = 2; i < height - 3; ++i) { + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[(i + 1) * dst_stride + j] = b + c + d + e; + dst[(i + 2) * dst_stride + j] = c + d + e; + } + } + + // Horizontal sum over 5-pixel regions of dst + for (i = 0; i < height; ++i) { + a = dst[i * dst_stride]; + b = dst[i * dst_stride + 1]; + c = dst[i * dst_stride + 2]; + d = dst[i * dst_stride + 3]; + e = dst[i * dst_stride + 4]; + + dst[i * dst_stride] = a + b + c; + dst[i * dst_stride + 1] = a + b + c + d; + for (j = 2; j < width - 3; ++j) { + // Loop invariant: At the start of each iteration, + // a = src[i * src_stride + (j - 2)] + // b = src[i * src_stride + (j - 1)] + // c = src[i * src_stride + (j )] + // d = src[i * src_stride + (j + 1)] + // e = src[i * src_stride + (j + 2)] + dst[i * dst_stride + j] = a + b + c + d + e; + a = b; + b = c; + c = d; + d = e; + e = dst[i * dst_stride + (j + 3)]; + } + dst[i * dst_stride + j] = a + b + c + d + e; + dst[i * dst_stride + (j + 1)] = b + c + d + e; + dst[i * dst_stride + (j + 2)] = c + d + e; + } +} + +static void boxsum(int32_t *src, int width, int height, int src_stride, int r, + int sqr, int32_t *dst, int dst_stride) { + if (r == 1) + boxsum1(src, width, height, src_stride, sqr, dst, dst_stride); + else if (r == 2) + boxsum2(src, width, height, src_stride, sqr, dst, dst_stride); + else + assert(0 && "Invalid value of r in self-guided filter"); +} + +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) { + if (params->r[0] == 0) { + xq[0] = 0; + xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1]; + } else if (params->r[1] == 0) { + xq[0] = xqd[0]; + xq[1] = 0; + } else { + xq[0] = xqd[0]; + xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1]; + } +} + +const int32_t av1_x_by_xplus1[256] = { + // Special case: Map 0 -> 1 (corresponding to a value of 1/256) + // instead of 0. See comments in selfguided_restoration_internal() for why + 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, + 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, + 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, + 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, + 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, + 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, + 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 256, +}; + +const int32_t av1_one_by_x[MAX_NELEM] = { + 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315, + 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164, +}; + +static void calculate_intermediate_result(int32_t *dgd, int width, int height, + int dgd_stride, int bit_depth, + int sgr_params_idx, int radius_idx, + int pass, int32_t *A, int32_t *B) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + const int step = pass == 0 ? 1 : 2; + int i, j; + + assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r"); + assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 && + "Need SGRPROJ_BORDER_* >= r+1"); + + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 0, B, buf_stride); + boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ, + width_ext, height_ext, dgd_stride, r, 1, A, buf_stride); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie, + // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[]. + for (i = -1; i < height + 1; i += step) { + for (j = -1; j < width + 1; ++j) { + const int k = i * buf_stride + j; + const int n = (2 * r + 1) * (2 * r + 1); + + // a < 2^16 * n < 2^22 regardless of bit depth + uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8)); + // b < 2^8 * n < 2^14 regardless of bit depth + uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8); + + // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28, + // and p itself satisfies p < 2^14 * n^2 < 2^26. + // This bound on p is due to: + // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances + // + // Note: Sometimes, in high bit depth, we can end up with a*n < b*b. + // This is an artefact of rounding, and can only happen if all pixels + // are (almost) identical, so in this case we saturate to p=0. + uint32_t p = (a * n < b * b) ? 0 : a * n - b * b; + + const uint32_t s = params->s[radius_idx]; + + // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32 + // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12 + // (this holds even after accounting for the rounding in s) + const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS); + + // Note: We have to be quite careful about the value of A[k]. + // This is used as a blend factor between individual pixel values and the + // local mean. So it logically has a range of [0, 256], including both + // endpoints. + // + // This is a pain for hardware, as we'd like something which can be stored + // in exactly 8 bits. + // Further, in the calculation of B[k] below, if z == 0 and r == 2, + // then A[k] "should be" 0. But then we can end up setting B[k] to a value + // slightly above 2^(8 + bit depth), due to rounding in the value of + // av1_one_by_x[25-1]. + // + // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0. + // This fixes the above issues (256 - A[k] fits in a uint8, and we can't + // overflow), without significantly affecting the final result: z == 0 + // implies that the image is essentially "flat", so the local mean and + // individual pixel values are very similar. + // + // Note that saturating on the other side, ie. requring A[k] <= 255, + // would be a bad idea, as that corresponds to the case where the image + // is very variable, when we want to preserve the local pixel value as + // much as possible. + A[k] = av1_x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256] + + // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n, + // av1_one_by_x[n - 1] = round(2^12 / n) + // => the product here is < 2^(20 + bit_depth) <= 2^32, + // and B[k] is set to a value < 2^(8 + bit depth) + // This holds even with the rounding in av1_one_by_x and in the overall + // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8. + B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) * + (uint32_t)B[k] * + (uint32_t)av1_one_by_x[n - 1], + SGRPROJ_RECIP_BITS); + } + } +} + +static void selfguided_restoration_fast_internal( + int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 1, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Use the A[] and B[] arrays to calculate the filtered image + (void)r; + assert(r == 2); + for (i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 5; + const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } else { // odd row + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 4; + const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5; + const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5; + const int32_t v = a * dgd[l] + b; + dst[m] = + ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } + } +} + +static void selfguided_restoration_internal(int32_t *dgd, int width, int height, + int dgd_stride, int32_t *dst, + int dst_stride, int bit_depth, + int sgr_params_idx, + int radius_idx) { + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes, for consistency + // with the SIMD version of this function. + int buf_stride = ((width_ext + 3) & ~3) + 16; + int32_t A_[RESTORATION_PROC_UNIT_PELS]; + int32_t B_[RESTORATION_PROC_UNIT_PELS]; + int32_t *A = A_; + int32_t *B = B_; + int i, j; + calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth, + sgr_params_idx, radius_idx, 0, A, B); + A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ; + + // Use the A[] and B[] arrays to calculate the filtered image + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int k = i * buf_stride + j; + const int l = i * dgd_stride + j; + const int m = i * dst_stride + j; + const int nb = 5; + const int32_t a = + (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) * + 4 + + (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] + + A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) * + 3; + const int32_t b = + (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) * + 4 + + (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] + + B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) * + 3; + const int32_t v = a * dgd[l] + b; + dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + } + } +} + +int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, int32_t *flt1, + int flt_stride, int sgr_params_idx, + int bit_depth, int highbd) { + int32_t dgd32_[RESTORATION_PROC_UNIT_PELS]; + const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ; + int32_t *dgd32 = + dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ; + + if (highbd) { + const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8); + for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { + for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { + dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j]; + } + } + } else { + for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) { + for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) { + dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j]; + } + } + } + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + + if (params->r[0] > 0) + selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride, + flt0, flt_stride, bit_depth, + sgr_params_idx, 0); + if (params->r[1] > 0) + selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1, + flt_stride, bit_depth, sgr_params_idx, 1); + return 0; +} + +int av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + + const int ret = av1_selfguided_restoration_c( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + if (ret != 0) return ret; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int k = i * width + j; + uint8_t *dst8ij = dst8 + i * dst_stride + j; + const uint8_t *dat8ij = dat8 + i * stride + j; + + const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij; + const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS; + int32_t v = u << SGRPROJ_PRJ_BITS; + // If params->r == 0 then we skipped the filtering in + // av1_selfguided_restoration_c, i.e. flt[k] == u + if (params->r[0] > 0) v += xq[0] * (flt0[k] - u); + if (params->r[1] > 0) v += xq[1] * (flt1[k] - u); + const int16_t w = + (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + const uint16_t out = clip_pixel_highbd(w, bit_depth); + if (highbd) + *CONVERT_TO_SHORTPTR(dst8ij) = out; + else + *dst8ij = (uint8_t)out; + } + } + return 0; +} + +static void sgrproj_filter_stripe(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + (void)bit_depth; + assert(bit_depth == 8); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, stripe_width - j); + if (av1_apply_selfguided_restoration( + src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, + 0) != 0) { + aom_internal_error( + error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffer in av1_apply_selfguided_restoration"); + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void wiener_filter_stripe_highbd( + const RestorationUnitInfo *rui, int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + (void)tmpbuf; + (void)error_info; + const WienerConvolveParams conv_params = get_conv_params_wiener(bit_depth); + + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15); + const uint8_t *src8_p = src8 + j; + uint8_t *dst8_p = dst8 + j; + av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride, + rui->wiener_info.hfilter, 16, + rui->wiener_info.vfilter, 16, w, + stripe_height, &conv_params, bit_depth); + } +} + +static void sgrproj_filter_stripe_highbd( + const RestorationUnitInfo *rui, int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src8, int src_stride, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info) { + for (int j = 0; j < stripe_width; j += procunit_width) { + int w = AOMMIN(procunit_width, stripe_width - j); + if (av1_apply_selfguided_restoration( + src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep, + rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, + 1) != 0) { + aom_internal_error( + error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffer in av1_apply_selfguided_restoration"); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui, + int stripe_width, int stripe_height, + int procunit_width, const uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, + int32_t *tmpbuf, int bit_depth, + struct aom_internal_error_info *error_info); + +#if CONFIG_AV1_HIGHBITDEPTH +#define NUM_STRIPE_FILTERS 4 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd, + sgrproj_filter_stripe_highbd +}; +#else +#define NUM_STRIPE_FILTERS 2 +static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = { + wiener_filter_stripe, sgrproj_filter_stripe +}; +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Filter one restoration unit +void av1_loop_restoration_filter_unit( + const RestorationTileLimits *limits, const RestorationUnitInfo *rui, + const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, + int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth, + uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, + int optimized_lr, struct aom_internal_error_info *error_info) { + RestorationType unit_rtype = rui->restoration_type; + + int unit_h = limits->v_end - limits->v_start; + int unit_w = limits->h_end - limits->h_start; + uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start; + uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start; + + if (unit_rtype == RESTORE_NONE) { + copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, + highbd); + return; + } + + const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ); + assert(filter_idx < NUM_STRIPE_FILTERS); + const stripe_filter_fun stripe_filter = stripe_filters[filter_idx]; + + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + + // Filter the whole image one stripe at a time + RestorationTileLimits remaining_stripes = *limits; + int i = 0; + while (i < unit_h) { + int copy_above, copy_below; + remaining_stripes.v_start = limits->v_start + i; + + get_stripe_boundary_info(&remaining_stripes, plane_w, plane_h, ss_y, + ©_above, ©_below); + + const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y; + + // Work out where this stripe's boundaries are within + // rsb->stripe_boundary_{above,below} + const int frame_stripe = + (remaining_stripes.v_start + runit_offset) / full_stripe_height; + const int rsb_row = RESTORATION_CTX_VERT * frame_stripe; + + // Calculate this stripe's height, based on two rules: + // * The topmost stripe in the frame is 8 luma pixels shorter than usual. + // * We can't extend past the end of the current restoration unit + const int nominal_stripe_height = + full_stripe_height - ((frame_stripe == 0) ? runit_offset : 0); + const int h = AOMMIN(nominal_stripe_height, + remaining_stripes.v_end - remaining_stripes.v_start); + + setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd, + h, data8, stride, rlbs, copy_above, + copy_below, optimized_lr); + + stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride, + dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth, + error_info); + + restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h, + data8, stride, copy_above, copy_below, + optimized_lr); + + i += h; + } +} + +static void filter_frame_on_unit(const RestorationTileLimits *limits, + int rest_unit_idx, void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv; + const RestorationInfo *rsi = ctxt->rsi; + + av1_loop_restoration_filter_unit( + limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, + ctxt->plane_w, ctxt->plane_h, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, + ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8, + ctxt->dst_stride, tmpbuf, rsi->optimized_lr, error_info); +} + +void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, + YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + int num_planes) { + const SequenceHeader *const seq_params = cm->seq_params; + const int bit_depth = seq_params->bit_depth; + const int highbd = seq_params->use_highbitdepth; + lr_ctxt->dst = &cm->rst_frame; + + const int frame_width = frame->crop_widths[0]; + const int frame_height = frame->crop_heights[0]; + if (aom_realloc_frame_buffer( + lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) != AOM_CODEC_OK) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate restoration dst buffer"); + + lr_ctxt->on_rest_unit = filter_frame_on_unit; + lr_ctxt->frame = frame; + for (int plane = 0; plane < num_planes; ++plane) { + RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationType rtype = rsi->frame_restoration_type; + rsi->optimized_lr = optimized_lr; + lr_ctxt->ctxt[plane].rsi = rsi; + + if (rtype == RESTORE_NONE) { + continue; + } + + const int is_uv = plane > 0; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + assert(plane_w == frame->crop_widths[is_uv]); + assert(plane_h == frame->crop_heights[is_uv]); + + av1_extend_frame(frame->buffers[plane], plane_w, plane_h, + frame->strides[is_uv], RESTORATION_BORDER, + RESTORATION_BORDER, highbd); + + FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane]; + lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x; + lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y; + lr_plane_ctxt->plane_w = plane_w; + lr_plane_ctxt->plane_h = plane_h; + lr_plane_ctxt->highbd = highbd; + lr_plane_ctxt->bit_depth = bit_depth; + lr_plane_ctxt->data8 = frame->buffers[plane]; + lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane]; + lr_plane_ctxt->data_stride = frame->strides[is_uv]; + lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv]; + } +} + +void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, + AV1_COMMON *cm, int num_planes) { + typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, + int vstart, int vend); + static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y, + aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v }; + assert(num_planes <= 3); + for (int plane = 0; plane < num_planes; ++plane) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + FilterFrameCtxt *lr_plane_ctxt = &loop_rest_ctxt->ctxt[plane]; + copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, 0, + lr_plane_ctxt->plane_w, 0, lr_plane_ctxt->plane_h); + } +} + +static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm, + int num_planes) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + for (int plane = 0; plane < num_planes; ++plane) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) { + continue; + } + + av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit, + &ctxt[plane], cm->rst_tmpbuf, cm->rlbs); + } +} + +void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + void *lr_ctxt) { + assert(!cm->features.all_lossless); + const int num_planes = av1_num_planes(cm); + + AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; + + av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, + optimized_lr, num_planes); + + foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes); + + av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes); +} + +void av1_foreach_rest_unit_in_row( + RestorationTileLimits *limits, int plane_w, + rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, + int hnum_rest_units, int vnum_rest_units, int plane, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, + sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync, + struct aom_internal_error_info *error_info) { + const int ext_size = unit_size * 3 / 2; + int x0 = 0, j = 0; + while (x0 < plane_w) { + int remaining_w = plane_w - x0; + int w = (remaining_w < ext_size) ? remaining_w : unit_size; + + limits->h_start = x0; + limits->h_end = x0 + w; + assert(limits->h_end <= plane_w); + + const int unit_idx = row_number * hnum_rest_units + j; + + // No sync for even numbered rows + // For odd numbered rows, Loop Restoration of current block requires the LR + // of top-right and bottom-right blocks to be completed + + // top-right sync + on_sync_read(lr_sync, row_number, j, plane); + if ((row_number + 1) < vnum_rest_units) + // bottom-right sync + on_sync_read(lr_sync, row_number + 2, j, plane); + +#if CONFIG_MULTITHREAD + if (lr_sync && lr_sync->num_workers > 1) { + pthread_mutex_lock(lr_sync->job_mutex); + const bool lr_mt_exit = lr_sync->lr_mt_exit; + pthread_mutex_unlock(lr_sync->job_mutex); + // Exit in case any worker has encountered an error. + if (lr_mt_exit) return; + } +#endif + + on_rest_unit(limits, unit_idx, priv, tmpbuf, rlbs, error_info); + + on_sync_write(lr_sync, row_number, j, hnum_rest_units, plane); + + x0 += w; + ++j; + } +} + +void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) { + (void)lr_sync; + (void)r; + (void)c; + (void)plane; +} + +void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, + const int sb_cols, int plane) { + (void)lr_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +} + +void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, + rest_unit_visitor_t on_rest_unit, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs) { + const RestorationInfo *rsi = &cm->rst_info[plane]; + const int hnum_rest_units = rsi->horz_units; + const int vnum_rest_units = rsi->vert_units; + const int unit_size = rsi->restoration_unit_size; + + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int ext_size = unit_size * 3 / 2; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + int y0 = 0, i = 0; + while (y0 < plane_h) { + int remaining_h = plane_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : unit_size; + + RestorationTileLimits limits; + limits.v_start = y0; + limits.v_end = y0 + h; + assert(limits.v_end <= plane_h); + // Offset upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(0, limits.v_start - voffset); + if (limits.v_end < plane_h) limits.v_end -= voffset; + + av1_foreach_rest_unit_in_row(&limits, plane_w, on_rest_unit, i, unit_size, + hnum_rest_units, vnum_rest_units, plane, priv, + tmpbuf, rlbs, av1_lr_sync_read_dummy, + av1_lr_sync_write_dummy, NULL, cm->error); + + y0 += h; + ++i; + } +} + +int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rcol0, int *rcol1, int *rrow0, + int *rrow1) { + assert(rcol0 && rcol1 && rrow0 && rrow1); + + if (bsize != cm->seq_params->sb_size) return 0; + + assert(!cm->features.all_lossless); + + const int is_uv = plane > 0; + + // Compute the mi-unit corners of the superblock + const int mi_row0 = mi_row; + const int mi_col0 = mi_col; + const int mi_row1 = mi_row0 + mi_size_high[bsize]; + const int mi_col1 = mi_col0 + mi_size_wide[bsize]; + + const RestorationInfo *rsi = &cm->rst_info[plane]; + const int size = rsi->restoration_unit_size; + const int horz_units = rsi->horz_units; + const int vert_units = rsi->vert_units; + + // The size of an MI-unit on this plane of the image + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int mi_size_x = MI_SIZE >> ss_x; + const int mi_size_y = MI_SIZE >> ss_y; + + // Write m for the relative mi column or row, D for the superres denominator + // and N for the superres numerator. If u is the upscaled pixel offset then + // we can write the downscaled pixel offset in two ways as: + // + // MI_SIZE * m = N / D u + // + // from which we get u = D * MI_SIZE * m / N + const int mi_to_num_x = av1_superres_scaled(cm) + ? mi_size_x * cm->superres_scale_denominator + : mi_size_x; + const int mi_to_num_y = mi_size_y; + const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size; + const int denom_y = size; + + const int rnd_x = denom_x - 1; + const int rnd_y = denom_y - 1; + + // rcol0/rrow0 should be the first column/row of restoration units that + // doesn't start left/below of mi_col/mi_row. For this calculation, we need + // to round up the division (if the sb starts at runit column 10.1, the first + // matching runit has column index 11) + *rcol0 = (mi_col0 * mi_to_num_x + rnd_x) / denom_x; + *rrow0 = (mi_row0 * mi_to_num_y + rnd_y) / denom_y; + + // rel_col1/rel_row1 is the equivalent calculation, but for the superblock + // below-right. If we're at the bottom or right of the frame, this restoration + // unit might not exist, in which case we'll clamp accordingly. + *rcol1 = AOMMIN((mi_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units); + *rrow1 = AOMMIN((mi_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units); + + return *rcol0 < *rcol1 && *rrow0 < *rrow1; +} + +// Extend to left and right +static void extend_lines(uint8_t *buf, int width, int height, int stride, + int extend, int use_highbitdepth) { + for (int i = 0; i < height; ++i) { + if (use_highbitdepth) { + uint16_t *buf16 = (uint16_t *)buf; + aom_memset16(buf16 - extend, buf16[0], extend); + aom_memset16(buf16 + width, buf16[width - 1], extend); + } else { + memset(buf - extend, buf[0], extend); + memset(buf + width, buf[width - 1], extend); + } + buf += stride; + } +} + +static void save_deblock_boundary_lines( + const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row, + int stripe, int use_highbd, int is_above, + RestorationStripeBoundaries *boundaries) { + const int is_uv = plane > 0; + const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); + const int src_stride = frame->strides[is_uv] << use_highbd; + const uint8_t *src_rows = src_buf + row * src_stride; + + uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above + : boundaries->stripe_boundary_below; + uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); + const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; + uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; + + // There is a rare case in which a processing stripe can end 1px above the + // crop border. In this case, we do want to use deblocked pixels from below + // the stripe (hence why we ended up in this function), but instead of + // fetching 2 "below" rows we need to fetch one and duplicate it. + // This is equivalent to clamping the sample locations against the crop border + const int lines_to_save = + AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row); + assert(lines_to_save == 1 || lines_to_save == 2); + + int upscaled_width; + int line_bytes; + if (av1_superres_scaled(cm)) { + const int ss_x = is_uv && cm->seq_params->subsampling_x; + upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; + line_bytes = upscaled_width << use_highbd; + if (use_highbd) + av1_upscale_normative_rows( + cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv], + CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride, + plane, lines_to_save); + else + av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows, + boundaries->stripe_boundary_stride, plane, + lines_to_save); + } else { + upscaled_width = frame->crop_widths[is_uv]; + line_bytes = upscaled_width << use_highbd; + for (int i = 0; i < lines_to_save; i++) { + memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride, + line_bytes); + } + } + // If we only saved one line, then copy it into the second line buffer + if (lines_to_save == 1) + memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes); + + extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, + RESTORATION_EXTRA_HORZ, use_highbd); +} + +static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, + const AV1_COMMON *cm, int plane, int row, + int stripe, int use_highbd, int is_above, + RestorationStripeBoundaries *boundaries) { + const int is_uv = plane > 0; + const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); + const int src_stride = frame->strides[is_uv] << use_highbd; + const uint8_t *src_rows = src_buf + row * src_stride; + + uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above + : boundaries->stripe_boundary_below; + uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd); + const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd; + uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride; + const int src_width = frame->crop_widths[is_uv]; + + // At the point where this function is called, we've already applied + // superres. So we don't need to extend the lines here, we can just + // pull directly from the topmost row of the upscaled frame. + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int upscaled_width = av1_superres_scaled(cm) + ? (cm->superres_upscaled_width + ss_x) >> ss_x + : src_width; + const int line_bytes = upscaled_width << use_highbd; + for (int i = 0; i < RESTORATION_CTX_VERT; i++) { + // Copy the line at 'src_rows' into both context lines + memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes); + } + extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride, + RESTORATION_EXTRA_HORZ, use_highbd); +} + +static void save_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd, + int plane, AV1_COMMON *cm, int after_cdef) { + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries; + + const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y); + + int stripe_idx; + for (stripe_idx = 0;; ++stripe_idx) { + const int rel_y0 = AOMMAX(0, stripe_idx * stripe_height - stripe_off); + const int y0 = rel_y0; + if (y0 >= plane_h) break; + + const int rel_y1 = (stripe_idx + 1) * stripe_height - stripe_off; + const int y1 = AOMMIN(rel_y1, plane_h); + + // Extend using CDEF pixels at the top and bottom of the frame, + // and deblocked pixels at internal stripe boundaries + const int use_deblock_above = (stripe_idx > 0); + const int use_deblock_below = (y1 < plane_height); + + if (!after_cdef) { + // Save deblocked context at internal stripe boundaries + if (use_deblock_above) { + save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT, + stripe_idx, use_highbd, 1, boundaries); + } + if (use_deblock_below) { + save_deblock_boundary_lines(frame, cm, plane, y1, stripe_idx, + use_highbd, 0, boundaries); + } + } else { + // Save CDEF context at frame boundaries + if (!use_deblock_above) { + save_cdef_boundary_lines(frame, cm, plane, y0, stripe_idx, use_highbd, + 1, boundaries); + } + if (!use_deblock_below) { + save_cdef_boundary_lines(frame, cm, plane, y1 - 1, stripe_idx, + use_highbd, 0, boundaries); + } + } + } +} + +// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan +// lines to be used as boundary in the loop restoration process. The +// lines are saved in rst_internal.stripe_boundary_lines +void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int after_cdef) { + const int num_planes = av1_num_planes(cm); + const int use_highbd = cm->seq_params->use_highbitdepth; + for (int p = 0; p < num_planes; ++p) { + save_boundary_lines(frame, use_highbd, p, cm, after_cdef); + } +} diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h new file mode 100644 index 0000000000..644e06980f --- /dev/null +++ b/third_party/aom/av1/common/restoration.h @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_RESTORATION_H_ +#define AOM_AV1_COMMON_RESTORATION_H_ + +#include "aom_ports/mem.h" +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! @file */ + +/*!\cond */ + +// Border for Loop restoration buffer +#define AOM_RESTORATION_FRAME_BORDER 32 +#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) +#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) + +#define RESTORATION_PROC_UNIT_SIZE 64 + +// Filter stripe grid offset upwards compared to the superblock grid +#define RESTORATION_UNIT_OFFSET 8 + +#define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr +#define SGRPROJ_BORDER_HORZ 3 // Horizontal border used for Sgr + +#define WIENER_BORDER_VERT 2 // Vertical border used for Wiener +#define WIENER_HALFWIN 3 +#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener + +// RESTORATION_BORDER_VERT determines line buffer requirement for LR. +// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT. +// Note the line buffer needed is twice the value of this macro. +#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT +#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT) +#else +#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ +#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ) +#else +#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ) +#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT + +// How many border pixels do we need for each processing unit? +#define RESTORATION_BORDER 3 + +// How many rows of deblocked pixels do we save above/below each processing +// stripe? +#define RESTORATION_CTX_VERT 2 + +// Additional pixels to the left and right in above/below buffers +// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment +#define RESTORATION_EXTRA_HORZ 4 + +// Pad up to 20 more (may be much less is needed) +#define RESTORATION_PADDING 20 +#define RESTORATION_PROC_UNIT_PELS \ + ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \ + RESTORATION_PADDING) * \ + (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \ + RESTORATION_PADDING)) + +#define RESTORATION_UNITSIZE_MAX 256 +#define RESTORATION_UNITPELS_HORZ_MAX \ + (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16) +#define RESTORATION_UNITPELS_VERT_MAX \ + ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \ + RESTORATION_UNIT_OFFSET)) +#define RESTORATION_UNITPELS_MAX \ + (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX) + +// Two 32-bit buffers needed for the restored versions from two filters +// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored +// on the decoder side. +#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t)) + +#define SGRPROJ_EXTBUF_SIZE (0) +#define SGRPROJ_PARAMS_BITS 4 +#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS) + +// Precision bits for projection +#define SGRPROJ_PRJ_BITS 7 +// Restoration precision bits generated higher than source before projection +#define SGRPROJ_RST_BITS 4 +// Internal precision bits for core selfguided_restoration +#define SGRPROJ_SGR_BITS 8 +#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS) + +#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4) +#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1) +#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4) +#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1) + +#define SGRPROJ_PRJ_SUBEXP_K 4 + +#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS) + +#define MAX_RADIUS 2 // Only 1, 2, 3 allowed +#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) +#define SGRPROJ_MTABLE_BITS 20 +#define SGRPROJ_RECIP_BITS 12 + +#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1) +#define WIENER_WIN (2 * WIENER_HALFWIN + 1) +#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN)) +#define WIENER_TMPBUF_SIZE (0) +#define WIENER_EXTBUF_SIZE (0) + +// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for +// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN. +#define WIENER_WIN_CHROMA (WIENER_WIN - 2) +#define WIENER_WIN_REDUCED (WIENER_WIN - 2) +#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA)) +#define WIENER_STATS_DOWNSAMPLE_FACTOR 4 + +#define WIENER_FILT_PREC_BITS 7 +#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS) + +// Central values for the taps +#define WIENER_FILT_TAP0_MIDV (3) +#define WIENER_FILT_TAP1_MIDV (-7) +#define WIENER_FILT_TAP2_MIDV (15) +#define WIENER_FILT_TAP3_MIDV \ + (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \ + WIENER_FILT_TAP2_MIDV)) + +#define WIENER_FILT_TAP0_BITS 4 +#define WIENER_FILT_TAP1_BITS 5 +#define WIENER_FILT_TAP2_BITS 6 + +#define WIENER_FILT_BITS \ + ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2) + +#define WIENER_FILT_TAP0_MINV \ + (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2) +#define WIENER_FILT_TAP1_MINV \ + (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2) +#define WIENER_FILT_TAP2_MINV \ + (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2) + +#define WIENER_FILT_TAP0_MAXV \ + (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2) +#define WIENER_FILT_TAP1_MAXV \ + (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2) +#define WIENER_FILT_TAP2_MAXV \ + (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2) + +#define WIENER_FILT_TAP0_SUBEXP_K 1 +#define WIENER_FILT_TAP1_SUBEXP_K 2 +#define WIENER_FILT_TAP2_SUBEXP_K 3 + +// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE +#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE) + +// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE +#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE) + +// Check the assumptions of the existing code +#if SUBPEL_TAPS != WIENER_WIN + 1 +#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1" +#endif +#if WIENER_FILT_PREC_BITS != 7 +#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7" +#endif + +typedef struct { + int r[2]; // radii + int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable() +} sgr_params_type; +/*!\endcond */ + +/*!\brief Parameters related to Restoration Unit Info */ +typedef struct { + /*! + * restoration type + */ + RestorationType restoration_type; + + /*! + * Wiener filter parameters if restoration_type indicates Wiener + */ + WienerInfo wiener_info; + + /*! + * Sgrproj filter parameters if restoration_type indicates Sgrproj + */ + SgrprojInfo sgrproj_info; +} RestorationUnitInfo; + +/*!\cond */ + +// A restoration line buffer needs space for two lines plus a horizontal filter +// margin of RESTORATION_EXTRA_HORZ on each side. +#define RESTORATION_LINEBUFFER_WIDTH \ + (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ) + +typedef struct { + // Temporary buffers to save/restore 3 lines above/below the restoration + // stripe. + uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; + uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH]; +} RestorationLineBuffers; +/*!\endcond */ + +/*!\brief Parameters related to Restoration Stripe boundaries */ +typedef struct { + /*! + * stripe boundary above + */ + uint8_t *stripe_boundary_above; + + /*! + * stripe boundary below + */ + uint8_t *stripe_boundary_below; + + /*! + * strides for stripe boundaries above and below + */ + int stripe_boundary_stride; + + /*! + * size of stripe boundaries above and below + */ + int stripe_boundary_size; +} RestorationStripeBoundaries; + +/*!\brief Parameters related to Restoration Info */ +typedef struct { + /*! + * Restoration type for frame + */ + RestorationType frame_restoration_type; + + /*! + * Restoration unit size + */ + int restoration_unit_size; + + /** + * \name Fields allocated and initialised by av1_alloc_restoration_struct. + */ + /**@{*/ + /*! + * Total number of restoration units in this plane + */ + int num_rest_units; + + /*! + * Number of vertical restoration units in this plane + */ + int vert_units; + + /*! + * Number of horizontal restoration units in this plane + */ + int horz_units; + /**@}*/ + + /*! + * Parameters for each restoration unit in this plane + */ + RestorationUnitInfo *unit_info; + + /*! + * Restoration Stripe boundary info + */ + RestorationStripeBoundaries boundaries; + + /*! + * Whether optimized lr can be used for speed. + * That includes cases of no cdef and no superres, or if fast trial runs + * are used on the encoder side. + */ + int optimized_lr; +} RestorationInfo; + +/*!\cond */ + +static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) { + sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2; + sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2; +} + +static INLINE void set_default_wiener(WienerInfo *wiener_info) { + wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV; + wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV; + wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV; + wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] = + -2 * + (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV); + wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV; + wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV; + wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV; +} + +typedef struct { + int h_start, h_end, v_start, v_end; +} RestorationTileLimits; + +typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits, + int rest_unit_idx, void *priv, + int32_t *tmpbuf, + RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info); + +typedef struct FilterFrameCtxt { + const RestorationInfo *rsi; + int ss_x, ss_y; + int plane_w, plane_h; + int highbd, bit_depth; + uint8_t *data8, *dst8; + int data_stride, dst_stride; +} FilterFrameCtxt; + +typedef struct AV1LrStruct { + rest_unit_visitor_t on_rest_unit; + FilterFrameCtxt ctxt[MAX_MB_PLANE]; + YV12_BUFFER_CONFIG *frame; + YV12_BUFFER_CONFIG *dst; +} AV1LrStruct; + +extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS]; +extern int sgrproj_mtable[SGRPROJ_PARAMS][2]; +extern const int32_t av1_x_by_xplus1[256]; +extern const int32_t av1_one_by_x[MAX_NELEM]; + +void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi, + int is_uv); +void av1_free_restoration_struct(RestorationInfo *rst_info); + +void av1_extend_frame(uint8_t *data, int width, int height, int stride, + int border_horz, int border_vert, int highbd); +void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params); + +/*!\endcond */ + +/*!\brief Function for applying loop restoration filter to a single unit. + * + * \ingroup in_loop_restoration + * This function applies the loop restoration filter to a single + * loop restoration unit. + * + * \param[in] limits Limits of the unit + * \param[in] rui The parameters to use for this unit and its + * coefficients + * \param[in] rsb Deblocked pixels to use for stripe boundaries + * \param[in] rlbs Space to use as a scratch buffer + * \param[in] ss_x Horizontal subsampling for plane + * \param[in] ss_y Vertical subsampling for plane + * \param[in] plane_w Width of the current plane + * \param[in] plane_h Height of the current plane + * \param[in] highbd Whether high bitdepth pipeline is used + * \param[in] bit_depth Bit-depth of the video + * \param[in] data8 Frame data (pointing at the top-left corner of + * the frame, not the restoration unit). + * \param[in] stride Stride of \c data8 + * \param[out] dst8 Buffer where the results will be written. Like + * \c data8, \c dst8 should point at the top-left + * corner of the frame + * \param[in] dst_stride Stride of \c dst8 + * \param[in] tmpbuf Scratch buffer used by the sgrproj filter + * which should be at least SGRPROJ_TMPBUF_SIZE + * big. + * \param[in] optimized_lr Whether to use fast optimized Loop Restoration + * \param[in,out] error_info Error info for reporting errors + * + * \remark Nothing is returned. Instead, the filtered unit is output in + * \c dst8 at the proper restoration unit offset. + */ +void av1_loop_restoration_filter_unit( + const RestorationTileLimits *limits, const RestorationUnitInfo *rui, + const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs, + int plane_w, int plane_h, int ss_x, int ss_y, int highbd, int bit_depth, + uint8_t *data8, int stride, uint8_t *dst8, int dst_stride, int32_t *tmpbuf, + int optimized_lr, struct aom_internal_error_info *error_info); + +/*!\brief Function for applying loop restoration filter to a frame + * + * \ingroup in_loop_restoration + * This function applies the loop restoration filter to a frame. + * + * \param[in,out] frame Compressed frame buffer + * \param[in,out] cm Pointer to top level common structure + * \param[in] optimized_lr Whether to use fast optimized Loop Restoration + * \param[in] lr_ctxt Loop restoration context + * + * \remark Nothing is returned. Instead, the filtered frame is output in + * \c frame. + */ +void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, int optimized_lr, + void *lr_ctxt); +/*!\cond */ + +void av1_loop_restoration_precal(void); + +struct AV1LrSyncData; + +typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane); + +typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c, + const int sb_cols, int plane); + +// Call on_rest_unit for each loop restoration unit in the plane. +void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, + rest_unit_visitor_t on_rest_unit, + void *priv, int32_t *tmpbuf, + RestorationLineBuffers *rlbs); + +// Return 1 iff the block at mi_row, mi_col with size bsize is a +// top-level superblock containing the top-left corner of at least one +// loop restoration unit. +// +// If the block is a top-level superblock, the function writes to +// *rcol0, *rcol1, *rrow0, *rrow1. This means that the parameters for all +// restoration units in the rectangle [*rcol0, *rcol1) x [*rrow0, *rrow1) +// are signaled in this superblock. +int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *rcol0, int *rcol1, int *rrow0, + int *rrow1); + +void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int after_cdef); +void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, + YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int optimized_lr, int num_planes); +void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt, + struct AV1Common *cm, int num_planes); +void av1_foreach_rest_unit_in_row( + RestorationTileLimits *limits, int plane_w, + rest_unit_visitor_t on_rest_unit, int row_number, int unit_size, + int hnum_rest_units, int vnum_rest_units, int plane, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, sync_read_fn_t on_sync_read, + sync_write_fn_t on_sync_write, struct AV1LrSyncData *const lr_sync, + struct aom_internal_error_info *error_info); + +void av1_get_upsampled_plane_size(const struct AV1Common *cm, int is_uv, + int *plane_w, int *plane_h); +int av1_lr_count_units(int unit_size, int plane_size); +void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane); +void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c, + const int sb_cols, int plane); + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_RESTORATION_H_ diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c new file mode 100644 index 0000000000..d7c6a24378 --- /dev/null +++ b/third_party/aom/av1/common/scale.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/common/filter.h" +#include "av1/common/scale.h" +#include "aom_dsp/aom_filter.h" + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size; +} + +// Given the fixed point scale, calculate coarse point scale. +static int fixed_point_scale_to_coarse_point_scale(int scale_fp) { + return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS); +} + +// Note: x and y are integer precision, mvq4 is q4 precision. +MV32 av1_scale_mv(const MV *mvq4, int x, int y, + const struct scale_factors *sf) { + const int x_off_q4 = av1_scaled_x(x << SUBPEL_BITS, sf); + const int y_off_q4 = av1_scaled_y(y << SUBPEL_BITS, sf); + const MV32 res = { + av1_scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4, + av1_scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 + }; + return res; +} + +void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h) { + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; + return; + } + + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + + sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp); + sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp); +} diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h new file mode 100644 index 0000000000..d8481bfc2c --- /dev/null +++ b/third_party/aom/av1/common/scale.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SCALE_H_ +#define AOM_AV1_COMMON_SCALE_H_ + +#include "av1/common/convolve.h" +#include "av1/common/mv.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define SCALE_NUMERATOR 8 + +#define REF_SCALE_SHIFT 14 +#define REF_NO_SCALE (1 << REF_SCALE_SHIFT) +#define REF_INVALID_SCALE -1 + +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_step_q4; + int y_step_q4; +}; + +// Note: Expect val to be in q4 precision +static INLINE int av1_scaled_x(int val, const struct scale_factors *sf) { + const int off = + (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); + const int64_t tval = (int64_t)val * sf->x_scale_fp + off; + return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, + REF_SCALE_SHIFT - SCALE_EXTRA_BITS); +} + +// Note: Expect val to be in q4 precision +static INLINE int av1_scaled_y(int val, const struct scale_factors *sf) { + const int off = + (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1)); + const int64_t tval = (int64_t)val * sf->y_scale_fp + off; + return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, + REF_SCALE_SHIFT - SCALE_EXTRA_BITS); +} + +// Note: Expect val to be in q4 precision +static INLINE int av1_unscaled_value(int val, const struct scale_factors *sf) { + (void)sf; + return val * (1 << SCALE_EXTRA_BITS); +} + +MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); + +void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, + int other_h, int this_w, int this_h); + +static INLINE int av1_is_valid_scale(const struct scale_factors *sf) { + assert(sf != NULL); + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; +} + +static INLINE int av1_is_scaled(const struct scale_factors *sf) { + assert(sf != NULL); + return av1_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); +} + +// See AV1 spec, Section 6.8.6. Frame size with refs semantics. +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && 2 * this_height >= ref_height && + this_width <= 16 * ref_width && this_height <= 16 * ref_height; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SCALE_H_ diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c new file mode 100644 index 0000000000..0943579db1 --- /dev/null +++ b/third_party/aom/av1/common/scan.c @@ -0,0 +1,2038 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common_data.h" +#include "av1/common/scan.h" + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { + 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = { + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = { + 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19, + 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35, + 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39, + 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, + 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = { + 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, + 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, + 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, + 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226, + 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10, + 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, + 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, + 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109, + 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142, + 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175, + 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208, + 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, + 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, + 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58, + 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91, + 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124, + 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126, + 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, + 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, + 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, + 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, + 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, + 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, + 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, + 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, + 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, + 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, + 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, + 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, + 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, + 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, + 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, + 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, + 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, + 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, + 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, + 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, + 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, + 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, + 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, + 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, + 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, + 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, + 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, + 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, + 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, + 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, + 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, + 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, + 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, + 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, + 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { + 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, + 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67, + 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, + 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71, + 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73, + 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75, + 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, + 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = { + 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, + 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, + 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, + 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, + 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, + 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, + 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, + 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, + 117, 124, 111, 118, 125, 119, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, + 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, + 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, + 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, + 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, + 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, + 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, + 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, + 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, + 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, + 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, + 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, + 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, + 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, + 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238, + 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, + 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, + 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, + 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, + 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, + 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, + 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, + 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494, + 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, + 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, + 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, + 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, + 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, + 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500, + 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408, + 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285, + 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, + 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, + 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, + 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = { + 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, + 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, + 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, + 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, + 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, + 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, + 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, + 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, + 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, + 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, + 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, + 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, + 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, + 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, + 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, + 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, + 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, + 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, + 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, + 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, + 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, + 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, + 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, + 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, + 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, + 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, + 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, + 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, + 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, + 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, + 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, + 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, + 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, + 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, + 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, + 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, + 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, + 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, + 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, + 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, + 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, + 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, + 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, + 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, + 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, + 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, + 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, + 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, + 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, + 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, + 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, + 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, + 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, + 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, + 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, + 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, + 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, + 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, + 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, + 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, + 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, + 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, + 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, + 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, + 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, + 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, + 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, + 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, + 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, + 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, + 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, + 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, + 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, + 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, + 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, + 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, + 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, + 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, + 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, + 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, + 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, + 495, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { + 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, + 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, + 82, 67, 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, + 144, 129, 114, 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, + 85, 100, 115, 130, 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, + 41, 26, 11, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, + 192, 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, 28, 13, + 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, + 240, 225, 210, 195, 180, 165, 150, 135, 120, 105, 90, 75, 60, 45, 30, + 15, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, + 241, 242, 227, 212, 197, 182, 167, 152, 137, 122, 107, 92, 77, 62, 47, + 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 244, 229, + 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110, 125, 140, 155, + 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156, 141, 126, + 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203, 188, + 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190, + 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, + 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, + 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, + 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, + 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, + 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, + 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, + 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, + 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, + 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, + 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, + 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, + 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, + 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, + 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, + 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, + 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, + 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, + 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, + 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, + 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, + 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, + 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, + 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, + 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, + 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, + 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, + 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, + 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, + 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, + 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, + 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, + 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, + 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, + 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, + 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, + 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, + 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, + 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, + 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, + 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, + 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, + 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, + 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, + 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, + 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, + 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, + 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, + 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, + 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, + 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, + 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, + 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, + 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, + 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, + 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, + 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, + 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, + 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, + 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, + 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, + 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, + 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, + 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, + 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, + 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, + 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, + 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, + 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, + 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, + 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, + 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, + 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, + 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, + 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, + 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, + 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, + 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, + 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, + 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, + 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, + 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, + 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, + 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, + 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, + 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, + 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, + 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, + 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, + 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, + 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, + 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, + 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, + 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, + 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, + 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, + 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, + 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, + 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, + 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, + 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, + 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, + 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, + 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, + 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, + 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, + 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, + 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, + 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, + 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, + 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, + 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, + 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, + 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, + 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, + 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, + 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, + 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, + 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, + 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, + 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, + 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, + 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, + 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, + 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, + 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, + 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, + 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, + 991, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { + 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, + 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, + 161, 192, 224, 193, 162, 131, 100, 69, 38, 7, 8, 39, 70, + 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, + 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289, + 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, + 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, + 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, + 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, + 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, + 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202, + 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482, + 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79, + 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, + 359, 390, 421, 452, 483, 514, 545, 576, 608, 577, 546, 515, 484, + 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, + 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, + 361, 392, 423, 454, 485, 516, 547, 578, 609, 640, 672, 641, 610, + 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, + 176, 145, 114, 83, 52, 21, 22, 53, 84, 115, 146, 177, 208, + 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, + 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, + 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, + 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, + 396, 427, 458, 489, 520, 551, 582, 613, 644, 675, 706, 737, 768, + 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, + 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, + 26, 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, + 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, + 832, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523, + 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, + 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307, + 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, + 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742, + 711, 680, 649, 618, 587, 556, 525, 494, 463, 432, 401, 370, 339, + 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92, + 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, 495, + 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, + 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, + 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279, + 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187, + 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528, 559, 590, + 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, + 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622, + 591, 560, 529, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, + 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375, + 406, 437, 468, 499, 530, 561, 592, 623, 654, 685, 716, 747, 778, + 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841, + 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, + 407, 376, 345, 314, 283, 252, 221, 190, 159, 191, 222, 253, 284, + 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, + 718, 749, 780, 811, 842, 873, 904, 935, 966, 997, 998, 967, 936, + 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533, + 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317, + 348, 379, 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, + 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907, + 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, 535, 504, + 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474, + 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, + 908, 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754, + 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351, + 383, 414, 445, 476, 507, 538, 569, 600, 631, 662, 693, 724, 755, + 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880, + 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, + 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, + 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882, + 851, 820, 789, 758, 727, 696, 665, 634, 603, 572, 541, 510, 479, + 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, + 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791, 760, + 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730, + 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917, + 886, 855, 824, 793, 762, 731, 700, 669, 638, 607, 639, 670, 701, + 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950, + 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734, 765, 796, + 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859, + 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015, + 1016, 985, 954, 923, 892, 861, 830, 799, 831, 862, 893, 924, 955, + 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, + 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = { + 0, 2, 3, 9, 1, 4, 8, 10, 5, 7, 11, 14, 6, 12, 13, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = { + 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = { + 0, 2, 5, 9, 13, 17, 21, 25, 1, 4, 8, 12, 16, 20, 24, 28, + 3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, + 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = { + 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, + 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = { + 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, + 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = { + 0, 2, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, + 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62, + 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = { + 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18, + 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34, + 29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50, + 45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, + 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, + 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, + 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = { + 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, + 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, + 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, + 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, + 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, + 219, 227, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, + 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, + 201, 209, 217, 225, 233, 240, 6, 11, 17, 24, 32, 40, 48, 56, 64, + 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 239, 245, 10, 16, 23, 31, 39, 47, 55, + 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, + 183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15, 22, 30, 38, 46, + 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, + 174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28, + 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, + 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, + 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, + 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, + 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, + 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, + 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 108, 67, 74, + 81, 88, 95, 102, 109, 116, 75, 82, 89, 96, 103, 110, 117, 124, 83, + 90, 97, 104, 111, 118, 125, 132, 91, 98, 105, 112, 119, 126, 133, 140, + 99, 106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149, + 156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158, + 165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167, + 174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176, + 183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185, + 192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194, + 201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203, + 210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250, + 219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, + 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, + 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, + 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, + 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, + 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, + 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, + 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, + 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, + 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, + 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, + 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, + 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, + 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, + 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, + 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, + 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, + 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, + 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, + 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, + 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, + 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, + 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, + 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, + 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, + 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, + 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, + 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, + 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, + 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, + 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, + 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, + 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, + 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, + 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, + 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 112, + 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 111, 117, + 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 110, 116, 121, + 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 109, 115, 120, 124, + 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 108, 114, 119, 123, 126, + 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36, + 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52, + 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68, + 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84, + 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100, + 59, 66, 73, 80, 87, 94, 101, 107, 67, 74, 81, 88, 95, 102, 108, 113, + 75, 82, 89, 96, 103, 109, 114, 118, 83, 90, 97, 104, 110, 115, 119, 122, + 91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = { + 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, + 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, + 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, + 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, + 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, + 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, + 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, + 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, + 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, + 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, + 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, + 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, + 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, + 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, + 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = { + 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90, 104, 119, + 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, + 375, 391, 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76, 89, 103, + 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, + 358, 374, 390, 406, 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88, + 102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, + 341, 357, 373, 389, 405, 420, 6, 11, 17, 24, 32, 41, 51, 62, 74, + 87, 101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, + 324, 340, 356, 372, 388, 404, 419, 433, 10, 16, 23, 31, 40, 50, 61, + 73, 86, 100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, + 307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15, 22, 30, 39, 49, + 60, 72, 85, 99, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, + 290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21, 29, 38, + 48, 59, 71, 84, 98, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, + 273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28, + 37, 47, 58, 70, 83, 97, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465, + 475, 36, 46, 57, 69, 82, 96, 111, 127, 143, 159, 175, 191, 207, 223, + 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453, + 464, 474, 483, 45, 56, 68, 81, 95, 110, 126, 142, 158, 174, 190, 206, + 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440, + 452, 463, 473, 482, 490, 55, 67, 80, 94, 109, 125, 141, 157, 173, 189, + 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426, + 439, 451, 462, 472, 481, 489, 496, 66, 79, 93, 108, 124, 140, 156, 172, + 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411, + 425, 438, 450, 461, 471, 480, 488, 495, 501, 78, 92, 107, 123, 139, 155, + 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, + 410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91, 106, 122, 138, + 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, + 394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121, + 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, + 377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510, + 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, + 360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506, + 509, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = { + 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105, + 120, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 67, 79, 92, 106, + 121, 136, 5, 8, 12, 17, 23, 30, 38, 47, 57, 68, 80, 93, 107, + 122, 137, 152, 9, 13, 18, 24, 31, 39, 48, 58, 69, 81, 94, 108, + 123, 138, 153, 168, 14, 19, 25, 32, 40, 49, 59, 70, 82, 95, 109, + 124, 139, 154, 169, 184, 20, 26, 33, 41, 50, 60, 71, 83, 96, 110, + 125, 140, 155, 170, 185, 200, 27, 34, 42, 51, 61, 72, 84, 97, 111, + 126, 141, 156, 171, 186, 201, 216, 35, 43, 52, 62, 73, 85, 98, 112, + 127, 142, 157, 172, 187, 202, 217, 232, 44, 53, 63, 74, 86, 99, 113, + 128, 143, 158, 173, 188, 203, 218, 233, 248, 54, 64, 75, 87, 100, 114, + 129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65, 76, 88, 101, 115, + 130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77, 89, 102, 116, + 131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90, 103, 117, + 132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118, + 133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119, + 134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344, + 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345, + 360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346, + 361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347, + 362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348, + 363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349, + 364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350, + 365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351, + 366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352, + 367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353, + 368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354, + 369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355, + 370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356, + 371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357, + 372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358, + 373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359, + 374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, + 375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, + 509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, + 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, + 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, + 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, + 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, + 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, + 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, + 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, + 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, + 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, + 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, + 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, + 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, + 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, + 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, + 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, + 510, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, + 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, + 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, + 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, + 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, + 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, + 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, + 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, + 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, + 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, + 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, + 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, + 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, + 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, + 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, + 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, + 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, + 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, + 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, + 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, + 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, + 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, + 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, + 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, + 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, + 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, + 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, + 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, + 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, + 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, + 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, + 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, + 495, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, + 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, + 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, + 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, + 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, + 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, + 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, + 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, + 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, + 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, + 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, + 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, + 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, + 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, + 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, + 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, + 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, + 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, + 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, + 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, + 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, + 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, + 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, + 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, + 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, + 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = { + 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, + 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, + 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, + 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, + 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, + 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, + 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, + 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, + 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, + 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, + 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, + 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, + 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, + 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, + 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, + 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78, 104, 105, + 135, 1, 4, 8, 11, 19, 22, 34, 37, 53, 56, 76, 79, 103, 106, + 134, 136, 5, 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107, + 133, 137, 164, 6, 13, 17, 24, 32, 39, 51, 58, 74, 81, 101, 108, + 132, 138, 163, 165, 14, 16, 25, 31, 40, 50, 59, 73, 82, 100, 109, + 131, 139, 162, 166, 189, 15, 26, 30, 41, 49, 60, 72, 83, 99, 110, + 130, 140, 161, 167, 188, 190, 27, 29, 42, 48, 61, 71, 84, 98, 111, + 129, 141, 160, 168, 187, 191, 210, 28, 43, 47, 62, 70, 85, 97, 112, + 128, 142, 159, 169, 186, 192, 209, 211, 44, 46, 63, 69, 86, 96, 113, + 127, 143, 158, 170, 185, 193, 208, 212, 227, 45, 64, 68, 87, 95, 114, + 126, 144, 157, 171, 184, 194, 207, 213, 226, 228, 65, 67, 88, 94, 115, + 125, 145, 156, 172, 183, 195, 206, 214, 225, 229, 240, 66, 89, 93, 116, + 124, 146, 155, 173, 182, 196, 205, 215, 224, 230, 239, 241, 90, 92, 117, + 123, 147, 154, 174, 181, 197, 204, 216, 223, 231, 238, 242, 249, 91, 118, + 122, 148, 153, 175, 180, 198, 203, 217, 222, 232, 237, 243, 248, 250, 119, + 121, 149, 152, 176, 179, 199, 202, 218, 221, 233, 236, 244, 247, 251, 254, + 120, 150, 151, 177, 178, 200, 201, 219, 220, 234, 235, 245, 246, 252, 253, + 255, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, + 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, + 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, + 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, + 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, + 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, + 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, + 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, + 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, + 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, + 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, + 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, + 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, + 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, + 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, + 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, + 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, + 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, + 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, + 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, + 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, + 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, + 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, + 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, + 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, + 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, + 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, + 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, + 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, + 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, + 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, + 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, + 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, + 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, + 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, + 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, + 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, + 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, + 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, + 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, + 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, + 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, + 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, + 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, + 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, + 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, + 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, + 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, + 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, + 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, + 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, + 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, + 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, + 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, + 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, + 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, + 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, + 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, + 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, + 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, + 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, + 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, + 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, + 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, + 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, + 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, + 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = { + 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, + 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, + 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, + 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, + 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, + 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, + 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, + 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, + 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, + 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, + 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, + 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, + 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, + 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, + 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, + 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, + 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, + 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, + 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, + 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, + 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, + 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, + 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, + 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, + 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, + 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, + 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, + 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, + 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, + 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, + 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, + 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, + 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, + 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, + 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, + 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, + 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, + 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, + 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, + 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, + 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, + 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, + 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, + 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, + 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, + 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, + 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, + 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, + 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, + 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, + 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, + 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, + 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, + 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, + 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, + 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, + 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, + 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, + 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, + 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, + 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, + 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, + 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, + 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, + 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, + 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, + 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, + 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, + 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, + 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, + 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, + 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, + 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, + 991, 1023, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 36, 54, 55, 77, 78, + 104, 105, 135, 136, 170, 171, 209, 210, 252, 253, 299, 300, 350, + 351, 405, 406, 464, 465, 527, 1, 4, 8, 11, 19, 22, 34, + 37, 53, 56, 76, 79, 103, 106, 134, 137, 169, 172, 208, 211, + 251, 254, 298, 301, 349, 352, 404, 407, 463, 466, 526, 528, 5, + 7, 12, 18, 23, 33, 38, 52, 57, 75, 80, 102, 107, 133, + 138, 168, 173, 207, 212, 250, 255, 297, 302, 348, 353, 403, 408, + 462, 467, 525, 529, 588, 6, 13, 17, 24, 32, 39, 51, 58, + 74, 81, 101, 108, 132, 139, 167, 174, 206, 213, 249, 256, 296, + 303, 347, 354, 402, 409, 461, 468, 524, 530, 587, 589, 14, 16, + 25, 31, 40, 50, 59, 73, 82, 100, 109, 131, 140, 166, 175, + 205, 214, 248, 257, 295, 304, 346, 355, 401, 410, 460, 469, 523, + 531, 586, 590, 645, 15, 26, 30, 41, 49, 60, 72, 83, 99, + 110, 130, 141, 165, 176, 204, 215, 247, 258, 294, 305, 345, 356, + 400, 411, 459, 470, 522, 532, 585, 591, 644, 646, 27, 29, 42, + 48, 61, 71, 84, 98, 111, 129, 142, 164, 177, 203, 216, 246, + 259, 293, 306, 344, 357, 399, 412, 458, 471, 521, 533, 584, 592, + 643, 647, 698, 28, 43, 47, 62, 70, 85, 97, 112, 128, 143, + 163, 178, 202, 217, 245, 260, 292, 307, 343, 358, 398, 413, 457, + 472, 520, 534, 583, 593, 642, 648, 697, 699, 44, 46, 63, 69, + 86, 96, 113, 127, 144, 162, 179, 201, 218, 244, 261, 291, 308, + 342, 359, 397, 414, 456, 473, 519, 535, 582, 594, 641, 649, 696, + 700, 747, 45, 64, 68, 87, 95, 114, 126, 145, 161, 180, 200, + 219, 243, 262, 290, 309, 341, 360, 396, 415, 455, 474, 518, 536, + 581, 595, 640, 650, 695, 701, 746, 748, 65, 67, 88, 94, 115, + 125, 146, 160, 181, 199, 220, 242, 263, 289, 310, 340, 361, 395, + 416, 454, 475, 517, 537, 580, 596, 639, 651, 694, 702, 745, 749, + 792, 66, 89, 93, 116, 124, 147, 159, 182, 198, 221, 241, 264, + 288, 311, 339, 362, 394, 417, 453, 476, 516, 538, 579, 597, 638, + 652, 693, 703, 744, 750, 791, 793, 90, 92, 117, 123, 148, 158, + 183, 197, 222, 240, 265, 287, 312, 338, 363, 393, 418, 452, 477, + 515, 539, 578, 598, 637, 653, 692, 704, 743, 751, 790, 794, 833, + 91, 118, 122, 149, 157, 184, 196, 223, 239, 266, 286, 313, 337, + 364, 392, 419, 451, 478, 514, 540, 577, 599, 636, 654, 691, 705, + 742, 752, 789, 795, 832, 834, 119, 121, 150, 156, 185, 195, 224, + 238, 267, 285, 314, 336, 365, 391, 420, 450, 479, 513, 541, 576, + 600, 635, 655, 690, 706, 741, 753, 788, 796, 831, 835, 870, 120, + 151, 155, 186, 194, 225, 237, 268, 284, 315, 335, 366, 390, 421, + 449, 480, 512, 542, 575, 601, 634, 656, 689, 707, 740, 754, 787, + 797, 830, 836, 869, 871, 152, 154, 187, 193, 226, 236, 269, 283, + 316, 334, 367, 389, 422, 448, 481, 511, 543, 574, 602, 633, 657, + 688, 708, 739, 755, 786, 798, 829, 837, 868, 872, 903, 153, 188, + 192, 227, 235, 270, 282, 317, 333, 368, 388, 423, 447, 482, 510, + 544, 573, 603, 632, 658, 687, 709, 738, 756, 785, 799, 828, 838, + 867, 873, 902, 904, 189, 191, 228, 234, 271, 281, 318, 332, 369, + 387, 424, 446, 483, 509, 545, 572, 604, 631, 659, 686, 710, 737, + 757, 784, 800, 827, 839, 866, 874, 901, 905, 932, 190, 229, 233, + 272, 280, 319, 331, 370, 386, 425, 445, 484, 508, 546, 571, 605, + 630, 660, 685, 711, 736, 758, 783, 801, 826, 840, 865, 875, 900, + 906, 931, 933, 230, 232, 273, 279, 320, 330, 371, 385, 426, 444, + 485, 507, 547, 570, 606, 629, 661, 684, 712, 735, 759, 782, 802, + 825, 841, 864, 876, 899, 907, 930, 934, 957, 231, 274, 278, 321, + 329, 372, 384, 427, 443, 486, 506, 548, 569, 607, 628, 662, 683, + 713, 734, 760, 781, 803, 824, 842, 863, 877, 898, 908, 929, 935, + 956, 958, 275, 277, 322, 328, 373, 383, 428, 442, 487, 505, 549, + 568, 608, 627, 663, 682, 714, 733, 761, 780, 804, 823, 843, 862, + 878, 897, 909, 928, 936, 955, 959, 978, 276, 323, 327, 374, 382, + 429, 441, 488, 504, 550, 567, 609, 626, 664, 681, 715, 732, 762, + 779, 805, 822, 844, 861, 879, 896, 910, 927, 937, 954, 960, 977, + 979, 324, 326, 375, 381, 430, 440, 489, 503, 551, 566, 610, 625, + 665, 680, 716, 731, 763, 778, 806, 821, 845, 860, 880, 895, 911, + 926, 938, 953, 961, 976, 980, 995, 325, 376, 380, 431, 439, 490, + 502, 552, 565, 611, 624, 666, 679, 717, 730, 764, 777, 807, 820, + 846, 859, 881, 894, 912, 925, 939, 952, 962, 975, 981, 994, 996, + 377, 379, 432, 438, 491, 501, 553, 564, 612, 623, 667, 678, 718, + 729, 765, 776, 808, 819, 847, 858, 882, 893, 913, 924, 940, 951, + 963, 974, 982, 993, 997, 1008, 378, 433, 437, 492, 500, 554, 563, + 613, 622, 668, 677, 719, 728, 766, 775, 809, 818, 848, 857, 883, + 892, 914, 923, 941, 950, 964, 973, 983, 992, 998, 1007, 1009, 434, + 436, 493, 499, 555, 562, 614, 621, 669, 676, 720, 727, 767, 774, + 810, 817, 849, 856, 884, 891, 915, 922, 942, 949, 965, 972, 984, + 991, 999, 1006, 1010, 1017, 435, 494, 498, 556, 561, 615, 620, 670, + 675, 721, 726, 768, 773, 811, 816, 850, 855, 885, 890, 916, 921, + 943, 948, 966, 971, 985, 990, 1000, 1005, 1011, 1016, 1018, 495, 497, + 557, 560, 616, 619, 671, 674, 722, 725, 769, 772, 812, 815, 851, + 854, 886, 889, 917, 920, 944, 947, 967, 970, 986, 989, 1001, 1004, + 1012, 1015, 1019, 1022, 496, 558, 559, 617, 618, 672, 673, 723, 724, + 770, 771, 813, 814, 852, 853, 887, 888, 918, 919, 945, 946, 968, + 969, 987, 988, 1002, 1003, 1013, 1014, 1020, 1021, 1023, +}; + +const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = { + { + // TX_4X4 + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { default_scan_4x4, av1_default_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + { mrow_scan_4x4, av1_mrow_iscan_4x4 }, + { mcol_scan_4x4, av1_mcol_iscan_4x4 }, + }, + { + // TX_8X8 + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { default_scan_8x8, av1_default_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + { mrow_scan_8x8, av1_mrow_iscan_8x8 }, + { mcol_scan_8x8, av1_mcol_iscan_8x8 }, + }, + { + // TX_16X16 + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { default_scan_16x16, av1_default_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + { mrow_scan_16x16, av1_mrow_iscan_16x16 }, + { mcol_scan_16x16, av1_mcol_iscan_16x16 }, + }, + { + // TX_32X32 + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_64X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_4X8 + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { default_scan_4x8, av1_default_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + { mrow_scan_4x8, av1_mrow_iscan_4x8 }, + { mcol_scan_4x8, av1_mcol_iscan_4x8 }, + }, + { + // TX_8X4 + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { default_scan_8x4, av1_default_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + { mrow_scan_8x4, av1_mrow_iscan_8x4 }, + { mcol_scan_8x4, av1_mcol_iscan_8x4 }, + }, + { + // TX_8X16 + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { default_scan_8x16, av1_default_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + { mrow_scan_8x16, av1_mrow_iscan_8x16 }, + { mcol_scan_8x16, av1_mcol_iscan_8x16 }, + }, + { + // TX_16X8 + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { default_scan_16x8, av1_default_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + { mrow_scan_16x8, av1_mrow_iscan_16x8 }, + { mcol_scan_16x8, av1_mcol_iscan_16x8 }, + }, + { + // TX_16X32 + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + }, + { + // TX_32X16 + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + }, + { + // TX_32X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_64X32 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { default_scan_32x32, av1_default_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + { mrow_scan_32x32, av1_mrow_iscan_32x32 }, + { mcol_scan_32x32, av1_mcol_iscan_32x32 }, + }, + { + // TX_4X16 + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { default_scan_4x16, av1_default_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + { mrow_scan_4x16, av1_mrow_iscan_4x16 }, + { mcol_scan_4x16, av1_mcol_iscan_4x16 }, + }, + { + // TX_16X4 + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { default_scan_16x4, av1_default_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + { mrow_scan_16x4, av1_mrow_iscan_16x4 }, + { mcol_scan_16x4, av1_mcol_iscan_16x4 }, + }, + { + // TX_8X32 + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { default_scan_8x32, av1_default_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + { mrow_scan_8x32, av1_mrow_iscan_8x32 }, + { mcol_scan_8x32, av1_mcol_iscan_8x32 }, + }, + { + // TX_32X8 + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { default_scan_32x8, av1_default_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + { mrow_scan_32x8, av1_mrow_iscan_32x8 }, + { mcol_scan_32x8, av1_mcol_iscan_32x8 }, + }, + { + // TX_16X64 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { default_scan_16x32, av1_default_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + { mrow_scan_16x32, av1_mrow_iscan_16x32 }, + { mcol_scan_16x32, av1_mcol_iscan_16x32 }, + }, + { + // TX_64X16 + // Half of the coefficients of tx64 at higher frequencies are set to + // zeros. So tx32's scan order is used. + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { default_scan_32x16, av1_default_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + { mrow_scan_32x16, av1_mrow_iscan_32x16 }, + { mcol_scan_32x16, av1_mcol_iscan_32x16 }, + }, +}; diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h new file mode 100644 index 0000000000..4f369786f2 --- /dev/null +++ b/third_party/aom/av1/common/scan.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SCAN_H_ +#define AOM_AV1_COMMON_SCAN_H_ + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_NEIGHBORS 2 + +enum { + SCAN_MODE_ZIG_ZAG, + SCAN_MODE_COL_DIAG, + SCAN_MODE_ROW_DIAG, + SCAN_MODE_COL_1D, + SCAN_MODE_ROW_1D, + SCAN_MODES +} UENUM1BYTE(SCAN_MODE); + +extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; + +void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd); + +static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size, + TX_TYPE tx_type) { + return &av1_scan_orders[tx_size][tx_type]; +} + +static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) { + return get_default_scan(tx_size, tx_type); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SCAN_H_ diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c new file mode 100644 index 0000000000..60b185161c --- /dev/null +++ b/third_party/aom/av1/common/seg_common.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_loopfilter.h" +#include "av1/common/blockd.h" +#include "av1/common/seg_common.h" +#include "av1/common/quant_common.h" + +static const int seg_feature_data_signed[SEG_LVL_MAX] = { + 1, 1, 1, 1, 1, 0, 0, 0 +}; + +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + MAX_LOOP_FILTER, + 7, + 0, + 0 }; + +// These functions provide access to new segment level features. +// Eventually these function may be "optimized out" but for the moment, +// the coding mechanism is still subject to change so these provide a +// convenient single point of change. + +void av1_clearall_segfeatures(struct segmentation *seg) { + av1_zero(seg->feature_data); + av1_zero(seg->feature_mask); +} + +void av1_calculate_segdata(struct segmentation *seg) { + seg->segid_preskip = 0; + seg->last_active_segid = 0; + for (int i = 0; i < MAX_SEGMENTS; i++) { + for (int j = 0; j < SEG_LVL_MAX; j++) { + if (seg->feature_mask[i] & (1 << j)) { + seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME); + seg->last_active_segid = i; + } + } + } +} + +void av1_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] |= 1 << feature_id; +} + +int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_max[feature_id]; +} + +int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_signed[feature_id]; +} + +// The 'seg_data' given for each segment can be either deltas (from the default +// value chosen for the frame) or absolute values. +// +// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for +// SEGMENT_ALT_LF) +// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for +// SEGMENT_ALT_LF) +// +// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use +// the absolute values given). + +void av1_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data) { + if (seg_data < 0) { + assert(seg_feature_data_signed[feature_id]); + assert(-seg_data <= seg_feature_data_max[feature_id]); + } else { + assert(seg_data <= seg_feature_data_max[feature_id]); + } + + seg->feature_data[segment_id][feature_id] = seg_data; +} + +// TBD? Functions to read and write segment data with range / validity checking diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h new file mode 100644 index 0000000000..44b508b146 --- /dev/null +++ b/third_party/aom/av1/common/seg_common.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_SEG_COMMON_H_ +#define AOM_AV1_COMMON_SEG_COMMON_H_ + +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS - 1) + +#define SEG_TEMPORAL_PRED_CTXS 3 +#define SPATIAL_PREDICTION_PROBS 3 + +enum { + SEG_LVL_ALT_Q, // Use alternate Quantizer .... + SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical + SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal + SEG_LVL_ALT_LF_U, // Use alternate loop filter value on u plane + SEG_LVL_ALT_LF_V, // Use alternate loop filter value on v plane + SEG_LVL_REF_FRAME, // Optional Segment reference frame + SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode + SEG_LVL_GLOBALMV, + SEG_LVL_MAX +} UENUM1BYTE(SEG_LVL_FEATURES); + +struct segmentation { + uint8_t enabled; + uint8_t update_map; + uint8_t update_data; + uint8_t temporal_update; + + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + unsigned int feature_mask[MAX_SEGMENTS]; + int last_active_segid; // The highest numbered segment id that has some + // enabled feature. + uint8_t segid_preskip; // Whether the segment id will be read before the + // skip syntax element. + // 1: the segment id will be read first. + // 0: the skip syntax element will be read first. +}; + +struct segmentation_probs { + aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)]; + aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS] + [CDF_SIZE(MAX_SEGMENTS)]; +}; + +static INLINE int segfeature_active(const struct segmentation *seg, + uint8_t segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id)); +} + +static INLINE void segfeatures_copy(struct segmentation *dst, + const struct segmentation *src) { + int i, j; + for (i = 0; i < MAX_SEGMENTS; i++) { + dst->feature_mask[i] = src->feature_mask[i]; + for (j = 0; j < SEG_LVL_MAX; j++) { + dst->feature_data[i][j] = src->feature_data[i][j]; + } + } + dst->segid_preskip = src->segid_preskip; + dst->last_active_segid = src->last_active_segid; +} + +void av1_clearall_segfeatures(struct segmentation *seg); + +void av1_enable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void av1_calculate_segdata(struct segmentation *seg); + +int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id); + +int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void av1_set_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data); + +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} + +static AOM_INLINE void set_segment_id(uint8_t *segment_ids, int mi_offset, + int x_mis, int y_mis, int mi_stride, + uint8_t segment_id) { + segment_ids += mi_offset; + for (int y = 0; y < y_mis; ++y) { + memset(&segment_ids[y * mi_stride], segment_id, + x_mis * sizeof(segment_ids[0])); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_SEG_COMMON_H_ diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c new file mode 100644 index 0000000000..45695147ff --- /dev/null +++ b/third_party/aom/av1/common/thread_common.c @@ -0,0 +1,1250 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_image.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/entropymode.h" +#include "av1/common/thread_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +} + +static INLINE int get_lr_sync_range(int width) { +#if 0 + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +#else + (void)width; + return 1; +#endif +} + +// Allocate memory for lf row synchronization +void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, + int width, int num_workers) { + lf_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i, j; + + for (j = 0; j < MAX_MB_PLANE; j++) { + CHECK_MEM_ERROR(cm, lf_sync->mutex_[j], + aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows)); + if (lf_sync->mutex_[j]) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[j][i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->cond_[j], + aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows)); + if (lf_sync->cond_[j]) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[j][i], NULL); + } + } + } + + CHECK_MEM_ERROR(cm, lf_sync->job_mutex, + aom_malloc(sizeof(*(lf_sync->job_mutex)))); + if (lf_sync->job_mutex) { + pthread_mutex_init(lf_sync->job_mutex, NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lf_sync->lfdata, + aom_malloc(num_workers * sizeof(*(lf_sync->lfdata)))); + lf_sync->num_workers = num_workers; + + for (int j = 0; j < MAX_MB_PLANE; j++) { + CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j], + aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows)); + } + CHECK_MEM_ERROR( + cm, lf_sync->job_queue, + aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2)); + // Set up nsync. + lf_sync->sync_range = get_sync_range(width); +} + +// Deallocate lf synchronization related mutex and data +void av1_loop_filter_dealloc(AV1LfSync *lf_sync) { + if (lf_sync != NULL) { + int j; +#if CONFIG_MULTITHREAD + int i; + for (j = 0; j < MAX_MB_PLANE; j++) { + if (lf_sync->mutex_[j] != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex_[j][i]); + } + aom_free(lf_sync->mutex_[j]); + } + if (lf_sync->cond_[j] != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond_[j][i]); + } + aom_free(lf_sync->cond_[j]); + } + } + if (lf_sync->job_mutex != NULL) { + pthread_mutex_destroy(lf_sync->job_mutex); + aom_free(lf_sync->job_mutex); + } +#endif // CONFIG_MULTITHREAD + aom_free(lf_sync->lfdata); + for (j = 0; j < MAX_MB_PLANE; j++) { + aom_free(lf_sync->cur_sb_col[j]); + } + + aom_free(lf_sync->job_queue); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*lf_sync); + } +} + +void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, + int num_workers) { + if (num_workers < 1) return; +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } +#else + (void)cm; + (void)cdef_sync; +#endif // CONFIG_MULTITHREAD +} + +void av1_free_cdef_sync(AV1CdefSync *cdef_sync) { + if (cdef_sync == NULL) return; +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ != NULL) { + pthread_mutex_destroy(cdef_sync->mutex_); + aom_free(cdef_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD +} + +static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync, + int row) { + if (!row) return; +#if CONFIG_MULTITHREAD + AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; + pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_); + while (cdef_row_mt[row - 1].is_row_done != 1) + pthread_cond_wait(cdef_row_mt[row - 1].row_cond_, + cdef_row_mt[row - 1].row_mutex_); + cdef_row_mt[row - 1].is_row_done = 0; + pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_); +#else + (void)cdef_sync; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync, + int row) { +#if CONFIG_MULTITHREAD + AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; + pthread_mutex_lock(cdef_row_mt[row].row_mutex_); + pthread_cond_signal(cdef_row_mt[row].row_cond_); + cdef_row_mt[row].is_row_done = 1; + pthread_mutex_unlock(cdef_row_mt[row].row_mutex_); +#else + (void)cdef_sync; + (void)row; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c, + int plane) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1]; + pthread_mutex_lock(mutex); + + while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) { + pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, + const int sb_cols, int plane) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&lf_sync->mutex_[plane][r]); + + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur); + + pthread_cond_broadcast(&lf_sync->cond_[plane][r]); + pthread_mutex_unlock(&lf_sync->mutex_[plane][r]); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +// One job of row loopfiltering. +void av1_thread_loop_filter_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane, + int dir, int lpf_opt_level, AV1LfSync *const lf_sync, + struct aom_internal_error_info *error_info, + AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, + int num_mis_in_lpf_unit_height_log2) { + // TODO(aomedia:3276): Pass error_info to the low-level functions as required + // in future to handle error propagation. + (void)error_info; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2); + const int r = mi_row >> num_mis_in_lpf_unit_height_log2; + int mi_col, c; + + const bool joint_filter_chroma = (lpf_opt_level == 2) && plane > AOM_PLANE_Y; + const int num_planes = joint_filter_chroma ? 2 : 1; + assert(IMPLIES(joint_filter_chroma, plane == AOM_PLANE_U)); + + if (dir == 0) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { + c = mi_col >> MAX_MIB_SIZE_LOG2; + + av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + num_planes); + if (lpf_opt_level) { + if (plane == AOM_PLANE_Y) { + av1_filter_block_plane_vert_opt(cm, xd, &planes[plane], mi_row, + mi_col, params_buf, tx_buf, + num_mis_in_lpf_unit_height_log2); + } else { + av1_filter_block_plane_vert_opt_chroma( + cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane, + joint_filter_chroma, num_mis_in_lpf_unit_height_log2); + } + } else { + av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } + if (lf_sync != NULL) { + sync_write(lf_sync, r, c, sb_cols, plane); + } + } + } else if (dir == 1) { + for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { + c = mi_col >> MAX_MIB_SIZE_LOG2; + + if (lf_sync != NULL) { + // Wait for vertical edge filtering of the top-right block to be + // completed + sync_read(lf_sync, r, c, plane); + + // Wait for vertical edge filtering of the right block to be completed + sync_read(lf_sync, r + 1, c, plane); + } + +#if CONFIG_MULTITHREAD + if (lf_sync && lf_sync->num_workers > 1) { + pthread_mutex_lock(lf_sync->job_mutex); + const bool lf_mt_exit = lf_sync->lf_mt_exit; + pthread_mutex_unlock(lf_sync->job_mutex); + // Exit in case any worker has encountered an error. + if (lf_mt_exit) return; + } +#endif + + av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + num_planes); + if (lpf_opt_level) { + if (plane == AOM_PLANE_Y) { + av1_filter_block_plane_horz_opt(cm, xd, &planes[plane], mi_row, + mi_col, params_buf, tx_buf, + num_mis_in_lpf_unit_height_log2); + } else { + av1_filter_block_plane_horz_opt_chroma( + cm, xd, &planes[plane], mi_row, mi_col, params_buf, tx_buf, plane, + joint_filter_chroma, num_mis_in_lpf_unit_height_log2); + } + } else { + av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } + } + } +} + +void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync, + int num_mis_in_lpf_unit_height_log2) { + int plane, sb_row; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, num_mis_in_lpf_unit_height_log2); + const int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2); + + // In case of loopfilter row-multithreading, the worker on an SB row waits for + // the vertical edge filtering of the right and top-right SBs. Hence, in case + // a thread (main/worker) encounters an error, update that vertical + // loopfiltering of every SB row in the frame is complete in order to avoid + // dependent workers waiting indefinitely. + for (sb_row = 0; sb_row < sb_rows; ++sb_row) + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + sync_write(lf_sync, sb_row, sb_cols - 1, sb_cols, plane); +} + +static AOM_INLINE void sync_lf_workers(AVxWorker *const workers, + AV1_COMMON *const cm, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = workers[0].had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + AVxWorker *const worker = &workers[0]; + error_info = ((LFWorkerData *)worker->data2)->error_info; + } + + // Wait till all rows are finished. + for (int i = num_workers - 1; i > 0; --i) { + AVxWorker *const worker = &workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((LFWorkerData *)worker->data2)->error_info; + } + } + if (had_error) aom_internal_error_copy(cm->error, &error_info); +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_row_worker(void *arg1, void *arg2) { + AV1LfSync *const lf_sync = (AV1LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; + AV1LfMTInfo *cur_job_info; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex_ = lf_sync->job_mutex; +#endif + + struct aom_internal_error_info *const error_info = &lf_data->error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(job_mutex_); + lf_sync->lf_mt_exit = true; + pthread_mutex_unlock(job_mutex_); +#endif + av1_set_vert_loop_filter_done(lf_data->cm, lf_sync, MAX_MIB_SIZE_LOG2); + return 0; + } + error_info->setjmp = 1; + + while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { + const int lpf_opt_level = cur_job_info->lpf_opt_level; + av1_thread_loop_filter_rows( + lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, + cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, + lpf_opt_level, lf_sync, error_info, lf_data->params_buf, + lf_data->tx_buf, MAX_MIB_SIZE_LOG2); + } + error_info->setjmp = 0; + return 1; +} + +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int start, int stop, + const int planes_to_lf[MAX_MB_PLANE], + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync, int lpf_opt_level) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int i; + loop_filter_frame_mt_init(cm, start, stop, planes_to_lf, num_workers, lf_sync, + lpf_opt_level, MAX_MIB_SIZE_LOG2); + + // Set up loopfilter thread data. + for (i = num_workers - 1; i >= 0; --i) { + AVxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + + worker->hook = loop_filter_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + loop_filter_data_reset(lf_data, frame, cm, xd); + + // Start loopfiltering + worker->had_error = 0; + if (i == 0) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + sync_lf_workers(workers, cm, num_workers); +} + +static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int start, int stop, + const int planes_to_lf[MAX_MB_PLANE], + int lpf_opt_level) { + // Filter top rows of all planes first, in case the output can be partially + // reconstructed row by row. + int mi_row, plane, dir; + + AV1_DEBLOCKING_PARAMETERS params_buf[MAX_MIB_SIZE]; + TX_SIZE tx_buf[MAX_MIB_SIZE]; + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) { + continue; + } + + for (dir = 0; dir < 2; ++dir) { + av1_thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane, + dir, lpf_opt_level, /*lf_sync=*/NULL, + xd->error_info, params_buf, tx_buf, + MAX_MIB_SIZE_LOG2); + } + } + } +} + +void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, + MACROBLOCKD *xd, int plane_start, int plane_end, + int partial_frame, AVxWorker *workers, + int num_workers, AV1LfSync *lf_sync, + int lpf_opt_level) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + int planes_to_lf[MAX_MB_PLANE]; + + if (!check_planes_to_loop_filter(&cm->lf, planes_to_lf, plane_start, + plane_end)) + return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_params.mi_rows; + if (partial_frame && cm->mi_params.mi_rows > 8) { + start_mi_row = cm->mi_params.mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + av1_loop_filter_frame_init(cm, plane_start, plane_end); + + if (num_workers > 1) { + // Enqueue and execute loopfiltering jobs. + loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf, + workers, num_workers, lf_sync, lpf_opt_level); + } else { + // Directly filter in the main thread. + loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf, + lpf_opt_level); + } +} + +static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) { +#if CONFIG_MULTITHREAD + AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; + const int nsync = loop_res_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1]; + pthread_mutex_lock(mutex); + + while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) { + pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lr_sync; + (void)r; + (void)c; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void lr_sync_write(void *const lr_sync, int r, int c, + const int sb_cols, int plane) { +#if CONFIG_MULTITHREAD + AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync; + const int nsync = loop_res_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]); + + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + loop_res_sync->cur_sb_col[plane][r] = + AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur); + + pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]); + pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]); + } +#else + (void)lr_sync; + (void)r; + (void)c; + (void)sb_cols; + (void)plane; +#endif // CONFIG_MULTITHREAD +} + +// Allocate memory for loop restoration row synchronization +void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, + int num_workers, int num_rows_lr, + int num_planes, int width) { + lr_sync->rows = num_rows_lr; + lr_sync->num_planes = num_planes; +#if CONFIG_MULTITHREAD + { + int i, j; + + for (j = 0; j < num_planes; j++) { + CHECK_MEM_ERROR(cm, lr_sync->mutex_[j], + aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr)); + if (lr_sync->mutex_[j]) { + for (i = 0; i < num_rows_lr; ++i) { + pthread_mutex_init(&lr_sync->mutex_[j][i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lr_sync->cond_[j], + aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr)); + if (lr_sync->cond_[j]) { + for (i = 0; i < num_rows_lr; ++i) { + pthread_cond_init(&lr_sync->cond_[j][i], NULL); + } + } + } + + CHECK_MEM_ERROR(cm, lr_sync->job_mutex, + aom_malloc(sizeof(*(lr_sync->job_mutex)))); + if (lr_sync->job_mutex) { + pthread_mutex_init(lr_sync->job_mutex, NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata, + aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata)))); + lr_sync->num_workers = num_workers; + + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + if (worker_idx < num_workers - 1) { + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf, + (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE)); + CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs, + aom_malloc(sizeof(RestorationLineBuffers))); + + } else { + lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf; + lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs; + } + } + + for (int j = 0; j < num_planes; j++) { + CHECK_MEM_ERROR( + cm, lr_sync->cur_sb_col[j], + aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr)); + } + CHECK_MEM_ERROR( + cm, lr_sync->job_queue, + aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes)); + // Set up nsync. + lr_sync->sync_range = get_lr_sync_range(width); +} + +// Deallocate loop restoration synchronization related mutex and data +void av1_loop_restoration_dealloc(AV1LrSync *lr_sync) { + if (lr_sync != NULL) { + int j; +#if CONFIG_MULTITHREAD + int i; + for (j = 0; j < MAX_MB_PLANE; j++) { + if (lr_sync->mutex_[j] != NULL) { + for (i = 0; i < lr_sync->rows; ++i) { + pthread_mutex_destroy(&lr_sync->mutex_[j][i]); + } + aom_free(lr_sync->mutex_[j]); + } + if (lr_sync->cond_[j] != NULL) { + for (i = 0; i < lr_sync->rows; ++i) { + pthread_cond_destroy(&lr_sync->cond_[j][i]); + } + aom_free(lr_sync->cond_[j]); + } + } + if (lr_sync->job_mutex != NULL) { + pthread_mutex_destroy(lr_sync->job_mutex); + aom_free(lr_sync->job_mutex); + } +#endif // CONFIG_MULTITHREAD + for (j = 0; j < MAX_MB_PLANE; j++) { + aom_free(lr_sync->cur_sb_col[j]); + } + + aom_free(lr_sync->job_queue); + + if (lr_sync->lrworkerdata) { + for (int worker_idx = 0; worker_idx < lr_sync->num_workers - 1; + worker_idx++) { + LRWorkerData *const workerdata_data = + lr_sync->lrworkerdata + worker_idx; + + aom_free(workerdata_data->rst_tmpbuf); + aom_free(workerdata_data->rlbs); + } + aom_free(lr_sync->lrworkerdata); + } + + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*lr_sync); + } +} + +static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, + AV1_COMMON *cm) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + const int num_planes = av1_num_planes(cm); + AV1LrMTInfo *lr_job_queue = lr_sync->job_queue; + int32_t lr_job_counter[2], num_even_lr_jobs = 0; + lr_sync->jobs_enqueued = 0; + lr_sync->jobs_dequeued = 0; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + num_even_lr_jobs = + num_even_lr_jobs + ((ctxt[plane].rsi->vert_units + 1) >> 1); + } + lr_job_counter[0] = 0; + lr_job_counter[1] = num_even_lr_jobs; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int unit_size = ctxt[plane].rsi->restoration_unit_size; + const int plane_h = ctxt[plane].plane_h; + const int ext_size = unit_size * 3 / 2; + + int y0 = 0, i = 0; + while (y0 < plane_h) { + int remaining_h = plane_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : unit_size; + + RestorationTileLimits limits; + limits.v_start = y0; + limits.v_end = y0 + h; + assert(limits.v_end <= plane_h); + // Offset upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(0, limits.v_start - voffset); + if (limits.v_end < plane_h) limits.v_end -= voffset; + + assert(lr_job_counter[0] <= num_even_lr_jobs); + + lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i; + lr_job_queue[lr_job_counter[i & 1]].plane = plane; + lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start; + lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end; + lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1; + if ((i & 1) == 0) { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + limits.v_start + RESTORATION_BORDER; + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + limits.v_end - RESTORATION_BORDER; + if (i == 0) { + assert(limits.v_start == 0); + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = 0; + } + if (i == (ctxt[plane].rsi->vert_units - 1)) { + assert(limits.v_end == plane_h); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = plane_h; + } + } else { + lr_job_queue[lr_job_counter[i & 1]].v_copy_start = + AOMMAX(limits.v_start - RESTORATION_BORDER, 0); + lr_job_queue[lr_job_counter[i & 1]].v_copy_end = + AOMMIN(limits.v_end + RESTORATION_BORDER, plane_h); + } + lr_job_counter[i & 1]++; + lr_sync->jobs_enqueued++; + + y0 += h; + ++i; + } + } +} + +static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) { + AV1LrMTInfo *cur_job_info = NULL; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lr_sync->job_mutex); + + if (!lr_sync->lr_mt_exit && lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) { + cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued; + lr_sync->jobs_dequeued++; + } + + pthread_mutex_unlock(lr_sync->job_mutex); +#else + (void)lr_sync; +#endif + + return cur_job_info; +} + +static void set_loop_restoration_done(AV1LrSync *const lr_sync, + FilterFrameCtxt *const ctxt) { + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (ctxt[plane].rsi->frame_restoration_type == RESTORE_NONE) continue; + int y0 = 0, row_number = 0; + const int unit_size = ctxt[plane].rsi->restoration_unit_size; + const int plane_h = ctxt[plane].plane_h; + const int ext_size = unit_size * 3 / 2; + const int hnum_rest_units = ctxt[plane].rsi->horz_units; + while (y0 < plane_h) { + const int remaining_h = plane_h - y0; + const int h = (remaining_h < ext_size) ? remaining_h : unit_size; + lr_sync_write(lr_sync, row_number, hnum_rest_units - 1, hnum_rest_units, + plane); + y0 += h; + ++row_number; + } + } +} + +// Implement row loop restoration for each thread. +static int loop_restoration_row_worker(void *arg1, void *arg2) { + AV1LrSync *const lr_sync = (AV1LrSync *)arg1; + LRWorkerData *lrworkerdata = (LRWorkerData *)arg2; + AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt; + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + int lr_unit_row; + int plane; + int plane_w; +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex_ = lr_sync->job_mutex; +#endif + struct aom_internal_error_info *const error_info = &lrworkerdata->error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(job_mutex_); + lr_sync->lr_mt_exit = true; + pthread_mutex_unlock(job_mutex_); +#endif + // In case of loop restoration multithreading, the worker on an even lr + // block row waits for the completion of the filtering of the top-right and + // bottom-right blocks. Hence, in case a thread (main/worker) encounters an + // error, update that filtering of every row in the frame is complete in + // order to avoid the dependent workers from waiting indefinitely. + set_loop_restoration_done(lr_sync, lr_ctxt->ctxt); + return 0; + } + error_info->setjmp = 1; + + typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend, + int vstart, int vend); + static const copy_fun copy_funs[MAX_MB_PLANE] = { + aom_yv12_partial_coloc_copy_y, aom_yv12_partial_coloc_copy_u, + aom_yv12_partial_coloc_copy_v + }; + + while (1) { + AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync); + if (cur_job_info != NULL) { + RestorationTileLimits limits; + sync_read_fn_t on_sync_read; + sync_write_fn_t on_sync_write; + limits.v_start = cur_job_info->v_start; + limits.v_end = cur_job_info->v_end; + lr_unit_row = cur_job_info->lr_unit_row; + plane = cur_job_info->plane; + plane_w = ctxt[plane].plane_w; + + // sync_mode == 1 implies only sync read is required in LR Multi-threading + // sync_mode == 0 implies only sync write is required. + on_sync_read = + cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy; + on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write + : av1_lr_sync_write_dummy; + + av1_foreach_rest_unit_in_row( + &limits, plane_w, lr_ctxt->on_rest_unit, lr_unit_row, + ctxt[plane].rsi->restoration_unit_size, ctxt[plane].rsi->horz_units, + ctxt[plane].rsi->vert_units, plane, &ctxt[plane], + lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read, + on_sync_write, lr_sync, error_info); + + copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, 0, plane_w, + cur_job_info->v_copy_start, cur_job_info->v_copy_end); + + if (lrworkerdata->do_extend_border) { + aom_extend_frame_borders_plane_row(lr_ctxt->frame, plane, + cur_job_info->v_copy_start, + cur_job_info->v_copy_end); + } + } else { + break; + } + } + error_info->setjmp = 0; + return 1; +} + +static AOM_INLINE void sync_lr_workers(AVxWorker *const workers, + AV1_COMMON *const cm, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = workers[0].had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + AVxWorker *const worker = &workers[0]; + error_info = ((LRWorkerData *)worker->data2)->error_info; + } + + // Wait till all rows are finished. + for (int i = num_workers - 1; i > 0; --i) { + AVxWorker *const worker = &workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((LRWorkerData *)worker->data2)->error_info; + } + } + if (had_error) aom_internal_error_copy(cm->error, &error_info); +} + +static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, + AVxWorker *workers, int num_workers, + AV1LrSync *lr_sync, AV1_COMMON *cm, + int do_extend_border) { + FilterFrameCtxt *ctxt = lr_ctxt->ctxt; + + const int num_planes = av1_num_planes(cm); + + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_rows_lr = 0; + + for (int plane = 0; plane < num_planes; plane++) { + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + + const int plane_h = ctxt[plane].plane_h; + const int unit_size = cm->rst_info[plane].restoration_unit_size; + + num_rows_lr = AOMMAX(num_rows_lr, av1_lr_count_units(unit_size, plane_h)); + } + + int i; + assert(MAX_MB_PLANE == 3); + + if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || + num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) { + av1_loop_restoration_dealloc(lr_sync); + av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, + num_planes, cm->width); + } + lr_sync->lr_mt_exit = false; + + // Initialize cur_sb_col to -1 for all SB rows. + for (i = 0; i < num_planes; i++) { + memset(lr_sync->cur_sb_col[i], -1, + sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr); + } + + enqueue_lr_jobs(lr_sync, lr_ctxt, cm); + + // Set up looprestoration thread data. + for (i = num_workers - 1; i >= 0; --i) { + AVxWorker *const worker = &workers[i]; + lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt; + lr_sync->lrworkerdata[i].do_extend_border = do_extend_border; + worker->hook = loop_restoration_row_worker; + worker->data1 = lr_sync; + worker->data2 = &lr_sync->lrworkerdata[i]; + + // Start loop restoration + worker->had_error = 0; + if (i == 0) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + sync_lr_workers(workers, cm, num_workers); +} + +void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + AV1_COMMON *cm, int optimized_lr, + AVxWorker *workers, int num_workers, + AV1LrSync *lr_sync, void *lr_ctxt, + int do_extend_border) { + assert(!cm->features.all_lossless); + + const int num_planes = av1_num_planes(cm); + + AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt; + + av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm, + optimized_lr, num_planes); + + foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync, + cm, do_extend_border); +} + +// Initializes cdef_sync parameters. +static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) { + cdef_sync->end_of_frame = 0; + cdef_sync->fbr = 0; + cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; +} + +static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + worker->had_error = 0; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers, + AV1_COMMON *const cm, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = workers[0].had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + AVxWorker *const worker = &workers[0]; + error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; + } + + // Wait till all rows are finished. + for (int i = num_workers - 1; i > 0; --i) { + AVxWorker *const worker = &workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; + } + } + if (had_error) aom_internal_error_copy(cm->error, &error_info); +} + +// Updates the row index of the next job to be processed. +// Also updates end_of_frame flag when the processing of all rows is complete. +static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync, + const int nvfb) { + cdef_sync->fbr++; + if (cdef_sync->fbr == nvfb) { + cdef_sync->end_of_frame = 1; + } +} + +// Checks if a job is available. If job is available, +// populates next job information and returns 1, else returns 0. +static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync, + volatile int *cur_fbr, + const int nvfb) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + int do_next_row = 0; + // Populates information needed for current job and update the row + // index of the next row to be processed. + if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { + do_next_row = 1; + *cur_fbr = cdef_sync->fbr; + update_cdef_row_next_job_info(cdef_sync, nvfb); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + return do_next_row; +} + +static void set_cdef_init_fb_row_done(AV1CdefSync *const cdef_sync, int nvfb) { + for (int fbr = 0; fbr < nvfb; fbr++) cdef_row_mt_sync_write(cdef_sync, fbr); +} + +// Hook function for each thread in CDEF multi-threading. +static int cdef_sb_row_worker_hook(void *arg1, void *arg2) { + AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1; + AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2; + AV1_COMMON *cm = cdef_worker->cm; + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex_ = cdef_sync->mutex_; +#endif + struct aom_internal_error_info *const error_info = &cdef_worker->error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(job_mutex_); + cdef_sync->cdef_mt_exit = true; + pthread_mutex_unlock(job_mutex_); +#endif + // In case of cdef row-multithreading, the worker on a filter block row + // (fbr) waits for the line buffers (top and bottom) copy of the above row. + // Hence, in case a thread (main/worker) encounters an error before copying + // of the line buffers, update that line buffer copy is complete in order to + // avoid dependent workers waiting indefinitely. + set_cdef_init_fb_row_done(cdef_sync, nvfb); + return 0; + } + error_info->setjmp = 1; + + volatile int cur_fbr; + const int num_planes = av1_num_planes(cm); + while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) { + MACROBLOCKD *xd = cdef_worker->xd; + av1_cdef_fb_row(cm, xd, cdef_worker->linebuf, cdef_worker->colbuf, + cdef_worker->srcbuf, cur_fbr, + cdef_worker->cdef_init_fb_row_fn, cdef_sync, error_info); + if (cdef_worker->do_extend_border) { + for (int plane = 0; plane < num_planes; ++plane) { + const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf; + const int is_uv = plane > 0; + const int mi_high = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int unit_height = MI_SIZE_64X64 << mi_high; + const int v_start = cur_fbr * unit_height; + const int v_end = + AOMMIN(v_start + unit_height, ybf->crop_heights[is_uv]); + aom_extend_frame_borders_plane_row(ybf, plane, v_start, v_end); + } + } + } + error_info->setjmp = 0; + return 1; +} + +// Assigns CDEF hook function and thread data to each worker. +static void prepare_cdef_frame_workers( + AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker, + AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, + int do_extend_border) { + const int num_planes = av1_num_planes(cm); + + cdef_worker[0].srcbuf = cm->cdef_info.srcbuf; + for (int plane = 0; plane < num_planes; plane++) + cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane]; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + cdef_worker[i].cm = cm; + cdef_worker[i].xd = xd; + cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn; + cdef_worker[i].do_extend_border = do_extend_border; + for (int plane = 0; plane < num_planes; plane++) + cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane]; + + worker->hook = hook; + worker->data1 = cdef_sync; + worker->data2 = &cdef_worker[i]; + } +} + +// Initializes row-level parameters for CDEF frame. +void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr) { + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + + // for the current filter block, it's top left corner mi structure (mi_tl) + // is first accessed to check whether the top and left boundaries are + // frame boundaries. Then bottom-left and top-right mi structures are + // accessed to check whether the bottom and right boundaries + // (respectively) are frame boundaries. + // + // Note that we can't just check the bottom-right mi structure - eg. if + // we're at the right-hand edge of the frame but not the bottom, then + // the bottom-right mi is NULL but the bottom-left is not. + fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; + if (fbr != nvfb - 1) + fb_info->frame_boundary[BOTTOM] = + (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; + else + fb_info->frame_boundary[BOTTOM] = 1; + + fb_info->src = src; + fb_info->damping = cm->cdef_info.cdef_damping; + fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + av1_zero(fb_info->dir); + av1_zero(fb_info->var); + + for (int plane = 0; plane < num_planes; plane++) { + const int stride = luma_stride >> xd->plane[plane].subsampling_x; + uint16_t *top_linebuf = &linebuf[plane][0]; + uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride]; + { + const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + + if (fbr != nvfb - 1) // if (fbr != 0) // top line buffer copy + av1_cdef_copy_sb8_16( + cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride, + xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + if (fbr != nvfb - 1) // bottom line buffer copy + av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride], + stride, xd->plane[plane].dst.buf, bot_offset, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + } + + fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride]; + fb_info->bot_linebuf[plane] = + &linebuf[plane] + [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)]; + } + + cdef_row_mt_sync_write(cdef_sync, fbr); + cdef_row_mt_sync_read(cdef_sync, fbr); +} + +// Implements multi-threading for CDEF. +// Perform CDEF on input frame. +// Inputs: +// frame: Pointer to input frame buffer. +// cm: Pointer to common structure. +// xd: Pointer to common current coding block structure. +// Returns: +// Nothing will be returned. +void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, + AV1CdefWorkerData *const cdef_worker, + AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, + int do_extend_border) { + YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf; + const int num_planes = av1_num_planes(cm); + + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + + reset_cdef_job_info(cdef_sync); + prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook, + workers, cdef_sync, num_workers, + cdef_init_fb_row_fn, do_extend_border); + launch_cdef_workers(workers, num_workers); + sync_cdef_workers(workers, cm, num_workers); +} + +int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm) { + // No additional top-right delay when intraBC tool is not enabled. + if (!av1_allow_intrabc(cm)) return 0; + // Due to the hardware constraints on processing the intraBC tool with row + // multithreading, a top-right delay of 3 superblocks of size 128x128 or 5 + // superblocks of size 64x64 is mandated. However, a minimum top-right delay + // of 1 superblock is assured with 'sync_range'. Hence return only the + // additional superblock delay when the intraBC tool is enabled. + return cm->seq_params->sb_size == BLOCK_128X128 ? 2 : 4; +} diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h new file mode 100644 index 0000000000..675687dc98 --- /dev/null +++ b/third_party/aom/av1/common/thread_common.h @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_ +#define AOM_AV1_COMMON_THREAD_COMMON_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_loopfilter.h" +#include "av1/common/cdef.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; + +typedef struct AV1LfMTInfo { + int mi_row; + int plane; + int dir; + int lpf_opt_level; +} AV1LfMTInfo; + +// Loopfilter row synchronization +typedef struct AV1LfSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_[MAX_MB_PLANE]; + pthread_cond_t *cond_[MAX_MB_PLANE]; +#endif + // Allocate memory to store the loop-filtered superblock index in each row. + int *cur_sb_col[MAX_MB_PLANE]; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + + // Row-based parallel loopfilter data + LFWorkerData *lfdata; + int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + AV1LfMTInfo *job_queue; + int jobs_enqueued; + int jobs_dequeued; + + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool lf_mt_exit; +} AV1LfSync; + +typedef struct AV1LrMTInfo { + int v_start; + int v_end; + int lr_unit_row; + int plane; + int sync_mode; + int v_copy_start; + int v_copy_end; +} AV1LrMTInfo; + +typedef struct LoopRestorationWorkerData { + int32_t *rst_tmpbuf; + void *rlbs; + void *lr_ctxt; + int do_extend_border; + struct aom_internal_error_info error_info; +} LRWorkerData; + +// Looprestoration row synchronization +typedef struct AV1LrSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_[MAX_MB_PLANE]; + pthread_cond_t *cond_[MAX_MB_PLANE]; +#endif + // Allocate memory to store the loop-restoration block index in each row. + int *cur_sb_col[MAX_MB_PLANE]; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; + int num_planes; + + int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + // Row-based parallel loopfilter data + LRWorkerData *lrworkerdata; + + AV1LrMTInfo *job_queue; + int jobs_enqueued; + int jobs_dequeued; + // Initialized to false, set to true by the worker thread that encounters + // an error in order to abort the processing of other worker threads. + bool lr_mt_exit; +} AV1LrSync; + +typedef struct AV1CdefWorker { + AV1_COMMON *cm; + MACROBLOCKD *xd; + uint16_t *colbuf[MAX_MB_PLANE]; + uint16_t *srcbuf; + uint16_t *linebuf[MAX_MB_PLANE]; + cdef_init_fb_row_t cdef_init_fb_row_fn; + int do_extend_border; + struct aom_internal_error_info error_info; +} AV1CdefWorkerData; + +typedef struct AV1CdefRowSync { +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mutex_; + pthread_cond_t *row_cond_; +#endif // CONFIG_MULTITHREAD + int is_row_done; +} AV1CdefRowSync; + +// Data related to CDEF search multi-thread synchronization. +typedef struct AV1CdefSyncData { +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif // CONFIG_MULTITHREAD + // Data related to CDEF row mt sync information + AV1CdefRowSync *cdef_row_mt; + // Flag to indicate all blocks are processed and end of frame is reached + int end_of_frame; + // Row index in units of 64x64 block + int fbr; + // Column index in units of 64x64 block + int fbc; + // Initialized to false, set to true by the worker thread that encounters + // an error in order to abort the processing of other worker threads. + bool cdef_mt_exit; +} AV1CdefSync; + +void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, + AV1CdefWorkerData *const cdef_worker, + AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn, + int do_extend_border); +void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); +void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, + int dstride, const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, int hsize); +void av1_cdef_copy_sb8_16_lowbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize); +void av1_cdef_copy_sb8_16_highbd(uint16_t *const dst, int dstride, + const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, + int hsize); +void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, + int num_workers); +void av1_free_cdef_sync(AV1CdefSync *cdef_sync); + +// Deallocate loopfilter synchronization related mutex and data. +void av1_loop_filter_dealloc(AV1LfSync *lf_sync); +void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, + int width, int num_workers); + +void av1_set_vert_loop_filter_done(AV1_COMMON *cm, AV1LfSync *lf_sync, + int num_mis_in_lpf_unit_height_log2); + +void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, + struct macroblockd *xd, int plane_start, + int plane_end, int partial_frame, + AVxWorker *workers, int num_workers, + AV1LfSync *lf_sync, int lpf_opt_level); + +void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + struct AV1Common *cm, + int optimized_lr, AVxWorker *workers, + int num_workers, AV1LrSync *lr_sync, + void *lr_ctxt, int do_extend_border); +void av1_loop_restoration_dealloc(AV1LrSync *lr_sync); +void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm, + int num_workers, int num_rows_lr, + int num_planes, int width); +int av1_get_intrabc_extra_top_right_sb_delay(const AV1_COMMON *cm); + +void av1_thread_loop_filter_rows( + const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, + struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane, + int dir, int lpf_opt_level, AV1LfSync *const lf_sync, + struct aom_internal_error_info *error_info, + AV1_DEBLOCKING_PARAMETERS *params_buf, TX_SIZE *tx_buf, int mib_size_log2); + +static AOM_FORCE_INLINE bool skip_loop_filter_plane( + const int planes_to_lf[MAX_MB_PLANE], int plane, int lpf_opt_level) { + // If LPF_PICK_METHOD is LPF_PICK_FROM_Q, we have the option to filter both + // chroma planes together + if (lpf_opt_level == 2) { + if (plane == AOM_PLANE_Y) { + return !planes_to_lf[plane]; + } + if (plane == AOM_PLANE_U) { + // U and V are handled together + return !planes_to_lf[1] && !planes_to_lf[2]; + } + assert(plane == AOM_PLANE_V); + if (plane == AOM_PLANE_V) { + // V is handled when u is filtered + return true; + } + } + + // Normal operation mode + return !planes_to_lf[plane]; +} + +static AOM_INLINE void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop, + const int planes_to_lf[MAX_MB_PLANE], + int lpf_opt_level, + int num_mis_in_lpf_unit_height) { + int mi_row, plane, dir; + AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; + lf_sync->jobs_enqueued = 0; + lf_sync->jobs_dequeued = 0; + + // Launch all vertical jobs first, as they are blocking the horizontal ones. + // Launch top row jobs for all planes first, in case the output can be + // partially reconstructed row by row. + for (dir = 0; dir < 2; ++dir) { + for (mi_row = start; mi_row < stop; mi_row += num_mis_in_lpf_unit_height) { + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (skip_loop_filter_plane(planes_to_lf, plane, lpf_opt_level)) { + continue; + } + if (!planes_to_lf[plane]) continue; + lf_job_queue->mi_row = mi_row; + lf_job_queue->plane = plane; + lf_job_queue->dir = dir; + lf_job_queue->lpf_opt_level = lpf_opt_level; + lf_job_queue++; + lf_sync->jobs_enqueued++; + } + } + } +} + +static AOM_INLINE void loop_filter_frame_mt_init( + AV1_COMMON *cm, int start_mi_row, int end_mi_row, + const int planes_to_lf[MAX_MB_PLANE], int num_workers, AV1LfSync *lf_sync, + int lpf_opt_level, int num_mis_in_lpf_unit_height_log2) { + // Number of superblock rows + const int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, num_mis_in_lpf_unit_height_log2); + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + av1_loop_filter_dealloc(lf_sync); + av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + lf_sync->lf_mt_exit = false; + + // Initialize cur_sb_col to -1 for all SB rows. + for (int i = 0; i < MAX_MB_PLANE; i++) { + memset(lf_sync->cur_sb_col[i], -1, + sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows); + } + + enqueue_lf_jobs(lf_sync, start_mi_row, end_mi_row, planes_to_lf, + lpf_opt_level, (1 << num_mis_in_lpf_unit_height_log2)); +} + +static AOM_INLINE AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) { + AV1LfMTInfo *cur_job_info = NULL; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->job_mutex); + + if (!lf_sync->lf_mt_exit && lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) { + cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued; + lf_sync->jobs_dequeued++; + } + + pthread_mutex_unlock(lf_sync->job_mutex); +#else + (void)lf_sync; +#endif + + return cur_job_info; +} + +static AOM_INLINE void loop_filter_data_reset(LFWorkerData *lf_data, + YV12_BUFFER_CONFIG *frame_buffer, + struct AV1Common *cm, + MACROBLOCKD *xd) { + struct macroblockd_plane *pd = xd->plane; + lf_data->frame_buffer = frame_buffer; + lf_data->cm = cm; + lf_data->xd = xd; + for (int i = 0; i < MAX_MB_PLANE; i++) { + memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst)); + lf_data->planes[i].subsampling_x = pd[i].subsampling_x; + lf_data->planes[i].subsampling_y = pd[i].subsampling_y; + } +} + +static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf, + int planes_to_lf[MAX_MB_PLANE], + int plane_start, + int plane_end) { + // For each luma and chroma plane, whether to filter it or not. + planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) && + plane_start <= 0 && 0 < plane_end; + planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end; + planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end; +} + +static AOM_INLINE int check_planes_to_loop_filter( + const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE], + int plane_start, int plane_end) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); + // If the luma plane is purposely not filtered, neither are the chroma + // planes. + if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0; + // Early exit. + if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return 0; + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_THREAD_COMMON_H_ diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c new file mode 100644 index 0000000000..b964f259b8 --- /dev/null +++ b/third_party/aom/av1/common/tile_common.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" +#include "aom_dsp/aom_dsp_common.h" + +void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) { + av1_tile_set_row(tile, cm, row); + av1_tile_set_col(tile, cm, col); +} + +// Find smallest k>=0 such that (blk_size << k) >= target +static int tile_log2(int blk_size, int target) { + int k; + for (k = 0; (blk_size << k) < target; k++) { + } + return k; +} + +void av1_get_tile_limits(AV1_COMMON *const cm) { + const SequenceHeader *const seq_params = cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); + const int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); + + const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2; + tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2; + +#if CONFIG_CWG_C013 + bool use_level_7_above = false; + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_7_0 && + seq_params->seq_level_idx[i] <= SEQ_LEVEL_8_3) { + // Currently it is assumed that levels 7.x and 8.x are either used for all + // operating points, or none of them. + if (i != 0 && !use_level_7_above) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Either all the operating points are levels 7.x or " + "8.x, or none of them are."); + } + use_level_7_above = true; + } + } + const int max_tile_area_sb = + (use_level_7_above ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE : MAX_TILE_AREA) >> + (2 * sb_size_log2); +#else + const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); +#endif + + tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols); + tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS)); + tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS)); + tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows); + tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols); +} + +void av1_calculate_tile_cols(const SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + CommonTileParams *const tiles) { + int sb_cols = CEIL_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2); + int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int i; + + // This will be overridden if there is at least two columns of tiles + // (otherwise there is no inner tile width) + tiles->min_inner_width = -1; + + if (tiles->uniform_spacing) { + int start_sb; + int size_sb = CEIL_POWER_OF_TWO(sb_cols, tiles->log2_cols); + assert(size_sb > 0); + for (i = 0, start_sb = 0; start_sb < sb_cols; i++) { + tiles->col_start_sb[i] = start_sb; + start_sb += size_sb; + } + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0); + tiles->max_height_sb = sb_rows >> tiles->min_log2_rows; + + tiles->width = size_sb << seq_params->mib_size_log2; + tiles->width = AOMMIN(tiles->width, cm_mi_cols); + if (tiles->cols > 1) { + tiles->min_inner_width = tiles->width; + } + } else { + int max_tile_area_sb = (sb_rows * sb_cols); + int widest_tile_sb = 1; + int narrowest_inner_tile_sb = 65536; + tiles->log2_cols = tile_log2(1, tiles->cols); + for (i = 0; i < tiles->cols; i++) { + int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + widest_tile_sb = AOMMAX(widest_tile_sb, size_sb); + // ignore the rightmost tile in frame for determining the narrowest + if (i < tiles->cols - 1) + narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb); + } + if (tiles->min_log2) { + max_tile_area_sb >>= (tiles->min_log2 + 1); + } + tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); + if (tiles->cols > 1) { + tiles->min_inner_width = narrowest_inner_tile_sb + << seq_params->mib_size_log2; + } + } +} + +void av1_calculate_tile_rows(const SequenceHeader *const seq_params, + int cm_mi_rows, CommonTileParams *const tiles) { + int sb_rows = CEIL_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2); + int start_sb, size_sb, i; + + if (tiles->uniform_spacing) { + size_sb = CEIL_POWER_OF_TWO(sb_rows, tiles->log2_rows); + assert(size_sb > 0); + for (i = 0, start_sb = 0; start_sb < sb_rows; i++) { + tiles->row_start_sb[i] = start_sb; + start_sb += size_sb; + } + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; + + tiles->height = size_sb << seq_params->mib_size_log2; + tiles->height = AOMMIN(tiles->height, cm_mi_rows); + } else { + tiles->log2_rows = tile_log2(1, tiles->rows); + } +} + +void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { + assert(row < cm->tiles.rows); + int mi_row_start = cm->tiles.row_start_sb[row] + << cm->seq_params->mib_size_log2; + int mi_row_end = cm->tiles.row_start_sb[row + 1] + << cm->seq_params->mib_size_log2; + tile->tile_row = row; + tile->mi_row_start = mi_row_start; + tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows); + assert(tile->mi_row_end > tile->mi_row_start); +} + +void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { + assert(col < cm->tiles.cols); + int mi_col_start = cm->tiles.col_start_sb[col] + << cm->seq_params->mib_size_log2; + int mi_col_end = cm->tiles.col_start_sb[col + 1] + << cm->seq_params->mib_size_log2; + tile->tile_col = col; + tile->mi_col_start = mi_col_start; + tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols); + assert(tile->mi_col_end > tile->mi_col_start); +} + +int av1_get_sb_rows_in_tile(const AV1_COMMON *cm, const TileInfo *tile) { + return CEIL_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, + cm->seq_params->mib_size_log2); +} + +int av1_get_sb_cols_in_tile(const AV1_COMMON *cm, const TileInfo *tile) { + return CEIL_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, + cm->seq_params->mib_size_log2); +} + +PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, + int is_uv) { + PixelRect r; + + // Calculate position in the Y plane + r.left = tile_info->mi_col_start * MI_SIZE; + r.right = tile_info->mi_col_end * MI_SIZE; + r.top = tile_info->mi_row_start * MI_SIZE; + r.bottom = tile_info->mi_row_end * MI_SIZE; + + // If upscaling is enabled, the tile limits need scaling to match the + // upscaled frame where the restoration units live. To do this, scale up the + // top-left and bottom-right of the tile. + if (av1_superres_scaled(cm)) { + av1_calculate_unscaled_superres_size(&r.left, &r.top, + cm->superres_scale_denominator); + av1_calculate_unscaled_superres_size(&r.right, &r.bottom, + cm->superres_scale_denominator); + } + + const int frame_w = cm->superres_upscaled_width; + const int frame_h = cm->superres_upscaled_height; + + // Make sure we don't fall off the bottom-right of the frame. + r.right = AOMMIN(r.right, frame_w); + r.bottom = AOMMIN(r.bottom, frame_h); + + // Convert to coordinates in the appropriate plane + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + + r.left = ROUND_POWER_OF_TWO(r.left, ss_x); + r.right = ROUND_POWER_OF_TWO(r.right, ss_x); + r.top = ROUND_POWER_OF_TWO(r.top, ss_y); + r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y); + + return r; +} + +void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { + const CommonTileParams *const tiles = &cm->tiles; + if (tiles->uniform_spacing) { + *w = tiles->width; + *h = tiles->height; + } else { + for (int i = 0; i < tiles->cols; ++i) { + const int tile_width_sb = + tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + const int tile_w = tile_width_sb * cm->seq_params->mib_size; + assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + *w = tile_w; + } + + for (int i = 0; i < tiles->rows; ++i) { + const int tile_height_sb = + tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + const int tile_h = tile_height_sb * cm->seq_params->mib_size; + assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + *h = tile_h; + } + } +} + +int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) { + // Disable check if there is a single tile col in the frame + if (cm->tiles.cols == 1) return 1; + + return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >= + (64 << av1_superres_scaled(cm))); +} diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h new file mode 100644 index 0000000000..5383ae940b --- /dev/null +++ b/third_party/aom/av1/common/tile_common.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TILE_COMMON_H_ +#define AOM_AV1_COMMON_TILE_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" +#include "aom_dsp/rect.h" + +struct AV1Common; +struct SequenceHeader; +struct CommonTileParams; + +#define DEFAULT_MAX_NUM_TG 1 + +typedef struct TileInfo { + int mi_row_start, mi_row_end; + int mi_col_start, mi_col_end; + int tile_row; + int tile_col; +} TileInfo; + +// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on +// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)' +void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row, + int col); + +void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row); +void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col); + +int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile); +int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile); + +// Return the pixel extents of the given tile +PixelRect av1_get_tile_rect(const TileInfo *tile_info, + const struct AV1Common *cm, int is_uv); + +// Define tile maximum width and area +// There is no maximum height since height is limited by area and width limits +// The minimum tile width or height is fixed at one superblock +#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels +#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels +#if CONFIG_CWG_C013 +#define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608) +#endif + +void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); +void av1_get_tile_limits(struct AV1Common *const cm); +void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, + int cm_mi_rows, int cm_mi_cols, + struct CommonTileParams *const tiles); +void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params, + int cm_mi_rows, + struct CommonTileParams *const tiles); + +// Checks if the minimum tile_width requirement is satisfied +int av1_is_min_tile_width_satisfied(const struct AV1Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_TILE_COMMON_H_ diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c new file mode 100644 index 0000000000..a959cdf768 --- /dev/null +++ b/third_party/aom/av1/common/timing.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/timing.h" + +/* Tables for AV1 max bitrates for different levels of main and high tier. + * The tables are in Kbps instead of Mbps in the specification. + * Note that depending on the profile, a multiplier is needed. + */ +#define UNDEFINED_RATE \ + (1 << 21) // Placeholder rate for levels with undefined rate +#define INVALID_RATE \ + (0) // For invalid profile-level configuration, set rate to 0 + +/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */ +/* is a dummy value. The decoder model is not applicable for level 31. */ +static int32_t main_kbps[1 << LEVEL_BITS] = { + 1500, 3000, UNDEFINED_RATE, UNDEFINED_RATE, + 6000, 10000, UNDEFINED_RATE, UNDEFINED_RATE, + 12000, 20000, UNDEFINED_RATE, UNDEFINED_RATE, + 30000, 40000, 60000, 60000, + 60000, 100000, 160000, 160000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE +}; + +/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */ +/* is a dummy value. The decoder model is not applicable for level 31. */ +static int32_t high_kbps[1 << LEVEL_BITS] = { + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + INVALID_RATE, INVALID_RATE, INVALID_RATE, INVALID_RATE, + 30000, 50000, UNDEFINED_RATE, UNDEFINED_RATE, + 100000, 160000, 240000, 240000, + 240000, 480000, 800000, 800000, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, + UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE +}; + +/* BitrateProfileFactor */ +static int bitrate_profile_factor[1 << PROFILE_BITS] = { + 1, 2, 3, 0, 0, 0, 0, 0 +}; + +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier) { + int64_t bitrate; + + if (seq_tier) { + bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; + } else { + bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile]; + } + + return bitrate * 1000; +} + +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) { + decoder_model->encoder_decoder_buffer_delay_length = 16; + decoder_model->buffer_removal_time_length = 10; + decoder_model->frame_presentation_time_length = 10; +} + +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) { + op_params->decoder_model_param_present_flag = 1; + op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s + op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s + op_params->low_delay_mode_flag = 0; + op_params->display_model_param_present_flag = 1; + op_params->initial_display_delay = 8; // 8 frames delay +} + +void av1_set_resource_availability_parameters( + aom_dec_model_op_parameters_t *op_params) { + op_params->decoder_model_param_present_flag = 0; + op_params->decoder_buffer_delay = + 70000; // Resource availability mode default + op_params->encoder_buffer_delay = + 20000; // Resource availability mode default + op_params->low_delay_mode_flag = 0; // Resource availability mode default + op_params->display_model_param_present_flag = 1; + op_params->initial_display_delay = 8; // 8 frames delay +} diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h new file mode 100644 index 0000000000..9192124f72 --- /dev/null +++ b/third_party/aom/av1/common/timing.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TIMING_H_ +#define AOM_AV1_COMMON_TIMING_H_ + +#include "aom/aom_integer.h" +#include "av1/common/enums.h" + +#define MAX_NUM_OP_POINTS 32 + +typedef struct aom_timing { + uint32_t num_units_in_display_tick; + uint32_t time_scale; + int equal_picture_interval; + uint32_t num_ticks_per_picture; +} aom_timing_info_t; + +typedef struct aom_dec_model_info { + uint32_t num_units_in_decoding_tick; + int encoder_decoder_buffer_delay_length; + int buffer_removal_time_length; + int frame_presentation_time_length; +} aom_dec_model_info_t; + +typedef struct aom_dec_model_op_parameters { + int decoder_model_param_present_flag; + int64_t bitrate; + int64_t buffer_size; + uint32_t decoder_buffer_delay; + uint32_t encoder_buffer_delay; + int low_delay_mode_flag; + int display_model_param_present_flag; + int initial_display_delay; +} aom_dec_model_op_parameters_t; + +void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model); + +void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params); + +void av1_set_resource_availability_parameters( + aom_dec_model_op_parameters_t *op_params); + +int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx, + int seq_tier); + +#endif // AOM_AV1_COMMON_TIMING_H_ diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h new file mode 100644 index 0000000000..f1edda58d7 --- /dev/null +++ b/third_party/aom/av1/common/token_cdfs.h @@ -0,0 +1,3555 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_ +#define AOM_AV1_COMMON_TOKEN_CDFS_H_ + +#include "config/aom_config.h" + +#include "av1/common/entropy.h" + +static const aom_cdf_prob + av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS] + [CDF_SIZE(2)] = { + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + { { + { AOM_CDF2(128 * 125) }, + { AOM_CDF2(128 * 102) }, + { AOM_CDF2(128 * 147) }, + }, + { + { AOM_CDF2(128 * 119) }, + { AOM_CDF2(128 * 101) }, + { AOM_CDF2(128 * 135) }, + } }, + }; + +static const aom_cdf_prob + av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS] + [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) }, + { AOM_CDF2(5892) }, + { AOM_CDF2(12112) }, + { AOM_CDF2(21935) }, + { AOM_CDF2(20289) }, + { AOM_CDF2(27473) }, + { AOM_CDF2(32487) }, + { AOM_CDF2(7654) }, + { AOM_CDF2(19473) }, + { AOM_CDF2(29984) }, + { AOM_CDF2(9961) }, + { AOM_CDF2(30242) }, + { AOM_CDF2(32117) } }, + { { AOM_CDF2(31548) }, + { AOM_CDF2(1549) }, + { AOM_CDF2(10130) }, + { AOM_CDF2(16656) }, + { AOM_CDF2(18591) }, + { AOM_CDF2(26308) }, + { AOM_CDF2(32537) }, + { AOM_CDF2(5403) }, + { AOM_CDF2(18096) }, + { AOM_CDF2(30003) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(29957) }, + { AOM_CDF2(5391) }, + { AOM_CDF2(18039) }, + { AOM_CDF2(23566) }, + { AOM_CDF2(22431) }, + { AOM_CDF2(25822) }, + { AOM_CDF2(32197) }, + { AOM_CDF2(3778) }, + { AOM_CDF2(15336) }, + { AOM_CDF2(28981) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(17920) }, + { AOM_CDF2(1818) }, + { AOM_CDF2(7282) }, + { AOM_CDF2(25273) }, + { AOM_CDF2(10923) }, + { AOM_CDF2(31554) }, + { AOM_CDF2(32624) }, + { AOM_CDF2(1366) }, + { AOM_CDF2(15628) }, + { AOM_CDF2(30462) }, + { AOM_CDF2(146) }, + { AOM_CDF2(5132) }, + { AOM_CDF2(31657) } }, + { { AOM_CDF2(6308) }, + { AOM_CDF2(117) }, + { AOM_CDF2(1638) }, + { AOM_CDF2(2161) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(10923) }, + { AOM_CDF2(30247) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(30371) }, + { AOM_CDF2(7570) }, + { AOM_CDF2(13155) }, + { AOM_CDF2(20751) }, + { AOM_CDF2(20969) }, + { AOM_CDF2(27067) }, + { AOM_CDF2(32013) }, + { AOM_CDF2(5495) }, + { AOM_CDF2(17942) }, + { AOM_CDF2(28280) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31782) }, + { AOM_CDF2(1836) }, + { AOM_CDF2(10689) }, + { AOM_CDF2(17604) }, + { AOM_CDF2(21622) }, + { AOM_CDF2(27518) }, + { AOM_CDF2(32399) }, + { AOM_CDF2(4419) }, + { AOM_CDF2(16294) }, + { AOM_CDF2(28345) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31901) }, + { AOM_CDF2(10311) }, + { AOM_CDF2(18047) }, + { AOM_CDF2(24806) }, + { AOM_CDF2(23288) }, + { AOM_CDF2(27914) }, + { AOM_CDF2(32296) }, + { AOM_CDF2(4215) }, + { AOM_CDF2(15756) }, + { AOM_CDF2(28341) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(26726) }, + { AOM_CDF2(1045) }, + { AOM_CDF2(11703) }, + { AOM_CDF2(20590) }, + { AOM_CDF2(18554) }, + { AOM_CDF2(25970) }, + { AOM_CDF2(31938) }, + { AOM_CDF2(5583) }, + { AOM_CDF2(21313) }, + { AOM_CDF2(29390) }, + { AOM_CDF2(641) }, + { AOM_CDF2(22265) }, + { AOM_CDF2(31452) } }, + { { AOM_CDF2(26584) }, + { AOM_CDF2(188) }, + { AOM_CDF2(8847) }, + { AOM_CDF2(24519) }, + { AOM_CDF2(22938) }, + { AOM_CDF2(30583) }, + { AOM_CDF2(32608) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(29614) }, + { AOM_CDF2(9068) }, + { AOM_CDF2(12924) }, + { AOM_CDF2(19538) }, + { AOM_CDF2(17737) }, + { AOM_CDF2(24619) }, + { AOM_CDF2(30642) }, + { AOM_CDF2(4119) }, + { AOM_CDF2(16026) }, + { AOM_CDF2(25657) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31957) }, + { AOM_CDF2(3230) }, + { AOM_CDF2(11153) }, + { AOM_CDF2(18123) }, + { AOM_CDF2(20143) }, + { AOM_CDF2(26536) }, + { AOM_CDF2(31986) }, + { AOM_CDF2(3050) }, + { AOM_CDF2(14603) }, + { AOM_CDF2(25155) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(32363) }, + { AOM_CDF2(10692) }, + { AOM_CDF2(19090) }, + { AOM_CDF2(24357) }, + { AOM_CDF2(24442) }, + { AOM_CDF2(28312) }, + { AOM_CDF2(32169) }, + { AOM_CDF2(3648) }, + { AOM_CDF2(15690) }, + { AOM_CDF2(26815) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(30669) }, + { AOM_CDF2(3832) }, + { AOM_CDF2(11663) }, + { AOM_CDF2(18889) }, + { AOM_CDF2(19782) }, + { AOM_CDF2(23313) }, + { AOM_CDF2(31330) }, + { AOM_CDF2(5124) }, + { AOM_CDF2(18719) }, + { AOM_CDF2(28468) }, + { AOM_CDF2(3082) }, + { AOM_CDF2(20982) }, + { AOM_CDF2(29443) } }, + { { AOM_CDF2(28573) }, + { AOM_CDF2(3183) }, + { AOM_CDF2(17802) }, + { AOM_CDF2(25977) }, + { AOM_CDF2(26677) }, + { AOM_CDF2(27832) }, + { AOM_CDF2(32387) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } }, + { { { AOM_CDF2(26887) }, + { AOM_CDF2(6729) }, + { AOM_CDF2(10361) }, + { AOM_CDF2(17442) }, + { AOM_CDF2(15045) }, + { AOM_CDF2(22478) }, + { AOM_CDF2(29072) }, + { AOM_CDF2(2713) }, + { AOM_CDF2(11861) }, + { AOM_CDF2(20773) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31903) }, + { AOM_CDF2(2044) }, + { AOM_CDF2(7528) }, + { AOM_CDF2(14618) }, + { AOM_CDF2(16182) }, + { AOM_CDF2(24168) }, + { AOM_CDF2(31037) }, + { AOM_CDF2(2786) }, + { AOM_CDF2(11194) }, + { AOM_CDF2(20155) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(32510) }, + { AOM_CDF2(8430) }, + { AOM_CDF2(17318) }, + { AOM_CDF2(24154) }, + { AOM_CDF2(23674) }, + { AOM_CDF2(28789) }, + { AOM_CDF2(32139) }, + { AOM_CDF2(3440) }, + { AOM_CDF2(13117) }, + { AOM_CDF2(22702) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } }, + { { AOM_CDF2(31671) }, + { AOM_CDF2(2056) }, + { AOM_CDF2(11746) }, + { AOM_CDF2(16852) }, + { AOM_CDF2(18635) }, + { AOM_CDF2(24715) }, + { AOM_CDF2(31484) }, + { AOM_CDF2(4656) }, + { AOM_CDF2(16074) }, + { AOM_CDF2(24704) }, + { AOM_CDF2(1806) }, + { AOM_CDF2(14645) }, + { AOM_CDF2(25336) } }, + { { AOM_CDF2(31539) }, + { AOM_CDF2(8433) }, + { AOM_CDF2(20576) }, + { AOM_CDF2(27904) }, + { AOM_CDF2(27852) }, + { AOM_CDF2(30026) }, + { AOM_CDF2(32441) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } } } }; + +static const aom_cdf_prob + av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = { + { { { + { AOM_CDF2(16961) }, + { AOM_CDF2(17223) }, + { AOM_CDF2(7621) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(19069) }, + { AOM_CDF2(22525) }, + { AOM_CDF2(13377) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20401) }, + { AOM_CDF2(17025) }, + { AOM_CDF2(12845) }, + { AOM_CDF2(12873) }, + { AOM_CDF2(14094) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20681) }, + { AOM_CDF2(20701) }, + { AOM_CDF2(15250) }, + { AOM_CDF2(15017) }, + { AOM_CDF2(14928) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(23905) }, + { AOM_CDF2(17194) }, + { AOM_CDF2(16170) }, + { AOM_CDF2(17695) }, + { AOM_CDF2(13826) }, + { AOM_CDF2(15810) }, + { AOM_CDF2(12036) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(23959) }, + { AOM_CDF2(20799) }, + { AOM_CDF2(19021) }, + { AOM_CDF2(16203) }, + { AOM_CDF2(17886) }, + { AOM_CDF2(14144) }, + { AOM_CDF2(12010) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(27399) }, + { AOM_CDF2(16327) }, + { AOM_CDF2(18071) }, + { AOM_CDF2(19584) }, + { AOM_CDF2(20721) }, + { AOM_CDF2(18432) }, + { AOM_CDF2(19560) }, + { AOM_CDF2(10150) }, + { AOM_CDF2(8805) }, + }, + { + { AOM_CDF2(24932) }, + { AOM_CDF2(20833) }, + { AOM_CDF2(12027) }, + { AOM_CDF2(16670) }, + { AOM_CDF2(19914) }, + { AOM_CDF2(15106) }, + { AOM_CDF2(17662) }, + { AOM_CDF2(13783) }, + { AOM_CDF2(28756) }, + } }, + { { + { AOM_CDF2(23406) }, + { AOM_CDF2(21845) }, + { AOM_CDF2(18432) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(17096) }, + { AOM_CDF2(12561) }, + { AOM_CDF2(17320) }, + { AOM_CDF2(22395) }, + { AOM_CDF2(21370) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(17471) }, + { AOM_CDF2(20223) }, + { AOM_CDF2(11357) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20335) }, + { AOM_CDF2(21667) }, + { AOM_CDF2(14818) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20430) }, + { AOM_CDF2(20662) }, + { AOM_CDF2(15367) }, + { AOM_CDF2(16970) }, + { AOM_CDF2(14657) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22117) }, + { AOM_CDF2(22028) }, + { AOM_CDF2(18650) }, + { AOM_CDF2(16042) }, + { AOM_CDF2(15885) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(22409) }, + { AOM_CDF2(21012) }, + { AOM_CDF2(15650) }, + { AOM_CDF2(17395) }, + { AOM_CDF2(15469) }, + { AOM_CDF2(20205) }, + { AOM_CDF2(19511) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(24220) }, + { AOM_CDF2(22480) }, + { AOM_CDF2(17737) }, + { AOM_CDF2(18916) }, + { AOM_CDF2(19268) }, + { AOM_CDF2(18412) }, + { AOM_CDF2(18844) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(25991) }, + { AOM_CDF2(20314) }, + { AOM_CDF2(17731) }, + { AOM_CDF2(19678) }, + { AOM_CDF2(18649) }, + { AOM_CDF2(17307) }, + { AOM_CDF2(21798) }, + { AOM_CDF2(17549) }, + { AOM_CDF2(15630) }, + }, + { + { AOM_CDF2(26585) }, + { AOM_CDF2(21469) }, + { AOM_CDF2(20432) }, + { AOM_CDF2(17735) }, + { AOM_CDF2(19280) }, + { AOM_CDF2(15235) }, + { AOM_CDF2(20297) }, + { AOM_CDF2(22471) }, + { AOM_CDF2(28997) }, + } }, + { { + { AOM_CDF2(26605) }, + { AOM_CDF2(11304) }, + { AOM_CDF2(16726) }, + { AOM_CDF2(16560) }, + { AOM_CDF2(20866) }, + { AOM_CDF2(23524) }, + { AOM_CDF2(19878) }, + { AOM_CDF2(13469) }, + { AOM_CDF2(23084) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(18983) }, + { AOM_CDF2(20512) }, + { AOM_CDF2(14885) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20090) }, + { AOM_CDF2(19444) }, + { AOM_CDF2(17286) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19139) }, + { AOM_CDF2(21487) }, + { AOM_CDF2(18959) }, + { AOM_CDF2(20910) }, + { AOM_CDF2(19089) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20536) }, + { AOM_CDF2(20664) }, + { AOM_CDF2(20625) }, + { AOM_CDF2(19123) }, + { AOM_CDF2(14862) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19833) }, + { AOM_CDF2(21502) }, + { AOM_CDF2(17485) }, + { AOM_CDF2(20267) }, + { AOM_CDF2(18353) }, + { AOM_CDF2(23329) }, + { AOM_CDF2(21478) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22041) }, + { AOM_CDF2(23434) }, + { AOM_CDF2(20001) }, + { AOM_CDF2(20554) }, + { AOM_CDF2(20951) }, + { AOM_CDF2(20145) }, + { AOM_CDF2(15562) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(23312) }, + { AOM_CDF2(21607) }, + { AOM_CDF2(16526) }, + { AOM_CDF2(18957) }, + { AOM_CDF2(18034) }, + { AOM_CDF2(18934) }, + { AOM_CDF2(24247) }, + { AOM_CDF2(16921) }, + { AOM_CDF2(17080) }, + }, + { + { AOM_CDF2(26579) }, + { AOM_CDF2(24910) }, + { AOM_CDF2(18637) }, + { AOM_CDF2(19800) }, + { AOM_CDF2(20388) }, + { AOM_CDF2(9887) }, + { AOM_CDF2(15642) }, + { AOM_CDF2(30198) }, + { AOM_CDF2(24721) }, + } }, + { { + { AOM_CDF2(26998) }, + { AOM_CDF2(16737) }, + { AOM_CDF2(17838) }, + { AOM_CDF2(18922) }, + { AOM_CDF2(19515) }, + { AOM_CDF2(18636) }, + { AOM_CDF2(17333) }, + { AOM_CDF2(15776) }, + { AOM_CDF2(22658) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } }, + { { { + { AOM_CDF2(20177) }, + { AOM_CDF2(20789) }, + { AOM_CDF2(20262) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(21416) }, + { AOM_CDF2(20855) }, + { AOM_CDF2(23410) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(20238) }, + { AOM_CDF2(21057) }, + { AOM_CDF2(19159) }, + { AOM_CDF2(22337) }, + { AOM_CDF2(20159) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(20125) }, + { AOM_CDF2(20559) }, + { AOM_CDF2(21707) }, + { AOM_CDF2(22296) }, + { AOM_CDF2(17333) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(19941) }, + { AOM_CDF2(20527) }, + { AOM_CDF2(21470) }, + { AOM_CDF2(22487) }, + { AOM_CDF2(19558) }, + { AOM_CDF2(22354) }, + { AOM_CDF2(20331) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + }, + { + { AOM_CDF2(22752) }, + { AOM_CDF2(25006) }, + { AOM_CDF2(22075) }, + { AOM_CDF2(21576) }, + { AOM_CDF2(17740) }, + { AOM_CDF2(21690) }, + { AOM_CDF2(19211) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } }, + { { + { AOM_CDF2(21442) }, + { AOM_CDF2(22358) }, + { AOM_CDF2(18503) }, + { AOM_CDF2(20291) }, + { AOM_CDF2(19945) }, + { AOM_CDF2(21294) }, + { AOM_CDF2(21178) }, + { AOM_CDF2(19400) }, + { AOM_CDF2(10556) }, + }, + { + { AOM_CDF2(24648) }, + { AOM_CDF2(24949) }, + { AOM_CDF2(20708) }, + { AOM_CDF2(23905) }, + { AOM_CDF2(20501) }, + { AOM_CDF2(9558) }, + { AOM_CDF2(9423) }, + { AOM_CDF2(30365) }, + { AOM_CDF2(19253) }, + } }, + { { + { AOM_CDF2(26064) }, + { AOM_CDF2(22098) }, + { AOM_CDF2(19613) }, + { AOM_CDF2(20525) }, + { AOM_CDF2(17595) }, + { AOM_CDF2(16618) }, + { AOM_CDF2(20497) }, + { AOM_CDF2(18989) }, + { AOM_CDF2(15513) }, + }, + { + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, + } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) }, + { AOM_CDF5(370, 671, 1883, 4471) } }, + { { AOM_CDF5(3247, 4950, 9688, 14563) }, + { AOM_CDF5(1904, 3354, 7763, 14647) } } }, + { { { AOM_CDF5(2125, 2551, 5165, 8946) }, + { AOM_CDF5(513, 765, 1859, 6339) } }, + { { AOM_CDF5(7637, 9498, 14259, 19108) }, + { AOM_CDF5(2497, 4096, 8866, 16993) } } }, + { { { AOM_CDF5(4016, 4897, 8881, 14968) }, + { AOM_CDF5(716, 1105, 2646, 10056) } }, + { { AOM_CDF5(11139, 13270, 18241, 23566) }, + { AOM_CDF5(3192, 5032, 10297, 19755) } } }, + { { { AOM_CDF5(6708, 8958, 14746, 22133) }, + { AOM_CDF5(1222, 2074, 4783, 15410) } }, + { { AOM_CDF5(19575, 21766, 26044, 29709) }, + { AOM_CDF5(7297, 10767, 19273, 28194) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) }, + { AOM_CDF6(210, 405, 1315, 3326, 7537) } }, + { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) }, + { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } }, + { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) }, + { AOM_CDF6(313, 441, 1099, 2917, 8562) } }, + { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) }, + { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } }, + { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) }, + { AOM_CDF6(574, 821, 1836, 5089, 13128) } }, + { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) }, + { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } }, + { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) }, + { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } }, + { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) }, + { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) }, + { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } }, + { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) }, + { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } }, + { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) }, + { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } }, + { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) }, + { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } }, + { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) }, + { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } }, + { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) }, + { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } }, + { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) }, + { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } }, + { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) }, + { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 8)] = { + { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) }, + { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } }, + { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) }, + { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } }, + { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) }, + { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } }, + { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) }, + { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } }, + { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) }, + { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } }, + { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) }, + { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } }, + { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) }, + { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } }, + { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) }, + { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 9)] = { + { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) }, + { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } }, + { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) }, + { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } }, + { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) }, + { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } }, + { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) }, + { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } }, + { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) }, + { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } }, + { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) }, + { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842, + 32708) } } }, + { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) }, + { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } }, + { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) }, + { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403, + 32695) } } } + }; + +static const aom_cdf_prob + av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788, + 23412, 26061) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919, + 26129, 29140) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590, + 24584, 28749) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478, + 28396, 31811) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267, + 28410, 31078) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812, + 27300, 29219, 32114) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } }, + { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456, + 31142, 32060) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } }, + { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716, + 30073, 30820, 31956) }, + { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938, + 26214, 29491) } } } }; + +static const aom_cdf_prob + av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE( + 11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047, + 22571, 25830) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354, + 27255, 28546, 31784) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851, + 21856, 25692, 28034) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527, + 28027, 28377, 30876) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155, + 26682, 29229, 31045) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601, + 25483, 25843, 32056) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } }, + { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434, + 29326, 31082, 32050) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } }, + { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913, + 29486, 29724, 29807, 32570) }, + { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852, + 23831, 26810, 29789) } } } }; + +static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS] + [CDF_SIZE(BR_CDF_SIZE)] = { + { { { { AOM_CDF4(14298, 20718, 24174) }, + { AOM_CDF4(12536, 19601, 23789) }, + { AOM_CDF4(8712, 15051, 19503) }, + { AOM_CDF4(6170, 11327, 15434) }, + { AOM_CDF4(4742, 8926, 12538) }, + { AOM_CDF4(3803, 7317, 10546) }, + { AOM_CDF4(1696, 3317, 4871) }, + { AOM_CDF4(14392, 19951, 22756) }, + { AOM_CDF4(15978, 23218, 26818) }, + { AOM_CDF4(12187, 19474, 23889) }, + { AOM_CDF4(9176, 15640, 20259) }, + { AOM_CDF4(7068, 12655, 17028) }, + { AOM_CDF4(5656, 10442, 14472) }, + { AOM_CDF4(2580, 4992, 7244) }, + { AOM_CDF4(12136, 18049, 21426) }, + { AOM_CDF4(13784, 20721, 24481) }, + { AOM_CDF4(10836, 17621, 21900) }, + { AOM_CDF4(8372, 14444, 18847) }, + { AOM_CDF4(6523, 11779, 16000) }, + { AOM_CDF4(5337, 9898, 13760) }, + { AOM_CDF4(3034, 5860, 8462) } }, + { { AOM_CDF4(15967, 22905, 26286) }, + { AOM_CDF4(13534, 20654, 24579) }, + { AOM_CDF4(9504, 16092, 20535) }, + { AOM_CDF4(6975, 12568, 16903) }, + { AOM_CDF4(5364, 10091, 14020) }, + { AOM_CDF4(4357, 8370, 11857) }, + { AOM_CDF4(2506, 4934, 7218) }, + { AOM_CDF4(23032, 28815, 30936) }, + { AOM_CDF4(19540, 26704, 29719) }, + { AOM_CDF4(15158, 22969, 27097) }, + { AOM_CDF4(11408, 18865, 23650) }, + { AOM_CDF4(8885, 15448, 20250) }, + { AOM_CDF4(7108, 12853, 17416) }, + { AOM_CDF4(4231, 8041, 11480) }, + { AOM_CDF4(19823, 26490, 29156) }, + { AOM_CDF4(18890, 25929, 28932) }, + { AOM_CDF4(15660, 23491, 27433) }, + { AOM_CDF4(12147, 19776, 24488) }, + { AOM_CDF4(9728, 16774, 21649) }, + { AOM_CDF4(7919, 14277, 19066) }, + { AOM_CDF4(5440, 10170, 14185) } } }, + { { { AOM_CDF4(14406, 20862, 24414) }, + { AOM_CDF4(11824, 18907, 23109) }, + { AOM_CDF4(8257, 14393, 18803) }, + { AOM_CDF4(5860, 10747, 14778) }, + { AOM_CDF4(4475, 8486, 11984) }, + { AOM_CDF4(3606, 6954, 10043) }, + { AOM_CDF4(1736, 3410, 5048) }, + { AOM_CDF4(14430, 20046, 22882) }, + { AOM_CDF4(15593, 22899, 26709) }, + { AOM_CDF4(12102, 19368, 23811) }, + { AOM_CDF4(9059, 15584, 20262) }, + { AOM_CDF4(6999, 12603, 17048) }, + { AOM_CDF4(5684, 10497, 14553) }, + { AOM_CDF4(2822, 5438, 7862) }, + { AOM_CDF4(15785, 21585, 24359) }, + { AOM_CDF4(18347, 25229, 28266) }, + { AOM_CDF4(14974, 22487, 26389) }, + { AOM_CDF4(11423, 18681, 23271) }, + { AOM_CDF4(8863, 15350, 20008) }, + { AOM_CDF4(7153, 12852, 17278) }, + { AOM_CDF4(3707, 7036, 9982) } }, + { { AOM_CDF4(15460, 21696, 25469) }, + { AOM_CDF4(12170, 19249, 23191) }, + { AOM_CDF4(8723, 15027, 19332) }, + { AOM_CDF4(6428, 11704, 15874) }, + { AOM_CDF4(4922, 9292, 13052) }, + { AOM_CDF4(4139, 7695, 11010) }, + { AOM_CDF4(2291, 4508, 6598) }, + { AOM_CDF4(19856, 26920, 29828) }, + { AOM_CDF4(17923, 25289, 28792) }, + { AOM_CDF4(14278, 21968, 26297) }, + { AOM_CDF4(10910, 18136, 22950) }, + { AOM_CDF4(8423, 14815, 19627) }, + { AOM_CDF4(6771, 12283, 16774) }, + { AOM_CDF4(4074, 7750, 11081) }, + { AOM_CDF4(19852, 26074, 28672) }, + { AOM_CDF4(19371, 26110, 28989) }, + { AOM_CDF4(16265, 23873, 27663) }, + { AOM_CDF4(12758, 20378, 24952) }, + { AOM_CDF4(10095, 17098, 21961) }, + { AOM_CDF4(8250, 14628, 19451) }, + { AOM_CDF4(5205, 9745, 13622) } } }, + { { { AOM_CDF4(10563, 16233, 19763) }, + { AOM_CDF4(9794, 16022, 19804) }, + { AOM_CDF4(6750, 11945, 15759) }, + { AOM_CDF4(4963, 9186, 12752) }, + { AOM_CDF4(3845, 7435, 10627) }, + { AOM_CDF4(3051, 6085, 8834) }, + { AOM_CDF4(1311, 2596, 3830) }, + { AOM_CDF4(11246, 16404, 19689) }, + { AOM_CDF4(12315, 18911, 22731) }, + { AOM_CDF4(10557, 17095, 21289) }, + { AOM_CDF4(8136, 14006, 18249) }, + { AOM_CDF4(6348, 11474, 15565) }, + { AOM_CDF4(5196, 9655, 13400) }, + { AOM_CDF4(2349, 4526, 6587) }, + { AOM_CDF4(13337, 18730, 21569) }, + { AOM_CDF4(19306, 26071, 28882) }, + { AOM_CDF4(15952, 23540, 27254) }, + { AOM_CDF4(12409, 19934, 24430) }, + { AOM_CDF4(9760, 16706, 21389) }, + { AOM_CDF4(8004, 14220, 18818) }, + { AOM_CDF4(4138, 7794, 10961) } }, + { { AOM_CDF4(10870, 16684, 20949) }, + { AOM_CDF4(9664, 15230, 18680) }, + { AOM_CDF4(6886, 12109, 15408) }, + { AOM_CDF4(4825, 8900, 12305) }, + { AOM_CDF4(3630, 7162, 10314) }, + { AOM_CDF4(3036, 6429, 9387) }, + { AOM_CDF4(1671, 3296, 4940) }, + { AOM_CDF4(13819, 19159, 23026) }, + { AOM_CDF4(11984, 19108, 23120) }, + { AOM_CDF4(10690, 17210, 21663) }, + { AOM_CDF4(7984, 14154, 18333) }, + { AOM_CDF4(6868, 12294, 16124) }, + { AOM_CDF4(5274, 8994, 12868) }, + { AOM_CDF4(2988, 5771, 8424) }, + { AOM_CDF4(19736, 26647, 29141) }, + { AOM_CDF4(18933, 26070, 28984) }, + { AOM_CDF4(15779, 23048, 27200) }, + { AOM_CDF4(12638, 20061, 24532) }, + { AOM_CDF4(10692, 17545, 22220) }, + { AOM_CDF4(9217, 15251, 20054) }, + { AOM_CDF4(5078, 9284, 12594) } } }, + { { { AOM_CDF4(2331, 3662, 5244) }, + { AOM_CDF4(2891, 4771, 6145) }, + { AOM_CDF4(4598, 7623, 9729) }, + { AOM_CDF4(3520, 6845, 9199) }, + { AOM_CDF4(3417, 6119, 9324) }, + { AOM_CDF4(2601, 5412, 7385) }, + { AOM_CDF4(600, 1173, 1744) }, + { AOM_CDF4(7672, 13286, 17469) }, + { AOM_CDF4(4232, 7792, 10793) }, + { AOM_CDF4(2915, 5317, 7397) }, + { AOM_CDF4(2318, 4356, 6152) }, + { AOM_CDF4(2127, 4000, 5554) }, + { AOM_CDF4(1850, 3478, 5275) }, + { AOM_CDF4(977, 1933, 2843) }, + { AOM_CDF4(18280, 24387, 27989) }, + { AOM_CDF4(15852, 22671, 26185) }, + { AOM_CDF4(13845, 20951, 24789) }, + { AOM_CDF4(11055, 17966, 22129) }, + { AOM_CDF4(9138, 15422, 19801) }, + { AOM_CDF4(7454, 13145, 17456) }, + { AOM_CDF4(3370, 6393, 9013) } }, + { { AOM_CDF4(5842, 9229, 10838) }, + { AOM_CDF4(2313, 3491, 4276) }, + { AOM_CDF4(2998, 6104, 7496) }, + { AOM_CDF4(2420, 7447, 9868) }, + { AOM_CDF4(3034, 8495, 10923) }, + { AOM_CDF4(4076, 8937, 10975) }, + { AOM_CDF4(1086, 2370, 3299) }, + { AOM_CDF4(9714, 17254, 20444) }, + { AOM_CDF4(8543, 13698, 17123) }, + { AOM_CDF4(4918, 9007, 11910) }, + { AOM_CDF4(4129, 7532, 10553) }, + { AOM_CDF4(2364, 5533, 8058) }, + { AOM_CDF4(1834, 3546, 5563) }, + { AOM_CDF4(1473, 2908, 4133) }, + { AOM_CDF4(15405, 21193, 25619) }, + { AOM_CDF4(15691, 21952, 26561) }, + { AOM_CDF4(12962, 19194, 24165) }, + { AOM_CDF4(10272, 17855, 22129) }, + { AOM_CDF4(8588, 15270, 20718) }, + { AOM_CDF4(8682, 14669, 19500) }, + { AOM_CDF4(4870, 9636, 13205) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(14995, 21341, 24749) }, + { AOM_CDF4(13158, 20289, 24601) }, + { AOM_CDF4(8941, 15326, 19876) }, + { AOM_CDF4(6297, 11541, 15807) }, + { AOM_CDF4(4817, 9029, 12776) }, + { AOM_CDF4(3731, 7273, 10627) }, + { AOM_CDF4(1847, 3617, 5354) }, + { AOM_CDF4(14472, 19659, 22343) }, + { AOM_CDF4(16806, 24162, 27533) }, + { AOM_CDF4(12900, 20404, 24713) }, + { AOM_CDF4(9411, 16112, 20797) }, + { AOM_CDF4(7056, 12697, 17148) }, + { AOM_CDF4(5544, 10339, 14460) }, + { AOM_CDF4(2954, 5704, 8319) }, + { AOM_CDF4(12464, 18071, 21354) }, + { AOM_CDF4(15482, 22528, 26034) }, + { AOM_CDF4(12070, 19269, 23624) }, + { AOM_CDF4(8953, 15406, 20106) }, + { AOM_CDF4(7027, 12730, 17220) }, + { AOM_CDF4(5887, 10913, 15140) }, + { AOM_CDF4(3793, 7278, 10447) } }, + { { AOM_CDF4(15571, 22232, 25749) }, + { AOM_CDF4(14506, 21575, 25374) }, + { AOM_CDF4(10189, 17089, 21569) }, + { AOM_CDF4(7316, 13301, 17915) }, + { AOM_CDF4(5783, 10912, 15190) }, + { AOM_CDF4(4760, 9155, 13088) }, + { AOM_CDF4(2993, 5966, 8774) }, + { AOM_CDF4(23424, 28903, 30778) }, + { AOM_CDF4(20775, 27666, 30290) }, + { AOM_CDF4(16474, 24410, 28299) }, + { AOM_CDF4(12471, 20180, 24987) }, + { AOM_CDF4(9410, 16487, 21439) }, + { AOM_CDF4(7536, 13614, 18529) }, + { AOM_CDF4(5048, 9586, 13549) }, + { AOM_CDF4(21090, 27290, 29756) }, + { AOM_CDF4(20796, 27402, 30026) }, + { AOM_CDF4(17819, 25485, 28969) }, + { AOM_CDF4(13860, 21909, 26462) }, + { AOM_CDF4(11002, 18494, 23529) }, + { AOM_CDF4(8953, 15929, 20897) }, + { AOM_CDF4(6448, 11918, 16454) } } }, + { { { AOM_CDF4(15999, 22208, 25449) }, + { AOM_CDF4(13050, 19988, 24122) }, + { AOM_CDF4(8594, 14864, 19378) }, + { AOM_CDF4(6033, 11079, 15238) }, + { AOM_CDF4(4554, 8683, 12347) }, + { AOM_CDF4(3672, 7139, 10337) }, + { AOM_CDF4(1900, 3771, 5576) }, + { AOM_CDF4(15788, 21340, 23949) }, + { AOM_CDF4(16825, 24235, 27758) }, + { AOM_CDF4(12873, 20402, 24810) }, + { AOM_CDF4(9590, 16363, 21094) }, + { AOM_CDF4(7352, 13209, 17733) }, + { AOM_CDF4(5960, 10989, 15184) }, + { AOM_CDF4(3232, 6234, 9007) }, + { AOM_CDF4(15761, 20716, 23224) }, + { AOM_CDF4(19318, 25989, 28759) }, + { AOM_CDF4(15529, 23094, 26929) }, + { AOM_CDF4(11662, 18989, 23641) }, + { AOM_CDF4(8955, 15568, 20366) }, + { AOM_CDF4(7281, 13106, 17708) }, + { AOM_CDF4(4248, 8059, 11440) } }, + { { AOM_CDF4(14899, 21217, 24503) }, + { AOM_CDF4(13519, 20283, 24047) }, + { AOM_CDF4(9429, 15966, 20365) }, + { AOM_CDF4(6700, 12355, 16652) }, + { AOM_CDF4(5088, 9704, 13716) }, + { AOM_CDF4(4243, 8154, 11731) }, + { AOM_CDF4(2702, 5364, 7861) }, + { AOM_CDF4(22745, 28388, 30454) }, + { AOM_CDF4(20235, 27146, 29922) }, + { AOM_CDF4(15896, 23715, 27637) }, + { AOM_CDF4(11840, 19350, 24131) }, + { AOM_CDF4(9122, 15932, 20880) }, + { AOM_CDF4(7488, 13581, 18362) }, + { AOM_CDF4(5114, 9568, 13370) }, + { AOM_CDF4(20845, 26553, 28932) }, + { AOM_CDF4(20981, 27372, 29884) }, + { AOM_CDF4(17781, 25335, 28785) }, + { AOM_CDF4(13760, 21708, 26297) }, + { AOM_CDF4(10975, 18415, 23365) }, + { AOM_CDF4(9045, 15789, 20686) }, + { AOM_CDF4(6130, 11199, 15423) } } }, + { { { AOM_CDF4(13549, 19724, 23158) }, + { AOM_CDF4(11844, 18382, 22246) }, + { AOM_CDF4(7919, 13619, 17773) }, + { AOM_CDF4(5486, 10143, 13946) }, + { AOM_CDF4(4166, 7983, 11324) }, + { AOM_CDF4(3364, 6506, 9427) }, + { AOM_CDF4(1598, 3160, 4674) }, + { AOM_CDF4(15281, 20979, 23781) }, + { AOM_CDF4(14939, 22119, 25952) }, + { AOM_CDF4(11363, 18407, 22812) }, + { AOM_CDF4(8609, 14857, 19370) }, + { AOM_CDF4(6737, 12184, 16480) }, + { AOM_CDF4(5506, 10263, 14262) }, + { AOM_CDF4(2990, 5786, 8380) }, + { AOM_CDF4(20249, 25253, 27417) }, + { AOM_CDF4(21070, 27518, 30001) }, + { AOM_CDF4(16854, 24469, 28074) }, + { AOM_CDF4(12864, 20486, 25000) }, + { AOM_CDF4(9962, 16978, 21778) }, + { AOM_CDF4(8074, 14338, 19048) }, + { AOM_CDF4(4494, 8479, 11906) } }, + { { AOM_CDF4(13960, 19617, 22829) }, + { AOM_CDF4(11150, 17341, 21228) }, + { AOM_CDF4(7150, 12964, 17190) }, + { AOM_CDF4(5331, 10002, 13867) }, + { AOM_CDF4(4167, 7744, 11057) }, + { AOM_CDF4(3480, 6629, 9646) }, + { AOM_CDF4(1883, 3784, 5686) }, + { AOM_CDF4(18752, 25660, 28912) }, + { AOM_CDF4(16968, 24586, 28030) }, + { AOM_CDF4(13520, 21055, 25313) }, + { AOM_CDF4(10453, 17626, 22280) }, + { AOM_CDF4(8386, 14505, 19116) }, + { AOM_CDF4(6742, 12595, 17008) }, + { AOM_CDF4(4273, 8140, 11499) }, + { AOM_CDF4(22120, 27827, 30233) }, + { AOM_CDF4(20563, 27358, 29895) }, + { AOM_CDF4(17076, 24644, 28153) }, + { AOM_CDF4(13362, 20942, 25309) }, + { AOM_CDF4(10794, 17965, 22695) }, + { AOM_CDF4(9014, 15652, 20319) }, + { AOM_CDF4(5708, 10512, 14497) } } }, + { { { AOM_CDF4(5705, 10930, 15725) }, + { AOM_CDF4(7946, 12765, 16115) }, + { AOM_CDF4(6801, 12123, 16226) }, + { AOM_CDF4(5462, 10135, 14200) }, + { AOM_CDF4(4189, 8011, 11507) }, + { AOM_CDF4(3191, 6229, 9408) }, + { AOM_CDF4(1057, 2137, 3212) }, + { AOM_CDF4(10018, 17067, 21491) }, + { AOM_CDF4(7380, 12582, 16453) }, + { AOM_CDF4(6068, 10845, 14339) }, + { AOM_CDF4(5098, 9198, 12555) }, + { AOM_CDF4(4312, 8010, 11119) }, + { AOM_CDF4(3700, 6966, 9781) }, + { AOM_CDF4(1693, 3326, 4887) }, + { AOM_CDF4(18757, 24930, 27774) }, + { AOM_CDF4(17648, 24596, 27817) }, + { AOM_CDF4(14707, 22052, 26026) }, + { AOM_CDF4(11720, 18852, 23292) }, + { AOM_CDF4(9357, 15952, 20525) }, + { AOM_CDF4(7810, 13753, 18210) }, + { AOM_CDF4(3879, 7333, 10328) } }, + { { AOM_CDF4(8278, 13242, 15922) }, + { AOM_CDF4(10547, 15867, 18919) }, + { AOM_CDF4(9106, 15842, 20609) }, + { AOM_CDF4(6833, 13007, 17218) }, + { AOM_CDF4(4811, 9712, 13923) }, + { AOM_CDF4(3985, 7352, 11128) }, + { AOM_CDF4(1688, 3458, 5262) }, + { AOM_CDF4(12951, 21861, 26510) }, + { AOM_CDF4(9788, 16044, 20276) }, + { AOM_CDF4(6309, 11244, 14870) }, + { AOM_CDF4(5183, 9349, 12566) }, + { AOM_CDF4(4389, 8229, 11492) }, + { AOM_CDF4(3633, 6945, 10620) }, + { AOM_CDF4(3600, 6847, 9907) }, + { AOM_CDF4(21748, 28137, 30255) }, + { AOM_CDF4(19436, 26581, 29560) }, + { AOM_CDF4(16359, 24201, 27953) }, + { AOM_CDF4(13961, 21693, 25871) }, + { AOM_CDF4(11544, 18686, 23322) }, + { AOM_CDF4(9372, 16462, 20952) }, + { AOM_CDF4(6138, 11210, 15390) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(16138, 22223, 25509) }, + { AOM_CDF4(15347, 22430, 26332) }, + { AOM_CDF4(9614, 16736, 21332) }, + { AOM_CDF4(6600, 12275, 16907) }, + { AOM_CDF4(4811, 9424, 13547) }, + { AOM_CDF4(3748, 7809, 11420) }, + { AOM_CDF4(2254, 4587, 6890) }, + { AOM_CDF4(15196, 20284, 23177) }, + { AOM_CDF4(18317, 25469, 28451) }, + { AOM_CDF4(13918, 21651, 25842) }, + { AOM_CDF4(10052, 17150, 21995) }, + { AOM_CDF4(7499, 13630, 18587) }, + { AOM_CDF4(6158, 11417, 16003) }, + { AOM_CDF4(4014, 7785, 11252) }, + { AOM_CDF4(15048, 21067, 24384) }, + { AOM_CDF4(18202, 25346, 28553) }, + { AOM_CDF4(14302, 22019, 26356) }, + { AOM_CDF4(10839, 18139, 23166) }, + { AOM_CDF4(8715, 15744, 20806) }, + { AOM_CDF4(7536, 13576, 18544) }, + { AOM_CDF4(5413, 10335, 14498) } }, + { { AOM_CDF4(17394, 24501, 27895) }, + { AOM_CDF4(15889, 23420, 27185) }, + { AOM_CDF4(11561, 19133, 23870) }, + { AOM_CDF4(8285, 14812, 19844) }, + { AOM_CDF4(6496, 12043, 16550) }, + { AOM_CDF4(4771, 9574, 13677) }, + { AOM_CDF4(3603, 6830, 10144) }, + { AOM_CDF4(21656, 27704, 30200) }, + { AOM_CDF4(21324, 27915, 30511) }, + { AOM_CDF4(17327, 25336, 28997) }, + { AOM_CDF4(13417, 21381, 26033) }, + { AOM_CDF4(10132, 17425, 22338) }, + { AOM_CDF4(8580, 15016, 19633) }, + { AOM_CDF4(5694, 11477, 16411) }, + { AOM_CDF4(24116, 29780, 31450) }, + { AOM_CDF4(23853, 29695, 31591) }, + { AOM_CDF4(20085, 27614, 30428) }, + { AOM_CDF4(15326, 24335, 28575) }, + { AOM_CDF4(11814, 19472, 24810) }, + { AOM_CDF4(10221, 18611, 24767) }, + { AOM_CDF4(7689, 14558, 20321) } } }, + { { { AOM_CDF4(16214, 22380, 25770) }, + { AOM_CDF4(14213, 21304, 25295) }, + { AOM_CDF4(9213, 15823, 20455) }, + { AOM_CDF4(6395, 11758, 16139) }, + { AOM_CDF4(4779, 9187, 13066) }, + { AOM_CDF4(3821, 7501, 10953) }, + { AOM_CDF4(2293, 4567, 6795) }, + { AOM_CDF4(15859, 21283, 23820) }, + { AOM_CDF4(18404, 25602, 28726) }, + { AOM_CDF4(14325, 21980, 26206) }, + { AOM_CDF4(10669, 17937, 22720) }, + { AOM_CDF4(8297, 14642, 19447) }, + { AOM_CDF4(6746, 12389, 16893) }, + { AOM_CDF4(4324, 8251, 11770) }, + { AOM_CDF4(16532, 21631, 24475) }, + { AOM_CDF4(20667, 27150, 29668) }, + { AOM_CDF4(16728, 24510, 28175) }, + { AOM_CDF4(12861, 20645, 25332) }, + { AOM_CDF4(10076, 17361, 22417) }, + { AOM_CDF4(8395, 14940, 19963) }, + { AOM_CDF4(5731, 10683, 14912) } }, + { { AOM_CDF4(14433, 21155, 24938) }, + { AOM_CDF4(14658, 21716, 25545) }, + { AOM_CDF4(9923, 16824, 21557) }, + { AOM_CDF4(6982, 13052, 17721) }, + { AOM_CDF4(5419, 10503, 15050) }, + { AOM_CDF4(4852, 9162, 13014) }, + { AOM_CDF4(3271, 6395, 9630) }, + { AOM_CDF4(22210, 27833, 30109) }, + { AOM_CDF4(20750, 27368, 29821) }, + { AOM_CDF4(16894, 24828, 28573) }, + { AOM_CDF4(13247, 21276, 25757) }, + { AOM_CDF4(10038, 17265, 22563) }, + { AOM_CDF4(8587, 14947, 20327) }, + { AOM_CDF4(5645, 11371, 15252) }, + { AOM_CDF4(22027, 27526, 29714) }, + { AOM_CDF4(23098, 29146, 31221) }, + { AOM_CDF4(19886, 27341, 30272) }, + { AOM_CDF4(15609, 23747, 28046) }, + { AOM_CDF4(11993, 20065, 24939) }, + { AOM_CDF4(9637, 18267, 23671) }, + { AOM_CDF4(7625, 13801, 19144) } } }, + { { { AOM_CDF4(14438, 20798, 24089) }, + { AOM_CDF4(12621, 19203, 23097) }, + { AOM_CDF4(8177, 14125, 18402) }, + { AOM_CDF4(5674, 10501, 14456) }, + { AOM_CDF4(4236, 8239, 11733) }, + { AOM_CDF4(3447, 6750, 9806) }, + { AOM_CDF4(1986, 3950, 5864) }, + { AOM_CDF4(16208, 22099, 24930) }, + { AOM_CDF4(16537, 24025, 27585) }, + { AOM_CDF4(12780, 20381, 24867) }, + { AOM_CDF4(9767, 16612, 21416) }, + { AOM_CDF4(7686, 13738, 18398) }, + { AOM_CDF4(6333, 11614, 15964) }, + { AOM_CDF4(3941, 7571, 10836) }, + { AOM_CDF4(22819, 27422, 29202) }, + { AOM_CDF4(22224, 28514, 30721) }, + { AOM_CDF4(17660, 25433, 28913) }, + { AOM_CDF4(13574, 21482, 26002) }, + { AOM_CDF4(10629, 17977, 22938) }, + { AOM_CDF4(8612, 15298, 20265) }, + { AOM_CDF4(5607, 10491, 14596) } }, + { { AOM_CDF4(13569, 19800, 23206) }, + { AOM_CDF4(13128, 19924, 23869) }, + { AOM_CDF4(8329, 14841, 19403) }, + { AOM_CDF4(6130, 10976, 15057) }, + { AOM_CDF4(4682, 8839, 12518) }, + { AOM_CDF4(3656, 7409, 10588) }, + { AOM_CDF4(2577, 5099, 7412) }, + { AOM_CDF4(22427, 28684, 30585) }, + { AOM_CDF4(20913, 27750, 30139) }, + { AOM_CDF4(15840, 24109, 27834) }, + { AOM_CDF4(12308, 20029, 24569) }, + { AOM_CDF4(10216, 16785, 21458) }, + { AOM_CDF4(8309, 14203, 19113) }, + { AOM_CDF4(6043, 11168, 15307) }, + { AOM_CDF4(23166, 28901, 30998) }, + { AOM_CDF4(21899, 28405, 30751) }, + { AOM_CDF4(18413, 26091, 29443) }, + { AOM_CDF4(15233, 23114, 27352) }, + { AOM_CDF4(12683, 20472, 25288) }, + { AOM_CDF4(10702, 18259, 23409) }, + { AOM_CDF4(8125, 14464, 19226) } } }, + { { { AOM_CDF4(9040, 14786, 18360) }, + { AOM_CDF4(9979, 15718, 19415) }, + { AOM_CDF4(7913, 13918, 18311) }, + { AOM_CDF4(5859, 10889, 15184) }, + { AOM_CDF4(4593, 8677, 12510) }, + { AOM_CDF4(3820, 7396, 10791) }, + { AOM_CDF4(1730, 3471, 5192) }, + { AOM_CDF4(11803, 18365, 22709) }, + { AOM_CDF4(11419, 18058, 22225) }, + { AOM_CDF4(9418, 15774, 20243) }, + { AOM_CDF4(7539, 13325, 17657) }, + { AOM_CDF4(6233, 11317, 15384) }, + { AOM_CDF4(5137, 9656, 13545) }, + { AOM_CDF4(2977, 5774, 8349) }, + { AOM_CDF4(21207, 27246, 29640) }, + { AOM_CDF4(19547, 26578, 29497) }, + { AOM_CDF4(16169, 23871, 27690) }, + { AOM_CDF4(12820, 20458, 25018) }, + { AOM_CDF4(10224, 17332, 22214) }, + { AOM_CDF4(8526, 15048, 19884) }, + { AOM_CDF4(5037, 9410, 13118) } }, + { { AOM_CDF4(12339, 17329, 20140) }, + { AOM_CDF4(13505, 19895, 23225) }, + { AOM_CDF4(9847, 16944, 21564) }, + { AOM_CDF4(7280, 13256, 18348) }, + { AOM_CDF4(4712, 10009, 14454) }, + { AOM_CDF4(4361, 7914, 12477) }, + { AOM_CDF4(2870, 5628, 7995) }, + { AOM_CDF4(20061, 25504, 28526) }, + { AOM_CDF4(15235, 22878, 26145) }, + { AOM_CDF4(12985, 19958, 24155) }, + { AOM_CDF4(9782, 16641, 21403) }, + { AOM_CDF4(9456, 16360, 20760) }, + { AOM_CDF4(6855, 12940, 18557) }, + { AOM_CDF4(5661, 10564, 15002) }, + { AOM_CDF4(25656, 30602, 31894) }, + { AOM_CDF4(22570, 29107, 31092) }, + { AOM_CDF4(18917, 26423, 29541) }, + { AOM_CDF4(15940, 23649, 27754) }, + { AOM_CDF4(12803, 20581, 25219) }, + { AOM_CDF4(11082, 18695, 23376) }, + { AOM_CDF4(7939, 14373, 19005) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(18315, 24289, 27551) }, + { AOM_CDF4(16854, 24068, 27835) }, + { AOM_CDF4(10140, 17927, 23173) }, + { AOM_CDF4(6722, 12982, 18267) }, + { AOM_CDF4(4661, 9826, 14706) }, + { AOM_CDF4(3832, 8165, 12294) }, + { AOM_CDF4(2795, 6098, 9245) }, + { AOM_CDF4(17145, 23326, 26672) }, + { AOM_CDF4(20733, 27680, 30308) }, + { AOM_CDF4(16032, 24461, 28546) }, + { AOM_CDF4(11653, 20093, 25081) }, + { AOM_CDF4(9290, 16429, 22086) }, + { AOM_CDF4(7796, 14598, 19982) }, + { AOM_CDF4(6502, 12378, 17441) }, + { AOM_CDF4(21681, 27732, 30320) }, + { AOM_CDF4(22389, 29044, 31261) }, + { AOM_CDF4(19027, 26731, 30087) }, + { AOM_CDF4(14739, 23755, 28624) }, + { AOM_CDF4(11358, 20778, 25511) }, + { AOM_CDF4(10995, 18073, 24190) }, + { AOM_CDF4(9162, 14990, 20617) } }, + { { AOM_CDF4(21425, 27952, 30388) }, + { AOM_CDF4(18062, 25838, 29034) }, + { AOM_CDF4(11956, 19881, 24808) }, + { AOM_CDF4(7718, 15000, 20980) }, + { AOM_CDF4(5702, 11254, 16143) }, + { AOM_CDF4(4898, 9088, 16864) }, + { AOM_CDF4(3679, 6776, 11907) }, + { AOM_CDF4(23294, 30160, 31663) }, + { AOM_CDF4(24397, 29896, 31836) }, + { AOM_CDF4(19245, 27128, 30593) }, + { AOM_CDF4(13202, 19825, 26404) }, + { AOM_CDF4(11578, 19297, 23957) }, + { AOM_CDF4(8073, 13297, 21370) }, + { AOM_CDF4(5461, 10923, 19745) }, + { AOM_CDF4(27367, 30521, 31934) }, + { AOM_CDF4(24904, 30671, 31940) }, + { AOM_CDF4(23075, 28460, 31299) }, + { AOM_CDF4(14400, 23658, 30417) }, + { AOM_CDF4(13885, 23882, 28325) }, + { AOM_CDF4(14746, 22938, 27853) }, + { AOM_CDF4(5461, 16384, 27307) } } }, + { { { AOM_CDF4(18274, 24813, 27890) }, + { AOM_CDF4(15537, 23149, 27003) }, + { AOM_CDF4(9449, 16740, 21827) }, + { AOM_CDF4(6700, 12498, 17261) }, + { AOM_CDF4(4988, 9866, 14198) }, + { AOM_CDF4(4236, 8147, 11902) }, + { AOM_CDF4(2867, 5860, 8654) }, + { AOM_CDF4(17124, 23171, 26101) }, + { AOM_CDF4(20396, 27477, 30148) }, + { AOM_CDF4(16573, 24629, 28492) }, + { AOM_CDF4(12749, 20846, 25674) }, + { AOM_CDF4(10233, 17878, 22818) }, + { AOM_CDF4(8525, 15332, 20363) }, + { AOM_CDF4(6283, 11632, 16255) }, + { AOM_CDF4(20466, 26511, 29286) }, + { AOM_CDF4(23059, 29174, 31191) }, + { AOM_CDF4(19481, 27263, 30241) }, + { AOM_CDF4(15458, 23631, 28137) }, + { AOM_CDF4(12416, 20608, 25693) }, + { AOM_CDF4(10261, 18011, 23261) }, + { AOM_CDF4(8016, 14655, 19666) } }, + { { AOM_CDF4(17616, 24586, 28112) }, + { AOM_CDF4(15809, 23299, 27155) }, + { AOM_CDF4(10767, 18890, 23793) }, + { AOM_CDF4(7727, 14255, 18865) }, + { AOM_CDF4(6129, 11926, 16882) }, + { AOM_CDF4(4482, 9704, 14861) }, + { AOM_CDF4(3277, 7452, 11522) }, + { AOM_CDF4(22956, 28551, 30730) }, + { AOM_CDF4(22724, 28937, 30961) }, + { AOM_CDF4(18467, 26324, 29580) }, + { AOM_CDF4(13234, 20713, 25649) }, + { AOM_CDF4(11181, 17592, 22481) }, + { AOM_CDF4(8291, 18358, 24576) }, + { AOM_CDF4(7568, 11881, 14984) }, + { AOM_CDF4(24948, 29001, 31147) }, + { AOM_CDF4(25674, 30619, 32151) }, + { AOM_CDF4(20841, 26793, 29603) }, + { AOM_CDF4(14669, 24356, 28666) }, + { AOM_CDF4(11334, 23593, 28219) }, + { AOM_CDF4(8922, 14762, 22873) }, + { AOM_CDF4(8301, 13544, 20535) } } }, + { { { AOM_CDF4(17113, 23733, 27081) }, + { AOM_CDF4(14139, 21406, 25452) }, + { AOM_CDF4(8552, 15002, 19776) }, + { AOM_CDF4(5871, 11120, 15378) }, + { AOM_CDF4(4455, 8616, 12253) }, + { AOM_CDF4(3469, 6910, 10386) }, + { AOM_CDF4(2255, 4553, 6782) }, + { AOM_CDF4(18224, 24376, 27053) }, + { AOM_CDF4(19290, 26710, 29614) }, + { AOM_CDF4(14936, 22991, 27184) }, + { AOM_CDF4(11238, 18951, 23762) }, + { AOM_CDF4(8786, 15617, 20588) }, + { AOM_CDF4(7317, 13228, 18003) }, + { AOM_CDF4(5101, 9512, 13493) }, + { AOM_CDF4(22639, 28222, 30210) }, + { AOM_CDF4(23216, 29331, 31307) }, + { AOM_CDF4(19075, 26762, 29895) }, + { AOM_CDF4(15014, 23113, 27457) }, + { AOM_CDF4(11938, 19857, 24752) }, + { AOM_CDF4(9942, 17280, 22282) }, + { AOM_CDF4(7167, 13144, 17752) } }, + { { AOM_CDF4(15820, 22738, 26488) }, + { AOM_CDF4(13530, 20885, 25216) }, + { AOM_CDF4(8395, 15530, 20452) }, + { AOM_CDF4(6574, 12321, 16380) }, + { AOM_CDF4(5353, 10419, 14568) }, + { AOM_CDF4(4613, 8446, 12381) }, + { AOM_CDF4(3440, 7158, 9903) }, + { AOM_CDF4(24247, 29051, 31224) }, + { AOM_CDF4(22118, 28058, 30369) }, + { AOM_CDF4(16498, 24768, 28389) }, + { AOM_CDF4(12920, 21175, 26137) }, + { AOM_CDF4(10730, 18619, 25352) }, + { AOM_CDF4(10187, 16279, 22791) }, + { AOM_CDF4(9310, 14631, 22127) }, + { AOM_CDF4(24970, 30558, 32057) }, + { AOM_CDF4(24801, 29942, 31698) }, + { AOM_CDF4(22432, 28453, 30855) }, + { AOM_CDF4(19054, 25680, 29580) }, + { AOM_CDF4(14392, 23036, 28109) }, + { AOM_CDF4(12495, 20947, 26650) }, + { AOM_CDF4(12442, 20326, 26214) } } }, + { { { AOM_CDF4(12162, 18785, 22648) }, + { AOM_CDF4(12749, 19697, 23806) }, + { AOM_CDF4(8580, 15297, 20346) }, + { AOM_CDF4(6169, 11749, 16543) }, + { AOM_CDF4(4836, 9391, 13448) }, + { AOM_CDF4(3821, 7711, 11613) }, + { AOM_CDF4(2228, 4601, 7070) }, + { AOM_CDF4(16319, 24725, 28280) }, + { AOM_CDF4(15698, 23277, 27168) }, + { AOM_CDF4(12726, 20368, 25047) }, + { AOM_CDF4(9912, 17015, 21976) }, + { AOM_CDF4(7888, 14220, 19179) }, + { AOM_CDF4(6777, 12284, 17018) }, + { AOM_CDF4(4492, 8590, 12252) }, + { AOM_CDF4(23249, 28904, 30947) }, + { AOM_CDF4(21050, 27908, 30512) }, + { AOM_CDF4(17440, 25340, 28949) }, + { AOM_CDF4(14059, 22018, 26541) }, + { AOM_CDF4(11288, 18903, 23898) }, + { AOM_CDF4(9411, 16342, 21428) }, + { AOM_CDF4(6278, 11588, 15944) } }, + { { AOM_CDF4(13981, 20067, 23226) }, + { AOM_CDF4(16922, 23580, 26783) }, + { AOM_CDF4(11005, 19039, 24487) }, + { AOM_CDF4(7389, 14218, 19798) }, + { AOM_CDF4(5598, 11505, 17206) }, + { AOM_CDF4(6090, 11213, 15659) }, + { AOM_CDF4(3820, 7371, 10119) }, + { AOM_CDF4(21082, 26925, 29675) }, + { AOM_CDF4(21262, 28627, 31128) }, + { AOM_CDF4(18392, 26454, 30437) }, + { AOM_CDF4(14870, 22910, 27096) }, + { AOM_CDF4(12620, 19484, 24908) }, + { AOM_CDF4(9290, 16553, 22802) }, + { AOM_CDF4(6668, 14288, 20004) }, + { AOM_CDF4(27704, 31055, 31949) }, + { AOM_CDF4(24709, 29978, 31788) }, + { AOM_CDF4(21668, 29264, 31657) }, + { AOM_CDF4(18295, 26968, 30074) }, + { AOM_CDF4(16399, 24422, 29313) }, + { AOM_CDF4(14347, 23026, 28104) }, + { AOM_CDF4(12370, 19806, 24477) } } }, + { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } + }; + +static const aom_cdf_prob av1_default_coeff_base_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS] + [CDF_SIZE(NUM_BASE_LEVELS + + 2)] = { { { { { AOM_CDF4(4034, 8930, 12727) }, + { AOM_CDF4(18082, 29741, 31877) }, + { AOM_CDF4(12596, 26124, 30493) }, + { AOM_CDF4(9446, 21118, 27005) }, + { AOM_CDF4(6308, 15141, 21279) }, + { AOM_CDF4(2463, 6357, 9783) }, + { AOM_CDF4(20667, 30546, 31929) }, + { AOM_CDF4(13043, 26123, 30134) }, + { AOM_CDF4(8151, 18757, 24778) }, + { AOM_CDF4(5255, 12839, 18632) }, + { AOM_CDF4(2820, 7206, 11161) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(15736, 27553, 30604) }, + { AOM_CDF4(11210, 23794, 28787) }, + { AOM_CDF4(5947, 13874, 19701) }, + { AOM_CDF4(4215, 9323, 13891) }, + { AOM_CDF4(2833, 6462, 10059) }, + { AOM_CDF4(19605, 30393, 31582) }, + { AOM_CDF4(13523, 26252, 30248) }, + { AOM_CDF4(8446, 18622, 24512) }, + { AOM_CDF4(3818, 10343, 15974) }, + { AOM_CDF4(1481, 4117, 6796) }, + { AOM_CDF4(22649, 31302, 32190) }, + { AOM_CDF4(14829, 27127, 30449) }, + { AOM_CDF4(8313, 17702, 23304) }, + { AOM_CDF4(3022, 8301, 12786) }, + { AOM_CDF4(1536, 4412, 7184) }, + { AOM_CDF4(22354, 29774, 31372) }, + { AOM_CDF4(14723, 25472, 29214) }, + { AOM_CDF4(6673, 13745, 18662) }, + { AOM_CDF4(2068, 5766, 9322) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6302, 16444, 21761) }, + { AOM_CDF4(23040, 31538, 32475) }, + { AOM_CDF4(15196, 28452, 31496) }, + { AOM_CDF4(10020, 22946, 28514) }, + { AOM_CDF4(6533, 16862, 23501) }, + { AOM_CDF4(3538, 9816, 15076) }, + { AOM_CDF4(24444, 31875, 32525) }, + { AOM_CDF4(15881, 28924, 31635) }, + { AOM_CDF4(9922, 22873, 28466) }, + { AOM_CDF4(6527, 16966, 23691) }, + { AOM_CDF4(4114, 11303, 17220) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20201, 30770, 32209) }, + { AOM_CDF4(14754, 28071, 31258) }, + { AOM_CDF4(8378, 20186, 26517) }, + { AOM_CDF4(5916, 15299, 21978) }, + { AOM_CDF4(4268, 11583, 17901) }, + { AOM_CDF4(24361, 32025, 32581) }, + { AOM_CDF4(18673, 30105, 31943) }, + { AOM_CDF4(10196, 22244, 27576) }, + { AOM_CDF4(5495, 14349, 20417) }, + { AOM_CDF4(2676, 7415, 11498) }, + { AOM_CDF4(24678, 31958, 32585) }, + { AOM_CDF4(18629, 29906, 31831) }, + { AOM_CDF4(9364, 20724, 26315) }, + { AOM_CDF4(4641, 12318, 18094) }, + { AOM_CDF4(2758, 7387, 11579) }, + { AOM_CDF4(25433, 31842, 32469) }, + { AOM_CDF4(18795, 29289, 31411) }, + { AOM_CDF4(7644, 17584, 23592) }, + { AOM_CDF4(3408, 9014, 15047) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4536, 10072, 14001) }, + { AOM_CDF4(25459, 31416, 32206) }, + { AOM_CDF4(16605, 28048, 30818) }, + { AOM_CDF4(11008, 22857, 27719) }, + { AOM_CDF4(6915, 16268, 22315) }, + { AOM_CDF4(2625, 6812, 10537) }, + { AOM_CDF4(24257, 31788, 32499) }, + { AOM_CDF4(16880, 29454, 31879) }, + { AOM_CDF4(11958, 25054, 29778) }, + { AOM_CDF4(7916, 18718, 25084) }, + { AOM_CDF4(3383, 8777, 13446) }, + { AOM_CDF4(22720, 31603, 32393) }, + { AOM_CDF4(14960, 28125, 31335) }, + { AOM_CDF4(9731, 22210, 27928) }, + { AOM_CDF4(6304, 15832, 22277) }, + { AOM_CDF4(2910, 7818, 12166) }, + { AOM_CDF4(20375, 30627, 32131) }, + { AOM_CDF4(13904, 27284, 30887) }, + { AOM_CDF4(9368, 21558, 27144) }, + { AOM_CDF4(5937, 14966, 21119) }, + { AOM_CDF4(2667, 7225, 11319) }, + { AOM_CDF4(23970, 31470, 32378) }, + { AOM_CDF4(17173, 29734, 32018) }, + { AOM_CDF4(12795, 25441, 29965) }, + { AOM_CDF4(8981, 19680, 25893) }, + { AOM_CDF4(4728, 11372, 16902) }, + { AOM_CDF4(24287, 31797, 32439) }, + { AOM_CDF4(16703, 29145, 31696) }, + { AOM_CDF4(10833, 23554, 28725) }, + { AOM_CDF4(6468, 16566, 23057) }, + { AOM_CDF4(2415, 6562, 10278) }, + { AOM_CDF4(26610, 32395, 32659) }, + { AOM_CDF4(18590, 30498, 32117) }, + { AOM_CDF4(12420, 25756, 29950) }, + { AOM_CDF4(7639, 18746, 24710) }, + { AOM_CDF4(3001, 8086, 12347) }, + { AOM_CDF4(25076, 32064, 32580) }, + { AOM_CDF4(17946, 30128, 32028) }, + { AOM_CDF4(12024, 24985, 29378) }, + { AOM_CDF4(7517, 18390, 24304) }, + { AOM_CDF4(3243, 8781, 13331) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6037, 16771, 21957) }, + { AOM_CDF4(24774, 31704, 32426) }, + { AOM_CDF4(16830, 28589, 31056) }, + { AOM_CDF4(10602, 22828, 27760) }, + { AOM_CDF4(6733, 16829, 23071) }, + { AOM_CDF4(3250, 8914, 13556) }, + { AOM_CDF4(25582, 32220, 32668) }, + { AOM_CDF4(18659, 30342, 32223) }, + { AOM_CDF4(12546, 26149, 30515) }, + { AOM_CDF4(8420, 20451, 26801) }, + { AOM_CDF4(4636, 12420, 18344) }, + { AOM_CDF4(27581, 32362, 32639) }, + { AOM_CDF4(18987, 30083, 31978) }, + { AOM_CDF4(11327, 24248, 29084) }, + { AOM_CDF4(7264, 17719, 24120) }, + { AOM_CDF4(3995, 10768, 16169) }, + { AOM_CDF4(25893, 31831, 32487) }, + { AOM_CDF4(16577, 28587, 31379) }, + { AOM_CDF4(10189, 22748, 28182) }, + { AOM_CDF4(6832, 17094, 23556) }, + { AOM_CDF4(3708, 10110, 15334) }, + { AOM_CDF4(25904, 32282, 32656) }, + { AOM_CDF4(19721, 30792, 32276) }, + { AOM_CDF4(12819, 26243, 30411) }, + { AOM_CDF4(8572, 20614, 26891) }, + { AOM_CDF4(5364, 14059, 20467) }, + { AOM_CDF4(26580, 32438, 32677) }, + { AOM_CDF4(20852, 31225, 32340) }, + { AOM_CDF4(12435, 25700, 29967) }, + { AOM_CDF4(8691, 20825, 26976) }, + { AOM_CDF4(4446, 12209, 17269) }, + { AOM_CDF4(27350, 32429, 32696) }, + { AOM_CDF4(21372, 30977, 32272) }, + { AOM_CDF4(12673, 25270, 29853) }, + { AOM_CDF4(9208, 20925, 26640) }, + { AOM_CDF4(5018, 13351, 18732) }, + { AOM_CDF4(27351, 32479, 32713) }, + { AOM_CDF4(21398, 31209, 32387) }, + { AOM_CDF4(12162, 25047, 29842) }, + { AOM_CDF4(7896, 18691, 25319) }, + { AOM_CDF4(4670, 12882, 18881) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5487, 10460, 13708) }, + { AOM_CDF4(21597, 28303, 30674) }, + { AOM_CDF4(11037, 21953, 26476) }, + { AOM_CDF4(8147, 17962, 22952) }, + { AOM_CDF4(5242, 13061, 18532) }, + { AOM_CDF4(1889, 5208, 8182) }, + { AOM_CDF4(26774, 32133, 32590) }, + { AOM_CDF4(17844, 29564, 31767) }, + { AOM_CDF4(11690, 24438, 29171) }, + { AOM_CDF4(7542, 18215, 24459) }, + { AOM_CDF4(2993, 8050, 12319) }, + { AOM_CDF4(28023, 32328, 32591) }, + { AOM_CDF4(18651, 30126, 31954) }, + { AOM_CDF4(12164, 25146, 29589) }, + { AOM_CDF4(7762, 18530, 24771) }, + { AOM_CDF4(3492, 9183, 13920) }, + { AOM_CDF4(27591, 32008, 32491) }, + { AOM_CDF4(17149, 28853, 31510) }, + { AOM_CDF4(11485, 24003, 28860) }, + { AOM_CDF4(7697, 18086, 24210) }, + { AOM_CDF4(3075, 7999, 12218) }, + { AOM_CDF4(28268, 32482, 32654) }, + { AOM_CDF4(19631, 31051, 32404) }, + { AOM_CDF4(13860, 27260, 31020) }, + { AOM_CDF4(9605, 21613, 27594) }, + { AOM_CDF4(4876, 12162, 17908) }, + { AOM_CDF4(27248, 32316, 32576) }, + { AOM_CDF4(18955, 30457, 32075) }, + { AOM_CDF4(11824, 23997, 28795) }, + { AOM_CDF4(7346, 18196, 24647) }, + { AOM_CDF4(3403, 9247, 14111) }, + { AOM_CDF4(29711, 32655, 32735) }, + { AOM_CDF4(21169, 31394, 32417) }, + { AOM_CDF4(13487, 27198, 30957) }, + { AOM_CDF4(8828, 21683, 27614) }, + { AOM_CDF4(4270, 11451, 17038) }, + { AOM_CDF4(28708, 32578, 32731) }, + { AOM_CDF4(20120, 31241, 32482) }, + { AOM_CDF4(13692, 27550, 31321) }, + { AOM_CDF4(9418, 22514, 28439) }, + { AOM_CDF4(4999, 13283, 19462) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5673, 14302, 19711) }, + { AOM_CDF4(26251, 30701, 31834) }, + { AOM_CDF4(12782, 23783, 27803) }, + { AOM_CDF4(9127, 20657, 25808) }, + { AOM_CDF4(6368, 16208, 21462) }, + { AOM_CDF4(2465, 7177, 10822) }, + { AOM_CDF4(29961, 32563, 32719) }, + { AOM_CDF4(18318, 29891, 31949) }, + { AOM_CDF4(11361, 24514, 29357) }, + { AOM_CDF4(7900, 19603, 25607) }, + { AOM_CDF4(4002, 10590, 15546) }, + { AOM_CDF4(29637, 32310, 32595) }, + { AOM_CDF4(18296, 29913, 31809) }, + { AOM_CDF4(10144, 21515, 26871) }, + { AOM_CDF4(5358, 14322, 20394) }, + { AOM_CDF4(3067, 8362, 13346) }, + { AOM_CDF4(28652, 32470, 32676) }, + { AOM_CDF4(17538, 30771, 32209) }, + { AOM_CDF4(13924, 26882, 30494) }, + { AOM_CDF4(10496, 22837, 27869) }, + { AOM_CDF4(7236, 16396, 21621) }, + { AOM_CDF4(30743, 32687, 32746) }, + { AOM_CDF4(23006, 31676, 32489) }, + { AOM_CDF4(14494, 27828, 31120) }, + { AOM_CDF4(10174, 22801, 28352) }, + { AOM_CDF4(6242, 15281, 21043) }, + { AOM_CDF4(25817, 32243, 32720) }, + { AOM_CDF4(18618, 31367, 32325) }, + { AOM_CDF4(13997, 28318, 31878) }, + { AOM_CDF4(12255, 26534, 31383) }, + { AOM_CDF4(9561, 21588, 28450) }, + { AOM_CDF4(28188, 32635, 32724) }, + { AOM_CDF4(22060, 32365, 32728) }, + { AOM_CDF4(18102, 30690, 32528) }, + { AOM_CDF4(14196, 28864, 31999) }, + { AOM_CDF4(12262, 25792, 30865) }, + { AOM_CDF4(24176, 32109, 32628) }, + { AOM_CDF4(18280, 29681, 31963) }, + { AOM_CDF4(10205, 23703, 29664) }, + { AOM_CDF4(7889, 20025, 27676) }, + { AOM_CDF4(6060, 16743, 23970) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5141, 7096, 8260) }, + { AOM_CDF4(27186, 29022, 29789) }, + { AOM_CDF4(6668, 12568, 15682) }, + { AOM_CDF4(2172, 6181, 8638) }, + { AOM_CDF4(1126, 3379, 4531) }, + { AOM_CDF4(443, 1361, 2254) }, + { AOM_CDF4(26083, 31153, 32436) }, + { AOM_CDF4(13486, 24603, 28483) }, + { AOM_CDF4(6508, 14840, 19910) }, + { AOM_CDF4(3386, 8800, 13286) }, + { AOM_CDF4(1530, 4322, 7054) }, + { AOM_CDF4(29639, 32080, 32548) }, + { AOM_CDF4(15897, 27552, 30290) }, + { AOM_CDF4(8588, 20047, 25383) }, + { AOM_CDF4(4889, 13339, 19269) }, + { AOM_CDF4(2240, 6871, 10498) }, + { AOM_CDF4(28165, 32197, 32517) }, + { AOM_CDF4(20735, 30427, 31568) }, + { AOM_CDF4(14325, 24671, 27692) }, + { AOM_CDF4(5119, 12554, 17805) }, + { AOM_CDF4(1810, 5441, 8261) }, + { AOM_CDF4(31212, 32724, 32748) }, + { AOM_CDF4(23352, 31766, 32545) }, + { AOM_CDF4(14669, 27570, 31059) }, + { AOM_CDF4(8492, 20894, 27272) }, + { AOM_CDF4(3644, 10194, 15204) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(2461, 7013, 9371) }, + { AOM_CDF4(24749, 29600, 30986) }, + { AOM_CDF4(9466, 19037, 22417) }, + { AOM_CDF4(3584, 9280, 14400) }, + { AOM_CDF4(1505, 3929, 5433) }, + { AOM_CDF4(677, 1500, 2736) }, + { AOM_CDF4(23987, 30702, 32117) }, + { AOM_CDF4(13554, 24571, 29263) }, + { AOM_CDF4(6211, 14556, 21155) }, + { AOM_CDF4(3135, 10972, 15625) }, + { AOM_CDF4(2435, 7127, 11427) }, + { AOM_CDF4(31300, 32532, 32550) }, + { AOM_CDF4(14757, 30365, 31954) }, + { AOM_CDF4(4405, 11612, 18553) }, + { AOM_CDF4(580, 4132, 7322) }, + { AOM_CDF4(1695, 10169, 14124) }, + { AOM_CDF4(30008, 32282, 32591) }, + { AOM_CDF4(19244, 30108, 31748) }, + { AOM_CDF4(11180, 24158, 29555) }, + { AOM_CDF4(5650, 14972, 19209) }, + { AOM_CDF4(2114, 5109, 8456) }, + { AOM_CDF4(31856, 32716, 32748) }, + { AOM_CDF4(23012, 31664, 32572) }, + { AOM_CDF4(13694, 26656, 30636) }, + { AOM_CDF4(8142, 19508, 26093) }, + { AOM_CDF4(4253, 10955, 16724) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(601, 983, 1311) }, + { AOM_CDF4(18725, 23406, 28087) }, + { AOM_CDF4(5461, 8192, 10923) }, + { AOM_CDF4(3781, 15124, 21425) }, + { AOM_CDF4(2587, 7761, 12072) }, + { AOM_CDF4(106, 458, 810) }, + { AOM_CDF4(22282, 29710, 31894) }, + { AOM_CDF4(8508, 20926, 25984) }, + { AOM_CDF4(3726, 12713, 18083) }, + { AOM_CDF4(1620, 7112, 10893) }, + { AOM_CDF4(729, 2236, 3495) }, + { AOM_CDF4(30163, 32474, 32684) }, + { AOM_CDF4(18304, 30464, 32000) }, + { AOM_CDF4(11443, 26526, 29647) }, + { AOM_CDF4(6007, 15292, 21299) }, + { AOM_CDF4(2234, 6703, 8937) }, + { AOM_CDF4(30954, 32177, 32571) }, + { AOM_CDF4(17363, 29562, 31076) }, + { AOM_CDF4(9686, 22464, 27410) }, + { AOM_CDF4(8192, 16384, 21390) }, + { AOM_CDF4(1755, 8046, 11264) }, + { AOM_CDF4(31168, 32734, 32748) }, + { AOM_CDF4(22486, 31441, 32471) }, + { AOM_CDF4(12833, 25627, 29738) }, + { AOM_CDF4(6980, 17379, 23122) }, + { AOM_CDF4(3111, 8887, 13479) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(6041, 11854, 15927) }, + { AOM_CDF4(20326, 30905, 32251) }, + { AOM_CDF4(14164, 26831, 30725) }, + { AOM_CDF4(9760, 20647, 26585) }, + { AOM_CDF4(6416, 14953, 21219) }, + { AOM_CDF4(2966, 7151, 10891) }, + { AOM_CDF4(23567, 31374, 32254) }, + { AOM_CDF4(14978, 27416, 30946) }, + { AOM_CDF4(9434, 20225, 26254) }, + { AOM_CDF4(6658, 14558, 20535) }, + { AOM_CDF4(3916, 8677, 12989) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(18088, 29545, 31587) }, + { AOM_CDF4(13062, 25843, 30073) }, + { AOM_CDF4(8940, 16827, 22251) }, + { AOM_CDF4(7654, 13220, 17973) }, + { AOM_CDF4(5733, 10316, 14456) }, + { AOM_CDF4(22879, 31388, 32114) }, + { AOM_CDF4(15215, 27993, 30955) }, + { AOM_CDF4(9397, 19445, 24978) }, + { AOM_CDF4(3442, 9813, 15344) }, + { AOM_CDF4(1368, 3936, 6532) }, + { AOM_CDF4(25494, 32033, 32406) }, + { AOM_CDF4(16772, 27963, 30718) }, + { AOM_CDF4(9419, 18165, 23260) }, + { AOM_CDF4(2677, 7501, 11797) }, + { AOM_CDF4(1516, 4344, 7170) }, + { AOM_CDF4(26556, 31454, 32101) }, + { AOM_CDF4(17128, 27035, 30108) }, + { AOM_CDF4(8324, 15344, 20249) }, + { AOM_CDF4(1903, 5696, 9469) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8455, 19003, 24368) }, + { AOM_CDF4(23563, 32021, 32604) }, + { AOM_CDF4(16237, 29446, 31935) }, + { AOM_CDF4(10724, 23999, 29358) }, + { AOM_CDF4(6725, 17528, 24416) }, + { AOM_CDF4(3927, 10927, 16825) }, + { AOM_CDF4(26313, 32288, 32634) }, + { AOM_CDF4(17430, 30095, 32095) }, + { AOM_CDF4(11116, 24606, 29679) }, + { AOM_CDF4(7195, 18384, 25269) }, + { AOM_CDF4(4726, 12852, 19315) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(22822, 31648, 32483) }, + { AOM_CDF4(16724, 29633, 31929) }, + { AOM_CDF4(10261, 23033, 28725) }, + { AOM_CDF4(7029, 17840, 24528) }, + { AOM_CDF4(4867, 13886, 21502) }, + { AOM_CDF4(25298, 31892, 32491) }, + { AOM_CDF4(17809, 29330, 31512) }, + { AOM_CDF4(9668, 21329, 26579) }, + { AOM_CDF4(4774, 12956, 18976) }, + { AOM_CDF4(2322, 7030, 11540) }, + { AOM_CDF4(25472, 31920, 32543) }, + { AOM_CDF4(17957, 29387, 31632) }, + { AOM_CDF4(9196, 20593, 26400) }, + { AOM_CDF4(4680, 12705, 19202) }, + { AOM_CDF4(2917, 8456, 13436) }, + { AOM_CDF4(26471, 32059, 32574) }, + { AOM_CDF4(18458, 29783, 31909) }, + { AOM_CDF4(8400, 19464, 25956) }, + { AOM_CDF4(3812, 10973, 17206) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(6779, 13743, 17678) }, + { AOM_CDF4(24806, 31797, 32457) }, + { AOM_CDF4(17616, 29047, 31372) }, + { AOM_CDF4(11063, 23175, 28003) }, + { AOM_CDF4(6521, 16110, 22324) }, + { AOM_CDF4(2764, 7504, 11654) }, + { AOM_CDF4(25266, 32367, 32637) }, + { AOM_CDF4(19054, 30553, 32175) }, + { AOM_CDF4(12139, 25212, 29807) }, + { AOM_CDF4(7311, 18162, 24704) }, + { AOM_CDF4(3397, 9164, 14074) }, + { AOM_CDF4(25988, 32208, 32522) }, + { AOM_CDF4(16253, 28912, 31526) }, + { AOM_CDF4(9151, 21387, 27372) }, + { AOM_CDF4(5688, 14915, 21496) }, + { AOM_CDF4(2717, 7627, 12004) }, + { AOM_CDF4(23144, 31855, 32443) }, + { AOM_CDF4(16070, 28491, 31325) }, + { AOM_CDF4(8702, 20467, 26517) }, + { AOM_CDF4(5243, 13956, 20367) }, + { AOM_CDF4(2621, 7335, 11567) }, + { AOM_CDF4(26636, 32340, 32630) }, + { AOM_CDF4(19990, 31050, 32341) }, + { AOM_CDF4(13243, 26105, 30315) }, + { AOM_CDF4(8588, 19521, 25918) }, + { AOM_CDF4(4717, 11585, 17304) }, + { AOM_CDF4(25844, 32292, 32582) }, + { AOM_CDF4(19090, 30635, 32097) }, + { AOM_CDF4(11963, 24546, 28939) }, + { AOM_CDF4(6218, 16087, 22354) }, + { AOM_CDF4(2340, 6608, 10426) }, + { AOM_CDF4(28046, 32576, 32694) }, + { AOM_CDF4(21178, 31313, 32296) }, + { AOM_CDF4(13486, 26184, 29870) }, + { AOM_CDF4(7149, 17871, 23723) }, + { AOM_CDF4(2833, 7958, 12259) }, + { AOM_CDF4(27710, 32528, 32686) }, + { AOM_CDF4(20674, 31076, 32268) }, + { AOM_CDF4(12413, 24955, 29243) }, + { AOM_CDF4(6676, 16927, 23097) }, + { AOM_CDF4(2966, 8333, 12919) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8639, 19339, 24429) }, + { AOM_CDF4(24404, 31837, 32525) }, + { AOM_CDF4(16997, 29425, 31784) }, + { AOM_CDF4(11253, 24234, 29149) }, + { AOM_CDF4(6751, 17394, 24028) }, + { AOM_CDF4(3490, 9830, 15191) }, + { AOM_CDF4(26283, 32471, 32714) }, + { AOM_CDF4(19599, 31168, 32442) }, + { AOM_CDF4(13146, 26954, 30893) }, + { AOM_CDF4(8214, 20588, 26890) }, + { AOM_CDF4(4699, 13081, 19300) }, + { AOM_CDF4(28212, 32458, 32669) }, + { AOM_CDF4(18594, 30316, 32100) }, + { AOM_CDF4(11219, 24408, 29234) }, + { AOM_CDF4(6865, 17656, 24149) }, + { AOM_CDF4(3678, 10362, 16006) }, + { AOM_CDF4(25825, 32136, 32616) }, + { AOM_CDF4(17313, 29853, 32021) }, + { AOM_CDF4(11197, 24471, 29472) }, + { AOM_CDF4(6947, 17781, 24405) }, + { AOM_CDF4(3768, 10660, 16261) }, + { AOM_CDF4(27352, 32500, 32706) }, + { AOM_CDF4(20850, 31468, 32469) }, + { AOM_CDF4(14021, 27707, 31133) }, + { AOM_CDF4(8964, 21748, 27838) }, + { AOM_CDF4(5437, 14665, 21187) }, + { AOM_CDF4(26304, 32492, 32698) }, + { AOM_CDF4(20409, 31380, 32385) }, + { AOM_CDF4(13682, 27222, 30632) }, + { AOM_CDF4(8974, 21236, 26685) }, + { AOM_CDF4(4234, 11665, 16934) }, + { AOM_CDF4(26273, 32357, 32711) }, + { AOM_CDF4(20672, 31242, 32441) }, + { AOM_CDF4(14172, 27254, 30902) }, + { AOM_CDF4(9870, 21898, 27275) }, + { AOM_CDF4(5164, 13506, 19270) }, + { AOM_CDF4(26725, 32459, 32728) }, + { AOM_CDF4(20991, 31442, 32527) }, + { AOM_CDF4(13071, 26434, 30811) }, + { AOM_CDF4(8184, 20090, 26742) }, + { AOM_CDF4(4803, 13255, 19895) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7555, 14942, 18501) }, + { AOM_CDF4(24410, 31178, 32287) }, + { AOM_CDF4(14394, 26738, 30253) }, + { AOM_CDF4(8413, 19554, 25195) }, + { AOM_CDF4(4766, 12924, 18785) }, + { AOM_CDF4(2029, 5806, 9207) }, + { AOM_CDF4(26776, 32364, 32663) }, + { AOM_CDF4(18732, 29967, 31931) }, + { AOM_CDF4(11005, 23786, 28852) }, + { AOM_CDF4(6466, 16909, 23510) }, + { AOM_CDF4(3044, 8638, 13419) }, + { AOM_CDF4(29208, 32582, 32704) }, + { AOM_CDF4(20068, 30857, 32208) }, + { AOM_CDF4(12003, 25085, 29595) }, + { AOM_CDF4(6947, 17750, 24189) }, + { AOM_CDF4(3245, 9103, 14007) }, + { AOM_CDF4(27359, 32465, 32669) }, + { AOM_CDF4(19421, 30614, 32174) }, + { AOM_CDF4(11915, 25010, 29579) }, + { AOM_CDF4(6950, 17676, 24074) }, + { AOM_CDF4(3007, 8473, 13096) }, + { AOM_CDF4(29002, 32676, 32735) }, + { AOM_CDF4(22102, 31849, 32576) }, + { AOM_CDF4(14408, 28009, 31405) }, + { AOM_CDF4(9027, 21679, 27931) }, + { AOM_CDF4(4694, 12678, 18748) }, + { AOM_CDF4(28216, 32528, 32682) }, + { AOM_CDF4(20849, 31264, 32318) }, + { AOM_CDF4(12756, 25815, 29751) }, + { AOM_CDF4(7565, 18801, 24923) }, + { AOM_CDF4(3509, 9533, 14477) }, + { AOM_CDF4(30133, 32687, 32739) }, + { AOM_CDF4(23063, 31910, 32515) }, + { AOM_CDF4(14588, 28051, 31132) }, + { AOM_CDF4(9085, 21649, 27457) }, + { AOM_CDF4(4261, 11654, 17264) }, + { AOM_CDF4(29518, 32691, 32748) }, + { AOM_CDF4(22451, 31959, 32613) }, + { AOM_CDF4(14864, 28722, 31700) }, + { AOM_CDF4(9695, 22964, 28716) }, + { AOM_CDF4(4932, 13358, 19502) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(6465, 16958, 21688) }, + { AOM_CDF4(25199, 31514, 32360) }, + { AOM_CDF4(14774, 27149, 30607) }, + { AOM_CDF4(9257, 21438, 26972) }, + { AOM_CDF4(5723, 15183, 21882) }, + { AOM_CDF4(3150, 8879, 13731) }, + { AOM_CDF4(26989, 32262, 32682) }, + { AOM_CDF4(17396, 29937, 32085) }, + { AOM_CDF4(11387, 24901, 29784) }, + { AOM_CDF4(7289, 18821, 25548) }, + { AOM_CDF4(3734, 10577, 16086) }, + { AOM_CDF4(29728, 32501, 32695) }, + { AOM_CDF4(17431, 29701, 31903) }, + { AOM_CDF4(9921, 22826, 28300) }, + { AOM_CDF4(5896, 15434, 22068) }, + { AOM_CDF4(3430, 9646, 14757) }, + { AOM_CDF4(28614, 32511, 32705) }, + { AOM_CDF4(19364, 30638, 32263) }, + { AOM_CDF4(13129, 26254, 30402) }, + { AOM_CDF4(8754, 20484, 26440) }, + { AOM_CDF4(4378, 11607, 17110) }, + { AOM_CDF4(30292, 32671, 32744) }, + { AOM_CDF4(21780, 31603, 32501) }, + { AOM_CDF4(14314, 27829, 31291) }, + { AOM_CDF4(9611, 22327, 28263) }, + { AOM_CDF4(4890, 13087, 19065) }, + { AOM_CDF4(25862, 32567, 32733) }, + { AOM_CDF4(20794, 32050, 32567) }, + { AOM_CDF4(17243, 30625, 32254) }, + { AOM_CDF4(13283, 27628, 31474) }, + { AOM_CDF4(9669, 22532, 28918) }, + { AOM_CDF4(27435, 32697, 32748) }, + { AOM_CDF4(24922, 32390, 32714) }, + { AOM_CDF4(21449, 31504, 32536) }, + { AOM_CDF4(16392, 29729, 31832) }, + { AOM_CDF4(11692, 24884, 29076) }, + { AOM_CDF4(24193, 32290, 32735) }, + { AOM_CDF4(18909, 31104, 32563) }, + { AOM_CDF4(12236, 26841, 31403) }, + { AOM_CDF4(8171, 21840, 29082) }, + { AOM_CDF4(7224, 17280, 25275) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(3078, 6839, 9890) }, + { AOM_CDF4(13837, 20450, 24479) }, + { AOM_CDF4(5914, 14222, 19328) }, + { AOM_CDF4(3866, 10267, 14762) }, + { AOM_CDF4(2612, 7208, 11042) }, + { AOM_CDF4(1067, 2991, 4776) }, + { AOM_CDF4(25817, 31646, 32529) }, + { AOM_CDF4(13708, 26338, 30385) }, + { AOM_CDF4(7328, 18585, 24870) }, + { AOM_CDF4(4691, 13080, 19276) }, + { AOM_CDF4(1825, 5253, 8352) }, + { AOM_CDF4(29386, 32315, 32624) }, + { AOM_CDF4(17160, 29001, 31360) }, + { AOM_CDF4(9602, 21862, 27396) }, + { AOM_CDF4(5915, 15772, 22148) }, + { AOM_CDF4(2786, 7779, 12047) }, + { AOM_CDF4(29246, 32450, 32663) }, + { AOM_CDF4(18696, 29929, 31818) }, + { AOM_CDF4(10510, 23369, 28560) }, + { AOM_CDF4(6229, 16499, 23125) }, + { AOM_CDF4(2608, 7448, 11705) }, + { AOM_CDF4(30753, 32710, 32748) }, + { AOM_CDF4(21638, 31487, 32503) }, + { AOM_CDF4(12937, 26854, 30870) }, + { AOM_CDF4(8182, 20596, 26970) }, + { AOM_CDF4(3637, 10269, 15497) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(5244, 12150, 16906) }, + { AOM_CDF4(20486, 26858, 29701) }, + { AOM_CDF4(7756, 18317, 23735) }, + { AOM_CDF4(3452, 9256, 13146) }, + { AOM_CDF4(2020, 5206, 8229) }, + { AOM_CDF4(1801, 4993, 7903) }, + { AOM_CDF4(27051, 31858, 32531) }, + { AOM_CDF4(15988, 27531, 30619) }, + { AOM_CDF4(9188, 21484, 26719) }, + { AOM_CDF4(6273, 17186, 23800) }, + { AOM_CDF4(3108, 9355, 14764) }, + { AOM_CDF4(31076, 32520, 32680) }, + { AOM_CDF4(18119, 30037, 31850) }, + { AOM_CDF4(10244, 22969, 27472) }, + { AOM_CDF4(4692, 14077, 19273) }, + { AOM_CDF4(3694, 11677, 17556) }, + { AOM_CDF4(30060, 32581, 32720) }, + { AOM_CDF4(21011, 30775, 32120) }, + { AOM_CDF4(11931, 24820, 29289) }, + { AOM_CDF4(7119, 17662, 24356) }, + { AOM_CDF4(3833, 10706, 16304) }, + { AOM_CDF4(31954, 32731, 32748) }, + { AOM_CDF4(23913, 31724, 32489) }, + { AOM_CDF4(15520, 28060, 31286) }, + { AOM_CDF4(11517, 23008, 28571) }, + { AOM_CDF4(6193, 14508, 20629) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(1035, 2807, 4156) }, + { AOM_CDF4(13162, 18138, 20939) }, + { AOM_CDF4(2696, 6633, 8755) }, + { AOM_CDF4(1373, 4161, 6853) }, + { AOM_CDF4(1099, 2746, 4716) }, + { AOM_CDF4(340, 1021, 1599) }, + { AOM_CDF4(22826, 30419, 32135) }, + { AOM_CDF4(10395, 21762, 26942) }, + { AOM_CDF4(4726, 12407, 17361) }, + { AOM_CDF4(2447, 7080, 10593) }, + { AOM_CDF4(1227, 3717, 6011) }, + { AOM_CDF4(28156, 31424, 31934) }, + { AOM_CDF4(16915, 27754, 30373) }, + { AOM_CDF4(9148, 20990, 26431) }, + { AOM_CDF4(5950, 15515, 21148) }, + { AOM_CDF4(2492, 7327, 11526) }, + { AOM_CDF4(30602, 32477, 32670) }, + { AOM_CDF4(20026, 29955, 31568) }, + { AOM_CDF4(11220, 23628, 28105) }, + { AOM_CDF4(6652, 17019, 22973) }, + { AOM_CDF4(3064, 8536, 13043) }, + { AOM_CDF4(31769, 32724, 32748) }, + { AOM_CDF4(22230, 30887, 32373) }, + { AOM_CDF4(12234, 25079, 29731) }, + { AOM_CDF4(7326, 18816, 25353) }, + { AOM_CDF4(3933, 10907, 16616) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(8896, 16227, 20630) }, + { AOM_CDF4(23629, 31782, 32527) }, + { AOM_CDF4(15173, 27755, 31321) }, + { AOM_CDF4(10158, 21233, 27382) }, + { AOM_CDF4(6420, 14857, 21558) }, + { AOM_CDF4(3269, 8155, 12646) }, + { AOM_CDF4(24835, 32009, 32496) }, + { AOM_CDF4(16509, 28421, 31579) }, + { AOM_CDF4(10957, 21514, 27418) }, + { AOM_CDF4(7881, 15930, 22096) }, + { AOM_CDF4(5388, 10960, 15918) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(20745, 30773, 32093) }, + { AOM_CDF4(15200, 27221, 30861) }, + { AOM_CDF4(13032, 20873, 25667) }, + { AOM_CDF4(12285, 18663, 23494) }, + { AOM_CDF4(11563, 17481, 21489) }, + { AOM_CDF4(26260, 31982, 32320) }, + { AOM_CDF4(15397, 28083, 31100) }, + { AOM_CDF4(9742, 19217, 24824) }, + { AOM_CDF4(3261, 9629, 15362) }, + { AOM_CDF4(1480, 4322, 7499) }, + { AOM_CDF4(27599, 32256, 32460) }, + { AOM_CDF4(16857, 27659, 30774) }, + { AOM_CDF4(9551, 18290, 23748) }, + { AOM_CDF4(3052, 8933, 14103) }, + { AOM_CDF4(2021, 5910, 9787) }, + { AOM_CDF4(29005, 32015, 32392) }, + { AOM_CDF4(17677, 27694, 30863) }, + { AOM_CDF4(9204, 17356, 23219) }, + { AOM_CDF4(2403, 7516, 12814) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10808, 22056, 26896) }, + { AOM_CDF4(25739, 32313, 32676) }, + { AOM_CDF4(17288, 30203, 32221) }, + { AOM_CDF4(11359, 24878, 29896) }, + { AOM_CDF4(6949, 17767, 24893) }, + { AOM_CDF4(4287, 11796, 18071) }, + { AOM_CDF4(27880, 32521, 32705) }, + { AOM_CDF4(19038, 31004, 32414) }, + { AOM_CDF4(12564, 26345, 30768) }, + { AOM_CDF4(8269, 19947, 26779) }, + { AOM_CDF4(5674, 14657, 21674) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(25742, 32319, 32671) }, + { AOM_CDF4(19557, 31164, 32454) }, + { AOM_CDF4(13381, 26381, 30755) }, + { AOM_CDF4(10101, 21466, 26722) }, + { AOM_CDF4(9209, 19650, 26825) }, + { AOM_CDF4(27107, 31917, 32432) }, + { AOM_CDF4(18056, 28893, 31203) }, + { AOM_CDF4(10200, 21434, 26764) }, + { AOM_CDF4(4660, 12913, 19502) }, + { AOM_CDF4(2368, 6930, 12504) }, + { AOM_CDF4(26960, 32158, 32613) }, + { AOM_CDF4(18628, 30005, 32031) }, + { AOM_CDF4(10233, 22442, 28232) }, + { AOM_CDF4(5471, 14630, 21516) }, + { AOM_CDF4(3235, 10767, 17109) }, + { AOM_CDF4(27696, 32440, 32692) }, + { AOM_CDF4(20032, 31167, 32438) }, + { AOM_CDF4(8700, 21341, 28442) }, + { AOM_CDF4(5662, 14831, 21795) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9704, 17294, 21132) }, + { AOM_CDF4(26762, 32278, 32633) }, + { AOM_CDF4(18382, 29620, 31819) }, + { AOM_CDF4(10891, 23475, 28723) }, + { AOM_CDF4(6358, 16583, 23309) }, + { AOM_CDF4(3248, 9118, 14141) }, + { AOM_CDF4(27204, 32573, 32699) }, + { AOM_CDF4(19818, 30824, 32329) }, + { AOM_CDF4(11772, 25120, 30041) }, + { AOM_CDF4(6995, 18033, 25039) }, + { AOM_CDF4(3752, 10442, 16098) }, + { AOM_CDF4(27222, 32256, 32559) }, + { AOM_CDF4(15356, 28399, 31475) }, + { AOM_CDF4(8821, 20635, 27057) }, + { AOM_CDF4(5511, 14404, 21239) }, + { AOM_CDF4(2935, 8222, 13051) }, + { AOM_CDF4(24875, 32120, 32529) }, + { AOM_CDF4(15233, 28265, 31445) }, + { AOM_CDF4(8605, 20570, 26932) }, + { AOM_CDF4(5431, 14413, 21196) }, + { AOM_CDF4(2994, 8341, 13223) }, + { AOM_CDF4(28201, 32604, 32700) }, + { AOM_CDF4(21041, 31446, 32456) }, + { AOM_CDF4(13221, 26213, 30475) }, + { AOM_CDF4(8255, 19385, 26037) }, + { AOM_CDF4(4930, 12585, 18830) }, + { AOM_CDF4(28768, 32448, 32627) }, + { AOM_CDF4(19705, 30561, 32021) }, + { AOM_CDF4(11572, 23589, 28220) }, + { AOM_CDF4(5532, 15034, 21446) }, + { AOM_CDF4(2460, 7150, 11456) }, + { AOM_CDF4(29874, 32619, 32699) }, + { AOM_CDF4(21621, 31071, 32201) }, + { AOM_CDF4(12511, 24747, 28992) }, + { AOM_CDF4(6281, 16395, 22748) }, + { AOM_CDF4(3246, 9278, 14497) }, + { AOM_CDF4(29715, 32625, 32712) }, + { AOM_CDF4(20958, 31011, 32283) }, + { AOM_CDF4(11233, 23671, 28806) }, + { AOM_CDF4(6012, 16128, 22868) }, + { AOM_CDF4(3427, 9851, 15414) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11016, 22111, 26794) }, + { AOM_CDF4(25946, 32357, 32677) }, + { AOM_CDF4(17890, 30452, 32252) }, + { AOM_CDF4(11678, 25142, 29816) }, + { AOM_CDF4(6720, 17534, 24584) }, + { AOM_CDF4(4230, 11665, 17820) }, + { AOM_CDF4(28400, 32623, 32747) }, + { AOM_CDF4(21164, 31668, 32575) }, + { AOM_CDF4(13572, 27388, 31182) }, + { AOM_CDF4(8234, 20750, 27358) }, + { AOM_CDF4(5065, 14055, 20897) }, + { AOM_CDF4(28981, 32547, 32705) }, + { AOM_CDF4(18681, 30543, 32239) }, + { AOM_CDF4(10919, 24075, 29286) }, + { AOM_CDF4(6431, 17199, 24077) }, + { AOM_CDF4(3819, 10464, 16618) }, + { AOM_CDF4(26870, 32467, 32693) }, + { AOM_CDF4(19041, 30831, 32347) }, + { AOM_CDF4(11794, 25211, 30016) }, + { AOM_CDF4(6888, 18019, 24970) }, + { AOM_CDF4(4370, 12363, 18992) }, + { AOM_CDF4(29578, 32670, 32744) }, + { AOM_CDF4(23159, 32007, 32613) }, + { AOM_CDF4(15315, 28669, 31676) }, + { AOM_CDF4(9298, 22607, 28782) }, + { AOM_CDF4(6144, 15913, 22968) }, + { AOM_CDF4(28110, 32499, 32669) }, + { AOM_CDF4(21574, 30937, 32015) }, + { AOM_CDF4(12759, 24818, 28727) }, + { AOM_CDF4(6545, 16761, 23042) }, + { AOM_CDF4(3649, 10597, 16833) }, + { AOM_CDF4(28163, 32552, 32728) }, + { AOM_CDF4(22101, 31469, 32464) }, + { AOM_CDF4(13160, 25472, 30143) }, + { AOM_CDF4(7303, 18684, 25468) }, + { AOM_CDF4(5241, 13975, 20955) }, + { AOM_CDF4(28400, 32631, 32744) }, + { AOM_CDF4(22104, 31793, 32603) }, + { AOM_CDF4(13557, 26571, 30846) }, + { AOM_CDF4(7749, 19861, 26675) }, + { AOM_CDF4(4873, 14030, 21234) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(9800, 17635, 21073) }, + { AOM_CDF4(26153, 31885, 32527) }, + { AOM_CDF4(15038, 27852, 31006) }, + { AOM_CDF4(8718, 20564, 26486) }, + { AOM_CDF4(5128, 14076, 20514) }, + { AOM_CDF4(2636, 7566, 11925) }, + { AOM_CDF4(27551, 32504, 32701) }, + { AOM_CDF4(18310, 30054, 32100) }, + { AOM_CDF4(10211, 23420, 29082) }, + { AOM_CDF4(6222, 16876, 23916) }, + { AOM_CDF4(3462, 9954, 15498) }, + { AOM_CDF4(29991, 32633, 32721) }, + { AOM_CDF4(19883, 30751, 32201) }, + { AOM_CDF4(11141, 24184, 29285) }, + { AOM_CDF4(6420, 16940, 23774) }, + { AOM_CDF4(3392, 9753, 15118) }, + { AOM_CDF4(28465, 32616, 32712) }, + { AOM_CDF4(19850, 30702, 32244) }, + { AOM_CDF4(10983, 24024, 29223) }, + { AOM_CDF4(6294, 16770, 23582) }, + { AOM_CDF4(3244, 9283, 14509) }, + { AOM_CDF4(30023, 32717, 32748) }, + { AOM_CDF4(22940, 32032, 32626) }, + { AOM_CDF4(14282, 27928, 31473) }, + { AOM_CDF4(8562, 21327, 27914) }, + { AOM_CDF4(4846, 13393, 19919) }, + { AOM_CDF4(29981, 32590, 32695) }, + { AOM_CDF4(20465, 30963, 32166) }, + { AOM_CDF4(11479, 23579, 28195) }, + { AOM_CDF4(5916, 15648, 22073) }, + { AOM_CDF4(3031, 8605, 13398) }, + { AOM_CDF4(31146, 32691, 32739) }, + { AOM_CDF4(23106, 31724, 32444) }, + { AOM_CDF4(13783, 26738, 30439) }, + { AOM_CDF4(7852, 19468, 25807) }, + { AOM_CDF4(3860, 11124, 16853) }, + { AOM_CDF4(31014, 32724, 32748) }, + { AOM_CDF4(23629, 32109, 32628) }, + { AOM_CDF4(14747, 28115, 31403) }, + { AOM_CDF4(8545, 21242, 27478) }, + { AOM_CDF4(4574, 12781, 19067) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9185, 19694, 24688) }, + { AOM_CDF4(26081, 31985, 32621) }, + { AOM_CDF4(16015, 29000, 31787) }, + { AOM_CDF4(10542, 23690, 29206) }, + { AOM_CDF4(6732, 17945, 24677) }, + { AOM_CDF4(3916, 11039, 16722) }, + { AOM_CDF4(28224, 32566, 32744) }, + { AOM_CDF4(19100, 31138, 32485) }, + { AOM_CDF4(12528, 26620, 30879) }, + { AOM_CDF4(7741, 20277, 26885) }, + { AOM_CDF4(4566, 12845, 18990) }, + { AOM_CDF4(29933, 32593, 32718) }, + { AOM_CDF4(17670, 30333, 32155) }, + { AOM_CDF4(10385, 23600, 28909) }, + { AOM_CDF4(6243, 16236, 22407) }, + { AOM_CDF4(3976, 10389, 16017) }, + { AOM_CDF4(28377, 32561, 32738) }, + { AOM_CDF4(19366, 31175, 32482) }, + { AOM_CDF4(13327, 27175, 31094) }, + { AOM_CDF4(8258, 20769, 27143) }, + { AOM_CDF4(4703, 13198, 19527) }, + { AOM_CDF4(31086, 32706, 32748) }, + { AOM_CDF4(22853, 31902, 32583) }, + { AOM_CDF4(14759, 28186, 31419) }, + { AOM_CDF4(9284, 22382, 28348) }, + { AOM_CDF4(5585, 15192, 21868) }, + { AOM_CDF4(28291, 32652, 32746) }, + { AOM_CDF4(19849, 32107, 32571) }, + { AOM_CDF4(14834, 26818, 29214) }, + { AOM_CDF4(10306, 22594, 28672) }, + { AOM_CDF4(6615, 17384, 23384) }, + { AOM_CDF4(28947, 32604, 32745) }, + { AOM_CDF4(25625, 32289, 32646) }, + { AOM_CDF4(18758, 28672, 31403) }, + { AOM_CDF4(10017, 23430, 28523) }, + { AOM_CDF4(6862, 15269, 22131) }, + { AOM_CDF4(23933, 32509, 32739) }, + { AOM_CDF4(19927, 31495, 32631) }, + { AOM_CDF4(11903, 26023, 30621) }, + { AOM_CDF4(7026, 20094, 27252) }, + { AOM_CDF4(5998, 18106, 24437) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4456, 11274, 15533) }, + { AOM_CDF4(21219, 29079, 31616) }, + { AOM_CDF4(11173, 23774, 28567) }, + { AOM_CDF4(7282, 18293, 24263) }, + { AOM_CDF4(4890, 13286, 19115) }, + { AOM_CDF4(1890, 5508, 8659) }, + { AOM_CDF4(26651, 32136, 32647) }, + { AOM_CDF4(14630, 28254, 31455) }, + { AOM_CDF4(8716, 21287, 27395) }, + { AOM_CDF4(5615, 15331, 22008) }, + { AOM_CDF4(2675, 7700, 12150) }, + { AOM_CDF4(29954, 32526, 32690) }, + { AOM_CDF4(16126, 28982, 31633) }, + { AOM_CDF4(9030, 21361, 27352) }, + { AOM_CDF4(5411, 14793, 21271) }, + { AOM_CDF4(2943, 8422, 13163) }, + { AOM_CDF4(29539, 32601, 32730) }, + { AOM_CDF4(18125, 30385, 32201) }, + { AOM_CDF4(10422, 24090, 29468) }, + { AOM_CDF4(6468, 17487, 24438) }, + { AOM_CDF4(2970, 8653, 13531) }, + { AOM_CDF4(30912, 32715, 32748) }, + { AOM_CDF4(20666, 31373, 32497) }, + { AOM_CDF4(12509, 26640, 30917) }, + { AOM_CDF4(8058, 20629, 27290) }, + { AOM_CDF4(4231, 12006, 18052) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(10202, 20633, 25484) }, + { AOM_CDF4(27336, 31445, 32352) }, + { AOM_CDF4(12420, 24384, 28552) }, + { AOM_CDF4(7648, 18115, 23856) }, + { AOM_CDF4(5662, 14341, 19902) }, + { AOM_CDF4(3611, 10328, 15390) }, + { AOM_CDF4(30945, 32616, 32736) }, + { AOM_CDF4(18682, 30505, 32253) }, + { AOM_CDF4(11513, 25336, 30203) }, + { AOM_CDF4(7449, 19452, 26148) }, + { AOM_CDF4(4482, 13051, 18886) }, + { AOM_CDF4(32022, 32690, 32747) }, + { AOM_CDF4(18578, 30501, 32146) }, + { AOM_CDF4(11249, 23368, 28631) }, + { AOM_CDF4(5645, 16958, 22158) }, + { AOM_CDF4(5009, 11444, 16637) }, + { AOM_CDF4(31357, 32710, 32748) }, + { AOM_CDF4(21552, 31494, 32504) }, + { AOM_CDF4(13891, 27677, 31340) }, + { AOM_CDF4(9051, 22098, 28172) }, + { AOM_CDF4(5190, 13377, 19486) }, + { AOM_CDF4(32364, 32740, 32748) }, + { AOM_CDF4(24839, 31907, 32551) }, + { AOM_CDF4(17160, 28779, 31696) }, + { AOM_CDF4(12452, 24137, 29602) }, + { AOM_CDF4(6165, 15389, 22477) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(2575, 7281, 11077) }, + { AOM_CDF4(14002, 20866, 25402) }, + { AOM_CDF4(6343, 15056, 19658) }, + { AOM_CDF4(4474, 11858, 17041) }, + { AOM_CDF4(2865, 8299, 12534) }, + { AOM_CDF4(1344, 3949, 6391) }, + { AOM_CDF4(24720, 31239, 32459) }, + { AOM_CDF4(12585, 25356, 29968) }, + { AOM_CDF4(7181, 18246, 24444) }, + { AOM_CDF4(5025, 13667, 19885) }, + { AOM_CDF4(2521, 7304, 11605) }, + { AOM_CDF4(29908, 32252, 32584) }, + { AOM_CDF4(17421, 29156, 31575) }, + { AOM_CDF4(9889, 22188, 27782) }, + { AOM_CDF4(5878, 15647, 22123) }, + { AOM_CDF4(2814, 8665, 13323) }, + { AOM_CDF4(30183, 32568, 32713) }, + { AOM_CDF4(18528, 30195, 32049) }, + { AOM_CDF4(10982, 24606, 29657) }, + { AOM_CDF4(6957, 18165, 25231) }, + { AOM_CDF4(3508, 10118, 15468) }, + { AOM_CDF4(31761, 32736, 32748) }, + { AOM_CDF4(21041, 31328, 32546) }, + { AOM_CDF4(12568, 26732, 31166) }, + { AOM_CDF4(8052, 20720, 27733) }, + { AOM_CDF4(4336, 12192, 18396) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } }, + { { { { AOM_CDF4(7062, 16472, 22319) }, + { AOM_CDF4(24538, 32261, 32674) }, + { AOM_CDF4(13675, 28041, 31779) }, + { AOM_CDF4(8590, 20674, 27631) }, + { AOM_CDF4(5685, 14675, 22013) }, + { AOM_CDF4(3655, 9898, 15731) }, + { AOM_CDF4(26493, 32418, 32658) }, + { AOM_CDF4(16376, 29342, 32090) }, + { AOM_CDF4(10594, 22649, 28970) }, + { AOM_CDF4(8176, 17170, 24303) }, + { AOM_CDF4(5605, 12694, 19139) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(23888, 31902, 32542) }, + { AOM_CDF4(18612, 29687, 31987) }, + { AOM_CDF4(16245, 24852, 29249) }, + { AOM_CDF4(15765, 22608, 27559) }, + { AOM_CDF4(19895, 24699, 27510) }, + { AOM_CDF4(28401, 32212, 32457) }, + { AOM_CDF4(15274, 27825, 30980) }, + { AOM_CDF4(9364, 18128, 24332) }, + { AOM_CDF4(2283, 8193, 15082) }, + { AOM_CDF4(1228, 3972, 7881) }, + { AOM_CDF4(29455, 32469, 32620) }, + { AOM_CDF4(17981, 28245, 31388) }, + { AOM_CDF4(10921, 20098, 26240) }, + { AOM_CDF4(3743, 11829, 18657) }, + { AOM_CDF4(2374, 9593, 15715) }, + { AOM_CDF4(31068, 32466, 32635) }, + { AOM_CDF4(20321, 29572, 31971) }, + { AOM_CDF4(10771, 20255, 27119) }, + { AOM_CDF4(2795, 10410, 17361) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(9320, 22102, 27840) }, + { AOM_CDF4(27057, 32464, 32724) }, + { AOM_CDF4(16331, 30268, 32309) }, + { AOM_CDF4(10319, 23935, 29720) }, + { AOM_CDF4(6189, 16448, 24106) }, + { AOM_CDF4(3589, 10884, 18808) }, + { AOM_CDF4(29026, 32624, 32748) }, + { AOM_CDF4(19226, 31507, 32587) }, + { AOM_CDF4(12692, 26921, 31203) }, + { AOM_CDF4(7049, 19532, 27635) }, + { AOM_CDF4(7727, 15669, 23252) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(28056, 32625, 32748) }, + { AOM_CDF4(22383, 32075, 32669) }, + { AOM_CDF4(15417, 27098, 31749) }, + { AOM_CDF4(18127, 26493, 27190) }, + { AOM_CDF4(5461, 16384, 21845) }, + { AOM_CDF4(27982, 32091, 32584) }, + { AOM_CDF4(19045, 29868, 31972) }, + { AOM_CDF4(10397, 22266, 27932) }, + { AOM_CDF4(5990, 13697, 21500) }, + { AOM_CDF4(1792, 6912, 15104) }, + { AOM_CDF4(28198, 32501, 32718) }, + { AOM_CDF4(21534, 31521, 32569) }, + { AOM_CDF4(11109, 25217, 30017) }, + { AOM_CDF4(5671, 15124, 26151) }, + { AOM_CDF4(4681, 14043, 18725) }, + { AOM_CDF4(28688, 32580, 32741) }, + { AOM_CDF4(22576, 32079, 32661) }, + { AOM_CDF4(10627, 22141, 28340) }, + { AOM_CDF4(9362, 14043, 28087) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7754, 16948, 22142) }, + { AOM_CDF4(25670, 32330, 32691) }, + { AOM_CDF4(15663, 29225, 31994) }, + { AOM_CDF4(9878, 23288, 29158) }, + { AOM_CDF4(6419, 17088, 24336) }, + { AOM_CDF4(3859, 11003, 17039) }, + { AOM_CDF4(27562, 32595, 32725) }, + { AOM_CDF4(17575, 30588, 32399) }, + { AOM_CDF4(10819, 24838, 30309) }, + { AOM_CDF4(7124, 18686, 25916) }, + { AOM_CDF4(4479, 12688, 19340) }, + { AOM_CDF4(28385, 32476, 32673) }, + { AOM_CDF4(15306, 29005, 31938) }, + { AOM_CDF4(8937, 21615, 28322) }, + { AOM_CDF4(5982, 15603, 22786) }, + { AOM_CDF4(3620, 10267, 16136) }, + { AOM_CDF4(27280, 32464, 32667) }, + { AOM_CDF4(15607, 29160, 32004) }, + { AOM_CDF4(9091, 22135, 28740) }, + { AOM_CDF4(6232, 16632, 24020) }, + { AOM_CDF4(4047, 11377, 17672) }, + { AOM_CDF4(29220, 32630, 32718) }, + { AOM_CDF4(19650, 31220, 32462) }, + { AOM_CDF4(13050, 26312, 30827) }, + { AOM_CDF4(9228, 20870, 27468) }, + { AOM_CDF4(6146, 15149, 21971) }, + { AOM_CDF4(30169, 32481, 32623) }, + { AOM_CDF4(17212, 29311, 31554) }, + { AOM_CDF4(9911, 21311, 26882) }, + { AOM_CDF4(4487, 13314, 20372) }, + { AOM_CDF4(2570, 7772, 12889) }, + { AOM_CDF4(30924, 32613, 32708) }, + { AOM_CDF4(19490, 30206, 32107) }, + { AOM_CDF4(11232, 23998, 29276) }, + { AOM_CDF4(6769, 17955, 25035) }, + { AOM_CDF4(4398, 12623, 19214) }, + { AOM_CDF4(30609, 32627, 32722) }, + { AOM_CDF4(19370, 30582, 32287) }, + { AOM_CDF4(10457, 23619, 29409) }, + { AOM_CDF4(6443, 17637, 24834) }, + { AOM_CDF4(4645, 13236, 20106) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8626, 20271, 26216) }, + { AOM_CDF4(26707, 32406, 32711) }, + { AOM_CDF4(16999, 30329, 32286) }, + { AOM_CDF4(11445, 25123, 30286) }, + { AOM_CDF4(6411, 18828, 25601) }, + { AOM_CDF4(6801, 12458, 20248) }, + { AOM_CDF4(29918, 32682, 32748) }, + { AOM_CDF4(20649, 31739, 32618) }, + { AOM_CDF4(12879, 27773, 31581) }, + { AOM_CDF4(7896, 21751, 28244) }, + { AOM_CDF4(5260, 14870, 23698) }, + { AOM_CDF4(29252, 32593, 32731) }, + { AOM_CDF4(17072, 30460, 32294) }, + { AOM_CDF4(10653, 24143, 29365) }, + { AOM_CDF4(6536, 17490, 23983) }, + { AOM_CDF4(4929, 13170, 20085) }, + { AOM_CDF4(28137, 32518, 32715) }, + { AOM_CDF4(18171, 30784, 32407) }, + { AOM_CDF4(11437, 25436, 30459) }, + { AOM_CDF4(7252, 18534, 26176) }, + { AOM_CDF4(4126, 13353, 20978) }, + { AOM_CDF4(31162, 32726, 32748) }, + { AOM_CDF4(23017, 32222, 32701) }, + { AOM_CDF4(15629, 29233, 32046) }, + { AOM_CDF4(9387, 22621, 29480) }, + { AOM_CDF4(6922, 17616, 25010) }, + { AOM_CDF4(28838, 32265, 32614) }, + { AOM_CDF4(19701, 30206, 31920) }, + { AOM_CDF4(11214, 22410, 27933) }, + { AOM_CDF4(5320, 14177, 23034) }, + { AOM_CDF4(5049, 12881, 17827) }, + { AOM_CDF4(27484, 32471, 32734) }, + { AOM_CDF4(21076, 31526, 32561) }, + { AOM_CDF4(12707, 26303, 31211) }, + { AOM_CDF4(8169, 21722, 28219) }, + { AOM_CDF4(6045, 19406, 27042) }, + { AOM_CDF4(27753, 32572, 32745) }, + { AOM_CDF4(20832, 31878, 32653) }, + { AOM_CDF4(13250, 27356, 31674) }, + { AOM_CDF4(7718, 21508, 29858) }, + { AOM_CDF4(7209, 18350, 25559) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(7876, 16901, 21741) }, + { AOM_CDF4(24001, 31898, 32625) }, + { AOM_CDF4(14529, 27959, 31451) }, + { AOM_CDF4(8273, 20818, 27258) }, + { AOM_CDF4(5278, 14673, 21510) }, + { AOM_CDF4(2983, 8843, 14039) }, + { AOM_CDF4(28016, 32574, 32732) }, + { AOM_CDF4(17471, 30306, 32301) }, + { AOM_CDF4(10224, 24063, 29728) }, + { AOM_CDF4(6602, 17954, 25052) }, + { AOM_CDF4(4002, 11585, 17759) }, + { AOM_CDF4(30190, 32634, 32739) }, + { AOM_CDF4(17497, 30282, 32270) }, + { AOM_CDF4(10229, 23729, 29538) }, + { AOM_CDF4(6344, 17211, 24440) }, + { AOM_CDF4(3849, 11189, 17108) }, + { AOM_CDF4(28570, 32583, 32726) }, + { AOM_CDF4(17521, 30161, 32238) }, + { AOM_CDF4(10153, 23565, 29378) }, + { AOM_CDF4(6455, 17341, 24443) }, + { AOM_CDF4(3907, 11042, 17024) }, + { AOM_CDF4(30689, 32715, 32748) }, + { AOM_CDF4(21546, 31840, 32610) }, + { AOM_CDF4(13547, 27581, 31459) }, + { AOM_CDF4(8912, 21757, 28309) }, + { AOM_CDF4(5548, 15080, 22046) }, + { AOM_CDF4(30783, 32540, 32685) }, + { AOM_CDF4(17540, 29528, 31668) }, + { AOM_CDF4(10160, 21468, 26783) }, + { AOM_CDF4(4724, 13393, 20054) }, + { AOM_CDF4(2702, 8174, 13102) }, + { AOM_CDF4(31648, 32686, 32742) }, + { AOM_CDF4(20954, 31094, 32337) }, + { AOM_CDF4(12420, 25698, 30179) }, + { AOM_CDF4(7304, 19320, 26248) }, + { AOM_CDF4(4366, 12261, 18864) }, + { AOM_CDF4(31581, 32723, 32748) }, + { AOM_CDF4(21373, 31586, 32525) }, + { AOM_CDF4(12744, 26625, 30885) }, + { AOM_CDF4(7431, 20322, 26950) }, + { AOM_CDF4(4692, 13323, 20111) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(7833, 18369, 24095) }, + { AOM_CDF4(26650, 32273, 32702) }, + { AOM_CDF4(16371, 29961, 32191) }, + { AOM_CDF4(11055, 24082, 29629) }, + { AOM_CDF4(6892, 18644, 25400) }, + { AOM_CDF4(5006, 13057, 19240) }, + { AOM_CDF4(29834, 32666, 32748) }, + { AOM_CDF4(19577, 31335, 32570) }, + { AOM_CDF4(12253, 26509, 31122) }, + { AOM_CDF4(7991, 20772, 27711) }, + { AOM_CDF4(5677, 15910, 23059) }, + { AOM_CDF4(30109, 32532, 32720) }, + { AOM_CDF4(16747, 30166, 32252) }, + { AOM_CDF4(10134, 23542, 29184) }, + { AOM_CDF4(5791, 16176, 23556) }, + { AOM_CDF4(4362, 10414, 17284) }, + { AOM_CDF4(29492, 32626, 32748) }, + { AOM_CDF4(19894, 31402, 32525) }, + { AOM_CDF4(12942, 27071, 30869) }, + { AOM_CDF4(8346, 21216, 27405) }, + { AOM_CDF4(6572, 17087, 23859) }, + { AOM_CDF4(32035, 32735, 32748) }, + { AOM_CDF4(22957, 31838, 32618) }, + { AOM_CDF4(14724, 28572, 31772) }, + { AOM_CDF4(10364, 23999, 29553) }, + { AOM_CDF4(7004, 18433, 25655) }, + { AOM_CDF4(27528, 32277, 32681) }, + { AOM_CDF4(16959, 31171, 32096) }, + { AOM_CDF4(10486, 23593, 27962) }, + { AOM_CDF4(8192, 16384, 23211) }, + { AOM_CDF4(8937, 17873, 20852) }, + { AOM_CDF4(27715, 32002, 32615) }, + { AOM_CDF4(15073, 29491, 31676) }, + { AOM_CDF4(11264, 24576, 28672) }, + { AOM_CDF4(2341, 18725, 23406) }, + { AOM_CDF4(7282, 18204, 25486) }, + { AOM_CDF4(28547, 32213, 32657) }, + { AOM_CDF4(20788, 29773, 32239) }, + { AOM_CDF4(6780, 21469, 30508) }, + { AOM_CDF4(5958, 14895, 23831) }, + { AOM_CDF4(16384, 21845, 27307) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(5992, 14304, 19765) }, + { AOM_CDF4(22612, 31238, 32456) }, + { AOM_CDF4(13456, 27162, 31087) }, + { AOM_CDF4(8001, 20062, 26504) }, + { AOM_CDF4(5168, 14105, 20764) }, + { AOM_CDF4(2632, 7771, 12385) }, + { AOM_CDF4(27034, 32344, 32709) }, + { AOM_CDF4(15850, 29415, 31997) }, + { AOM_CDF4(9494, 22776, 28841) }, + { AOM_CDF4(6151, 16830, 23969) }, + { AOM_CDF4(3461, 10039, 15722) }, + { AOM_CDF4(30134, 32569, 32731) }, + { AOM_CDF4(15638, 29422, 31945) }, + { AOM_CDF4(9150, 21865, 28218) }, + { AOM_CDF4(5647, 15719, 22676) }, + { AOM_CDF4(3402, 9772, 15477) }, + { AOM_CDF4(28530, 32586, 32735) }, + { AOM_CDF4(17139, 30298, 32292) }, + { AOM_CDF4(10200, 24039, 29685) }, + { AOM_CDF4(6419, 17674, 24786) }, + { AOM_CDF4(3544, 10225, 15824) }, + { AOM_CDF4(31333, 32726, 32748) }, + { AOM_CDF4(20618, 31487, 32544) }, + { AOM_CDF4(12901, 27217, 31232) }, + { AOM_CDF4(8624, 21734, 28171) }, + { AOM_CDF4(5104, 14191, 20748) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(11206, 21090, 26561) }, + { AOM_CDF4(28759, 32279, 32671) }, + { AOM_CDF4(14171, 27952, 31569) }, + { AOM_CDF4(9743, 22907, 29141) }, + { AOM_CDF4(6871, 17886, 24868) }, + { AOM_CDF4(4960, 13152, 19315) }, + { AOM_CDF4(31077, 32661, 32748) }, + { AOM_CDF4(19400, 31195, 32515) }, + { AOM_CDF4(12752, 26858, 31040) }, + { AOM_CDF4(8370, 22098, 28591) }, + { AOM_CDF4(5457, 15373, 22298) }, + { AOM_CDF4(31697, 32706, 32748) }, + { AOM_CDF4(17860, 30657, 32333) }, + { AOM_CDF4(12510, 24812, 29261) }, + { AOM_CDF4(6180, 19124, 24722) }, + { AOM_CDF4(5041, 13548, 17959) }, + { AOM_CDF4(31552, 32716, 32748) }, + { AOM_CDF4(21908, 31769, 32623) }, + { AOM_CDF4(14470, 28201, 31565) }, + { AOM_CDF4(9493, 22982, 28608) }, + { AOM_CDF4(6858, 17240, 24137) }, + { AOM_CDF4(32543, 32752, 32756) }, + { AOM_CDF4(24286, 32097, 32666) }, + { AOM_CDF4(15958, 29217, 32024) }, + { AOM_CDF4(10207, 24234, 29958) }, + { AOM_CDF4(6929, 18305, 25652) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } }, + { { { AOM_CDF4(4137, 10847, 15682) }, + { AOM_CDF4(17824, 27001, 30058) }, + { AOM_CDF4(10204, 22796, 28291) }, + { AOM_CDF4(6076, 15935, 22125) }, + { AOM_CDF4(3852, 10937, 16816) }, + { AOM_CDF4(2252, 6324, 10131) }, + { AOM_CDF4(25840, 32016, 32662) }, + { AOM_CDF4(15109, 28268, 31531) }, + { AOM_CDF4(9385, 22231, 28340) }, + { AOM_CDF4(6082, 16672, 23479) }, + { AOM_CDF4(3318, 9427, 14681) }, + { AOM_CDF4(30594, 32574, 32718) }, + { AOM_CDF4(16836, 29552, 31859) }, + { AOM_CDF4(9556, 22542, 28356) }, + { AOM_CDF4(6305, 16725, 23540) }, + { AOM_CDF4(3376, 9895, 15184) }, + { AOM_CDF4(29383, 32617, 32745) }, + { AOM_CDF4(18891, 30809, 32401) }, + { AOM_CDF4(11688, 25942, 30687) }, + { AOM_CDF4(7468, 19469, 26651) }, + { AOM_CDF4(3909, 11358, 17012) }, + { AOM_CDF4(31564, 32736, 32748) }, + { AOM_CDF4(20906, 31611, 32600) }, + { AOM_CDF4(13191, 27621, 31537) }, + { AOM_CDF4(8768, 22029, 28676) }, + { AOM_CDF4(5079, 14109, 20906) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } }, + { { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) }, + { AOM_CDF4(8192, 16384, 24576) } } } } }; + +static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs + [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE( + NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) }, + { AOM_CDF3(29600, 31446) }, + { AOM_CDF3(30844, 31878) }, + { AOM_CDF3(24926, 28948) } }, + { { AOM_CDF3(21365, 30026) }, + { AOM_CDF3(30512, 32423) }, + { AOM_CDF3(31658, 32621) }, + { AOM_CDF3(29630, 31881) } } }, + { { { AOM_CDF3(5717, 26477) }, + { AOM_CDF3(30491, 31703) }, + { AOM_CDF3(31550, 32158) }, + { AOM_CDF3(29648, 31491) } }, + { { AOM_CDF3(12608, 27820) }, + { AOM_CDF3(30680, 32225) }, + { AOM_CDF3(30809, 32335) }, + { AOM_CDF3(31299, 32423) } } }, + { { { AOM_CDF3(1786, 12612) }, + { AOM_CDF3(30663, 31625) }, + { AOM_CDF3(32339, 32468) }, + { AOM_CDF3(31148, 31833) } }, + { { AOM_CDF3(18857, 23865) }, + { AOM_CDF3(31428, 32428) }, + { AOM_CDF3(31744, 32373) }, + { AOM_CDF3(31775, 32526) } } }, + { { { AOM_CDF3(1787, 2532) }, + { AOM_CDF3(30832, 31662) }, + { AOM_CDF3(31824, 32682) }, + { AOM_CDF3(32133, 32569) } }, + { { AOM_CDF3(13751, 22235) }, + { AOM_CDF3(32089, 32409) }, + { AOM_CDF3(27084, 27920) }, + { AOM_CDF3(29291, 32594) } } }, + { { { AOM_CDF3(1725, 3449) }, + { AOM_CDF3(31102, 31935) }, + { AOM_CDF3(32457, 32613) }, + { AOM_CDF3(32412, 32649) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(17560, 29888) }, + { AOM_CDF3(29671, 31549) }, + { AOM_CDF3(31007, 32056) }, + { AOM_CDF3(27286, 30006) } }, + { { AOM_CDF3(26594, 31212) }, + { AOM_CDF3(31208, 32582) }, + { AOM_CDF3(31835, 32637) }, + { AOM_CDF3(30595, 32206) } } }, + { { { AOM_CDF3(15239, 29932) }, + { AOM_CDF3(31315, 32095) }, + { AOM_CDF3(32130, 32434) }, + { AOM_CDF3(30864, 31996) } }, + { { AOM_CDF3(26279, 30968) }, + { AOM_CDF3(31142, 32495) }, + { AOM_CDF3(31713, 32540) }, + { AOM_CDF3(31929, 32594) } } }, + { { { AOM_CDF3(2644, 25198) }, + { AOM_CDF3(32038, 32451) }, + { AOM_CDF3(32639, 32695) }, + { AOM_CDF3(32166, 32518) } }, + { { AOM_CDF3(17187, 27668) }, + { AOM_CDF3(31714, 32550) }, + { AOM_CDF3(32283, 32678) }, + { AOM_CDF3(31930, 32563) } } }, + { { { AOM_CDF3(1044, 2257) }, + { AOM_CDF3(30755, 31923) }, + { AOM_CDF3(32208, 32693) }, + { AOM_CDF3(32244, 32615) } }, + { { AOM_CDF3(21317, 26207) }, + { AOM_CDF3(29133, 30868) }, + { AOM_CDF3(29311, 31231) }, + { AOM_CDF3(29657, 31087) } } }, + { { { AOM_CDF3(478, 1834) }, + { AOM_CDF3(31005, 31987) }, + { AOM_CDF3(32317, 32724) }, + { AOM_CDF3(30865, 32648) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(20092, 30774) }, + { AOM_CDF3(30695, 32020) }, + { AOM_CDF3(31131, 32103) }, + { AOM_CDF3(28666, 30870) } }, + { { AOM_CDF3(27258, 31095) }, + { AOM_CDF3(31804, 32623) }, + { AOM_CDF3(31763, 32528) }, + { AOM_CDF3(31438, 32506) } } }, + { { { AOM_CDF3(18049, 30489) }, + { AOM_CDF3(31706, 32286) }, + { AOM_CDF3(32163, 32473) }, + { AOM_CDF3(31550, 32184) } }, + { { AOM_CDF3(27116, 30842) }, + { AOM_CDF3(31971, 32598) }, + { AOM_CDF3(32088, 32576) }, + { AOM_CDF3(32067, 32664) } } }, + { { { AOM_CDF3(12854, 29093) }, + { AOM_CDF3(32272, 32558) }, + { AOM_CDF3(32667, 32729) }, + { AOM_CDF3(32306, 32585) } }, + { { AOM_CDF3(25476, 30366) }, + { AOM_CDF3(32169, 32687) }, + { AOM_CDF3(32479, 32689) }, + { AOM_CDF3(31673, 32634) } } }, + { { { AOM_CDF3(2809, 19301) }, + { AOM_CDF3(32205, 32622) }, + { AOM_CDF3(32338, 32730) }, + { AOM_CDF3(31786, 32616) } }, + { { AOM_CDF3(22737, 29105) }, + { AOM_CDF3(30810, 32362) }, + { AOM_CDF3(30014, 32627) }, + { AOM_CDF3(30528, 32574) } } }, + { { { AOM_CDF3(935, 3382) }, + { AOM_CDF3(30789, 31909) }, + { AOM_CDF3(32466, 32756) }, + { AOM_CDF3(30860, 32513) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } }, + { { { { AOM_CDF3(22497, 31198) }, + { AOM_CDF3(31715, 32495) }, + { AOM_CDF3(31606, 32337) }, + { AOM_CDF3(30388, 31990) } }, + { { AOM_CDF3(27877, 31584) }, + { AOM_CDF3(32170, 32728) }, + { AOM_CDF3(32155, 32688) }, + { AOM_CDF3(32219, 32702) } } }, + { { { AOM_CDF3(21457, 31043) }, + { AOM_CDF3(31951, 32483) }, + { AOM_CDF3(32153, 32562) }, + { AOM_CDF3(31473, 32215) } }, + { { AOM_CDF3(27558, 31151) }, + { AOM_CDF3(32020, 32640) }, + { AOM_CDF3(32097, 32575) }, + { AOM_CDF3(32242, 32719) } } }, + { { { AOM_CDF3(19980, 30591) }, + { AOM_CDF3(32219, 32597) }, + { AOM_CDF3(32581, 32706) }, + { AOM_CDF3(31803, 32287) } }, + { { AOM_CDF3(26473, 30507) }, + { AOM_CDF3(32431, 32723) }, + { AOM_CDF3(32196, 32611) }, + { AOM_CDF3(31588, 32528) } } }, + { { { AOM_CDF3(24647, 30463) }, + { AOM_CDF3(32412, 32695) }, + { AOM_CDF3(32468, 32720) }, + { AOM_CDF3(31269, 32523) } }, + { { AOM_CDF3(28482, 31505) }, + { AOM_CDF3(32152, 32701) }, + { AOM_CDF3(31732, 32598) }, + { AOM_CDF3(31767, 32712) } } }, + { { { AOM_CDF3(12358, 24977) }, + { AOM_CDF3(31331, 32385) }, + { AOM_CDF3(32634, 32756) }, + { AOM_CDF3(30411, 32548) } }, + { { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) }, + { AOM_CDF3(10923, 21845) } } } } }; + +#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_ diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c new file mode 100644 index 0000000000..bf2bc36b04 --- /dev/null +++ b/third_party/aom/av1/common/txb_common.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" + +// The ctx offset table when TX is TX_CLASS_2D. +// TX col and row indices are clamped to 4 + +const int8_t av1_nz_map_ctx_offset_4x4[16] = { + 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x8[64] = { + 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21, + 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x16[256] = { + 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x32[1024] = { + 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_4x8[32] = { + 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x16[128] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x32[512] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x16[512] = { + 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x64[1024] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, + 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_64x32[1024] = { + 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_4x16[64] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_16x4[64] = { + 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_8x32[256] = { + 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t av1_nz_map_ctx_offset_32x8[256] = { + 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, + 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, 21, +}; + +const int8_t *av1_nz_map_ctx_offset[19] = { + av1_nz_map_ctx_offset_4x4, // TX_4x4 + av1_nz_map_ctx_offset_8x8, // TX_8x8 + av1_nz_map_ctx_offset_16x16, // TX_16x16 + av1_nz_map_ctx_offset_32x32, // TX_32x32 + av1_nz_map_ctx_offset_32x32, // TX_64x64 + av1_nz_map_ctx_offset_4x8, // TX_4x8 + av1_nz_map_ctx_offset_16x4, // TX_8x4 + av1_nz_map_ctx_offset_8x16, // TX_8x16 + av1_nz_map_ctx_offset_32x8, // TX_16x8 + av1_nz_map_ctx_offset_16x32, // TX_16x32 + av1_nz_map_ctx_offset_32x16, // TX_32x16 + av1_nz_map_ctx_offset_32x64, // TX_32x64 + av1_nz_map_ctx_offset_64x32, // TX_64x32 + av1_nz_map_ctx_offset_4x16, // TX_4x16 + av1_nz_map_ctx_offset_16x4, // TX_16x4 + av1_nz_map_ctx_offset_8x32, // TX_8x32 + av1_nz_map_ctx_offset_32x8, // TX_32x8 + av1_nz_map_ctx_offset_32x64, // TX_16x64 + av1_nz_map_ctx_offset_32x16, // TX_64x16 +}; + +const int16_t av1_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, + 17, 33, 65, 129, 257, 513 }; +const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h new file mode 100644 index 0000000000..9628090b63 --- /dev/null +++ b/third_party/aom/av1/common/txb_common.h @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_TXB_COMMON_H_ +#define AOM_AV1_COMMON_TXB_COMMON_H_ + +#include "av1/common/av1_common_int.h" + +extern const int16_t av1_eob_group_start[12]; +extern const int16_t av1_eob_offset_bits[12]; + +extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL]; + +typedef struct txb_ctx { + int txb_skip_ctx; + int dc_sign_ctx; +} TXB_CTX; + +static const int base_level_count_to_index[13] = { + 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, +}; + +static const TX_CLASS tx_type_to_class[TX_TYPES] = { + TX_CLASS_2D, // DCT_DCT + TX_CLASS_2D, // ADST_DCT + TX_CLASS_2D, // DCT_ADST + TX_CLASS_2D, // ADST_ADST + TX_CLASS_2D, // FLIPADST_DCT + TX_CLASS_2D, // DCT_FLIPADST + TX_CLASS_2D, // FLIPADST_FLIPADST + TX_CLASS_2D, // ADST_FLIPADST + TX_CLASS_2D, // FLIPADST_ADST + TX_CLASS_2D, // IDTX + TX_CLASS_VERT, // V_DCT + TX_CLASS_HORIZ, // H_DCT + TX_CLASS_VERT, // V_ADST + TX_CLASS_HORIZ, // H_ADST + TX_CLASS_VERT, // V_FLIPADST + TX_CLASS_HORIZ, // H_FLIPADST +}; + +static INLINE int get_txb_bhl(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_high_log2[tx_size]; +} + +static INLINE int get_txb_wide(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_wide[tx_size]; +} + +static INLINE int get_txb_high(TX_SIZE tx_size) { + tx_size = av1_get_adjusted_tx_size(tx_size); + return tx_size_high[tx_size]; +} + +static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int height) { + return levels_buf + TX_PAD_TOP * (height + TX_PAD_HOR); +} + +static INLINE int get_padded_idx(const int idx, const int bhl) { + return idx + ((idx >> bhl) << TX_PAD_HOR_LOG2); +} + +static INLINE int get_br_ctx_2d(const uint8_t *const levels, + const int c, // raster order + const int bhl) { + assert(c > 0); + const int col = c >> bhl; + const int row = c - (col << bhl); + const int stride = (1 << bhl) + TX_PAD_HOR; + const int pos = col * stride + row; + int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) + + AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) + + AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE); + mag = AOMMIN((mag + 1) >> 1, 6); + //((row | col) < 2) is equivalent to ((row < 2) && (col < 2)) + if ((row | col) < 2) return mag + 7; + return mag + 14; +} + +static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order + const int bhl, + const TX_CLASS tx_class) { + const int col = c >> bhl; + const int row = c - (col << bhl); + if (c == 0) return 0; + if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) || + (tx_class == TX_CLASS_HORIZ && col == 0) || + (tx_class == TX_CLASS_VERT && row == 0)) + return 7; + return 14; +} + +static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, + const int c, // raster order + const int bhl, const TX_CLASS tx_class) { + const int col = c >> bhl; + const int row = c - (col << bhl); + const int stride = (1 << bhl) + TX_PAD_HOR; + const int pos = col * stride + row; + int mag = levels[pos + 1]; + mag += levels[pos + stride]; + switch (tx_class) { + case TX_CLASS_2D: + mag += levels[pos + stride + 1]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if ((row < 2) && (col < 2)) return mag + 7; + break; + case TX_CLASS_HORIZ: + mag += levels[pos + (stride << 1)]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if (col == 0) return mag + 7; + break; + case TX_CLASS_VERT: + mag += levels[pos + 2]; + mag = AOMMIN((mag + 1) >> 1, 6); + if (c == 0) return mag; + if (row == 0) return mag + 7; + break; + default: break; + } + + return mag + 14; +} + +static const uint8_t clip_max3[256] = { + 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +}; + +static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels, + const int bhl, const TX_CLASS tx_class) { + int mag; + + // Note: AOMMIN(level, 3) is useless for decoder since level < 3. + mag = clip_max3[levels[(1 << bhl) + TX_PAD_HOR]]; // { 0, 1 } + mag += clip_max3[levels[1]]; // { 1, 0 } + + if (tx_class == TX_CLASS_2D) { + mag += clip_max3[levels[(1 << bhl) + TX_PAD_HOR + 1]]; // { 1, 1 } + mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 } + mag += clip_max3[levels[2]]; // { 2, 0 } + } else if (tx_class == TX_CLASS_VERT) { + mag += clip_max3[levels[2]]; // { 2, 0 } + mag += clip_max3[levels[3]]; // { 3, 0 } + mag += clip_max3[levels[4]]; // { 4, 0 } + } else { + mag += clip_max3[levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]]; // { 0, 2 } + mag += clip_max3[levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]]; // { 0, 3 } + mag += clip_max3[levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]]; // { 0, 4 } + } + + return mag; +} + +#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D +#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5) +#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10) + +static const int nz_map_ctx_offset_1d[32] = { + NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, + NZ_MAP_CTX_10, NZ_MAP_CTX_10, +}; + +static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats( + const int stats, + const int coeff_idx, // raster order + const int bhl, const TX_SIZE tx_size, const TX_CLASS tx_class) { + // tx_class == 0(TX_CLASS_2D) + if ((tx_class | coeff_idx) == 0) return 0; + int ctx = (stats + 1) >> 1; + ctx = AOMMIN(ctx, 4); + switch (tx_class) { + case TX_CLASS_2D: { + // This is the algorithm to generate av1_nz_map_ctx_offset[][] + // const int width = tx_size_wide[tx_size]; + // const int height = tx_size_high[tx_size]; + // if (width < height) { + // if (row < 2) return 11 + ctx; + // } else if (width > height) { + // if (col < 2) return 16 + ctx; + // } + // if (row + col < 2) return ctx + 1; + // if (row + col < 4) return 5 + ctx + 1; + // return 21 + ctx; + return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; + } + case TX_CLASS_HORIZ: { + const int col = coeff_idx >> bhl; + return ctx + nz_map_ctx_offset_1d[col]; + } + case TX_CLASS_VERT: { + const int col = coeff_idx >> bhl; + const int row = coeff_idx - (col << bhl); + return ctx + nz_map_ctx_offset_1d[row]; + } + default: break; + } + return 0; +} + +typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)]; +typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)]; + +static INLINE int get_lower_levels_ctx_eob(int bhl, int width, int scan_idx) { + if (scan_idx == 0) return 0; + if (scan_idx <= (width << bhl) / 8) return 1; + if (scan_idx <= (width << bhl) / 4) return 2; + return 3; +} + +static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx, + int bhl, TX_SIZE tx_size) { + assert(coeff_idx > 0); + int mag; + // Note: AOMMIN(level, 3) is useless for decoder since level < 3. + levels = levels + get_padded_idx(coeff_idx, bhl); + mag = AOMMIN(levels[(1 << bhl) + TX_PAD_HOR], 3); // { 0, 1 } + mag += AOMMIN(levels[1], 3); // { 1, 0 } + mag += AOMMIN(levels[(1 << bhl) + TX_PAD_HOR + 1], 3); // { 1, 1 } + mag += AOMMIN(levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)], 3); // { 0, 2 } + mag += AOMMIN(levels[2], 3); // { 2, 0 } + + const int ctx = AOMMIN((mag + 1) >> 1, 4); + return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx]; +} +static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels, + int coeff_idx, int bhl, + TX_SIZE tx_size, + TX_CLASS tx_class) { + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class); +} + +static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx, + int bhl, int width, + const uint8_t *levels, + int coeff_idx, TX_SIZE tx_size, + TX_CLASS tx_class) { + if (is_last) { + if (scan_idx == 0) return 0; + if (scan_idx <= (width << bhl) >> 3) return 1; + if (scan_idx <= (width << bhl) >> 2) return 2; + return 3; + } + return get_lower_levels_ctx(levels, coeff_idx, bhl, tx_size, tx_class); +} + +static INLINE void set_dc_sign(int *cul_level, int dc_val) { + if (dc_val < 0) + *cul_level |= 1 << COEFF_CONTEXT_BITS; + else if (dc_val > 0) + *cul_level += 2 << COEFF_CONTEXT_BITS; +} + +static void get_txb_ctx_general(const BLOCK_SIZE plane_bsize, + const TX_SIZE tx_size, const int plane, + const ENTROPY_CONTEXT *const a, + const ENTROPY_CONTEXT *const l, + TXB_CTX *const txb_ctx) { +#define MAX_TX_SIZE_UNIT 16 + static const int8_t signs[3] = { 0, -1, 1 }; + static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + }; + const int txb_w_unit = tx_size_wide_unit[tx_size]; + const int txb_h_unit = tx_size_high_unit[tx_size]; + int dc_sign = 0; + int k = 0; + + do { + const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; + assert(sign <= 2); + dc_sign += signs[sign]; + } while (++k < txb_w_unit); + + k = 0; + do { + const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; + assert(sign <= 2); + dc_sign += signs[sign]; + } while (++k < txb_h_unit); + + txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; + + if (plane == 0) { + if (plane_bsize == txsize_to_bsize[tx_size]) { + txb_ctx->txb_skip_ctx = 0; + } else { + // This is the algorithm to generate table skip_contexts[top][left]. + // const int max = AOMMIN(top | left, 4); + // const int min = AOMMIN(AOMMIN(top, left), 4); + // if (!max) + // txb_skip_ctx = 1; + // else if (!min) + // txb_skip_ctx = 2 + (max > 3); + // else if (max <= 3) + // txb_skip_ctx = 4; + // else if (min <= 3) + // txb_skip_ctx = 5; + // else + // txb_skip_ctx = 6; + static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 2, 4, 4, 4, 5 }, + { 3, 5, 5, 5, 6 } }; + // For top and left, we only care about which of the following three + // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The + // spec calculates top and left with the Max() function. We can calculate + // an approximate max with bitwise OR because the real max and the + // approximate max belong to the same category. + int top = 0; + int left = 0; + + k = 0; + do { + top |= a[k]; + } while (++k < txb_w_unit); + top &= COEFF_CONTEXT_MASK; + top = AOMMIN(top, 4); + + k = 0; + do { + left |= l[k]; + } while (++k < txb_h_unit); + left &= COEFF_CONTEXT_MASK; + left = AOMMIN(left, 4); + + txb_ctx->txb_skip_ctx = skip_contexts[top][left]; + } + } else { + const int ctx_base = get_entropy_context(tx_size, a, l); + const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > + num_pels_log2_lookup[txsize_to_bsize[tx_size]]) + ? 10 + : 7; + txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; + } +} + +#define SPECIALIZE_GET_TXB_CTX(w, h) \ + static void get_txb_ctx_##w##x##h( \ + const BLOCK_SIZE plane_bsize, const int plane, \ + const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, \ + TXB_CTX *const txb_ctx) { \ + static const int8_t signs[3] = { 0, -1, 1 }; \ + static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, \ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 \ + }; \ + const TX_SIZE tx_size = TX_##w##X##h; \ + const int txb_w_unit = tx_size_wide_unit[tx_size]; \ + const int txb_h_unit = tx_size_high_unit[tx_size]; \ + int dc_sign = 0; \ + int k = 0; \ + \ + do { \ + const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; \ + assert(sign <= 2); \ + dc_sign += signs[sign]; \ + } while (++k < txb_w_unit); \ + \ + k = 0; \ + do { \ + const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; \ + assert(sign <= 2); \ + dc_sign += signs[sign]; \ + } while (++k < txb_h_unit); \ + \ + txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; \ + \ + if (plane == 0) { \ + if (plane_bsize == txsize_to_bsize[tx_size]) { \ + txb_ctx->txb_skip_ctx = 0; \ + } else { \ + static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, \ + { 2, 4, 4, 4, 5 }, \ + { 2, 4, 4, 4, 5 }, \ + { 2, 4, 4, 4, 5 }, \ + { 3, 5, 5, 5, 6 } }; \ + int top = 0; \ + int left = 0; \ + \ + k = 0; \ + do { \ + top |= a[k]; \ + } while (++k < txb_w_unit); \ + top &= COEFF_CONTEXT_MASK; \ + top = AOMMIN(top, 4); \ + \ + k = 0; \ + do { \ + left |= l[k]; \ + } while (++k < txb_h_unit); \ + left &= COEFF_CONTEXT_MASK; \ + left = AOMMIN(left, 4); \ + \ + txb_ctx->txb_skip_ctx = skip_contexts[top][left]; \ + } \ + } else { \ + const int ctx_base = get_entropy_context(tx_size, a, l); \ + const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > \ + num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \ + ? 10 \ + : 7; \ + txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; \ + } \ + } + +SPECIALIZE_GET_TXB_CTX(4, 4) +SPECIALIZE_GET_TXB_CTX(8, 8) +SPECIALIZE_GET_TXB_CTX(16, 16) +SPECIALIZE_GET_TXB_CTX(32, 32) + +// Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_* +// so that the compiler can compile away the while loops. +static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, + const TX_SIZE tx_size, const int plane, + const ENTROPY_CONTEXT *const a, + const ENTROPY_CONTEXT *const l, + TXB_CTX *const txb_ctx) { + switch (tx_size) { + case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break; + case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break; + case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break; + case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break; + default: + get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx); + break; + } +} +#undef MAX_TX_SIZE_UNIT + +#endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c new file mode 100644 index 0000000000..4282b92bfa --- /dev/null +++ b/third_party/aom/av1/common/warped_motion.c @@ -0,0 +1,918 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/warped_motion.h" +#include "av1/common/scale.h" + +// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels +// at a time. The zoom/rotation/shear in the model are applied to the +// "fractional" position of each pixel, which therefore varies within +// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS. +// We need an extra 2 taps to fit this in, for a total of 8 taps. +/* clang-format off */ +const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = { + // [-1, 0) + { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 }, + { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 }, + { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 }, + { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 }, + { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 }, + { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 }, + { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 }, + { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 }, + { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 }, + { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 }, + { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 }, + { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 }, + { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 }, + { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 }, + { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 }, + { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 }, + { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 }, + { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 }, + { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 }, + { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 }, + { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 }, + { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 }, + { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 }, + { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 }, + { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 }, + { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 }, + { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 }, + { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 }, + { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 }, + { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 }, + { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 }, + { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 }, + + // [0, 1) + { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0}, + { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0}, + { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1}, + {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1}, + {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1}, + {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1}, + {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1}, + {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2}, + {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2}, + {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2}, + {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2}, + {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2}, + {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2}, + {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2}, + {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2}, + {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2}, + {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2}, + {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2}, + {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2}, + {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2}, + {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2}, + {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2}, + {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2}, + {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2}, + {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1}, + {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2}, + {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1}, + {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1}, + {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1}, + { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0}, + { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0}, + + // [1, 2) + { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 }, + { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 }, + { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 }, + { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 }, + { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 }, + { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 }, + { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 }, + { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 }, + { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 }, + { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 }, + { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 }, + { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 }, + { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 }, + { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 }, + { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 }, + { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 }, + { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 }, + { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 }, + { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 }, + { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 }, + { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 }, + { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 }, + { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 }, + { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 }, + { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 }, + { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 }, + { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 }, + { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 }, + { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 }, + { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 }, + { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 }, + { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 }, + // dummy (replicate row index 191) + { 0, 0, 0, 0, 2, 127, - 1, 0 }, +}; + +/* clang-format on */ + +#define DIV_LUT_PREC_BITS 14 +#define DIV_LUT_BITS 8 +#define DIV_LUT_NUM (1 << DIV_LUT_BITS) + +static const uint16_t div_lut[DIV_LUT_NUM + 1] = { + 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768, + 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142, + 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564, + 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028, + 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530, + 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066, + 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633, + 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228, + 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848, + 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491, + 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155, + 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838, + 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538, + 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255, + 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986, + 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732, + 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489, + 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259, + 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039, + 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830, + 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630, + 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439, + 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257, + 8240, 8224, 8208, 8192, +}; + +// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned +// at precision of DIV_LUT_PREC_BITS along with the shift. +static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) { + int64_t f; + *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + // e is obtained from D after resetting the most significant 1 bit. + const int64_t e = D - ((uint64_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) { + int32_t f; + *shift = get_msb(D); + // e is obtained from D after resetting the most significant 1 bit. + const int32_t e = D - ((uint32_t)1 << *shift); + // Get the most significant DIV_LUT_BITS (8) bits of e into f + if (*shift > DIV_LUT_BITS) + f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS); + else + f = e << (DIV_LUT_BITS - *shift); + assert(f <= DIV_LUT_NUM); + *shift += DIV_LUT_PREC_BITS; + // Use f as lookup into the precomputed table of multipliers + return div_lut[f]; +} + +static int is_affine_valid(const WarpedMotionParams *const wm) { + const int32_t *mat = wm->wmmat; + return (mat[2] > 0); +} + +static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) || + (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS))) + return 0; + else + return 1; +} + +#ifndef NDEBUG +// Check that the given warp model satisfies the relevant constraints for +// its stated model type +static void check_model_consistency(WarpedMotionParams *wm) { + switch (wm->wmtype) { + case IDENTITY: + assert(wm->wmmat[0] == 0); + assert(wm->wmmat[1] == 0); + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS); + assert(wm->wmmat[3] == 0); + AOM_FALLTHROUGH_INTENDED; + case ROTZOOM: + assert(wm->wmmat[4] == -wm->wmmat[3]); + assert(wm->wmmat[5] == wm->wmmat[2]); + AOM_FALLTHROUGH_INTENDED; + case AFFINE: break; + default: assert(0 && "Bad wmtype"); + } +} +#endif // NDEBUG + +// Returns 1 on success or 0 on an invalid affine set +int av1_get_shear_params(WarpedMotionParams *wm) { +#ifndef NDEBUG + // Check that models have been constructed sensibly + // This is a good place to check, because this function does not need to + // be called until after model construction is complete, but must be called + // before the model can be used for prediction. + check_model_consistency(wm); +#endif // NDEBUG + + const int32_t *mat = wm->wmmat; + if (!is_affine_valid(wm)) return 0; + + wm->alpha = + clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX); + wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX); + int16_t shift; + int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1); + int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y; + wm->gamma = + clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX); + v = ((int64_t)mat[3] * mat[4]) * y; + wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) - + (1 << WARPEDMODEL_PREC_BITS), + INT16_MIN, INT16_MAX); + + wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) * + (1 << WARP_PARAM_REDUCE_BITS); + + if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta)) + return 0; + + return 1; +} + +#if CONFIG_AV1_HIGHBITDEPTH +/* Note: For an explanation of the warp algorithm, and some notes on bit widths + for hardware implementations, see the comments above av1_warp_affine_c +*/ +void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, uint16_t *pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, + int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + for (int i = p_row; i < p_row + p_height; i += 8) { + for (int j = p_col; j < p_col + p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4) << subsampling_x; + const int32_t src_y = (i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (int k = -7; k < 8; ++k) { + const int iy = clamp(iy4 + k, 0, height - 1); + + int sx = sx4 + beta * (k + 4); + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + const int sample_x = clamp(ix + m, 0, width - 1); + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + assert(0 <= sum && sum < (1 << max_bits_horiz)); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (int m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + if (conv_params->is_compound) { + CONV_BUF_TYPE *p = + &conv_params + ->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + if (conv_params->do_average) { + uint16_t *dst16 = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + int32_t tmp32 = *p; + if (conv_params->use_dist_wtd_comp_avg) { + tmp32 = tmp32 * conv_params->fwd_offset + + sum * conv_params->bck_offset; + tmp32 = tmp32 >> DIST_PRECISION_BITS; + } else { + tmp32 += sum; + tmp32 = tmp32 >> 1; + } + tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1)); + *dst16 = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd); + } else { + *p = sum; + } + } else { + uint16_t *p = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + assert(0 <= sum && sum < (1 << (bd + 2))); + *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd); + } + sy += gamma; + } + } + } + } +} + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params) { + const int32_t *const mat = wm->wmmat; + const int16_t alpha = wm->alpha; + const int16_t beta = wm->beta; + const int16_t gamma = wm->gamma; + const int16_t delta = wm->delta; + + av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +/* The warp filter for ROTZOOM and AFFINE models works as follows: + * Split the input into 8x8 blocks + * For each block, project the point (4, 4) within the block, to get the + overall block position. Split into integer and fractional coordinates, + maintaining full WARPEDMODEL precision + * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a + variable horizontal offset. This means that, while the rows of the + intermediate buffer align with the rows of the *reference* image, the + columns align with the columns of the *destination* image. + * Filter vertically: Generate the output block (up to 8x8 pixels, but if the + destination is too small we crop the output at this stage). Each pixel has + a variable vertical offset, so that the resulting rows are aligned with + the rows of the destination image. + + To accomplish these alignments, we factor the warp matrix as a + product of two shear / asymmetric zoom matrices: + / a b \ = / 1 0 \ * / 1+alpha beta \ + \ c d / \ gamma 1+delta / \ 0 1 / + where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively. + The horizontal shear (with alpha and beta) is applied first, + then the vertical shear (with gamma and delta) is applied second. + + The only limitation is that, to fit this in a fixed 8-tap filter size, + the fractional pixel offsets must be at most +-1. Since the horizontal filter + generates 15 rows of 8 columns, and the initial point we project is at (4, 4) + within the block, the parameters must satisfy + 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1 + for this filter to be applicable. + + Note: This function assumes that the caller has done all of the relevant + checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5] + are set appropriately (if using a ROTZOOM model), and that alpha, beta, + gamma, delta are all in range. + + TODO(rachelbarker): Maybe support scaled references? +*/ +/* A note on hardware implementation: + The warp filter is intended to be implementable using the same hardware as + the high-precision convolve filters from the loop-restoration and + convolve-round experiments. + + For a single filter stage, considering all of the coefficient sets for the + warp filter and the regular convolution filter, an input in the range + [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)] + before rounding. + + Allowing for some changes to the filter coefficient sets, call the range + [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k, + we can replace this by the range [0, 256 * 2^k], which can be stored in an + unsigned value with 8 + k bits. + + This allows the derivation of the appropriate bit widths and offsets for + the various intermediate values: If + + F := FILTER_BITS = 7 (or else the above ranges need adjusting) + So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit + intermediate value. + H := ROUND0_BITS + V := VERSHEAR_REDUCE_PREC_BITS + (and note that we must have H + V = 2*F for the output to have the same + scale as the input) + + then we end up with the following offsets and ranges: + Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a + uint{bd + F + 1} + After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}. + Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a + uint{bd + 2*F + 2 - H} + After rounding: The final value, before undoing the offset, fits into a + uint{bd + 2}. + + Then we need to undo the offsets before clamping to a pixel. Note that, + if we do this at the end, the amount to subtract is actually independent + of H and V: + + offset to subtract = (1 << ((bd + F - 1) - H + F - V)) + + (1 << ((bd + 2*F - H) - V)) + == (1 << (bd - 1)) + (1 << bd) + + This allows us to entirely avoid clamping in both the warp filter and + the convolve-round experiment. As of the time of writing, the Wiener filter + from loop-restoration can encode a central coefficient up to 216, which + leads to a maximum value of about 282 * 2^k after applying the offset. + So in that case we still need to clamp. +*/ +void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, + int16_t gamma, int16_t delta) { + int32_t tmp[15 * 8]; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + for (int i = p_row; i < p_row + p_height; i += 8) { + for (int j = p_col; j < p_col + p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4) << subsampling_x; + const int32_t src_y = (i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + for (int k = -7; k < 8; ++k) { + // Clamp to top/bottom edge of the frame + const int iy = clamp(iy4 + k, 0, height - 1); + + int sx = sx4 + beta * (k + 4); + + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + // At this point, sx = sx4 + alpha * l + beta * k + const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + // Clamp to left/right edge of the frame + const int sample_x = clamp(ix + m, 0, width - 1); + + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + assert(0 <= sum && sum < (1 << max_bits_horiz)); + tmp[(k + 7) * 8 + (l + 4)] = sum; + sx += alpha; + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) { + // At this point, sy = sy4 + gamma * l + delta * k + const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) + + WARPEDPIXEL_PREC_SHIFTS; + assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3); + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_vert; + for (int m = 0; m < 8; ++m) { + sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m]; + } + + if (conv_params->is_compound) { + CONV_BUF_TYPE *p = + &conv_params + ->dst[(i - p_row + k + 4) * conv_params->dst_stride + + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + if (conv_params->do_average) { + uint8_t *dst8 = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + int32_t tmp32 = *p; + if (conv_params->use_dist_wtd_comp_avg) { + tmp32 = tmp32 * conv_params->fwd_offset + + sum * conv_params->bck_offset; + tmp32 = tmp32 >> DIST_PRECISION_BITS; + } else { + tmp32 += sum; + tmp32 = tmp32 >> 1; + } + tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1)); + *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits)); + } else { + *p = sum; + } + } else { + uint8_t *p = + &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert); + assert(0 <= sum && sum < (1 << (bd + 2))); + *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd)); + } + sy += gamma; + } + } + } + } +} + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { + const int32_t *const mat = wm->wmmat; + const int16_t alpha = wm->alpha; + const int16_t beta = wm->beta; + const int16_t gamma = wm->gamma; + const int16_t delta = wm->delta; + av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params, + alpha, beta, gamma, delta); +} + +void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride, + CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, bd, + conv_params); + else + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#else + (void)use_hbd; + (void)bd; + warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width, + p_height, p_stride, subsampling_x, subsampling_y, conv_params); +#endif +} + +#define LS_MV_MAX 256 // max mv in 1/8-pel +// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By. +#define LS_STEP 8 + +// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8, +// the precision needed is: +// (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] + +// (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] + +// 1 [for sign] + +// LEAST_SQUARES_SAMPLES_MAX_BITS +// [for adding up to LEAST_SQUARES_SAMPLES_MAX samples] +// The value is 23 +#define LS_MAT_RANGE_BITS \ + ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS) + +// Bit-depth reduction from the full-range +#define LS_MAT_DOWN_BITS 2 + +// bits range of A, Bx and By after downshifting +#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS) +#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1))) +#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1) + +// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are +// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here. +#define LS_SQUARE(a) \ + (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ + (2 + LS_MAT_DOWN_BITS)) +#define LS_PRODUCT1(a, b) \ + (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \ + (2 + LS_MAT_DOWN_BITS)) +#define LS_PRODUCT2(a, b) \ + (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \ + (2 + LS_MAT_DOWN_BITS)) + +#define USE_LIMITED_PREC_MULT 0 + +#if USE_LIMITED_PREC_MULT + +#define MUL_PREC_BITS 16 +static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) { + int msb = 0; + uint16_t mult = 0; + *shift = 0; + if (D != 0) { + msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32 + : get_msb((unsigned int)D)); + if (msb >= MUL_PREC_BITS) { + mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS); + *shift = msb + 1 - MUL_PREC_BITS; + } else { + mult = (uint16_t)D; + *shift = 0; + } + } + return mult; +} + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int32_t ret; + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp(v * (1 << (-shift)), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } + return ret; +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int16_t mshift; + uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift); + int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1); + shift -= mshift; + if (shift > 0) { + return (int32_t)clamp( + ROUND_POWER_OF_TWO_SIGNED(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } else { + return (int32_t)clamp( + v * (1 << (-shift)), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); + } +} + +#else + +static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} + +static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) { + int64_t v = Px * (int64_t)iDet; + return (int32_t)clamp64( + ROUND_POWER_OF_TWO_SIGNED_64(v, shift), + (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1, + (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1); +} +#endif // USE_LIMITED_PREC_MULT + +static int find_affine_int(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm, int mi_row, int mi_col) { + int32_t A[2][2] = { { 0, 0 }, { 0, 0 } }; + int32_t Bx[2] = { 0, 0 }; + int32_t By[2] = { 0, 0 }; + + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int rsuy = bh / 2 - 1; + const int rsux = bw / 2 - 1; + const int suy = rsuy * 8; + const int sux = rsux * 8; + const int duy = suy + mvy; + const int dux = sux + mvx; + + // Assume the center pixel of the block has exactly the same motion vector + // as transmitted for the block. First shift the origin of the source + // points to the block center, and the origin of the destination points to + // the block center added to the motion vector transmitted. + // Let (xi, yi) denote the source points and (xi', yi') denote destination + // points after origin shfifting, for i = 0, 1, 2, .... n-1. + // Then if P = [x0, y0, + // x1, y1 + // x2, y1, + // .... + // ] + // q = [x0', x1', x2', ... ]' + // r = [y0', y1', y2', ... ]' + // the least squares problems that need to be solved are: + // [h1, h2]' = inv(P'P)P'q and + // [h3, h4]' = inv(P'P)P'r + // where the affine transformation is given by: + // x' = h1.x + h2.y + // y' = h3.x + h4.y + // + // The loop below computes: A = P'P, Bx = P'q, By = P'r + // We need to just compute inv(A).Bx and inv(A).By for the solutions. + // Contribution from neighbor block + for (int i = 0; i < np; i++) { + const int dx = pts2[i * 2] - dux; + const int dy = pts2[i * 2 + 1] - duy; + const int sx = pts1[i * 2] - sux; + const int sy = pts1[i * 2 + 1] - suy; + // (TODO)yunqing: This comparison wouldn't be necessary if the sample + // selection is done in find_samples(). Also, global offset can be removed + // while collecting samples. + if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) { + A[0][0] += LS_SQUARE(sx); + A[0][1] += LS_PRODUCT1(sx, sy); + A[1][1] += LS_SQUARE(sy); + Bx[0] += LS_PRODUCT2(sx, dx); + Bx[1] += LS_PRODUCT1(sy, dx); + By[0] += LS_PRODUCT1(sx, dy); + By[1] += LS_PRODUCT2(sy, dy); + } + } + + // Just for debugging, and can be removed later. + assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX); + assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX); + assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX); + assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX); + assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX); + assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX); + assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX); + + // Compute Determinant of A + const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1]; + if (Det == 0) return 1; + + int16_t shift; + int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1); + shift -= WARPEDMODEL_PREC_BITS; + if (shift < 0) { + iDet <<= (-shift); + shift = 0; + } + + int64_t Px[2], Py[2]; + // These divided by the Det, are the least squares solutions + Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1]; + Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1]; + Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1]; + Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1]; + + wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift); + wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift); + wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift); + wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift); + + const int isuy = (mi_row * MI_SIZE + rsuy); + const int isux = (mi_col * MI_SIZE + rsux); + // Note: In the vx, vy expressions below, the max value of each of the + // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room + // for the first term so that the overall sum in the worst case fits + // within 32 bits overall. + const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) + + isuy * wm->wmmat[3]); + const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) - + (isux * wm->wmmat[4] + + isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS))); + wm->wmmat[0] = + clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + wm->wmmat[1] = + clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1); + return 0; +} + +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col) { + assert(wm_params->wmtype == AFFINE); + + if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row, + mi_col)) + return 1; + + // check compatibility with the fast warp filter + if (!av1_get_shear_params(wm_params)) return 1; + + return 0; +} diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h new file mode 100644 index 0000000000..d772df8873 --- /dev/null +++ b/third_party/aom/av1/common/warped_motion.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_ +#define AOM_AV1_COMMON_WARPED_MOTION_H_ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/mv.h" +#include "av1/common/convolve.h" + +#define LEAST_SQUARES_SAMPLES_MAX_BITS 3 +#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS) +#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2) +#define WARPED_MOTION_DEBUG 0 +#define DEFAULT_WMTYPE AFFINE +#define WARP_ERROR_BLOCK_LOG 5 +#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG) + +extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]; + +DECLARE_ALIGNED(8, extern const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]); + +static const uint8_t warp_pad_left[14][16] = { + { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 }, + { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 }, + { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 }, + { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 }, + { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 }, + { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 }, +}; + +static const uint8_t warp_pad_right[14][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 }, + { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 }, + { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 }, + { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }, + { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 } +}; + +void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref, + int width, int height, int stride, uint16_t *const pred, + int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, + int bd, ConvolveParams *conv_params); + +void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width, + int height, int stride, uint8_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + +void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int width, int height, int stride, + uint8_t *pred, int p_col, int p_row, int p_width, + int p_height, int p_stride, int subsampling_x, + int subsampling_y, ConvolveParams *conv_params); + +int av1_find_projection(int np, const int *pts1, const int *pts2, + BLOCK_SIZE bsize, int mvy, int mvx, + WarpedMotionParams *wm_params, int mi_row, int mi_col); + +int av1_get_shear_params(WarpedMotionParams *wm); +#endif // AOM_AV1_COMMON_WARPED_MOTION_H_ diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c new file mode 100644 index 0000000000..8aa14696f6 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "av1/common/resize.h" +#include "aom_dsp/x86/synonyms.h" + +// Note: If the crop width is not a multiple of 4, then, unlike the C version, +// this function will overwrite some of the padding on the right hand side of +// the frame. This padding appears to be trashed anyway, so this should not +// affect the running of the decoder. +void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const int16_t *x_filters, int x0_qn, + int x_step_qn) { + assert(UPSCALE_NORMATIVE_TAPS == 8); + + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + + const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + const uint8_t *src_y; + uint8_t *dst_y; + int x_qn = x0_qn; + for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { + const int x_filter_idx0 = + ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx1 = + ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx2 = + ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx3 = + ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + + assert(x_filter_idx0 <= RS_SUBPEL_MASK); + assert(x_filter_idx1 <= RS_SUBPEL_MASK); + assert(x_filter_idx2 <= RS_SUBPEL_MASK); + assert(x_filter_idx3 <= RS_SUBPEL_MASK); + + const int16_t *const x_filter0 = + &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter1 = + &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter2 = + &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter3 = + &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; + + const __m128i fil0_16 = xx_loadu_128(x_filter0); + const __m128i fil1_16 = xx_loadu_128(x_filter1); + const __m128i fil2_16 = xx_loadu_128(x_filter2); + const __m128i fil3_16 = xx_loadu_128(x_filter3); + + src_y = src; + dst_y = dst; + for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { + const uint8_t *const src_x0 = + &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x1 = + &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x2 = + &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint8_t *const src_x3 = + &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + + // Load up the source data. This is 8-bit input data, so each load + // gets 8 pixels. + const __m128i src0_8 = xx_loadl_64(src_x0); + const __m128i src1_8 = xx_loadl_64(src_x1); + const __m128i src2_8 = xx_loadl_64(src_x2); + const __m128i src3_8 = xx_loadl_64(src_x3); + + // Now zero-extend up to 16-bit precision, i.e. + // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ] + const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8); + const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8); + const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8); + const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8); + + // Multiply by filter coefficients (results in a 32-bit value), + // and add adjacent pairs, i.e. + // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) + // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] + const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); + const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); + const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); + const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); + + // Reduce horizontally and add, i.e. + // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] + const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); + const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); + + const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); + + // Divide down by (1 << FILTER_BITS), rounding to nearest. + const __m128i shifted_32 = + _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); + + // Pack 32-bit values into 16-bit values, i.e. + // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] + const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); + + // Pack 16-bit values into 8-bit values, i.e. + // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ]) + // -> [ 0 0 0 0 0 0 DC BA ] + const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero); + + // Write to the output + xx_storel_32(&dst_y[x], shifted_8); + } + } +} + +// Note: If the crop width is not a multiple of 4, then, unlike the C version, +// this function will overwrite some of the padding on the right hand side of +// the frame. This padding appears to be trashed anyway, so this should not +// affect the running of the decoder. +void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, + int h, const int16_t *x_filters, + int x0_qn, int x_step_qn, int bd) { + assert(UPSCALE_NORMATIVE_TAPS == 8); + assert(bd == 8 || bd == 10 || bd == 12); + + src -= UPSCALE_NORMATIVE_TAPS / 2 - 1; + + const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1); + + const uint16_t *src_y; + uint16_t *dst_y; + int x_qn = x0_qn; + for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) { + const int x_filter_idx0 = + ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx1 = + ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx2 = + ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + const int x_filter_idx3 = + ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS; + + assert(x_filter_idx0 <= RS_SUBPEL_MASK); + assert(x_filter_idx1 <= RS_SUBPEL_MASK); + assert(x_filter_idx2 <= RS_SUBPEL_MASK); + assert(x_filter_idx3 <= RS_SUBPEL_MASK); + + const int16_t *const x_filter0 = + &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter1 = + &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter2 = + &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS]; + const int16_t *const x_filter3 = + &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS]; + + const __m128i fil0_16 = xx_loadu_128(x_filter0); + const __m128i fil1_16 = xx_loadu_128(x_filter1); + const __m128i fil2_16 = xx_loadu_128(x_filter2); + const __m128i fil3_16 = xx_loadu_128(x_filter3); + + src_y = src; + dst_y = dst; + for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) { + const uint16_t *const src_x0 = + &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x1 = + &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x2 = + &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + const uint16_t *const src_x3 = + &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS]; + + // Load up the source data. This is 16-bit input data, so each load + // gets 8 pixels. + const __m128i src0_16 = xx_loadu_128(src_x0); + const __m128i src1_16 = xx_loadu_128(src_x1); + const __m128i src2_16 = xx_loadu_128(src_x2); + const __m128i src3_16 = xx_loadu_128(src_x3); + + // Multiply by filter coefficients (results in a 32-bit value), + // and add adjacent pairs, i.e. + // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ]) + // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ] + const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16); + const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16); + const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16); + const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16); + + // Reduce horizontally and add, i.e. + // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ] + const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32); + const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32); + + const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32); + + // Divide down by (1 << FILTER_BITS), rounding to nearest. + const __m128i shifted_32 = + _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS); + + // Pack 32-bit values into 16-bit values, i.e. + // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ] + const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero); + + // Clip the values at (1 << bd) - 1 + const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum); + + // Write to the output + xx_storel_64(&dst_y[x], clipped_16); + } + } +} diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c new file mode 100644 index 0000000000..8e293b5bb1 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +// A specialised version of hfilter, the horizontal filter for +// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. +static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w, + int h, int subpel_x_qn, int x_step_qn, + const InterpFilterParams *filter_params, int round) { + const int bd = 8; + const int ntaps = 8; + + src -= ntaps / 2 - 1; + + int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); + const __m128i round_add = _mm_set1_epi32(round_add32); + const __m128i round_shift = _mm_cvtsi32_si128(round); + + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + // Load the filter coefficients + const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); + const __m128i zero = _mm_castps_si128(_mm_setzero_ps()); + + int y; + for (y = 0; y <= h - 4; y += 4) { + const uint8_t *const src0 = src_col + y * src_stride; + const uint8_t *const src1 = src0 + 1 * src_stride; + const uint8_t *const src2 = src0 + 2 * src_stride; + const uint8_t *const src3 = src0 + 3 * src_stride; + + // Load up source data. This is 8-bit input data; each load is just + // loading the lower half of the register and gets 8 pixels + const __m128i data08 = _mm_loadl_epi64((__m128i *)src0); + const __m128i data18 = _mm_loadl_epi64((__m128i *)src1); + const __m128i data28 = _mm_loadl_epi64((__m128i *)src2); + const __m128i data38 = _mm_loadl_epi64((__m128i *)src3); + + // Now zero-extend up to 16-bit precision by interleaving with + // zeros. Drop the upper half of each register (which just had zeros) + const __m128i data0lo = _mm_unpacklo_epi8(data08, zero); + const __m128i data1lo = _mm_unpacklo_epi8(data18, zero); + const __m128i data2lo = _mm_unpacklo_epi8(data28, zero); + const __m128i data3lo = _mm_unpacklo_epi8(data38, zero); + + // Multiply by coefficients + const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); + const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); + const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); + const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); + + // Reduce horizontally and add + const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); + const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); + const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); + + // Divide down by (1 << round), rounding to nearest. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); + + shifted = _mm_packus_epi32(shifted, shifted); + // Write transposed to the output + _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); + } + for (; y < h; ++y) { + const uint8_t *const src_row = src_col + y * src_stride; + + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < ntaps; ++k) { + sum += filter[k] * src_row[k]; + } + + dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); + } + } +} + +static __m128i convolve_16_8(const int16_t *src, __m128i coeff) { + __m128i data = _mm_loadu_si128((__m128i *)src); + return _mm_madd_epi16(data, coeff); +} + +// A specialised version of vfilter, the vertical filter for +// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters. +static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, int subpel_y_qn, + int y_step_qn, const InterpFilterParams *filter_params, + const ConvolveParams *conv_params, int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int ntaps = 8; + + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + const __m128i sub = _mm_set1_epi16(sub32); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i bits_shift = _mm_cvtsi32_si128(bits); + const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1)); + const __m128i round_shift_add = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16((short)w0); + const __m128i wt1 = _mm_set1_epi16((short)w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); + int x; + for (x = 0; x <= w - 4; x += 4) { + const int16_t *const src0 = src_y + x * src_stride; + const int16_t *const src1 = src0 + 1 * src_stride; + const int16_t *const src2 = src0 + 2 * src_stride; + const int16_t *const src3 = src0 + 3 * src_stride; + + // Load the source data for the three rows, adding the three registers of + // convolved products to one as we go (conv0..conv3) to avoid the + // register pressure getting too high. + const __m128i conv0 = convolve_16_8(src0, coeff0716); + const __m128i conv1 = convolve_16_8(src1, coeff0716); + const __m128i conv2 = convolve_16_8(src2, coeff0716); + const __m128i conv3 = convolve_16_8(src3, coeff0716); + + // Now reduce horizontally to get one lane for each result + const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); + const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); + __m128i conv = _mm_hadd_epi32(conv01, conv23); + + conv = _mm_add_epi32(conv, res_add_const); + // Divide down by (1 << round_1), rounding to nearest and subtract sub32. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); + + uint8_t *dst_x = dst + y * dst_stride + x; + __m128i result; + __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); + + if (conv_params->is_compound) { + CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; + if (conv_params->do_average) { + const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + shifted_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1); + } + const __m128i subbed = _mm_sub_epi16(shifted_16, sub); + result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); + const __m128i result_8 = _mm_packus_epi16(result, result); + *(int *)dst_x = _mm_cvtsi128_si32(result_8); + } else { + _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); + } + } else { + const __m128i subbed = _mm_sub_epi16(shifted_16, sub); + result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift); + const __m128i result_8 = _mm_packus_epi16(result, result); + *(int *)dst_x = _mm_cvtsi128_si32(result_8); + } + } + for (; x < w; ++x) { + const int16_t *src_x = src_y + x * src_stride; + int32_t sum = 1 << offset_bits; + for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - sub32; + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits)); + } + } + } +} +void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int x_step_qn, + const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params) { + int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]; + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + + const int xtaps = filter_params_x->taps; + const int ytaps = filter_params_y->taps; + const int fo_vert = ytaps / 2 - 1; + assert((xtaps == 8) && (ytaps == 8)); + (void)xtaps; + + // horizontal filter + hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn, + x_step_qn, filter_params_x, conv_params->round_0); + + // vertical filter (input is transposed) + vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params, 8); +} + +// A specialised version of hfilter, the horizontal filter for +// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap +// filters. +static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst, + int w, int h, int subpel_x_qn, int x_step_qn, + const InterpFilterParams *filter_params, int round, + int bd) { + const int ntaps = 8; + + src -= ntaps / 2 - 1; + + int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1)); + const __m128i round_add = _mm_set1_epi32(round_add32); + const __m128i round_shift = _mm_cvtsi32_si128(round); + + int x_qn = subpel_x_qn; + for (int x = 0; x < w; ++x, x_qn += x_step_qn) { + const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + // Load the filter coefficients + const __m128i coefflo = _mm_loadu_si128((__m128i *)filter); + + int y; + for (y = 0; y <= h - 4; y += 4) { + const uint16_t *const src0 = src_col + y * src_stride; + const uint16_t *const src1 = src0 + 1 * src_stride; + const uint16_t *const src2 = src0 + 2 * src_stride; + const uint16_t *const src3 = src0 + 3 * src_stride; + + // Load up source data. This is 16-bit input data, so each load gets the 8 + // pixels we need. + const __m128i data0lo = _mm_loadu_si128((__m128i *)src0); + const __m128i data1lo = _mm_loadu_si128((__m128i *)src1); + const __m128i data2lo = _mm_loadu_si128((__m128i *)src2); + const __m128i data3lo = _mm_loadu_si128((__m128i *)src3); + + // Multiply by coefficients + const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo); + const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo); + const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo); + const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo); + + // Reduce horizontally and add + const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo); + const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo); + const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo); + + // Divide down by (1 << round), rounding to nearest. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift); + + shifted = _mm_packus_epi32(shifted, shifted); + // Write transposed to the output + _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted); + } + for (; y < h; ++y) { + const uint16_t *const src_row = src_col + y * src_stride; + + int32_t sum = (1 << (bd + FILTER_BITS - 1)); + for (int k = 0; k < ntaps; ++k) { + sum += filter[k] * src_row[k]; + } + + dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round); + } + } +} +// A specialised version of vfilter, the vertical filter for +// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap +// filters. +static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, + int dst_stride, int w, int h, int subpel_y_qn, + int y_step_qn, + const InterpFilterParams *filter_params, + const ConvolveParams *conv_params, int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int ntaps = 8; + + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + const __m128i sub = _mm_set1_epi32(sub32); + + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int dst16_stride = conv_params->dst_stride; + const __m128i clip_pixel_ = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i bits_shift = _mm_cvtsi32_si128(bits); + const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1)); + const __m128i round_shift_add = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + int y_qn = subpel_y_qn; + for (int y = 0; y < h; ++y, y_qn += y_step_qn) { + const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS); + const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(filter_idx < SUBPEL_SHIFTS); + const int16_t *filter = + av1_get_interp_filter_subpel_kernel(filter_params, filter_idx); + + const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter); + int x; + for (x = 0; x <= w - 4; x += 4) { + const int16_t *const src0 = src_y + x * src_stride; + const int16_t *const src1 = src0 + 1 * src_stride; + const int16_t *const src2 = src0 + 2 * src_stride; + const int16_t *const src3 = src0 + 3 * src_stride; + + // Load the source data for the three rows, adding the three registers of + // convolved products to one as we go (conv0..conv3) to avoid the + // register pressure getting too high. + const __m128i conv0 = convolve_16_8(src0, coeff0716); + const __m128i conv1 = convolve_16_8(src1, coeff0716); + const __m128i conv2 = convolve_16_8(src2, coeff0716); + const __m128i conv3 = convolve_16_8(src3, coeff0716); + + // Now reduce horizontally to get one lane for each result + const __m128i conv01 = _mm_hadd_epi32(conv0, conv1); + const __m128i conv23 = _mm_hadd_epi32(conv2, conv3); + __m128i conv = _mm_hadd_epi32(conv01, conv23); + conv = _mm_add_epi32(conv, res_add_const); + + // Divide down by (1 << round_1), rounding to nearest and subtract sub32. + __m128i shifted = + _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift); + + uint16_t *dst_x = dst + y * dst_stride + x; + + __m128i result; + if (conv_params->is_compound) { + CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x; + if (conv_params->do_average) { + __m128i p_32 = + _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); + + if (conv_params->use_dist_wtd_comp_avg) { + shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), + _mm_mullo_epi32(shifted, wt1)); + shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); + } else { + shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1); + } + result = _mm_sub_epi32(shifted, sub); + result = _mm_sra_epi32(_mm_add_epi32(result, round_bits_const), + round_bits_shift); + + result = _mm_packus_epi32(result, result); + result = _mm_min_epi16(result, clip_pixel_); + _mm_storel_epi64((__m128i *)dst_x, result); + } else { + __m128i shifted_16 = _mm_packus_epi32(shifted, shifted); + _mm_storel_epi64((__m128i *)dst_16_x, shifted_16); + } + } else { + result = _mm_sub_epi32(shifted, sub); + result = _mm_sra_epi16(_mm_add_epi32(result, bits_const), bits_shift); + result = _mm_packus_epi32(result, result); + result = _mm_min_epi16(result, clip_pixel_); + _mm_storel_epi64((__m128i *)dst_x, result); + } + } + + for (; x < w; ++x) { + const int16_t *src_x = src_y + x * src_stride; + int32_t sum = 1 << offset_bits; + for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k]; + CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); + if (conv_params->is_compound) { + if (conv_params->do_average) { + int32_t tmp = dst16[y * dst16_stride + x]; + if (conv_params->use_dist_wtd_comp_avg) { + tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; + tmp = tmp >> DIST_PRECISION_BITS; + } else { + tmp += res; + tmp = tmp >> 1; + } + /* Subtract round offset and convolve round */ + tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } else { + dst16[y * dst16_stride + x] = res; + } + } else { + /* Subtract round offset and convolve round */ + int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1))); + dst[y * dst_stride + x] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd); + } + } + } +} + +void av1_highbd_convolve_2d_scale_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int x_step_qn, const int subpel_y_qn, const int y_step_qn, + ConvolveParams *conv_params, int bd) { + // TODO(yaowu): Move this out of stack + DECLARE_ALIGNED(16, int16_t, + tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + + filter_params_y->taps; + const int xtaps = filter_params_x->taps; + const int ytaps = filter_params_y->taps; + const int fo_vert = ytaps / 2 - 1; + + memset(tmp, 0, sizeof(tmp)); + assert((xtaps == 8) && (ytaps == 8)); + (void)xtaps; + + // horizontal filter + highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, + subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0, + bd); + + // vertical filter (input is transposed) + highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn, + filter_params_y, conv_params, bd); +} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c new file mode 100644 index 0000000000..0afd42b170 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c @@ -0,0 +1,2254 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_inv_txfm_avx2.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" + +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); +} + +static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); +} + +static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) { + btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]); + btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]); + btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]); + btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]); + btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]); + btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]); + btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]); + btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]); +} + +static void idct16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = input[8]; + x1[2] = input[4]; + x1[3] = input[12]; + x1[4] = input[2]; + x1[5] = input[10]; + x1[6] = input[6]; + x1[7] = input[14]; + x1[8] = input[1]; + x1[9] = input[9]; + x1[10] = input[5]; + x1[11] = input[13]; + x1[12] = input[3]; + x1[13] = input[11]; + x1[14] = input[7]; + x1[15] = input[15]; + + // stage 2 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, + INV_COS_BIT); + + // stage 3 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, + INV_COS_BIT); + + idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[2] = input[4]; + x1[4] = input[2]; + x1[6] = input[6]; + x1[8] = input[1]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]); + + // stage 3 + btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, + INV_COS_BIT); + + idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + idct16_stage7_avx2(output, x1); +} + +static void idct16_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x1[2]; + x1[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]); + + // stage 5 + // stage 6 + output[0] = x1[0]; + output[1] = x1[1]; + output[2] = x1[1]; + output[3] = x1[0]; + output[4] = x1[0]; + output[5] = x1[1]; + output[6] = x1[1]; + output[7] = x1[0]; + output[8] = x1[0]; + output[9] = x1[1]; + output[10] = x1[1]; + output[11] = x1[0]; + output[12] = x1[0]; + output[13] = x1[1]; + output[14] = x1[1]; + output[15] = x1[0]; +} + +static INLINE void iadst16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[8]); + btf_16_adds_subs_avx2(&x[1], &x[9]); + btf_16_adds_subs_avx2(&x[2], &x[10]); + btf_16_adds_subs_avx2(&x[3], &x[11]); + btf_16_adds_subs_avx2(&x[4], &x[12]); + btf_16_adds_subs_avx2(&x[5], &x[13]); + btf_16_adds_subs_avx2(&x[6], &x[14]); + btf_16_adds_subs_avx2(&x[7], &x[15]); +} + +static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit); +} + +static INLINE void iadst16_stage5_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[4]); + btf_16_adds_subs_avx2(&x[1], &x[5]); + btf_16_adds_subs_avx2(&x[2], &x[6]); + btf_16_adds_subs_avx2(&x[3], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[12]); + btf_16_adds_subs_avx2(&x[9], &x[13]); + btf_16_adds_subs_avx2(&x[10], &x[14]); + btf_16_adds_subs_avx2(&x[11], &x[15]); +} + +static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit); +} + +static INLINE void iadst16_stage7_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[0], &x[2]); + btf_16_adds_subs_avx2(&x[1], &x[3]); + btf_16_adds_subs_avx2(&x[4], &x[6]); + btf_16_adds_subs_avx2(&x[5], &x[7]); + btf_16_adds_subs_avx2(&x[8], &x[10]); + btf_16_adds_subs_avx2(&x[9], &x[11]); + btf_16_adds_subs_avx2(&x[12], &x[14]); + btf_16_adds_subs_avx2(&x[13], &x[15]); +} + +static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); +} + +static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) { + const __m256i __zero = _mm256_setzero_si256(); + output[0] = x1[0]; + output[1] = _mm256_subs_epi16(__zero, x1[8]); + output[2] = x1[12]; + output[3] = _mm256_subs_epi16(__zero, x1[4]); + output[4] = x1[6]; + output[5] = _mm256_subs_epi16(__zero, x1[14]); + output[6] = x1[10]; + output[7] = _mm256_subs_epi16(__zero, x1[2]); + output[8] = x1[3]; + output[9] = _mm256_subs_epi16(__zero, x1[11]); + output[10] = x1[15]; + output[11] = _mm256_subs_epi16(__zero, x1[7]); + output[12] = x1[5]; + output[13] = _mm256_subs_epi16(__zero, x1[13]); + output[14] = x1[9]; + output[15] = _mm256_subs_epi16(__zero, x1[1]); +} + +static void iadst16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[15]; + x1[1] = input[0]; + x1[2] = input[13]; + x1[3] = input[2]; + x1[4] = input[11]; + x1[5] = input[4]; + x1[6] = input[9]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[9] = input[8]; + x1[10] = input[5]; + x1[11] = input[10]; + x1[12] = input[3]; + x1[13] = input[12]; + x1[14] = input[1]; + x1[15] = input[14]; + + // stage 2 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, + INV_COS_BIT); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + x1[3] = input[2]; + x1[5] = input[4]; + x1[7] = input[6]; + x1[8] = input[7]; + x1[10] = input[5]; + x1[12] = input[3]; + x1[14] = input[1]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]); + btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]); + btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]); + btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]); + btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]); + btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]); + btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]); + + iadst16_stage3_avx2(x1); + iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage5_avx2(x1); + iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage7_avx2(x1); + iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage9_avx2(output, x1); +} + +static void iadst16_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m256i x1[16]; + x1[1] = input[0]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]); + + // stage 3 + x1[8] = x1[0]; + x1[9] = x1[1]; + + // stage 4 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, + INV_COS_BIT); + + // stage 5 + x1[4] = x1[0]; + x1[5] = x1[1]; + + x1[12] = x1[8]; + x1[13] = x1[9]; + + // stage 6 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, + INV_COS_BIT); + + // stage 7 + x1[2] = x1[0]; + x1[3] = x1[1]; + x1[6] = x1[4]; + x1[7] = x1[5]; + x1[10] = x1[8]; + x1[11] = x1[9]; + x1[14] = x1[12]; + x1[15] = x1[13]; + + iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + iadst16_stage9_avx2(output, x1); +} + +static INLINE void idct32_high16_stage3_avx2(__m256i *x) { + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); +} + +static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); +} + +static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); +} + +static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); +} + +static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); +} + +static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); +} + +static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]); + btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]); + btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]); + btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]); + btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]); + btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]); + btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]); + btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]); + btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]); + btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]); + btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]); + btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]); + btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]); + btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]); + btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]); + btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]); +} + +static void idct32_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); + + idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage9_avx2(output, x); +} + +static void idct32_low16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_avx2(x); + + // stage 4 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT); + + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT); + + idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT); + idct32_stage9_avx2(output, x); +} + +static void idct32_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + + // stage 1 + __m256i x1[32]; + x1[0] = input[0]; + x1[1] = input[16]; + x1[2] = input[8]; + x1[3] = input[24]; + x1[4] = input[4]; + x1[5] = input[20]; + x1[6] = input[12]; + x1[7] = input[28]; + x1[8] = input[2]; + x1[9] = input[18]; + x1[10] = input[10]; + x1[11] = input[26]; + x1[12] = input[6]; + x1[13] = input[22]; + x1[14] = input[14]; + x1[15] = input[30]; + x1[16] = input[1]; + x1[17] = input[17]; + x1[18] = input[9]; + x1[19] = input[25]; + x1[20] = input[5]; + x1[21] = input[21]; + x1[22] = input[13]; + x1[23] = input[29]; + x1[24] = input[3]; + x1[25] = input[19]; + x1[26] = input[11]; + x1[27] = input[27]; + x1[28] = input[7]; + x1[29] = input[23]; + x1[30] = input[15]; + x1[31] = input[31]; + + // stage 2 + btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, + INV_COS_BIT); + + // stage 3 + btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, + INV_COS_BIT); + idct32_high16_stage3_avx2(x1); + + // stage 4 + btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, + INV_COS_BIT); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT); + + // stage 6 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT); + + idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT); + idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT); + idct32_stage9_avx2(output, x1); +} + +static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit); +} + +static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[35]); + btf_16_adds_subs_avx2(&x[33], &x[34]); + btf_16_adds_subs_avx2(&x[39], &x[36]); + btf_16_adds_subs_avx2(&x[38], &x[37]); + btf_16_adds_subs_avx2(&x[40], &x[43]); + btf_16_adds_subs_avx2(&x[41], &x[42]); + btf_16_adds_subs_avx2(&x[47], &x[44]); + btf_16_adds_subs_avx2(&x[46], &x[45]); + btf_16_adds_subs_avx2(&x[48], &x[51]); + btf_16_adds_subs_avx2(&x[49], &x[50]); + btf_16_adds_subs_avx2(&x[55], &x[52]); + btf_16_adds_subs_avx2(&x[54], &x[53]); + btf_16_adds_subs_avx2(&x[56], &x[59]); + btf_16_adds_subs_avx2(&x[57], &x[58]); + btf_16_adds_subs_avx2(&x[63], &x[60]); + btf_16_adds_subs_avx2(&x[62], &x[61]); +} + +static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit); +} + +static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + btf_16_adds_subs_avx2(&x[16], &x[19]); + btf_16_adds_subs_avx2(&x[17], &x[18]); + btf_16_adds_subs_avx2(&x[23], &x[20]); + btf_16_adds_subs_avx2(&x[22], &x[21]); + btf_16_adds_subs_avx2(&x[24], &x[27]); + btf_16_adds_subs_avx2(&x[25], &x[26]); + btf_16_adds_subs_avx2(&x[31], &x[28]); + btf_16_adds_subs_avx2(&x[30], &x[29]); + idct64_stage6_high32_avx2(x, cospi, _r, cos_bit); +} + +static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[39]); + btf_16_adds_subs_avx2(&x[33], &x[38]); + btf_16_adds_subs_avx2(&x[34], &x[37]); + btf_16_adds_subs_avx2(&x[35], &x[36]); + btf_16_adds_subs_avx2(&x[47], &x[40]); + btf_16_adds_subs_avx2(&x[46], &x[41]); + btf_16_adds_subs_avx2(&x[45], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[43]); + btf_16_adds_subs_avx2(&x[48], &x[55]); + btf_16_adds_subs_avx2(&x[49], &x[54]); + btf_16_adds_subs_avx2(&x[50], &x[53]); + btf_16_adds_subs_avx2(&x[51], &x[52]); + btf_16_adds_subs_avx2(&x[63], &x[56]); + btf_16_adds_subs_avx2(&x[62], &x[57]); + btf_16_adds_subs_avx2(&x[61], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[59]); +} + +static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_avx2(&x[16], &x[23]); + btf_16_adds_subs_avx2(&x[17], &x[22]); + btf_16_adds_subs_avx2(&x[18], &x[21]); + btf_16_adds_subs_avx2(&x[19], &x[20]); + btf_16_adds_subs_avx2(&x[31], &x[24]); + btf_16_adds_subs_avx2(&x[30], &x[25]); + btf_16_adds_subs_avx2(&x[29], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit); +} + +static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[15]); + btf_16_adds_subs_avx2(&x[1], &x[14]); + btf_16_adds_subs_avx2(&x[2], &x[13]); + btf_16_adds_subs_avx2(&x[3], &x[12]); + btf_16_adds_subs_avx2(&x[4], &x[11]); + btf_16_adds_subs_avx2(&x[5], &x[10]); + btf_16_adds_subs_avx2(&x[6], &x[9]); + btf_16_adds_subs_avx2(&x[7], &x[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x[32], &x[47]); + btf_16_adds_subs_avx2(&x[33], &x[46]); + btf_16_adds_subs_avx2(&x[34], &x[45]); + btf_16_adds_subs_avx2(&x[35], &x[44]); + btf_16_adds_subs_avx2(&x[36], &x[43]); + btf_16_adds_subs_avx2(&x[37], &x[42]); + btf_16_adds_subs_avx2(&x[38], &x[41]); + btf_16_adds_subs_avx2(&x[39], &x[40]); + btf_16_adds_subs_avx2(&x[63], &x[48]); + btf_16_adds_subs_avx2(&x[62], &x[49]); + btf_16_adds_subs_avx2(&x[61], &x[50]); + btf_16_adds_subs_avx2(&x[60], &x[51]); + btf_16_adds_subs_avx2(&x[59], &x[52]); + btf_16_adds_subs_avx2(&x[58], &x[53]); + btf_16_adds_subs_avx2(&x[57], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[55]); +} + +static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi, + const __m256i _r, int8_t cos_bit) { + (void)cos_bit; + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_avx2(&x[0], &x[31]); + btf_16_adds_subs_avx2(&x[1], &x[30]); + btf_16_adds_subs_avx2(&x[2], &x[29]); + btf_16_adds_subs_avx2(&x[3], &x[28]); + btf_16_adds_subs_avx2(&x[4], &x[27]); + btf_16_adds_subs_avx2(&x[5], &x[26]); + btf_16_adds_subs_avx2(&x[6], &x[25]); + btf_16_adds_subs_avx2(&x[7], &x[24]); + btf_16_adds_subs_avx2(&x[8], &x[23]); + btf_16_adds_subs_avx2(&x[9], &x[22]); + btf_16_adds_subs_avx2(&x[10], &x[21]); + btf_16_adds_subs_avx2(&x[11], &x[20]); + btf_16_adds_subs_avx2(&x[12], &x[19]); + btf_16_adds_subs_avx2(&x[13], &x[18]); + btf_16_adds_subs_avx2(&x[14], &x[17]); + btf_16_adds_subs_avx2(&x[15], &x[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit); +} + +static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) { + btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]); + btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]); + btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]); + btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]); + btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]); + btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]); + btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]); + btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]); + btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]); + btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]); + btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]); + btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]); + btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]); + btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]); + btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]); + btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]); + btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]); + btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]); + btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]); + btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]); + btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]); + btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]); + btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]); + btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]); + btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]); + btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]); + btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]); + btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]); + btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]); + btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]); + btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]); + btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]); +} + +static void idct64_low1_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m256i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, + INV_COS_BIT); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, + INV_COS_BIT); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, + INV_COS_BIT); + idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); + + idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage11_avx2(output, x); +} + +static void idct64_low16_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, + INV_COS_BIT); + idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 8 + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, + INV_COS_BIT); + idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); + + idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage11_avx2(output, x); +} + +static void idct64_low32_avx2(const __m256i *input, __m256i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m256i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_avx2(&x[32], &x[33]); + btf_16_adds_subs_avx2(&x[35], &x[34]); + btf_16_adds_subs_avx2(&x[36], &x[37]); + btf_16_adds_subs_avx2(&x[39], &x[38]); + btf_16_adds_subs_avx2(&x[40], &x[41]); + btf_16_adds_subs_avx2(&x[43], &x[42]); + btf_16_adds_subs_avx2(&x[44], &x[45]); + btf_16_adds_subs_avx2(&x[47], &x[46]); + btf_16_adds_subs_avx2(&x[48], &x[49]); + btf_16_adds_subs_avx2(&x[51], &x[50]); + btf_16_adds_subs_avx2(&x[52], &x[53]); + btf_16_adds_subs_avx2(&x[55], &x[54]); + btf_16_adds_subs_avx2(&x[56], &x[57]); + btf_16_adds_subs_avx2(&x[59], &x[58]); + btf_16_adds_subs_avx2(&x[60], &x[61]); + btf_16_adds_subs_avx2(&x[63], &x[62]); + + // stage 4 + btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_avx2(&x[16], &x[17]); + btf_16_adds_subs_avx2(&x[19], &x[18]); + btf_16_adds_subs_avx2(&x[20], &x[21]); + btf_16_adds_subs_avx2(&x[23], &x[22]); + btf_16_adds_subs_avx2(&x[24], &x[25]); + btf_16_adds_subs_avx2(&x[27], &x[26]); + btf_16_adds_subs_avx2(&x[28], &x[29]); + btf_16_adds_subs_avx2(&x[31], &x[30]); + idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 5 + btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_avx2(&x[8], &x[9]); + btf_16_adds_subs_avx2(&x[11], &x[10]); + btf_16_adds_subs_avx2(&x[12], &x[13]); + btf_16_adds_subs_avx2(&x[15], &x[14]); + idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 6 + btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_avx2(&x[4], &x[5]); + btf_16_adds_subs_avx2(&x[7], &x[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, + INV_COS_BIT); + idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 7 + btf_16_adds_subs_avx2(&x[0], &x[3]); + btf_16_adds_subs_avx2(&x[1], &x[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT); + btf_16_adds_subs_avx2(&x[8], &x[11]); + btf_16_adds_subs_avx2(&x[9], &x[10]); + btf_16_adds_subs_avx2(&x[15], &x[12]); + btf_16_adds_subs_avx2(&x[14], &x[13]); + idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 8 + btf_16_adds_subs_avx2(&x[0], &x[7]); + btf_16_adds_subs_avx2(&x[1], &x[6]); + btf_16_adds_subs_avx2(&x[2], &x[5]); + btf_16_adds_subs_avx2(&x[3], &x[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, + INV_COS_BIT); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, + INV_COS_BIT); + idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT); + + // stage 9~11 + idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT); + idct64_stage11_avx2(output, x); +} + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output); + +// 1D functions process 16 pixels at one time. +static const transform_1d_avx2 + lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, + idct64_low32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// only process w >= 16 h >= 16 +static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4; + const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0])); + for (int i = 0; i < buf_size_nonzero_h_div16; i++) { + __m256i buf0[64]; + load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + for (int j = 0; j < txfm_size_col; ++j) { + buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0); + } + + __m256i *buf1_cur = buf1 + (i << 4); + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_avx2(buf0 + 16 * j, temp, 16); + int offset = txfm_size_row * (buf_size_w_div16 - 1 - j); + transpose_16bit_16x16_avx2(temp, buf1_cur + offset); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j); + } + } + } + const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1])); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i *buf1_cur = buf1 + i * txfm_size_row; + col_txfm(buf1_cur, buf1_cur); + for (int j = 0; j < txfm_size_row; ++j) { + buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1); + } + } + for (int i = 0; i < buf_size_w_div16; i++) { + lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i, + stride, ud_flip, txfm_size_row); + } +} + +static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]); + const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } else { + const __m256i rect_scale = + _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m256i src = load_32bit_to_16bit_w16_avx2(input_row); + src = _mm256_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m256i lo = _mm256_unpacklo_epi16(src, one); + __m256i hi = _mm256_unpackhi_epi16(src, one); + lo = _mm256_madd_epi16(lo, scale__r); + hi = _mm256_madd_epi16(hi, scale__r); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm256_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride, + __m256i *buf, int shift, int height, + int txh_idx) { + const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]); + const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1)); + const __m256i one = _mm256_set1_epi16(1); + const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r); + for (int h = 0; h < height; ++h) { + __m256i lo = _mm256_unpacklo_epi16(buf[h], one); + __m256i hi = _mm256_unpackhi_epi16(buf[h], one); + lo = _mm256_madd_epi16(lo, scale_coeff); + hi = _mm256_madd_epi16(hi, scale_coeff); + lo = _mm256_srai_epi32(lo, NewSqrt2Bits); + hi = _mm256_srai_epi32(hi, NewSqrt2Bits); + lo = _mm256_add_epi32(lo, shift__r); + hi = _mm256_add_epi32(hi, shift__r); + lo = _mm256_srai_epi32(lo, -shift); + hi = _mm256_srai_epi32(hi, -shift); + const __m256i x = _mm256_packs_epi32(lo, hi); + write_recon_w16_avx2(x, output); + output += stride; + } +} + +static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_SIZE tx_size, + int32_t eob) { + (void)eob; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int col_max = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + __m256i buf[32]; + + for (int i = 0; i < (col_max >> 4); ++i) { + for (int j = 0; j < (row_max >> 4); j++) { + iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride, + row_max, shift[0], 16, txw_idx, rect_type); + transpose_16bit_16x16_avx2(buf, buf); + iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf, + shift[1], 16, txh_idx); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row); + const int input_stride = txfm_size_row_notzero; + const int buf_size_w_div16 = (eobx + 16) >> 4; + const int buf_size_h_div16 = (eoby + 16) >> 4; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 col_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div16; i++) { + __m256i buf0[64]; + for (int j = 0; j < buf_size_h_div16; j++) { + __m256i *buf0_cur = buf0 + j * 16; + const int32_t *input_cur = input + i * 16 * input_stride + j * 16; + iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16, + txw_idx, rect_type); + transpose_16bit_16x16_avx2(buf0_cur, buf0_cur); + } + col_txfm(buf0, buf0); + __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift); + write_recon_w16_avx2(res, output + (i << 4) + j * stride); + } + } +} + +static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m256i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div16 = txfm_size_col >> 4; + const int buf_size_h_div16 = (eoby + 16) >> 4; + const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_avx2 row_txfm = + lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + + assert(row_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div16; i++) { + __m256i buf0[64]; + load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]); + __m256i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div16; ++j) { + __m256i temp[16]; + flip_buf_avx2(buf0 + 16 * j, temp, 16); + transpose_16bit_16x16_avx2(temp, + _buf1 + 16 * (buf_size_w_div16 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j); + } + } + for (int j = 0; j < buf_size_w_div16; ++j) { + iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride, + buf1 + j * 16, shift[1], 16, txh_idx); + } + } +} + +static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = { + { av1_idct8_low1_ssse3, av1_idct8_sse2 }, + { av1_iadst8_low1_ssse3, av1_iadst8_sse2 } +}; + +static INLINE void load_buffer_avx2(const int32_t *in, int stride, + __m128i *out) { + const __m256i a = _mm256_load_si256((const __m256i *)in); + const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1)); + const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2)); + const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3)); + const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4)); + const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5)); + const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6)); + const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7)); + + // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 + const __m256i ab_16bit = _mm256_packs_epi32(a, b); + // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7 + const __m256i cd_16bit = _mm256_packs_epi32(c, d); + // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7 + const __m256i ef_16bit = _mm256_packs_epi32(e, f); + // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7 + const __m256i gh_16bit = _mm256_packs_epi32(g, h); + + // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 + const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8); + // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7 + const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8); + // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7 + const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8); + // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7 + const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8); + + out[0] = _mm256_castsi256_si128(ab); + out[1] = _mm256_extractf128_si256(ab, 1); + out[2] = _mm256_castsi256_si128(cd); + out[3] = _mm256_extractf128_si256(cd, 1); + out[4] = _mm256_castsi256_si128(ef); + out[5] = _mm256_extractf128_si256(ef, 1); + out[6] = _mm256_castsi256_si128(gh); + out[7] = _mm256_extractf128_si256(gh, 1); +} + +static INLINE void round_and_transpose_avx2(const __m128i *const in, + __m128i *const out, int bit, + int *lr_flip) { + __m256i buf_temp[4]; + const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); + int j = *lr_flip ? 7 : 0; + const int step = *lr_flip ? -1 : 1; + + // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 + buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + j += step; + // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 + buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + j += step; + // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 + buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + j += step; + // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 + buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), + in[j + 4 * step], 1); + + // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37 + buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale); + // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27 + buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale); + // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17 + buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale); + // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07 + buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale); + + // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23 + const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]); + // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27 + const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]); + // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03 + const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]); + // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07 + const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]); + + // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01 + const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1); + // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03 + const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1); + // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05 + const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1); + // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07 + const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1); + + // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01 + const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8); + // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03 + const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8); + // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05 + const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8); + // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07 + const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8); + + // 70 60 50 40 30 20 10 00 + out[0] = _mm256_castsi256_si128(reg_00); + // 71 61 51 41 31 21 11 01 + out[1] = _mm256_extracti128_si256(reg_00, 1); + // 72 62 52 42 32 22 12 02 + out[2] = _mm256_castsi256_si128(reg_01); + // 73 63 53 43 33 23 13 03 + out[3] = _mm256_extracti128_si256(reg_01, 1); + // 74 64 54 44 34 24 14 04 + out[4] = _mm256_castsi256_si128(reg_10); + // 75 65 55 45 35 25 15 05 + out[5] = _mm256_extracti128_si256(reg_10, 1); + // 76 66 56 46 36 26 16 06 + out[6] = _mm256_castsi256_si128(reg_11); + // 77 67 57 47 37 27 17 07 + out[7] = _mm256_extracti128_si256(reg_11, 1); +} + +static INLINE void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit, + uint8_t *output, + int stride, int flipud) { + __m256i in_256[4], v_256[4]; + int j = flipud ? 7 : 0; + const int step = flipud ? -1 : 1; + const __m256i scale = _mm256_set1_epi16(1 << (15 + bit)); + const __m256i zero = _mm256_setzero_si256(); + // in[0], in[1] + in_256[0] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + j += 2 * step; + // in[2], in[3] + in_256[1] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + j += 2 * step; + // in[4], in[5] + in_256[2] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + j += 2 * step; + // in[6], in[7] + in_256[3] = + _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1); + + // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17 + in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale); + // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37 + in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale); + // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57 + in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale); + // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77 + in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale); + + const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output)); + const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride)); + const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); + const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); + const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride)); + const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride)); + const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride)); + const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride)); + + v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1); + v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1); + v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1); + v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1); + + const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero); + const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero); + const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero); + const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero); + // 00 01 10 11 + const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0); + // 20 21 30 31 + const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1); + // 40 41 50 51 + const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2); + // 60 61 70 71 + const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3); + + // 00 01 20 21 10 11 30 31 + const __m256i res_0123 = _mm256_packus_epi16(x0, x1); + // 40 41 60 61 50 51 70 71 + const __m256i res_4567 = _mm256_packus_epi16(x2, x3); + + // 00 01 20 21 + const __m128i res_02 = _mm256_castsi256_si128(res_0123); + // 10 11 30 31 + const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1); + // 40 41 60 61 + const __m128i res_46 = _mm256_castsi256_si128(res_4567); + // 50 51 70 71 + const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1); + + // 00 01 + _mm_storel_epi64((__m128i *)(output), res_02); + // 10 11 + _mm_storel_epi64((__m128i *)(output + stride), res_13); + // 20 21 + _mm_storel_epi64((__m128i *)(output + 2 * stride), + _mm_unpackhi_epi64(res_02, res_02)); + // 30 31 + _mm_storel_epi64((__m128i *)(output + 3 * stride), + _mm_unpackhi_epi64(res_13, res_13)); + // 40 41 + _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46); + // 50 51 + _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57); + // 60 61 + _mm_storel_epi64((__m128i *)(output + 6 * stride), + _mm_unpackhi_epi64(res_46, res_46)); + // 70 71 + _mm_storel_epi64((__m128i *)(output + 7 * stride), + _mm_unpackhi_epi64(res_57, res_57)); +} + +// AVX2 implementation has the advantage when combined multiple operations +// together. +static INLINE void lowbd_inv_txfm2d_8x8_no_identity_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m128i buf1[8]; + const int input_stride = 8; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + assert(hitx_1d_tab[tx_type] < 2); + assert(vitx_1d_tab[tx_type] < 2); + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + __m128i buf0[8]; + __m128i *buf0_cur = buf0; + load_buffer_avx2(input, input_stride, buf0_cur); + row_txfm(buf0, buf0); + + assert(shift[0] < 0); + __m128i *_buf1 = buf1; + round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip); + assert(shift[1] < 0); + col_txfm(buf1, buf1); + round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip); +} + +// AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for +// tx_type with identity in either of the direction has no advantage. +static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case IDTX: + av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); + + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + default: + lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + } +} + +// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_avx2( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + (void)eob; + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: // ADST in vertical, DCT in horizontal + case DCT_ADST: // DCT in vertical, ADST in horizontal + case ADST_ADST: // ADST in both directions + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + default: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + switch (tx_size) { + case TX_4X4: + case TX_4X8: + case TX_8X4: + case TX_8X16: + case TX_16X8: + case TX_4X16: + case TX_16X4: + case TX_8X32: + case TX_32X8: + av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_8X8: + lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X16: + case TX_32X32: + case TX_64X64: + case TX_16X32: + case TX_32X16: + case TX_32X64: + case TX_64X32: + case TX_16X64: + case TX_64X16: + default: + lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + const TX_TYPE tx_type = txfm_param->tx_type; + if (!txfm_param->lossless) { + av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h new file mode 100644 index 0000000000..a09dea389f --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// half input is zero +#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \ + do { \ + const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \ + const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \ + const __m256i _in = in; \ + out0 = _mm256_mulhrs_epi16(_in, _w0); \ + out1 = _mm256_mulhrs_epi16(_in, _w1); \ + } while (0) + +static INLINE void round_shift_avx2(const __m256i *input, __m256i *output, + int size) { + const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8); + for (int i = 0; i < size; ++i) { + output[i] = _mm256_mulhrs_epi16(input[i], scale); + } +} + +static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) { + __m128i pred = _mm_loadu_si128((__m128i const *)(output)); + __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res); + __m128i y = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168)); + _mm_storeu_si128((__m128i *)(output), y); +} + +static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output, + int stride, int flipud, + int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + write_recon_w16_avx2(in[j], output + i * stride); + } +} + +void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, TX_SIZE tx_size, + int eob); +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c new file mode 100644 index 0000000000..79a6064c3e --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c @@ -0,0 +1,2904 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" + +// TODO(venkatsanampudi@ittiam.com): move this to header file + +// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5 +static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096, + 4 * 5793 }; + +// TODO(binpengsmail@gmail.com): replace some for loop with do {} while + +static void idct4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[4]; + x[0] = input[0]; + x[1] = input[2]; + x[2] = input[1]; + x[3] = input[3]; + + // stage 2 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + + // stage 3 + btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); + btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); +} + +static void idct4_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[4]; + x[0] = input[0]; + x[1] = input[2]; + x[2] = input[1]; + x[3] = input[3]; + + // stage 2 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + + // stage 3 + btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]); + btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]); +} + +void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 4 + // stage 5 + output[0] = x[0]; + output[7] = x[0]; + output[1] = x[1]; + output[6] = x[1]; + output[2] = x[1]; + output[5] = x[1]; + output[3] = x[0]; + output[4] = x[0]; +} + +void av1_idct8_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[0]; + x[1] = input[4]; + x[2] = input[2]; + x[3] = input[6]; + x[4] = input[1]; + x[5] = input[5]; + x[6] = input[3]; + x[7] = input[7]; + + // stage 2 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + + // stage 3 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + + // stage 4 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + + // stage 5 + btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); + btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); + btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); + btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); +} + +static void idct8_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[0]; + x[1] = input[4]; + x[2] = input[2]; + x[3] = input[6]; + x[4] = input[1]; + x[5] = input[5]; + x[6] = input[3]; + x[7] = input[7]; + + // stage 2 + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + + // stage 3 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + + // stage 4 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + + // stage 5 + btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]); + btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]); + btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]); + btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]); +} + +static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); +} + +static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); +} + +static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]); + btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]); + btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]); + btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]); + btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]); + btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]); + btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]); + btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]); +} + +static void idct16_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 5 + // stage 6 + // stage 7 + output[0] = x[0]; + output[15] = x[0]; + output[1] = x[1]; + output[14] = x[1]; + output[2] = x[1]; + output[13] = x[1]; + output[3] = x[0]; + output[12] = x[0]; + output[4] = x[0]; + output[11] = x[0]; + output[5] = x[1]; + output[10] = x[1]; + output[6] = x[1]; + output[9] = x[1]; + output[7] = x[0]; + output[8] = x[0]; +} + +static void idct16_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[2] = input[4]; + x[4] = input[2]; + x[6] = input[6]; + x[8] = input[1]; + x[10] = input[5]; + x[12] = input[3]; + x[14] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + + // stage 3 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + idct16_stage5_sse2(x, cospi, __rounding, cos_bit); + idct16_stage6_sse2(x, cospi, __rounding, cos_bit); + idct16_stage7_sse2(output, x); +} + +static void idct16_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[1] = input[8]; + x[2] = input[4]; + x[3] = input[12]; + x[4] = input[2]; + x[5] = input[10]; + x[6] = input[6]; + x[7] = input[14]; + x[8] = input[1]; + x[9] = input[9]; + x[10] = input[5]; + x[11] = input[13]; + x[12] = input[3]; + x[13] = input[11]; + x[14] = input[7]; + x[15] = input[15]; + + // stage 2 + btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + + // stage 3 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + // stage 5~7 + idct16_stage5_sse2(x, cospi, __rounding, cos_bit); + idct16_stage6_sse2(x, cospi, __rounding, cos_bit); + idct16_stage7_sse2(output, x); +} + +static void idct16_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[16]; + x[0] = input[0]; + x[1] = input[8]; + x[2] = input[4]; + x[3] = input[12]; + x[4] = input[2]; + x[5] = input[10]; + x[6] = input[6]; + x[7] = input[14]; + x[8] = input[1]; + x[9] = input[9]; + x[10] = input[5]; + x[11] = input[13]; + x[12] = input[3]; + x[13] = input[11]; + x[14] = input[7]; + x[15] = input[15]; + + // stage 2 + btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + + // stage 3 + btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + + // stage 4 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + + // stage 6 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + + // stage 7 + idct16_stage7_sse2(output, x); +} + +static INLINE void idct32_high16_stage3_sse2(__m128i *x) { + btf_16_adds_subs_sse2(x[16], x[17]); + btf_16_subs_adds_sse2(x[19], x[18]); + btf_16_adds_subs_sse2(x[20], x[21]); + btf_16_subs_adds_sse2(x[23], x[22]); + btf_16_adds_subs_sse2(x[24], x[25]); + btf_16_subs_adds_sse2(x[27], x[26]); + btf_16_adds_subs_sse2(x[28], x[29]); + btf_16_subs_adds_sse2(x[31], x[30]); +} + +static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); +} + +static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + btf_16_adds_subs_sse2(x[16], x[19]); + btf_16_adds_subs_sse2(x[17], x[18]); + btf_16_subs_adds_sse2(x[23], x[20]); + btf_16_subs_adds_sse2(x[22], x[21]); + btf_16_adds_subs_sse2(x[24], x[27]); + btf_16_adds_subs_sse2(x[25], x[26]); + btf_16_subs_adds_sse2(x[31], x[28]); + btf_16_subs_adds_sse2(x[30], x[29]); +} + +static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); +} + +static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + btf_16_adds_subs_sse2(x[16], x[23]); + btf_16_adds_subs_sse2(x[17], x[22]); + btf_16_adds_subs_sse2(x[18], x[21]); + btf_16_adds_subs_sse2(x[19], x[20]); + btf_16_subs_adds_sse2(x[31], x[24]); + btf_16_subs_adds_sse2(x[30], x[25]); + btf_16_subs_adds_sse2(x[29], x[26]); + btf_16_subs_adds_sse2(x[28], x[27]); +} + +static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[15]); + btf_16_adds_subs_sse2(x[1], x[14]); + btf_16_adds_subs_sse2(x[2], x[13]); + btf_16_adds_subs_sse2(x[3], x[12]); + btf_16_adds_subs_sse2(x[4], x[11]); + btf_16_adds_subs_sse2(x[5], x[10]); + btf_16_adds_subs_sse2(x[6], x[9]); + btf_16_adds_subs_sse2(x[7], x[8]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); +} + +static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]); + btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]); + btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]); + btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]); + btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]); + btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]); + btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]); + btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]); + btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]); + btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]); + btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]); + btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]); + btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]); + btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]); + btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]); + btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]); +} + +static void idct32_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[2]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + output[0] = x[0]; + output[31] = x[0]; + output[1] = x[1]; + output[30] = x[1]; + output[2] = x[1]; + output[29] = x[1]; + output[3] = x[0]; + output[28] = x[0]; + output[4] = x[0]; + output[27] = x[0]; + output[5] = x[1]; + output[26] = x[1]; + output[6] = x[1]; + output[25] = x[1]; + output[7] = x[0]; + output[24] = x[0]; + output[8] = x[0]; + output[23] = x[0]; + output[9] = x[1]; + output[22] = x[1]; + output[10] = x[1]; + output[21] = x[1]; + output[11] = x[0]; + output[20] = x[0]; + output[12] = x[0]; + output[19] = x[0]; + output[13] = x[1]; + output[18] = x[1]; + output[14] = x[1]; + output[17] = x[1]; + output[15] = x[0]; + output[16] = x[0]; +} + +static void idct32_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[4] = input[4]; + x[8] = input[2]; + x[12] = input[6]; + x[16] = input[1]; + x[20] = input[5]; + x[24] = input[3]; + x[28] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + + // stage 4 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + // stage 6 + x[3] = x[0]; + x[2] = x[1]; + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static void idct32_low16_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[2] = input[8]; + x[4] = input[4]; + x[6] = input[12]; + x[8] = input[2]; + x[10] = input[10]; + x[12] = input[6]; + x[14] = input[14]; + x[16] = input[1]; + x[18] = input[9]; + x[20] = input[5]; + x[22] = input[13]; + x[24] = input[3]; + x[26] = input[11]; + x[28] = input[7]; + x[30] = input[15]; + + // stage 2 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + + // stage 3 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + idct32_high16_stage3_sse2(x); + + // stage 4 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static void idct32_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + x[1] = input[16]; + x[2] = input[8]; + x[3] = input[24]; + x[4] = input[4]; + x[5] = input[20]; + x[6] = input[12]; + x[7] = input[28]; + x[8] = input[2]; + x[9] = input[18]; + x[10] = input[10]; + x[11] = input[26]; + x[12] = input[6]; + x[13] = input[22]; + x[14] = input[14]; + x[15] = input[30]; + x[16] = input[1]; + x[17] = input[17]; + x[18] = input[9]; + x[19] = input[25]; + x[20] = input[5]; + x[21] = input[21]; + x[22] = input[13]; + x[23] = input[29]; + x[24] = input[3]; + x[25] = input[19]; + x[26] = input[11]; + x[27] = input[27]; + x[28] = input[7]; + x[29] = input[23]; + x[30] = input[15]; + x[31] = input[31]; + + // stage 2 + btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]); + btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]); + + // stage 3 + btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]); + btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]); + idct32_high16_stage3_sse2(x); + + // stage 4 + btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]); + btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_adds_subs_sse2(x[7], x[6]); + idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit); + + // stage 7~8 + idct32_stage7_sse2(x, cospi, __rounding, cos_bit); + idct32_stage8_sse2(x, cospi, __rounding, cos_bit); + idct32_stage9_sse2(output, x); +} + +static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]); + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); +} + +static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + btf_16_adds_subs_sse2(x[32], x[35]); + btf_16_adds_subs_sse2(x[33], x[34]); + btf_16_subs_adds_sse2(x[39], x[36]); + btf_16_subs_adds_sse2(x[38], x[37]); + btf_16_adds_subs_sse2(x[40], x[43]); + btf_16_adds_subs_sse2(x[41], x[42]); + btf_16_subs_adds_sse2(x[47], x[44]); + btf_16_subs_adds_sse2(x[46], x[45]); + btf_16_adds_subs_sse2(x[48], x[51]); + btf_16_adds_subs_sse2(x[49], x[50]); + btf_16_subs_adds_sse2(x[55], x[52]); + btf_16_subs_adds_sse2(x[54], x[53]); + btf_16_adds_subs_sse2(x[56], x[59]); + btf_16_adds_subs_sse2(x[57], x[58]); + btf_16_subs_adds_sse2(x[63], x[60]); + btf_16_subs_adds_sse2(x[62], x[61]); +} + +static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]); +} + +static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + btf_16_adds_subs_sse2(x[16], x[19]); + btf_16_adds_subs_sse2(x[17], x[18]); + btf_16_subs_adds_sse2(x[23], x[20]); + btf_16_subs_adds_sse2(x[22], x[21]); + btf_16_adds_subs_sse2(x[24], x[27]); + btf_16_adds_subs_sse2(x[25], x[26]); + btf_16_subs_adds_sse2(x[31], x[28]); + btf_16_subs_adds_sse2(x[30], x[29]); + idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); +} + +static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]); + btf_16_adds_subs_sse2(x[32], x[39]); + btf_16_adds_subs_sse2(x[33], x[38]); + btf_16_adds_subs_sse2(x[34], x[37]); + btf_16_adds_subs_sse2(x[35], x[36]); + btf_16_subs_adds_sse2(x[47], x[40]); + btf_16_subs_adds_sse2(x[46], x[41]); + btf_16_subs_adds_sse2(x[45], x[42]); + btf_16_subs_adds_sse2(x[44], x[43]); + btf_16_adds_subs_sse2(x[48], x[55]); + btf_16_adds_subs_sse2(x[49], x[54]); + btf_16_adds_subs_sse2(x[50], x[53]); + btf_16_adds_subs_sse2(x[51], x[52]); + btf_16_subs_adds_sse2(x[63], x[56]); + btf_16_subs_adds_sse2(x[62], x[57]); + btf_16_subs_adds_sse2(x[61], x[58]); + btf_16_subs_adds_sse2(x[60], x[59]); +} + +static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + btf_16_adds_subs_sse2(x[16], x[23]); + btf_16_adds_subs_sse2(x[17], x[22]); + btf_16_adds_subs_sse2(x[18], x[21]); + btf_16_adds_subs_sse2(x[19], x[20]); + btf_16_subs_adds_sse2(x[31], x[24]); + btf_16_subs_adds_sse2(x[30], x[25]); + btf_16_subs_adds_sse2(x[29], x[26]); + btf_16_subs_adds_sse2(x[28], x[27]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]); +} + +static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[15]); + btf_16_adds_subs_sse2(x[1], x[14]); + btf_16_adds_subs_sse2(x[2], x[13]); + btf_16_adds_subs_sse2(x[3], x[12]); + btf_16_adds_subs_sse2(x[4], x[11]); + btf_16_adds_subs_sse2(x[5], x[10]); + btf_16_adds_subs_sse2(x[6], x[9]); + btf_16_adds_subs_sse2(x[7], x[8]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]); + btf_16_adds_subs_sse2(x[32], x[47]); + btf_16_adds_subs_sse2(x[33], x[46]); + btf_16_adds_subs_sse2(x[34], x[45]); + btf_16_adds_subs_sse2(x[35], x[44]); + btf_16_adds_subs_sse2(x[36], x[43]); + btf_16_adds_subs_sse2(x[37], x[42]); + btf_16_adds_subs_sse2(x[38], x[41]); + btf_16_adds_subs_sse2(x[39], x[40]); + btf_16_subs_adds_sse2(x[63], x[48]); + btf_16_subs_adds_sse2(x[62], x[49]); + btf_16_subs_adds_sse2(x[61], x[50]); + btf_16_subs_adds_sse2(x[60], x[51]); + btf_16_subs_adds_sse2(x[59], x[52]); + btf_16_subs_adds_sse2(x[58], x[53]); + btf_16_subs_adds_sse2(x[57], x[54]); + btf_16_subs_adds_sse2(x[56], x[55]); +} + +static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + btf_16_adds_subs_sse2(x[0], x[31]); + btf_16_adds_subs_sse2(x[1], x[30]); + btf_16_adds_subs_sse2(x[2], x[29]); + btf_16_adds_subs_sse2(x[3], x[28]); + btf_16_adds_subs_sse2(x[4], x[27]); + btf_16_adds_subs_sse2(x[5], x[26]); + btf_16_adds_subs_sse2(x[6], x[25]); + btf_16_adds_subs_sse2(x[7], x[24]); + btf_16_adds_subs_sse2(x[8], x[23]); + btf_16_adds_subs_sse2(x[9], x[22]); + btf_16_adds_subs_sse2(x[10], x[21]); + btf_16_adds_subs_sse2(x[11], x[20]); + btf_16_adds_subs_sse2(x[12], x[19]); + btf_16_adds_subs_sse2(x[13], x[18]); + btf_16_adds_subs_sse2(x[14], x[17]); + btf_16_adds_subs_sse2(x[15], x[16]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]); +} + +static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) { + btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]); + btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]); + btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]); + btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]); + btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]); + btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]); + btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]); + btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]); + btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]); + btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]); + btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]); + btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]); + btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]); + btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]); + btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]); + btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]); + btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]); + btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]); + btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]); + btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]); + btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]); + btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]); + btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]); + btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]); + btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]); + btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]); + btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]); + btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]); + btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]); + btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]); + btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]); + btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]); +} + +static void idct64_low1_ssse3(const __m128i *input, __m128i *output) { + const int32_t *cospi = cospi_arr(INV_COS_BIT); + + // stage 1 + __m128i x[32]; + x[0] = input[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + + // stage 7 + // stage 8 + // stage 9 + // stage 10 + // stage 11 + output[0] = x[0]; + output[63] = x[0]; + output[1] = x[1]; + output[62] = x[1]; + output[2] = x[1]; + output[61] = x[1]; + output[3] = x[0]; + output[60] = x[0]; + output[4] = x[0]; + output[59] = x[0]; + output[5] = x[1]; + output[58] = x[1]; + output[6] = x[1]; + output[57] = x[1]; + output[7] = x[0]; + output[56] = x[0]; + output[8] = x[0]; + output[55] = x[0]; + output[9] = x[1]; + output[54] = x[1]; + output[10] = x[1]; + output[53] = x[1]; + output[11] = x[0]; + output[52] = x[0]; + output[12] = x[0]; + output[51] = x[0]; + output[13] = x[1]; + output[50] = x[1]; + output[14] = x[1]; + output[49] = x[1]; + output[15] = x[0]; + output[48] = x[0]; + output[16] = x[0]; + output[47] = x[0]; + output[17] = x[1]; + output[46] = x[1]; + output[18] = x[1]; + output[45] = x[1]; + output[19] = x[0]; + output[44] = x[0]; + output[20] = x[0]; + output[43] = x[0]; + output[21] = x[1]; + output[42] = x[1]; + output[22] = x[1]; + output[41] = x[1]; + output[23] = x[0]; + output[40] = x[0]; + output[24] = x[0]; + output[39] = x[0]; + output[25] = x[1]; + output[38] = x[1]; + output[26] = x[1]; + output[37] = x[1]; + output[27] = x[0]; + output[36] = x[0]; + output[28] = x[0]; + output[35] = x[0]; + output[29] = x[1]; + output[34] = x[1]; + output[30] = x[1]; + output[33] = x[1]; + output[31] = x[0]; + output[32] = x[0]; +} + +static void idct64_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[8] = input[4]; + x[16] = input[2]; + x[24] = input[6]; + x[32] = input[1]; + x[40] = input[5]; + x[48] = input[3]; + x[56] = input[7]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[38] = x[39]; + x[41] = x[40]; + x[46] = x[47]; + x[49] = x[48]; + x[54] = x[55]; + x[57] = x[56]; + x[62] = x[63]; + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + x[17] = x[16]; + x[22] = x[23]; + x[25] = x[24]; + x[30] = x[31]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]); + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]); + + // stage 5 + x[9] = x[8]; + x[14] = x[15]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]); + x[35] = x[32]; + x[34] = x[33]; + x[36] = x[39]; + x[37] = x[38]; + x[43] = x[40]; + x[42] = x[41]; + x[44] = x[47]; + x[45] = x[46]; + x[51] = x[48]; + x[50] = x[49]; + x[52] = x[55]; + x[53] = x[54]; + x[59] = x[56]; + x[58] = x[57]; + x[60] = x[63]; + x[61] = x[62]; + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + x[19] = x[16]; + x[18] = x[17]; + x[20] = x[23]; + x[21] = x[22]; + x[27] = x[24]; + x[26] = x[25]; + x[28] = x[31]; + x[29] = x[30]; + idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + x[11] = x[8]; + x[10] = x[9]; + x[12] = x[15]; + x[13] = x[14]; + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + x[7] = x[0]; + x[6] = x[1]; + x[5] = x[2]; + x[4] = x[3]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void idct64_low16_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[4] = input[8]; + x[8] = input[4]; + x[12] = input[12]; + x[16] = input[2]; + x[20] = input[10]; + x[24] = input[6]; + x[28] = input[14]; + x[32] = input[1]; + x[36] = input[9]; + x[40] = input[5]; + x[44] = input[13]; + x[48] = input[3]; + x[52] = input[11]; + x[56] = input[7]; + x[60] = input[15]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + x[33] = x[32]; + x[34] = x[35]; + x[37] = x[36]; + x[38] = x[39]; + x[41] = x[40]; + x[42] = x[43]; + x[45] = x[44]; + x[46] = x[47]; + x[49] = x[48]; + x[50] = x[51]; + x[53] = x[52]; + x[54] = x[55]; + x[57] = x[56]; + x[58] = x[59]; + x[61] = x[60]; + x[62] = x[63]; + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + x[17] = x[16]; + x[18] = x[19]; + x[21] = x[20]; + x[22] = x[23]; + x[25] = x[24]; + x[26] = x[27]; + x[29] = x[28]; + x[30] = x[31]; + idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + x[9] = x[8]; + x[10] = x[11]; + x[13] = x[12]; + x[14] = x[15]; + idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + x[5] = x[4]; + x[6] = x[7]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + x[3] = x[0]; + x[2] = x[1]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void idct64_low32_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + + // stage 1 + __m128i x[64]; + x[0] = input[0]; + x[2] = input[16]; + x[4] = input[8]; + x[6] = input[24]; + x[8] = input[4]; + x[10] = input[20]; + x[12] = input[12]; + x[14] = input[28]; + x[16] = input[2]; + x[18] = input[18]; + x[20] = input[10]; + x[22] = input[26]; + x[24] = input[6]; + x[26] = input[22]; + x[28] = input[14]; + x[30] = input[30]; + x[32] = input[1]; + x[34] = input[17]; + x[36] = input[9]; + x[38] = input[25]; + x[40] = input[5]; + x[42] = input[21]; + x[44] = input[13]; + x[46] = input[29]; + x[48] = input[3]; + x[50] = input[19]; + x[52] = input[11]; + x[54] = input[27]; + x[56] = input[7]; + x[58] = input[23]; + x[60] = input[15]; + x[62] = input[31]; + + // stage 2 + btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]); + btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]); + btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]); + btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]); + btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]); + btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]); + btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]); + btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]); + btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]); + btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]); + btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]); + btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]); + btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]); + btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]); + btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]); + btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]); + + // stage 3 + btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]); + btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]); + btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]); + btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]); + btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]); + btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]); + btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]); + btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]); + btf_16_adds_subs_sse2(x[32], x[33]); + btf_16_subs_adds_sse2(x[35], x[34]); + btf_16_adds_subs_sse2(x[36], x[37]); + btf_16_subs_adds_sse2(x[39], x[38]); + btf_16_adds_subs_sse2(x[40], x[41]); + btf_16_subs_adds_sse2(x[43], x[42]); + btf_16_adds_subs_sse2(x[44], x[45]); + btf_16_subs_adds_sse2(x[47], x[46]); + btf_16_adds_subs_sse2(x[48], x[49]); + btf_16_subs_adds_sse2(x[51], x[50]); + btf_16_adds_subs_sse2(x[52], x[53]); + btf_16_subs_adds_sse2(x[55], x[54]); + btf_16_adds_subs_sse2(x[56], x[57]); + btf_16_subs_adds_sse2(x[59], x[58]); + btf_16_adds_subs_sse2(x[60], x[61]); + btf_16_subs_adds_sse2(x[63], x[62]); + + // stage 4 + btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]); + btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]); + btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]); + btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]); + btf_16_adds_subs_sse2(x[16], x[17]); + btf_16_subs_adds_sse2(x[19], x[18]); + btf_16_adds_subs_sse2(x[20], x[21]); + btf_16_subs_adds_sse2(x[23], x[22]); + btf_16_adds_subs_sse2(x[24], x[25]); + btf_16_subs_adds_sse2(x[27], x[26]); + btf_16_adds_subs_sse2(x[28], x[29]); + btf_16_subs_adds_sse2(x[31], x[30]); + idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit); + + // stage 5 + btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]); + btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[9]); + btf_16_subs_adds_sse2(x[11], x[10]); + btf_16_adds_subs_sse2(x[12], x[13]); + btf_16_subs_adds_sse2(x[15], x[14]); + idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 6 + btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]); + btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]); + btf_16_adds_subs_sse2(x[4], x[5]); + btf_16_subs_adds_sse2(x[7], x[6]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]); + idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 7 + btf_16_adds_subs_sse2(x[0], x[3]); + btf_16_adds_subs_sse2(x[1], x[2]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]); + btf_16_adds_subs_sse2(x[8], x[11]); + btf_16_adds_subs_sse2(x[9], x[10]); + btf_16_subs_adds_sse2(x[15], x[12]); + btf_16_subs_adds_sse2(x[14], x[13]); + idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 8 + btf_16_adds_subs_sse2(x[0], x[7]); + btf_16_adds_subs_sse2(x[1], x[6]); + btf_16_adds_subs_sse2(x[2], x[5]); + btf_16_adds_subs_sse2(x[3], x[4]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]); + idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit); + + // stage 9~11 + idct64_stage9_sse2(x, cospi, __rounding, cos_bit); + idct64_stage10_sse2(x, cospi, __rounding, cos_bit); + idct64_stage11_sse2(output, x); +} + +static void iadst4_sse2(const __m128i *input, __m128i *output) { + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); + const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[4]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpackhi_epi16(x0[0], x0[2]); + u[2] = _mm_unpacklo_epi16(x0[1], x0[3]); + u[3] = _mm_unpackhi_epi16(x0[1], x0[3]); + + __m128i x1[16]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04); + x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01); + x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02); + x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04); + x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03); + x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3 + x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03); + x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 + x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02); + x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1 + x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01); + + __m128i x2[8]; + x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[5]); + x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4 + x2[3] = _mm_add_epi32(x1[3], x1[7]); + x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3 + x2[5] = _mm_add_epi32(x1[9], x1[11]); + x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1 + x2[7] = _mm_add_epi32(x1[13], x1[15]); + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[2 * i], rounding); + __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + out1 = _mm_srai_epi32(out1, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out1); + } +} + +static void iadst4_w4_sse2(const __m128i *input, __m128i *output) { + const int32_t *sinpi = sinpi_arr(INV_COS_BIT); + const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]); + const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]); + const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]); + const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]); + const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]); + const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]); + const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]); + const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]); + __m128i x0[4]; + x0[0] = input[0]; + x0[1] = input[1]; + x0[2] = input[2]; + x0[3] = input[3]; + + __m128i u[2]; + u[0] = _mm_unpacklo_epi16(x0[0], x0[2]); + u[1] = _mm_unpacklo_epi16(x0[1], x0[3]); + + __m128i x1[8]; + x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4 + x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1 + x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2 + x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4 + x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3 + x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3 + x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2 + x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1 + + __m128i x2[4]; + x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2 + x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4 + x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3 + x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1 + + const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + for (int i = 0; i < 4; ++i) { + __m128i out0 = _mm_add_epi32(x2[i], rounding); + out0 = _mm_srai_epi32(out0, INV_COS_BIT); + output[i] = _mm_packs_epi32(out0, out0); + } +} + +void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[1] = input[0]; + + // stage 2 + btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]); + + // stage 3 + x[4] = x[0]; + x[5] = x[1]; + + // stage 4 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + + // stage 5 + x[2] = x[0]; + x[3] = x[1]; + x[6] = x[4]; + x[7] = x[5]; + + // stage 6 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +void av1_iadst8_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[7]; + x[1] = input[0]; + x[2] = input[5]; + x[3] = input[2]; + x[4] = input[3]; + x[5] = input[4]; + x[6] = input[1]; + x[7] = input[6]; + + // stage 2 + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); + + // stage 3 + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + + // stage 4 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + + // stage 6 + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst8_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[8]; + x[0] = input[7]; + x[1] = input[0]; + x[2] = input[5]; + x[3] = input[2]; + x[4] = input[3]; + x[5] = input[4]; + x[6] = input[1]; + x[7] = input[6]; + + // stage 2 + btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]); + + // stage 3 + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + + // stage 4 + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + + // stage 5 + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + + // stage 6 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + + // stage 7 + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[4]); + output[2] = x[6]; + output[3] = _mm_subs_epi16(__zero, x[2]); + output[4] = x[3]; + output[5] = _mm_subs_epi16(__zero, x[7]); + output[6] = x[5]; + output[7] = _mm_subs_epi16(__zero, x[1]); +} + +static INLINE void iadst16_stage3_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[8]); + btf_16_adds_subs_sse2(x[1], x[9]); + btf_16_adds_subs_sse2(x[2], x[10]); + btf_16_adds_subs_sse2(x[3], x[11]); + btf_16_adds_subs_sse2(x[4], x[12]); + btf_16_adds_subs_sse2(x[5], x[13]); + btf_16_adds_subs_sse2(x[6], x[14]); + btf_16_adds_subs_sse2(x[7], x[15]); +} + +static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage5_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[4]); + btf_16_adds_subs_sse2(x[1], x[5]); + btf_16_adds_subs_sse2(x[2], x[6]); + btf_16_adds_subs_sse2(x[3], x[7]); + btf_16_adds_subs_sse2(x[8], x[12]); + btf_16_adds_subs_sse2(x[9], x[13]); + btf_16_adds_subs_sse2(x[10], x[14]); + btf_16_adds_subs_sse2(x[11], x[15]); +} + +static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage7_ssse3(__m128i *x) { + btf_16_adds_subs_sse2(x[0], x[2]); + btf_16_adds_subs_sse2(x[1], x[3]); + btf_16_adds_subs_sse2(x[4], x[6]); + btf_16_adds_subs_sse2(x[5], x[7]); + btf_16_adds_subs_sse2(x[8], x[10]); + btf_16_adds_subs_sse2(x[9], x[11]); + btf_16_adds_subs_sse2(x[12], x[14]); + btf_16_adds_subs_sse2(x[13], x[15]); +} + +static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi, + const __m128i __rounding, + int8_t cos_bit) { + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); +} + +static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) { + const __m128i __zero = _mm_setzero_si128(); + output[0] = x[0]; + output[1] = _mm_subs_epi16(__zero, x[8]); + output[2] = x[12]; + output[3] = _mm_subs_epi16(__zero, x[4]); + output[4] = x[6]; + output[5] = _mm_subs_epi16(__zero, x[14]); + output[6] = x[10]; + output[7] = _mm_subs_epi16(__zero, x[2]); + output[8] = x[3]; + output[9] = _mm_subs_epi16(__zero, x[11]); + output[10] = x[15]; + output[11] = _mm_subs_epi16(__zero, x[7]); + output[12] = x[5]; + output[13] = _mm_subs_epi16(__zero, x[13]); + output[14] = x[9]; + output[15] = _mm_subs_epi16(__zero, x[1]); +} + +static void iadst16_low1_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + + // stage 1 + __m128i x[16]; + x[1] = input[0]; + + // stage 2 + btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); + + // stage 3 + x[8] = x[0]; + x[9] = x[1]; + + // stage 4 + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + + // stage 5 + x[4] = x[0]; + x[5] = x[1]; + x[12] = x[8]; + x[13] = x[9]; + + // stage 6 + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + + // stage 7 + x[2] = x[0]; + x[3] = x[1]; + x[6] = x[4]; + x[7] = x[5]; + x[10] = x[8]; + x[11] = x[9]; + x[14] = x[12]; + x[15] = x[13]; + + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} + +static void iadst16_low8_ssse3(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + // stage 1 + __m128i x[16]; + x[1] = input[0]; + x[3] = input[2]; + x[5] = input[4]; + x[7] = input[6]; + x[8] = input[7]; + x[10] = input[5]; + x[12] = input[3]; + x[14] = input[1]; + + // stage 2 + btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]); + btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]); + btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]); + btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]); + btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]); + btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]); + btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]); + btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]); + + // stage 3 + iadst16_stage3_ssse3(x); + iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage5_ssse3(x); + iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage7_ssse3(x); + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} +static void iadst16_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x[16]; + x[0] = input[15]; + x[1] = input[0]; + x[2] = input[13]; + x[3] = input[2]; + x[4] = input[11]; + x[5] = input[4]; + x[6] = input[9]; + x[7] = input[6]; + x[8] = input[7]; + x[9] = input[8]; + x[10] = input[5]; + x[11] = input[10]; + x[12] = input[3]; + x[13] = input[12]; + x[14] = input[1]; + x[15] = input[14]; + + // stage 2 + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); + + // stage 3~9 + iadst16_stage3_ssse3(x); + iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage5_ssse3(x); + iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage7_ssse3(x); + iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit); + iadst16_stage9_ssse3(output, x); +} + +static void iadst16_w4_sse2(const __m128i *input, __m128i *output) { + const int8_t cos_bit = INV_COS_BIT; + const int32_t *cospi = cospi_arr(INV_COS_BIT); + const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1)); + + const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + + // stage 1 + __m128i x[16]; + x[0] = input[15]; + x[1] = input[0]; + x[2] = input[13]; + x[3] = input[2]; + x[4] = input[11]; + x[5] = input[4]; + x[6] = input[9]; + x[7] = input[6]; + x[8] = input[7]; + x[9] = input[8]; + x[10] = input[5]; + x[11] = input[10]; + x[12] = input[3]; + x[13] = input[12]; + x[14] = input[1]; + x[15] = input[14]; + + // stage 2 + btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]); + btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]); + btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]); + + // stage 3 + iadst16_stage3_ssse3(x); + + // stage 4 + btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]); + btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]); + + // stage 5 + iadst16_stage5_ssse3(x); + + // stage 6 + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]); + btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]); + + // stage 7 + iadst16_stage7_ssse3(x); + + // stage 8 + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]); + btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]); + + // stage 9 + iadst16_stage9_ssse3(output, x); +} + +static void iidentity4_ssse3(const __m128i *input, __m128i *output) { + const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits)); + const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); + for (int i = 0; i < 4; ++i) { + __m128i x = _mm_mulhrs_epi16(input[i], scale); + output[i] = _mm_adds_epi16(x, input[i]); + } +} + +static void iidentity8_sse2(const __m128i *input, __m128i *output) { + for (int i = 0; i < 8; ++i) { + output[i] = _mm_adds_epi16(input[i], input[i]); + } +} + +static void iidentity16_ssse3(const __m128i *input, __m128i *output) { + const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits)); + const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits)); + for (int i = 0; i < 16; ++i) { + __m128i x = _mm_mulhrs_epi16(input[i], scale); + __m128i srcx2 = _mm_adds_epi16(input[i], input[i]); + output[i] = _mm_adds_epi16(x, srcx2); + } +} + +static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred, + __m128i res) { + const __m128i zero = _mm_setzero_si128(); + __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero)); + return _mm_packus_epi16(x0, x0); +} + +static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_cvtsi32_si128(*((int *)(output + i * stride))); + __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero)); + u = _mm_packus_epi16(u, zero); + *((int *)(output + i * stride)) = _mm_cvtsi128_si32(u); + } +} + +static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + const int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]); + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + +// 1D functions process process 8 pixels at one time. +static const transform_1d_ssse3 + lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_sse2, iadst4_sse2, iidentity4_ssse3 }, + { av1_idct8_sse2, av1_iadst8_sse2, iidentity8_sse2 }, + { idct16_sse2, iadst16_sse2, iidentity16_ssse3 }, + { idct32_sse2, NULL, NULL }, + { idct64_low32_ssse3, NULL, NULL }, + }; + +// functions for blocks with eob at DC and within +// topleft 8x8, 16x16, 32x32 corner +static const transform_1d_ssse3 + lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4_sse2, idct4_sse2, NULL, NULL }, + { iadst4_sse2, iadst4_sse2, NULL, NULL }, + { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL }, + }, + { { av1_idct8_low1_ssse3, av1_idct8_sse2, NULL, NULL }, + { av1_iadst8_low1_ssse3, av1_iadst8_sse2, NULL, NULL }, + { iidentity8_sse2, iidentity8_sse2, NULL, NULL } }, + { + { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL }, + { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3, + idct32_sse2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3, + idct64_low32_ssse3 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +// 1D functions process process 4 pixels at one time. +// used in 4x4, 4x8, 4x16, 8x4, 16x4 +static const transform_1d_ssse3 + lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = { + { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 }, + { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 }, + { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 }, + { NULL, NULL, NULL }, + { NULL, NULL, NULL }, + }; + +static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input, + int stride, int shift, int height, + int txw_idx, int rect_type) { + const int32_t *input_row = input; + const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]); + const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) + + (1 << (NewSqrt2Bits - shift - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding); + if (rect_type != 1 && rect_type != -1) { + for (int i = 0; i < height; ++i) { + const __m128i src = load_32bit_to_16bit(input_row); + input_row += stride; + __m128i lo = _mm_unpacklo_epi16(src, one); + __m128i hi = _mm_unpackhi_epi16(src, one); + lo = _mm_madd_epi16(lo, scale_rounding); + hi = _mm_madd_epi16(hi, scale_rounding); + lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm_packs_epi32(lo, hi); + } + } else { + const __m128i rect_scale = + _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits)); + for (int i = 0; i < height; ++i) { + __m128i src = load_32bit_to_16bit(input_row); + src = _mm_mulhrs_epi16(src, rect_scale); + input_row += stride; + __m128i lo = _mm_unpacklo_epi16(src, one); + __m128i hi = _mm_unpackhi_epi16(src, one); + lo = _mm_madd_epi16(lo, scale_rounding); + hi = _mm_madd_epi16(hi, scale_rounding); + lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift); + hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift); + out[i] = _mm_packs_epi32(lo, hi); + } + } +} + +static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride, + __m128i *buf, int shift, int height, + int txh_idx) { + const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]); + const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1)); + const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1)); + const __m128i one = _mm_set1_epi16(1); + const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding); + const __m128i zero = _mm_setzero_si128(); + for (int h = 0; h < height; ++h) { + __m128i lo = _mm_unpacklo_epi16(buf[h], one); + __m128i hi = _mm_unpackhi_epi16(buf[h], one); + lo = _mm_madd_epi16(lo, scale_coeff); + hi = _mm_madd_epi16(hi, scale_coeff); + lo = _mm_srai_epi32(lo, NewSqrt2Bits); + hi = _mm_srai_epi32(hi, NewSqrt2Bits); + lo = _mm_add_epi32(lo, shift_rounding); + hi = _mm_add_epi32(hi, shift_rounding); + lo = _mm_srai_epi32(lo, -shift); + hi = _mm_srai_epi32(hi, -shift); + __m128i x = _mm_packs_epi32(lo, hi); + + const __m128i pred = _mm_loadl_epi64((__m128i const *)(output)); + x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero)); + const __m128i u = _mm_packus_epi16(x, x); + _mm_storel_epi64((__m128i *)(output), u); + output += stride; + } +} + +void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_SIZE tx_size) { + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int col_max = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + for (int i = 0; i < (col_max >> 3); ++i) { + for (int j = 0; j < (row_max >> 3); j++) { + __m128i buf[8]; + iidentity_row_8xn_ssse3(buf, input + j * 8 + i * 8 * input_stride, + row_max, shift[0], 8, txw_idx, rect_type); + transpose_16bit_8x8(buf, buf); + iidentity_col_8xn_ssse3(output + i * 8 + j * 8 * stride, stride, buf, + shift[1], 8, txh_idx); + } + } +} + +static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[4]; + const TX_SIZE tx_size = TX_4X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); + row_txfm(buf, buf); + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x4(temp, buf); + } else { + transpose_16bit_4x4(buf, buf); + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred, + __m128i res0, __m128i res1) { + const __m128i zero = _mm_setzero_si128(); + __m128i x0 = _mm_unpacklo_epi8(pred, zero); + __m128i x1 = _mm_unpackhi_epi8(pred, zero); + x0 = _mm_adds_epi16(res0, x0); + x1 = _mm_adds_epi16(res1, x1); + return _mm_packus_epi16(x0, x1); +} + +static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output, + int stride, int flipud, + int height) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]); + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output, + int size) { + const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8); + for (int i = 0; i < size; ++i) { + output[i] = _mm_mulhrs_epi16(input[i], scale); + } +} + +static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + __m128i buf1[64 * 8]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m128i buf0[64]; + load_buffer_32bit_to_16bit(input + 8 * i, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); + __m128i *_buf1 = buf1 + i * 8; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + __m128i temp[8]; + flip_buf_sse2(buf0 + 8 * j, temp, 8); + transpose_16bit_8x8(temp, + _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row); + round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]); + } + + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row); + } + } else if (txfm_size_col == 8) { + lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row); + } +} + +void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = (eobx + 8) >> 3; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + assert(fun_idx < 5); + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + + assert(col_txfm != NULL); + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_w_div8; i++) { + __m128i buf0[64]; + for (int j = 0; j < buf_size_h_div8; j++) { + __m128i *buf0_cur = buf0 + j * 8; + const int32_t *input_cur = input + i * 8 * input_stride + j * 8; + iidentity_row_8xn_ssse3(buf0_cur, input_cur, input_stride, shift[0], 8, + txw_idx, rect_type); + transpose_16bit_8x8(buf0_cur, buf0_cur); + } + col_txfm(buf0, buf0); + __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1])); + int k = ud_flip ? (txfm_size_row - 1) : 0; + const int step = ud_flip ? -1 : 1; + uint8_t *out = output + 8 * i; + for (int j = 0; j < txfm_size_row; ++j, k += step) { + const __m128i v = _mm_loadl_epi64((__m128i const *)(out)); + __m128i res = _mm_mulhrs_epi16(buf0[k], mshift); + const __m128i u = lowbd_get_recon_8x8_sse2(v, res); + _mm_storel_epi64((__m128i *)(out), u); + out += stride; + } + } +} + +void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < buf_size_h_div8; i++) { + __m128i buf0[64]; + load_buffer_32bit_to_16bit(input + i * 8, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_ssse3(buf0, buf0, buf_size_nonzero_w); // rect special code + } + row_txfm(buf0, buf0); + round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]); + __m128i *_buf1 = buf1; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + __m128i temp[8]; + flip_buf_sse2(buf0 + 8 * j, temp, 8); + transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j)); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j); + } + } + + for (int j = 0; j < buf_size_w_div8; ++j) { + iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride, + buf1 + j * 8, shift[1], 8, txh_idx); + } + } +} + +// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64 +static INLINE void lowbd_inv_txfm2d_add_universe_ssse3( + const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_type) { + case DCT_DCT: + lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case IDTX: + av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + default: + lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[8]; + const TX_SIZE tx_size = TX_4X8; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit(input, txfm_size_row, buf, txfm_size_col); + round_shift_ssse3(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf); + // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0 + if (lr_flip) { + __m128i temp[4]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf); + } else { + transpose_16bit_8x4(buf, buf); + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[8]; + const TX_SIZE tx_size = TX_8X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); + round_shift_ssse3(buf, buf, txfm_size_col); // rect special code + row_txfm(buf, buf); + // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0 + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf, temp, txfm_size_col); + transpose_16bit_4x8(temp, buf); + } else { + transpose_16bit_4x8(buf, buf); + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[16]; + const TX_SIZE tx_size = TX_4X16; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + const int row_one_loop = 8; + for (int i = 0; i < 2; ++i) { + const int32_t *input_cur = input + i * row_one_loop; + __m128i *buf_cur = buf + i * row_one_loop; + load_buffer_32bit_to_16bit(input_cur, txfm_size_row, buf_cur, + txfm_size_col); + if (row_txfm == iidentity4_ssse3) { + const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 4; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf_cur, buf_cur); + round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]); + } + if (lr_flip) { + __m128i temp[8]; + flip_buf_sse2(buf_cur, temp, txfm_size_col); + transpose_16bit_8x4(temp, buf_cur); + } else { + transpose_16bit_8x4(buf_cur, buf_cur); + } + } + col_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]); + lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row); +} + +static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size_, + int eob) { + (void)tx_size_; + (void)eob; + __m128i buf[16]; + const TX_SIZE tx_size = TX_16X4; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + + const transform_1d_ssse3 row_txfm = + lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]]; + const transform_1d_ssse3 col_txfm = + lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int row_one_loop = 8; + load_buffer_32bit_to_16bit_w4(input, txfm_size_row, buf, txfm_size_col); + if (row_txfm == iidentity16_ssse3) { + const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1)); + const __m128i ones = _mm_set1_epi16(1); + for (int j = 0; j < 16; ++j) { + const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones); + const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones); + const __m128i buf_32_lo = + _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1)); + const __m128i buf_32_hi = + _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1)); + buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi); + } + } else { + row_txfm(buf, buf); + round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); + } + if (lr_flip) { + __m128i temp[16]; + flip_buf_sse2(buf, temp, 16); + transpose_16bit_4x8(temp, buf); + transpose_16bit_4x8(temp + 8, buf + 8); + } else { + transpose_16bit_4x8(buf, buf); + transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop); + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf + i * row_one_loop, buf + i * row_one_loop); + round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]); + } + lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4); + lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4); +} + +void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob) { + switch (tx_size) { + case TX_4X4: + lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_4X8: + lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_8X4: + lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_4X16: + lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + case TX_16X4: + lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size, + eob); + break; + default: + lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type, + tx_size, eob); + break; + } +} + +void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, + const TxfmParam *txfm_param) { + if (!txfm_param->lossless) { + const TX_TYPE tx_type = txfm_param->tx_type; + av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + + } else { + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + } +} diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h new file mode 100644 index 0000000000..1873d01bc0 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ +#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ + +#include // SSE2 +#include // SSSE3 + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define btf_16_ssse3(w0, w1, in, out0, out1) \ + do { \ + const __m128i _w0 = _mm_set1_epi16(w0 * 8); \ + const __m128i _w1 = _mm_set1_epi16(w1 * 8); \ + const __m128i _in = in; \ + out0 = _mm_mulhrs_epi16(_in, _w0); \ + out1 = _mm_mulhrs_epi16(_in, _w1); \ + } while (0) + +#define btf_16_adds_subs_sse2(in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + in0 = _mm_adds_epi16(_in0, _in1); \ + in1 = _mm_subs_epi16(_in0, _in1); \ + } while (0) + +#define btf_16_subs_adds_sse2(in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + in1 = _mm_subs_epi16(_in0, _in1); \ + in0 = _mm_adds_epi16(_in0, _in1); \ + } while (0) + +#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \ + do { \ + const __m128i _in0 = in0; \ + const __m128i _in1 = in1; \ + out0 = _mm_adds_epi16(_in0, _in1); \ + out1 = _mm_subs_epi16(_in0, _in1); \ + } while (0) + +static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { + if (bit < 0) { + const __m128i scale = _mm_set1_epi16(1 << (15 + bit)); + for (int i = 0; i < size; ++i) { + in[i] = _mm_mulhrs_epi16(in[i], scale); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm_slli_epi16(in[i], bit); + } + } +} + +// 1D itx types +enum { + IDCT_1D, + IADST_1D, + IFLIPADST_1D = IADST_1D, + IIDENTITY_1D, + ITX_TYPES_1D, +} UENUM1BYTE(ITX_TYPE_1D); + +static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { + IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, + IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, + IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, + IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, +}; + +static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { + IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, + IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, + IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, + IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x16_default[16]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x32_default[32]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, + 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = { + 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_16x32_default[32]) = { + 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, + 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, + 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, +}; + +DECLARE_ALIGNED(16, static const int16_t, + av1_eob_to_eobxy_32x16_default[16]) = { + 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, + 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = { + 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07, + 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, + 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, +}; + +DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = { + 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f, +}; + +DECLARE_ALIGNED(16, static const int16_t *, + av1_eob_to_eobxy_default[TX_SIZES_ALL]) = { + NULL, + av1_eob_to_eobxy_8x8_default, + av1_eob_to_eobxy_16x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x16_default, + av1_eob_to_eobxy_16x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, + av1_eob_to_eobxy_32x32_default, + av1_eob_to_eobxy_32x32_default, + NULL, + NULL, + av1_eob_to_eobxy_8x32_default, + av1_eob_to_eobxy_32x8_default, + av1_eob_to_eobxy_16x32_default, + av1_eob_to_eobxy_32x16_default, +}; + +static const int lowbd_txfm_all_1d_zeros_idx[32] = { + 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +}; + +// Transform block width in log2 for eob (size of 64 map to 32) +static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = { + 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5, +}; + +static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + if (eob == 1) { + *eobx = 0; + *eoby = 0; + return; + } + + const int tx_w_log2 = tx_size_wide_log2_eob[tx_size]; + const int eob_row = (eob - 1) >> tx_w_log2; + const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row]; + *eobx = eobxy & 0xFF; + *eoby = eobxy >> 8; +} + +static int eob_fill[32] = { + 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, +}; + +static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_col = tx_size_wide[tx_size]; + const int eobx_max = AOMMIN(32, txfm_size_col) - 1; + *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob]; + const int temp_eoby = eob / (eobx_max + 1); + assert(temp_eoby < 32); + *eoby = eob_fill[temp_eoby]; +} + +static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby, + TX_SIZE tx_size, int eob) { + eob -= 1; + const int txfm_size_row = tx_size_high[tx_size]; + const int eoby_max = AOMMIN(32, txfm_size_row) - 1; + *eobx = eob_fill[eob / (eoby_max + 1)]; + *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob]; +} + +typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output); + +void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob); + +void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output, + int stride, TX_SIZE tx_size); + +void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob); +void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob); + +void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output); + +void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h new file mode 100644 index 0000000000..129721cf05 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE void btf_16_w4_sse2( + const __m128i *const w0, const __m128i *const w1, const __m128i __rounding, + const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1, + __m128i *const out0, __m128i *const out1) { + const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i u0 = _mm_madd_epi16(t0, *w0); + const __m128i v0 = _mm_madd_epi16(t0, *w1); + const __m128i a0 = _mm_add_epi32(u0, __rounding); + const __m128i b0 = _mm_add_epi32(v0, __rounding); + const __m128i c0 = _mm_srai_epi32(a0, cos_bit); + const __m128i d0 = _mm_srai_epi32(b0, cos_bit); + + *out0 = _mm_packs_epi32(c0, c0); + *out1 = _mm_packs_epi32(d0, c0); +} + +#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \ + do { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c0); \ + out1 = _mm_packs_epi32(d0, d0); \ + } while (0) + +#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \ + do { \ + __m128i t0 = _mm_unpacklo_epi16(in0, in1); \ + __m128i t1 = _mm_unpackhi_epi16(in0, in1); \ + __m128i u0 = _mm_madd_epi16(t0, w0); \ + __m128i u1 = _mm_madd_epi16(t1, w0); \ + __m128i v0 = _mm_madd_epi16(t0, w1); \ + __m128i v1 = _mm_madd_epi16(t1, w1); \ + \ + __m128i a0 = _mm_add_epi32(u0, __rounding); \ + __m128i a1 = _mm_add_epi32(u1, __rounding); \ + __m128i b0 = _mm_add_epi32(v0, __rounding); \ + __m128i b1 = _mm_add_epi32(v1, __rounding); \ + \ + __m128i c0 = _mm_srai_epi32(a0, cos_bit); \ + __m128i c1 = _mm_srai_epi32(a1, cos_bit); \ + __m128i d0 = _mm_srai_epi32(b0, cos_bit); \ + __m128i d1 = _mm_srai_epi32(b1, cos_bit); \ + \ + out0 = _mm_packs_epi32(c0, c1); \ + out1 = _mm_packs_epi32(d0, d1); \ + } while (0) + +static INLINE __m128i load_16bit_to_16bit(const int16_t *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i load_32bit_to_16bit(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); +} + +static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) { + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, a_low); +} + +// Store 4 16 bit values. Sign extend the values. +static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) { + const __m128i a_lo = _mm_unpacklo_epi16(a, a); + const __m128i a_1 = _mm_srai_epi32(a_lo, 16); + _mm_store_si128((__m128i *)b, a_1); +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) { + const __m128i a_lo = _mm_unpacklo_epi16(a, a); + const __m128i a_hi = _mm_unpackhi_epi16(a, a); + const __m128i a_1 = _mm_srai_epi32(a_lo, 16); + const __m128i a_2 = _mm_srai_epi32(a_hi, 16); + _mm_store_si128((__m128i *)b, a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +} + +static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) { + const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m128i b = _mm_madd_epi16(a, scale_rounding); + return _mm_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a, + int32_t *const b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_lo = _mm_unpacklo_epi16(a, one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + _mm_store_si128((__m128i *)b, b_lo); +} + +static INLINE void store_rect_16bit_to_32bit(const __m128i a, + int32_t *const b) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a_lo = _mm_unpacklo_epi16(a, one); + const __m128i a_hi = _mm_unpackhi_epi16(a, one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + _mm_store_si128((__m128i *)b, b_lo); + _mm_store_si128((__m128i *)(b + 4), b_hi); +} + +static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in, + const int stride, + __m128i *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in, + const int stride, + __m128i *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w4(in + i * stride); + } +} + +static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_16bit_to_32bit_w4(in[i], out + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_16bit_to_32bit(in[i], out + i * stride); + } +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w4(in[i], out + i * stride); + } +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit(in[i], out + i * stride); + } +} + +static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in, + uint16_t *out, + const int stride) { + for (int i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)(out + i * stride), in[i]); + } +} + +static INLINE void round_shift_16bit(__m128i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m128i rounding = _mm_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm_adds_epi16(in[i], rounding); + in[i] = _mm_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm_slli_epi16(in[i], bit); + } + } +} + +static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd); + +typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output, + int8_t cos_bit); + +void av1_iadst8_sse2(const __m128i *input, __m128i *output); + +void av1_idct8_sse2(const __m128i *input, __m128i *output); + +typedef struct { + transform_1d_sse2 col, row; // vertical and horizontal +} transform_2d_sse2; + +#ifdef __cplusplus +} +#endif // __cplusplus +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c new file mode 100644 index 0000000000..1894efdc10 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +// This function assumes `arr` is 16-byte aligned. +void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) { + __m128i *const vec = (__m128i *)arr; + const int vec_size = size >> 2; + av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit); +} diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h new file mode 100644 index 0000000000..387dfd6bb3 --- /dev/null +++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ +#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) { + __m128i tmp, round; + round = _mm_set1_epi32(1 << (bit - 1)); + tmp = _mm_add_epi32(vec, round); + return _mm_srai_epi32(tmp, bit); +} + +static INLINE void av1_round_shift_array_32_sse4_1(const __m128i *input, + __m128i *output, + const int size, + const int bit) { + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + output[i] = av1_round_shift_32_sse4_1(input[i], bit); + } + } else { + int i; + for (i = 0; i < size; i++) { + output[i] = _mm_slli_epi32(input[i], -bit); + } + } +} + +static INLINE void av1_round_shift_rect_array_32_sse4_1(const __m128i *input, + __m128i *output, + const int size, + const int bit, + const int val) { + const __m128i sqrt2 = _mm_set1_epi32(val); + if (bit > 0) { + int i; + for (i = 0; i < size; i++) { + const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit); + const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); + } + } else { + int i; + for (i = 0; i < size; i++) { + const __m128i r0 = _mm_slli_epi32(input[i], -bit); + const __m128i r1 = _mm_mullo_epi32(sqrt2, r0); + output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/cdef_block_avx2.c b/third_party/aom/av1/common/x86/cdef_block_avx2.c new file mode 100644 index 0000000000..1ec4b6c332 --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_avx2.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_avx2 +#include "av1/common/cdef_block_simd.h" + +// Mask used to shuffle the elements present in 256bit register. +const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504, + 0x0f0e0100, 0x0b0a0d0c, 0x07060908, + 0x03020504, 0x0f0e0100 }; + +/* partial A is a 16-bit vector of the form: +[x8 - - x1 | x16 - - x9] and partial B has the form: +[0 y1 - y7 | 0 y9 - y15]. +This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... +(x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants +are in const1 and const2. */ +static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala, + __m256i *partialb, + const __m256i *const1, + const __m256i *const2) { + __m256i tmp; + /* Reverse partial B. */ + *partialb = _mm256_shuffle_epi8( + *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit)); + + /* Interleave the x and y values of identical indices and pair x8 with 0. */ + tmp = *partiala; + *partiala = _mm256_unpacklo_epi16(*partiala, *partialb); + *partialb = _mm256_unpackhi_epi16(tmp, *partialb); + + /* Square and add the corresponding x and y values. */ + *partiala = _mm256_madd_epi16(*partiala, *partiala); + *partialb = _mm256_madd_epi16(*partialb, *partialb); + /* Multiply by constant. */ + *partiala = _mm256_mullo_epi32(*partiala, *const1); + *partialb = _mm256_mullo_epi32(*partialb, *const2); + /* Sum all results. */ + *partiala = _mm256_add_epi32(*partiala, *partialb); + return *partiala; +} + +static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2, + __m256i *x3) { + const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1); + const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3); + const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1); + const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3); + + *x0 = _mm256_unpacklo_epi64(t0, t1); + *x1 = _mm256_unpackhi_epi64(t0, t1); + *x2 = _mm256_unpacklo_epi64(t2, t3); + *x3 = _mm256_unpackhi_epi64(t2, t3); + return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1), + _mm256_add_epi32(*x2, *x3)); +} + +/* Computes cost for directions 0, 5, 6 and 7. We can call this function again +to compute the remaining directions. */ +static INLINE __m256i compute_directions_avx2(__m256i *lines, + int32_t cost_frist_8x8[4], + int32_t cost_second_8x8[4]) { + __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; + __m256i partial6; + __m256i tmp; + /* Partial sums for lines 0 and 1. */ + partial4a = _mm256_slli_si256(lines[0], 14); + partial4b = _mm256_srli_si256(lines[0], 2); + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4)); + tmp = _mm256_add_epi16(lines[0], lines[1]); + partial5a = _mm256_slli_si256(tmp, 10); + partial5b = _mm256_srli_si256(tmp, 6); + partial7a = _mm256_slli_si256(tmp, 4); + partial7b = _mm256_srli_si256(tmp, 12); + partial6 = tmp; + + /* Partial sums for lines 2 and 3. */ + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6)); + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8)); + tmp = _mm256_add_epi16(lines[2], lines[3]); + partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8)); + partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8)); + partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6)); + partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10)); + partial6 = _mm256_add_epi16(partial6, tmp); + + /* Partial sums for lines 4 and 5. */ + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10)); + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12)); + tmp = _mm256_add_epi16(lines[4], lines[5]); + partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6)); + partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10)); + partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8)); + partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8)); + partial6 = _mm256_add_epi16(partial6, tmp); + + /* Partial sums for lines 6 and 7. */ + partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2)); + partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14)); + partial4a = _mm256_add_epi16(partial4a, lines[7]); + tmp = _mm256_add_epi16(lines[6], lines[7]); + partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4)); + partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12)); + partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10)); + partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6)); + partial6 = _mm256_add_epi16(partial6, tmp); + + const __m256i const_reg_1 = + _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840); + const __m256i const_reg_2 = + _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168); + const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0); + const __m256i const_reg_4 = + _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140); + + /* Compute costs in terms of partial sums. */ + partial4a = + fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2); + partial7a = + fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4); + partial5a = + fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4); + partial6 = _mm256_madd_epi16(partial6, partial6); + partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105)); + + partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a); + _mm_storeu_si128((__m128i *)cost_frist_8x8, + _mm256_castsi256_si128(partial4a)); + _mm_storeu_si128((__m128i *)cost_second_8x8, + _mm256_extractf128_si256(partial4a, 1)); + + return partial4a; +} + +/* transpose and reverse the order of the lines -- equivalent to a 90-degree +counter-clockwise rotation of the pixels. */ +static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) { + const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]); + const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]); + const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]); + const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); + const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5); + const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); + const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5); + const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3); + const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); + const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3); + const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); + + res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1); + res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1); + res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3); + res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3); + res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5); + res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5); + res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7); + res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7); +} + +void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + int32_t cost_first_8x8[8]; + int32_t cost_second_8x8[8]; + // Used to store the best cost for 2 8x8's. + int32_t best_cost[2] = { 0 }; + // Best direction for 2 8x8's. + int best_dir[2] = { 0 }; + + const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift); + const __m256i const_128_reg = _mm256_set1_epi16(128); + __m256i lines[8]; + for (int i = 0; i < 8; i++) { + const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]); + const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]); + + lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1); + lines[i] = _mm256_sub_epi16( + _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg); + } + + /* Compute "mostly vertical" directions. */ + const __m256i dir47 = + compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4); + + /* Transpose and reverse the order of the lines. */ + array_reverse_transpose_8x8_avx2(lines, lines); + + /* Compute "mostly horizontal" directions. */ + const __m256i dir03 = + compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8); + + __m256i max = _mm256_max_epi32(dir03, dir47); + max = + _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8), + _mm256_slli_si256(max, 16 - (8)))); + max = + _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4), + _mm256_slli_si256(max, 16 - (4)))); + + const __m128i first_8x8_output = _mm256_castsi256_si128(max); + const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1); + const __m128i cmpeg_res_00 = + _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47)); + const __m128i cmpeg_res_01 = + _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03)); + const __m128i cmpeg_res_10 = + _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1)); + const __m128i cmpeg_res_11 = + _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1)); + const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00); + const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10); + + best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max)); + best_cost[1] = _mm_cvtsi128_si32(second_8x8_output); + best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8)); + best_dir[0] = + get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros + best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8)); + best_dir[1] = + get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros + + /* Difference between the optimal variance and the variance along the + orthogonal direction. Again, the sum(x^2) terms cancel out. */ + *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7]; + *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7]; + + /* We'd normally divide by 840, but dividing by 1024 is close enough + for what we're going to do with this. */ + *var_out_1st >>= 10; + *var_out_2nd >>= 10; + *out_dir_1st_8x8 = best_dir[0]; + *out_dir_2nd_8x8 = best_dir[1]; +} + +void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j = 0; + int remaining_width = width; + assert(height % 2 == 0); + assert(height > 0); + assert(width > 0); + + // Process multiple 32 pixels at a time. + if (remaining_width > 31) { + int i = 0; + do { + j = 0; + do { + __m128i row00 = + _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]); + __m128i row01 = _mm_loadu_si128( + (const __m128i *)&src[(i + 0) * sstride + (j + 16)]); + __m128i row10 = + _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]); + __m128i row11 = _mm_loadu_si128( + (const __m128i *)&src[(i + 1) * sstride + (j + 16)]); + _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)], + _mm256_cvtepu8_epi16(row00)); + _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)], + _mm256_cvtepu8_epi16(row01)); + _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)], + _mm256_cvtepu8_epi16(row10)); + _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)], + _mm256_cvtepu8_epi16(row11)); + j += 32; + } while (j <= width - 32); + i += 2; + } while (i < height); + remaining_width = width & 31; + } + + // Process 16 pixels at a time. + if (remaining_width > 15) { + int i = 0; + do { + __m128i row0 = + _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]); + __m128i row1 = + _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]); + _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j], + _mm256_cvtepu8_epi16(row0)); + _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j], + _mm256_cvtepu8_epi16(row1)); + i += 2; + } while (i < height); + remaining_width = width & 15; + j += 16; + } + + // Process 8 pixels at a time. + if (remaining_width > 7) { + int i = 0; + do { + __m128i row0 = + _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]); + __m128i row1 = + _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]); + _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j], + _mm_unpacklo_epi8(row0, _mm_setzero_si128())); + _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j], + _mm_unpacklo_epi8(row1, _mm_setzero_si128())); + i += 2; + } while (i < height); + remaining_width = width & 7; + j += 8; + } + + // Process 4 pixels at a time. + if (remaining_width > 3) { + int i = 0; + do { + __m128i row0 = + _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j])); + __m128i row1 = + _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j])); + _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j], + _mm_unpacklo_epi8(row0, _mm_setzero_si128())); + _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j], + _mm_unpacklo_epi8(row1, _mm_setzero_si128())); + i += 2; + } while (i < height); + remaining_width = width & 3; + j += 4; + } + + // Process the remaining pixels. + if (remaining_width) { + for (int i = 0; i < height; i++) { + for (int k = j; k < width; k++) { + dst[i * dstride + k] = src[i * sstride + k]; + } + } + } +} diff --git a/third_party/aom/av1/common/x86/cdef_block_sse2.c b/third_party/aom/av1/common/x86/cdef_block_sse2.c new file mode 100644 index 0000000000..5ab7ffa2ff --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_sse2.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse2 +#include "av1/common/cdef_block_simd.h" + +void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j = 0; + for (int i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} diff --git a/third_party/aom/av1/common/x86/cdef_block_sse4.c b/third_party/aom/av1/common/x86/cdef_block_sse4.c new file mode 100644 index 0000000000..344c1e47c9 --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_sse4.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_sse4_1 +#include "av1/common/cdef_block_simd.h" + +void cdef_find_dir_dual_sse4_1(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +void cdef_copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j = 0; + for (int i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} diff --git a/third_party/aom/av1/common/x86/cdef_block_ssse3.c b/third_party/aom/av1/common/x86/cdef_block_ssse3.c new file mode 100644 index 0000000000..0fb36eb6e0 --- /dev/null +++ b/third_party/aom/av1/common/x86/cdef_block_ssse3.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_simd.h" +#define SIMD_FUNC(name) name##_ssse3 +#include "av1/common/cdef_block_simd.h" + +void cdef_find_dir_dual_ssse3(const uint16_t *img1, const uint16_t *img2, + int stride, int32_t *var_out_1st, + int32_t *var_out_2nd, int coeff_shift, + int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { + // Process first 8x8. + *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); + + // Process second 8x8. + *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); +} + +void cdef_copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, + const uint8_t *src, int sstride, + int width, int height) { + int j; + for (int i = 0; i < height; i++) { + for (j = 0; j < (width & ~0x7); j += 8) { + v64 row = v64_load_unaligned(&src[i * sstride + j]); + v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + } + for (; j < width; j++) { + dst[i * dstride + j] = src[i * sstride + j]; + } + } +} diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c new file mode 100644 index 0000000000..e1e187c4a6 --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_avx2.c @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#include "av1/common/x86/cfl_simd.h" + +#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ + TX_SIZE tx_size) { \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + return subfn_##sub[tx_size]; \ + } + +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. + */ +static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos + const int luma_stride = input_stride << 1; + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); + + __m256i top_16x16 = _mm256_maddubs_epi16(top, twos); + __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos); + __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16); + + _mm256_storeu_si256(row, sum_16x16); + + input += luma_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd) + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_16x16 = _mm256_maddubs_epi16(top, fours); + _mm256_storeu_si256(row, top_16x16); + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd) + +/** + * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only + * performed on block of width 32. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + const __m256i zeros = _mm256_setzero_si256(); + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0)); + + __m256i row_lo = _mm256_unpacklo_epi8(top, zeros); + row_lo = _mm256_slli_epi16(row_lo, 3); + __m256i row_hi = _mm256_unpackhi_epi8(top, zeros); + row_hi = _mm256_slli_epi16(row_hi, 3); + + _mm256_storeu_si256(row, row_lo); + _mm256_storeu_si256(row + 1, row_hi); + + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + * Note: For 4:2:0 luma subsampling, the width will never be greater than 16. + */ +static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + const int luma_stride = input_stride << 1; + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride)); + __m256i sum = _mm256_add_epi16(top, bot); + + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride)); + __m256i sum_1 = _mm256_add_epi16(top_1, bot_1); + + __m256i hsum = _mm256_hadd_epi16(sum, sum_1); + hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); + hsum = _mm256_add_epi16(hsum, hsum); + + _mm256_storeu_si256(row, hsum); + + input += luma_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd) + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + * + */ +static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + __m256i hsum = _mm256_hadd_epi16(top, top_1); + hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0)); + hsum = _mm256_slli_epi16(hsum, 2); + + _mm256_storeu_si256(row, hsum); + + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd) + +static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, int width, + int height) { + (void)width; // Forever 32 + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + __m256i top = _mm256_loadu_si256((__m256i *)input); + __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16)); + _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3)); + _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3)); + input += input_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12, + __m256i alpha_sign, __m256i dc_q0) { + __m256i ac_q3 = _mm256_loadu_si256(input); + __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3); + __m256i scaled_luma_q0 = + _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign); + return _mm256_add_epi16(scaled_luma_q0, dc_q0); +} + +static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + (void)width; + const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); + const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); + const __m256i dc_q0 = _mm256_set1_epi16(*dst); + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + + do { + __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + res = _mm256_packus_epi16(res, next); + res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i *)dst, res); + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_PREDICT_X(avx2, 32, 8, lbd) +CFL_PREDICT_X(avx2, 32, 16, lbd) +CFL_PREDICT_X(avx2, 32, 32, lbd) + +cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { + static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { + cfl_predict_lbd_4x4_ssse3, /* 4x4 */ + cfl_predict_lbd_8x8_ssse3, /* 8x8 */ + cfl_predict_lbd_16x16_ssse3, /* 16x16 */ + cfl_predict_lbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_lbd_4x8_ssse3, /* 4x8 */ + cfl_predict_lbd_8x4_ssse3, /* 8x4 */ + cfl_predict_lbd_8x16_ssse3, /* 8x16 */ + cfl_predict_lbd_16x8_ssse3, /* 16x8 */ + cfl_predict_lbd_16x32_ssse3, /* 16x32 */ + cfl_predict_lbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_lbd_4x16_ssse3, /* 4x16 */ + cfl_predict_lbd_16x4_ssse3, /* 16x4 */ + cfl_predict_lbd_8x32_ssse3, /* 8x32 */ + cfl_predict_lbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the + // function pointer array out of bounds. + return pred[tx_size % TX_SIZES_ALL]; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static __m256i highbd_max_epi16(int bd) { + const __m256i neg_one = _mm256_set1_epi16(-1); + // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) + return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one); +} + +static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) { + return _mm256_max_epi16(_mm256_min_epi16(u, max), zero); +} + +static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + // Use SSSE3 version for smaller widths + assert(width == 16 || width == 32); + const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3); + const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9); + const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst); + const __m256i max = highbd_max_epi16(bd); + + __m256i *row = (__m256i *)pred_buf_q3; + const __m256i *row_end = row + height * CFL_BUF_LINE_I256; + do { + const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + _mm256_storeu_si256((__m256i *)dst, + highbd_clamp_epi16(res, _mm256_setzero_si256(), max)); + if (width == 32) { + const __m256i res_1 = + predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + _mm256_storeu_si256( + (__m256i *)(dst + 16), + highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max)); + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I256) < row_end); +} + +CFL_PREDICT_X(avx2, 16, 4, hbd) +CFL_PREDICT_X(avx2, 16, 8, hbd) +CFL_PREDICT_X(avx2, 16, 16, hbd) +CFL_PREDICT_X(avx2, 16, 32, hbd) +CFL_PREDICT_X(avx2, 32, 8, hbd) +CFL_PREDICT_X(avx2, 32, 16, hbd) +CFL_PREDICT_X(avx2, 32, 32, hbd) + +cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { + static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { + cfl_predict_hbd_4x4_ssse3, /* 4x4 */ + cfl_predict_hbd_8x8_ssse3, /* 8x8 */ + cfl_predict_hbd_16x16_avx2, /* 16x16 */ + cfl_predict_hbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_hbd_4x8_ssse3, /* 4x8 */ + cfl_predict_hbd_8x4_ssse3, /* 8x4 */ + cfl_predict_hbd_8x16_ssse3, /* 8x16 */ + cfl_predict_hbd_16x8_avx2, /* 16x8 */ + cfl_predict_hbd_16x32_avx2, /* 16x32 */ + cfl_predict_hbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_hbd_4x16_ssse3, /* 4x16 */ + cfl_predict_hbd_16x4_avx2, /* 16x4 */ + cfl_predict_hbd_8x32_ssse3, /* 8x32 */ + cfl_predict_hbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the + // function pointer array out of bounds. + return pred[tx_size % TX_SIZES_ALL]; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Returns a vector where all the (32-bits) elements are the sum of all the +// lanes in a. +static INLINE __m256i fill_sum_epi32(__m256i a) { + // Given that a == [A, B, C, D, E, F, G, H] + a = _mm256_hadd_epi32(a, a); + // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H + // a == [A', C', A', C', E', G', E', G'] + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)); + // a == [A', C', E', G', A', C', E', G'] + a = _mm256_hadd_epi32(a, a); + // Given that A'' == A' + C' and E'' == E' + G' + // a == [A'', E'', A'', E'', A'', E'', A'', E''] + return _mm256_hadd_epi32(a, a); + // Given that A''' == A'' + E'' + // a == [A''', A''', A''', A''', A''', A''', A''', A'''] +} + +static INLINE __m256i _mm256_addl_epi16(__m256i a) { + return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi16(a, _mm256_setzero_si256())); +} + +static INLINE void subtract_average_avx2(const uint16_t *src_ptr, + int16_t *dst_ptr, int width, + int height, int round_offset, + int num_pel_log2) { + // Use SSE2 version for smaller widths + assert(width == 16 || width == 32); + + const __m256i *src = (__m256i *)src_ptr; + const __m256i *const end = src + height * CFL_BUF_LINE_I256; + // To maximize usage of the AVX2 registers, we sum two rows per loop + // iteration + const int step = 2 * CFL_BUF_LINE_I256; + + __m256i sum = _mm256_setzero_si256(); + // For width 32, we use a second sum accumulator to reduce accumulator + // dependencies in the loop. + __m256i sum2; + if (width == 32) sum2 = _mm256_setzero_si256(); + + do { + // Add top row to the bottom row + __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src), + _mm256_loadu_si256(src + CFL_BUF_LINE_I256)); + sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0)); + if (width == 32) { /* Don't worry, this if it gets optimized out. */ + // Add the second part of the top row to the second part of the bottom row + __m256i l1 = + _mm256_add_epi16(_mm256_loadu_si256(src + 1), + _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256)); + sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1)); + } + src += step; + } while (src < end); + // Combine both sum accumulators + if (width == 32) sum = _mm256_add_epi32(sum, sum2); + + __m256i fill = fill_sum_epi32(sum); + + __m256i avg_epi16 = _mm256_srli_epi32( + _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2); + avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16); + + // Store and subtract loop + src = (__m256i *)src_ptr; + __m256i *dst = (__m256i *)dst_ptr; + do { + _mm256_storeu_si256(dst, + _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16)); + if (width == 32) { + _mm256_storeu_si256( + dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16)); + } + src += CFL_BUF_LINE_I256; + dst += CFL_BUF_LINE_I256; + } while (src < end); +} + +// Declare wrappers for AVX2 sizes +CFL_SUB_AVG_X(avx2, 16, 4, 32, 6) +CFL_SUB_AVG_X(avx2, 16, 8, 64, 7) +CFL_SUB_AVG_X(avx2, 16, 16, 128, 8) +CFL_SUB_AVG_X(avx2, 16, 32, 256, 9) +CFL_SUB_AVG_X(avx2, 32, 8, 128, 8) +CFL_SUB_AVG_X(avx2, 32, 16, 256, 9) +CFL_SUB_AVG_X(avx2, 32, 32, 512, 10) + +// Based on the observation that for small blocks AVX2 does not outperform +// SSE2, we call the SSE2 code for block widths 4 and 8. +cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { + static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { + cfl_subtract_average_4x4_sse2, /* 4x4 */ + cfl_subtract_average_8x8_sse2, /* 8x8 */ + cfl_subtract_average_16x16_avx2, /* 16x16 */ + cfl_subtract_average_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_sse2, /* 4x8 */ + cfl_subtract_average_8x4_sse2, /* 8x4 */ + cfl_subtract_average_8x16_sse2, /* 8x16 */ + cfl_subtract_average_16x8_avx2, /* 16x8 */ + cfl_subtract_average_16x32_avx2, /* 16x32 */ + cfl_subtract_average_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_sse2, /* 4x16 */ + cfl_subtract_average_16x4_avx2, /* 16x4 */ + cfl_subtract_average_8x32_sse2, /* 8x32 */ + cfl_subtract_average_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ + }; + // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to + // index the function pointer array out of bounds. + return sub_avg[tx_size % TX_SIZES_ALL]; +} diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h new file mode 100644 index 0000000000..03ae02a922 --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_simd.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_ +#define AOM_AV1_COMMON_X86_CFL_SIMD_H_ + +#include "av1/common/blockd.h" + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); + +// SSSE3 version is optimal for with == 16, we reuse it in AVX2 +void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type, + int input_stride, uint16_t *output_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is optimal for with == 8, we reuse it in AVX2 +void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); + +// SSSE3 version is faster for with == 16, we reuse it in AVX2 +void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type, + int input_stride, uint16_t *output_q3); +#endif // CONFIG_AV1_HIGHBITDEPTH + +// SSE2 version is optimal for with == 4, we reuse them in AVX2 +void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); + +// SSE2 version is optimal for with == 8, we reuse them in AVX2 +void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); + +void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); + +#if CONFIG_AV1_HIGHBITDEPTH +void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); + +void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_ diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c new file mode 100644 index 0000000000..4783fe098c --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_sse2.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/cfl.h" +#include "config/av1_rtcd.h" + +static INLINE __m128i fill_sum_epi32(__m128i l0) { + l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2))); + return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1))); +} + +static INLINE void subtract_average_sse2(const uint16_t *src_ptr, + int16_t *dst_ptr, int width, + int height, int round_offset, + int num_pel_log2) { + const __m128i zeros = _mm_setzero_si128(); + const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset); + const __m128i *src = (__m128i *)src_ptr; + const __m128i *const end = src + height * CFL_BUF_LINE_I128; + const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4)); + + __m128i sum = zeros; + do { + __m128i l0; + if (width == 4) { + l0 = _mm_add_epi16(_mm_loadl_epi64(src), + _mm_loadl_epi64(src + CFL_BUF_LINE_I128)); + __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128), + _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128)); + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpacklo_epi16(l1, zeros))); + } else { + if (width == 8) { + l0 = _mm_add_epi16(_mm_loadu_si128(src), + _mm_loadu_si128(src + CFL_BUF_LINE_I128)); + } else { + l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1)); + } + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpackhi_epi16(l0, zeros))); + if (width == 32) { + l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3)); + sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros), + _mm_unpackhi_epi16(l0, zeros))); + } + } + src += step; + } while (src < end); + + sum = fill_sum_epi32(sum); + + __m128i avg_epi16 = + _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2); + avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16); + + src = (__m128i *)src_ptr; + __m128i *dst = (__m128i *)dst_ptr; + do { + if (width == 4) { + _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16)); + } else { + _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16)); + if (width > 8) { + _mm_storeu_si128(dst + 1, + _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16)); + if (width == 32) { + _mm_storeu_si128(dst + 2, + _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16)); + _mm_storeu_si128(dst + 3, + _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16)); + } + } + } + src += CFL_BUF_LINE_I128; + dst += CFL_BUF_LINE_I128; + } while (src < end); +} + +CFL_SUB_AVG_FN(sse2) diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c new file mode 100644 index 0000000000..476b6609a9 --- /dev/null +++ b/third_party/aom/av1/common/x86/cfl_ssse3.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/cfl.h" + +#include "av1/common/x86/cfl_simd.h" + +// Load 32-bit integer from memory into the first element of dst. +static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) { + return _mm_cvtsi32_si128(*((int *)mem_addr)); +} + +// Store 32-bit integer from the first element of a into memory. +static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) { + *((int *)mem_addr) = _mm_cvtsi128_si32(a); +} + +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i twos = _mm_set1_epi8(2); + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + __m128i top = _mm_loadh_epi32((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storeh_epi32(pred_buf_m128i, sum); + } else if (width == 8) { + __m128i top = _mm_loadl_epi64((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storel_epi64(pred_buf_m128i, sum); + } else { + __m128i top = _mm_loadu_si128((__m128i *)input); + top = _mm_maddubs_epi16(top, twos); + __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); + bot = _mm_maddubs_epi16(bot, twos); + const __m128i sum = _mm_add_epi16(top, bot); + _mm_storeu_si128(pred_buf_m128i, sum); + if (width == 32) { + __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + __m128i bot_1 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); + top_1 = _mm_maddubs_epi16(top_1, twos); + bot_1 = _mm_maddubs_epi16(bot_1, twos); + __m128i sum_1 = _mm_add_epi16(top_1, bot_1); + _mm_storeu_si128(pred_buf_m128i + 1, sum_1); + } + } + input += luma_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i fours = _mm_set1_epi8(4); + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + __m128i top = _mm_loadh_epi32((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storeh_epi32(pred_buf_m128i, top); + } else if (width == 8) { + __m128i top = _mm_loadl_epi64((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storel_epi64(pred_buf_m128i, top); + } else { + __m128i top = _mm_loadu_si128((__m128i *)input); + top = _mm_maddubs_epi16(top, fours); + _mm_storeu_si128(pred_buf_m128i, top); + if (width == 32) { + __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + top_1 = _mm_maddubs_epi16(top_1, fours); + _mm_storeu_si128(pred_buf_m128i + 1, top_1); + } + } + input += input_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +/** + * Multiplies the pixels by 8 (scaling in Q3). + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const __m128i zeros = _mm_setzero_si128(); + const int luma_stride = input_stride; + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + __m128i row = _mm_loadh_epi32((__m128i *)input); + row = _mm_unpacklo_epi8(row, zeros); + _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); + } else if (width == 8) { + __m128i row = _mm_loadl_epi64((__m128i *)input); + row = _mm_unpacklo_epi8(row, zeros); + _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); + } else { + __m128i row = _mm_loadu_si128((__m128i *)input); + const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); + const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); + _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); + _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); + if (width == 32) { + __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); + const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); + _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); + _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); + } + } + input += luma_stride; + pred_buf_m128i += CFL_BUF_LINE_I128; + } while (pred_buf_m128i < end); +} + +#if CONFIG_AV1_HIGHBITDEPTH +/** + * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more + * precise version of a box filter 4:2:0 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; + const int luma_stride = input_stride << 1; + do { + if (width == 4) { + const __m128i top = _mm_loadl_epi64((__m128i *)input); + const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); + __m128i sum = _mm_add_epi16(top, bot); + sum = _mm_hadd_epi16(sum, sum); + *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); + } else { + const __m128i top = _mm_loadu_si128((__m128i *)input); + const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); + __m128i sum = _mm_add_epi16(top, bot); + if (width == 8) { + sum = _mm_hadd_epi16(sum, sum); + _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); + } else { + const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i bot_1 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); + sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); + _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); + if (width == 32) { + const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); + const __m128i bot_2 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); + const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); + const __m128i bot_3 = + _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); + const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); + const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); + __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, + _mm_add_epi16(next_sum, next_sum)); + } + } + } + input += luma_stride; + } while ((pred_buf_q3 += CFL_BUF_LINE) < end); +} + +/** + * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more + * precise version of a box filter 4:2:2 pixel subsampling in Q3. + * + * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the + * active area is specified using width and height. + * + * Note: We don't need to worry about going over the active area, as long as we + * stay inside the CfL prediction buffer. + */ +static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; + const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; + do { + if (width == 4) { + const __m128i top = _mm_loadl_epi64((__m128i *)input); + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); + _mm_storeh_epi32(pred_buf_m128i, sum); + } else { + const __m128i top = _mm_loadu_si128((__m128i *)input); + if (width == 8) { + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); + _mm_storel_epi64(pred_buf_m128i, sum); + } else { + const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); + const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); + _mm_storeu_si128(pred_buf_m128i, sum); + if (width == 32) { + const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); + const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); + const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); + _mm_storeu_si128(pred_buf_m128i + 1, sum_1); + } + } + } + pred_buf_m128i += CFL_BUF_LINE_I128; + input += input_stride; + } while (pred_buf_m128i < end); +} + +static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, + int input_stride, + uint16_t *pred_buf_q3, + int width, int height) { + const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; + do { + if (width == 4) { + const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); + _mm_storel_epi64((__m128i *)pred_buf_q3, row); + } else { + const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); + _mm_storeu_si128((__m128i *)pred_buf_q3, row); + if (width >= 16) { + __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); + row_1 = _mm_slli_epi16(row_1, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); + if (width == 32) { + __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); + row_2 = _mm_slli_epi16(row_2, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); + __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); + row_3 = _mm_slli_epi16(row_3, 3); + _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); + } + } + } + input += input_stride; + pred_buf_q3 += CFL_BUF_LINE; + } while (pred_buf_q3 < end); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +CFL_GET_SUBSAMPLE_FUNCTION(ssse3) + +static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12, + __m128i alpha_sign, __m128i dc_q0) { + __m128i ac_q3 = _mm_loadu_si128(input); + __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3); + __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12); + scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign); + return _mm_add_epi16(scaled_luma_q0, dc_q0); +} + +static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, + uint8_t *dst, int dst_stride, + int alpha_q3, int width, int height) { + const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + const __m128i dc_q0 = _mm_set1_epi16(*dst); + __m128i *row = (__m128i *)pred_buf_q3; + const __m128i *row_end = row + height * CFL_BUF_LINE_I128; + do { + __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + if (width < 16) { + res = _mm_packus_epi16(res, res); + if (width == 4) + _mm_storeh_epi32((__m128i *)dst, res); + else + _mm_storel_epi64((__m128i *)dst, res); + } else { + __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + res = _mm_packus_epi16(res, next); + _mm_storeu_si128((__m128i *)dst, res); + if (width == 32) { + res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); + next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); + res = _mm_packus_epi16(res, next); + _mm_storeu_si128((__m128i *)(dst + 16), res); + } + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I128) < row_end); +} + +CFL_PREDICT_FN(ssse3, lbd) + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE __m128i highbd_max_epi16(int bd) { + const __m128i neg_one = _mm_set1_epi16(-1); + // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd) + return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one); +} + +static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) { + return _mm_max_epi16(_mm_min_epi16(u, max), zero); +} + +static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3, + uint16_t *dst, int dst_stride, + int alpha_q3, int bd, int width, + int height) { + const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); + const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); + const __m128i dc_q0 = _mm_set1_epi16(*dst); + const __m128i max = highbd_max_epi16(bd); + const __m128i zeros = _mm_setzero_si128(); + __m128i *row = (__m128i *)pred_buf_q3; + const __m128i *row_end = row + height * CFL_BUF_LINE_I128; + do { + __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); + res = highbd_clamp_epi16(res, zeros, max); + if (width == 4) { + _mm_storel_epi64((__m128i *)dst, res); + } else { + _mm_storeu_si128((__m128i *)dst, res); + } + if (width >= 16) { + const __m128i res_1 = + predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128(((__m128i *)dst) + 1, + highbd_clamp_epi16(res_1, zeros, max)); + } + if (width == 32) { + const __m128i res_2 = + predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128((__m128i *)(dst + 16), + highbd_clamp_epi16(res_2, zeros, max)); + const __m128i res_3 = + predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); + _mm_storeu_si128((__m128i *)(dst + 24), + highbd_clamp_epi16(res_3, zeros, max)); + } + dst += dst_stride; + } while ((row += CFL_BUF_LINE_I128) < row_end); +} + +CFL_PREDICT_FN(ssse3, hbd) +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c new file mode 100644 index 0000000000..1b39a0a8d5 --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "third_party/SVT-AV1/convolve_2d_avx2.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" + +#include "av1/common/convolve.h" + +void av1_convolve_2d_sr_general_avx2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params) { + if (filter_params_x->taps > 8) { + const int bd = 8; + int im_stride = 8, i; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h12 = _mm256_set1_epi32( + ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i sum_round_v = _mm256_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift_v = _mm_cvtsi32_si128(bits); + + __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 }; + + int horiz_tap = 12; + int vert_tap = 12; + + prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h); + prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v); + + int im_h = h + vert_tap - 1; + const int fo_vert = vert_tap / 2 - 1; + const int fo_horiz = horiz_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + for (int j = 0; j < w; j += 8) { + CONVOLVE_SR_HORIZONTAL_FILTER_12TAP + CONVOLVE_SR_VERTICAL_FILTER_12TAP + } + } else { + const int bd = 8; + int im_stride = 8, i; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h = + _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) + + (1 << (bd + FILTER_BITS - 2))); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + const __m256i sum_round_v = _mm256_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift_v = _mm_cvtsi32_si128(bits); + + __m256i filt[4], coeffs_h[4], coeffs_v[4]; + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); + + int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); + int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); + + if (horiz_tap == 6) + prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + else + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); + + if (vert_tap == 6) + prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v); + else + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); + + int im_h = h + vert_tap - 1; + const int fo_vert = vert_tap / 2 - 1; + const int fo_horiz = horiz_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + for (int j = 0; j < w; j += 8) { + if (horiz_tap == 4) { + CONVOLVE_SR_HORIZONTAL_FILTER_4TAP + } else if (horiz_tap == 6) { + CONVOLVE_SR_HORIZONTAL_FILTER_6TAP + } else { + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP + } + + if (vert_tap == 4) { + CONVOLVE_SR_VERTICAL_FILTER_4TAP + } else if (vert_tap == 6) { + CONVOLVE_SR_VERTICAL_FILTER_6TAP + } else { + CONVOLVE_SR_VERTICAL_FILTER_8TAP + } + } + } +} + +void av1_convolve_2d_sr_avx2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int32_t w, int32_t h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int32_t subpel_x_q4, + const int32_t subpel_y_q4, ConvolveParams *conv_params) { + const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_q4); + const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_q4); + + const bool use_general = (tap_x == 12 || tap_y == 12); + if (use_general) { + av1_convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_q4, subpel_y_q4, conv_params); + } else { + av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_q4, subpel_y_q4, conv_params); + } +} diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c new file mode 100644 index 0000000000..1b85f37294 --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "av1/common/convolve.h" + +void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = w; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + __m128i coeffs[6]; + + /* Horizontal filter */ + { + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); + + const __m128i round_const = _mm_set1_epi32( + (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data_2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i src_6 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + const __m128i src_8 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero); + const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]); + const __m128i src_10 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero); + const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]); + + const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]); + const __m128i src_5 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]); + const __m128i src_7 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]); + const __m128i src_9 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero); + const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]); + const __m128i src_11 = + _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero); + const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]); + + const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); + + const __m128i sum_round = + _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + const __m128i src_8 = + _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride), + *(__m128i *)(data + 9 * im_stride)); + const __m128i src_10 = + _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride), + *(__m128i *)(data + 11 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]); + const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]); + + const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + const __m128i src_9 = + _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride), + *(__m128i *)(data + 9 * im_stride)); + const __m128i src_11 = + _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride), + *(__m128i *)(data + 11 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]); + const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]); + const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]); + + const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); + + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + + _mm_storel_epi64(p, res); + } + } + } +} + +void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + if (filter_params_x->taps > 8) { + if (w < 8) { + av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params); + } else { + av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params); + } + } else { + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + + assert(conv_params->round_0 > 0); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = + _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i sum_round = _mm_set1_epi32( + (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); + const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - + ((1 << (offset_bits - conv_params->round_1)) >> 1)); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift); + __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift); + + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + // Accumulate values into the destination buffer + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + + if (w == 2) { + *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res); + } else if (w == 4) { + *(int *)p = _mm_cvtsi128_si32(res); + } else { + _mm_storel_epi64(p, res); + } + } + } + } + } +} + +void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, + int h, ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const __m128i zero = _mm_setzero_si128(); + const __m128i left_shift = _mm_cvtsi32_si128(bits); + int i, j; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + assert((w % 4) == 0); + + if (!(w % 16)) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]); + + const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero); + const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero); + + const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift); + const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const); + + const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift); + const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const); + + if (do_average) { + const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j])); + const __m128i data_ref_0_hi = + _mm_loadu_si128((__m128i *)(&dst[j + 8])); + + const __m128i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + + const __m128i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result_hi = convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = + _mm_packus_epi16(round_result_lo, round_result_hi); + + _mm_store_si128((__m128i *)(&dst0[j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo); + _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi); + } + } + src += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]); + const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero); + + const __m128i res = _mm_sll_epi16(d16_0, left_shift); + const __m128i res_unsigned = _mm_add_epi16(res, offset_const); + + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[j]), res_8); + else + *(int *)(&dst0[j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[j]), res_unsigned); + } + } + src += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } + } +} diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c new file mode 100644 index 0000000000..3862bbeac1 --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_avx2.c @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "third_party/SVT-AV1/convolve_avx2.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/synonyms.h" + +static AOM_INLINE void av1_convolve_y_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { + // right shift is F-1 because we are already dividing + // filter co-efficients by 2 + const int right_shift_bits = (FILTER_BITS - 1); + __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); + __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); + + __m256i coeffs[6], s[12]; + __m128i d[10]; + + int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); + + if (vert_tap == 6) + prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); + else if (vert_tap == 12) { + prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); + } else { + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); + } + + // vert_filt as 4 tap + if (vert_tap == 4) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (int j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else if (vert_tap == 6) { + const int fo_vert = vert_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (int j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); + const __m256i src_34a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_45a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[3] = s[4]; + s[4] = s[5]; + } + } + } else if (vert_tap == 12) { // vert_tap == 12 + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + const __m256i v_zero = _mm256_setzero_si256(); + right_shift = _mm_cvtsi32_si128(FILTER_BITS); + right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); + + for (int j = 0; j < w; j += 8) { + const uint8_t *data = &src_ptr[j]; + __m256i src10; + + d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); + d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); + d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); + + const __m256i src_67a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); + + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); + + const __m256i src_89a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); + + src10 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); + const __m256i src_910a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); + + const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); + const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); + const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); + const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); + const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); + const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); + const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); + const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); + const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); + const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); + + s[0] = _mm256_unpacklo_epi16(src_01, src_12); + s[1] = _mm256_unpacklo_epi16(src_23, src_34); + s[2] = _mm256_unpacklo_epi16(src_45, src_56); + s[3] = _mm256_unpacklo_epi16(src_67, src_78); + s[4] = _mm256_unpacklo_epi16(src_89, src_910); + + s[6] = _mm256_unpackhi_epi16(src_01, src_12); + s[7] = _mm256_unpackhi_epi16(src_23, src_34); + s[8] = _mm256_unpackhi_epi16(src_45, src_56); + s[9] = _mm256_unpackhi_epi16(src_67, src_78); + s[10] = _mm256_unpackhi_epi16(src_89, src_910); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_1011a = _mm256_permute2x128_si256( + src10, + _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), + 0x20); + + src10 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); + + const __m256i src_1112a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), + src10, 0x20); + + const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); + const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); + + s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); + s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); + + const __m256i res_lo = convolve_12taps(s, coeffs); + + const __m256i res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 4) { + const __m256i res_hi = convolve_12taps(s + 6, coeffs); + + const __m256i res_32b_hi = _mm256_sra_epi32( + _mm256_add_epi32(res_hi, right_shift_const), right_shift); + __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 2) { + *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); + *(int *)&dst[i * dst_stride + j + dst_stride] = + _mm_cvtsi128_si32(res_1); + } else { + *(uint16_t *)&dst[i * dst_stride + j] = + (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = + (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + + s[6] = s[7]; + s[7] = s[8]; + s[8] = s[9]; + s[9] = s[10]; + s[10] = s[11]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (int j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + + s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); + s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_67a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + const __m256i res_lo = convolve_lowbd(s, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t w, + int32_t h, + const InterpFilterParams *filter_params_y, + const int32_t subpel_y_q4) { + const int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4); + + if (vert_tap == 12) { + av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_q4); + } else { + av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_q4); + } +} + +static AOM_INLINE void av1_convolve_x_sr_general_avx2( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params) { + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift = _mm_cvtsi32_si128(bits); + __m256i round_0_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); + int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + assert(conv_params->round_0 > 0); + + __m256i coeffs[6], filt[4]; + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + if (horiz_tap == 6) + prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); + else if (horiz_tap == 12) { + prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); + } else { + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + } + + // horz_filt as 4 tap + if (horiz_tap == 4) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } else if (horiz_tap == 6) { + const int fo_horiz = horiz_tap / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } else if (horiz_tap == 12) { // horiz_tap == 12 + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + const __m256i v_zero = _mm256_setzero_si256(); + round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); + round_const = _mm256_set1_epi32((1 << bits) >> 1); + round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + __m256i s[6]; + + if (w <= 4) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + // row0 0..7 row1 0..7 + const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); + // row0 8..F row1 8..F + const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); + + // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 + const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); + // row0 04 04 .. 07 07 row1 04 04 .. 07 07 + const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); + + // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B + const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); + // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F + const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); + + // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 + s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); + // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 + s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); + // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 + s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); + // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A + s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); + // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C + s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); + // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E + s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); + + const __m256i res_lo = convolve_12taps(s, coeffs); + + __m256i res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_lo, round_0_const), round_0_shift); + + // 00 01 02 03 10 12 13 14 + res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), + round_shift); + // 8 bit conversion and saturation to uint8 + // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); + // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 + // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 + const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); + // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w > 2) { + // 00 01 02 03 + *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); + // 10 11 12 13 + *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); + } else { + // 00 01 + *(uint16_t *)&dst[i * dst_stride] = + (uint16_t)_mm_cvtsi128_si32(res_0); + // 10 11 + *(uint16_t *)&dst[i * dst_stride + dst_stride] = + (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; i++) { + for (int j = 0; j < w; j += 8) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + j + 4]))), + 0x20); + // row0 0..7 4..B + const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); + // row0 8..F C..13 + const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); + + // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 + const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); + // row0 04 04 .. 07 07 08 08 .. 0B 0B + const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); + + // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F + const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); + // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 + const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); + + s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); + s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); + s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); + s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); + s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); + s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); + + const __m256i res_lo = convolve_12taps(s, coeffs); + + __m256i res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_lo, round_0_const), round_0_shift); + + res_32b_lo = _mm256_sra_epi32( + _mm256_add_epi32(res_32b_lo, round_const), round_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); + *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); + } + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } + } +} + +void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t w, + int32_t h, + const InterpFilterParams *filter_params_x, + const int32_t subpel_x_q4, + ConvolveParams *conv_params) { + const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4); + + if (horz_tap == 12) { + av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_q4, conv_params); + } else { + av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_q4, + conv_params); + } +} diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c new file mode 100644 index 0000000000..012e75c1ae --- /dev/null +++ b/third_party/aom/av1/common/x86/convolve_sse2.c @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "av1/common/convolve.h" + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel( + filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1 + coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 + coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 + coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]); + const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3)); + return d; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + int subpel_y_qn) { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); + __m128i coeffs[6]; + + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs); + + int j = 0; + do { + __m128i s[12], src10, res_lo, res_hi; + __m128i res_lo_round, res_hi_round, res16, res; + const uint8_t *data = &src_ptr[j]; + + src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride)); + s[0] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 6 * src_stride))); + s[6] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + s[7] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 8 * src_stride))); + s[8] = + _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 9 * src_stride))); + s[9] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[10] = _mm_unpacklo_epi8( + src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))); + src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride)); + s[11] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10); + + res_lo = convolve_lo_y_12tap(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y_12tap(s, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + res_lo = convolve_lo_y_12tap(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y_12tap(s + 1, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + s[6] = s[8]; + s[7] = s[9]; + s[8] = s[10]; + s[9] = s[11]; + } while (i < h); + j += 8; + } while (j < w); +} + +void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn) { + if (filter_params_y->taps > 8) { + if (w < 8) { + av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + } else { + av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn); + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS); + __m128i coeffs[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); + + if (w <= 4) { + __m128i s[8], src6, res, res_round, res16; + int res_int; + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6); + + do { + s[6] = _mm_unpacklo_epi8( + src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride))); + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6); + + res = convolve_lo_y(s + 0, coeffs); + res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); + res16 = _mm_packs_epi32(res_round, res_round); + res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); + + if (w == 2) + *(uint16_t *)dst = (uint16_t)res_int; + else + *(int *)dst = res_int; + + src_ptr += src_stride; + dst += dst_stride; + + res = convolve_lo_y(s + 1, coeffs); + res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift); + res16 = _mm_packs_epi32(res_round, res_round); + res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16)); + + if (w == 2) + *(uint16_t *)dst = (uint16_t)res_int; + else + *(int *)dst = res_int; + + src_ptr += src_stride; + dst += dst_stride; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + h -= 2; + } while (h); + } else { + assert(!(w % 8)); + int j = 0; + do { + __m128i s[8], src6, res_lo, res_hi; + __m128i res_lo_round, res_hi_round, res16, res; + const uint8_t *data = &src_ptr[j]; + + src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[6] = _mm_unpacklo_epi8( + src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); + + res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels + + res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + } while (i < h); + j += 8; + } while (j < w); + } + } +} + +void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + int subpel_x_qn, + ConvolveParams *conv_params) { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_0_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + const __m128i zero = _mm_setzero_si128(); + __m128i coeffs[6]; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs); + + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi16(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi16(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi16(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi16(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + + const __m128i res32 = convolve_lo_x_12tap(s, coeffs, zero); + + __m128i res32_round = + _mm_sra_epi32(_mm_add_epi32(res32, round_0_const), round_0_shift); + res32_round = + _mm_sra_epi32(_mm_add_epi32(res32_round, round_const), round_shift); + + const __m128i res16 = _mm_packs_epi32(res32_round, zero); + const __m128i res = _mm_packus_epi16(res16, zero); + + const int val = _mm_cvtsi128_si32(res); + memcpy((dst + i * dst_stride + j), &val, sizeof(val)); + j += 4; + } while (j < w); + } while (++i < h); +} + +void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + if (filter_params_x->taps > 8) { + if (w < 4) { + av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } else { + av1_convolve_x_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params); + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_0_const = + _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1); + const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift = _mm_cvtsi32_si128(bits); + __m128i coeffs[4]; + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); + + if (w <= 4) { + do { + const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + const __m128i res_lo = convolve_lo_x(s, coeffs); + __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + int r = _mm_cvtsi128_si32(res); + if (w == 2) + *(uint16_t *)dst = (uint16_t)r; + else + *(int *)dst = r; + + src_ptr += src_stride; + dst += dst_stride; + } while (--h); + } else { + assert(!(w % 8)); + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + // Filter even-index pixels + s[0] = data; + s[1] = _mm_srli_si128(data, 2); + s[2] = _mm_srli_si128(data, 4); + s[3] = _mm_srli_si128(data, 6); + const __m128i res_even = convolve_lo_x(s, coeffs); + + // Filter odd-index pixels + s[0] = _mm_srli_si128(data, 1); + s[1] = _mm_srli_si128(data, 3); + s[2] = _mm_srli_si128(data, 5); + s[3] = _mm_srli_si128(data, 7); + const __m128i res_odd = convolve_lo_x(s, coeffs); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + __m128i res_lo_round = _mm_sra_epi32( + _mm_add_epi32(res_lo, round_0_const), round_0_shift); + res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), + round_shift); + __m128i res_hi_round = _mm_sra_epi32( + _mm_add_epi32(res_hi, round_0_const), round_0_shift); + res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const), + round_shift); + + const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res = _mm_packus_epi16(res16, res16); + + _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res); + j += 8; + } while (j < w); + } while (++i < h); + } + } +} diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c new file mode 100644 index 0000000000..d05bb0e15f --- /dev/null +++ b/third_party/aom/av1/common/x86/filterintra_sse4.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +//------------------------------------------------------------------------------ +// filter_intra_predictor_sse4_1 + +// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which +// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes. +#define DUPLICATE_FIRST_HALF 0x44 + +// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th +// at zero to preserve the sum. +static INLINE void filter_4x2_sse4_1(uint8_t *dst, const ptrdiff_t stride, + const __m128i *pixels, + const __m128i *taps_0_1, + const __m128i *taps_2_3, + const __m128i *taps_4_5, + const __m128i *taps_6_7) { + const __m128i mul_0_01 = _mm_maddubs_epi16(*pixels, *taps_0_1); + const __m128i mul_0_23 = _mm_maddubs_epi16(*pixels, *taps_2_3); + // |output_half| contains 8 partial sums. + __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23); + __m128i output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row0 = + _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4), + /* arbitrary pack arg */ output); + xx_storel_32(dst, output_row0); + const __m128i mul_1_01 = _mm_maddubs_epi16(*pixels, *taps_4_5); + const __m128i mul_1_23 = _mm_maddubs_epi16(*pixels, *taps_6_7); + output_half = _mm_hadd_epi16(mul_1_01, mul_1_23); + output = _mm_hadd_epi16(output_half, output_half); + const __m128i output_row1 = + _mm_packus_epi16(xx_roundn_epi16_unsigned(output, 4), + /* arbitrary pack arg */ output); + xx_storel_32(dst + stride, output_row1); +} + +// 4xH transform sizes are given special treatment because xx_loadl_64 goes out +// of bounds and every block involves the left column. This implementation +// loads TL from the top row for the first block, so it is not +static INLINE void filter_4xh(uint8_t *dest, ptrdiff_t stride, + const uint8_t *const top_ptr, + const uint8_t *const left_ptr, int mode, + const int height) { + const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]); + const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]); + const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]); + const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]); + __m128i top = xx_loadl_32(top_ptr - 1); + __m128i pixels = _mm_insert_epi8(top, (int8_t)top_ptr[3], 4); + __m128i left = (height == 4 ? xx_loadl_32(left_ptr) : xx_loadl_64(left_ptr)); + left = _mm_slli_si128(left, 5); + + // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1], + // left[2], left[3], left[4], left[5], left[6], left[7] + pixels = _mm_or_si128(left, pixels); + + // Duplicate first 8 bytes. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 1. + pixels = xx_loadl_32(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1], + // left[0], left[1], ... + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last + // byte is an unused value, which shall be multiplied by 0 when we apply the + // filter. + const int64_t kInsertTopLeftFirstMask = 0x0F08070302010006; + + // Insert left[-1] in front as TL and put left[0] and left[1] at the end. + const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 2. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 3. + + // Compute the middle 8 rows before using common code for the final 4 rows. + // Because the common code below this block assumes that + if (height == 16) { + // This shift allows us to use pixel_order2 twice after shifting by 2 later. + left = _mm_slli_si128(left, 1); + pixels = xx_loadl_32(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4], + // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3] + pixels = _mm_or_si128(left, pixels); + + // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The + // last byte is an unused value, as above. The top-left was shifted to + // position nine to keep two empty spaces after the top pixels. + const int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009; + + // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at + // the end. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 4. + + // First 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + + // Clear all but final pixel in the first 8 of left column. + __m128i keep_top_left = _mm_srli_si128(left, 13); + dest += stride; // Move to y = 5. + pixels = xx_loadl_32(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-6], + // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1] + pixels = _mm_or_si128(left, pixels); + left = xx_loadl_64(left_ptr + 8); + + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + dest += stride; // Move to y = 6. + + // Second 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + + // Position TL value so we can use pixel_order1. + keep_top_left = _mm_slli_si128(keep_top_left, 6); + dest += stride; // Move to y = 7. + pixels = xx_loadl_32(dest); + left = _mm_slli_si128(left, 7); + left = _mm_or_si128(left, keep_top_left); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 8. + + // Third 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 9. + + // Prepare final inputs. + pixels = xx_loadl_32(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 10. + + // Fourth 4x2 in the if body. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 11. + } + + // In both the 8 and 16 case, we assume that the left vector has the next TL + // at position 8. + if (height > 4) { + // Erase prior left pixels by shifting TL to position 0. + left = _mm_srli_si128(left, 8); + left = _mm_slli_si128(left, 6); + pixels = xx_loadl_32(dest); + + // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 12 or 4. + + // First of final two 4x2 blocks. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + dest += stride; // Move to y = 13 or 5. + pixels = xx_loadl_32(dest); + left = _mm_srli_si128(left, 2); + + // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2] + // left[-1], left[0], left[1], left[2], left[3], ... + pixels = _mm_or_si128(left, pixels); + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + dest += stride; // Move to y = 14 or 6. + + // Last of final two 4x2 blocks. + filter_4x2_sse4_1(dest, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + } +} + +static INLINE void filter_intra_predictor_sse4_1(void *const dest, + ptrdiff_t stride, + const void *const top_row, + const void *const left_column, + int mode, const int width, + const int height) { + const uint8_t *const top_ptr = (const uint8_t *)top_row; + const uint8_t *const left_ptr = (const uint8_t *)left_column; + uint8_t *dst = (uint8_t *)dest; + if (width == 4) { + filter_4xh(dst, stride, top_ptr, left_ptr, mode, height); + return; + } + + // There is one set of 7 taps for each of the 4x2 output pixels. + const __m128i taps_0_1 = xx_load_128(av1_filter_intra_taps[mode][0]); + const __m128i taps_2_3 = xx_load_128(av1_filter_intra_taps[mode][2]); + const __m128i taps_4_5 = xx_load_128(av1_filter_intra_taps[mode][4]); + const __m128i taps_6_7 = xx_load_128(av1_filter_intra_taps[mode][6]); + + // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at + // the end is an unused value, which shall be multiplied by 0 when we apply + // the filter. + const int64_t kCondenseLeftMask = 0x0F09080403020100; + + // Takes the "left section" and puts it right after p0-p4. + const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask); + + // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last + // byte is unused as above. + const int64_t kInsertTopLeftMask = 0x0F0A090302010008; + + // Shuffles the "top left" from the left section, to the front. Used when + // grabbing data from left_column and not top_row. + const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask); + + // This first pass takes care of the cases where the top left pixel comes from + // top_row. + __m128i pixels = xx_loadl_64(top_ptr - 1); + __m128i left = _mm_slli_si128(xx_loadl_32(left_column), 8); + pixels = _mm_or_si128(pixels, left); + + // Two sets of the same pixels to multiply with two sets of taps. + pixels = _mm_shuffle_epi8(pixels, pixel_order1); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + left = _mm_srli_si128(left, 1); + + // Load + pixels = xx_loadl_32(dst + stride); + + // Because of the above shift, this OR 'invades' the final of the first 8 + // bytes of |pixels|. This is acceptable because the 8th filter tap is always + // a padded 0. + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + const ptrdiff_t stride2 = stride << 1; + const ptrdiff_t stride4 = stride << 2; + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + dst += 4; + for (int x = 3; x < width - 4; x += 4) { + pixels = xx_loadl_32(top_ptr + x); + pixels = _mm_insert_epi8(pixels, (int8_t)top_ptr[x + 4], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + pixels = xx_loadl_32(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + stride2 - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + dst += 4; + } + + // Now we handle heights that reference previous blocks rather than top_row. + for (int y = 4; y < height; y += 4) { + // Leftmost 4x4 block for this height. + dst -= width; + dst += stride4; + + // Top Left is not available by offset in these leftmost blocks. + pixels = xx_loadl_32(dst - stride); + left = _mm_slli_si128(xx_loadl_32(left_ptr + y - 1), 8); + left = _mm_insert_epi8(left, (int8_t)left_ptr[y + 3], 12); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + + // The bytes shifted into positions 6 and 7 will be ignored by the shuffle. + left = _mm_srli_si128(left, 2); + pixels = xx_loadl_32(dst + stride); + pixels = _mm_or_si128(pixels, left); + pixels = _mm_shuffle_epi8(pixels, pixel_order2); + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + + dst += 4; + + // Remaining 4x4 blocks for this height. + for (int x = 4; x < width; x += 4) { + pixels = xx_loadl_32(dst - stride - 1); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[-stride + 3], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[-1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst, stride, &pixels, &taps_0_1, &taps_2_3, &taps_4_5, + &taps_6_7); + pixels = xx_loadl_32(dst + stride - 1); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride + 3], 4); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 - 1], 5); + pixels = _mm_insert_epi8(pixels, (int8_t)dst[stride2 + stride - 1], 6); + + // Duplicate bottom half into upper half. + pixels = _mm_shuffle_epi32(pixels, DUPLICATE_FIRST_HALF); + filter_4x2_sse4_1(dst + stride2, stride, &pixels, &taps_0_1, &taps_2_3, + &taps_4_5, &taps_6_7); + dst += 4; + } + } +} + +void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, + TX_SIZE tx_size, const uint8_t *above, + const uint8_t *left, int mode) { + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + filter_intra_predictor_sse4_1(dst, stride, above, left, mode, bw, bh); +} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c new file mode 100644 index 0000000000..d65318ccfa --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +void av1_highbd_convolve_2d_sr_ssse3( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd); + +void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + if (filter_params_x->taps == 12) { + av1_highbd_convolve_2d_sr_ssse3(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[8], coeffs_y[4], coeffs_x[4]; + + const __m256i round_const_x = _mm256_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i round_const_y = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = _mm256_setzero_si256(); + if (i + 1 < im_h) + row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + s[2] = _mm256_unpacklo_epi16(s4, s5); + + s[4] = _mm256_unpackhi_epi16(s0, s1); + s[5] = _mm256_unpackhi_epi16(s2, s3); + s[6] = _mm256_unpackhi_epi16(s4, s5); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + const __m256i res_a = convolve(s, coeffs_y); + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_y), round_shift_y); + + res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_y), round_shift_y); + res_b_round = + _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits), + round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32(&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c new file mode 100644 index 0000000000..89d7199f48 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "av1/common/convolve.h" + +void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(const uint16_t *src, + int src_stride, uint16_t *dst0, + int dst_stride0, int w, int h, + ConvolveParams *conv_params, + int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i zero = _mm_setzero_si128(); + int i, j; + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const __m128i offset_const_16b = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits <= 4); + + if (!(w % 8)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 8) { + const __m128i src_16bit = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); + const __m128i res = _mm_sll_epi16(src_16bit, left_shift); + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); + + const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero); + const __m128i res_unsigned_lo = + _mm_add_epi32(res_32b_lo, offset_const); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); + const __m128i res_unsigned_hi = + _mm_add_epi32(res_32b_hi, offset_const); + + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_unsigned_16b = + _mm_adds_epu16(res, offset_const_16b); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 4) { + const __m128i src_row_0 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); + const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1); + + const __m128i res = _mm_sll_epi16(src_10, left_shift); + + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i res_32b = _mm_unpacklo_epi16(res, zero); + const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const); + + const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); + const __m128i res_unsigned_hi = + _mm_add_epi32(res_32b_hi, offset_const); + + const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( + &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_1 = _mm_srli_si128(res_clip, 8); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m128i res_unsigned_16b = + _mm_adds_epu16(res, offset_const_16b); + + const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + + const __m128i res_unsigned_lo = + _mm_add_epi32(res_lo_round, offset_const); + + if (w < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); + + const __m128i comp_avg_res = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result = highbd_convolve_rounding_sse2( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result, round_result); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_16b = + _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } else { + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_unsigned_hi = + _mm_add_epi32(res_hi_round, offset_const); + + if (do_average) { + const __m128i data_lo = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_hi = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4])); + + const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); + const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = + highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi = + highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m128i res_16b = + _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c new file mode 100644 index 0000000000..88974ba260 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" +#include "av1/common/convolve.h" +#include "aom_dsp/x86/convolve_common_intrin.h" + +void av1_highbd_convolve_2d_sr_ssse3( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const __m128i round_const_x = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m128i round_const_y = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int bits = + FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + if (filter_params_x->taps == 12) { + __m128i coeffs_x[6], coeffs_y[6], s[24]; + prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + const __m128i row02 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + s[4] = _mm_alignr_epi8(row02, row01, 0); + s[5] = _mm_alignr_epi8(row02, row01, 4); + + __m128i res_even = convolve_12tap(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + s[4] = _mm_alignr_epi8(row02, row01, 2); + s[5] = _mm_alignr_epi8(row02, row01, 6); + + __m128i res_odd = convolve_12tap(s, coeffs_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + _mm_store_si128((__m128i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); + __m128i s7 = _mm_loadu_si128((__m128i *)(im_block + 7 * im_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(im_block + 8 * im_stride)); + __m128i s9 = _mm_loadu_si128((__m128i *)(im_block + 9 * im_stride)); + __m128i s10 = _mm_loadu_si128((__m128i *)(im_block + 10 * im_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + s[3] = _mm_unpacklo_epi16(s6, s7); + s[4] = _mm_unpacklo_epi16(s8, s9); + + s[6] = _mm_unpackhi_epi16(s0, s1); + s[7] = _mm_unpackhi_epi16(s2, s3); + s[8] = _mm_unpackhi_epi16(s4, s5); + s[9] = _mm_unpackhi_epi16(s6, s7); + s[10] = _mm_unpackhi_epi16(s8, s9); + + s[12] = _mm_unpacklo_epi16(s1, s2); + s[13] = _mm_unpacklo_epi16(s3, s4); + s[14] = _mm_unpacklo_epi16(s5, s6); + s[15] = _mm_unpacklo_epi16(s7, s8); + s[16] = _mm_unpacklo_epi16(s9, s10); + + s[18] = _mm_unpackhi_epi16(s1, s2); + s[19] = _mm_unpackhi_epi16(s3, s4); + s[20] = _mm_unpackhi_epi16(s5, s6); + s[21] = _mm_unpackhi_epi16(s7, s8); + s[22] = _mm_unpackhi_epi16(s9, s10); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * im_stride)); + __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * im_stride)); + + s[5] = _mm_unpacklo_epi16(s10, s11); + s[11] = _mm_unpackhi_epi16(s10, s11); + + s[17] = _mm_unpacklo_epi16(s11, s12); + s[23] = _mm_unpackhi_epi16(s11, s12); + + const __m128i res_a0 = convolve_12tap(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_y), round_shift_y); + res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_y), round_shift_y); + res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_y), round_shift_y); + res_b_round0 = + _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits), + round_shift_bits); + + const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_y), round_shift_y); + res_b_round1 = + _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits), + round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((int *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((int *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + + s[6] = s[7]; + s[7] = s[8]; + s[8] = s[9]; + s[9] = s[10]; + s[10] = s[11]; + + s[12] = s[13]; + s[13] = s[14]; + s[14] = s[15]; + s[15] = s[16]; + s[16] = s[17]; + + s[18] = s[19]; + s[19] = s[20]; + s[20] = s[21]; + s[21] = s[22]; + s[22] = s[23]; + + s10 = s12; + } + } + } + } else { + __m128i coeffs_x[4], coeffs_y[4], s[16]; + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + _mm_store_si128((__m128i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_y), round_shift_y); + res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_y), round_shift_y); + res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_y), round_shift_y); + res_b_round0 = + _mm_sra_epi32(_mm_add_epi32(res_b_round0, round_const_bits), + round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_y), round_shift_y); + res_b_round1 = + _mm_sra_epi32(_mm_add_epi32(res_b_round1, round_const_bits), + round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((int *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((int *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c new file mode 100644 index 0000000000..cbfe5614c3 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c @@ -0,0 +1,4239 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +// Note: +// Total 32x4 registers to represent 32x32 block coefficients. +// For high bit depth, each coefficient is 4-byte. +// Each __m256i register holds 8 coefficients. +// So each "row" we needs 4 register. Totally 32 rows +// Register layout: +// v0, v1, v2, v3, +// v4, v5, v6, v7, +// ... ... +// v124, v125, v126, v127 + +static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one); + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(u, max); + clamped = _mm256_andnot_si256(mask, u); + mask = _mm256_and_si256(mask, max); + clamped = _mm256_or_si256(mask, clamped); + mask = _mm256_cmpgt_epi16(clamped, zero); + clamped = _mm256_and_si256(clamped, mask); + + return clamped; +} + +static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) { + if (shift != 0) { + __m256i rnding = _mm256_set1_epi32(1 << (shift - 1)); + in[0] = _mm256_add_epi32(in[0], rnding); + in[1] = _mm256_add_epi32(in[1], rnding); + in[2] = _mm256_add_epi32(in[2], rnding); + in[3] = _mm256_add_epi32(in[3], rnding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + } +} + +static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) { + round_shift_4x4_avx2(in, shift); + round_shift_4x4_avx2(in + 4, shift); + round_shift_4x4_avx2(in + 8, shift); + round_shift_4x4_avx2(in + 12, shift); +} + +static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out, + const __m256i *clamp_lo, + const __m256i *clamp_hi, int size) { + __m256i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm256_max_epi32(in[i], *clamp_lo); + out[i] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm256_min_epi32(a1, *clamp_hi); + + a0 = _mm256_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm256_min_epi32(a0, *clamp_hi); + + a1 = _mm256_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm256_min_epi32(a1, *clamp_hi); + } +} + +static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred, + __m256i res0, __m256i res1, + const int bd) { + __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred)); + __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)); + + x0 = _mm256_add_epi32(res0, x0); + x1 = _mm256_add_epi32(res1, x1); + x0 = _mm256_packus_epi32(x0, x1); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride)); + __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd); + + _mm256_storeu_si256((__m256i *)(output + i * stride), u); + } +} +static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res, + const int bd) { + __m256i x0 = pred; + x0 = _mm256_add_epi32(res, x0); + x0 = _mm256_packus_epi32(x0, x0); + x0 = _mm256_permute4x64_epi64(x0, 0xd8); + x0 = highbd_clamp_epi16_avx2(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + __m128i temp; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + temp = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m256i v = _mm256_cvtepi16_epi32(temp); + __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd); + __m128i u1 = _mm256_castsi256_si128(u); + _mm_storeu_si128((__m128i *)(output + i * stride), u1); + } +} +static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi, int shift) { + __m256i offset = _mm256_set1_epi32((1 << shift) >> 1); + __m256i a0 = _mm256_add_epi32(offset, in0); + __m256i a1 = _mm256_sub_epi32(offset, in1); + + a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void transpose_8x8_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0], in[1]); + u1 = _mm256_unpackhi_epi32(in[0], in[1]); + + u2 = _mm256_unpacklo_epi32(in[2], in[3]); + u3 = _mm256_unpackhi_epi32(in[2], in[3]); + + u4 = _mm256_unpacklo_epi32(in[4], in[5]); + u5 = _mm256_unpackhi_epi32(in[4], in[5]); + + u6 = _mm256_unpacklo_epi32(in[6], in[7]); + u7 = _mm256_unpackhi_epi32(in[6], in[7]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + +static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[7], in[6]); + u1 = _mm256_unpackhi_epi32(in[7], in[6]); + + u2 = _mm256_unpacklo_epi32(in[5], in[4]); + u3 = _mm256_unpackhi_epi32(in[5], in[4]); + + u4 = _mm256_unpacklo_epi32(in[3], in[2]); + u5 = _mm256_unpackhi_epi32(in[3], in[2]); + + u6 = _mm256_unpacklo_epi32(in[1], in[0]); + u7 = _mm256_unpackhi_epi32(in[1], in[0]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7] = _mm256_permute2f128_si256(x0, x1, 0x31); +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m256i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + } +} + +static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *rounding, int bit) { + __m256i x; + x = _mm256_mullo_epi32(*w0, *n0); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} + +static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} + +static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0, + __m256i *out1, const __m256i *clamp_lo, + const __m256i *clamp_hi) { + __m256i a0 = _mm256_add_epi32(in0, in1); + __m256i a1 = _mm256_sub_epi32(in0, in1); + + a0 = _mm256_max_epi32(a0, *clamp_lo); + a0 = _mm256_min_epi32(a0, *clamp_hi); + a1 = _mm256_max_epi32(a1, *clamp_lo); + a1 = _mm256_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static INLINE void idct32_stage4_avx2( + __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56, + const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40, + const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_avx2( + __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48, + const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo, + const __m256i *clamp_hi, const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_avx2( + __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rounding, int bit) { + __m256i temp1, temp2; + addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out, + const int do_cols, const int bd, + const int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rounding); + x = _mm256_srai_epi32(x, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(offset, x); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; +} + +static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit); + + addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit); + + addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i bf1[32], bf0[32]; + + { + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); + + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = + half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } + } +} +static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + { + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm256_mullo_epi32(in[0], cospi32); + in[0] = _mm256_add_epi32(in[0], rnding); + in[0] = _mm256_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm256_add_epi32(in[0], offset); + in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + in[0] = _mm256_max_epi32(in[0], clamp_lo); + in[0] = _mm256_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; + } +} + +static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit); + + addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[5], cospi32); + y = _mm256_mullo_epi32(u[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + u[10] = _mm256_sub_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[13] = _mm256_add_epi32(x, y); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_add_epi32(x, y); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + // stage 7 + addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm256_mullo_epi32(u[0], cospi32); + y = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(x, y); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(x, y); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm256_mullo_epi32(v[5], cospi32); + y = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_sub_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_add_epi32(x, y); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + x = _mm256_mullo_epi32(u[11], cospi32); + y = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_add_epi32(x, y); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8_avx2(out, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i v[16], x, y, temp1, temp2; + + // Calculate the column 0, 1, 2, 3 + { + // stage 0 + // stage 1 + // stage 2 + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(x, rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(zero, x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm256_mullo_epi32(v[8], cospi8); + x = _mm256_mullo_epi32(v[9], cospi56); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[8], cospi56); + x = _mm256_mullo_epi32(v[9], cospi8); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm256_mullo_epi32(v[12], cospi16); + x = _mm256_mullo_epi32(v[13], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + + temp2 = _mm256_mullo_epi32(v[12], cospi48); + x = _mm256_mullo_epi32(v[13], cospi16); + temp2 = _mm256_sub_epi32(temp2, x); + temp2 = _mm256_add_epi32(temp2, rnding); + temp2 = _mm256_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + y = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + y = _mm256_mullo_epi32(v[10], cospi32); + x = _mm256_mullo_epi32(v[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + y = _mm256_mullo_epi32(v[14], cospi32); + x = _mm256_mullo_epi32(v[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + __m256i zero = _mm256_setzero_si256(); + x = _mm256_mullo_epi32(in[0], cospi62); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi2); + u[1] = _mm256_sub_epi32(zero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + x = _mm256_mullo_epi32(in[2], cospi54); + u[2] = _mm256_add_epi32(x, rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + x = _mm256_mullo_epi32(in[2], cospi10); + u[3] = _mm256_sub_epi32(zero, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + x = _mm256_mullo_epi32(in[4], cospi46); + u[4] = _mm256_add_epi32(x, rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(in[4], cospi18); + u[5] = _mm256_sub_epi32(zero, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(in[6], cospi38); + u[6] = _mm256_add_epi32(x, rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(in[6], cospi26); + u[7] = _mm256_sub_epi32(zero, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[8] = _mm256_mullo_epi32(in[7], cospi34); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + u[9] = _mm256_mullo_epi32(in[7], cospi30); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + u[10] = _mm256_mullo_epi32(in[5], cospi42); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_mullo_epi32(in[5], cospi22); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + u[12] = _mm256_mullo_epi32(in[3], cospi50); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + u[13] = _mm256_mullo_epi32(in[3], cospi14); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + u[14] = _mm256_mullo_epi32(in[1], cospi58); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_mullo_epi32(in[1], cospi6); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 3 + addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi56); + u[8] = _mm256_mullo_epi32(u[8], cospi8); + u[8] = _mm256_add_epi32(u[8], x); + u[8] = _mm256_add_epi32(u[8], rnding); + u[8] = _mm256_srai_epi32(u[8], bit); + + x = _mm256_mullo_epi32(u[9], cospi8); + u[9] = _mm256_sub_epi32(y, x); + u[9] = _mm256_add_epi32(u[9], rnding); + u[9] = _mm256_srai_epi32(u[9], bit); + + x = _mm256_mullo_epi32(u[11], cospi24); + y = _mm256_mullo_epi32(u[10], cospi24); + u[10] = _mm256_mullo_epi32(u[10], cospi40); + u[10] = _mm256_add_epi32(u[10], x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + x = _mm256_mullo_epi32(u[11], cospi40); + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + x = _mm256_mullo_epi32(u[13], cospi8); + y = _mm256_mullo_epi32(u[12], cospi8); + u[12] = _mm256_mullo_epi32(u[12], cospim56); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospim56); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi40); + y = _mm256_mullo_epi32(u[14], cospi40); + u[14] = _mm256_mullo_epi32(u[14], cospim24); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim24); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 5 + addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm256_mullo_epi32(u[5], cospi48); + y = _mm256_mullo_epi32(u[4], cospi48); + u[4] = _mm256_mullo_epi32(u[4], cospi16); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + x = _mm256_mullo_epi32(u[5], cospi16); + u[5] = _mm256_sub_epi32(y, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + x = _mm256_mullo_epi32(u[7], cospi16); + y = _mm256_mullo_epi32(u[6], cospi16); + u[6] = _mm256_mullo_epi32(u[6], cospim48); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + x = _mm256_mullo_epi32(u[7], cospim48); + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + x = _mm256_mullo_epi32(u[13], cospi48); + y = _mm256_mullo_epi32(u[12], cospi48); + u[12] = _mm256_mullo_epi32(u[12], cospi16); + u[12] = _mm256_add_epi32(u[12], x); + u[12] = _mm256_add_epi32(u[12], rnding); + u[12] = _mm256_srai_epi32(u[12], bit); + + x = _mm256_mullo_epi32(u[13], cospi16); + u[13] = _mm256_sub_epi32(y, x); + u[13] = _mm256_add_epi32(u[13], rnding); + u[13] = _mm256_srai_epi32(u[13], bit); + + x = _mm256_mullo_epi32(u[15], cospi16); + y = _mm256_mullo_epi32(u[14], cospi16); + u[14] = _mm256_mullo_epi32(u[14], cospim48); + u[14] = _mm256_add_epi32(u[14], x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + x = _mm256_mullo_epi32(u[15], cospim48); + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 7 + addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + u[2] = _mm256_add_epi32(y, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(y, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + u[6] = _mm256_add_epi32(y, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(y, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + u[10] = _mm256_add_epi32(y, x); + u[10] = _mm256_add_epi32(u[10], rnding); + u[10] = _mm256_srai_epi32(u[10], bit); + + u[11] = _mm256_sub_epi32(y, x); + u[11] = _mm256_add_epi32(u[11], rnding); + u[11] = _mm256_srai_epi32(u[11], bit); + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + u[14] = _mm256_add_epi32(y, x); + u[14] = _mm256_add_epi32(u[14], rnding); + u[14] = _mm256_srai_epi32(u[14], bit); + + u[15] = _mm256_sub_epi32(y, x); + u[15] = _mm256_add_epi32(u[15], rnding); + u[15] = _mm256_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]); + out[2] = u[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]); + out[4] = u[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]); + out[6] = u[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]); + out[8] = u[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]); + out[10] = u[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]); + out[12] = u[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]); + out[14] = u[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} + +static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm256_mullo_epi32(in[15], cospi2); + x = _mm256_mullo_epi32(in[0], cospi62); + v[0] = _mm256_add_epi32(v[0], x); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_mullo_epi32(in[15], cospi62); + x = _mm256_mullo_epi32(in[0], cospi2); + v[1] = _mm256_sub_epi32(v[1], x); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(in[13], cospi10); + x = _mm256_mullo_epi32(in[2], cospi54); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(in[13], cospi54); + x = _mm256_mullo_epi32(in[2], cospi10); + v[3] = _mm256_sub_epi32(v[3], x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_mullo_epi32(in[11], cospi18); + x = _mm256_mullo_epi32(in[4], cospi46); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(in[11], cospi46); + x = _mm256_mullo_epi32(in[4], cospi18); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(in[9], cospi26); + x = _mm256_mullo_epi32(in[6], cospi38); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(in[9], cospi38); + x = _mm256_mullo_epi32(in[6], cospi26); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = _mm256_mullo_epi32(in[7], cospi34); + x = _mm256_mullo_epi32(in[8], cospi30); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(in[7], cospi30); + x = _mm256_mullo_epi32(in[8], cospi34); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(in[5], cospi42); + x = _mm256_mullo_epi32(in[10], cospi22); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(in[5], cospi22); + x = _mm256_mullo_epi32(in[10], cospi42); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(in[3], cospi50); + x = _mm256_mullo_epi32(in[12], cospi14); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(in[3], cospi14); + x = _mm256_mullo_epi32(in[12], cospi50); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(in[1], cospi58); + x = _mm256_mullo_epi32(in[14], cospi6); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(in[1], cospi6); + x = _mm256_mullo_epi32(in[14], cospi58); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi8); + x = _mm256_mullo_epi32(u[9], cospi56); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[9] = _mm256_mullo_epi32(u[8], cospi56); + x = _mm256_mullo_epi32(u[9], cospi8); + v[9] = _mm256_sub_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi40); + x = _mm256_mullo_epi32(u[11], cospi24); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_mullo_epi32(u[10], cospi24); + x = _mm256_mullo_epi32(u[11], cospi40); + v[11] = _mm256_sub_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[12], cospim56); + x = _mm256_mullo_epi32(u[13], cospi8); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi8); + x = _mm256_mullo_epi32(u[13], cospim56); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim24); + x = _mm256_mullo_epi32(u[15], cospi40); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi40); + x = _mm256_mullo_epi32(u[15], cospim24); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 5 + addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm256_mullo_epi32(u[4], cospi16); + x = _mm256_mullo_epi32(u[5], cospi48); + v[4] = _mm256_add_epi32(v[4], x); + v[4] = _mm256_add_epi32(v[4], rnding); + v[4] = _mm256_srai_epi32(v[4], bit); + + v[5] = _mm256_mullo_epi32(u[4], cospi48); + x = _mm256_mullo_epi32(u[5], cospi16); + v[5] = _mm256_sub_epi32(v[5], x); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + v[6] = _mm256_mullo_epi32(u[6], cospim48); + x = _mm256_mullo_epi32(u[7], cospi16); + v[6] = _mm256_add_epi32(v[6], x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_mullo_epi32(u[6], cospi16); + x = _mm256_mullo_epi32(u[7], cospim48); + v[7] = _mm256_sub_epi32(v[7], x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm256_mullo_epi32(u[12], cospi16); + x = _mm256_mullo_epi32(u[13], cospi48); + v[12] = _mm256_add_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + v[13] = _mm256_mullo_epi32(u[12], cospi48); + x = _mm256_mullo_epi32(u[13], cospi16); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[14] = _mm256_mullo_epi32(u[14], cospim48); + x = _mm256_mullo_epi32(u[15], cospi16); + v[14] = _mm256_add_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_mullo_epi32(u[14], cospi16); + x = _mm256_mullo_epi32(u[15], cospim48); + v[15] = _mm256_sub_epi32(v[15], x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 7 + addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm256_mullo_epi32(u[2], cospi32); + x = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(y, x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(y, x); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm256_mullo_epi32(u[6], cospi32); + x = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(y, x); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(y, x); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(y, x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(y, x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm256_mullo_epi32(u[14], cospi32); + x = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(y, x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(y, x); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]); + out[2] = v[12]; + out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]); + out[4] = v[6]; + out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]); + out[6] = v[10]; + out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]); + out[8] = v[3]; + out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]); + out[10] = v[15]; + out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]); + out[12] = v[5]; + out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]); + out[14] = v[9]; + out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + } +} +static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm256_mullo_epi32(in[0], cospi32); + x = _mm256_add_epi32(x, rnding); + x = _mm256_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} +static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm256_mullo_epi32(in[1], cospi56); + y = _mm256_mullo_epi32(in[7], cospim8); + u4 = _mm256_add_epi32(x, y); + u4 = _mm256_add_epi32(u4, rnding); + u4 = _mm256_srai_epi32(u4, bit); + + x = _mm256_mullo_epi32(in[1], cospi8); + y = _mm256_mullo_epi32(in[7], cospi56); + u7 = _mm256_add_epi32(x, y); + u7 = _mm256_add_epi32(u7, rnding); + u7 = _mm256_srai_epi32(u7, bit); + + x = _mm256_mullo_epi32(in[5], cospi24); + y = _mm256_mullo_epi32(in[3], cospim40); + u5 = _mm256_add_epi32(x, y); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + x = _mm256_mullo_epi32(in[5], cospi40); + y = _mm256_mullo_epi32(in[3], cospi24); + u6 = _mm256_add_epi32(x, y); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + // stage 3 + x = _mm256_mullo_epi32(u0, cospi32); + y = _mm256_mullo_epi32(u1, cospi32); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + v1 = _mm256_sub_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi48); + y = _mm256_mullo_epi32(u3, cospim16); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi16); + y = _mm256_mullo_epi32(u3, cospi48); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm256_mullo_epi32(v5, cospi32); + y = _mm256_mullo_epi32(v6, cospi32); + u6 = _mm256_add_epi32(y, x); + u6 = _mm256_add_epi32(u6, rnding); + u6 = _mm256_srai_epi32(u6, bit); + + u5 = _mm256_sub_epi32(y, x); + u5 = _mm256_add_epi32(u5, rnding); + u5 = _mm256_srai_epi32(u5, bit); + + addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4_avx2(out, out_shift); + round_shift_4x4_avx2(out + 4, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} +static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + __m256i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(x, rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(kZero, x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m256i temp1, temp2; + temp1 = _mm256_mullo_epi32(u[0], cospi16); + x = _mm256_mullo_epi32(u[1], cospi48); + temp1 = _mm256_add_epi32(temp1, x); + temp1 = _mm256_add_epi32(temp1, rnding); + temp1 = _mm256_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm256_mullo_epi32(u[0], cospi48); + x = _mm256_mullo_epi32(u[1], cospi16); + u[5] = _mm256_sub_epi32(temp2, x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm256_mullo_epi32(u[0], cospi32); + x = _mm256_mullo_epi32(u[1], cospi32); + u[2] = _mm256_add_epi32(temp1, x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(temp1, x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + temp1 = _mm256_mullo_epi32(u[4], cospi32); + x = _mm256_mullo_epi32(u[5], cospi32); + u[6] = _mm256_add_epi32(temp1, x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(temp1, x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i kZero = _mm256_setzero_si256(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + __m256i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm256_mullo_epi32(in[7], cospi4); + x = _mm256_mullo_epi32(in[0], cospi60); + u[0] = _mm256_add_epi32(u[0], x); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_mullo_epi32(in[7], cospi60); + x = _mm256_mullo_epi32(in[0], cospi4); + u[1] = _mm256_sub_epi32(u[1], x); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + u[2] = _mm256_mullo_epi32(in[5], cospi20); + x = _mm256_mullo_epi32(in[2], cospi44); + u[2] = _mm256_add_epi32(u[2], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_mullo_epi32(in[5], cospi44); + x = _mm256_mullo_epi32(in[2], cospi20); + u[3] = _mm256_sub_epi32(u[3], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_mullo_epi32(in[3], cospi36); + x = _mm256_mullo_epi32(in[4], cospi28); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(in[3], cospi28); + x = _mm256_mullo_epi32(in[4], cospi36); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(in[1], cospi52); + x = _mm256_mullo_epi32(in[6], cospi12); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(in[1], cospi12); + x = _mm256_mullo_epi32(in[6], cospi52); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 3 + addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi16); + x = _mm256_mullo_epi32(v[5], cospi48); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[5] = _mm256_mullo_epi32(v[4], cospi48); + x = _mm256_mullo_epi32(v[5], cospi16); + u[5] = _mm256_sub_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[6], cospim48); + x = _mm256_mullo_epi32(v[7], cospi16); + u[6] = _mm256_add_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_mullo_epi32(v[6], cospi16); + x = _mm256_mullo_epi32(v[7], cospim48); + u[7] = _mm256_sub_epi32(u[7], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 5 + addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm256_mullo_epi32(v[2], cospi32); + x = _mm256_mullo_epi32(v[3], cospi32); + u[2] = _mm256_add_epi32(v[0], x); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + u[3] = _mm256_sub_epi32(v[0], x); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + v[0] = _mm256_mullo_epi32(v[6], cospi32); + x = _mm256_mullo_epi32(v[7], cospi32); + u[6] = _mm256_add_epi32(v[0], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = _mm256_sub_epi32(v[0], x); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm256_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm256_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm256_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm256_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} +static INLINE void idct64_stage8_avx2( + __m256i *u, const __m256i *cospim32, const __m256i *cospi32, + const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16, + const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + int i; + __m256i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32, + const __m256i *cospi32, + const __m256i *clamp_lo, + const __m256i *clamp_hi, + const __m256i *rnding, int bit) { + __m256i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols, + int bd, int out_shift, + const __m256i *clamp_lo, + const __m256i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } +} + +static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + + { + __m256i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1); + x = _mm256_add_epi32(x, offset); + x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + } + x = _mm256_max_epi32(x, clamp_lo); + x = _mm256_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} +static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + + { + __m256i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m256i temp1, temp2; + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64]; + __m256i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} +static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd, + int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1))); + const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1); + + const __m256i cospi1 = _mm256_set1_epi32(cospi[1]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi3 = _mm256_set1_epi32(cospi[3]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi5 = _mm256_set1_epi32(cospi[5]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospi7 = _mm256_set1_epi32(cospi[7]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi9 = _mm256_set1_epi32(cospi[9]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi11 = _mm256_set1_epi32(cospi[11]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi13 = _mm256_set1_epi32(cospi[13]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospi15 = _mm256_set1_epi32(cospi[15]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi17 = _mm256_set1_epi32(cospi[17]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi19 = _mm256_set1_epi32(cospi[19]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi21 = _mm256_set1_epi32(cospi[21]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospi23 = _mm256_set1_epi32(cospi[23]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi25 = _mm256_set1_epi32(cospi[25]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi27 = _mm256_set1_epi32(cospi[27]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi29 = _mm256_set1_epi32(cospi[29]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospi31 = _mm256_set1_epi32(cospi[31]); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi35 = _mm256_set1_epi32(cospi[35]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospi39 = _mm256_set1_epi32(cospi[39]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi43 = _mm256_set1_epi32(cospi[43]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospi47 = _mm256_set1_epi32(cospi[47]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi51 = _mm256_set1_epi32(cospi[51]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospi55 = _mm256_set1_epi32(cospi[55]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi59 = _mm256_set1_epi32(cospi[59]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospi63 = _mm256_set1_epi32(cospi[63]); + + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]); + const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]); + const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]); + + { + __m256i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit); + + addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m256i clamp_lo_out = + _mm256_set1_epi32(-(1 << (log_range_out - 1))); + const __m256i clamp_hi_out = + _mm256_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_8x8_avx2(out, out_shift); + round_shift_8x8_avx2(out + 16, out_shift); + round_shift_8x8_avx2(out + 32, out_shift); + round_shift_8x8_avx2(out + 48, out_shift); + highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64); + } + } +} +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit, + int do_cols, int bd, int out_shift); + +static const transform_1d_avx2 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { + { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL }, + { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { + { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL }, + { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL }, + { NULL, NULL, NULL, NULL }, + }, + { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } }, + + { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; + +static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m256i buf1[64 * 8]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 3; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_avx2 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_avx2 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8; i++) { + __m256i buf0[64]; + load_buffer_32bit_input(input + i * 8, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m256i *_buf1 = buf1 + i * 8; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_flip_avx2( + &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + round_shift_array_32_avx2(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + if (txfm_size_col >= 16) { + for (int i = 0; i < (txfm_size_col >> 4); i++) { + highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2, + output + 16 * i, stride, ud_flip, + txfm_size_row, bd); + } + } else if (txfm_size_col == 8) { + highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row, + bd); + } +} + +void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, + tx_size, eob, bd); + break; + default: assert(0); break; + } +} +void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); + break; + default: + av1_highbd_inv_txfm2d_add_universe_avx2( + input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, + txfm_param->eob, txfm_param->bd); + break; + } +} diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c new file mode 100644 index 0000000000..4ff6a90f95 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -0,0 +1,5830 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/av1_inv_txfm1d_cfg.h" +#include "av1/common/idct.h" +#include "av1/common/x86/av1_inv_txfm_ssse3.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/av1_txfm_sse4.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" + +static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i clamped, mask; + + mask = _mm_cmpgt_epi16(u, max); + clamped = _mm_andnot_si128(mask, u); + mask = _mm_and_si128(mask, max); + clamped = _mm_or_si128(mask, clamped); + mask = _mm_cmpgt_epi16(clamped, zero); + clamped = _mm_and_si128(clamped, mask); + + return clamped; +} + +static INLINE void round_shift_4x4(__m128i *in, int shift) { + if (shift != 0) { + __m128i rnding = _mm_set1_epi32(1 << (shift - 1)); + in[0] = _mm_add_epi32(in[0], rnding); + in[1] = _mm_add_epi32(in[1], rnding); + in[2] = _mm_add_epi32(in[2], rnding); + in[3] = _mm_add_epi32(in[3], rnding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + } +} + +static void round_shift_8x8(__m128i *in, int shift) { + round_shift_4x4(&in[0], shift); + round_shift_4x4(&in[4], shift); + round_shift_4x4(&in[8], shift); + round_shift_4x4(&in[12], shift); +} + +static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int size) { + __m128i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm_max_epi32(in[i], *clamp_lo); + out[i] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm_min_epi32(a1, *clamp_hi); + + a0 = _mm_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm_min_epi32(a1, *clamp_hi); + } +} + +static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred, + __m128i res0, __m128i res1, + const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)); + __m128i min_clip_val = _mm_setzero_si128(); + __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1); + x0 = _mm_add_epi32(res0, x0); + x1 = _mm_add_epi32(res1, x1); + x0 = _mm_max_epi32(x0, min_clip_val); + x0 = _mm_min_epi32(x0, max_clip_val); + x1 = _mm_max_epi32(x1, min_clip_val); + x1 = _mm_min_epi32(x1, max_clip_val); + x0 = _mm_packus_epi32(x0, x1); + return x0; +} + +static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred, + __m128i res0, const int bd) { + __m128i x0 = _mm_cvtepi16_epi32(pred); + + x0 = _mm_add_epi32(res0, x0); + x0 = _mm_packus_epi32(x0, x0); + x0 = highbd_clamp_epi16(x0, bd); + return x0; +} + +static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd); + + _mm_storel_epi64((__m128i *)(output + i * stride), u); + } +} + +static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output, + int stride, int flipud, + int height, const int bd) { + int j = flipud ? (height - 1) : 0; + const int step = flipud ? -1 : 1; + for (int i = 0; i < height; ++i, j += step) { + __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride)); + __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd); + + _mm_storeu_si128((__m128i *)(output + i * stride), u); + } +} + +static INLINE void load_buffer_32bit_input(const int32_t *in, int stride, + __m128i *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride)); + } +} + +static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); +} + +void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + __m128i op[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + load_buffer_4x4(input, op); + + // Shift before-hand. + op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT); + op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT); + op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT); + op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT); + + for (int i = 0; i < 2; ++i) { + __m128i a1 = op[0]; + __m128i c1 = op[1]; + __m128i d1 = op[2]; + __m128i b1 = op[3]; + a1 = _mm_add_epi32(a1, c1); // a1 += c1 + d1 = _mm_sub_epi32(d1, b1); // d1 -= b1 + __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 + e1 = _mm_srai_epi32(e1, 1); + b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 + c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 + a1 = _mm_sub_epi32(a1, b1); // a1 -= b1 + d1 = _mm_add_epi32(d1, c1); // d1 += c1 + + op[0] = a1; + op[1] = b1; + op[2] = c1; + op[3] = d1; + if (i == 0) { + transpose_32bit_4x4(op, op); + } + } + + // Convert to int16_t. The C code checks that we are in range. + op[0] = _mm_packs_epi32(op[0], op[1]); + op[1] = _mm_packs_epi32(op[2], op[3]); + + // Load uint16_t. + __m128i dst[2]; + __m128i tmp[4]; + tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); + dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]); + tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); + tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); + dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]); + + // Add to the previous results. + dst[0] = _mm_add_epi16(dst[0], op[0]); + dst[1] = _mm_add_epi16(dst[1], op[1]); + + // Clamp. + dst[0] = highbd_clamp_epi16(dst[0], bd); + dst[1] = highbd_clamp_epi16(dst[1], bd); + + // Store. + _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]); + dst[0] = _mm_srli_si128(dst[0], 8); + _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]); + _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]); + dst[1] = _mm_srli_si128(dst[1], 8); + _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]); +} + +static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0, + __m128i *out1, const __m128i *clamp_lo, + const __m128i *clamp_hi) { + __m128i a0 = _mm_add_epi32(in0, in1); + __m128i a1 = _mm_sub_epi32(in0, in1); + + a0 = _mm_max_epi32(a0, *clamp_lo); + a0 = _mm_min_epi32(a0, *clamp_hi); + a1 = _mm_max_epi32(a1, *clamp_lo); + a1 = _mm_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int shift) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i in0_w_offset = _mm_add_epi32(*in0, offset); + __m128i in1_w_offset = _mm_add_epi32(*in1, offset); + + in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift)); + in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift)); + + in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo); + in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi); + in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo); + in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi); + + *in0 = in0_w_offset; + *in1 = in1_w_offset; +} + +static INLINE void idct32_stage4_sse4_1( + __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56, + const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40, + const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit); + bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit); + bf1[17] = temp1; + + temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit); + bf1[18] = temp2; + + temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit); + bf1[21] = temp1; + + temp2 = + half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit); + bf1[22] = temp2; +} + +static INLINE void idct32_stage5_sse4_1( + __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48, + const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo, + const __m128i *clamp_hi, const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit); + bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit); + bf1[9] = temp1; + + temp2 = + half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit); + bf1[10] = temp2; + + addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage6_sse4_1( + __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit); + bf1[5] = temp1; + + addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit); + bf1[29] = + half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit); + bf1[18] = temp1; + temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit); + bf1[28] = + half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit); + bf1[19] = temp2; + temp1 = + half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = + half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit); + bf1[21] = temp2; +} + +static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[13] = + half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit); + bf1[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[12] = + half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit); + bf1[11] = temp2; + + addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi); +} + +static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rounding, int bit) { + __m128i temp1, temp2; + addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi); + + temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[27] = + half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit); + bf1[20] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[26] = + half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit); + bf1[21] = temp2; + temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[25] = + half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit); + bf1[22] = temp1; + temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[24] = + half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit); + bf1[23] = temp2; +} + +static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out, + const int do_cols, const int bd, + const int out_shift, + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi); + addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (int i = 0; i < 32; i += 8) { + round_shift_4x4(out + i, out_shift); + round_shift_4x4(out + i + 4, out_shift); + } + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +static void neg_shift_sse4_1(const __m128i in0, const __m128i in1, + __m128i *out0, __m128i *out1, + const __m128i *clamp_lo, const __m128i *clamp_hi, + int shift) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i a0 = _mm_add_epi32(offset, in0); + __m128i a1 = _mm_sub_epi32(offset, in1); + + a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift)); + a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift)); + + a0 = _mm_max_epi32(a0, *clamp_lo); + a0 = _mm_min_epi32(a0, *clamp_hi); + a1 = _mm_max_epi32(a1, *clamp_lo); + a1 = _mm_min_epi32(a1, *clamp_hi); + + *out0 = a0; + *out1 = a1; +} + +static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3, x, y; + + // Stage 0 + // Stage 1 + // Stage 2 + u0 = in[0]; + u1 = in[1]; + u2 = in[2]; + u3 = in[3]; + + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u2, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u1, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u1, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + // Stage 3 + addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi); + + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift); + shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift); + } +} + +static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *sinpi = sinpi_arr(bit); + const __m128i zero = _mm_setzero_si128(); + __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1)); + rnding = _mm_unpacklo_epi32(rnding, zero); + const __m128i mul = _mm_set1_epi32(1 << 4); + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; + __m128i u0, u1, u2, u3; + __m128i u0_low, u1_low, u2_low, u3_low; + __m128i u0_high, u1_high, u2_high, u3_high; + + x0 = in[0]; + x1 = in[1]; + x2 = in[2]; + x3 = in[3]; + + s0 = _mm_mullo_epi32(x0, sinpi1); + s1 = _mm_mullo_epi32(x0, sinpi2); + s2 = _mm_mullo_epi32(x1, sinpi3); + s3 = _mm_mullo_epi32(x2, sinpi4); + s4 = _mm_mullo_epi32(x2, sinpi1); + s5 = _mm_mullo_epi32(x3, sinpi2); + s6 = _mm_mullo_epi32(x3, sinpi4); + t = _mm_sub_epi32(x0, x2); + s7 = _mm_add_epi32(t, x3); + + t = _mm_add_epi32(s0, s3); + s0 = _mm_add_epi32(t, s5); + t = _mm_sub_epi32(s1, s4); + s1 = _mm_sub_epi32(t, s6); + s3 = s2; + s2 = _mm_mullo_epi32(s7, sinpi3); + + u0 = _mm_add_epi32(s0, s3); + u1 = _mm_add_epi32(s1, s3); + u2 = s2; + t = _mm_add_epi32(s0, s1); + u3 = _mm_sub_epi32(t, s3); + + // u0 + u0_low = _mm_mul_epi32(u0, mul); + u0_low = _mm_add_epi64(u0_low, rnding); + + u0 = _mm_srli_si128(u0, 4); + u0_high = _mm_mul_epi32(u0, mul); + u0_high = _mm_add_epi64(u0_high, rnding); + + u0_low = _mm_srli_si128(u0_low, 2); + u0_high = _mm_srli_si128(u0_high, 2); + + u0 = _mm_unpacklo_epi32(u0_low, u0_high); + u0_high = _mm_unpackhi_epi32(u0_low, u0_high); + u0 = _mm_unpacklo_epi64(u0, u0_high); + + // u1 + u1_low = _mm_mul_epi32(u1, mul); + u1_low = _mm_add_epi64(u1_low, rnding); + + u1 = _mm_srli_si128(u1, 4); + u1_high = _mm_mul_epi32(u1, mul); + u1_high = _mm_add_epi64(u1_high, rnding); + + u1_low = _mm_srli_si128(u1_low, 2); + u1_high = _mm_srli_si128(u1_high, 2); + + u1 = _mm_unpacklo_epi32(u1_low, u1_high); + u1_high = _mm_unpackhi_epi32(u1_low, u1_high); + u1 = _mm_unpacklo_epi64(u1, u1_high); + + // u2 + u2_low = _mm_mul_epi32(u2, mul); + u2_low = _mm_add_epi64(u2_low, rnding); + + u2 = _mm_srli_si128(u2, 4); + u2_high = _mm_mul_epi32(u2, mul); + u2_high = _mm_add_epi64(u2_high, rnding); + + u2_low = _mm_srli_si128(u2_low, 2); + u2_high = _mm_srli_si128(u2_high, 2); + + u2 = _mm_unpacklo_epi32(u2_low, u2_high); + u2_high = _mm_unpackhi_epi32(u2_low, u2_high); + u2 = _mm_unpacklo_epi64(u2, u2_high); + + // u3 + u3_low = _mm_mul_epi32(u3, mul); + u3_low = _mm_add_epi64(u3_low, rnding); + + u3 = _mm_srli_si128(u3, 4); + u3_high = _mm_mul_epi32(u3, mul); + u3_high = _mm_add_epi64(u3_high, rnding); + + u3_low = _mm_srli_si128(u3_low, 2); + u3_high = _mm_srli_si128(u3_high, 2); + + u3 = _mm_unpacklo_epi32(u3_low, u3_high); + u3_high = _mm_unpackhi_epi32(u3_low, u3_high); + u3 = _mm_unpacklo_epi64(u3, u3_high); + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } +} + +static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + round_shift_4x4(in, shift); + + v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride)); + v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride)); + v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride)); + v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride)); + + v0 = _mm_unpacklo_epi16(v0, zero); + v1 = _mm_unpacklo_epi16(v1, zero); + v2 = _mm_unpacklo_epi16(v2, zero); + v3 = _mm_unpacklo_epi16(v3, zero); + + if (fliplr) { + in[0] = _mm_shuffle_epi32(in[0], 0x1B); + in[1] = _mm_shuffle_epi32(in[1], 0x1B); + in[2] = _mm_shuffle_epi32(in[2], 0x1B); + in[3] = _mm_shuffle_epi32(in[3], 0x1B); + } + + if (flipud) { + u0 = _mm_add_epi32(in[3], v0); + u1 = _mm_add_epi32(in[2], v1); + u2 = _mm_add_epi32(in[1], v2); + u3 = _mm_add_epi32(in[0], v3); + } else { + u0 = _mm_add_epi32(in[0], v0); + u1 = _mm_add_epi32(in[1], v1); + u2 = _mm_add_epi32(in[2], v2); + u3 = _mm_add_epi32(in[3], v3); + } + + v0 = _mm_packus_epi32(u0, u1); + v2 = _mm_packus_epi32(u2, u3); + + u0 = highbd_clamp_epi16(v0, bd); + u2 = highbd_clamp_epi16(v2, bd); + + v0 = _mm_unpacklo_epi64(u0, u0); + v1 = _mm_unpackhi_epi64(u0, u0); + v2 = _mm_unpacklo_epi64(u2, u2); + v3 = _mm_unpackhi_epi64(u2, u2); + + _mm_storel_epi64((__m128i *)(output + 0 * stride), v0); + _mm_storel_epi64((__m128i *)(output + 1 * stride), v1); + _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); + _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); +} + +static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i zero = _mm_setzero_si128(); + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a1_low; + __m128i a0_high, a1_high; + + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 4; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } +} +void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[4]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case IDTX: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(input, in); + idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(input, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case H_FLIPADST: + load_buffer_4x4(input, in); + iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0); + transpose_32bit_4x4(in, in); + iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; + default: assert(0); + } +} + +// 8x8 +static void load_buffer_8x8(const int32_t *coeff, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(coeff + 0)); + in[1] = _mm_load_si128((const __m128i *)(coeff + 4)); + in[2] = _mm_load_si128((const __m128i *)(coeff + 8)); + in[3] = _mm_load_si128((const __m128i *)(coeff + 12)); + in[4] = _mm_load_si128((const __m128i *)(coeff + 16)); + in[5] = _mm_load_si128((const __m128i *)(coeff + 20)); + in[6] = _mm_load_si128((const __m128i *)(coeff + 24)); + in[7] = _mm_load_si128((const __m128i *)(coeff + 28)); + in[8] = _mm_load_si128((const __m128i *)(coeff + 32)); + in[9] = _mm_load_si128((const __m128i *)(coeff + 36)); + in[10] = _mm_load_si128((const __m128i *)(coeff + 40)); + in[11] = _mm_load_si128((const __m128i *)(coeff + 44)); + in[12] = _mm_load_si128((const __m128i *)(coeff + 48)); + in[13] = _mm_load_si128((const __m128i *)(coeff + 52)); + in[14] = _mm_load_si128((const __m128i *)(coeff + 56)); + in[15] = _mm_load_si128((const __m128i *)(coeff + 60)); +} + +static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; + + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < 2; ++col) { + // stage 0 + // stage 1 + // stage 2 + u0 = in[0 * 2 + col]; + u1 = in[4 * 2 + col]; + u2 = in[2 * 2 + col]; + u3 = in[6 * 2 + col]; + + x = _mm_mullo_epi32(in[1 * 2 + col], cospi56); + y = _mm_mullo_epi32(in[7 * 2 + col], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1 * 2 + col], cospi8); + y = _mm_mullo_epi32(in[7 * 2 + col], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5 * 2 + col], cospi24); + y = _mm_mullo_epi32(in[3 * 2 + col], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5 * 2 + col], cospi40); + y = _mm_mullo_epi32(in[3 * 2 + col], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + // stage 5 + addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo, + &clamp_hi); + addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[8], v[8], x; + + // Even 8 points: 0, 2, ..., 14 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[14], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[14], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[10], cospi20); + x = _mm_mullo_epi32(in[4], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[10], cospi44); + x = _mm_mullo_epi32(in[4], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[6], cospi36); + x = _mm_mullo_epi32(in[8], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[6], cospi28); + x = _mm_mullo_epi32(in[8], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[2], cospi52); + x = _mm_mullo_epi32(in[12], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[2], cospi12); + x = _mm_mullo_epi32(in[12], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[2] = _mm_sub_epi32(kZero, u[4]); + out[4] = u[6]; + out[6] = _mm_sub_epi32(kZero, u[2]); + out[8] = u[3]; + out[10] = _mm_sub_epi32(kZero, u[7]); + out[12] = u[5]; + out[14] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out, + &clamp_hi_out, out_shift); + } + + // Odd 8 points: 1, 3, ..., 15 + // stage 0 + // stage 1 + // stage 2 + // (1) + u[0] = _mm_mullo_epi32(in[15], cospi4); + x = _mm_mullo_epi32(in[1], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[15], cospi60); + x = _mm_mullo_epi32(in[1], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[11], cospi20); + x = _mm_mullo_epi32(in[5], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[11], cospi44); + x = _mm_mullo_epi32(in[5], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[7], cospi36); + x = _mm_mullo_epi32(in[9], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[7], cospi28); + x = _mm_mullo_epi32(in[9], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[3], cospi52); + x = _mm_mullo_epi32(in[13], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[3], cospi12); + x = _mm_mullo_epi32(in[13], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[1] = u[0]; + out[3] = _mm_sub_epi32(kZero, u[4]); + out[5] = u[6]; + out[7] = _mm_sub_epi32(kZero, u[2]); + out[9] = u[3]; + out[11] = _mm_sub_epi32(kZero, u[7]); + out[13] = u[5]; + out[15] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + out[0] = _mm_add_epi32(in[0], in[0]); + out[1] = _mm_add_epi32(in[1], in[1]); + out[2] = _mm_add_epi32(in[2], in[2]); + out[3] = _mm_add_epi32(in[3], in[3]); + out[4] = _mm_add_epi32(in[4], in[4]); + out[5] = _mm_add_epi32(in[5], in[5]); + out[6] = _mm_add_epi32(in[6], in[6]); + out[7] = _mm_add_epi32(in[7], in[7]); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8); + } +} + +static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi, + int fliplr, int bd) { + __m128i x0, x1; + const __m128i zero = _mm_setzero_si128(); + + x0 = _mm_unpacklo_epi16(pred, zero); + x1 = _mm_unpackhi_epi16(pred, zero); + + if (fliplr) { + res_lo = _mm_shuffle_epi32(res_lo, 0x1B); + res_hi = _mm_shuffle_epi32(res_hi, 0x1B); + x0 = _mm_add_epi32(res_hi, x0); + x1 = _mm_add_epi32(res_lo, x1); + + } else { + x0 = _mm_add_epi32(res_lo, x0); + x1 = _mm_add_epi32(res_hi, x1); + } + + x0 = _mm_packus_epi32(x0, x1); + return highbd_clamp_epi16(x0, bd); +} + +static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride, + int fliplr, int flipud, int shift, int bd) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + + round_shift_8x8(in, shift); + + v0 = _mm_load_si128((__m128i const *)(output + 0 * stride)); + v1 = _mm_load_si128((__m128i const *)(output + 1 * stride)); + v2 = _mm_load_si128((__m128i const *)(output + 2 * stride)); + v3 = _mm_load_si128((__m128i const *)(output + 3 * stride)); + v4 = _mm_load_si128((__m128i const *)(output + 4 * stride)); + v5 = _mm_load_si128((__m128i const *)(output + 5 * stride)); + v6 = _mm_load_si128((__m128i const *)(output + 6 * stride)); + v7 = _mm_load_si128((__m128i const *)(output + 7 * stride)); + + if (flipud) { + u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd); + u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd); + u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd); + u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd); + u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd); + u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd); + u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd); + u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd); + } else { + u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd); + u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd); + u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd); + u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd); + u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd); + u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd); + u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd); + u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd); + } + + _mm_store_si128((__m128i *)(output + 0 * stride), u0); + _mm_store_si128((__m128i *)(output + 1 * stride), u1); + _mm_store_si128((__m128i *)(output + 2 * stride), u2); + _mm_store_si128((__m128i *)(output + 3 * stride), u3); + _mm_store_si128((__m128i *)(output + 4 * stride), u4); + _mm_store_si128((__m128i *)(output + 5 * stride), u5); + _mm_store_si128((__m128i *)(output + 6 * stride), u6); + _mm_store_si128((__m128i *)(output + 7 * stride), u7); +} + +void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16], out[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case DCT_ADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_DCT: + load_buffer_8x8(input, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case ADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]); + transpose_8x8(out, in); + iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0); + write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd); + break; + default: assert(0); + } +} + +static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i x; + + // stage 0 + // stage 1 + // stage 2 + // stage 3 + x = _mm_mullo_epi32(in[0], cospi32); + x = _mm_add_epi32(x, rnding); + x = _mm_srai_epi32(x, bit); + + // stage 4 + // stage 5 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; +} + +static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + + // stage 0 + // stage 1 + // stage 2 + u0 = in[0]; + u1 = in[4]; + u2 = in[2]; + u3 = in[6]; + + x = _mm_mullo_epi32(in[1], cospi56); + y = _mm_mullo_epi32(in[7], cospim8); + u4 = _mm_add_epi32(x, y); + u4 = _mm_add_epi32(u4, rnding); + u4 = _mm_srai_epi32(u4, bit); + + x = _mm_mullo_epi32(in[1], cospi8); + y = _mm_mullo_epi32(in[7], cospi56); + u7 = _mm_add_epi32(x, y); + u7 = _mm_add_epi32(u7, rnding); + u7 = _mm_srai_epi32(u7, bit); + + x = _mm_mullo_epi32(in[5], cospi24); + y = _mm_mullo_epi32(in[3], cospim40); + u5 = _mm_add_epi32(x, y); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + x = _mm_mullo_epi32(in[5], cospi40); + y = _mm_mullo_epi32(in[3], cospi24); + u6 = _mm_add_epi32(x, y); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + // stage 3 + x = _mm_mullo_epi32(u0, cospi32); + y = _mm_mullo_epi32(u1, cospi32); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + v1 = _mm_sub_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi48); + y = _mm_mullo_epi32(u3, cospim16); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi16); + y = _mm_mullo_epi32(u3, cospi48); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi); + + // stage 4 + addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi); + addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi); + u4 = v4; + u7 = v7; + + x = _mm_mullo_epi32(v5, cospi32); + y = _mm_mullo_epi32(v6, cospi32); + u6 = _mm_add_epi32(y, x); + u6 = _mm_add_epi32(u6, rnding); + u6 = _mm_srai_epi32(u6, bit); + + u5 = _mm_sub_epi32(y, x); + u5 = _mm_add_epi32(u5, rnding); + u5 = _mm_srai_epi32(u5, bit); + + // stage 5 + addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + round_shift_4x4(out, out_shift); + round_shift_4x4(out + 4, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8); + } +} + +static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i u[8], x; + + // stage 0 + // stage 1 + // stage 2 + + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(kZero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // stage 3 + // stage 4 + __m128i temp1, temp2; + temp1 = _mm_mullo_epi32(u[0], cospi16); + x = _mm_mullo_epi32(u[1], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + u[4] = temp1; + + temp2 = _mm_mullo_epi32(u[0], cospi48); + x = _mm_mullo_epi32(u[1], cospi16); + u[5] = _mm_sub_epi32(temp2, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // stage 5 + // stage 6 + temp1 = _mm_mullo_epi32(u[0], cospi32); + x = _mm_mullo_epi32(u[1], cospi32); + u[2] = _mm_add_epi32(temp1, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(temp1, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + temp1 = _mm_mullo_epi32(u[4], cospi32); + x = _mm_mullo_epi32(u[5], cospi32); + u[6] = _mm_add_epi32(temp1, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(temp1, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[8], v[8], x; + + // stage 0 + // stage 1 + // stage 2 + + u[0] = _mm_mullo_epi32(in[7], cospi4); + x = _mm_mullo_epi32(in[0], cospi60); + u[0] = _mm_add_epi32(u[0], x); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_mullo_epi32(in[7], cospi60); + x = _mm_mullo_epi32(in[0], cospi4); + u[1] = _mm_sub_epi32(u[1], x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // (2) + u[2] = _mm_mullo_epi32(in[5], cospi20); + x = _mm_mullo_epi32(in[2], cospi44); + u[2] = _mm_add_epi32(u[2], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_mullo_epi32(in[5], cospi44); + x = _mm_mullo_epi32(in[2], cospi20); + u[3] = _mm_sub_epi32(u[3], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + // (3) + u[4] = _mm_mullo_epi32(in[3], cospi36); + x = _mm_mullo_epi32(in[4], cospi28); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(in[3], cospi28); + x = _mm_mullo_epi32(in[4], cospi36); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + // (4) + u[6] = _mm_mullo_epi32(in[1], cospi52); + x = _mm_mullo_epi32(in[6], cospi12); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(in[1], cospi12); + x = _mm_mullo_epi32(in[6], cospi52); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 3 + addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi); + + // stage 4 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[5] = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + u[5] = _mm_sub_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[6], cospim48); + x = _mm_mullo_epi32(v[7], cospi16); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_mullo_epi32(v[6], cospi16); + x = _mm_mullo_epi32(v[7], cospim48); + u[7] = _mm_sub_epi32(u[7], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 5 + addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi); + + // stage 6 + u[0] = v[0]; + u[1] = v[1]; + u[4] = v[4]; + u[5] = v[5]; + + v[0] = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + u[2] = _mm_add_epi32(v[0], x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(v[0], x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + v[0] = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + u[6] = _mm_add_epi32(v[0], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(v[0], x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + // stage 7 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(kZero, u[4]); + out[2] = u[6]; + out[3] = _mm_sub_epi32(kZero, u[2]); + out[4] = u[3]; + out[5] = _mm_sub_epi32(kZero, u[7]); + out[6] = u[5]; + out[7] = _mm_sub_epi32(kZero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out, + out_shift); + } +} + +static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + // stage 0 + // stage 1 + // stage 2 + // stage 3 + // stage 4 + in[0] = _mm_mullo_epi32(in[0], cospi32); + in[0] = _mm_add_epi32(in[0], rnding); + in[0] = _mm_srai_epi32(in[0], bit); + + // stage 5 + // stage 6 + // stage 7 + if (!do_cols) { + log_range = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + in[0] = _mm_add_epi32(in[0], offset); + in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift)); + } + } + + in[0] = _mm_max_epi32(in[0], clamp_lo); + in[0] = _mm_min_epi32(in[0], clamp_hi); + out[0] = in[0]; + out[1] = in[0]; + out[2] = in[0]; + out[3] = in[0]; + out[4] = in[0]; + out[5] = in[0]; + out[6] = in[0]; + out[7] = in[0]; + out[8] = in[0]; + out[9] = in[0]; + out[10] = in[0]; + out[11] = in[0]; + out[12] = in[0]; + out[13] = in[0]; + out[14] = in[0]; + out[15] = in[0]; +} + +static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], x, y; + // stage 0 + // stage 1 + u[0] = in[0]; + u[2] = in[4]; + u[4] = in[2]; + u[6] = in[6]; + u[8] = in[1]; + u[10] = in[5]; + u[12] = in[3]; + u[14] = in[7]; + + // stage 2 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + + u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + + u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + // stage 3 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit); + + addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + u[1] = u[0]; + + u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi); + + x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = x; + y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = y; + + // stage 5 + addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[5], cospi32); + y = _mm_mullo_epi32(u[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi); + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + u[10] = _mm_sub_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[13] = _mm_add_epi32(x, y); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_add_epi32(x, y); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + // stage 7 + addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } +} + +static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i v[16], x, y, temp1, temp2; + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(x, rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(zero, x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + // stage 3 + v[8] = v[0]; + v[9] = v[1]; + + // stage 4 + temp1 = _mm_mullo_epi32(v[8], cospi8); + x = _mm_mullo_epi32(v[9], cospi56); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[8], cospi56); + x = _mm_mullo_epi32(v[9], cospi8); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[8] = temp1; + v[9] = temp2; + + // stage 5 + v[4] = v[0]; + v[5] = v[1]; + v[12] = v[8]; + v[13] = v[9]; + + // stage 6 + temp1 = _mm_mullo_epi32(v[4], cospi16); + x = _mm_mullo_epi32(v[5], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[4], cospi48); + x = _mm_mullo_epi32(v[5], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[4] = temp1; + v[5] = temp2; + + temp1 = _mm_mullo_epi32(v[12], cospi16); + x = _mm_mullo_epi32(v[13], cospi48); + temp1 = _mm_add_epi32(temp1, x); + temp1 = _mm_add_epi32(temp1, rnding); + temp1 = _mm_srai_epi32(temp1, bit); + + temp2 = _mm_mullo_epi32(v[12], cospi48); + x = _mm_mullo_epi32(v[13], cospi16); + temp2 = _mm_sub_epi32(temp2, x); + temp2 = _mm_add_epi32(temp2, rnding); + temp2 = _mm_srai_epi32(temp2, bit); + v[12] = temp1; + v[13] = temp2; + + // stage 7 + v[2] = v[0]; + v[3] = v[1]; + v[6] = v[4]; + v[7] = v[5]; + v[10] = v[8]; + v[11] = v[9]; + v[14] = v[12]; + v[15] = v[13]; + + // stage 8 + y = _mm_mullo_epi32(v[2], cospi32); + x = _mm_mullo_epi32(v[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + y = _mm_mullo_epi32(v[6], cospi32); + x = _mm_mullo_epi32(v[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + y = _mm_mullo_epi32(v[10], cospi32); + x = _mm_mullo_epi32(v[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + y = _mm_mullo_epi32(v[14], cospi32); + x = _mm_mullo_epi32(v[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i zero = _mm_setzero_si128(); + __m128i u[16], x, y; + + // stage 0 + // stage 1 + // stage 2 + x = _mm_mullo_epi32(in[0], cospi62); + u[0] = _mm_add_epi32(x, rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + x = _mm_mullo_epi32(in[0], cospi2); + u[1] = _mm_sub_epi32(zero, x); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + x = _mm_mullo_epi32(in[2], cospi54); + u[2] = _mm_add_epi32(x, rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + x = _mm_mullo_epi32(in[2], cospi10); + u[3] = _mm_sub_epi32(zero, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + x = _mm_mullo_epi32(in[4], cospi46); + u[4] = _mm_add_epi32(x, rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(in[4], cospi18); + u[5] = _mm_sub_epi32(zero, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(in[6], cospi38); + u[6] = _mm_add_epi32(x, rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(in[6], cospi26); + u[7] = _mm_sub_epi32(zero, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[8] = _mm_mullo_epi32(in[7], cospi34); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + u[9] = _mm_mullo_epi32(in[7], cospi30); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + u[10] = _mm_mullo_epi32(in[5], cospi42); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_mullo_epi32(in[5], cospi22); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + u[12] = _mm_mullo_epi32(in[3], cospi50); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + u[13] = _mm_mullo_epi32(in[3], cospi14); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + u[14] = _mm_mullo_epi32(in[1], cospi58); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_mullo_epi32(in[1], cospi6); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 3 + addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + y = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi56); + u[8] = _mm_mullo_epi32(u[8], cospi8); + u[8] = _mm_add_epi32(u[8], x); + u[8] = _mm_add_epi32(u[8], rnding); + u[8] = _mm_srai_epi32(u[8], bit); + + x = _mm_mullo_epi32(u[9], cospi8); + u[9] = _mm_sub_epi32(y, x); + u[9] = _mm_add_epi32(u[9], rnding); + u[9] = _mm_srai_epi32(u[9], bit); + + x = _mm_mullo_epi32(u[11], cospi24); + y = _mm_mullo_epi32(u[10], cospi24); + u[10] = _mm_mullo_epi32(u[10], cospi40); + u[10] = _mm_add_epi32(u[10], x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + x = _mm_mullo_epi32(u[11], cospi40); + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + x = _mm_mullo_epi32(u[13], cospi8); + y = _mm_mullo_epi32(u[12], cospi8); + u[12] = _mm_mullo_epi32(u[12], cospim56); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospim56); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi40); + y = _mm_mullo_epi32(u[14], cospi40); + u[14] = _mm_mullo_epi32(u[14], cospim24); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim24); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 5 + addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + x = _mm_mullo_epi32(u[5], cospi48); + y = _mm_mullo_epi32(u[4], cospi48); + u[4] = _mm_mullo_epi32(u[4], cospi16); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + x = _mm_mullo_epi32(u[5], cospi16); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + x = _mm_mullo_epi32(u[7], cospi16); + y = _mm_mullo_epi32(u[6], cospi16); + u[6] = _mm_mullo_epi32(u[6], cospim48); + u[6] = _mm_add_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + x = _mm_mullo_epi32(u[7], cospim48); + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + x = _mm_mullo_epi32(u[13], cospi48); + y = _mm_mullo_epi32(u[12], cospi48); + u[12] = _mm_mullo_epi32(u[12], cospi16); + u[12] = _mm_add_epi32(u[12], x); + u[12] = _mm_add_epi32(u[12], rnding); + u[12] = _mm_srai_epi32(u[12], bit); + + x = _mm_mullo_epi32(u[13], cospi16); + u[13] = _mm_sub_epi32(y, x); + u[13] = _mm_add_epi32(u[13], rnding); + u[13] = _mm_srai_epi32(u[13], bit); + + x = _mm_mullo_epi32(u[15], cospi16); + y = _mm_mullo_epi32(u[14], cospi16); + u[14] = _mm_mullo_epi32(u[14], cospim48); + u[14] = _mm_add_epi32(u[14], x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + x = _mm_mullo_epi32(u[15], cospim48); + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 7 + addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + u[2] = _mm_add_epi32(y, x); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + u[3] = _mm_sub_epi32(y, x); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = _mm_sub_epi32(y, x); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + u[10] = _mm_add_epi32(y, x); + u[10] = _mm_add_epi32(u[10], rnding); + u[10] = _mm_srai_epi32(u[10], bit); + + u[11] = _mm_sub_epi32(y, x); + u[11] = _mm_add_epi32(u[11], rnding); + u[11] = _mm_srai_epi32(u[11], bit); + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + u[14] = _mm_add_epi32(y, x); + u[14] = _mm_add_epi32(u[14], rnding); + u[14] = _mm_srai_epi32(u[14], bit); + + u[15] = _mm_sub_epi32(y, x); + u[15] = _mm_add_epi32(u[15], rnding); + u[15] = _mm_srai_epi32(u[15], bit); + + // stage 9 + if (do_cols) { + out[0] = u[0]; + out[1] = _mm_sub_epi32(zero, u[8]); + out[2] = u[12]; + out[3] = _mm_sub_epi32(zero, u[4]); + out[4] = u[6]; + out[5] = _mm_sub_epi32(zero, u[14]); + out[6] = u[10]; + out[7] = _mm_sub_epi32(zero, u[2]); + out[8] = u[3]; + out[9] = _mm_sub_epi32(zero, u[11]); + out[10] = u[15]; + out[11] = _mm_sub_epi32(zero, u[7]); + out[12] = u[5]; + out[13] = _mm_sub_epi32(zero, u[13]); + out[14] = u[9]; + out[15] = _mm_sub_epi32(zero, u[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} + +static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i u[16], v[16], x, y; + + { + // stage 0 + // stage 1 + u[0] = in[0]; + u[1] = in[8]; + u[2] = in[4]; + u[3] = in[12]; + u[4] = in[2]; + u[5] = in[10]; + u[6] = in[6]; + u[7] = in[14]; + u[8] = in[1]; + u[9] = in[9]; + u[10] = in[5]; + u[11] = in[13]; + u[12] = in[3]; + u[13] = in[11]; + u[14] = in[7]; + u[15] = in[15]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit); + + // stage 3 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit); + u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit); + u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit); + addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi); + + // stage 4 + x = _mm_mullo_epi32(u[0], cospi32); + y = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(x, y); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(x, y); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit); + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[11] = u[11]; + v[12] = u[12]; + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + v[15] = u[15]; + + // stage 5 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + u[4] = v[4]; + + x = _mm_mullo_epi32(v[5], cospi32); + y = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_sub_epi32(y, x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_add_epi32(y, x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi); + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_sub_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_add_epi32(x, y); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + x = _mm_mullo_epi32(u[11], cospi32); + y = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_add_epi32(x, y); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[14] = u[14]; + v[15] = u[15]; + + // stage 7 + addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16); + } + } +} + +static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + const __m128i zero = _mm_setzero_si128(); + __m128i u[16], v[16], x, y; + // Calculate the column 0, 1, 2, 3 + // stage 0 + // stage 1 + // stage 2 + v[0] = _mm_mullo_epi32(in[15], cospi2); + x = _mm_mullo_epi32(in[0], cospi62); + v[0] = _mm_add_epi32(v[0], x); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_mullo_epi32(in[15], cospi62); + x = _mm_mullo_epi32(in[0], cospi2); + v[1] = _mm_sub_epi32(v[1], x); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(in[13], cospi10); + x = _mm_mullo_epi32(in[2], cospi54); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(in[13], cospi54); + x = _mm_mullo_epi32(in[2], cospi10); + v[3] = _mm_sub_epi32(v[3], x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_mullo_epi32(in[11], cospi18); + x = _mm_mullo_epi32(in[4], cospi46); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(in[11], cospi46); + x = _mm_mullo_epi32(in[4], cospi18); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(in[9], cospi26); + x = _mm_mullo_epi32(in[6], cospi38); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(in[9], cospi38); + x = _mm_mullo_epi32(in[6], cospi26); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = _mm_mullo_epi32(in[7], cospi34); + x = _mm_mullo_epi32(in[8], cospi30); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(in[7], cospi30); + x = _mm_mullo_epi32(in[8], cospi34); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(in[5], cospi42); + x = _mm_mullo_epi32(in[10], cospi22); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(in[5], cospi22); + x = _mm_mullo_epi32(in[10], cospi42); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(in[3], cospi50); + x = _mm_mullo_epi32(in[12], cospi14); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(in[3], cospi14); + x = _mm_mullo_epi32(in[12], cospi50); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(in[1], cospi58); + x = _mm_mullo_epi32(in[14], cospi6); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(in[1], cospi6); + x = _mm_mullo_epi32(in[14], cospi58); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi8); + x = _mm_mullo_epi32(u[9], cospi56); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[9] = _mm_mullo_epi32(u[8], cospi56); + x = _mm_mullo_epi32(u[9], cospi8); + v[9] = _mm_sub_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi40); + x = _mm_mullo_epi32(u[11], cospi24); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_mullo_epi32(u[10], cospi24); + x = _mm_mullo_epi32(u[11], cospi40); + v[11] = _mm_sub_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[12], cospim56); + x = _mm_mullo_epi32(u[13], cospi8); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi8); + x = _mm_mullo_epi32(u[13], cospim56); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim24); + x = _mm_mullo_epi32(u[15], cospi40); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi40); + x = _mm_mullo_epi32(u[15], cospim24); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 5 + addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + + v[4] = _mm_mullo_epi32(u[4], cospi16); + x = _mm_mullo_epi32(u[5], cospi48); + v[4] = _mm_add_epi32(v[4], x); + v[4] = _mm_add_epi32(v[4], rnding); + v[4] = _mm_srai_epi32(v[4], bit); + + v[5] = _mm_mullo_epi32(u[4], cospi48); + x = _mm_mullo_epi32(u[5], cospi16); + v[5] = _mm_sub_epi32(v[5], x); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + v[6] = _mm_mullo_epi32(u[6], cospim48); + x = _mm_mullo_epi32(u[7], cospi16); + v[6] = _mm_add_epi32(v[6], x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_mullo_epi32(u[6], cospi16); + x = _mm_mullo_epi32(u[7], cospim48); + v[7] = _mm_sub_epi32(v[7], x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + v[12] = _mm_mullo_epi32(u[12], cospi16); + x = _mm_mullo_epi32(u[13], cospi48); + v[12] = _mm_add_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + v[13] = _mm_mullo_epi32(u[12], cospi48); + x = _mm_mullo_epi32(u[13], cospi16); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[14] = _mm_mullo_epi32(u[14], cospim48); + x = _mm_mullo_epi32(u[15], cospi16); + v[14] = _mm_add_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_mullo_epi32(u[14], cospi16); + x = _mm_mullo_epi32(u[15], cospim48); + v[15] = _mm_sub_epi32(v[15], x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 7 + addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi); + + // stage 8 + v[0] = u[0]; + v[1] = u[1]; + + y = _mm_mullo_epi32(u[2], cospi32); + x = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(y, x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(y, x); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + y = _mm_mullo_epi32(u[6], cospi32); + x = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(y, x); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(y, x); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + y = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(y, x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(y, x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + y = _mm_mullo_epi32(u[14], cospi32); + x = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(y, x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(y, x); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 9 + if (do_cols) { + out[0] = v[0]; + out[1] = _mm_sub_epi32(zero, v[8]); + out[2] = v[12]; + out[3] = _mm_sub_epi32(zero, v[4]); + out[4] = v[6]; + out[5] = _mm_sub_epi32(zero, v[14]); + out[6] = v[10]; + out[7] = _mm_sub_epi32(zero, v[2]); + out[8] = v[3]; + out[9] = _mm_sub_epi32(zero, v[11]); + out[10] = v[15]; + out[11] = _mm_sub_epi32(zero, v[7]); + out[12] = v[5]; + out[13] = _mm_sub_epi32(zero, v[13]); + out[14] = v[9]; + out[15] = _mm_sub_epi32(zero, v[1]); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out, + out_shift); + neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out, + &clamp_hi_out, out_shift); + neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out, + &clamp_hi_out, out_shift); + } +} +static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0_low, a0_high, a1_low, a1_high; + __m128i zero = _mm_setzero_si128(); + offset = _mm_unpacklo_epi32(offset, zero); + + for (int i = 0; i < 16; i++) { + a0_low = _mm_mul_epi32(in[i], fact); + a0_low = _mm_add_epi32(a0_low, offset); + a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits); + + a0_high = _mm_srli_si128(in[i], 4); + a0_high = _mm_mul_epi32(a0_high, fact); + a0_high = _mm_add_epi32(a0_high, offset); + a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits); + + a1_low = _mm_unpacklo_epi32(a0_low, a0_high); + a1_high = _mm_unpackhi_epi32(a0_low, a0_high); + out[i] = _mm_unpacklo_epi64(a1_low, a1_high); + } + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + round_shift_8x8(out, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16); + } +} +static INLINE void idct64_stage8_sse4_1( + __m128i *u, const __m128i *cospim32, const __m128i *cospi32, + const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, + const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit); + u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit); + u[10] = temp1; + temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit); + u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit); + u[11] = temp2; + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, + clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit); + temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit); + temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit); + temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit); + u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit); + u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit); + u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit); + u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit); + u[36] = temp1; + u[37] = temp2; + u[38] = temp3; + u[39] = temp4; + + temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; +} + +static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + int i; + __m128i temp1, temp2, temp3, temp4; + for (i = 0; i < 8; ++i) { + addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit); + u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit); + u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit); + u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit); + u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit); + u[20] = temp1; + u[21] = temp2; + u[22] = temp3; + u[23] = temp4; + for (i = 32; i < 40; i++) { + addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi); + } +} + +static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32, + const __m128i *cospi32, + const __m128i *clamp_lo, + const __m128i *clamp_hi, + const __m128i *rnding, int bit) { + __m128i temp1, temp2, temp3, temp4; + for (int i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi); + } + + temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit); + u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit); + u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit); + u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit); + u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit); + u[40] = temp1; + u[41] = temp2; + u[42] = temp3; + u[43] = temp4; + + temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit); + temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit); + temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit); + temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit); + u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit); + u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit); + u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit); + u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit); + u[44] = temp1; + u[45] = temp2; + u[46] = temp3; + u[47] = temp4; +} + +static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols, + int bd, int out_shift, + const __m128i *clamp_lo, + const __m128i *clamp_hi) { + for (int i = 0; i < 32; i++) { + addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + + for (int i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out, + 4); + } + } +} + +static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + + { + __m128i x; + + // stage 1 + // stage 2 + // stage 3 + // stage 4 + // stage 5 + // stage 6 + x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit); + + // stage 8 + // stage 9 + // stage 10 + // stage 11 + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + x = _mm_add_epi32(x, offset); + x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift)); + } + } + x = _mm_max_epi32(x, clamp_lo); + x = _mm_min_epi32(x, clamp_hi); + out[0] = x; + out[1] = x; + out[2] = x; + out[3] = x; + out[4] = x; + out[5] = x; + out[6] = x; + out[7] = x; + out[8] = x; + out[9] = x; + out[10] = x; + out[11] = x; + out[12] = x; + out[13] = x; + out[14] = x; + out[15] = x; + out[16] = x; + out[17] = x; + out[18] = x; + out[19] = x; + out[20] = x; + out[21] = x; + out[22] = x; + out[23] = x; + out[24] = x; + out[25] = x; + out[26] = x; + out[27] = x; + out[28] = x; + out[29] = x; + out[30] = x; + out[31] = x; + out[32] = x; + out[33] = x; + out[34] = x; + out[35] = x; + out[36] = x; + out[37] = x; + out[38] = x; + out[39] = x; + out[40] = x; + out[41] = x; + out[42] = x; + out[43] = x; + out[44] = x; + out[45] = x; + out[46] = x; + out[47] = x; + out[48] = x; + out[49] = x; + out[50] = x; + out[51] = x; + out[52] = x; + out[53] = x; + out[54] = x; + out[55] = x; + out[56] = x; + out[57] = x; + out[58] = x; + out[59] = x; + out[60] = x; + out[61] = x; + out[62] = x; + out[63] = x; + } +} + +static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + + { + __m128i u[64]; + + // stage 1 + u[0] = in[0]; + u[8] = in[4]; + u[16] = in[2]; + u[24] = in[6]; + u[32] = in[1]; + u[40] = in[5]; + u[48] = in[3]; + u[56] = in[7]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[38] = u[39]; + u[41] = u[40]; + u[46] = u[47]; + u[49] = u[48]; + u[54] = u[55]; + u[57] = u[56]; + u[62] = u[63]; + + // stage 4 + __m128i temp1, temp2; + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[17] = u[16]; + u[22] = u[23]; + u[25] = u[24]; + u[30] = u[31]; + + temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = temp1; + + temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = temp2; + + temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = temp1; + + temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[46] = temp2; + + // stage 5 + u[9] = u[8]; + u[14] = u[15]; + + temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = temp1; + + temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[22] = temp2; + + u[35] = u[32]; + u[34] = u[33]; + u[36] = u[39]; + u[37] = u[38]; + u[43] = u[40]; + u[42] = u[41]; + u[44] = u[47]; + u[45] = u[46]; + u[51] = u[48]; + u[50] = u[49]; + u[52] = u[55]; + u[53] = u[54]; + u[59] = u[56]; + u[58] = u[57]; + u[60] = u[63]; + u[61] = u[62]; + + // stage 6 + temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = temp1; + + temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = temp2; + u[19] = u[16]; + u[18] = u[17]; + u[20] = u[23]; + u[21] = u[22]; + u[27] = u[24]; + u[26] = u[25]; + u[28] = u[31]; + u[29] = u[30]; + + temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = temp1; + temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[35] = temp2; + temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[36] = temp1; + temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[37] = temp2; + temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = temp1; + temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[43] = temp2; + temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[44] = temp1; + temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[45] = temp2; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + u[11] = u[8]; + u[10] = u[9]; + u[12] = u[15]; + u[13] = u[14]; + + temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = temp1; + temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[19] = temp2; + temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[20] = temp1; + temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[21] = temp2; + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + u[7] = u[0]; + u[6] = u[1]; + u[5] = u[2]; + u[4] = u[3]; + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64]; + __m128i tmp1, tmp2, tmp3, tmp4; + // stage 1 + u[0] = in[0]; + u[32] = in[1]; + u[36] = in[9]; + u[40] = in[5]; + u[44] = in[13]; + u[48] = in[3]; + u[52] = in[11]; + u[56] = in[7]; + u[60] = in[15]; + u[16] = in[2]; + u[20] = in[10]; + u[24] = in[6]; + u[28] = in[14]; + u[4] = in[8]; + u[8] = in[4]; + u[12] = in[12]; + + // stage 2 + u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + + // stage 3 + u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit); + u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit); + u[33] = u[32]; + u[34] = u[35]; + u[37] = u[36]; + u[38] = u[39]; + u[41] = u[40]; + u[42] = u[43]; + u[45] = u[44]; + u[46] = u[47]; + u[49] = u[48]; + u[50] = u[51]; + u[53] = u[52]; + u[54] = u[55]; + u[57] = u[56]; + u[58] = u[59]; + u[61] = u[60]; + u[62] = u[63]; + + // stage 4 + u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + + u[17] = u[16]; + u[18] = u[19]; + u[21] = u[20]; + u[22] = u[23]; + u[25] = u[24]; + u[26] = u[27]; + u[29] = u[28]; + u[30] = u[31]; + + tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + u[33] = tmp1; + u[34] = tmp2; + u[37] = tmp3; + u[38] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + u[41] = tmp1; + u[42] = tmp2; + u[45] = tmp3; + u[46] = tmp4; + + // stage 5 + u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit); + u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit); + + u[9] = u[8]; + u[10] = u[11]; + u[13] = u[12]; + u[14] = u[15]; + + tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit); + u[17] = tmp1; + u[18] = tmp2; + u[21] = tmp3; + u[22] = tmp4; + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + u[0] = tmp1; + u[5] = u[4]; + u[6] = u[7]; + + tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + u[9] = tmp1; + tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + u[10] = tmp2; + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + u[34] = tmp1; + u[35] = tmp2; + u[36] = tmp3; + u[37] = tmp4; + + tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + u[42] = tmp1; + u[43] = tmp2; + u[44] = tmp3; + u[45] = tmp4; + + // stage 7 + u[3] = u[0]; + u[2] = u[1]; + tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit); + u[5] = tmp1; + addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit); + tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit); + tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit); + tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit); + u[18] = tmp1; + u[19] = tmp2; + u[20] = tmp3; + u[21] = tmp4; + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi); + } + + idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rnding, bit); + + // stage 9 + idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 10 + idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding, + bit); + + // stage 11 + idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); + } +} + +static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + int i, j; + const int32_t *cospi = cospi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + const __m128i cospi1 = _mm_set1_epi32(cospi[1]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi3 = _mm_set1_epi32(cospi[3]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi5 = _mm_set1_epi32(cospi[5]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi7 = _mm_set1_epi32(cospi[7]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi9 = _mm_set1_epi32(cospi[9]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi11 = _mm_set1_epi32(cospi[11]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi13 = _mm_set1_epi32(cospi[13]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi15 = _mm_set1_epi32(cospi[15]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi17 = _mm_set1_epi32(cospi[17]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi19 = _mm_set1_epi32(cospi[19]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi21 = _mm_set1_epi32(cospi[21]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi23 = _mm_set1_epi32(cospi[23]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi25 = _mm_set1_epi32(cospi[25]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi27 = _mm_set1_epi32(cospi[27]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi29 = _mm_set1_epi32(cospi[29]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi31 = _mm_set1_epi32(cospi[31]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi35 = _mm_set1_epi32(cospi[35]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi39 = _mm_set1_epi32(cospi[39]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi43 = _mm_set1_epi32(cospi[43]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi47 = _mm_set1_epi32(cospi[47]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi51 = _mm_set1_epi32(cospi[51]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi55 = _mm_set1_epi32(cospi[55]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi59 = _mm_set1_epi32(cospi[59]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi63 = _mm_set1_epi32(cospi[63]); + + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim12 = _mm_set1_epi32(-cospi[12]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim28 = _mm_set1_epi32(-cospi[28]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospim33 = _mm_set1_epi32(-cospi[33]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim37 = _mm_set1_epi32(-cospi[37]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim41 = _mm_set1_epi32(-cospi[41]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim44 = _mm_set1_epi32(-cospi[44]); + const __m128i cospim45 = _mm_set1_epi32(-cospi[45]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim49 = _mm_set1_epi32(-cospi[49]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim53 = _mm_set1_epi32(-cospi[53]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim57 = _mm_set1_epi32(-cospi[57]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim60 = _mm_set1_epi32(-cospi[60]); + const __m128i cospim61 = _mm_set1_epi32(-cospi[61]); + + { + __m128i u[64], v[64]; + + // stage 1 + u[32] = in[1]; + u[34] = in[17]; + u[36] = in[9]; + u[38] = in[25]; + u[40] = in[5]; + u[42] = in[21]; + u[44] = in[13]; + u[46] = in[29]; + u[48] = in[3]; + u[50] = in[19]; + u[52] = in[11]; + u[54] = in[27]; + u[56] = in[7]; + u[58] = in[23]; + u[60] = in[15]; + u[62] = in[31]; + + v[16] = in[2]; + v[18] = in[18]; + v[20] = in[10]; + v[22] = in[26]; + v[24] = in[6]; + v[26] = in[22]; + v[28] = in[14]; + v[30] = in[30]; + + u[8] = in[4]; + u[10] = in[20]; + u[12] = in[12]; + u[14] = in[28]; + + v[4] = in[8]; + v[6] = in[24]; + + u[0] = in[0]; + u[2] = in[16]; + + // stage 2 + v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit); + v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit); + v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit); + v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit); + v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit); + v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit); + v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit); + v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit); + v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit); + v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit); + v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit); + v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit); + v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit); + v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit); + v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit); + v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit); + v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit); + v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit); + v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit); + v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit); + v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit); + v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit); + v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit); + v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit); + v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit); + v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit); + v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit); + v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit); + v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit); + v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit); + v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit); + v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit); + + // stage 3 + u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit); + u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit); + u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit); + u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit); + u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit); + u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit); + u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit); + u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit); + u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit); + u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit); + u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit); + u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit); + u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit); + u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit); + u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit); + u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit); + + for (i = 32; i < 64; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + // stage 4 + v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit); + v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit); + v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit); + v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit); + v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit); + v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit); + v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit); + v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit); + + for (i = 16; i < 32; i += 4) { + addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit); + v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit); + v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit); + v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit); + v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit); + v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit); + + // stage 5 + u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit); + u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit); + u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit); + u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit); + + for (i = 8; i < 16; i += 4) { + addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo, + &clamp_hi); + } + + for (i = 16; i < 32; i += 4) { + u[i + 0] = v[i + 0]; + u[i + 3] = v[i + 3]; + } + + u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit); + u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit); + u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit); + u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit); + u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit); + + for (i = 32; i < 64; i += 8) { + addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo, + &clamp_hi); + } + + // stage 6 + v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit); + v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit); + v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit); + + addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi); + + for (i = 8; i < 16; i += 4) { + v[i + 0] = u[i + 0]; + v[i + 3] = u[i + 3]; + } + + v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit); + v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit); + + for (i = 16; i < 32; i += 8) { + addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo, + &clamp_hi); + + addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo, + &clamp_hi); + addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 64; i += 8) { + v[i + 0] = u[i + 0]; + v[i + 1] = u[i + 1]; + v[i + 6] = u[i + 6]; + v[i + 7] = u[i + 7]; + } + + v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit); + v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit); + v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit); + v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit); + v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit); + v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit); + v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit); + + // stage 7 + addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi); + + u[4] = v[4]; + u[7] = v[7]; + u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit); + u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit); + + addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi); + + for (i = 16; i < 32; i += 8) { + u[i + 0] = v[i + 0]; + u[i + 1] = v[i + 1]; + u[i + 6] = v[i + 6]; + u[i + 7] = v[i + 7]; + } + + u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit); + u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit); + u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit); + u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit); + u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit); + u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit); + + for (i = 32; i < 64; i += 16) { + for (j = i; j < i + 4; j++) { + addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo, + &clamp_hi); + } + } + + // stage 8 + for (i = 0; i < 4; ++i) { + addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi); + } + + v[8] = u[8]; + v[9] = u[9]; + v[14] = u[14]; + v[15] = u[15]; + + v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit); + v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit); + + for (i = 16; i < 20; ++i) { + addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi); + addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo, + &clamp_hi); + } + + for (i = 32; i < 36; ++i) { + v[i] = u[i]; + v[i + 12] = u[i + 12]; + v[i + 16] = u[i + 16]; + v[i + 28] = u[i + 28]; + } + + v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit); + v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit); + v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit); + v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit); + v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit); + v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit); + v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit); + v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit); + v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit); + v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit); + + // stage 9 + for (i = 0; i < 8; ++i) { + addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi); + } + + for (i = 16; i < 20; ++i) { + u[i] = v[i]; + u[i + 12] = v[i + 12]; + } + + u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit); + u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit); + u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit); + u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit); + u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit); + u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit); + u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit); + u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit); + + for (i = 32; i < 40; i++) { + addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi); + } + + for (i = 48; i < 56; i++) { + addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi); + } + + // stage 10 + for (i = 0; i < 16; i++) { + addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi); + } + + for (i = 32; i < 40; i++) v[i] = u[i]; + + v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit); + v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit); + v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit); + v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit); + v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit); + v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit); + v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit); + v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit); + v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit); + v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit); + v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit); + v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit); + v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit); + v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit); + v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit); + v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit); + + for (i = 56; i < 64; i++) v[i] = u[i]; + + // stage 11 + for (i = 0; i < 32; i++) { + addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo, + &clamp_hi); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = + _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + for (i = 0; i < 64; i += 4) { + round_shift_4x4(out + i, out_shift); + highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, + &clamp_hi_out, 4); + } + } + } +} + +static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1; + + // stage 0 + // stage 1 + bf1 = in[0]; + + // stage 2 + // stage 3 + // stage 4 + // stage 5 + bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit); + + // stage 6 + // stage 7 + // stage 8 + // stage 9 + if (do_cols) { + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + } else { + const int log_range_out = AOMMAX(16, bd + 6); + clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1))); + clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + if (out_shift != 0) { + __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1); + bf1 = _mm_add_epi32(bf1, offset); + bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift)); + } + } + + bf1 = _mm_max_epi32(bf1, clamp_lo); + bf1 = _mm_min_epi32(bf1, clamp_hi); + out[0] = bf1; + out[1] = bf1; + out[2] = bf1; + out[3] = bf1; + out[4] = bf1; + out[5] = bf1; + out[6] = bf1; + out[7] = bf1; + out[8] = bf1; + out[9] = bf1; + out[10] = bf1; + out[11] = bf1; + out[12] = bf1; + out[13] = bf1; + out[14] = bf1; + out[15] = bf1; + out[16] = bf1; + out[17] = bf1; + out[18] = bf1; + out[19] = bf1; + out[20] = bf1; + out[21] = bf1; + out[22] = bf1; + out[23] = bf1; + out[24] = bf1; + out[25] = bf1; + out[26] = bf1; + out[27] = bf1; + out[28] = bf1; + out[29] = bf1; + out[30] = bf1; + out[31] = bf1; +} + +static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[4] = in[4]; + bf1[8] = in[2]; + bf1[12] = in[6]; + bf1[16] = in[1]; + bf1[20] = in[5]; + bf1[24] = in[3]; + bf1[28] = in[7]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + bf1[17] = bf1[16]; + bf1[18] = bf1[19]; + bf1[21] = bf1[20]; + bf1[22] = bf1[23]; + bf1[25] = bf1[24]; + bf1[26] = bf1[27]; + bf1[29] = bf1[28]; + bf1[30] = bf1[31]; + + // stage 4 : + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + + bf1[9] = bf1[8]; + bf1[10] = bf1[11]; + bf1[13] = bf1[12]; + bf1[14] = bf1[15]; + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[5] = bf1[4]; + bf1[6] = bf1[7]; + + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + bf1[3] = bf1[0]; + bf1[2] = bf1[1]; + + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32]; + + // stage 0 + // stage 1 + + bf1[0] = in[0]; + bf1[2] = in[8]; + bf1[4] = in[4]; + bf1[6] = in[12]; + bf1[8] = in[2]; + bf1[10] = in[10]; + bf1[12] = in[6]; + bf1[14] = in[14]; + bf1[16] = in[1]; + bf1[18] = in[9]; + bf1[20] = in[5]; + bf1[22] = in[13]; + bf1[24] = in[3]; + bf1[26] = in[11]; + bf1[28] = in[7]; + bf1[30] = in[15]; + + // stage 2 + bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit); + bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit); + bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit); + bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit); + bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit); + bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit); + bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit); + bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit); + bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit); + bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit); + bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit); + bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit); + bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit); + bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit); + bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit); + bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit); + + // stage 3 + bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit); + bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit); + bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit); + bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit); + bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit); + bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit); + bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit); + bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit); + + addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + // stage 4 + bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit); + bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit); + bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit); + bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi); + + idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40, + &cospi24, &cospi40, &cospim24, &rounding, bit); + + // stage 5 + bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit); + bf1[1] = bf1[0]; + bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit); + bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit); + + addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + + idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo, + &clamp_hi, &rounding, bit); + + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi); + + idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16, + &cospim48, &clamp_lo, &clamp_hi, &rounding, bit); + + // stage 7 + idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + + // stage 8 + idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi, + &rounding, bit); + // stage 9 + idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi); +} + +static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i rounding = _mm_set1_epi32(1 << (bit - 1)); + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i bf1[32], bf0[32]; + + // stage 0 + // stage 1 + bf1[0] = in[0]; + bf1[1] = in[16]; + bf1[2] = in[8]; + bf1[3] = in[24]; + bf1[4] = in[4]; + bf1[5] = in[20]; + bf1[6] = in[12]; + bf1[7] = in[28]; + bf1[8] = in[2]; + bf1[9] = in[18]; + bf1[10] = in[10]; + bf1[11] = in[26]; + bf1[12] = in[6]; + bf1[13] = in[22]; + bf1[14] = in[14]; + bf1[15] = in[30]; + bf1[16] = in[1]; + bf1[17] = in[17]; + bf1[18] = in[9]; + bf1[19] = in[25]; + bf1[20] = in[5]; + bf1[21] = in[21]; + bf1[22] = in[13]; + bf1[23] = in[29]; + bf1[24] = in[3]; + bf1[25] = in[19]; + bf1[26] = in[11]; + bf1[27] = in[27]; + bf1[28] = in[7]; + bf1[29] = in[23]; + bf1[30] = in[15]; + bf1[31] = in[31]; + + // stage 2 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = bf1[4]; + bf0[5] = bf1[5]; + bf0[6] = bf1[6]; + bf0[7] = bf1[7]; + bf0[8] = bf1[8]; + bf0[9] = bf1[9]; + bf0[10] = bf1[10]; + bf0[11] = bf1[11]; + bf0[12] = bf1[12]; + bf0[13] = bf1[13]; + bf0[14] = bf1[14]; + bf0[15] = bf1[15]; + bf0[16] = + half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit); + bf0[17] = + half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit); + bf0[31] = + half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit); + + // stage 3 + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = + half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit); + bf1[9] = + half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit); + bf1[15] = + half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit); + + addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi); + + // stage 4 + bf0[0] = bf1[0]; + bf0[1] = bf1[1]; + bf0[2] = bf1[2]; + bf0[3] = bf1[3]; + bf0[4] = + half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit); + bf0[5] = + half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit); + bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit); + + addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi); + + bf0[16] = bf1[16]; + bf0[17] = + half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit); + bf0[18] = + half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit); + bf0[19] = bf1[19]; + bf0[20] = bf1[20]; + bf0[21] = + half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit); + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = + half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit); + bf0[27] = bf1[27]; + bf0[28] = bf1[28]; + bf0[29] = + half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit); + bf0[30] = + half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit); + bf0[31] = bf1[31]; + + // stage 5 + bf1[0] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit); + bf1[1] = + half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit); + bf1[2] = + half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit); + bf1[3] = + half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit); + addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = + half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit); + bf1[10] = + half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = + half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit); + bf1[14] = + half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit); + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi); + + // stage 6 + addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi); + bf0[4] = bf1[4]; + bf0[5] = + half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[6] = + half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit); + bf0[7] = bf1[7]; + addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = + half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit); + bf0[19] = + half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit); + bf0[20] = + half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit); + bf0[22] = bf1[22]; + bf0[23] = bf1[23]; + bf0[24] = bf1[24]; + bf0[25] = bf1[25]; + bf0[26] = + half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit); + bf0[28] = + half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit); + bf0[29] = + half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit); + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 7 + addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = + half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[11] = + half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[12] = + half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit); + bf1[13] = + half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi); + + // stage 8 + addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi); + bf0[16] = bf1[16]; + bf0[17] = bf1[17]; + bf0[18] = bf1[18]; + bf0[19] = bf1[19]; + bf0[20] = + half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[21] = + half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[22] = + half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[23] = + half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[24] = + half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit); + bf0[25] = + half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit); + bf0[26] = + half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit); + bf0[27] = + half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit); + bf0[28] = bf1[28]; + bf0[29] = bf1[29]; + bf0[30] = bf1[30]; + bf0[31] = bf1[31]; + + // stage 9 + addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi); + addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} + +void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const int32_t *src = cast_to_int32(input); + switch (tx_type) { + case IDTX: + case H_DCT: + case H_ADST: + case H_FLIPADST: + case V_DCT: + case V_ADST: + case V_FLIPADST: + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, + txfm_param->tx_size, + txfm_param->eob, bd); + break; + default: + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); + break; + } +} +void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + int eob = txfm_param->eob; + int bd = txfm_param->bd; + int lossless = txfm_param->lossless; + const int32_t *src = cast_to_int32(input); + const TX_TYPE tx_type = txfm_param->tx_type; + if (lossless) { + assert(tx_type == DCT_DCT); + av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); + return; + } + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); +} +static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + for (int i = 0; i < 32; i += 16) { + out[i] = _mm_slli_epi32(in[i], 2); + out[i + 1] = _mm_slli_epi32(in[i + 1], 2); + out[i + 2] = _mm_slli_epi32(in[i + 2], 2); + out[i + 3] = _mm_slli_epi32(in[i + 3], 2); + out[i + 4] = _mm_slli_epi32(in[i + 4], 2); + out[i + 5] = _mm_slli_epi32(in[i + 5], 2); + out[i + 6] = _mm_slli_epi32(in[i + 6], 2); + out[i + 7] = _mm_slli_epi32(in[i + 7], 2); + out[i + 8] = _mm_slli_epi32(in[i + 8], 2); + out[i + 9] = _mm_slli_epi32(in[i + 9], 2); + out[i + 10] = _mm_slli_epi32(in[i + 10], 2); + out[i + 11] = _mm_slli_epi32(in[i + 11], 2); + out[i + 12] = _mm_slli_epi32(in[i + 12], 2); + out[i + 13] = _mm_slli_epi32(in[i + 13], 2); + out[i + 14] = _mm_slli_epi32(in[i + 14], 2); + out[i + 15] = _mm_slli_epi32(in[i + 15], 2); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1))); + const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1); + round_shift_8x8(out, out_shift); + round_shift_8x8(out + 16, out_shift); + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32); + } +} +static const transform_1d_sse4_1 + highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { + { + { idct4x4_sse4_1, NULL, NULL, NULL }, + { iadst4x4_sse4_1, NULL, NULL, NULL }, + { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, + }, + { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, + { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, + { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, + { + { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, + NULL }, + { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, + NULL }, + { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, + }, + { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, + idct32x32_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { iidentity32_sse4_1, NULL, NULL, NULL } }, + { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, + idct64x64_sse4_1 }, + { NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL } } + }; +static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + __m128i buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} +static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[16]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[64 * 4]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int row_max = AOMMIN(32, txfm_size_row); + const int input_stride = row_max; + const int buf_size_w = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = buf_size_w >> 2; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[32]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + _buf1[j * txfm_size_row + 0] = buf0_cur[0]; + _buf1[j * txfm_size_row + 1] = buf0_cur[1]; + _buf1[j * txfm_size_row + 2] = buf0_cur[2]; + _buf1[j * txfm_size_row + 3] = buf0_cur[3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, 0, txfm_size_row, + bd); + } + } +} +static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64 * 16]; + int eobx, eoby; + get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div4 = txfm_size_col >> 2; + const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; + const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; + const int input_stride = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + + const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; + const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) { + __m128i buf0[64]; + load_buffer_32bit_input(input + i * 4, input_stride, buf0, + buf_size_nonzero_w); + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div4; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1, + bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} + +static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col); + load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col); + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]); + + if (lr_flip) { + TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6], + buf1[7]); + } else { + TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2], + buf1[3]); + + TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6], + buf1[7]); + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[8]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[8]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0, + NewInvSqrt2); + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < 2; i++) { + __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_32bit_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + // write to buffer + highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip, + txfm_size_row, bd); +} + +static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_h_div8 = txfm_size_row >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2]; + const int input_stride = AOMMIN(32, txfm_size_row); + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + for (int i = 0; i < (txfm_size_row >> 2); i++) { + const int32_t *input_row = input + i * 4; + __m128i *buf0_cur = buf0 + i * 4; + load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col); + row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]); + } + + if (lr_flip) { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2], + buf1[4 * j + 3]); + } + } else { + for (int j = 0; j < buf_size_h_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2], + buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1], + buf1[4 * j + 2], buf1[4 * j + 3]); + } + } + + // 2nd stage: column transform + col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]); + + // write to buffer + highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row, + bd); +} + +static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[16]; + const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int buf_size_w_div8 = txfm_size_col >> 2; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + assert(col_txfm != NULL); + assert(row_txfm != NULL); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // 1st stage: column transform + __m128i buf0[16]; + const int32_t *input_row = input; + load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col); + + row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]); + + __m128i *buf1_ptr; + if (lr_flip) { + flip_buf_sse2(buf0, buf1, txfm_size_col); + buf1_ptr = buf1; + } else { + buf1_ptr = buf0; + } + + // 2nd stage: column transform + for (int i = 0; i < buf_size_w_div8; i++) { + __m128i *buf1_cur = buf1_ptr + i * txfm_size_row; + transpose_32bit_4x4(buf1_cur, buf1_cur); + col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0); + } + av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]); + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } +} + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + switch (tx_type) { + case DCT_DCT: + case ADST_DCT: + case DCT_ADST: + case ADST_ADST: + case FLIPADST_DCT: + case DCT_FLIPADST: + case FLIPADST_FLIPADST: + case ADST_FLIPADST: + case FLIPADST_ADST: + highbd_inv_txfm2d_add_no_identity_sse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_h_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + highbd_inv_txfm2d_add_v_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case IDTX: + highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; + default: assert(0); break; + } +} + +void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, + const TxfmParam *txfm_param) { + int bd = txfm_param->bd; + const TX_TYPE tx_type = txfm_param->tx_type; + const TX_SIZE tx_size = txfm_param->tx_size; + int eob = txfm_param->eob; + highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); +} + +void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, + int stride, const TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_8X8: + av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X8: + av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); + break; + case TX_8X4: + av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X4: + av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_16X4: + av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); + break; + case TX_4X16: + av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); + break; + default: + av1_highbd_inv_txfm2d_add_universe_sse4_1( + input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob, + txfm_param->bd); + break; + } +} diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c new file mode 100644 index 0000000000..6dcac10e45 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -0,0 +1,849 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "av1/common/convolve.h" + +void av1_highbd_dist_wtd_convolve_2d_copy_avx2(const uint16_t *src, + int src_stride, uint16_t *dst0, + int dst_stride0, int w, int h, + ConvolveParams *conv_params, + int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + + const int bits = + FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + int i, j; + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const __m256i offset_const_16b = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits <= 4); + + if (!(w % 16)) { + for (i = 0; i < h; i += 1) { + for (j = 0; j < w; j += 16) { + const __m256i src_16bit = + _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j])); + + const __m256i res = _mm256_sll_epi16(src_16bit, left_shift); + + if (do_average) { + const __m256i data_0 = + _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero); + + const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b_lo, offset_const); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); + const __m256i res_unsigned_hi = + _mm256_add_epi32(res_32b_hi, offset_const); + + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = highbd_convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result_hi = highbd_convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + + _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]), + res_unsigned_16b); + } + } + } + } else if (!(w % 4)) { + for (i = 0; i < h; i += 2) { + for (j = 0; j < w; j += 8) { + const __m128i src_row_0 = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); + const __m128i src_row_1 = + _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride])); + // since not all compilers yet support _mm256_set_m128i() + const __m256i src_10 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_0), src_row_1, 1); + + const __m256i res = _mm256_sll_epi16(src_10, left_shift); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i res_32b = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b, offset_const); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero); + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_32b_lo, offset_const); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); + const __m256i res_unsigned_hi = + _mm256_add_epi32(res_32b_hi, offset_const); + + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + const __m256i res_unsigned_16b = + _mm256_adds_epu16(res, offset_const_16b); + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_2d_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[8], coeffs_y[4], coeffs_x[4]; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i round_const_x = _mm256_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const __m256i round_const_y = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < im_h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = _mm256_setzero_si256(); + if (i + 1 < im_h) + row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + } + + /* Vertical filter */ + { + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + s[2] = _mm256_unpacklo_epi16(s4, s5); + + s[4] = _mm256_unpackhi_epi16(s0, s1); + s[5] = _mm256_unpackhi_epi16(s2, s3); + s[6] = _mm256_unpackhi_epi16(s4, s5); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + const __m256i res_a = convolve(s, coeffs_y); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_y), round_shift_y); + + const __m256i res_unsigned_lo = + _mm256_add_epi32(res_a_round, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_b = convolve(s + 4, coeffs_y); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_y), round_shift_y); + + __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + + int i, j; + __m256i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i zero = _mm256_setzero_si256(); + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sll_epi32(res_even, round_shift_bits); + res_odd = _mm256_sll_epi32(res_odd, round_shift_bits); + + __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd); + + __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = highbd_comp_avg( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd); + __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = highbd_convolve_rounding( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result_hi = highbd_convolve_rounding( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_highbd_dist_wtd_convolve_y_avx2( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + int i, j; + __m256i s[8], coeffs_y[4]; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + const __m256i round_const_y = + _mm256_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1); + const __m256i clip_pixel_to_bd = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits); + res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a_round, round_const_y), round_shift_y); + + __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); + + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result = highbd_convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result, round_result); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits); + res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b_round, round_const_y), round_shift_y); + + __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const); + + if (do_average) { + const __m256i data_0 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]))); + const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride]))); + const __m256i data_01 = + _mm256_permute2x128_si256(data_0, data_1, 0x20); + + const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); + const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); + + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + const __m256i round_result_hi = + highbd_convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_16b = + _mm256_packus_epi32(round_result_lo, round_result_hi); + const __m256i res_clip = + _mm256_min_epi16(res_16b, clip_pixel_to_bd); + + const __m128i res_0 = _mm256_castsi256_si128(res_clip); + const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); + } else { + __m256i res_16b = + _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_16b); + const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1); + + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c new file mode 100644 index 0000000000..5a7fc536a2 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_sse4_1.h" + +void av1_highbd_dist_wtd_convolve_y_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + + assert(bits >= 0); + int i, j; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i round_const_y = + _mm_set1_epi32(((1 << conv_params->round_1) >> 1)); + const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + __m128i s[16], coeffs_y[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits); + res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y), + round_shift_y); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits); + res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y), + round_shift_y); + + __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const); + __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const); + + if (w - j < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadl_epi64( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i comp_avg_res_0 = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_1 = + highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b_0 = + _mm_packus_epi32(round_result_0, round_result_0); + const __m128i res_clip_0 = + _mm_min_epi16(res_16b_0, clip_pixel_to_bd); + const __m128i res_16b_1 = + _mm_packus_epi32(round_result_1, round_result_1); + const __m128i res_clip_1 = + _mm_min_epi16(res_16b_1, clip_pixel_to_bd); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), + res_clip_0); + _mm_storel_epi64( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_clip_1); + + } else { + __m128i res_16b_0 = + _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0); + + __m128i res_16b_1 = + _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16b_1); + } + } else { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits); + res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b_round0, round_const_y), round_shift_y); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits); + res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b_round1, round_const_y), round_shift_y); + + __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const); + __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const); + + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_1 = _mm_loadu_si128( + (__m128i *)(&dst[i * dst_stride + j + dst_stride])); + const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero); + + const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero); + const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero); + + const __m128i comp_avg_res_lo_0 = + highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_lo_1 = + highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi_0 = + highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, + &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi_1 = + highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, + &wt0, &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_lo_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi_0 = + highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const, + &rounding_const, rounding_shift); + const __m128i round_result_hi_1 = + highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const, + &rounding_const, rounding_shift); + + const __m128i res_16b_0 = + _mm_packus_epi32(round_result_lo_0, round_result_hi_0); + const __m128i res_clip_0 = + _mm_min_epi16(res_16b_0, clip_pixel_to_bd); + + const __m128i res_16b_1 = + _mm_packus_epi32(round_result_lo_1, round_result_hi_1); + const __m128i res_clip_1 = + _mm_min_epi16(res_16b_1, clip_pixel_to_bd); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), + res_clip_0); + _mm_store_si128( + (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), + res_clip_1); + } else { + __m128i res_16bit0 = + _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0); + __m128i res_16bit1 = + _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_16bit1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_dist_wtd_convolve_x_sse4_1( + const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + + int i, j; + __m128i s[4], coeffs_x[4]; + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + const __m128i zero = _mm_setzero_si128(); + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi32(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); + const __m128i clip_pixel_to_bd = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + + assert(bits >= 0); + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sll_epi32(res_even, round_shift_bits); + res_odd = _mm_sll_epi32(res_odd, round_shift_bits); + + __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd); + __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const); + if (w - j < 8) { + if (do_average) { + const __m128i data_0 = + _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); + + const __m128i comp_avg_res = highbd_comp_avg_sse4_1( + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); + const __m128i round_result = highbd_convolve_rounding_sse2( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = _mm_packus_epi32(round_result, round_result); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b); + } + } else { + __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd); + __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const); + if (do_average) { + const __m128i data_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); + const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); + + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); + + const __m128i round_result_lo = highbd_convolve_rounding_sse2( + &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + const __m128i round_result_hi = highbd_convolve_rounding_sse2( + &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_16b = + _mm_packus_epi32(round_result_lo, round_result_hi); + const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); + } else { + __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h new file mode 100644 index 0000000000..5734810f52 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ +#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ + +#include /* SSE4.1 */ + +#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \ + do { \ + __m128i u0, u1, u2, u3; \ + u0 = _mm_unpacklo_epi32(x0, x1); \ + u1 = _mm_unpackhi_epi32(x0, x1); \ + u2 = _mm_unpacklo_epi32(x2, x3); \ + u3 = _mm_unpackhi_epi32(x2, x3); \ + y0 = _mm_unpacklo_epi64(u0, u2); \ + y1 = _mm_unpackhi_epi64(u0, u2); \ + y2 = _mm_unpacklo_epi64(u1, u3); \ + y3 = _mm_unpackhi_epi64(u1, u3); \ + } while (0) + +static INLINE void transpose_8x8(const __m128i *in, __m128i *out) { + TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]); + TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]); + TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]); + TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13], + out[15]); +} + +static INLINE void transpose_16x16(const __m128i *in, __m128i *out) { + // Upper left 8x8 + TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]); + TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24], + out[28]); + TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9], + out[13]); + TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25], + out[29]); + + // Upper right 8x8 + TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40], + out[44]); + TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56], + out[60]); + TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41], + out[45]); + TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57], + out[61]); + + // Lower left 8x8 + TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10], + out[14]); + TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26], + out[30]); + TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11], + out[15]); + TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27], + out[31]); + // Lower right 8x8 + TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42], + out[46]); + TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58], + out[62]); + TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43], + out[47]); + TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59], + out[63]); +} + +static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output, + const int width, const int height) { + const int numcol = height >> 2; + const int numrow = width >> 2; + for (int j = 0; j < numrow; j++) { + for (int i = 0; i < numcol; i++) { + TRANSPOSE_4X4(input[i * width + j + (numrow * 0)], + input[i * width + j + (numrow * 1)], + input[i * width + j + (numrow * 2)], + input[i * width + j + (numrow * 3)], + output[j * height + i + (numcol * 0)], + output[j * height + i + (numcol * 1)], + output[j * height + i + (numcol * 2)], + output[j * height + i + (numcol * 3)]); + } + } +} + +// Note: +// rounding = 1 << (bit - 1) +static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *w1, const __m128i *n1, + const __m128i *rounding, int bit) { + __m128i x, y; + + x = _mm_mullo_epi32(*w0, *n0); + y = _mm_mullo_epi32(*w1, *n1); + x = _mm_add_epi32(x, y); + x = _mm_add_epi32(x, *rounding); + x = _mm_srai_epi32(x, bit); + return x; +} + +static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0, + const __m128i *rounding, int bit) { + __m128i x; + + x = _mm_mullo_epi32(*w0, *n0); + x = _mm_add_epi32(x, *rounding); + x = _mm_srai_epi32(x, bit); + return x; +} + +typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + int do_cols, int bd, int out_shift); + +typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit, + const int num_cols); + +void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, + uint8_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd); + +#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_ diff --git a/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c new file mode 100644 index 0000000000..75108b49da --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_warp_affine_avx2.c @@ -0,0 +1,656 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +void av1_highbd_warp_affine_avx2(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m256i tmp[15]; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + (void)max_bits_horiz; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); + const __m256i reduce_bits_vert_const = + _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); + const __m256i res_sub_const = + _mm256_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m256i round_bits_const = _mm256_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi32(w0); + const __m256i wt1 = _mm256_set1_epi32(w1); + + __m256i v_rbhoriz = _mm256_set1_epi32(1 << (reduce_bits_horiz - 1)); + __m256i v_zeros = _mm256_setzero_si256(); + int ohoriz = 1 << offset_bits_horiz; + int mhoriz = 1 << max_bits_horiz; + (void)mhoriz; + int sx; + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int16_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int16_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + if (ix4 <= -7) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm256_cvtepi16_epi32(_mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)))); + } + } else if (ix4 >= width + 6) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm256_cvtepi16_epi32( + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz)))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + int32_t tmp1[8]; + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + const int iy = clamp(iy4 + k, 0, height - 1); + + sx = sx4 + beta * (k + 4); + for (int l = -4; l < 4; ++l) { + int ix = ix4 + l - 3; + const int offs = sx >> WARPEDDIFF_PREC_BITS; + const int16_t *coeffs = av1_warped_filter[offs]; + + int32_t sum = 1 << offset_bits_horiz; + for (int m = 0; m < 8; ++m) { + const int sample_x = clamp(ix + m, 0, width - 1); + sum += ref[iy * stride + sample_x] * coeffs[m]; + } + sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz); + tmp1[(l + 4) / 2 + ((l + 4) % 2) * 4] = sum; + sx += alpha; + } + tmp[k + 7] = _mm256_loadu_si256((__m256i *)tmp1); + } + } else { + if (beta == 0 && alpha == 0) { + sx = sx4; + __m128i v_01 = _mm_loadu_si128( + (__m128i *) + av1_warped_filter[sx >> + WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 + __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 + __m256i v_c23 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 + __m256i v_c45 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 + __m256i v_c67 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = _mm256_madd_epi16( + v_c01, _mm256_alignr_epi8(v_refu, v_refl, + 0)); // R8R7R6..R1R7R6R5..R1R0 + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = _mm256_madd_epi16( + v_c23, + _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = _mm256_madd_epi16( + v_c45, _mm256_alignr_epi8(v_refu, v_refl, + 8)); // R12R11..R5R11R10..R5R4 + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16( + v_c67, _mm256_alignr_epi8(v_refu, v_refl, + 12)); // R14R13..R7R13R12..R7R6 + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + } else if (alpha == 0) { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + sx = sx4 + beta * (k + 4); + + __m128i v_01 = _mm_loadu_si128( + (__m128i *)av1_warped_filter + [sx >> WARPEDDIFF_PREC_BITS]); // A7A6A5A4A3A2A1A0 + __m256i v_c01 = _mm256_broadcastd_epi32(v_01); // A1A0A1A0A1A0A1A0 + __m256i v_c23 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 1)); // A3A2A3A2A3A2A3A2 + __m256i v_c45 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 2)); // A5A4A5A4A5A4A5A4 + __m256i v_c67 = _mm256_broadcastd_epi32( + _mm_shuffle_epi32(v_01, 3)); // A7A6A7A6A7A6A7A6 + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = + _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = + _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = + _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16(v_c67, + _mm256_alignr_epi8(v_refu, v_refl, 12)); + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + } else if (beta == 0) { + sx = sx4; + __m256i v_coeff01 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), + 0); + v_coeff01 = _mm256_inserti128_si256( + v_coeff01, + _mm_loadu_si128( + (__m128i *) + av1_warped_filter[(sx + alpha) >> WARPEDDIFF_PREC_BITS]), + 1); // B7B6..B1B0A7A6..A1A0 + __m256i v_coeff23 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff23 = _mm256_inserti128_si256( + v_coeff23, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // D7D6..D1D0C7C6..C1C0 + __m256i v_coeff45 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff45 = _mm256_inserti128_si256( + v_coeff45, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // F7F6..F1F0E7E6..E1E0 + __m256i v_coeff67 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff67 = _mm256_inserti128_si256( + v_coeff67, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // H7H6..H1H0G7G6..G1G0 + + __m256i v_c0123 = _mm256_unpacklo_epi32( + v_coeff01, + v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 + __m256i v_c0123u = _mm256_unpackhi_epi32( + v_coeff01, + v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 + __m256i v_c4567 = _mm256_unpacklo_epi32( + v_coeff45, + v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 + __m256i v_c4567u = _mm256_unpackhi_epi32( + v_coeff45, + v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 + + __m256i v_c01 = _mm256_unpacklo_epi64( + v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 + __m256i v_c23 = + _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 + __m256i v_c45 = + _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 + __m256i v_c67 = + _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 + + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = _mm256_madd_epi16( + v_c01, _mm256_alignr_epi8(v_refu, v_refl, + 0)); // R8R7R6..R1R7R6R5..R1R0 + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = _mm256_madd_epi16( + v_c23, + _mm256_alignr_epi8(v_refu, v_refl, 4)); // R10R9..R3R9R8..R3R2 + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = _mm256_madd_epi16( + v_c45, _mm256_alignr_epi8(v_refu, v_refl, + 8)); // R12R11..R5R11R10..R5R4 + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16( + v_c67, _mm256_alignr_epi8(v_refu, v_refl, + 12)); // R14R13..R7R13R12..R7R6 + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + + } else { + for (int k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + iy = iy * stride; + + sx = sx4 + beta * (k + 4); + + __m256i v_coeff01 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx) >> WARPEDDIFF_PREC_BITS]), + 0); + v_coeff01 = _mm256_inserti128_si256( + v_coeff01, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // B7B6..B1B0A7A6..A1A0 + __m256i v_coeff23 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff23 = _mm256_inserti128_si256( + v_coeff23, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // D7D6..D1D0C7C6..C1C0 + __m256i v_coeff45 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff45 = _mm256_inserti128_si256( + v_coeff45, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // F7F6..F1F0E7E6..E1E0 + __m256i v_coeff67 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff67 = _mm256_inserti128_si256( + v_coeff67, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sx + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]), + 1); // H7H6..H1H0G7G6..G1G0 + + __m256i v_c0123 = _mm256_unpacklo_epi32( + v_coeff01, + v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 + __m256i v_c0123u = _mm256_unpackhi_epi32( + v_coeff01, + v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 + __m256i v_c4567 = _mm256_unpacklo_epi32( + v_coeff45, + v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 + __m256i v_c4567u = _mm256_unpackhi_epi32( + v_coeff45, + v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 + + __m256i v_c01 = _mm256_unpacklo_epi64( + v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 + __m256i v_c23 = + _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 + __m256i v_c45 = + _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 + __m256i v_c67 = + _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 + + __m256i v_refl = _mm256_inserti128_si256( + _mm256_setzero_si256(), + _mm_loadu_si128((__m128i *)&ref[iy + ix4 - 7]), 0); + v_refl = _mm256_inserti128_si256( + v_refl, _mm_loadu_si128((__m128i *)&ref[iy + ix4 + 1]), + 1); // R15 .. R0 + + __m256i v_ref = _mm256_permute4x64_epi64(v_refl, 0xEE); + + __m256i v_refu = + _mm256_alignr_epi8(v_ref, v_refl, 2); // R8R15R14...R2R1 + + v_refl = _mm256_inserti128_si256( + v_refl, _mm256_extracti128_si256(v_refu, 0), 1); + v_refu = _mm256_inserti128_si256( + v_refu, _mm256_extracti128_si256(v_ref, 0), 0); + + __m256i v_sum = _mm256_set1_epi32(ohoriz); + __m256i parsum = + _mm256_madd_epi16(v_c01, _mm256_alignr_epi8(v_refu, v_refl, 0)); + __m256i v_sum1 = _mm256_add_epi32(v_sum, parsum); + + parsum = + _mm256_madd_epi16(v_c23, _mm256_alignr_epi8(v_refu, v_refl, 4)); + __m256i v_sum2 = _mm256_add_epi32(v_sum1, parsum); + parsum = + _mm256_madd_epi16(v_c45, _mm256_alignr_epi8(v_refu, v_refl, 8)); + __m256i v_sum3 = _mm256_add_epi32(v_sum2, parsum); + parsum = _mm256_madd_epi16(v_c67, + _mm256_alignr_epi8(v_refu, v_refl, 12)); + __m256i v_sum4 = _mm256_add_epi32(v_sum3, parsum); + + tmp[k + 7] = _mm256_srai_epi32(_mm256_add_epi32(v_sum4, v_rbhoriz), + reduce_bits_horiz); + } + } + } + + // Vertical filter + for (int k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + const __m256i *src = tmp + (k + 4); + + __m256i v_coeff01 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128( + (__m128i *)av1_warped_filter[(sy) >> WARPEDDIFF_PREC_BITS]), + 0); + v_coeff01 = _mm256_inserti128_si256( + v_coeff01, + _mm_loadu_si128( + (__m128i *) + av1_warped_filter[(sy + gamma) >> WARPEDDIFF_PREC_BITS]), + 1); + __m256i v_coeff23 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 2 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff23 = _mm256_inserti128_si256( + v_coeff23, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 3 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 1); + __m256i v_coeff45 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 4 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff45 = _mm256_inserti128_si256( + v_coeff45, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 5 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 1); + __m256i v_coeff67 = _mm256_inserti128_si256( + v_zeros, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 6 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 0); + v_coeff67 = _mm256_inserti128_si256( + v_coeff67, + _mm_loadu_si128((__m128i *)av1_warped_filter[(sy + 7 * gamma) >> + WARPEDDIFF_PREC_BITS]), + 1); + + __m256i v_c0123 = _mm256_unpacklo_epi32( + v_coeff01, + v_coeff23); // D3D2B3B2D1D0B1B0C3C2A3A2C1C0A1A0 + __m256i v_c0123u = _mm256_unpackhi_epi32( + v_coeff01, + v_coeff23); // D7D6B7B6D5D4B5B4C7C6A7A6C5C4A5A4 + __m256i v_c4567 = _mm256_unpacklo_epi32( + v_coeff45, + v_coeff67); // H3H2F3F2H1H0F1F0G3G2E3E2G1G0E1E0 + __m256i v_c4567u = _mm256_unpackhi_epi32( + v_coeff45, + v_coeff67); // H7H6F7F6H5H4F5F4G7G6E7E6G5G4E5E4 + + __m256i v_c01 = _mm256_unpacklo_epi64( + v_c0123, v_c4567); // H1H0F1F0D1D0B1B0G1G0E1E0C1C0A1A0 + __m256i v_c23 = + _mm256_unpackhi_epi64(v_c0123, v_c4567); // H3H2 ... A3A2 + __m256i v_c45 = + _mm256_unpacklo_epi64(v_c0123u, v_c4567u); // H5H4 ... A5A4 + __m256i v_c67 = + _mm256_unpackhi_epi64(v_c0123u, v_c4567u); // H7H6 ... A7A6 + + __m256i v_src01l = + _mm256_unpacklo_epi32(src[0], src[1]); // T13T03T11T01T12T02T10T00 + __m256i v_src01u = + _mm256_unpackhi_epi32(src[0], src[1]); // T17T07T15T05T16T06T14T04 + __m256i v_sum = + _mm256_madd_epi16(_mm256_packus_epi32(v_src01l, v_src01u), + v_c01); // S7S5S3S1S6S4S2S0 + + __m256i v_src23l = _mm256_unpacklo_epi32(src[2], src[3]); + __m256i v_src23u = _mm256_unpackhi_epi32(src[2], src[3]); + v_sum = _mm256_add_epi32( + v_sum, + _mm256_madd_epi16(_mm256_packus_epi32(v_src23l, v_src23u), v_c23)); + + __m256i v_src45l = _mm256_unpacklo_epi32(src[4], src[5]); + __m256i v_src45u = _mm256_unpackhi_epi32(src[4], src[5]); + v_sum = _mm256_add_epi32( + v_sum, + _mm256_madd_epi16(_mm256_packus_epi32(v_src45l, v_src45u), v_c45)); + + __m256i v_src67l = _mm256_unpacklo_epi32(src[6], src[7]); + __m256i v_src67u = _mm256_unpackhi_epi32(src[6], src[7]); + v_sum = _mm256_add_epi32( + v_sum, + _mm256_madd_epi16(_mm256_packus_epi32(v_src67l, v_src67u), v_c67)); + + // unpack S7S5S3S1S6S4S2S0 to S7S6S5S4S3S2S1S0 + + __m256i v_suml = + _mm256_permute4x64_epi64(v_sum, 0xD8); // S7S5S6S4S3S1S2S0 + __m256i v_sumh = + _mm256_permute4x64_epi64(v_sum, 0x32); // S2S0S7S5S2S0S3S1 + v_sum = _mm256_unpacklo_epi32(v_suml, v_sumh); // S7S6S5S4S3S2S1S0 + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + + v_sum = _mm256_add_epi32(v_sum, res_add_const); + v_sum = + _mm256_sra_epi32(_mm256_add_epi32(v_sum, reduce_bits_vert_const), + reduce_bits_vert_shift); + if (conv_params->do_average) { + __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m256i p_32 = _mm256_cvtepu16_epi32(_mm_loadu_si128(p)); + + if (conv_params->use_dist_wtd_comp_avg) { + v_sum = _mm256_add_epi32(_mm256_mullo_epi32(p_32, wt0), + _mm256_mullo_epi32(v_sum, wt1)); + v_sum = _mm256_srai_epi32(v_sum, DIST_PRECISION_BITS); + } else { + v_sum = _mm256_srai_epi32(_mm256_add_epi32(p_32, v_sum), 1); + } + + __m256i v_sum1 = _mm256_add_epi32(v_sum, res_sub_const); + v_sum1 = _mm256_sra_epi32( + _mm256_add_epi32(v_sum1, round_bits_const), round_bits_shift); + + __m256i v_sum16 = _mm256_packus_epi32(v_sum1, v_sum1); + v_sum16 = _mm256_permute4x64_epi64(v_sum16, 0xD8); + v_sum16 = _mm256_min_epi16(v_sum16, clip_pixel); + _mm_storeu_si128(dst16, _mm256_extracti128_si256(v_sum16, 0)); + } else { + v_sum = _mm256_packus_epi32(v_sum, v_sum); + __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum, 0xD8); + _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); + } + } else { + // Round and pack into 8 bits + const __m256i round_const = + _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + + __m256i v_sum1 = _mm256_srai_epi32( + _mm256_add_epi32(v_sum, round_const), reduce_bits_vert); + + v_sum1 = _mm256_packus_epi32(v_sum1, v_sum1); + __m256i v_sum16 = _mm256_permute4x64_epi64(v_sum1, 0xD8); + // Clamp res_16bit to the range [0, 2^bd - 1] + const __m256i max_val = _mm256_set1_epi16((1 << bd) - 1); + const __m256i zero = _mm256_setzero_si256(); + v_sum16 = _mm256_max_epi16(_mm256_min_epi16(v_sum16, max_val), zero); + + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + _mm_storeu_si128(p, _mm256_extracti128_si256(v_sum16, 0)); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c new file mode 100644 index 0000000000..96fb4cf632 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10, + 12, 14, 1, 3, 5, 7, + 9, 11, 13, 15 }; + +static const uint8_t highbd_shuffle_alpha0_mask0[16] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; +static const uint8_t highbd_shuffle_alpha0_mask1[16] = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; +static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9, + 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11 }; +static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13, + 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15 }; + +static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS))); + + // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2 + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6 + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2 + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6 + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6 + coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6 + coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10); + // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6 + coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14); + // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6 + coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + // Filter odd-index pixels + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0( + int sx, __m128i *coeff) { + // Filter coeff + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS))); + + coeff[0] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0)); + coeff[2] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1)); + coeff[4] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2)); + coeff[6] = _mm_shuffle_epi8( + tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3)); + + coeff[1] = coeff[0]; + coeff[3] = coeff[2]; + coeff[5] = coeff[4]; + coeff[7] = coeff[6]; +} + +static INLINE void highbd_filter_src_pixels( + const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, const int reduce_bits_horiz, int k) { + const __m128i src_1 = *src; + const __m128i src2_1 = *src2; + + const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]); + + __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]); + + __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), + _mm_cvtsi32_si128(reduce_bits_horiz)); + + // Combine results into one register. + // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7 + // as this order helps with the vertical filter. + tmp[k + 7] = _mm_packs_epi32(res_even, res_odd); +} + +static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2, + __m128i *tmp, int sx, int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff); + highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); +} + +static INLINE void highbd_warp_horizontal_filter_alpha0_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_alpha0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter_beta0( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[8]; + highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz, + reduce_bits_horiz, k); + } +} + +static INLINE void highbd_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void highbd_prepare_warp_horizontal_filter( + const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + highbd_warp_horizontal_filter_alpha0_beta0( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha == 0 && beta != 0) + highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + + else if (alpha != 0 && beta == 0) + highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else + highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + +void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + assert(!(bd == 12 && reduce_bits_horiz < 5)); + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); + const __m128i reduce_bits_vert_const = + _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const __m128i res_sub_const = + _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits); + __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi32(w0); + const __m128i wt1 = _mm_set1_epi32(w1); + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m128i src2 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1)); + + const __m128i src_01 = _mm_shuffle_epi8( + src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); + const __m128i src2_01 = _mm_shuffle_epi8( + src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes)); + + __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01); + __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left); + src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left); + } + + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right); + src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right); + } + + const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi); + const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi); + + highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k, + offset_bits_horiz, reduce_bits_horiz); + } + } else { + highbd_prepare_warp_horizontal_filter( + ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + } + + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10); + const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10); + const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14); + const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i tmp_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11); + const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11); + const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15); + const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo = _mm_add_epi32(res_lo, res_add_const); + res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const), + reduce_bits_vert_shift); + + if (conv_params->do_average) { + __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); + + if (conv_params->use_dist_wtd_comp_avg) { + res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), + _mm_mullo_epi32(res_lo, wt1)); + res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); + } else { + res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1); + } + + __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const); + res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const), + round_bits_shift); + + __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo); + res16_lo = _mm_min_epi16(res16_lo, clip_pixel); + _mm_storel_epi64(dst16, res16_lo); + } else { + res_lo = _mm_packus_epi32(res_lo, res_lo); + _mm_storel_epi64(p, res_lo); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + + res_hi = _mm_add_epi32(res_hi, res_add_const); + res_hi = + _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const), + reduce_bits_vert_shift); + if (conv_params->do_average) { + __m128i *const dst16_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); + + if (conv_params->use_dist_wtd_comp_avg) { + res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), + _mm_mullo_epi32(res_hi, wt1)); + res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); + } else { + res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1); + } + + __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const); + res32_hi = _mm_sra_epi32( + _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift); + __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi); + res16_hi = _mm_min_epi16(res16_hi, clip_pixel); + _mm_storel_epi64(dst16_4, res16_hi); + } else { + res_hi = _mm_packus_epi32(res_hi, res_hi); + _mm_storel_epi64(p4, res_hi); + } + } + } else { + // Round and pack into 8 bits + const __m128i round_const = + _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), reduce_bits_vert); + + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + // Clamp res_16bit to the range [0, 2^bd - 1] + const __m128i max_val = _mm_set1_epi16((1 << bd) - 1); + const __m128i zero = _mm_setzero_si128(); + res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + _mm_storel_epi64(p, res_16bit); + } else { + _mm_storeu_si128(p, res_16bit); + } + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c new file mode 100644 index 0000000000..562c623fa9 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// 128-bit xmmwords are written as [ ... ] with the MSB on the left. +// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB +// on the left. +// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be +// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ]. +void av1_highbd_wiener_convolve_add_src_avx2( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + DECLARE_ALIGNED(32, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 1; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero_128 = _mm_setzero_si128(); + const __m256i zero_256 = _mm256_setzero_si256(); + + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + + const __m256i clamp_low = zero_256; + + /* Horizontal filter */ + { + const __m256i clamp_high_ep = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = _mm256_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (int i = 0; i < intermediate_height; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *src_ij = src_ptr + i * src_stride + j; + + // Load 16-bit src data + const __m256i src_0 = yy_loadu_256(src_ij + 0); + const __m256i src_1 = yy_loadu_256(src_ij + 1); + const __m256i src_2 = yy_loadu_256(src_ij + 2); + const __m256i src_3 = yy_loadu_256(src_ij + 3); + const __m256i src_4 = yy_loadu_256(src_ij + 4); + const __m256i src_5 = yy_loadu_256(src_ij + 5); + const __m256i src_6 = yy_loadu_256(src_ij + 6); + const __m256i src_7 = yy_loadu_256(src_ij + 7); + + // Multiply src data by filter coeffs and sum pairs + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + // Calculate scalar product for even- and odd-indices separately, + // increasing to 32-bit precision + const __m256i res_even_sum = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); + const __m256i res_even = _mm256_srai_epi32( + _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); + + const __m256i res_odd_sum = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); + const __m256i res_odd = _mm256_srai_epi32( + _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); + + // Reduce to 16-bit precision and pack even- and odd-index results + // back into one register. The _mm256_packs_epi32 intrinsic returns + // a register with the pixels ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i res = _mm256_packs_epi32(res_even, res_odd); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep); + + // Store in a temporary array + yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); + } + } + } + + /* Vertical filter */ + { + const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1); + + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j; + + // Load 16-bit data from the output of the horizontal filter in + // which the pixels are ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE); + const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE); + const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE); + const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE); + const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE); + const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE); + const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE); + const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE); + + // Filter the even-indices, increasing to 32-bit precision + const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); + const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); + const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); + const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); + + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + + const __m256i res_even = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); + + // Filter the odd-indices, increasing to 32-bit precision + const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); + const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); + const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); + const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); + + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + const __m256i res_odd = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); + + // Pixels are currently in the following order: + // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] + // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] + // + // Rearrange the pixels into the following order: + // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] + // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] + const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); + + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo, round_const), conv_params->round_1); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi, round_const), conv_params->round_1); + + // Reduce to 16-bit precision and pack into the correct order: + // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] + const __m256i res_16bit = + _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_16bit_clamped = _mm256_min_epi16( + _mm256_max_epi16(res_16bit, clamp_low), clamp_high); + + // Store in the dst array + yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c new file mode 100644 index 0000000000..cab37fa910 --- /dev/null +++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void av1_highbd_wiener_convolve_add_src_ssse3( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, + const WienerConvolveParams *conv_params, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 1; + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + conv_params->round_0); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + conv_params->round_0); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + const __m128i maxval = + _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), conv_params->round_1); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), conv_params->round_1); + + const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, res_16bit); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c new file mode 100644 index 0000000000..3eee46faeb --- /dev/null +++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { + if (!strength) return; + + DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { + { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 + { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 + { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 + }; + + DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { + { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + }; + + // Extend the first and last samples to simplify the loop for the 5-tap case + p[-1] = p[0]; + __m128i last = _mm_set1_epi8((char)p[sz - 1]); + _mm_storeu_si128((__m128i *)&p[sz], last); + + // Adjust input pointer for filter support area + uint8_t *in = (strength == 3) ? p - 1 : p; + + // Avoid modifying first sample + uint8_t *out = p + 1; + int len = sz - 1; + + const int use_3tap_filter = (strength < 3); + + if (use_3tap_filter) { + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); + __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); + __m128i in0 = _mm_lddqu_si128((__m128i *)in); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i d0 = _mm_shuffle_epi8(in0, shuf0); + __m128i d1 = _mm_shuffle_epi8(in0, shuf1); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srai_epi16(d0, 4); + d0 = _mm_packus_epi16(d0, d0); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi8(n_out); + __m128i mask = _mm_cmpgt_epi8(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storel_epi64((__m128i *)out, out0); + __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); + in0 = _mm_alignr_epi8(in1, in0, 8); + in += 8; + out += 8; + len -= n_out; + } + } else { // 5-tap filter + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i two = _mm_set1_epi8(2); + __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); + __m128i shuf_b = _mm_add_epi8(shuf_a, two); + __m128i shuf_c = _mm_add_epi8(shuf_b, two); + __m128i shuf_d = _mm_add_epi8(shuf_c, two); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); + __m128i in0 = _mm_lddqu_si128((__m128i *)in); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); + __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); + __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); + __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d2 = _mm_maddubs_epi16(d2, coef0); + d3 = _mm_maddubs_epi16(d3, coef0); + d0 = _mm_hadd_epi16(d0, d1); + d2 = _mm_hadd_epi16(d2, d3); + d0 = _mm_hadd_epi16(d0, d2); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srai_epi16(d0, 4); + d0 = _mm_packus_epi16(d0, d0); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi8(n_out); + __m128i mask = _mm_cmpgt_epi8(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storel_epi64((__m128i *)out, out0); + __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); + in0 = _mm_alignr_epi8(in1, in0, 8); + in += 8; + out += 8; + len -= n_out; + } + } +} + +void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { + // interpolate half-sample positions + assert(sz <= 24); + + DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { + { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } + }; + + DECLARE_ALIGNED( + 16, static const int8_t, + v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, + { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; + + // Extend first/last samples (upper-left p[-1], last p[sz-1]) + // to support 4-tap filter + p[-2] = p[-1]; + p[sz] = p[sz - 1]; + + uint8_t *in = &p[-2]; + uint8_t *out = &p[-2]; + + int n = sz + 1; // Input length including upper-left sample + + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); + + __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); + __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); + + while (n > 0) { + __m128i in8 = _mm_alignr_epi8(in16, in0, 8); + __m128i d0 = _mm_shuffle_epi8(in0, shuf0); + __m128i d1 = _mm_shuffle_epi8(in0, shuf1); + __m128i d2 = _mm_shuffle_epi8(in8, shuf0); + __m128i d3 = _mm_shuffle_epi8(in8, shuf1); + d0 = _mm_maddubs_epi16(d0, coef0); + d1 = _mm_maddubs_epi16(d1, coef0); + d2 = _mm_maddubs_epi16(d2, coef0); + d3 = _mm_maddubs_epi16(d3, coef0); + d0 = _mm_hadd_epi16(d0, d1); + d2 = _mm_hadd_epi16(d2, d3); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d2 = _mm_add_epi16(d2, eight); + d0 = _mm_srai_epi16(d0, 4); + d2 = _mm_srai_epi16(d2, 4); + d0 = _mm_packus_epi16(d0, d2); + __m128i in1 = _mm_alignr_epi8(in16, in0, 1); + __m128i out0 = _mm_unpacklo_epi8(in1, d0); + __m128i out1 = _mm_unpackhi_epi8(in1, d0); + _mm_storeu_si128((__m128i *)&out[0], out0); + _mm_storeu_si128((__m128i *)&out[16], out1); + in0 = in16; + in16 = _mm_setzero_si128(); + out += 32; + n -= 16; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH + +void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) { + if (!strength) return; + + DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { + { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 + { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 + { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 + }; + + DECLARE_ALIGNED(16, static const int16_t, + v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; + + // Extend the first and last samples to simplify the loop for the 5-tap case + p[-1] = p[0]; + __m128i last = _mm_set1_epi16(p[sz - 1]); + _mm_storeu_si128((__m128i *)&p[sz], last); + + // Adjust input pointer for filter support area + uint16_t *in = (strength == 3) ? p - 1 : p; + + // Avoid modifying first sample + uint16_t *out = p + 1; + int len = sz - 1; + + const int use_3tap_filter = (strength < 3); + + if (use_3tap_filter) { + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in02 = _mm_add_epi16(in0, in2); + __m128i d0 = _mm_unpacklo_epi16(in02, in1); + __m128i d1 = _mm_unpackhi_epi16(in02, in1); + d0 = _mm_mullo_epi16(d0, coef0); + d1 = _mm_mullo_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srli_epi16(d0, 4); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi16(n_out); + __m128i mask = _mm_cmpgt_epi16(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storeu_si128((__m128i *)out, out0); + in += 8; + in0 = in8; + in8 = _mm_lddqu_si128((__m128i *)&in[8]); + out += 8; + len -= n_out; + } + } else { // 5-tap filter + __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); + __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + while (len > 0) { + int n_out = (len < 8) ? len : 8; + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in3 = _mm_alignr_epi8(in8, in0, 6); + __m128i in4 = _mm_alignr_epi8(in8, in0, 8); + __m128i in04 = _mm_add_epi16(in0, in4); + __m128i in123 = _mm_add_epi16(in1, in2); + in123 = _mm_add_epi16(in123, in3); + __m128i d0 = _mm_unpacklo_epi16(in04, in123); + __m128i d1 = _mm_unpackhi_epi16(in04, in123); + d0 = _mm_mullo_epi16(d0, coef0); + d1 = _mm_mullo_epi16(d1, coef0); + d0 = _mm_hadd_epi16(d0, d1); + __m128i eight = _mm_set1_epi16(8); + d0 = _mm_add_epi16(d0, eight); + d0 = _mm_srli_epi16(d0, 4); + __m128i out0 = _mm_lddqu_si128((__m128i *)out); + __m128i n0 = _mm_set1_epi16(n_out); + __m128i mask = _mm_cmpgt_epi16(n0, iden); + out0 = _mm_blendv_epi8(out0, d0, mask); + _mm_storeu_si128((__m128i *)out, out0); + in += 8; + in0 = in8; + in8 = _mm_lddqu_si128((__m128i *)&in[8]); + out += 8; + len -= n_out; + } + } +} + +void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) { + // interpolate half-sample positions + assert(sz <= 24); + + DECLARE_ALIGNED(16, static const int16_t, + kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; + + // Extend first/last samples (upper-left p[-1], last p[sz-1]) + // to support 4-tap filter + p[-2] = p[-1]; + p[sz] = p[sz - 1]; + + uint16_t *in = &p[-2]; + uint16_t *out = in; + int n = sz + 1; + + __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); + __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); + __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); + __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); + + while (n > 0) { + __m128i in1 = _mm_alignr_epi8(in8, in0, 2); + __m128i in2 = _mm_alignr_epi8(in8, in0, 4); + __m128i in3 = _mm_alignr_epi8(in8, in0, 6); + __m128i sum0 = _mm_add_epi16(in0, in3); + __m128i sum1 = _mm_add_epi16(in1, in2); + __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); + __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); + __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); + d0 = _mm_madd_epi16(d0, coef0); + d1 = _mm_madd_epi16(d1, coef0); + __m128i eight = _mm_set1_epi32(8); + d0 = _mm_add_epi32(d0, eight); + d1 = _mm_add_epi32(d1, eight); + d0 = _mm_srai_epi32(d0, 4); + d1 = _mm_srai_epi32(d1, 4); + d0 = _mm_packus_epi32(d0, d1); + __m128i max0 = _mm_set1_epi16((1 << bd) - 1); + d0 = _mm_min_epi16(d0, max0); + __m128i out0 = _mm_unpacklo_epi16(in1, d0); + __m128i out1 = _mm_unpackhi_epi16(in1, d0); + _mm_storeu_si128((__m128i *)&out[0], out0); + _mm_storeu_si128((__m128i *)&out[8], out1); + in0 = in8; + in8 = in16; + in16 = in24; + in24 = _mm_setzero_si128(); + out += 16; + n -= 8; + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c new file mode 100644 index 0000000000..9f82ed2300 --- /dev/null +++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c @@ -0,0 +1,1124 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/convolve_common_intrin.h" +#include "aom_dsp/x86/convolve_sse4_1.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "av1/common/convolve.h" + +static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) { + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((int16_t)w0); + const __m256i wt1 = _mm256_set1_epi16((int16_t)w1); + const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1); + return wt; +} + +static INLINE __m256i load_line2_avx2(const void *a, const void *b) { + return _mm256_permute2x128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)), + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); +} + +void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + int i, j, is_horiz_4tap = 0; + const int bits = FILTER_BITS - conv_params->round_1; + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + assert(bits >= 0); + assert(conv_params->round_0 > 0); + + const __m256i round_const = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + __m256i filt[4], coeffs[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; + + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x(data, coeffs, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + } + } +} + +void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + int i, j, is_vert_4tap = 0; + // +1 to compensate for dividing the filter coeffs by 2 + const int left_shift = FILTER_BITS - conv_params->round_0 + 1; + const __m256i round_const = + _mm256_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int offset_1 = (1 << (bd + FILTER_BITS - 2)); + const __m256i offset_const_1 = _mm256_set1_epi16(offset_1); + const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0)); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + const __m256i zero = _mm256_setzero_si256(); + __m256i coeffs[4], s[8]; + + assert((FILTER_BITS - conv_params->round_0) >= 0); + + prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src4; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[4]; + __m256i src_a[5]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 4; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src4 = src_a[4]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + + s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + } + + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 5) * src_stride + j]; + const __m256i src5 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); + + src4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); + + res_lo = _mm256_add_epi16(res_lo, offset_const_1); + + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); + + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } + + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + __m256i res_lo = convolve_lowbd(s, coeffs); + + res_lo = _mm256_add_epi16(res_lo, offset_const_1); + + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); + + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} + +void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); + + int im_stride = 8; + int i, is_horiz_4tap = 0, is_vert_4tap = 0; + const __m256i wt = unpack_weights_avx2(conv_params); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + assert(conv_params->round_0 > 0); + + const __m256i round_const_h = _mm256_set1_epi16( + ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); + + const __m256i round_const_v = _mm256_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + __m256i filt[4], coeffs_x[4], coeffs_y[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) + is_vert_4tap = 1; + + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + for (i = 0; i < im_h; i += 2) { + __m256i data = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); + src_h += (src_stride << 1); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), + round_shift_h); + + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + /* Vertical filter */ + __m256i s[6]; + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + + s[0] = _mm256_unpacklo_epi16(s0, s1); + s[1] = _mm256_unpacklo_epi16(s2, s3); + + s[3] = _mm256_unpackhi_epi16(s0, s1); + s[4] = _mm256_unpackhi_epi16(s2, s3); + + for (i = 0; i < h; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); + + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); + + const __m256i res_a = convolve_4tap(s, coeffs_y + 1); + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + + if (w - j > 4) { + const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } else { + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); + + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); + *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } + } +} + +#define DO_NO_AVG_2D_COPY_4X16(r0, c0, r1, c1, r2, c2, r3, c3) \ + do { \ + src_0 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ + src_1 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ + src_2 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ + src_3 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ + \ + src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ + src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ + src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ + src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ + \ + src_0 = _mm256_add_epi16(src_0, offset_const); \ + src_1 = _mm256_add_epi16(src_1, offset_const); \ + src_2 = _mm256_add_epi16(src_2, offset_const); \ + src_3 = _mm256_add_epi16(src_3, offset_const); \ + \ + _mm256_store_si256((__m256i *)(&dst[r0 * dst_stride + c0]), src_0); \ + _mm256_store_si256((__m256i *)(&dst[r1 * dst_stride + c1]), src_1); \ + _mm256_store_si256((__m256i *)(&dst[r2 * dst_stride + c2]), src_2); \ + _mm256_store_si256((__m256i *)(&dst[r3 * dst_stride + c3]), src_3); \ + } while (0) + +#define LEFT_SHIFT (2 * FILTER_BITS - 3 - 7) +static AOM_INLINE void av1_dist_wtd_convolve_2d_no_avg_copy_avx2( + const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst, int dst_stride, + int w, int h, const __m256i offset_const) { + int i = h; + if (w >= 16) { + __m256i src_0, src_1, src_2, src_3; + if (w == 128) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); + DO_NO_AVG_2D_COPY_4X16(0, 64, 0, 80, 0, 96, 0, 112); + src += 1 * src_stride; + dst += 1 * dst_stride; + i -= 1; + } while (i); + } else if (w == 64) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 0, 16, 0, 32, 0, 48); + src += 1 * src_stride; + dst += 1 * dst_stride; + i -= 1; + } while (i); + } else if (w == 32) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 0, 16, 1, 16); + src += 2 * src_stride; + dst += 2 * dst_stride; + i -= 2; + } while (i); + } else if (w == 16) { + do { + DO_NO_AVG_2D_COPY_4X16(0, 0, 1, 0, 2, 0, 3, 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + i -= 4; + } while (i); + } + } else { + const __m256i zero = _mm256_setzero_si256(); + do { + const __m128i src_row_0 = + _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); + const __m128i src_row_1 = + _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); + const __m128i src_row_2 = + _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); + const __m128i src_row_3 = + _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); + + __m256i src_10 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_0), src_row_1, 1); + __m256i src_32 = _mm256_insertf128_si256( + _mm256_castsi128_si256(src_row_2), src_row_3, 1); + + src_10 = _mm256_unpacklo_epi8(src_10, zero); + src_32 = _mm256_unpacklo_epi8(src_32, zero); + + src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); + src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); + + src_10 = _mm256_add_epi16(src_10, offset_const); + src_32 = _mm256_add_epi16(src_32, offset_const); + + // Accumulate values into the destination buffer + _mm_store_si128((__m128i *)(&dst[0 * dst_stride]), + _mm256_castsi256_si128(src_10)); + _mm_store_si128((__m128i *)(&dst[1 * dst_stride]), + _mm256_extracti128_si256(src_10, 1)); + _mm_store_si128((__m128i *)(&dst[2 * dst_stride]), + _mm256_castsi256_si128(src_32)); + _mm_store_si128((__m128i *)(&dst[3 * dst_stride]), + _mm256_extracti128_si256(src_32, 1)); + + src += 4 * src_stride; + dst += 4 * dst_stride; + i -= 4; + } while (i); + } +} + +#define DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, r0, c0, r1, c1, r2, c2, r3, c3) \ + do { \ + src_0 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r0 * src_stride + c0]))); \ + src_1 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r1 * src_stride + c1]))); \ + src_2 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r2 * src_stride + c2]))); \ + src_3 = _mm256_cvtepu8_epi16( \ + _mm_loadu_si128((__m128i *)(&src[r3 * src_stride + c3]))); \ + \ + src_0 = _mm256_slli_epi16(src_0, LEFT_SHIFT); \ + src_1 = _mm256_slli_epi16(src_1, LEFT_SHIFT); \ + src_2 = _mm256_slli_epi16(src_2, LEFT_SHIFT); \ + src_3 = _mm256_slli_epi16(src_3, LEFT_SHIFT); \ + src_0 = _mm256_add_epi16(src_0, offset_const); \ + src_1 = _mm256_add_epi16(src_1, offset_const); \ + src_2 = _mm256_add_epi16(src_2, offset_const); \ + src_3 = _mm256_add_epi16(src_3, offset_const); \ + \ + ref_0 = _mm256_loadu_si256((__m256i *)(&dst[r0 * dst_stride + c0])); \ + ref_1 = _mm256_loadu_si256((__m256i *)(&dst[r1 * dst_stride + c1])); \ + ref_2 = _mm256_loadu_si256((__m256i *)(&dst[r2 * dst_stride + c2])); \ + ref_3 = _mm256_loadu_si256((__m256i *)(&dst[r3 * dst_stride + c3])); \ + \ + res_0 = comp_avg(&ref_0, &src_0, &wt, USE_DIST_WEIGHTED); \ + res_1 = comp_avg(&ref_1, &src_1, &wt, USE_DIST_WEIGHTED); \ + res_2 = comp_avg(&ref_2, &src_2, &wt, USE_DIST_WEIGHTED); \ + res_3 = comp_avg(&ref_3, &src_3, &wt, USE_DIST_WEIGHTED); \ + \ + res_0 = convolve_rounding(&res_0, &offset_const, &rounding_const, \ + rounding_shift); \ + res_1 = convolve_rounding(&res_1, &offset_const, &rounding_const, \ + rounding_shift); \ + res_2 = convolve_rounding(&res_2, &offset_const, &rounding_const, \ + rounding_shift); \ + res_3 = convolve_rounding(&res_3, &offset_const, &rounding_const, \ + rounding_shift); \ + \ + res_10 = _mm256_packus_epi16(res_0, res_1); \ + res_32 = _mm256_packus_epi16(res_2, res_3); \ + res_10 = _mm256_permute4x64_epi64(res_10, 0xD8); \ + res_32 = _mm256_permute4x64_epi64(res_32, 0xD8); \ + \ + _mm_store_si128((__m128i *)(&dst0[r0 * dst_stride0 + c0]), \ + _mm256_castsi256_si128(res_10)); \ + _mm_store_si128((__m128i *)(&dst0[r1 * dst_stride0 + c1]), \ + _mm256_extracti128_si256(res_10, 1)); \ + _mm_store_si128((__m128i *)(&dst0[r2 * dst_stride0 + c2]), \ + _mm256_castsi256_si128(res_32)); \ + _mm_store_si128((__m128i *)(&dst0[r3 * dst_stride0 + c3]), \ + _mm256_extracti128_si256(res_32, 1)); \ + } while (0) + +#define DO_AVG_2D_COPY(USE_DIST_WEIGHTED) \ + int i = h; \ + if (w >= 16) { \ + __m256i src_0, src_1, src_2, src_3; \ + __m256i ref_0, ref_1, ref_2, ref_3; \ + __m256i res_0, res_1, res_2, res_3; \ + __m256i res_10, res_32; \ + if (w == 128) { \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 64, 0, 80, 0, 96, 0, 112); \ + i -= 1; \ + src += 1 * src_stride; \ + dst += 1 * dst_stride; \ + dst0 += 1 * dst_stride0; \ + } while (i); \ + } else if (w == 64) { \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 0, 16, 0, 32, 0, 48); \ + \ + i -= 1; \ + src += 1 * src_stride; \ + dst += 1 * dst_stride; \ + dst0 += 1 * dst_stride0; \ + } while (i); \ + } else if (w == 32) { \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 0, 16, 1, 16); \ + \ + i -= 2; \ + src += 2 * src_stride; \ + dst += 2 * dst_stride; \ + dst0 += 2 * dst_stride0; \ + } while (i); \ + } else { \ + assert(w == 16); \ + do { \ + DO_AVG_2D_COPY_4X16(USE_DIST_WEIGHTED, 0, 0, 1, 0, 2, 0, 3, 0); \ + \ + i -= 4; \ + src += 4 * src_stride; \ + dst += 4 * dst_stride; \ + dst0 += 4 * dst_stride0; \ + } while (i); \ + } \ + } else if (w == 8) { \ + do { \ + const __m128i src_0 = \ + _mm_loadl_epi64((__m128i *)(&src[0 * src_stride])); \ + const __m128i src_1 = \ + _mm_loadl_epi64((__m128i *)(&src[1 * src_stride])); \ + const __m128i src_2 = \ + _mm_loadl_epi64((__m128i *)(&src[2 * src_stride])); \ + const __m128i src_3 = \ + _mm_loadl_epi64((__m128i *)(&src[3 * src_stride])); \ + __m256i src_10 = \ + _mm256_insertf128_si256(_mm256_castsi128_si256(src_0), src_1, 1); \ + __m256i src_32 = \ + _mm256_insertf128_si256(_mm256_castsi128_si256(src_2), src_3, 1); \ + \ + src_10 = _mm256_unpacklo_epi8(src_10, zero); \ + src_32 = _mm256_unpacklo_epi8(src_32, zero); \ + \ + src_10 = _mm256_slli_epi16(src_10, LEFT_SHIFT); \ + src_32 = _mm256_slli_epi16(src_32, LEFT_SHIFT); \ + \ + src_10 = _mm256_add_epi16(src_10, offset_const); \ + src_32 = _mm256_add_epi16(src_32, offset_const); \ + \ + const __m256i ref_10 = \ + load_line2_avx2(&dst[0 * dst_stride], &dst[1 * dst_stride]); \ + const __m256i ref_32 = \ + load_line2_avx2(&dst[2 * dst_stride], &dst[3 * dst_stride]); \ + __m256i res_10 = comp_avg(&ref_10, &src_10, &wt, USE_DIST_WEIGHTED); \ + __m256i res_32 = comp_avg(&ref_32, &src_32, &wt, USE_DIST_WEIGHTED); \ + \ + res_10 = convolve_rounding(&res_10, &offset_const, &rounding_const, \ + rounding_shift); \ + res_32 = convolve_rounding(&res_32, &offset_const, &rounding_const, \ + rounding_shift); \ + \ + __m256i res = _mm256_packus_epi16(res_10, res_32); \ + const __m128i res_20 = _mm256_castsi256_si128(res); \ + const __m128i res_31 = _mm256_extracti128_si256(res, 1); \ + \ + _mm_storel_epi64((__m128i *)(&dst0[0 * dst_stride0]), res_20); \ + _mm_storel_epi64((__m128i *)((&dst0[1 * dst_stride0])), res_31); \ + _mm_storeh_epi64((__m128i *)(&dst0[2 * dst_stride0]), res_20); \ + _mm_storeh_epi64((__m128i *)((&dst0[3 * dst_stride0])), res_31); \ + i -= 4; \ + src += 4 * src_stride; \ + dst += 4 * dst_stride; \ + dst0 += 4 * dst_stride0; \ + } while (i); \ + } else { \ + assert(w == 4); \ + do { \ + __m256i src_3210_8bit = \ + _mm256_setr_epi32(loadu_int32(src + 0 * src_stride), \ + loadu_int32(src + 1 * src_stride), 0, 0, \ + loadu_int32(src + 2 * src_stride), \ + loadu_int32(src + 3 * src_stride), 0, 0); \ + \ + __m256i src_3210 = _mm256_unpacklo_epi8(src_3210_8bit, zero); \ + src_3210 = _mm256_slli_epi16(src_3210, LEFT_SHIFT); \ + src_3210 = _mm256_add_epi16(src_3210, offset_const); \ + \ + __m256i ref_3210 = \ + _mm256_setr_epi64x(*(int64_t *)(dst + 0 * dst_stride), \ + *(int64_t *)(dst + 1 * dst_stride), \ + *(int64_t *)(dst + 2 * dst_stride), \ + *(int64_t *)(dst + 3 * dst_stride)); \ + __m256i res_3210 = \ + comp_avg(&ref_3210, &src_3210, &wt, USE_DIST_WEIGHTED); \ + \ + res_3210 = convolve_rounding(&res_3210, &offset_const, &rounding_const, \ + rounding_shift); \ + \ + res_3210 = _mm256_packus_epi16(res_3210, res_3210); \ + const __m128i res_10 = _mm256_castsi256_si128(res_3210); \ + const __m128i res_32 = _mm256_extracti128_si256(res_3210, 1); \ + \ + *(int *)(&dst0[0 * dst_stride0]) = _mm_cvtsi128_si32(res_10); \ + *(int *)(&dst0[2 * dst_stride0]) = _mm_cvtsi128_si32(res_32); \ + *(int *)(&dst0[1 * dst_stride0]) = _mm_extract_epi32(res_10, 1); \ + *(int *)(&dst0[3 * dst_stride0]) = _mm_extract_epi32(res_32, 1); \ + i -= 4; \ + src += 4 * src_stride; \ + dst += 4 * dst_stride; \ + dst0 += 4 * dst_stride0; \ + } while (i); \ + } + +void av1_dist_wtd_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, + int h, ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + assert(conv_params->round_0 == 3); + assert(conv_params->round_1 == 7); + assert(w % 4 == 0); + assert(h % 4 == 0); + + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const __m256i wt = unpack_weights_avx2(conv_params); + const __m256i zero = _mm256_setzero_si256(); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m256i offset_const = _mm256_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); + + if (do_average) { + if (use_dist_wtd_comp_avg) { + DO_AVG_2D_COPY(1) + } else { + DO_AVG_2D_COPY(0) + } + } else { + av1_dist_wtd_convolve_2d_no_avg_copy_avx2(src, src_stride, dst, dst_stride, + w, h, offset_const); + } +} +#undef LEFT_SHIFT diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c new file mode 100644 index 0000000000..8c5d9918fb --- /dev/null +++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c @@ -0,0 +1,606 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_horiz; + const int bits = FILTER_BITS - conv_params->round_1; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + __m128i coeffs[4]; + + prepare_coeffs(filter_params_x, subpel_x_qn, coeffs); + + if (w == 4) { + do { + const __m128i data = _mm_loadu_si128((__m128i *)src_ptr); + __m128i s[4]; + + s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1)); + s[1] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3)); + s[2] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5)); + s[3] = + _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7)); + const __m128i res_lo = convolve_lo_x(s, coeffs); + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[0]), res_unsigned); + } + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + } while (--h); + } else { + assert(!(w % 8)); + int i = 0; + do { + int j = 0; + do { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + __m128i s[4]; + + // Filter even-index pixels + s[0] = data; + s[1] = _mm_srli_si128(data, 2); + s[2] = _mm_srli_si128(data, 4); + s[3] = _mm_srli_si128(data, 6); + const __m128i res_even = convolve_lo_x(s, coeffs); + + // Filter odd-index pixels + s[0] = _mm_srli_si128(data, 1); + s[1] = _mm_srli_si128(data, 3); + s[2] = _mm_srli_si128(data, 5); + s[3] = _mm_srli_si128(data, 7); + const __m128i res_odd = convolve_lo_x(s, coeffs); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift); + const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + j += 8; + } while (j < w); + } while (++i < h); + } +} + +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, + ConvolveParams *conv_params) { + const int bd = 8; + CONV_BUF_TYPE *dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *src_ptr = src - fo_vert * src_stride; + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i left_shift = _mm_cvtsi32_si128(bits); + const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset); + const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + __m128i coeffs[4]; + + prepare_coeffs(filter_params_y, subpel_y_qn, coeffs); + + if (w == 4) { + __m128i s[8], src6, res, res_shift; + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 0 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 1 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 2 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 3 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 4 * src_stride)), + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 5 * src_stride)), src6); + + do { + s[6] = _mm_unpacklo_epi8( + src6, _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride))); + src6 = _mm_cvtsi32_si128(*(int *)(src_ptr + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_cvtsi32_si128(*(int *)(src_ptr + 7 * src_stride)), src6); + + res = convolve_lo_y(s + 0, coeffs); + res_shift = _mm_sll_epi32(res, left_shift); + res_shift = + _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); + + __m128i res_16b = _mm_packs_epi32(res_shift, res_shift); + __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + + } else { + _mm_store_si128((__m128i *)dst, res_unsigned); + } + + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + + res = convolve_lo_y(s + 1, coeffs); + res_shift = _mm_sll_epi32(res, left_shift); + res_shift = + _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift); + + res_16b = _mm_packs_epi32(res_shift, res_shift); + res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + *(int *)(&dst0[0]) = _mm_cvtsi128_si32(res_8); + + } else { + _mm_store_si128((__m128i *)dst, res_unsigned); + } + + src_ptr += src_stride; + dst += dst_stride; + dst0 += dst_stride0; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + h -= 2; + } while (h); + } else { + assert(!(w % 8)); + int j = 0; + do { + __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift; + const uint8_t *data = &src_ptr[j]; + + src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); + s[0] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride))); + s[1] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride))); + s[2] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride))); + s[3] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride))); + s[4] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)), + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride))); + s[5] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6); + + int i = 0; + do { + data = &src_ptr[i * src_stride + j]; + s[6] = _mm_unpacklo_epi8( + src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride))); + src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); + s[7] = _mm_unpacklo_epi8( + _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6); + + res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels + res_lo_shift = _mm_sll_epi32(res_lo, left_shift); + res_hi_shift = _mm_sll_epi32(res_hi, left_shift); + res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), + round_shift); + res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), + round_shift); + + __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + i++; + + res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels + res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels + res_lo_shift = _mm_sll_epi32(res_lo, left_shift); + res_hi_shift = _mm_sll_epi32(res_hi, left_shift); + res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const), + round_shift); + res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const), + round_shift); + res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift); + res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + i++; + + s[0] = s[2]; + s[1] = s[3]; + s[2] = s[4]; + s[3] = s[5]; + s[4] = s[6]; + s[5] = s[7]; + } while (i < h); + j += 8; + } while (j < w); + } +} + +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, const int subpel_y_qn, + ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i temp_lo, temp_hi; + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 4); + temp_hi = _mm_slli_si128(src_hi, 12); + const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 8); + temp_hi = _mm_slli_si128(src_hi, 8); + const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 12); + temp_hi = _mm_slli_si128(src_hi, 4); + const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + temp_lo = _mm_srli_si128(src_lo, 2); + temp_hi = _mm_slli_si128(src_hi, 14); + const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + temp_lo = _mm_srli_si128(src_lo, 6); + temp_hi = _mm_slli_si128(src_hi, 10); + const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + temp_lo = _mm_srli_si128(src_lo, 10); + temp_hi = _mm_slli_si128(src_hi, 6); + const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + temp_lo = _mm_srli_si128(src_lo, 14); + temp_hi = _mm_slli_si128(src_hi, 2); + const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c new file mode 100644 index 0000000000..f6bf67815d --- /dev/null +++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_dist_wtd_convolve_2d_ssse3( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_qn, + const int subpel_y_qn, ConvolveParams *conv_params) { + CONV_BUF_TYPE *dst = conv_params->dst; + int dst_stride = conv_params->dst_stride; + const int bd = 8; + + DECLARE_ALIGNED(16, int16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); + int im_h = h + filter_params_y->taps - 1; + int im_stride = MAX_SB_SIZE; + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const int do_average = conv_params->do_average; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + const __m128i zero = _mm_setzero_si128(); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16(w0); + const __m128i wt1 = _mm_set1_epi16(w1); + const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); + + const int offset_0 = + bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); + const __m128i offset_const = _mm_set1_epi16(offset); + const int rounding_shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); + + /* Horizontal filter */ + { + const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); + + for (i = 0; i < im_h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + const __m128i src_lo = _mm_unpacklo_epi8(data, zero); + const __m128i src_hi = _mm_unpackhi_epi8(data, zero); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); + const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = + _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); + + // Filter odd-index pixels + const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); + } + } + } + + /* Vertical filter */ + { + const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + ((1 << conv_params->round_1) >> 1) - + (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); + const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const int16_t *data = &im_block[i * im_stride + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), + *(__m128i *)(data + 1 * im_stride)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), + *(__m128i *)(data + 3 * im_stride)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), + *(__m128i *)(data + 5 * im_stride)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), + *(__m128i *)(data + 7 * im_stride)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = + _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); + const __m128i res_hi_round = + _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); + + const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); + const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m128i data_ref_0 = + _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); + + const __m128i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m128i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m128i res_8 = _mm_packus_epi16(round_result, round_result); + + if (w > 4) + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); + else + *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); + } else { + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); + } + } + } + } +} diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c new file mode 100644 index 0000000000..71fab7a577 --- /dev/null +++ b/third_party/aom/av1/common/x86/reconinter_avx2.c @@ -0,0 +1,624 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "av1/common/blockd.h" + +static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0, + const __m256i s1) { + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)); + return _mm256_abs_epi16( + _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} +void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = xx_loadl_32(src0); + const __m128i s0B = xx_loadl_32(src0 + src0_stride); + const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); + const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); + const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); + + const __m128i s1A = xx_loadl_32(src1); + const __m128i s1B = xx_loadl_32(src1 + src1_stride); + const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); + const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); + const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); + const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + const __m128i x_m8 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); + xx_storeu_128(mask, x_m8); + src0 += (src0_stride << 2); + src1 += (src1_stride << 2); + mask += 16; + i += 4; + } while (i < h); + } else if (8 == w) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); + const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); + const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); + const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); + const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); + const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); + yy_storeu_256(mask, m8); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (16 == w) { + do { + const __m128i s0A = xx_load_128(src0); + const __m128i s0B = xx_load_128(src0 + src0_stride); + const __m128i s1A = xx_load_128(src1); + const __m128i s1B = xx_load_128(src1 + src1_stride); + const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); + const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); + const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); + const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); + + const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); + const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); + + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); + yy_storeu_256(mask, m8); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else { + do { + int j = 0; + do { + const __m256i s0 = yy_loadu_256(src0 + j); + const __m256i s1 = yy_loadu_256(src1 + j); + const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); + const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); + const __m256i s0H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); + const __m256i s1H = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); + const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); + const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); + const __m256i m8 = + _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); + yy_storeu_256(mask + j, m8); + j += 32; + } while (j < w); + src0 += src0_stride; + src1 += src1_stride; + mask += w; + i += 1; + } while (i < h); + } +} + +static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + return diff_clamp; +} + +static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0, + const __m256i *data_src1, + const __m256i *round_const, + const __m256i *mask_base_16, + const __m256i *clip_diff, + int round) { + const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); + const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); + const __m256i diff = _mm256_max_epu16(diffa, diffb); + const __m256i diff_round = + _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); + const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); + const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); + const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp); + return diff_const_16; +} + +static INLINE void build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( + uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, + const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { + const int mask_base = 38; + const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); + const __m256i y38 = _mm256_set1_epi16(mask_base); + const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + int i = 0; + if (w == 4) { + do { + const __m128i s0A = xx_loadl_64(src0); + const __m128i s0B = xx_loadl_64(src0 + src0_stride); + const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); + const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); + const __m128i s1A = xx_loadl_64(src1); + const __m128i s1B = xx_loadl_64(src1 + src1_stride); + const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); + const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); + const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), + _mm_unpacklo_epi64(s0A, s0B)); + const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), + _mm_unpacklo_epi64(s1A, s1B)); + const __m256i m16 = + calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); + xx_storeu_128(mask, + _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 16; + i += 4; + } while (i < h); + } else if (w == 8) { + do { + const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); + const __m256i s0CD = + yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); + const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); + const __m256i s1CD = + yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); + const __m256i m16AB = + calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); + const __m256i m16CD = + calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 2; + src1 += src1_stride << 2; + mask += 32; + i += 4; + } while (i < h); + } else if (w == 16) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + src0_stride); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + src1_stride); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride << 1; + src1 += src1_stride << 1; + mask += 32; + i += 2; + } while (i < h); + } else if (w == 32) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m8 = _mm256_packus_epi16(m16A, m16B); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 32; + i += 1; + } while (i < h); + } else if (w == 64) { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 64; + i += 1; + } while (i < h); + } else { + do { + const __m256i s0A = yy_loadu_256(src0); + const __m256i s0B = yy_loadu_256(src0 + 16); + const __m256i s0C = yy_loadu_256(src0 + 32); + const __m256i s0D = yy_loadu_256(src0 + 48); + const __m256i s0E = yy_loadu_256(src0 + 64); + const __m256i s0F = yy_loadu_256(src0 + 80); + const __m256i s0G = yy_loadu_256(src0 + 96); + const __m256i s0H = yy_loadu_256(src0 + 112); + const __m256i s1A = yy_loadu_256(src1); + const __m256i s1B = yy_loadu_256(src1 + 16); + const __m256i s1C = yy_loadu_256(src1 + 32); + const __m256i s1D = yy_loadu_256(src1 + 48); + const __m256i s1E = yy_loadu_256(src1 + 64); + const __m256i s1F = yy_loadu_256(src1 + 80); + const __m256i s1G = yy_loadu_256(src1 + 96); + const __m256i s1H = yy_loadu_256(src1 + 112); + const __m256i m16A = + calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); + const __m256i m16B = + calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); + const __m256i m16C = + calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); + const __m256i m16D = + calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); + const __m256i m16E = + calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); + const __m256i m16F = + calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); + const __m256i m16G = + calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); + const __m256i m16H = + calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); + const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); + const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); + const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); + const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); + yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); + yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); + yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); + yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); + src0 += src0_stride; + src1 += src1_stride; + mask += 128; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int shift = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + + if (mask_type == DIFFWTD_38) { + build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } else { + build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1, + src1_stride, h, w, shift); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH + +void av1_build_compound_diffwtd_mask_highbd_avx2( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + if (w < 16) { + av1_build_compound_diffwtd_mask_highbd_ssse3( + mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd); + } else { + assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); + assert(bd >= 8); + assert((w % 16) == 0); + const __m256i y0 = _mm256_setzero_si256(); + const __m256i yAOM_BLEND_A64_MAX_ALPHA = + _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const int mask_base = 38; + const __m256i ymask_base = _mm256_set1_epi16(mask_base); + const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); + const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); + if (bd == 8) { + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_srai_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_srai_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } else { + const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_sra_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]); + __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]); + __m256i diff = _mm256_sra_epi16( + _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift); + __m256i m = _mm256_min_epi16( + _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)), + yAOM_BLEND_A64_MAX_ALPHA); + m = _mm256_packus_epi16(m, m); + m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i m0 = _mm256_castsi256_si128(m); + _mm_storeu_si128((__m128i *)&mask[j], m0); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c new file mode 100644 index 0000000000..eb4a4d1da3 --- /dev/null +++ b/third_party/aom/av1/common/x86/reconinter_sse4.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "av1/common/blockd.h" +#include "config/av1_rtcd.h" + +static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0, + const __m128i s1) { + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1)); + return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4))); + // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54) +} + +void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, + DIFFWTD_MASK_TYPE mask_type, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + int h, int w) { + const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; + const __m128i mask_base = _mm_set1_epi16(38 - mb); + int i = 0; + if (4 == w) { + do { + const __m128i s0A = _mm_cvtsi32_si128(*(int *)src0); + const __m128i s0B = _mm_cvtsi32_si128(*(int *)(src0 + stride0)); + const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); + const __m128i s0 = _mm_cvtepu8_epi16(s0AB); + + const __m128i s1A = _mm_cvtsi32_si128(*(int *)src1); + const __m128i s1B = _mm_cvtsi32_si128(*(int *)(src1 + stride1)); + const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); + const __m128i s1 = _mm_cvtepu8_epi16(s1AB); + + const __m128i m16 = calc_mask(mask_base, s0, s1); + const __m128i m8 = _mm_packus_epi16(m16, m16); + + *(int *)mask = _mm_cvtsi128_si32(m8); + *(int *)(mask + w) = _mm_extract_epi32(m8, 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += 8; + i += 2; + } while (i < h); + } else if (8 == w) { + do { + __m128i s0 = _mm_loadl_epi64((__m128i const *)src0); + __m128i s1 = _mm_loadl_epi64((__m128i const *)src1); + s0 = _mm_cvtepu8_epi16(s0); + s1 = _mm_cvtepu8_epi16(s1); + const __m128i m16 = calc_mask(mask_base, s0, s1); + const __m128i m8 = _mm_packus_epi16(m16, m16); + _mm_storel_epi64((__m128i *)mask, m8); + src0 += stride0; + src1 += stride1; + mask += 8; + i += 1; + } while (i < h); + } else { + const __m128i zero = _mm_setzero_si128(); + do { + int j = 0; + do { + const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j)); + const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j)); + const __m128i s0L = _mm_cvtepu8_epi16(s0); + const __m128i s1L = _mm_cvtepu8_epi16(s1); + const __m128i s0H = _mm_unpackhi_epi8(s0, zero); + const __m128i s1H = _mm_unpackhi_epi8(s1, zero); + + const __m128i m16L = calc_mask(mask_base, s0L, s1L); + const __m128i m16H = calc_mask(mask_base, s0H, s1H); + + const __m128i m8 = _mm_packus_epi16(m16L, m16H); + _mm_store_si128((__m128i *)(mask + j), m8); + j += 16; + } while (j < w); + src0 += stride0; + src1 += stride1; + mask += w; + i += 1; + } while (i < h); + } +} + +void av1_build_compound_diffwtd_mask_d16_sse4_1( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, + int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, + ConvolveParams *conv_params, int bd) { + const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1; + const int mask_base = 38; + int round = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8); + const __m128i round_const = _mm_set1_epi16((1 << round) >> 1); + const __m128i mask_base_16 = _mm_set1_epi16(mask_base); + const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i add_const = + _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0)); + const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1)); + + int i, j; + // When rounding constant is added, there is a possibility of overflow. + // However that much precision is not required. Code should very well work for + // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But + // there is a possibility of corner case bugs. + assert(DIFF_FACTOR_LOG2 == 4); + assert(AOM_BLEND_A64_MAX_ALPHA == 64); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data_src0 = + _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]); + const __m128i data_src1 = + _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]); + + const __m128i diffa = _mm_subs_epu16(data_src0, data_src1); + const __m128i diffb = _mm_subs_epu16(data_src1, data_src0); + const __m128i diff = _mm_max_epu16(diffa, diffb); + const __m128i diff_round = + _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round); + const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2); + const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16); + __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff); + // clamp to 0 can be skipped since we are using add and saturate + // instruction + + const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign); + const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16); + + // Store values into the destination buffer + __m128i *const dst = (__m128i *)&mask[i * w + j]; + + if ((w - j) > 4) { + _mm_storel_epi64(dst, res_8); + } else { // w==4 + *(int *)dst = _mm_cvtsi128_si32(res_8); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c new file mode 100644 index 0000000000..c9a3709a62 --- /dev/null +++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#if CONFIG_AV1_HIGHBITDEPTH + +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" +#include "av1/common/blockd.h" + +void av1_build_compound_diffwtd_mask_highbd_ssse3( + uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, + int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, + int bd) { + if (w < 8) { + av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride, + src1, src1_stride, h, w, bd); + } else { + assert(bd >= 8); + assert((w % 8) == 0); + assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV); + const __m128i x0 = _mm_setzero_si128(); + const __m128i xAOM_BLEND_A64_MAX_ALPHA = + _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const int mask_base = 38; + const __m128i xmask_base = _mm_set1_epi16(mask_base); + const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0); + const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1); + if (bd == 8) { + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), + DIFF_FACTOR_LOG2); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), + DIFF_FACTOR_LOG2); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } else { + const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2); + if (mask_type == DIFFWTD_38_INV) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = + _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]); + __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]); + __m128i diff = + _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift); + __m128i m = _mm_min_epi16( + _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)), + xAOM_BLEND_A64_MAX_ALPHA); + m = _mm_packus_epi16(m, m); + _mm_storel_epi64((__m128i *)&mask[j], m); + } + ssrc0 += src0_stride; + ssrc1 += src1_stride; + mask += w; + } + } + } + } +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/common/x86/resize_ssse3.c b/third_party/aom/av1/common/x86/resize_ssse3.c new file mode 100644 index 0000000000..a7fdb5a9a4 --- /dev/null +++ b/third_party/aom/av1/common/x86/resize_ssse3.c @@ -0,0 +1,974 @@ +/* + * + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSSE3 +#include "config/av1_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" +#include "aom_dsp/x86/convolve_ssse3.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "av1/common/resize.h" + +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} + +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi16(0x00FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); +} + +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase + 1 * step_q4; + const int offset2_q4 = phase + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs shuffle_filter_func_list[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs convolve8_func_list[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase + 0 * step_q4) & SUBPEL_MASK], f0); + shuffle_filter_func_list[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + shuffle_filter_func_list[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_func_list[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_func_list[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_func_list[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_func_list[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; + int y; + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { + int x; + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); + } + + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); +} + +// There's SIMD optimizations for 1/4, 1/2 and 3/4 downscaling and 2x upscaling +// in SSSE3. +static INLINE bool has_normative_scaler_ssse3(const int src_width, + const int src_height, + const int dst_width, + const int dst_height) { + const bool has_normative_scaler = + (2 * dst_width == src_width && 2 * dst_height == src_height) || + (4 * dst_width == src_width && 4 * dst_height == src_height) || + (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) || + (dst_width == src_width * 2 && dst_height == src_height * 2); + + return has_normative_scaler; +} + +void av1_resize_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + const InterpFilter filter, + const int phase, const int num_planes) { + bool has_normative_scaler = + has_normative_scaler_ssse3(src->y_crop_width, src->y_crop_height, + dst->y_crop_width, dst->y_crop_height); + + if (num_planes > 1) { + has_normative_scaler = + has_normative_scaler && + has_normative_scaler_ssse3(src->uv_crop_width, src->uv_crop_height, + dst->uv_crop_width, dst->uv_crop_height); + } + + if (!has_normative_scaler) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + return; + } + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + int malloc_failed = 0; + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) { + const int is_uv = i > 0; + const int src_w = src->crop_widths[is_uv]; + const int src_h = src->crop_heights[is_uv]; + const int src_y_w = (src->crop_widths[0] + 1) & ~1; + const int dst_w = dst->crop_widths[is_uv]; + const int dst_h = dst->crop_heights[is_uv]; + const int dst_y_w = (dst->crop_widths[0] + 1) & ~1; + const int dst_y_h = (dst->crop_heights[0] + 1) & ~1; + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + // 2 to 1 + if (phase == 0) { + scale_plane_2_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0c1); + } else { + const int buffer_stride = (dst_y_w + 3) & ~3; + const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_2_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + if (phase == 0) { + scale_plane_4_to_1_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h); + } else if (filter == BILINEAR) { + const int16_t c0 = av1_bilinear_filters[phase][3]; + const int16_t c1 = av1_bilinear_filters[phase][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, c0c1); + } else { + const int buffer_stride = (dst_y_w + 1) & ~1; + const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read + // overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_1_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel[phase], temp_buffer); + free(temp_buffer); + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_y_w + 5) - ((dst_y_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_y_w + 7) & ~7; + const int buffer_height = (4 * dst_y_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read + // overflow. For example, the horizontal filter generates 18 pixels but + // the vertical filter reads 24 pixels in a row. The difference is + // multiplied by 2 since two rows are interlaced together in the + // optimization. + const int extra_padding = + (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_4_to_3_general(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], dst_w, + dst_h, interp_kernel, phase, temp_buffer); + free(temp_buffer); + } else { + assert(dst_w == src_w * 2 && dst_h == src_h * 2); + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_y_w + 7) & ~7)); + if (!temp_buffer) { + malloc_failed = 1; + break; + } + const InterpKernel *interp_kernel = + (const InterpKernel *)av1_interp_filter_params_list[filter] + .filter_ptr; + scale_plane_1_to_2_phase_0(src->buffers[i], src->strides[is_uv], + dst->buffers[i], dst->strides[is_uv], src_w, + src_h, interp_kernel[8], temp_buffer); + free(temp_buffer); + } + } + + if (malloc_failed) { + av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes); + } else { + aom_extend_frame_borders(dst, num_planes); + } +} diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c new file mode 100644 index 0000000000..5ab6c46f8a --- /dev/null +++ b/third_party/aom/av1/common/x86/selfguided_avx2.c @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to +// 32-bit precision and return them in an AVX2 register. +static __m256i yy256_load_extend_8_32(const void *p) { + return _mm256_cvtepu8_epi32(xx_loadl_64(p)); +} + +// Load 8 halfwords from the possibly-misaligned pointer p, extend each +// halfword to 32-bit precision and return them in an AVX2 register. +static __m256i yy256_load_extend_16_32(const void *p) { + return _mm256_cvtepu16_epi32(xx_loadu_128(p)); +} + +// Compute the scan of an AVX2 register holding 8 32-bit integers. If the +// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ..., +// x0+x1+...+x7 +// +// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers +// (assumed small enough to be able to add them without overflow). +// +// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a. +// +// x = [h g f e][d c b a] +// x01 = [g f e 0][c b a 0] +// x02 = [g+h f+g e+f e][c+d b+c a+b a] +// x03 = [e+f e 0 0][a+b a 0 0] +// x04 = [e->h e->g e->f e][a->d a->c a->b a] +// s = a->d +// s01 = [a->d a->d a->d a->d] +// s02 = [a->d a->d a->d a->d][0 0 0 0] +// ret = [a->h a->g a->f a->e][a->d a->c a->b a] +static __m256i scan_32(__m256i x) { + const __m256i x01 = _mm256_slli_si256(x, 4); + const __m256i x02 = _mm256_add_epi32(x, x01); + const __m256i x03 = _mm256_slli_si256(x02, 8); + const __m256i x04 = _mm256_add_epi32(x02, x03); + const int32_t s = _mm256_extract_epi32(x04, 3); + const __m128i s01 = _mm_set1_epi32(s); + const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1); + return _mm256_add_epi32(x04, s02); +} + +// Compute two integral images from src. B sums elements; A sums their +// squares. The images are offset by one pixel, so will have width and height +// equal to width + 1, height + 1 and the first row and column will be zero. +// +// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple +// of 8. + +static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) { + unsigned int i = 0; + for (i = 0; i < (count & 0xffffffe0); i += 32) { + _mm256_storeu_si256((__m256i *)(dest + i), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero); + _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero); + } + for (; i < (count & 0xfffffff8); i += 8) { + _mm256_storeu_si256((__m256i *)(dest + i), *zero); + } + for (; i < count; i++) { + dest[i] = 0; + } + return dest; +} + +static void integral_images(const uint8_t *src, int src_stride, int width, + int height, int32_t *A, int32_t *B, + int buf_stride) { + const __m256i zero = _mm256_setzero_si256(); + // Write out the zero top row + memset_zero_avx(A, &zero, (width + 8)); + memset_zero_avx(B, &zero, (width + 8)); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the eight lanes. + __m256i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 8) { + const int ABj = 1 + j; + + const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); + const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); + + const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride); + const __m256i x2 = _mm256_madd_epi16(x1, x1); + + const __m256i sc1 = scan_32(x1); + const __m256i sc2 = scan_32(x2); + + const __m256i row1 = + _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); + const __m256i row2 = + _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); + + yy_store_256(B + ABj + (i + 1) * buf_stride, row1); + yy_store_256(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); + ldiff2 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); + } + } +} + +// Compute two integral images from src. B sums elements; A sums their squares +// +// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8. +static void integral_images_highbd(const uint16_t *src, int src_stride, + int width, int height, int32_t *A, + int32_t *B, int buf_stride) { + const __m256i zero = _mm256_setzero_si256(); + // Write out the zero top row + memset_zero_avx(A, &zero, (width + 8)); + memset_zero_avx(B, &zero, (width + 8)); + + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the eight lanes. + __m256i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 8) { + const int ABj = 1 + j; + + const __m256i above1 = yy_load_256(B + ABj + i * buf_stride); + const __m256i above2 = yy_load_256(A + ABj + i * buf_stride); + + const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride); + const __m256i x2 = _mm256_madd_epi16(x1, x1); + + const __m256i sc1 = scan_32(x1); + const __m256i sc2 = scan_32(x2); + + const __m256i row1 = + _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1); + const __m256i row2 = + _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2); + + yy_store_256(B + ABj + (i + 1) * buf_stride, row1); + yy_store_256(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7)); + ldiff2 = _mm256_set1_epi32( + _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7)); + } + } +} + +// Compute 8 values of boxsum from the given integral image. ii should point +// at the middle of the box (for the first value). r is the box radius. +static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) { + const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride); + const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride); + const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride); + const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride); + const __m256i u = _mm256_sub_epi32(tr, tl); + const __m256i v = _mm256_sub_epi32(br, bl); + return _mm256_sub_epi32(v, u); +} + +static __m256i round_for_shift(unsigned shift) { + return _mm256_set1_epi32((1 << shift) >> 1); +} + +static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) { + __m256i an, bb; + if (bit_depth > 8) { + const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8)); + const __m256i rounding_b = round_for_shift(bit_depth - 8); + const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); + const __m256i a = + _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a); + const __m256i b = + _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b); + // b < 2^14, so we can use a 16-bit madd rather than a 32-bit + // mullo to square it + bb = _mm256_madd_epi16(b, b); + an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb); + } else { + bb = _mm256_madd_epi16(sum1, sum1); + an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n)); + } + return _mm256_sub_epi32(an, bb); +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, + int width, int height, int buf_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); + + const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m256i mask[8]; + for (int idx = 0; idx < 8; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); + mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; ++i) { + for (int j = -1; j < width + 1; j += 8) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); + + if (idx < 8) { + sum1 = _mm256_and_si256(mask[idx], sum1); + sum2 = _mm256_and_si256(mask[idx], sum2); + } + + const __m256i p = compute_p(sum1, sum2, bit_depth, n); + + const __m256i z = _mm256_min_epi32( + _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm256_set1_epi32(255)); + + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); + + yy_storeu_256(A + i * buf_stride + j, a_res); + + const __m256i a_complement = + _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); + const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); + const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), + SGRPROJ_RECIP_BITS); + + yy_storeu_256(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter +// where the outer four corners have weight 3 and all other pixels have weight +// 4. +// +// Pixels are indexed as follows: +// xtl xt xtr +// xl x xr +// xbl xb xbr +// +// buf points to x +// +// fours = xl + xt + xr + xb + x +// threes = xtl + xtr + xbr + xbl +// cross_sum = 4 * fours + 3 * threes +// = 4 * (fours + threes) - threes +// = (fours + threes) << 2 - threes +static INLINE __m256i cross_sum(const int32_t *buf, int stride) { + const __m256i xtl = yy_loadu_256(buf - 1 - stride); + const __m256i xt = yy_loadu_256(buf - stride); + const __m256i xtr = yy_loadu_256(buf + 1 - stride); + const __m256i xl = yy_loadu_256(buf - 1); + const __m256i x = yy_loadu_256(buf); + const __m256i xr = yy_loadu_256(buf + 1); + const __m256i xbl = yy_loadu_256(buf - 1 + stride); + const __m256i xb = yy_loadu_256(buf + stride); + const __m256i xbr = yy_loadu_256(buf + 1 + stride); + + const __m256i fours = _mm256_add_epi32( + xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x)))); + const __m256i threes = + _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); + + return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2), + threes); +} + +// The final filter for self-guided restoration. Computes a weighted average +// across A, B with "cross sums" (see cross_sum implementation above). +static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, const void *dgd8, + int dgd_stride, int width, int height, int highbd) { + const int nb = 5; + const __m256i rounding = + round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride); + const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding), + SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, + const int32_t *D, int width, int height, + int buf_stride, int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m256i s = _mm256_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]); + + const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m256i mask[8]; + for (int idx = 0; idx < 8; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx)); + mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; i += 2) { + for (int j = -1; j < width + 1; j += 8) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(8, width + 1 - j); + assert(idx >= 1); + + if (idx < 8) { + sum1 = _mm256_and_si256(mask[idx], sum1); + sum2 = _mm256_and_si256(mask[idx], sum2); + } + + const __m256i p = compute_p(sum1, sum2, bit_depth, n); + + const __m256i z = _mm256_min_epi32( + _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm256_set1_epi32(255)); + + const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4); + + yy_storeu_256(A + i * buf_stride + j, a_res); + + const __m256i a_complement = + _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n); + const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1); + const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res), + SGRPROJ_RECIP_BITS); + + yy_storeu_256(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 8 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xtl xt xtr +// - buf - +// xbl xb xbr +// +// Pixels are weighted like this: +// 5 6 5 +// 0 0 0 +// 5 6 5 +// +// fives = xtl + xtr + xbl + xbr +// sixes = xt + xb +// cross_sum = 6 * sixes + 5 * fives +// = 5 * (fives + sixes) - sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) { + const __m256i xtl = yy_loadu_256(buf - 1 - stride); + const __m256i xt = yy_loadu_256(buf - stride); + const __m256i xtr = yy_loadu_256(buf + 1 - stride); + const __m256i xbl = yy_loadu_256(buf - 1 + stride); + const __m256i xb = yy_loadu_256(buf + stride); + const __m256i xbr = yy_loadu_256(buf + 1 + stride); + + const __m256i fives = + _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl))); + const __m256i sixes = _mm256_add_epi32(xt, xb); + const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); + + return _mm256_add_epi32( + _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), + fives_plus_sixes), + sixes); +} + +// Calculate 8 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xl x xr +// +// Pixels are weighted like this: +// 5 6 5 +// +// buf points to x +// +// fives = xl + xr +// sixes = x +// cross_sum = 5 * fives + 6 * sixes +// = 4 * (fives + sixes) + (fives + sixes) + sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) { + const __m256i xl = yy_loadu_256(buf - 1); + const __m256i x = yy_loadu_256(buf); + const __m256i xr = yy_loadu_256(buf + 1); + + const __m256i fives = _mm256_add_epi32(xl, xr); + const __m256i sixes = x; + + const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes); + + return _mm256_add_epi32( + _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2), + fives_plus_sixes), + sixes); +} + +// The final filter for the self-guided restoration. Computes a +// weighted average across A, B with "cross sums" (see cross_sum_... +// implementations above). +static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, + const void *dgd8, int dgd_stride, int width, + int height, int highbd) { + const int nb0 = 5; + const int nb1 = 4; + + const __m256i rounding0 = + round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + const __m256i rounding1 = + round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (int j = 0; j < width; j += 8) { + const __m256i a = + cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); + const __m256i b = + cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = + _mm256_srai_epi32(_mm256_add_epi32(v, rounding0), + SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } else { // odd row + for (int j = 0; j < width; j += 8) { + const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j); + const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j); + + const __m128i raw = + xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m256i src = + highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw); + + __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b); + __m256i w = + _mm256_srai_epi32(_mm256_add_epi32(v, rounding1), + SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + yy_storeu_256(dst + i * dst_stride + j, w); + } + } + } +} + +int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height, + int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl, + // Ctl and Dtl is 32-byte aligned. + const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3); + + int32_t *buf = aom_memalign( + 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)); + if (!buf) return -1; + + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 32 bytes for efficiency. + int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3); + + // The "tl" pointers point at the top-left of the initialised data for the + // array. + int32_t *Atl = buf + 0 * buf_elts + 7; + int32_t *Btl = buf + 1 * buf_elts + 7; + int32_t *Ctl = buf + 2 * buf_elts + 7; + int32_t *Dtl = buf + 3 * buf_elts + 7; + + // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note + // there's a zero row and column in A, B (integral images), so we move down + // and right one for them. + const int buf_diag_border = + SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; + + int32_t *A0 = Atl + 1 + buf_stride; + int32_t *B0 = Btl + 1 + buf_stride; + int32_t *C0 = Ctl + 1 + buf_stride; + int32_t *D0 = Dtl + 1 + buf_stride; + + // Finally, A, B, C, D point at position (0, 0). + int32_t *A = A0 + buf_diag_border; + int32_t *B = B0 + buf_diag_border; + int32_t *C = C0 + buf_diag_border; + int32_t *D = D0 + buf_diag_border; + + const int dgd_diag_border = + SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; + const uint8_t *dgd0 = dgd8 - dgd_diag_border; + + // Generate integral images from the input. C will contain sums of squares; D + // will contain just sums + if (highbd) + integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, + height_ext, Ctl, Dtl, buf_stride); + else + integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, + buf_stride); + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // Write to flt0 and flt1 + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + + if (params->r[0] > 0) { + calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, + sgr_params_idx, 0); + final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, + width, height, highbd); + } + + if (params->r[1] > 0) { + calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, + 1); + final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, + height, highbd); + } + aom_free(buf); + return 0; +} + +int av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + const int ret = av1_selfguided_restoration_avx2( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + if (ret != 0) return ret; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + + __m256i xq0 = _mm256_set1_epi32(xq[0]); + __m256i xq1 = _mm256_set1_epi32(xq[1]); + + for (int i = 0; i < height; ++i) { + // Calculate output in batches of 16 pixels + for (int j = 0; j < width; j += 16) { + const int k = i * width + j; + const int m = i * dst_stride + j; + + const uint8_t *dat8ij = dat8 + i * stride + j; + __m256i ep_0, ep_1; + __m128i src_0, src_1; + if (highbd) { + src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); + src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8)); + ep_0 = _mm256_cvtepu16_epi32(src_0); + ep_1 = _mm256_cvtepu16_epi32(src_1); + } else { + src_0 = xx_loadu_128(dat8ij); + ep_0 = _mm256_cvtepu8_epi32(src_0); + ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8)); + } + + const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS); + const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS); + + __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS); + __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0); + v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0)); + + const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1); + v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1)); + } + + if (params->r[1] > 0) { + const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0); + v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0)); + + const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1); + v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1)); + } + + const __m256i rounding = + round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m256i w_0 = _mm256_srai_epi32( + _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m256i w_1 = _mm256_srai_epi32( + _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + if (highbd) { + // Pack into 16 bits and clamp to [0, 2^bit_depth) + // Note that packing into 16 bits messes up the order of the bits, + // so we use a permute function to correct this + const __m256i tmp = _mm256_packus_epi32(w_0, w_1); + const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); + const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1); + const __m256i res = _mm256_min_epi16(tmp2, max); + yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res); + } else { + // Pack into 8 bits and clamp to [0, 256) + // Note that each pack messes up the order of the bits, + // so we use a permute function to correct this + const __m256i tmp = _mm256_packs_epi32(w_0, w_1); + const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8); + const __m256i res = + _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */); + const __m128i res2 = + _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8)); + xx_storeu_128(dst8 + m, res2); + } + } + } + return 0; +} diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c new file mode 100644 index 0000000000..ac850f5691 --- /dev/null +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/restoration.h" +#include "aom_dsp/x86/synonyms.h" + +// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to +// 32-bit precision and return them in an SSE register. +static __m128i xx_load_extend_8_32(const void *p) { + return _mm_cvtepu8_epi32(xx_loadl_32(p)); +} + +// Load 4 halfwords from the possibly-misaligned pointer p, extend each +// halfword to 32-bit precision and return them in an SSE register. +static __m128i xx_load_extend_16_32(const void *p) { + return _mm_cvtepu16_epi32(xx_loadl_64(p)); +} + +// Compute the scan of an SSE register holding 4 32-bit integers. If the +// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2, +// x0+x1+x2+x3 +static __m128i scan_32(__m128i x) { + const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4)); + return _mm_add_epi32(x01, _mm_slli_si128(x01, 8)); +} + +// Compute two integral images from src. B sums elements; A sums their +// squares. The images are offset by one pixel, so will have width and height +// equal to width + 1, height + 1 and the first row and column will be zero. +// +// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple +// of 4. +static void integral_images(const uint8_t *src, int src_stride, int width, + int height, int32_t *A, int32_t *B, + int buf_stride) { + // Write out the zero top row + memset(A, 0, sizeof(*A) * (width + 1)); + memset(B, 0, sizeof(*B) * (width + 1)); + + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the four lanes. + __m128i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 4) { + const int ABj = 1 + j; + + const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); + const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); + + const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride); + const __m128i x2 = _mm_madd_epi16(x1, x1); + + const __m128i sc1 = scan_32(x1); + const __m128i sc2 = scan_32(x2); + + const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); + const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); + + xx_store_128(B + ABj + (i + 1) * buf_stride, row1); + xx_store_128(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); + ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); + } + } +} + +// Compute two integral images from src. B sums elements; A sums their squares +// +// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4. +static void integral_images_highbd(const uint16_t *src, int src_stride, + int width, int height, int32_t *A, + int32_t *B, int buf_stride) { + // Write out the zero top row + memset(A, 0, sizeof(*A) * (width + 1)); + memset(B, 0, sizeof(*B) * (width + 1)); + + const __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < height; ++i) { + // Zero the left column. + A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0; + + // ldiff is the difference H - D where H is the output sample immediately + // to the left and D is the output sample above it. These are scalars, + // replicated across the four lanes. + __m128i ldiff1 = zero, ldiff2 = zero; + for (int j = 0; j < width; j += 4) { + const int ABj = 1 + j; + + const __m128i above1 = xx_load_128(B + ABj + i * buf_stride); + const __m128i above2 = xx_load_128(A + ABj + i * buf_stride); + + const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride); + const __m128i x2 = _mm_madd_epi16(x1, x1); + + const __m128i sc1 = scan_32(x1); + const __m128i sc2 = scan_32(x2); + + const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1); + const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2); + + xx_store_128(B + ABj + (i + 1) * buf_stride, row1); + xx_store_128(A + ABj + (i + 1) * buf_stride, row2); + + // Calculate the new H - D. + ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff); + ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff); + } + } +} + +// Compute 4 values of boxsum from the given integral image. ii should point +// at the middle of the box (for the first value). r is the box radius. +static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) { + const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride); + const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride); + const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride); + const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride); + const __m128i u = _mm_sub_epi32(tr, tl); + const __m128i v = _mm_sub_epi32(br, bl); + return _mm_sub_epi32(v, u); +} + +static __m128i round_for_shift(unsigned shift) { + return _mm_set1_epi32((1 << shift) >> 1); +} + +static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) { + __m128i an, bb; + if (bit_depth > 8) { + const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8)); + const __m128i rounding_b = round_for_shift(bit_depth - 8); + const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8)); + const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8); + const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a); + const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b); + // b < 2^14, so we can use a 16-bit madd rather than a 32-bit + // mullo to square it + bb = _mm_madd_epi16(b, b); + an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb); + } else { + bb = _mm_madd_epi16(sum1, sum1); + an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n)); + } + return _mm_sub_epi32(an, bb); +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D, + int width, int height, int buf_stride, int bit_depth, + int sgr_params_idx, int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m128i s = _mm_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); + + const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m128i mask[4]; + for (int idx = 0; idx < 4; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); + mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; ++i) { + for (int j = -1; j < width + 1; j += 4) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(4, width + 1 - j); + assert(idx >= 1); + + if (idx < 4) { + sum1 = _mm_and_si128(mask[idx], sum1); + sum2 = _mm_and_si128(mask[idx], sum2); + } + + const __m128i p = compute_p(sum1, sum2, bit_depth, n); + + const __m128i z = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm_set1_epi32(255)); + + // 'Gather' type instructions are not available pre-AVX2, so synthesize a + // gather using scalar loads. + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); + + xx_storeu_128(A + i * buf_stride + j, a_res); + + const __m128i a_complement = + _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); + const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); + const __m128i b_res = + _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); + + xx_storeu_128(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter +// where the outer four corners have weight 3 and all other pixels have weight +// 4. +// +// Pixels are indexed like this: +// xtl xt xtr +// xl x xr +// xbl xb xbr +// +// buf points to x +// +// fours = xl + xt + xr + xb + x +// threes = xtl + xtr + xbr + xbl +// cross_sum = 4 * fours + 3 * threes +// = 4 * (fours + threes) - threes +// = (fours + threes) << 2 - threes +static INLINE __m128i cross_sum(const int32_t *buf, int stride) { + const __m128i xtl = xx_loadu_128(buf - 1 - stride); + const __m128i xt = xx_loadu_128(buf - stride); + const __m128i xtr = xx_loadu_128(buf + 1 - stride); + const __m128i xl = xx_loadu_128(buf - 1); + const __m128i x = xx_loadu_128(buf); + const __m128i xr = xx_loadu_128(buf + 1); + const __m128i xbl = xx_loadu_128(buf - 1 + stride); + const __m128i xb = xx_loadu_128(buf + stride); + const __m128i xbr = xx_loadu_128(buf + 1 + stride); + + const __m128i fours = _mm_add_epi32( + xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x)))); + const __m128i threes = + _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); + + return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes); +} + +// The final filter for self-guided restoration. Computes a weighted average +// across A, B with "cross sums" (see cross_sum implementation above). +static void final_filter(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, const void *dgd8, + int dgd_stride, int width, int height, int highbd) { + const int nb = 5; + const __m128i rounding = + round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride); + const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding), + SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } +} + +// Assumes that C, D are integral images for the original buffer which has been +// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels +// on the sides. A, B, C, D point at logical position (0, 0). +static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C, + const int32_t *D, int width, int height, + int buf_stride, int bit_depth, int sgr_params_idx, + int radius_idx) { + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + const int r = params->r[radius_idx]; + const int n = (2 * r + 1) * (2 * r + 1); + const __m128i s = _mm_set1_epi32(params->s[radius_idx]); + // one_over_n[n-1] is 2^12/n, so easily fits in an int16 + const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]); + + const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS); + const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS); + + // Set up masks + const __m128i ones32 = _mm_set_epi32(0, 0, ~0, ~0); + __m128i mask[4]; + for (int idx = 0; idx < 4; idx++) { + const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx)); + mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift)); + } + + for (int i = -1; i < height + 1; i += 2) { + for (int j = -1; j < width + 1; j += 4) { + const int32_t *Cij = C + i * buf_stride + j; + const int32_t *Dij = D + i * buf_stride + j; + + __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r); + __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r); + + // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain + // some uninitialised data in their upper words. We use a mask to + // ensure that these bits are set to 0. + int idx = AOMMIN(4, width + 1 - j); + assert(idx >= 1); + + if (idx < 4) { + sum1 = _mm_and_si128(mask[idx], sum1); + sum2 = _mm_and_si128(mask[idx], sum2); + } + + const __m128i p = compute_p(sum1, sum2, bit_depth, n); + + const __m128i z = _mm_min_epi32( + _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z), + SGRPROJ_MTABLE_BITS), + _mm_set1_epi32(255)); + + // 'Gather' type instructions are not available pre-AVX2, so synthesize a + // gather using scalar loads. + const __m128i a_res = + _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)], + av1_x_by_xplus1[_mm_extract_epi32(z, 2)], + av1_x_by_xplus1[_mm_extract_epi32(z, 1)], + av1_x_by_xplus1[_mm_extract_epi32(z, 0)]); + + xx_storeu_128(A + i * buf_stride + j, a_res); + + const __m128i a_complement = + _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res); + + // sum1 might have lanes greater than 2^15, so we can't use madd to do + // multiplication involving sum1. However, a_complement and one_over_n + // are both less than 256, so we can multiply them first. + const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n); + const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1); + const __m128i b_res = + _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS); + + xx_storeu_128(B + i * buf_stride + j, b_res); + } + } +} + +// Calculate 4 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xtl xt xtr +// - buf - +// xbl xb xbr +// +// Pixels are weighted like this: +// 5 6 5 +// 0 0 0 +// 5 6 5 +// +// fives = xtl + xtr + xbl + xbr +// sixes = xt + xb +// cross_sum = 6 * sixes + 5 * fives +// = 5 * (fives + sixes) - sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) { + const __m128i xtl = xx_loadu_128(buf - 1 - stride); + const __m128i xt = xx_loadu_128(buf - stride); + const __m128i xtr = xx_loadu_128(buf + 1 - stride); + const __m128i xbl = xx_loadu_128(buf - 1 + stride); + const __m128i xb = xx_loadu_128(buf + stride); + const __m128i xbr = xx_loadu_128(buf + 1 + stride); + + const __m128i fives = + _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl))); + const __m128i sixes = _mm_add_epi32(xt, xb); + const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); + + return _mm_add_epi32( + _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), + sixes); +} + +// Calculate 4 values of the "cross sum" starting at buf. +// +// Pixels are indexed like this: +// xl x xr +// +// Pixels are weighted like this: +// 5 6 5 +// +// buf points to x +// +// fives = xl + xr +// sixes = x +// cross_sum = 5 * fives + 6 * sixes +// = 4 * (fives + sixes) + (fives + sixes) + sixes +// = (fives + sixes) << 2 + (fives + sixes) + sixes +static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) { + const __m128i xl = xx_loadu_128(buf - 1); + const __m128i x = xx_loadu_128(buf); + const __m128i xr = xx_loadu_128(buf + 1); + + const __m128i fives = _mm_add_epi32(xl, xr); + const __m128i sixes = x; + + const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes); + + return _mm_add_epi32( + _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes), + sixes); +} + +// The final filter for the self-guided restoration. Computes a +// weighted average across A, B with "cross sums" (see cross_sum_... +// implementations above). +static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A, + const int32_t *B, int buf_stride, + const void *dgd8, int dgd_stride, int width, + int height, int highbd) { + const int nb0 = 5; + const int nb1 = 4; + + const __m128i rounding0 = + round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + const __m128i rounding1 = + round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + const uint8_t *dgd_real = + highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8; + + for (int i = 0; i < height; ++i) { + if (!(i & 1)) { // even row + for (int j = 0; j < width; j += 4) { + const __m128i a = + cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride); + const __m128i b = + cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0), + SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } else { // odd row + for (int j = 0; j < width; j += 4) { + const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j); + const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j); + const __m128i raw = + xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd)); + const __m128i src = + highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw); + + __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b); + __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1), + SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS); + + xx_storeu_128(dst + i * dst_stride + j, w); + } + } + } +} + +int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, + int height, int dgd_stride, int32_t *flt0, + int32_t *flt1, int flt_stride, + int sgr_params_idx, int bit_depth, + int highbd) { + int32_t *buf = (int32_t *)aom_memalign( + 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + if (!buf) return -1; + memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS); + + const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ; + const int height_ext = height + 2 * SGRPROJ_BORDER_VERT; + + // Adjusting the stride of A and B here appears to avoid bad cache effects, + // leading to a significant speed improvement. + // We also align the stride to a multiple of 16 bytes for efficiency. + int buf_stride = ((width_ext + 3) & ~3) + 16; + + // The "tl" pointers point at the top-left of the initialised data for the + // array. Adding 3 here ensures that column 1 is 16-byte aligned. + int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3; + int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3; + + // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note + // there's a zero row and column in A, B (integral images), so we move down + // and right one for them. + const int buf_diag_border = + SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT; + + int32_t *A0 = Atl + 1 + buf_stride; + int32_t *B0 = Btl + 1 + buf_stride; + int32_t *C0 = Ctl + 1 + buf_stride; + int32_t *D0 = Dtl + 1 + buf_stride; + + // Finally, A, B, C, D point at position (0, 0). + int32_t *A = A0 + buf_diag_border; + int32_t *B = B0 + buf_diag_border; + int32_t *C = C0 + buf_diag_border; + int32_t *D = D0 + buf_diag_border; + + const int dgd_diag_border = + SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT; + const uint8_t *dgd0 = dgd8 - dgd_diag_border; + + // Generate integral images from the input. C will contain sums of squares; D + // will contain just sums + if (highbd) + integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext, + height_ext, Ctl, Dtl, buf_stride); + else + integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl, + buf_stride); + + const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx]; + // Write to flt0 and flt1 + // If params->r == 0 we skip the corresponding filter. We only allow one of + // the radii to be 0, as having both equal to 0 would be equivalent to + // skipping SGR entirely. + assert(!(params->r[0] == 0 && params->r[1] == 0)); + assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ)); + + if (params->r[0] > 0) { + calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth, + sgr_params_idx, 0); + final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride, + width, height, highbd); + } + + if (params->r[1] > 0) { + calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx, + 1); + final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width, + height, highbd); + } + aom_free(buf); + return 0; +} + +int av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width, + int height, int stride, int eps, + const int *xqd, uint8_t *dst8, + int dst_stride, int32_t *tmpbuf, + int bit_depth, int highbd) { + int32_t *flt0 = tmpbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + assert(width * height <= RESTORATION_UNITPELS_MAX); + const int ret = av1_selfguided_restoration_sse4_1( + dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd); + if (ret != 0) return ret; + const sgr_params_type *const params = &av1_sgr_params[eps]; + int xq[2]; + av1_decode_xq(xqd, xq, params); + + __m128i xq0 = _mm_set1_epi32(xq[0]); + __m128i xq1 = _mm_set1_epi32(xq[1]); + + for (int i = 0; i < height; ++i) { + // Calculate output in batches of 8 pixels + for (int j = 0; j < width; j += 8) { + const int k = i * width + j; + const int m = i * dst_stride + j; + + const uint8_t *dat8ij = dat8 + i * stride + j; + __m128i src; + if (highbd) { + src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij)); + } else { + src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij)); + } + + const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS); + const __m128i u_0 = _mm_cvtepu16_epi32(u); + const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8)); + + __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS); + __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS); + + if (params->r[0] > 0) { + const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0); + v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0)); + + const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1); + v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1)); + } + + if (params->r[1] > 0) { + const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0); + v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0)); + + const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1); + v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1)); + } + + const __m128i rounding = + round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding), + SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding), + SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS); + + if (highbd) { + // Pack into 16 bits and clamp to [0, 2^bit_depth) + const __m128i tmp = _mm_packus_epi32(w_0, w_1); + const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1); + const __m128i res = _mm_min_epi16(tmp, max); + xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res); + } else { + // Pack into 8 bits and clamp to [0, 256) + const __m128i tmp = _mm_packs_epi32(w_0, w_1); + const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */); + xx_storel_64(dst8 + m, res); + } + } + } + return 0; +} diff --git a/third_party/aom/av1/common/x86/warp_plane_avx2.c b/third_party/aom/av1/common/x86/warp_plane_avx2.c new file mode 100644 index 0000000000..663b8cde93 --- /dev/null +++ b/third_party/aom/av1/common/x86/warp_plane_avx2.c @@ -0,0 +1,1210 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "config/av1_rtcd.h" +#include "av1/common/warped_motion.h" +#include "aom_dsp/x86/synonyms.h" + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = { + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = { + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = { + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = { + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = { + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = { + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = { + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15 +}; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, + 5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6, + 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src1[32]) = { 4, 6, 6, 8, 8, 10, 10, 12, 5, 7, 7, + 9, 9, 11, 11, 13, 4, 6, 6, 8, 8, 10, + 10, 12, 5, 7, 7, 9, 9, 11, 11, 13 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src2[32]) = { 1, 3, 3, 5, 5, 7, 7, 9, 2, 4, 4, + 6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7, + 7, 9, 2, 4, 4, 6, 6, 8, 8, 10 }; + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src3[32]) = { 5, 7, 7, 9, 9, 11, 11, 13, 6, 8, 8, + 10, 10, 12, 12, 14, 5, 7, 7, 9, 9, 11, + 11, 13, 6, 8, 8, 10, 10, 12, 12, 14 }; + +static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out, + __m256i *coeff, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift, int row) { + const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]); + const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]); + const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]); + const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]); + + const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]); + const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]); + const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]); + const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]); + + const __m256i res_even = _mm256_add_epi16(res_02, res_46); + const __m256i res_odd = _mm256_add_epi16(res_13, res_57); + const __m256i res = + _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const); + horz_out[row] = _mm256_srl_epi16(res, *shift); +} + +static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta, + int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >> + WARPEDDIFF_PREC_BITS]); + + __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0); + __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2); + __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1); + __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3); + + __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4); + __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6); + __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5); + __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7); + + __m128i tmp_8 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1); + + __m128i tmp_9 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1); + + __m128i tmp_10 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1); + + __m128i tmp_11 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1); + + tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1); + + tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1); + + tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1); + + tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >> + WARPEDDIFF_PREC_BITS]); + tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1); + + const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256); + const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256); + const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256); + const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx, + __m256i *coeff) { + __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2); + tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3); + tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6); + tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0); + const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1); + const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4); + const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5); + + const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14); + const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14); + const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15); + const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15); + + coeff[0] = _mm256_unpacklo_epi64(res_0, res_2); + coeff[1] = _mm256_unpackhi_epi64(res_0, res_2); + coeff[2] = _mm256_unpacklo_epi64(res_1, res_3); + coeff[3] = _mm256_unpackhi_epi64(res_1, res_3); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx, + __m256i *coeff) { + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]); + + const __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1); + + coeff[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2)); + coeff[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2)); + coeff[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2)); + coeff[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2)); +} + +static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out, + int sx, int alpha, int beta, int row, + const __m256i *shuffle_src, + const __m256i *round_const, + const __m128i *shift) { + __m256i coeff[4]; + prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff); + filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift, + row); +} +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m256i *coeff) { + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14)); + coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14)); + coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15)); + coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15)); +} + +static INLINE void warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src, + round_const, shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, sx, row = 0; + __m256i coeff[4]; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + sx = sx4 + beta * (k + 4); + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)beta; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src_0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src_1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void warp_horizontal_filter_alpha0_beta0_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + (void)alpha; + int k, iy, row = 0; + __m256i coeff[4]; + prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff); + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i src_01 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7))); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const, + shift, row); +} + +static INLINE void unpack_weights_and_set_round_const_avx2( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) { + *res_sub_const = + _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m256i wt0 = _mm256_set1_epi16((short)w0); + const __m256i wt1 = _mm256_set1_epi16((short)w1); + *wt = _mm256_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta, + int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m128i filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + __m256i filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + __m256i filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + __m256i filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_10 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_11 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_12 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_13 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + + (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1); + filt_1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1); + filt_2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1); + filt_3 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy, + __m256i *coeffs) { + __m128i filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + __m128i filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00); + __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01); + __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02); + __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03); + + __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3); + + filt_00 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_01 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_02 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + filt_03 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + filt_0 = _mm256_broadcastsi128_si256(filt_00); + filt_1 = _mm256_broadcastsi128_si256(filt_01); + filt_2 = _mm256_broadcastsi128_si256(filt_02); + filt_3 = _mm256_broadcastsi128_si256(filt_03); + + res_0 = _mm256_unpacklo_epi32(filt_0, filt_1); + res_1 = _mm256_unpacklo_epi32(filt_2, filt_3); + res_2 = _mm256_unpackhi_epi32(filt_0, filt_1); + res_3 = _mm256_unpackhi_epi32(filt_2, filt_3); + + coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1); + coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1); + coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3); + coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy, + __m256i *coeffs) { + const __m128i filt_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + const __m128i filt_1 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS))); + + __m256i res_0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1); + + coeffs[0] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2)); + coeffs[1] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2)); + coeffs[2] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2)); + coeffs[3] = _mm256_shuffle_epi8( + res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2)); + + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out, + __m256i *src, + __m256i *coeffs, + __m256i *res_lo, + __m256i *res_hi, int row) { + const __m256i src_6 = horz_out[row + 3]; + const __m256i src_7 = + _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21); + + src[6] = _mm256_unpacklo_epi16(src_6, src_7); + + const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]); + const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]); + const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]); + const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]); + + const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2), + _mm256_add_epi32(res_4, res_6)); + + src[7] = _mm256_unpackhi_epi16(src_6, src_7); + + const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]); + const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]); + const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]); + const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]); + + const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3), + _mm256_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm256_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output_avx2( + const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const, + const __m256i *wt, const __m256i *res_sub_const, + const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params, + int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m256i res_lo_1 = *res_lo; + __m256i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p_0 = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j]; + + res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + + const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1); + __m256i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const dst8_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + const __m128i p_16_0 = _mm_loadl_epi64(p_0); + const __m128i p_16_1 = _mm_loadl_epi64(p_1); + const __m256i p_16 = + _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16); + const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1); + } + res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const); + res_lo_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits); + const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16); + const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo); + const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1); + *(int *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0); + *(int *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1); + } else { + const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16); + const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1); + _mm_storel_epi64(p_0, temp_lo_16_0); + _mm_storel_epi64(p_1, temp_lo_16_1); + } + if (p_width > 4) { + __m128i *const p4_0 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + __m128i *const p4_1 = + (__m128i *)&conv_params + ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1); + __m256i res_hi_16; + if (conv_params->do_average) { + __m128i *const dst8_4_0 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + __m128i *const dst8_4_1 = + (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4]; + const __m128i p4_16_0 = _mm_loadl_epi64(p4_0); + const __m128i p4_16_1 = _mm_loadl_epi64(p4_1); + const __m256i p4_16 = _mm256_inserti128_si256( + _mm256_castsi128_si256(p4_16_0), p4_16_1, 1); + if (conv_params->use_dist_wtd_comp_avg) { + const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16); + const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt); + const __m256i shifted_32 = + _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const); + res_hi_16 = _mm256_srai_epi16( + _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits); + __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16); + const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi); + const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1); + *(int *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0); + *(int *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1); + } else { + const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16); + const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1); + _mm_storel_epi64(p4_0, temp_hi_16_0); + _mm_storel_epi64(p4_1, temp_hi_16_1); + } + } + } else { + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round); + const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit); + const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit); + const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j]; + + if (p_width == 4) { + *(int *)p = _mm_cvtsi128_si32(res_8bit0); + *(int *)p1 = _mm_cvtsi128_si32(res_8bit1); + } else { + _mm_storel_epi64(p, res_8bit0); + _mm_storel_epi64(p1, res_8bit1); + } + } +} + +static INLINE void warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + int sy = sy4 + delta * (k + 4); + __m256i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs); + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)delta; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + (void)gamma; + int k, row = 0; + __m256i src[8], coeffs[8]; + const __m256i src_0 = horz_out[0]; + const __m256i src_1 = + _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21); + const __m256i src_2 = horz_out[1]; + const __m256i src_3 = + _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21); + const __m256i src_4 = horz_out[2]; + const __m256i src_5 = + _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21); + + src[0] = _mm256_unpacklo_epi16(src_0, src_1); + src[2] = _mm256_unpacklo_epi16(src_2, src_3); + src[4] = _mm256_unpacklo_epi16(src_4, src_5); + + src[1] = _mm256_unpackhi_epi16(src_0, src_1); + src[3] = _mm256_unpackhi_epi16(src_2, src_3); + src[5] = _mm256_unpackhi_epi16(src_4, src_5); + + prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs); + + for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) { + __m256i res_lo, res_hi; + filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi, + row); + store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt, + res_sub_const, round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + src[0] = src[2]; + src[2] = src[4]; + src[4] = src[6]; + src[1] = src[3]; + src[3] = src[5]; + src[5] = src[7]; + row += 1; + } +} + +static INLINE void prepare_warp_vertical_filter_avx2( + uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params, + int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width, + int i, int j, int sy4, const int reduce_bits_vert, + const __m256i *res_add_const, const int round_bits, + const __m256i *res_sub_const, const __m256i *round_bits_const, + const __m256i *wt) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width, + i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const, + round_bits_const, wt); + else + warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta, + p_height, p_stride, p_width, i, j, sy4, + reduce_bits_vert, res_add_const, round_bits, + res_sub_const, round_bits_const, wt); +} + +static INLINE void prepare_warp_horizontal_filter_avx2( + const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const __m256i *round_const, const __m128i *shift, + const __m256i *shuffle_src) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4, + alpha, beta, p_height, height, i, + round_const, shift, shuffle_src); + else + warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, round_const, shift, + shuffle_src); +} + +void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m256i horz_out[8]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m256i reduce_bits_vert_const = + _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + const __m256i round_const = _mm256_set1_epi16( + (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1)); + const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz); + + __m256i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, + &wt); + + __m256i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + const int32_t const1 = alpha * (-4) + beta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const2 = gamma * (-4) + delta * (-4) + + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1); + const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)); + const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz)); + + __m256i shuffle_src[4]; + shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0); + shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1); + shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2); + shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3); + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += const1; + sy4 += const2; + + sx4 &= ~const3; + sy4 &= ~const3; + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + + if (ix4 <= -7) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = + _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5); + } else if (ix4 >= width + 6) { + int iy, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + const __m256i temp_0 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + const __m256i temp_1 = _mm256_set1_epi16( + const4 + ref[iy * stride + (width - 1)] * const5); + horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + horz_out[row] = + _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5); + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + int iy, sx, row = 0; + for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) { + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src0 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + iy = iy4 + k + 1; + iy = clamp(iy, 0, height - 1); + __m128i src1 = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_left); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src0 = _mm_shuffle_epi8(src0, shuffle_reg_right); + src1 = _mm_shuffle_epi8(src1, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1); + horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, + shuffle_src, &round_const, &shift); + row += 1; + } + iy = iy4 + k; + iy = clamp(iy, 0, height - 1); + __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = + _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + sx = sx4 + beta * (k + 4); + const __m256i src_01 = _mm256_castsi128_si256(src); + __m256i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, + &round_const, &shift, row); + } else { + prepare_warp_horizontal_filter_avx2( + ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, + i, &round_const, &shift, shuffle_src); + } + + // Vertical filter + prepare_warp_vertical_filter_avx2( + pred, horz_out, conv_params, gamma, delta, p_height, p_stride, + p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, + &res_sub_const, &round_bits_const, &wt); + } + } +} diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c new file mode 100644 index 0000000000..4c05555ff7 --- /dev/null +++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/warped_motion.h" + +/* This is a modified version of 'av1_warped_filter' from warped_motion.c: + * Each coefficient is stored in 8 bits instead of 16 bits + * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7 + + This is done in order to avoid overflow: Since the tap with the largest + coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation + order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular + convolve functions. + + Instead, we use the summation order + ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)). + The rearrangement of coefficients in this table is so that we can get the + coefficients into the correct order more quickly. +*/ +/* clang-format off */ +DECLARE_ALIGNED(8, const int8_t, + av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = { + // [-1, 0) + { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0}, + { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0}, + { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0}, + { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0}, + { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0}, + { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0}, + { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0}, + { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0}, + { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0}, + { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0}, + { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0}, + { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0}, + { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0}, + { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0}, + { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0}, + { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0}, + { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0}, + { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0}, + { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0}, + { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0}, + { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0}, + { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0}, + { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0}, + { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0}, + { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0}, + { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0}, + { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0}, + { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0}, + { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0}, + { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0}, + { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0}, + { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0}, + // [0, 1) + { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0}, + { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0}, + { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1}, + {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1}, + {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1}, + {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1}, + {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1}, + {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1}, + {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2}, + {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2}, + {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2}, + {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2}, + {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2}, + {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2}, + {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2}, + {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2}, + {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2}, + {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2}, + {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2}, + {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2}, + {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2}, + {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2}, + {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2}, + {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2}, + {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2}, + {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1}, + {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2}, + {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1}, + {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1}, + {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1}, + { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0}, + { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0}, + // [1, 2) + { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0}, + { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1}, + { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1}, + { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1}, + { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1}, + { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2}, + { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2}, + { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2}, + { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3}, + { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3}, + { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3}, + { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4}, + { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4}, + { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4}, + { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4}, + { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4}, + { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4}, + { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4}, + { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4}, + { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4}, + { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4}, + { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4}, + { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4}, + { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3}, + { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3}, + { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3}, + { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2}, + { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2}, + { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2}, + { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1}, + { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1}, + { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0}, + // dummy (replicate row index 191) + { 0, 0, 2, -1, 0, 0, 127, 0}, +}; +/* clang-format on */ + +// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15 +// in an SSE register into two sequences: +// 0, 2, 2, 4, ..., 12, 12, 14, +// 1, 3, 3, 5, ..., 13, 13, 15, +DECLARE_ALIGNED(16, static const uint8_t, + even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9, + 9, 11, 11, 13, 13, 15, 15, 0 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3, + 2, 3, 2, 3, 2, 3, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5, + 4, 5, 4, 5, 4, 5, 4, 5 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7, + 6, 7, 6, 7, 6, 7, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7, + 4, 5, 6, 7, 4, 5, 6, 7 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11, + 8, 9, 10, 11, 8, 9, 10, 11 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15, + 12, 13, 14, 15, 12, 13, 14, 15 }; + +static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff, + const int offset_bits_horiz, + const int reduce_bits_horiz, int k) { + const __m128i src_even = + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask)); + const __m128i src_odd = + _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask)); + // The pixel order we need for 'src' is: + // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9 + const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd); + const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]); + // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13 + const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4), + _mm_srli_si128(src_odd, 4)); + const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]); + // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10 + const __m128i src_13 = + _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2)); + const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]); + // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14 + const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4), + _mm_srli_si128(src_even, 6)); + const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]); + + const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) + + ((1 << reduce_bits_horiz) >> 1)); + + // Note: The values res_02 + res_46 and res_13 + res_57 both + // fit into int16s at this point, but their sum may be too wide to fit + // into an int16. However, once we also add round_const, the sum of + // all of these fits into a uint16. + // + // The wrapping behaviour of _mm_add_* is used here to make sure we + // get the correct result despite converting between different + // (implicit) types. + const __m128i res_even = _mm_add_epi16(res_02, res_46); + const __m128i res_odd = _mm_add_epi16(res_13, res_57); + const __m128i res = + _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const); + tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz)); +} + +static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_1 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_2 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_3 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_4 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_5 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_6 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]); + const __m128i tmp_7 = _mm_loadl_epi64( + (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2 + const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3 + const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6 + const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6); + // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7 + const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7); + + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6 + const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6 + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10); + // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7 + const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11); + // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7 + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx, + __m128i *coeff) { + // Filter even-index pixels + const __m128i tmp_0 = + _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]); + + // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7 + coeff[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01)); + // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7 + coeff[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23)); + // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7 + coeff[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45)); + // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7 + coeff[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67)); +} + +static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx, + int alpha, int k, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); +} + +static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp, + int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, + int p_height, int height, int i, + const int offset_bits_horiz, + const int reduce_bits_horiz) { + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } +} + +static INLINE void warp_horizontal_filter_alpha0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)alpha; + int k; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx, coeff); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + int k; + __m128i coeff[4]; + prepare_horizontal_filter_coeff(alpha, sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void warp_horizontal_filter_alpha0_beta0( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + (void)beta; + (void)alpha; + int k; + + __m128i coeff[4]; + prepare_horizontal_filter_coeff_alpha0(sx4, coeff); + + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + + // Load source pixels + const __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k); + } +} + +static INLINE void unpack_weights_and_set_round_const( + ConvolveParams *conv_params, const int round_bits, const int offset_bits, + __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) { + *res_sub_const = + _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) - + (1 << (offset_bits - conv_params->round_1 - 1))); + *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1)); + + const int w0 = conv_params->fwd_offset; + const int w1 = conv_params->bck_offset; + const __m128i wt0 = _mm_set1_epi16((int16_t)w0); + const __m128i wt1 = _mm_set1_epi16((int16_t)w1); + *wt = _mm_unpacklo_epi16(wt0, wt1); +} + +static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy, + __m128i *coeffs) { + const __m128i tmp_0 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_2 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_4 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_6 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2); + const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6); + const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2); + const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6); + + // even coeffs + coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10); + coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10); + coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14); + coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14); + + const __m128i tmp_1 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_3 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_5 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS))); + const __m128i tmp_7 = + _mm_loadu_si128((__m128i *)(av1_warped_filter + + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS))); + + const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3); + const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7); + const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3); + const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7); + + // odd coeffs + coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11); + coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11); + coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15); + coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15); +} + +static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy, + __m128i *coeffs) { + const __m128i tmp_0 = _mm_loadu_si128( + (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS))); + + // even coeffs + coeffs[0] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0)); + coeffs[1] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1)); + coeffs[2] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2)); + coeffs[3] = + _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3)); + + // odd coeffs + coeffs[4] = coeffs[0]; + coeffs[5] = coeffs[1]; + coeffs[6] = coeffs[2]; + coeffs[7] = coeffs[3]; +} + +static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs, + __m128i *res_lo, __m128i *res_hi, + int k) { + // Load from tmp and rearrange pairs of consecutive rows into the + // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7 + const __m128i *src = tmp + (k + 4); + const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]); + const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]); + const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]); + const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]); + const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]); + const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]); + const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]); + + const __m128i res_even = + _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]); + const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]); + const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]); + const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]); + const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]); + const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]); + const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]); + + const __m128i res_odd = + _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + *res_lo = _mm_unpacklo_epi32(res_even, res_odd); + *res_hi = _mm_unpackhi_epi32(res_even, res_odd); +} + +static INLINE void store_vertical_filter_output( + __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const, + const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const, + uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k, + const int reduce_bits_vert, int p_stride, int p_width, + const int round_bits) { + __m128i res_lo_1 = *res_lo; + __m128i res_hi_1 = *res_hi; + + if (conv_params->is_compound) { + __m128i *const p = + (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j]; + res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1); + __m128i res_lo_16; + if (conv_params->do_average) { + __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + const __m128i p_16 = _mm_loadl_epi64(p); + + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); + const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1); + } + + res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const); + + res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const), + round_bits); + __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16); + *(int *)dst8 = _mm_cvtsi128_si32(res_8_lo); + } else { + _mm_storel_epi64(p, temp_lo_16); + } + if (p_width > 4) { + __m128i *const p4 = + (__m128i *)&conv_params + ->dst[(i + k + 4) * conv_params->dst_stride + j + 4]; + res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const), + reduce_bits_vert); + const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1); + __m128i res_hi_16; + + if (conv_params->do_average) { + __m128i *const dst8_4 = + (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; + const __m128i p4_16 = _mm_loadl_epi64(p4); + + if (conv_params->use_dist_wtd_comp_avg) { + const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); + const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); + const __m128i shifted_32 = + _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32); + } else { + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1); + } + res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const); + + res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const), + round_bits); + __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16); + *(int *)dst8_4 = _mm_cvtsi128_si32(res_8_hi); + + } else { + _mm_storel_epi64(p4, temp_hi_16); + } + } + } else { + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + // Store, blending with 'pred' if needed + __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j]; + + // Note: If we're outputting a 4x4 block, we need to be very careful + // to only output 4 pixels at this point, to avoid encode/decode + // mismatches when encoding with multiple threads. + if (p_width == 4) { + *(int *)p = _mm_cvtsi128_si32(res_8bit); + } else { + _mm_storel_epi64(p, res_8bit); + } + } +} + +static INLINE void warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + int k; + (void)gamma; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + int sy = sy4 + delta * (k + 4); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy, coeffs); + + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs(gamma, sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void warp_vertical_filter_gamma0_delta0( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + (void)delta; + (void)gamma; + int k; + __m128i res_sub_const, round_bits_const, wt; + unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits, + &res_sub_const, &round_bits_const, &wt); + + __m128i coeffs[8]; + prepare_vertical_filter_coeffs_gamma0(sy4, coeffs); + // Vertical filter + for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) { + __m128i res_lo; + __m128i res_hi; + filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k); + + store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt, + &res_sub_const, &round_bits_const, pred, + conv_params, i, j, k, reduce_bits_vert, + p_stride, p_width, round_bits); + } +} + +static INLINE void prepare_warp_vertical_filter( + uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma, + int16_t delta, int p_height, int p_stride, int p_width, int i, int j, + int sy4, const int reduce_bits_vert, const __m128i *res_add_const, + const int round_bits, const int offset_bits) { + if (gamma == 0 && delta == 0) + warp_vertical_filter_gamma0_delta0( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j, + sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits); + else if (gamma == 0 && delta != 0) + warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else if (gamma != 0 && delta == 0) + warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); + else + warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height, + p_stride, p_width, i, j, sy4, reduce_bits_vert, + res_add_const, round_bits, offset_bits); +} + +static INLINE void prepare_warp_horizontal_filter( + const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4, + int32_t sx4, int alpha, int beta, int p_height, int height, int i, + const int offset_bits_horiz, const int reduce_bits_horiz) { + if (alpha == 0 && beta == 0) + warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + else if (alpha == 0 && beta != 0) + warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else if (alpha != 0 && beta == 0) + warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); + else + warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta, + p_height, height, i, offset_bits_horiz, + reduce_bits_horiz); +} + +void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, + int height, int stride, uint8_t *pred, int p_col, + int p_row, int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + __m128i tmp[15]; + int i, j, k; + const int bd = 8; + const int reduce_bits_horiz = conv_params->round_0; + const int reduce_bits_vert = conv_params->is_compound + ? conv_params->round_1 + : 2 * FILTER_BITS - reduce_bits_horiz; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; + const __m128i reduce_bits_vert_const = + _mm_set1_epi32(((1 << reduce_bits_vert) >> 1)); + const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + + /* Note: For this code to work, the left/right frame borders need to be + extended by at least 13 pixels each. By the time we get here, other + code will have set up this border, but we allow an explicit check + for debugging purposes. + */ + /*for (i = 0; i < height; ++i) { + for (j = 0; j < 13; ++j) { + assert(ref[i * stride - 13 + j] == ref[i * stride]); + assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]); + } + }*/ + __m128i res_add_const_1; + if (conv_params->is_compound == 1) { + res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const); + } else { + res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) + + ((1 << reduce_bits_vert) >> 1)); + } + + for (i = 0; i < p_height; i += 8) { + for (j = 0; j < p_width; j += 8) { + const int32_t src_x = (p_col + j + 4) << subsampling_x; + const int32_t src_y = (p_row + i + 4) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + // Add in all the constant terms, including rounding and offset + sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) + + (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Horizontal filter + // If the block is aligned such that, after clamping, every sample + // would be taken from the leftmost/rightmost column, then we can + // skip the expensive horizontal filter. + if (ix4 <= -7) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = _mm_set1_epi16( + (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (ix4 >= width + 6) { + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + tmp[k + 7] = + _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) + + ref[iy * stride + (width - 1)] * + (1 << (FILTER_BITS - reduce_bits_horiz))); + } + } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + for (k = -7; k < AOMMIN(8, p_height - i); ++k) { + int iy = iy4 + k; + if (iy < 0) + iy = 0; + else if (iy > height - 1) + iy = height - 1; + int sx = sx4 + beta * (k + 4); + + // Load source pixels + __m128i src = + _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)); + if (out_of_boundary_left >= 0) { + const __m128i shuffle_reg_left = + _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]); + src = _mm_shuffle_epi8(src, shuffle_reg_left); + } + if (out_of_boundary_right >= 0) { + const __m128i shuffle_reg_right = _mm_loadu_si128( + (__m128i *)warp_pad_right[out_of_boundary_right]); + src = _mm_shuffle_epi8(src, shuffle_reg_right); + } + horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz, + reduce_bits_horiz); + } + } else { + prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, + beta, p_height, height, i, + offset_bits_horiz, reduce_bits_horiz); + } + + // Vertical filter + prepare_warp_vertical_filter( + pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, + j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits); + } + } +} diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c new file mode 100644 index 0000000000..3de630f203 --- /dev/null +++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +// 128-bit xmmwords are written as [ ... ] with the MSB on the left. +// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB +// on the left. +// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be +// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. + +// Exploiting the range of wiener filter coefficients, +// horizontal filtering can be done in 16 bit intermediate precision. +// The details are as follows : +// Consider the horizontal wiener filter coefficients of the following form : +// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] +// Subtracting 2^(FILTER_BITS) from the centre tap we get the following : +// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] +// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 +// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit +// precision. Finally, after rounding the above result by round_0, we multiply +// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the +// horizontal filter output. + +void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]); + int im_h = h + SUBPEL_TAPS - 2; + int im_stride = 8; + memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = (SUBPEL_TAPS - 1) / 2; + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center; + + assert(conv_params->round_0 > 0); + + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2); + + const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x); + const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_h[0] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_h[1] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_h[2] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_h[3] = + _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu)); + + const __m256i round_const_h = + _mm256_set1_epi16((1 << (conv_params->round_0 - 1))); + const __m256i round_const_horz = + _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1))); + const __m256i clamp_low = _mm256_setzero_si256(); + const __m256i clamp_high = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0); + + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i zero_128 = _mm_setzero_si128(); + const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0); + + const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff); + + const __m256i round_const_v = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); + + for (j = 0; j < w; j += 8) { + for (i = 0; i < im_h; i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line + if (i + 1 < im_h) + data = _mm256_inserti128_si256( + data, + _mm_loadu_si128( + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), + 1); + + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); + + res = + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); + + __m256i data_0 = _mm256_shuffle_epi8(data, filt_center); + + // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to + // the result + data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0); + res = _mm256_add_epi16(res, data_0); + res = _mm256_add_epi16(res, round_const_horz); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped); + } + + /* Vertical filter */ + { + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); + + __m256i s[8]; + s[0] = _mm256_unpacklo_epi16(src_0, src_1); + s[1] = _mm256_unpacklo_epi16(src_2, src_3); + s[2] = _mm256_unpacklo_epi16(src_4, src_5); + + s[4] = _mm256_unpackhi_epi16(src_0, src_1); + s[5] = _mm256_unpackhi_epi16(src_2, src_3); + s[6] = _mm256_unpackhi_epi16(src_4, src_5); + + for (i = 0; i < h - 1; i += 2) { + const int16_t *data = &im_block[i * im_stride]; + + const __m256i s6 = + _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); + const __m256i s7 = + _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + + s[3] = _mm256_unpacklo_epi16(s6, s7); + s[7] = _mm256_unpackhi_epi16(s6, s7); + + __m256i res_a = convolve(s, coeffs_v); + __m256i res_b = convolve(s + 4, coeffs_v); + + const __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_v), round_shift_v); + const __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + // 8 bit conversion and saturation to uint8 + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + // Store values into the destination buffer + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; + + _mm_storel_epi64(p_0, res_0); + _mm_storel_epi64(p_1, res_1); + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + if (h - i) { + s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20); + s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20); + s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20); + + const int16_t *data = &im_block[i * im_stride]; + const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride)); + const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); + + __m128i s3 = _mm_unpacklo_epi16(s6_, s7_); + __m128i s7 = _mm_unpackhi_epi16(s6_, s7_); + + s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1); + __m256i convolveres = convolve(s, coeffs_v); + + const __m256i res_round = _mm256_sra_epi32( + _mm256_add_epi32(convolveres, round_const_v), round_shift_v); + + /* rounding code */ + // 16 bit conversion + __m128i reslo = _mm256_castsi256_si128(res_round); + __m128i reshi = _mm256_extracti128_si256(res_round, 1); + const __m128i res_16bit = _mm_packus_epi32(reslo, reshi); + + // 8 bit conversion and saturation to uint8 + const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit); + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p_0, res_8b); + } + } + } +} diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c new file mode 100644 index 0000000000..1c039e80c6 --- /dev/null +++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/common/convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, + const WienerConvolveParams *conv_params) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + DECLARE_ALIGNED(16, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = _mm_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + conv_params->round_0); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + conv_params->round_0); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16( + _mm_max_epi16(res, zero), + _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1)); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), conv_params->round_1); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), conv_params->round_1); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p, res_8bit); + } + } + } +} diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c new file mode 100644 index 0000000000..1ded380ec3 --- /dev/null +++ b/third_party/aom/av1/decoder/accounting.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom/aom_integer.h" +#include "av1/decoder/accounting.h" + +static int accounting_hash(const char *str) { + uint32_t val; + const unsigned char *ustr; + val = 0; + ustr = (const unsigned char *)str; + /* This is about the worst hash one can design, but it should be good enough + here. */ + while (*ustr) val += *ustr++; + return val % AOM_ACCOUNTING_HASH_SIZE; +} + +/* Dictionary lookup based on an open-addressing hash table. */ +int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) { + int hash; + size_t len; + AccountingDictionary *dictionary; + dictionary = &accounting->syms.dictionary; + hash = accounting_hash(str); + while (accounting->hash_dictionary[hash] != -1) { + if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) { + return accounting->hash_dictionary[hash]; + } + hash++; + if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0; + } + /* No match found. */ + assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES); + accounting->hash_dictionary[hash] = dictionary->num_strs; + len = strlen(str); + dictionary->strs[dictionary->num_strs] = malloc(len + 1); + if (!dictionary->strs[dictionary->num_strs]) abort(); + snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str); + dictionary->num_strs++; + return dictionary->num_strs - 1; +} + +void aom_accounting_init(Accounting *accounting) { + int i; + accounting->num_syms_allocated = 1000; + accounting->syms.syms = + malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated); + if (!accounting->syms.syms) abort(); + accounting->syms.dictionary.num_strs = 0; + assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES); + for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++) + accounting->hash_dictionary[i] = -1; + aom_accounting_reset(accounting); +} + +void aom_accounting_reset(Accounting *accounting) { + accounting->syms.num_syms = 0; + accounting->syms.num_binary_syms = 0; + accounting->syms.num_multi_syms = 0; + accounting->context.x = -1; + accounting->context.y = -1; + accounting->last_tell_frac = 0; +} + +void aom_accounting_clear(Accounting *accounting) { + int i; + AccountingDictionary *dictionary; + free(accounting->syms.syms); + dictionary = &accounting->syms.dictionary; + for (i = 0; i < dictionary->num_strs; i++) { + free(dictionary->strs[i]); + } +} + +void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) { + accounting->context.x = x; + accounting->context.y = y; +} + +void aom_accounting_record(Accounting *accounting, const char *str, + uint32_t bits) { + AccountingSymbol sym; + // Reuse previous symbol if it has the same context and symbol id. + if (accounting->syms.num_syms) { + AccountingSymbol *last_sym; + last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1]; + if (memcmp(&last_sym->context, &accounting->context, + sizeof(AccountingSymbolContext)) == 0) { + uint32_t id; + id = aom_accounting_dictionary_lookup(accounting, str); + if (id == last_sym->id) { + last_sym->bits += bits; + last_sym->samples++; + return; + } + } + } + sym.context = accounting->context; + sym.samples = 1; + sym.bits = bits; + sym.id = aom_accounting_dictionary_lookup(accounting, str); + assert(sym.id <= 255); + if (accounting->syms.num_syms == accounting->num_syms_allocated) { + accounting->num_syms_allocated *= 2; + accounting->syms.syms = + realloc(accounting->syms.syms, + sizeof(AccountingSymbol) * accounting->num_syms_allocated); + if (!accounting->syms.syms) abort(); + } + accounting->syms.syms[accounting->syms.num_syms++] = sym; +} + +void aom_accounting_dump(Accounting *accounting) { + int i; + AccountingSymbol *sym; + printf("\n----- Number of recorded syntax elements = %d -----\n", + accounting->syms.num_syms); + printf("----- Total number of symbol calls = %d (%d binary) -----\n", + accounting->syms.num_multi_syms + accounting->syms.num_binary_syms, + accounting->syms.num_binary_syms); + for (i = 0; i < accounting->syms.num_syms; i++) { + sym = &accounting->syms.syms[i]; + printf("%s x: %d, y: %d bits: %f samples: %d\n", + accounting->syms.dictionary.strs[sym->id], sym->context.x, + sym->context.y, (float)sym->bits / 8.0, sym->samples); + } +} diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h new file mode 100644 index 0000000000..ad2e8b6cfe --- /dev/null +++ b/third_party/aom/av1/decoder/accounting.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_DECODER_ACCOUNTING_H_ +#define AOM_AV1_DECODER_ACCOUNTING_H_ +#include +#include "aom/aomdx.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#define AOM_ACCOUNTING_HASH_SIZE (1021) + +/* Max number of entries for symbol types in the dictionary (increase as + necessary). */ +#define MAX_SYMBOL_TYPES (256) + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +#define AOM_ACCT_BITRES (3) + +typedef struct { + int16_t x; + int16_t y; +} AccountingSymbolContext; + +typedef struct { + AccountingSymbolContext context; + uint32_t id; + /** Number of bits in units of 1/8 bit. */ + uint32_t bits; + uint32_t samples; +} AccountingSymbol; + +/** Dictionary for translating strings into id. */ +typedef struct { + char *strs[MAX_SYMBOL_TYPES]; + int num_strs; +} AccountingDictionary; + +typedef struct { + /** All recorded symbols decoded. */ + AccountingSymbol *syms; + /** Number of syntax actually recorded. */ + int num_syms; + /** Raw symbol decoding calls for non-binary values. */ + int num_multi_syms; + /** Raw binary symbol decoding calls. */ + int num_binary_syms; + /** Dictionary for translating strings into id. */ + AccountingDictionary dictionary; +} AccountingSymbols; + +struct Accounting { + AccountingSymbols syms; + /** Size allocated for symbols (not all may be used). */ + int num_syms_allocated; + int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE]; + AccountingSymbolContext context; + uint32_t last_tell_frac; +}; + +void aom_accounting_init(Accounting *accounting); +void aom_accounting_reset(Accounting *accounting); +void aom_accounting_clear(Accounting *accounting); +void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y); +int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str); +void aom_accounting_record(Accounting *accounting, const char *str, + uint32_t bits); +void aom_accounting_dump(Accounting *accounting); +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_AV1_DECODER_ACCOUNTING_H_ diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c new file mode 100644 index 0000000000..bb09347e1c --- /dev/null +++ b/third_party/aom/av1/decoder/decodeframe.c @@ -0,0 +1,5369 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_reader.h" +#include "aom_dsp/bitreader.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_ports/mem_ops.h" +#include "aom_scale/aom_scale.h" +#include "aom_util/aom_thread.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "av1/common/alloccommon.h" +#include "av1/common/cdef.h" +#include "av1/common/cfl.h" +#if CONFIG_INSPECTION +#include "av1/decoder/inspection.h" +#endif +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/frame_buffers.h" +#include "av1/common/idct.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/seg_common.h" +#include "av1/common/thread_common.h" +#include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" +#include "av1/common/obmc.h" +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/decodemv.h" +#include "av1/decoder/decoder.h" +#include "av1/decoder/decodetxb.h" +#include "av1/decoder/detokenize.h" + +#define ACCT_STR __func__ + +#define AOM_MIN_THREADS_PER_TILE 1 +#define AOM_MAX_THREADS_PER_TILE 2 + +// This is needed by ext_tile related unit tests. +#define EXT_TILE_DEBUG 1 +#define MC_TEMP_BUF_PELS \ + (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \ + ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2)) + +// Checks that the remaining bits start with a 1 and ends with 0s. +// It consumes an additional byte, if already byte aligned before the check. +int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { + // bit_offset is set to 0 (mod 8) when the reader is already byte aligned + int bits_before_alignment = 8 - rb->bit_offset % 8; + int trailing = aom_rb_read_literal(rb, bits_before_alignment); + if (trailing != (1 << (bits_before_alignment - 1))) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + return 0; +} + +// Use only_chroma = 1 to only set the chroma planes +static AOM_INLINE void set_planes_to_neutral_grey( + const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf, + int only_chroma) { + if (seq_params->use_highbitdepth) { + const int val = 1 << (seq_params->bit_depth - 1); + for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { + const int is_uv = plane > 0; + uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]); + // Set the first row to neutral grey. Then copy the first row to all + // subsequent rows. + if (buf->crop_heights[is_uv] > 0) { + aom_memset16(base, val, buf->crop_widths[is_uv]); + for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) { + memcpy(&base[row_idx * buf->strides[is_uv]], base, + sizeof(*base) * buf->crop_widths[is_uv]); + } + } + } + } else { + for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) { + const int is_uv = plane > 0; + for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) { + memset(&buf->buffers[plane][row_idx * buf->strides[is_uv]], 1 << 7, + buf->crop_widths[is_uv]); + } + } + } +} + +static AOM_INLINE void loop_restoration_read_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane, + int runit_idx); + +static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { + return len != 0 && len <= (size_t)(end - start); +} + +static TX_MODE read_tx_mode(struct aom_read_bit_buffer *rb, + int coded_lossless) { + if (coded_lossless) return ONLY_4X4; + return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST; +} + +static REFERENCE_MODE read_frame_reference_mode( + const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { + if (frame_is_intra_only(cm)) { + return SINGLE_REFERENCE; + } else { + return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE; + } +} + +static AOM_INLINE void inverse_transform_block(DecoderCodingBlock *dcb, + int plane, const TX_TYPE tx_type, + const TX_SIZE tx_size, + uint8_t *dst, int stride, + int reduced_tx_set) { + tran_low_t *const dqcoeff = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane]; + eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; + uint16_t scan_line = eob_data->max_scan_line; + uint16_t eob = eob_data->eob; + av1_inverse_transform_block(&dcb->xd, dqcoeff, plane, tx_type, tx_size, dst, + stride, eob, reduced_tx_set); + memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0])); +} + +static AOM_INLINE void read_coeffs_tx_intra_block( + const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, + const int plane, const int row, const int col, const TX_SIZE tx_size) { + MB_MODE_INFO *mbmi = dcb->xd.mi[0]; + if (!mbmi->skip_txfm) { +#if TXCOEFF_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + av1_read_coeffs_txb_facade(cm, dcb, r, plane, row, col, tx_size); +#if TXCOEFF_TIMER + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + cm->txcoeff_timer += elapsed_time; + ++cm->txb_count; +#endif + } +} + +static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm, + DecoderCodingBlock *dcb, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size) { + (void)cm; + (void)dcb; + (void)r; + (void)plane; + (void)row; + (void)col; + (void)tx_size; +} + +static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm, + DecoderCodingBlock *dcb, + BLOCK_SIZE bsize) { + (void)cm; + (void)dcb; + (void)bsize; +} + +static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + (void)cm; + (void)xd; +} + +static AOM_INLINE void predict_and_reconstruct_intra_block( + const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, + const int plane, const int row, const int col, const TX_SIZE tx_size) { + (void)r; + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *mbmi = xd->mi[0]; + PLANE_TYPE plane_type = get_plane_type(plane); + + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + + if (!mbmi->skip_txfm) { + eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; + if (eob_data->eob) { + const bool reduced_tx_set_used = cm->features.reduced_tx_set_used; + // tx_type was read out in av1_read_coeffs_txb. + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size, + reduced_tx_set_used); + struct macroblockd_plane *const pd = &xd->plane[plane]; + uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2]; + inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride, + reduced_tx_set_used); + } + } + if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) { + cfl_store_tx(xd, row, col, tx_size, mbmi->bsize); + } +} + +static AOM_INLINE void inverse_transform_inter_block( + const AV1_COMMON *const cm, DecoderCodingBlock *dcb, aom_reader *const r, + const int plane, const int blk_row, const int blk_col, + const TX_SIZE tx_size) { + (void)r; + MACROBLOCKD *const xd = &dcb->xd; + PLANE_TYPE plane_type = get_plane_type(plane); + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const bool reduced_tx_set_used = cm->features.reduced_tx_set_used; + // tx_type was read out in av1_read_coeffs_txb. + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, + tx_size, reduced_tx_set_used); + + uint8_t *dst = + &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + inverse_transform_block(dcb, plane, tx_type, tx_size, dst, pd->dst.stride, + reduced_tx_set_used); +#if CONFIG_MISMATCH_DEBUG + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, + plane, pixel_c, pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); +#endif +} + +static AOM_INLINE void set_cb_buffer_offsets(DecoderCodingBlock *dcb, + TX_SIZE tx_size, int plane) { + dcb->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size]; + dcb->txb_offset[plane] = + dcb->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); +} + +static AOM_INLINE void decode_reconstruct_tx( + AV1_COMMON *cm, ThreadData *const td, aom_reader *r, + MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row, + int blk_col, int block, TX_SIZE tx_size, int *eob_total) { + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + // Scale to match transform block unit. + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size || plane) { + td->read_coeffs_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col, + tx_size); + + td->inverse_tx_inter_block_visit(cm, dcb, r, plane, blk_row, blk_col, + tx_size); + eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; + *eob_total += eob_data->eob; + set_cb_buffer_offsets(dcb, tx_size, plane); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); + assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int sub_step = bsw * bsh; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + + decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr, + offsetc, block, sub_txs, eob_total); + block += sub_step; + } + } + } +} + +static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int bw, int bh, int x_mis, int y_mis) { + const int num_planes = av1_num_planes(cm); + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const TileInfo *const tile = &xd->tile; + + set_mi_offsets(mi_params, xd, mi_row, mi_col); + xd->mi[0]->bsize = bsize; +#if CONFIG_RD_DEBUG + xd->mi[0]->mi_row = mi_row; + xd->mi[0]->mi_col = mi_col; +#endif + + assert(x_mis && y_mis); + for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0]; + int idx = mi_params->mi_stride; + for (int y = 1; y < y_mis; ++y) { + memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0])); + idx += mi_params->mi_stride; + } + + set_plane_n4(xd, bw, bh, num_planes); + set_entropy_context(xd, mi_row, mi_col, num_planes); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); +} + +static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi, + DecoderCodingBlock *dcb, int mi_row, + int mi_col, aom_reader *r, + PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col); + const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row); + MACROBLOCKD *const xd = &dcb->xd; + +#if CONFIG_ACCOUNTING + aom_accounting_set_context(&pbi->accounting, mi_col, mi_row); +#endif + set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis); + xd->mi[0]->partition = partition; + av1_read_mode_info(pbi, dcb, r, x_mis, y_mis); + if (bsize >= BLOCK_8X8 && + (seq_params->subsampling_x || seq_params->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + av1_ss_size_lookup[bsize][seq_params->subsampling_x] + [seq_params->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } +} + +typedef struct PadBlock { + int x0; + int x1; + int y0; + int y1; +} PadBlock; + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8, + int src_stride, uint8_t *dst8, + int dst_stride, int x, int y, + int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) left = b_w; + + if (x + b_w > w) right = x + b_w - w; + + if (right > b_w) right = b_w; + + copy = b_w - left - right; + + if (left) aom_memset16(dst, ref_row[0], left); + + if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t)); + + if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) ref_row += src_stride; + } while (--b_h); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int x, + int y, int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; + + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; + + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; + + if (left > b_w) left = b_w; + + if (x + b_w > w) right = x + b_w - w; + + if (right > b_w) right = b_w; + + copy = b_w - left - right; + + if (left) memset(dst, ref_row[0], left); + + if (copy) memcpy(dst + left, ref_row + x + left, copy); + + if (right) memset(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) ref_row += src_stride; + } while (--b_h); +} + +static INLINE int update_extend_mc_border_params( + const struct scale_factors *const sf, struct buf_2d *const pre_buf, + MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv, + int do_warp, int is_intrabc, int *x_pad, int *y_pad) { + const int is_scaled = av1_is_scaled(sf); + // Get reference width and height. + int frame_width = pre_buf->width; + int frame_height = pre_buf->height; + + // Do border extension if there is motion or + // width/height is not a multiple of 8 pixels. + if ((!is_intrabc) && (!do_warp) && + (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || + (frame_height & 0x7))) { + if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) { + block->x0 -= AOM_INTERP_EXTEND - 1; + block->x1 += AOM_INTERP_EXTEND; + *x_pad = 1; + } + + if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) { + block->y0 -= AOM_INTERP_EXTEND - 1; + block->y1 += AOM_INTERP_EXTEND; + *y_pad = 1; + } + + // Skip border extension if block is inside the frame. + if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 || + block->y1 > frame_height - 1) { + return 1; + } + } + return 0; +} + +static INLINE void extend_mc_border(const struct scale_factors *const sf, + struct buf_2d *const pre_buf, + MV32 scaled_mv, PadBlock block, + int subpel_x_mv, int subpel_y_mv, + int do_warp, int is_intrabc, int highbd, + uint8_t *mc_buf, uint8_t **pre, + int *src_stride) { + int x_pad = 0, y_pad = 0; + if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block, + subpel_x_mv, subpel_y_mv, do_warp, + is_intrabc, &x_pad, &y_pad)) { + // Get reference block pointer. + const uint8_t *const buf_ptr = + pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0; + int buf_stride = pre_buf->stride; + const int b_w = block.x1 - block.x0; + const int b_h = block.y1 - block.y0; + +#if CONFIG_AV1_HIGHBITDEPTH + // Extend the border. + if (highbd) { + highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, + block.y0, b_w, b_h, pre_buf->width, + pre_buf->height); + } else { + build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, + b_h, pre_buf->width, pre_buf->height); + } +#else + (void)highbd; + build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w, + b_h, pre_buf->width, pre_buf->height); +#endif + *src_stride = b_w; + *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w + + x_pad * (AOM_INTERP_EXTEND - 1); + } +} + +static AOM_INLINE void dec_calc_subpel_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + const MACROBLOCKD *const xd, int mi_x, int mi_y, uint8_t **pre, + SubpelParams *subpel_params, int *src_stride, PadBlock *block, + MV32 *scaled_mv, int *subpel_x_mv, int *subpel_y_mv) { + const struct scale_factors *sf = inter_pred_params->scale_factors; + struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; + const int bw = inter_pred_params->block_width; + const int bh = inter_pred_params->block_height; + const int is_scaled = av1_is_scaled(sf); + if (is_scaled) { + int ssx = inter_pred_params->subsampling_x; + int ssy = inter_pred_params->subsampling_y; + int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + orig_pos_y += src_mv->row * (1 << (1 - ssy)); + int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + orig_pos_x += src_mv->col * (1 << (1 - ssx)); + int pos_y = av1_scaled_y(orig_pos_y, sf); + int pos_x = av1_scaled_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK; + subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK; + subpel_params->xs = sf->x_step_q4; + subpel_params->ys = sf->y_step_q4; + + // Get reference block top left coordinate. + block->x0 = pos_x >> SCALE_SUBPEL_BITS; + block->y0 = pos_y >> SCALE_SUBPEL_BITS; + + // Get reference block bottom right coordinate. + block->x1 = + ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1; + block->y1 = + ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1; + + MV temp_mv; + temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh, + inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y); + *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf); + scaled_mv->row += SCALE_EXTRA_OFF; + scaled_mv->col += SCALE_EXTRA_OFF; + + *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK; + *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK; + } else { + // Get block position in current frame. + int pos_x = inter_pred_params->pix_col << SUBPEL_BITS; + int pos_y = inter_pred_params->pix_row << SUBPEL_BITS; + + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, src_mv, bw, bh, inter_pred_params->subsampling_x, + inter_pred_params->subsampling_y); + subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS; + subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + + // Get reference block top left coordinate. + pos_x += mv_q4.col; + pos_y += mv_q4.row; + block->x0 = pos_x >> SUBPEL_BITS; + block->y0 = pos_y >> SUBPEL_BITS; + + // Get reference block bottom right coordinate. + block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1; + block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1; + + scaled_mv->row = mv_q4.row; + scaled_mv->col = mv_q4.col; + *subpel_x_mv = scaled_mv->col & SUBPEL_MASK; + *subpel_y_mv = scaled_mv->row & SUBPEL_MASK; + } + *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0; + *src_stride = pre_buf->stride; +} + +static AOM_INLINE void dec_calc_subpel_params_and_extend( + const MV *const src_mv, InterPredParams *const inter_pred_params, + MACROBLOCKD *const xd, int mi_x, int mi_y, int ref, uint8_t **mc_buf, + uint8_t **pre, SubpelParams *subpel_params, int *src_stride) { + PadBlock block; + MV32 scaled_mv; + int subpel_x_mv, subpel_y_mv; + dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre, + subpel_params, src_stride, &block, &scaled_mv, + &subpel_x_mv, &subpel_y_mv); + extend_mc_border( + inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf, + scaled_mv, block, subpel_x_mv, subpel_y_mv, + inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc, + inter_pred_params->use_hbd_buf, mc_buf[ref], pre, src_stride); +} + +#define IS_DEC 1 +#include "av1/common/reconinter_template.inc" +#undef IS_DEC + +static void dec_build_inter_predictors(const AV1_COMMON *cm, + DecoderCodingBlock *dcb, int plane, + const MB_MODE_INFO *mi, + int build_for_obmc, int bw, int bh, + int mi_x, int mi_y) { + build_inter_predictors(cm, &dcb->xd, plane, mi, build_for_obmc, bw, bh, mi_x, + mi_y, dcb->mc_buf); +} + +static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm, + DecoderCodingBlock *dcb, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &dcb->xd; + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + dec_build_inter_predictors(cm, dcb, plane, xd->mi[0], 0, + xd->plane[plane].width, xd->plane[plane].height, + mi_x, mi_y); + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf, + xd->plane[2].dst.buf }, + { xd->plane[0].dst.stride, xd->plane[1].dst.stride, + xd->plane[2].dst.stride } }; + av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, + xd->plane[plane].dst.stride, &ctx, plane, + bsize); + } + } +} + +static INLINE void dec_build_prediction_by_above_pred( + MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int above_mi_col = xd->mi_col + rel_mi_col; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *above_mbmi; + + (void)rel_mi_row; + (void)dir; + + av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size, + &backup_mbmi, ctxt, num_planes); + mi_x = above_mi_col << MI_SIZE_LOG2; + mi_y = xd->mi_row << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue; + dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j, + &backup_mbmi, 1, bw, bh, mi_x, mi_y); + } +} + +static AOM_INLINE void dec_build_prediction_by_above_preds( + const AV1_COMMON *cm, DecoderCodingBlock *dcb, + uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) { + MACROBLOCKD *const xd = &dcb->xd; + if (!xd->up_available) return; + + // Adjust mb_to_bottom_edge to have the correct value for the OBMC + // prediction block. This is half the height of the original block, + // except for 128-wide blocks, where we only use a height of 32. + const int this_height = xd->height * MI_SIZE; + const int pred_height = AOMMIN(this_height / 2, 32); + xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height); + struct build_prediction_ctxt ctxt = { + cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, dcb + }; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + dec_build_prediction_by_above_pred, &ctxt); + + xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE); + xd->mb_to_right_edge = ctxt.mb_to_far_edge; + xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height); +} + +static INLINE void dec_build_prediction_by_left_pred( + MACROBLOCKD *const xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + const int left_mi_row = xd->mi_row + rel_mi_row; + int mi_x, mi_y; + MB_MODE_INFO backup_mbmi = *left_mbmi; + + (void)rel_mi_col; + (void)dir; + + av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size, + &backup_mbmi, ctxt, num_planes); + mi_x = xd->mi_col << MI_SIZE_LOG2; + mi_y = left_mi_row << MI_SIZE_LOG2; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue; + dec_build_inter_predictors(ctxt->cm, (DecoderCodingBlock *)ctxt->dcb, j, + &backup_mbmi, 1, bw, bh, mi_x, mi_y); + } +} + +static AOM_INLINE void dec_build_prediction_by_left_preds( + const AV1_COMMON *cm, DecoderCodingBlock *dcb, + uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) { + MACROBLOCKD *const xd = &dcb->xd; + if (!xd->left_available) return; + + // Adjust mb_to_right_edge to have the correct value for the OBMC + // prediction block. This is half the width of the original block, + // except for 128-wide blocks, where we only use a width of 32. + const int this_width = xd->width * MI_SIZE; + const int pred_width = AOMMIN(this_width / 2, 32); + xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width); + + struct build_prediction_ctxt ctxt = { + cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, dcb + }; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + dec_build_prediction_by_left_pred, &ctxt); + + xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE); + xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width); + xd->mb_to_bottom_edge = ctxt.mb_to_far_edge; +} + +static AOM_INLINE void dec_build_obmc_inter_predictors_sb( + const AV1_COMMON *cm, DecoderCodingBlock *dcb) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + + MACROBLOCKD *const xd = &dcb->xd; + av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2); + + dec_build_prediction_by_above_preds(cm, dcb, dst_buf1, dst_width1, + dst_height1, dst_stride1); + dec_build_prediction_by_left_preds(cm, dcb, dst_buf2, dst_width2, dst_height2, + dst_stride2); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row, + mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, + dst_stride2); +} + +static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + MB_MODE_INFO *mbmi = xd->mi[0]; + if (store_cfl_required(cm, xd)) { + cfl_store_block(xd, mbmi->bsize, mbmi->tx_size); + } +} + +static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm, + DecoderCodingBlock *dcb, + BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + if (frame < LAST_FRAME) { + assert(is_intrabc_block(mbmi)); + assert(frame == INTRA_FRAME); + assert(ref == 0); + } else { + const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, frame); + + xd->block_ref_scale_factors[ref] = ref_scale_factors; + av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col, + ref_scale_factors, num_planes); + } + } + + dec_build_inter_predictor(cm, dcb, mi_row, mi_col, bsize); + if (mbmi->motion_mode == OBMC_CAUSAL) { + dec_build_obmc_inter_predictors_sb(cm, dcb); + } +#if CONFIG_MISMATCH_DEBUG + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x, + pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, + cm->current_frame.order_hint, plane, pixel_c, + pixel_r, pd->width, pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif +} + +static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd, + int plane, aom_reader *r) { + (void)r; + Av1ColorMapParam params; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + av1_get_block_dimensions(mbmi->bsize, plane, xd, ¶ms.plane_width, + ¶ms.plane_height, NULL, NULL); + xd->color_index_map_offset[plane] += params.plane_width * params.plane_height; +} + +static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi, + ThreadData *const td, + aom_reader *r, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + + if (!is_inter_block(mbmi)) { + int row, col; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); + + for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += stepr) { + for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += stepc) { + td->read_coeffs_tx_intra_block_visit(cm, dcb, r, plane, blk_row, + blk_col, tx_size); + td->predict_and_recon_intra_block_visit( + cm, dcb, r, plane, blk_row, blk_col, tx_size); + set_cb_buffer_offsets(dcb, tx_size, plane); + } + } + } + } + } + } else { + td->predict_inter_block_visit(cm, dcb, bsize); + // Reconstruction + if (!mbmi->skip_txfm) { + int eobtotal = 0; + + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + int row, col; + + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + assert(max_unit_bsize == + get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, ss_x, ss_y); + const TX_SIZE max_tx_size = + get_vartx_max_txsize(xd, plane_bsize, plane); + const int bh_var_tx = tx_size_high_unit[max_tx_size]; + const int bw_var_tx = tx_size_wide_unit[max_tx_size]; + int block = 0; + int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + int blk_row, blk_col; + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x); + + for (blk_row = row >> ss_y; blk_row < unit_height; + blk_row += bh_var_tx) { + for (blk_col = col >> ss_x; blk_col < unit_width; + blk_col += bw_var_tx) { + decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, + blk_row, blk_col, block, max_tx_size, + &eobtotal); + block += step; + } + } + } + } + } + } + td->cfl_store_inter_block_visit(cm, xd); + } + + av1_visit_palette(pbi, xd, r, set_color_index_map_offset); +} + +static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2, + int tx_w_log2, int tx_h_log2, + int min_txs, int split_size, int txs, + int blk_row, int blk_col) { + for (int idy = 0; idy < tx_size_high_unit[split_size]; + idy += tx_size_high_unit[min_txs]) { + for (int idx = 0; idx < tx_size_wide_unit[split_size]; + idx += tx_size_wide_unit[min_txs]) { + const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) + + ((blk_col + idx) >> tx_w_log2); + mbmi->inter_tx_size[index] = txs; + } + } +} + +static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + TX_SIZE tx_size, int depth, + int blk_row, int blk_col, + aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + int is_split = 0; + const BLOCK_SIZE bsize = mbmi->bsize; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + assert(tx_size > TX_4X4); + TX_SIZE txs = max_txsize_rect_lookup[bsize]; + for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level) + txs = sub_tx_size_map[txs]; + const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2; + const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2; + const int bw_log2 = mi_size_wide_log2[bsize]; + const int stride_log2 = bw_log2 - tx_w_log2; + + if (depth == MAX_VARTX_DEPTH) { + set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, + tx_size, blk_row, blk_col); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->bsize, tx_size); + is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR); + + if (is_split) { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + + if (sub_txs == TX_4X4) { + set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, + sub_txs, blk_row, blk_col); + mbmi->tx_size = sub_txs; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, sub_txs, tx_size); + return; + } + + assert(bsw > 0 && bsh > 0); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = blk_row + row; + int offsetc = blk_col + col; + read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r); + } + } + } else { + set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size, + tx_size, blk_row, blk_col); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + } +} + +static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd, + aom_reader *r) { + // TODO(debargha): Clean up the logic here. This function should only + // be called for intra. + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int max_depths = bsize_to_max_depth(bsize); + const int ctx = get_tx_size_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx], + max_depths + 1, ACCT_STR); + assert(depth >= 0 && depth <= max_depths); + const TX_SIZE tx_size = depth_to_tx_size(depth, bsize); + return tx_size; +} + +static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode, + int is_inter, int allow_select_inter, + aom_reader *r) { + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4; + + if (block_signals_txsize(bsize)) { + if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) { + const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r); + return coded_tx_size; + } else { + return tx_size_from_tx_mode(bsize, tx_mode); + } + } else { + assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4)); + return max_txsize_rect_lookup[bsize]; + } +} + +static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi, + ThreadData *const td, int mi_row, + int mi_col, aom_reader *r, + PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + decode_mbmi_block(pbi, dcb, mi_row, mi_col, r, partition, bsize); + + av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens); + + AV1_COMMON *cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi); + if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && + !mbmi->skip_txfm && inter_block_tx && !xd->lossless[mbmi->segment_id]) { + const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int width = mi_size_wide[bsize]; + const int height = mi_size_high[bsize]; + + for (int idy = 0; idy < height; idy += bh) + for (int idx = 0; idx < width; idx += bw) + read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r); + } else { + mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx, + !mbmi->skip_txfm, r); + if (inter_block_tx) + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, + mbmi->skip_txfm && is_inter_block(mbmi), xd); + } + + if (cm->delta_q_info.delta_q_present_flag) { + for (int i = 0; i < MAX_SEGMENTS; i++) { + const int current_qindex = + av1_get_qindex(&cm->seg, i, xd->current_base_qindex); + const CommonQuantParams *const quant_params = &cm->quant_params; + for (int j = 0; j < num_planes; ++j) { + const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q + : (j == 1 ? quant_params->u_dc_delta_q + : quant_params->v_dc_delta_q); + const int ac_delta_q = j == 0 ? 0 + : (j == 1 ? quant_params->u_ac_delta_q + : quant_params->v_ac_delta_q); + xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX( + current_qindex, dc_delta_q, cm->seq_params->bit_depth); + xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX( + current_qindex, ac_delta_q, cm->seq_params->bit_depth); + } + } + } + if (mbmi->skip_txfm) av1_reset_entropy_context(xd, bsize, num_planes); + + decode_token_recon_block(pbi, td, r, bsize); +} + +static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi, + ThreadData *const td, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &pbi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int num_planes = av1_num_planes(cm); + + const int offset = mi_row * mi_params->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + + xd->mi = mi_params->mi_grid_base + offset; + xd->tx_type_map = + &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col]; + xd->tx_type_map_stride = mi_params->mi_stride; + + set_plane_n4(xd, bw, bh, num_planes); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); +} + +static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, + BLOCK_SIZE bsize) { + (void)partition; + set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize); + decode_token_recon_block(pbi, td, r, bsize); +} + +static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, + aom_reader *r, int has_rows, int has_cols, + BLOCK_SIZE bsize) { + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!has_rows && !has_cols) return PARTITION_SPLIT; + + assert(ctx >= 0); + aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx]; + if (has_rows && has_cols) { + return (PARTITION_TYPE)aom_read_symbol( + r, partition_cdf, partition_cdf_length(bsize), ACCT_STR); + } else if (!has_rows && has_cols) { + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_vert_alike(cdf, partition_cdf, bsize); + assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP)); + return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ; + } else { + assert(has_rows && !has_cols); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_horz_alike(cdf, partition_cdf, bsize); + assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP)); + return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT; + } +} + +// TODO(slavarnway): eliminate bsize and subsize in future commits +static AOM_INLINE void decode_partition(AV1Decoder *const pbi, + ThreadData *const td, int mi_row, + int mi_col, aom_reader *reader, + BLOCK_SIZE bsize, + int parse_decode_flag) { + assert(bsize < BLOCK_SIZES_ALL); + AV1_COMMON *const cm = &pbi->common; + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + const int bw = mi_size_wide[bsize]; + const int hbs = bw >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int quarter_step = bw / 4; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); + const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + // parse_decode_flag takes the following values : + // 01 - do parse only + // 10 - do decode only + // 11 - do parse and decode + static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block, + decode_block, + parse_decode_block }; + + if (parse_decode_flag & 1) { + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1; + + // Skip some unnecessary work if loop restoration is disabled + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1)) { + const int rstride = cm->rst_info[plane].horz_units; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = rcol + rrow * rstride; + loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx); + } + } + } + } + + partition = (bsize < BLOCK_8X8) ? PARTITION_NONE + : read_partition(xd, mi_row, mi_col, reader, + has_rows, has_cols, bsize); + } else { + partition = get_partition(cm, mi_row, mi_col, bsize); + } + subsize = get_partition_subsize(bsize, partition); + if (subsize == BLOCK_INVALID) { + // When an internal error occurs ensure that xd->mi_row is set appropriately + // w.r.t. current tile, which is used to signal processing of current row is + // done. + xd->mi_row = mi_row; + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Partition is invalid for block size %dx%d", + block_size_wide[bsize], block_size_high[bsize]); + } + // Check the bitstream is conformant: if there is subsampling on the + // chroma planes, subsize must subsample to a valid block size. + const struct macroblockd_plane *const pd_u = &xd->plane[1]; + if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) == + BLOCK_INVALID) { + // When an internal error occurs ensure that xd->mi_row is set appropriately + // w.r.t. current tile, which is used to signal processing of current row is + // done. + xd->mi_row = mi_row; + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Block size %dx%d invalid with this subsampling mode", + block_size_wide[subsize], block_size_high[subsize]); + } + +#define DEC_BLOCK_STX_ARG +#define DEC_BLOCK_EPT_ARG partition, +#define DEC_BLOCK(db_r, db_c, db_subsize) \ + block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \ + reader, DEC_BLOCK_EPT_ARG(db_subsize)) +#define DEC_PARTITION(db_r, db_c, db_subsize) \ + decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \ + (db_subsize), parse_decode_flag) + + switch (partition) { + case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break; + case PARTITION_HORZ: + DEC_BLOCK(mi_row, mi_col, subsize); + if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize); + break; + case PARTITION_VERT: + DEC_BLOCK(mi_row, mi_col, subsize); + if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize); + break; + case PARTITION_SPLIT: + DEC_PARTITION(mi_row, mi_col, subsize); + DEC_PARTITION(mi_row, mi_col + hbs, subsize); + DEC_PARTITION(mi_row + hbs, mi_col, subsize); + DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize); + break; + case PARTITION_HORZ_A: + DEC_BLOCK(mi_row, mi_col, bsize2); + DEC_BLOCK(mi_row, mi_col + hbs, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col, subsize); + break; + case PARTITION_HORZ_B: + DEC_BLOCK(mi_row, mi_col, subsize); + DEC_BLOCK(mi_row + hbs, mi_col, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2); + break; + case PARTITION_VERT_A: + DEC_BLOCK(mi_row, mi_col, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col, bsize2); + DEC_BLOCK(mi_row, mi_col + hbs, subsize); + break; + case PARTITION_VERT_B: + DEC_BLOCK(mi_row, mi_col, subsize); + DEC_BLOCK(mi_row, mi_col + hbs, bsize2); + DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2); + break; + case PARTITION_HORZ_4: + for (int i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break; + DEC_BLOCK(this_mi_row, mi_col, subsize); + } + break; + case PARTITION_VERT_4: + for (int i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break; + DEC_BLOCK(mi_row, this_mi_col, subsize); + } + break; + default: assert(0 && "Invalid partition type"); + } + +#undef DEC_PARTITION +#undef DEC_BLOCK +#undef DEC_BLOCK_EPT_ARG +#undef DEC_BLOCK_STX_ARG + + if (parse_decode_flag & 1) + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +static AOM_INLINE void setup_bool_decoder( + MACROBLOCKD *const xd, const uint8_t *data, const uint8_t *data_end, + const size_t read_size, struct aom_internal_error_info *error_info, + aom_reader *r, uint8_t allow_update_cdf) { + // Validate the calculated partition length. If the buffer + // described by the partition can't be fully read, then restrict + // it to the portion that can be (for EC mode) or throw an error. + if (!read_is_valid(data, read_size, data_end)) { + // When internal error occurs ensure that xd->mi_row is set appropriately + // w.r.t. current tile, which is used to signal processing of current row is + // done in row-mt decoding. + xd->mi_row = xd->tile.mi_row_start; + + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + } + if (aom_reader_init(r, data, read_size)) { + // When internal error occurs ensure that xd->mi_row is set appropriately + // w.r.t. current tile, which is used to signal processing of current row is + // done in row-mt decoding. + xd->mi_row = xd->tile.mi_row_start; + + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", 1); + } + + r->allow_update_cdf = allow_update_cdf; +} + +static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm, + struct aom_read_bit_buffer *rb) { + struct segmentation *const seg = &cm->seg; + + seg->update_map = 0; + seg->update_data = 0; + seg->temporal_update = 0; + + seg->enabled = aom_rb_read_bit(rb); + if (!seg->enabled) { + if (cm->cur_frame->seg_map) { + memset(cm->cur_frame->seg_map, 0, + (cm->cur_frame->mi_rows * cm->cur_frame->mi_cols)); + } + + memset(seg, 0, sizeof(*seg)); + segfeatures_copy(&cm->cur_frame->seg, seg); + return; + } + if (cm->seg.enabled && cm->prev_frame && + (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) && + (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) { + cm->last_frame_seg_map = cm->prev_frame->seg_map; + } else { + cm->last_frame_seg_map = NULL; + } + // Read update flags + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + // These frames can't use previous frames, so must signal map + features + seg->update_map = 1; + seg->temporal_update = 0; + seg->update_data = 1; + } else { + seg->update_map = aom_rb_read_bit(rb); + if (seg->update_map) { + seg->temporal_update = aom_rb_read_bit(rb); + } else { + seg->temporal_update = 0; + } + seg->update_data = aom_rb_read_bit(rb); + } + + // Segmentation data update + if (seg->update_data) { + av1_clearall_segfeatures(seg); + + for (int i = 0; i < MAX_SEGMENTS; i++) { + for (int j = 0; j < SEG_LVL_MAX; j++) { + int data = 0; + const int feature_enabled = aom_rb_read_bit(rb); + if (feature_enabled) { + av1_enable_segfeature(seg, i, j); + + const int data_max = av1_seg_feature_data_max(j); + const int data_min = -data_max; + const int ubits = get_unsigned_bits(data_max); + + if (av1_is_segfeature_signed(j)) { + data = aom_rb_read_inv_signed_literal(rb, ubits); + } else { + data = aom_rb_read_literal(rb, ubits); + } + + data = clamp(data, data_min, data_max); + } + av1_set_segdata(seg, i, j, data); + } + } + av1_calculate_segdata(seg); + } else if (cm->prev_frame) { + segfeatures_copy(seg, &cm->prev_frame->seg); + } + segfeatures_copy(&cm->cur_frame->seg, seg); +} + +static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + assert(!cm->features.all_lossless); + const int num_planes = av1_num_planes(cm); + if (cm->features.allow_intrabc) return; + int all_none = 1, chroma_none = 1; + for (int p = 0; p < num_planes; ++p) { + RestorationInfo *rsi = &cm->rst_info[p]; + if (aom_rb_read_bit(rb)) { + rsi->frame_restoration_type = + aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER; + } else { + rsi->frame_restoration_type = + aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE; + } + if (rsi->frame_restoration_type != RESTORE_NONE) { + all_none = 0; + chroma_none &= p == 0; + } + } + if (!all_none) { + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; + + for (int p = 0; p < num_planes; ++p) + cm->rst_info[p].restoration_unit_size = sb_size; + + RestorationInfo *rsi = &cm->rst_info[0]; + + if (sb_size == 64) { + rsi->restoration_unit_size <<= aom_rb_read_bit(rb); + } + if (rsi->restoration_unit_size > 64) { + rsi->restoration_unit_size <<= aom_rb_read_bit(rb); + } + } else { + const int size = RESTORATION_UNITSIZE_MAX; + for (int p = 0; p < num_planes; ++p) + cm->rst_info[p].restoration_unit_size = size; + } + + if (num_planes > 1) { + int s = + AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); + if (s && !chroma_none) { + cm->rst_info[1].restoration_unit_size = + cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s); + } else { + cm->rst_info[1].restoration_unit_size = + cm->rst_info[0].restoration_unit_size; + } + cm->rst_info[2].restoration_unit_size = + cm->rst_info[1].restoration_unit_size; + } +} + +static AOM_INLINE void read_wiener_filter(int wiener_win, + WienerInfo *wiener_info, + WienerInfo *ref_wiener_info, + aom_reader *rb) { + memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter)); + memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter)); + + if (wiener_win == WIENER_WIN) + wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) + + WIENER_FILT_TAP0_MINV; + else + wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0; + wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) + + WIENER_FILT_TAP1_MINV; + wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) + + WIENER_FILT_TAP2_MINV; + // The central element has an implicit +WIENER_FILT_STEP + wiener_info->vfilter[WIENER_HALFWIN] = + -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] + + wiener_info->vfilter[2]); + + if (wiener_win == WIENER_WIN) + wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) + + WIENER_FILT_TAP0_MINV; + else + wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0; + wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) + + WIENER_FILT_TAP1_MINV; + wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] = + aom_read_primitive_refsubexpfin( + rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) + + WIENER_FILT_TAP2_MINV; + // The central element has an implicit +WIENER_FILT_STEP + wiener_info->hfilter[WIENER_HALFWIN] = + -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] + + wiener_info->hfilter[2]); + memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); +} + +static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info, + aom_reader *rb) { + sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR); + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + + if (params->r[0] == 0) { + sgrproj_info->xqd[0] = 0; + sgrproj_info->xqd[1] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) + + SGRPROJ_PRJ_MIN1; + } else if (params->r[1] == 0) { + sgrproj_info->xqd[0] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) + + SGRPROJ_PRJ_MIN0; + sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0], + SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1); + } else { + sgrproj_info->xqd[0] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) + + SGRPROJ_PRJ_MIN0; + sgrproj_info->xqd[1] = + aom_read_primitive_refsubexpfin( + rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) + + SGRPROJ_PRJ_MIN1; + } + + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); +} + +static AOM_INLINE void loop_restoration_read_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane, + int runit_idx) { + const RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationUnitInfo *rui = &rsi->unit_info[runit_idx]; + assert(rsi->frame_restoration_type != RESTORE_NONE); + + assert(!cm->features.all_lossless); + + const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; + WienerInfo *wiener_info = xd->wiener_info + plane; + SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane; + + if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) { + rui->restoration_type = + aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES, ACCT_STR); + switch (rui->restoration_type) { + case RESTORE_WIENER: + read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r); + break; + case RESTORE_SGRPROJ: + read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r); + break; + default: assert(rui->restoration_type == RESTORE_NONE); break; + } + } else if (rsi->frame_restoration_type == RESTORE_WIENER) { + if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) { + rui->restoration_type = RESTORE_WIENER; + read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r); + } else { + rui->restoration_type = RESTORE_NONE; + } + } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) { + if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) { + rui->restoration_type = RESTORE_SGRPROJ; + read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r); + } else { + rui->restoration_type = RESTORE_NONE; + } + } +} + +static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + const int num_planes = av1_num_planes(cm); + struct loopfilter *lf = &cm->lf; + + if (cm->features.allow_intrabc || cm->features.coded_lossless) { + // write default deltas to frame buffer + av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); + av1_set_default_mode_deltas(cm->cur_frame->mode_deltas); + return; + } + assert(!cm->features.coded_lossless); + if (cm->prev_frame) { + // write deltas to frame buffer + memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); + memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); + } else { + av1_set_default_ref_deltas(lf->ref_deltas); + av1_set_default_mode_deltas(lf->mode_deltas); + } + lf->filter_level[0] = aom_rb_read_literal(rb, 6); + lf->filter_level[1] = aom_rb_read_literal(rb, 6); + if (num_planes > 1) { + if (lf->filter_level[0] || lf->filter_level[1]) { + lf->filter_level_u = aom_rb_read_literal(rb, 6); + lf->filter_level_v = aom_rb_read_literal(rb, 6); + } + } + lf->sharpness_level = aom_rb_read_literal(rb, 3); + + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. + lf->mode_ref_delta_update = 0; + + lf->mode_ref_delta_enabled = aom_rb_read_bit(rb); + if (lf->mode_ref_delta_enabled) { + lf->mode_ref_delta_update = aom_rb_read_bit(rb); + if (lf->mode_ref_delta_update) { + for (int i = 0; i < REF_FRAMES; i++) + if (aom_rb_read_bit(rb)) + lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6); + + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) + if (aom_rb_read_bit(rb)) + lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6); + } + } + + // write deltas to frame buffer + memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES); + memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS); +} + +static AOM_INLINE void setup_cdef(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + const int num_planes = av1_num_planes(cm); + CdefInfo *const cdef_info = &cm->cdef_info; + + if (cm->features.allow_intrabc) return; + cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3; + cdef_info->cdef_bits = aom_rb_read_literal(rb, 2); + cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits; + for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) { + cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS); + cdef_info->cdef_uv_strengths[i] = + num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0; + } +} + +static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) { + return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0; +} + +static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params, + int num_planes, + bool separate_uv_delta_q, + struct aom_read_bit_buffer *rb) { + quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS); + quant_params->y_dc_delta_q = read_delta_q(rb); + if (num_planes > 1) { + int diff_uv_delta = 0; + if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb); + quant_params->u_dc_delta_q = read_delta_q(rb); + quant_params->u_ac_delta_q = read_delta_q(rb); + if (diff_uv_delta) { + quant_params->v_dc_delta_q = read_delta_q(rb); + quant_params->v_ac_delta_q = read_delta_q(rb); + } else { + quant_params->v_dc_delta_q = quant_params->u_dc_delta_q; + quant_params->v_ac_delta_q = quant_params->u_ac_delta_q; + } + } else { + quant_params->u_dc_delta_q = 0; + quant_params->u_ac_delta_q = 0; + quant_params->v_dc_delta_q = 0; + quant_params->v_ac_delta_q = 0; + } + quant_params->using_qmatrix = aom_rb_read_bit(rb); + if (quant_params->using_qmatrix) { + quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS); + quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS); + if (!separate_uv_delta_q) + quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; + else + quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS); + } else { + quant_params->qmatrix_level_y = 0; + quant_params->qmatrix_level_u = 0; + quant_params->qmatrix_level_v = 0; + } +} + +// Build y/uv dequant values based on segmentation. +static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm, + MACROBLOCKD *const xd) { + const int bit_depth = cm->seq_params->bit_depth; + // When segmentation is disabled, only the first value is used. The + // remaining are don't cares. + const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1; + CommonQuantParams *const quant_params = &cm->quant_params; + for (int i = 0; i < max_segments; ++i) { + const int qindex = xd->qindex[i]; + quant_params->y_dequant_QTX[i][0] = + av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth); + quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth); + quant_params->u_dequant_QTX[i][0] = + av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth); + quant_params->u_dequant_QTX[i][1] = + av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth); + quant_params->v_dequant_QTX[i][0] = + av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth); + quant_params->v_dequant_QTX[i][1] = + av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth); + const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i); + // NB: depends on base index so there is only 1 set per frame + // No quant weighting when lossless or signalled not using QM + const int qmlevel_y = + use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; + for (int j = 0; j < TX_SIZES_ALL; ++j) { + quant_params->y_iqmatrix[i][j] = + av1_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j); + } + const int qmlevel_u = + use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; + for (int j = 0; j < TX_SIZES_ALL; ++j) { + quant_params->u_iqmatrix[i][j] = + av1_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j); + } + const int qmlevel_v = + use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; + for (int j = 0; j < TX_SIZES_ALL; ++j) { + quant_params->v_iqmatrix[i][j] = + av1_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j); + } + } +} + +static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) { + return aom_rb_read_bit(rb) ? SWITCHABLE + : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS); +} + +static AOM_INLINE void setup_render_size(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + cm->render_width = cm->superres_upscaled_width; + cm->render_height = cm->superres_upscaled_height; + if (aom_rb_read_bit(rb)) + av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height); +} + +// TODO(afergs): make "struct aom_read_bit_buffer *const rb"? +static AOM_INLINE void setup_superres(AV1_COMMON *const cm, + struct aom_read_bit_buffer *rb, + int *width, int *height) { + cm->superres_upscaled_width = *width; + cm->superres_upscaled_height = *height; + + const SequenceHeader *const seq_params = cm->seq_params; + if (!seq_params->enable_superres) return; + + if (aom_rb_read_bit(rb)) { + cm->superres_scale_denominator = + (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS); + cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN; + // Don't edit cm->width or cm->height directly, or the buffers won't get + // resized correctly + av1_calculate_scaled_superres_size(width, height, + cm->superres_scale_denominator); + } else { + // 1:1 scaling - ie. no scaling, scale not provided + cm->superres_scale_denominator = SCALE_NUMERATOR; + } +} + +static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width, + int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Dimensions of %dx%d beyond allowed size of %dx%d.", + width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); +#endif + if (cm->width != width || cm->height != height) { + const int new_mi_rows = CEIL_POWER_OF_TWO(height, MI_SIZE_LOG2); + const int new_mi_cols = CEIL_POWER_OF_TWO(width, MI_SIZE_LOG2); + + // Allocations in av1_alloc_context_buffers() depend on individual + // dimensions as well as the overall size. + if (new_mi_cols > cm->mi_params.mi_cols || + new_mi_rows > cm->mi_params.mi_rows) { + if (av1_alloc_context_buffers(cm, width, height, BLOCK_4X4)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + } else { + cm->mi_params.set_mb_mi(&cm->mi_params, width, height, BLOCK_4X4); + } + av1_init_mi_buffers(&cm->mi_params); + cm->width = width; + cm->height = height; + } + + ensure_mv_buffer(cm->cur_frame, cm); + cm->cur_frame->width = cm->width; + cm->cur_frame->height = cm->height; +} + +static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) { + BufferPool *const pool = cm->buffer_pool; + const SequenceHeader *const seq_params = cm->seq_params; + + lock_buffer_pool(pool); + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment, + &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0, + 0)) { + unlock_buffer_pool(pool); + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + unlock_buffer_pool(pool); + + cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth; + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; +} + +static AOM_INLINE void setup_frame_size(AV1_COMMON *cm, + int frame_size_override_flag, + struct aom_read_bit_buffer *rb) { + const SequenceHeader *const seq_params = cm->seq_params; + int width, height; + + if (frame_size_override_flag) { + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); + if (width > seq_params->max_frame_width || + height > seq_params->max_frame_height) { + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Frame dimensions are larger than the maximum values"); + } + } else { + width = seq_params->max_frame_width; + height = seq_params->max_frame_height; + } + + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + setup_buffer_pool(cm); +} + +static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params, + struct aom_read_bit_buffer *rb) { + set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64); +} + +static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth, + int ref_xss, int ref_yss, + aom_bit_depth_t this_bit_depth, + int this_xss, int this_yss) { + return ref_bit_depth == this_bit_depth && ref_xss == this_xss && + ref_yss == this_yss; +} + +static AOM_INLINE void setup_frame_size_with_refs( + AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { + int width, height; + int found = 0; + int has_valid_ref_frame = 0; + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + if (aom_rb_read_bit(rb)) { + const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); + // This will never be NULL in a normal stream, as streams are required to + // have a shown keyframe before any inter frames, which would refresh all + // the reference buffers. However, it might be null if we're starting in + // the middle of a stream, and static analysis will error if we don't do + // a null check here. + if (ref_buf == NULL) { + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid condition: invalid reference buffer"); + } else { + const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf; + width = buf->y_crop_width; + height = buf->y_crop_height; + cm->render_width = buf->render_width; + cm->render_height = buf->render_height; + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + found = 1; + break; + } + } + } + + const SequenceHeader *const seq_params = cm->seq_params; + if (!found) { + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + + av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_render_size(cm, rb); + } + + if (width <= 0 || height <= 0) + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid frame size"); + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); + has_valid_ref_frame |= + valid_ref_frame_size(ref_frame->buf.y_crop_width, + ref_frame->buf.y_crop_height, width, height); + } + if (!has_valid_ref_frame) + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); + if (!valid_ref_frame_img_fmt( + ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x, + ref_frame->buf.subsampling_y, seq_params->bit_depth, + seq_params->subsampling_x, seq_params->subsampling_y)) + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Referenced frame has incompatible color format"); + } + setup_buffer_pool(cm); +} + +// Same function as av1_read_uniform but reading from uncompresses header wb +static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + const int v = aom_rb_read_literal(rb, l - 1); + assert(l != 0); + if (v < m) + return v; + else + return (v << 1) - m + aom_rb_read_bit(rb); +} + +static AOM_INLINE void read_tile_info_max_tile( + AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) { + const SequenceHeader *const seq_params = cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + int width_sb = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); + int height_sb = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2); + + av1_get_tile_limits(cm); + tiles->uniform_spacing = aom_rb_read_bit(rb); + + // Read tile columns + if (tiles->uniform_spacing) { + tiles->log2_cols = tiles->min_log2_cols; + while (tiles->log2_cols < tiles->max_log2_cols) { + if (!aom_rb_read_bit(rb)) { + break; + } + tiles->log2_cols++; + } + } else { + int i; + int start_sb; + for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) { + const int size_sb = + 1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb)); + tiles->col_start_sb[i] = start_sb; + start_sb += size_sb; + width_sb -= size_sb; + } + tiles->cols = i; + tiles->col_start_sb[i] = start_sb + width_sb; + } + av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows, + cm->mi_params.mi_cols, tiles); + + // Read tile rows + if (tiles->uniform_spacing) { + tiles->log2_rows = tiles->min_log2_rows; + while (tiles->log2_rows < tiles->max_log2_rows) { + if (!aom_rb_read_bit(rb)) { + break; + } + tiles->log2_rows++; + } + } else { + int i; + int start_sb; + for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) { + const int size_sb = + 1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb)); + tiles->row_start_sb[i] = start_sb; + start_sb += size_sb; + height_sb -= size_sb; + } + tiles->rows = i; + tiles->row_start_sb[i] = start_sb + height_sb; + } + av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles); +} + +void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) { + cm->tiles.single_tile_decoding = 0; + if (cm->tiles.large_scale) { + struct loopfilter *lf = &cm->lf; + RestorationInfo *const rst_info = cm->rst_info; + const CdefInfo *const cdef_info = &cm->cdef_info; + + // Figure out single_tile_decoding by loopfilter_level. + const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]); + const int no_cdef = cdef_info->cdef_bits == 0 && + cdef_info->cdef_strengths[0] == 0 && + cdef_info->cdef_uv_strengths[0] == 0; + const int no_restoration = + rst_info[0].frame_restoration_type == RESTORE_NONE && + rst_info[1].frame_restoration_type == RESTORE_NONE && + rst_info[2].frame_restoration_type == RESTORE_NONE; + assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef)); + assert(IMPLIES(cm->features.all_lossless, no_restoration)); + cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration; + } +} + +static AOM_INLINE void read_tile_info(AV1Decoder *const pbi, + struct aom_read_bit_buffer *const rb) { + AV1_COMMON *const cm = &pbi->common; + + read_tile_info_max_tile(cm, rb); + + pbi->context_update_tile_id = 0; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // tile to use for cdf update + pbi->context_update_tile_id = + aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols); + if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid context_update_tile_id"); + } + // tile size magnitude + pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; + } +} + +#if EXT_TILE_DEBUG +static AOM_INLINE void read_ext_tile_info( + AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) { + AV1_COMMON *const cm = &pbi->common; + + // This information is stored as a separate byte. + int mod = rb->bit_offset % CHAR_BIT; + if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod); + assert(rb->bit_offset % CHAR_BIT == 0); + + if (cm->tiles.cols * cm->tiles.rows > 1) { + // Read the number of bytes used to store tile size + pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1; + pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1; + } +} +#endif // EXT_TILE_DEBUG + +static size_t mem_get_varsize(const uint8_t *src, int sz) { + switch (sz) { + case 1: return src[0]; + case 2: return mem_get_le16(src); + case 3: return mem_get_le24(src); + case 4: return mem_get_le32(src); + default: assert(0 && "Invalid size"); return -1; + } +} + +#if EXT_TILE_DEBUG +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. On return, '*data' is updated to point to the end of the +// raw tile buffer in the bit stream. +static AOM_INLINE void get_ls_tile_buffer( + const uint8_t *const data_end, struct aom_internal_error_info *error_info, + const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], + int tile_size_bytes, int col, int row, int tile_copy_mode) { + size_t size; + + size_t copy_size = 0; + const uint8_t *copy_data = NULL; + + if (!read_is_valid(*data, tile_size_bytes, data_end)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + size = mem_get_varsize(*data, tile_size_bytes); + + // If tile_copy_mode = 1, then the top bit of the tile header indicates copy + // mode. + if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) { + // The remaining bits in the top byte signal the row offset + int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f; + + // Currently, only use tiles in same column as reference tiles. + copy_data = tile_buffers[row - offset][col].data; + copy_size = tile_buffers[row - offset][col].size; + size = 0; + } else { + size += AV1_MIN_TILE_SIZE_BYTES; + } + + *data += tile_size_bytes; + + if (size > (size_t)(data_end - *data)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + + if (size > 0) { + tile_buffers[row][col].data = *data; + tile_buffers[row][col].size = size; + } else { + tile_buffers[row][col].data = copy_data; + tile_buffers[row][col].size = copy_size; + } + + *data += size; +} + +// Returns the end of the last tile buffer +// (tile_buffers[cm->tiles.rows - 1][cm->tiles.cols - 1]). +static const uint8_t *get_ls_tile_buffers( + AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, + TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) { + AV1_COMMON *const cm = &pbi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int have_tiles = tile_cols * tile_rows > 1; + const uint8_t *raw_data_end; // The end of the last tile buffer + + if (!have_tiles) { + const size_t tile_size = data_end - data; + tile_buffers[0][0].data = data; + tile_buffers[0][0].size = tile_size; + raw_data_end = NULL; + } else { + // We locate only the tile buffers that are required, which are the ones + // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always + // need the last (bottom right) tile buffer, as we need to know where the + // end of the compressed frame buffer is for proper superframe decoding. + + const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL }; + const uint8_t *const data_start = data; + + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int tile_rows_start = single_row ? dec_tile_row : 0; + const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + const int tile_cols_start = single_col ? dec_tile_col : 0; + const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + + const int tile_col_size_bytes = pbi->tile_col_size_bytes; + const int tile_size_bytes = pbi->tile_size_bytes; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_copy_mode = + ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; + // Read tile column sizes for all columns (we need the last tile buffer) + for (int c = 0; c < tile_cols; ++c) { + const int is_last = c == tile_cols - 1; + size_t tile_col_size; + + if (!is_last) { + tile_col_size = mem_get_varsize(data, tile_col_size_bytes); + data += tile_col_size_bytes; + tile_col_data_end[c] = data + tile_col_size; + } else { + tile_col_size = data_end - data; + tile_col_data_end[c] = data_end; + } + data += tile_col_size; + } + + data = data_start; + + // Read the required tile sizes. + for (int c = tile_cols_start; c < tile_cols_end; ++c) { + const int is_last = c == tile_cols - 1; + + if (c > 0) data = tile_col_data_end[c - 1]; + + if (!is_last) data += tile_col_size_bytes; + + // Get the whole of the last column, otherwise stop at the required tile. + for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) { + get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data, + tile_buffers, tile_size_bytes, c, r, tile_copy_mode); + } + } + + // If we have not read the last column, then read it to get the last tile. + if (tile_cols_end != tile_cols) { + const int c = tile_cols - 1; + + data = tile_col_data_end[c - 1]; + + for (int r = 0; r < tile_rows; ++r) { + get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data, + tile_buffers, tile_size_bytes, c, r, tile_copy_mode); + } + } + raw_data_end = data; + } + return raw_data_end; +} +#endif // EXT_TILE_DEBUG + +static const uint8_t *get_ls_single_tile_buffer( + AV1Decoder *pbi, const uint8_t *data, + TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) { + assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0); + tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data; + tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size = + (size_t)pbi->coded_tile_data_size; + return data + pbi->coded_tile_data_size; +} + +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static AOM_INLINE void get_tile_buffer( + const uint8_t *const data_end, const int tile_size_bytes, int is_last, + struct aom_internal_error_info *error_info, const uint8_t **data, + TileBufferDec *const buf) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, tile_size_bytes, data_end)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Not enough data to read tile size"); + + size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES; + *data += tile_size_bytes; + + if (size > (size_t)(data_end - *data)) + aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + } else { + size = data_end - *data; + } + + buf->data = *data; + buf->size = size; + + *data += size; +} + +static AOM_INLINE void get_tile_buffers( + AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, + TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tc = 0; + + for (int r = 0; r < tile_rows; ++r) { + for (int c = 0; c < tile_cols; ++c, ++tc) { + TileBufferDec *const buf = &tile_buffers[r][c]; + + const int is_last = (tc == end_tile); + const size_t hdr_offset = 0; + + if (tc < start_tile || tc > end_tile) continue; + + if (data + hdr_offset >= data_end) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Data ended before all tiles were read."); + data += hdr_offset; + get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error, + &data, buf); + } + } +} + +static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb, + CB_BUFFER *cb_buffer_base, + const int num_planes, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &pbi->common; + int mib_size_log2 = cm->seq_params->mib_size_log2; + int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1; + int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); + CB_BUFFER *cb_buffer = cb_buffer_base + offset; + + for (int plane = 0; plane < num_planes; ++plane) { + dcb->dqcoeff_block[plane] = cb_buffer->dqcoeff[plane]; + dcb->eob_data[plane] = cb_buffer->eob_data[plane]; + dcb->cb_offset[plane] = 0; + dcb->txb_offset[plane] = 0; + } + MACROBLOCKD *const xd = &dcb->xd; + xd->plane[0].color_index_map = cb_buffer->color_index_map[0]; + xd->plane[1].color_index_map = cb_buffer->color_index_map[1]; + xd->color_index_map_offset[0] = 0; + xd->color_index_map_offset[1] = 0; +} + +static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi, + const int n_tiles) { + AV1_COMMON *const cm = &pbi->common; + aom_free(pbi->tile_data); + pbi->allocated_tiles = 0; + CHECK_MEM_ERROR(cm, pbi->tile_data, + aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); + pbi->allocated_tiles = n_tiles; + for (int i = 0; i < n_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_zero(tile_data->dec_row_mt_sync); + } + pbi->allocated_row_mt_sync_rows = 0; +} + +// Set up nsync by width. +static INLINE int get_sync_range(int width) { +// nsync numbers are picked by testing. +#if 0 + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +#else + (void)width; +#endif + return 1; +} + +// Allocate memory for decoder row synchronization +static AOM_INLINE void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, + AV1_COMMON *cm, int rows) { + dec_row_mt_sync->allocated_sb_rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_, + aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows)); + if (dec_row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_, + aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows)); + if (dec_row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col, + aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows)); + + // Set up nsync. + dec_row_mt_sync->sync_range = get_sync_range(cm->width); +} + +// Deallocate decoder row synchronization related mutex and data +void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) { + if (dec_row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + if (dec_row_mt_sync->mutex_ != NULL) { + for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { + pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]); + } + aom_free(dec_row_mt_sync->mutex_); + } + if (dec_row_mt_sync->cond_ != NULL) { + for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) { + pthread_cond_destroy(&dec_row_mt_sync->cond_[i]); + } + aom_free(dec_row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + aom_free(dec_row_mt_sync->cur_sb_col); + + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*dec_row_mt_sync); + } +} + +static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r, + int c) { +#if CONFIG_MULTITHREAD + const int nsync = dec_row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync - + dec_row_mt_sync->intrabc_extra_top_right_sb_delay) { + pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)dec_row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r, + int c, const int sb_cols) { +#if CONFIG_MULTITHREAD + const int nsync = dec_row_mt_sync->sync_range; + int cur; + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = sb_cols + nsync + dec_row_mt_sync->intrabc_extra_top_right_sb_delay; + } + + if (sig) { + pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]); + + dec_row_mt_sync->cur_sb_col[r] = cur; + + pthread_cond_signal(&dec_row_mt_sync->cond_[r]); + pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]); + } +#else + (void)dec_row_mt_sync; + (void)r; + (void)c; + (void)sb_cols; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void signal_decoding_done_for_erroneous_row( + AV1Decoder *const pbi, const MACROBLOCKD *const xd) { + AV1_COMMON *const cm = &pbi->common; + const TileInfo *const tile = &xd->tile; + const int sb_row_in_tile = + ((xd->mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2); + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile); + TileDataDec *const tile_data = + pbi->tile_data + tile->tile_row * cm->tiles.cols + tile->tile_col; + AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + sync_write(dec_row_mt_sync, sb_row_in_tile, sb_cols_in_tile - 1, + sb_cols_in_tile); +} + +static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, + const TileInfo *tile_info, + const int mi_row) { + AV1_COMMON *const cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + TileDataDec *const tile_data = pbi->tile_data + + tile_info->tile_row * cm->tiles.cols + + tile_info->tile_col; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; + int sb_col_in_tile = 0; + int row_mt_exit = 0; + + for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; + mi_col += cm->seq_params->mib_size, sb_col_in_tile++) { + set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row, + mi_col); + + sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + row_mt_exit = pbi->frame_row_mt_info.row_mt_exit; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + + if (!row_mt_exit) { + // Decoding of the super-block + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params->sb_size, 0x2); + } + + sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile, + sb_cols_in_tile); + } +} + +static int check_trailing_bits_after_symbol_coder(aom_reader *r) { + if (aom_reader_has_overflowed(r)) return -1; + + uint32_t nb_bits = aom_reader_tell(r); + uint32_t nb_bytes = (nb_bits + 7) >> 3; + const uint8_t *p = aom_reader_find_begin(r) + nb_bytes; + + // aom_reader_tell() returns 1 for a newly initialized decoder, and the + // return value only increases as values are decoded. So nb_bits > 0, and + // thus p > p_begin. Therefore accessing p[-1] is safe. + uint8_t last_byte = p[-1]; + uint8_t pattern = 128 >> ((nb_bits - 1) & 7); + if ((last_byte & (2 * pattern - 1)) != pattern) return -1; + + // Make sure that all padding bytes are zero as required by the spec. + const uint8_t *p_end = aom_reader_find_end(r); + while (p < p_end) { + if (*p != 0) return -1; + p++; + } + return 0; +} + +static AOM_INLINE void set_decode_func_pointers(ThreadData *td, + int parse_decode_flag) { + td->read_coeffs_tx_intra_block_visit = decode_block_void; + td->predict_and_recon_intra_block_visit = decode_block_void; + td->read_coeffs_tx_inter_block_visit = decode_block_void; + td->inverse_tx_inter_block_visit = decode_block_void; + td->predict_inter_block_visit = predict_inter_block_void; + td->cfl_store_inter_block_visit = cfl_store_inter_block_void; + + if (parse_decode_flag & 0x1) { + td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block; + td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade; + } + if (parse_decode_flag & 0x2) { + td->predict_and_recon_intra_block_visit = + predict_and_reconstruct_intra_block; + td->inverse_tx_inter_block_visit = inverse_transform_inter_block; + td->predict_inter_block_visit = predict_inter_block; + td->cfl_store_inter_block_visit = cfl_store_inter_block; + } +} + +static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td, + int tile_row, int tile_col) { + TileInfo tile_info; + + AV1_COMMON *const cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + + av1_tile_set_row(&tile_info, cm, tile_row); + av1_tile_set_col(&tile_info, cm, tile_col); + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + + av1_zero_above_context(cm, xd, tile_info.mi_col_start, tile_info.mi_col_end, + tile_row); + av1_reset_loop_filter_delta(xd, num_planes); + av1_reset_loop_restoration(xd, num_planes); + + for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; + mi_row += cm->seq_params->mib_size) { + av1_zero_left_context(xd); + + for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; + mi_col += cm->seq_params->mib_size) { + set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0); + + // Bit-stream parsing and decoding of the superblock + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params->sb_size, 0x3); + + if (aom_reader_has_overflowed(td->bit_reader)) { + aom_merge_corrupted_flag(&dcb->corrupted, 1); + return; + } + } + } + + int corrupted = + (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; + aom_merge_corrupted_flag(&dcb->corrupted, corrupted); +} + +static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + ThreadData *const td = &pbi->td; + CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int inv_col_order; + int inv_row_order; + int tile_row, tile_col; + uint8_t allow_update_cdf; + const uint8_t *raw_data_end = NULL; + + if (tiles->large_scale) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + inv_col_order = pbi->inv_tile_order && !single_col; + inv_row_order = pbi->inv_tile_order && !single_row; + allow_update_cdf = 0; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; + inv_col_order = pbi->inv_tile_order; + inv_row_order = pbi->inv_tile_order; + allow_update_cdf = 1; + } + + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tiles->cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile) + return data; + + allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + +#if EXT_TILE_DEBUG + if (tiles->large_scale && !pbi->ext_tile_debug) + raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers); + else if (tiles->large_scale && pbi->ext_tile_debug) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); + + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + decoder_alloc_tile_data(pbi, n_tiles); + } + if (pbi->dcb.xd.seg_mask == NULL) + CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + aom_accounting_reset(&pbi->accounting); + } +#endif + + set_decode_func_pointers(&pbi->td, 0x3); + + // Load all tile information into thread_data. + td->dcb = pbi->dcb; + + td->dcb.corrupted = 0; + td->dcb.mc_buf[0] = td->mc_buf[0]; + td->dcb.mc_buf[1] = td->mc_buf[1]; + td->dcb.xd.tmp_conv_dst = td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + td->dcb.xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j]; + } + + for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { + const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row; + + for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { + const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col; + TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col; + const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col]; + + if (row * tiles->cols + col < start_tile || + row * tiles->cols + col > end_tile) + continue; + + td->bit_reader = &tile_data->bit_reader; + av1_zero(td->cb_buffer_base.dqcoeff); + av1_tile_init(&td->dcb.xd.tile, cm, row, col); + td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex; + setup_bool_decoder(&td->dcb.xd, tile_bs_buf->data, data_end, + tile_bs_buf->size, &pbi->error, td->bit_reader, + allow_update_cdf); +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + td->bit_reader->accounting = &pbi->accounting; + td->bit_reader->accounting->last_tell_frac = + aom_reader_tell_frac(td->bit_reader); + } else { + td->bit_reader->accounting = NULL; + } +#endif + av1_init_macroblockd(cm, &td->dcb.xd); + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row, + &td->dcb.xd); + + // Initialise the tile context from the frame context + tile_data->tctx = *cm->fc; + td->dcb.xd.tile_ctx = &tile_data->tctx; + + // decode tile + decode_tile(pbi, td, row, col); + aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted); + if (pbi->dcb.corrupted) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + } + } + + if (tiles->large_scale) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); + } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) { + TileJobsDec *cur_job_info = NULL; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(tile_mt_info->job_mutex); + + if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) { + cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued; + tile_mt_info->jobs_dequeued++; + } + + pthread_mutex_unlock(tile_mt_info->job_mutex); +#else + (void)tile_mt_info; +#endif + return cur_job_info; +} + +static AOM_INLINE void tile_worker_hook_init( + AV1Decoder *const pbi, DecWorkerData *const thread_data, + const TileBufferDec *const tile_buffer, TileDataDec *const tile_data, + uint8_t allow_update_cdf) { + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + int tile_row = tile_data->tile_info.tile_row; + int tile_col = tile_data->tile_info.tile_col; + + td->bit_reader = &tile_data->bit_reader; + av1_zero(td->cb_buffer_base.dqcoeff); + + MACROBLOCKD *const xd = &td->dcb.xd; + av1_tile_init(&xd->tile, cm, tile_row, tile_col); + xd->current_base_qindex = cm->quant_params.base_qindex; + + setup_bool_decoder(xd, tile_buffer->data, thread_data->data_end, + tile_buffer->size, &thread_data->error_info, + td->bit_reader, allow_update_cdf); +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + td->bit_reader->accounting = &pbi->accounting; + td->bit_reader->accounting->last_tell_frac = + aom_reader_tell_frac(td->bit_reader); + } else { + td->bit_reader->accounting = NULL; + } +#endif + av1_init_macroblockd(cm, xd); + xd->error_info = &thread_data->error_info; + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, xd); + + // Initialise the tile context from the frame context + tile_data->tctx = *cm->fc; + xd->tile_ctx = &tile_data->tctx; +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + tile_data->bit_reader.accounting->last_tell_frac = + aom_reader_tell_frac(&tile_data->bit_reader); + } +#endif +} + +static int tile_worker_hook(void *arg1, void *arg2) { + DecWorkerData *const thread_data = (DecWorkerData *)arg1; + AV1Decoder *const pbi = (AV1Decoder *)arg2; + AV1_COMMON *cm = &pbi->common; + ThreadData *const td = thread_data->td; + uint8_t allow_update_cdf; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(thread_data->error_info.jmp)) { + thread_data->error_info.setjmp = 0; + thread_data->td->dcb.corrupted = 1; + return 0; + } + thread_data->error_info.setjmp = 1; + + allow_update_cdf = cm->tiles.large_scale ? 0 : 1; + allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; + + set_decode_func_pointers(td, 0x3); + + assert(cm->tiles.cols > 0); + while (!td->dcb.corrupted) { + TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); + + if (cur_job_info != NULL) { + const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; + TileDataDec *const tile_data = cur_job_info->tile_data; + tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, + allow_update_cdf); + // decode tile + int tile_row = tile_data->tile_info.tile_row; + int tile_col = tile_data->tile_info.tile_col; + decode_tile(pbi, td, tile_row, tile_col); + } else { + break; + } + } + thread_data->error_info.setjmp = 0; + return !td->dcb.corrupted; +} + +static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm, + const TileInfo *tile) { + // NOTE: Currently value of max workers is calculated based + // on the parse and decode time. As per the theoretical estimate + // when percentage of parse time is equal to percentage of decode + // time, number of workers needed to parse + decode a tile can not + // exceed more than 2. + // TODO(any): Modify this value if parsing is optimized in future. + int sb_rows = av1_get_sb_rows_in_tile(cm, tile); + int max_workers = + sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE; + return max_workers; +} + +// The caller must hold pbi->row_mt_mutex_ when calling this function. +// Returns 1 if either the next job is stored in *next_job_info or 1 is stored +// in *end_of_frame. +// NOTE: The caller waits on pbi->row_mt_cond_ if this function returns 0. +// The return value of this function depends on the following variables: +// - frame_row_mt_info->mi_rows_parse_done +// - frame_row_mt_info->mi_rows_decode_started +// - frame_row_mt_info->row_mt_exit +// Therefore we may need to signal or broadcast pbi->row_mt_cond_ if any of +// these variables is modified. +static int get_next_job_info(AV1Decoder *const pbi, + AV1DecRowMTJobInfo *next_job_info, + int *end_of_frame) { + AV1_COMMON *cm = &pbi->common; + TileDataDec *tile_data; + AV1DecRowMTSync *dec_row_mt_sync; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + const int tile_rows_start = frame_row_mt_info->tile_rows_start; + const int tile_rows_end = frame_row_mt_info->tile_rows_end; + const int tile_cols_start = frame_row_mt_info->tile_cols_start; + const int tile_cols_end = frame_row_mt_info->tile_cols_end; + const int start_tile = frame_row_mt_info->start_tile; + const int end_tile = frame_row_mt_info->end_tile; + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; + int num_mis_to_decode, num_threads_working; + int num_mis_waiting_for_decode; + int min_threads_working = INT_MAX; + int max_mis_to_decode = 0; + int tile_row_idx, tile_col_idx; + int tile_row = -1; + int tile_col = -1; + + memset(next_job_info, 0, sizeof(*next_job_info)); + + // Frame decode is completed or error is encountered. + *end_of_frame = (frame_row_mt_info->mi_rows_decode_started == + frame_row_mt_info->mi_rows_to_decode) || + (frame_row_mt_info->row_mt_exit == 1); + if (*end_of_frame) { + return 1; + } + + // Decoding cannot start as bit-stream parsing is not complete. + assert(frame_row_mt_info->mi_rows_parse_done >= + frame_row_mt_info->mi_rows_decode_started); + if (frame_row_mt_info->mi_rows_parse_done == + frame_row_mt_info->mi_rows_decode_started) + return 0; + + // Choose the tile to decode. + for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end; + ++tile_row_idx) { + for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end; + ++tile_col_idx) { + if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile || + tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile) + continue; + + tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx; + dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + num_threads_working = dec_row_mt_sync->num_threads_working; + num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done - + dec_row_mt_sync->mi_rows_decode_started) * + dec_row_mt_sync->mi_cols; + num_mis_to_decode = + (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) * + dec_row_mt_sync->mi_cols; + + assert(num_mis_to_decode >= num_mis_waiting_for_decode); + + // Pick the tile which has minimum number of threads working on it. + if (num_mis_waiting_for_decode > 0) { + if (num_threads_working < min_threads_working) { + min_threads_working = num_threads_working; + max_mis_to_decode = 0; + } + if (num_threads_working == min_threads_working && + num_mis_to_decode > max_mis_to_decode && + num_threads_working < + get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info)) { + max_mis_to_decode = num_mis_to_decode; + tile_row = tile_row_idx; + tile_col = tile_col_idx; + } + } + } + } + // No job found to process + if (tile_row == -1 || tile_col == -1) return 0; + + tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col; + dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + next_job_info->tile_row = tile_row; + next_job_info->tile_col = tile_col; + next_job_info->mi_row = dec_row_mt_sync->mi_rows_decode_started + + tile_data->tile_info.mi_row_start; + + dec_row_mt_sync->num_threads_working++; + dec_row_mt_sync->mi_rows_decode_started += sb_mi_size; + frame_row_mt_info->mi_rows_decode_started += sb_mi_size; + assert(frame_row_mt_info->mi_rows_parse_done >= + frame_row_mt_info->mi_rows_decode_started); +#if CONFIG_MULTITHREAD + if (frame_row_mt_info->mi_rows_decode_started == + frame_row_mt_info->mi_rows_to_decode) { + pthread_cond_broadcast(pbi->row_mt_cond_); + } +#endif + + return 1; +} + +static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi, + TileDataDec *const tile_data, + const int sb_mi_size) { + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + assert(frame_row_mt_info->mi_rows_parse_done >= + frame_row_mt_info->mi_rows_decode_started); + tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size; + frame_row_mt_info->mi_rows_parse_done += sb_mi_size; +#if CONFIG_MULTITHREAD + // A new decode job is available. Wake up one worker thread to handle the + // new decode job. + // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started + // by the same increment (sb_mi_size). + pthread_cond_signal(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif +} + +// This function is very similar to decode_tile(). It would be good to figure +// out how to share code. +static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td, + TileDataDec *const tile_data) { + AV1_COMMON *const cm = &pbi->common; + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; + const int num_planes = av1_num_planes(cm); + const TileInfo *const tile_info = &tile_data->tile_info; + int tile_row = tile_info->tile_row; + DecoderCodingBlock *const dcb = &td->dcb; + MACROBLOCKD *const xd = &dcb->xd; + + av1_zero_above_context(cm, xd, tile_info->mi_col_start, tile_info->mi_col_end, + tile_row); + av1_reset_loop_filter_delta(xd, num_planes); + av1_reset_loop_restoration(xd, num_planes); + + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += cm->seq_params->mib_size) { + av1_zero_left_context(xd); + + for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; + mi_col += cm->seq_params->mib_size) { + set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col); + + // Bit-stream parsing of the superblock + decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, + cm->seq_params->sb_size, 0x1); + + if (aom_reader_has_overflowed(td->bit_reader)) { + aom_merge_corrupted_flag(&dcb->corrupted, 1); + return; + } + } + signal_parse_sb_row_done(pbi, tile_data, sb_mi_size); + } + + int corrupted = + (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0; + aom_merge_corrupted_flag(&dcb->corrupted, corrupted); +} + +static int row_mt_worker_hook(void *arg1, void *arg2) { + DecWorkerData *const thread_data = (DecWorkerData *)arg1; + AV1Decoder *const pbi = (AV1Decoder *)arg2; + ThreadData *const td = thread_data->td; + uint8_t allow_update_cdf; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + td->dcb.corrupted = 0; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(thread_data->error_info.jmp)) { + thread_data->error_info.setjmp = 0; + thread_data->td->dcb.corrupted = 1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + frame_row_mt_info->row_mt_exit = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + // If any SB row (erroneous row) processed by a thread encounters an + // internal error, there is a need to indicate other threads that decoding + // of the erroneous row is complete. This ensures that other threads which + // wait upon the completion of SB's present in erroneous row are not waiting + // indefinitely. + signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd); + return 0; + } + thread_data->error_info.setjmp = 1; + + AV1_COMMON *cm = &pbi->common; + allow_update_cdf = cm->tiles.large_scale ? 0 : 1; + allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update; + + set_decode_func_pointers(td, 0x1); + + assert(cm->tiles.cols > 0); + while (!td->dcb.corrupted) { + TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info); + + if (cur_job_info != NULL) { + const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer; + TileDataDec *const tile_data = cur_job_info->tile_data; + tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, + allow_update_cdf); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + tile_data->dec_row_mt_sync.num_threads_working++; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + // decode tile + parse_tile_row_mt(pbi, td, tile_data); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + tile_data->dec_row_mt_sync.num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + } else { + break; + } + } + + if (td->dcb.corrupted) { + thread_data->error_info.setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + frame_row_mt_info->row_mt_exit = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + return 0; + } + + set_decode_func_pointers(td, 0x2); + + while (1) { + AV1DecRowMTJobInfo next_job_info; + int end_of_frame = 0; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_); +#endif + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + + if (end_of_frame) break; + + int tile_row = next_job_info.tile_row; + int tile_col = next_job_info.tile_col; + int mi_row = next_job_info.mi_row; + + TileDataDec *tile_data = + pbi->tile_data + tile_row * cm->tiles.cols + tile_col; + AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync; + + av1_tile_init(&td->dcb.xd.tile, cm, tile_row, tile_col); + av1_init_macroblockd(cm, &td->dcb.xd); + td->dcb.xd.error_info = &thread_data->error_info; + + decode_tile_sb_row(pbi, td, &tile_data->tile_info, mi_row); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + dec_row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif + } + thread_data->error_info.setjmp = 0; + return !td->dcb.corrupted; +} + +// sorts in descending order +static int compare_tile_buffers(const void *a, const void *b) { + const TileJobsDec *const buf1 = (const TileJobsDec *)a; + const TileJobsDec *const buf2 = (const TileJobsDec *)b; + return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size)); +} + +static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm, + int tile_rows_start, int tile_rows_end, + int tile_cols_start, int tile_cols_end, + int start_tile, int end_tile) { + AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info; + TileJobsDec *tile_job_queue = tile_mt_info->job_queue; + tile_mt_info->jobs_enqueued = 0; + tile_mt_info->jobs_dequeued = 0; + + for (int row = tile_rows_start; row < tile_rows_end; row++) { + for (int col = tile_cols_start; col < tile_cols_end; col++) { + if (row * cm->tiles.cols + col < start_tile || + row * cm->tiles.cols + col > end_tile) + continue; + tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col]; + tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col; + tile_job_queue++; + tile_mt_info->jobs_enqueued++; + } + } +} + +static AOM_INLINE void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, + AV1_COMMON *cm, int tile_rows, + int tile_cols) { + tile_mt_info->alloc_tile_rows = tile_rows; + tile_mt_info->alloc_tile_cols = tile_cols; + int num_tiles = tile_rows * tile_cols; +#if CONFIG_MULTITHREAD + { + CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex, + aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles)); + + for (int i = 0; i < num_tiles; i++) { + pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL); + } + } +#endif + CHECK_MEM_ERROR(cm, tile_mt_info->job_queue, + aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles)); +} + +void av1_free_mc_tmp_buf(ThreadData *thread_data) { + int ref; + for (ref = 0; ref < 2; ref++) { + if (thread_data->mc_buf_use_highbd) + aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref])); + else + aom_free(thread_data->mc_buf[ref]); + thread_data->mc_buf[ref] = NULL; + } + thread_data->mc_buf_size = 0; + thread_data->mc_buf_use_highbd = 0; + + aom_free(thread_data->tmp_conv_dst); + thread_data->tmp_conv_dst = NULL; + aom_free(thread_data->seg_mask); + thread_data->seg_mask = NULL; + for (int i = 0; i < 2; ++i) { + aom_free(thread_data->tmp_obmc_bufs[i]); + thread_data->tmp_obmc_bufs[i] = NULL; + } +} + +static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm, + ThreadData *thread_data, + int buf_size, int use_highbd) { + for (int ref = 0; ref < 2; ref++) { + // The mc_buf/hbd_mc_buf must be zeroed to fix a intermittent valgrind error + // 'Conditional jump or move depends on uninitialised value' from the loop + // filter. Uninitialized reads in convolve function (e.g. horiz_4tap path in + // av1_convolve_2d_sr_avx2()) from mc_buf/hbd_mc_buf are seen to be the + // potential reason for this issue. + if (use_highbd) { + uint16_t *hbd_mc_buf; + CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size)); + memset(hbd_mc_buf, 0, buf_size); + thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf); + } else { + CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref], + (uint8_t *)aom_memalign(16, buf_size)); + memset(thread_data->mc_buf[ref], 0, buf_size); + } + } + thread_data->mc_buf_size = buf_size; + thread_data->mc_buf_use_highbd = use_highbd; + + CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->tmp_conv_dst))); + CHECK_MEM_ERROR(cm, thread_data->seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask))); + + for (int i = 0; i < 2; ++i) { + CHECK_MEM_ERROR( + cm, thread_data->tmp_obmc_bufs[i], + aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*thread_data->tmp_obmc_bufs[i]))); + } +} + +static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi, + AVxWorkerHook worker_hook, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + // Reset tile decoding hook + for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + thread_data->td->dcb = pbi->dcb; + thread_data->td->dcb.corrupted = 0; + thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0]; + thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1]; + thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst; + if (worker_idx) + thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask; + for (int j = 0; j < 2; ++j) { + thread_data->td->dcb.xd.tmp_obmc_bufs[j] = + thread_data->td->tmp_obmc_bufs[j]; + } + winterface->sync(worker); + + worker->hook = worker_hook; + worker->data1 = thread_data; + worker->data2 = pbi; + } +#if CONFIG_ACCOUNTING + if (pbi->acct_enabled) { + aom_accounting_reset(&pbi->accounting); + } +#endif +} + +static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi, + const uint8_t *data_end, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + + for (int worker_idx = num_workers - 1; worker_idx >= 0; --worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = (DecWorkerData *)worker->data1; + + thread_data->data_end = data_end; + + worker->had_error = 0; + if (worker_idx == 0) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } +} + +static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int corrupted = 0; + + for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1]; + aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker)); + } + + pbi->dcb.corrupted = corrupted; +} + +static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int worker_idx; + + // Create workers and thread_data + if (pbi->num_workers == 0) { + const int num_threads = pbi->max_threads; + CHECK_MEM_ERROR(cm, pbi->tile_workers, + aom_malloc(num_threads * sizeof(*pbi->tile_workers))); + CHECK_MEM_ERROR(cm, pbi->thread_data, + aom_calloc(num_threads, sizeof(*pbi->thread_data))); + + for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) { + AVxWorker *const worker = &pbi->tile_workers[worker_idx]; + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + + winterface->init(worker); + worker->thread_name = "aom tile worker"; + if (worker_idx != 0 && !winterface->reset(worker)) { + aom_internal_error(&pbi->error, AOM_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + ++pbi->num_workers; + + if (worker_idx != 0) { + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + aom_memalign(32, sizeof(*thread_data->td))); + av1_zero(*thread_data->td); + } else { + // Main thread acts as a worker and uses the thread data in pbi + thread_data->td = &pbi->td; + } + thread_data->error_info.error_code = AOM_CODEC_OK; + thread_data->error_info.setjmp = 0; + } + } + const int use_highbd = cm->seq_params->use_highbitdepth; + const int buf_size = MC_TEMP_BUF_PELS << use_highbd; + for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) { + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + if (thread_data->td->mc_buf_size != buf_size) { + av1_free_mc_tmp_buf(thread_data->td); + allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd); + } + } +} + +static AOM_INLINE void tile_mt_queue(AV1Decoder *pbi, int tile_cols, + int tile_rows, int tile_rows_start, + int tile_rows_end, int tile_cols_start, + int tile_cols_end, int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + if (pbi->tile_mt_info.alloc_tile_cols != tile_cols || + pbi->tile_mt_info.alloc_tile_rows != tile_rows) { + av1_dealloc_dec_jobs(&pbi->tile_mt_info); + alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols); + } + enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start, + tile_cols_end, start_tile, end_tile); + qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued, + sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers); +} + +static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, int start_tile, + int end_tile) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int tile_count_tg; + int num_workers; + const uint8_t *raw_data_end = NULL; + + if (tiles->large_scale) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; + } + tile_count_tg = end_tile - start_tile + 1; + num_workers = AOMMIN(pbi->max_threads, tile_count_tg); + + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tile_cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) + return data; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + assert(tile_count_tg > 0); + assert(num_workers > 0); + assert(start_tile <= end_tile); + assert(start_tile >= 0 && end_tile < n_tiles); + + decode_mt_init(pbi); + + // get tile size in tile group +#if EXT_TILE_DEBUG + if (tiles->large_scale) assert(pbi->ext_tile_debug == 1); + if (tiles->large_scale) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); + + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + decoder_alloc_tile_data(pbi, n_tiles); + } + if (pbi->dcb.xd.seg_mask == NULL) + CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col; + av1_tile_init(&tile_data->tile_info, cm, row, col); + } + } + + tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, + tile_cols_start, tile_cols_end, start_tile, end_tile); + + reset_dec_workers(pbi, tile_worker_hook, num_workers); + launch_dec_workers(pbi, data_end, num_workers); + sync_dec_workers(pbi, num_workers); + + if (pbi->dcb.corrupted) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (tiles->large_scale) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); + } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) * + ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1); + + if (pbi->cb_buffer_alloc_size < size) { + av1_dec_free_cb_buf(pbi); + CHECK_MEM_ERROR(cm, pbi->cb_buffer_base, + aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size)); + memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size); + pbi->cb_buffer_alloc_size = size; + } +} + +static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start, + int tile_rows_end, int tile_cols_start, + int tile_cols_end, int start_tile, + int end_tile, int max_sb_rows) { + AV1_COMMON *const cm = &pbi->common; + AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info; + + frame_row_mt_info->tile_rows_start = tile_rows_start; + frame_row_mt_info->tile_rows_end = tile_rows_end; + frame_row_mt_info->tile_cols_start = tile_cols_start; + frame_row_mt_info->tile_cols_end = tile_cols_end; + frame_row_mt_info->start_tile = start_tile; + frame_row_mt_info->end_tile = end_tile; + frame_row_mt_info->mi_rows_to_decode = 0; + frame_row_mt_info->mi_rows_parse_done = 0; + frame_row_mt_info->mi_rows_decode_started = 0; + frame_row_mt_info->row_mt_exit = 0; + + for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) { + for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) { + if (tile_row * cm->tiles.cols + tile_col < start_tile || + tile_row * cm->tiles.cols + tile_col > end_tile) + continue; + + TileDataDec *const tile_data = + pbi->tile_data + tile_row * cm->tiles.cols + tile_col; + const TileInfo *const tile_info = &tile_data->tile_info; + + tile_data->dec_row_mt_sync.mi_rows_parse_done = 0; + tile_data->dec_row_mt_sync.mi_rows_decode_started = 0; + tile_data->dec_row_mt_sync.num_threads_working = 0; + tile_data->dec_row_mt_sync.mi_rows = + ALIGN_POWER_OF_TWO(tile_info->mi_row_end - tile_info->mi_row_start, + cm->seq_params->mib_size_log2); + tile_data->dec_row_mt_sync.mi_cols = + ALIGN_POWER_OF_TWO(tile_info->mi_col_end - tile_info->mi_col_start, + cm->seq_params->mib_size_log2); + tile_data->dec_row_mt_sync.intrabc_extra_top_right_sb_delay = + av1_get_intrabc_extra_top_right_sb_delay(cm); + + frame_row_mt_info->mi_rows_to_decode += + tile_data->dec_row_mt_sync.mi_rows; + + // Initialize cur_sb_col to -1 for all SB rows. + memset(tile_data->dec_row_mt_sync.cur_sb_col, -1, + sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows); + } + } + +#if CONFIG_MULTITHREAD + if (pbi->row_mt_mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_, + aom_malloc(sizeof(*(pbi->row_mt_mutex_)))); + if (pbi->row_mt_mutex_) { + pthread_mutex_init(pbi->row_mt_mutex_, NULL); + } + } + + if (pbi->row_mt_cond_ == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_cond_, + aom_malloc(sizeof(*(pbi->row_mt_cond_)))); + if (pbi->row_mt_cond_) { + pthread_cond_init(pbi->row_mt_cond_, NULL); + } + } +#endif +} + +static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + int start_tile, int end_tile) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int n_tiles = tile_cols * tile_rows; + TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers; + const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows); + const int single_row = pbi->dec_tile_row >= 0; + const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols); + const int single_col = pbi->dec_tile_col >= 0; + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int tile_count_tg; + int num_workers = 0; + int max_threads; + const uint8_t *raw_data_end = NULL; + int max_sb_rows = 0; + + if (tiles->large_scale) { + tile_rows_start = single_row ? dec_tile_row : 0; + tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows; + tile_cols_start = single_col ? dec_tile_col : 0; + tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols; + } else { + tile_rows_start = 0; + tile_rows_end = tile_rows; + tile_cols_start = 0; + tile_cols_end = tile_cols; + } + tile_count_tg = end_tile - start_tile + 1; + max_threads = pbi->max_threads; + + // No tiles to decode. + if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || + // First tile is larger than end_tile. + tile_rows_start * tile_cols + tile_cols_start > end_tile || + // Last tile is smaller than start_tile. + (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile) + return data; + + assert(tile_rows <= MAX_TILE_ROWS); + assert(tile_cols <= MAX_TILE_COLS); + assert(tile_count_tg > 0); + assert(max_threads > 0); + assert(start_tile <= end_tile); + assert(start_tile >= 0 && end_tile < n_tiles); + + (void)tile_count_tg; + + decode_mt_init(pbi); + + // get tile size in tile group +#if EXT_TILE_DEBUG + if (tiles->large_scale) assert(pbi->ext_tile_debug == 1); + if (tiles->large_scale) + raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers); + else +#endif // EXT_TILE_DEBUG + get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile); + + if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { + if (pbi->tile_data != NULL) { + for (int i = 0; i < pbi->allocated_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + } + } + decoder_alloc_tile_data(pbi, n_tiles); + } + if (pbi->dcb.xd.seg_mask == NULL) + CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col; + av1_tile_init(&tile_data->tile_info, cm, row, col); + + max_sb_rows = AOMMAX(max_sb_rows, + av1_get_sb_rows_in_tile(cm, &tile_data->tile_info)); + num_workers += get_max_row_mt_workers_per_tile(cm, &tile_data->tile_info); + } + } + num_workers = AOMMIN(num_workers, max_threads); + + if (pbi->allocated_row_mt_sync_rows != max_sb_rows) { + for (int i = 0; i < n_tiles; ++i) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows); + } + pbi->allocated_row_mt_sync_rows = max_sb_rows; + } + + tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end, + tile_cols_start, tile_cols_end, start_tile, end_tile); + + dec_alloc_cb_buf(pbi); + + row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start, + tile_cols_end, start_tile, end_tile, max_sb_rows); + + reset_dec_workers(pbi, row_mt_worker_hook, num_workers); + launch_dec_workers(pbi, data_end, num_workers); + sync_dec_workers(pbi, num_workers); + + if (pbi->dcb.corrupted) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (tiles->large_scale) { + if (n_tiles == 1) { + // Find the end of the single tile buffer + return aom_reader_find_end(&pbi->tile_data->bit_reader); + } + // Return the end of the last tile buffer + return raw_data_end; + } + TileDataDec *const tile_data = pbi->tile_data + end_tile; + + return aom_reader_find_end(&tile_data->bit_reader); +} + +static AOM_INLINE void error_handler(void *data) { + AV1_COMMON *const cm = (AV1_COMMON *)data; + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet"); +} + +// Reads the high_bitdepth and twelve_bit fields in color_config() and sets +// seq_params->bit_depth based on the values of those fields and +// seq_params->profile. Reports errors by calling rb->error_handler() or +// aom_internal_error(). +static AOM_INLINE void read_bitdepth( + struct aom_read_bit_buffer *rb, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info) { + const int high_bitdepth = aom_rb_read_bit(rb); + if (seq_params->profile == PROFILE_2 && high_bitdepth) { + const int twelve_bit = aom_rb_read_bit(rb); + seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10; + } else if (seq_params->profile <= PROFILE_2) { + seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8; + } else { + aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Unsupported profile/bit-depth combination"); + } +#if !CONFIG_AV1_HIGHBITDEPTH + if (seq_params->bit_depth > AOM_BITS_8) { + aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Bit-depth %d not supported", seq_params->bit_depth); + } +#endif +} + +void av1_read_film_grain_params(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + aom_film_grain_t *pars = &cm->film_grain_params; + const SequenceHeader *const seq_params = cm->seq_params; + + pars->apply_grain = aom_rb_read_bit(rb); + if (!pars->apply_grain) { + memset(pars, 0, sizeof(*pars)); + return; + } + + pars->random_seed = aom_rb_read_literal(rb, 16); + if (cm->current_frame.frame_type == INTER_FRAME) + pars->update_parameters = aom_rb_read_bit(rb); + else + pars->update_parameters = 1; + + pars->bit_depth = seq_params->bit_depth; + + if (!pars->update_parameters) { + // inherit parameters from a previous reference frame + int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3); + // Section 6.8.20: It is a requirement of bitstream conformance that + // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value + // of j in the range 0 to REFS_PER_FRAME - 1. + int found = 0; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) { + found = 1; + break; + } + } + if (!found) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Invalid film grain reference idx %d. ref_frame_idx = " + "{%d, %d, %d, %d, %d, %d, %d}", + film_grain_params_ref_idx, cm->remapped_ref_idx[0], + cm->remapped_ref_idx[1], cm->remapped_ref_idx[2], + cm->remapped_ref_idx[3], cm->remapped_ref_idx[4], + cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]); + } + RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx]; + if (buf == NULL) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Invalid Film grain reference idx"); + } + if (!buf->film_grain_params_present) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Film grain reference parameters not available"); + } + uint16_t random_seed = pars->random_seed; + *pars = buf->film_grain_params; // inherit paramaters + pars->random_seed = random_seed; // with new random seed + return; + } + + // Scaling functions parameters + pars->num_y_points = aom_rb_read_literal(rb, 4); // max 14 + if (pars->num_y_points > 14) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Number of points for film grain luma scaling function " + "exceeds the maximum value."); + for (int i = 0; i < pars->num_y_points; i++) { + pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8); + if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0]) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "First coordinate of the scaling function points " + "shall be increasing."); + pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8); + } + + if (!seq_params->monochrome) + pars->chroma_scaling_from_luma = aom_rb_read_bit(rb); + else + pars->chroma_scaling_from_luma = 0; + + if (seq_params->monochrome || pars->chroma_scaling_from_luma || + ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && + (pars->num_y_points == 0))) { + pars->num_cb_points = 0; + pars->num_cr_points = 0; + } else { + pars->num_cb_points = aom_rb_read_literal(rb, 4); // max 10 + if (pars->num_cb_points > 10) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Number of points for film grain cb scaling function " + "exceeds the maximum value."); + for (int i = 0; i < pars->num_cb_points; i++) { + pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8); + if (i && + pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0]) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "First coordinate of the scaling function points " + "shall be increasing."); + pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8); + } + + pars->num_cr_points = aom_rb_read_literal(rb, 4); // max 10 + if (pars->num_cr_points > 10) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Number of points for film grain cr scaling function " + "exceeds the maximum value."); + for (int i = 0; i < pars->num_cr_points; i++) { + pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8); + if (i && + pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0]) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "First coordinate of the scaling function points " + "shall be increasing."); + pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8); + } + + if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && + (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) || + ((pars->num_cb_points != 0) && (pars->num_cr_points == 0)))) + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "In YCbCr 4:2:0, film grain shall be applied " + "to both chroma components or neither."); + } + + pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8; // 8 + value + + // AR coefficients + // Only sent if the corresponsing scaling function has + // more than 0 points + + pars->ar_coeff_lag = aom_rb_read_literal(rb, 2); + + int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (pars->num_y_points > 0) ++num_pos_chroma; + + if (pars->num_y_points) + for (int i = 0; i < num_pos_luma; i++) + pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128; + + if (pars->num_cb_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128; + + if (pars->num_cr_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128; + + pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6; // 6 + value + + pars->grain_scale_shift = aom_rb_read_literal(rb, 2); + + if (pars->num_cb_points) { + pars->cb_mult = aom_rb_read_literal(rb, 8); + pars->cb_luma_mult = aom_rb_read_literal(rb, 8); + pars->cb_offset = aom_rb_read_literal(rb, 9); + } + + if (pars->num_cr_points) { + pars->cr_mult = aom_rb_read_literal(rb, 8); + pars->cr_luma_mult = aom_rb_read_literal(rb, 8); + pars->cr_offset = aom_rb_read_literal(rb, 9); + } + + pars->overlap_flag = aom_rb_read_bit(rb); + + pars->clip_to_restricted_range = aom_rb_read_bit(rb); +} + +static AOM_INLINE void read_film_grain(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + if (cm->seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { + av1_read_film_grain_params(cm, rb); + } else { + memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); + } + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; + memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params, + sizeof(aom_film_grain_t)); +} + +void av1_read_color_config(struct aom_read_bit_buffer *rb, + int allow_lowbitdepth, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info) { + read_bitdepth(rb, seq_params, error_info); + + seq_params->use_highbitdepth = + seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth; + // monochrome bit (not needed for PROFILE_1) + const int is_monochrome = + seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0; + seq_params->monochrome = is_monochrome; + int color_description_present_flag = aom_rb_read_bit(rb); + if (color_description_present_flag) { + seq_params->color_primaries = aom_rb_read_literal(rb, 8); + seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8); + seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8); + } else { + seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED; + seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; + seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; + } + if (is_monochrome) { + // [16,235] (including xvycc) vs [0,255] range + seq_params->color_range = aom_rb_read_bit(rb); + seq_params->subsampling_y = seq_params->subsampling_x = 1; + seq_params->chroma_sample_position = AOM_CSP_UNKNOWN; + seq_params->separate_uv_delta_q = 0; + return; + } + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + seq_params->subsampling_y = seq_params->subsampling_x = 0; + seq_params->color_range = 1; // assume full color-range + if (!(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12))) { + aom_internal_error( + error_info, AOM_CODEC_UNSUP_BITSTREAM, + "sRGB colorspace not compatible with specified profile"); + } + } else { + // [16,235] (including xvycc) vs [0,255] range + seq_params->color_range = aom_rb_read_bit(rb); + if (seq_params->profile == PROFILE_0) { + // 420 only + seq_params->subsampling_x = seq_params->subsampling_y = 1; + } else if (seq_params->profile == PROFILE_1) { + // 444 only + seq_params->subsampling_x = seq_params->subsampling_y = 0; + } else { + assert(seq_params->profile == PROFILE_2); + if (seq_params->bit_depth == AOM_BITS_12) { + seq_params->subsampling_x = aom_rb_read_bit(rb); + if (seq_params->subsampling_x) + seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420 + else + seq_params->subsampling_y = 0; // 444 + } else { + // 422 + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 0; + } + } + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY && + (seq_params->subsampling_x || seq_params->subsampling_y)) { + aom_internal_error( + error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Identity CICP Matrix incompatible with non 4:4:4 color sampling"); + } + if (seq_params->subsampling_x && seq_params->subsampling_y) { + seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2); + } + } + seq_params->separate_uv_delta_q = aom_rb_read_bit(rb); +} + +void av1_read_timing_info_header(aom_timing_info_t *timing_info, + struct aom_internal_error_info *error, + struct aom_read_bit_buffer *rb) { + timing_info->num_units_in_display_tick = + aom_rb_read_unsigned_literal(rb, + 32); // Number of units in a display tick + timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32); // Time scale + if (timing_info->num_units_in_display_tick == 0 || + timing_info->time_scale == 0) { + aom_internal_error( + error, AOM_CODEC_UNSUP_BITSTREAM, + "num_units_in_display_tick and time_scale must be greater than 0."); + } + timing_info->equal_picture_interval = + aom_rb_read_bit(rb); // Equal picture interval bit + if (timing_info->equal_picture_interval) { + const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); + if (num_ticks_per_picture_minus_1 == UINT32_MAX) { + aom_internal_error( + error, AOM_CODEC_UNSUP_BITSTREAM, + "num_ticks_per_picture_minus_1 cannot be (1 << 32) - 1."); + } + timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1; + } +} + +void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info, + struct aom_read_bit_buffer *rb) { + decoder_model_info->encoder_decoder_buffer_delay_length = + aom_rb_read_literal(rb, 5) + 1; + decoder_model_info->num_units_in_decoding_tick = + aom_rb_read_unsigned_literal(rb, + 32); // Number of units in a decoding tick + decoder_model_info->buffer_removal_time_length = + aom_rb_read_literal(rb, 5) + 1; + decoder_model_info->frame_presentation_time_length = + aom_rb_read_literal(rb, 5) + 1; +} + +void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, + int buffer_delay_length, + struct aom_read_bit_buffer *rb) { + op_params->decoder_buffer_delay = + aom_rb_read_unsigned_literal(rb, buffer_delay_length); + op_params->encoder_buffer_delay = + aom_rb_read_unsigned_literal(rb, buffer_delay_length); + op_params->low_delay_mode_flag = aom_rb_read_bit(rb); +} + +static AOM_INLINE void read_temporal_point_info( + AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) { + cm->frame_presentation_time = aom_rb_read_unsigned_literal( + rb, cm->seq_params->decoder_model_info.frame_presentation_time_length); +} + +void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params) { + const int num_bits_width = aom_rb_read_literal(rb, 4) + 1; + const int num_bits_height = aom_rb_read_literal(rb, 4) + 1; + const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1; + const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1; + + seq_params->num_bits_width = num_bits_width; + seq_params->num_bits_height = num_bits_height; + seq_params->max_frame_width = max_frame_width; + seq_params->max_frame_height = max_frame_height; + + if (seq_params->reduced_still_picture_hdr) { + seq_params->frame_id_numbers_present_flag = 0; + } else { + seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb); + } + if (seq_params->frame_id_numbers_present_flag) { + // We must always have delta_frame_id_length < frame_id_length, + // in order for a frame to be referenced with a unique delta. + // Avoid wasting bits by using a coding that enforces this restriction. + seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2; + seq_params->frame_id_length = + aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1; + if (seq_params->frame_id_length > 16) + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid frame_id_length"); + } + + setup_sb_size(seq_params, rb); + + seq_params->enable_filter_intra = aom_rb_read_bit(rb); + seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb); + + if (seq_params->reduced_still_picture_hdr) { + seq_params->enable_interintra_compound = 0; + seq_params->enable_masked_compound = 0; + seq_params->enable_warped_motion = 0; + seq_params->enable_dual_filter = 0; + seq_params->order_hint_info.enable_order_hint = 0; + seq_params->order_hint_info.enable_dist_wtd_comp = 0; + seq_params->order_hint_info.enable_ref_frame_mvs = 0; + seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS + seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV + seq_params->order_hint_info.order_hint_bits_minus_1 = -1; + } else { + seq_params->enable_interintra_compound = aom_rb_read_bit(rb); + seq_params->enable_masked_compound = aom_rb_read_bit(rb); + seq_params->enable_warped_motion = aom_rb_read_bit(rb); + seq_params->enable_dual_filter = aom_rb_read_bit(rb); + + seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb); + seq_params->order_hint_info.enable_dist_wtd_comp = + seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; + seq_params->order_hint_info.enable_ref_frame_mvs = + seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; + + if (aom_rb_read_bit(rb)) { + seq_params->force_screen_content_tools = + 2; // SELECT_SCREEN_CONTENT_TOOLS + } else { + seq_params->force_screen_content_tools = aom_rb_read_bit(rb); + } + + if (seq_params->force_screen_content_tools > 0) { + if (aom_rb_read_bit(rb)) { + seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV + } else { + seq_params->force_integer_mv = aom_rb_read_bit(rb); + } + } else { + seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV + } + seq_params->order_hint_info.order_hint_bits_minus_1 = + seq_params->order_hint_info.enable_order_hint + ? aom_rb_read_literal(rb, 3) + : -1; + } + + seq_params->enable_superres = aom_rb_read_bit(rb); + seq_params->enable_cdef = aom_rb_read_bit(rb); + seq_params->enable_restoration = aom_rb_read_bit(rb); +} + +static int read_global_motion_params(WarpedMotionParams *params, + const WarpedMotionParams *ref_params, + struct aom_read_bit_buffer *rb, + int allow_hp) { + TransformationType type = aom_rb_read_bit(rb); + if (type != IDENTITY) { + if (aom_rb_read_bit(rb)) + type = ROTZOOM; + else + type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE; + } + + *params = default_warp_params; + params->wmtype = type; + + if (type >= ROTZOOM) { + params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS)) * + GM_ALPHA_DECODE_FACTOR + + (1 << WARPEDMODEL_PREC_BITS); + params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) * + GM_ALPHA_DECODE_FACTOR; + } + + if (type >= AFFINE) { + params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) * + GM_ALPHA_DECODE_FACTOR; + params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin( + rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS)) * + GM_ALPHA_DECODE_FACTOR + + (1 << WARPEDMODEL_PREC_BITS); + } else { + params->wmmat[4] = -params->wmmat[3]; + params->wmmat[5] = params->wmmat[2]; + } + + if (type >= TRANSLATION) { + const int trans_bits = (type == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + const int trans_dec_factor = + (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp) + : GM_TRANS_DECODE_FACTOR; + const int trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin( + rb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff)) * + trans_dec_factor; + params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin( + rb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff)) * + trans_dec_factor; + } + + int good_shear_params = av1_get_shear_params(params); + if (!good_shear_params) return 0; + + return 1; +} + +static AOM_INLINE void read_global_motion(AV1_COMMON *cm, + struct aom_read_bit_buffer *rb) { + for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + int good_params = + read_global_motion_params(&cm->global_motion[frame], ref_params, rb, + cm->features.allow_high_precision_mv); + if (!good_params) { +#if WARPED_MOTION_DEBUG + printf("Warning: unexpected global motion shear params from aomenc\n"); +#endif + cm->global_motion[frame].invalid = 1; + } + + // TODO(sarahparker, debargha): The logic in the commented out code below + // does not work currently and causes mismatches when resize is on. Fix it + // before turning the optimization back on. + /* + YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame); + if (cm->width == ref_buf->y_crop_width && + cm->height == ref_buf->y_crop_height) { + read_global_motion_params(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], rb, + cm->features.allow_high_precision_mv); + } else { + cm->global_motion[frame] = default_warp_params; + } + */ + /* + printf("Dec Ref %d [%d/%d]: %d %d %d %d\n", + frame, cm->current_frame.frame_number, cm->show_frame, + cm->global_motion[frame].wmmat[0], + cm->global_motion[frame].wmmat[1], + cm->global_motion[frame].wmmat[2], + cm->global_motion[frame].wmmat[3]); + */ + } + memcpy(cm->cur_frame->global_motion, cm->global_motion, + REF_FRAMES * sizeof(WarpedMotionParams)); +} + +// Release the references to the frame buffers in cm->ref_frame_map and reset +// all elements of cm->ref_frame_map to NULL. +static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) { + BufferPool *const pool = cm->buffer_pool; + + for (int i = 0; i < REF_FRAMES; i++) { + decrease_ref_count(cm->ref_frame_map[i], pool); + cm->ref_frame_map[i] = NULL; + } +} + +// If the refresh_frame_flags bitmask is set, update reference frame id values +// and mark frames as valid for reference. +static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) { + AV1_COMMON *const cm = &pbi->common; + int refresh_frame_flags = cm->current_frame.refresh_frame_flags; + for (int i = 0; i < REF_FRAMES; i++) { + if ((refresh_frame_flags >> i) & 1) { + cm->ref_frame_id[i] = cm->current_frame_id; + pbi->valid_for_referencing[i] = 1; + } + } +} + +static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi, + int existing_frame_idx) { + AV1_COMMON *const cm = &pbi->common; + + assert(cm->show_existing_frame); + + cm->current_frame.frame_type = KEY_FRAME; + + cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1; + + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + cm->remapped_ref_idx[i] = INVALID_IDX; + } + + if (pbi->need_resync) { + reset_ref_frame_map(cm); + pbi->need_resync = 0; + } + + // Note that the displayed frame must be valid for referencing in order to + // have been selected. + cm->current_frame_id = cm->ref_frame_id[existing_frame_idx]; + update_ref_frame_id(pbi); + + cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; +} + +static INLINE void reset_frame_buffers(AV1_COMMON *cm) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + lock_buffer_pool(cm->buffer_pool); + reset_ref_frame_map(cm); + assert(cm->cur_frame->ref_count == 1); + for (i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) { + // Reset all unreferenced frame buffers. We can also reset cm->cur_frame + // because we are the sole owner of cm->cur_frame. + if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) { + continue; + } + frame_bufs[i].order_hint = 0; + av1_zero(frame_bufs[i].ref_order_hints); + } + av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers); + unlock_buffer_pool(cm->buffer_pool); +} + +// On success, returns 0. On failure, calls aom_internal_error and does not +// return. +static int read_uncompressed_header(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb) { + AV1_COMMON *const cm = &pbi->common; + const SequenceHeader *const seq_params = cm->seq_params; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &pbi->dcb.xd; + BufferPool *const pool = cm->buffer_pool; + RefCntBuffer *const frame_bufs = pool->frame_bufs; + aom_s_frame_info *sframe_info = &pbi->sframe_info; + sframe_info->is_s_frame = 0; + sframe_info->is_s_frame_at_altref = 0; + + if (!pbi->sequence_header_ready) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "No sequence header"); + } + + if (seq_params->reduced_still_picture_hdr) { + cm->show_existing_frame = 0; + cm->show_frame = 1; + current_frame->frame_type = KEY_FRAME; + if (pbi->sequence_header_changed) { + // This is the start of a new coded video sequence. + pbi->sequence_header_changed = 0; + pbi->decoding_first_frame = 1; + reset_frame_buffers(cm); + } + features->error_resilient_mode = 1; + } else { + cm->show_existing_frame = aom_rb_read_bit(rb); + pbi->reset_decoder_state = 0; + + if (cm->show_existing_frame) { + if (pbi->sequence_header_changed) { + aom_internal_error( + &pbi->error, AOM_CODEC_CORRUPT_FRAME, + "New sequence header starts with a show_existing_frame."); + } + // Show an existing frame directly. + const int existing_frame_idx = aom_rb_read_literal(rb, 3); + RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx]; + if (frame_to_show == NULL) { + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a decoded frame"); + } + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) { + read_temporal_point_info(cm, rb); + } + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int display_frame_id = aom_rb_read_literal(rb, frame_id_length); + /* Compare display_frame_id with ref_frame_id and check valid for + * referencing */ + if (display_frame_id != cm->ref_frame_id[existing_frame_idx] || + pbi->valid_for_referencing[existing_frame_idx] == 0) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Reference buffer frame ID mismatch"); + } + lock_buffer_pool(pool); + assert(frame_to_show->ref_count > 0); + // cm->cur_frame should be the buffer referenced by the return value + // of the get_free_fb() call in assign_cur_frame_new_fb() (called by + // av1_receive_compressed_data()), so the ref_count should be 1. + assert(cm->cur_frame->ref_count == 1); + // assign_frame_buffer_p() decrements ref_count directly rather than + // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has + // already been allocated, it will not be released by + // assign_frame_buffer_p()! + assert(!cm->cur_frame->raw_frame_buffer.data); + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME; + unlock_buffer_pool(pool); + + cm->lf.filter_level[0] = 0; + cm->lf.filter_level[1] = 0; + cm->show_frame = 1; + current_frame->order_hint = frame_to_show->order_hint; + + // Section 6.8.2: It is a requirement of bitstream conformance that when + // show_existing_frame is used to show a previous frame, that the value + // of showable_frame for the previous frame was equal to 1. + if (!frame_to_show->showable_frame) { + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a showable frame"); + } + // Section 6.8.2: It is a requirement of bitstream conformance that when + // show_existing_frame is used to show a previous frame with + // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the + // frame is output via the show_existing_frame mechanism at most once. + if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0; + + cm->film_grain_params = frame_to_show->film_grain_params; + + if (pbi->reset_decoder_state) { + show_existing_frame_reset(pbi, existing_frame_idx); + } else { + current_frame->refresh_frame_flags = 0; + } + + return 0; + } + + current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); + if (pbi->sequence_header_changed) { + if (current_frame->frame_type == KEY_FRAME) { + // This is the start of a new coded video sequence. + pbi->sequence_header_changed = 0; + pbi->decoding_first_frame = 1; + reset_frame_buffers(cm); + } else { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Sequence header has changed without a keyframe."); + } + } + + cm->show_frame = aom_rb_read_bit(rb); + if (cm->show_frame == 0) pbi->is_arf_frame_present = 1; + if (cm->show_frame == 0 && cm->current_frame.frame_type == KEY_FRAME) + pbi->is_fwd_kf_present = 1; + if (cm->current_frame.frame_type == S_FRAME) { + sframe_info->is_s_frame = 1; + sframe_info->is_s_frame_at_altref = cm->show_frame ? 0 : 1; + } + if (seq_params->still_picture && + (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Still pictures must be coded as shown keyframes"); + } + cm->showable_frame = current_frame->frame_type != KEY_FRAME; + if (cm->show_frame) { + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) + read_temporal_point_info(cm, rb); + } else { + // See if this frame can be used as show_existing_frame in future + cm->showable_frame = aom_rb_read_bit(rb); + } + cm->cur_frame->showable_frame = cm->showable_frame; + features->error_resilient_mode = + frame_is_sframe(cm) || + (current_frame->frame_type == KEY_FRAME && cm->show_frame) + ? 1 + : aom_rb_read_bit(rb); + } + + if (current_frame->frame_type == KEY_FRAME && cm->show_frame) { + /* All frames need to be marked as not valid for referencing */ + for (int i = 0; i < REF_FRAMES; i++) { + pbi->valid_for_referencing[i] = 0; + } + } + features->disable_cdf_update = aom_rb_read_bit(rb); + if (seq_params->force_screen_content_tools == 2) { + features->allow_screen_content_tools = aom_rb_read_bit(rb); + } else { + features->allow_screen_content_tools = + seq_params->force_screen_content_tools; + } + + if (features->allow_screen_content_tools) { + if (seq_params->force_integer_mv == 2) { + features->cur_frame_force_integer_mv = aom_rb_read_bit(rb); + } else { + features->cur_frame_force_integer_mv = seq_params->force_integer_mv; + } + } else { + features->cur_frame_force_integer_mv = 0; + } + + int frame_size_override_flag = 0; + features->allow_intrabc = 0; + features->primary_ref_frame = PRIMARY_REF_NONE; + + if (!seq_params->reduced_still_picture_hdr) { + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int prev_frame_id = 0; + int have_prev_frame_id = + !pbi->decoding_first_frame && + !(current_frame->frame_type == KEY_FRAME && cm->show_frame); + if (have_prev_frame_id) { + prev_frame_id = cm->current_frame_id; + } + cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length); + + if (have_prev_frame_id) { + int diff_frame_id; + if (cm->current_frame_id > prev_frame_id) { + diff_frame_id = cm->current_frame_id - prev_frame_id; + } else { + diff_frame_id = + (1 << frame_id_length) + cm->current_frame_id - prev_frame_id; + } + /* Check current_frame_id for conformance */ + if (prev_frame_id == cm->current_frame_id || + diff_frame_id >= (1 << (frame_id_length - 1))) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid value of current_frame_id"); + } + } + /* Check if some frames need to be marked as not valid for referencing */ + for (int i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame_id - (1 << diff_len) > 0) { + if (cm->ref_frame_id[i] > cm->current_frame_id || + cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len)) + pbi->valid_for_referencing[i] = 0; + } else { + if (cm->ref_frame_id[i] > cm->current_frame_id && + cm->ref_frame_id[i] < (1 << frame_id_length) + + cm->current_frame_id - (1 << diff_len)) + pbi->valid_for_referencing[i] = 0; + } + } + } + + frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb); + + current_frame->order_hint = aom_rb_read_literal( + rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + + if (seq_params->order_hint_info.enable_order_hint) + current_frame->frame_number = current_frame->order_hint; + + if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { + features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS); + } + } + + if (seq_params->decoder_model_info_present_flag) { + pbi->buffer_removal_time_present = aom_rb_read_bit(rb); + if (pbi->buffer_removal_time_present) { + for (int op_num = 0; + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { + if (seq_params->op_params[op_num].decoder_model_param_present_flag) { + if (seq_params->operating_point_idc[op_num] == 0 || + (((seq_params->operating_point_idc[op_num] >> + cm->temporal_layer_id) & + 0x1) && + ((seq_params->operating_point_idc[op_num] >> + (cm->spatial_layer_id + 8)) & + 0x1))) { + cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal( + rb, seq_params->decoder_model_info.buffer_removal_time_length); + } else { + cm->buffer_removal_times[op_num] = 0; + } + } else { + cm->buffer_removal_times[op_num] = 0; + } + } + } + } + if (current_frame->frame_type == KEY_FRAME) { + if (!cm->show_frame) { // unshown keyframe (forward keyframe) + current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); + } else { // shown keyframe + current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1; + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + cm->remapped_ref_idx[i] = INVALID_IDX; + } + if (pbi->need_resync) { + reset_ref_frame_map(cm); + pbi->need_resync = 0; + } + } else { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); + if (current_frame->refresh_frame_flags == 0xFF) { + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "Intra only frames cannot have refresh flags 0xFF"); + } + if (pbi->need_resync) { + reset_ref_frame_map(cm); + pbi->need_resync = 0; + } + } else if (pbi->need_resync != 1) { /* Skip if need resync */ + current_frame->refresh_frame_flags = + frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES); + } + } + + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) { + // Read all ref frame order hints if error_resilient_mode == 1 + if (features->error_resilient_mode && + seq_params->order_hint_info.enable_order_hint) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + // Read order hint from bit stream + unsigned int order_hint = aom_rb_read_literal( + rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + // Get buffer + RefCntBuffer *buf = cm->ref_frame_map[ref_idx]; + if (buf == NULL || order_hint != buf->order_hint) { + if (buf != NULL) { + lock_buffer_pool(pool); + decrease_ref_count(buf, pool); + unlock_buffer_pool(pool); + cm->ref_frame_map[ref_idx] = NULL; + } + // If no corresponding buffer exists, allocate a new buffer with all + // pixels set to neutral grey. + int buf_idx = get_free_fb(cm); + if (buf_idx == INVALID_IDX) { + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + } + buf = &frame_bufs[buf_idx]; + lock_buffer_pool(pool); + if (aom_realloc_frame_buffer( + &buf->buf, seq_params->max_frame_width, + seq_params->max_frame_height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + AOM_BORDER_IN_PIXELS, features->byte_alignment, + &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0, + 0)) { + decrease_ref_count(buf, pool); + unlock_buffer_pool(pool); + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + unlock_buffer_pool(pool); + // According to the specification, valid bitstreams are required to + // never use missing reference frames so the filling process for + // missing frames is not normatively defined and RefValid for missing + // frames is set to 0. + + // To make libaom more robust when the bitstream has been corrupted + // by the loss of some frames of data, this code adds a neutral grey + // buffer in place of missing frames, i.e. + // + set_planes_to_neutral_grey(seq_params, &buf->buf, 0); + // + // and allows the frames to be used for referencing, i.e. + // + pbi->valid_for_referencing[ref_idx] = 1; + // + // Please note such behavior is not normative and other decoders may + // use a different approach. + cm->ref_frame_map[ref_idx] = buf; + buf->order_hint = order_hint; + } + } + } + } + + if (current_frame->frame_type == KEY_FRAME) { + setup_frame_size(cm, frame_size_override_flag, rb); + + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + features->allow_intrabc = aom_rb_read_bit(rb); + features->allow_ref_frame_mvs = 0; + cm->prev_frame = NULL; + } else { + features->allow_ref_frame_mvs = 0; + + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + cm->cur_frame->film_grain_params_present = + seq_params->film_grain_params_present; + setup_frame_size(cm, frame_size_override_flag, rb); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + features->allow_intrabc = aom_rb_read_bit(rb); + + } else if (pbi->need_resync != 1) { /* Skip if need resync */ + int frame_refs_short_signaling = 0; + // Frame refs short signaling is off when error resilient mode is on. + if (seq_params->order_hint_info.enable_order_hint) + frame_refs_short_signaling = aom_rb_read_bit(rb); + + if (frame_refs_short_signaling) { + // == LAST_FRAME == + const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); + const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref]; + + // == GOLDEN_FRAME == + const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); + const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref]; + + // Most of the time, streams start with a keyframe. In that case, + // ref_frame_map will have been filled in at that point and will not + // contain any NULLs. However, streams are explicitly allowed to start + // with an intra-only frame, so long as they don't then signal a + // reference to a slot that hasn't been set yet. That's what we are + // checking here. + if (lst_buf == NULL) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests nonexistent reference"); + if (gld_buf == NULL) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests nonexistent reference"); + + av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref); + } + + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + int ref = 0; + if (!frame_refs_short_signaling) { + ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); + + // Most of the time, streams start with a keyframe. In that case, + // ref_frame_map will have been filled in at that point and will not + // contain any NULLs. However, streams are explicitly allowed to start + // with an intra-only frame, so long as they don't then signal a + // reference to a slot that hasn't been set yet. That's what we are + // checking here. + if (cm->ref_frame_map[ref] == NULL) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Inter frame requests nonexistent reference"); + cm->remapped_ref_idx[i] = ref; + } else { + ref = cm->remapped_ref_idx[i]; + } + // Check valid for referencing + if (pbi->valid_for_referencing[ref] == 0) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Reference frame not valid for referencing"); + + cm->ref_frame_sign_bias[LAST_FRAME + i] = 0; + + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_length = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len); + int ref_frame_id = + ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) + + (1 << frame_id_length)) % + (1 << frame_id_length)); + // Compare values derived from delta_frame_id_minus_1 and + // refresh_frame_flags. + if (ref_frame_id != cm->ref_frame_id[ref]) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Reference buffer frame ID mismatch"); + } + } + + if (!features->error_resilient_mode && frame_size_override_flag) { + setup_frame_size_with_refs(cm, rb); + } else { + setup_frame_size(cm, frame_size_override_flag, rb); + } + + if (features->cur_frame_force_integer_mv) { + features->allow_high_precision_mv = 0; + } else { + features->allow_high_precision_mv = aom_rb_read_bit(rb); + } + features->interp_filter = read_frame_interp_filter(rb); + features->switchable_motion_mode = aom_rb_read_bit(rb); + } + + cm->prev_frame = get_primary_ref_frame_buf(cm); + if (features->primary_ref_frame != PRIMARY_REF_NONE && + get_primary_ref_frame_buf(cm) == NULL) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Reference frame containing this frame's initial " + "frame context is unavailable."); + } + + if (!(current_frame->frame_type == INTRA_ONLY_FRAME) && + pbi->need_resync != 1) { + if (frame_might_allow_ref_frame_mvs(cm)) + features->allow_ref_frame_mvs = aom_rb_read_bit(rb); + else + features->allow_ref_frame_mvs = 0; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); + struct scale_factors *const ref_scale_factors = + get_ref_scale_factors(cm, i); + av1_setup_scale_factors_for_frame( + ref_scale_factors, ref_buf->buf.y_crop_width, + ref_buf->buf.y_crop_height, cm->width, cm->height); + if ((!av1_is_valid_scale(ref_scale_factors))) + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + } + } + } + + av1_setup_frame_buf_refs(cm); + + av1_setup_frame_sign_bias(cm); + + cm->cur_frame->frame_type = current_frame->frame_type; + + update_ref_frame_id(pbi); + + const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && + !(features->disable_cdf_update); + if (might_bwd_adapt) { + features->refresh_frame_context = aom_rb_read_bit(rb) + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + } else { + features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + } + + cm->cur_frame->buf.bit_depth = seq_params->bit_depth; + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; + + if (pbi->need_resync) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } + + if (features->allow_intrabc) { + // Set parameters corresponding to no filtering. + struct loopfilter *lf = &cm->lf; + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.nb_cdef_strengths = 1; + cm->cdef_info.cdef_uv_strengths[0] = 0; + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } + + read_tile_info(pbi, rb); + if (!av1_is_min_tile_width_satisfied(cm)) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Minimum tile width requirement not satisfied"); + } + + CommonQuantParams *const quant_params = &cm->quant_params; + setup_quantization(quant_params, av1_num_planes(cm), + cm->seq_params->separate_uv_delta_q, rb); + xd->bd = (int)seq_params->bit_depth; + + CommonContexts *const above_contexts = &cm->above_contexts; + if (above_contexts->num_planes < av1_num_planes(cm) || + above_contexts->num_mi_cols < cm->mi_params.mi_cols || + above_contexts->num_tile_rows < cm->tiles.rows) { + av1_free_above_context_buffers(above_contexts); + if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, + cm->mi_params.mi_cols, + av1_num_planes(cm))) { + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + } + + if (features->primary_ref_frame == PRIMARY_REF_NONE) { + av1_setup_past_independence(cm); + } + + setup_segmentation(cm, rb); + + cm->delta_q_info.delta_q_res = 1; + cm->delta_q_info.delta_lf_res = 1; + cm->delta_q_info.delta_lf_present_flag = 0; + cm->delta_q_info.delta_lf_multi = 0; + cm->delta_q_info.delta_q_present_flag = + quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0; + if (cm->delta_q_info.delta_q_present_flag) { + xd->current_base_qindex = quant_params->base_qindex; + cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2); + if (!features->allow_intrabc) + cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb); + if (cm->delta_q_info.delta_lf_present_flag) { + cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2); + cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb); + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + + xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv; + + for (int i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex); + xd->lossless[i] = + qindex == 0 && quant_params->y_dc_delta_q == 0 && + quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && + quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; + xd->qindex[i] = qindex; + } + features->coded_lossless = is_coded_lossless(cm, xd); + features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); + setup_segmentation_dequant(cm, xd); + if (features->coded_lossless) { + cm->lf.filter_level[0] = 0; + cm->lf.filter_level[1] = 0; + } + if (features->coded_lossless || !seq_params->enable_cdef) { + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.cdef_uv_strengths[0] = 0; + } + if (features->all_lossless || !seq_params->enable_restoration) { + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; + } + setup_loopfilter(cm, rb); + + if (!features->coded_lossless && seq_params->enable_cdef) { + setup_cdef(cm, rb); + } + if (!features->all_lossless && seq_params->enable_restoration) { + decode_restoration_mode(cm, rb); + } + + features->tx_mode = read_tx_mode(rb, features->coded_lossless); + current_frame->reference_mode = read_frame_reference_mode(cm, rb); + + av1_setup_skip_mode_allowed(cm); + current_frame->skip_mode_info.skip_mode_flag = + current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0; + + if (frame_might_allow_warped_motion(cm)) + features->allow_warped_motion = aom_rb_read_bit(rb); + else + features->allow_warped_motion = 0; + + features->reduced_tx_set_used = aom_rb_read_bit(rb); + + if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Frame wrongly requests reference frame MVs"); + } + + if (!frame_is_intra_only(cm)) read_global_motion(cm, rb); + + cm->cur_frame->film_grain_params_present = + seq_params->film_grain_params_present; + read_film_grain(cm, rb); + +#if EXT_TILE_DEBUG + if (pbi->ext_tile_debug && cm->tiles.large_scale) { + read_ext_tile_info(pbi, rb); + av1_set_single_tile_decoding_mode(cm); + } +#endif // EXT_TILE_DEBUG + return 0; +} + +struct aom_read_bit_buffer *av1_init_read_bit_buffer( + AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, + const uint8_t *data_end) { + rb->bit_offset = 0; + rb->error_handler = error_handler; + rb->error_handler_data = &pbi->common; + rb->bit_buffer = data; + rb->bit_buffer_end = data_end; + return rb; +} + +void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width, + int num_bits_height, int *width, int *height) { + *width = aom_rb_read_literal(rb, num_bits_width) + 1; + *height = aom_rb_read_literal(rb, num_bits_height) + 1; +} + +BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) { + int profile = aom_rb_read_literal(rb, PROFILE_BITS); + return (BITSTREAM_PROFILE)profile; +} + +static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + if (!av1_superres_scaled(cm)) return; + assert(!cm->features.all_lossless); + + av1_superres_upscale(cm, pool, 0); +} + +uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + int trailing_bits_present) { + AV1_COMMON *const cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &pbi->dcb.xd; + +#if CONFIG_BITSTREAM_DEBUG + if (cm->seq_params->order_hint_info.enable_order_hint) { + aom_bitstream_queue_set_frame_read(cm->current_frame.order_hint * 2 + + cm->show_frame); + } else { + // This is currently used in RTC encoding. cm->show_frame is always 1. + assert(cm->show_frame); + aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number); + } +#endif +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_r(); +#endif + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + cm->global_motion[i] = default_warp_params; + cm->cur_frame->global_motion[i] = default_warp_params; + } + xd->global_motion = cm->global_motion; + + read_uncompressed_header(pbi, rb); + + if (trailing_bits_present) av1_check_trailing_bits(pbi, rb); + + if (!cm->tiles.single_tile_decoding && + (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) { + pbi->dec_tile_row = -1; + pbi->dec_tile_col = -1; + } + + const uint32_t uncomp_hdr_size = + (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header + YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf; + xd->cur_buf = new_fb; + if (av1_allow_intrabc(cm)) { + av1_setup_scale_factors_for_frame( + &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height, + xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height); + } + + // Showing a frame directly. + if (cm->show_existing_frame) { + if (pbi->reset_decoder_state) { + // Use the default frame context values. + *cm->fc = *cm->default_frame_context; + if (!cm->fc->initialized) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + } + return uncomp_hdr_size; + } + + cm->mi_params.setup_mi(&cm->mi_params); + + av1_calculate_ref_frame_side(cm); + if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm); + + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + // use the default frame context values + *cm->fc = *cm->default_frame_context; + } else { + *cm->fc = get_primary_ref_frame_buf(cm)->frame_context; + } + if (!cm->fc->initialized) + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Uninitialized entropy context."); + + pbi->dcb.corrupted = 0; + return uncomp_hdr_size; +} + +// Once-per-frame initialization +static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + + if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { + av1_alloc_restoration_buffers(cm, /*is_sgr_enabled =*/true); + for (int p = 0; p < av1_num_planes(cm); p++) { + av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0); + } + } + + const int use_highbd = cm->seq_params->use_highbitdepth; + const int buf_size = MC_TEMP_BUF_PELS << use_highbd; + if (pbi->td.mc_buf_size != buf_size) { + av1_free_mc_tmp_buf(&pbi->td); + allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd); + } +} + +void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, int start_tile, + int end_tile, int initialize_flag) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + MACROBLOCKD *const xd = &pbi->dcb.xd; + const int tile_count_tg = end_tile - start_tile + 1; + + xd->error_info = cm->error; + if (initialize_flag) setup_frame_info(pbi); + const int num_planes = av1_num_planes(cm); + + if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) && + pbi->row_mt) + *p_data_end = + decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile); + else if (pbi->max_threads > 1 && tile_count_tg > 1 && + !(tiles->large_scale && !pbi->ext_tile_debug)) + *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile); + else + *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile); + + // If the bit stream is monochrome, set the U and V buffers to a constant. + if (num_planes < 3) { + set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1); + } + + if (end_tile != tiles->rows * tiles->cols - 1) { + return; + } + + av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync, + pbi->num_workers, 1); + av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers); + + if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) { + if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) { + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0, + num_planes, 0, pbi->tile_workers, + pbi->num_workers, &pbi->lf_row_sync, 0); + } + + const int do_cdef = + !pbi->skip_loop_filter && !cm->features.coded_lossless && + (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] || + cm->cdef_info.cdef_uv_strengths[0]); + const int do_superres = av1_superres_scaled(cm); + const int optimized_loop_restoration = !do_cdef && !do_superres; + const int do_loop_restoration = + cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE; + // Frame border extension is not required in the decoder + // as it happens in extend_mc_border(). + int do_extend_border_mt = 0; + if (!optimized_loop_restoration) { + if (do_loop_restoration) + av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf, + cm, 0); + + if (do_cdef) { + if (pbi->num_workers > 1) { + av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker, + pbi->tile_workers, &pbi->cdef_sync, + pbi->num_workers, av1_cdef_init_fb_row_mt, + do_extend_border_mt); + } else { + av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd, + av1_cdef_init_fb_row); + } + } + + superres_post_decode(pbi); + + if (do_loop_restoration) { + av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf, + cm, 1); + if (pbi->num_workers > 1) { + av1_loop_restoration_filter_frame_mt( + (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, + pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync, + &pbi->lr_ctxt, do_extend_border_mt); + } else { + av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, + cm, optimized_loop_restoration, + &pbi->lr_ctxt); + } + } + } else { + // In no cdef and no superres case. Provide an optimized version of + // loop_restoration_filter. + if (do_loop_restoration) { + if (pbi->num_workers > 1) { + av1_loop_restoration_filter_frame_mt( + (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration, + pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync, + &pbi->lr_ctxt, do_extend_border_mt); + } else { + av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, + cm, optimized_loop_restoration, + &pbi->lr_ctxt); + } + } + } + } + + if (!pbi->dcb.corrupted) { + if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + assert(pbi->context_update_tile_id < pbi->allocated_tiles); + *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx; + av1_reset_cdf_symbol_counters(cm->fc); + } + } else { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + +#if CONFIG_INSPECTION + if (pbi->inspect_cb != NULL) { + (*pbi->inspect_cb)(pbi, pbi->inspect_ctx); + } +#endif + + // Non frame parallel update frame context here. + if (!tiles->large_scale) { + cm->cur_frame->frame_context = *cm->fc; + } + + if (cm->show_frame && !cm->seq_params->order_hint_info.enable_order_hint) { + ++cm->current_frame.frame_number; + } +} diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h new file mode 100644 index 0000000000..46ae475ff5 --- /dev/null +++ b/third_party/aom/av1/decoder/decodeframe.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODEFRAME_H_ +#define AOM_AV1_DECODER_DECODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Decoder; +struct aom_read_bit_buffer; +struct ThreadData; + +// Reads the middle part of the sequence header OBU (from +// frame_width_bits_minus_1 to enable_restoration) into seq_params. +// Reports errors by calling rb->error_handler() or aom_internal_error(). +void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, + SequenceHeader *seq_params); + +void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width, + int num_bits_height, int *width, int *height); +BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb); + +// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on +// failure. +int av1_check_trailing_bits(struct AV1Decoder *pbi, + struct aom_read_bit_buffer *rb); + +// On success, returns the frame header size. On failure, calls +// aom_internal_error and does not return. +uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + int trailing_bits_present); + +void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, int start_tile, + int end_tile, int initialize_flag); + +// Implements the color_config() function in the spec. Reports errors by +// calling rb->error_handler() or aom_internal_error(). +void av1_read_color_config(struct aom_read_bit_buffer *rb, + int allow_lowbitdepth, SequenceHeader *seq_params, + struct aom_internal_error_info *error_info); + +// Implements the timing_info() function in the spec. Reports errors by calling +// rb->error_handler() or aom_internal_error(). +void av1_read_timing_info_header(aom_timing_info_t *timing_info, + struct aom_internal_error_info *error, + struct aom_read_bit_buffer *rb); + +// Implements the decoder_model_info() function in the spec. Reports errors by +// calling rb->error_handler(). +void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info, + struct aom_read_bit_buffer *rb); + +// Implements the operating_parameters_info() function in the spec. Reports +// errors by calling rb->error_handler(). +void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, + int buffer_delay_length, + struct aom_read_bit_buffer *rb); + +struct aom_read_bit_buffer *av1_init_read_bit_buffer( + struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data, + const uint8_t *data_end); + +void av1_free_mc_tmp_buf(struct ThreadData *thread_data); + +void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_DECODEFRAME_H_ diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c new file mode 100644 index 0000000000..bb0ccf5fd8 --- /dev/null +++ b/third_party/aom/av1/decoder/decodemv.c @@ -0,0 +1,1586 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/cfl.h" +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/decodemv.h" + +#include "aom_dsp/aom_dsp_common.h" + +#define ACCT_STR __func__ + +#define DEC_MISMATCH_DEBUG 0 + +static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) { + return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR); +} + +static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) { + const int skip_txfm = xd->mi[0]->skip_txfm; + if (cm->features.coded_lossless) return; + if (cm->features.allow_intrabc) { + assert(cm->cdef_info.cdef_bits == 0); + return; + } + + // At the start of a superblock, mark that we haven't yet read CDEF strengths + // for any of the CDEF units contained in this superblock. + const int sb_mask = (cm->seq_params->mib_size - 1); + const int mi_row_in_sb = (xd->mi_row & sb_mask); + const int mi_col_in_sb = (xd->mi_col & sb_mask); + if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { + xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = + xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; + } + + // CDEF unit size is 64x64 irrespective of the superblock size. + const int cdef_size = 1 << (6 - MI_SIZE_LOG2); + + // Find index of this CDEF unit in this superblock. + const int index_mask = cdef_size; + const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); + const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); + const int index = (cm->seq_params->sb_size == BLOCK_128X128) + ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb + : 0; + + // Read CDEF strength from the first non-skip coding block in this CDEF unit. + if (!xd->cdef_transmitted[index] && !skip_txfm) { + // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO + // of the 1st block in this CDEF unit. + const int first_block_mask = ~(cdef_size - 1); + CommonModeInfoParams *const mi_params = &cm->mi_params; + const int grid_idx = + get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, + xd->mi_col & first_block_mask); + MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; + mbmi->cdef_strength = + aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR); + xd->cdef_transmitted[index] = true; + } +} + +static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd, + aom_reader *r, MB_MODE_INFO *const mbmi) { + int sign, abs, reduced_delta_qindex = 0; + BLOCK_SIZE bsize = mbmi->bsize; + const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1); + const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1); + const int read_delta_q_flag = (b_col == 0 && b_row == 0); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) && + read_delta_q_flag) { + abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR); + const int smallval = (abs < DELTA_Q_SMALL); + + if (!smallval) { + const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; + const int thr = (1 << rem_bits) + 1; + abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; + } + + if (abs) { + sign = aom_read_bit(r, ACCT_STR); + } else { + sign = 1; + } + + reduced_delta_qindex = sign ? -abs : abs; + } + return reduced_delta_qindex; +} +static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r, + aom_cdf_prob *const cdf, + const MB_MODE_INFO *const mbmi, int mi_col, + int mi_row) { + int reduced_delta_lflevel = 0; + const BLOCK_SIZE bsize = mbmi->bsize; + const int b_col = mi_col & (cm->seq_params->mib_size - 1); + const int b_row = mi_row & (cm->seq_params->mib_size - 1); + const int read_delta_lf_flag = (b_col == 0 && b_row == 0); + + if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) && + read_delta_lf_flag) { + int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR); + const int smallval = (abs < DELTA_LF_SMALL); + if (!smallval) { + const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1; + const int thr = (1 << rem_bits) + 1; + abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr; + } + const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1; + reduced_delta_lflevel = sign ? -abs : abs; + } + return reduced_delta_lflevel; +} + +static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx, + aom_reader *r, + CFL_ALLOWED_TYPE cfl_allowed, + PREDICTION_MODE y_mode) { + const UV_PREDICTION_MODE uv_mode = + aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode], + UV_INTRA_MODES - !cfl_allowed, ACCT_STR); + return uv_mode; +} + +static uint8_t read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r, + int8_t *signs_out) { + const int8_t joint_sign = + aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs"); + uint8_t idx = 0; + // Magnitudes are only coded for nonzero values + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u") + << CFL_ALPHABET_SIZE_LOG2; + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v"); + } + *signs_out = joint_sign; + return idx; +} + +static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r, + int size_group) { + const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol( + r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES, + ACCT_STR); + return ii_mode; +} + +static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r, + int16_t ctx) { + int16_t mode_ctx = ctx & NEWMV_CTX_MASK; + int is_newmv, is_zeromv, is_refmv; + is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0; + if (is_newmv) return NEWMV; + + mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + is_zeromv = + aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0; + if (is_zeromv) return GLOBALMV; + + mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0; + if (is_refmv) + return NEARESTMV; + else + return NEARMV; +} + +static void read_drl_idx(FRAME_CONTEXT *ec_ctx, DecoderCodingBlock *dcb, + MB_MODE_INFO *mbmi, aom_reader *r) { + MACROBLOCKD *const xd = &dcb->xd; + uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + mbmi->ref_mv_idx = 0; + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (dcb->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx); + int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR); + mbmi->ref_mv_idx = idx + drl_idx; + if (!drl_idx) return; + } + } + } + if (have_nearmv_in_inter_mode(mbmi->mode)) { + // Offset the NEARESTMV mode. + // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV + // mode is factored in. + for (int idx = 1; idx < 3; ++idx) { + if (dcb->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx); + int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR); + mbmi->ref_mv_idx = idx + drl_idx - 1; + if (!drl_idx) return; + } + } + } +} + +static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd, + MB_MODE_INFO *mbmi, aom_reader *r) { + if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION; + if (mbmi->skip_mode) return SIMPLE_TRANSLATION; + + const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, cm->features.allow_warped_motion); + int motion_mode; + + if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION; + + if (last_motion_mode_allowed == OBMC_CAUSAL) { + motion_mode = + aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->bsize], 2, ACCT_STR); + return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode); + } else { + motion_mode = aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->bsize], + MOTION_MODES, ACCT_STR); + return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode); + } +} + +static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r, + int16_t ctx) { + const int mode = + aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx], + INTER_COMPOUND_MODES, ACCT_STR); + assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode)); + return NEAREST_NEARESTMV + mode; +} + +int av1_neg_deinterleave(int diff, int ref, int max) { + if (!ref) return diff; + if (ref >= (max - 1)) return max - diff - 1; + if (2 * ref < max) { + if (diff <= 2 * ref) { + if (diff & 1) + return ref + ((diff + 1) >> 1); + else + return ref - (diff >> 1); + } + return diff; + } else { + if (diff <= 2 * (max - ref - 1)) { + if (diff & 1) + return ref + ((diff + 1) >> 1); + else + return ref - (diff >> 1); + } + return max - (diff + 1); + } +} + +static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd, + aom_reader *r, int skip) { + int cdf_num; + const uint8_t pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num, 0); + if (skip) return pred; + + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; + const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR); + const int segment_id = + av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1); + + if (segment_id < 0 || segment_id > seg->last_active_segid) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Corrupted segment_ids"); + } + return segment_id; +} + +static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids, + int mi_offset, int x_mis, int y_mis) { + int segment_id = INT_MAX; + + for (int y = 0; y < y_mis; y++) + for (int x = 0; x < x_mis; x++) + segment_id = AOMMIN( + segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); + return segment_id; +} + +static int read_intra_segment_id(AV1_COMMON *const cm, + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, + aom_reader *r, int skip) { + struct segmentation *const seg = &cm->seg; + if (!seg->enabled) return 0; // Default for disabled segmentation + assert(seg->update_map && !seg->temporal_update); + + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_stride = cm->mi_params.mi_cols; + const int mi_offset = mi_row * mi_stride + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh); + const int segment_id = read_segment_id(cm, xd, r, skip); + set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride, + segment_id); + return segment_id; +} + +static void copy_segment_id(const CommonModeInfoParams *const mi_params, + const uint8_t *last_segment_ids, + uint8_t *current_segment_ids, int mi_offset, + int x_mis, int y_mis) { + const int stride = mi_params->mi_cols; + if (last_segment_ids) { + assert(last_segment_ids != current_segment_ids); + for (int y = 0; y < y_mis; y++) { + memcpy(¤t_segment_ids[mi_offset + y * stride], + &last_segment_ids[mi_offset + y * stride], + sizeof(current_segment_ids[0]) * x_mis); + } + } else { + for (int y = 0; y < y_mis; y++) { + memset(¤t_segment_ids[mi_offset + y * stride], 0, + sizeof(current_segment_ids[0]) * x_mis); + } + } +} + +static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset, + int x_mis, int y_mis) { + return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map, + mi_offset, x_mis, y_mis) + : 0; +} + +static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd, + int preskip, aom_reader *r) { + struct segmentation *const seg = &cm->seg; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[mbmi->bsize]; + const int bh = mi_size_high[mbmi->bsize]; + + // TODO(slavarnway): move x_mis, y_mis into xd ????? + const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh); + + if (!seg->enabled) return 0; // Default for disabled segmentation + + if (!seg->update_map) { + copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map, + mi_offset, x_mis, y_mis); + return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); + } + + uint8_t segment_id; + const int mi_stride = cm->mi_params.mi_cols; + if (preskip) { + if (!seg->segid_preskip) return 0; + } else { + if (mbmi->skip_txfm) { + if (seg->temporal_update) { + mbmi->seg_id_predicted = 0; + } + segment_id = read_segment_id(cm, xd, r, 1); + set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride, + segment_id); + return segment_id; + } + } + + if (seg->temporal_update) { + const uint8_t ctx = av1_get_pred_context_seg_id(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + struct segmentation_probs *const segp = &ec_ctx->seg; + aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx]; + mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR); + if (mbmi->seg_id_predicted) { + segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); + } else { + segment_id = read_segment_id(cm, xd, r, 0); + } + } else { + segment_id = read_segment_id(cm, xd, r, 0); + } + set_segment_id(cm->cur_frame->seg_map, mi_offset, x_mis, y_mis, mi_stride, + segment_id); + return segment_id; +} + +static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, + aom_reader *r) { + if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 0; + } + + if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return 0; + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + // These features imply single-reference mode, while skip mode implies + // compound reference. Hence, the two are mutually exclusive. + // In other words, skip_mode is implicitly 0 here. + return 0; + } + + const int ctx = av1_get_skip_mode_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int skip_mode = + aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR); + return skip_mode; +} + +static int read_skip_txfm(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id, + aom_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int ctx = av1_get_skip_txfm_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int skip_txfm = + aom_read_symbol(r, ec_ctx->skip_txfm_cdfs[ctx], 2, ACCT_STR); + return skip_txfm; + } +} + +// Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1]) +// and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into +// one single sorted list(colors[...]). +static void merge_colors(uint16_t *colors, uint16_t *cached_colors, + int n_colors, int n_cached_colors) { + if (n_cached_colors == 0) return; + int cache_idx = 0, trans_idx = n_cached_colors; + for (int i = 0; i < n_colors; ++i) { + if (cache_idx < n_cached_colors && + (trans_idx >= n_colors || + cached_colors[cache_idx] <= colors[trans_idx])) { + colors[i] = cached_colors[cache_idx++]; + } else { + assert(trans_idx < n_colors); + colors[i] = colors[trans_idx++]; + } + } +} + +static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth, + PALETTE_MODE_INFO *const pmi, aom_reader *r) { + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + uint16_t cached_colors[PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + const int n = pmi->palette_size[0]; + int idx = 0; + for (int i = 0; i < n_cache && idx < n; ++i) + if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i]; + if (idx < n) { + const int n_cached_colors = idx; + pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR); + if (idx < n) { + const int min_bits = bit_depth - 3; + int bits = min_bits + aom_read_literal(r, 2, ACCT_STR); + int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1; + for (; idx < n; ++idx) { + assert(range >= 0); + const int delta = aom_read_literal(r, bits, ACCT_STR) + 1; + pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta, + 0, (1 << bit_depth) - 1); + range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]); + bits = AOMMIN(bits, av1_ceil_log2(range)); + } + } + merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors); + } else { + memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0])); + } +} + +static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth, + PALETTE_MODE_INFO *const pmi, + aom_reader *r) { + const int n = pmi->palette_size[1]; + // U channel colors. + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + uint16_t cached_colors[PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + int idx = 0; + for (int i = 0; i < n_cache && idx < n; ++i) + if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i]; + if (idx < n) { + const int n_cached_colors = idx; + idx += PALETTE_MAX_SIZE; + pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR); + if (idx < PALETTE_MAX_SIZE + n) { + const int min_bits = bit_depth - 3; + int bits = min_bits + aom_read_literal(r, 2, ACCT_STR); + int range = (1 << bit_depth) - pmi->palette_colors[idx - 1]; + for (; idx < PALETTE_MAX_SIZE + n; ++idx) { + assert(range >= 0); + const int delta = aom_read_literal(r, bits, ACCT_STR); + pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta, + 0, (1 << bit_depth) - 1); + range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]); + bits = AOMMIN(bits, av1_ceil_log2(range)); + } + } + merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n, + n_cached_colors); + } else { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, + n * sizeof(cached_colors[0])); + } + + // V channel colors. + if (aom_read_bit(r, ACCT_STR)) { // Delta encoding. + const int min_bits_v = bit_depth - 4; + const int max_val = 1 << bit_depth; + int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR); + pmi->palette_colors[2 * PALETTE_MAX_SIZE] = + aom_read_literal(r, bit_depth, ACCT_STR); + for (int i = 1; i < n; ++i) { + int delta = aom_read_literal(r, bits, ACCT_STR); + if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta; + int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta; + if (val < 0) val += max_val; + if (val >= max_val) val -= max_val; + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val; + } + } else { + for (int i = 0; i < n; ++i) { + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = + aom_read_literal(r, bit_depth, ACCT_STR); + } + } +} + +static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r) { + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + if (mbmi->mode == DC_PRED) { + const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); + const int modev = aom_read_symbol( + r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2, + ACCT_STR); + if (modev) { + pmi->palette_size[0] = + aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], + PALETTE_SIZES, ACCT_STR) + + 2; + read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r); + } + } + if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) { + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + const int modev = aom_read_symbol( + r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR); + if (modev) { + pmi->palette_size[1] = + aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], + PALETTE_SIZES, ACCT_STR) + + 2; + read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r); + } + } +} + +static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) { + const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR); + return sym - MAX_ANGLE_DELTA; +} + +static void read_filter_intra_mode_info(const AV1_COMMON *const cm, + MACROBLOCKD *const xd, aom_reader *r) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + FILTER_INTRA_MODE_INFO *filter_intra_mode_info = + &mbmi->filter_intra_mode_info; + + if (av1_filter_intra_allowed(cm, mbmi)) { + filter_intra_mode_info->use_filter_intra = aom_read_symbol( + r, xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2, ACCT_STR); + if (filter_intra_mode_info->use_filter_intra) { + filter_intra_mode_info->filter_intra_mode = aom_read_symbol( + r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR); + } + } else { + filter_intra_mode_info->use_filter_intra = 0; + } +} + +void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, + int blk_col, TX_SIZE tx_size, aom_reader *r) { + MB_MODE_INFO *mbmi = xd->mi[0]; + uint8_t *tx_type = + &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; + *tx_type = DCT_DCT; + + // No need to read transform type if block is skipped. + if (mbmi->skip_txfm || + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + return; + + // No need to read transform type for lossless mode(qindex==0). + const int qindex = xd->qindex[mbmi->segment_id]; + if (qindex == 0) return; + + const int inter_block = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) > + 1) { + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, inter_block, cm->features.reduced_tx_set_used); + const int eset = + get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used); + // eset == 0 should correspond to a set with only DCT_DCT and + // there is no need to read the tx_type + assert(eset != 0); + + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + if (inter_block) { + *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( + r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; + } else { + const PREDICTION_MODE intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra + ? fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode] + : mbmi->mode; + *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol( + r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode], + av1_num_ext_tx_set[tx_set_type], ACCT_STR)]; + } + } +} + +static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref, + nmv_context *ctx, MvSubpelPrecision precision); + +static INLINE int is_mv_valid(const MV *mv); + +static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, + const int_mv *ref_mv, int mi_row, int mi_col, + BLOCK_SIZE bsize, aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE); + // DV should not have sub-pel. + assert((mv->as_mv.col & 7) == 0); + assert((mv->as_mv.row & 7) == 0); + mv->as_mv.col = (mv->as_mv.col >> 3) * 8; + mv->as_mv.row = (mv->as_mv.row >> 3) * 8; + int valid = is_mv_valid(&mv->as_mv) && + av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize, + cm->seq_params->mib_size_log2); + return valid; +} + +static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb, + aom_reader *r) { + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR); + if (mbmi->use_intrabc) { + BLOCK_SIZE bsize = mbmi->bsize; + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + mbmi->motion_mode = SIMPLE_TRANSLATION; + + int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; + int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES]; + + av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, dcb->ref_mv_count, + xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL, + inter_mode_ctx); + + int_mv nearestmv, nearmv; + + av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0); + int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; + if (dv_ref.as_int == 0) + av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row); + // Ref DV should not have sub-pel. + int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0; + dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8; + dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8; + valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row, + xd->mi_col, bsize, r); + if (!valid_dv) { + // Intra bc motion vectors are not valid - signal corrupt frame + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid intrabc dv"); + } + } +} + +// If delta q is present, reads delta_q index. +// Also reads delta_q loop filter levels, if present. +static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r) { + DeltaQInfo *const delta_q_info = &cm->delta_q_info; + + if (delta_q_info->delta_q_present_flag) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + xd->current_base_qindex += + read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res; + /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */ + xd->current_base_qindex = clamp(xd->current_base_qindex, 1, MAXQ); + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + if (delta_q_info->delta_lf_present_flag) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + const int tmp_lvl = + xd->delta_lf[lf_id] + + read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi, + mi_col, mi_row) * + delta_q_info->delta_lf_res; + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] = + clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + } + } else { + const int tmp_lvl = xd->delta_lf_from_base + + read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf, + mbmi, mi_col, mi_row) * + delta_q_info->delta_lf_res; + mbmi->delta_lf_from_base = xd->delta_lf_from_base = + clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + } + } + } +} + +static void read_intra_frame_mode_info(AV1_COMMON *const cm, + DecoderCodingBlock *dcb, aom_reader *r) { + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const BLOCK_SIZE bsize = mbmi->bsize; + struct segmentation *const seg = &cm->seg; + + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (seg->segid_preskip) + mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0); + + mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r); + + if (!seg->segid_preskip) + mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip_txfm); + + read_cdef(cm, r, xd); + + read_delta_q_params(cm, xd, r); + + mbmi->current_qindex = xd->current_base_qindex; + + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + if (av1_allow_intrabc(cm)) { + read_intrabc_info(cm, dcb, r); + if (is_intrabc_block(mbmi)) return; + } + + mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi)); + + const int use_angle_delta = av1_use_angle_delta(bsize); + mbmi->angle_delta[PLANE_TYPE_Y] = + (use_angle_delta && av1_is_directional_mode(mbmi->mode)) + ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) + : 0; + + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { + mbmi->uv_mode = + read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); + if (mbmi->uv_mode == UV_CFL_PRED) { + mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs); + } + const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode); + mbmi->angle_delta[PLANE_TYPE_UV] = + (use_angle_delta && av1_is_directional_mode(intra_mode)) + ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED]) + : 0; + } else { + // Avoid decoding angle_info if there is no chroma prediction + mbmi->uv_mode = UV_DC_PRED; + } + xd->cfl.store_y = store_cfl_required(cm, xd); + + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) + read_palette_mode_info(cm, xd, r); + + read_filter_intra_mode_info(cm, xd, r); +} + +static int read_mv_component(aom_reader *r, nmv_component *mvcomp, + int use_subpel, int usehp) { + int mag, d, fr, hp; + const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR); + const int mv_class = + aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR); + const int class0 = mv_class == MV_CLASS_0; + + // Integer part + if (class0) { + d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR); + mag = 0; + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + d = 0; + for (int i = 0; i < n; ++i) + d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i; + mag = CLASS0_SIZE << (mv_class + 2); + } + + if (use_subpel) { + // Fractional part + fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, + MV_FP_SIZE, ACCT_STR); + + // High precision part (if hp is not used, the default value of the hp is 1) + hp = usehp ? aom_read_symbol( + r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2, + ACCT_STR) + : 1; + } else { + fr = 3; + hp = 1; + } + + // Result + mag += ((d << 3) | (fr << 1) | hp) + 1; + return sign ? -mag : mag; +} + +static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref, + nmv_context *ctx, MvSubpelPrecision precision) { + MV diff = kZeroMv; + const MV_JOINT_TYPE joint_type = + (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR); + + if (mv_joint_vertical(joint_type)) + diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE, + precision > MV_SUBPEL_LOW_PRECISION); + + if (mv_joint_horizontal(joint_type)) + diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE, + precision > MV_SUBPEL_LOW_PRECISION); + + mv->row = ref->row + diff.row; + mv->col = ref->col + diff.col; +} + +static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm, + const MACROBLOCKD *xd, + aom_reader *r) { + if (!is_comp_ref_allowed(xd->mi[0]->bsize)) return SINGLE_REFERENCE; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + const int ctx = av1_get_reference_mode_context(xd); + const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol( + r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR); + return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE + } else { + assert(cm->current_frame.reference_mode == SINGLE_REFERENCE); + return cm->current_frame.reference_mode; + } +} + +#define READ_REF_BIT(pname) \ + aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR) + +static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd, + aom_reader *r) { + const int ctx = av1_get_comp_reference_type_context(xd); + const COMP_REFERENCE_TYPE comp_ref_type = + (COMP_REFERENCE_TYPE)aom_read_symbol( + r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR); + return comp_ref_type; // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE +} + +static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref_frame[2]) { + ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0; + ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1; +} + +// Read the referncence frame +static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd, + aom_reader *r, int segment_id, + MV_REFERENCE_FRAME ref_frame[2]) { + if (xd->mi[0]->skip_mode) { + set_ref_frames_for_skip_mode(cm, ref_frame); + return; + } + + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); + ref_frame[1] = NONE_FRAME; + } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = NONE_FRAME; + } else { + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); + + if (mode == COMPOUND_REFERENCE) { + const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r); + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = READ_REF_BIT(uni_comp_ref_p); + if (bit) { + ref_frame[0] = BWDREF_FRAME; + ref_frame[1] = ALTREF_FRAME; + } else { + const int bit1 = READ_REF_BIT(uni_comp_ref_p1); + if (bit1) { + const int bit2 = READ_REF_BIT(uni_comp_ref_p2); + if (bit2) { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = GOLDEN_FRAME; + } else { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = LAST3_FRAME; + } + } else { + ref_frame[0] = LAST_FRAME; + ref_frame[1] = LAST2_FRAME; + } + } + + return; + } + + assert(comp_ref_type == BIDIR_COMP_REFERENCE); + + const int idx = 1; + const int bit = READ_REF_BIT(comp_ref_p); + // Decode forward references. + if (!bit) { + const int bit1 = READ_REF_BIT(comp_ref_p1); + ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME; + } else { + const int bit2 = READ_REF_BIT(comp_ref_p2); + ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME; + } + + // Decode backward references. + const int bit_bwd = READ_REF_BIT(comp_bwdref_p); + if (!bit_bwd) { + const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1); + ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME; + } else { + ref_frame[idx] = ALTREF_FRAME; + } + } else if (mode == SINGLE_REFERENCE) { + const int bit0 = READ_REF_BIT(single_ref_p1); + if (bit0) { + const int bit1 = READ_REF_BIT(single_ref_p2); + if (!bit1) { + const int bit5 = READ_REF_BIT(single_ref_p6); + ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME; + } else { + ref_frame[0] = ALTREF_FRAME; + } + } else { + const int bit2 = READ_REF_BIT(single_ref_p3); + if (bit2) { + const int bit4 = READ_REF_BIT(single_ref_p5); + ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME; + } else { + const int bit3 = READ_REF_BIT(single_ref_p4); + ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME; + } + } + + ref_frame[1] = NONE_FRAME; + } else { + assert(0 && "Invalid prediction mode."); + } + } +} + +static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd, + InterpFilter interp_filter, + bool enable_dual_filter, + MB_MODE_INFO *const mbmi, + aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!av1_is_interp_needed(xd)) { + set_default_interp_filters(mbmi, interp_filter); + return; + } + + if (interp_filter != SWITCHABLE) { + mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter); + } else { + InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }; + for (int dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + ref0_filter[dir] = (InterpFilter)aom_read_symbol( + r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR); + if (!enable_dual_filter) { + ref0_filter[1] = ref0_filter[0]; + break; + } + } + // The index system works as: (0, 1) -> (vertical, horizontal) filter types + mbmi->interp_filters.as_filters.x_filter = ref0_filter[1]; + mbmi->interp_filters.as_filters.y_filter = ref0_filter[0]; + } +} + +static void read_intra_block_mode_info(AV1_COMMON *const cm, + MACROBLOCKD *const xd, + MB_MODE_INFO *const mbmi, + aom_reader *r) { + const BLOCK_SIZE bsize = mbmi->bsize; + const int use_angle_delta = av1_use_angle_delta(bsize); + + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]); + + mbmi->angle_delta[PLANE_TYPE_Y] = + use_angle_delta && av1_is_directional_mode(mbmi->mode) + ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) + : 0; + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { + mbmi->uv_mode = + read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); + if (mbmi->uv_mode == UV_CFL_PRED) { + mbmi->cfl_alpha_idx = + read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs); + } + const PREDICTION_MODE intra_mode = get_uv_mode(mbmi->uv_mode); + mbmi->angle_delta[PLANE_TYPE_UV] = + use_angle_delta && av1_is_directional_mode(intra_mode) + ? read_angle_delta(r, ec_ctx->angle_delta_cdf[intra_mode - V_PRED]) + : 0; + } else { + // Avoid decoding angle_info if there is no chroma prediction + mbmi->uv_mode = UV_DC_PRED; + } + xd->cfl.store_y = store_cfl_required(cm, xd); + + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) + read_palette_mode_info(cm, xd, r); + + read_filter_intra_mode_info(cm, xd, r); +} + +static INLINE int is_mv_valid(const MV *mv) { + return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW && + mv->col < MV_UPP; +} + +static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd, + PREDICTION_MODE mode, + MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2], + int_mv ref_mv[2], int_mv nearest_mv[2], + int_mv near_mv[2], int is_compound, int allow_hp, + aom_reader *r) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + MB_MODE_INFO *mbmi = xd->mi[0]; + BLOCK_SIZE bsize = mbmi->bsize; + FeatureFlags *const features = &cm->features; + if (features->cur_frame_force_integer_mv) { + allow_hp = MV_SUBPEL_NONE; + } + switch (mode) { + case NEWMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); + break; + } + case NEARESTMV: { + mv[0].as_int = nearest_mv[0].as_int; + break; + } + case NEARMV: { + mv[0].as_int = near_mv[0].as_int; + break; + } + case GLOBALMV: { + mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]], + features->allow_high_precision_mv, + bsize, xd->mi_col, xd->mi_row, + features->cur_frame_force_integer_mv) + .as_int; + break; + } + case NEW_NEWMV: { + assert(is_compound); + for (int i = 0; i < 2; ++i) { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp); + } + break; + } + case NEAREST_NEARESTMV: { + assert(is_compound); + mv[0].as_int = nearest_mv[0].as_int; + mv[1].as_int = nearest_mv[1].as_int; + break; + } + case NEAR_NEARMV: { + assert(is_compound); + mv[0].as_int = near_mv[0].as_int; + mv[1].as_int = near_mv[1].as_int; + break; + } + case NEW_NEARESTMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); + assert(is_compound); + mv[1].as_int = nearest_mv[1].as_int; + break; + } + case NEAREST_NEWMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + mv[0].as_int = nearest_mv[0].as_int; + read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp); + assert(is_compound); + break; + } + case NEAR_NEWMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + mv[0].as_int = near_mv[0].as_int; + read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp); + assert(is_compound); + break; + } + case NEW_NEARMV: { + nmv_context *const nmvc = &ec_ctx->nmvc; + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp); + assert(is_compound); + mv[1].as_int = near_mv[1].as_int; + break; + } + case GLOBAL_GLOBALMV: { + assert(is_compound); + mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]], + features->allow_high_precision_mv, + bsize, xd->mi_col, xd->mi_row, + features->cur_frame_force_integer_mv) + .as_int; + mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]], + features->allow_high_precision_mv, + bsize, xd->mi_col, xd->mi_row, + features->cur_frame_force_integer_mv) + .as_int; + break; + } + default: { + return 0; + } + } + + int ret = is_mv_valid(&mv[0].as_mv); + if (is_compound) { + ret = ret && is_mv_valid(&mv[1].as_mv); + } + return ret; +} + +static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd, + int segment_id, aom_reader *r) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (frame < LAST_FRAME) return 0; + return frame != INTRA_FRAME; + } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + return 1; + } + const int ctx = av1_get_intra_inter_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const int is_inter = + aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR); + return is_inter; +} + +#if DEC_MISMATCH_DEBUG +static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row, + int mi_col, int16_t mode_ctx) { + int_mv mv[2] = { { 0 } }; + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) + mv[ref].as_mv = mbmi->mv[ref].as_mv; + + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + int16_t zeromv_ctx = -1; + int16_t refmv_ctx = -1; + if (mbmi->mode != NEWMV) { + zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mbmi->mode != GLOBALMV) + refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + } + +#define FRAME_TO_CHECK 11 + if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) { + printf( + "=== DECODER ===: " + "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " + "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " + "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " + "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", + cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, + mbmi->mode, mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, + mv[0].as_mv.col, mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], + mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx, + refmv_ctx, mbmi->tx_size); + } +} +#endif // DEC_MISMATCH_DEBUG + +static void read_inter_block_mode_info(AV1Decoder *const pbi, + DecoderCodingBlock *dcb, + MB_MODE_INFO *const mbmi, + aom_reader *r) { + AV1_COMMON *const cm = &pbi->common; + FeatureFlags *const features = &cm->features; + const BLOCK_SIZE bsize = mbmi->bsize; + const int allow_hp = features->allow_high_precision_mv; + int_mv nearestmv[2], nearmv[2]; + int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } }; + int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + MACROBLOCKD *const xd = &dcb->xd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + mbmi->uv_mode = UV_DC_PRED; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + + av1_collect_neighbors_ref_counts(xd); + + read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); + const int is_compound = has_second_ref(mbmi); + + const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame); + av1_find_mv_refs(cm, xd, mbmi, ref_frame, dcb->ref_mv_count, xd->ref_mv_stack, + xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx); + + mbmi->ref_mv_idx = 0; + + if (mbmi->skip_mode) { + assert(is_compound); + mbmi->mode = NEAREST_NEARESTMV; + } else { + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) { + mbmi->mode = GLOBALMV; + } else { + const int mode_ctx = + av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame); + if (is_compound) + mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx); + else + mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx); + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV || + have_nearmv_in_inter_mode(mbmi->mode)) + read_drl_idx(ec_ctx, dcb, mbmi, r); + } + } + + if (is_compound != is_inter_compound_mode(mbmi->mode)) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Prediction mode %d invalid with ref frame %d %d", + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + + if (!is_compound && mbmi->mode != GLOBALMV) { + av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0], + &nearmv[0], features->cur_frame_force_integer_mv); + } + + if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) { + const int ref_mv_idx = mbmi->ref_mv_idx + 1; + nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv; + nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv; + nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv; + nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv; + lower_mv_precision(&nearestmv[0].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + lower_mv_precision(&nearestmv[1].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + lower_mv_precision(&nearmv[0].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + lower_mv_precision(&nearmv[1].as_mv, allow_hp, + features->cur_frame_force_integer_mv); + } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) { + nearmv[0] = + xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv; + } + + int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] }; + + if (is_compound) { + int ref_mv_idx = mbmi->ref_mv_idx; + // Special case: NEAR_NEWMV and NEW_NEARMV modes use + // 1 + mbmi->ref_mv_idx (like NEARMV) instead of + // mbmi->ref_mv_idx (like NEWMV) + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) + ref_mv_idx = 1 + mbmi->ref_mv_idx; + + // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here? + if (compound_ref0_mode(mbmi->mode) == NEWMV) + ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv; + + if (compound_ref1_mode(mbmi->mode) == NEWMV) + ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv; + } else { + if (mbmi->mode == NEWMV) { + if (dcb->ref_mv_count[ref_frame] > 1) + ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv; + } + } + + if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV); + + const int mv_corrupted_flag = + !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv, + nearestmv, nearmv, is_compound, allow_hp, r); + aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag); + + mbmi->use_wedge_interintra = 0; + if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode && + is_interintra_allowed(mbmi)) { + const int bsize_group = size_group_lookup[bsize]; + const int interintra = + aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR); + assert(mbmi->ref_frame[1] == NONE_FRAME); + if (interintra) { + const INTERINTRA_MODE interintra_mode = + read_interintra_mode(xd, r, bsize_group); + mbmi->ref_frame[1] = INTRA_FRAME; + mbmi->interintra_mode = interintra_mode; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + if (av1_is_wedge_used(bsize)) { + mbmi->use_wedge_interintra = aom_read_symbol( + r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR); + if (mbmi->use_wedge_interintra) { + mbmi->interintra_wedge_index = (int8_t)aom_read_symbol( + r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR); + } + } + } + } + + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame); + } + + mbmi->motion_mode = SIMPLE_TRANSLATION; + if (is_motion_variation_allowed_bsize(mbmi->bsize) && !mbmi->skip_mode && + !has_second_ref(mbmi)) { + mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); + } + av1_count_overlappable_neighbors(cm, xd); + + if (mbmi->ref_frame[1] != INTRA_FRAME) + mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r); + + // init + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + + if (has_second_ref(mbmi) && !mbmi->skip_mode) { + // Read idx to indicate current compound inter prediction mode group + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + + if (masked_compound_used) { + const int ctx_comp_group_idx = get_comp_group_idx_context(xd); + mbmi->comp_group_idx = (uint8_t)aom_read_symbol( + r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR); + } + + if (mbmi->comp_group_idx == 0) { + if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { + const int comp_index_ctx = get_comp_index_context(cm, xd); + mbmi->compound_idx = (uint8_t)aom_read_symbol( + r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR); + mbmi->interinter_comp.type = + mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD; + } else { + // Distance-weighted compound is disabled, so always use average + mbmi->compound_idx = 1; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + } + } else { + assert(cm->current_frame.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + assert(masked_compound_used); + + // compound_diffwtd, wedge + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { + mbmi->interinter_comp.type = + COMPOUND_WEDGE + aom_read_symbol(r, + ec_ctx->compound_type_cdf[bsize], + MASKED_COMPOUND_TYPES, ACCT_STR); + } else { + mbmi->interinter_comp.type = COMPOUND_DIFFWTD; + } + + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol( + r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR); + mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR); + } else { + assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + mbmi->interinter_comp.mask_type = + aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR); + } + } + } + + read_mb_interp_filter(xd, features->interp_filter, + cm->seq_params->enable_dual_filter, mbmi, r); + + if (mbmi->motion_mode == WARPED_CAUSAL) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; + mbmi->wm_params.invalid = 0; + + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); + } + + if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + &mbmi->wm_params, mi_row, mi_col)) { +#if WARPED_MOTION_DEBUG + printf("Warning: unexpected warped model from aomenc\n"); +#endif + mbmi->wm_params.invalid = 1; + } + } + + xd->cfl.store_y = store_cfl_required(cm, xd); + +#if DEC_MISMATCH_DEBUG + dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx); +#endif // DEC_MISMATCH_DEBUG +} + +static void read_inter_frame_mode_info(AV1Decoder *const pbi, + DecoderCodingBlock *dcb, aom_reader *r) { + AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int inter_block = 1; + + mbmi->mv[0].as_int = 0; + mbmi->mv[1].as_int = 0; + mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r); + + mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r); + + if (mbmi->skip_mode) + mbmi->skip_txfm = 1; + else + mbmi->skip_txfm = read_skip_txfm(cm, xd, mbmi->segment_id, r); + + if (!cm->seg.segid_preskip) + mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r); + + read_cdef(cm, r, xd); + + read_delta_q_params(cm, xd, r); + + if (!mbmi->skip_mode) + inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); + + mbmi->current_qindex = xd->current_base_qindex; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + if (inter_block) + read_inter_block_mode_info(pbi, dcb, mbmi, r); + else + read_intra_block_mode_info(cm, xd, mbmi, r); +} + +static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col, + int x_mis, int y_mis) { + const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1); + MV_REF *frame_mvs = + cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1); + x_mis = ROUND_POWER_OF_TWO(x_mis, 1); + y_mis = ROUND_POWER_OF_TWO(y_mis, 1); + + for (int h = 0; h < y_mis; h++) { + MV_REF *mv = frame_mvs; + for (int w = 0; w < x_mis; w++) { + mv->ref_frame = NONE_FRAME; + mv++; + } + frame_mvs += frame_mvs_stride; + } +} + +void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, + aom_reader *r, int x_mis, int y_mis) { + AV1_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *const mi = xd->mi[0]; + mi->use_intrabc = 0; + + if (frame_is_intra_only(cm)) { + read_intra_frame_mode_info(cm, dcb, r); + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) + intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis); + } else { + read_inter_frame_mode_info(pbi, dcb, r); + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) + av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis); + } +} diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h new file mode 100644 index 0000000000..3d8629c9a5 --- /dev/null +++ b/third_party/aom/av1/decoder/decodemv.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODEMV_H_ +#define AOM_AV1_DECODER_DECODEMV_H_ + +#include "aom_dsp/bitreader.h" + +#include "av1/decoder/decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, + aom_reader *r, int x_mis, int y_mis); + +#ifdef __cplusplus +} // extern "C" +#endif + +void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row, + int blk_col, TX_SIZE tx_size, aom_reader *r); + +#endif // AOM_AV1_DECODER_DECODEMV_H_ diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c new file mode 100644 index 0000000000..32e94840be --- /dev/null +++ b/third_party/aom/av1/decoder/decoder.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_scale/aom_scale.h" +#include "aom_util/aom_thread.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/decoder.h" +#include "av1/decoder/detokenize.h" +#include "av1/decoder/obu.h" + +static void initialize_dec(void) { + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_wedge_masks(); +} + +static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height, BLOCK_SIZE min_partition_size) { + (void)min_partition_size; + // Ensure that the decoded width and height are both multiples of + // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if + // subsampling is used). + // This simplifies the implementation of various experiments, + // eg. cdef, which operates on units of 8x8 luma pixels. + const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + const int aligned_height = ALIGN_POWER_OF_TWO(height, 3); + + mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2; + mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2; + mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); + + mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2); + mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2); + mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; + + mi_params->mi_alloc_bsize = BLOCK_4X4; + mi_params->mi_alloc_stride = mi_params->mi_stride; + + assert(mi_size_wide[mi_params->mi_alloc_bsize] == + mi_size_high[mi_params->mi_alloc_bsize]); +} + +static void dec_setup_mi(CommonModeInfoParams *mi_params) { + const int mi_grid_size = + mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); + memset(mi_params->mi_grid_base, 0, + mi_grid_size * sizeof(*mi_params->mi_grid_base)); +} + +static void dec_free_mi(CommonModeInfoParams *mi_params) { + aom_free(mi_params->mi_alloc); + mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; + aom_free(mi_params->mi_grid_base); + mi_params->mi_grid_base = NULL; + mi_params->mi_grid_size = 0; + aom_free(mi_params->tx_type_map); + mi_params->tx_type_map = NULL; +} + +AV1Decoder *av1_decoder_create(BufferPool *const pool) { + AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi)); + if (!pbi) return NULL; + av1_zero(*pbi); + + AV1_COMMON *volatile const cm = &pbi->common; + cm->seq_params = &pbi->seq_params; + cm->error = &pbi->error; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(pbi->error.jmp)) { + pbi->error.setjmp = 0; + av1_decoder_remove(pbi); + return NULL; + } + + pbi->error.setjmp = 1; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + cm, cm->default_frame_context, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); + + pbi->need_resync = 1; + initialize_dec(); + + // Initialize the references to not point to any frame buffers. + for (int i = 0; i < REF_FRAMES; i++) { + cm->ref_frame_map[i] = NULL; + } + + cm->current_frame.frame_number = 0; + pbi->decoding_first_frame = 1; + pbi->common.buffer_pool = pool; + + cm->seq_params->bit_depth = AOM_BITS_8; + + cm->mi_params.free_mi = dec_free_mi; + cm->mi_params.setup_mi = dec_setup_mi; + cm->mi_params.set_mb_mi = dec_set_mb_mi; + + av1_loop_filter_init(cm); + + av1_qm_init(&cm->quant_params, av1_num_planes(cm)); + av1_loop_restoration_precal(); + +#if CONFIG_ACCOUNTING + pbi->acct_enabled = 1; + aom_accounting_init(&pbi->accounting); +#endif + + pbi->error.setjmp = 0; + + aom_get_worker_interface()->init(&pbi->lf_worker); + pbi->lf_worker.thread_name = "aom lf worker"; + + return pbi; +} + +void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) { + if (tile_mt_info != NULL) { +#if CONFIG_MULTITHREAD + if (tile_mt_info->job_mutex != NULL) { + pthread_mutex_destroy(tile_mt_info->job_mutex); + aom_free(tile_mt_info->job_mutex); + } +#endif + aom_free(tile_mt_info->job_queue); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*tile_mt_info); + } +} + +void av1_dec_free_cb_buf(AV1Decoder *pbi) { + aom_free(pbi->cb_buffer_base); + pbi->cb_buffer_base = NULL; + pbi->cb_buffer_alloc_size = 0; +} + +void av1_decoder_remove(AV1Decoder *pbi) { + int i; + + if (!pbi) return; + + // Free the tile list output buffer. + aom_free_frame_buffer(&pbi->tile_list_outbuf); + + aom_get_worker_interface()->end(&pbi->lf_worker); + aom_free(pbi->lf_worker.data1); + + if (pbi->thread_data) { + for (int worker_idx = 1; worker_idx < pbi->num_workers; worker_idx++) { + DecWorkerData *const thread_data = pbi->thread_data + worker_idx; + if (thread_data->td != NULL) { + av1_free_mc_tmp_buf(thread_data->td); + aom_free(thread_data->td); + } + } + aom_free(pbi->thread_data); + } + aom_free(pbi->dcb.xd.seg_mask); + + for (i = 0; i < pbi->num_workers; ++i) { + AVxWorker *const worker = &pbi->tile_workers[i]; + aom_get_worker_interface()->end(worker); + } +#if CONFIG_MULTITHREAD + if (pbi->row_mt_mutex_ != NULL) { + pthread_mutex_destroy(pbi->row_mt_mutex_); + aom_free(pbi->row_mt_mutex_); + } + if (pbi->row_mt_cond_ != NULL) { + pthread_cond_destroy(pbi->row_mt_cond_); + aom_free(pbi->row_mt_cond_); + } +#endif + for (i = 0; i < pbi->allocated_tiles; i++) { + TileDataDec *const tile_data = pbi->tile_data + i; + av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync); + } + aom_free(pbi->tile_data); + aom_free(pbi->tile_workers); + + if (pbi->num_workers > 0) { + av1_loop_filter_dealloc(&pbi->lf_row_sync); + av1_loop_restoration_dealloc(&pbi->lr_row_sync); + av1_dealloc_dec_jobs(&pbi->tile_mt_info); + } + + av1_dec_free_cb_buf(pbi); +#if CONFIG_ACCOUNTING + aom_accounting_clear(&pbi->accounting); +#endif + av1_free_mc_tmp_buf(&pbi->td); + aom_img_metadata_array_free(pbi->metadata); + av1_remove_common(&pbi->common); + aom_free(pbi); +} + +void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, + aom_reader *r, palette_visitor_fn_t visit) { + if (!is_inter_block(xd->mi[0])) { + for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common)); + ++plane) { + if (plane == 0 || xd->is_chroma_ref) { + if (xd->mi[0]->palette_mode_info.palette_size[plane]) + visit(xd, plane, r); + } else { + assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0); + } + } + } +} + +static int equal_dimensions(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width; +} + +aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx, + YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *cm = &pbi->common; + const int num_planes = av1_num_planes(cm); + + const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx); + if (cfg == NULL) { + aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame"); + return AOM_CODEC_ERROR; + } + if (!equal_dimensions(cfg, sd)) + aom_internal_error(&pbi->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(cfg, sd, num_planes); + + return pbi->error.error_code; +} + +static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width && + a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && + a->border == b->border && + (a->flags & YV12_FLAG_HIGHBITDEPTH) == + (b->flags & YV12_FLAG_HIGHBITDEPTH); +} + +aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, + int use_external_ref, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *ref_buf = NULL; + + // Get the destination reference buffer. + ref_buf = get_ref_frame(cm, idx); + + if (ref_buf == NULL) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame"); + return AOM_CODEC_ERROR; + } + + if (!use_external_ref) { + if (!equal_dimensions(ref_buf, sd)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + // Overwrite the reference frame buffer. + aom_yv12_copy_frame(sd, ref_buf, num_planes); + } + } else { + if (!equal_dimensions_and_border(ref_buf, sd)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + // Overwrite the reference frame buffer pointers. + // Once we no longer need the external reference buffer, these pointers + // are restored. + ref_buf->store_buf_adr[0] = ref_buf->y_buffer; + ref_buf->store_buf_adr[1] = ref_buf->u_buffer; + ref_buf->store_buf_adr[2] = ref_buf->v_buffer; + ref_buf->y_buffer = sd->y_buffer; + ref_buf->u_buffer = sd->u_buffer; + ref_buf->v_buffer = sd->v_buffer; + ref_buf->use_external_reference_buffers = 1; + } + } + + return cm->error->error_code; +} + +aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + + if (!equal_dimensions_and_border(new_frame, sd)) + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(new_frame, sd, num_planes); + + return cm->error->error_code; +} + +static void release_current_frame(AV1Decoder *pbi) { + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + cm->cur_frame->buf.corrupted = 1; + lock_buffer_pool(pool); + decrease_ref_count(cm->cur_frame, pool); + unlock_buffer_pool(pool); + cm->cur_frame = NULL; +} + +// If any buffer updating is signaled it should be done here. +// Consumes a reference to cm->cur_frame. +// +// This functions returns void. It reports failure by setting +// pbi->error.error_code. +static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) { + int ref_index = 0, mask; + AV1_COMMON *const cm = &pbi->common; + BufferPool *const pool = cm->buffer_pool; + + if (frame_decoded) { + lock_buffer_pool(pool); + + // In ext-tile decoding, the camera frame header is only decoded once. So, + // we don't update the references here. + if (!pbi->camera_frame_header_ready) { + // The following for loop needs to release the reference stored in + // cm->ref_frame_map[ref_index] before storing a reference to + // cm->cur_frame in cm->ref_frame_map[ref_index]. + for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + decrease_ref_count(cm->ref_frame_map[ref_index], pool); + cm->ref_frame_map[ref_index] = cm->cur_frame; + ++cm->cur_frame->ref_count; + } + ++ref_index; + } + } + + if (cm->show_existing_frame || cm->show_frame) { + if (pbi->output_all_layers) { + // Append this frame to the output queue + if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) { + // We can't store the new frame anywhere, so drop it and return an + // error + cm->cur_frame->buf.corrupted = 1; + decrease_ref_count(cm->cur_frame, pool); + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + } else { + pbi->output_frames[pbi->num_output_frames] = cm->cur_frame; + pbi->num_output_frames++; + } + } else { + // Replace any existing output frame + assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1); + if (pbi->num_output_frames > 0) { + decrease_ref_count(pbi->output_frames[0], pool); + } + pbi->output_frames[0] = cm->cur_frame; + pbi->num_output_frames = 1; + } + } else { + decrease_ref_count(cm->cur_frame, pool); + } + + unlock_buffer_pool(pool); + } else { + // Nothing was decoded, so just drop this frame buffer + lock_buffer_pool(pool); + decrease_ref_count(cm->cur_frame, pool); + unlock_buffer_pool(pool); + } + cm->cur_frame = NULL; + + if (!pbi->camera_frame_header_ready) { + // Invalidate these references until the next frame starts. + for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) { + cm->remapped_ref_idx[ref_index] = INVALID_IDX; + } + } +} + +int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, + const uint8_t **psource) { + AV1_COMMON *volatile const cm = &pbi->common; + const uint8_t *source = *psource; + pbi->error.error_code = AOM_CODEC_OK; + pbi->error.has_detail = 0; + + if (size == 0) { + // This is used to signal that we are missing frames. + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + // + // TODO(jkoleszar): Error concealment is undefined and non-normative + // at this point, but if it becomes so, [0] may not always be the correct + // thing to do here. + RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME); + if (ref_buf != NULL) ref_buf->buf.corrupted = 1; + } + + if (assign_cur_frame_new_fb(cm) == NULL) { + pbi->error.error_code = AOM_CODEC_MEM_ERROR; + return 1; + } + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(pbi->error.jmp)) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int i; + + pbi->error.setjmp = 0; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + release_current_frame(pbi); + return -1; + } + + pbi->error.setjmp = 1; + + int frame_decoded = + aom_decode_frame_from_obus(pbi, source, source + size, psource); + + if (frame_decoded < 0) { + assert(pbi->error.error_code != AOM_CODEC_OK); + release_current_frame(pbi); + pbi->error.setjmp = 0; + return 1; + } + +#if TXCOEFF_TIMER + cm->cum_txcoeff_timer += cm->txcoeff_timer; + fprintf(stderr, + "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n", + cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer); + cm->txcoeff_timer = 0; + cm->txb_count = 0; +#endif + + // Note: At this point, this function holds a reference to cm->cur_frame + // in the buffer pool. This reference is consumed by update_frame_buffers(). + update_frame_buffers(pbi, frame_decoded); + + if (frame_decoded) { + pbi->decoding_first_frame = 0; + } + + if (pbi->error.error_code != AOM_CODEC_OK) { + pbi->error.setjmp = 0; + return 1; + } + + if (!cm->show_existing_frame) { + if (cm->seg.enabled) { + if (cm->prev_frame && + (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) && + (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) { + cm->last_frame_seg_map = cm->prev_frame->seg_map; + } else { + cm->last_frame_seg_map = NULL; + } + } + } + + // Update progress in frame parallel decode. + pbi->error.setjmp = 0; + + return 0; +} + +// Get the frame at a particular index in the output queue +int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, + aom_film_grain_t **grain_params) { + if (index >= pbi->num_output_frames) return -1; + *sd = &pbi->output_frames[index]->buf; + *grain_params = &pbi->output_frames[index]->film_grain_params; + return 0; +} + +// Get the highest-spatial-layer output +// TODO(rachelbarker): What should this do? +int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) { + if (pbi->num_output_frames == 0) return -1; + + *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf; + return 0; +} diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h new file mode 100644 index 0000000000..560b1d9f24 --- /dev/null +++ b/third_party/aom/av1/decoder/decoder.h @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODER_H_ +#define AOM_AV1_DECODER_DECODER_H_ + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom_dsp/bitreader.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_thread.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/thread_common.h" +#include "av1/decoder/dthread.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif +#if CONFIG_INSPECTION +#include "av1/decoder/inspection.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief Contains coding block data required by the decoder. + * + * This includes: + * - Coding block info that is common between encoder and decoder. + * - Other coding block info only needed by the decoder. + * Contrast this with a similar struct MACROBLOCK on encoder side. + * This data is also common between ThreadData and AV1Decoder structs. + */ +typedef struct DecoderCodingBlock { + /*! + * Coding block info that is common between encoder and decoder. + */ + DECLARE_ALIGNED(32, MACROBLOCKD, xd); + /*! + * True if the at least one of the coding blocks decoded was corrupted. + */ + int corrupted; + /*! + * Pointer to 'mc_buf' inside 'pbi->td' (single-threaded decoding) or + * 'pbi->thread_data[i].td' (multi-threaded decoding). + */ + uint8_t *mc_buf[2]; + /*! + * Pointer to 'dqcoeff' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base' + * with appropriate offset for the current superblock, for each plane. + */ + tran_low_t *dqcoeff_block[MAX_MB_PLANE]; + /*! + * cb_offset[p] is the offset into the dqcoeff_block[p] for the current coding + * block, for each plane 'p'. + */ + uint16_t cb_offset[MAX_MB_PLANE]; + /*! + * Pointer to 'eob_data' inside 'td->cb_buffer_base' or 'pbi->cb_buffer_base' + * with appropriate offset for the current superblock, for each plane. + */ + eob_info *eob_data[MAX_MB_PLANE]; + /*! + * txb_offset[p] is the offset into the eob_data[p] for the current coding + * block, for each plane 'p'. + */ + uint16_t txb_offset[MAX_MB_PLANE]; + /*! + * ref_mv_count[i] specifies the number of number of motion vector candidates + * in xd->ref_mv_stack[i]. + */ + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; +} DecoderCodingBlock; + +/*!\cond */ + +typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm, + DecoderCodingBlock *dcb, + aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size); + +typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm, + DecoderCodingBlock *dcb, + BLOCK_SIZE bsize); + +typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm, + MACROBLOCKD *const xd); + +typedef struct ThreadData { + DecoderCodingBlock dcb; + + // Coding block buffer for the current superblock. + // Used only for single-threaded decoding and multi-threaded decoding with + // row_mt == 1 cases. + // See also: similar buffer in 'AV1Decoder'. + CB_BUFFER cb_buffer_base; + + aom_reader *bit_reader; + + // Motion compensation buffer used to get a prediction buffer with extended + // borders. One buffer for each of the two possible references. + uint8_t *mc_buf[2]; + // Mask for this block used for compound prediction. + uint8_t *seg_mask; + // Allocated size of 'mc_buf'. + int32_t mc_buf_size; + // If true, the pointers in 'mc_buf' were converted from highbd pointers. + int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in + // mc_buf were converted from highbd pointers. + + CONV_BUF_TYPE *tmp_conv_dst; + uint8_t *tmp_obmc_bufs[2]; + + decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit; + decode_block_visitor_fn_t predict_and_recon_intra_block_visit; + decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit; + decode_block_visitor_fn_t inverse_tx_inter_block_visit; + predict_inter_block_visitor_fn_t predict_inter_block_visit; + cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit; +} ThreadData; + +typedef struct AV1DecRowMTJobInfo { + int tile_row; + int tile_col; + int mi_row; +} AV1DecRowMTJobInfo; + +typedef struct AV1DecRowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + int allocated_sb_rows; + int *cur_sb_col; + // Denotes the superblock interval at which conditional signalling should + // happen. Also denotes the minimum number of extra superblocks of the top row + // to be complete to start decoding the current superblock. A value of 1 + // indicates top-right dependency. + int sync_range; + // Denotes the additional number of superblocks in the previous row to be + // complete to start decoding the current superblock when intraBC tool is + // enabled. This additional top-right delay is required to satisfy the + // hardware constraints for intraBC tool when row multithreading is enabled. + int intrabc_extra_top_right_sb_delay; + int mi_rows; + int mi_cols; + int mi_rows_parse_done; + int mi_rows_decode_started; + int num_threads_working; +} AV1DecRowMTSync; + +typedef struct AV1DecRowMTInfo { + int tile_rows_start; + int tile_rows_end; + int tile_cols_start; + int tile_cols_end; + int start_tile; + int end_tile; + int mi_rows_to_decode; + + // Invariant: + // mi_rows_parse_done >= mi_rows_decode_started. + // mi_rows_parse_done and mi_rows_decode_started are both initialized to 0. + // mi_rows_parse_done is incremented freely. mi_rows_decode_started may only + // be incremented to catch up with mi_rows_parse_done but is not allowed to + // surpass mi_rows_parse_done. + // + // When mi_rows_decode_started reaches mi_rows_to_decode, there are no more + // decode jobs. + + // Indicates the progress of the bit-stream parsing of superblocks. + // Initialized to 0. Incremented by sb_mi_size when parse sb row is done. + int mi_rows_parse_done; + // Indicates the progress of the decoding of superblocks. + // Initialized to 0. Incremented by sb_mi_size when decode sb row is started. + int mi_rows_decode_started; + // Boolean: Initialized to 0 (false). Set to 1 (true) on error to abort + // decoding. + int row_mt_exit; +} AV1DecRowMTInfo; + +typedef struct TileDataDec { + TileInfo tile_info; + aom_reader bit_reader; + DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + AV1DecRowMTSync dec_row_mt_sync; +} TileDataDec; + +typedef struct TileBufferDec { + const uint8_t *data; + size_t size; +} TileBufferDec; + +typedef struct DataBuffer { + const uint8_t *data; + size_t size; +} DataBuffer; + +typedef struct EXTERNAL_REFERENCES { + YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES]; + int num; +} EXTERNAL_REFERENCES; + +typedef struct TileJobsDec { + TileBufferDec *tile_buffer; + TileDataDec *tile_data; +} TileJobsDec; + +typedef struct AV1DecTileMTData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *job_mutex; +#endif + TileJobsDec *job_queue; + int jobs_enqueued; + int jobs_dequeued; + int alloc_tile_rows; + int alloc_tile_cols; +} AV1DecTileMT; + +typedef struct AV1Decoder { + DecoderCodingBlock dcb; + + DECLARE_ALIGNED(32, AV1_COMMON, common); + + AVxWorker lf_worker; + AV1LfSync lf_row_sync; + AV1LrSync lr_row_sync; + AV1LrStruct lr_ctxt; + AV1CdefSync cdef_sync; + AV1CdefWorkerData *cdef_worker; + AVxWorker *tile_workers; + int num_workers; + DecWorkerData *thread_data; + ThreadData td; + TileDataDec *tile_data; + int allocated_tiles; + + TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; + AV1DecTileMT tile_mt_info; + + // Each time the decoder is called, we expect to receive a full temporal unit. + // This can contain up to one shown frame per spatial layer in the current + // operating point (note that some layers may be entirely omitted). + // If the 'output_all_layers' option is true, we save all of these shown + // frames so that they can be returned to the application. If the + // 'output_all_layers' option is false, then we only output one image per + // temporal unit. + // + // Note: The saved buffers are released at the start of the next time the + // application calls aom_codec_decode(). + int output_all_layers; + RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS]; + size_t num_output_frames; // How many frames are queued up so far? + + // In order to properly support random-access decoding, we need + // to behave slightly differently for the very first frame we decode. + // So we track whether this is the first frame or not. + int decoding_first_frame; + + int allow_lowbitdepth; + int max_threads; + int inv_tile_order; + int need_resync; // wait for key/intra-only frame. + int reset_decoder_state; + + int tile_size_bytes; + int tile_col_size_bytes; + int dec_tile_row, dec_tile_col; // always -1 for non-VR tile encoding +#if CONFIG_ACCOUNTING + int acct_enabled; + Accounting accounting; +#endif + int sequence_header_ready; + int sequence_header_changed; +#if CONFIG_INSPECTION + aom_inspect_cb inspect_cb; + void *inspect_ctx; +#endif + int operating_point; + int current_operating_point; + int seen_frame_header; + // The expected start_tile (tg_start syntax element) of the next tile group. + int next_start_tile; + + // State if the camera frame header is already decoded while + // large_scale_tile = 1. + int camera_frame_header_ready; + size_t frame_header_size; + DataBuffer obu_size_hdr; + int output_frame_width_in_tiles_minus_1; + int output_frame_height_in_tiles_minus_1; + int tile_count_minus_1; + uint32_t coded_tile_data_size; + unsigned int ext_tile_debug; // for ext-tile software debug & testing + + // Decoder has 3 modes of operation: + // (1) Single-threaded decoding. + // (2) Multi-threaded decoding with each tile decoded in parallel. + // (3) In addition to (2), each thread decodes 1 superblock row in parallel. + // row_mt = 1 triggers mode (3) above, while row_mt = 0, will trigger mode (1) + // or (2) depending on 'max_threads'. + unsigned int row_mt; + + EXTERNAL_REFERENCES ext_refs; + YV12_BUFFER_CONFIG tile_list_outbuf; + + // Coding block buffer for the current frame. + // Allocated and used only for multi-threaded decoding with 'row_mt == 0'. + // See also: similar buffer in 'ThreadData' struct. + CB_BUFFER *cb_buffer_base; + // Allocated size of 'cb_buffer_base'. Currently same as the number of + // superblocks in the coded frame. + int cb_buffer_alloc_size; + + int allocated_row_mt_sync_rows; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mt_mutex_; + pthread_cond_t *row_mt_cond_; +#endif + + AV1DecRowMTInfo frame_row_mt_info; + aom_metadata_array_t *metadata; + + int context_update_tile_id; + int skip_loop_filter; + int skip_film_grain; + int is_annexb; + int valid_for_referencing[REF_FRAMES]; + int is_fwd_kf_present; + int is_arf_frame_present; + int num_tile_groups; + aom_s_frame_info sframe_info; + + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader seq_params; + + /*! + * If true, buffer removal times are present. + */ + bool buffer_removal_time_present; + + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info error; + + /*! + * Number of temporal layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_temporal_layers; + + /*! + * Number of spatial layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_spatial_layers; +} AV1Decoder; + +// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error +// code and returns a nonzero value on failure. +int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size, + const uint8_t **psource); + +// Get the frame at a particular index in the output queue +int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, + aom_film_grain_t **grain_params); + +int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame); + +aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx, + YV12_BUFFER_CONFIG *sd); + +aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, + int use_external_ref, + YV12_BUFFER_CONFIG *sd); +aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd); + +struct AV1Decoder *av1_decoder_create(BufferPool *const pool); + +void av1_decoder_remove(struct AV1Decoder *pbi); +void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info); + +void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync); + +void av1_dec_free_cb_buf(AV1Decoder *pbi); + +static INLINE void decrease_ref_count(RefCntBuffer *const buf, + BufferPool *const pool) { + if (buf != NULL) { + --buf->ref_count; + // Reference counts should never become negative. If this assertion fails, + // there is a bug in our reference count management. + assert(buf->ref_count >= 0); + // A worker may only get a free framebuffer index when calling get_free_fb. + // But the raw frame buffer is not set up until we finish decoding header. + // So if any error happens during decoding header, frame_bufs[idx] will not + // have a valid raw frame buffer. + if (buf->ref_count == 0 && buf->raw_frame_buffer.data) { + pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer); + buf->raw_frame_buffer.data = NULL; + buf->raw_frame_buffer.size = 0; + buf->raw_frame_buffer.priv = NULL; + } + } +} + +#define ACCT_STR __func__ +static INLINE int av1_read_uniform(aom_reader *r, int n) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + const int v = aom_read_literal(r, l - 1, ACCT_STR); + assert(l != 0); + if (v < m) + return v; + else + return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR); +} + +typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane, + aom_reader *r); + +void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, + aom_reader *r, palette_visitor_fn_t visit); + +typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td, + int mi_row, int mi_col, aom_reader *r, + PARTITION_TYPE partition, BLOCK_SIZE bsize); + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_DECODER_H_ diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c new file mode 100644 index 0000000000..dd5aa62001 --- /dev/null +++ b/third_party/aom/av1/decoder/decodetxb.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/decoder/decodetxb.h" + +#include "aom_ports/mem.h" +#include "av1/common/idct.h" +#include "av1/common/scan.h" +#include "av1/common/txb_common.h" +#include "av1/decoder/decodemv.h" + +#define ACCT_STR __func__ + +static int read_golomb(MACROBLOCKD *xd, aom_reader *r) { + int x = 1; + int length = 0; + int i = 0; + + while (!i) { + i = aom_read_bit(r, ACCT_STR); + ++length; + if (length > 20) { + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid length in read_golomb"); + break; + } + } + + for (i = 0; i < length - 1; ++i) { + x <<= 1; + x += aom_read_bit(r, ACCT_STR); + } + + return x - 1; +} + +static INLINE int rec_eob_pos(const int eob_token, const int extra) { + int eob = av1_eob_group_start[eob_token]; + if (eob > 2) { + eob += extra; + } + return eob; +} + +static INLINE int get_dqv(const int16_t *dequant, int coeff_idx, + const qm_val_t *iqmatrix) { + int dqv = dequant[!!coeff_idx]; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return dqv; +} + +static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size, + int start_si, int end_si, + const int16_t *scan, int bhl, + uint8_t *levels, + base_cdf_arr base_cdf, + br_cdf_arr br_cdf) { + for (int c = end_si; c >= start_si; --c) { + const int pos = scan[c]; + const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bhl, tx_size); + const int nsymbs = 4; + int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR); + if (level > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx_2d(levels, pos, bhl); + aom_cdf_prob *cdf = br_cdf[br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); + level += k; + if (k < BR_CDF_SIZE - 1) break; + } + } + levels[get_padded_idx(pos, bhl)] = level; + } +} + +static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size, + TX_CLASS tx_class, int start_si, + int end_si, const int16_t *scan, int bhl, + uint8_t *levels, base_cdf_arr base_cdf, + br_cdf_arr br_cdf) { + for (int c = end_si; c >= start_si; --c) { + const int pos = scan[c]; + const int coeff_ctx = + get_lower_levels_ctx(levels, pos, bhl, tx_size, tx_class); + const int nsymbs = 4; + int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR); + if (level > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + aom_cdf_prob *cdf = br_cdf[br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); + level += k; + if (k < BR_CDF_SIZE - 1) break; + } + } + levels[get_padded_idx(pos, bhl)] = level; + } +} + +uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, DecoderCodingBlock *dcb, + aom_reader *const r, const int blk_row, + const int blk_col, const int plane, + const TXB_CTX *const txb_ctx, + const TX_SIZE tx_size) { + MACROBLOCKD *const xd = &dcb->xd; + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + const int32_t max_value = (1 << (7 + xd->bd)) - 1; + const int32_t min_value = -(1 << (7 + xd->bd)); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id]; + tran_low_t *const tcoeffs = dcb->dqcoeff_block[plane] + dcb->cb_offset[plane]; + const int shift = av1_get_tx_scale(tx_size); + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + int cul_level = 0; + int dc_val = 0; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + const int all_zero = aom_read_symbol( + r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR); + eob_info *eob_data = dcb->eob_data[plane] + dcb->txb_offset[plane]; + uint16_t *const eob = &(eob_data->eob); + uint16_t *const max_scan_line = &(eob_data->max_scan_line); + *max_scan_line = 0; + *eob = 0; + +#if CONFIG_INSPECTION + if (plane == 0) { + const int txk_type_idx = + av1_get_txk_type_index(mbmi->bsize, blk_row, blk_col); + mbmi->tx_skip[txk_type_idx] = all_zero; + } +#endif + + if (all_zero) { + *max_scan_line = 0; + if (plane == 0) { + xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT; + } + return 0; + } + + if (plane == AOM_PLANE_Y) { + // only y plane's tx_type is transmitted + av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r); + } + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const qm_val_t *iqmatrix = + av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + int eob_extra = 0; + int eob_pt = 1; + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + switch (eob_multi_size) { + case 0: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], + 5, ACCT_STR) + + 1; + break; + case 1: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], + 6, ACCT_STR) + + 1; + break; + case 2: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], + 7, ACCT_STR) + + 1; + break; + case 3: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], + 8, ACCT_STR) + + 1; + break; + case 4: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], + 9, ACCT_STR) + + 1; + break; + case 5: + eob_pt = + aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], + 10, ACCT_STR) + + 1; + break; + case 6: + default: + eob_pt = aom_read_symbol( + r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11, + ACCT_STR) + + 1; + break; + } + + const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { + const int eob_ctx = eob_pt - 3; + int bit = aom_read_symbol( + r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR); + if (bit) { + eob_extra += (1 << (eob_offset_bits - 1)); + } + + for (int i = 1; i < eob_offset_bits; i++) { + bit = aom_read_bit(r, ACCT_STR); + if (bit) { + eob_extra += (1 << (eob_offset_bits - 1 - i)); + } + } + } + *eob = rec_eob_pos(eob_pt, eob_extra); + + if (*eob > 1) { + memset(levels_buf, 0, + sizeof(*levels_buf) * + ((height + TX_PAD_HOR) * (width + TX_PAD_VER) + TX_PAD_END)); + } + + { + // Read the non-zero coefficient with scan index eob-1 + // TODO(angiebird): Put this into a function + const int c = *eob - 1; + const int pos = scan[c]; + const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, c); + const int nsymbs = 3; + aom_cdf_prob *cdf = + ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx]; + int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1; + if (level > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx_eob(pos, bhl, tx_class); + cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); + level += k; + if (k < BR_CDF_SIZE - 1) break; + } + } + levels[get_padded_idx(pos, bhl)] = level; + } + if (*eob > 1) { + base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type]; + br_cdf_arr br_cdf = + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type]; + if (tx_class == TX_CLASS_2D) { + read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bhl, levels, + base_cdf, br_cdf); + read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bhl, levels, + base_cdf, br_cdf); + } else { + read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bhl, + levels, base_cdf, br_cdf); + } + } + + for (int c = 0; c < *eob; ++c) { + const int pos = scan[c]; + uint8_t sign; + tran_low_t level = levels[get_padded_idx(pos, bhl)]; + if (level) { + *max_scan_line = AOMMAX(*max_scan_line, pos); + if (c == 0) { + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], + 2, ACCT_STR); + } else { + sign = aom_read_bit(r, ACCT_STR); + } + if (level >= MAX_BASE_BR_RANGE) { + level += read_golomb(xd, r); + } + + if (c == 0) dc_val = sign ? -level : level; + + // Bitmasking to clamp level to valid range: + // The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit + level &= 0xfffff; + cul_level += level; + tran_low_t dq_coeff; + // Bitmasking to clamp dq_coeff to valid range: + // The valid range for 8/10/12 bit video is at most 17/19/21 bit + dq_coeff = (tran_low_t)( + (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff); + dq_coeff = dq_coeff >> shift; + if (sign) { + dq_coeff = -dq_coeff; + } + tcoeffs[pos] = clamp(dq_coeff, min_value, max_value); + } + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + + // DC value + set_dc_sign(&cul_level, dc_val); + + return cul_level; +} + +void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm, + DecoderCodingBlock *dcb, aom_reader *const r, + const int plane, const int row, const int col, + const TX_SIZE tx_size) { +#if TXCOEFF_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + MACROBLOCKD *const xd = &dcb->xd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize < BLOCK_SIZES_ALL); + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col, + pd->left_entropy_context + row, &txb_ctx); + const uint8_t cul_level = + av1_read_coeffs_txb(cm, dcb, r, row, col, plane, &txb_ctx, tx_size); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, + row); + + if (is_inter_block(mbmi)) { + const PLANE_TYPE plane_type = get_plane_type(plane); + // tx_type will be read out in av1_read_coeffs_txb_facade + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size, + cm->features.reduced_tx_set_used); + + if (plane == 0) { + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + // The 16x16 unit is due to the constraint from tx_64x64 which sets the + // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block + // size, the constraint takes effect in 32x16 / 16x32 size too. To solve + // the intricacy, cover all the 16x16 units inside a 64 level transform. + if (txw == tx_size_wide_unit[TX_64X64] || + txh == tx_size_high_unit[TX_64X64]) { + const int tx_unit = tx_size_wide_unit[TX_16X16]; + const int stride = xd->tx_type_map_stride; + for (int idy = 0; idy < txh; idy += tx_unit) { + for (int idx = 0; idx < txw; idx += tx_unit) { + xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type; + } + } + } + } + } + +#if TXCOEFF_TIMER + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + cm->txcoeff_timer += elapsed_time; + ++cm->txb_count; +#endif +} diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h new file mode 100644 index 0000000000..fd34d40341 --- /dev/null +++ b/third_party/aom/av1/decoder/decodetxb.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DECODETXB_H_ +#define AOM_AV1_DECODER_DECODETXB_H_ + +#include "av1/common/enums.h" + +struct aom_reader; +struct AV1Common; +struct DecoderCodingBlock; +struct txb_ctx; + +uint8_t av1_read_coeffs_txb(const struct AV1Common *const cm, + struct DecoderCodingBlock *dcb, + struct aom_reader *const r, const int blk_row, + const int blk_col, const int plane, + const struct txb_ctx *const txb_ctx, + const TX_SIZE tx_size); + +void av1_read_coeffs_txb_facade(const struct AV1Common *const cm, + struct DecoderCodingBlock *dcb, + struct aom_reader *const r, const int plane, + const int row, const int col, + const TX_SIZE tx_size); +#endif // AOM_AV1_DECODER_DECODETXB_H_ diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c new file mode 100644 index 0000000000..3c6a006eaf --- /dev/null +++ b/third_party/aom/av1/decoder/detokenize.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/decoder/detokenize.h" + +#define ACCT_STR __func__ + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/idct.h" + +static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) { + uint8_t color_order[PALETTE_MAX_SIZE]; + const int n = param->n_colors; + uint8_t *const color_map = param->color_map; + MapCdf color_map_cdf = param->map_cdf; + int plane_block_width = param->plane_width; + int plane_block_height = param->plane_height; + int rows = param->rows; + int cols = param->cols; + + // The first color index. + color_map[0] = av1_read_uniform(r, n); + assert(color_map[0] < n); + + // Run wavefront on the palette map index decoding. + for (int i = 1; i < rows + cols - 1; ++i) { + for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) { + const int color_ctx = av1_get_palette_color_index_context( + color_map, plane_block_width, (i - j), j, n, color_order, NULL); + const int color_idx = aom_read_symbol( + r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR); + assert(color_idx >= 0 && color_idx < n); + color_map[(i - j) * plane_block_width + j] = color_order[color_idx]; + } + } + // Copy last column to extra columns. + if (cols < plane_block_width) { + for (int i = 0; i < rows; ++i) { + memset(color_map + i * plane_block_width + cols, + color_map[i * plane_block_width + cols - 1], + (plane_block_width - cols)); + } + } + // Copy last row to extra rows. + for (int i = rows; i < plane_block_height; ++i) { + memcpy(color_map + i * plane_block_width, + color_map + (rows - 1) * plane_block_width, plane_block_width); + } +} + +void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, + aom_reader *r) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam params; + params.color_map = + xd->plane[plane].color_index_map + xd->color_index_map_offset[plane]; + params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf + : xd->tile_ctx->palette_y_color_index_cdf; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + params.n_colors = mbmi->palette_mode_info.palette_size[plane]; + av1_get_block_dimensions(mbmi->bsize, plane, xd, ¶ms.plane_width, + ¶ms.plane_height, ¶ms.rows, ¶ms.cols); + decode_color_map_tokens(¶ms, r); +} diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h new file mode 100644 index 0000000000..173b437a94 --- /dev/null +++ b/third_party/aom/av1/decoder/detokenize.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DETOKENIZE_H_ +#define AOM_AV1_DECODER_DETOKENIZE_H_ + +#include "config/aom_config.h" + +#include "av1/common/scan.h" +#include "av1/decoder/decoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_DECODER_DETOKENIZE_H_ diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h new file mode 100644 index 0000000000..f82b9d8ccf --- /dev/null +++ b/third_party/aom/av1/decoder/dthread.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_DTHREAD_H_ +#define AOM_AV1_DECODER_DTHREAD_H_ + +#include "config/aom_config.h" + +#include "aom_util/aom_thread.h" +#include "aom/internal/aom_codec_internal.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1Common; +struct AV1Decoder; +struct ThreadData; + +typedef struct DecWorkerData { + struct ThreadData *td; + const uint8_t *data_end; + struct aom_internal_error_info error_info; +} DecWorkerData; + +// WorkerData for the FrameWorker thread. It contains all the information of +// the worker and decode structures for decoding a frame. +typedef struct FrameWorkerData { + struct AV1Decoder *pbi; + const uint8_t *data; + const uint8_t *data_end; + size_t data_size; + void *user_priv; + int received_frame; + int frame_context_ready; // Current frame's context is ready to read. + int frame_decoded; // Finished decoding current frame. +} FrameWorkerData; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_DTHREAD_H_ diff --git a/third_party/aom/av1/decoder/grain_synthesis.c b/third_party/aom/av1/decoder/grain_synthesis.c new file mode 100644 index 0000000000..d276f6f90e --- /dev/null +++ b/third_party/aom/av1/decoder/grain_synthesis.c @@ -0,0 +1,1461 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain parameters and film grain synthesis + * + */ + +#include +#include +#include +#include +#include +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "av1/decoder/grain_synthesis.h" + +// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits) +// with zero mean and standard deviation of about 512. +// should be divided by 4 for 10-bit range and 16 for 8-bit range. +static const int gaussian_sequence[2048] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484 +}; + +static const int gauss_bits = 11; + +static int luma_subblock_size_y = 32; +static int luma_subblock_size_x = 32; + +static int chroma_subblock_size_y = 16; +static int chroma_subblock_size_x = 16; + +static const int min_luma_legal_range = 16; +static const int max_luma_legal_range = 235; + +static const int min_chroma_legal_range = 16; +static const int max_chroma_legal_range = 240; + +static int scaling_lut_y[256]; +static int scaling_lut_cb[256]; +static int scaling_lut_cr[256]; + +static int grain_min; +static int grain_max; + +static uint16_t random_register = 0; // random number generator register + +static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma, + int ***pred_pos_chroma, int **luma_grain_block, + int **cb_grain_block, int **cr_grain_block, + int **y_line_buf, int **cb_line_buf, + int **cr_line_buf, int **y_col_buf, int **cb_col_buf, + int **cr_col_buf) { + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (params->num_y_points > 0) ++num_pos_chroma; + + if (*pred_pos_luma) { + for (int row = 0; row < num_pos_luma; row++) { + aom_free((*pred_pos_luma)[row]); + } + aom_free(*pred_pos_luma); + *pred_pos_luma = NULL; + } + + if (*pred_pos_chroma) { + for (int row = 0; row < num_pos_chroma; row++) { + aom_free((*pred_pos_chroma)[row]); + } + aom_free(*pred_pos_chroma); + *pred_pos_chroma = NULL; + } + + aom_free(*y_line_buf); + *y_line_buf = NULL; + + aom_free(*cb_line_buf); + *cb_line_buf = NULL; + + aom_free(*cr_line_buf); + *cr_line_buf = NULL; + + aom_free(*y_col_buf); + *y_col_buf = NULL; + + aom_free(*cb_col_buf); + *cb_col_buf = NULL; + + aom_free(*cr_col_buf); + *cr_col_buf = NULL; + + aom_free(*luma_grain_block); + *luma_grain_block = NULL; + + aom_free(*cb_grain_block); + *cb_grain_block = NULL; + + aom_free(*cr_grain_block); + *cr_grain_block = NULL; +} + +static bool init_arrays(const aom_film_grain_t *params, int luma_stride, + int chroma_stride, int ***pred_pos_luma_p, + int ***pred_pos_chroma_p, int **luma_grain_block, + int **cb_grain_block, int **cr_grain_block, + int **y_line_buf, int **cb_line_buf, int **cr_line_buf, + int **y_col_buf, int **cb_col_buf, int **cr_col_buf, + int luma_grain_samples, int chroma_grain_samples, + int chroma_subsamp_y, int chroma_subsamp_x) { + *pred_pos_luma_p = NULL; + *pred_pos_chroma_p = NULL; + *luma_grain_block = NULL; + *cb_grain_block = NULL; + *cr_grain_block = NULL; + *y_line_buf = NULL; + *cb_line_buf = NULL; + *cr_line_buf = NULL; + *y_col_buf = NULL; + *cb_col_buf = NULL; + *cr_col_buf = NULL; + + memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256); + memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256); + memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256); + + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (params->num_y_points > 0) ++num_pos_chroma; + + int **pred_pos_luma; + int **pred_pos_chroma; + + pred_pos_luma = (int **)aom_calloc(num_pos_luma, sizeof(*pred_pos_luma)); + if (!pred_pos_luma) return false; + + for (int row = 0; row < num_pos_luma; row++) { + pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3); + if (!pred_pos_luma[row]) { + dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, + luma_grain_block, cb_grain_block, cr_grain_block, + y_line_buf, cb_line_buf, cr_line_buf, y_col_buf, + cb_col_buf, cr_col_buf); + return false; + } + } + + pred_pos_chroma = + (int **)aom_calloc(num_pos_chroma, sizeof(*pred_pos_chroma)); + if (!pred_pos_chroma) { + dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block, + cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf, + cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf); + return false; + } + + for (int row = 0; row < num_pos_chroma; row++) { + pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3); + if (!pred_pos_chroma[row]) { + dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, + luma_grain_block, cb_grain_block, cr_grain_block, + y_line_buf, cb_line_buf, cr_line_buf, y_col_buf, + cb_col_buf, cr_col_buf); + return false; + } + } + + int pos_ar_index = 0; + + for (int row = -params->ar_coeff_lag; row < 0; row++) { + for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1; + col++) { + pred_pos_luma[pos_ar_index][0] = row; + pred_pos_luma[pos_ar_index][1] = col; + pred_pos_luma[pos_ar_index][2] = 0; + + pred_pos_chroma[pos_ar_index][0] = row; + pred_pos_chroma[pos_ar_index][1] = col; + pred_pos_chroma[pos_ar_index][2] = 0; + ++pos_ar_index; + } + } + + for (int col = -params->ar_coeff_lag; col < 0; col++) { + pred_pos_luma[pos_ar_index][0] = 0; + pred_pos_luma[pos_ar_index][1] = col; + pred_pos_luma[pos_ar_index][2] = 0; + + pred_pos_chroma[pos_ar_index][0] = 0; + pred_pos_chroma[pos_ar_index][1] = col; + pred_pos_chroma[pos_ar_index][2] = 0; + + ++pos_ar_index; + } + + if (params->num_y_points > 0) { + pred_pos_chroma[pos_ar_index][0] = 0; + pred_pos_chroma[pos_ar_index][1] = 0; + pred_pos_chroma[pos_ar_index][2] = 1; + } + + *pred_pos_luma_p = pred_pos_luma; + *pred_pos_chroma_p = pred_pos_chroma; + + *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2); + *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride * + (2 >> chroma_subsamp_y)); + *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride * + (2 >> chroma_subsamp_y)); + + *y_col_buf = + (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2); + *cb_col_buf = + (int *)aom_malloc(sizeof(**cb_col_buf) * + (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * + (2 >> chroma_subsamp_x)); + *cr_col_buf = + (int *)aom_malloc(sizeof(**cr_col_buf) * + (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * + (2 >> chroma_subsamp_x)); + + *luma_grain_block = + (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples); + *cb_grain_block = + (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples); + *cr_grain_block = + (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); + if (!(*pred_pos_luma_p && *pred_pos_chroma_p && *y_line_buf && *cb_line_buf && + *cr_line_buf && *y_col_buf && *cb_col_buf && *cr_col_buf && + *luma_grain_block && *cb_grain_block && *cr_grain_block)) { + dealloc_arrays(params, pred_pos_luma_p, pred_pos_chroma_p, luma_grain_block, + cb_grain_block, cr_grain_block, y_line_buf, cb_line_buf, + cr_line_buf, y_col_buf, cb_col_buf, cr_col_buf); + return false; + } + return true; +} + +// get a number between 0 and 2^bits - 1 +static INLINE int get_random_number(int bits) { + uint16_t bit; + bit = ((random_register >> 0) ^ (random_register >> 1) ^ + (random_register >> 3) ^ (random_register >> 12)) & + 1; + random_register = (random_register >> 1) | (bit << 15); + return (random_register >> (16 - bits)) & ((1 << bits) - 1); +} + +static void init_random_generator(int luma_line, uint16_t seed) { + // same for the picture + + uint16_t msb = (seed >> 8) & 255; + uint16_t lsb = seed & 255; + + random_register = (msb << 8) + lsb; + + // changes for each row + int luma_num = luma_line >> 5; + + random_register ^= ((luma_num * 37 + 178) & 255) << 8; + random_register ^= ((luma_num * 173 + 105) & 255); +} + +static void generate_luma_grain_block( + const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, + int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, + int left_pad, int top_pad, int right_pad, int bottom_pad) { + if (params->num_y_points == 0) { + memset(luma_grain_block, 0, + sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); + return; + } + + int bit_depth = params->bit_depth; + int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; + + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int rounding_offset = (1 << (params->ar_coeff_shift - 1)); + + for (int i = 0; i < luma_block_size_y; i++) + for (int j = 0; j < luma_block_size_x; j++) + luma_grain_block[i * luma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + + for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++) + for (int j = left_pad; j < luma_block_size_x - right_pad; j++) { + int wsum = 0; + for (int pos = 0; pos < num_pos_luma; pos++) { + wsum = wsum + params->ar_coeffs_y[pos] * + luma_grain_block[(i + pred_pos_luma[pos][0]) * + luma_grain_stride + + j + pred_pos_luma[pos][1]]; + } + luma_grain_block[i * luma_grain_stride + j] = + clamp(luma_grain_block[i * luma_grain_stride + j] + + ((wsum + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + } +} + +static bool generate_chroma_grain_blocks( + const aom_film_grain_t *params, int **pred_pos_chroma, + int *luma_grain_block, int *cb_grain_block, int *cr_grain_block, + int luma_grain_stride, int chroma_block_size_y, int chroma_block_size_x, + int chroma_grain_stride, int left_pad, int top_pad, int right_pad, + int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) { + int bit_depth = params->bit_depth; + int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; + + int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + if (params->num_y_points > 0) ++num_pos_chroma; + int rounding_offset = (1 << (params->ar_coeff_shift - 1)); + int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride; + + if (params->num_cb_points || params->chroma_scaling_from_luma) { + init_random_generator(7 << 5, params->random_seed); + + for (int i = 0; i < chroma_block_size_y; i++) + for (int j = 0; j < chroma_block_size_x; j++) + cb_grain_block[i * chroma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + } else { + memset(cb_grain_block, 0, + sizeof(*cb_grain_block) * chroma_grain_block_size); + } + + if (params->num_cr_points || params->chroma_scaling_from_luma) { + init_random_generator(11 << 5, params->random_seed); + + for (int i = 0; i < chroma_block_size_y; i++) + for (int j = 0; j < chroma_block_size_x; j++) + cr_grain_block[i * chroma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + } else { + memset(cr_grain_block, 0, + sizeof(*cr_grain_block) * chroma_grain_block_size); + } + + for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) + for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) { + int wsum_cb = 0; + int wsum_cr = 0; + for (int pos = 0; pos < num_pos_chroma; pos++) { + if (pred_pos_chroma[pos][2] == 0) { + wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * + cb_grain_block[(i + pred_pos_chroma[pos][0]) * + chroma_grain_stride + + j + pred_pos_chroma[pos][1]]; + wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * + cr_grain_block[(i + pred_pos_chroma[pos][0]) * + chroma_grain_stride + + j + pred_pos_chroma[pos][1]]; + } else if (pred_pos_chroma[pos][2] == 1) { + int av_luma = 0; + int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad; + int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad; + + for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1; + k++) + for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1; + l++) + av_luma += luma_grain_block[k * luma_grain_stride + l]; + + av_luma = + (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >> + (chroma_subsamp_y + chroma_subsamp_x); + + wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; + wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; + } else { + fprintf( + stderr, + "Grain synthesis: prediction between two chroma components is " + "not supported!"); + return false; + } + } + if (params->num_cb_points || params->chroma_scaling_from_luma) + cb_grain_block[i * chroma_grain_stride + j] = + clamp(cb_grain_block[i * chroma_grain_stride + j] + + ((wsum_cb + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + if (params->num_cr_points || params->chroma_scaling_from_luma) + cr_grain_block[i * chroma_grain_stride + j] = + clamp(cr_grain_block[i * chroma_grain_stride + j] + + ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + } + return true; +} + +static void init_scaling_function(const int scaling_points[][2], int num_points, + int scaling_lut[]) { + if (num_points == 0) return; + + for (int i = 0; i < scaling_points[0][0]; i++) + scaling_lut[i] = scaling_points[0][1]; + + for (int point = 0; point < num_points - 1; point++) { + int delta_y = scaling_points[point + 1][1] - scaling_points[point][1]; + int delta_x = scaling_points[point + 1][0] - scaling_points[point][0]; + + int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); + + for (int x = 0; x < delta_x; x++) { + scaling_lut[scaling_points[point][0] + x] = + scaling_points[point][1] + (int)((x * delta + 32768) >> 16); + } + } + + for (int i = scaling_points[num_points - 1][0]; i < 256; i++) + scaling_lut[i] = scaling_points[num_points - 1][1]; +} + +// function that extracts samples from a LUT (and interpolates intemediate +// frames for 10- and 12-bit video) +static int scale_LUT(int *scaling_lut, int index, int bit_depth) { + int x = index >> (bit_depth - 8); + + if (!(bit_depth - 8) || x == 255) + return scaling_lut[x]; + else + return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) * + (index & ((1 << (bit_depth - 8)) - 1)) + + (1 << (bit_depth - 9))) >> + (bit_depth - 8)); +} + +static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int luma_stride, + int chroma_stride, int *luma_grain, + int *cb_grain, int *cr_grain, + int luma_grain_stride, int chroma_grain_stride, + int half_luma_height, int half_luma_width, + int bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { + int cb_mult = params->cb_mult - 128; // fixed scale + int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale + int cb_offset = params->cb_offset - 256; + + int cr_mult = params->cr_mult - 128; // fixed scale + int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale + int cr_offset = params->cr_offset - 256; + + int rounding_offset = (1 << (params->scaling_shift - 1)); + + int apply_y = params->num_y_points > 0 ? 1 : 0; + int apply_cb = + (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; + int apply_cr = + (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; + + if (params->chroma_scaling_from_luma) { + cb_mult = 0; // fixed scale + cb_luma_mult = 64; // fixed scale + cb_offset = 0; + + cr_mult = 0; // fixed scale + cr_luma_mult = 64; // fixed scale + cr_offset = 0; + } + + int min_luma, max_luma, min_chroma, max_chroma; + + if (params->clip_to_restricted_range) { + min_luma = min_luma_legal_range; + max_luma = max_luma_legal_range; + + if (mc_identity) { + min_chroma = min_luma_legal_range; + max_chroma = max_luma_legal_range; + } else { + min_chroma = min_chroma_legal_range; + max_chroma = max_chroma_legal_range; + } + } else { + min_luma = min_chroma = 0; + max_luma = max_chroma = 255; + } + + for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { + for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { + int average_luma = 0; + if (chroma_subsamp_x) { + average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x)] + + luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x) + 1] + + 1) >> + 1; + } else { + average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; + } + + if (apply_cb) { + cb[i * chroma_stride + j] = clamp( + cb[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cb, + clamp(((average_luma * cb_luma_mult + + cb_mult * cb[i * chroma_stride + j]) >> + 6) + + cb_offset, + 0, (256 << (bit_depth - 8)) - 1), + 8) * + cb_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + + if (apply_cr) { + cr[i * chroma_stride + j] = clamp( + cr[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cr, + clamp(((average_luma * cr_luma_mult + + cr_mult * cr[i * chroma_stride + j]) >> + 6) + + cr_offset, + 0, (256 << (bit_depth - 8)) - 1), + 8) * + cr_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + } + } + + if (apply_y) { + for (int i = 0; i < (half_luma_height << 1); i++) { + for (int j = 0; j < (half_luma_width << 1); j++) { + luma[i * luma_stride + j] = + clamp(luma[i * luma_stride + j] + + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) * + luma_grain[i * luma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_luma, max_luma); + } + } + } +} + +static void add_noise_to_block_hbd( + const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, + int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, + int *cr_grain, int luma_grain_stride, int chroma_grain_stride, + int half_luma_height, int half_luma_width, int bit_depth, + int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { + int cb_mult = params->cb_mult - 128; // fixed scale + int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale + // offset value depends on the bit depth + int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth); + + int cr_mult = params->cr_mult - 128; // fixed scale + int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale + // offset value depends on the bit depth + int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth); + + int rounding_offset = (1 << (params->scaling_shift - 1)); + + int apply_y = params->num_y_points > 0 ? 1 : 0; + int apply_cb = + (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 + : 0; + int apply_cr = + (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 + : 0; + + if (params->chroma_scaling_from_luma) { + cb_mult = 0; // fixed scale + cb_luma_mult = 64; // fixed scale + cb_offset = 0; + + cr_mult = 0; // fixed scale + cr_luma_mult = 64; // fixed scale + cr_offset = 0; + } + + int min_luma, max_luma, min_chroma, max_chroma; + + if (params->clip_to_restricted_range) { + min_luma = min_luma_legal_range << (bit_depth - 8); + max_luma = max_luma_legal_range << (bit_depth - 8); + + if (mc_identity) { + min_chroma = min_luma_legal_range << (bit_depth - 8); + max_chroma = max_luma_legal_range << (bit_depth - 8); + } else { + min_chroma = min_chroma_legal_range << (bit_depth - 8); + max_chroma = max_chroma_legal_range << (bit_depth - 8); + } + } else { + min_luma = min_chroma = 0; + max_luma = max_chroma = (256 << (bit_depth - 8)) - 1; + } + + for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { + for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { + int average_luma = 0; + if (chroma_subsamp_x) { + average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x)] + + luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x) + 1] + + 1) >> + 1; + } else { + average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; + } + + if (apply_cb) { + cb[i * chroma_stride + j] = clamp( + cb[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cb, + clamp(((average_luma * cb_luma_mult + + cb_mult * cb[i * chroma_stride + j]) >> + 6) + + cb_offset, + 0, (256 << (bit_depth - 8)) - 1), + bit_depth) * + cb_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + if (apply_cr) { + cr[i * chroma_stride + j] = clamp( + cr[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cr, + clamp(((average_luma * cr_luma_mult + + cr_mult * cr[i * chroma_stride + j]) >> + 6) + + cr_offset, + 0, (256 << (bit_depth - 8)) - 1), + bit_depth) * + cr_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + } + } + + if (apply_y) { + for (int i = 0; i < (half_luma_height << 1); i++) { + for (int j = 0; j < (half_luma_width << 1); j++) { + luma[i * luma_stride + j] = + clamp(luma[i * luma_stride + j] + + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], + bit_depth) * + luma_grain[i * luma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_luma, max_luma); + } + } + } +} + +static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int width, int height, + int use_high_bit_depth) { + int hbd_coeff = use_high_bit_depth ? 2 : 1; + while (height) { + memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff); + src += src_stride; + dst += dst_stride; + --height; + } + return; +} + +static void copy_area(int *src, int src_stride, int *dst, int dst_stride, + int width, int height) { + while (height) { + memcpy(dst, src, width * sizeof(*src)); + src += src_stride; + dst += dst_stride; + --height; + } + return; +} + +static void extend_even(uint8_t *dst, int dst_stride, int width, int height, + int use_high_bit_depth) { + if ((width & 1) == 0 && (height & 1) == 0) return; + if (use_high_bit_depth) { + uint16_t *dst16 = (uint16_t *)dst; + int dst16_stride = dst_stride / 2; + if (width & 1) { + for (int i = 0; i < height; ++i) + dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1]; + } + width = (width + 1) & (~1); + if (height & 1) { + memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride], + sizeof(*dst16) * width); + } + } else { + if (width & 1) { + for (int i = 0; i < height; ++i) + dst[i * dst_stride + width] = dst[i * dst_stride + width - 1]; + } + width = (width + 1) & (~1); + if (height & 1) { + memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride], + sizeof(*dst) * width); + } + } +} + +static void ver_boundary_overlap(int *left_block, int left_stride, + int *right_block, int right_stride, + int *dst_block, int dst_stride, int width, + int height) { + if (width == 1) { + while (height) { + *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5, + grain_min, grain_max); + left_block += left_stride; + right_block += right_stride; + dst_block += dst_stride; + --height; + } + return; + } else if (width == 2) { + while (height) { + dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5, + grain_min, grain_max); + dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5, + grain_min, grain_max); + left_block += left_stride; + right_block += right_stride; + dst_block += dst_stride; + --height; + } + return; + } +} + +static void hor_boundary_overlap(int *top_block, int top_stride, + int *bottom_block, int bottom_stride, + int *dst_block, int dst_stride, int width, + int height) { + if (height == 1) { + while (width) { + *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5, + grain_min, grain_max); + ++top_block; + ++bottom_block; + ++dst_block; + --width; + } + return; + } else if (height == 2) { + while (width) { + dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5, + grain_min, grain_max); + dst_block[dst_stride] = clamp((17 * top_block[top_stride] + + 27 * bottom_block[bottom_stride] + 16) >> + 5, + grain_min, grain_max); + ++top_block; + ++bottom_block; + ++dst_block; + --width; + } + return; + } +} + +int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, + aom_image_t *dst) { + uint8_t *luma, *cb, *cr; + int height, width, luma_stride, chroma_stride; + int use_high_bit_depth = 0; + int chroma_subsamp_x = 0; + int chroma_subsamp_y = 0; + int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0; + + switch (src->fmt) { + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_I420: + use_high_bit_depth = 0; + chroma_subsamp_x = 1; + chroma_subsamp_y = 1; + break; + case AOM_IMG_FMT_I42016: + use_high_bit_depth = 1; + chroma_subsamp_x = 1; + chroma_subsamp_y = 1; + break; + // case AOM_IMG_FMT_444A: + case AOM_IMG_FMT_I444: + use_high_bit_depth = 0; + chroma_subsamp_x = 0; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I44416: + use_high_bit_depth = 1; + chroma_subsamp_x = 0; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I422: + use_high_bit_depth = 0; + chroma_subsamp_x = 1; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I42216: + use_high_bit_depth = 1; + chroma_subsamp_x = 1; + chroma_subsamp_y = 0; + break; + default: // unknown input format + fprintf(stderr, "Film grain error: input format is not supported!"); + return -1; + } + + assert(params->bit_depth == src->bit_depth); + + dst->fmt = src->fmt; + dst->bit_depth = src->bit_depth; + + dst->r_w = src->r_w; + dst->r_h = src->r_h; + dst->d_w = src->d_w; + dst->d_h = src->d_h; + + dst->cp = src->cp; + dst->tc = src->tc; + dst->mc = src->mc; + + dst->monochrome = src->monochrome; + dst->csp = src->csp; + dst->range = src->range; + + dst->x_chroma_shift = src->x_chroma_shift; + dst->y_chroma_shift = src->y_chroma_shift; + + dst->temporal_id = src->temporal_id; + dst->spatial_id = src->spatial_id; + + width = src->d_w % 2 ? src->d_w + 1 : src->d_w; + height = src->d_h % 2 ? src->d_h + 1 : src->d_h; + + copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], + dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, + src->d_h, use_high_bit_depth); + // Note that dst is already assumed to be aligned to even. + extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, + src->d_h, use_high_bit_depth); + + if (!src->monochrome) { + copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], + dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], + width >> chroma_subsamp_x, height >> chroma_subsamp_y, + use_high_bit_depth); + + copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], + dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], + width >> chroma_subsamp_x, height >> chroma_subsamp_y, + use_high_bit_depth); + } + + luma = dst->planes[AOM_PLANE_Y]; + cb = dst->planes[AOM_PLANE_U]; + cr = dst->planes[AOM_PLANE_V]; + + // luma and chroma strides in samples + luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; + chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; + + return av1_add_film_grain_run( + params, luma, cb, cr, height, width, luma_stride, chroma_stride, + use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); +} + +int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { + int **pred_pos_luma; + int **pred_pos_chroma; + int *luma_grain_block; + int *cb_grain_block; + int *cr_grain_block; + + int *y_line_buf; + int *cb_line_buf; + int *cr_line_buf; + + int *y_col_buf; + int *cb_col_buf; + int *cr_col_buf; + + random_register = params->random_seed; + + int left_pad = 3; + int right_pad = 3; // padding to offset for AR coefficients + int top_pad = 3; + int bottom_pad = 0; + + int ar_padding = 3; // maximum lag used for stabilization of AR coefficients + + luma_subblock_size_y = 32; + luma_subblock_size_x = 32; + + chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y; + chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x; + + // Initial padding is only needed for generation of + // film grain templates (to stabilize the AR process) + // Only a 64x64 luma and 32x32 chroma part of a template + // is used later for adding grain, padding can be discarded + + int luma_block_size_y = + top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad; + int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 + + 2 * ar_padding + right_pad; + + int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + + chroma_subblock_size_y * 2 + bottom_pad; + int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + + chroma_subblock_size_x * 2 + + (2 >> chroma_subsamp_x) * ar_padding + right_pad; + + int luma_grain_stride = luma_block_size_x; + int chroma_grain_stride = chroma_block_size_x; + + int overlap = params->overlap_flag; + int bit_depth = params->bit_depth; + + const int grain_center = 128 << (bit_depth - 8); + grain_min = 0 - grain_center; + grain_max = grain_center - 1; + + if (!init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma, + &pred_pos_chroma, &luma_grain_block, &cb_grain_block, + &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, + &y_col_buf, &cb_col_buf, &cr_col_buf, + luma_block_size_y * luma_block_size_x, + chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, + chroma_subsamp_x)) + return -1; + + generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, + luma_block_size_y, luma_block_size_x, + luma_grain_stride, left_pad, top_pad, right_pad, + bottom_pad); + + if (!generate_chroma_grain_blocks( + params, pred_pos_chroma, luma_grain_block, cb_grain_block, + cr_grain_block, luma_grain_stride, chroma_block_size_y, + chroma_block_size_x, chroma_grain_stride, left_pad, top_pad, + right_pad, bottom_pad, chroma_subsamp_y, chroma_subsamp_x)) + return -1; + + init_scaling_function(params->scaling_points_y, params->num_y_points, + scaling_lut_y); + + if (params->chroma_scaling_from_luma) { + memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256); + memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256); + } else { + init_scaling_function(params->scaling_points_cb, params->num_cb_points, + scaling_lut_cb); + init_scaling_function(params->scaling_points_cr, params->num_cr_points, + scaling_lut_cr); + } + for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) { + init_random_generator(y * 2, params->random_seed); + + for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) { + int offset_y = get_random_number(8); + int offset_x = (offset_y >> 4) & 15; + offset_y &= 15; + + int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1); + int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1); + + int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + + offset_y * (2 >> chroma_subsamp_y); + int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + + offset_x * (2 >> chroma_subsamp_x); + + if (overlap && x) { + ver_boundary_overlap( + y_col_buf, 2, + luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x, + luma_grain_stride, y_col_buf, 2, 2, + AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); + + ver_boundary_overlap( + cb_col_buf, 2 >> chroma_subsamp_x, + cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x, + chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + ver_boundary_overlap( + cr_col_buf, 2 >> chroma_subsamp_x, + cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x, + chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + int i = y ? 1 : 0; + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, + (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1), + (uint16_t *)cb + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + (uint16_t *)cr + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, y_col_buf + i * 4, + cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + 2, (2 - chroma_subsamp_x), + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, + bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + ((y + i) << 1) * luma_stride + (x << 1), + cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, y_col_buf + i * 4, + cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + 2, (2 - chroma_subsamp_x), + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, + bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + } + + if (overlap && y) { + if (x) { + hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2, + y_line_buf + (x << 1), luma_stride, 2, 2); + + hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x, + cb_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_y); + + hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x, + cr_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_y); + } + + hor_boundary_overlap( + y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x + (x ? 2 : 0), + luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1), + width - ((x ? x + 1 : 0) << 1)), + 2); + + hor_boundary_overlap( + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + chroma_grain_stride, + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x - + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), + 2 >> chroma_subsamp_y); + + hor_boundary_overlap( + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + chroma_grain_stride, + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x - + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), + 2 >> chroma_subsamp_y); + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1), + (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + luma_stride, chroma_stride, y_line_buf + (x << 1), + cb_line_buf + (x << (1 - chroma_subsamp_x)), + cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, + chroma_stride, 1, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + (y << 1) * luma_stride + (x << 1), + cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + luma_stride, chroma_stride, y_line_buf + (x << 1), + cb_line_buf + (x << (1 - chroma_subsamp_x)), + cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, + chroma_stride, 1, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + } + + int i = overlap && y ? 1 : 0; + int j = overlap && x ? 1 : 0; + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, + (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), + (uint16_t *)cb + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + (uint16_t *)cr + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, + luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + + luma_offset_x + (j << 1), + cb_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + cr_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + luma_grain_stride, chroma_grain_stride, + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), + cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, + luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + + luma_offset_x + (j << 1), + cb_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + cr_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + luma_grain_stride, chroma_grain_stride, + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + + if (overlap) { + if (x) { + // Copy overlapped column bufer to line buffer + copy_area(y_col_buf + (luma_subblock_size_y << 1), 2, + y_line_buf + (x << 1), luma_stride, 2, 2); + + copy_area( + cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), + 2 >> chroma_subsamp_x, + cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, + 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); + + copy_area( + cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), + 2 >> chroma_subsamp_x, + cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, + 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); + } + + // Copy grain to the line buffer for overlap with a bottom block + copy_area( + luma_grain_block + + (luma_offset_y + luma_subblock_size_y) * luma_grain_stride + + luma_offset_x + ((x ? 2 : 0)), + luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2); + + copy_area(cb_grain_block + + (chroma_offset_y + chroma_subblock_size_y) * + chroma_grain_stride + + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), + chroma_grain_stride, + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x, + ((width - (x << 1)) >> chroma_subsamp_x)) - + (x ? 2 >> chroma_subsamp_x : 0), + 2 >> chroma_subsamp_y); + + copy_area(cr_grain_block + + (chroma_offset_y + chroma_subblock_size_y) * + chroma_grain_stride + + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), + chroma_grain_stride, + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x, + ((width - (x << 1)) >> chroma_subsamp_x)) - + (x ? 2 >> chroma_subsamp_x : 0), + 2 >> chroma_subsamp_y); + + // Copy grain to the column buffer for overlap with the next block to + // the right + + copy_area(luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x + luma_subblock_size_x, + luma_grain_stride, y_col_buf, 2, 2, + AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); + + copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + chroma_subblock_size_x, + chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + chroma_subblock_size_x, + chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + } + } + } + + dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, + &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, + &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); + return 0; +} diff --git a/third_party/aom/av1/decoder/grain_synthesis.h b/third_party/aom/av1/decoder/grain_synthesis.h new file mode 100644 index 0000000000..9858ce0013 --- /dev/null +++ b/third_party/aom/av1/decoder/grain_synthesis.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain synthesis + * + */ +#ifndef AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_ +#define AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "aom_dsp/grain_params.h" +#include "aom/aom_image.h" + +/*!\brief Add film grain + * + * Add film grain to an image + * + * Returns 0 for success, -1 for failure + * + * \param[in] grain_params Grain parameters + * \param[in] luma luma plane + * \param[in] cb cb plane + * \param[in] cr cr plane + * \param[in] height luma plane height + * \param[in] width luma plane width + * \param[in] luma_stride luma plane stride + * \param[in] chroma_stride chroma plane stride + */ +int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity); + +/*!\brief Add film grain + * + * Add film grain to an image + * + * Returns 0 for success, -1 for failure + * + * \param[in] grain_params Grain parameters + * \param[in] src Source image + * \param[out] dst Resulting image with grain + */ +int av1_add_film_grain(const aom_film_grain_t *grain_params, + const aom_image_t *src, aom_image_t *dst); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_DECODER_GRAIN_SYNTHESIS_H_ diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c new file mode 100644 index 0000000000..288d69a224 --- /dev/null +++ b/third_party/aom/av1/decoder/inspection.c @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/decoder/decoder.h" +#include "av1/decoder/inspection.h" +#include "av1/common/enums.h" +#include "av1/common/cdef.h" + +static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) { + fd->mi_cols = mi_cols; + fd->mi_rows = mi_rows; + fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows * + fd->mi_cols); + if (!fd->mi_grid) { + fprintf(stderr, "Error allocating inspection data\n"); + abort(); + } +} + +void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) { + int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2; + int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2; + ifd_init_mi_rc(fd, mi_cols, mi_rows); +} + +void ifd_clear(insp_frame_data *fd) { + aom_free(fd->mi_grid); + fd->mi_grid = NULL; +} + +/* TODO(negge) This function may be called by more than one thread when using + a multi-threaded decoder and this may cause a data race. */ +int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) { + struct AV1Decoder *pbi = (struct AV1Decoder *)decoder; + AV1_COMMON *const cm = &pbi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const CommonQuantParams *quant_params = &cm->quant_params; + + if (fd->mi_rows != mi_params->mi_rows || fd->mi_cols != mi_params->mi_cols) { + ifd_clear(fd); + ifd_init_mi_rc(fd, mi_params->mi_rows, mi_params->mi_cols); + } + fd->show_existing_frame = cm->show_existing_frame; + fd->frame_number = cm->current_frame.frame_number; + fd->show_frame = cm->show_frame; + fd->frame_type = cm->current_frame.frame_type; + fd->base_qindex = quant_params->base_qindex; + // Set width and height of the first tile until generic support can be added + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, 0); + av1_tile_set_col(&tile_info, cm, 0); + fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start; + fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start; + fd->delta_q_present_flag = cm->delta_q_info.delta_q_present_flag; + fd->delta_q_res = cm->delta_q_info.delta_q_res; +#if CONFIG_ACCOUNTING + fd->accounting = &pbi->accounting; +#endif + // TODO(negge): copy per frame CDEF data + int i, j; + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < 2; j++) { + fd->y_dequant[i][j] = quant_params->y_dequant_QTX[i][j]; + fd->u_dequant[i][j] = quant_params->u_dequant_QTX[i][j]; + fd->v_dequant[i][j] = quant_params->v_dequant_QTX[i][j]; + } + } + for (j = 0; j < mi_params->mi_rows; j++) { + for (i = 0; i < mi_params->mi_cols; i++) { + const MB_MODE_INFO *mbmi = + mi_params->mi_grid_base[j * mi_params->mi_stride + i]; + insp_mi_data *mi = &fd->mi_grid[j * mi_params->mi_cols + i]; + // Segment + mi->segment_id = mbmi->segment_id; + // Motion Vectors + mi->mv[0].row = mbmi->mv[0].as_mv.row; + mi->mv[0].col = mbmi->mv[0].as_mv.col; + mi->mv[1].row = mbmi->mv[1].as_mv.row; + mi->mv[1].col = mbmi->mv[1].as_mv.col; + // Reference Frames + mi->ref_frame[0] = mbmi->ref_frame[0]; + mi->ref_frame[1] = mbmi->ref_frame[1]; + // Prediction Mode + mi->mode = mbmi->mode; + mi->intrabc = (int16_t)mbmi->use_intrabc; + mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0]; + mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1]; + // Prediction Mode for Chromatic planes + if (mi->mode < INTRA_MODES) { + mi->uv_mode = mbmi->uv_mode; + } else { + mi->uv_mode = UV_MODE_INVALID; + } + + mi->motion_mode = mbmi->motion_mode; + mi->compound_type = mbmi->interinter_comp.type; + + // Block Size + mi->bsize = mbmi->bsize; + // Skip Flag + mi->skip = mbmi->skip_txfm; + mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0); + mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1); + mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1]; + + // Transform + // TODO(anyone): extract tx type info from mbmi->txk_type[]. + + const BLOCK_SIZE bsize = mbmi->bsize; + const int c = i % mi_size_wide[bsize]; + const int r = j % mi_size_high[bsize]; + if (is_inter_block(mbmi) || is_intrabc_block(mbmi)) + mi->tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(bsize, r, c)]; + else + mi->tx_size = mbmi->tx_size; + + if (skip_not_transform && mi->skip) mi->tx_size = -1; + + if (mi->skip) { + const int tx_type_row = j - j % tx_size_high_unit[mi->tx_size]; + const int tx_type_col = i - i % tx_size_wide_unit[mi->tx_size]; + const int tx_type_map_idx = + tx_type_row * mi_params->mi_stride + tx_type_col; + mi->tx_type = mi_params->tx_type_map[tx_type_map_idx]; + } else { + mi->tx_type = 0; + } + + if (skip_not_transform && + (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)])) + mi->tx_type = -1; + + mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] / + CDEF_SEC_STRENGTHS; + mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] % + CDEF_SEC_STRENGTHS; + + mi->cdef_strength += mi->cdef_strength == 3; + if (mbmi->uv_mode == UV_CFL_PRED) { + mi->cfl_alpha_idx = mbmi->cfl_alpha_idx; + mi->cfl_alpha_sign = mbmi->cfl_alpha_signs; + } else { + mi->cfl_alpha_idx = 0; + mi->cfl_alpha_sign = 0; + } + // delta_q + mi->current_qindex = mbmi->current_qindex; + } + } + return 1; +} diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h new file mode 100644 index 0000000000..70b1c80fab --- /dev/null +++ b/third_party/aom/av1/decoder/inspection.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_DECODER_INSPECTION_H_ +#define AOM_AV1_DECODER_INSPECTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include "av1/common/seg_common.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#ifndef AOM_AOM_AOMDX_H_ +typedef void (*aom_inspect_cb)(void *decoder, void *data); +#endif + +typedef struct insp_mv insp_mv; + +struct insp_mv { + int16_t row; + int16_t col; +}; + +typedef struct insp_mi_data insp_mi_data; + +struct insp_mi_data { + insp_mv mv[2]; + int16_t ref_frame[2]; + int16_t mode; + int16_t uv_mode; + int16_t bsize; + int16_t skip; + int16_t segment_id; + int16_t dual_filter_type; + int16_t filter[2]; + int16_t tx_type; + int16_t tx_size; + int16_t cdef_level; + int16_t cdef_strength; + int16_t cfl_alpha_idx; + int16_t cfl_alpha_sign; + int16_t current_qindex; + int16_t compound_type; + int16_t motion_mode; + int16_t intrabc; + int16_t palette; + int16_t uv_palette; +}; + +typedef struct insp_frame_data insp_frame_data; + +struct insp_frame_data { +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif + insp_mi_data *mi_grid; + int16_t frame_number; + int show_frame; + int frame_type; + int base_qindex; + int mi_rows; + int mi_cols; + int tile_mi_rows; + int tile_mi_cols; + int16_t y_dequant[MAX_SEGMENTS][2]; + int16_t u_dequant[MAX_SEGMENTS][2]; + int16_t v_dequant[MAX_SEGMENTS][2]; + // TODO(negge): add per frame CDEF data + int delta_q_present_flag; + int delta_q_res; + int show_existing_frame; +}; + +void ifd_init(insp_frame_data *fd, int frame_width, int frame_height); +void ifd_clear(insp_frame_data *fd); +int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_AV1_DECODER_INSPECTION_H_ diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c new file mode 100644 index 0000000000..0e31ce9404 --- /dev/null +++ b/third_party/aom/av1/decoder/obu.c @@ -0,0 +1,1101 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_ports/mem_ops.h" + +#include "av1/common/common.h" +#include "av1/common/obu_util.h" +#include "av1/common/timing.h" +#include "av1/decoder/decoder.h" +#include "av1/decoder/decodeframe.h" +#include "av1/decoder/obu.h" + +aom_codec_err_t aom_get_num_layers_from_operating_point_idc( + int operating_point_idc, unsigned int *number_spatial_layers, + unsigned int *number_temporal_layers) { + // derive number of spatial/temporal layers from operating_point_idc + + if (!number_spatial_layers || !number_temporal_layers) + return AOM_CODEC_INVALID_PARAM; + + if (operating_point_idc == 0) { + *number_temporal_layers = 1; + *number_spatial_layers = 1; + } else { + *number_spatial_layers = 0; + *number_temporal_layers = 0; + for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) { + *number_spatial_layers += + (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1; + } + for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) { + *number_temporal_layers += (operating_point_idc >> j) & 0x1; + } + } + + return AOM_CODEC_OK; +} + +static int is_obu_in_current_operating_point(AV1Decoder *pbi, + const ObuHeader *obu_header) { + if (!pbi->current_operating_point || !obu_header->has_extension) { + return 1; + } + + if ((pbi->current_operating_point >> obu_header->temporal_layer_id) & 0x1 && + (pbi->current_operating_point >> (obu_header->spatial_layer_id + 8)) & + 0x1) { + return 1; + } + return 0; +} + +static int byte_alignment(AV1_COMMON *const cm, + struct aom_read_bit_buffer *const rb) { + while (rb->bit_offset & 7) { + if (aom_rb_read_bit(rb)) { + cm->error->error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + return 0; +} + +static uint32_t read_temporal_delimiter_obu(void) { return 0; } + +// Returns a boolean that indicates success. +static int read_bitstream_level(AV1_LEVEL *seq_level_idx, + struct aom_read_bit_buffer *rb) { + *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); + if (!is_valid_seq_level_idx(*seq_level_idx)) return 0; + return 1; +} + +// Returns whether two sequence headers are consistent with each other. +// Note that the 'op_params' field is not compared per Section 7.5 in the spec: +// Within a particular coded video sequence, the contents of +// sequence_header_obu must be bit-identical each time the sequence header +// appears except for the contents of operating_parameters_info. +static int are_seq_headers_consistent(const SequenceHeader *seq_params_old, + const SequenceHeader *seq_params_new) { + return !memcmp(seq_params_old, seq_params_new, + offsetof(SequenceHeader, op_params)); +} + +// On success, sets pbi->sequence_header_ready to 1 and returns the number of +// bytes read from 'rb'. +// On failure, sets pbi->common.error.error_code and returns 0. +static uint32_t read_sequence_header_obu(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb) { + AV1_COMMON *const cm = &pbi->common; + const uint32_t saved_bit_offset = rb->bit_offset; + + // Verify rb has been configured to report errors. + assert(rb->error_handler); + + // Use a local variable to store the information as we decode. At the end, + // if no errors have occurred, cm->seq_params is updated. + SequenceHeader sh = *cm->seq_params; + SequenceHeader *const seq_params = &sh; + + seq_params->profile = av1_read_profile(rb); + if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) { + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + + // Still picture or not + seq_params->still_picture = aom_rb_read_bit(rb); + seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb); + // Video must have reduced_still_picture_hdr = 0 + if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) { + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + + if (seq_params->reduced_still_picture_hdr) { + seq_params->timing_info_present = 0; + seq_params->decoder_model_info_present_flag = 0; + seq_params->display_model_info_present_flag = 0; + seq_params->operating_points_cnt_minus_1 = 0; + seq_params->operating_point_idc[0] = 0; + if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) { + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + seq_params->tier[0] = 0; + seq_params->op_params[0].decoder_model_param_present_flag = 0; + seq_params->op_params[0].display_model_param_present_flag = 0; + } else { + seq_params->timing_info_present = aom_rb_read_bit(rb); + if (seq_params->timing_info_present) { + av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb); + + seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb); + if (seq_params->decoder_model_info_present_flag) + av1_read_decoder_model_info(&seq_params->decoder_model_info, rb); + } else { + seq_params->decoder_model_info_present_flag = 0; + } + seq_params->display_model_info_present_flag = aom_rb_read_bit(rb); + seq_params->operating_points_cnt_minus_1 = + aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + seq_params->operating_point_idc[i] = + aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); + if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) { + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return 0; + } + // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7 + // is equivalent to level 3.3. + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) + seq_params->tier[i] = aom_rb_read_bit(rb); + else + seq_params->tier[i] = 0; + if (seq_params->decoder_model_info_present_flag) { + seq_params->op_params[i].decoder_model_param_present_flag = + aom_rb_read_bit(rb); + if (seq_params->op_params[i].decoder_model_param_present_flag) + av1_read_op_parameters_info(&seq_params->op_params[i], + seq_params->decoder_model_info + .encoder_decoder_buffer_delay_length, + rb); + } else { + seq_params->op_params[i].decoder_model_param_present_flag = 0; + } + if (seq_params->timing_info_present && + (seq_params->timing_info.equal_picture_interval || + seq_params->op_params[i].decoder_model_param_present_flag)) { + seq_params->op_params[i].bitrate = av1_max_level_bitrate( + seq_params->profile, seq_params->seq_level_idx[i], + seq_params->tier[i]); + // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass + // the check + if (seq_params->op_params[i].bitrate == 0) + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support this combination of " + "profile, level, and tier."); + // Buffer size in bits/s is bitrate in bits/s * 1 s + seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; + } + if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->op_params[i].decoder_model_param_present_flag) { + // When the decoder_model_parameters are not sent for this op, set + // the default ones that can be used with the resource availability mode + seq_params->op_params[i].decoder_buffer_delay = 70000; + seq_params->op_params[i].encoder_buffer_delay = 20000; + seq_params->op_params[i].low_delay_mode_flag = 0; + } + + if (seq_params->display_model_info_present_flag) { + seq_params->op_params[i].display_model_param_present_flag = + aom_rb_read_bit(rb); + if (seq_params->op_params[i].display_model_param_present_flag) { + seq_params->op_params[i].initial_display_delay = + aom_rb_read_literal(rb, 4) + 1; + if (seq_params->op_params[i].initial_display_delay > 10) + aom_internal_error( + &pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support more than 10 decoded frames delay"); + } else { + seq_params->op_params[i].initial_display_delay = 10; + } + } else { + seq_params->op_params[i].display_model_param_present_flag = 0; + seq_params->op_params[i].initial_display_delay = 10; + } + } + } + // This decoder supports all levels. Choose operating point provided by + // external means + int operating_point = pbi->operating_point; + if (operating_point < 0 || + operating_point > seq_params->operating_points_cnt_minus_1) + operating_point = 0; + pbi->current_operating_point = + seq_params->operating_point_idc[operating_point]; + if (aom_get_num_layers_from_operating_point_idc( + pbi->current_operating_point, &pbi->number_spatial_layers, + &pbi->number_temporal_layers) != AOM_CODEC_OK) { + pbi->error.error_code = AOM_CODEC_ERROR; + return 0; + } + + av1_read_sequence_header(cm, rb, seq_params); + + av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error); + if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) && + !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) && + !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) { + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, " + "%d %d subsampling is not supported.\n", + seq_params->subsampling_x, seq_params->subsampling_y); + } + + seq_params->film_grain_params_present = aom_rb_read_bit(rb); + + if (av1_check_trailing_bits(pbi, rb) != 0) { + // pbi->error.error_code is already set. + return 0; + } + + // If a sequence header has been decoded before, we check if the new + // one is consistent with the old one. + if (pbi->sequence_header_ready) { + if (!are_seq_headers_consistent(cm->seq_params, seq_params)) + pbi->sequence_header_changed = 1; + } + + *cm->seq_params = *seq_params; + pbi->sequence_header_ready = 1; + + return ((rb->bit_offset - saved_bit_offset + 7) >> 3); +} + +// On success, returns the frame header size. On failure, calls +// aom_internal_error and does not return. If show existing frame, +// also marks the data processing to end after the frame header. +static uint32_t read_frame_header_obu(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t **p_data_end, + int trailing_bits_present) { + const uint32_t hdr_size = + av1_decode_frame_headers_and_setup(pbi, rb, trailing_bits_present); + const AV1_COMMON *cm = &pbi->common; + if (cm->show_existing_frame) { + *p_data_end = data + hdr_size; + } + return hdr_size; +} + +// On success, returns the tile group header size. On failure, calls +// aom_internal_error() and returns -1. +static int32_t read_tile_group_header(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + int *start_tile, int *end_tile, + int tile_start_implicit) { + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + uint32_t saved_bit_offset = rb->bit_offset; + int tile_start_and_end_present_flag = 0; + const int num_tiles = tiles->rows * tiles->cols; + + if (!tiles->large_scale && num_tiles > 1) { + tile_start_and_end_present_flag = aom_rb_read_bit(rb); + if (tile_start_implicit && tile_start_and_end_present_flag) { + aom_internal_error( + &pbi->error, AOM_CODEC_UNSUP_BITSTREAM, + "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0"); + return -1; + } + } + if (tiles->large_scale || num_tiles == 1 || + !tile_start_and_end_present_flag) { + *start_tile = 0; + *end_tile = num_tiles - 1; + } else { + int tile_bits = tiles->log2_rows + tiles->log2_cols; + *start_tile = aom_rb_read_literal(rb, tile_bits); + *end_tile = aom_rb_read_literal(rb, tile_bits); + } + if (*start_tile != pbi->next_start_tile) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "tg_start (%d) must be equal to %d", *start_tile, + pbi->next_start_tile); + return -1; + } + if (*start_tile > *end_tile) { + aom_internal_error( + &pbi->error, AOM_CODEC_CORRUPT_FRAME, + "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile, + *start_tile); + return -1; + } + if (*end_tile >= num_tiles) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "tg_end (%d) must be less than NumTiles (%d)", *end_tile, + num_tiles); + return -1; + } + pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1; + + return ((rb->bit_offset - saved_bit_offset + 7) >> 3); +} + +// On success, returns the tile group OBU size. On failure, sets +// pbi->common.error.error_code and returns 0. +static uint32_t read_one_tile_group_obu( + AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg, + const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end, + int *is_last_tg, int tile_start_implicit) { + AV1_COMMON *const cm = &pbi->common; + int start_tile, end_tile; + int32_t header_size, tg_payload_size; + + assert((rb->bit_offset & 7) == 0); + assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data); + + header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile, + tile_start_implicit); + if (header_size == -1 || byte_alignment(cm, rb)) return 0; + data += header_size; + av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile, + end_tile, is_first_tg); + + tg_payload_size = (uint32_t)(*p_data_end - data); + + *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1; + return header_size + tg_payload_size; +} + +static void alloc_tile_list_buffer(AV1Decoder *pbi) { + // The resolution of the output frame is read out from the bitstream. The data + // are stored in the order of Y plane, U plane and V plane. As an example, for + // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the + // output frame. + AV1_COMMON *const cm = &pbi->common; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; + const int output_frame_width = + (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels; + const int output_frame_height = + (pbi->output_frame_height_in_tiles_minus_1 + 1) * tile_height_in_pixels; + // The output frame is used to store the decoded tile list. The decoded tile + // list has to fit into 1 output frame. + assert((pbi->tile_count_minus_1 + 1) <= + (pbi->output_frame_width_in_tiles_minus_1 + 1) * + (pbi->output_frame_height_in_tiles_minus_1 + 1)); + + // Allocate the tile list output buffer. + // Note: if cm->seq_params->use_highbitdepth is 1 and + // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8 + // bits/pixel. + if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width, + output_frame_height, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, + (cm->seq_params->use_highbitdepth && + (cm->seq_params->bit_depth > AOM_BITS_8)), + 0, cm->features.byte_alignment, 0, 0)) + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate the tile list output buffer"); +} + +static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1, + int hend1, int vstart1, int vend1, + YV12_BUFFER_CONFIG *dst, int hstart2, int vstart2, + int plane) { + const int src_stride = (plane > 0) ? src->strides[1] : src->strides[0]; + const int dst_stride = (plane > 0) ? dst->strides[1] : dst->strides[0]; + int row, col; + + assert(src->flags & YV12_FLAG_HIGHBITDEPTH); + assert(!(dst->flags & YV12_FLAG_HIGHBITDEPTH)); + + const uint16_t *src16 = + CONVERT_TO_SHORTPTR(src->buffers[plane] + vstart1 * src_stride + hstart1); + uint8_t *dst8 = dst->buffers[plane] + vstart2 * dst_stride + hstart2; + + for (row = vstart1; row < vend1; ++row) { + for (col = 0; col < (hend1 - hstart1); ++col) *dst8++ = (uint8_t)(*src16++); + src16 += src_stride - (hend1 - hstart1); + dst8 += dst_stride - (hend1 - hstart1); + } + return; +} + +static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, + int tile_idx) { + AV1_COMMON *const cm = &pbi->common; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; + const int ssy = cm->seq_params->subsampling_y; + const int ssx = cm->seq_params->subsampling_x; + const int num_planes = av1_num_planes(cm); + + YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf; + const int tr = tile_idx / (pbi->output_frame_width_in_tiles_minus_1 + 1); + const int tc = tile_idx % (pbi->output_frame_width_in_tiles_minus_1 + 1); + int plane; + + // Copy decoded tile to the tile list output buffer. + for (plane = 0; plane < num_planes; ++plane) { + const int shift_x = plane > 0 ? ssx : 0; + const int shift_y = plane > 0 ? ssy : 0; + const int h = tile_height_in_pixels >> shift_y; + const int w = tile_width_in_pixels >> shift_x; + + // src offset + int vstart1 = pbi->dec_tile_row * h; + int vend1 = vstart1 + h; + int hstart1 = pbi->dec_tile_col * w; + int hend1 = hstart1 + w; + // dst offset + int vstart2 = tr * h; + int hstart2 = tc * w; + + if (cm->seq_params->use_highbitdepth && + cm->seq_params->bit_depth == AOM_BITS_8) { + yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2, plane); + } else { + switch (plane) { + case 0: + aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2); + break; + case 1: + aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2); + break; + case 2: + aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1, + &pbi->tile_list_outbuf, hstart2, vstart2); + break; + default: assert(0); + } + } + } +} + +// Only called while large_scale_tile = 1. +// +// On success, returns the tile list OBU size. On failure, sets +// pbi->common.error.error_code and returns 0. +static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, + struct aom_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end, + int *frame_decoding_finished) { + AV1_COMMON *const cm = &pbi->common; + uint32_t tile_list_payload_size = 0; + const int num_tiles = cm->tiles.cols * cm->tiles.rows; + const int start_tile = 0; + const int end_tile = num_tiles - 1; + int i = 0; + + // Process the tile list info. + pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); + pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); + pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); + if (pbi->tile_count_minus_1 > MAX_TILES - 1) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + // Allocate output frame buffer for the tile list. + alloc_tile_list_buffer(pbi); + + uint32_t tile_list_info_bytes = 4; + tile_list_payload_size += tile_list_info_bytes; + data += tile_list_info_bytes; + + int tile_idx = 0; + for (i = 0; i <= pbi->tile_count_minus_1; i++) { + // Process 1 tile. + // Reset the bit reader. + rb->bit_offset = 0; + rb->bit_buffer = data; + + // Read out the tile info. + uint32_t tile_info_bytes = 5; + // Set reference for each tile. + int ref_idx = aom_rb_read_literal(rb, 8); + if (ref_idx >= MAX_EXTERNAL_REFERENCES) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1, + &pbi->ext_refs.refs[ref_idx]); + + pbi->dec_tile_row = aom_rb_read_literal(rb, 8); + pbi->dec_tile_col = aom_rb_read_literal(rb, 8); + if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 || + pbi->dec_tile_row >= cm->tiles.rows || + pbi->dec_tile_col >= cm->tiles.cols) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1; + data += tile_info_bytes; + if ((size_t)(data_end - data) < pbi->coded_tile_data_size) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + + av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size, + p_data_end, start_tile, end_tile, 0); + uint32_t tile_payload_size = (uint32_t)(*p_data_end - data); + + tile_list_payload_size += tile_info_bytes + tile_payload_size; + + // Update data ptr for next tile decoding. + data = *p_data_end; + assert(data <= data_end); + + // Copy the decoded tile to the tile list output buffer. + copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx); + tile_idx++; + } + + *frame_decoding_finished = 1; + return tile_list_payload_size; +} + +// Returns the last nonzero byte index in 'data'. If there is no nonzero byte in +// 'data', returns -1. +static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) { + // Scan backward and return on the first nonzero byte. + int i = (int)sz - 1; + while (i >= 0 && data[i] == 0) { + --i; + } + return i; +} + +// Allocates metadata that was read and adds it to the decoders metadata array. +static void alloc_read_metadata(AV1Decoder *const pbi, + OBU_METADATA_TYPE metadata_type, + const uint8_t *data, size_t sz, + aom_metadata_insert_flags_t insert_flag) { + if (!pbi->metadata) { + pbi->metadata = aom_img_metadata_array_alloc(0); + if (!pbi->metadata) { + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate metadata array"); + } + } + aom_metadata_t *metadata = + aom_img_metadata_alloc(metadata_type, data, sz, insert_flag); + if (!metadata) { + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Error allocating metadata"); + } + aom_metadata_t **metadata_array = + (aom_metadata_t **)realloc(pbi->metadata->metadata_array, + (pbi->metadata->sz + 1) * sizeof(metadata)); + if (!metadata_array) { + aom_img_metadata_free(metadata); + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, + "Error growing metadata array"); + } + pbi->metadata->metadata_array = metadata_array; + pbi->metadata->metadata_array[pbi->metadata->sz] = metadata; + pbi->metadata->sz++; +} + +// On failure, calls aom_internal_error() and does not return. +static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data, + size_t sz) { + if (sz == 0) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "itu_t_t35_country_code is missing"); + } + int country_code_size = 1; + if (*data == 0xFF) { + if (sz == 1) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "itu_t_t35_country_code_extension_byte is missing"); + } + ++country_code_size; + } + int end_index = get_last_nonzero_byte_index(data, sz); + if (end_index < country_code_size) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "No trailing bits found in ITU-T T.35 metadata OBU"); + } + // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says: + // itu_t_t35_payload_bytes shall be bytes containing data registered as + // specified in Recommendation ITU-T T.35. + // Therefore the first trailing byte should be 0x80. + if (data[end_index] != 0x80) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "The last nonzero byte of the ITU-T T.35 metadata OBU " + "is 0x%02x, should be 0x80.", + data[end_index]); + } + alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, end_index, + AOM_MIF_ANY_FRAME); +} + +// On success, returns the number of bytes read from 'data'. On failure, calls +// aom_internal_error() and does not return. +static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data, + size_t sz) { + const size_t kHdrCllPayloadSize = 4; + if (sz < kHdrCllPayloadSize) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Incorrect HDR CLL metadata payload size"); + } + alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize, + AOM_MIF_ANY_FRAME); + return kHdrCllPayloadSize; +} + +// On success, returns the number of bytes read from 'data'. On failure, calls +// aom_internal_error() and does not return. +static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data, + size_t sz) { + const size_t kMdcvPayloadSize = 24; + if (sz < kMdcvPayloadSize) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Incorrect HDR MDCV metadata payload size"); + } + alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize, + AOM_MIF_ANY_FRAME); + return kMdcvPayloadSize; +} + +static void scalability_structure(struct aom_read_bit_buffer *rb) { + const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2); + const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb); + const int spatial_layer_description_present_flag = aom_rb_read_bit(rb); + const int temporal_group_description_present_flag = aom_rb_read_bit(rb); + // scalability_structure_reserved_3bits must be set to zero and be ignored by + // decoders. + aom_rb_read_literal(rb, 3); + + if (spatial_layer_dimensions_present_flag) { + for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) { + aom_rb_read_literal(rb, 16); + aom_rb_read_literal(rb, 16); + } + } + if (spatial_layer_description_present_flag) { + for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) { + aom_rb_read_literal(rb, 8); + } + } + if (temporal_group_description_present_flag) { + const int temporal_group_size = aom_rb_read_literal(rb, 8); + for (int i = 0; i < temporal_group_size; i++) { + aom_rb_read_literal(rb, 3); + aom_rb_read_bit(rb); + aom_rb_read_bit(rb); + const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3); + for (int j = 0; j < temporal_group_ref_cnt; j++) { + aom_rb_read_literal(rb, 8); + } + } + } +} + +static void read_metadata_scalability(struct aom_read_bit_buffer *rb) { + const int scalability_mode_idc = aom_rb_read_literal(rb, 8); + if (scalability_mode_idc == SCALABILITY_SS) { + scalability_structure(rb); + } +} + +static void read_metadata_timecode(struct aom_read_bit_buffer *rb) { + aom_rb_read_literal(rb, 5); // counting_type f(5) + const int full_timestamp_flag = + aom_rb_read_bit(rb); // full_timestamp_flag f(1) + aom_rb_read_bit(rb); // discontinuity_flag (f1) + aom_rb_read_bit(rb); // cnt_dropped_flag f(1) + aom_rb_read_literal(rb, 9); // n_frames f(9) + if (full_timestamp_flag) { + aom_rb_read_literal(rb, 6); // seconds_value f(6) + aom_rb_read_literal(rb, 6); // minutes_value f(6) + aom_rb_read_literal(rb, 5); // hours_value f(5) + } else { + const int seconds_flag = aom_rb_read_bit(rb); // seconds_flag f(1) + if (seconds_flag) { + aom_rb_read_literal(rb, 6); // seconds_value f(6) + const int minutes_flag = aom_rb_read_bit(rb); // minutes_flag f(1) + if (minutes_flag) { + aom_rb_read_literal(rb, 6); // minutes_value f(6) + const int hours_flag = aom_rb_read_bit(rb); // hours_flag f(1) + if (hours_flag) { + aom_rb_read_literal(rb, 5); // hours_value f(5) + } + } + } + } + // time_offset_length f(5) + const int time_offset_length = aom_rb_read_literal(rb, 5); + if (time_offset_length) { + // time_offset_value f(time_offset_length) + aom_rb_read_literal(rb, time_offset_length); + } +} + +// Returns the last nonzero byte in 'data'. If there is no nonzero byte in +// 'data', returns 0. +// +// Call this function to check the following requirement in the spec: +// This implies that when any payload data is present for this OBU type, at +// least one byte of the payload data (including the trailing bit) shall not +// be equal to 0. +static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) { + // Scan backward and return on the first nonzero byte. + size_t i = sz; + while (i != 0) { + --i; + if (data[i] != 0) return data[i]; + } + return 0; +} + +// Checks the metadata for correct syntax but ignores the parsed metadata. +// +// On success, returns the number of bytes read from 'data'. On failure, sets +// pbi->common.error.error_code and returns 0, or calls aom_internal_error() +// and does not return. +static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { + size_t type_length; + uint64_t type_value; + if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value; + if (metadata_type == 0 || metadata_type >= 6) { + // If metadata_type is reserved for future use or a user private value, + // ignore the entire OBU and just check trailing bits. + if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } + if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) { + // read_metadata_itut_t35() checks trailing bits. + read_metadata_itut_t35(pbi, data + type_length, sz - type_length); + return sz; + } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) { + size_t bytes_read = + type_length + + read_metadata_hdr_cll(pbi, data + type_length, sz - type_length); + if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) { + size_t bytes_read = + type_length + + read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length); + if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + return sz; + } + + struct aom_read_bit_buffer rb; + av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz); + if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) { + read_metadata_scalability(&rb); + } else { + assert(metadata_type == OBU_METADATA_TYPE_TIMECODE); + read_metadata_timecode(&rb); + } + if (av1_check_trailing_bits(pbi, &rb) != 0) { + // pbi->error.error_code is already set. + return 0; + } + assert((rb.bit_offset & 7) == 0); + return type_length + (rb.bit_offset >> 3); +} + +// On success, returns 'sz'. On failure, sets pbi->common.error.error_code and +// returns 0. +static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data, + size_t sz) { + // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So + // check trailing bits only if sz > 0. + if (sz > 0) { + // The payload of a padding OBU is byte aligned. Therefore the first + // trailing byte should be 0x80. See https://crbug.com/aomedia/2393. + const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz); + if (last_nonzero_byte != 0x80) { + cm->error->error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + } + return sz; +} + +// On success, returns a boolean that indicates whether the decoding of the +// current frame is finished. On failure, sets pbi->error.error_code and +// returns -1. +int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end) { + AV1_COMMON *const cm = &pbi->common; + int frame_decoding_finished = 0; + int is_first_tg_obu_received = 1; + // Whenever pbi->seen_frame_header is set to 1, frame_header is set to the + // beginning of the frame_header_obu and frame_header_size is set to its + // size. This allows us to check if a redundant frame_header_obu is a copy + // of the previous frame_header_obu. + // + // Initialize frame_header to a dummy nonnull pointer, otherwise the Clang + // Static Analyzer in clang 7.0.1 will falsely warn that a null pointer is + // passed as an argument to a 'nonnull' parameter of memcmp(). The initial + // value will not be used. + const uint8_t *frame_header = data; + uint32_t frame_header_size = 0; + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + pbi->seen_frame_header = 0; + pbi->next_start_tile = 0; + pbi->num_tile_groups = 0; + + if (data_end < data) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0. + if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0; + + // decode frame as a series of OBUs + while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) { + struct aom_read_bit_buffer rb; + size_t payload_size = 0; + size_t decoded_payload_size = 0; + size_t obu_payload_offset = 0; + size_t bytes_read = 0; + const size_t bytes_available = data_end - data; + + if (bytes_available == 0 && !pbi->seen_frame_header) { + *p_data_end = data; + pbi->error.error_code = AOM_CODEC_OK; + break; + } + + aom_codec_err_t status = + aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb, + &obu_header, &payload_size, &bytes_read); + + if (status != AOM_CODEC_OK) { + pbi->error.error_code = status; + return -1; + } + + // Record obu size header information. + pbi->obu_size_hdr.data = data + obu_header.size; + pbi->obu_size_hdr.size = bytes_read - obu_header.size; + + // Note: aom_read_obu_header_and_size() takes care of checking that this + // doesn't cause 'data' to advance past 'data_end'. + data += bytes_read; + + if ((size_t)(data_end - data) < payload_size) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + cm->temporal_layer_id = obu_header.temporal_layer_id; + cm->spatial_layer_id = obu_header.spatial_layer_id; + + if (obu_header.type != OBU_TEMPORAL_DELIMITER && + obu_header.type != OBU_SEQUENCE_HEADER) { + // don't decode obu if it's not in current operating mode + if (!is_obu_in_current_operating_point(pbi, &obu_header)) { + data += payload_size; + continue; + } + } + + av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size); + + switch (obu_header.type) { + case OBU_TEMPORAL_DELIMITER: + decoded_payload_size = read_temporal_delimiter_obu(); + if (pbi->seen_frame_header) { + // A new temporal unit has started, but the frame in the previous + // temporal unit is incomplete. + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + break; + case OBU_SEQUENCE_HEADER: + decoded_payload_size = read_sequence_header_obu(pbi, &rb); + if (pbi->error.error_code != AOM_CODEC_OK) return -1; + // The sequence header should not change in the middle of a frame. + if (pbi->sequence_header_changed && pbi->seen_frame_header) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + break; + case OBU_FRAME_HEADER: + case OBU_REDUNDANT_FRAME_HEADER: + case OBU_FRAME: + if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) { + if (!pbi->seen_frame_header) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } else { + // OBU_FRAME_HEADER or OBU_FRAME. + if (pbi->seen_frame_header) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + // Only decode first frame header received + if (!pbi->seen_frame_header || + (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) { + frame_header_size = read_frame_header_obu( + pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME); + frame_header = data; + pbi->seen_frame_header = 1; + if (!pbi->ext_tile_debug && cm->tiles.large_scale) + pbi->camera_frame_header_ready = 1; + } else { + // Verify that the frame_header_obu is identical to the original + // frame_header_obu. + if (frame_header_size > payload_size || + memcmp(data, frame_header, frame_header_size) != 0) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + assert(rb.bit_offset == 0); + rb.bit_offset = 8 * frame_header_size; + } + + decoded_payload_size = frame_header_size; + pbi->frame_header_size = frame_header_size; + cm->cur_frame->temporal_id = obu_header.temporal_layer_id; + cm->cur_frame->spatial_id = obu_header.spatial_layer_id; + + if (cm->show_existing_frame) { + if (obu_header.type == OBU_FRAME) { + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return -1; + } + frame_decoding_finished = 1; + pbi->seen_frame_header = 0; + + if (cm->show_frame && + !cm->seq_params->order_hint_info.enable_order_hint) { + ++cm->current_frame.frame_number; + } + break; + } + + // In large scale tile coding, decode the common camera frame header + // before any tile list OBU. + if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) { + frame_decoding_finished = 1; + // Skip the rest of the frame data. + decoded_payload_size = payload_size; + // Update data_end. + *p_data_end = data_end; + break; + } + + if (obu_header.type != OBU_FRAME) break; + obu_payload_offset = frame_header_size; + // Byte align the reader before reading the tile group. + // byte_alignment() has set pbi->error.error_code if it returns -1. + if (byte_alignment(cm, &rb)) return -1; + AOM_FALLTHROUGH_INTENDED; // fall through to read tile group. + case OBU_TILE_GROUP: + if (!pbi->seen_frame_header) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + if (obu_payload_offset > payload_size) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + decoded_payload_size += read_one_tile_group_obu( + pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset, + data + payload_size, p_data_end, &frame_decoding_finished, + obu_header.type == OBU_FRAME); + if (pbi->error.error_code != AOM_CODEC_OK) return -1; + is_first_tg_obu_received = 0; + if (frame_decoding_finished) { + pbi->seen_frame_header = 0; + pbi->next_start_tile = 0; + } + pbi->num_tile_groups++; + break; + case OBU_METADATA: + decoded_payload_size = read_metadata(pbi, data, payload_size); + if (pbi->error.error_code != AOM_CODEC_OK) return -1; + break; + case OBU_TILE_LIST: + if (CONFIG_NORMAL_TILE_MODE) { + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + return -1; + } + + // This OBU type is purely for the large scale tile coding mode. + // The common camera frame header has to be already decoded. + if (!pbi->camera_frame_header_ready) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + cm->tiles.large_scale = 1; + av1_set_single_tile_decoding_mode(cm); + decoded_payload_size = + read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size, + p_data_end, &frame_decoding_finished); + if (pbi->error.error_code != AOM_CODEC_OK) return -1; + break; + case OBU_PADDING: + decoded_payload_size = read_padding(cm, data, payload_size); + if (pbi->error.error_code != AOM_CODEC_OK) return -1; + break; + default: + // Skip unrecognized OBUs + if (payload_size > 0 && + get_last_nonzero_byte(data, payload_size) == 0) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + decoded_payload_size = payload_size; + break; + } + + // Check that the signalled OBU size matches the actual amount of data read + if (decoded_payload_size > payload_size) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + + // If there are extra padding bytes, they should all be zero + while (decoded_payload_size < payload_size) { + uint8_t padding_byte = data[decoded_payload_size++]; + if (padding_byte != 0) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return -1; + } + } + + data += payload_size; + } + + if (pbi->error.error_code != AOM_CODEC_OK) return -1; + return frame_decoding_finished; +} diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h new file mode 100644 index 0000000000..d8ebe368e6 --- /dev/null +++ b/third_party/aom/av1/decoder/obu.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_DECODER_OBU_H_ +#define AOM_AV1_DECODER_OBU_H_ + +#include "aom/aom_codec.h" +#include "av1/decoder/decoder.h" + +// Try to decode one frame from a buffer. +// Returns 1 if we decoded a frame, +// 0 if we didn't decode a frame but that's okay +// (eg, if there was a frame but we skipped it), +// or -1 on error +int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, + const uint8_t *data_end, + const uint8_t **p_data_end); + +aom_codec_err_t aom_get_num_layers_from_operating_point_idc( + int operating_point_idc, unsigned int *number_spatial_layers, + unsigned int *number_temporal_layers); + +#endif // AOM_AV1_DECODER_OBU_H_ diff --git a/third_party/aom/av1/encoder/allintra_vis.c b/third_party/aom/av1/encoder/allintra_vis.c new file mode 100644 index 0000000000..8dcef5fc85 --- /dev/null +++ b/third_party/aom/av1/encoder/allintra_vis.c @@ -0,0 +1,1055 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#if CONFIG_TFLITE +#include "tensorflow/lite/c/c_api.h" +#include "av1/encoder/deltaq4_model.c" +#endif + +#include "av1/common/common_data.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/rdopt_utils.h" + +#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128 +#define MB_WIENER_PRED_BUF_STRIDE 128 + +void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) { + const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd); + assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL); + const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE]; + const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE]; + assert(buf_width == MB_WIENER_PRED_BUF_STRIDE); + const size_t buf_size = + (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf)) + << is_high_bitdepth; + CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size)); +} + +void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) { + aom_free(td->wiener_tmp_pred_buf); + td->wiener_tmp_pred_buf = NULL; +} + +void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + // This block size is also used to determine number of workers in + // multi-threading. If it is changed, one needs to change it accordingly in + // "compute_num_ai_workers()". + cpi->weber_bsize = BLOCK_8X8; + + if (cpi->oxcf.enable_rate_guide_deltaq) { + if (cpi->mb_weber_stats && cpi->prep_rate_estimates && + cpi->ext_rate_distribution) + return; + } else { + if (cpi->mb_weber_stats) return; + } + + CHECK_MEM_ERROR(cm, cpi->mb_weber_stats, + aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, + sizeof(*cpi->mb_weber_stats))); + + if (cpi->oxcf.enable_rate_guide_deltaq) { + CHECK_MEM_ERROR( + cm, cpi->prep_rate_estimates, + aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, + sizeof(*cpi->prep_rate_estimates))); + + CHECK_MEM_ERROR( + cm, cpi->ext_rate_distribution, + aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols, + sizeof(*cpi->ext_rate_distribution))); + } +} + +static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int mb_stride = cpi->frame_info.mi_cols; + int mb_count = 0; + int64_t satd = 0; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)] + .satd; + ++mb_count; + } + } + + if (mb_count) satd = (int)(satd / mb_count); + satd = AOMMAX(1, satd); + + return (int)satd; +} + +static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int mb_stride = cpi->frame_info.mi_cols; + int mb_count = 0; + int64_t distortion = 0; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + distortion += + cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)] + .distortion; + ++mb_count; + } + } + + if (mb_count) distortion = (int)(distortion / mb_count); + distortion = AOMMAX(1, distortion); + + return (int)distortion; +} + +static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int mb_stride = cpi->frame_info.mi_cols; + double min_max_scale = 10.0; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + WeberStats *weber_stats = + &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]; + if (weber_stats->max_scale < 1.0) continue; + if (weber_stats->max_scale < min_max_scale) + min_max_scale = weber_stats->max_scale; + } + } + return min_max_scale; +} + +static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + const int mi_step = mi_size_wide[cpi->weber_bsize]; + int sb_wiener_var = 0; + int mb_stride = cpi->frame_info.mi_cols; + int mb_count = 0; + double base_num = 1; + double base_den = 1; + double base_reg = 1; + + for (int row = mi_row; row < mi_row + mi_high; row += mi_step) { + for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + WeberStats *weber_stats = + &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]; + + base_num += ((double)weber_stats->distortion) * + sqrt((double)weber_stats->src_variance) * + weber_stats->rec_pix_max; + + base_den += fabs( + weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) - + weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance)); + + base_reg += sqrt((double)weber_stats->distortion) * + sqrt((double)weber_stats->src_pix_max) * 0.1; + ++mb_count; + } + } + + sb_wiener_var = + (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count); + sb_wiener_var = AOMMAX(1, sb_wiener_var); + + return (int)sb_wiener_var; +} + +static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col); + + if (mi_row >= (mi_high / 2)) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col)); + } + if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col)); + } + if (mi_col >= (mi_wide / 2)) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2)); + } + if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) { + sb_wiener_var = + AOMMIN(sb_wiener_var, + get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2)); + } + + return sb_wiener_var; +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + int rate_cost = 1; + + for (int idx = 0; idx < eob; ++idx) { + int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += (int)(log1p(abs_level) / log(2.0)) + 1 + (abs_level > 0); + } + + return (rate_cost << AV1_PROB_COST_SHIFT); +} + +void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, + MACROBLOCKD *xd, const int mi_row, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + double *sum_rec_distortion, + double *sum_est_rate, uint8_t *pred_buffer) { + AV1_COMMON *const cm = &cpi->common; + uint8_t *buffer = cpi->source->y_buffer; + int buf_stride = cpi->source->y_stride; + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int block_size = tx_size_wide[tx_size]; + const int coeff_count = block_size * block_size; + const int mb_step = mi_size_wide[bsize]; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + const MultiThreadInfo *const mt_info = &cpi->mt_info; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + const int mi_cols = cm->mi_params.mi_cols; + const int mt_thread_id = mi_row / mb_step; + // TODO(chengchen): test different unit step size + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; + const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step; + int mt_unit_col = 0; + const int is_high_bitdepth = is_cur_buf_hbd(xd); + + uint8_t *dst_buffer = pred_buffer; + const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE; + + if (is_high_bitdepth) { + uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer; + dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16); + } + + for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) { + if (mi_col % mt_unit_step == 0) { + intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_col); +#if CONFIG_MULTITHREAD + const int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); + if (num_workers > 1) { + const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + pthread_mutex_lock(enc_row_mt->mutex_); + const bool exit = enc_row_mt->mb_wiener_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Stop further processing in case any worker has encountered an error. + if (exit) break; + } +#endif + } + + PREDICTION_MODE best_mode = DC_PRED; + int best_intra_cost = INT_MAX; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, + AOMMIN(mi_row + mi_height, cm->mi_params.mi_rows), + AOMMIN(mi_col + mi_width, cm->mi_params.mi_cols)); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], + av1_num_planes(cm)); + xd->mi[0]->bsize = bsize; + xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + // Set above and left mbmi to NULL as they are not available in the + // preprocessing stage. + // They are used to detemine intra edge filter types in intra prediction. + if (xd->up_available) { + xd->above_mbmi = NULL; + } + if (xd->left_available) { + xd->left_mbmi = NULL; + } + uint8_t *mb_buffer = + buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE; + for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END; + ++mode) { + // TODO(chengchen): Here we use src instead of reconstructed frame as + // the intra predictor to make single and multithread version match. + // Ideally we want to use the reconstructed. + av1_predict_intra_block( + xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter, + block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0); + av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride); + av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff); + int intra_cost = aom_satd(coeff, coeff_count); + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = mode; + } + } + + av1_predict_intra_block( + xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter, + block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0); + av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, dst_buffer, dst_buffer_stride); + av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff); + + const struct macroblock_plane *const p = &x->plane[0]; + uint16_t eob; + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + QUANT_PARAM quant_param; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, + scan_order, &quant_param); + } else { + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, + scan_order, &quant_param); + } +#else + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order, + &quant_param); +#endif // CONFIG_AV1_HIGHBITDEPTH + + if (cpi->oxcf.enable_rate_guide_deltaq) { + const int rate_cost = rate_estimator(qcoeff, eob, tx_size); + cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols + + (mi_col / mb_step)] = rate_cost; + } + + av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer, + dst_buffer_stride, eob, 0); + WeberStats *weber_stats = + &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols + + (mi_col / mb_step)]; + + weber_stats->rec_pix_max = 1; + weber_stats->rec_variance = 0; + weber_stats->src_pix_max = 1; + weber_stats->src_variance = 0; + weber_stats->distortion = 0; + + int64_t src_mean = 0; + int64_t rec_mean = 0; + int64_t dist_mean = 0; + + for (int pix_row = 0; pix_row < block_size; ++pix_row) { + for (int pix_col = 0; pix_col < block_size; ++pix_col) { + int src_pix, rec_pix; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer); + uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer); + src_pix = src[pix_row * buf_stride + pix_col]; + rec_pix = rec[pix_row * dst_buffer_stride + pix_col]; + } else { + src_pix = mb_buffer[pix_row * buf_stride + pix_col]; + rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col]; + } +#else + src_pix = mb_buffer[pix_row * buf_stride + pix_col]; + rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col]; +#endif + src_mean += src_pix; + rec_mean += rec_pix; + dist_mean += src_pix - rec_pix; + weber_stats->src_variance += src_pix * src_pix; + weber_stats->rec_variance += rec_pix * rec_pix; + weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix); + weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix); + weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix); + } + } + + if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) { + *sum_rec_distortion += weber_stats->distortion; + int est_block_rate = 0; + int64_t est_block_dist = 0; + model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion, + pix_num, &est_block_rate, + &est_block_dist); + *sum_est_rate += est_block_rate; + } + + weber_stats->src_variance -= (src_mean * src_mean) / pix_num; + weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num; + weber_stats->distortion -= (dist_mean * dist_mean) / pix_num; + weber_stats->satd = best_intra_cost; + + qcoeff[0] = 0; + int max_scale = 0; + for (int idx = 1; idx < coeff_count; ++idx) { + const int abs_qcoeff = abs(qcoeff[idx]); + max_scale = AOMMAX(max_scale, abs_qcoeff); + } + weber_stats->max_scale = max_scale; + + if ((mi_col + mb_step) % mt_unit_step == 0 || + (mi_col + mb_step) >= mi_cols) { + intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_col, mt_unit_cols); + ++mt_unit_col; + } + } + // Set the pointer to null since mbmi is only allocated inside this function. + xd->mi = NULL; +} + +static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion, + double *sum_est_rate) { + MACROBLOCK *x = &cpi->td.mb; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) { + av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff, + dqcoeff, sum_rec_distortion, sum_est_rate, + cpi->td.wiener_tmp_pred_buf); + } +} + +static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi, + const BLOCK_SIZE norm_block_size) { + const AV1_COMMON *const cm = &cpi->common; + int64_t norm_factor = 1; + assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128); + const int norm_step = mi_size_wide[norm_block_size]; + double sb_wiener_log = 0; + double sb_count = 0; + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) { + const int sb_wiener_var = + get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col); + const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col); + const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col); + const double scaled_satd = (double)satd / sqrt((double)sse); + sb_wiener_log += scaled_satd * log(sb_wiener_var); + sb_count += scaled_satd; + } + } + if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count)); + norm_factor = AOMMAX(1, norm_factor); + + return norm_factor; +} + +static void automatic_intra_tools_off(AV1_COMP *cpi, + const double sum_rec_distortion, + const double sum_est_rate) { + if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return; + + // Thresholds + const int high_quality_qindex = 128; + const double high_quality_bpp = 2.0; + const double high_quality_dist_per_pix = 4.0; + + AV1_COMMON *const cm = &cpi->common; + const int qindex = cm->quant_params.base_qindex; + const double dist_per_pix = + (double)sum_rec_distortion / (cm->width * cm->height); + // The estimate bpp is not accurate, an empirical constant 100 is divided. + const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100); + + if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp && + dist_per_pix < high_quality_dist_per_pix) { + cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0; + cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0; + cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0; + cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0; + } +} + +static void ext_rate_guided_quantization(AV1_COMP *cpi) { + // Calculation uses 8x8. + const int mb_step = mi_size_wide[cpi->weber_bsize]; + // Accumulate to 16x16, step size is in the unit of mi. + const int block_step = 4; + + const char *filename = cpi->oxcf.rate_distribution_info; + FILE *pfile = fopen(filename, "r"); + if (pfile == NULL) { + assert(pfile != NULL); + return; + } + + double ext_rate_sum = 0.0; + for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) { + for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) { + float val; + const int fields_converted = fscanf(pfile, "%f", &val); + if (fields_converted != 1) { + assert(fields_converted == 1); + fclose(pfile); + return; + } + ext_rate_sum += val; + cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols + + (col / mb_step)] = val; + } + } + fclose(pfile); + + int uniform_rate_sum = 0; + for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) { + for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) { + int rate_sum = 0; + for (int r = 0; r < block_step; r += mb_step) { + for (int c = 0; c < block_step; c += mb_step) { + const int mi_row = row + r; + const int mi_col = col + c; + rate_sum += cpi->prep_rate_estimates[(mi_row / mb_step) * + cpi->frame_info.mi_cols + + (mi_col / mb_step)]; + } + } + uniform_rate_sum += rate_sum; + } + } + + const double scale = uniform_rate_sum / ext_rate_sum; + cpi->ext_rate_scale = scale; +} + +void av1_set_mb_wiener_variance(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td); + cpi->norm_wiener_variance = 0; + + MACROBLOCK *x = &cpi->td.mb; + MACROBLOCKD *xd = &x->e_mbd; + // xd->mi needs to be setup since it is used in av1_frame_init_quantizer. + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level; + av1_frame_init_quantizer(cpi); + + double sum_rec_distortion = 0.0; + double sum_est_rate = 0.0; + + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); + AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; + intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read_dummy; + intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write_dummy; + // Calculate differential contrast for each block for the entire image. + // TODO(chengchen): properly accumulate the distortion and rate in + // av1_calc_mb_wiener_var_mt(). Until then, call calc_mb_wiener_var() if + // auto_intra_tools_off is true. + if (num_workers > 1 && !cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) { + intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read; + intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write; + av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion, + &sum_est_rate); + } else { + calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate); + } + + // Determine whether to turn off several intra coding tools. + automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate); + + // Read external rate distribution and use it to guide delta quantization + if (cpi->oxcf.enable_rate_guide_deltaq) ext_rate_guided_quantization(cpi); + + const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size; + cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size); + const int norm_step = mi_size_wide[norm_block_size]; + + double sb_wiener_log = 0; + double sb_count = 0; + for (int its_cnt = 0; its_cnt < 2; ++its_cnt) { + sb_wiener_log = 0; + sb_count = 0; + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; + mi_col += norm_step) { + int sb_wiener_var = + get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col); + + double beta = (double)cpi->norm_wiener_variance / sb_wiener_var; + double min_max_scale = AOMMAX( + 1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col)); + + beta = AOMMIN(beta, 4); + beta = AOMMAX(beta, 0.25); + + if (beta < 1 / min_max_scale) continue; + + sb_wiener_var = (int)(cpi->norm_wiener_variance / beta); + + int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col); + int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col); + double scaled_satd = (double)satd / sqrt((double)sse); + sb_wiener_log += scaled_satd * log(sb_wiener_var); + sb_count += scaled_satd; + } + } + + if (sb_count > 0) + cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count)); + cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance); + } + + // Set the pointer to null since mbmi is only allocated inside this function. + xd->mi = NULL; + aom_free_frame_buffer(&cm->cur_frame->buf); + av1_dealloc_mb_wiener_var_pred_buf(&cpi->td); +} + +static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + // Calculation uses 8x8. + const int mb_step = mi_size_wide[cpi->weber_bsize]; + // Accumulate to 16x16 + const int block_step = mi_size_wide[BLOCK_16X16]; + double sb_rate_hific = 0.0; + double sb_rate_uniform = 0.0; + for (int row = mi_row; row < mi_row + mi_size_wide[bsize]; + row += block_step) { + for (int col = mi_col; col < mi_col + mi_size_high[bsize]; + col += block_step) { + sb_rate_hific += + cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols + + (col / mb_step)]; + + for (int r = 0; r < block_step; r += mb_step) { + for (int c = 0; c < block_step; c += mb_step) { + const int this_row = row + r; + const int this_col = col + c; + sb_rate_uniform += + cpi->prep_rate_estimates[(this_row / mb_step) * + cpi->frame_info.mi_cols + + (this_col / mb_step)]; + } + } + } + } + sb_rate_hific *= cpi->ext_rate_scale; + + const double weight = 1.0; + const double rate_diff = + weight * (sb_rate_hific - sb_rate_uniform) / sb_rate_uniform; + double scale = pow(2, rate_diff); + + scale = scale * scale; + double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col)); + scale = 1.0 / AOMMIN(1.0 / scale, min_max_scale); + + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + int offset = + av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, scale); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + const int max_offset = delta_q_info->delta_q_res * 10; + offset = AOMMIN(offset, max_offset - 1); + offset = AOMMAX(offset, -max_offset + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1); + + return qindex; +} + +int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + if (cpi->oxcf.enable_rate_guide_deltaq) { + return get_rate_guided_quantizer(cpi, bsize, mi_row, mi_col); + } + + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col); + int offset = 0; + double beta = (double)cpi->norm_wiener_variance / sb_wiener_var; + double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col)); + beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale); + + // Cap beta such that the delta q value is not much far away from the base q. + beta = AOMMIN(beta, 4); + beta = AOMMAX(beta, 0.25); + offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1); + + return qindex; +} + +void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + if (cpi->mb_delta_q) return; + + CHECK_MEM_ERROR(cm, cpi->mb_delta_q, + aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols, + sizeof(*cpi->mb_delta_q))); +} + +#if CONFIG_TFLITE +static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows, + int bit_depth, uint8_t *y_buffer, int y_stride, + float *predicts0, float *predicts1) { + // Create the model and interpreter options. + TfLiteModel *model = + TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize); + if (model == NULL) return 1; + + TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate(); + TfLiteInterpreterOptionsSetNumThreads(options, 2); + if (options == NULL) { + TfLiteModelDelete(model); + return 1; + } + + // Create the interpreter. + TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options); + if (interpreter == NULL) { + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + // Allocate tensors and populate the input tensor data. + TfLiteInterpreterAllocateTensors(interpreter); + TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0); + if (input_tensor == NULL) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + size_t input_size = TfLiteTensorByteSize(input_tensor); + float *input_data = aom_calloc(input_size, 1); + if (input_data == NULL) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset = (row * num_mi_h) << 2; + const int col_offset = (col * num_mi_w) << 2; + + uint8_t *buf = y_buffer + row_offset * y_stride + col_offset; + int r = row_offset, pos = 0; + const float base = (float)((1 << bit_depth) - 1); + while (r < row_offset + (num_mi_h << 2)) { + for (int c = 0; c < (num_mi_w << 2); ++c) { + input_data[pos++] = bit_depth > 8 + ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base + : (float)*(buf + c) / base; + } + buf += y_stride; + ++r; + } + TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size); + + // Execute inference. + if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + // Extract the output tensor data. + const TfLiteTensor *output_tensor = + TfLiteInterpreterGetOutputTensor(interpreter, 0); + if (output_tensor == NULL) { + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + return 1; + } + + size_t output_size = TfLiteTensorByteSize(output_tensor); + float output_data[2]; + + TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size); + predicts0[row * num_cols + col] = output_data[0]; + predicts1[row * num_cols + col] = output_data[1]; + } + } + + // Dispose of the model and interpreter objects. + TfLiteInterpreterDelete(interpreter); + TfLiteInterpreterOptionsDelete(options); + TfLiteModelDelete(model); + aom_free(input_data); + return 0; +} + +void av1_set_mb_ur_variance(AV1_COMP *cpi) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = cpi->common.seq_params->sb_size; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + + // TODO(sdeng): fit a better model_1; disable it at this time. + float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f; + CHECK_MEM_ERROR(cm, mb_delta_q0, + aom_calloc(num_rows * num_cols, sizeof(float))); + CHECK_MEM_ERROR(cm, mb_delta_q1, + aom_calloc(num_rows * num_cols, sizeof(float))); + + if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer, + y_stride, mb_delta_q0, mb_delta_q1)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to call TFlite functions."); + } + + // Loop through each SB block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + delta_q_avg0 += mb_delta_q0[index]; + } + } + + delta_q_avg0 /= (float)(num_rows * num_cols); + + float scaling_factor; + const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ; + if (cq_level < delta_q_avg0) { + scaling_factor = cq_level / delta_q_avg0; + } else { + scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0); + } + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->mb_delta_q[index] = + RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ * + scaling_factor * (mb_delta_q0[index] - delta_q_avg0)); + } + } + + aom_free(mb_delta_q0); + aom_free(mb_delta_q1); +} +#else // !CONFIG_TFLITE +void av1_set_mb_ur_variance(AV1_COMP *cpi) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = cpi->common.seq_params->sb_size; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + + int *mb_delta_q[2]; + CHECK_MEM_ERROR(cm, mb_delta_q[0], + aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0]))); + CHECK_MEM_ERROR(cm, mb_delta_q[1], + aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1]))); + + // Approximates the model change between current version (Spet 2021) and the + // baseline (July 2021). + const double model_change[] = { 3.0, 3.0 }; + // The following parameters are fitted from user labeled data. + const double a[] = { -24.50 * 4.0, -17.20 * 4.0 }; + const double b[] = { 0.004898, 0.003093 }; + const double c[] = { (29.932 + model_change[0]) * 4.0, + (42.100 + model_change[1]) * 4.0 }; + int delta_q_avg[2] = { 0, 0 }; + // Loop through each SB block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + // Loop through each 8x8 block. + for (int mi_row = row * num_mi_h; + mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; + mi_row += 2) { + for (int mi_col = col * num_mi_w; + mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; + mi_col += 2) { + struct buf_2d buf; + const int row_offset_y = mi_row << 2; + const int col_offset_y = mi_col << 2; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + unsigned int block_variance; + block_variance = av1_get_perpixel_variance_facade( + cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y); + + block_variance = AOMMAX(block_variance, 1); + var += log((double)block_variance); + num_of_var += 1.0; + } + } + var = exp(var / num_of_var); + mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]); + mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]); + delta_q_avg[0] += mb_delta_q[0][index]; + delta_q_avg[1] += mb_delta_q[1][index]; + } + } + + delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols)); + delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols)); + + int model_idx; + double scaling_factor; + const int cq_level = cpi->oxcf.rc_cfg.cq_level; + if (cq_level < delta_q_avg[0]) { + model_idx = 0; + scaling_factor = (double)cq_level / delta_q_avg[0]; + } else if (cq_level < delta_q_avg[1]) { + model_idx = 2; + scaling_factor = + (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]); + } else { + model_idx = 1; + scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]); + } + + const double new_delta_q_avg = + delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]); + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + if (model_idx == 2) { + const double delta_q = + mb_delta_q[0][index] + + scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]); + cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength / + 100.0 * (delta_q - new_delta_q_avg)); + } else { + cpi->mb_delta_q[index] = RINT( + (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor * + (mb_delta_q[model_idx][index] - delta_q_avg[model_idx])); + } + } + } + + aom_free(mb_delta_q[0]); + aom_free(mb_delta_q[1]); +} +#endif + +int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) { + const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex; + + const int num_mi_w = mi_size_wide[bsize]; + const int num_mi_h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w); + const int delta_q = cpi->mb_delta_q[index]; + + int qindex = base_qindex + delta_q; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ + 1); + + return qindex; +} diff --git a/third_party/aom/av1/encoder/allintra_vis.h b/third_party/aom/av1/encoder/allintra_vis.h new file mode 100644 index 0000000000..0d34ce0841 --- /dev/null +++ b/third_party/aom/av1/encoder/allintra_vis.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_ +#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_ + +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" + +#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64 + +void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi); + +void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, + MACROBLOCKD *xd, const int mi_row, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + double *sum_rec_distortion, + double *sum_est_rate, uint8_t *pred_buffer); + +void av1_set_mb_wiener_variance(AV1_COMP *cpi); + +int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col); + +// User rating based mode +void av1_init_mb_ur_var_buffer(AV1_COMP *cpi); + +void av1_set_mb_ur_variance(AV1_COMP *cpi); + +int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col); + +#endif // AOM_AV1_ENCODER_ALLINTRA_VIS_H_ diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c new file mode 100644 index 0000000000..4cf6bd572d --- /dev/null +++ b/third_party/aom/av1/encoder/aq_complexity.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/encodeframe.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/segmentation.h" +#include "aom_dsp/aom_dsp_common.h" + +#define AQ_C_SEGMENTS 5 +#define DEFAULT_AQ2_SEG 3 // Neutral Q segment +#define AQ_C_STRENGTHS 3 +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 1.75, 1.25, 1.05, 1.00, 0.90 }, + { 2.00, 1.50, 1.15, 1.00, 0.85 }, + { 2.50, 1.75, 1.25, 1.00, 0.80 } +}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { 0.15, 0.30, 0.55, 2.00, 100.0 }, + { 0.20, 0.40, 0.65, 2.00, 100.0 }, + { 0.25, 0.50, 0.75, 2.00, 100.0 } +}; +static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = { + { -4.0, -3.0, -2.0, 100.00, 100.0 }, + { -3.5, -2.5, -1.5, 100.00, 100.0 }, + { -3.0, -2.0, -1.0, 100.00, 100.0 } +}; + +static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) { + // Approximate base quatizer (truncated to int) + const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4; + return (base_quant > 10) + (base_quant > 25); +} + +static bool is_frame_aq_enabled(const AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + + return frame_is_intra_only(cm) || cm->features.error_resilient_mode || + refresh_frame->alt_ref_frame || + (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref); +} + +// Segmentation only makes sense if the target bits per SB is above a threshold. +// Below this the overheads will usually outweigh any benefit. +static bool is_sb_aq_enabled(const AV1_COMP *const cpi) { + return cpi->rc.sb64_target_rate >= 256; +} + +void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + struct segmentation *const seg = &cm->seg; + const int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + + // Make SURE use of floating point in this function is safe. + + if (resolution_change) { + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_clearall_segfeatures(seg); + av1_disable_segmentation(seg); + return; + } + + if (is_frame_aq_enabled(cpi)) { + int segment; + const int aq_strength = + get_aq_c_strength(base_qindex, cm->seq_params->bit_depth); + + // Clear down the segment map. + memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG, + cm->mi_params.mi_rows * cm->mi_params.mi_cols); + + av1_clearall_segfeatures(seg); + + if (!is_sb_aq_enabled(cpi)) { + av1_disable_segmentation(seg); + return; + } + + av1_enable_segmentation(seg); + + // Default segment "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q); + + // Use some of the segments for in frame Q adjustment. + for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) { + int qindex_delta; + + if (segment == DEFAULT_AQ2_SEG) continue; + + qindex_delta = av1_compute_qdelta_by_rate( + cpi, cm->current_frame.frame_type, base_qindex, + aq_c_q_adj_factor[aq_strength][segment]); + + // For AQ complexity mode, we dont allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + if ((base_qindex + qindex_delta) > 0) { + av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } + } + } +} + +#define DEFAULT_LV_THRESH 10.0 +#define MIN_DEFAULT_LV_THRESH 8.0 +// Select a segment for the current block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average and its spatial complexity. +void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, + int mi_row, int mi_col, int projected_rate) { + if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]); + int i; + unsigned char segment; + + // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). + // It is converted to bits << AV1_PROB_COST_SHIFT units. + const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis) + << AV1_PROB_COST_SHIFT; + const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size; + const int target_rate = (int)(num / denom); + double logvar; + double low_var_thresh; + const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex, + cm->seq_params->bit_depth); + + low_var_thresh = + (is_stat_consumption_stage_twopass(cpi)) + ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH) + : DEFAULT_LV_THRESH; + + av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs); + logvar = av1_log_block_var(cpi, mb, bs); + + segment = AQ_C_SEGMENTS - 1; // Just in case no break out below. + for (i = 0; i < AQ_C_SEGMENTS; ++i) { + // Test rate against a threshold value and variance against a threshold. + // Increasing segment number (higher variance and complexity) = higher Q. + if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) && + (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) { + segment = i; + break; + } + } + + // Fill in the entires in the segment map corresponding to this SB64. + const int mi_stride = cm->mi_params.mi_cols; + set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment); +} diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h new file mode 100644 index 0000000000..3421d74c93 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_complexity.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ +#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/common/enums.h" + +struct AV1_COMP; +struct macroblock; + +// Select a segment for the current Block. +void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *, + BLOCK_SIZE bs, int mi_row, int mi_col, + int projected_rate); + +// This function sets up a set of segments with delta Q values around +// the baseline frame quantizer. +void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_ diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c new file mode 100644 index 0000000000..f48ff11e51 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c @@ -0,0 +1,657 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/pred_common.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" +#include "aom_dsp/aom_dsp_common.h" + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr)); + if (cr == NULL) return NULL; + + cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map)); + cr->counter_encode_maxq_scene_change = 0; + cr->percent_refresh_adjustment = 5; + cr->rate_ratio_qdelta_adjustment = 0.25; + if (cr->map == NULL) { + av1_cyclic_refresh_free(cr); + return NULL; + } + return cr; +} + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { + if (cr != NULL) { + aom_free(cr->map); + aom_free(cr); + } +} + +// Check if this coding block, of size bsize, should be considered for refresh +// (lower-qp coding). Decision can be based on various factors, such as +// size of the coding block (i.e., below min_block size rejected), coding +// mode, and rate/distortion. +static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, + const MB_MODE_INFO *mbmi, int64_t rate, + int64_t dist, BLOCK_SIZE bsize, + int noise_level) { + MV mv = mbmi->mv[0].as_mv; + int is_compound = has_second_ref(mbmi); + // Reject the block for lower-qp coding for non-compound mode if + // projected distortion is above the threshold, and any of the following + // is true: + // 1) mode uses large mv + // 2) mode is an intra-mode + // Otherwise accept for refresh. + if (!is_compound && dist > cr->thresh_dist_sb && + (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh || + mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh || + !is_inter_block(mbmi))) + return CR_SEGMENT_ID_BASE; + else if ((is_compound && noise_level < kMedium) || + (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb && + is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 && + cr->rate_boost_fac > 10)) + // More aggressive delta-q for bigger blocks with zero motion. + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BOOST1; +} + +// Compute delta-q for the segment. +static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int deltaq = av1_compute_qdelta_by_rate( + cpi, cpi->common.current_frame.frame_type, q, rate_factor); + if ((-deltaq) > cr->max_qdelta_perc * q / 100) { + deltaq = -cr->max_qdelta_perc * q / 100; + } + return deltaq; +} + +int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + const int base_qindex = cm->quant_params.base_qindex; + const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int mbs = cm->mi_params.MBs; + const int num4x4bl = mbs << 4; + // Weight for non-base segments: use actual number of blocks refreshed in + // previous/just encoded frame. Note number of blocks here is in 4x4 units. + double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl; + double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl; + if (cpi->rc.rtc_external_ratectrl) { + weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows * + cm->mi_params.mi_cols / 100) / + num4x4bl; + weight_segment2 = 0; + } + // Take segment weighted average for estimated bits. + const int estimated_bits = + (int)((1.0 - weight_segment1 - weight_segment2) * + av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) + + weight_segment1 * + av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1], + correction_factor) + + weight_segment2 * + av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2], + correction_factor)); + return estimated_bits; +} + +int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int bits_per_mb; + int num4x4bl = cm->mi_params.MBs << 4; + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + double weight_segment = + (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num4x4bl; + if (cpi->rc.rtc_external_ratectrl) { + weight_segment = (double)((cr->target_num_seg_blocks + + cr->percent_refresh * cm->mi_params.mi_rows * + cm->mi_params.mi_cols / 100) >> + 1) / + num4x4bl; + } + // Compute delta-q corresponding to qindex i. + int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate; + // Take segment weighted average for bits per mb. + bits_per_mb = + (int)((1.0 - weight_segment) * + av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i, + correction_factor, accurate_estimate) + + weight_segment * av1_rc_bits_per_mb( + cpi, cm->current_frame.frame_type, i + deltaq, + correction_factor, accurate_estimate)); + return bits_per_mb; +} + +void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RUN_TYPE dry_run) { + int cdf_num; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int prev_segment_id = mbmi->segment_id; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + + assert(cm->seg.enabled); + + if (!cr->skip_over4x4) { + mbmi->segment_id = + av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4); + if (prev_segment_id != mbmi->segment_id) { + const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; + const int mi_stride = cm->mi_params.mi_cols; + const uint8_t segment_id = mbmi->segment_id; + for (int mi_y = 0; mi_y < ymis; mi_y++) { + const int map_offset = block_index + mi_y * mi_stride; + memset(&cr->map[map_offset], 0, xmis); + memset(&cpi->enc_seg.map[map_offset], segment_id, xmis); + memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis); + } + } + } + if (!dry_run) { + if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks -= xmis * ymis; + else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks -= xmis * ymis; + } +} + +void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip, + RUN_TYPE dry_run) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; + int noise_level = 0; + if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level; + const int refresh_this_block = + candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level); + int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1; + // Default is to not update the refresh map. + int new_map_value = cr->map[block_index]; + + // If this block is labeled for refresh, check if we should reset the + // segment_id. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) { + mbmi->segment_id = refresh_this_block; + // Reset segment_id if will be skipped. + if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE; + } + const uint8_t segment_id = mbmi->segment_id; + + // Update the cyclic refresh map, to be used for setting segmentation map + // for the next frame. If the block will be refreshed this frame, mark it + // as clean. The magnitude of the -ve influences how long before we consider + // it for refresh again. + if (cyclic_refresh_segment_id_boosted(segment_id)) { + new_map_value = -cr->time_for_refresh; + } else if (refresh_this_block) { + // Else if it is accepted as candidate for refresh, and has not already + // been refreshed (marked as 1) then mark it as a candidate for cleanup + // for future time (marked as 0), otherwise don't update it. + if (cr->map[block_index] == 1) new_map_value = 0; + } else { + // Leave it marked as block that is not candidate for refresh. + new_map_value = 1; + } + + // Update entries in the cyclic refresh map with new_map_value, and + // copy mbmi->segment_id into global segmentation map. + const int mi_stride = cm->mi_params.mi_cols; + for (int mi_y = 0; mi_y < ymis; mi_y += sh) { + const int map_offset = block_index + mi_y * mi_stride; + memset(&cr->map[map_offset], new_map_value, xmis); + memset(&cpi->enc_seg.map[map_offset], segment_id, xmis); + memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis); + } + + // Accumulate cyclic refresh update counters. + if (!dry_run) { + if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks += xmis * ymis; + else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks += xmis * ymis; + } +} + +// Initializes counters used for cyclic refresh. +void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) { + x->actual_num_seg1_blocks = 0; + x->actual_num_seg2_blocks = 0; +} + +// Accumulate cyclic refresh counters. +void av1_accumulate_cyclic_refresh_counters( + CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) { + cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks; + cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks; +} + +void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // Set minimum gf_interval for GF update to a multiple of the refresh period, + // with some max limit. Depending on past encoding stats, GF flag may be + // reset and update may not occur until next baseline_gf_interval. + const int gf_length_mult[2] = { 8, 4 }; + if (cr->percent_refresh > 0) + p_rc->baseline_gf_interval = + AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * + (100 / cr->percent_refresh), + MAX_GF_INTERVAL_RT); + else + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT; + if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40) + p_rc->baseline_gf_interval = 16; +} + +// Update the segmentation map, and related quantities: cyclic refresh map, +// refresh sb_index, and target number of blocks to be refreshed. +// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to +// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock. +// Blocks labeled as BOOST1 may later get set to BOOST2 (during the +// encoding of the superblock). +static void cyclic_refresh_update_map(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + unsigned char *const seg_map = cpi->enc_seg.map; + int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; + int xmis, ymis, x, y; + uint64_t sb_sad = 0; + uint64_t thresh_sad_low = 0; + uint64_t thresh_sad = INT64_MAX; + const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols; + const int mi_stride = mi_cols; + memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); + sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; + sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; + sbs_in_frame = sb_cols * sb_rows; + // Number of target blocks to get the q delta (segment 1). + block_count = cr->percent_refresh * mi_rows * mi_cols / 100; + // Set the segmentation map: cycle through the superblocks, starting at + // cr->mb_index, and stopping when either block_count blocks have been found + // to be refreshed, or we have passed through whole frame. + if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0; + assert(cr->sb_index < sbs_in_frame); + i = cr->sb_index; + cr->last_sb_index = cr->sb_index; + cr->target_num_seg_blocks = 0; + do { + int sum_map = 0; + // Get the mi_row/mi_col corresponding to superblock index i. + int sb_row_index = (i / sb_cols); + int sb_col_index = i - sb_row_index * sb_cols; + int mi_row = sb_row_index * cm->seq_params->mib_size; + int mi_col = sb_col_index * cm->seq_params->mib_size; + assert(mi_row >= 0 && mi_row < mi_rows); + assert(mi_col >= 0 && mi_col < mi_cols); + bl_index = mi_row * mi_stride + mi_col; + // Loop through all MI blocks in superblock and update map. + xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size); + ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size); + if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 && + cr->counter_encode_maxq_scene_change > 30 && + cpi->src_sad_blk_64x64 != NULL && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index]; + int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8; + int scale_low = 2; + thresh_sad = (scale * 64 * 64); + thresh_sad_low = (scale_low * 64 * 64); + // For temporal layers: the base temporal layer (temporal_layer_id = 0) + // has larger frame separation (2 or 4 frames apart), so use larger sad + // thresholds to compensate for larger frame sad. The larger thresholds + // also increase the amount of refresh, which is needed for the base + // temporal layer. + if (cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id == 0) { + thresh_sad <<= 4; + thresh_sad_low <<= 2; + } + } + // cr_map only needed at 8x8 blocks. + for (y = 0; y < ymis; y += 2) { + for (x = 0; x < xmis; x += 2) { + const int bl_index2 = bl_index + y * mi_stride + x; + // If the block is as a candidate for clean up then mark it + // for possible boost/refresh (segment 1). The segment id may get + // reset to 0 later if block gets coded anything other than low motion. + // If the block_sad (sb_sad) is very low label it for refresh anyway. + if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) { + sum_map += 4; + } else if (cr->map[bl_index2] < 0) { + cr->map[bl_index2]++; + } + } + } + // Enforce constant segment over superblock. + // If segment is at least half of superblock, set to 1. + // Enforce that block sad (sb_sad) is not too high. + if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) { + set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride, + CR_SEGMENT_ID_BOOST1); + cr->target_num_seg_blocks += xmis * ymis; + } + i++; + if (i == sbs_in_frame) { + i = 0; + } + } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); + cr->sb_index = i; + if (cr->target_num_seg_blocks == 0) { + // Disable segmentation, seg_map is already set to 0 above. + av1_disable_segmentation(&cm->seg); + } +} + +static int is_scene_change_detected(AV1_COMP *const cpi) { + return cpi->rc.high_source_sad; +} + +// Set cyclic refresh parameters. +void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { + // TODO(marpan): Parameters need to be tuned. + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + SVC *const svc = &cpi->svc; + const int qp_thresh = AOMMAX(16, rc->best_quality + 4); + const int qp_max_thresh = 118 * MAXQ >> 7; + const int scene_change_detected = is_scene_change_detected(cpi); + const int is_screen_content = + (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); + + // A scene change or key frame marks the start of a cyclic refresh cycle. + const int frames_since_scene_change = + (cpi->ppi->use_svc || !is_screen_content) + ? cpi->rc.frames_since_key + : AOMMIN(cpi->rc.frames_since_key, + cr->counter_encode_maxq_scene_change); + + // Cases to reset the cyclic refresh adjustment parameters. + if (frame_is_intra_only(cm) || scene_change_detected || + cpi->ppi->rtc_ref.bias_recovery_frame) { + // Reset adaptive elements for intra only frames and scene changes. + cr->percent_refresh_adjustment = 5; + cr->rate_ratio_qdelta_adjustment = 0.25; + } + + // Although this segment feature for RTC is only used for + // blocks >= 8X8, for more efficient coding of the seg map + // cur_frame->seg_map needs to set at 4x4 along with the + // function av1_cyclic_reset_segment_skip(). Skipping over + // 4x4 will therefore have small bdrate loss (~0.2%), so + // we use it only for speed > 9 for now. + // Also if loop-filter deltas is applied via segment, then + // we need to set cr->skip_over4x4 = 1. + cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0; + + // should we enable cyclic refresh on this frame. + cr->apply_cyclic_refresh = 1; + if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) || + scene_change_detected || svc->temporal_layer_id > 0 || + svc->prev_number_spatial_layers != svc->number_spatial_layers || + p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (svc->number_spatial_layers > 1 && + svc->layer_context[svc->temporal_layer_id].is_key_frame) || + (frames_since_scene_change > 20 && + p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) || + (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 && + frames_since_scene_change > 40) || + cpi->ppi->rtc_ref.bias_recovery_frame) { + cr->apply_cyclic_refresh = 0; + return; + } + + // Increase the amount of refresh for #temporal_layers > 2 + if (svc->number_temporal_layers > 2) + cr->percent_refresh = 15; + else + cr->percent_refresh = 10 + cr->percent_refresh_adjustment; + + cr->max_qdelta_perc = 60; + cr->time_for_refresh = 0; + cr->use_block_sad_scene_det = + (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && + cm->seq_params->sb_size == BLOCK_64X64) + ? 1 + : 0; + cr->motion_thresh = 32; + cr->rate_boost_fac = + (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15; + + // Use larger delta-qp (increase rate_ratio_qdelta) for first few + // refresh cycles after a key frame (svc) or scene change (non svc). + // For non svc screen content, after a scene change gradually reduce + // this boost and supress it further if either of the previous two + // frames overshot. + if (cr->percent_refresh > 0) { + if (cpi->ppi->use_svc || !is_screen_content) { + if (frames_since_scene_change < + ((4 * svc->number_temporal_layers) * (100 / cr->percent_refresh))) { + cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment; + } else { + cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment; + } + } else { + double distance_from_sc_factor = + AOMMIN(0.75, (int)(frames_since_scene_change / 10) * 0.1); + cr->rate_ratio_qdelta = + 3.0 + cr->rate_ratio_qdelta_adjustment - distance_from_sc_factor; + if ((frames_since_scene_change < 10) && + ((cpi->rc.rc_1_frame < 0) || (cpi->rc.rc_2_frame < 0))) { + cr->rate_ratio_qdelta -= 0.25; + } + } + } else { + cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment; + } + // Adjust some parameters for low resolutions. + if (cm->width * cm->height <= 352 * 288) { + if (cpi->svc.number_temporal_layers > 1) { + cr->motion_thresh = 32; + cr->rate_boost_fac = 13; + } else { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 16; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 50; + cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0); + } + } + } + if (cpi->oxcf.rc_cfg.mode == AOM_VBR) { + // To be adjusted for VBR mode, e.g., based on gf period and boost. + // For now use smaller qp-delta (than CBR), no second boosted seg, and + // turn-off (no refresh) on golden refresh (since it's already boosted). + cr->percent_refresh = 10; + cr->rate_ratio_qdelta = 1.5; + cr->rate_boost_fac = 10; + if (cpi->refresh_frame.golden_frame) { + cr->percent_refresh = 0; + cr->rate_ratio_qdelta = 1.0; + } + } + if (rc->rtc_external_ratectrl) { + cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows * + cm->mi_params.mi_cols / 100; + cr->actual_num_seg2_blocks = 0; + } +} + +// Setup cyclic background refresh: set delta q and segmentation map. +void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + struct segmentation *const seg = &cm->seg; + const int scene_change_detected = is_scene_change_detected(cpi); + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + // Set resolution_change flag: for svc only set it when the + // number of spatial layers has not changed. + const int resolution_change = + cm->prev_frame && + (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height) && + cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers; + + if (resolution_change) av1_cyclic_refresh_reset_resize(cpi); + if (!cr->apply_cyclic_refresh) { + // Set segmentation map to 0 and disable. + unsigned char *const seg_map = cpi->enc_seg.map; + memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_disable_segmentation(&cm->seg); + if (frame_is_intra_only(cm) || scene_change_detected || + cpi->ppi->rtc_ref.bias_recovery_frame) { + cr->sb_index = 0; + cr->last_sb_index = 0; + cr->counter_encode_maxq_scene_change = 0; + cr->actual_num_seg1_blocks = 0; + cr->actual_num_seg2_blocks = 0; + } + return; + } else { + cr->counter_encode_maxq_scene_change++; + const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, + cm->seq_params->bit_depth); + // Set rate threshold to some multiple (set to 2 for now) of the target + // rate (target is given by sb64_target_rate and scaled by 256). + cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; + // Distortion threshold, quadratic in Q, scale factor to be adjusted. + // q will not exceed 457, so (q * q) is within 32bit; see: + // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[]. + cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; + // For low-resoln or lower speeds, the rate/dist thresholds need to be + // tuned/updated. + if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) { + cr->thresh_dist_sb = 0; + cr->thresh_rate_sb = INT64_MAX; + } + // Set up segmentation. + // Clear down the segment map. + av1_enable_segmentation(&cm->seg); + av1_clearall_segfeatures(seg); + + // Note: setting temporal_update has no effect, as the seg-map coding method + // (temporal or spatial) is determined in + // av1_choose_segmap_coding_method(), + // based on the coding cost of each method. For error_resilient mode on the + // last_frame_seg_map is set to 0, so if temporal coding is used, it is + // relative to 0 previous map. + // seg->temporal_update = 0; + + // Segment BASE "Q" feature is disabled so it defaults to the baseline Q. + av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q); + // Use segment BOOST1 for in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q); + // Use segment BOOST2 for more aggressive in-frame Q adjustment. + av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q); + + // Set the q delta for segment BOOST1. + const CommonQuantParams *const quant_params = &cm->quant_params; + int qindex_delta = + compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta); + cr->qindex_delta[1] = qindex_delta; + + // Compute rd-mult for segment BOOST1. + const int qindex2 = clamp( + quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta, + 0, MAXQ); + cr->rdmult = av1_compute_rd_mult( + qindex2, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta); + + // Set a more aggressive (higher) q delta for segment BOOST2. + qindex_delta = compute_deltaq( + cpi, quant_params->base_qindex, + AOMMIN(CR_MAX_RATE_TARGET_RATIO, + 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta)); + cr->qindex_delta[2] = qindex_delta; + av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); + + // Update the segmentation and refresh map. + cyclic_refresh_update_map(cpi); + } +} + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { + return cr->rdmult; +} + +void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + cr->sb_index = 0; + cr->last_sb_index = 0; + cpi->refresh_frame.golden_frame = true; + cr->apply_cyclic_refresh = 0; + cr->counter_encode_maxq_scene_change = 0; + cr->percent_refresh_adjustment = 5; + cr->rate_ratio_qdelta_adjustment = 0.25; +} + +int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int qindex = cpi->common.quant_params.base_qindex; + if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 && + cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh && + cpi->rc.frame_source_sad < 1000 && + qindex < 7 * (cpi->rc.worst_quality >> 3)) + return 1; + // More aggressive skip. + else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad && + cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality) + return 1; + return 0; +} diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h new file mode 100644 index 0000000000..10974f018b --- /dev/null +++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ +#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/block.h" +#include "av1/encoder/tokenize.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The segment ids used in cyclic refresh: from base (no boost) to increasing +// boost (higher delta-qp). +#define CR_SEGMENT_ID_BASE 0 +#define CR_SEGMENT_ID_BOOST1 1 +#define CR_SEGMENT_ID_BOOST2 2 + +// Maximum rate target ratio for setting segment delta-qp. +#define CR_MAX_RATE_TARGET_RATIO 4.0 + +/*! + * \brief The stucture of CYCLIC_REFRESH. + * \ingroup cyclic_refresh + */ +struct CYCLIC_REFRESH { + /*! + * Percentage of blocks per frame that are targeted as candidates + * for cyclic refresh. + */ + int percent_refresh; + + /*! + * Active adjustment delta for cyclic refresh for rate control. + */ + int percent_refresh_adjustment; + + /*! + * Maximum q-delta as percentage of base q. + */ + int max_qdelta_perc; + /*! + *Superblock starting index for cycling through the frame. + */ + int sb_index; + /*! + *Superblock index cyclic refresh index last frame + */ + int last_sb_index; + /*! + * Controls how long block will need to wait to be refreshed again, in + * excess of the cycle time, i.e., in the case of all zero motion, block + * will be refreshed every (100/percent_refresh + time_for_refresh) frames. + */ + int time_for_refresh; + /*! + * Target number of (4x4) blocks that are set for delta-q. + */ + int target_num_seg_blocks; + /*! + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 1. + */ + int actual_num_seg1_blocks; + /*! + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 2. + */ + int actual_num_seg2_blocks; + /*! + * RD mult. parameters for segment 1. + */ + int rdmult; + /*! + * Cyclic refresh map. + */ + int8_t *map; + /*! + * Threshold applied to the projected rate of the coding block, + * when deciding whether block should be refreshed. + */ + int64_t thresh_rate_sb; + /*! + * Threshold applied to the projected distortion of the coding block, + * when deciding whether block should be refreshed. + */ + int64_t thresh_dist_sb; + /*! + * Threshold applied to the motion vector (in units of 1/8 pel) of the + * coding block, when deciding whether block should be refreshed. + */ + int16_t motion_thresh; + /*! + * Rate target ratio to set q delta. + */ + double rate_ratio_qdelta; + + /*! + * Active adjustment of qdelta rate ratio for enhanced rate control + */ + double rate_ratio_qdelta_adjustment; + + /*! + * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2. + */ + int rate_boost_fac; + + /*!\cond */ + int qindex_delta[3]; + int apply_cyclic_refresh; + int skip_over4x4; + int counter_encode_maxq_scene_change; + int use_block_sad_scene_det; + /*!\endcond */ +}; + +struct AV1_COMP; + +typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; + +CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols); + +void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr); + +/*!\brief Estimate the bits, incorporating the delta-q from the segments. + * + * For the just encoded frame, estimate the bits, incorporating the delta-q + * from non-base segment(s). Note this function is called in the postencode + * (called from rc_update_rate_correction_factors()). + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] correction_factor rate correction factor + * + * \return Return the estimated bits at given q. + */ +int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi, + double correction_factor); + +/*!\brief Estimate the bits per mb, for given q = i and delta-q. + * + * Prior to encoding the frame, estimate the bits per mb, for a given q = i and + * a corresponding delta-q (for segment 1). This function is called in the + * rc_regulate_q() to set the base qp index. Note: the segment map is set to + * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1 + * (refresh) for each superblock, prior to encoding. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] i q index + * \param[in] correction_factor rate correction factor + * + * \return Return the estimated bits for q = i and delta-q (segment 1). + */ +int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i, + double correction_factor); + +/*!\brief Update segment_id for blocks are skipped. + * + * After encoding a given prediction block, of size bsize at (mi_row, mi_col), + * check if we should reset the segment_id based on skip_txfm, + * and update the cyclic_refresh map and segmentation counters. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] x Pointer to MACROBLOCK structure + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE + * \param[in] bsize Block size + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * + * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and + * the \c cm->cpi->enc_seg.map. + */ + +void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize, RUN_TYPE dry_run); + +/*!\brief Update segment_id for block based on mode selected. + * + * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), + * check if we should reset the segment_id (based on mode/motion/skip selected + * for that block) and update the cyclic_refresh map and segmentation map. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] x Pointer to MACROBLOCK structure + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE + * \param[in] bsize Block size + * \param[in] rate Projected block rate from pickmode + * \param[in] dist Projected block dist from pickmode + * \param[in] skip Skip flag set from picmode + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * + * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and + * the \c cm->cpi->enc_seg.map. + */ +void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int64_t rate, int64_t dist, int skip, + RUN_TYPE dry_run); + +/*!\brief Initialize counters used for cyclic refresh. + * + * Initializes cyclic refresh counters actual_num_seg1_blocks and + * actual_num_seg2_blocks. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] x Pointer to MACROBLOCK structure + * + * \remark Update the \c x->actual_num_seg1_blocks and the + * \c x->actual_num_seg2_blocks. + */ +void av1_init_cyclic_refresh_counters(MACROBLOCK *const x); + +/*!\brief Accumulate cyclic refresh counters. + * + * Accumulates cyclic refresh counters actual_num_seg1_blocks and + * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cyclic_refresh Pointer to CYCLIC_REFRESH structure + * \param[in] x Pointer to MACROBLOCK structure + * + * \remark Update the \c cyclic_refresh->actual_num_seg1_blocks and the + * \c cyclic_refresh->actual_num_seg2_blocks. + */ +void av1_accumulate_cyclic_refresh_counters( + CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x); + +/*!\brief Set golden frame update interval nased on cyclic refresh. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Returns the interval in \c cpi->rc.baseline_gf_interval. + */ +void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi); + +/*!\brief Set the global/frame level parameters for cyclic refresh. + * + * First call to the cyclic refresh, before encoding the frame. + * Sets the flag on whether cyclic refresh should be applied, sets + * the amount/percent of refresh, and the amount of boost applied to + * the two segments (set by rate_ratio_qdelta and rate_boost_fac). + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Updates the \c cpi->cyclic_refresh with the settings. + */ +void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi); + +/*!\brief Setup the cyclic background refresh. + * + * Set the delta q for the segment(s), and set the segmentation map. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh + * parameters and the \c cm->seg with the segmentation data. + */ +void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi); + +int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); + +void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi); + +int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi); + +static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) { + return segment_id == CR_SEGMENT_ID_BOOST1 || + segment_id == CR_SEGMENT_ID_BOOST2; +} + +static INLINE int cyclic_refresh_segment_id(int segment_id) { + if (segment_id == CR_SEGMENT_ID_BOOST1) + return CR_SEGMENT_ID_BOOST1; + else if (segment_id == CR_SEGMENT_ID_BOOST2) + return CR_SEGMENT_ID_BOOST2; + else + return CR_SEGMENT_ID_BASE; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_ diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c new file mode 100644 index 0000000000..086928a118 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_variance.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_ports/mem.h" + +#include "av1/encoder/aq_variance.h" +#include "av1/common/seg_common.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/dwt.h" + +static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0, + 0.9, .8, .7, .6 }; + +static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, + 0.75, 1.0, 1.0, 1.0 }; +#define ENERGY_MIN (-4) +#define ENERGY_MAX (1) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy) \ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 }; + +DECLARE_ALIGNED(16, static const uint16_t, + av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; + +static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; + +#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] + +void av1_vaq_frame_setup(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const int base_qindex = cm->quant_params.base_qindex; + struct segmentation *seg = &cm->seg; + int i; + + int resolution_change = + cm->prev_frame && (cm->width != cm->prev_frame->width || + cm->height != cm->prev_frame->height); + int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2); + double avg_ratio; + if (avg_energy > 7) avg_energy = 7; + if (avg_energy < 0) avg_energy = 0; + avg_ratio = rate_ratio[avg_energy]; + + if (resolution_change) { + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_clearall_segfeatures(seg); + av1_disable_segmentation(seg); + return; + } + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + refresh_frame->alt_ref_frame || + (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + cpi->vaq_refresh = 1; + + av1_enable_segmentation(seg); + av1_clearall_segfeatures(seg); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Set up avg segment id to be 1.0 and adjust the other segments around + // it. + int qindex_delta = + av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, + base_qindex, rate_ratio[i] / avg_ratio); + + // We don't allow qindex 0 in a segment if the base value is not 0. + // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + + av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta); + av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + } +} + +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + // This functions returns a score for the blocks local variance as calculated + // by: sum of the log of the (4x4 variances) of each subblock to the current + // block (x,bs) + // * 32 / number of pixels in the block_size. + // This is used for segmentation because to avoid situations in which a large + // block with a gentle gradient gets marked high variance even though each + // subblock has a low variance. This allows us to assign the same segment + // number for the same sorts of area regardless of how the partitioning goes. + + MACROBLOCKD *xd = &x->e_mbd; + double var = 0; + unsigned int sse; + int i, j; + + int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + for (i = 0; i < bh; i += 4) { + for (j = 0; j < bw; j += 4) { + if (is_cur_buf_hbd(xd)) { + var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / + 16.0); + } else { + var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf( + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, av1_all_zeros, 0, &sse) / + 16.0); + } + } + } + // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561. + var /= (bw / 4 * bh / 4); + if (var > 7) var = 7; + + return (int)(var); +} + +int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + int mi_row, int mi_col) { + // This functions returns the block average of luma block + unsigned int sum, avg, num_pix; + int r, c; + const int pic_w = cpi->common.width; + const int pic_h = cpi->common.height; + const int bw = MI_SIZE * mi_size_wide[bs]; + const int bh = MI_SIZE * mi_size_high[bs]; + const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); + + sum = 0; + num_pix = 0; + avg = 0; + int row = mi_row << MI_SIZE_LOG2; + int col = mi_col << MI_SIZE_LOG2; + for (r = row; (r < (row + bh)) && (r < pic_h); r++) { + for (c = col; (c < (col + bw)) && (c < pic_w); c++) { + sum += *(x16 + r * x->plane[0].src.stride + c); + num_pix++; + } + } + if (num_pix != 0) { + avg = sum / num_pix; + } + return avg; +} + +#define DEFAULT_E_MIDPOINT 10.0 + +static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + int stride = x->plane[0].src.stride; + uint8_t *buf = x->plane[0].src.buf; + const int num_8x8_cols = block_size_wide[bs] / 8; + const int num_8x8_rows = block_size_high[bs] / 8; + const int hbd = is_cur_buf_hbd(xd); + + int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows, + num_8x8_cols); + + return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; +} + +static double log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int haar_sad = haar_ac_energy(x, bs); + return log1p(haar_sad); +} + +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + double energy, energy_midpoint; + energy_midpoint = (is_stat_consumption_stage_twopass(cpi)) + ? cpi->twopass_frame.frame_avg_haar_energy + : DEFAULT_E_MIDPOINT; + energy = log_block_wavelet_energy(x, bs) - energy_midpoint; + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); +} + +int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, + int block_var_level) { + int rate_level; + const AV1_COMMON *const cm = &cpi->common; + + if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { + ENERGY_IN_BOUNDS(block_var_level); + rate_level = SEGMENT_ID(block_var_level); + } else { + rate_level = block_var_level; + } + const int base_qindex = cm->quant_params.base_qindex; + int qindex_delta = + av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex, + deltaq_rate_ratio[rate_level]); + + if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { + qindex_delta = -base_qindex + 1; + } + return base_qindex + qindex_delta; +} diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h new file mode 100644 index 0000000000..aa0535ad72 --- /dev/null +++ b/third_party/aom/av1/encoder/aq_variance.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_ +#define AOM_AV1_ENCODER_AQ_VARIANCE_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_vaq_frame_setup(AV1_COMP *cpi); + +int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); +int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + int mi_row, int mi_col); +int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, + int block_var_level); +int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_ diff --git a/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c new file mode 100644 index 0000000000..91fc1e00a5 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if defined(_MSC_VER) && !defined(__clang__) +#include +#else +#include +#endif + +#include +#include + +#include "config/aom_config.h" + +#define CRC_LOOP(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +#define CRC_SINGLE(op, crc, type, buf, len) \ + if ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/* Return 32-bit CRC for the input buffer. + * Polynomial is 0x1EDC6F41. + */ + +uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + +#if !AOM_ARCH_AARCH64 + // Align input to 8-byte boundary (only necessary for 32-bit builds.) + while (len && ((uintptr_t)buf & 7)) { + crc = __crc32cb(crc, *buf++); + len--; + } +#endif + + CRC_LOOP(__crc32cd, crc, uint64_t, buf, len) + CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len) + CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len) + CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len) + + return ~crc; +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c new file mode 100644 index 0000000000..26d06b46fe --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can only accumulate 2 squares. + int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_s64x2(ssz_s64); + return (int64_t)horizontal_add_u64x2(err_u64); +} + +int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + uint64x2_t err_u64 = vdupq_n_u64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return (int64_t)horizontal_add_u64x2(err_u64); +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c new file mode 100644 index 0000000000..63aad0b785 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/dot_sve.h" +#include "aom_dsp/arm/mem_neon.h" + +int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0); + sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1])); + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} + +int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + if (block_size % 32 == 0) { + int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t c2 = vld1q_s16(coeff + 16); + const int16x8_t c3 = vld1q_s16(coeff + 24); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + const int16x8_t d2 = vld1q_s16(dqcoeff + 16); + const int16x8_t d3 = vld1q_s16(dqcoeff + 24); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + const int16x8_t diff2 = vsubq_s16(c2, d2); + const int16x8_t diff3 = vsubq_s16(c3, d3); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + error[2] = aom_sdotq_s16(error[2], diff2, diff2); + error[3] = aom_sdotq_s16(error[3], diff3, diff3); + + coeff += 32; + dqcoeff += 32; + block_size -= 32; + } while (block_size != 0); + + error[0] = vaddq_s64(error[0], error[1]); + error[2] = vaddq_s64(error[2], error[3]); + error[0] = vaddq_s64(error[0], error[2]); + return vaddvq_s64(error[0]); + } + assert(block_size == 16); + + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c new file mode 100644 index 0000000000..5148ee74a9 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c @@ -0,0 +1,3090 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "shift_neon.h" +#include "txfm_neon.h" + +#define TXFM_COS_BIT_MAX 13 + +// A note on butterfly helper naming: +// +// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon +// e.g. butterfly_s32_s32_x4_0231_neon +// | | | ^ Weights are applied as indices 0, 2, 3, 1 +// | | | (see more detail below) +// | | ^ (int32)x4 input/output parameters +// | ^ 32-bit accumulators internally +// ^ 32-bit input/output parameters +// +// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to +// avoid needing separate negation instructions. This is represented in the +// helper naming by referring to the lane index in the loaded tuple that each +// multiply is performed with: +// +// in0 in1 +// /---------- +// out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1 +// out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3 +// +// So for indices 0331 from the earlier example, we end up with: +// +// in0 in1 +// /------------------ +// out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0 +// out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0) + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); + o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1); + int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); + o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1); + int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); + o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0); + int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon( + const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, + int32x4_t *out0, int32x4_t *out1) { + int32x4_t w0101 = vmovl_s16(w0101_s16); + int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); + o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0); + int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0); + o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); + *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); + *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); +} + +#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ + out0, out1) \ + do { \ + int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \ + u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \ + int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \ + v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \ + *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ + *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ + } while (0) + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon( + const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, + int16x4_t *out0, int16x4_t *out1) { + butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); +} + +#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ + out0, out1) \ + do { \ + int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \ + u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \ + int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \ + u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \ + int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \ + v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \ + int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \ + v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \ + const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ + const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \ + const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ + const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \ + *out0 = vcombine_s16(c0, c1); \ + *out1 = vcombine_s16(d0, d1); \ + } while (0) + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon( + const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, + int16x8_t *out0, int16x8_t *out1) { + butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); +} + +static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out, + int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out, + int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8( + int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + stride * i, in1[i]); + vst1q_s32(out + stride * i + 4, in2[i]); + } +} + +static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in, + const int stride, + int16x4_t *const out, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = vld1_s16(in); + in += stride; + } +} + +static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride, + int16x8_t *out, int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = vld1q_s16(in + i * stride); + } +} + +static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride, vmovl_s16(in[i])); + } +} + +static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i]))); + vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i]))); + } +} + +// A note on naming: +// round_shift_[sqrt2]_s16_s32_4x1_neon(...) +// | | | ^ 1 => a single vector +// | | | n => an array of vectors +// | | | ^ input/output vector element count +// | | ^ output type +// | ^ input type +// ^ multiplicand and shift identifier + +static AOM_FORCE_INLINE int16x4_t +round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) { + return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); +} + +static AOM_FORCE_INLINE int16x8_t +round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) { + return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), + round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); +} + +static AOM_FORCE_INLINE int16x4_t +round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) { + return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits); +} + +static AOM_FORCE_INLINE int16x8_t +round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) { + return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), + round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); +} + +static AOM_FORCE_INLINE int32x4_t +round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) { + return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); +} + +static AOM_FORCE_INLINE int32x4_t +round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) { + return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits); +} + +#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \ + static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \ + for (int i = 0; i < size; ++i) { \ + out[i] = fn(in[i]); \ + } \ + } + +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t, + int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t, + int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t, + int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t, + int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon) +ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t, + int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon) + +static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i])); + } +} + +static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + vst1q_s32(out + i * stride + 0, + round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i]))); + vst1q_s32(out + i * stride + 4, + round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i]))); + } +} + +static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + int32x4_t u[6], v[6]; + const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); + const int16x4_t u01 = vqadd_s16(input[0], input[1]); + + v[5] = vmull_lane_s16(input[2], sinpi, 2); + v[0] = vmull_lane_s16(input[1], sinpi, 1); + v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0); + v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3); + v[2] = vmull_lane_s16(u01, sinpi, 2); + v[3] = vmull_lane_s16(input[0], sinpi, 3); + v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0); + v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1); + + u[0] = vaddq_s32(v[0], v[1]); + u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2); + u[2] = vsubq_s32(v[3], v[4]); + u[3] = vsubq_s32(u[2], u[0]); + u[3] = vmlaq_n_s32(u[3], v[5], 3); + + output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); + output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); + output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); + output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); +} + +static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 1-2 + int16x4_t x2[8]; + butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); + butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); + + // stage 3 + int16x4_t x3[8]; + x3[0] = vqadd_s16(input[0], x2[2]); + x3[1] = vqsub_s16(x2[3], input[7]); + x3[2] = vqsub_s16(input[0], x2[2]); + x3[3] = vqadd_s16(input[7], x2[3]); + x3[4] = vqsub_s16(x2[6], input[1]); + x3[5] = vqadd_s16(input[6], x2[7]); + x3[6] = vqadd_s16(input[1], x2[6]); + x3[7] = vqsub_s16(input[6], x2[7]); + + // stage 4 + int16x4_t x4[8]; + butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]); + + // stage 5 + int16x4_t x5[8]; + x5[0] = vqadd_s16(x3[0], x4[4]); + x5[1] = vqadd_s16(x3[1], x4[5]); + x5[2] = vqadd_s16(x3[2], x4[6]); + x5[3] = vqsub_s16(x4[7], x3[3]); + x5[4] = vqsub_s16(x3[0], x4[4]); + x5[5] = vqsub_s16(x3[1], x4[5]); + x5[6] = vqsub_s16(x3[2], x4[6]); + x5[7] = vqadd_s16(x3[3], x4[7]); + + // stage 6-7 + butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); + butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); + butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); + butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); +} + +static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + int32x4_t u_lo[4], u_hi[4]; + const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); + const int16x8_t u01 = vqaddq_s16(input[0], input[1]); + + u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1); + u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1); + + u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0); + u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0); + + u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3); + u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3); + + u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2); + u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2); + + u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2); + u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2); + + u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3); + u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3); + + u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0); + u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0); + + u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1); + u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1); + + u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2); + u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2); + + u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2); + u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2); + + u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]); + u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]); + + const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3); + u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2); + u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2); + + output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX)); + output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX)); + output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX)); + output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX), + vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX)); +} + +static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]); + + int16x4_t in12a = vadd_s16(input[1], input[2]); + int16x4_t in12s = vsub_s16(input[1], input[2]); + int16x4_t in03a = vadd_s16(input[0], input[3]); + int16x4_t in03s = vsub_s16(input[0], input[3]); + + int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]); + int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]); + + int32x4_t u[4]; + u[0] = vaddq_s32(u0ad1, u0ad2); + u[1] = vsubq_s32(u0ad2, u0ad1); + u[2] = vmull_lane_s16(in12s, cospi16, 1); + u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0); + u[3] = vmull_lane_s16(in03s, cospi16, 1); + u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0); + + output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); + output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); + output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); + output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); +} + +// Butterfly pre-processing: +// e.g. n=4: +// out[0] = in[0] + in[3] +// out[1] = in[1] + in[2] +// out[2] = in[1] - in[2] +// out[3] = in[0] - in[3] + +static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input, + int16x4_t *output, + int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vqadd_s16(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input, + int16x8_t *output, + int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vqaddq_s16(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input, + int32x4_t *output, + int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vqaddq_s32(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +// Butterfly post-processing: +// e.g. n=8: +// out[0] = in0[0] + in1[3]; +// out[1] = in0[1] + in1[2]; +// out[2] = in0[1] - in1[2]; +// out[3] = in0[0] - in1[3]; +// out[4] = in0[7] - in1[4]; +// out[5] = in0[6] - in1[5]; +// out[6] = in0[6] + in1[5]; +// out[7] = in0[7] + in1[4]; + +static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0, + const int16x4_t *in1, + int16x4_t *output, + int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0, + const int16x8_t *in1, + int16x8_t *output, + int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0, + const int32x4_t *in1, + int32x4_t *output, + int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + + // stage 1 + int16x8_t x1[4]; + butterfly_dct_pre_s16_x8(input, x1, 4); + + // stage 2 + int16x8_t x2[4]; + butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]); + butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + + // stage 1 + int16x4_t x1[8]; + butterfly_dct_pre_s16_x4(input, x1, 8); + + // stage 2 + int16x4_t x2[8]; + butterfly_dct_pre_s16_x4(x1, x2, 4); + butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); + + // stage 3 + int16x4_t x3[8]; + butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); + butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); + butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4); + + // stage 4-5 + butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); + butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + + // stage 1 + int16x8_t x1[8]; + butterfly_dct_pre_s16_x8(input, x1, 8); + + // stage 2 + int16x8_t x2[8]; + butterfly_dct_pre_s16_x8(x1, x2, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); + + // stage 3 + int16x8_t x3[8]; + butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); + butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); + butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4); + + // stage 4-5 + butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); + butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); +} + +static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 1 + int16x4_t x1[16]; + butterfly_dct_pre_s16_x4(input, x1, 16); + + // stage 2 + int16x4_t x2[16]; + butterfly_dct_pre_s16_x4(x1, x2, 8); + butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); + butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); + + // stage 3 + int16x4_t x3[16]; + butterfly_dct_pre_s16_x4(x2, x3, 4); + butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); + butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8); + + // stage 4 + int16x4_t x4[16]; + butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4], + &output[12]); + butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); + butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); + + // stage 5 + int16x4_t x5[16]; + butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); + butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10], + &output[6]); + butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4); + butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4); + + // stage 6-7 + butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1], + &output[15]); + butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9], + &output[7]); + butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5], + &output[11]); + butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13], + &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 1 + int16x8_t x1[16]; + butterfly_dct_pre_s16_x8(input, x1, 16); + + // stage 2 + int16x8_t x2[16]; + butterfly_dct_pre_s16_x8(x1, x2, 8); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); + + // stage 3 + int16x8_t x3[16]; + butterfly_dct_pre_s16_x8(x2, x3, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); + butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8); + + // stage 4 + int16x8_t x4[16]; + butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4], + &output[12]); + butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); + + // stage 5 + int16x8_t x5[16]; + butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); + butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10], + &output[6]); + butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4); + butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4); + + // stage 6-7 + butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1], + &output[15]); + butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9], + &output[7]); + butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5], + &output[11]); + butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13], + &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + // stage 1 + int16x8_t x1[32]; + butterfly_dct_pre_s16_x8(input, x1, 32); + + // stage 2 + int16x8_t x2[32]; + butterfly_dct_pre_s16_x8(x1, x2, 16); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]); + + // stage 3 + int16x8_t x3[32]; + butterfly_dct_pre_s16_x8(x2, x3, 8); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]); + butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16); + + // stage 4 + int16x8_t x4[32]; + butterfly_dct_pre_s16_x8(x3, x4, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]); + butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]); + + // stage 5 + int16x8_t x5[32]; + butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0], + &output[16]); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8], + &output[24]); + butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]); + butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]); + butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8); + butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8); + + // stage 6 + int16x8_t x6[32]; + butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20], + &output[12]); + butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4); + butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4); + butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]); + butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]); + butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]); + + // stage 7 + int16x8_t x7[32]; + butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2], + &output[30]); + butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18], + &output[14]); + butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10], + &output[22]); + butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26], + &output[6]); + butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4); + butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4); + butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4); + butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4); + + butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1], + &output[31]); + butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17], + &output[15]); + butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9], + &output[23]); + butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25], + &output[7]); + butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5], + &output[27]); + butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21], + &output[11]); + butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13], + &output[19]); + butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29], + &output[3]); +} + +static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); + const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); + const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); + const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); + const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); + const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); + const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); + const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + const int16x4_t cospi1 = vget_low_s16(cospi1_3); + const int16x4_t cospi3 = vget_high_s16(cospi1_3); + const int16x4_t cospi5 = vget_low_s16(cospi5_7); + const int16x4_t cospi7 = vget_high_s16(cospi5_7); + const int16x4_t cospi9 = vget_low_s16(cospi9_11); + const int16x4_t cospi11 = vget_high_s16(cospi9_11); + const int16x4_t cospi13 = vget_low_s16(cospi13_15); + const int16x4_t cospi15 = vget_high_s16(cospi13_15); + const int16x4_t cospi17 = vget_low_s16(cospi17_19); + const int16x4_t cospi19 = vget_high_s16(cospi17_19); + const int16x4_t cospi21 = vget_low_s16(cospi21_23); + const int16x4_t cospi23 = vget_high_s16(cospi21_23); + const int16x4_t cospi25 = vget_low_s16(cospi25_27); + const int16x4_t cospi27 = vget_high_s16(cospi25_27); + const int16x4_t cospi29 = vget_low_s16(cospi29_31); + const int16x4_t cospi31 = vget_high_s16(cospi29_31); + + // stage 1 + int16x8_t x1[64]; + butterfly_dct_pre_s16_x8(input, x1, 64); + + // stage 2 + int16x8_t x2[64]; + butterfly_dct_pre_s16_x8(x1, x2, 32); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); + butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); + + // stage 3 + int16x8_t x3[64]; + butterfly_dct_pre_s16_x8(x2, x3, 16); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); + butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32); + + // stage 4 + int16x8_t x4[64]; + butterfly_dct_pre_s16_x8(x3, x4, 8); + butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); + butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); + butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); + butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); + + // stage 5 + int16x8_t x5[64]; + butterfly_dct_pre_s16_x8(x4, x5, 4); + butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); + butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); + butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); + butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); + butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); + butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16); + butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16); + + // stage 6 + int16x8_t x6[64]; + butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]); + butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); + butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4); + butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); + butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); + butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8); + butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8); + butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); + butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); + butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); + butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); + butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); + butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); + + // stage 7 + int16x8_t x7[64]; + butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); + butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); + butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4); + butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4); + butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); + butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); + butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); + butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); + butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8); + butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8); + butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8); + butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8); + + // stage 8 + int16x8_t x8[64]; + butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); + butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); + butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); + butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); + butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4); + butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4); + butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4); + butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4); + butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); + butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); + butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); + butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); + butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); + butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); + butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); + butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); + + // stage 9 + int16x8_t x9[64]; + butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); + butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); + butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); + butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); + butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); + butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); + butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); + butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); + butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4); + butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4); + butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4); + butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4); + butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4); + butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4); + butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4); + butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4); + + // stage 10 + butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1], + &output[63]); + butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33], + &output[31]); + butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17], + &output[47]); + butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49], + &output[15]); + butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9], + &output[55]); + butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41], + &output[23]); + butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25], + &output[39]); + butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57], + &output[7]); + butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5], + &output[59]); + butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37], + &output[27]); + butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21], + &output[43]); + butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53], + &output[11]); + butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13], + &output[51]); + butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45], + &output[19]); + butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29], + &output[35]); + butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61], + &output[3]); + + // stage 11 + output[0] = x6[0]; + output[2] = x9[16]; + output[4] = x8[8]; + output[6] = x9[24]; + output[8] = x7[4]; + output[10] = x9[20]; + output[12] = x8[12]; + output[14] = x9[28]; + output[16] = x6[2]; + output[18] = x9[18]; + output[20] = x8[10]; + output[22] = x9[26]; + output[24] = x7[6]; + output[26] = x9[22]; + output[28] = x8[14]; + output[30] = x9[30]; + output[32] = x6[1]; + output[34] = x9[17]; + output[36] = x8[9]; + output[38] = x9[25]; + output[40] = x7[5]; + output[42] = x9[21]; + output[44] = x8[13]; + output[46] = x9[29]; + output[48] = x6[3]; + output[52] = x8[11]; + output[54] = x9[27]; + output[56] = x7[7]; + output[58] = x9[23]; + output[60] = x8[15]; + output[62] = x9[31]; +} + +static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + + // stage 2 + int16x8_t x2[8]; + butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); + butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); + + // stage 3 + int16x8_t x3[8]; + x3[0] = vqaddq_s16(input[0], x2[2]); + x3[1] = vqsubq_s16(x2[3], input[7]); + x3[2] = vqsubq_s16(input[0], x2[2]); + x3[3] = vqaddq_s16(input[7], x2[3]); + x3[4] = vqsubq_s16(x2[6], input[1]); + x3[5] = vqaddq_s16(input[6], x2[7]); + x3[6] = vqaddq_s16(input[1], x2[6]); + x3[7] = vqsubq_s16(input[6], x2[7]); + + // stage 4 + butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); + + // stage 5 + int16x8_t x5[8]; + x5[0] = vqaddq_s16(x3[0], x3[4]); + x5[1] = vqaddq_s16(x3[1], x3[5]); + x5[2] = vqaddq_s16(x3[2], x3[6]); + x5[3] = vqsubq_s16(x3[7], x3[3]); + x5[4] = vqsubq_s16(x3[0], x3[4]); + x5[5] = vqsubq_s16(x3[1], x3[5]); + x5[6] = vqsubq_s16(x3[2], x3[6]); + x5[7] = vqaddq_s16(x3[3], x3[7]); + + // stage 6 + butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); + butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); + butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); + butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); +} + +static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + // stage 2 + int16x4_t x2[8]; + butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); + butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); + butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); + butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); + + // stage 3 + int16x4_t x3[16]; + x3[0] = vqadd_s16(input[0], x2[0]); + x3[1] = vqsub_s16(x2[1], input[15]); + x3[2] = vqsub_s16(input[0], x2[0]); + x3[3] = vqadd_s16(input[15], x2[1]); + x3[4] = vqsub_s16(x2[2], input[3]); + x3[5] = vqadd_s16(input[12], x2[3]); + x3[6] = vqadd_s16(input[3], x2[2]); + x3[7] = vqsub_s16(input[12], x2[3]); + x3[8] = vqsub_s16(x2[4], input[1]); + x3[9] = vqadd_s16(input[14], x2[5]); + x3[10] = vqadd_s16(input[1], x2[4]); + x3[11] = vqsub_s16(input[14], x2[5]); + x3[12] = vqadd_s16(input[2], x2[6]); + x3[13] = vqsub_s16(x2[7], input[13]); + x3[14] = vqsub_s16(input[2], x2[6]); + x3[15] = vqadd_s16(input[13], x2[7]); + + // stage 4 + butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); + butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); + butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); + + // stage 5 + int16x4_t x5[16]; + x5[0] = vqadd_s16(x3[0], x3[4]); + x5[1] = vqadd_s16(x3[1], x3[5]); + x5[2] = vqadd_s16(x3[2], x3[6]); + x5[3] = vqsub_s16(x3[7], x3[3]); + x5[4] = vqsub_s16(x3[0], x3[4]); + x5[5] = vqsub_s16(x3[1], x3[5]); + x5[6] = vqsub_s16(x3[2], x3[6]); + x5[7] = vqadd_s16(x3[3], x3[7]); + x5[8] = vqadd_s16(x3[8], x3[12]); + x5[9] = vqadd_s16(x3[9], x3[13]); + x5[10] = vqsub_s16(x3[14], x3[10]); + x5[11] = vqadd_s16(x3[11], x3[15]); + x5[12] = vqsub_s16(x3[8], x3[12]); + x5[13] = vqsub_s16(x3[9], x3[13]); + x5[14] = vqadd_s16(x3[10], x3[14]); + x5[15] = vqsub_s16(x3[11], x3[15]); + + // stage 6 + butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); + butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); + butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); + butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); + + // stage 7 + int16x4_t x7[16]; + x7[0] = vqadd_s16(x5[0], x5[8]); + x7[1] = vqadd_s16(x5[1], x5[9]); + x7[2] = vqadd_s16(x5[2], x5[10]); + x7[3] = vqadd_s16(x5[3], x5[11]); + x7[4] = vqadd_s16(x5[4], x5[12]); + x7[5] = vqadd_s16(x5[5], x5[13]); + x7[6] = vqadd_s16(x5[6], x5[14]); + x7[7] = vqsub_s16(x5[15], x5[7]); + x7[8] = vqsub_s16(x5[0], x5[8]); + x7[9] = vqsub_s16(x5[1], x5[9]); + x7[10] = vqsub_s16(x5[2], x5[10]); + x7[11] = vqsub_s16(x5[3], x5[11]); + x7[12] = vqsub_s16(x5[4], x5[12]); + x7[13] = vqsub_s16(x5[5], x5[13]); + x7[14] = vqsub_s16(x5[6], x5[14]); + x7[15] = vqadd_s16(x5[7], x5[15]); + + // stage 8 + butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); + butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13], + &output[2]); + butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11], + &output[4]); + butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); + butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); + butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5], + &output[10]); + butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3], + &output[12]); + butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14], + &output[1]); +} + +static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + // stage 2 + int16x8_t x2[8]; + butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); + butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); + butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); + butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); + + // stage 3 + int16x8_t x3[16]; + x3[0] = vqaddq_s16(input[0], x2[0]); + x3[1] = vqsubq_s16(x2[1], input[15]); + x3[2] = vqsubq_s16(input[0], x2[0]); + x3[3] = vqaddq_s16(input[15], x2[1]); + x3[4] = vqsubq_s16(x2[2], input[3]); + x3[5] = vqaddq_s16(input[12], x2[3]); + x3[6] = vqaddq_s16(input[3], x2[2]); + x3[7] = vqsubq_s16(input[12], x2[3]); + x3[8] = vqsubq_s16(x2[4], input[1]); + x3[9] = vqaddq_s16(input[14], x2[5]); + x3[10] = vqaddq_s16(input[1], x2[4]); + x3[11] = vqsubq_s16(input[14], x2[5]); + x3[12] = vqaddq_s16(input[2], x2[6]); + x3[13] = vqsubq_s16(x2[7], input[13]); + x3[14] = vqsubq_s16(input[2], x2[6]); + x3[15] = vqaddq_s16(input[13], x2[7]); + + // stage 4 + butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); + butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); + butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); + + // stage 5 + int16x8_t x5[16]; + x5[0] = vqaddq_s16(x3[0], x3[4]); + x5[1] = vqaddq_s16(x3[1], x3[5]); + x5[2] = vqaddq_s16(x3[2], x3[6]); + x5[3] = vqsubq_s16(x3[7], x3[3]); + x5[4] = vqsubq_s16(x3[0], x3[4]); + x5[5] = vqsubq_s16(x3[1], x3[5]); + x5[6] = vqsubq_s16(x3[2], x3[6]); + x5[7] = vqaddq_s16(x3[3], x3[7]); + x5[8] = vqaddq_s16(x3[8], x3[12]); + x5[9] = vqaddq_s16(x3[9], x3[13]); + x5[10] = vqsubq_s16(x3[14], x3[10]); + x5[11] = vqaddq_s16(x3[11], x3[15]); + x5[12] = vqsubq_s16(x3[8], x3[12]); + x5[13] = vqsubq_s16(x3[9], x3[13]); + x5[14] = vqaddq_s16(x3[10], x3[14]); + x5[15] = vqsubq_s16(x3[11], x3[15]); + + // stage 6 + butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); + butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); + butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); + + // stage 7 + int16x8_t x7[16]; + x7[0] = vqaddq_s16(x5[0], x5[8]); + x7[1] = vqaddq_s16(x5[1], x5[9]); + x7[2] = vqaddq_s16(x5[2], x5[10]); + x7[3] = vqaddq_s16(x5[3], x5[11]); + x7[4] = vqaddq_s16(x5[4], x5[12]); + x7[5] = vqaddq_s16(x5[5], x5[13]); + x7[6] = vqaddq_s16(x5[6], x5[14]); + x7[7] = vqsubq_s16(x5[15], x5[7]); + x7[8] = vqsubq_s16(x5[0], x5[8]); + x7[9] = vqsubq_s16(x5[1], x5[9]); + x7[10] = vqsubq_s16(x5[2], x5[10]); + x7[11] = vqsubq_s16(x5[3], x5[11]); + x7[12] = vqsubq_s16(x5[4], x5[12]); + x7[13] = vqsubq_s16(x5[5], x5[13]); + x7[14] = vqsubq_s16(x5[6], x5[14]); + x7[15] = vqaddq_s16(x5[7], x5[15]); + + // stage 8 + butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); + butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13], + &output[2]); + butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11], + &output[4]); + butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); + butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); + butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5], + &output[10]); + butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3], + &output[12]); + butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14], + &output[1]); +} + +static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input, + int16x4_t *const output, + const int cos_bit) { + (void)cos_bit; + round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4); +} + +static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input, + int16x8_t *const output, + const int cos_bit) { + (void)cos_bit; + round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4); +} + +static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input, + int16x4_t *output, int cos_bit) { + (void)cos_bit; + shift_left_1_s16_x4(input, output, 8); +} + +static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input, + int16x8_t *output, int cos_bit) { + (void)cos_bit; + shift_left_1_s16_x8(input, output, 8); +} + +static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input, + int16x4_t *output, + int cos_bit) { + (void)cos_bit; + round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16); +} + +static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input, + int16x8_t *output, + int cos_bit) { + (void)cos_bit; + round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16); +} + +static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input, + int16x8_t *output, + int cos_bit) { + (void)cos_bit; + shift_left_2_s16_x8(input, output, 32); +} + +#define TRANSFORM_COL(name, tw, n) \ + static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \ + int stride, int cos_bit) { \ + int16x##tw##_t buf0[n]; \ + load_buffer_s16_x##tw(input, stride, buf0, n); \ + shift_left_2_s16_x##tw(buf0, buf0, n); \ + name##_neon(buf0, output, cos_bit); \ + } + +TRANSFORM_COL(fadst4x4, 4, 4) +TRANSFORM_COL(fadst4x8, 4, 8) +TRANSFORM_COL(fadst4x16, 4, 16) +TRANSFORM_COL(fadst8x4, 8, 4) +TRANSFORM_COL(fadst8x8, 8, 8) +TRANSFORM_COL(fadst8x16, 8, 16) +TRANSFORM_COL(fdct4x4, 4, 4) +TRANSFORM_COL(fdct4x8, 4, 8) +TRANSFORM_COL(fdct4x16, 4, 16) +TRANSFORM_COL(fdct8x4, 8, 4) +TRANSFORM_COL(fdct8x8, 8, 8) +TRANSFORM_COL(fdct8x16, 8, 16) +TRANSFORM_COL(fdct8x32, 8, 32) +TRANSFORM_COL(fidentity4x4, 4, 4) +TRANSFORM_COL(fidentity4x8, 4, 8) +TRANSFORM_COL(fidentity4x16, 4, 16) +TRANSFORM_COL(fidentity8x4, 8, 4) +TRANSFORM_COL(fidentity8x8, 8, 8) +TRANSFORM_COL(fidentity8x16, 8, 16) +TRANSFORM_COL(fidentity8x32, 8, 32) + +#define TRANSFORM_ROW(name, tw, n) \ + static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \ + int stride, int cos_bit) { \ + int16x##tw##_t buf0[n]; \ + name##_neon(input, buf0, cos_bit); \ + store_buffer_s16_x##tw(buf0, output, stride, n); \ + } + +#define TRANSFORM_ROW_RECT(name, tw, n) \ + static void name##_row_rect_neon(const int16x##tw##_t *input, \ + int32_t *output, int stride, int cos_bit) { \ + int16x##tw##_t buf0[n]; \ + name##_neon(input, buf0, cos_bit); \ + store_rect_buffer_s16_x##tw(buf0, output, stride, n); \ + } + +TRANSFORM_ROW(fadst4x4, 4, 4) +TRANSFORM_ROW(fadst4x16, 4, 16) +TRANSFORM_ROW(fadst8x4, 8, 4) +TRANSFORM_ROW(fadst8x8, 8, 8) +TRANSFORM_ROW(fadst8x16, 8, 16) +TRANSFORM_ROW(fdct4x4, 4, 4) +TRANSFORM_ROW(fdct4x16, 4, 16) +TRANSFORM_ROW(fdct8x4, 8, 4) +TRANSFORM_ROW(fdct8x8, 8, 8) +TRANSFORM_ROW(fdct8x16, 8, 16) +TRANSFORM_ROW(fdct8x32, 8, 32) +TRANSFORM_ROW(fidentity4x4, 4, 4) +TRANSFORM_ROW(fidentity4x16, 4, 16) +TRANSFORM_ROW(fidentity8x4, 8, 4) +TRANSFORM_ROW(fidentity8x8, 8, 8) +TRANSFORM_ROW(fidentity8x16, 8, 16) +TRANSFORM_ROW(fidentity8x32, 8, 32) + +TRANSFORM_ROW_RECT(fadst4x8, 4, 8) +TRANSFORM_ROW_RECT(fadst8x4, 8, 4) +TRANSFORM_ROW_RECT(fadst8x8, 8, 8) +TRANSFORM_ROW_RECT(fadst8x16, 8, 16) +TRANSFORM_ROW_RECT(fdct4x8, 4, 8) +TRANSFORM_ROW_RECT(fdct8x4, 8, 4) +TRANSFORM_ROW_RECT(fdct8x8, 8, 8) +TRANSFORM_ROW_RECT(fdct8x16, 8, 16) +TRANSFORM_ROW_RECT(fdct8x32, 8, 32) +TRANSFORM_ROW_RECT(fidentity4x8, 4, 8) +TRANSFORM_ROW_RECT(fidentity8x4, 8, 4) +TRANSFORM_ROW_RECT(fidentity8x8, 8, 8) +TRANSFORM_ROW_RECT(fidentity8x16, 8, 16) +TRANSFORM_ROW_RECT(fidentity8x32, 8, 32) + +typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input, + int16x4_t *output, int cos_bit); +typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input, + int16x8_t *output, int cos_bit); + +typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input, + int16x4_t *output, int stride, + int cos_bit); +typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input, + int16x8_t *output, int stride, + int cos_bit); + +typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input, + int32_t *output, int stride, + int cos_bit); +typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input, + int32_t *output, int stride, + int cos_bit); + +static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_col_neon, // DCT_DCT + fadst4x8_col_neon, // ADST_DCT + fdct4x8_col_neon, // DCT_ADST + fadst4x8_col_neon, // ADST_ADST + fadst4x8_col_neon, // FLIPADST_DCT + fdct4x8_col_neon, // DCT_FLIPADST + fadst4x8_col_neon, // FLIPADST_FLIPADST + fadst4x8_col_neon, // ADST_FLIPADST + fadst4x8_col_neon, // FLIPADST_ADST + fidentity4x8_col_neon, // IDTX + fdct4x8_col_neon, // V_DCT + fidentity4x8_col_neon, // H_DCT + fadst4x8_col_neon, // V_ADST + fidentity4x8_col_neon, // H_ADST + fadst4x8_col_neon, // V_FLIPADST + fidentity4x8_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_row_neon, // DCT_DCT + fdct8x4_row_neon, // ADST_DCT + fadst8x4_row_neon, // DCT_ADST + fadst8x4_row_neon, // ADST_ADST + fdct8x4_row_neon, // FLIPADST_DCT + fadst8x4_row_neon, // DCT_FLIPADST + fadst8x4_row_neon, // FLIPADST_FLIPADST + fadst8x4_row_neon, // ADST_FLIPADST + fadst8x4_row_neon, // FLIPADST_ADST + fidentity8x4_row_neon, // IDTX + fidentity8x4_row_neon, // V_DCT + fdct8x4_row_neon, // H_DCT + fidentity8x4_row_neon, // V_ADST + fadst8x4_row_neon, // H_ADST + fidentity8x4_row_neon, // V_FLIPADST + fadst8x4_row_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = { + fdct8x4_row_rect_neon, // DCT_DCT + fdct8x4_row_rect_neon, // ADST_DCT + fadst8x4_row_rect_neon, // DCT_ADST + fadst8x4_row_rect_neon, // ADST_ADST + fdct8x4_row_rect_neon, // FLIPADST_DCT + fadst8x4_row_rect_neon, // DCT_FLIPADST + fadst8x4_row_rect_neon, // FLIPADST_FLIPADST + fadst8x4_row_rect_neon, // ADST_FLIPADST + fadst8x4_row_rect_neon, // FLIPADST_ADST + fidentity8x4_row_rect_neon, // IDTX + fidentity8x4_row_rect_neon, // V_DCT + fdct8x4_row_rect_neon, // H_DCT + fidentity8x4_row_rect_neon, // V_ADST + fadst8x4_row_rect_neon, // H_ADST + fidentity8x4_row_rect_neon, // V_FLIPADST + fadst8x4_row_rect_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_col_neon, // DCT_DCT + fadst8x4_col_neon, // ADST_DCT + fdct8x4_col_neon, // DCT_ADST + fadst8x4_col_neon, // ADST_ADST + fadst8x4_col_neon, // FLIPADST_DCT + fdct8x4_col_neon, // DCT_FLIPADST + fadst8x4_col_neon, // FLIPADST_FLIPADST + fadst8x4_col_neon, // ADST_FLIPADST + fadst8x4_col_neon, // FLIPADST_ADST + fidentity8x4_col_neon, // IDTX + fdct8x4_col_neon, // V_DCT + fidentity8x4_col_neon, // H_DCT + fadst8x4_col_neon, // V_ADST + fidentity8x4_col_neon, // H_ADST + fadst8x4_col_neon, // V_FLIPADST + fidentity8x4_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = { + fdct4x8_row_rect_neon, // DCT_DCT + fdct4x8_row_rect_neon, // ADST_DCT + fadst4x8_row_rect_neon, // DCT_ADST + fadst4x8_row_rect_neon, // ADST_ADST + fdct4x8_row_rect_neon, // FLIPADST_DCT + fadst4x8_row_rect_neon, // DCT_FLIPADST + fadst4x8_row_rect_neon, // FLIPADST_FLIPADST + fadst4x8_row_rect_neon, // ADST_FLIPADST + fadst4x8_row_rect_neon, // FLIPADST_ADST + fidentity4x8_row_rect_neon, // IDTX + fidentity4x8_row_rect_neon, // V_DCT + fdct4x8_row_rect_neon, // H_DCT + fidentity4x8_row_rect_neon, // V_ADST + fadst4x8_row_rect_neon, // H_ADST + fidentity4x8_row_rect_neon, // V_FLIPADST + fadst4x8_row_rect_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_col_neon, // DCT_DCT + fadst8x8_col_neon, // ADST_DCT + fdct8x8_col_neon, // DCT_ADST + fadst8x8_col_neon, // ADST_ADST + fadst8x8_col_neon, // FLIPADST_DCT + fdct8x8_col_neon, // DCT_FLIPADST + fadst8x8_col_neon, // FLIPADST_FLIPADST + fadst8x8_col_neon, // ADST_FLIPADST + fadst8x8_col_neon, // FLIPADST_ADST + fidentity8x8_col_neon, // IDTX + fdct8x8_col_neon, // V_DCT + fidentity8x8_col_neon, // H_DCT + fadst8x8_col_neon, // V_ADST + fidentity8x8_col_neon, // H_ADST + fadst8x8_col_neon, // V_FLIPADST + fidentity8x8_col_neon, // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_row_neon, // DCT_DCT + fdct8x8_row_neon, // ADST_DCT + fadst8x8_row_neon, // DCT_ADST + fadst8x8_row_neon, // ADST_ADST + fdct8x8_row_neon, // FLIPADST_DCT + fadst8x8_row_neon, // DCT_FLIPADST + fadst8x8_row_neon, // FLIPADST_FLIPADST + fadst8x8_row_neon, // ADST_FLIPADST + fadst8x8_row_neon, // FLIPADST_ADST + fidentity8x8_row_neon, // IDTX + fidentity8x8_row_neon, // V_DCT + fdct8x8_row_neon, // H_DCT + fidentity8x8_row_neon, // V_ADST + fadst8x8_row_neon, // H_ADST + fidentity8x8_row_neon, // V_FLIPADST + fadst8x8_row_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = { + fdct8x8_row_rect_neon, // DCT_DCT + fdct8x8_row_rect_neon, // ADST_DCT + fadst8x8_row_rect_neon, // DCT_ADST + fadst8x8_row_rect_neon, // ADST_ADST + fdct8x8_row_rect_neon, // FLIPADST_DCT + fadst8x8_row_rect_neon, // DCT_FLIPADST + fadst8x8_row_rect_neon, // FLIPADST_FLIPADST + fadst8x8_row_rect_neon, // ADST_FLIPADST + fadst8x8_row_rect_neon, // FLIPADST_ADST + fidentity8x8_row_rect_neon, // IDTX + fidentity8x8_row_rect_neon, // V_DCT + fdct8x8_row_rect_neon, // H_DCT + fidentity8x8_row_rect_neon, // V_ADST + fadst8x8_row_rect_neon, // H_ADST + fidentity8x8_row_rect_neon, // V_FLIPADST + fadst8x8_row_rect_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = { + fdct4x16_col_neon, // DCT_DCT + fadst4x16_col_neon, // ADST_DCT + fdct4x16_col_neon, // DCT_ADST + fadst4x16_col_neon, // ADST_ADST + fadst4x16_col_neon, // FLIPADST_DCT + fdct4x16_col_neon, // DCT_FLIPADST + fadst4x16_col_neon, // FLIPADST_FLIPADST + fadst4x16_col_neon, // ADST_FLIPADST + fadst4x16_col_neon, // FLIPADST_ADST + fidentity4x16_col_neon, // IDTX + fdct4x16_col_neon, // V_DCT + fidentity4x16_col_neon, // H_DCT + fadst4x16_col_neon, // V_ADST + fidentity4x16_col_neon, // H_ADST + fadst4x16_col_neon, // V_FLIPADST + fidentity4x16_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = { + fdct4x16_row_neon, // DCT_DCT + fdct4x16_row_neon, // ADST_DCT + fadst4x16_row_neon, // DCT_ADST + fadst4x16_row_neon, // ADST_ADST + fdct4x16_row_neon, // FLIPADST_DCT + fadst4x16_row_neon, // DCT_FLIPADST + fadst4x16_row_neon, // FLIPADST_FLIPADST + fadst4x16_row_neon, // ADST_FLIPADST + fadst4x16_row_neon, // FLIPADST_ADST + fidentity4x16_row_neon, // IDTX + fidentity4x16_row_neon, // V_DCT + fdct4x16_row_neon, // H_DCT + fidentity4x16_row_neon, // V_ADST + fadst4x16_row_neon, // H_ADST + fidentity4x16_row_neon, // V_FLIPADST + fadst4x16_row_neon // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_col_neon, // DCT_DCT + fadst8x16_col_neon, // ADST_DCT + fdct8x16_col_neon, // DCT_ADST + fadst8x16_col_neon, // ADST_ADST + fadst8x16_col_neon, // FLIPADST_DCT + fdct8x16_col_neon, // DCT_FLIPADST + fadst8x16_col_neon, // FLIPADST_FLIPADST + fadst8x16_col_neon, // ADST_FLIPADST + fadst8x16_col_neon, // FLIPADST_ADST + fidentity8x16_col_neon, // IDTX + fdct8x16_col_neon, // V_DCT + fidentity8x16_col_neon, // H_DCT + fadst8x16_col_neon, // V_ADST + fidentity8x16_col_neon, // H_ADST + fadst8x16_col_neon, // V_FLIPADST + fidentity8x16_col_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_row_neon, // DCT_DCT + fdct8x16_row_neon, // ADST_DCT + fadst8x16_row_neon, // DCT_ADST + fadst8x16_row_neon, // ADST_ADST + fdct8x16_row_neon, // FLIPADST_DCT + fadst8x16_row_neon, // DCT_FLIPADST + fadst8x16_row_neon, // FLIPADST_FLIPADST + fadst8x16_row_neon, // ADST_FLIPADST + fadst8x16_row_neon, // FLIPADST_ADST + fidentity8x16_row_neon, // IDTX + fidentity8x16_row_neon, // V_DCT + fdct8x16_row_neon, // H_DCT + fidentity8x16_row_neon, // V_ADST + fadst8x16_row_neon, // H_ADST + fidentity8x16_row_neon, // V_FLIPADST + fadst8x16_row_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = { + fdct8x16_row_rect_neon, // DCT_DCT + fdct8x16_row_rect_neon, // ADST_DCT + fadst8x16_row_rect_neon, // DCT_ADST + fadst8x16_row_rect_neon, // ADST_ADST + fdct8x16_row_rect_neon, // FLIPADST_DCT + fadst8x16_row_rect_neon, // DCT_FLIPADST + fadst8x16_row_rect_neon, // FLIPADST_FLIPADST + fadst8x16_row_rect_neon, // ADST_FLIPADST + fadst8x16_row_rect_neon, // FLIPADST_ADST + fidentity8x16_row_rect_neon, // IDTX + fidentity8x16_row_rect_neon, // V_DCT + fdct8x16_row_rect_neon, // H_DCT + fidentity8x16_row_rect_neon, // V_ADST + fadst8x16_row_rect_neon, // H_ADST + fidentity8x16_row_rect_neon, // V_FLIPADST + fadst8x16_row_rect_neon // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = { + fdct8x32_row_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_row_neon, // IDTX + fidentity8x32_row_neon, // V_DCT + fdct8x32_row_neon, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = { + fdct8x32_row_rect_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_row_rect_neon, // IDTX + fidentity8x32_row_rect_neon, // V_DCT + fdct8x32_row_rect_neon, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = { + fdct8x32_col_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_col_neon, // IDTX + fdct8x32_col_neon, // V_DCT + fidentity8x32_col_neon, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + + int16x4_t buf0[4], buf1[4]; + switch (tx_type) { + case DCT_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_ADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case FLIPADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_FLIPADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case ADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case IDTX: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case V_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_DCT: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case V_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_ADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case V_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_FLIPADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + } +} + +static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x4_t buf0[8]; + int16x8_t buf1[8]; + const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + col_txfm(input, buf0, stride, 13); + shift_right_1_round_s16_x4(buf0, buf0, 8); + transpose_arrays_s16_4x8(buf0, buf1); + + if (lr_flip) { + int16x8_t buf2[8]; + flip_buf_8_neon(buf1, buf2, 4); + row_txfm(buf2, output, 8, 13); + } else { + row_txfm(buf1, output, 8, 13); + } +} + +static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x4_t buf0[16]; + int16x8_t buf1[16]; + const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + col_txfm(input, buf0, stride, 13); + shift_right_1_round_s16_x4(buf0, buf0, 16); + transpose_arrays_s16_4x8(buf0, buf1); + transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + int16x8_t buf2[16]; + flip_buf_8_neon(buf1 + 8 * i, buf2, 4); + row_txfm(buf2, output + 8 * i, 16, 12); + } else { + int16x8_t *buf = buf1 + 8 * i; + row_txfm(buf, output + 8 * i, 16, 12); + } + } +} + +static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[8]; + int16x4_t buf1[8]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; + const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + col_txfm(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 4); + transpose_arrays_s16_8x4(buf0, buf1); + + if (lr_flip) { + int16x4_t buf2[8]; + flip_buf_4_neon(buf1, buf2, 8); + row_txfm(buf2, output, 4, 13); + } else { + row_txfm(buf1, output, 4, 13); + } +} + +static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + int16x8_t buf0[8], buf1[8]; + + switch (tx_type) { + case DCT_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_ADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case FLIPADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_FLIPADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case ADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case IDTX: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case V_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_DCT: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case V_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_ADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case V_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_FLIPADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + } +} + +static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16], buf1[16]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + col_txfm(input, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 16); + transpose_arrays_s16_8x8(buf0, buf1); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 8 * i, buf0, 8); + row_txfm(buf0, output + 8 * i, 16, 13); + } else { + int16x8_t *buf = buf1 + 8 * i; + row_txfm(buf, output + 8 * i, 16, 13); + } + } +} + +static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[32]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); + col_txfm(input, buf0, stride, 12); + shift_right_2_round_s16_x8(buf0, buf0, 32); + transpose_arrays_s16_8x8(buf0, buf1); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); + transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16); + transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 8 * i, buf0, 8); + row_txfm(buf0, output + 8 * i, 32, 12); + } else { + int16x8_t *buf = buf1 + 8 * i; + row_txfm(buf, output + 8 * i, 32, 12); + } + } +} + +static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16]; + int16x4_t buf1[16]; + int16x4_t buf2[16]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; + const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 4); + transpose_arrays_s16_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + flip_buf_4_neon(buf1, buf2, 16); + row_txfm(buf2, output, 4, 13); + } else { + row_txfm(buf1, output, 4, 13); + } +} + +static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16], buf1[16]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + flip_buf_8_neon(buf1, buf0, 16); + row_txfm(buf0, output, 8, 13); + } else { + row_txfm(buf1, output, 8, 13); + } +} + +static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[16], buf1[32]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 16); + transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i); + } + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 16 * i, buf0, 16); + row_txfm(buf0, output + 8 * i, 16, 12); + } else { + int16x8_t *buf = buf1 + 16 * i; + row_txfm(buf, output + 8 * i, 16, 12); + } + } +} + +static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[64]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); + for (int i = 0; i < 2; i++) { + col_txfm(input + 8 * i, buf0, stride, 12); + shift_right_4_round_s16_x8(buf0, buf0, 32); + transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i); + } + + for (int i = 0; i < 4; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 16 * i, buf0, 16); + row_txfm(buf0, output + 8 * i, 32, 13); + } else { + int16x8_t *buf = buf1 + 16 * i; + row_txfm(buf, output + 8 * i, 32, 13); + } + } +} + +static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[32]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + for (int i = 0; i < 4; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_2_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); + } + + if (lr_flip) { + flip_buf_8_neon(buf1, buf0, 32); + row_txfm(buf0, output, 8, 12); + } else { + row_txfm(buf1, output, 8, 12); + } +} + +static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[64]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + for (int i = 0; i < 4; i++) { + col_txfm(input + 8 * i, buf0, stride, 13); + shift_right_4_round_s16_x8(buf0, buf0, 16); + transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i); + } + + for (int i = 0; i < 2; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 32 * i, buf0, 32); + row_txfm(buf0, output + 8 * i, 16, 13); + } else { + int16x8_t *buf = buf1 + 32 * i; + row_txfm(buf, output + 8 * i, 16, 13); + } + } +} + +static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[32], buf1[128]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm == NULL || row_txfm == NULL) { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + return; + } + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); + for (int i = 0; i < 4; i++) { + col_txfm(input + 8 * i, buf0, stride, 12); + shift_right_4_round_s16_x8(buf0, buf0, 32); + transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i); + transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i); + } + + for (int i = 0; i < 4; i++) { + if (lr_flip) { + flip_buf_8_neon(buf1 + 32 * i, buf0, 32); + row_txfm(buf0, output + 8 * i, 32, 12); + } else { + int16x8_t *buf = buf1 + 32 * i; + row_txfm(buf, output + 8 * i, 32, 12); + } + } +} + +static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[128]; + const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon; + const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon; + + for (int i = 0; i < 8; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 16); + shift_left_2_s16_x8(buf0, buf0, 16); + col_txfm(buf0, buf0, 13); + shift_right_4_round_s16_x8(buf0, buf0, 16); + for (int j = 0; j < 2; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); + } + } + + for (int i = 0; i < 2; i++) { + int16x8_t *buf = buf1 + 64 * i; + row_txfm(buf, buf, 12); + store_buffer_s16_x8(buf, output + 8 * i, 16, 32); + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[128]; + const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; + const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon; + + for (int i = 0; i < 2; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); + col_txfm(buf0, buf0, 13); + shift_right_2_round_s16_x8(buf0, buf0, 64); + for (int j = 0; j < 8; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i); + } + } + + for (int i = 0; i < 4; i++) { + int16x8_t *buf = buf1 + 16 * i; + row_txfm(buf, buf, 12); + store_buffer_s16_x8(buf, output + 8 * i, 32, 16); + } +} + +static void fdct32_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + + int32x4_t buf0[32]; + int32x4_t buf1[32]; + + // stage 1 + butterfly_dct_pre_s32_x4(input, buf1, 32); + + // stage 2 + butterfly_dct_pre_s32_x4(buf1, buf0, 16); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27], + &buf0[20]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26], + &buf0[21]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25], + &buf0[22]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24], + &buf0[23]); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + butterfly_dct_pre_s32_x4(buf0, buf1, 8); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13], + &buf1[10]); + butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12], + &buf1[11]); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16); + + // stage 4 + butterfly_dct_pre_s32_x4(buf1, buf0, 4); + buf0[4] = buf1[4]; + butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]); + buf0[7] = buf1[7]; + butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29], + &buf0[18]); + butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28], + &buf0[19]); + butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27], + &buf0[20]); + butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26], + &buf0[21]); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]); + butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]); + butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4); + buf1[8] = buf0[8]; + butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14], + &buf1[9]); + butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13], + &buf1[10]); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8); + butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8); + + // stage 6 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]); + butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]); + butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4); + butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4); + buf0[16] = buf1[16]; + butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30], + &buf0[17]); + butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29], + &buf0[18]); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26], + &buf0[21]); + butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25], + &buf0[22]); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8], + &buf1[15]); + butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9], + &buf1[14]); + butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10], + &buf1[13]); + butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11], + &buf1[12]); + butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4); + butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4); + butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4); + butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4); + + // stage 8 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16], + &buf0[31]); + butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17], + &buf0[30]); + butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18], + &buf0[29]); + butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19], + &buf0[28]); + butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20], + &buf0[27]); + butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21], + &buf0[26]); + butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22], + &buf0[25]); + butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23], + &buf0[24]); + + // stage 9 + output[0] = buf0[0]; + output[1] = buf0[16]; + output[2] = buf0[8]; + output[3] = buf0[24]; + output[4] = buf0[4]; + output[5] = buf0[20]; + output[6] = buf0[12]; + output[7] = buf0[28]; + output[8] = buf0[2]; + output[9] = buf0[18]; + output[10] = buf0[10]; + output[11] = buf0[26]; + output[12] = buf0[6]; + output[13] = buf0[22]; + output[14] = buf0[14]; + output[15] = buf0[30]; + output[16] = buf0[1]; + output[17] = buf0[17]; + output[18] = buf0[9]; + output[19] = buf0[25]; + output[20] = buf0[5]; + output[21] = buf0[21]; + output[22] = buf0[13]; + output[23] = buf0[29]; + output[24] = buf0[3]; + output[25] = buf0[19]; + output[26] = buf0[11]; + output[27] = buf0[27]; + output[28] = buf0[7]; + output[29] = buf0[23]; + output[30] = buf0[15]; + output[31] = buf0[31]; +} + +static void fdct64_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { + const int16_t *cospi = cospi_arr_q13(cos_bit); + + const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); + const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); + const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); + const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); + const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); + const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); + const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); + const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); + const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); + const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); + const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); + const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); + const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); + const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); + const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); + const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); + + const int16x4_t cospi32 = vget_low_s16(cospi32_16); + const int16x4_t cospi16 = vget_high_s16(cospi32_16); + const int16x4_t cospi8 = vget_low_s16(cospi8_24); + const int16x4_t cospi24 = vget_high_s16(cospi8_24); + const int16x4_t cospi4 = vget_low_s16(cospi4_12); + const int16x4_t cospi12 = vget_high_s16(cospi4_12); + const int16x4_t cospi20 = vget_low_s16(cospi20_28); + const int16x4_t cospi28 = vget_high_s16(cospi20_28); + const int16x4_t cospi2 = vget_low_s16(cospi2_6); + const int16x4_t cospi6 = vget_high_s16(cospi2_6); + const int16x4_t cospi10 = vget_low_s16(cospi10_14); + const int16x4_t cospi14 = vget_high_s16(cospi10_14); + const int16x4_t cospi18 = vget_low_s16(cospi18_22); + const int16x4_t cospi22 = vget_high_s16(cospi18_22); + const int16x4_t cospi26 = vget_low_s16(cospi26_30); + const int16x4_t cospi30 = vget_high_s16(cospi26_30); + const int16x4_t cospi1 = vget_low_s16(cospi1_3); + const int16x4_t cospi3 = vget_high_s16(cospi1_3); + const int16x4_t cospi5 = vget_low_s16(cospi5_7); + const int16x4_t cospi7 = vget_high_s16(cospi5_7); + const int16x4_t cospi9 = vget_low_s16(cospi9_11); + const int16x4_t cospi11 = vget_high_s16(cospi9_11); + const int16x4_t cospi13 = vget_low_s16(cospi13_15); + const int16x4_t cospi15 = vget_high_s16(cospi13_15); + const int16x4_t cospi17 = vget_low_s16(cospi17_19); + const int16x4_t cospi19 = vget_high_s16(cospi17_19); + const int16x4_t cospi21 = vget_low_s16(cospi21_23); + const int16x4_t cospi23 = vget_high_s16(cospi21_23); + const int16x4_t cospi25 = vget_low_s16(cospi25_27); + const int16x4_t cospi27 = vget_high_s16(cospi25_27); + const int16x4_t cospi29 = vget_low_s16(cospi29_31); + const int16x4_t cospi31 = vget_high_s16(cospi29_31); + + // stage 1 + int32x4_t x1[64]; + butterfly_dct_pre_s32_x4(input, x1, 64); + + // stage 2 + int32x4_t x2[64]; + butterfly_dct_pre_s32_x4(x1, x2, 32); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); + butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); + + // stage 3 + int32x4_t x3[64]; + butterfly_dct_pre_s32_x4(x2, x3, 16); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); + butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); + butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32); + + // stage 4 + int32x4_t x4[64]; + butterfly_dct_pre_s32_x4(x3, x4, 8); + butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); + butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); + butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); + butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); + butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); + + // stage 5 + int32x4_t x5[64]; + butterfly_dct_pre_s32_x4(x4, x5, 4); + butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); + butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8); + butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); + butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); + butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); + butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); + butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16); + butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16); + + // stage 6 + int32x4_t x6[64]; + butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]); + butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); + butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4); + butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); + butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); + butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8); + butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8); + butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); + butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); + butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); + butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); + butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); + butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); + butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); + butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); + + // stage 7 + int32x4_t x7[64]; + butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); + butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); + butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4); + butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4); + butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); + butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); + butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); + butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); + butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8); + butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8); + butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8); + butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8); + + // stage 8 + int32x4_t x8[64]; + butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); + butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); + butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); + butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); + butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4); + butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4); + butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4); + butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4); + butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); + butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); + butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); + butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); + butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); + butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); + butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); + butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); + + // stage 9 + int32x4_t x9[64]; + butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); + butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); + butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); + butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); + butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); + butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); + butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); + butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); + butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4); + butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4); + butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4); + butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4); + butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4); + butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4); + butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4); + butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4); + + // stage 10 + int32x4_t x10[64]; + butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]); + butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]); + butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]); + butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]); + butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]); + butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]); + butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]); + butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]); + butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]); + butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]); + butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]); + butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]); + butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]); + butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]); + butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]); + butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]); + + // stage 11, only store into the low 32 output indices. + output[0] = x6[0]; + output[1] = x10[32]; + output[2] = x9[16]; + output[3] = x10[48]; + output[4] = x8[8]; + output[5] = x10[40]; + output[6] = x9[24]; + output[7] = x10[56]; + output[8] = x7[4]; + output[9] = x10[36]; + output[10] = x9[20]; + output[11] = x10[52]; + output[12] = x8[12]; + output[13] = x10[44]; + output[14] = x9[28]; + output[15] = x10[60]; + output[16] = x6[2]; + output[17] = x10[34]; + output[18] = x9[18]; + output[19] = x10[50]; + output[20] = x8[10]; + output[21] = x10[42]; + output[22] = x9[26]; + output[23] = x10[58]; + output[24] = x7[6]; + output[25] = x10[38]; + output[26] = x9[22]; + output[27] = x10[54]; + output[28] = x8[14]; + output[29] = x10[46]; + output[30] = x9[30]; + output[31] = x10[62]; +} + +static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[512]; + const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; + + for (int i = 0; i < 8; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); + col_txfm(buf0, buf0, 13); + shift_right_2_round_s16_x8(buf0, buf0, 64); + for (int j = 0; j < 4; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); + } + } + for (int i = 0; i < 4; i++) { + int32x4_t bufA[64]; + int32x4_t bufB[64]; + int16x8_t *buf = buf1 + 64 * i; + for (int j = 0; j < 64; ++j) { + bufA[j] = vmovl_s16(vget_low_s16(buf[j])); + bufB[j] = vmovl_s16(vget_high_s16(buf[j])); + } + fdct64_neon(bufA, bufA, 10); + fdct64_neon(bufB, bufB, 10); + shift_right_2_round_s32_x4(bufA, bufA, 32); + shift_right_2_round_s32_x4(bufB, bufB, 32); + store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + int16x8_t buf0[64], buf1[256]; + const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; + + for (int i = 0; i < 8; i++) { + col_txfm(input + 8 * i, buf0, stride, 12); + shift_right_4_round_s16_x8(buf0, buf0, 32); + for (int j = 0; j < 4; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < 4; i++) { + int32x4_t bufA[64]; + int32x4_t bufB[64]; + int16x8_t *buf = buf1 + 64 * i; + for (int j = 0; j < 64; ++j) { + bufA[j] = vmovl_s16(vget_low_s16(buf[j])); + bufB[j] = vmovl_s16(vget_high_s16(buf[j])); + } + fdct64_neon(bufA, bufA, 11); + fdct64_neon(bufB, bufB, 11); + shift_right_2_round_s32_x4(bufA, bufA, 32); + shift_right_2_round_s32_x4(bufB, bufB, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); + store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + int16x8_t buf0[64], buf1[256]; + const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; + + for (int i = 0; i < 4; i++) { + load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); + col_txfm(buf0, buf0, 13); + shift_right_2_round_s16_x8(buf0, buf0, 64); + for (int j = 0; j < 4; ++j) { + transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i); + } + } + + for (int i = 0; i < 4; i++) { + int32x4_t bufA[32]; + int32x4_t bufB[32]; + int16x8_t *buf = buf1 + 32 * i; + for (int j = 0; j < 32; ++j) { + bufA[j] = vmovl_s16(vget_low_s16(buf[j])); + bufB[j] = vmovl_s16(vget_high_s16(buf[j])); + } + fdct32_neon(bufA, bufA, 11); + fdct32_neon(bufB, bufB, 11); + shift_right_2_round_s32_x4(bufA, bufA, 32); + shift_right_2_round_s32_x4(bufB, bufB, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); + round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); + store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = { + lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform + lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform + lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform + lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform + lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform + lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform + lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform + lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform + lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform + lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform + lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform + lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform + lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform + lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform + lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform + lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform + lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform + lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform + lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size]; + if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c new file mode 100644 index 0000000000..11d3def16b --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" + +#include "av1/common/quant_common.h" +#include "av1/encoder/av1_quantize.h" + +static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, + int32x4_t v_round_s32, int log_scale) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_log_scale = vdupq_n_s32(log_scale); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) + const int32x4_t v_abs_coeff_scaled = + vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale)); + const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), + vreinterpretq_s32_u32(v_mask)); + // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); + const int32x4_t v_abs_qcoeff = + vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale)); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Used to find eob. + const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0)); + return vmovn_u32(nz_qcoeff_mask); +} + +static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan, + int16x8_t v_eobmax, + uint16x8_t v_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if AOM_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif +} + +void av1_highbd_quantize_fp_neon( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + (void)scan; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int16x4_t v_quant = vld1_s16(quant_ptr); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero); + const int16x4_t v_round_no_scale = vld1_s16(round_ptr); + const int16x4_t v_round_log_scale = + vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); + const int16x4_t v_round = + vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + + // DC and first 3 AC + v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, + v_dequant_s32, v_round_s32, log_scale); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, log_scale); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + count -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, + v_dequant_s32, v_round_s32, log_scale); + v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32, log_scale); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + count -= 8; + } while (count); + + *eob_ptr = get_max_eob(v_eobmax); +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c new file mode 100644 index 0000000000..d13cc65ae0 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "aom_dsp/arm/sum_neon.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +static int32x4_t k_means_multiply_add_neon(const int16x8_t a) { + const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a)); + const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a)); +#if AOM_ARCH_AARCH64 + return vpaddq_s32(l, h); +#else + const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l)); + const int32x2_t dh = vpadd_s32(vget_low_s32(h), vget_high_s32(h)); + return vcombine_s32(dl, dh); +#endif +} + +void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + int64x2_t sum = vdupq_n_s64(0); + int16x8_t cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + cents[j] = vdupq_n_s16(centroids[j]); + } + + for (int i = 0; i < n; i += 8) { + const int16x8_t in = vld1q_s16(data); + uint16x8_t ind = vdupq_n_u16(0); + // Compute the distance to the first centroid. + int16x8_t dist_min = vabdq_s16(in, cents[0]); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + const int16x8_t dist = vabdq_s16(in, cents[j]); + // Compare to the minimal one. + const uint16x8_t cmp = vcgtq_s16(dist_min, dist); + dist_min = vminq_s16(dist_min, dist); + const uint16x8_t ind1 = vdupq_n_u16(j); + ind = vbslq_u16(cmp, ind1, ind); + } + if (total_dist) { + // Square, convert to 32 bit and add together. + const int32x4_t l = + vmull_s16(vget_low_s16(dist_min), vget_low_s16(dist_min)); + const int32x4_t sum32_tmp = + vmlal_s16(l, vget_high_s16(dist_min), vget_high_s16(dist_min)); + // Pairwise sum, convert to 64 bit and add to sum. + sum = vpadalq_s32(sum, sum32_tmp); + } + vst1_u8(indices, vmovn_u16(ind)); + indices += 8; + data += 8; + } + if (total_dist) { + *total_dist = horizontal_add_s64x2(sum); + } +} + +void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + int64x2_t sum = vdupq_n_s64(0); + uint32x4_t ind[2]; + int16x8_t cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; + const int16_t cxcy[8] = { cx, cy, cx, cy, cx, cy, cx, cy }; + cents[j] = vld1q_s16(cxcy); + } + + for (int i = 0; i < n; i += 8) { + for (int l = 0; l < 2; ++l) { + const int16x8_t in = vld1q_s16(data); + ind[l] = vdupq_n_u32(0); + // Compute the distance to the first centroid. + int16x8_t d1 = vsubq_s16(in, cents[0]); + int32x4_t dist_min = k_means_multiply_add_neon(d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = vsubq_s16(in, cents[j]); + const int32x4_t dist = k_means_multiply_add_neon(d1); + // Compare to the minimal one. + const uint32x4_t cmp = vcgtq_s32(dist_min, dist); + dist_min = vminq_s32(dist_min, dist); + const uint32x4_t ind1 = vdupq_n_u32(j); + ind[l] = vbslq_u32(cmp, ind1, ind[l]); + } + if (total_dist) { + // Pairwise sum, convert to 64 bit and add to sum. + sum = vpadalq_s32(sum, dist_min); + } + data += 8; + } + // Cast to 8 bit and store. + vst1_u8(indices, + vmovn_u16(vcombine_u16(vmovn_u32(ind[0]), vmovn_u32(ind[1])))); + indices += 8; + } + if (total_dist) { + *total_dist = horizontal_add_s64x2(sum); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c new file mode 100644 index 0000000000..18cd0ce4c0 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "av1/common/reconinter.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_temporal_denoiser.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { +#if AOM_ARCH_AARCH64 + return vaddlvq_s8(v_sum_diff_total); +#else + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +#endif +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = block_size_high[bs] >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, to 128x128 blocks. +static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = block_size_wide[bs]; + const int b_height = block_size_high[bs]; + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[8][8]; + int r, c, sum_diff = 0; + + for (r = 0; r < 8; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/third_party/aom/av1/encoder/arm/neon/cnn_neon.c b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c new file mode 100644 index 0000000000..8e686260d0 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c @@ -0,0 +1,1144 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_cnn_weights.h" + +// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are +// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in +// partition_cnn_weights.h. However, to enable linear memory access, rearrange +// the weight tables here. +static const float weights_layer_1[] = { + 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f, + -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f, + 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f, + -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f, + 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f, + 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f, + 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f, + -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f, + -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f, + 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f, + -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f, + 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f, + 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f, + -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f, + 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f, + -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f, + -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f, + -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f, + -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f, + -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f, + 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f, + 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f, + -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f, + -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f, + -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f, + 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f, + 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f, + 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f, + 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f, + 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f, + 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f, + 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f, + 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f, + 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f, + 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f, + 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f, + 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f, + 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f, + 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f, + 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f, + 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f, + 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f, + -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f, + 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f, + 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f, + 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f, + 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f, + 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f, + 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f, + -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f, + 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f, + -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f, + -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f, + 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f, + 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f, + -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f, + -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f, + -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f, + 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f, + -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f, + 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f, + -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f, + 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f, + -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f, + 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f, + -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f, + 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f, + -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f, + -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f, + 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f, + 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f, + 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f, + 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f, + -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f, + 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f, + 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f, + 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f, + -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f, + 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f, + 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f, + 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f, + 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f, + 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f, + 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f, + -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f, + 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f, + 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f, + 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f, + 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f, + -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f, + -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f, + 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f, + -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f, + -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f, + 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f, + 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f, + 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f, + 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f, + 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f, + -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f, + 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f, + -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f, + -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f, + -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f, + 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f, + 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f, + 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f, + -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f, + -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f, + -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f, + 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f, + 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f, + -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f, + -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f, + 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f, + 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f, + 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f, + 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f, + 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f, + -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f, + 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f, + -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f, + 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f, + -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f, + -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f, + 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f, + -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f, + 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f, + -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f, + -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f, + -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f, + -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f, + -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f, + 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f, + -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f, + 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f, + -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f, + 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f, + -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f, + -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f, + -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f, + -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f, + -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f, + 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f, + -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f, + 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f, + -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f, + -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f, + 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f, + 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f, + -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f, + -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f, + 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f, + -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f, + -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f, + 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f, + -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f, + -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f, + -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f, + 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f, + -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f, + -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f, + 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f, + 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f, + -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f, + -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f, + -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f, + -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f, + -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f, + 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f, + -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f, + 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f, + -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f, + 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f, + -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f, + 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f, + -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f, + 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f, + 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f, + 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f, + 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f, + 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f, + 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f, + 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f, + 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f, + 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f, + 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f, + -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f, + 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f, + 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f, + -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f, + 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f, + 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f, + -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f, + -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f, + -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f, + -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f, + 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f, + -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f, + 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f, + 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f, + 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f, + 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f, + 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f, + -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f, + 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f, + -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f, + 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f, + 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f, + 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f, + 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f, + -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f, + -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f, + -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f, + -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f, + -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f, + 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f, + 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f, + -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f, + 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f, + -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f, + -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f, + 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f, + 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f, + -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f, + 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f, + -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f, + 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f, + -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f, + 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f, + 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f, + 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f, + 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f, + 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f, + 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f, + 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f, + 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f, + -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f, + -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f, + -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f, + -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f, + 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f, + -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f, + -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f, + 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f, + -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f, + 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f, + 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f, + -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f, + 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f, + 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f, + -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f, + 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f, + -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f, + 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f, + -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f, + 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f, + -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f, + -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f, + -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f, + -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f, + -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f, + 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f, + 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f, + -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f, + 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f, + 0.018088f, 0.115791f, -0.079165f, 0.139388f, +}; + +static const float weights_layer_2[] = { + 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f, + 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f, + 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f, + -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f, + -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f, + -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f, + 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f, + -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f, + 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f, + 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f, + 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f, + -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f, + 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f, + 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f, + 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f, + -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f, + -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f, + 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f, + 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f, + 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f, + 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f, + 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f, + 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f, + -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f, + 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f, + 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f, + 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f, + 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f, + -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f, + 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f, + 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f, + -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f, + -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f, + 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f, + -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f, + -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f, + 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f, + 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f, + 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f, + -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f, + -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f, + 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f, + -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f, + 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f, + 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f, + -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f, + -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f, + -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f, + -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f, + -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f, + -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f, + -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f, + -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f, + -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f, + 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f, + 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f, + 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f, + -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f, + 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f, + -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f, + 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f, + -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f, + 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f, + -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f, + -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f, + -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f, + 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f, + -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f, + 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f, + -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f, + -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f, + -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f, + -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f, + -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f, + -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f, + -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f, + 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f, + -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f, + -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f, + -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f, + 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f, + 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f, + 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f, + 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f, + 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f, + -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f, + 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f, + 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f, + 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f, + 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f, + 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f, + 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f, + 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f, + -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f, + -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f, + -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f, + 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f, + -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f, + 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f, + -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f, + -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f, + -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f, + 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f, + -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f, + -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f, + 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f, + -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f, + -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f, + 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f, + -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f, + -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f, + 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f, + -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f, + -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f, + -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f, + -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f, + 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f, + -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f, + 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f, + -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f, + -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f, + -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f, + -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f, + -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f, + 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f, + -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f, + -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f, + 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f, + -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f, + -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f, + -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f, + -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f, + -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f, + 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f, + 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f, + -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f, + -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f, + -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f, + 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f, + -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f, + 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f, + -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f, + 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f, + -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f, + 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f, + 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f, + 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f, + 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f, + -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f, + 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f, + -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f, + -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f, + -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f, + 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f, + -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f, + 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f, + -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f, + 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f, + 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f, + 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f, + 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f, + 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f, + 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f, + 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f, + -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f, + -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f, + 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f, + -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f, + 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f, + 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f, + 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f, + -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f, + 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f, + 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f, + -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f, + 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f, + 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f, + -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f, + -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f, + -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f, + 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f, + -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f, + 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f, + -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f, + 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f, + -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f, + -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f, + 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f, + 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f, + 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f, + 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f, + -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f, + 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f, + 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f, + -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f, + 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f, + -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f, + 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f, + -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f, + 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f, + 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f, + 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f, + 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f, + -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f, + 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f, + -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f, + 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f, + 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f, + 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f, + -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f, + 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f, + -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f, + 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f, + -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f, + -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f, + 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f, + -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f, + -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f, + 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f, + -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f, + 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f, + 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f, + -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f, + 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f, + -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f, + 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f, + -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f, + 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f, + -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f, + 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f, + -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f, + 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f, + -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f, + 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f, + -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f, + -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f, + -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f, + 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f, + 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f, + 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f, + -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f, + -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f, + -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f, + -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f, + -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f, + 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f, + -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f, + -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f, + 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f, + 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f, + -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f, + 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f, + -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f, + -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f, + -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f, + 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f, + 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f, + 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f, + 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f, + 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f, + 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f, + 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f, + 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f, + 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f, + -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f, + -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f, + -0.212384f, -0.229157f, -0.283428f, -0.184891f, +}; + +static const float weights_layer_3[] = { + -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f, + 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f, + -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f, + 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f, + -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f, + -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f, + -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f, + -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f, + -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f, + 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f, + -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f, + -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f, + 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f, + -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f, + 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f, + -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f, + 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f, + -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f, + -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f, + 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f, + 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f, + 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f, + 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f, + -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f, + -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f, + -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f, + -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f, + -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f, + -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f, + 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f, + 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f, + 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f, + 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f, + -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f, + 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f, + -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f, + 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f, + 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f, + 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f, + -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f, + -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f, + 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f, + 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f, + -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f, + 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f, + -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f, + -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f, + 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f, + 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f, + 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f, + -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f, + -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f, + -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f, + -0.071383f, -0.075005f, +}; + +static const float weights_layer_4[] = { + -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f, + -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f, + 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f, + -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f, + -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f, + -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f, + -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f, + -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f, + -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f, + -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f, + 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f, + -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f, + -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f, + -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f, + -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f, + -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f, + -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f, + -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f, + 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f, + 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f, + -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f, + -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f, + -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f, + -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f, + -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f, + -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f, + 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f, + 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f, + 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f, + 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f, + 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f, + -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f, + 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f, + 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f, + -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f, + -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f, + 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f, + 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f, + -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f, + 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f, + -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f, + 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f, + -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f, + 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f, + 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f, + 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f, + -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f, + -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f, + -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f, + 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f, + -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f, + -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f, + 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f, + -0.421885f, -0.293573f, +}; + +static const float weights_layer_5[] = { + 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f, + 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f, + 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f, + 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f, + -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f, + 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f, + -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f, + 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f, + 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f, + -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f, + -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f, + 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f, + 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f, + -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f, + -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f, + 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f, + -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f, + -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f, + 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f, + 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f, + -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f, + -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f, + -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f, + 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f, + 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f, + -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f, + 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f, + -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f, + -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f, + -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f, + -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f, + -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f, + -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f, + -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f, + -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f, + 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f, + -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f, + -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f, + -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f, + 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f, + -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f, + 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f, + -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f, + -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f, + 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f, + 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f, + 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f, + -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f, + -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f, + 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f, + 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f, + 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f, + 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f, + 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f, + 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f, + -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f, + 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f, + 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f, + -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f, + -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f, + -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f, + -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f, + 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f, + 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f, + 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f, + -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f, + -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f, + -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f, + 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f, + 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f, + -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f, + 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f, + 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f, + -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f, + -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f, + 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f, + 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f, + -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f, + -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f, + -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f, + -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f, + -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f, + -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f, + 0.565984f, 0.592690f, +}; + +static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) { + float32x4_t sum01 = vaddq_f32(a[0], a[1]); + float32x4_t sum23 = vaddq_f32(a[2], a[3]); + return vaddq_f32(sum01, sum23); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width >= 16); + const int in_size = in_height * in_width; + + do { + const float32x4_t bias_v = vdupq_n_f32(bias[0]); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi_0 = + vld2q_f32(in_ptr2 + in_size + in_stride); + + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0); + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1); + + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0); + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1); + + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0); + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1); + + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0); + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1); + + const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8); + const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8); + const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8); + const float32x4x2_t in1_hi_1 = + vld2q_f32(in_ptr2 + in_size + in_stride + 8); + + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0); + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1); + + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0); + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1); + + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0); + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1); + + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0); + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum0)); + vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1)); + + out_ptr1 += 8; + in_ptr1 += 8 * skip_width; + w += 8 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width == 8); + const int in_size = in_height * in_width; + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride); + + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0); + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1); + + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1); + + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0); + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1); + + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0); + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum)); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 5 && filter_width == 5); + assert(skip_width == 4 && skip_height == 4); + assert(in_width >= 16); + assert(in_channels == 1); + (void)in_channels; + + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *in_ptr0 = *input; + const float *weights_ptr0 = weights; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) }; + + const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0); + const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4); + const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8); + const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12); + const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16); + const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20); + + const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3); + const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3); + const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7); + const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7); + const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11); + const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11); + const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15); + const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15); + const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19); + const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19); + const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23); + const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23); + + const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride); + const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride); + const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride); + const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride); + const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride); + + const float32x4_t in0_4 = vextq_f32( + in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1); + const float32x4_t in1_4 = vextq_f32( + in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1); + const float32x4_t in2_4 = vextq_f32( + in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1); + const float32x4_t in3_4 = vextq_f32( + in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1); + const float32x4_t in4_4 = vextq_f32( + in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1); + + // Kernel row 0. + sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0); + + // Kernel row 1. + sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1); + + // Kernel row 2. + sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0); + + // Kernel row 3. + sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1); + + // Kernel row 4. + sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1); + sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4); + + vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1])); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++output; + ++bias; + weights += 25; + } while (++start_idx < out_channels); +} + +// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_neon( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || + !layer_config->maxpool); + assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); + assert(layer_config->pad == PADDING_VALID); + assert(channel_step == 1); + assert(cstep == layer_config->in_channels * layer_config->out_channels); + + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights_layer_5); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + const float *weights = weights_layer_1; + if (layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) { + weights = weights_layer_2; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[3] + .output_num)) { + weights = weights_layer_3; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[4] + .output_num)) { + weights = weights_layer_4; + } + if (in_width >= 16) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else if (in_width == 8) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, channel_step); + } + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c new file mode 100644 index 0000000000..582863a27c --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c @@ -0,0 +1,646 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/encodetxb.h" + +void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + memset(levels - TX_PAD_TOP * stride, 0, + sizeof(*levels) * TX_PAD_TOP * stride); + memset(levels + stride * width, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + const int32x4_t zeros = vdupq_n_s32(0); + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const int32x4_t coeffA = vld1q_s32(cf); + const int32x4_t coeffB = vld1q_s32(cf + height); + const int16x8_t coeffAB = + vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const int8x8_t absABs = vqmovn_s16(absAB); +#if AOM_ARCH_AARCH64 + const int8x16_t absAB8 = + vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros))); + const uint8x16_t lsAB = + vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros)); +#else + const int32x2x2_t absAB8 = + vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros)); + const uint8x16_t lsAB = + vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1])); +#endif + vst1q_u8(ls, lsAB); + ls += (stride << 1); + cf += (height << 1); + i += 2; + } while (i < width); + } else if (height == 8) { + do { + const int16x8_t coeffAB = load_tran_low_to_s16q(cf); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8( + vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros)))); + vst1q_u8(ls, absAB8); + ls += stride; + cf += height; + i += 1; + } while (i < width); + } else { + do { + int j = 0; + do { + const int16x8_t coeffAB = load_tran_low_to_s16q(cf); + const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8); + const int16x8_t absAB = vqabsq_s16(coeffAB); + const int16x8_t absCD = vqabsq_s16(coeffCD); + const uint8x16_t absABCD = vreinterpretq_u8_s8( + vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD))); + vst1q_u8((ls + j), absABCD); + j += 16; + cf += 16; + } while (j < height); + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} + +// get_4_nz_map_contexts_2d coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = { + { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 }, + { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 } +}; + +// get_4_nz_map_contexts_hor coefficients: +/* clang-format off */ +#define SIG_COEF_CONTEXTS_2D_X4_051010 \ + (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \ + ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24)) +/* clang-format on */ + +// get_4_nz_map_contexts_ver coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// get_8_coeff_contexts_2d coefficients: +// if (width == 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = { + { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } +}; +// if (width < 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = { + { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 }, + { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 } +}; + +// if (width > 8) +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = { + { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } +}; + +// get_4_nz_map_contexts_ver coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// get_16n_coeff_contexts_2d coefficients: +// real_width == real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = { + { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// real_width < real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = { + { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// real_width > real_height +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = { + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }, + { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, + { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } +}; + +// get_16n_coeff_contexts_hor coefficients: +static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = { + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 +}; + +// end of coefficients declaration area + +static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src, + const int byte_stride) { +#if AOM_ARCH_AARCH64 + uint32x4_t v_data = vld1q_u32((uint32_t *)src); + v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1); + v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2); + v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3); + + return vreinterpretq_u8_u32(v_data); +#else + return load_unaligned_u8q(src, byte_stride); +#endif +} + +static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src, + const int byte_stride) { +#if AOM_ARCH_AARCH64 + uint64x2_t v_data = vld1q_u64((uint64_t *)src); + v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1); + + return vreinterpretq_u8_u64(v_data); +#else + uint8x8_t v_data_low = vld1_u8(src); + uint8x8_t v_data_high = vld1_u8(src + byte_stride); + + return vcombine_u8(v_data_low, v_data_high); +#endif +} + +static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src, + const int byte_stride) { + (void)byte_stride; + return vld1q_u8(src); +} + +static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_4x4_to_1_reg(&src[1], stride); + level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride); + level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_8x2_to_1_reg(&src[1], stride); + level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride); + level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE void load_levels_16x1x5(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + uint8x16_t *const level) { + level[0] = load_8bit_16x1_to_1_reg(&src[1], stride); + level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride); + level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride); + level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride); + level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride); +} + +static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) { + const uint8x16_t const_3 = vdupq_n_u8(3); + const uint8x16_t const_4 = vdupq_n_u8(4); + uint8x16_t count; + + count = vminq_u8(level[0], const_3); + level[1] = vminq_u8(level[1], const_3); + level[2] = vminq_u8(level[2], const_3); + level[3] = vminq_u8(level[3], const_3); + level[4] = vminq_u8(level[4], const_3); + count = vaddq_u8(count, level[1]); + count = vaddq_u8(count, level[2]); + count = vaddq_u8(count, level[3]); + count = vaddq_u8(count, level[4]); + + count = vrshrq_n_u8(count, 1); + count = vminq_u8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(21); + + uint8x16_t pos_to_offset = + (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]); + + uint8x16_t count; + uint8x16_t level[5]; + uint8_t *cc = coeff_contexts; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + col -= 4; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + + const uint8x16_t pos_to_offset = + vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010)); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 4)); + + int col = width; + do { + load_levels_4x4x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + uint8_t *cc = coeff_contexts; + uint8x16_t count; + uint8x16_t level[5]; + uint8x16_t pos_to_offset[3]; + + assert(!(width % 2)); + + if (width == 8) { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]); + } else if (width < 8) { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]); + } else { + pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]); + pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]); + } + pos_to_offset[2] = vdupq_n_u8(21); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + col -= 2; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + + const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 2)); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0), + vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5)); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(width % 2)); + + int col = width; + do { + load_levels_8x2x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + uint8_t *cc = coeff_contexts; + int col = width; + uint8x16_t pos_to_offset[5]; + uint8x16_t pos_to_offset_large[3]; + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + pos_to_offset_large[2] = vdupq_n_u8(21); + if (real_width == real_height) { + pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]); + pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]); + pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]); + pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width < real_height) { + pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]); + pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = + vld1q_u8(c_16_po_2d_g[2]); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width > real_height + pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]); + pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]); + pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16); + } + + do { + int h = height; + + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(cc, count); + levels += 16; + cc += 16; + h -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + + const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + int col = width; + do { + uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver); + + int h = height; + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset); + vst1q_u8(coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + levels += TX_PAD_HOR; + } while (--col); +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + uint8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + + uint8x16_t pos_to_offset[3]; + uint8x16_t count; + uint8x16_t level[5]; + + assert(!(height % 16)); + + pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); + + int col = width; + do { + int h = height; + do { + load_levels_16x1x5(levels, stride, offsets, level); + count = get_coeff_contexts_kernel(level); + count = vaddq_u8(count, pos_to_offset[0]); + vst1q_u8(coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--col); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_neon(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + uint8_t *const coefficients = (uint8_t *const)coeff_contexts; + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = height + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (height == 4) { + get_4_nz_map_contexts_2d(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_2d(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coefficients); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (height == 4) { + get_4_nz_map_contexts_hor(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_hor(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients); + } + } else { // TX_CLASS_VERT + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (height == 4) { + get_4_nz_map_contexts_ver(levels, width, offsets, coefficients); + } else if (height == 8) { + get_8_coeff_contexts_ver(levels, width, offsets, coefficients); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients); + } + } + + const int bhl = get_txb_bhl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (width << bhl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (width << bhl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c new file mode 100644 index 0000000000..aa64a38902 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c @@ -0,0 +1,2619 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "shift_neon.h" +#include "txfm_neon.h" + +static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in, + int32x4_t *out) { + // This is not quite the same as the other transposes defined in + // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is + // unused by the following row transform. + for (int j = 0; j < 8; ++j) { + for (int i = 0; i < 16; ++i) { + transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i); + } + } +} + +// A note on butterfly helper naming: +// +// butterfly_[weight_indices]_neon +// e.g. butterfly_0312_neon +// ^ Weights are applied as indices 0, 3, 2, 1 +// (see more detail below) +// +// Weight indices are treated as an index into the 4-tuple of the weight +// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1). +// This is then represented in the helper naming by referring to the lane index +// in the loaded tuple that each multiply is performed with: +// +// in0 in1 +// /------------ +// out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1] +// out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3] +// +// So for indices 0321 from the earlier example, we end up with: +// +// in0 in1 +// /------------------ +// out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1) +// out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0) + +#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \ + do { \ + int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \ + int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \ + x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \ + *out = vrshlq_s32(x, v_bit); \ + } while (false) + +static AOM_FORCE_INLINE void butterfly_0112_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_2312_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_0332_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_0130_neon( + const int32_t *cospi, const int widx0, const int32x4_t n0, + const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, + const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * widx0); + butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon( + const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, + int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * 32); + butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon( + const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, + int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { + int32x2_t w01 = vld1_s32(cospi + 2 * 32); + butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit); + butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit); +} + +static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input, + int32x4_t *output, + const int size) { + const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); + int i = 0; + do { + const int32x4_t r1 = vmulq_s32(input[i], sqrt2); + output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); + } while (++i < size); +} + +static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon( + const int32x4_t *input, int32x4_t *output, const int size) { + const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); + int i = 0; + do { + const int32x4_t r0 = vrshrq_n_s32(input[i], 2); + const int32x4_t r1 = vmulq_s32(r0, sqrt2); + output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); + } while (++i < size); +} + +#define LOAD_BUFFER_4XH(h) \ + static AOM_FORCE_INLINE void load_buffer_4x##h( \ + const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ + if (fliplr) { \ + for (int i = 0; i < (h); ++i) { \ + int16x4_t a = vld1_s16(input + i * stride); \ + a = vrev64_s16(a); \ + in[i] = vshll_n_s16(a, 2); \ + } \ + } else { \ + for (int i = 0; i < (h); ++i) { \ + int16x4_t a = vld1_s16(input + i * stride); \ + in[i] = vshll_n_s16(a, 2); \ + } \ + } \ + } + +// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to +// avoid the expression even though the compiler can prove that the code path +// is never taken if `shift == 0`. +#define shift_left_long_s16(a, shift) \ + ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift))) + +#define LOAD_BUFFER_WXH(w, h, shift) \ + static AOM_FORCE_INLINE void load_buffer_##w##x##h( \ + const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ + assert(w >= 8); \ + if (fliplr) { \ + for (int i = 0; i < (h); ++i) { \ + for (int j = 0; j < (w) / 8; ++j) { \ + int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ + a = vrev64q_s16(a); \ + int j2 = (w) / 8 - j - 1; \ + in[i + (h) * (2 * j2 + 0)] = \ + shift_left_long_s16(vget_high_s16(a), (shift)); \ + in[i + (h) * (2 * j2 + 1)] = \ + shift_left_long_s16(vget_low_s16(a), (shift)); \ + } \ + } \ + } else { \ + for (int i = 0; i < (h); ++i) { \ + for (int j = 0; j < (w) / 8; ++j) { \ + int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ + in[i + (h) * (2 * j + 0)] = \ + shift_left_long_s16(vget_low_s16(a), (shift)); \ + in[i + (h) * (2 * j + 1)] = \ + shift_left_long_s16(vget_high_s16(a), (shift)); \ + } \ + } \ + } \ + } + +LOAD_BUFFER_4XH(4) +LOAD_BUFFER_4XH(8) +LOAD_BUFFER_4XH(16) +LOAD_BUFFER_4XH(32) +LOAD_BUFFER_WXH(8, 8, 2) +LOAD_BUFFER_WXH(16, 16, 2) +LOAD_BUFFER_WXH(32, 64, 0) +LOAD_BUFFER_WXH(64, 32, 2) +LOAD_BUFFER_WXH(64, 64, 0) + +#if !CONFIG_REALTIME_ONLY +LOAD_BUFFER_WXH(16, 64, 0) +LOAD_BUFFER_WXH(64, 16, 2) +#endif // !CONFIG_REALTIME_ONLY + +#define STORE_BUFFER_WXH(w, h) \ + static AOM_FORCE_INLINE void store_buffer_##w##x##h( \ + const int32x4_t *in, int32_t *out, int stride) { \ + for (int i = 0; i < (w); ++i) { \ + for (int j = 0; j < (h) / 4; ++j) { \ + vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \ + } \ + } \ + } + +STORE_BUFFER_WXH(4, 4) +STORE_BUFFER_WXH(8, 4) +STORE_BUFFER_WXH(8, 8) +STORE_BUFFER_WXH(16, 4) +STORE_BUFFER_WXH(16, 16) +STORE_BUFFER_WXH(32, 4) +STORE_BUFFER_WXH(32, 32) +STORE_BUFFER_WXH(64, 32) + +#if !CONFIG_REALTIME_ONLY +STORE_BUFFER_WXH(16, 32) +STORE_BUFFER_WXH(64, 16) +#endif // !CONFIG_REALTIME_ONLY + +static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]); + const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]); + + const int32x4_t a0 = vaddq_s32(in[0], in[3]); + const int32x4_t a1 = vsubq_s32(in[0], in[3]); + const int32x4_t a2 = vaddq_s32(in[1], in[2]); + const int32x4_t a3 = vsubq_s32(in[1], in[2]); + + const int32x4_t b0 = vmulq_s32(a0, cospi32); + const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1); + const int32x4_t b2 = vmulq_s32(a2, cospi32); + const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1); + + const int32x4_t c0 = vaddq_s32(b0, b2); + const int32x4_t c1 = vsubq_s32(b0, b2); + const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0); + const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0); + + const int32x4_t v_bit = vdupq_n_s32(-bit); + const int32x4_t d0 = vrshlq_s32(c0, v_bit); + const int32x4_t d1 = vrshlq_s32(c1, v_bit); + const int32x4_t d2 = vrshlq_s32(c2, v_bit); + const int32x4_t d3 = vrshlq_s32(c3, v_bit); + + out[0] = d0; + out[1] = d2; + out[2] = d1; + out[3] = d3; +} + +static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1); + + const int32x4_t a0 = vaddq_s32(in[0], in[1]); + const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0); + const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1); + const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0); + + const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1); + const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0); + const int32x4_t b2 = vsubq_s32(a0, in[3]); + + const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1); + const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1); + const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0); + + const int32x4_t d0 = vaddq_s32(c0, a3); + const int32x4_t d1 = vsubq_s32(c1, a3); + const int32x4_t d2 = vsubq_s32(c1, c0); + + const int32x4_t e0 = vaddq_s32(d2, a3); + + const int32x4_t v_bit = vdupq_n_s32(-bit); + out[0] = vrshlq_s32(d0, v_bit); + out[1] = vrshlq_s32(c2, v_bit); + out[2] = vrshlq_s32(d1, v_bit); + out[3] = vrshlq_s32(e0, v_bit); +} + +static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in, + int32x4_t *out, + int bit) { + (void)bit; + int32x4_t fact = vdupq_n_s32(NewSqrt2); + + for (int i = 0; i < 4; i++) { + const int32x4_t a_low = vmulq_s32(in[i], fact); + out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits); + } +} + +void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff, + int input_stride, TX_TYPE tx_type, int bd) { + (void)bd; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4); + + // Workspace for column/row-wise transforms. + int32x4_t buf[4]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case ADST_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case DCT_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case ADST_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case IDTX: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case V_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case H_DCT: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case V_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case H_ADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case V_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 0); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + case H_FLIPADST: + load_buffer_4x4(input, buf, input_stride, 1); + highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + transpose_arrays_s32_4x4(buf, buf); + highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); + store_buffer_4x4(buf, coeff, /*stride=*/4); + break; + default: assert(0); + } +} + +// Butterfly pre-processing: +// e.g. n=4: +// out[0] = in[0] + in[3] +// out[1] = in[1] + in[2] +// out[2] = in[1] - in[2] +// out[3] = in[0] - in[3] + +static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input, + int32x4_t *output, int n) { + for (int i = 0; i < n / 2; ++i) { + output[i] = vaddq_s32(input[i], input[n - i - 1]); + } + for (int i = 0; i < n / 2; ++i) { + output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); + } +} + +// Butterfly post-processing: +// e.g. n=8: +// out[0] = in0[0] + in1[3]; +// out[1] = in0[1] + in1[2]; +// out[2] = in0[1] - in1[2]; +// out[3] = in0[0] - in1[3]; +// out[4] = in0[7] - in1[4]; +// out[5] = in0[6] - in1[5]; +// out[6] = in0[6] + in1[5]; +// out[7] = in0[7] + in1[4]; + +static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0, + const int32x4_t *in1, + int32x4_t *output, int n) { + for (int i = 0; i < n / 4; ++i) { + output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]); + } + for (int i = 0; i < n / 4; ++i) { + output[(3 * n) / 4 + i] = + vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); + } +} + +static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + // stage 1 + int32x4_t a[8]; + butterfly_dct_pre(in, a, 8); + + // stage 2 + int32x4_t b[8]; + butterfly_dct_pre(a, b, 4); + butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit); + + // stage 3 + int32x4_t c[8]; + butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit); + butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit); + butterfly_dct_post(a + 4, b + 4, c + 4, 4); + + // stage 4-5 + butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit); + butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit); + + out[0] = c[0]; + out[2] = c[2]; + out[4] = c[1]; + out[6] = c[3]; +} + +static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in, + int32x4_t *out, int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; + int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; + + // stage 0-1 + u0 = in[0]; + u1 = in[7]; + u2 = in[3]; + u3 = in[4]; + u4 = in[1]; + u5 = in[6]; + u6 = in[2]; + u7 = in[5]; + + // stage 2 + v0 = u0; + v1 = u1; + butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit); + v4 = u4; + v5 = u5; + butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit); + + // stage 3 + u0 = vaddq_s32(v0, v2); + u1 = vsubq_s32(v3, v1); + u2 = vsubq_s32(v0, v2); + u3 = vaddq_s32(v1, v3); + u4 = vsubq_s32(v6, v4); + u5 = vaddq_s32(v5, v7); + u6 = vaddq_s32(v4, v6); + u7 = vsubq_s32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit); + butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit); + + // stage 5 + u0 = vaddq_s32(v0, v4); + u1 = vaddq_s32(v1, v5); + u2 = vaddq_s32(v2, v6); + u3 = vsubq_s32(v7, v3); + u4 = vsubq_s32(v0, v4); + u5 = vsubq_s32(v1, v5); + u6 = vsubq_s32(v2, v6); + u7 = vaddq_s32(v3, v7); + + // stage 6 + butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit); + butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit); + butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit); + butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit); + + // stage 7 + out[0] = v1; + out[1] = v6; + out[2] = v3; + out[3] = v4; + out[4] = v5; + out[5] = v2; + out[6] = v7; + out[7] = v0; +} + +static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in, + int32x4_t *out, + int bit) { + (void)bit; + out[0] = vshlq_n_s32(in[0], 1); + out[1] = vshlq_n_s32(in[1], 1); + out[2] = vshlq_n_s32(in[2], 1); + out[3] = vshlq_n_s32(in[3], 1); + out[4] = vshlq_n_s32(in[4], 1); + out[5] = vshlq_n_s32(in[5], 1); + out[6] = vshlq_n_s32(in[6], 1); + out[7] = vshlq_n_s32(in[7], 1); +} + +static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in, + int32x4_t *out, int bit, + int howmany) { + const int stride = 8; + int i = 0; + do { + highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in, + int32x4_t *out, int bit, + int howmany) { + const int stride = 8; + int i = 0; + do { + highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in, + int32x4_t *out, int bit, + int howmany) { + (void)bit; + const int stride = 8; + int i = 0; + do { + highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + // Workspaces for column/row-wise transforms. + int32x4_t buf0[16], buf1[16]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case ADST_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case DCT_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case ADST_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case IDTX: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case V_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case H_DCT: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case V_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case H_ADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case V_FLIPADST: + load_buffer_8x8(input, buf0, stride, 0); + highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + case H_FLIPADST: + load_buffer_8x8(input, buf0, stride, 1); + highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_8x8(buf0, buf1); + highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); + store_buffer_8x8(buf1, coeff, /*stride=*/8); + break; + default: assert(0); + } +} + +static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out, + int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + int32x4_t u[16], v[16]; + + // stage 1 + butterfly_dct_pre(in, u, 16); + + // stage 2 + butterfly_dct_pre(u, v, 8); + v[8] = u[8]; + v[9] = u[9]; + butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit); + butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + butterfly_dct_pre(v, u, 4); + u[4] = v[4]; + butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit); + u[7] = v[7]; + butterfly_dct_post(v + 8, v + 8, u + 8, 8); + + // stage 4 + butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit); + butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit); + butterfly_dct_post(u + 4, u + 4, v + 4, 4); + v[8] = u[8]; + butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit); + butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit); + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit); + butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit); + butterfly_dct_post(v + 8, v + 8, u + 8, 4); + butterfly_dct_post(v + 12, v + 12, u + 12, 4); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit); + butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit); + butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit); + butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit); + + out[0] = v[0]; + out[1] = v[8]; + out[2] = v[4]; + out[3] = v[12]; + out[4] = v[2]; + out[5] = v[10]; + out[6] = v[6]; + out[7] = v[14]; + out[8] = v[1]; + out[9] = v[9]; + out[10] = v[5]; + out[11] = v[13]; + out[12] = v[3]; + out[13] = v[11]; + out[14] = v[7]; + out[15] = v[15]; +} + +static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out, + int bit) { + const int32_t *const cospi = cospi_arr_s32(bit); + const int32x4_t v_bit = vdupq_n_s32(-bit); + + int32x4_t u[16], v[16]; + + // stage 0-1 + u[0] = in[0]; + u[1] = in[15]; + u[2] = in[7]; + u[3] = in[8]; + u[4] = in[3]; + u[5] = in[12]; + u[6] = in[4]; + u[7] = in[11]; + u[8] = in[1]; + u[9] = in[14]; + u[10] = in[6]; + u[11] = in[9]; + u[12] = in[2]; + u[13] = in[13]; + u[14] = in[5]; + u[15] = in[10]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit); + v[4] = u[4]; + v[5] = u[5]; + butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit); + v[8] = u[8]; + v[9] = u[9]; + butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit); + v[12] = u[12]; + v[13] = u[13]; + butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit); + + // stage 3 + u[0] = vaddq_s32(v[0], v[2]); + u[1] = vsubq_s32(v[3], v[1]); + u[2] = vsubq_s32(v[0], v[2]); + u[3] = vaddq_s32(v[1], v[3]); + u[4] = vsubq_s32(v[6], v[4]); + u[5] = vaddq_s32(v[5], v[7]); + u[6] = vaddq_s32(v[4], v[6]); + u[7] = vsubq_s32(v[5], v[7]); + u[8] = vsubq_s32(v[10], v[8]); + u[9] = vaddq_s32(v[9], v[11]); + u[10] = vaddq_s32(v[8], v[10]); + u[11] = vsubq_s32(v[9], v[11]); + u[12] = vaddq_s32(v[12], v[14]); + u[13] = vsubq_s32(v[15], v[13]); + u[14] = vsubq_s32(v[12], v[14]); + u[15] = vaddq_s32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit); + butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit); + + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + + butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit); + butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit); + + // stage 5 + u[0] = vaddq_s32(v[0], v[4]); + u[1] = vaddq_s32(v[1], v[5]); + u[2] = vaddq_s32(v[2], v[6]); + u[3] = vsubq_s32(v[7], v[3]); + u[4] = vsubq_s32(v[0], v[4]); + u[5] = vsubq_s32(v[1], v[5]); + u[6] = vsubq_s32(v[2], v[6]); + u[7] = vaddq_s32(v[3], v[7]); + u[8] = vaddq_s32(v[8], v[12]); + u[9] = vaddq_s32(v[9], v[13]); + u[10] = vsubq_s32(v[14], v[10]); + u[11] = vaddq_s32(v[11], v[15]); + u[12] = vsubq_s32(v[8], v[12]); + u[13] = vsubq_s32(v[9], v[13]); + u[14] = vaddq_s32(v[10], v[14]); + u[15] = vsubq_s32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit); + butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit); + butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit); + butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit); + + // stage 7 + u[0] = vaddq_s32(v[0], v[8]); + u[1] = vaddq_s32(v[1], v[9]); + u[2] = vaddq_s32(v[2], v[10]); + u[3] = vaddq_s32(v[3], v[11]); + u[4] = vaddq_s32(v[4], v[12]); + u[5] = vaddq_s32(v[5], v[13]); + u[6] = vaddq_s32(v[6], v[14]); + u[7] = vsubq_s32(v[15], v[7]); + u[8] = vsubq_s32(v[0], v[8]); + u[9] = vsubq_s32(v[1], v[9]); + u[10] = vsubq_s32(v[2], v[10]); + u[11] = vsubq_s32(v[3], v[11]); + u[12] = vsubq_s32(v[4], v[12]); + u[13] = vsubq_s32(v[5], v[13]); + u[14] = vsubq_s32(v[6], v[14]); + u[15] = vaddq_s32(v[7], v[15]); + + // stage 8 + butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit); + butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit); + butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit); + butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit); + butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit); + butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit); + butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit); + butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit); + + // stage 9 + out[0] = v[1]; + out[1] = v[14]; + out[2] = v[3]; + out[3] = v[12]; + out[4] = v[5]; + out[5] = v[10]; + out[6] = v[7]; + out[7] = v[8]; + out[8] = v[9]; + out[9] = v[6]; + out[10] = v[11]; + out[11] = v[4]; + out[12] = v[13]; + out[13] = v[2]; + out[14] = v[15]; + out[15] = v[0]; +} + +static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out, + int bit) { + (void)bit; + const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2); + const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1)); + + for (int i = 0; i < 16; i++) { + int32x4_t a = vmulq_s32(in[i], fact); + a = vaddq_s32(a, offset); + out[i] = vshrq_n_s32(a, NewSqrt2Bits); + } +} + +static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, + const int howmany) { + const int stride = 16; + int i = 0; + do { + highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, + int howmany) { + const int stride = 16; + int i = 0; + do { + highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out, + int bit, int howmany) { + const int stride = 16; + int i = 0; + do { + highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit); + } while (++i < howmany); +} + +void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Workspaces for column/row-wise transforms. + int32x4_t buf0[64], buf1[64]; + + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case ADST_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case DCT_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case ADST_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case FLIPADST_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case IDTX: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case V_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case H_DCT: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case V_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case H_ADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case V_FLIPADST: + load_buffer_16x16(input, buf0, stride, 0); + highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + case H_FLIPADST: + load_buffer_16x16(input, buf0, stride, 1); + highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); + shift_right_2_round_s32_x4(buf0, buf0, 64); + transpose_arrays_s32_16x16(buf0, buf1); + highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); + store_buffer_16x16(buf1, coeff, /*stride=*/16); + break; + default: assert(0); + } +} + +typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out, + int stride, int bit, int lr_flip); +typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in, + int32x4_t *out, int stride, + int bit, int lr_flip, + int howmany, int hm_stride); + +typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out, + int bit, int stride); +typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in, + int32_t *out, int bit, + int howmany, int hm_stride, + int stride); + +// Construct component kernels that include the load_buffer and store_buffer +// stages to avoid the need to spill loaded data to the stack between these and +// the txfm kernel calls. +// The TRANSFORM_*_ONE cases are only ever called in situations where the +// howmany parameter would be one, so no need for the loop at all in these +// cases. + +#define TRANSFORM_COL_ONE(name, n) \ + static void highbd_##name##_col_neon(const int16_t *input, \ + int32x4_t *output, int stride, \ + int cos_bit, int lr_flip) { \ + int32x4_t buf0[n]; \ + load_buffer_4x##n(input, buf0, stride, lr_flip); \ + highbd_##name##_x4_neon(buf0, output, cos_bit); \ + } + +#define TRANSFORM_COL_MANY(name, n) \ + static void highbd_##name##_col_many_neon( \ + const int16_t *input, int32x4_t *output, int stride, int cos_bit, \ + int lr_flip, int howmany, int hm_stride) { \ + int i = 0; \ + do { \ + int32x4_t buf0[n]; \ + load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \ + highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \ + } while (++i < howmany); \ + } + +#define TRANSFORM_ROW_ONE(name, n) \ + static void highbd_##name##_row_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input, buf0, cos_bit); \ + store_buffer_##n##x4(buf0, output, stride); \ + } + +#define TRANSFORM_ROW_RECT_ONE(name, n) \ + static void highbd_##name##_row_rect_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input, buf0, cos_bit); \ + round_rect_array_s32_neon(buf0, buf0, (n)); \ + store_buffer_##n##x4(buf0, output, stride); \ + } + +#define TRANSFORM_ROW_MANY(name, n) \ + static void highbd_##name##_row_many_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ + int hm_stride, int stride) { \ + int i = 0; \ + do { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ + store_buffer_##n##x4(buf0, output + 4 * i, stride); \ + } while (++i < howmany); \ + } + +#define TRANSFORM_ROW_RECT_MANY(name, n) \ + static void highbd_##name##_row_rect_many_neon( \ + const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ + int hm_stride, int stride) { \ + int i = 0; \ + do { \ + int32x4_t buf0[n]; \ + highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ + round_rect_array_s32_neon(buf0, buf0, (n)); \ + store_buffer_##n##x4(buf0, output + 4 * i, stride); \ + } while (++i < howmany); \ + } + +TRANSFORM_COL_ONE(fdct8, 8) +TRANSFORM_COL_ONE(fadst8, 8) +TRANSFORM_COL_ONE(fidentity8, 8) + +TRANSFORM_COL_MANY(fdct4, 4) +TRANSFORM_COL_MANY(fdct8, 8) +TRANSFORM_COL_MANY(fdct16, 16) +TRANSFORM_COL_MANY(fadst4, 4) +TRANSFORM_COL_MANY(fadst8, 8) +TRANSFORM_COL_MANY(fadst16, 16) +TRANSFORM_COL_MANY(fidentity4, 4) +TRANSFORM_COL_MANY(fidentity8, 8) +TRANSFORM_COL_MANY(fidentity16, 16) + +TRANSFORM_ROW_ONE(fdct16, 16) +TRANSFORM_ROW_ONE(fadst16, 16) +TRANSFORM_ROW_ONE(fidentity16, 16) + +TRANSFORM_ROW_RECT_ONE(fdct8, 8) +TRANSFORM_ROW_RECT_ONE(fadst8, 8) +TRANSFORM_ROW_RECT_ONE(fidentity8, 8) + +#if !CONFIG_REALTIME_ONLY +TRANSFORM_ROW_MANY(fdct4, 4) +TRANSFORM_ROW_MANY(fdct8, 8) +TRANSFORM_ROW_MANY(fadst4, 4) +TRANSFORM_ROW_MANY(fadst8, 8) +TRANSFORM_ROW_MANY(fidentity4, 4) +TRANSFORM_ROW_MANY(fidentity8, 8) +#endif + +TRANSFORM_ROW_RECT_MANY(fdct4, 4) +TRANSFORM_ROW_RECT_MANY(fdct8, 8) +TRANSFORM_ROW_RECT_MANY(fdct16, 16) +TRANSFORM_ROW_RECT_MANY(fadst4, 4) +TRANSFORM_ROW_RECT_MANY(fadst8, 8) +TRANSFORM_ROW_RECT_MANY(fadst16, 16) +TRANSFORM_ROW_RECT_MANY(fidentity4, 4) +TRANSFORM_ROW_RECT_MANY(fidentity8, 8) +TRANSFORM_ROW_RECT_MANY(fidentity16, 16) + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm8_xn_arr[TX_TYPES] = { + highbd_fdct8_col_many_neon, // DCT_DCT + highbd_fadst8_col_many_neon, // ADST_DCT + highbd_fdct8_col_many_neon, // DCT_ADST + highbd_fadst8_col_many_neon, // ADST_ADST + highbd_fadst8_col_many_neon, // FLIPADST_DCT + highbd_fdct8_col_many_neon, // DCT_FLIPADST + highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST + highbd_fadst8_col_many_neon, // ADST_FLIPADST + highbd_fadst8_col_many_neon, // FLIPADST_ADST + highbd_fidentity8_col_many_neon, // IDTX + highbd_fdct8_col_many_neon, // V_DCT + highbd_fidentity8_col_many_neon, // H_DCT + highbd_fadst8_col_many_neon, // V_ADST + highbd_fidentity8_col_many_neon, // H_ADST + highbd_fadst8_col_many_neon, // V_FLIPADST + highbd_fidentity8_col_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = { + highbd_fdct8_col_neon, // DCT_DCT + highbd_fadst8_col_neon, // ADST_DCT + highbd_fdct8_col_neon, // DCT_ADST + highbd_fadst8_col_neon, // ADST_ADST + highbd_fadst8_col_neon, // FLIPADST_DCT + highbd_fdct8_col_neon, // DCT_FLIPADST + highbd_fadst8_col_neon, // FLIPADST_FLIPADST + highbd_fadst8_col_neon, // ADST_FLIPADST + highbd_fadst8_col_neon, // FLIPADST_ADST + highbd_fidentity8_col_neon, // IDTX + highbd_fdct8_col_neon, // V_DCT + highbd_fidentity8_col_neon, // H_DCT + highbd_fadst8_col_neon, // V_ADST + highbd_fidentity8_col_neon, // H_ADST + highbd_fadst8_col_neon, // V_FLIPADST + highbd_fidentity8_col_neon // H_FLIPADST +}; + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm16_xn_arr[TX_TYPES] = { + highbd_fdct16_col_many_neon, // DCT_DCT + highbd_fadst16_col_many_neon, // ADST_DCT + highbd_fdct16_col_many_neon, // DCT_ADST + highbd_fadst16_col_many_neon, // ADST_ADST + highbd_fadst16_col_many_neon, // FLIPADST_DCT + highbd_fdct16_col_many_neon, // DCT_FLIPADST + highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST + highbd_fadst16_col_many_neon, // ADST_FLIPADST + highbd_fadst16_col_many_neon, // FLIPADST_ADST + highbd_fidentity16_col_many_neon, // IDTX + highbd_fdct16_col_many_neon, // V_DCT + highbd_fidentity16_col_many_neon, // H_DCT + highbd_fadst16_col_many_neon, // V_ADST + highbd_fidentity16_col_many_neon, // H_ADST + highbd_fadst16_col_many_neon, // V_FLIPADST + highbd_fidentity16_col_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm4_xn_arr[TX_TYPES] = { + highbd_fdct4_col_many_neon, // DCT_DCT + highbd_fadst4_col_many_neon, // ADST_DCT + highbd_fdct4_col_many_neon, // DCT_ADST + highbd_fadst4_col_many_neon, // ADST_ADST + highbd_fadst4_col_many_neon, // FLIPADST_DCT + highbd_fdct4_col_many_neon, // DCT_FLIPADST + highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST + highbd_fadst4_col_many_neon, // ADST_FLIPADST + highbd_fadst4_col_many_neon, // FLIPADST_ADST + highbd_fidentity4_col_many_neon, // IDTX + highbd_fdct4_col_many_neon, // V_DCT + highbd_fidentity4_col_many_neon, // H_DCT + highbd_fadst4_col_many_neon, // V_ADST + highbd_fidentity4_col_many_neon, // H_ADST + highbd_fadst4_col_many_neon, // V_FLIPADST + highbd_fidentity4_col_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = { + highbd_fdct16_row_neon, // DCT_DCT + highbd_fdct16_row_neon, // ADST_DCT + highbd_fadst16_row_neon, // DCT_ADST + highbd_fadst16_row_neon, // ADST_ADST + highbd_fdct16_row_neon, // FLIPADST_DCT + highbd_fadst16_row_neon, // DCT_FLIPADST + highbd_fadst16_row_neon, // FLIPADST_FLIPADST + highbd_fadst16_row_neon, // ADST_FLIPADST + highbd_fadst16_row_neon, // FLIPADST_ADST + highbd_fidentity16_row_neon, // IDTX + highbd_fidentity16_row_neon, // V_DCT + highbd_fdct16_row_neon, // H_DCT + highbd_fidentity16_row_neon, // V_ADST + highbd_fadst16_row_neon, // H_ADST + highbd_fidentity16_row_neon, // V_FLIPADST + highbd_fadst16_row_neon // H_FLIPADST +}; + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm16_xn_arr[TX_TYPES] = { + highbd_fdct16_row_rect_many_neon, // DCT_DCT + highbd_fdct16_row_rect_many_neon, // ADST_DCT + highbd_fadst16_row_rect_many_neon, // DCT_ADST + highbd_fadst16_row_rect_many_neon, // ADST_ADST + highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT + highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST + highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST + highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST + highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST + highbd_fidentity16_row_rect_many_neon, // IDTX + highbd_fidentity16_row_rect_many_neon, // V_DCT + highbd_fdct16_row_rect_many_neon, // H_DCT + highbd_fidentity16_row_rect_many_neon, // V_ADST + highbd_fadst16_row_rect_many_neon, // H_ADST + highbd_fidentity16_row_rect_many_neon, // V_FLIPADST + highbd_fadst16_row_rect_many_neon // H_FLIPADST + }; + +#if !CONFIG_REALTIME_ONLY +static const fwd_transform_1d_row_many_neon + row_highbd_txfm8_xn_arr[TX_TYPES] = { + highbd_fdct8_row_many_neon, // DCT_DCT + highbd_fdct8_row_many_neon, // ADST_DCT + highbd_fadst8_row_many_neon, // DCT_ADST + highbd_fadst8_row_many_neon, // ADST_ADST + highbd_fdct8_row_many_neon, // FLIPADST_DCT + highbd_fadst8_row_many_neon, // DCT_FLIPADST + highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST + highbd_fadst8_row_many_neon, // ADST_FLIPADST + highbd_fadst8_row_many_neon, // FLIPADST_ADST + highbd_fidentity8_row_many_neon, // IDTX + highbd_fidentity8_row_many_neon, // V_DCT + highbd_fdct8_row_many_neon, // H_DCT + highbd_fidentity8_row_many_neon, // V_ADST + highbd_fadst8_row_many_neon, // H_ADST + highbd_fidentity8_row_many_neon, // V_FLIPADST + highbd_fadst8_row_many_neon // H_FLIPADST + }; +#endif + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm8_xn_arr[TX_TYPES] = { + highbd_fdct8_row_rect_many_neon, // DCT_DCT + highbd_fdct8_row_rect_many_neon, // ADST_DCT + highbd_fadst8_row_rect_many_neon, // DCT_ADST + highbd_fadst8_row_rect_many_neon, // ADST_ADST + highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT + highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST + highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST + highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST + highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST + highbd_fidentity8_row_rect_many_neon, // IDTX + highbd_fidentity8_row_rect_many_neon, // V_DCT + highbd_fdct8_row_rect_many_neon, // H_DCT + highbd_fidentity8_row_rect_many_neon, // V_ADST + highbd_fadst8_row_rect_many_neon, // H_ADST + highbd_fidentity8_row_rect_many_neon, // V_FLIPADST + highbd_fadst8_row_rect_many_neon // H_FLIPADST + }; + +static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = { + highbd_fdct8_row_rect_neon, // DCT_DCT + highbd_fdct8_row_rect_neon, // ADST_DCT + highbd_fadst8_row_rect_neon, // DCT_ADST + highbd_fadst8_row_rect_neon, // ADST_ADST + highbd_fdct8_row_rect_neon, // FLIPADST_DCT + highbd_fadst8_row_rect_neon, // DCT_FLIPADST + highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST + highbd_fadst8_row_rect_neon, // ADST_FLIPADST + highbd_fadst8_row_rect_neon, // FLIPADST_ADST + highbd_fidentity8_row_rect_neon, // IDTX + highbd_fidentity8_row_rect_neon, // V_DCT + highbd_fdct8_row_rect_neon, // H_DCT + highbd_fidentity8_row_rect_neon, // V_ADST + highbd_fadst8_row_rect_neon, // H_ADST + highbd_fidentity8_row_rect_neon, // V_FLIPADST + highbd_fadst8_row_rect_neon // H_FLIPADST +}; + +#if !CONFIG_REALTIME_ONLY +static const fwd_transform_1d_row_many_neon + row_highbd_txfm4_xn_arr[TX_TYPES] = { + highbd_fdct4_row_many_neon, // DCT_DCT + highbd_fdct4_row_many_neon, // ADST_DCT + highbd_fadst4_row_many_neon, // DCT_ADST + highbd_fadst4_row_many_neon, // ADST_ADST + highbd_fdct4_row_many_neon, // FLIPADST_DCT + highbd_fadst4_row_many_neon, // DCT_FLIPADST + highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST + highbd_fadst4_row_many_neon, // ADST_FLIPADST + highbd_fadst4_row_many_neon, // FLIPADST_ADST + highbd_fidentity4_row_many_neon, // IDTX + highbd_fidentity4_row_many_neon, // V_DCT + highbd_fdct4_row_many_neon, // H_DCT + highbd_fidentity4_row_many_neon, // V_ADST + highbd_fadst4_row_many_neon, // H_ADST + highbd_fidentity4_row_many_neon, // V_FLIPADST + highbd_fadst4_row_many_neon // H_FLIPADST + }; +#endif + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm4_xn_arr[TX_TYPES] = { + highbd_fdct4_row_rect_many_neon, // DCT_DCT + highbd_fdct4_row_rect_many_neon, // ADST_DCT + highbd_fadst4_row_rect_many_neon, // DCT_ADST + highbd_fadst4_row_rect_many_neon, // ADST_ADST + highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT + highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST + highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST + highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST + highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST + highbd_fidentity4_row_rect_many_neon, // IDTX + highbd_fidentity4_row_rect_many_neon, // V_DCT + highbd_fdct4_row_rect_many_neon, // H_DCT + highbd_fidentity4_row_rect_many_neon, // V_ADST + highbd_fadst4_row_rect_many_neon, // H_ADST + highbd_fidentity4_row_rect_many_neon, // V_FLIPADST + highbd_fadst4_row_rect_many_neon // H_FLIPADST + }; + +static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { + const int32_t *const cospi = cospi_arr_s32(cos_bit); + const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); + + // Workspaces for intermediate transform steps. + int32x4_t buf0[32]; + int32x4_t buf1[32]; + + // stage 1 + butterfly_dct_pre(input, buf1, 32); + + // stage 2 + butterfly_dct_pre(buf1, buf0, 16); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23], + v_cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + butterfly_dct_pre(buf0, buf1, 8); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10], + v_cos_bit); + butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11], + v_cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16); + + // stage 4 + butterfly_dct_pre(buf1, buf0, 4); + buf0[4] = buf1[4]; + butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5], + v_cos_bit); + buf0[7] = buf1[7]; + butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18], + v_cos_bit); + butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19], + v_cos_bit); + butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27], + v_cos_bit); + butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26], + v_cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1], + v_cos_bit); + butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3], + v_cos_bit); + butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4); + buf1[8] = buf0[8]; + butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9], + v_cos_bit); + butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13], + v_cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8); + butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8); + + // stage 6 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + + butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7], + v_cos_bit); + butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17], + v_cos_bit); + butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29], + v_cos_bit); + butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4); + butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4); + buf0[16] = buf1[16]; + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + + butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6], + v_cos_bit); + butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21], + v_cos_bit); + butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22], + v_cos_bit); + + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15], + v_cos_bit); + butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14], + v_cos_bit); + butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13], + v_cos_bit); + butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12], + v_cos_bit); + butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4); + butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4); + butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4); + butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4); + + // stage 8 + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31], + v_cos_bit); + butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30], + v_cos_bit); + butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29], + v_cos_bit); + butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28], + v_cos_bit); + butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27], + v_cos_bit); + butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26], + v_cos_bit); + butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25], + v_cos_bit); + butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24], + v_cos_bit); + + // stage 9 + output[0] = buf0[0]; + output[1] = buf0[16]; + output[2] = buf0[8]; + output[3] = buf0[24]; + output[4] = buf0[4]; + output[5] = buf0[20]; + output[6] = buf0[12]; + output[7] = buf0[28]; + output[8] = buf0[2]; + output[9] = buf0[18]; + output[10] = buf0[10]; + output[11] = buf0[26]; + output[12] = buf0[6]; + output[13] = buf0[22]; + output[14] = buf0[14]; + output[15] = buf0[30]; + output[16] = buf0[1]; + output[17] = buf0[17]; + output[18] = buf0[9]; + output[19] = buf0[25]; + output[20] = buf0[5]; + output[21] = buf0[21]; + output[22] = buf0[13]; + output[23] = buf0[29]; + output[24] = buf0[3]; + output[25] = buf0[19]; + output[26] = buf0[11]; + output[27] = buf0[27]; + output[28] = buf0[7]; + output[29] = buf0[23]; + output[30] = buf0[15]; + output[31] = buf0[31]; +} + +static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output, + int8_t cos_bit) { + const int32_t *const cospi = cospi_arr_s32(cos_bit); + const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); + + // stage 1 + int32x4_t x1[64]; + butterfly_dct_pre(input, x1, 64); + + // stage 2 + int32x4_t x2[64]; + butterfly_dct_pre(x1, x2, 32); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit); + butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + int32x4_t x3[64]; + butterfly_dct_pre(x2, x3, 16); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit); + butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit); + butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit); + butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32); + + // stage 4 + int32x4_t x4[64]; + butterfly_dct_pre(x3, x4, 8); + x4[8] = x3[8]; + x4[9] = x3[9]; + butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit); + butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit); + butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit); + butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit); + butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit); + butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + int32x4_t x5[64]; + butterfly_dct_pre(x4, x5, 4); + x5[4] = x4[4]; + butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit); + x5[7] = x4[7]; + butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8); + x5[16] = x4[16]; + x5[17] = x4[17]; + butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit); + butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit); + butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit); + butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16); + butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16); + + // stage 6 + int32x4_t x6[64]; + butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit); + butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit); + butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4); + x6[8] = x5[8]; + butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit); + butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8); + butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8); + x6[32] = x5[32]; + x6[33] = x5[33]; + butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit); + butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit); + butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit); + butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit); + butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit); + butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit); + butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + int32x4_t x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit); + butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit); + butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4); + butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4); + x7[16] = x6[16]; + butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit); + butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit); + butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8); + butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8); + butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8); + butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8); + + // stage 8 + int32x4_t x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + + butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit); + butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit); + butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit); + butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit); + butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4); + butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4); + butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4); + butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4); + x8[32] = x7[32]; + butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit); + butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit); + butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit); + butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit); + butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + int32x4_t x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit); + butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit); + butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit); + butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit); + butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit); + butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit); + butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit); + butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit); + butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4); + butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4); + butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4); + butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4); + butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4); + butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4); + butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4); + butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4); + + // stage 10 + int32x4_t x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit); + butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit); + butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit); + butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit); + butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit); + butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit); + butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit); + butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit); + butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit); + butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit); + butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit); + butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit); + butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit); + butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit); + butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit); + butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void highbd_fidentity32_x4_neon(const int32x4_t *input, + int32x4_t *output, int cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; i++) { + output[i] = vshlq_n_s32(input[i], 2); + } +} + +TRANSFORM_COL_MANY(fdct32, 32) +TRANSFORM_COL_MANY(fidentity32, 32) + +static const fwd_transform_1d_col_many_neon + col_highbd_txfm32_x4_arr[TX_TYPES] = { + highbd_fdct32_col_many_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + highbd_fidentity32_col_many_neon, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST + }; + +TRANSFORM_ROW_MANY(fdct32, 32) +TRANSFORM_ROW_MANY(fidentity32, 32) + +static const fwd_transform_1d_row_many_neon + row_highbd_txfm32_x4_arr[TX_TYPES] = { + highbd_fdct32_row_many_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + highbd_fidentity32_row_many_neon, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST + }; + +TRANSFORM_ROW_RECT_MANY(fdct32, 32) +TRANSFORM_ROW_RECT_MANY(fidentity32, 32) + +static const fwd_transform_1d_row_many_neon + row_rect_highbd_txfm32_x4_arr[TX_TYPES] = { + highbd_fdct32_row_rect_many_neon, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + highbd_fidentity32_row_rect_many_neon, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST + }; + +void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm8_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm16_xn_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[2][1]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + // Column-wise transform. + int32x4_t buf0[32]; + if (lr_flip) { + col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4, + /*hm_stride=*/-8); + } else { + col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4, + /*hm_stride=*/8); + } + shift_right_2_round_s32_x4(buf0, buf0, 32); + + int32x4_t buf1[32]; + transpose_arrays_s32_16x8(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8); +} + +void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm16_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm8_xn_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[1][2]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Column-wise transform. + int32x4_t buf0[32]; + if (lr_flip) { + col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2, + /*hm_stride=*/-16); + } else { + col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2, + /*hm_stride=*/16); + } + shift_right_2_round_s32_x4(buf0, buf0, 32); + + int32x4_t buf1[32]; + transpose_arrays_s32_8x16(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16); +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int bitcol = av1_fwd_cos_bit_col[0][2]; + int bitrow = av1_fwd_cos_bit_row[0][2]; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm16_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm4_xn_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Column-wise transform. + int32x4_t buf0[16]; + if (lr_flip) { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1, + /*hm_stride=*/0); + } else { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1, + /*hm_stride=*/0); + } + shift_right_1_round_s32_x4(buf0, buf0, 16); + + int32x4_t buf1[16]; + transpose_arrays_s32_4x16(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16); +} +#endif + +void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int bitcol = av1_fwd_cos_bit_col[2][0]; + int bitrow = av1_fwd_cos_bit_row[2][0]; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm4_xn_arr[tx_type]; + const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + + // Column-wise transform. + int32x4_t buf0[16]; + if (lr_flip) { + col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4, + /*hm_stride=*/-4); + } else { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, + /*hm_stride=*/4); + } + + shift_right_1_round_s32_x4(buf0, buf0, 16); + transpose_arrays_s32_4x16(buf0, buf0); + + // Row-wise transform. + row_txfm(buf0, coeff, bitrow, /*stride=*/4); +} + +void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm32_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm16_xn_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[2][3]; + int bitrow = av1_fwd_cos_bit_row[2][3]; + + // Column-wise transform. + int32x4_t buf0[128]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, + /*hm_stride=*/32); + shift_right_4_round_s32_x4(buf0, buf0, 128); + + int32x4_t buf1[128]; + transpose_arrays_s32_16x32(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32); +} + +void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + int bitcol = av1_fwd_cos_bit_col[3][4]; + int bitrow = av1_fwd_cos_bit_row[3][4]; + + // Column-wise transform. + int32x4_t buf0[512]; + load_buffer_32x64(input, buf0, stride, 0); + for (int i = 0; i < 8; i++) { + highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); + } + shift_right_2_round_s32_x4(buf0, buf0, 512); + + int32x4_t buf1[512]; + transpose_arrays_s32_32x64(buf0, buf1); + + // Row-wise transform. + for (int i = 0; i < 16; i++) { + highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow); + } + round_shift2_rect_array_s32_neon(buf1, buf1, 512); + store_buffer_32x32(buf1, coeff, /*stride=*/32); +} + +void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + int bitcol = av1_fwd_cos_bit_col[4][3]; + int bitrow = av1_fwd_cos_bit_row[4][3]; + + // Column-wise transform. + int32x4_t buf0[512]; + load_buffer_64x32(input, buf0, stride, 0); + for (int i = 0; i < 16; i++) { + highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol); + } + shift_right_4_round_s32_x4(buf0, buf0, 512); + + int32x4_t buf1[512]; + transpose_arrays_s32_64x32(buf0, buf1); + + // Row-wise transform. + for (int i = 0; i < 8; i++) { + highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); + } + round_shift2_rect_array_s32_neon(buf1, buf1, 512); + store_buffer_64x32(buf1, coeff, /*stride=*/32); +} + +void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm16_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm32_x4_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[3][2]; + int bitrow = av1_fwd_cos_bit_row[3][2]; + + // Column-wise transform. + int32x4_t buf0[128]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, + /*hm_stride=*/16); + shift_right_4_round_s32_x4(buf0, buf0, 128); + + int32x4_t buf1[128]; + transpose_arrays_s32_32x16(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16); +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm32_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm8_xn_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[1][3]; + int bitrow = av1_fwd_cos_bit_row[1][3]; + + // Column-wise transform. + int32x4_t buf0[64]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, + /*hm_stride=*/32); + shift_right_2_round_s32_x4(buf0, buf0, 64); + + int32x4_t buf1[64]; + transpose_arrays_s32_8x32(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32); +} + +void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm8_xn_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm32_x4_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[3][1]; + int bitrow = av1_fwd_cos_bit_row[3][1]; + + // Column-wise transform. + int32x4_t buf0[64]; + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, + /*hm_stride=*/8); + shift_right_2_round_s32_x4(buf0, buf0, 64); + + int32x4_t buf1[64]; + transpose_arrays_s32_32x8(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8); +} +#endif + +void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + int bitcol = av1_fwd_cos_bit_col[0][1]; + int bitrow = av1_fwd_cos_bit_row[0][1]; + const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_rect_highbd_txfm4_xn_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); + + // Column-wise transform. + int32x4_t buf0[8]; + col_txfm(input, buf0, stride, bitcol, lr_flip); + shift_right_1_round_s32_x4(buf0, buf0, 8); + + int32x4_t buf1[8]; + transpose_arrays_s32_4x8(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8); +} + +void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const int bitcol = av1_fwd_cos_bit_col[1][0]; + const int bitrow = av1_fwd_cos_bit_row[1][0]; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm4_xn_arr[tx_type]; + const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); + + // Column-wise transform. + int32x4_t buf0[8]; + if (lr_flip) { + col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2, + /*hm_stride=*/-4); + } else { + col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, + /*hm_stride=*/4); + } + + shift_right_1_round_s32_x4(buf0, buf0, 8); + + int32x4_t buf1[8]; + transpose_arrays_s32_8x4(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, coeff, bitrow, /*stride=*/4); +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const int bitcol = av1_fwd_cos_bit_col[2][4]; + const int bitrow = av1_fwd_cos_bit_row[2][4]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 64); + + // Column-wise transform. + int32x4_t buf0[256]; + load_buffer_16x64(input, buf0, stride, lr_flip); + for (int i = 0; i < 4; i++) { + highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); + } + shift_right_2_round_s32_x4(buf0, buf0, 256); + + int32x4_t buf1[256]; + transpose_arrays_s32_16x64(buf0, buf1); + + // Row-wise transform. + highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8); + store_buffer_16x32(buf1, coeff, /*stride=*/32); +} + +void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + (void)bd; + const int bitcol = av1_fwd_cos_bit_col[4][2]; + const int bitrow = av1_fwd_cos_bit_row[4][2]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); + + // Column-wise transform. + int32x4_t buf0[256]; + load_buffer_64x16(input, buf0, stride, lr_flip); + highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16); + shift_right_4_round_s32_x4(buf0, buf0, 256); + + int32x4_t buf1[256]; + transpose_arrays_s32_64x16(buf0, buf1); + + // Row-wise transform. + for (int i = 0; i < 4; i++) { + highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); + } + store_buffer_64x16(buf1, coeff, /*stride=*/16); + memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff)); +} +#endif + +void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const fwd_transform_1d_col_many_neon col_txfm = + col_highbd_txfm32_x4_arr[tx_type]; + const fwd_transform_1d_row_many_neon row_txfm = + row_highbd_txfm32_x4_arr[tx_type]; + + // Column-wise transform. + int32x4_t buf0[256]; + col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8, + /*hm_stride=*/32); + shift_right_4_round_s32_x4(buf0, buf0, 256); + + int32x4_t buf1[256]; + transpose_arrays_s32_32x32(buf0, buf1); + + // Row-wise transform. + row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32, + /*stride=*/32); +} + +void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + + // Column-wise transform. + int32x4_t buf0[1024]; + load_buffer_64x64(input, buf0, stride, 0); + for (int col = 0; col < 16; col++) { + highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13); + } + shift_right_2_round_s32_x4(buf0, buf0, 1024); + + int32x4_t buf1[1024]; + transpose_arrays_s32_64x64(buf0, buf1); + + // Row-wise transform. + for (int col = 0; col < 8; col++) { + highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10); + } + shift_right_2_round_s32_x4(buf1, buf1, 512); + store_buffer_64x32(buf1, output, /*stride=*/32); +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c new file mode 100644 index 0000000000..47b5f5cfb7 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c @@ -0,0 +1,1207 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/encoder/arm/neon/pickrst_neon.h" +#include "av1/encoder/pickrst.h" + +static INLINE void highbd_calc_proj_params_r0_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t h01_lo = vdupq_n_s64(0); + int64x2_t h01_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt0_ptr = flt0; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f0_lo = vsubq_s32(f0_lo, u_lo); + f0_hi = vsubq_s32(f0_hi, u_hi); + f1_lo = vsubq_s32(f1_lo, u_lo); + f1_hi = vsubq_s32(f1_hi, u_hi); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); + h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); + h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); + h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + H[1][0] = H[0][1]; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +static INLINE void highbd_calc_proj_params_r0_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt0_ptr = flt0; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f0_lo = vsubq_s32(f0_lo, u_lo); + f0_hi = vsubq_s32(f0_hi, u_hi); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt0 += flt0_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; +} + +static INLINE void highbd_calc_proj_params_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint16_t *src_ptr = src; + const uint16_t *dat_ptr = dat; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint16x8_t s = vld1q_u16(src_ptr); + uint16x8_t d = vld1q_u16(dat_ptr); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int32x4_t u_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS)); + int32x4_t u_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS)); + int32x4_t s_lo = + vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS)); + int32x4_t s_hi = vreinterpretq_s32_u32( + vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS)); + s_lo = vsubq_s32(s_lo, u_lo); + s_hi = vsubq_s32(s_hi, u_hi); + + f1_lo = vsubq_s32(f1_lo, u_lo); + f1_hi = vsubq_s32(f1_hi, u_hi); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src += src_stride; + dat += dat_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } }; + return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx)); +#else + uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)) } }; + return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx)))); +#endif +} + +static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c, + uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b), + vreinterpretq_u8_s16(c) } }; + return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx)); +#else + // This is a specific implementation working only for compute stats with + // wiener_win == 5. + uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)) } }; + uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)), + vreinterpret_u8_s16(vget_low_s16(c)) } }; + return vreinterpretq_s16_u8(vcombine_u8( + vtbl3_u8(table_lo, vget_low_u8(idx)), + vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16))))); +#endif +} + +static INLINE int64_t div_shift_s64(int64_t x, int power) { + return (x < 0 ? x + (1ll << power) - 1 : x) >> power; +} + +// The M matrix is accumulated in a bitdepth-dependent number of steps to +// speed up the computation. This function computes the final M from the +// accumulated (src_s64) and the residual parts (src_s32). It also transposes +// the result as the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int shift) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in a bitdepth-dependent number of +// steps to speed up the computation. This function computes the final H from +// the accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static INLINE void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int stride, int shift) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-right triangle, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } + } + } +} + +// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src); + src += stride; + dst[5] = vld1q_s16(src); + src += stride; + dst[6] = vld1q_s16(src - 1); +} + +static INLINE void highbd_compute_stats_win7_neon( + const uint16_t *dgd, const uint16_t *src, int avg, int width, int height, + int dgd_stride, int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80); + const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96); + const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112); + const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128); + const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144); + const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160); + const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16); + dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16); + + // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays. + // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6); + dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7); + dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8); + dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3); + dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9); + dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4); + dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10); + dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5); + dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly + // packed into a int16x8_t[6] array. This array contains 48 elements of + // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG + // buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16); + dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16); + dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16); + dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift); +} + +// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src - 3); +} + +static void highbd_compute_stats_win5_neon(const uint16_t *dgd, + const uint16_t *src, int avg, + int width, int height, + int dgd_stride, int src_stride, + int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from 5x5 + // matrix. + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33, + 2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35, + 4, 5, 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, + }; + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + + // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays. + // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1); + dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4); + dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16( + tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + bit_depth_shift); +} + +static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, + int width, int height) { + assert(width > 0); + assert(height > 0); + + uint64x2_t sum_u64 = vdupq_n_u64(0); + uint64_t sum = 0; + + int h = height; + do { + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int w = width; + const uint16_t *row = src; + while (w >= 32) { + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + uint16x8_t s2 = vld1q_u16(row + 16); + uint16x8_t s3 = vld1q_u16(row + 24); + + s0 = vaddq_u16(s0, s1); + s2 = vaddq_u16(s2, s3); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s2); + + row += 32; + w -= 32; + } + + if (w >= 16) { + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + + s0 = vaddq_u16(s0, s1); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + + row += 16; + w -= 16; + } + + if (w >= 8) { + uint16x8_t s0 = vld1q_u16(row); + sum_u32[1] = vpadalq_u16(sum_u32[1], s0); + + row += 8; + w -= 8; + } + + if (w >= 4) { + uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0)); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + + row += 4; + w -= 4; + } + + while (w-- > 0) { + sum += *row++; + } + + sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1])); + + src += src_stride; + } while (--h != 0); + + return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width)); +} + +void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); + + const int wiener_halfwin = wiener_win >> 1; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const int height = v_end - v_start; + const int width = h_end - h_start; + + const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint16_t *src_start = src + h_start + v_start * src_stride; + + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + + uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height); + + if (wiener_win == WIENER_WIN) { + highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } else { + highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } +} + +int64_t av1_highbd_pixel_proj_error_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); + + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t d_s32_lo = vreinterpretq_s32_u32( + vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16( + vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + + int32x4_t v0 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_lo); + int32x4_t v1 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_hi); + + v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0); + v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0); + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d, s))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]); + v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x4_t xq_v = vdupq_n_s32(xq_active); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + do { + const uint16x8_t d0 = vld1q_u16(&dat[j]); + const uint16x8_t s0 = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt[j]); + int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]); + + uint16x8_t d_u16 = vshlq_n_u16(d0, 4); + int32x4_t sub0 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16))); + int32x4_t sub1 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16))); + + int32x4_t v0 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0, + xq_v); + int32x4_t v1 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1, + xq_v); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d0, s0))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4)); + const int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } while (--height != 0); + } else { + do { + int j = 0; + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + + uint16x8_t diff = vabdq_u16(d, s); + uint16x4_t diff_lo = vget_low_u16(diff); + uint16x4_t diff_hi = vget_high_u16(diff); + + uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo); + uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi); + + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo)); + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi)); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); + } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c new file mode 100644 index 0000000000..4bf7ae6ce4 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/arm/sum_neon.h" + +int64_t av1_highbd_block_error_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int32x4_t c = vld1q_s32(coeff); + const int32x4_t d = vld1q_s32(dqcoeff); + + const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); + + err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); + err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); + + ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); + ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); + + coeff += 4; + dqcoeff += 4; + block_size -= 4; + } while (block_size != 0); + + *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift; + return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift; +} diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c new file mode 100644 index 0000000000..88e176f56c --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +static INLINE void get_squared_error( + const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2, + const uint32_t stride2, const uint32_t block_width, + const uint32_t block_height, uint32_t *frame_sse, + const unsigned int dst_stride) { + uint32_t *dst = frame_sse; + + uint32_t i = 0; + do { + uint32_t j = 0; + do { + uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j); + uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint32x4_t sse_lo = + vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff)); + uint32x4_t sse_hi = + vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff)); + + vst1q_u32(dst + j, sse_lo); + vst1q_u32(dst + j + 4, sse_hi); + + j += 8; + } while (j < block_width); + + dst += dst_stride; + i++; + } while (i < block_height); +} + +static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2], + const uint32x4_t mask_single) { + uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single); + vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single); + return horizontal_add_u32x4(vsums); +} + +static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2], + const uint32x4_t mask1, + const uint32x4_t mask2) { + uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[1][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[2][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[3][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[4][0], mask1); + vsums = vmlaq_u32(vsums, vsrc[0][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[1][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[2][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[3][1], mask2); + vsums = vmlaq_u32(vsums, vsrc[4][1], mask2); + return vsums; +} + +static void highbd_apply_temporal_filter( + const uint16_t *frame, const unsigned int stride, + const uint32_t block_width, const uint32_t block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + const uint32_t *frame_sse, const uint32_t frame_sse_stride, + const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, + const double decay_factor, const double inv_factor, + const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl, + int bd) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_neon[BH][BW] = { 0 }; + const int half_window = TF_WINDOW_LENGTH >> 1; + + uint32x4_t vsrc[5][2] = { 0 }; + const uint32x4_t k0000 = vdupq_n_u32(0); + const uint32x4_t k1111 = vdupq_n_u32(1); + const uint32_t k3110_u32[4] = { 0, 1, 1, 3 }; + const uint32_t k2111_u32[4] = { 1, 1, 1, 2 }; + const uint32_t k1112_u32[4] = { 2, 1, 1, 1 }; + const uint32_t k0113_u32[4] = { 3, 1, 1, 0 }; + const uint32x4_t k3110 = vld1q_u32(k3110_u32); + const uint32x4_t k2111 = vld1q_u32(k2111_u32); + const uint32x4_t k1112 = vld1q_u32(k1112_u32); + const uint32x4_t k0113 = vld1q_u32(k0113_u32); + + uint32x4_t vmask1[4], vmask2[4]; + vmask1[0] = k1111; + vmask2[0] = vextq_u32(k1111, k0000, 3); + vmask1[1] = vextq_u32(k0000, k1111, 3); + vmask2[1] = vextq_u32(k1111, k0000, 2); + vmask1[2] = vextq_u32(k0000, k1111, 2); + vmask2[2] = vextq_u32(k1111, k0000, 1); + vmask1[3] = vextq_u32(k0000, k1111, 1); + vmask2[3] = k1111; + + uint32_t row = 0; + do { + uint32_t col = 0; + const uint32_t *src = frame_sse + row * frame_sse_stride; + if (row == 0) { + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); + + // First 2 rows of the 5x5 matrix are padded from the 1st. + vsrc[0][0] = vsrc[2][0]; + vsrc[1][0] = vsrc[2][0]; + } else if (row == 1) { + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); + + // First row of the 5x5 matrix are padded from the 1st. + vsrc[0][0] = vsrc[1][0]; + } else if (row == block_height - 2) { + vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + + // Last row of the 5x5 matrix are padded from the one before. + vsrc[4][0] = vsrc[3][0]; + } else if (row == block_height - 1) { + vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + + // Last 2 rows of the 5x5 matrix are padded from the 3rd. + vsrc[3][0] = vsrc[2][0]; + vsrc[4][0] = vsrc[2][0]; + } else { + vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][0] = vld1q_u32(src - frame_sse_stride); + vsrc[2][0] = vld1q_u32(src); + vsrc[3][0] = vld1q_u32(src + frame_sse_stride); + vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride); + } + + acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113); + acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112); + + col += 4; + src += 4; + // Traverse 4 columns at a time + do { + if (row == 0) { + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); + + // First 2 rows of the 5x5 matrix are padded from the 1st. + vsrc[0][1] = vsrc[2][1]; + vsrc[1][1] = vsrc[2][1]; + } else if (row == 1) { + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); + + // First row of the 5x5 matrix are padded from the 1st. + vsrc[0][1] = vsrc[1][1]; + } else if (row == block_height - 2) { + vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + + // Last row of the 5x5 matrix are padded from the one before. + vsrc[4][1] = vsrc[3][1]; + } else if (row == block_height - 1) { + vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + + // Last 2 rows of the 5x5 matrix are padded from the 3rd. + vsrc[3][1] = vsrc[2][1]; + vsrc[4][1] = vsrc[2][1]; + } else { + vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride); + vsrc[1][1] = vld1q_u32(src - frame_sse_stride); + vsrc[2][1] = vld1q_u32(src); + vsrc[3][1] = vld1q_u32(src + frame_sse_stride); + vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride); + } + + uint32x4_t sums[4]; + sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]); + sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]); + sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]); + sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]); + vst1q_u32(&acc_5x5_neon[row][col - half_window], + horizontal_add_4d_u32x4(sums)); + + vsrc[0][0] = vsrc[0][1]; + vsrc[1][0] = vsrc[1][1]; + vsrc[2][0] = vsrc[2][1]; + vsrc[3][0] = vsrc[3][1]; + vsrc[4][0] = vsrc[4][1]; + + src += 4; + col += 4; + } while (col <= block_width - 4); + + acc_5x5_neon[row][col - half_window] = + sum_kernel5x5_mask_single(vsrc, k2111); + acc_5x5_neon[row][col - half_window + 1] = + sum_kernel5x5_mask_single(vsrc, k3110); + + row++; + } while (row < block_height); + + // Perform filtering. + if (tf_wgt_calc_lvl == 0) { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + // Scale down the difference for high bit depth input. + const uint32_t diff_sse = + (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } else { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + // Scale down the difference for high bit depth input. + const uint32_t diff_sse = + (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } +} + +void av1_highbd_apply_temporal_filter_neon( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + assert(is_high_bitdepth); + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint32_t frame_sse[BW * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const uint32_t frame_sse_stride = plane_w; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint16_t *ref = + CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + const int ww = frame_sse_stride + << ss_x_shift; // Width of Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx]; + } + } + } + } + } + get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w, + plane_h, frame_sse, frame_sse_stride); + + highbd_apply_temporal_filter( + pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses, + accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride, + luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd); + + plane_offset += plane_h * plane_w; + } +} + +double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src, + int height, int width, + int stride, + int bitdepth, + int edge_thresh) { + uint16x8_t thresh = vdupq_n_u16(edge_thresh); + uint64x2_t acc = vdupq_n_u64(0); + // Count is in theory positive as it counts the number of times we're under + // the threshold, but it will be counted negatively in order to make best use + // of the vclt instruction, which sets every bit of a lane to 1 when the + // condition is true. + int32x4_t count = vdupq_n_s32(0); + int final_count = 0; + uint64_t final_acc = 0; + const uint16_t *src_start = src + stride + 1; + int h = 1; + + do { + int w = 1; + const uint16_t *src_ptr = src_start; + + while (w <= (width - 1) - 8) { + uint16x8_t mat[3][3]; + mat[0][0] = vld1q_u16(src_ptr - stride - 1); + mat[0][1] = vld1q_u16(src_ptr - stride); + mat[0][2] = vld1q_u16(src_ptr - stride + 1); + mat[1][0] = vld1q_u16(src_ptr - 1); + mat[1][1] = vld1q_u16(src_ptr); + mat[1][2] = vld1q_u16(src_ptr + 1); + mat[2][0] = vld1q_u16(src_ptr + stride - 1); + mat[2][1] = vld1q_u16(src_ptr + stride); + mat[2][2] = vld1q_u16(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]); + uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]); + gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0])); + gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2])); + + uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]); + uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]); + gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1])); + gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1])); + + uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); + ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth)); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionnally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16 = vcltq_u16(ga, thresh); + + uint16x8_t center = vshlq_n_u16(mat[1][1], 2); + + uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]); + uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]); + uint16x8_t adj = vaddq_u16(adj0, adj1); + adj = vaddq_u16(adj, adj); + + uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]); + uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]); + uint16x8_t diag = vaddq_u16(diag0, diag1); + + uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj); + v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16); + uint32x4_t v_u32 = vpaddlq_u16(v); + + acc = vpadalq_u32(acc, v_u32); + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); + + w += 8; + src_ptr += 8; + } + + if (w <= (width - 1) - 4) { + uint16x4_t mat[3][3]; + mat[0][0] = vld1_u16(src_ptr - stride - 1); + mat[0][1] = vld1_u16(src_ptr - stride); + mat[0][2] = vld1_u16(src_ptr - stride + 1); + mat[1][0] = vld1_u16(src_ptr - 1); + mat[1][1] = vld1_u16(src_ptr); + mat[1][2] = vld1_u16(src_ptr + 1); + mat[2][0] = vld1_u16(src_ptr + stride - 1); + mat[2][1] = vld1_u16(src_ptr + stride); + mat[2][2] = vld1_u16(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]); + uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]); + gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0])); + gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2])); + + uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]); + uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]); + gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1])); + gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1])); + + uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb); + ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth)); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionnally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh)); + + uint16x4_t center = vshl_n_u16(mat[1][1], 2); + + uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]); + uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]); + uint16x4_t adj = vadd_u16(adj0, adj1); + adj = vadd_u16(adj, adj); + + uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]); + uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]); + uint16x4_t diag = vadd_u16(diag0, diag1); + + uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj); + v = vand_u16(v, thresh_u16); + uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth))); + + acc = vpadalq_u32(acc, v_u32); + // Add -1 for each lane where the gradient is under the threshold. + count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16)); + + w += 4; + src_ptr += 4; + } + + while (w < width - 1) { + int mat[3][3]; + mat[0][0] = *(src_ptr - stride - 1); + mat[0][1] = *(src_ptr - stride); + mat[0][2] = *(src_ptr - stride + 1); + mat[1][0] = *(src_ptr - 1); + mat[1][1] = *(src_ptr); + mat[1][2] = *(src_ptr + 1); + mat[2][0] = *(src_ptr + stride - 1); + mat[2][1] = *(src_ptr + stride); + mat[2][2] = *(src_ptr + stride + 1); + + // Compute Sobel gradients. + const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8); + + // Accumulate Laplacian. + const int is_under = ga < edge_thresh; + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under; + final_count += is_under; + + src_ptr++; + w++; + } + src_start += stride; + } while (++h < height - 1); + + // We counted negatively, so subtract to get the final value. + final_count -= horizontal_add_s32x4(count); + final_acc += horizontal_add_u64x2(acc); + return (final_count < 16) + ? -1.0 + : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2; +} diff --git a/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c new file mode 100644 index 0000000000..6cf835a243 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/txfm_common.h" + +static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) { + int32x4x2_t b0 = + vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1])); + int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])), + vreinterpret_s16_s32(vget_high_s32(b0.val[0]))); + int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])), + vreinterpret_s16_s32(vget_high_s32(b0.val[1]))); + out[0] = c0.val[0]; + out[1] = c0.val[1]; + out[2] = c1.val[0]; + out[3] = c1.val[1]; +} + +void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) { + // Load the 4x4 source in transposed form. + int16x4_t a1, b1, c1, d1, e; + a1 = vld1_s16(&input[0]); + b1 = vld1_s16(&input[1 * stride]); + c1 = vld1_s16(&input[2 * stride]); + d1 = vld1_s16(&input[3 * stride]); + + // WHT. + + // Row transforms. + a1 = vadd_s16(a1, b1); + d1 = vsub_s16(d1, c1); + e = vhsub_s16(a1, d1); + b1 = vsub_s16(e, b1); + c1 = vsub_s16(e, c1); + a1 = vsub_s16(a1, c1); + d1 = vadd_s16(d1, b1); + + int16x8_t x[2]; + x[0] = vcombine_s16(a1, c1); + x[1] = vcombine_s16(d1, b1); + + int16x4_t s[4]; + transpose4x4(x, s); + + a1 = s[0]; + b1 = s[1]; + c1 = s[2]; + d1 = s[3]; + + // Row transforms. + a1 = vadd_s16(a1, b1); + d1 = vsub_s16(d1, c1); + e = vhsub_s16(a1, d1); + b1 = vsub_s16(e, b1); + c1 = vsub_s16(e, c1); + a1 = vsub_s16(a1, c1); + d1 = vadd_s16(d1, b1); + + vst1q_s32(&output[0], vshll_n_s16(a1, UNIT_QUANT_SHIFT)); + vst1q_s32(&output[4], vshll_n_s16(c1, UNIT_QUANT_SHIFT)); + vst1q_s32(&output[8], vshll_n_s16(d1, UNIT_QUANT_SHIFT)); + vst1q_s32(&output[12], vshll_n_s16(b1, UNIT_QUANT_SHIFT)); +} diff --git a/third_party/aom/av1/encoder/arm/neon/ml_neon.c b/third_party/aom/av1/encoder/arm/neon/ml_neon.c new file mode 100644 index 0000000000..be6ddfd763 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/ml_neon.c @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" + +static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l, + const float32x4_t *zero) { + *out_h = vmaxq_f32(*out_h, *zero); + *out_l = vmaxq_f32(*out_l, *zero); +} + +static void nn_activate4(float32x4_t *x, const float32x4_t *zero) { + *x = vmaxq_f32(*x, *zero); +} + +#define CLAMP_0(x) (x = x > 0 ? x : 0) + +static void nn_propagate_8to1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t vadd = zero; + float total = *layer_bias; + + for (int in = 0; in < num_inputs; in += 8) { + const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); + const float32x4_t inputs_l = vld1q_f32(&inputs[in]); + + const float32x4_t weights_h = vld1q_f32(&weights[in + 4]); + const float32x4_t weights_l = vld1q_f32(&weights[in]); + + vadd = vmlaq_f32(vadd, inputs_h, weights_h); + vadd = vmlaq_f32(vadd, inputs_l, weights_l); + } +#if AOM_ARCH_AARCH64 + total += vaddvq_f32(vadd); +#else + float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); + vadd_lo = vpadd_f32(vadd_lo, vadd_lo); + total += vget_lane_f32(vadd_lo, 0); +#endif + + if (!output_layer) CLAMP_0(total); + *output_nodes = total; +} + +static void nn_propagate_xto1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes) { + float32x4_t vadd = vdupq_n_f32(0); + + float total = *layer_bias; + int j = num_inputs; + int in = 0; + while (j > 7) { + const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); + const float32x4_t inputs_l = vld1q_f32(&inputs[in]); + + const float32x4_t weights_h = vld1q_f32(&weights[in + 4]); + const float32x4_t weights_l = vld1q_f32(&weights[in]); + + vadd = vmlaq_f32(vadd, inputs_h, weights_h); + vadd = vmlaq_f32(vadd, inputs_l, weights_l); + in += 8; + j -= 8; + } + +#if AOM_ARCH_AARCH64 + total += vaddvq_f32(vadd); + +#else + float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); + vadd_lo = vpadd_f32(vadd_lo, vadd_lo); + total += vget_lane_f32(vadd_lo, 0); +#endif + for (; in < num_inputs; in++) total += weights[in] * inputs[in]; + + *output_nodes = CLAMP_0(total); +} + +static void nn_propagate_xsto1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes) { + float total = *layer_bias; +#if AOM_ARCH_AARCH64 + const float32x4_t v_inputs = vld1q_f32(inputs); + const float32x4_t v_weights = vld1q_f32(weights); + const float32x4_t vadd = vmulq_f32(v_inputs, v_weights); + total += vaddvq_f32(vadd); + int in = 4; +#else + int in = 0; +#endif + for (; in < num_inputs; in++) total += weights[in] * inputs[in]; + + *output_nodes = CLAMP_0(total); +} + +static void nn_propagate_4to1(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t vadd = zero; + float total = *layer_bias; + + for (int in = 0; in < num_inputs; in += 4) { + const float32x4_t v_inputs = vld1q_f32(&inputs[in]); + const float32x4_t v_weights = vld1q_f32(&weights[in]); + vadd = vmlaq_f32(vadd, v_inputs, v_weights); + } + +#if AOM_ARCH_AARCH64 + total += vaddvq_f32(vadd); +#else + float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd)); + vadd_lo = vpadd_f32(vadd_lo, vadd_lo); + total += vget_lane_f32(vadd_lo, 0); +#endif + + if (!output_layer) CLAMP_0(total); + *output_nodes = total; +} + +static void nn_propagate_4to4(int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + float32x4_t outputs = vld1q_f32(layer_bias); + const float32x4_t zero = vdupq_n_f32(0); + + float32x4_t mul0[2] = { zero, zero }; + float32x4_t mul1[2] = { zero, zero }; + for (int in = 0; in < num_inputs; in += 4) { + const float32x4_t v_input = vld1q_f32(&inputs[in]); + + for (int i = 0; i < 2; i++) { + const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]); + mul0[i] = vmlaq_f32(mul0[i], weight0, v_input); + const float32x4_t weight1 = + vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]); + mul1[i] = vmlaq_f32(mul1[i], weight1, v_input); + } + } + for (int i = 0; i < 2; i++) +#if AOM_ARCH_AARCH64 + mul0[i] = vpaddq_f32(mul0[i], mul1[i]); + const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]); +#else + mul0[i] = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])), + vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i]))); + const float32x4_t hh = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])), + vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1]))); +#endif + + outputs = vaddq_f32(outputs, hh); + if (!output_layer) nn_activate4(&outputs, &zero); + vst1q_f32(output_nodes, outputs); +} + +static void nn_propagate_4to8(const int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + float32x4_t out_h = vld1q_f32(&layer_bias[4]); + float32x4_t out_l = vld1q_f32(layer_bias); + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t mul0[4] = { zero, zero, zero, zero }; + float32x4_t mul1[4] = { zero, zero, zero, zero }; + + for (int in = 0; in < num_inputs; in += 4) { + const float32x4_t v_input = vld1q_f32(&inputs[in]); + for (int i = 0; i < 4; i++) { + const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]); + const float32x4_t weight1 = + vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]); + mul0[i] = vmlaq_f32(mul0[i], v_input, weight0); + mul1[i] = vmlaq_f32(mul1[i], v_input, weight1); + } + } + for (int i = 0; i < 4; i++) +#if AOM_ARCH_AARCH64 + mul0[i] = vpaddq_f32(mul0[i], mul1[i]); + const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]); + const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]); +#else + mul0[i] = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])), + vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i]))); + const float32x4_t hh0 = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])), + vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1]))); + const float32x4_t hh1 = + vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])), + vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3]))); +#endif + + out_h = vaddq_f32(out_h, hh1); + out_l = vaddq_f32(out_l, hh0); + + if (!output_layer) nn_activate8(&out_h, &out_l, &zero); + vst1q_f32(&output_nodes[4], out_h); + vst1q_f32(output_nodes, out_l); +} + +static void nn_propagate_8to4(const int num_inputs, const float *const inputs, + const float *const weights, + const float *layer_bias, + float *const output_nodes, bool output_layer) { + float32x4_t outputs = vld1q_f32(layer_bias); + const float32x4_t zero = vdupq_n_f32(0); + float32x4_t add[4] = { zero, zero, zero, zero }; + for (int in = 0; in < num_inputs; in += 8) { + const float32x4_t inputs_l = vld1q_f32(&inputs[in]); + const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]); + + for (int i = 0; i < 4; i++) { + const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]); + const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]); + add[i] = vmlaq_f32(add[i], inputs_l, weight_l); + add[i] = vmlaq_f32(add[i], inputs_h, weight_h); + } + } +#if AOM_ARCH_AARCH64 + const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]); + const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]); + const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h); +#else + const float32x4_t hadd_h = + vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])), + vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3]))); + const float32x4_t hadd_l = + vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])), + vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1]))); + const float32x4_t haddhadd = + vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)), + vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h))); +#endif + + outputs = vaddq_f32(outputs, haddhadd); + if (!output_layer) nn_activate4(&outputs, &zero); + vst1q_f32(output_nodes, outputs); +} + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_neon(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + // Hidden layers, except the final iteration is the output layer. + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool output_layer = (layer == nn_config->num_hidden_layers); + float *const output_nodes = output_layer ? output : buf[buf_index]; + const int num_outputs = output_layer ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + + if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + nn_propagate_4to8(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + nn_propagate_8to4(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + nn_propagate_4to4(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 8 == 0) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_8to1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_4to1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out], output_layer); + } + } else if (num_inputs > 8) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_xto1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out]); + } + } else if (num_inputs >= 4) { + for (int out = 0; out < num_outputs; out++) { + nn_propagate_xsto1(num_inputs, input_nodes, + &layer_weights[out * num_inputs], &layer_bias[out], + &output_nodes[out]); + } + } else { + for (int node = 0; node < num_outputs; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_inputs; ++i) + val += layer_weights[node * num_inputs + i] * input_nodes[i]; + // ReLU as activation function. + val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). + output_nodes[node] = val; + } + } + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c new file mode 100644 index 0000000000..2e4761f9a4 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c @@ -0,0 +1,1217 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/restoration.h" +#include "av1/encoder/arm/neon/pickrst_neon.h" +#include "av1/encoder/pickrst.h" + +int64_t av1_lowbd_pixel_proj_error_neon( + const uint8_t *src, int width, int height, int src_stride, + const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); + + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0); + + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d)); + v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = (dat[k] << SGRPROJ_RST_BITS); + int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) + + xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += e * e; + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x2_t xq_v = vdup_n_s32(xq_active); + + do { + int32x4_t sse_s32 = vdupq_n_s32(0); + int j = 0; + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt_0 = vld1q_s32(&flt[j]); + int32x4_t flt_1 = vld1q_s32(&flt[j + 4]); + int16x8_t d_s16 = + vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + + int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16)); + int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16)); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = dat[k] << SGRPROJ_RST_BITS; + int32_t v = xq_active * (flt[k] - u); + int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) + + dat[k] - src[k]; + sse += e * e; + } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } while (--height != 0); + } else { + uint32x4_t sse_s32 = vdupq_n_u32(0); + + do { + int j = 0; + + do { + const uint8x16_t d = vld1q_u8(&dat[j]); + const uint8x16_t s = vld1q_u8(&src[j]); + + uint8x16_t diff = vabdq_u8(d, s); + uint8x8_t diff_lo = vget_low_u8(diff); + uint8x8_t diff_hi = vget_high_u8(diff); + + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo)); + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi)); + + j += 16; + } while (j <= width - 16); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); + + sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32)); + } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +} + +// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are +// processing 2 pixels at a time, so the accumulator max can be as high as 32768 +// for the compute stats. +#define STAT_ACCUMULATOR_MAX 32768 + +static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vtbl4_u8(table, idx); +#endif +} + +static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2q_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx))); +#endif +} + +// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the +// computation. This function computes the final M from the accumulated +// (src_s64) and the residual parts (src_s32). It also transposes the result as +// the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to +// speed-up the computation. This function computes the final H from the +// accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, int stride, + int scale) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-triangle indices, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } + } + } +} + +// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win7_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = { + 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, + 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, + 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, + 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, + 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, + 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + dgd += 2; + + // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49 + // consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8)); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4); + + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8)); + dgd_avg1[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8)); + + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5); + + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + dgd_avg1[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8)); + dgd_avg1[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8)); + + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7 + // matrix tightly packed into a int16x8_t[6] array. This array contains + // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor); +} + +// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win5_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = { + 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, + 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25, + 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + dgd += 2; + + // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25 + // consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1); + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2)); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + downsample_factor); +} + +static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride, + int width, int height) { + uint64_t sum = 0; + + if (width >= 16) { + int h = 0; + // We can accumulate up to 257 8-bit values in a 16-bit value, given + // that each 16-bit vector has 8 elements, that means we can process up to + // int(257*8/width) rows before we need to widen to 32-bit vector + // elements. + int h_overflow = 257 * 8 / width; + int h_limit = height > h_overflow ? h_overflow : height; + uint32x4_t avg_u32 = vdupq_n_u32(0); + do { + uint16x8_t avg_u16 = vdupq_n_u16(0); + do { + int j = width; + const uint8_t *src_ptr = src; + do { + uint8x16_t s = vld1q_u8(src_ptr); + avg_u16 = vpadalq_u8(avg_u16, s); + j -= 16; + src_ptr += 16; + } while (j >= 16); + if (j >= 8) { + uint8x8_t s = vld1_u8(src_ptr); + avg_u16 = vaddw_u8(avg_u16, s); + j -= 8; + src_ptr += 8; + } + // Scalar tail case. + while (j > 0) { + sum += src[width - j]; + j--; + } + src += src_stride; + } while (++h < h_limit); + avg_u32 = vpadalq_u16(avg_u32, avg_u16); + + h_limit += h_overflow; + h_limit = height > h_overflow ? h_overflow : height; + } while (h < height); + return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) / + (width * height)); + } + if (width >= 8) { + int h = 0; + // We can accumulate up to 257 8-bit values in a 16-bit value, given + // that each 16-bit vector has 4 elements, that means we can process up to + // int(257*4/width) rows before we need to widen to 32-bit vector + // elements. + int h_overflow = 257 * 4 / width; + int h_limit = height > h_overflow ? h_overflow : height; + uint32x2_t avg_u32 = vdup_n_u32(0); + do { + uint16x4_t avg_u16 = vdup_n_u16(0); + do { + int j = width; + const uint8_t *src_ptr = src; + uint8x8_t s = vld1_u8(src_ptr); + avg_u16 = vpadal_u8(avg_u16, s); + j -= 8; + src_ptr += 8; + // Scalar tail case. + while (j > 0) { + sum += src[width - j]; + j--; + } + src += src_stride; + } while (++h < h_limit); + avg_u32 = vpadal_u16(avg_u32, avg_u16); + + h_limit += h_overflow; + h_limit = height > h_overflow ? h_overflow : height; + } while (h < height); + return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) / + (width * height)); + } + int i = height; + do { + int j = 0; + do { + sum += src[j]; + } while (++j < width); + src += src_stride; + } while (--i != 0); + return (uint8_t)(sum / (width * height)); +} + +void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); + (void)dgd_avg; + (void)src_avg; + + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = wiener_win >> 1; + const int width = h_end - h_start; + const int height = v_end - v_start; + + const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint8_t *src_start = src + h_start + v_start * src_stride; + + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; + + uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); + + // Since the height is not necessarily a multiple of the downsample factor, + // the last line of src will be scaled according to how many rows remain. + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + int downsampled_height = height / downsample_factor; + int downsample_remainder = height % downsample_factor; + + memset(M, 0, wiener_win2 * sizeof(*M)); + memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H)); + + // Calculate the M and H matrices for the normal and downsampled cases. + if (downsampled_height > 0) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } else { + compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } + } + + // Accumulate the remaining last rows in the downsampled case. + if (downsample_remainder > 0) { + int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } else { + compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } + } +} + +static INLINE void calc_proj_params_r0_r1_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t h01_lo = vdupq_n_s64(0); + int64x2_t h01_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt0_ptr = flt0; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); + f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); + f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); + f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo)); + h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo)); + h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi)); + h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size; + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + H[1][0] = H[0][1]; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h00_lo = vdupq_n_s64(0); + int64x2_t h00_hi = vdupq_n_s64(0); + int64x2_t c0_lo = vdupq_n_s64(0); + int64x2_t c0_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt0_ptr = flt0; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f0_lo = vld1q_s32(flt0_ptr); + int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f0_lo = vsubw_s16(f0_lo, vget_low_s16(u)); + f0_hi = vsubw_s16(f0_hi, vget_high_s16(u)); + + h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo)); + h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo)); + h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi)); + h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi)); + + c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo)); + c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo)); + c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi)); + c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt0_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt0 += flt0_stride; + } while (--height != 0); + + H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size; + C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size; +} + +static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + assert(width % 8 == 0); + const int size = width * height; + + int64x2_t h11_lo = vdupq_n_s64(0); + int64x2_t h11_hi = vdupq_n_s64(0); + int64x2_t c1_lo = vdupq_n_s64(0); + int64x2_t c1_hi = vdupq_n_s64(0); + + do { + const uint8_t *src_ptr = src8; + const uint8_t *dat_ptr = dat8; + int32_t *flt1_ptr = flt1; + int w = width; + + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t d = vld1_u8(dat_ptr); + int32x4_t f1_lo = vld1q_s32(flt1_ptr); + int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4); + + int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS)); + + int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u)); + int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u)); + f1_lo = vsubw_s16(f1_lo, vget_low_s16(u)); + f1_hi = vsubw_s16(f1_hi, vget_high_s16(u)); + + h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo)); + h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo)); + h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi)); + h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi)); + + c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo)); + c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo)); + c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi)); + c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi)); + + src_ptr += 8; + dat_ptr += 8; + flt1_ptr += 8; + w -= 8; + } while (w != 0); + + src8 += src_stride; + dat8 += dat_stride; + flt1 += flt1_stride; + } while (--height != 0); + + H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size; + C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], + int64_t C[2], const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h new file mode 100644 index 0000000000..7b72dca34d --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ +#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ + +#include + +#include "av1/common/restoration.h" + +// Aligned sizes for Wiener filters. +#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2) +#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3) +#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED)) +#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2) +#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3) + +// Compute 8 values of M (cross correlation) for a single source pixel and +// accumulate. +static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg, + int16x8_t dgd_avg) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +// Compute 8 values of M (cross correlation) for two source pixels and +// accumulate. +static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0, + int16x4_t src_avg1, int16x8_t dgd_avg0, + int16x8_t dgd_avg1) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0); + lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg, + int width, int height) { + for (int i = 0; i < height; i += 4) { + int16x4_t di = vld1_s16(dgd_avg + i); + + for (int j = i; j < width; j += 4) { + int16x4_t dj = vld1_s16(dgd_avg + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j); + + h0 = vmlal_lane_s16(h0, dj, di, 0); + h1 = vmlal_lane_s16(h1, dj, di, 1); + h2 = vmlal_lane_s16(h2, dj, di, 2); + h3 = vmlal_lane_s16(h3, dj, di, 3); + + vst1q_s32(H_s32 + 0 * width + j, h0); + vst1q_s32(H_s32 + 1 * width + j, h1); + vst1q_s32(H_s32 + 2 * width + j, h2); + vst1q_s32(H_s32 + 3 * width + j, h3); + } + H_s32 += 4 * width; + } +} + +static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 24; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2; + } +} + +static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 48; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i); + + h0 = vmlal_lane_s16(h0, di0, di0, 0); + h0 = vmlal_lane_s16(h0, di1, di1, 0); + h1 = vmlal_lane_s16(h1, di0, di0, 1); + h1 = vmlal_lane_s16(h1, di1, di1, 1); + h2 = vmlal_lane_s16(h2, di0, di0, 2); + h2 = vmlal_lane_s16(h2, di1, di1, 2); + h3 = vmlal_lane_s16(h3, di0, di0, 3); + h3 = vmlal_lane_s16(h3, di1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3); + + for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j); + h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j); + h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j); + h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_ALIGN2; + } +} + +// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data. +static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src, + int length) { + do { + int32x4_t s32 = vld1q_s32(src); + vst1q_s32(src, vdupq_n_s32(0)); + src += 4; + + int64x2_t d_lo = vld1q_s64(dst + 0); + int64x2_t d_hi = vld1q_s64(dst + 2); + + d_lo = vaddw_s32(d_lo, vget_low_s32(s32)); + d_hi = vaddw_s32(d_hi, vget_high_s32(s32)); + + vst1q_s64(dst + 0, d_lo); + vst1q_s64(dst + 2, d_hi); + + dst += 4; + length -= 4; + } while (length > 0); +} + +#endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c new file mode 100644 index 0000000000..c3b57ce206 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c @@ -0,0 +1,928 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_mem/aom_mem.h" + +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if AOM_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif +} + +static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan, + int16x8_t v_eobmax, + uint16x8_t v_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); + const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0)); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + int16x8_t v_quant, int16x8_t v_dequant, + int16x8_t v_round, int16x8_t v_zero) { + const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff); + store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff); + return v_nz_mask; +} + +void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + const int16x8_t v_zero = vdupq_n_s16(0); + int16x8_t v_quant = vld1q_s16(quant_ptr); + int16x8_t v_dequant = vld1q_s16(dequant_ptr); + int16x8_t v_round = vld1q_s16(round_ptr); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + uint16x8_t v_nz_mask; + // process dc and the first seven ac coeffs + v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask); + // overwrite the dc constants with ac constants + v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); + v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); + v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); + + count -= 8; + // now process the rest of the ac coeffs + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + count -= 8; + } while (count > 0); + *eob_ptr = get_max_eob(v_eobmax_76543210); +} + +static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr, + int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, int16x8_t v_quant, + int16x8_t v_dequant, int16x8_t v_round, + int16x8_t v_zero) { + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round); + const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + vst1q_s16(qcoeff_ptr, v_qcoeff); + vst1q_s16(dqcoeff_ptr, v_dqcoeff); + return v_nz_mask; +} + +void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + const int16x8_t v_zero = vdupq_n_s16(0); + int16x8_t v_quant = vld1q_s16(quant_ptr); + int16x8_t v_dequant = vld1q_s16(dequant_ptr); + int16x8_t v_round = vld1q_s16(round_ptr); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + uint16x8_t v_nz_mask; + intptr_t count = n_coeffs; + + // process dc and the first seven ac coeffs + v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + // overwrite the dc constants with ac constants + v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); + v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); + v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); + + count -= 8; + // now process the rest of the ac coeffs + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero); + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + count -= 8; + } while (count != 0); + *eob_ptr = get_max_eob(v_eobmax_76543210); +} + +static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, + int16x8_t v_round, int16x8_t v_zero, int log_scale) { + const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1); + const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale)); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const uint16x8_t v_mask = + vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1)); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), + vreinterpretq_s16_u16(v_mask)); + const int16x8_t v_tmp2 = + vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); + // Multiplying by dequant here will use all 16 bits. Cast to unsigned before + // shifting right. (vshlq_s16 will shift right if shift value is negative) + const uint16x8_t v_abs_dqcoeff = + vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), + vdupq_n_s16(-log_scale)); + const int16x8_t v_dqcoeff = + vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign), + v_coeff_sign); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); + return v_nz_mask; +} + +static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant, + int16x8_t v_round, int16x8_t v_zero) { + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const uint16x8_t v_mask = + vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1), + vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2)); + // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round), + vreinterpretq_s16_u16(v_mask)); + // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + const int16x8_t v_tmp2 = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1), + vreinterpretq_s16_u16(vshrq_n_u16( + vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14))); + const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign); + // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + const int16x8_t v_abs_dqcoeff = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13), + vreinterpretq_s16_u16(vshrq_n_u16( + vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2))); + const int16x8_t v_dqcoeff = + vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); + return v_nz_mask; +} + +static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan, + int log_scale) { + const int16x8_t v_zero = vdupq_n_s16(0); + int16x8_t v_quant = vld1q_s16(quant_ptr); + int16x8_t v_dequant = vld1q_s16(dequant_ptr); + const int16x8_t v_round_no_scale = vld1q_s16(round_ptr); + int16x8_t v_round = + vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale))); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + intptr_t non_zero_count = n_coeffs; + + assert(n_coeffs > 16); + // Pre-scan pass + const int16x8_t v_dequant_scaled = + vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale))); + const int16x8_t v_zbin_s16 = + vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1); + intptr_t i = n_coeffs; + do { + const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8); + const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16); + const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a); + const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b); + const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16); + const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16); + // If the coefficient is in the base ZBIN range, then discard. + if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) { + non_zero_count -= 16; + } else { + break; + } + i -= 16; + } while (i > 0); + + const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count; + memset(qcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr + non_zero_count, 0, + remaining_zcoeffs * sizeof(*dqcoeff_ptr)); + + // process dc and the first seven ac coeffs + uint16x8_t v_nz_mask; + if (log_scale == 2) { + v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant, v_dequant, v_round, v_zero); + } else { + v_nz_mask = + quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero, log_scale); + } + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + // overwrite the dc constants with ac constants + v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1); + v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1); + v_round = vdupq_lane_s16(vget_low_s16(v_round), 1); + + for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + if (log_scale == 2) { + v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant, v_dequant, v_round, v_zero); + } else { + v_nz_mask = + quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant, + v_dequant, v_round, v_zero, log_scale); + } + v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask); + } + *eob_ptr = get_max_eob(v_eobmax_76543210); +} + +void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, + iscan, 1); +} + +void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, + iscan, 2); +} + +void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + (void)quant_shift_ptr; + (void)scan; + + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + + uint16x8_t vcond = vcgeq_s16(v_abs, vzbins); + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = vsetq_lane_s16(round_ptr[0], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vround = vsetq_lane_s16(round_ptr[1], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + vcond = vcgeq_s16(v_abs, vzbins); + + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +#define QM_MULL_SHIFT(x0, x1) \ + vreinterpretq_s16_u16(vorrq_u16( \ + vreinterpretq_u16_s16(vshlq_n_s16( \ + vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \ + vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS))) + +static void aom_quantize_b_helper_16x16_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + (void)scan; + + uint16x8_t vwt, viwt; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + uint16x8_t vcond; + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = vsetq_lane_s16(round_ptr[0], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vround = vsetq_lane_s16(round_ptr[1], vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +static void aom_quantize_b_helper_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + (void)scan; + + uint16x8_t vwt, viwt; + const int log_scale = 1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + const int16x8_t v_log_scale = v_eobmax_76543210; + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), + vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + uint16x8_t vcond; + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift); + + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +static void aom_quantize_b_helper_64x64_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + (void)scan; + + uint16x8_t vwt, viwt; + const int log_scale = 2; + const int16x8_t v_log_scale = + vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE)); + + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + const int16x8_t zero = vdupq_n_s16(0); + int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero)); + int16x8_t v_ones = vnegq_s16(v_eobmax_76543210); + + int16x8_t vzbins = vdupq_n_s16(zbins[1]), + vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale)); + int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]); + int16x8_t vquant = vdupq_n_s16(quant_ptr[1]); + int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]); + + int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]); + int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + int16x8_t v_abs = vabsq_s16(v_coeff); + vzbins = vsetq_lane_s16(zbins[0], vzbins, 0); + uint16x8_t vcond; + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[0])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0); + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + int16x8_t ones = + vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); + vtmp2 = + vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0])); + store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[0])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + v_deq_abs = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0])); + store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask); + + vround = + vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0); + vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0); + vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0); + vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[0]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + vzbins = vsetq_lane_s16(zbins[1], vzbins, 0); + + for (int i = 8; i < n_coeffs; i += 8) { + v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]); + v_coeff_sign = vshrq_n_s16(v_coeff, 15); + v_abs = vabsq_s16(v_coeff); + + if (qm_ptr == NULL) { + vcond = vcgeq_s16(v_abs, vzbins); + } else { + vwt = vmovl_u8(vld1_u8(&qm_ptr[i])); + vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins); + } + nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0); + if (nz_check) { + int16x8_t vtmp = vqaddq_s16(v_abs, vround); + + int16x8_t vtmp2; + if (qm_ptr == NULL) { + vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1); + } else { + vtmp2 = QM_MULL_SHIFT(vtmp, vwt); + vtmp2 = vaddq_s16(vtmp2, vtmp); + } + + int16x8_t ones = + vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones); + vtmp2 = + vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones); + int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign); + int16x8_t coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i])); + store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask); + + if (iqm_ptr != NULL) { + viwt = vmovl_u8(vld1_u8(&iqm_ptr[i])); + vdequant = QM_MULL_SHIFT(vdequant, viwt); + } + int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16( + vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale)); + v_deq_abs = + vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs); + vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign); + coeff_nz_mask = + vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i])); + store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask); + + uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero); + const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond); + int16x8_t v_iscan = vld1q_s16(&iscan[i]); + vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210)); + v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210); + } + } + *eob_ptr = get_max_eob(v_eobmax_76543210) + 1; +} + +void aom_quantize_b_helper_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2 + case 0: + aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, qm_ptr, iqm_ptr); + break; + case 1: + aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, qm_ptr, iqm_ptr); + break; + case 2: + aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, qm_ptr, iqm_ptr); + break; + } +} + +void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); +} + +void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); +} diff --git a/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c new file mode 100644 index 0000000000..7d3bd4c606 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include + +#include "av1/encoder/rdopt.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + int32x4_t *xy_sum_32, + int32x4_t *xz_sum_32, + int32x4_t *x_sum_32, + int32x4_t *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride)); + const int16x4_t pixelsa_2_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16)); + const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride)); + const int16x4_t pixelsb_2_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16)); + const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride)); + const int16x4_t pixelsa_1_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16)); + const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride)); + const int16x4_t pixelsb_1_sli = + vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16)); + + const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli); + + *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli); + *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli); + *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli); + + *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli); + *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli); + *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli); + + // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k + // (sum up every element in slli_a and swap_b) + *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a); + *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli); + + // Also sum their squares + *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli); + *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli); + *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli); +} + +void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + int32x4_t zero = vdupq_n_s32(0); + int64x2_t v_x_sum = vreinterpretq_s64_s32(zero); + int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero); + int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero); + int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero); + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + + for (int i = 0; i <= height - 4; i += 3) { + int32x4_t xy_sum_32 = zero; + int32x4_t xz_sum_32 = zero; + int32x4_t x_sum_32 = zero; + int32x4_t x2_sum_32 = zero; + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32); + v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32); + v_x_sum = vpadalq_s32(v_x_sum, x_sum_32); + v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32); + } +#if AOM_ARCH_AARCH64 + xy_sum = vaddvq_s64(v_xy_sum); + xz_sum = vaddvq_s64(v_xz_sum); + x2_sum = vaddvq_s64(v_x2_sum); + x_sum = vaddvq_s64(v_x_sum); +#else + xy_sum = vget_lane_s64( + vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0); + xz_sum = vget_lane_s64( + vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0); + x2_sum = vget_lane_s64( + vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0); + x_sum = + vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0); +#endif + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + if (width >= 8) { + int32x4_t v_y_sum = zero; + int32x4_t v_y2_sum = zero; + int32x4_t v_xy_sum_a = zero; + int k = width - 1; + int j = 0; + while ((k - 8) > 0) { + const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]); + const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]); + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_y_hi = vget_high_s16(v_y); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + v_y_sum = vpadalq_s16(v_y_sum, v_y); + k -= 8; + j += 8; + } + + const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j); + const int16x8_t v_x = + vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7), + vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1); + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_y_hi = vget_high_s16(v_y); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y); + const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a); +#if AOM_ARCH_AARCH64 + const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum); + xy_sum += vaddvq_s64(v_xy_sum2); + const int32_t y = vaddvq_s32(v_y_sum_a); + const int64_t y2 = vaddvq_s64(v_y2_sum_a); +#else + xy_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0); + const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a); + const int64_t y = + vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0); + const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum); + int64_t y2 = vget_lane_s64( + vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0); +#endif + x_sum += y; + x2_sum += y2; + x_finalrow += y; + x2_finalrow += y2; + } else { + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + if (width >= 8) { + int32x4_t v_y2_sum = zero; + int32x4_t v_w2_sum = zero; + int32x4_t v_xy_sum_a = zero; + int32x4_t v_xz_sum_a = zero; + int32x4_t v_x_sum_a = zero; + int32x4_t v_w_sum = zero; + int k = width - 1; + int j = 0; + while ((k - 8) > 0) { + const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]); + const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]); + const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]); + const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]); + + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_z_lo = vget_low_s16(v_z); + const int16x4_t v_w_lo = vget_low_s16(v_w); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_hi = vget_high_s16(v_y); + const int16x4_t v_z_hi = vget_high_s16(v_z); + const int16x4_t v_w_hi = vget_high_s16(v_w); + + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi); + + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo); + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi); + + v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo); + v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + + v_w_sum = vpadalq_s16(v_w_sum, v_w); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w); + + k -= 8; + j += 8; + } + const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j); + const int16x8_t v_x = + vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7), + vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j); + const int16x8_t v_z = + vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7), + vreinterpretq_s16_s32(zero), 1); + const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1); + + const int16x4_t v_x_lo = vget_low_s16(v_x); + const int16x4_t v_y_lo = vget_low_s16(v_y); + const int16x4_t v_z_lo = vget_low_s16(v_z); + const int16x4_t v_w_lo = vget_low_s16(v_w); + const int16x4_t v_x_hi = vget_high_s16(v_x); + const int16x4_t v_y_hi = vget_high_s16(v_y); + const int16x4_t v_z_hi = vget_high_s16(v_z); + const int16x4_t v_w_hi = vget_high_s16(v_w); + + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo); + v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi); + + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo); + v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi); + + v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo); + v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo); + v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi); + + v_w_sum = vpadalq_s16(v_w_sum, v_w); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y); + v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w); + +#if AOM_ARCH_AARCH64 + xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a)); + xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a)); + x_sum += vaddvq_s32(v_x_sum_a); + x_finalrow += vaddvq_s32(v_w_sum); + int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum)); + int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum)); +#else + const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a); + xy_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0); + const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a); + xz_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0); + const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a); + x_sum += vget_lane_s64( + vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0); + const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum); + x_finalrow += vget_lane_s64( + vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0); + const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum); + int64_t y2 = vget_lane_s64( + vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0); + const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum); + int64_t w2 = vget_lane_s64( + vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0); +#endif + x2_sum += y2 + w2; + x2_finalrow += w2; + } else { + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + if (width >= 8) { + int32x4_t v_x_firstrow = zero; + int32x4_t v_x2_firstrow = zero; + for (int j = 0; j < width; j += 8) { + const int16x8_t v_diff = vld1q_s16(diff + j); + const int16x4_t v_diff_lo = vget_low_s16(v_diff); + const int16x4_t v_diff_hi = vget_high_s16(v_diff); + v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff); + v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo); + v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi); + } +#if AOM_ARCH_AARCH64 + x_firstrow += vaddvq_s32(v_x_firstrow); + x2_firstrow += vaddvq_s32(v_x2_firstrow); +#else + const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow); + x_firstrow += vget_lane_s64( + vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)), + 0); + const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow); + x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64), + vget_high_s64(v_x2_firstrow_64)), + 0); +#endif + } else { + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c new file mode 100644 index 0000000000..3d17723224 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/mem_neon.h" + +#include "av1/encoder/reconinter_enc.h" + +void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref, int ref_stride, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter_params = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + if (width > 8) { + assert(width % 16 == 0); + int i = height; + do { + int j = 0; + do { + uint8x16_t r = vld1q_u8(ref + j); + vst1q_u8(comp_pred + j, r); + j += 16; + } while (j < width); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else if (width == 8) { + int i = height; + do { + uint8x8_t r = vld1_u8(ref); + vst1_u8(comp_pred, r); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else { + assert(width == 4); + int i = height / 2; + do { + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + vst1_u8(comp_pred, r); + ref += 2 * ref_stride; + comp_pred += 2 * width; + } while (--i != 0); + } + } else if (!subpel_y_q3) { + const int16_t *const filter_x = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *const filter_y = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + + const int16_t *const filter_x = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1); + const int16_t *const filter_y = + av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1); + + const int im_stride = MAX_SB_SIZE; + const int im_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + SUBPEL_TAPS; + + const int ref_vert_offset = ref_stride * ((SUBPEL_TAPS >> 1) - 1); + const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1); + + assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block, + MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height); + aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width, + NULL, -1, filter_y, 16, width, height); + } +} + +void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd, + const AV1_COMMON *const cm, int mi_row, + int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width); +} + +void aom_dist_wtd_comp_avg_upsampled_pred_neon( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, + width, jcp_param); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + if (width > 4) { + assert(width % 8 == 0); + int i = height; + do { + int j = 0; + do { + uint16x8_t r = vld1q_u16(ref + j); + vst1q_u16(comp_pred + j, r); + j += 8; + } while (j < width); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else if (width == 4) { + int i = height; + do { + uint16x4_t r = vld1_u16(ref); + vst1_u16(comp_pred, r); + ref += ref_stride; + comp_pred += width; + } while (--i != 0); + } else { + assert(width == 2); + int i = height / 2; + do { + uint16x4_t r = load_u16_2x2(ref, ref_stride); + store_u16x2_strided_x2(comp_pred, width, r); + ref += 2 * ref_stride; + comp_pred += 2 * width; + } while (--i != 0); + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL, + -1, kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz_neon( + ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, + CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert_neon( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_neon( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8, + width); +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height, + comp_pred8, width, jcp_param); +} + +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/arm/neon/shift_neon.h b/third_party/aom/av1/encoder/arm/neon/shift_neon.h new file mode 100644 index 0000000000..d73aef2f25 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/shift_neon.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ +#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ + +#include + +#include "aom/aom_integer.h" // For AOM_INLINE. + +#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg) \ + static AOM_INLINE void name(const type *in, type *out, int size) { \ + int i = 0; \ + do { \ + out[i] = intrinsic(in[i], arg); \ + } while (++i < size); \ + } + +SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2) +SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2) +SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2) +SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2) +SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2) +SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4) +SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4) + +// Addition instructions have slightly better performance compared to shift +// instructions on some micro-architectures, so use these for shifts by one. + +SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i]) +SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i]) +SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16, + vdup_n_s16(0)) +SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16, + vdupq_n_s16(0)) +SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32, + vdupq_n_s32(0)) + +#undef SHIFT_LOOP_HELPER + +#endif // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c new file mode 100644 index 0000000000..986f143864 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +// For the squared error buffer, add padding for 4 samples. +#define SSE_STRIDE (BW + 4) + +// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits. +DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = { + 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, + 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, + 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, + 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF +}; + +static INLINE void get_squared_error( + const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2, + const uint32_t stride2, const uint32_t block_width, + const uint32_t block_height, uint16_t *frame_sse, + const unsigned int dst_stride) { + uint16_t *dst = frame_sse; + + uint32_t i = 0; + do { + uint32_t j = 0; + do { + uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j); + uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint16x8_t sse_lo = + vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); + uint16x8_t sse_hi = + vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff)); + + vst1q_u16(dst + j + 2, sse_lo); + vst1q_u16(dst + j + 10, sse_hi); + + j += 16; + } while (j < block_width); + + dst += dst_stride; + } while (++i < block_height); +} + +static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col, + const uint32_t block_width) { + uint16x8_t s = vld1q_u16(src); + + if (col == 0) { + const uint16_t lane2 = vgetq_lane_u16(s, 2); + s = vsetq_lane_u16(lane2, s, 0); + s = vsetq_lane_u16(lane2, s, 1); + } else if (col >= block_width - 4) { + const uint16_t lane5 = vgetq_lane_u16(s, 5); + s = vsetq_lane_u16(lane5, s, 6); + s = vsetq_lane_u16(lane5, s, 7); + } + return s; +} + +static void apply_temporal_filter( + const uint8_t *frame, const unsigned int stride, const uint32_t block_width, + const uint32_t block_height, const int *subblock_mses, + unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse, + const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, + const double decay_factor, const double inv_factor, + const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_neon[BH][BW]; + const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask); + + // Traverse 4 columns at a time - first and last two columns need padding. + for (uint32_t col = 0; col < block_width; col += 4) { + uint16x8_t vsrc[5]; + const uint16_t *src = frame_sse + col; + + // Load and pad (for first and last two columns) 3 rows from the top. + for (int i = 2; i < 5; i++) { + vsrc[i] = load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } + + // Pad the top 2 rows. + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (unsigned int row = 0; row < block_height; row++) { + for (int i = 0; i < 4; i++) { + uint32x4_t vsum = vdupq_n_u32(0); + for (int j = 0; j < 5; j++) { + vsum = vpadalq_u16(vsum, vandq_u16(vsrc[j], vmask.val[i])); + } + acc_5x5_neon[row][col + i] = horizontal_add_u32x4(vsum); + } + + // Push all rows in the sliding window up one. + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + if (row <= block_height - 4) { + // Load next row into the bottom of the sliding window. + vsrc[4] = load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } else { + // Pad the bottom 2 rows. + vsrc[4] = vsrc[3]; + } + } + } + + // Perform filtering. + if (tf_wgt_calc_lvl == 0) { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } else { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } +} + +void av1_apply_temporal_filter_neon( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); + assert(!is_high_bitdepth && "Only support low bit-depth with Neon!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w, + plane_h, frame_sse, SSE_STRIDE); + + apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, + count + plane_offset, frame_sse, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + + plane_offset += plane_h * plane_w; + } +} + +double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height, + int width, int stride, + int edge_thresh) { + uint16x8_t thresh = vdupq_n_u16(edge_thresh); + uint32x4_t acc = vdupq_n_u32(0); + // Count is in theory positive as it counts the number of times we're under + // the threshold, but it will be counted negatively in order to make best use + // of the vclt instruction, which sets every bit of a lane to 1 when the + // condition is true. + int32x4_t count = vdupq_n_s32(0); + int final_count = 0; + int64_t final_acc = 0; + const uint8_t *src_start = src + stride + 1; + int h = 1; + + do { + int w = 1; + const uint8_t *src_ptr = src_start; + + while (w <= (width - 1) - 16) { + uint8x16_t mat[3][3]; + mat[0][0] = vld1q_u8(src_ptr - stride - 1); + mat[0][1] = vld1q_u8(src_ptr - stride); + mat[0][2] = vld1q_u8(src_ptr - stride + 1); + mat[1][0] = vld1q_u8(src_ptr - 1); + mat[1][1] = vld1q_u8(src_ptr); + mat[1][2] = vld1q_u8(src_ptr + 1); + mat[2][0] = vld1q_u8(src_ptr + stride - 1); + mat[2][1] = vld1q_u8(src_ptr + stride); + mat[2][2] = vld1q_u8(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa_lo = + vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0])); + uint16x8_t gxa_hi = + vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0])); + uint16x8_t gxb_lo = + vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2])); + uint16x8_t gxb_hi = + vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2])); + gxa_lo = vaddq_u16( + gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0]))); + gxa_hi = vaddq_u16( + gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0]))); + gxb_lo = vaddq_u16( + gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2]))); + gxb_hi = vaddq_u16( + gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2]))); + + uint16x8_t gya_lo = + vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2])); + uint16x8_t gya_hi = + vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2])); + uint16x8_t gyb_lo = + vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2])); + uint16x8_t gyb_hi = + vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2])); + gya_lo = vaddq_u16( + gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1]))); + gya_hi = vaddq_u16( + gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1]))); + gyb_lo = vaddq_u16( + gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1]))); + gyb_hi = vaddq_u16( + gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1]))); + + uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo); + uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh); + uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh); + + uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2); + uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2); + + uint16x8_t adj0_lo = + vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1])); + uint16x8_t adj0_hi = + vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1])); + uint16x8_t adj1_lo = + vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2])); + uint16x8_t adj1_hi = + vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2])); + uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo); + adj_lo = vaddq_u16(adj_lo, adj_lo); + uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi); + adj_hi = vaddq_u16(adj_hi, adj_hi); + + uint16x8_t diag0_lo = + vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2])); + uint16x8_t diag0_hi = + vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2])); + uint16x8_t diag1_lo = + vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2])); + uint16x8_t diag1_hi = + vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2])); + uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo); + uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi); + + uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo); + v_lo = vabdq_u16(v_lo, adj_lo); + uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi); + v_hi = vabdq_u16(v_hi, adj_hi); + + acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo)); + acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi)); + + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo)); + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi)); + + w += 16; + src_ptr += 16; + } + + if (w <= (width - 1) - 8) { + uint8x8_t mat[3][3]; + mat[0][0] = vld1_u8(src_ptr - stride - 1); + mat[0][1] = vld1_u8(src_ptr - stride); + mat[0][2] = vld1_u8(src_ptr - stride + 1); + mat[1][0] = vld1_u8(src_ptr - 1); + mat[1][1] = vld1_u8(src_ptr); + mat[1][2] = vld1_u8(src_ptr + 1); + mat[2][0] = vld1_u8(src_ptr + stride - 1); + mat[2][1] = vld1_u8(src_ptr + stride); + mat[2][2] = vld1_u8(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]); + uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]); + gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0])); + gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2])); + + uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]); + gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1])); + gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1])); + + uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16 = vcltq_u16(ga, thresh); + + uint16x8_t center = vshll_n_u8(mat[1][1], 2); + + uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]); + uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]); + uint16x8_t adj = vaddq_u16(adj0, adj1); + adj = vaddq_u16(adj, adj); + + uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]); + uint16x8_t diag = vaddq_u16(diag0, diag1); + + uint16x8_t v = vaddq_u16(center, diag); + v = vabdq_u16(v, adj); + + acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16)); + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); + + w += 8; + src_ptr += 8; + } + + if (w <= (width - 1) - 4) { + uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0)); + uint8x8_t mat[3][3]; + mat[0][0] = load_u8_4x1(src_ptr - stride - 1); + mat[0][1] = load_u8_4x1(src_ptr - stride); + mat[0][2] = load_u8_4x1(src_ptr - stride + 1); + mat[1][0] = load_u8_4x1(src_ptr - 1); + mat[1][1] = load_u8_4x1(src_ptr); + mat[1][2] = load_u8_4x1(src_ptr + 1); + mat[2][0] = load_u8_4x1(src_ptr + stride - 1); + mat[2][1] = load_u8_4x1(src_ptr + stride); + mat[2][2] = load_u8_4x1(src_ptr + stride + 1); + + // Compute Sobel gradients. + uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]); + uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]); + gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0])); + gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2])); + + uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]); + gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1])); + gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1])); + + uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb); + + // Check which vector elements are under the threshold. The Laplacian is + // then unconditionally computed and we accumulate zeros if we're not + // under the threshold. This is much faster than using an if statement. + uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask); + + uint16x8_t center = vshll_n_u8(mat[1][1], 2); + + uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]); + uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]); + uint16x8_t adj = vaddq_u16(adj0, adj1); + adj = vaddq_u16(adj, adj); + + uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]); + uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]); + uint16x8_t diag = vaddq_u16(diag0, diag1); + + uint16x8_t v = vaddq_u16(center, diag); + v = vabdq_u16(v, adj); + + acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16)); + // Add -1 for each lane where the gradient is under the threshold. + count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16)); + + w += 4; + src_ptr += 4; + } + + while (w < width - 1) { + int mat[3][3]; + mat[0][0] = *(src_ptr - stride - 1); + mat[0][1] = *(src_ptr - stride); + mat[0][2] = *(src_ptr - stride + 1); + mat[1][0] = *(src_ptr - 1); + mat[1][1] = *(src_ptr); + mat[1][2] = *(src_ptr + 1); + mat[2][0] = *(src_ptr + stride - 1); + mat[2][1] = *(src_ptr + stride); + mat[2][2] = *(src_ptr + stride + 1); + + // Compute Sobel gradients. + const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int ga = abs(gx) + abs(gy); + + // Accumulate Laplacian. + const int is_under = ga < edge_thresh; + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + final_acc += abs(v) * is_under; + final_count += is_under; + + src_ptr++; + w++; + } + src_start += stride; + } while (++h < height - 1); + + // We counted negatively, so subtract to get the final value. + final_count -= horizontal_add_s32x4(count); + final_acc += horizontal_long_add_u32x4(acc); + return (final_count < 16) + ? -1.0 + : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2; +} diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c new file mode 100644 index 0000000000..5a52e701a2 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" + +// For the squared error buffer, add padding for 4 samples. +#define SSE_STRIDE (BW + 4) + +// clang-format off + +DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, + 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, + 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, + 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +// clang-format on + +static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1, + const uint8_t *frame2, const uint32_t stride2, + const uint32_t block_width, + const uint32_t block_height, + uint8_t *frame_abs_diff, + const unsigned int dst_stride) { + uint8_t *dst = frame_abs_diff; + + uint32_t i = 0; + do { + uint32_t j = 0; + do { + uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j); + uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j); + uint8x16_t abs_diff = vabdq_u8(s, r); + vst1q_u8(dst + j + 2, abs_diff); + j += 16; + } while (j < block_width); + + dst += dst_stride; + } while (++i < block_height); +} + +static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col, + const uint32_t block_width) { + uint8x8_t s = vld1_u8(src); + + if (col == 0) { + const uint8_t lane2 = vget_lane_u8(s, 2); + s = vset_lane_u8(lane2, s, 0); + s = vset_lane_u8(lane2, s, 1); + } else if (col >= block_width - 4) { + const uint8_t lane5 = vget_lane_u8(s, 5); + s = vset_lane_u8(lane5, s, 6); + s = vset_lane_u8(lane5, s, 7); + } + return vcombine_u8(s, s); +} + +static void apply_temporal_filter( + const uint8_t *frame, const unsigned int stride, const uint32_t block_width, + const uint32_t block_height, const int *subblock_mses, + unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff, + const uint32_t *luma_sse_sum, const double inv_num_ref_pixels, + const double decay_factor, const double inv_factor, + const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_neon[BH][BW]; + const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask); + + // Traverse 4 columns at a time - first and last two columns need padding. + for (uint32_t col = 0; col < block_width; col += 4) { + uint8x16_t vsrc[5][2]; + const uint8_t *src = frame_abs_diff + col; + + // Load, pad (for first and last two columns) and mask 3 rows from the top. + for (int i = 2; i < 5; i++) { + const uint8x16_t s = load_and_pad(src, col, block_width); + vsrc[i][0] = vandq_u8(s, vmask.val[0]); + vsrc[i][1] = vandq_u8(s, vmask.val[1]); + src += SSE_STRIDE; + } + + // Pad the top 2 rows. + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (unsigned int row = 0; row < block_height; row++) { + uint32x4_t sum_01 = vdupq_n_u32(0); + uint32x4_t sum_23 = vdupq_n_u32(0); + + sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]); + sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]); + sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]); + sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]); + sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]); + + sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]); + sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]); + sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]); + sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]); + sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]); + + vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23)); + + // Push all rows in the sliding window up one. + for (int i = 0; i < 4; i++) { + vsrc[i][0] = vsrc[i + 1][0]; + vsrc[i][1] = vsrc[i + 1][1]; + } + + if (row <= block_height - 4) { + // Load next row into the bottom of the sliding window. + uint8x16_t s = load_and_pad(src, col, block_width); + vsrc[4][0] = vandq_u8(s, vmask.val[0]); + vsrc[4][1] = vandq_u8(s, vmask.val[1]); + src += SSE_STRIDE; + } else { + // Pad the bottom 2 rows. + vsrc[4][0] = vsrc[3][0]; + vsrc[4][1] = vsrc[3][1]; + } + } + } + + // Perform filtering. + if (tf_wgt_calc_lvl == 0) { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } else { + for (unsigned int i = 0, k = 0; i < block_height; i++) { + for (unsigned int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame[i * stride + j]; + const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = + (i >= block_height / 2) * 2 + (j >= block_width / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + accumulator[k] += weight * pixel_value; + count[k] += weight; + } + } + } +} + +void av1_apply_temporal_filter_neon_dotprod( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!"); + assert(!is_high_bitdepth && "Only support low bit-depth with Neon!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += + (frame_abs_diff[yy * SSE_STRIDE + xx + 2] * + frame_abs_diff[yy * SSE_STRIDE + xx + 2]); + } + } + } + } + } + + get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w, + plane_h, frame_abs_diff, SSE_STRIDE); + + apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, + count + plane_offset, frame_abs_diff, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/arm/neon/txfm_neon.h b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h new file mode 100644 index 0000000000..635364f46a --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ +#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ + +#include "aom/aom_integer.h" // For AOM_INLINE. + +static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip, + const int16_t **input, + int *stride, int out_size) { + if (ud_flip) { + *input = *input + (out_size - 1) * *stride; + *stride = -*stride; + } +} + +#endif // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_ diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c new file mode 100644 index 0000000000..1b35269b33 --- /dev/null +++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c for details of the parameters and + * computation. + */ +uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + assert(N % 64 == 0); + + uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int i = 0; + do { + int32x4_t sum[4]; + int32x4_t sse[2]; + int16x4_t sum_s16[4]; + + const int16x8_t r1_l = vld1q_s16(r1 + i); + const int16x8_t r1_h = vld1q_s16(r1 + i + 8); + const int16x8_t d_l = vld1q_s16(d + i); + const int16x8_t d_h = vld1q_s16(d + i + 8); + // The following three lines are a bit inelegant compared to using a pair + // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair - + // which can be executed in parallel with the subsequent SSHL instructions. + // (SSHL can only be executed on half of the Neon pipes in modern Arm + // cores, whereas ZIP1/2 can be executed on all of them.) + const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0)); + const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]); + const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]); + + sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); + sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); + + sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); + sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); + sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); + sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); + + sum_s16[0] = vqmovn_s32(sum[0]); + sum_s16[1] = vqmovn_s32(sum[1]); + sum_s16[2] = vqmovn_s32(sum[2]); + sum_s16[3] = vqmovn_s32(sum[3]); + + sse[0] = vmull_s16(sum_s16[0], sum_s16[0]); + sse[1] = vmull_s16(sum_s16[2], sum_s16[2]); + sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]); + sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]); + + v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0])); + v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1])); + + i += 16; + } while (i < N); + + uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1])); + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + do { + int16x8_t ds_l = vld1q_s16(ds); + int16x8_t ds_h = vld1q_s16(ds + 8); + + int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m)); + int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8)); + int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8)); + + acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l)); + acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h)); + acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h)); + + ds += 16; + m += 16; + N -= 16; + } while (N != 0); + + int64x2_t sum = vpaddlq_s32(acc[0]); + sum = vpadalq_s32(sum, acc[1]); + sum = vpadalq_s32(sum, acc[2]); + sum = vpadalq_s32(sum, acc[3]); + + return (horizontal_add_s64x2(sum) > limit); +} + +void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr, + const int16_t *b_ptr, int N) { + do { + int16x8_t a = vld1q_s16(a_ptr); + int16x8_t b = vld1q_s16(b_ptr); + + int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a)); + int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a)); + + sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b)); + sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b)); + + int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi)); + + vst1q_s16(d_ptr, res); + + d_ptr += 8; + a_ptr += 8; + b_ptr += 8; + N -= 8; + } while (N != 0); +} diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c new file mode 100644 index 0000000000..6601c19ab3 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c @@ -0,0 +1,1885 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/common/av1_txfm.h" + +void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 4; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[4]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[3]; + bf1[1] = input[1] + input[2]; + bf1[2] = -input[2] + input[1]; + bf1[3] = -input[3] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[2]; + bf1[2] = bf0[1]; + bf1[3] = bf0[3]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[7]; + bf1[1] = input[1] + input[6]; + bf1[2] = input[2] + input[5]; + bf1[3] = input[3] + input[4]; + bf1[4] = -input[4] + input[3]; + bf1[5] = -input[5] + input[2]; + bf1[6] = -input[6] + input[1]; + bf1[7] = -input[7] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[4]; + bf1[2] = bf0[2]; + bf1[3] = bf0[6]; + bf1[4] = bf0[1]; + bf1[5] = bf0[5]; + bf1[6] = bf0[3]; + bf1[7] = bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[15]; + bf1[1] = input[1] + input[14]; + bf1[2] = input[2] + input[13]; + bf1[3] = input[3] + input[12]; + bf1[4] = input[4] + input[11]; + bf1[5] = input[5] + input[10]; + bf1[6] = input[6] + input[9]; + bf1[7] = input[7] + input[8]; + bf1[8] = -input[8] + input[7]; + bf1[9] = -input[9] + input[6]; + bf1[10] = -input[10] + input[5]; + bf1[11] = -input[11] + input[4]; + bf1[12] = -input[12] + input[3]; + bf1[13] = -input[13] + input[2]; + bf1[14] = -input[14] + input[1]; + bf1[15] = -input[15] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[8]; + bf1[2] = bf0[4]; + bf1[3] = bf0[12]; + bf1[4] = bf0[2]; + bf1[5] = bf0[10]; + bf1[6] = bf0[6]; + bf1[7] = bf0[14]; + bf1[8] = bf0[1]; + bf1[9] = bf0[9]; + bf1[10] = bf0[5]; + bf1[11] = bf0[13]; + bf1[12] = bf0[3]; + bf1[13] = bf0[11]; + bf1[14] = bf0[7]; + bf1[15] = bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 32; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[32]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[31]; + bf1[1] = input[1] + input[30]; + bf1[2] = input[2] + input[29]; + bf1[3] = input[3] + input[28]; + bf1[4] = input[4] + input[27]; + bf1[5] = input[5] + input[26]; + bf1[6] = input[6] + input[25]; + bf1[7] = input[7] + input[24]; + bf1[8] = input[8] + input[23]; + bf1[9] = input[9] + input[22]; + bf1[10] = input[10] + input[21]; + bf1[11] = input[11] + input[20]; + bf1[12] = input[12] + input[19]; + bf1[13] = input[13] + input[18]; + bf1[14] = input[14] + input[17]; + bf1[15] = input[15] + input[16]; + bf1[16] = -input[16] + input[15]; + bf1[17] = -input[17] + input[14]; + bf1[18] = -input[18] + input[13]; + bf1[19] = -input[19] + input[12]; + bf1[20] = -input[20] + input[11]; + bf1[21] = -input[21] + input[10]; + bf1[22] = -input[22] + input[9]; + bf1[23] = -input[23] + input[8]; + bf1[24] = -input[24] + input[7]; + bf1[25] = -input[25] + input[6]; + bf1[26] = -input[26] + input[5]; + bf1[27] = -input[27] + input[4]; + bf1[28] = -input[28] + input[3]; + bf1[29] = -input[29] + input[2]; + bf1[30] = -input[30] + input[1]; + bf1[31] = -input[31] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[16]; + bf1[2] = bf0[8]; + bf1[3] = bf0[24]; + bf1[4] = bf0[4]; + bf1[5] = bf0[20]; + bf1[6] = bf0[12]; + bf1[7] = bf0[28]; + bf1[8] = bf0[2]; + bf1[9] = bf0[18]; + bf1[10] = bf0[10]; + bf1[11] = bf0[26]; + bf1[12] = bf0[6]; + bf1[13] = bf0[22]; + bf1[14] = bf0[14]; + bf1[15] = bf0[30]; + bf1[16] = bf0[1]; + bf1[17] = bf0[17]; + bf1[18] = bf0[9]; + bf1[19] = bf0[25]; + bf1[20] = bf0[5]; + bf1[21] = bf0[21]; + bf1[22] = bf0[13]; + bf1[23] = bf0[29]; + bf1[24] = bf0[3]; + bf1[25] = bf0[19]; + bf1[26] = bf0[11]; + bf1[27] = bf0[27]; + bf1[28] = bf0[7]; + bf1[29] = bf0[23]; + bf1[30] = bf0[15]; + bf1[31] = bf0[31]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + int bit = cos_bit; + const int32_t *sinpi = sinpi_arr(bit); + int32_t x0, x1, x2, x3; + int32_t s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 0 + av1_range_check_buf(0, input, input, 4, stage_range[0]); + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + // stage 1 + s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]); + s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]); + s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]); + s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]); + s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]); + s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]); + s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]); + s7 = range_check_value(x0 + x1, stage_range[1]); + + // stage 2 + s7 = range_check_value(s7 - x3, stage_range[2]); + + // stage 3 + x0 = range_check_value(s0 + s2, bit + stage_range[3]); + x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]); + x2 = range_check_value(s1 - s3, bit + stage_range[3]); + x3 = range_check_value(s4, bit + stage_range[3]); + + // stage 4 + x0 = range_check_value(x0 + s5, bit + stage_range[4]); + x2 = range_check_value(x2 + s6, bit + stage_range[4]); + + // stage 5 + s0 = range_check_value(x0 + x3, bit + stage_range[5]); + s1 = range_check_value(x1, bit + stage_range[5]); + s2 = range_check_value(x2 - x3, bit + stage_range[5]); + s3 = range_check_value(x2 - x0, bit + stage_range[5]); + + // stage 6 + s3 = range_check_value(s3 + x3, bit + stage_range[6]); + + // 1-D transform scaling factor is sqrt(2). + output[0] = round_shift(s0, bit); + output[1] = round_shift(s1, bit); + output[2] = round_shift(s2, bit); + output[3] = round_shift(s3, bit); + av1_range_check_buf(6, input, output, 4, stage_range[6]); +} + +void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 8; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[8]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[7]; + bf1[2] = -input[3]; + bf1[3] = input[4]; + bf1[4] = -input[1]; + bf1[5] = input[6]; + bf1[6] = input[2]; + bf1[7] = -input[5]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[6]; + bf1[2] = bf0[3]; + bf1[3] = bf0[4]; + bf1[4] = bf0[5]; + bf1[5] = bf0[2]; + bf1[6] = bf0[7]; + bf1[7] = bf0[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 16; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[16]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + assert(output != input); + bf1 = output; + bf1[0] = input[0]; + bf1[1] = -input[15]; + bf1[2] = -input[7]; + bf1[3] = input[8]; + bf1[4] = -input[3]; + bf1[5] = input[12]; + bf1[6] = input[4]; + bf1[7] = -input[11]; + bf1[8] = -input[1]; + bf1[9] = input[14]; + bf1[10] = input[6]; + bf1[11] = -input[9]; + bf1[12] = input[2]; + bf1[13] = -input[13]; + bf1[14] = -input[5]; + bf1[15] = input[10]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit); + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit); + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[2]; + bf1[1] = bf0[1] + bf0[3]; + bf1[2] = bf0[0] - bf0[2]; + bf1[3] = bf0[1] - bf0[3]; + bf1[4] = bf0[4] + bf0[6]; + bf1[5] = bf0[5] + bf0[7]; + bf1[6] = bf0[4] - bf0[6]; + bf1[7] = bf0[5] - bf0[7]; + bf1[8] = bf0[8] + bf0[10]; + bf1[9] = bf0[9] + bf0[11]; + bf1[10] = bf0[8] - bf0[10]; + bf1[11] = bf0[9] - bf0[11]; + bf1[12] = bf0[12] + bf0[14]; + bf1[13] = bf0[13] + bf0[15]; + bf1[14] = bf0[12] - bf0[14]; + bf1[15] = bf0[13] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit); + bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit); + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[4]; + bf1[1] = bf0[1] + bf0[5]; + bf1[2] = bf0[2] + bf0[6]; + bf1[3] = bf0[3] + bf0[7]; + bf1[4] = bf0[0] - bf0[4]; + bf1[5] = bf0[1] - bf0[5]; + bf1[6] = bf0[2] - bf0[6]; + bf1[7] = bf0[3] - bf0[7]; + bf1[8] = bf0[8] + bf0[12]; + bf1[9] = bf0[9] + bf0[13]; + bf1[10] = bf0[10] + bf0[14]; + bf1[11] = bf0[11] + bf0[15]; + bf1[12] = bf0[8] - bf0[12]; + bf1[13] = bf0[9] - bf0[13]; + bf1[14] = bf0[10] - bf0[14]; + bf1[15] = bf0[11] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit); + bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit); + bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[8]; + bf1[1] = bf0[1] + bf0[9]; + bf1[2] = bf0[2] + bf0[10]; + bf1[3] = bf0[3] + bf0[11]; + bf1[4] = bf0[4] + bf0[12]; + bf1[5] = bf0[5] + bf0[13]; + bf1[6] = bf0[6] + bf0[14]; + bf1[7] = bf0[7] + bf0[15]; + bf1[8] = bf0[0] - bf0[8]; + bf1[9] = bf0[1] - bf0[9]; + bf1[10] = bf0[2] - bf0[10]; + bf1[11] = bf0[3] - bf0[11]; + bf1[12] = bf0[4] - bf0[12]; + bf1[13] = bf0[5] - bf0[13]; + bf1[14] = bf0[6] - bf0[14]; + bf1[15] = bf0[7] - bf0[15]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit); + bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit); + bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit); + bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit); + bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit); + bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit); + bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit); + bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit); + bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit); + bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit); + bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit); + bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit); + bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit); + bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit); + bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[1]; + bf1[1] = bf0[14]; + bf1[2] = bf0[3]; + bf1[3] = bf0[12]; + bf1[4] = bf0[5]; + bf1[5] = bf0[10]; + bf1[6] = bf0[7]; + bf1[7] = bf0[8]; + bf1[8] = bf0[9]; + bf1[9] = bf0[6]; + bf1[10] = bf0[11]; + bf1[11] = bf0[4]; + bf1[12] = bf0[13]; + bf1[13] = bf0[2]; + bf1[14] = bf0[15]; + bf1[15] = bf0[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} + +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 4; ++i) + output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + av1_range_check_buf(0, input, output, 4, stage_range[0]); +} + +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 8; ++i) output[i] = input[i] * 2; + av1_range_check_buf(0, input, output, 8, stage_range[0]); +} + +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 16; ++i) + output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits); + assert(stage_range[0] + NewSqrt2Bits <= 32); + av1_range_check_buf(0, input, output, 16, stage_range[0]); +} + +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) output[i] = input[i] * 4; + av1_range_check_buf(0, input, output, 32, stage_range[0]); +} + +void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range) { + const int32_t size = 64; + const int32_t *cospi; + + int32_t stage = 0; + int32_t *bf0, *bf1; + int32_t step[64]; + + // stage 0; + av1_range_check_buf(stage, input, input, size, stage_range[stage]); + + // stage 1; + stage++; + bf1 = output; + bf1[0] = input[0] + input[63]; + bf1[1] = input[1] + input[62]; + bf1[2] = input[2] + input[61]; + bf1[3] = input[3] + input[60]; + bf1[4] = input[4] + input[59]; + bf1[5] = input[5] + input[58]; + bf1[6] = input[6] + input[57]; + bf1[7] = input[7] + input[56]; + bf1[8] = input[8] + input[55]; + bf1[9] = input[9] + input[54]; + bf1[10] = input[10] + input[53]; + bf1[11] = input[11] + input[52]; + bf1[12] = input[12] + input[51]; + bf1[13] = input[13] + input[50]; + bf1[14] = input[14] + input[49]; + bf1[15] = input[15] + input[48]; + bf1[16] = input[16] + input[47]; + bf1[17] = input[17] + input[46]; + bf1[18] = input[18] + input[45]; + bf1[19] = input[19] + input[44]; + bf1[20] = input[20] + input[43]; + bf1[21] = input[21] + input[42]; + bf1[22] = input[22] + input[41]; + bf1[23] = input[23] + input[40]; + bf1[24] = input[24] + input[39]; + bf1[25] = input[25] + input[38]; + bf1[26] = input[26] + input[37]; + bf1[27] = input[27] + input[36]; + bf1[28] = input[28] + input[35]; + bf1[29] = input[29] + input[34]; + bf1[30] = input[30] + input[33]; + bf1[31] = input[31] + input[32]; + bf1[32] = -input[32] + input[31]; + bf1[33] = -input[33] + input[30]; + bf1[34] = -input[34] + input[29]; + bf1[35] = -input[35] + input[28]; + bf1[36] = -input[36] + input[27]; + bf1[37] = -input[37] + input[26]; + bf1[38] = -input[38] + input[25]; + bf1[39] = -input[39] + input[24]; + bf1[40] = -input[40] + input[23]; + bf1[41] = -input[41] + input[22]; + bf1[42] = -input[42] + input[21]; + bf1[43] = -input[43] + input[20]; + bf1[44] = -input[44] + input[19]; + bf1[45] = -input[45] + input[18]; + bf1[46] = -input[46] + input[17]; + bf1[47] = -input[47] + input[16]; + bf1[48] = -input[48] + input[15]; + bf1[49] = -input[49] + input[14]; + bf1[50] = -input[50] + input[13]; + bf1[51] = -input[51] + input[12]; + bf1[52] = -input[52] + input[11]; + bf1[53] = -input[53] + input[10]; + bf1[54] = -input[54] + input[9]; + bf1[55] = -input[55] + input[8]; + bf1[56] = -input[56] + input[7]; + bf1[57] = -input[57] + input[6]; + bf1[58] = -input[58] + input[5]; + bf1[59] = -input[59] + input[4]; + bf1[60] = -input[60] + input[3]; + bf1[61] = -input[61] + input[2]; + bf1[62] = -input[62] + input[1]; + bf1[63] = -input[63] + input[0]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 2 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[31]; + bf1[1] = bf0[1] + bf0[30]; + bf1[2] = bf0[2] + bf0[29]; + bf1[3] = bf0[3] + bf0[28]; + bf1[4] = bf0[4] + bf0[27]; + bf1[5] = bf0[5] + bf0[26]; + bf1[6] = bf0[6] + bf0[25]; + bf1[7] = bf0[7] + bf0[24]; + bf1[8] = bf0[8] + bf0[23]; + bf1[9] = bf0[9] + bf0[22]; + bf1[10] = bf0[10] + bf0[21]; + bf1[11] = bf0[11] + bf0[20]; + bf1[12] = bf0[12] + bf0[19]; + bf1[13] = bf0[13] + bf0[18]; + bf1[14] = bf0[14] + bf0[17]; + bf1[15] = bf0[15] + bf0[16]; + bf1[16] = -bf0[16] + bf0[15]; + bf1[17] = -bf0[17] + bf0[14]; + bf1[18] = -bf0[18] + bf0[13]; + bf1[19] = -bf0[19] + bf0[12]; + bf1[20] = -bf0[20] + bf0[11]; + bf1[21] = -bf0[21] + bf0[10]; + bf1[22] = -bf0[22] + bf0[9]; + bf1[23] = -bf0[23] + bf0[8]; + bf1[24] = -bf0[24] + bf0[7]; + bf1[25] = -bf0[25] + bf0[6]; + bf1[26] = -bf0[26] + bf0[5]; + bf1[27] = -bf0[27] + bf0[4]; + bf1[28] = -bf0[28] + bf0[3]; + bf1[29] = -bf0[29] + bf0[2]; + bf1[30] = -bf0[30] + bf0[1]; + bf1[31] = -bf0[31] + bf0[0]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = bf0[37]; + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit); + bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit); + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = bf0[58]; + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 3 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[15]; + bf1[1] = bf0[1] + bf0[14]; + bf1[2] = bf0[2] + bf0[13]; + bf1[3] = bf0[3] + bf0[12]; + bf1[4] = bf0[4] + bf0[11]; + bf1[5] = bf0[5] + bf0[10]; + bf1[6] = bf0[6] + bf0[9]; + bf1[7] = bf0[7] + bf0[8]; + bf1[8] = -bf0[8] + bf0[7]; + bf1[9] = -bf0[9] + bf0[6]; + bf1[10] = -bf0[10] + bf0[5]; + bf1[11] = -bf0[11] + bf0[4]; + bf1[12] = -bf0[12] + bf0[3]; + bf1[13] = -bf0[13] + bf0[2]; + bf1[14] = -bf0[14] + bf0[1]; + bf1[15] = -bf0[15] + bf0[0]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit); + bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit); + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[47]; + bf1[33] = bf0[33] + bf0[46]; + bf1[34] = bf0[34] + bf0[45]; + bf1[35] = bf0[35] + bf0[44]; + bf1[36] = bf0[36] + bf0[43]; + bf1[37] = bf0[37] + bf0[42]; + bf1[38] = bf0[38] + bf0[41]; + bf1[39] = bf0[39] + bf0[40]; + bf1[40] = -bf0[40] + bf0[39]; + bf1[41] = -bf0[41] + bf0[38]; + bf1[42] = -bf0[42] + bf0[37]; + bf1[43] = -bf0[43] + bf0[36]; + bf1[44] = -bf0[44] + bf0[35]; + bf1[45] = -bf0[45] + bf0[34]; + bf1[46] = -bf0[46] + bf0[33]; + bf1[47] = -bf0[47] + bf0[32]; + bf1[48] = -bf0[48] + bf0[63]; + bf1[49] = -bf0[49] + bf0[62]; + bf1[50] = -bf0[50] + bf0[61]; + bf1[51] = -bf0[51] + bf0[60]; + bf1[52] = -bf0[52] + bf0[59]; + bf1[53] = -bf0[53] + bf0[58]; + bf1[54] = -bf0[54] + bf0[57]; + bf1[55] = -bf0[55] + bf0[56]; + bf1[56] = bf0[56] + bf0[55]; + bf1[57] = bf0[57] + bf0[54]; + bf1[58] = bf0[58] + bf0[53]; + bf1[59] = bf0[59] + bf0[52]; + bf1[60] = bf0[60] + bf0[51]; + bf1[61] = bf0[61] + bf0[50]; + bf1[62] = bf0[62] + bf0[49]; + bf1[63] = bf0[63] + bf0[48]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 4 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0] + bf0[7]; + bf1[1] = bf0[1] + bf0[6]; + bf1[2] = bf0[2] + bf0[5]; + bf1[3] = bf0[3] + bf0[4]; + bf1[4] = -bf0[4] + bf0[3]; + bf1[5] = -bf0[5] + bf0[2]; + bf1[6] = -bf0[6] + bf0[1]; + bf1[7] = -bf0[7] + bf0[0]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit); + bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit); + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[23]; + bf1[17] = bf0[17] + bf0[22]; + bf1[18] = bf0[18] + bf0[21]; + bf1[19] = bf0[19] + bf0[20]; + bf1[20] = -bf0[20] + bf0[19]; + bf1[21] = -bf0[21] + bf0[18]; + bf1[22] = -bf0[22] + bf0[17]; + bf1[23] = -bf0[23] + bf0[16]; + bf1[24] = -bf0[24] + bf0[31]; + bf1[25] = -bf0[25] + bf0[30]; + bf1[26] = -bf0[26] + bf0[29]; + bf1[27] = -bf0[27] + bf0[28]; + bf1[28] = bf0[28] + bf0[27]; + bf1[29] = bf0[29] + bf0[26]; + bf1[30] = bf0[30] + bf0[25]; + bf1[31] = bf0[31] + bf0[24]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = bf0[34]; + bf1[35] = bf0[35]; + bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit); + bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit); + bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit); + bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit); + bf1[44] = bf0[44]; + bf1[45] = bf0[45]; + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = bf0[50]; + bf1[51] = bf0[51]; + bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit); + bf1[60] = bf0[60]; + bf1[61] = bf0[61]; + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 5 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0] + bf0[3]; + bf1[1] = bf0[1] + bf0[2]; + bf1[2] = -bf0[2] + bf0[1]; + bf1[3] = -bf0[3] + bf0[0]; + bf1[4] = bf0[4]; + bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit); + bf1[7] = bf0[7]; + bf1[8] = bf0[8] + bf0[11]; + bf1[9] = bf0[9] + bf0[10]; + bf1[10] = -bf0[10] + bf0[9]; + bf1[11] = -bf0[11] + bf0[8]; + bf1[12] = -bf0[12] + bf0[15]; + bf1[13] = -bf0[13] + bf0[14]; + bf1[14] = bf0[14] + bf0[13]; + bf1[15] = bf0[15] + bf0[12]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit); + bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit); + bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit); + bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit); + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit); + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[39]; + bf1[33] = bf0[33] + bf0[38]; + bf1[34] = bf0[34] + bf0[37]; + bf1[35] = bf0[35] + bf0[36]; + bf1[36] = -bf0[36] + bf0[35]; + bf1[37] = -bf0[37] + bf0[34]; + bf1[38] = -bf0[38] + bf0[33]; + bf1[39] = -bf0[39] + bf0[32]; + bf1[40] = -bf0[40] + bf0[47]; + bf1[41] = -bf0[41] + bf0[46]; + bf1[42] = -bf0[42] + bf0[45]; + bf1[43] = -bf0[43] + bf0[44]; + bf1[44] = bf0[44] + bf0[43]; + bf1[45] = bf0[45] + bf0[42]; + bf1[46] = bf0[46] + bf0[41]; + bf1[47] = bf0[47] + bf0[40]; + bf1[48] = bf0[48] + bf0[55]; + bf1[49] = bf0[49] + bf0[54]; + bf1[50] = bf0[50] + bf0[53]; + bf1[51] = bf0[51] + bf0[52]; + bf1[52] = -bf0[52] + bf0[51]; + bf1[53] = -bf0[53] + bf0[50]; + bf1[54] = -bf0[54] + bf0[49]; + bf1[55] = -bf0[55] + bf0[48]; + bf1[56] = -bf0[56] + bf0[63]; + bf1[57] = -bf0[57] + bf0[62]; + bf1[58] = -bf0[58] + bf0[61]; + bf1[59] = -bf0[59] + bf0[60]; + bf1[60] = bf0[60] + bf0[59]; + bf1[61] = bf0[61] + bf0[58]; + bf1[62] = bf0[62] + bf0[57]; + bf1[63] = bf0[63] + bf0[56]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 6 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit); + bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit); + bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit); + bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit); + bf1[4] = bf0[4] + bf0[5]; + bf1[5] = -bf0[5] + bf0[4]; + bf1[6] = -bf0[6] + bf0[7]; + bf1[7] = bf0[7] + bf0[6]; + bf1[8] = bf0[8]; + bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit); + bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit); + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit); + bf1[15] = bf0[15]; + bf1[16] = bf0[16] + bf0[19]; + bf1[17] = bf0[17] + bf0[18]; + bf1[18] = -bf0[18] + bf0[17]; + bf1[19] = -bf0[19] + bf0[16]; + bf1[20] = -bf0[20] + bf0[23]; + bf1[21] = -bf0[21] + bf0[22]; + bf1[22] = bf0[22] + bf0[21]; + bf1[23] = bf0[23] + bf0[20]; + bf1[24] = bf0[24] + bf0[27]; + bf1[25] = bf0[25] + bf0[26]; + bf1[26] = -bf0[26] + bf0[25]; + bf1[27] = -bf0[27] + bf0[24]; + bf1[28] = -bf0[28] + bf0[31]; + bf1[29] = -bf0[29] + bf0[30]; + bf1[30] = bf0[30] + bf0[29]; + bf1[31] = bf0[31] + bf0[28]; + bf1[32] = bf0[32]; + bf1[33] = bf0[33]; + bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit); + bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit); + bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit); + bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit); + bf1[38] = bf0[38]; + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = bf0[41]; + bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit); + bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit); + bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit); + bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit); + bf1[46] = bf0[46]; + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = bf0[49]; + bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit); + bf1[54] = bf0[54]; + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = bf0[57]; + bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit); + bf1[62] = bf0[62]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 7 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit); + bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit); + bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit); + bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit); + bf1[8] = bf0[8] + bf0[9]; + bf1[9] = -bf0[9] + bf0[8]; + bf1[10] = -bf0[10] + bf0[11]; + bf1[11] = bf0[11] + bf0[10]; + bf1[12] = bf0[12] + bf0[13]; + bf1[13] = -bf0[13] + bf0[12]; + bf1[14] = -bf0[14] + bf0[15]; + bf1[15] = bf0[15] + bf0[14]; + bf1[16] = bf0[16]; + bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit); + bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit); + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit); + bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit); + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit); + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit); + bf1[31] = bf0[31]; + bf1[32] = bf0[32] + bf0[35]; + bf1[33] = bf0[33] + bf0[34]; + bf1[34] = -bf0[34] + bf0[33]; + bf1[35] = -bf0[35] + bf0[32]; + bf1[36] = -bf0[36] + bf0[39]; + bf1[37] = -bf0[37] + bf0[38]; + bf1[38] = bf0[38] + bf0[37]; + bf1[39] = bf0[39] + bf0[36]; + bf1[40] = bf0[40] + bf0[43]; + bf1[41] = bf0[41] + bf0[42]; + bf1[42] = -bf0[42] + bf0[41]; + bf1[43] = -bf0[43] + bf0[40]; + bf1[44] = -bf0[44] + bf0[47]; + bf1[45] = -bf0[45] + bf0[46]; + bf1[46] = bf0[46] + bf0[45]; + bf1[47] = bf0[47] + bf0[44]; + bf1[48] = bf0[48] + bf0[51]; + bf1[49] = bf0[49] + bf0[50]; + bf1[50] = -bf0[50] + bf0[49]; + bf1[51] = -bf0[51] + bf0[48]; + bf1[52] = -bf0[52] + bf0[55]; + bf1[53] = -bf0[53] + bf0[54]; + bf1[54] = bf0[54] + bf0[53]; + bf1[55] = bf0[55] + bf0[52]; + bf1[56] = bf0[56] + bf0[59]; + bf1[57] = bf0[57] + bf0[58]; + bf1[58] = -bf0[58] + bf0[57]; + bf1[59] = -bf0[59] + bf0[56]; + bf1[60] = -bf0[60] + bf0[63]; + bf1[61] = -bf0[61] + bf0[62]; + bf1[62] = bf0[62] + bf0[61]; + bf1[63] = bf0[63] + bf0[60]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 8 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit); + bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit); + bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit); + bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit); + bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit); + bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit); + bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit); + bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit); + bf1[16] = bf0[16] + bf0[17]; + bf1[17] = -bf0[17] + bf0[16]; + bf1[18] = -bf0[18] + bf0[19]; + bf1[19] = bf0[19] + bf0[18]; + bf1[20] = bf0[20] + bf0[21]; + bf1[21] = -bf0[21] + bf0[20]; + bf1[22] = -bf0[22] + bf0[23]; + bf1[23] = bf0[23] + bf0[22]; + bf1[24] = bf0[24] + bf0[25]; + bf1[25] = -bf0[25] + bf0[24]; + bf1[26] = -bf0[26] + bf0[27]; + bf1[27] = bf0[27] + bf0[26]; + bf1[28] = bf0[28] + bf0[29]; + bf1[29] = -bf0[29] + bf0[28]; + bf1[30] = -bf0[30] + bf0[31]; + bf1[31] = bf0[31] + bf0[30]; + bf1[32] = bf0[32]; + bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit); + bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit); + bf1[35] = bf0[35]; + bf1[36] = bf0[36]; + bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit); + bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit); + bf1[39] = bf0[39]; + bf1[40] = bf0[40]; + bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit); + bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit); + bf1[43] = bf0[43]; + bf1[44] = bf0[44]; + bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit); + bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit); + bf1[47] = bf0[47]; + bf1[48] = bf0[48]; + bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit); + bf1[51] = bf0[51]; + bf1[52] = bf0[52]; + bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit); + bf1[55] = bf0[55]; + bf1[56] = bf0[56]; + bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit); + bf1[59] = bf0[59]; + bf1[60] = bf0[60]; + bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit); + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 9 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit); + bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit); + bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit); + bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit); + bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit); + bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit); + bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit); + bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit); + bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit); + bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit); + bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit); + bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit); + bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit); + bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit); + bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit); + bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit); + bf1[32] = bf0[32] + bf0[33]; + bf1[33] = -bf0[33] + bf0[32]; + bf1[34] = -bf0[34] + bf0[35]; + bf1[35] = bf0[35] + bf0[34]; + bf1[36] = bf0[36] + bf0[37]; + bf1[37] = -bf0[37] + bf0[36]; + bf1[38] = -bf0[38] + bf0[39]; + bf1[39] = bf0[39] + bf0[38]; + bf1[40] = bf0[40] + bf0[41]; + bf1[41] = -bf0[41] + bf0[40]; + bf1[42] = -bf0[42] + bf0[43]; + bf1[43] = bf0[43] + bf0[42]; + bf1[44] = bf0[44] + bf0[45]; + bf1[45] = -bf0[45] + bf0[44]; + bf1[46] = -bf0[46] + bf0[47]; + bf1[47] = bf0[47] + bf0[46]; + bf1[48] = bf0[48] + bf0[49]; + bf1[49] = -bf0[49] + bf0[48]; + bf1[50] = -bf0[50] + bf0[51]; + bf1[51] = bf0[51] + bf0[50]; + bf1[52] = bf0[52] + bf0[53]; + bf1[53] = -bf0[53] + bf0[52]; + bf1[54] = -bf0[54] + bf0[55]; + bf1[55] = bf0[55] + bf0[54]; + bf1[56] = bf0[56] + bf0[57]; + bf1[57] = -bf0[57] + bf0[56]; + bf1[58] = -bf0[58] + bf0[59]; + bf1[59] = bf0[59] + bf0[58]; + bf1[60] = bf0[60] + bf0[61]; + bf1[61] = -bf0[61] + bf0[60]; + bf1[62] = -bf0[62] + bf0[63]; + bf1[63] = bf0[63] + bf0[62]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 10 + stage++; + cospi = cospi_arr(cos_bit); + bf0 = output; + bf1 = step; + bf1[0] = bf0[0]; + bf1[1] = bf0[1]; + bf1[2] = bf0[2]; + bf1[3] = bf0[3]; + bf1[4] = bf0[4]; + bf1[5] = bf0[5]; + bf1[6] = bf0[6]; + bf1[7] = bf0[7]; + bf1[8] = bf0[8]; + bf1[9] = bf0[9]; + bf1[10] = bf0[10]; + bf1[11] = bf0[11]; + bf1[12] = bf0[12]; + bf1[13] = bf0[13]; + bf1[14] = bf0[14]; + bf1[15] = bf0[15]; + bf1[16] = bf0[16]; + bf1[17] = bf0[17]; + bf1[18] = bf0[18]; + bf1[19] = bf0[19]; + bf1[20] = bf0[20]; + bf1[21] = bf0[21]; + bf1[22] = bf0[22]; + bf1[23] = bf0[23]; + bf1[24] = bf0[24]; + bf1[25] = bf0[25]; + bf1[26] = bf0[26]; + bf1[27] = bf0[27]; + bf1[28] = bf0[28]; + bf1[29] = bf0[29]; + bf1[30] = bf0[30]; + bf1[31] = bf0[31]; + bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit); + bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit); + bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit); + bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit); + bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit); + bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit); + bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit); + bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit); + bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit); + bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit); + bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit); + bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit); + bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit); + bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit); + bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit); + bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit); + bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit); + bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit); + bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit); + bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit); + bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit); + bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit); + bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit); + bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit); + bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit); + bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit); + bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit); + bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit); + bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit); + bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit); + bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit); + bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit); + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); + + // stage 11 + stage++; + bf0 = step; + bf1 = output; + bf1[0] = bf0[0]; + bf1[1] = bf0[32]; + bf1[2] = bf0[16]; + bf1[3] = bf0[48]; + bf1[4] = bf0[8]; + bf1[5] = bf0[40]; + bf1[6] = bf0[24]; + bf1[7] = bf0[56]; + bf1[8] = bf0[4]; + bf1[9] = bf0[36]; + bf1[10] = bf0[20]; + bf1[11] = bf0[52]; + bf1[12] = bf0[12]; + bf1[13] = bf0[44]; + bf1[14] = bf0[28]; + bf1[15] = bf0[60]; + bf1[16] = bf0[2]; + bf1[17] = bf0[34]; + bf1[18] = bf0[18]; + bf1[19] = bf0[50]; + bf1[20] = bf0[10]; + bf1[21] = bf0[42]; + bf1[22] = bf0[26]; + bf1[23] = bf0[58]; + bf1[24] = bf0[6]; + bf1[25] = bf0[38]; + bf1[26] = bf0[22]; + bf1[27] = bf0[54]; + bf1[28] = bf0[14]; + bf1[29] = bf0[46]; + bf1[30] = bf0[30]; + bf1[31] = bf0[62]; + bf1[32] = bf0[1]; + bf1[33] = bf0[33]; + bf1[34] = bf0[17]; + bf1[35] = bf0[49]; + bf1[36] = bf0[9]; + bf1[37] = bf0[41]; + bf1[38] = bf0[25]; + bf1[39] = bf0[57]; + bf1[40] = bf0[5]; + bf1[41] = bf0[37]; + bf1[42] = bf0[21]; + bf1[43] = bf0[53]; + bf1[44] = bf0[13]; + bf1[45] = bf0[45]; + bf1[46] = bf0[29]; + bf1[47] = bf0[61]; + bf1[48] = bf0[3]; + bf1[49] = bf0[35]; + bf1[50] = bf0[19]; + bf1[51] = bf0[51]; + bf1[52] = bf0[11]; + bf1[53] = bf0[43]; + bf1[54] = bf0[27]; + bf1[55] = bf0[59]; + bf1[56] = bf0[7]; + bf1[57] = bf0[39]; + bf1[58] = bf0[23]; + bf1[59] = bf0[55]; + bf1[60] = bf0[15]; + bf1[61] = bf0[47]; + bf1[62] = bf0[31]; + bf1[63] = bf0[63]; + av1_range_check_buf(stage, input, bf1, size, stage_range[stage]); +} diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h new file mode 100644 index 0000000000..9ef54fe4de --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ + +#include "av1/common/av1_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit, + const int8_t *stage_range); +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h new file mode 100644 index 0000000000..2777cc25bc --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ +#include "av1/common/enums.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL]; +extern const int8_t av1_fwd_cos_bit_col[5][5]; +extern const int8_t av1_fwd_cos_bit_row[5][5]; +#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_ diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c new file mode 100644 index 0000000000..12a9535a7c --- /dev/null +++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/txfm_common.h" +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" + +static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT4: return av1_fdct4; + case TXFM_TYPE_DCT8: return av1_fdct8; + case TXFM_TYPE_DCT16: return av1_fdct16; + case TXFM_TYPE_DCT32: return av1_fdct32; + case TXFM_TYPE_DCT64: return av1_fdct64; + case TXFM_TYPE_ADST4: return av1_fadst4; + case TXFM_TYPE_ADST8: return av1_fadst8; + case TXFM_TYPE_ADST16: return av1_fadst16; + case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c; + case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c; + case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c; + case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c; + default: assert(0); return NULL; + } +} + +void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, + const TXFM_2D_FLIP_CFG *cfg, int bd) { + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1; + } + + // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning + for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1; + } +} + +static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output, + const int stride, const TXFM_2D_FLIP_CFG *cfg, + int32_t *buf, int bd) { + int c, r; + // Note when assigning txfm_size_col, we use the txfm_size from the + // row configuration and vice versa. This is intentionally done to + // accurately perform rectangular transforms. When the transform is + // rectangular, the number of columns will be the same as the + // txfm_size stored in the row cfg struct. It will make no difference + // for square transforms. + const int txfm_size_col = tx_size_wide[cfg->tx_size]; + const int txfm_size_row = tx_size_high[cfg->tx_size]; + // Take the shift from the larger dimension in the rectangular case. + const int8_t *shift = cfg->shift; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; + int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; + assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); + assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); + av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd); + + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + // use output buffer as temp buffer + int32_t *temp_in = output; + int32_t *temp_out = output + txfm_size_row; + + // Columns + for (c = 0; c < txfm_size_col; ++c) { + if (cfg->ud_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip upside down + temp_in[r] = input[(txfm_size_row - r - 1) * stride + c]; + } + av1_round_shift_array(temp_in, txfm_size_row, -shift[0]); + txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); + av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); + if (cfg->lr_flip == 0) { + for (r = 0; r < txfm_size_row; ++r) + buf[r * txfm_size_col + c] = temp_out[r]; + } else { + for (r = 0; r < txfm_size_row; ++r) + // flip from left to right + buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r]; + } + } + + DECLARE_ALIGNED(16, int32_t, row_buffer[MAX_TX_SIZE]); + + // Rows + for (r = 0; r < txfm_size_row; ++r) { + txfm_func_row(buf + r * txfm_size_col, row_buffer, cos_bit_row, + stage_range_row); + av1_round_shift_array(row_buffer, txfm_size_col, -shift[2]); + if (abs(rect_type) == 1) { + // Multiply everything by Sqrt2 if the transform is rectangular and the + // size difference is a factor of 2. + for (c = 0; c < txfm_size_col; ++c) { + row_buffer[c] = + round_shift((int64_t)row_buffer[c] * NewSqrt2, NewSqrt2Bits); + } + } + for (c = 0; c < txfm_size_col; ++c) { + output[c * txfm_size_row + r] = row_buffer[c]; + } + } +} + +void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[4 * 4]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[8 * 8]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[16 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[32 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); +} + +void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 64]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + + // Zero out top-right 32x32 area. + for (int col = 0; col < 32; ++col) { + memset(output + col * 64 + 32, 0, 32 * sizeof(*output)); + } + // Zero out the bottom 64x32 area. + memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output)); + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int col = 1; col < 32; ++col) { + memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out right 32x32 area. + for (int col = 0; col < 32; ++col) { + memset(output + col * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x32 indices. + for (int col = 1; col < 32; ++col) { + memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 32]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 32x32 area. + memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out right 32x16 area. + for (int row = 0; row < 16; ++row) { + memset(output + row * 64 + 32, 0, 32 * sizeof(*output)); + } + // Re-pack non-zero coeffs in the first 32x16 indices. + for (int row = 1; row < 16; ++row) { + memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output)); + } +} + +void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride, + TX_TYPE tx_type, int bd) { + int32_t txfm_buf[64 * 16]; + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg); + fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd); + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); + // Note: no repacking needed here. +} + +static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 }; +static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 }; +static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 }; +static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 }; +static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 }; +static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 }; +static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 }; +static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 }; + +const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = { + fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, + fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, + fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64, + fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32, + fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16, +}; + +const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 13, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 13, 12, 13 }, + { 0, 13, 13, 12, 13 }, + { 0, 0, 13, 12, 13 } + }; + +const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/] + [MAX_TXWH_IDX /*txh_idx*/] = { + { 13, 13, 12, 0, 0 }, + { 13, 13, 13, 12, 0 }, + { 13, 13, 12, 13, 12 }, + { 0, 12, 13, 12, 11 }, + { 0, 0, 12, 11, 10 } + }; + +static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 }; +static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 }; +static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 }; +static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 }; +static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10, + 11, 11, 11, 11, 11, 11 }; + +static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 }; +static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 }; +static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 }; + +static const int8_t fidtx4_range_mult2[1] = { 1 }; +static const int8_t fidtx8_range_mult2[1] = { 2 }; +static const int8_t fidtx16_range_mult2[1] = { 3 }; +static const int8_t fidtx32_range_mult2[1] = { 4 }; + +static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = { + fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2, + fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2, + fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2, + fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2 +}; + +static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) { + av1_zero(cfg->stage_range_col); + av1_zero(cfg->stage_range_row); + + const int8_t *const range_mult2_col = + fwd_txfm_range_mult2_list[cfg->txfm_type_col]; + const int stage_num_col = cfg->stage_num_col; + // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow. + for (int i = 0; i < stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) + cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1; + + const int8_t *const range_mult2_row = + fwd_txfm_range_mult2_list[cfg->txfm_type_row]; + const int stage_num_row = cfg->stage_num_row; + // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow. + for (int i = 0; i < stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { + cfg->stage_range_row[i] = + (range_mult2_col[stage_num_col - 1] + range_mult2_row[i] + 1) >> 1; + } +} + +void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, + TXFM_2D_FLIP_CFG *cfg) { + assert(cfg != NULL); + cfg->tx_size = tx_size; + set_flip_cfg(tx_type, cfg); + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + cfg->shift = av1_fwd_txfm_shift_ls[tx_size]; + cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; + assert(cfg->txfm_type_col != TXFM_TYPE_INVALID); + cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; + assert(cfg->txfm_type_row != TXFM_TYPE_INVALID); + cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; + cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; + set_fwd_txfm_non_scale_range(cfg); +} diff --git a/third_party/aom/av1/encoder/av1_ml_partition_models.h b/third_party/aom/av1/encoder/av1_ml_partition_models.h new file mode 100644 index 0000000000..2572b138d5 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_ml_partition_models.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ +#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// TODO(kyslov): Replace with proper weights after training AV1 models + +#define FEATURES 6 +static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = { + 0.35755366f, 0.86281112f, -0.20871686f, 0.0409634f, 0.97305766f, + 0.75510254f, 0.04860447f, 0.77095283f, -0.44105278f, -0.3755049f, + -0.08456618f, 1.1821136f, -0.73956301f, 1.30016453f, 0.45566902f, + 0.4742967f, 0.44213975f, 0.4876028f, 0.26720522f, -0.34429858f, + -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f, + 0.60262819f, -0.54788715f, -0.27272022f, 1.0995462f, -0.36338376f, + -0.64836313f, 0.16057039f, 1.02782791f, 0.9985311f, 0.90607883f, + 0.80570411f, -0.07750863f, -0.74006402f, 1.72839526f, 1.72355343f, + 1.69288916f, 1.59102043f, 0.14140216f, -1.47262839f, 0.4262519f, + -0.33805936f, -0.02449707f, 0.67203692f +}; + +static const float av1_var_part_nn_bias_64_layer0[8] = { + 0.39995694f, 0.65593756f, 1.12876737f, 1.28790576f, + 0.53468556f, 0.3177908f, -0.74388266f, -1.81131248f +}; + +static const float av1_var_part_nn_weights_64_layer1[8] = { + -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f, + 0.79258322f, 1.74626188f, -5.41831f, 3.33887435f +}; + +static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f }; + +static const float av1_var_part_means_64[FEATURES] = { + 5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f +}; +static const float av1_var_part_vars_64[FEATURES] = { + 0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f +}; + +static const NN_CONFIG av1_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_var_part_nn_weights_64_layer0, + av1_var_part_nn_weights_64_layer1, + }, + { + av1_var_part_nn_bias_64_layer0, + av1_var_part_nn_bias_64_layer1, + }, +}; + +static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.97886049f, -1.66262011f, 0.94902798f, 0.7080922f, 0.91181186f, + 0.35222601f, -0.04428585f, 0.42086472f, -0.0206325f, -0.77937809f, + -0.70947522f, -1.24463119f, 0.23739497f, -1.34327359f, 0.01024804f, + 0.4544633f, -0.96907661f, 0.67279522f, 0.23180693f, 1.54063368f, + -0.15700707f, 0.18597331f, 0.34167589f, 0.40736558f, 0.69213366f, + -1.33584593f, 1.21190814f, 1.26725267f, 1.21284802f, 1.26611399f, + 0.17546514f, -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f, + -1.26890855f, 0.12166347f, -0.94565678f, -1.47475267f, -0.69279948f, + -0.10166587f, -0.23489881f, 0.57123565f, 0.80051137f, -1.28411946f, + -1.36576732f, -1.30257508f, -1.30575106f +}; + +static const float av1_var_part_nn_bias_32_layer0[8] = { + -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f, + -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f +}; + +static const float av1_var_part_nn_weights_32_layer1[8] = { + 1.99257161f, 0.7331492f, 1.33539961f, 1.13501456f, + -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f +}; + +static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f }; + +static const float av1_var_part_means_32[FEATURES] = { + 5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f +}; + +static const float av1_var_part_vars_32[FEATURES] = { + 0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f +}; + +static const NN_CONFIG av1_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_var_part_nn_weights_32_layer0, + av1_var_part_nn_weights_32_layer1, + }, + { + av1_var_part_nn_bias_32_layer0, + av1_var_part_nn_bias_32_layer1, + }, +}; + +static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.45118305f, -0.22068295f, 0.4604435f, -0.1446326f, -0.15765035f, + 0.42260198f, -0.0945916f, 0.49544996f, 0.62781567f, -0.41564372f, + -0.39103292f, 0.44407624f, 0.48382613f, -0.85424238f, -0.00961433f, + 0.25383582f, 0.14403897f, 0.00901859f, -0.83201967f, -0.19323284f, + 0.59271213f, 0.69487457f, 0.6897112f, 0.62768521f, 0.9204492f, + -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f, -0.11289049f, + 0.26290832f, -0.41850393f, 0.17239733f, 0.41770622f, 0.43725942f, + 0.19362467f, -0.35955731f, -0.899446f, 0.49726389f, 0.66569571f, + 0.65893982f, 0.53199654f, -0.1158694f, -0.26472603f, 0.4155923f, + 0.15059544f, 0.09596755f, 0.26247133f +}; + +static const float av1_var_part_nn_bias_16_layer0[8] = { + 1.64486321f, -0.11851574f, 1.29322833f, -0.61193136f, + 0.33027532f, 1.04197232f, -0.80716674f, 0.88681233f +}; + +static const float av1_var_part_nn_weights_16_layer1[8] = { + -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f, + -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f +}; + +static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f }; + +static const float av1_var_part_means_16[FEATURES] = { + 5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f +}; + +static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f, + 0.01958579f, 0.02437927f, + 0.02420755f, 0.0192003f }; + +static const NN_CONFIG av1_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + av1_var_part_nn_weights_16_layer0, + av1_var_part_nn_weights_16_layer1, + }, + { + av1_var_part_nn_bias_16_layer0, + av1_var_part_nn_bias_16_layer1, + }, +}; + +#undef FEATURES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_ diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.c b/third_party/aom/av1/encoder/av1_noise_estimate.c new file mode 100644 index 0000000000..25007bb6d4 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_noise_estimate.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_noise_estimate.h" +#include "av1/encoder/encoder.h" +#if CONFIG_AV1_TEMPORAL_DENOISING +#include "av1/encoder/av1_temporal_denoiser.h" +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING +// For SVC: only do noise estimation on top spatial layer. +static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) { + return (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); +} +#endif + +void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { + const int64_t area = (int64_t)width * height; + ne->enabled = 0; + ne->level = (area < 1280 * 720) ? kLowLow : kLow; + ne->value = 0; + ne->count = 0; + ne->thresh = 90; + ne->last_w = 0; + ne->last_h = 0; + if (area >= 1920 * 1080) { + ne->thresh = 200; + } else if (area >= 1280 * 720) { + ne->thresh = 140; + } else if (area >= 640 * 360) { + ne->thresh = 115; + } + ne->num_frames_estimate = 15; + ne->adapt_thresh = (3 * ne->thresh) >> 1; +} + +static int enable_noise_estimation(AV1_COMP *const cpi) { + const int resize_pending = is_frame_resize_pending(cpi); + +#if CONFIG_AV1_HIGHBITDEPTH + if (cpi->common.seq_params->use_highbitdepth) return 0; +#endif +// Enable noise estimation if denoising is on. +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->common.width >= 320 && cpi->common.height >= 180) + return 1; +#endif + // Only allow noise estimate under certain encoding mode. + // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original. + // Not enabled for SVC mode and screen_content_mode. + // Not enabled for low resolutions. + if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR && + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && + resize_pending == 0 && !cpi->ppi->use_svc && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && + cpi->common.width * cpi->common.height >= 640 * 360) + return 1; + else + return 0; +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static void copy_frame(YV12_BUFFER_CONFIG *const dest, + const YV12_BUFFER_CONFIG *const src) { + const uint8_t *srcbuf = src->y_buffer; + uint8_t *destbuf = dest->y_buffer; + + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + + for (int r = 0; r < dest->y_height; ++r) { + memcpy(destbuf, srcbuf, dest->y_width); + destbuf += dest->y_stride; + srcbuf += src->y_stride; + } +} +#endif // CONFIG_AV1_TEMPORAL_DENOISING + +NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { + int noise_level = kLowLow; + if (ne->value > (ne->thresh << 1)) { + noise_level = kHigh; + } else { + if (ne->value > ne->thresh) + noise_level = kMedium; + else if (ne->value > (ne->thresh >> 1)) + noise_level = kLow; + else + noise_level = kLowLow; + } + return noise_level; +} + +void av1_update_noise_estimate(AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); + // Estimate of noise level every frame_period frames. + int frame_period = 8; + int thresh_consec_zeromv = 2; + int frame_counter = cm->current_frame.frame_number; + // Estimate is between current source and last source. + YV12_BUFFER_CONFIG *last_source = cpi->last_source; +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { + last_source = &cpi->denoiser.last_source; + // Tune these thresholds for different resolutions when denoising is + // enabled. + if (cm->width > 640 && cm->width <= 1920) { + thresh_consec_zeromv = 2; + } + } +#endif + ne->enabled = enable_noise_estimation(cpi); + if (cpi->svc.number_spatial_layers > 1) + frame_counter = cpi->svc.current_superframe; + if (!ne->enabled || frame_counter % frame_period != 0 || + last_source == NULL || + (cpi->svc.number_spatial_layers == 1 && + (ne->last_w != cm->width || ne->last_h != cm->height))) { +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + copy_frame(&cpi->denoiser.last_source, cpi->source); +#endif + if (last_source != NULL) { + ne->last_w = cm->width; + ne->last_h = cm->height; + } + return; + } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && + cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) { + // Force noise estimation to 0 and denoiser off if content has high motion. + ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->svc.current_superframe > 1) { + av1_denoiser_set_noise_level(cpi, ne->level); + copy_frame(&cpi->denoiser.last_source, cpi->source); + } +#endif + return; + } else { + unsigned int bin_size = 100; + unsigned int hist[MAX_VAR_HIST_BINS] = { 0 }; + unsigned int hist_avg[MAX_VAR_HIST_BINS]; + unsigned int max_bin = 0; + unsigned int max_bin_count = 0; + unsigned int bin_cnt; + BLOCK_SIZE bsize = BLOCK_16X16; + // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have + // been encoded as zero/small mv at least x consecutive frames, compute + // the variance to update estimate of noise in the source. + const uint8_t *src_y = cpi->source->y_buffer; + const int src_ystride = cpi->source->y_stride; + const uint8_t *last_src_y = last_source->y_buffer; + const int last_src_ystride = last_source->y_stride; + int mi_row, mi_col; + int num_low_motion = 0; + int frame_low_motion = 1; + for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) { + for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) { + int bl_index = + (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1); + if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv) + num_low_motion++; + } + } + if (num_low_motion < + (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3)) + frame_low_motion = 0; + for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) { + // 16x16 blocks, 1/4 sample of frame. + if (mi_row % 8 == 0 && mi_col % 8 == 0 && + mi_row < mi_params->mi_rows - 3 && + mi_col < mi_params->mi_cols - 3) { + int bl_index = + (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1); + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + (mi_params->mi_cols >> 1); + int bl_index3 = bl_index2 + 1; + int consec_zeromv = + AOMMIN(cpi->consec_zero_mv[bl_index], + AOMMIN(cpi->consec_zero_mv[bl_index1], + AOMMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + // Only consider blocks that are likely steady background. i.e, have + // been encoded as zero/low motion x (= thresh_consec_zeromv) frames + // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all + // 4 sub-blocks for 16x16 block. And exclude this frame if + // high_source_sad is true (i.e., scene/content change). + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv && + !cpi->rc.high_source_sad) { + unsigned int sse; + // Compute variance between co-located blocks from current and + // last input frames. + unsigned int variance = cpi->ppi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); + unsigned int hist_index = variance / bin_size; + if (hist_index < MAX_VAR_HIST_BINS) + hist[hist_index]++; + else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1)) + hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail + } + } + src_y += 4; + last_src_y += 4; + } + src_y += (src_ystride << 2) - (mi_params->mi_cols << 2); + last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2); + } + ne->last_w = cm->width; + ne->last_h = cm->height; + // Adjust histogram to account for effect that histogram flattens + // and shifts to zero as scene darkens. + if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) { + hist[0] = 0; + hist[1] >>= 2; + hist[2] >>= 2; + hist[3] >>= 2; + hist[4] >>= 1; + hist[5] >>= 1; + hist[6] = 3 * hist[6] >> 1; + hist[MAX_VAR_HIST_BINS - 1] >>= 1; + } + + // Average hist[] and find largest bin + for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) { + if (bin_cnt == 0) + hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3; + else if (bin_cnt == MAX_VAR_HIST_BINS - 1) + hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2; + else if (bin_cnt == MAX_VAR_HIST_BINS - 2) + hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + + (hist[bin_cnt + 1] >> 1) + 2) >> + 2; + else + hist_avg[bin_cnt] = + (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >> + 2; + + if (hist_avg[bin_cnt] > max_bin_count) { + max_bin_count = hist_avg[bin_cnt]; + max_bin = bin_cnt; + } + } + // Scale by 40 to work with existing thresholds + ne->value = (int)((3 * ne->value + max_bin * 40) >> 2); + // Quickly increase VNR strength when the noise level increases suddenly. + if (ne->level < kMedium && ne->value > ne->adapt_thresh) { + ne->count = ne->num_frames_estimate; + } else { + ne->count++; + } + if (ne->count == ne->num_frames_estimate) { + // Reset counter and check noise level condition. + ne->num_frames_estimate = 30; + ne->count = 0; + ne->level = av1_noise_estimate_extract_level(ne); +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + av1_denoiser_set_noise_level(cpi, ne->level); +#endif + } + } +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + copy_frame(&cpi->denoiser.last_source, cpi->source); +#endif +} diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.h b/third_party/aom/av1/encoder/av1_noise_estimate.h new file mode 100644 index 0000000000..85530666f6 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_noise_estimate.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ +#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ + +#include "av1/encoder/block.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_VAR_HIST_BINS 20 + +typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL; + +typedef struct noise_estimate { + int enabled; + NOISE_LEVEL level; + int value; + int thresh; + int adapt_thresh; + int count; + int last_w; + int last_h; + int num_frames_estimate; +} NOISE_ESTIMATE; + +struct AV1_COMP; + +void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height); + +NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne); + +void av1_update_noise_estimate(struct AV1_COMP *const cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_ diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c new file mode 100644 index 0000000000..110d17f434 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_quantize.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" + +#include "av1/common/idct.h" +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) { + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + *eob_ptr = 0; +} + +int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], + const int16_t dequant_ptr[2], + const int16_t round_ptr[2], int log_scale, + const int16_t *scan, int coeff_count, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr)); + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; + int eob = 0; + for (int i = 0; i < coeff_count; i++) { + const int rc = scan[i]; + const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if ((abs_coeff << (1 + log_scale)) >= thresh) { + abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + if (tmp32) { + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = + (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + } + if (tmp32) eob = i + 1; + } + return eob; +} + +static void quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, int log_scale) { + int i, eob = -1; + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (qm_ptr == NULL && iqm_ptr == NULL) { + *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr, + log_scale, scan, (int)n_coeffs, + coeff_ptr, qcoeff_ptr, dqcoeff_ptr); + } else { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + abs_coeff += rounding[rc != 0]; + abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + + if (tmp32) eob = i; + } + *eob_ptr = eob + 1; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void highbd_quantize_fp_helper_c( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, int log_scale) { + int i; + int eob = -1; + const int shift = 16 - log_scale; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)iscan; + + if (qm_ptr || iqm_ptr) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const int coeff_sign = AOMSIGN(coeff); + const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int abs_qcoeff = 0; + if (abs_coeff * wt >= + (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { + const int64_t tmp = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + abs_qcoeff = + (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = i; + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } else { + const int log_scaled_round_arr[2] = { + ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale), + }; + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int rc01 = (rc != 0); + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int log_scaled_round = log_scaled_round_arr[rc01]; + if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) { + const int quant = quant_ptr[rc01]; + const int dequant = dequant_ptr[rc01]; + const int64_t tmp = (int64_t)abs_coeff + log_scaled_round; + const int abs_qcoeff = (int)((tmp * quant) >> shift); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + if (abs_qcoeff) eob = i; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)iscan; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (int i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) eob = i; + } + *eob_ptr = eob + 1; +} + +void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); +} + +void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); +} + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qm_ptr != NULL && iqm_ptr != NULL) { + quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } + } +} + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#if !CONFIG_REALTIME_ONLY + if (qparam->use_quant_b_adapt) { + // TODO(sarahparker) These quantize_b optimizations need SIMD + // implementations + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX, + p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_quantize_b_64x64_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + return; + } +#endif // !CONFIG_REALTIME_ONLY + + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } + } +} + +static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp; + int eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + (void)sc; + assert(qparam->log_scale >= 0 && qparam->log_scale < (3)); + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0], + eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale); +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + if (qm_ptr != NULL && iqm_ptr != NULL) { + highbd_quantize_fp_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qparam->log_scale); + } +} + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; +#if !CONFIG_REALTIME_ONLY + if (qparam->use_quant_b_adapt) { + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_highbd_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_highbd_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } + return; + } +#endif // !CONFIG_REALTIME_ONLY + + if (qm_ptr != NULL && iqm_ptr != NULL) { + aom_highbd_quantize_b_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_highbd_quantize_b_32x32( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } +} + +static INLINE void highbd_quantize_dc( + const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[0]; + const int coeff_sign = AOMSIGN(coeff); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + const int64_t tmpw = tmp * wt; + const int abs_qcoeff = + (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int dequant = + (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam) { + // obsolete skip_block + const int skip_block = 0; + const qm_val_t *qm_ptr = qparam->qmatrix; + const qm_val_t *iqm_ptr = qparam->iqmatrix; + (void)sc; + + highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX, + p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr, + qparam->log_scale); +} + +void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + int log_scale) { + highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, + log_scale); +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static void invert_quant(int16_t *quant, int16_t *shift, int d) { + uint32_t t; + int l, m; + t = d; + l = get_msb(t); + m = 1 + (1 << (16 + l)) / d; + *quant = (int16_t)(m - (1 << 16)); + *shift = 1 << (16 - l); +} + +static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) { + const int quant = av1_dc_quant_QTX(q, 0, bit_depth); + switch (bit_depth) { + case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); + case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); + case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} + +void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq) { + int i, q, quant_QTX; + + for (q = 0; q < QINDEX_RANGE; q++) { + const int qzbin_factor = get_qzbin_factor(q, bit_depth); + const int qrounding_factor = q == 0 ? 64 : 48; + + for (i = 0; i < 2; ++i) { + const int qrounding_factor_fp = 64; + // y quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, 0, bit_depth); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], + quant_QTX); + quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->y_dequant_QTX[q][i] = quant_QTX; + + // u quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth); + invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i], + quant_QTX); + quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->u_dequant_QTX[q][i] = quant_QTX; + + // v quantizer with TX scale + quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth) + : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth); + invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i], + quant_QTX); + quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX; + quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7; + quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7); + quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7; + deq->v_dequant_QTX[q][i] = quant_QTX; + } + + for (i = 2; i < 8; i++) { // 8: SIMD width + quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; + quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; + quants->y_zbin[q][i] = quants->y_zbin[q][1]; + quants->y_round[q][i] = quants->y_round[q][1]; + deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1]; + + quants->u_quant[q][i] = quants->u_quant[q][1]; + quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1]; + quants->u_round_fp[q][i] = quants->u_round_fp[q][1]; + quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1]; + quants->u_zbin[q][i] = quants->u_zbin[q][1]; + quants->u_round[q][i] = quants->u_round[q][1]; + deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1]; + + quants->v_quant[q][i] = quants->v_quant[q][1]; + quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1]; + quants->v_round_fp[q][i] = quants->v_round_fp[q][1]; + quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1]; + quants->v_zbin[q][i] = quants->v_zbin[q][1]; + quants->v_round[q][i] = quants->v_round[q][1]; + deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1]; + } + } +} + +static INLINE bool deltaq_params_have_changed( + const DeltaQuantParams *prev_deltaq_params, + const CommonQuantParams *quant_params) { + return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q || + prev_deltaq_params->u_dc_delta_q != quant_params->u_dc_delta_q || + prev_deltaq_params->v_dc_delta_q != quant_params->v_dc_delta_q || + prev_deltaq_params->u_ac_delta_q != quant_params->u_ac_delta_q || + prev_deltaq_params->v_ac_delta_q != quant_params->v_ac_delta_q); +} + +void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, + const CommonQuantParams *quant_params, + aom_bit_depth_t bit_depth) { + DeltaQuantParams *const prev_deltaq_params = + &enc_quant_dequant_params->prev_deltaq_params; + + // Re-initialize the quantizer only if any of the dc/ac deltaq parameters + // change. + if (!deltaq_params_have_changed(prev_deltaq_params, quant_params)) return; + QUANTS *const quants = &enc_quant_dequant_params->quants; + Dequants *const dequants = &enc_quant_dequant_params->dequants; + av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q, + quant_params->u_dc_delta_q, quant_params->u_ac_delta_q, + quant_params->v_dc_delta_q, quant_params->v_ac_delta_q, + quants, dequants); + + // Record the state of deltaq parameters. + prev_deltaq_params->y_dc_delta_q = quant_params->y_dc_delta_q; + prev_deltaq_params->u_dc_delta_q = quant_params->u_dc_delta_q; + prev_deltaq_params->v_dc_delta_q = quant_params->v_dc_delta_q; + prev_deltaq_params->u_ac_delta_q = quant_params->u_ac_delta_q; + prev_deltaq_params->v_ac_delta_q = quant_params->v_ac_delta_q; +} + +void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params, + int qindex, MACROBLOCK *x) { + const QUANTS *const quants = &enc_quant_dequant_params->quants; + const Dequants *const dequants = &enc_quant_dequant_params->dequants; + x->qindex = qindex; + x->seg_skip_block = + 0; // TODO(angiebird): Find a proper place to init this variable. + + // Y + x->plane[0].quant_QTX = quants->y_quant[qindex]; + x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex]; + x->plane[0].round_fp_QTX = quants->y_round_fp[qindex]; + x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex]; + x->plane[0].zbin_QTX = quants->y_zbin[qindex]; + x->plane[0].round_QTX = quants->y_round[qindex]; + x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex]; + + // U + x->plane[1].quant_QTX = quants->u_quant[qindex]; + x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex]; + x->plane[1].round_fp_QTX = quants->u_round_fp[qindex]; + x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex]; + x->plane[1].zbin_QTX = quants->u_zbin[qindex]; + x->plane[1].round_QTX = quants->u_round[qindex]; + x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex]; + + // V + x->plane[2].quant_QTX = quants->v_quant[qindex]; + x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex]; + x->plane[2].round_fp_QTX = quants->v_round_fp[qindex]; + x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex]; + x->plane[2].zbin_QTX = quants->v_zbin[qindex]; + x->plane[2].round_QTX = quants->v_round[qindex]; + x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex]; +} + +void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id, + MACROBLOCKD *xd) { + const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id); + const int qmlevel_y = + use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1; + const int qmlevel_u = + use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1; + const int qmlevel_v = + use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1; + const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v }; + for (int i = 0; i < MAX_MB_PLANE; ++i) { + const int qmlevel = qmlevel_ls[i]; + memcpy(&xd->plane[i].seg_qmatrix[segment_id], + quant_params->gqmatrix[qmlevel][i], + sizeof(quant_params->gqmatrix[qmlevel][i])); + memcpy(&xd->plane[i].seg_iqmatrix[segment_id], + quant_params->giqmatrix[qmlevel][i], + sizeof(quant_params->giqmatrix[qmlevel][i])); + } +} + +void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x, + int segment_id, const int do_update) { + const AV1_COMMON *const cm = &cpi->common; + const CommonQuantParams *const quant_params = &cm->quant_params; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + int qindex_rd; + + const int current_qindex = AOMMAX( + 0, + AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag + ? quant_params->base_qindex + x->delta_qindex + : quant_params->base_qindex)); + const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex); + + if (cpi->oxcf.sb_qp_sweep) { + const int current_rd_qindex = + AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag + ? quant_params->base_qindex + + x->rdmult_delta_qindex + : quant_params->base_qindex)); + qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex); + } else { + qindex_rd = qindex; + } + + const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q; + const int rdmult = av1_compute_rd_mult( + qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + const int qindex_change = x->qindex != qindex; + if (qindex_change || do_update) { + av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x); + } + + MACROBLOCKD *const xd = &x->e_mbd; + if ((segment_id != x->prev_segment_id) || + av1_use_qmatrix(quant_params, xd, segment_id)) { + av1_set_qmatrix(quant_params, segment_id, xd); + } + + x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + + av1_set_error_per_bit(&x->errorperbit, rdmult); + av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd); + + x->prev_segment_id = segment_id; +} + +void av1_frame_init_quantizer(AV1_COMP *cpi) { + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + x->prev_segment_id = -1; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1); +} + +static int adjust_hdr_cb_deltaq(int base_qindex) { + double baseQp = base_qindex / QP_SCALE_FACTOR; + const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET; + const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR; + int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5)); + dqpCb = AOMMIN(0, dqpCb); + dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR); + return dqpCb; +} + +static int adjust_hdr_cr_deltaq(int base_qindex) { + double baseQp = base_qindex / QP_SCALE_FACTOR; + const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET; + const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR; + int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5)); + dqpCr = AOMMIN(0, dqpCr); + dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR); + return dqpCr; +} + +void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel, + int q, int enable_chroma_deltaq, int enable_hdr_deltaq) { + // quantizer has to be reinitialized with av1_init_quantizer() if any + // delta_q changes. + CommonQuantParams *quant_params = &cm->quant_params; + quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q); + quant_params->y_dc_delta_q = 0; + + if (enable_chroma_deltaq) { + // TODO(aomedia:2717): need to design better delta + quant_params->u_dc_delta_q = 2; + quant_params->u_ac_delta_q = 2; + quant_params->v_dc_delta_q = 2; + quant_params->v_ac_delta_q = 2; + } else { + quant_params->u_dc_delta_q = 0; + quant_params->u_ac_delta_q = 0; + quant_params->v_dc_delta_q = 0; + quant_params->v_ac_delta_q = 0; + } + + // following section 8.3.2 in T-REC-H.Sup15 document + // to apply to AV1 qindex in the range of [0, 255] + if (enable_hdr_deltaq) { + int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex); + int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex); + quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb; + quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr; + if (dqpCb != dqpCr) { + cm->seq_params->separate_uv_delta_q = 1; + } + } + + quant_params->qmatrix_level_y = + aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel); + quant_params->qmatrix_level_u = + aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q, + min_qmlevel, max_qmlevel); + + if (!cm->seq_params->separate_uv_delta_q) + quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; + else + quant_params->qmatrix_level_v = + aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q, + min_qmlevel, max_qmlevel); +} + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, + 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, + 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int av1_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int av1_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; + + return 63; +} diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h new file mode 100644 index 0000000000..040973376d --- /dev/null +++ b/third_party/aom/av1/encoder/av1_quantize.h @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_ +#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_ + +#include "config/aom_config.h" + +#include "av1/common/quant_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/block.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct QUANT_PARAM { + int log_scale; + TX_SIZE tx_size; + const qm_val_t *qmatrix; + const qm_val_t *iqmatrix; + int use_quant_b_adapt; + int use_optimize_b; + int xform_quant_idx; +} QUANT_PARAM; + +typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +// The QUANTS structure is used only for internal quantizer setup in +// av1_quantize.c. +// All of its fields use the same coefficient shift/scaling at TX. +typedef struct { + // 0: dc 1: ac 2-8: ac repeated to SIMD width + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]); +} QUANTS; + +// The Dequants structure is used only for internal quantizer setup in +// av1_quantize.c. +// Fields are suffixed according to whether or not they're expressed in +// the same coefficient shift/precision as TX or a fixed Q3 format. +typedef struct { + DECLARE_ALIGNED(16, int16_t, + y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width + DECLARE_ALIGNED(16, int16_t, + v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width +} Dequants; + +// The DeltaQuantParams structure holds the dc/ac deltaq parameters. +typedef struct { + int y_dc_delta_q; + int u_dc_delta_q; + int u_ac_delta_q; + int v_dc_delta_q; + int v_ac_delta_q; +} DeltaQuantParams; + +typedef struct { + // Quantization parameters for internal quantizer setup. + QUANTS quants; + // Dequantization parameters for internal quantizer setup. + Dequants dequants; + // Deltaq parameters to track the state of the dc/ac deltaq parameters in + // cm->quant_params. It is used to decide whether the quantizer tables need + // to be re-initialized. + DeltaQuantParams prev_deltaq_params; +} EncQuantDequantParams; + +struct AV1_COMP; +struct AV1Common; + +void av1_frame_init_quantizer(struct AV1_COMP *cpi); + +void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x, + int segment_id, const int do_update); + +void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q, + int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q, + int v_ac_delta_q, QUANTS *const quants, + Dequants *const deq); + +void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params, + const CommonQuantParams *quant_params, + aom_bit_depth_t bit_depth); + +void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel, + int max_qmlevel, int q, int enable_chroma_deltaq, + int enable_hdr_deltaq); + +int av1_quantizer_to_qindex(int quantizer); + +int av1_qindex_to_quantizer(int qindex); + +void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr); + +/*!\brief Quantize transform coefficients without using qmatrix + * + * quant_ptr, dequant_ptr and round_ptr are size 2 arrays, + * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs. + * + * \param[in] quant_ptr 16-bit fixed point representation of inverse + * quantize step size, i.e. 2^16/dequant + * \param[in] dequant_ptr quantize step size + * \param[in] round_ptr rounding + * \param[in] log_scale the relative log scale of the transform + * coefficients + * \param[in] scan scan[i] indicates the position of ith to-be-coded + * coefficient + * \param[in] coeff_count number of coefficients + * \param[out] qcoeff_ptr quantized coefficients + * \param[out] dqcoeff_ptr dequantized coefficients + * + * \return The last non-zero coefficient's scan index plus 1 + */ +int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], + const int16_t dequant_ptr[2], + const int16_t round_ptr[2], int log_scale, + const int16_t *scan, int coeff_count, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr); + +void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, const QUANT_PARAM *qparam); + +/*!\brief Update quantize parameters in MACROBLOCK + * + * \param[in] enc_quant_dequant_params This parameter cached the quantize and + * dequantize parameters for all q + * indices. + * \param[in] qindex Quantize index used for the current + * superblock. + * \param[out] x A superblock data structure for + * encoder. + */ +void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params, + int qindex, MACROBLOCK *x); + +/*!\brief Update quantize matrix in MACROBLOCKD based on segment id + * + * \param[in] quant_params Quantize parameters used by encoder and decoder + * \param[in] segment_id Segment id. + * \param[out] xd A superblock data structure used by encoder and + * decoder. + */ +void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id, + MACROBLOCKD *xd); + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, const MACROBLOCK_PLANE *p, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, + const SCAN_ORDER *sc, + const QUANT_PARAM *qparam); + +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_ diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.c b/third_party/aom/av1/encoder/av1_temporal_denoiser.c new file mode 100644 index 0000000000..3012df6311 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.c @@ -0,0 +1,805 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_temporal_denoiser.h" +#include "av1/encoder/encoder.h" + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv); +#endif + +static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + return 3 + (increase_denoising ? 1 : 0); +} + +static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 4; +} + +static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 625; +} + +static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40); +} + +static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, + int motion_magnitude) { + if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) { + if (increase_denoising) + return (1 << num_pels_log2_lookup[bs]) << 2; + else + return 0; + } else { + return (1 << num_pels_log2_lookup[bs]) << 4; + } +} + +static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); +} + +// TODO(kyslov): If increase_denoising is enabled in the future, +// we might need to update the code for calculating 'total_adj' in +// case the C code is not bit-exact with corresponding sse2 code. +int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + int r, c; + const uint8_t *sig_start = sig; + const uint8_t *mc_avg_start = mc_avg; + uint8_t *avg_start = avg; + int diff, adj, absdiff, delta; + int adj_val[] = { 3, 4, 6 }; + int total_adj = 0; + int shift_inc = 1; + + // If motion_magnitude is small, making the denoiser more aggressive by + // increasing the adjustment for each level. Add another increment for + // blocks that are labeled for increase denoising. + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + if (increase_denoising) { + shift_inc = 2; + } + adj_val[0] += shift_inc; + adj_val[1] += shift_inc; + adj_val[2] += shift_inc; + } + + // First attempt to apply a strong temporal denoising filter. + for (r = 0; r < block_size_high[bs]; ++r) { + for (c = 0; c < block_size_wide[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + absdiff = abs(diff); + + if (absdiff <= absdiff_thresh(bs, increase_denoising)) { + avg[c] = mc_avg[c]; + total_adj += diff; + } else { + switch (absdiff) { + case 4: + case 5: + case 6: + case 7: adj = adj_val[0]; break; + case 8: + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: adj = adj_val[1]; break; + default: adj = adj_val[2]; + } + if (diff > 0) { + avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj); + total_adj += adj; + } else { + avg[c] = AOMMAX(0, sig[c] - adj); + total_adj -= adj; + } + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // If the strong filter did not modify the signal too much, we're all set. + if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + + // Otherwise, we try to dampen the filter if the delta is not too high. + delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >> + num_pels_log2_lookup[bs]) + + 1; + + if (delta >= delta_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + + mc_avg = mc_avg_start; + avg = avg_start; + sig = sig_start; + for (r = 0; r < block_size_high[bs]; ++r) { + for (c = 0; c < block_size_wide[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + adj = abs(diff); + if (adj > delta) { + adj = delta; + } + if (diff > 0) { + // Diff positive means we made positive adjustment above + // (in first try/attempt), so now make negative adjustment to bring + // denoised signal down. + avg[c] = AOMMAX(0, avg[c] - adj); + total_adj -= adj; + } else { + // Diff negative means we made negative adjustment above + // (in first try/attempt), so now make positive adjustment to bring + // denoised signal up. + avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj); + total_adj += adj; + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // We can use the filter if it has been sufficiently dampened + if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + return COPY_BLOCK; +} + +static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, + int mi_col) { + return framebuf + (stride * mi_row << 2) + (mi_col << 2); +} + +static AV1_DENOISER_DECISION perform_motion_compensation( + AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, + int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, + int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width, + int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer, + int use_gf_temporal_ref) { + const int sse_diff = (ctx->newmv_sse == UINT_MAX) + ? 0 + : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); + int frame; + int denoise_layer_idx = 0; + MACROBLOCKD *filter_mbd = &mb->e_mbd; + MB_MODE_INFO *mi = filter_mbd->mi[0]; + MB_MODE_INFO saved_mi; + int i; + struct buf_2d saved_dst[MAX_MB_PLANE]; + struct buf_2d saved_pre[MAX_MB_PLANE]; + // const RefBuffer *saved_block_refs[2]; + MV_REFERENCE_FRAME saved_frame; + + frame = ctx->best_reference_frame; + + saved_mi = *mi; + + // Avoid denoising small blocks. When noise > kDenLow or frame width > 480, + // denoise 16x16 blocks. + if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 || + (bs == BLOCK_16X16 && width > 480 && + denoiser->denoising_level <= kDenLow)) + return COPY_BLOCK; + + // If the best reference frame uses inter-prediction and there is enough of a + // difference in sum-squared-error, use it. + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME && + sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { + mi->ref_frame[0] = ctx->best_reference_frame; + mi->mode = ctx->best_sse_inter_mode; + mi->mv[0] = ctx->best_sse_mv; + } else { + // Otherwise, use the zero reference frame. + frame = ctx->best_zeromv_reference_frame; + ctx->newmv_sse = ctx->zeromv_sse; + // Bias to last reference. + if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || + frame == ALTREF_FRAME || + (frame == GOLDEN_FRAME && use_gf_temporal_ref) || + (frame != LAST_FRAME && + ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || + denoiser->denoising_level >= kDenHigh))) { + frame = LAST_FRAME; + ctx->newmv_sse = ctx->zeromv_lastref_sse; + } + mi->ref_frame[0] = frame; + mi->mode = GLOBALMV; + mi->mv[0].as_int = 0; + ctx->best_sse_inter_mode = GLOBALMV; + ctx->best_sse_mv.as_int = 0; + *zeromv_filter = 1; + if (denoiser->denoising_level > kDenMedium) { + motion_magnitude = 0; + } + } + + saved_frame = frame; + // When using SVC, we need to map REF_FRAME to the frame buffer index. + if (use_svc) { + if (frame == LAST_FRAME) + frame = lst_fb_idx + 1; + else if (frame == GOLDEN_FRAME) + frame = gld_fb_idx + 1; + // Shift for the second spatial layer. + if (num_spatial_layers - spatial_layer == 2) + frame = frame + denoiser->num_ref_frames; + denoise_layer_idx = num_spatial_layers - spatial_layer - 1; + } + + // Force copy (no denoise, copy source in denoised buffer) if + // running_avg_y[frame] is NULL. + if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + + // We will restore these after motion compensation. + for (i = 0; i < MAX_MB_PLANE; ++i) { + saved_pre[i] = filter_mbd->plane[i].pre[0]; + saved_dst[i] = filter_mbd->plane[i].dst; + } + + // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser + // struct. + set_ref_ptrs(cm, filter_mbd, saved_frame, NONE); + av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row, + mi_col, filter_mbd->block_ref_scale_factors[0], 1); + av1_setup_dst_planes(filter_mbd->plane, bs, + &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row, + mi_col, 0, 1); + + av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col); + + // Restore everything to its original state + *mi = saved_mi; + for (i = 0; i < MAX_MB_PLANE; ++i) { + filter_mbd->plane[i].pre[0] = saved_pre[i]; + filter_mbd->plane[i].dst = saved_dst[i]; + } + + return FILTER_BLOCK; +} + +void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, + BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, + AV1_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref) { + int mv_col, mv_row; + int motion_magnitude = 0; + int zeromv_filter = 0; + AV1_DENOISER *denoiser = &cpi->denoiser; + AV1_DENOISER_DECISION decision = COPY_BLOCK; + + const int shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? denoiser->num_ref_frames + : 0; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; + const int denoise_layer_index = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; + uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + + uint8_t *mc_avg_start = + block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); + struct buf_2d src = mb->plane[0].src; + int increase_denoising = 0; + int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG; + mv_col = ctx->best_sse_mv.as_mv.col; + mv_row = ctx->best_sse_mv.as_mv.row; + motion_magnitude = mv_row * mv_row + mv_col * mv_col; + + if (denoiser->denoising_level == kDenHigh) increase_denoising = 1; + + // Copy block if LAST_FRAME is not a reference. + // Last doesn't always exist when SVC layers are dynamically changed, e.g. top + // spatial layer doesn't have last reference when it's brought up for the + // first time on the fly. + if (last_is_reference && denoiser->denoising_level >= kDenLow && + !ctx->sb_skip_denoising) + decision = perform_motion_compensation( + &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, + motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers, + cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0], + cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc, + cpi->svc.spatial_layer_id, use_gf_temporal_ref); + + if (decision == FILTER_BLOCK) { + decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start, + mc_avg.y_stride, avg_start, avg.y_stride, + increase_denoising, bs, motion_magnitude); + } + + if (decision == FILTER_BLOCK) { + aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, + block_size_wide[bs], block_size_high[bs]); + } else { // COPY_BLOCK + aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, + block_size_wide[bs], block_size_high[bs]); + } + *denoiser_decision = decision; + if (decision == FILTER_BLOCK && zeromv_filter == 1) + *denoiser_decision = FILTER_ZEROMV_BLOCK; +} + +static void copy_frame(YV12_BUFFER_CONFIG *const dest, + const YV12_BUFFER_CONFIG *const src) { + int r; + const uint8_t *srcbuf = src->y_buffer; + uint8_t *destbuf = dest->y_buffer; + + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + + for (r = 0; r < dest->y_height; ++r) { + memcpy(destbuf, srcbuf, dest->y_width); + destbuf += dest->y_stride; + srcbuf += src->y_stride; + } +} + +static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, + YV12_BUFFER_CONFIG *const src) { + uint8_t *tmp_buf = dest->y_buffer; + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + dest->y_buffer = src->y_buffer; + src->y_buffer = tmp_buf; +} + +void av1_denoiser_update_frame_info( + AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref, + struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame, + int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx, int resized, + int svc_refresh_denoiser_buffers, int second_spatial_layer) { + const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; + // Copy source into denoised reference buffers on KEY_FRAME or + // if the just encoded frame was resized. For SVC, copy source if the base + // spatial layer was key frame. + if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || + svc_refresh_denoiser_buffers) { + int i; + // Start at 1 so as not to overwrite the INTRA_FRAME + for (i = 1; i < denoiser->num_ref_frames; ++i) { + if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) + copy_frame(&denoiser->running_avg_y[i + shift], &src); + } + denoiser->reset = 0; + return; + } + + if (rtc_ref->set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) + copy_frame(&denoiser->running_avg_y[i + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + // If more than one refresh occurs, must copy frame buffer. + if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > + 1) { + if (refresh_alt_ref_frame) { + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } + } +} + +void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { + ctx->zeromv_sse = INT64_MAX; + ctx->newmv_sse = INT64_MAX; + ctx->zeromv_lastref_sse = INT64_MAX; + ctx->best_sse_mv.as_int = 0; +} + +void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx) { + if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { + ctx->zeromv_sse = sse; + ctx->best_zeromv_reference_frame = mi->ref_frame[0]; + if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse; + } + + if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) { + ctx->newmv_sse = sse; + ctx->best_sse_inter_mode = mode; + ctx->best_sse_mv = mi->mv[0]; + ctx->best_reference_frame = mi->ref_frame[0]; + } +} + +static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm, + AV1_DENOISER *denoiser, int fb_idx) { + int fail = 0; + if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { + fail = aom_alloc_frame_buffer( + &denoiser->running_avg_y[fb_idx], cm->width, cm->height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } + } + return 0; +} + +int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, + struct RTC_REF *rtc_ref, struct SVC *svc, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx) { + int fail = 0; + if (rtc_ref->set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame.frame_type == KEY_FRAME || + rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) { + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + i + 1 + svc_buf_shift); + } + } + } else { + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in + // the denoiser. + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = av1_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + } + return 0; +} + +int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, int use_highbitdepth, int border) { + int i, layer, fail, init_num_ref_frames; + const int legacy_byte_alignment = 0; + int num_layers = 1; + int scaled_width = width; + int scaled_height = height; + if (use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + av1_get_layer_resolution(width, height, lc->scaling_factor_num, + lc->scaling_factor_den, &scaled_width, + &scaled_height); + // For SVC: only denoise at most 2 spatial (highest) layers. + if (noise_sen >= 2) + // Denoise from one spatial layer below the top. + svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0); + else + // Only denoise the top spatial layer. + svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0); + num_layers = svc->number_spatial_layers - svc->first_layer_denoise; + } + assert(denoiser != NULL); + denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; + init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES; + denoiser->num_layers = num_layers; + CHECK_MEM_ERROR(cm, denoiser->running_avg_y, + aom_calloc(denoiser->num_ref_frames * num_layers, + sizeof(denoiser->running_avg_y[0]))); + CHECK_MEM_ERROR( + cm, denoiser->mc_running_avg_y, + aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); + + for (layer = 0; layer < num_layers; ++layer) { + const int denoise_width = (layer == 0) ? width : scaled_width; + const int denoise_height = (layer == 0) ? height : scaled_height; + for (i = 0; i < init_num_ref_frames; ++i) { + fail = aom_alloc_frame_buffer( + &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], + denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border, + legacy_byte_alignment, 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = aom_alloc_frame_buffer( + &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx, + ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } + } + + // denoiser->last_source only used for noise_estimation, so only for top + // layer. + fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, + use_highbitdepth, border, legacy_byte_alignment, + 0, 0); + if (fail) { + av1_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + denoiser->frame_buffer_initialized = 1; + denoiser->denoising_level = kDenMedium; + denoiser->prev_denoising_level = kDenMedium; + denoiser->reset = 0; + denoiser->current_denoiser_frame = 0; + return 0; +} + +void av1_denoiser_free(AV1_DENOISER *denoiser) { + int i; + if (denoiser == NULL) { + return; + } + denoiser->frame_buffer_initialized = 0; + for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { + aom_free_frame_buffer(&denoiser->running_avg_y[i]); + } + aom_free(denoiser->running_avg_y); + denoiser->running_avg_y = NULL; + + for (i = 0; i < denoiser->num_layers; ++i) { + aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]); + } + + aom_free(denoiser->mc_running_avg_y); + denoiser->mc_running_avg_y = NULL; + aom_free_frame_buffer(&denoiser->last_source); +} + +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 +static void force_refresh_longterm_ref(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // If long term reference is used, force refresh of that slot, so + // denoiser buffer for long term reference stays in sync. + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->refresh_alt_ref_frame = 1; + } +} +#endif + +void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) { + AV1_DENOISER *const denoiser = &cpi->denoiser; + denoiser->denoising_level = noise_level; + if (denoiser->denoising_level > kDenLowLow && + denoiser->prev_denoising_level == kDenLowLow) { + denoiser->reset = 1; +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 + force_refresh_longterm_ref(cpi); +#endif + } else { + denoiser->reset = 0; + } + denoiser->prev_denoising_level = denoiser->denoising_level; +} + +// Scale/increase the partition threshold +// for denoiser speed-up. +int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, + CONTENT_STATE_SB content_state, + int temporal_layer_id) { + if ((content_state.source_sad_nonrd <= kLowSad && + content_state.low_sumdiff) || + (content_state.source_sad_nonrd == kHighSad && + content_state.low_sumdiff) || + (content_state.lighting_change && !content_state.low_sumdiff) || + (noise_level == kDenHigh) || (temporal_layer_id != 0)) { + int64_t scaled_thr = + (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; + return scaled_thr; + } else { + return (5 * threshold) >> 2; + } +} + +// Scale/increase the ac skip threshold for +// denoiser speed-up. +int64_t av1_scale_acskip_thresh(int64_t threshold, + AV1_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id) { + if (noise_level >= kDenLow && abs_sumdiff < 5) + threshold *= (noise_level == kDenLow) ? 2 + : (temporal_layer_id == 2) ? 10 + : 6; + return threshold; +} + +void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) { + if (/*av1_denoise_svc_non_key(cpi) &&*/ + cpi->denoiser.current_denoiser_frame == 0) { + cpi->denoiser.reset = 1; +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 + force_refresh_longterm_ref(cpi); +#endif + } +} + +void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + SVC *const svc = &cpi->svc; + + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->denoiser.denoising_level > kDenLowLow) { + int svc_refresh_denoiser_buffers = 0; + int denoise_svc_second_layer = 0; + FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME + ? KEY_FRAME + : cm->current_frame.frame_type; + cpi->denoiser.current_denoiser_frame++; + const int resize_pending = is_frame_resize_pending(cpi); + + if (cpi->ppi->use_svc) { +// TODO(kyslov) Enable when SVC temporal denosing is implemented +#if 0 + const int svc_buf_shift = + svc->number_spatial_layers - svc->spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc_refresh_denoiser_buffers = + lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; + denoise_svc_second_layer = + svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; + // Check if we need to allocate extra buffers in the denoiser + // for refreshed frames. + if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref, + svc, svc_buf_shift, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); +#endif + } + av1_denoiser_update_frame_info( + &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type, + cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1, + rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0], + resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer); + } +} + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { + int r, c; + uint8_t *u = yuv->u_buffer; + uint8_t *v = yuv->v_buffer; + + for (r = 0; r < yuv->uv_height; ++r) { + for (c = 0; c < yuv->uv_width; ++c) { + u[c] = UINT8_MAX / 2; + v[c] = UINT8_MAX / 2; + } + u += yuv->uv_stride; + v += yuv->uv_stride; + } +} + +void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { + unsigned char *src = s->y_buffer; + int h = s->y_crop_height; + + do { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); +} +#endif diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.h b/third_party/aom/av1/encoder/av1_temporal_denoiser.h new file mode 100644 index 0000000000..14dcccce69 --- /dev/null +++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ +#define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ + +#include "av1/encoder/block.h" +#include "aom_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MOTION_MAGNITUDE_THRESHOLD (8 * 3) + +// Denoiser is used in non svc real-time mode which does not use alt-ref, so no +// need to allocate for it, and hence we need MAX_REF_FRAME - 1 +#define NONSVC_REF_FRAMES REF_FRAMES - 1 + +// Number of frame buffers when SVC is used. [0] for current denoised buffer and +// [1..8] for REF_FRAMES +#define SVC_REF_FRAMES 9 + +typedef enum av1_denoiser_decision { + COPY_BLOCK, + FILTER_BLOCK, + FILTER_ZEROMV_BLOCK +} AV1_DENOISER_DECISION; + +typedef enum av1_denoiser_level { + kDenLowLow, + kDenLow, + kDenMedium, + kDenHigh +} AV1_DENOISER_LEVEL; + +typedef struct av1_denoiser { + YV12_BUFFER_CONFIG *running_avg_y; + YV12_BUFFER_CONFIG *mc_running_avg_y; + YV12_BUFFER_CONFIG last_source; + int frame_buffer_initialized; + int reset; + int num_ref_frames; + int num_layers; + unsigned int current_denoiser_frame; + AV1_DENOISER_LEVEL denoising_level; + AV1_DENOISER_LEVEL prev_denoising_level; +} AV1_DENOISER; + +typedef struct { + int64_t zero_last_cost_orig; + unsigned int *ref_frame_cost; + int_mv (*frame_mv)[REF_FRAMES]; + int reuse_inter_pred; + TX_SIZE best_tx_size; + PREDICTION_MODE best_mode; + MV_REFERENCE_FRAME best_ref_frame; + int_interpfilters best_pred_filter; + uint8_t best_mode_skip_txfm; +} AV1_PICKMODE_CTX_DEN; + +struct AV1_COMP; +struct SVC; +struct RTC_REF; + +void av1_denoiser_update_frame_info( + AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref, + struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame, + int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx, int resized, + int svc_refresh_denoiser_buffers, int second_spatial_layer); + +void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, + AV1_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref); + +void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); + +void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx); + +int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, + struct RTC_REF *rtc, struct SVC *svc, + int svc_buf_shift, int refresh_alt, + int refresh_gld, int refresh_lst, int alt_fb_idx, + int gld_fb_idx, int lst_fb_idx); + +int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, int use_highbitdepth, int border); + +#if CONFIG_AV1_TEMPORAL_DENOISING +// This function is used by both c and sse2 denoiser implementations. +// Define it as a static function within the scope where av1_denoiser.h +// is referenced. +static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs, + int increase_denoising) { + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); +} +#endif + +void av1_denoiser_free(AV1_DENOISER *denoiser); + +void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level); + +void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi); + +int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level, + CONTENT_STATE_SB content_state, + int temporal_layer_id); + +int64_t av1_scale_acskip_thresh(int64_t threshold, + AV1_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id); + +void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi); + +void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_ diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c new file mode 100644 index 0000000000..219784fedf --- /dev/null +++ b/third_party/aom/av1/encoder/bitstream.c @@ -0,0 +1,4248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem_ops.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "av1/common/cdef.h" +#include "av1/common/cfl.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/entropymv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" + +#define ENC_MISMATCH_DEBUG 0 +#define SETUP_TIME_OH_CONST 5 // Setup time overhead constant per worker +#define JOB_DISP_TIME_OH_CONST 1 // Job dispatch time overhead per tile + +static INLINE void write_uniform(aom_writer *w, int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_literal(w, (v - m) & 1, 1); + } +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void loop_restoration_write_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx, + aom_writer *const w, int plane, FRAME_COUNTS *counts); +#endif + +static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx, + const MB_MODE_INFO *mi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, + PREDICTION_MODE mode, + aom_writer *w) { + assert(!is_intrabc_block(mi)); + (void)mi; + aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi), + INTRA_MODES); +} + +static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode, + FRAME_CONTEXT *ec_ctx, + const int16_t mode_ctx) { + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + + aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2); + + if (mode != NEWMV) { + const int16_t zeromv_ctx = + (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2); + + if (mode != GLOBALMV) { + int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2); + } + } +} + +static AOM_INLINE void write_drl_idx( + FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { + assert(mbmi->ref_mv_idx < 3); + + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { + int idx; + for (idx = 0; idx < 2; ++idx) { + if (mbmi_ext_frame->ref_mv_count > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); + + aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx], + 2); + if (mbmi->ref_mv_idx == idx) return; + } + } + return; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + int idx; + // TODO(jingning): Temporary solution to compensate the NEARESTMV offset. + for (idx = 1; idx < 3; ++idx) { + if (mbmi_ext_frame->ref_mv_count > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx); + aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1), + ec_ctx->drl_cdf[drl_ctx], 2); + if (mbmi->ref_mv_idx == (idx - 1)) return; + } + } + return; + } +} + +static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w, + PREDICTION_MODE mode, + const int16_t mode_ctx) { + assert(is_inter_compound_mode(mode)); + aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode), + xd->tile_ctx->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_MODES); +} + +static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + TX_SIZE tx_size, int depth, + int blk_row, int blk_col, + aom_writer *w) { + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; + const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0); + const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (depth == MAX_VARTX_DEPTH) { + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, + mbmi->bsize, tx_size); + const int txb_size_index = + av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col); + const int write_txfm_partition = + tx_size == mbmi->inter_tx_size[txb_size_index]; + if (write_txfm_partition) { + aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2); + + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + // TODO(yuec): set correct txfm partition update for qttx + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + + aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2); + + if (sub_txs == TX_4X4) { + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, sub_txs, tx_size); + return; + } + + assert(bsw > 0 && bsh > 0); + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + const int offsetc = blk_col + col; + write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w); + } + } + } +} + +static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + if (block_signals_txsize(bsize)) { + const TX_SIZE tx_size = mbmi->tx_size; + const int tx_size_ctx = get_tx_size_context(xd); + const int depth = tx_size_to_depth(tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + + assert(depth >= 0 && depth <= max_depths); + assert(!is_inter_block(mbmi)); + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi))); + + aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + max_depths + 1); + } +} + +static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd, + uint8_t segment_id, const MB_MODE_INFO *mi, + aom_writer *w) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int skip_txfm = mi->skip_txfm; + const int ctx = av1_get_skip_txfm_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2); + return skip_txfm; + } +} + +static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd, + uint8_t segment_id, const MB_MODE_INFO *mi, + aom_writer *w) { + if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + return 0; + } + const int skip_mode = mi->skip_mode; + if (!is_comp_ref_allowed(mi->bsize)) { + assert(!skip_mode); + return 0; + } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + // These features imply single-reference mode, while skip mode implies + // compound reference. Hence, the two are mutually exclusive. + // In other words, skip_mode is implicitly 0 here. + assert(!skip_mode); + return 0; + } + const int ctx = av1_get_skip_mode_context(xd); + aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2); + return skip_mode; +} + +static AOM_INLINE void write_is_inter(const AV1_COMMON *cm, + const MACROBLOCKD *xd, uint8_t segment_id, + aom_writer *w, const int is_inter) { + if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(is_inter); + return; + } + const int ctx = av1_get_intra_inter_context(xd); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2); + } +} + +static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + aom_writer *w) { + MOTION_MODE last_motion_mode_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(cm->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + assert(mbmi->motion_mode <= last_motion_mode_allowed); + switch (last_motion_mode_allowed) { + case SIMPLE_TRANSLATION: break; + case OBMC_CAUSAL: + aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL, + xd->tile_ctx->obmc_cdf[mbmi->bsize], 2); + break; + default: + aom_write_symbol(w, mbmi->motion_mode, + xd->tile_ctx->motion_mode_cdf[mbmi->bsize], + MOTION_MODES); + } +} + +static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd, + int delta_qindex, aom_writer *w) { + int sign = delta_qindex < 0; + int abs = sign ? -delta_qindex : delta_qindex; + int rem_bits, thr; + int smallval = abs < DELTA_Q_SMALL ? 1 : 0; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf, + DELTA_Q_PROBS + 1); + + if (!smallval) { + rem_bits = get_msb(abs - 1); + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits - 1, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm, + const MACROBLOCKD *xd, int lf_id, + int delta_lflevel, + int delta_lf_multi, aom_writer *w) { + int sign = delta_lflevel < 0; + int abs = sign ? -delta_lflevel : delta_lflevel; + int rem_bits, thr; + int smallval = abs < DELTA_LF_SMALL ? 1 : 0; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; + + if (delta_lf_multi) { + assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT + : FRAME_LF_COUNT - 2)); + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), + ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1); + } else { + aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf, + DELTA_LF_PROBS + 1); + } + + if (!smallval) { + rem_bits = get_msb(abs - 1); + thr = (1 << rem_bits) + 1; + aom_write_literal(w, rem_bits - 1, 3); + aom_write_literal(w, abs - thr, rem_bits); + } + if (abs > 0) { + aom_write_bit(w, sign); + } +} + +static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp, + int n, int num, MapCdf map_pb_cdf) { + const TokenExtra *p = *tp; + const int palette_size_idx = n - PALETTE_MIN_SIZE; + write_uniform(w, n, p->token); // The first color index. + ++p; + --num; + for (int i = 0; i < num; ++i) { + assert((p->color_ctx >= 0) && + (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS)); + aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx]; + aom_write_symbol(w, p->token, color_map_cdf, n); + ++p; + } + *tp = p; +} + +static AOM_INLINE void pack_txb_tokens( + aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp, + const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi, + int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block, + int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) { + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + + if (tx_size == plane_tx_size || plane) { + av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size); +#if CONFIG_RD_DEBUG + TOKEN_STATS tmp_token_stats; + init_token_stats(&tmp_token_stats); + token_stats->cost += tmp_token_stats.cost; +#endif + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + + assert(bsw > 0 && bsh > 0); + + for (int r = 0; r < row_end; r += bsh) { + const int offsetr = blk_row + r; + for (int c = 0; c < col_end; c += bsw) { + const int offsetc = blk_col + c; + pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize, + bit_depth, block, offsetr, offsetc, sub_txs, + token_stats); + block += step; + } + } + } +} + +static INLINE void set_spatial_segment_id( + const CommonModeInfoParams *const mi_params, uint8_t *segment_ids, + BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) { + const int mi_offset = mi_row * mi_params->mi_cols + mi_col; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw); + const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh); + + const int mi_stride = mi_params->mi_cols; + + set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id); +} + +int av1_neg_interleave(int x, int ref, int max) { + assert(x < max); + const int diff = x - ref; + if (!ref) return x; + if (ref >= (max - 1)) return -x + max - 1; + if (2 * ref < max) { + if (abs(diff) <= ref) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return x; + } else { + if (abs(diff) < (max - ref)) { + if (diff > 0) + return (diff << 1) - 1; + else + return ((-diff) << 1); + } + return (max - x) - 1; + } +} + +static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd, + const MB_MODE_INFO *const mbmi, + aom_writer *w, + const struct segmentation *seg, + struct segmentation_probs *segp, + int skip_txfm) { + if (!seg->enabled || !seg->update_map) return; + + AV1_COMMON *const cm = &cpi->common; + int cdf_num; + const uint8_t pred = av1_get_spatial_seg_pred( + cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (skip_txfm) { + // Still need to transmit tx size for intra blocks even if skip_txfm is + // true. Changing segment_id may make the tx size become invalid, e.g + // changing from lossless to lossy. + assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment); + + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, + mi_row, mi_col, pred); + set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize, + mi_row, mi_col, pred); + /* mbmi is read only but we need to update segment_id */ + ((MB_MODE_INFO *)mbmi)->segment_id = pred; + return; + } + + const int coded_id = + av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1); + aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; + aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS); + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, + mi_row, mi_col, mbmi->segment_id); +} + +#define WRITE_REF_BIT(bname, pname) \ + aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2) + +// This function encodes the reference frame +static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm, + const MACROBLOCKD *xd, aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + const uint8_t segment_id = mbmi->segment_id; + + // If segment level coding of this signal is disabled... + // or the segment allows multiple reference frame options + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) || + segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == LAST_FRAME); + } else { + // does the feature use compound prediction or not + // (if not specified at the frame/segment level) + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + if (is_comp_ref_allowed(mbmi->bsize)) + aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2); + } else { + assert((!is_compound) == + (cm->current_frame.reference_mode == SINGLE_REFERENCE)); + } + + if (is_compound) { + const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) + ? UNIDIR_COMP_REFERENCE + : BIDIR_COMP_REFERENCE; + aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd), + 2); + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = mbmi->ref_frame[0] == BWDREF_FRAME; + WRITE_REF_BIT(bit, uni_comp_ref_p); + + if (!bit) { + assert(mbmi->ref_frame[0] == LAST_FRAME); + const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME || + mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit1, uni_comp_ref_p1); + if (bit1) { + const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, uni_comp_ref_p2); + } + } else { + assert(mbmi->ref_frame[1] == ALTREF_FRAME); + } + + return; + } + + assert(comp_ref_type == BIDIR_COMP_REFERENCE); + + const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME); + WRITE_REF_BIT(bit, comp_ref_p); + + if (!bit) { + const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME; + WRITE_REF_BIT(bit1, comp_ref_p1); + } else { + const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME; + WRITE_REF_BIT(bit2, comp_ref_p2); + } + + const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME; + WRITE_REF_BIT(bit_bwd, comp_bwdref_p); + + if (!bit_bwd) { + WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1); + } + + } else { + const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME && + mbmi->ref_frame[0] >= BWDREF_FRAME); + WRITE_REF_BIT(bit0, single_ref_p1); + + if (bit0) { + const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME; + WRITE_REF_BIT(bit1, single_ref_p2); + + if (!bit1) { + WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6); + } + } else { + const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[0] == GOLDEN_FRAME); + WRITE_REF_BIT(bit2, single_ref_p3); + + if (!bit2) { + const int bit3 = mbmi->ref_frame[0] != LAST_FRAME; + WRITE_REF_BIT(bit3, single_ref_p4); + } else { + const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME; + WRITE_REF_BIT(bit4, single_ref_p5); + } + } + } + } +} + +static AOM_INLINE void write_filter_intra_mode_info( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + aom_writer *w) { + if (av1_filter_intra_allowed(cm, mbmi)) { + aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra, + xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2); + if (mbmi->filter_intra_mode_info.use_filter_intra) { + const FILTER_INTRA_MODE mode = + mbmi->filter_intra_mode_info.filter_intra_mode; + aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + } + } +} + +static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta, + aom_cdf_prob *cdf) { + aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf, + 2 * MAX_ANGLE_DELTA + 1); +} + +static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm, + ThreadData *td, aom_writer *w) { + const MACROBLOCKD *xd = &td->mb.e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!av1_is_interp_needed(xd)) { + int_interpfilters filters = av1_broadcast_interp_filter( + av1_unswitchable_filter(cm->features.interp_filter)); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + return; + } + if (cm->features.interp_filter == SWITCHABLE) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], + SWITCHABLE_FILTERS); + ++td->interp_filter_selected[filter]; + if (cm->seq_params->enable_dual_filter == 0) return; + } + } +} + +// Transmit color values with delta encoding. Write the first value as +// literal, and the deltas between each value and the previous one. "min_val" is +// the smallest possible value of the deltas. +static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num, + int bit_depth, int min_val, + aom_writer *w) { + if (num <= 0) return; + assert(colors[0] < (1 << bit_depth)); + aom_write_literal(w, colors[0], bit_depth); + if (num == 1) return; + int max_delta = 0; + int deltas[PALETTE_MAX_SIZE]; + memset(deltas, 0, sizeof(deltas)); + for (int i = 1; i < num; ++i) { + assert(colors[i] < (1 << bit_depth)); + const int delta = colors[i] - colors[i - 1]; + deltas[i - 1] = delta; + assert(delta >= min_val); + if (delta > max_delta) max_delta = delta; + } + const int min_bits = bit_depth - 3; + int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); + assert(bits <= bit_depth); + int range = (1 << bit_depth) - colors[0] - min_val; + aom_write_literal(w, bits - min_bits, 2); + for (int i = 0; i < num - 1; ++i) { + aom_write_literal(w, deltas[i] - min_val, bits); + range -= deltas[i]; + bits = AOMMIN(bits, av1_ceil_log2(range)); + } +} + +// Transmit luma palette color values. First signal if each color in the color +// cache is used. Those colors that are not in the cache are transmitted with +// delta encoding. +static AOM_INLINE void write_palette_colors_y( + const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[0]; + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = + av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, + cache_color_found, out_cache_colors); + int n_in_cache = 0; + for (int i = 0; i < n_cache && n_in_cache < n; ++i) { + const int found = cache_color_found[i]; + aom_write_bit(w, found); + n_in_cache += found; + } + assert(n_in_cache + n_out_cache == n); + delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w); +} + +// Write chroma palette color values. U channel is handled similarly to the luma +// channel. For v channel, either use delta encoding or transmit raw values +// directly, whichever costs less. +static AOM_INLINE void write_palette_colors_uv( + const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi, + int bit_depth, aom_writer *w) { + const int n = pmi->palette_size[1]; + const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE; + const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE; + // U channel colors. + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = av1_index_color_cache( + color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors); + int n_in_cache = 0; + for (int i = 0; i < n_cache && n_in_cache < n; ++i) { + const int found = cache_color_found[i]; + aom_write_bit(w, found); + n_in_cache += found; + } + delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w); + + // V channel colors. Don't use color cache as the colors are not sorted. + const int max_val = 1 << bit_depth; + int zero_count = 0, min_bits_v = 0; + int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int rate_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int rate_using_raw = bit_depth * n; + if (rate_using_delta < rate_using_raw) { // delta encoding + assert(colors_v[0] < (1 << bit_depth)); + aom_write_bit(w, 1); + aom_write_literal(w, bits_v - min_bits_v, 2); + aom_write_literal(w, colors_v[0], bit_depth); + for (int i = 1; i < n; ++i) { + assert(colors_v[i] < (1 << bit_depth)); + if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit. + aom_write_literal(w, 0, bits_v); + continue; + } + const int delta = abs((int)colors_v[i] - colors_v[i - 1]); + const int sign_bit = colors_v[i] < colors_v[i - 1]; + if (delta <= max_val - delta) { + aom_write_literal(w, delta, bits_v); + aom_write_bit(w, sign_bit); + } else { + aom_write_literal(w, max_val - delta, bits_v); + aom_write_bit(w, !sign_bit); + } + } + } else { // Transmit raw values. + aom_write_bit(w, 0); + for (int i = 0; i < n; ++i) { + assert(colors_v[i] < (1 << bit_depth)); + aom_write_literal(w, colors_v[i], bit_depth); + } + } +} + +static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm, + const MACROBLOCKD *xd, + const MB_MODE_INFO *const mbmi, + aom_writer *w) { + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE bsize = mbmi->bsize; + assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize)); + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd); + aom_write_symbol( + w, n > 0, + xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2); + if (n > 0) { + aom_write_symbol(w, n - PALETTE_MIN_SIZE, + xd->tile_ctx->palette_y_size_cdf[bsize_ctx], + PALETTE_SIZES); + write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w); + } + } + + const int uv_dc_pred = + num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref; + if (uv_dc_pred) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + aom_write_symbol(w, n > 0, + xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2); + if (n > 0) { + aom_write_symbol(w, n - PALETTE_MIN_SIZE, + xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], + PALETTE_SIZES); + write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w); + } + } +} + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, + TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const FeatureFlags *const features = &cm->features; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 && + ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) || + (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) && + !mbmi->skip_txfm && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter, features->reduced_tx_set_used); + const int eset = + get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used); + // eset == 0 should correspond to a set with only DCT_DCT and there + // is no need to send the tx_type + assert(eset > 0); + assert(av1_ext_tx_used[tx_set_type][tx_type]); + if (is_inter) { + aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->inter_ext_tx_cdf[eset][square_tx_size], + av1_num_ext_tx_set[tx_set_type]); + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = + fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; + else + intra_dir = mbmi->mode; + aom_write_symbol( + w, av1_ext_tx_ind[tx_set_type][tx_type], + ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir], + av1_num_ext_tx_set[tx_set_type]); + } + } +} + +static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, + BLOCK_SIZE bsize, + PREDICTION_MODE mode, + aom_writer *w) { + aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]], + INTRA_MODES); +} + +static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx, + UV_PREDICTION_MODE uv_mode, + PREDICTION_MODE y_mode, + CFL_ALLOWED_TYPE cfl_allowed, + aom_writer *w) { + aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode], + UV_INTRA_MODES - !cfl_allowed); +} + +static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, + uint8_t idx, int8_t joint_sign, + aom_writer *w) { + aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS); + // Magnitudes are only signaled for nonzero codes. + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE); + } +} + +static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, + aom_writer *w, int skip) { + if (cm->features.coded_lossless || cm->features.allow_intrabc) return; + + // At the start of a superblock, mark that we haven't yet written CDEF + // strengths for any of the CDEF units contained in this superblock. + const int sb_mask = (cm->seq_params->mib_size - 1); + const int mi_row_in_sb = (xd->mi_row & sb_mask); + const int mi_col_in_sb = (xd->mi_col & sb_mask); + if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { + xd->cdef_transmitted[0] = xd->cdef_transmitted[1] = + xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false; + } + + // CDEF unit size is 64x64 irrespective of the superblock size. + const int cdef_size = 1 << (6 - MI_SIZE_LOG2); + + // Find index of this CDEF unit in this superblock. + const int index_mask = cdef_size; + const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); + const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); + const int index = (cm->seq_params->sb_size == BLOCK_128X128) + ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb + : 0; + + // Write CDEF strength to the first non-skip coding block in this CDEF unit. + if (!xd->cdef_transmitted[index] && !skip) { + // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO + // of the 1st block in this CDEF unit. + const int first_block_mask = ~(cdef_size - 1); + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int grid_idx = + get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask, + xd->mi_col & first_block_mask); + const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx]; + aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits); + xd->cdef_transmitted[index] = true; + } +} + +static AOM_INLINE void write_inter_segment_id( + AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w, + const struct segmentation *const seg, struct segmentation_probs *const segp, + int skip, int preskip) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + AV1_COMMON *const cm = &cpi->common; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + if (seg->update_map) { + if (preskip) { + if (!seg->segid_preskip) return; + } else { + if (seg->segid_preskip) return; + if (skip) { + write_segment_id(cpi, xd, mbmi, w, seg, segp, 1); + if (seg->temporal_update) mbmi->seg_id_predicted = 0; + return; + } + } + if (seg->temporal_update) { + const int pred_flag = mbmi->seg_id_predicted; + aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); + aom_write_symbol(w, pred_flag, pred_cdf, 2); + if (!pred_flag) { + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); + } + if (pred_flag) { + set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, + mbmi->bsize, mi_row, mi_col, mbmi->segment_id); + } + } else { + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); + } + } +} + +// If delta q is present, writes delta_q index. +// Also writes delta_q loop filter levels, if present. +static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm, + MACROBLOCKD *const xd, int skip, + aom_writer *w) { + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + + if (delta_q_info->delta_q_present_flag) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + const int super_block_upper_left = + ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); + + if ((bsize != cm->seq_params->sb_size || skip == 0) && + super_block_upper_left) { + assert(mbmi->current_qindex > 0); + const int reduced_delta_qindex = + (mbmi->current_qindex - xd->current_base_qindex) / + delta_q_info->delta_q_res; + write_delta_qindex(xd, reduced_delta_qindex, w); + xd->current_base_qindex = mbmi->current_qindex; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + int reduced_delta_lflevel = + (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / + delta_q_info->delta_lf_res; + write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w); + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } + } else { + int reduced_delta_lflevel = + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / + delta_q_info->delta_lf_res; + write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w); + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + } +} + +static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm, + MACROBLOCKD *const xd, + int is_keyframe, + aom_writer *w) { + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE mode = mbmi->mode; + const BLOCK_SIZE bsize = mbmi->bsize; + + // Y mode. + if (is_keyframe) { + const MB_MODE_INFO *const above_mi = xd->above_mbmi; + const MB_MODE_INFO *const left_mi = xd->left_mbmi; + write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w); + } else { + write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w); + } + + // Y angle delta. + const int use_angle_delta = av1_use_angle_delta(bsize); + if (use_angle_delta && av1_is_directional_mode(mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y], + ec_ctx->angle_delta_cdf[mode - V_PRED]); + } + + // UV mode and UV angle delta. + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); + if (uv_mode == UV_CFL_PRED) + write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w); + const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + if (use_angle_delta && av1_is_directional_mode(intra_mode)) { + write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV], + ec_ctx->angle_delta_cdf[intra_mode - V_PRED]); + } + } + + // Palette. + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + write_palette_mode_info(cm, xd, mbmi, w); + } + + // Filter intra. + write_filter_intra_mode_info(cm, xd, mbmi, w); +} + +static INLINE int16_t mode_context_analyzer( + const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) { + if (rf[1] <= INTRA_FRAME) return mode_context; + + const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK; + const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + + const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN( + newmv_ctx, COMP_NEWMV_CTXS - 1)]; + return comp_ctx; +} + +static INLINE int_mv get_ref_mv_from_stack( + int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; + + if (ref_frame[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext_frame->ref_mv_count + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext_frame->global_mvs[ref_frame_type]; +} + +static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + x->mbmi_ext_frame); +} + +static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td, + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame; + const PREDICTION_MODE mode = mbmi->mode; + const uint8_t segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->bsize; + const int allow_hp = cm->features.allow_high_precision_mv; + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int ref; + + write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1); + + write_skip_mode(cm, xd, segment_id, mbmi, w); + + assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm)); + const int skip = + mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); + + write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0); + + write_cdef(cm, xd, w, skip); + + write_delta_q_params(cm, xd, skip, w); + + if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); + + if (mbmi->skip_mode) return; + + if (!is_inter) { + write_intra_prediction_modes(cm, xd, 0, w); + } else { + int16_t mode_ctx; + + av1_collect_neighbors_ref_counts(xd); + + write_ref_frames(cm, xd, w); + + mode_ctx = + mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame); + + // If segment skip is not enabled code the mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (is_inter_compound_mode(mode)) + write_inter_compound_mode(xd, w, mode, mode_ctx); + else if (is_inter_singleref_mode(mode)) + write_inter_mode(w, mode, ec_ctx, mode_ctx); + + if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode)) + write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w); + else + assert(mbmi->ref_mv_idx == 0); + } + + if (mode == NEWMV || mode == NEW_NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, ref); + av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, 1); + av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + nmv_context *nmvc = &ec_ctx->nmvc; + const int_mv ref_mv = get_ref_mv(x, 0); + av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); + } + + if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE && + cpi->common.seq_params->enable_interintra_compound && + is_interintra_allowed(mbmi)) { + const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; + const int bsize_group = size_group_lookup[bsize]; + aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2); + if (interintra) { + aom_write_symbol(w, mbmi->interintra_mode, + ec_ctx->interintra_mode_cdf[bsize_group], + INTERINTRA_MODES); + if (av1_is_wedge_used(bsize)) { + aom_write_symbol(w, mbmi->use_wedge_interintra, + ec_ctx->wedge_interintra_cdf[bsize], 2); + if (mbmi->use_wedge_interintra) { + aom_write_symbol(w, mbmi->interintra_wedge_index, + ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); + } + } + } + } + + if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w); + + // First write idx to indicate current compound inter prediction mode group + // Group A (0): dist_wtd_comp, compound_average + // Group B (1): interintra, compound_diffwtd, wedge + if (has_second_ref(mbmi)) { + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + + if (masked_compound_used) { + const int ctx_comp_group_idx = get_comp_group_idx_context(xd); + aom_write_symbol(w, mbmi->comp_group_idx, + ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2); + } else { + assert(mbmi->comp_group_idx == 0); + } + + if (mbmi->comp_group_idx == 0) { + if (mbmi->compound_idx) + assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); + + if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { + const int comp_index_ctx = get_comp_index_context(cm, xd); + aom_write_symbol(w, mbmi->compound_idx, + ec_ctx->compound_index_cdf[comp_index_ctx], 2); + } else { + assert(mbmi->compound_idx == 1); + } + } else { + assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + assert(masked_compound_used); + // compound_diffwtd, wedge + assert(mbmi->interinter_comp.type == COMPOUND_WEDGE || + mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) + aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE, + ec_ctx->compound_type_cdf[bsize], + MASKED_COMPOUND_TYPES); + + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + aom_write_symbol(w, mbmi->interinter_comp.wedge_index, + ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES); + aom_write_bit(w, mbmi->interinter_comp.wedge_sign); + } else { + assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD); + aom_write_literal(w, mbmi->interinter_comp.mask_type, + MAX_DIFFWTD_MASK_BITS); + } + } + } + write_mb_interp_filter(cm, td, w); + } +} + +static AOM_INLINE void write_intrabc_info( + MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, + aom_writer *w) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int use_intrabc = is_intrabc_block(mbmi); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2); + if (use_intrabc) { + assert(mbmi->mode == DC_PRED); + assert(mbmi->uv_mode == UV_DC_PRED); + assert(mbmi->motion_mode == SIMPLE_TRANSLATION); + int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv; + av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc); + } +} + +static AOM_INLINE void write_mb_modes_kf( + AV1_COMP *cpi, MACROBLOCKD *xd, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + const struct segmentation *const seg = &cm->seg; + struct segmentation_probs *const segp = &ec_ctx->seg; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + if (seg->segid_preskip && seg->update_map) + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); + + const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w); + + if (!seg->segid_preskip && seg->update_map) + write_segment_id(cpi, xd, mbmi, w, seg, segp, skip); + + write_cdef(cm, xd, w, skip); + + write_delta_q_params(cm, xd, skip, w); + + if (av1_allow_intrabc(cm)) { + write_intrabc_info(xd, mbmi_ext_frame, w); + if (is_intrabc_block(mbmi)) return; + } + + write_intra_prediction_modes(cm, xd, 1, w); +} + +#if CONFIG_RD_DEBUG +static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) { + printf("\nmi->mi_row == %d\n", mi->mi_row); + printf("&& mi->mi_col == %d\n", mi->mi_col); + printf("&& mi->bsize == %d\n", mi->bsize); + printf("&& mi->tx_size == %d\n", mi->tx_size); + printf("&& mi->mode == %d\n", mi->mode); +} + +static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, + int plane) { + if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { + printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", + plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); + return 1; + } + return 0; +} +#endif + +#if ENC_MISMATCH_DEBUG +static AOM_INLINE void enc_dump_logs( + const AV1_COMMON *const cm, + const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) { + const MB_MODE_INFO *const mbmi = *( + cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col)); + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = + mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col, + cm->mi_params.mi_alloc_bsize, + mbmi_ext_info->stride); + if (is_inter_block(mbmi)) { +#define FRAME_TO_CHECK 11 + if (cm->current_frame.frame_number == FRAME_TO_CHECK && + cm->show_frame == 1) { + const BLOCK_SIZE bsize = mbmi->bsize; + + int_mv mv[2] = { 0 }; + const int is_comp_ref = has_second_ref(mbmi); + + for (int ref = 0; ref < 1 + is_comp_ref; ++ref) + mv[ref].as_mv = mbmi->mv[ref].as_mv; + + if (!is_comp_ref) { + mv[1].as_int = 0; + } + + const int16_t mode_ctx = + is_comp_ref ? 0 + : mode_context_analyzer(mbmi_ext_frame->mode_context, + mbmi->ref_frame); + + const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK; + int16_t zeromv_ctx = -1; + int16_t refmv_ctx = -1; + + if (mbmi->mode != NEWMV) { + zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mbmi->mode != GLOBALMV) + refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; + } + + printf( + "=== ENCODER ===: " + "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, " + "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, " + "ref[1]=%d, motion_mode=%d, mode_ctx=%d, " + "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n", + cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode, + mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col, + mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0], + mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, + zeromv_ctx, refmv_ctx, mbmi->tx_size); + } + } +} +#endif // ENC_MISMATCH_DEBUG + +static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td, + aom_writer *w) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &td->mb.e_mbd; + MB_MODE_INFO *m = xd->mi[0]; + + if (frame_is_intra_only(cm)) { + write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w); + } else { + // has_subpel_mv_component needs the ref frame buffers set up to look + // up if they are scaled. has_subpel_mv_component is in turn needed by + // write_switchable_interp_filter, which is called by pack_inter_mode_mvs. + set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]); + +#if ENC_MISMATCH_DEBUG + enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col); +#endif // ENC_MISMATCH_DEBUG + + pack_inter_mode_mvs(cpi, td, w); + } +} + +static AOM_INLINE void write_inter_txb_coeff( + AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi, + aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end, + TOKEN_STATS *token_stats, const int row, const int col, int *block, + const int plane) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize < BLOCK_SIZES_ALL); + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + const int bkw = tx_size_wide_unit[max_tx_size]; + const int bkh = tx_size_high_unit[max_tx_size]; + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, ss_x, ss_y); + const int num_4x4_w = mi_size_wide[plane_bsize]; + const int num_4x4_h = mi_size_high[plane_bsize]; + const int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + const int mu_blocks_high = mi_size_high[max_unit_bsize]; + const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h); + const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w); + for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) { + for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) { + pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, + cm->seq_params->bit_depth, *block, blk_row, blk_col, + max_tx_size, token_stats); + *block += step; + } + } +} + +static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x, + aom_writer *w, const TokenExtra **tok, + const TokenExtra *const tok_end) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + + assert(!mbmi->skip_txfm); + + const int is_inter = is_inter_block(mbmi); + if (!is_inter) { + av1_write_intra_coeffs_mb(cm, x, w, bsize); + } else { + int block[MAX_MB_PLANE] = { 0 }; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + TOKEN_STATS token_stats; + init_token_stats(&token_stats); + + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64, + xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide); + mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high); + + const int num_planes = av1_num_planes(cm); + for (int row = 0; row < num_4x4_h; row += mu_blocks_high) { + for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row, + col, &block[plane], plane); + } + } + } +#if CONFIG_RD_DEBUG + for (int plane = 0; plane < num_planes; ++plane) { + if (mbmi->bsize >= BLOCK_8X8 && + rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { + dump_mode_info(mbmi); + assert(0); + } + } +#endif // CONFIG_RD_DEBUG + } +} + +static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td, + const TileInfo *const tile, aom_writer *w, + const TokenExtra **tok, + const TokenExtra *const tok_end, + int mi_row, int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *xd = &td->mb.e_mbd; + FRAME_CONTEXT *tile_ctx = xd->tile_ctx; + const int grid_idx = mi_row * mi_params->mi_stride + mi_col; + xd->mi = mi_params->mi_grid_base + grid_idx; + td->mb.mbmi_ext_frame = + cpi->mbmi_ext_info.frame_base + + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, + cpi->mbmi_ext_info.stride); + xd->tx_type_map = mi_params->tx_type_map + grid_idx; + xd->tx_type_map_stride = mi_params->mi_stride; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + assert(bsize <= cm->seq_params->sb_size || + (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL)); + + const int bh = mi_size_high[bsize]; + const int bw = mi_size_wide[bsize]; + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows, + mi_params->mi_cols); + + xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + write_mbmi_b(cpi, td, w); + + for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) { + const uint8_t palette_size_plane = + mbmi->palette_mode_info.palette_size[plane]; + assert(!mbmi->skip_mode || !palette_size_plane); + if (palette_size_plane > 0) { + assert(mbmi->use_intrabc == 0); + assert(av1_allow_palette(cm->features.allow_screen_content_tools, + mbmi->bsize)); + assert(!plane || xd->is_chroma_ref); + int rows, cols; + av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows, + &cols); + assert(*tok < tok_end); + MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf + : tile_ctx->palette_y_color_index_cdf; + pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf); + } + } + + const int is_inter_tx = is_inter_block(mbmi); + const int skip_txfm = mbmi->skip_txfm; + const uint8_t segment_id = mbmi->segment_id; + if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) && + !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) { + if (is_inter_tx) { // This implies skip flag is 0. + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int txbh = tx_size_high_unit[max_tx_size]; + const int txbw = tx_size_wide_unit[max_tx_size]; + const int width = mi_size_wide[bsize]; + const int height = mi_size_high[bsize]; + for (int idy = 0; idy < height; idy += txbh) { + for (int idx = 0; idx < width; idx += txbw) { + write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w); + } + } + } else { + write_selected_tx_size(xd, w); + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd); + } + } else { + set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, + skip_txfm && is_inter_tx, xd); + } + + if (!mbmi->skip_txfm) { + int start = aom_tell_size(w); + + write_tokens_b(cpi, &td->mb, w, tok, tok_end); + + const int end = aom_tell_size(w); + td->coefficient_size += end - start; + } +} + +static AOM_INLINE void write_partition(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, int hbs, + int mi_row, int mi_col, PARTITION_TYPE p, + BLOCK_SIZE bsize, aom_writer *w) { + const int is_partition_point = bsize >= BLOCK_8X8; + + if (!is_partition_point) return; + + const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols; + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + + if (!has_rows && !has_cols) { + assert(p == PARTITION_SPLIT); + return; + } + + if (has_rows && has_cols) { + aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx], + partition_cdf_length(bsize)); + } else if (!has_rows && has_cols) { + assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); + aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); + } else { + assert(has_rows && !has_cols); + assert(p == PARTITION_SPLIT || p == PARTITION_VERT); + assert(bsize > BLOCK_8X8); + aom_cdf_prob cdf[2]; + partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize); + aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2); + } +} + +static AOM_INLINE void write_modes_sb( + AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile, + aom_writer *const w, const TokenExtra **tok, + const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCKD *const xd = &td->mb.e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int hbs = mi_size_wide[bsize] / 2; + const int quarter_step = mi_size_wide[bsize] / 4; + int i; + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + +#if !CONFIG_REALTIME_ONLY + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + int rcol0, rcol1, rrow0, rrow1; + + // Skip some unnecessary work if loop restoration is disabled + if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; + + if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize, + &rcol0, &rcol1, &rrow0, &rrow1)) { + const int rstride = cm->rst_info[plane].horz_units; + for (int rrow = rrow0; rrow < rrow1; ++rrow) { + for (int rcol = rcol0; rcol < rcol1; ++rcol) { + const int runit_idx = rcol + rrow * rstride; + loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane, + td->counts); + } + } + } + } +#endif + + write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); + switch (partition) { + case PARTITION_NONE: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + break; + case PARTITION_HORZ: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_row + hbs < mi_params->mi_rows) + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + if (mi_col + hbs < mi_params->mi_cols) + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs, + subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col, + subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, + subsize); + break; + case PARTITION_HORZ_A: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (i = 0; i < 4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col); + } + break; + case PARTITION_VERT_4: + for (i = 0; i < 4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col); + } + break; + default: assert(0); + } + + // update partition context + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +// Populate token pointers appropriately based on token_info. +static AOM_INLINE void get_token_pointers(const TokenInfo *token_info, + const int tile_row, int tile_col, + const int sb_row_in_tile, + const TokenExtra **tok, + const TokenExtra **tok_end) { + if (!is_token_info_allocated(token_info)) { + *tok = NULL; + *tok_end = NULL; + return; + } + *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start; + *tok_end = + *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count; +} + +static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td, + const TileInfo *const tile, + aom_writer *const w, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &td->mb.e_mbd; + const int mi_row_start = tile->mi_row_start; + const int mi_row_end = tile->mi_row_end; + const int mi_col_start = tile->mi_col_start; + const int mi_col_end = tile->mi_col_end; + const int num_planes = av1_num_planes(cm); + + av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row); + av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd); + + if (cpi->common.delta_q_info.delta_q_present_flag) { + xd->current_base_qindex = cpi->common.quant_params.base_qindex; + if (cpi->common.delta_q_info.delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, num_planes); + } + } + + for (int mi_row = mi_row_start; mi_row < mi_row_end; + mi_row += cm->seq_params->mib_size) { + const int sb_row_in_tile = + (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2; + const TokenInfo *token_info = &cpi->token_info; + const TokenExtra *tok; + const TokenExtra *tok_end; + get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok, + &tok_end); + + av1_zero_left_context(xd); + + for (int mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += cm->seq_params->mib_size) { + td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col, + cm->seq_params->sb_size); + } + assert(tok == tok_end); + } +} + +static AOM_INLINE void encode_restoration_mode( + AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { + assert(!cm->features.all_lossless); + if (!cm->seq_params->enable_restoration) return; + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int all_none = 1, chroma_none = 1; + for (int p = 0; p < num_planes; ++p) { + RestorationInfo *rsi = &cm->rst_info[p]; + if (rsi->frame_restoration_type != RESTORE_NONE) { + all_none = 0; + chroma_none &= p == 0; + } + switch (rsi->frame_restoration_type) { + case RESTORE_NONE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_WIENER: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 0); + break; + case RESTORE_SGRPROJ: + aom_wb_write_bit(wb, 1); + aom_wb_write_bit(wb, 1); + break; + case RESTORE_SWITCHABLE: + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, 1); + break; + default: assert(0); + } + } + if (!all_none) { + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; + + RestorationInfo *rsi = &cm->rst_info[0]; + + assert(rsi->restoration_unit_size >= sb_size); + assert(RESTORATION_UNITSIZE_MAX == 256); + + if (sb_size == 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 64); + } + if (rsi->restoration_unit_size > 64) { + aom_wb_write_bit(wb, rsi->restoration_unit_size > 128); + } + } + + if (num_planes > 1) { + int s = + AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); + if (s && !chroma_none) { + aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size || + cm->rst_info[1].restoration_unit_size == + (cm->rst_info[0].restoration_unit_size >> s)); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } else if (!s) { + assert(cm->rst_info[1].restoration_unit_size == + cm->rst_info[0].restoration_unit_size); + assert(cm->rst_info[2].restoration_unit_size == + cm->rst_info[1].restoration_unit_size); + } + } +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void write_wiener_filter(int wiener_win, + const WienerInfo *wiener_info, + WienerInfo *ref_wiener_info, + aom_writer *wb) { + if (wiener_win == WIENER_WIN) + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + else + assert(wiener_info->vfilter[0] == 0 && + wiener_info->vfilter[WIENER_WIN - 1] == 0); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + if (wiener_win == WIENER_WIN) + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + else + assert(wiener_info->hfilter[0] == 0 && + wiener_info->hfilter[WIENER_WIN - 1] == 0); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + aom_write_primitive_refsubexpfin( + wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info)); +} + +static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info, + aom_writer *wb) { + aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS); + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + + if (params->r[0] == 0) { + assert(sgrproj_info->xqd[0] == 0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } else if (params->r[1] == 0) { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + } else { + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + aom_write_primitive_refsubexpfin( + wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + } + + memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info)); +} + +static AOM_INLINE void loop_restoration_write_sb_coeffs( + const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx, + aom_writer *const w, int plane, FRAME_COUNTS *counts) { + const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx]; + const RestorationInfo *rsi = cm->rst_info + plane; + RestorationType frame_rtype = rsi->frame_restoration_type; + assert(frame_rtype != RESTORE_NONE); + + (void)counts; + assert(!cm->features.all_lossless); + + const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; + WienerInfo *ref_wiener_info = &xd->wiener_info[plane]; + SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane]; + RestorationType unit_rtype = rui->restoration_type; + + if (frame_rtype == RESTORE_SWITCHABLE) { + aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); +#if CONFIG_ENTROPY_STATS + ++counts->switchable_restore[unit_rtype]; +#endif + switch (unit_rtype) { + case RESTORE_WIENER: +#if DEBUG_LR_COSTING + assert(!memcmp( + ref_wiener_info, + &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info, + sizeof(*ref_wiener_info))); +#endif + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); + break; + case RESTORE_SGRPROJ: +#if DEBUG_LR_COSTING + assert(!memcmp(&ref_sgrproj_info->xqd, + &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx] + .sgrproj_info.xqd, + sizeof(ref_sgrproj_info->xqd))); +#endif + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); + break; + default: assert(unit_rtype == RESTORE_NONE); break; + } + } else if (frame_rtype == RESTORE_WIENER) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->wiener_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->wiener_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { +#if DEBUG_LR_COSTING + assert( + !memcmp(ref_wiener_info, + &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info, + sizeof(*ref_wiener_info))); +#endif + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); + } + } else if (frame_rtype == RESTORE_SGRPROJ) { + aom_write_symbol(w, unit_rtype != RESTORE_NONE, + xd->tile_ctx->sgrproj_restore_cdf, 2); +#if CONFIG_ENTROPY_STATS + ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE]; +#endif + if (unit_rtype != RESTORE_NONE) { +#if DEBUG_LR_COSTING + assert(!memcmp( + &ref_sgrproj_info->xqd, + &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd, + sizeof(ref_sgrproj_info->xqd))); +#endif + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +// Only write out the ref delta section if any of the elements +// will signal a delta. +static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) { + struct loopfilter *lf = &cm->lf; + if (!lf->mode_ref_delta_update) { + return 0; + } + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); + int8_t last_ref_deltas[REF_FRAMES]; + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (buf == NULL) { + av1_set_default_ref_deltas(last_ref_deltas); + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); + memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); + } + for (int i = 0; i < REF_FRAMES; i++) { + if (lf->ref_deltas[i] != last_ref_deltas[i]) { + return true; + } + } + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { + if (lf->mode_deltas[i] != last_mode_deltas[i]) { + return true; + } + } + return false; +} + +static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + assert(!cm->features.coded_lossless); + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + struct loopfilter *lf = &cm->lf; + + // Encode the loop filter level and type + aom_wb_write_literal(wb, lf->filter_level[0], 6); + aom_wb_write_literal(wb, lf->filter_level[1], 6); + if (num_planes > 1) { + if (lf->filter_level[0] || lf->filter_level[1]) { + aom_wb_write_literal(wb, lf->filter_level_u, 6); + aom_wb_write_literal(wb, lf->filter_level_v, 6); + } + } + aom_wb_write_literal(wb, lf->sharpness_level, 3); + + aom_wb_write_bit(wb, lf->mode_ref_delta_enabled); + + // Write out loop filter deltas applied at the MB level based on mode or + // ref frame (if they are enabled), only if there is information to write. + int meaningful = is_mode_ref_delta_meaningful(cm); + aom_wb_write_bit(wb, meaningful); + if (!meaningful) { + return; + } + + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); + int8_t last_ref_deltas[REF_FRAMES]; + int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; + if (buf == NULL) { + av1_set_default_ref_deltas(last_ref_deltas); + av1_set_default_mode_deltas(last_mode_deltas); + } else { + memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); + memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); + } + for (int i = 0; i < REF_FRAMES; i++) { + const int delta = lf->ref_deltas[i]; + const int changed = delta != last_ref_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); + } + for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) { + const int delta = lf->mode_deltas[i]; + const int changed = delta != last_mode_deltas[i]; + aom_wb_write_bit(wb, changed); + if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6); + } +} + +static AOM_INLINE void encode_cdef(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + assert(!cm->features.coded_lossless); + if (!cm->seq_params->enable_cdef) return; + if (cm->features.allow_intrabc) return; + const int num_planes = av1_num_planes(cm); + int i; + aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2); + aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2); + for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) { + aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i], + CDEF_STRENGTH_BITS); + if (num_planes > 1) + aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i], + CDEF_STRENGTH_BITS); + } +} + +static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb, + int delta_q) { + if (delta_q != 0) { + aom_wb_write_bit(wb, 1); + aom_wb_write_inv_signed_literal(wb, delta_q, 6); + } else { + aom_wb_write_bit(wb, 0); + } +} + +static AOM_INLINE void encode_quantization( + const CommonQuantParams *const quant_params, int num_planes, + bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS); + write_delta_q(wb, quant_params->y_dc_delta_q); + if (num_planes > 1) { + int diff_uv_delta = + (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) || + (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q); + if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta); + write_delta_q(wb, quant_params->u_dc_delta_q); + write_delta_q(wb, quant_params->u_ac_delta_q); + if (diff_uv_delta) { + write_delta_q(wb, quant_params->v_dc_delta_q); + write_delta_q(wb, quant_params->v_ac_delta_q); + } + } + aom_wb_write_bit(wb, quant_params->using_qmatrix); + if (quant_params->using_qmatrix) { + aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS); + aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS); + if (!separate_uv_delta_q) + assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v); + else + aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS); + } +} + +static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + int i, j; + struct segmentation *seg = &cm->seg; + + aom_wb_write_bit(wb, seg->enabled); + if (!seg->enabled) return; + + // Write update flags + if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) { + aom_wb_write_bit(wb, seg->update_map); + if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update); + aom_wb_write_bit(wb, seg->update_data); + } + + // Segmentation data + if (seg->update_data) { + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + const int active = segfeature_active(seg, i, j); + aom_wb_write_bit(wb, active); + if (active) { + const int data_max = av1_seg_feature_data_max(j); + const int data_min = -data_max; + const int ubits = get_unsigned_bits(data_max); + const int data = clamp(get_segdata(seg, i, j), data_min, data_max); + + if (av1_is_segfeature_signed(j)) { + aom_wb_write_inv_signed_literal(wb, data, ubits); + } else { + aom_wb_write_literal(wb, data, ubits); + } + } + } + } + } +} + +static AOM_INLINE void write_frame_interp_filter( + InterpFilter filter, struct aom_write_bit_buffer *wb) { + aom_wb_write_bit(wb, filter == SWITCHABLE); + if (filter != SWITCHABLE) + aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS); +} + +// Same function as write_uniform but writing to uncompresses header wb +static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, + int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_literal(wb, (v - m) & 1, 1); + } +} + +static AOM_INLINE void write_tile_info_max_tile( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + int width_sb = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + int height_sb = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + int size_sb, i; + const CommonTileParams *const tiles = &cm->tiles; + + aom_wb_write_bit(wb, tiles->uniform_spacing); + + if (tiles->uniform_spacing) { + int ones = tiles->log2_cols - tiles->min_log2_cols; + while (ones--) { + aom_wb_write_bit(wb, 1); + } + if (tiles->log2_cols < tiles->max_log2_cols) { + aom_wb_write_bit(wb, 0); + } + + // rows + ones = tiles->log2_rows - tiles->min_log2_rows; + while (ones--) { + aom_wb_write_bit(wb, 1); + } + if (tiles->log2_rows < tiles->max_log2_rows) { + aom_wb_write_bit(wb, 0); + } + } else { + // Explicit tiles with configurable tile widths and heights + // columns + for (i = 0; i < tiles->cols; i++) { + size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; + wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1); + width_sb -= size_sb; + } + assert(width_sb == 0); + + // rows + for (i = 0; i < tiles->rows; i++) { + size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; + wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb), + size_sb - 1); + height_sb -= size_sb; + } + assert(height_sb == 0); + } +} + +static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + write_tile_info_max_tile(cm, wb); + + *saved_wb = *wb; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // tile id used for cdf update + aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 3, 2); + } +} + +static AOM_INLINE void write_ext_tile_info( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + // This information is stored as a separate byte. + int mod = wb->bit_offset % CHAR_BIT; + if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod); + assert(aom_wb_is_byte_aligned(wb)); + + *saved_wb = *wb; + if (cm->tiles.rows * cm->tiles.cols > 1) { + // Note that the last item in the uncompressed header is the data + // describing tile configuration. + // Number of bytes in tile column size - 1 + aom_wb_write_literal(wb, 0, 2); + // Number of bytes in tile size - 1 + aom_wb_write_literal(wb, 0, 2); + } +} + +static INLINE int find_identical_tile( + const int tile_row, const int tile_col, + TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) { + const MV32 candidate_offset[1] = { { 1, 0 } }; + const uint8_t *const cur_tile_data = + tile_buffers[tile_row][tile_col].data + 4; + const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size; + + int i; + + if (tile_row == 0) return 0; + + // (TODO: yunqingwang) For now, only above tile is checked and used. + // More candidates such as left tile can be added later. + for (i = 0; i < 1; i++) { + int row_offset = candidate_offset[0].row; + int col_offset = candidate_offset[0].col; + int row = tile_row - row_offset; + int col = tile_col - col_offset; + const uint8_t *tile_data; + TileBufferEnc *candidate; + + if (row < 0 || col < 0) continue; + + const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data); + + // Read out tile-copy-mode bit: + if ((tile_hdr >> 31) == 1) { + // The candidate is a copy tile itself: the offset is stored in bits + // 30 through 24 inclusive. + row_offset += (tile_hdr >> 24) & 0x7f; + row = tile_row - row_offset; + } + + candidate = &tile_buffers[row][col]; + + if (row_offset >= 128 || candidate->size != cur_tile_size) continue; + + tile_data = candidate->data + 4; + + if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue; + + // Identical tile found + assert(row_offset > 0); + return row_offset; + } + + // No identical tile found + return 0; +} + +static AOM_INLINE void write_render_size(const AV1_COMMON *cm, + struct aom_write_bit_buffer *wb) { + const int scaling_active = av1_resize_scaled(cm); + aom_wb_write_bit(wb, scaling_active); + if (scaling_active) { + aom_wb_write_literal(wb, cm->render_width - 1, 16); + aom_wb_write_literal(wb, cm->render_height - 1, 16); + } +} + +static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + const SequenceHeader *const seq_params = cm->seq_params; + if (!seq_params->enable_superres) { + assert(cm->superres_scale_denominator == SCALE_NUMERATOR); + return; + } + + // First bit is whether to to scale or not + if (cm->superres_scale_denominator == SCALE_NUMERATOR) { + aom_wb_write_bit(wb, 0); // no scaling + } else { + aom_wb_write_bit(wb, 1); // scaling, write scale factor + assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN); + assert(cm->superres_scale_denominator < + SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS)); + aom_wb_write_literal( + wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN, + SUPERRES_SCALE_BITS); + } +} + +static AOM_INLINE void write_frame_size(const AV1_COMMON *cm, + int frame_size_override, + struct aom_write_bit_buffer *wb) { + const int coded_width = cm->superres_upscaled_width - 1; + const int coded_height = cm->superres_upscaled_height - 1; + + if (frame_size_override) { + const SequenceHeader *seq_params = cm->seq_params; + int num_bits_width = seq_params->num_bits_width; + int num_bits_height = seq_params->num_bits_height; + aom_wb_write_literal(wb, coded_width, num_bits_width); + aom_wb_write_literal(wb, coded_height, num_bits_height); + } + + write_superres_scale(cm, wb); + write_render_size(cm, wb); +} + +static AOM_INLINE void write_frame_size_with_refs( + const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { + int found = 0; + + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); + + if (cfg != NULL) { + found = cm->superres_upscaled_width == cfg->y_crop_width && + cm->superres_upscaled_height == cfg->y_crop_height; + found &= cm->render_width == cfg->render_width && + cm->render_height == cfg->render_height; + } + aom_wb_write_bit(wb, found); + if (found) { + write_superres_scale(cm, wb); + break; + } + } + + if (!found) { + int frame_size_override = 1; // Always equal to 1 in this function + write_frame_size(cm, frame_size_override, wb); + } +} + +static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile, + struct aom_write_bit_buffer *wb) { + assert(profile >= PROFILE_0 && profile < MAX_PROFILES); + aom_wb_write_literal(wb, profile, PROFILE_BITS); +} + +static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params, + struct aom_write_bit_buffer *wb) { + // Profile 0/1: [0] for 8 bit, [1] 10-bit + // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1); + if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) { + aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1); + } +} + +static AOM_INLINE void write_color_config( + const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { + write_bitdepth(seq_params, wb); + const int is_monochrome = seq_params->monochrome; + // monochrome bit + if (seq_params->profile != PROFILE_1) + aom_wb_write_bit(wb, is_monochrome); + else + assert(!is_monochrome); + if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED && + seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED && + seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) { + aom_wb_write_bit(wb, 0); // No color description present + } else { + aom_wb_write_bit(wb, 1); // Color description present + aom_wb_write_literal(wb, seq_params->color_primaries, 8); + aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8); + aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8); + } + if (is_monochrome) { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, seq_params->color_range); + return; + } + if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + assert(seq_params->profile == PROFILE_1 || + (seq_params->profile == PROFILE_2 && + seq_params->bit_depth == AOM_BITS_12)); + } else { + // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + aom_wb_write_bit(wb, seq_params->color_range); + if (seq_params->profile == PROFILE_0) { + // 420 only + assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1); + } else if (seq_params->profile == PROFILE_1) { + // 444 only + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } else if (seq_params->profile == PROFILE_2) { + if (seq_params->bit_depth == AOM_BITS_12) { + // 420, 444 or 422 + aom_wb_write_bit(wb, seq_params->subsampling_x); + if (seq_params->subsampling_x == 0) { + assert(seq_params->subsampling_y == 0 && + "4:4:0 subsampling not allowed in AV1"); + } else { + aom_wb_write_bit(wb, seq_params->subsampling_y); + } + } else { + // 422 only + assert(seq_params->subsampling_x == 1 && + seq_params->subsampling_y == 0); + } + } + if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0); + } + if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) { + aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2); + } + } + aom_wb_write_bit(wb, seq_params->separate_uv_delta_q); +} + +static AOM_INLINE void write_timing_info_header( + const aom_timing_info_t *const timing_info, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32); + aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32); + aom_wb_write_bit(wb, timing_info->equal_picture_interval); + if (timing_info->equal_picture_interval) { + aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1); + } +} + +static AOM_INLINE void write_decoder_model_info( + const aom_dec_model_info_t *const decoder_model_info, + struct aom_write_bit_buffer *wb) { + aom_wb_write_literal( + wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5); + aom_wb_write_unsigned_literal( + wb, decoder_model_info->num_units_in_decoding_tick, 32); + aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1, + 5); + aom_wb_write_literal( + wb, decoder_model_info->frame_presentation_time_length - 1, 5); +} + +static AOM_INLINE void write_dec_model_op_parameters( + const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay, + buffer_delay_length); + aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay, + buffer_delay_length); + aom_wb_write_bit(wb, op_params->low_delay_mode_flag); +} + +static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm, + struct aom_write_bit_buffer *wb) { + aom_wb_write_unsigned_literal( + wb, cm->frame_presentation_time, + cm->seq_params->decoder_model_info.frame_presentation_time_length); +} + +static AOM_INLINE void write_film_grain_params( + const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) { + const AV1_COMMON *const cm = &cpi->common; + const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params; + aom_wb_write_bit(wb, pars->apply_grain); + if (!pars->apply_grain) return; + + aom_wb_write_literal(wb, pars->random_seed, 16); + + if (cm->current_frame.frame_type == INTER_FRAME) + aom_wb_write_bit(wb, pars->update_parameters); + + if (!pars->update_parameters) { + int ref_frame, ref_idx; + for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) { + ref_idx = get_ref_frame_map_idx(cm, ref_frame); + assert(ref_idx != INVALID_IDX); + const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx]; + if (buf->film_grain_params_present && + aom_check_grain_params_equiv(pars, &buf->film_grain_params)) { + break; + } + } + assert(ref_frame < REF_FRAMES); + aom_wb_write_literal(wb, ref_idx, 3); + return; + } + + // Scaling functions parameters + aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14 + for (int i = 0; i < pars->num_y_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); + } + + if (!cm->seq_params->monochrome) { + aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); + } else { + assert(!pars->chroma_scaling_from_luma); + } + + if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma || + ((cm->seq_params->subsampling_x == 1) && + (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) { + assert(pars->num_cb_points == 0 && pars->num_cr_points == 0); + } else { + aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 + for (int i = 0; i < pars->num_cb_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8); + } + + aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10 + for (int i = 0; i < pars->num_cr_points; i++) { + aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8); + aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8); + } + } + + aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value + + // AR coefficients + // Only sent if the corresponsing scaling function has + // more than 0 points + + aom_wb_write_literal(wb, pars->ar_coeff_lag, 2); + + int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (pars->num_y_points > 0) ++num_pos_chroma; + + if (pars->num_y_points) + for (int i = 0; i < num_pos_luma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8); + + if (pars->num_cb_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8); + + if (pars->num_cr_points || pars->chroma_scaling_from_luma) + for (int i = 0; i < num_pos_chroma; i++) + aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8); + + aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value + + aom_wb_write_literal(wb, pars->grain_scale_shift, 2); + + if (pars->num_cb_points) { + aom_wb_write_literal(wb, pars->cb_mult, 8); + aom_wb_write_literal(wb, pars->cb_luma_mult, 8); + aom_wb_write_literal(wb, pars->cb_offset, 9); + } + + if (pars->num_cr_points) { + aom_wb_write_literal(wb, pars->cr_mult, 8); + aom_wb_write_literal(wb, pars->cr_luma_mult, 8); + aom_wb_write_literal(wb, pars->cr_offset, 9); + } + + aom_wb_write_bit(wb, pars->overlap_flag); + + aom_wb_write_bit(wb, pars->clip_to_restricted_range); +} + +static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params, + struct aom_write_bit_buffer *wb) { + (void)seq_params; + (void)wb; + assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]); + assert(seq_params->mib_size == 1 << seq_params->mib_size_log2); + assert(seq_params->sb_size == BLOCK_128X128 || + seq_params->sb_size == BLOCK_64X64); + aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); +} + +static AOM_INLINE void write_sequence_header( + const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { + aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4); + aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4); + aom_wb_write_literal(wb, seq_params->max_frame_width - 1, + seq_params->num_bits_width); + aom_wb_write_literal(wb, seq_params->max_frame_height - 1, + seq_params->num_bits_height); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); + if (seq_params->frame_id_numbers_present_flag) { + // We must always have delta_frame_id_length < frame_id_length, + // in order for a frame to be referenced with a unique delta. + // Avoid wasting bits by using a coding that enforces this restriction. + aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4); + aom_wb_write_literal( + wb, + seq_params->frame_id_length - seq_params->delta_frame_id_length - 1, + 3); + } + } + + write_sb_size(seq_params, wb); + + aom_wb_write_bit(wb, seq_params->enable_filter_intra); + aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter); + + if (!seq_params->reduced_still_picture_hdr) { + aom_wb_write_bit(wb, seq_params->enable_interintra_compound); + aom_wb_write_bit(wb, seq_params->enable_masked_compound); + aom_wb_write_bit(wb, seq_params->enable_warped_motion); + aom_wb_write_bit(wb, seq_params->enable_dual_filter); + + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint); + + if (seq_params->order_hint_info.enable_order_hint) { + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp); + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs); + } + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_screen_content_tools); + } + if (seq_params->force_screen_content_tools > 0) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, 1); + } else { + aom_wb_write_bit(wb, 0); + aom_wb_write_bit(wb, seq_params->force_integer_mv); + } + } else { + assert(seq_params->force_integer_mv == 2); + } + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_literal( + wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3); + } + + aom_wb_write_bit(wb, seq_params->enable_superres); + aom_wb_write_bit(wb, seq_params->enable_cdef); + aom_wb_write_bit(wb, seq_params->enable_restoration); +} + +static AOM_INLINE void write_global_motion_params( + const WarpedMotionParams *params, const WarpedMotionParams *ref_params, + struct aom_write_bit_buffer *wb, int allow_hp) { + const TransformationType type = params->wmtype; + + // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION + // type models. Check here that we don't accidentally pick one somehow. + // See comments in gm_get_motion_vector() for details on the bug we're + // working around here + assert(type != TRANSLATION); + + aom_wb_write_bit(wb, type != IDENTITY); + if (type != IDENTITY) { + aom_wb_write_bit(wb, type == ROTZOOM); + if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION); + } + + if (type >= ROTZOOM) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + } + + if (type >= AFFINE) { + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (params->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + + if (type >= TRANSLATION) { + const int trans_bits = (type == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + const int trans_prec_diff = (type == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[0] >> trans_prec_diff), + (params->wmmat[0] >> trans_prec_diff)); + aom_wb_write_signed_primitive_refsubexpfin( + wb, (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_params->wmmat[1] >> trans_prec_diff), + (params->wmmat[1] >> trans_prec_diff)); + } +} + +static AOM_INLINE void write_global_motion(AV1_COMP *cpi, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + int frame; + for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + write_global_motion_params(&cm->global_motion[frame], ref_params, wb, + cm->features.allow_high_precision_mv); + // TODO(sarahparker, debargha): The logic in the commented out code below + // does not work currently and causes mismatches when resize is on. + // Fix it before turning the optimization back on. + /* + YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame); + if (cpi->source->y_crop_width == ref_buf->y_crop_width && + cpi->source->y_crop_height == ref_buf->y_crop_height) { + write_global_motion_params(&cm->global_motion[frame], + &cm->prev_frame->global_motion[frame], wb, + cm->features.allow_high_precision_mv); + } else { + assert(cm->global_motion[frame].wmtype == IDENTITY && + "Invalid warp type for frames of different resolutions"); + } + */ + /* + printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n", + cm->current_frame.frame_number, cm->show_frame, frame, + cm->global_motion[frame].wmmat[0], + cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2], + cm->global_motion[frame].wmmat[3]); + */ + } +} + +static int check_frame_refs_short_signaling(AV1_COMMON *const cm, + bool enable_ref_short_signaling) { + // In rtc case when res < 360p and speed >= 9, we turn on + // frame_refs_short_signaling if it won't break the decoder. + if (enable_ref_short_signaling) { + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + const int base = + 1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + + const int order_hint_group_cur = + cm->current_frame.display_order_hint / base; + const int order_hint_group_gld = + cm->ref_frame_map[gld_map_idx]->display_order_hint / base; + const int relative_dist = cm->current_frame.order_hint - + cm->ref_frame_map[gld_map_idx]->order_hint; + + // If current frame and GOLDEN frame are in the same order_hint group, and + // they are not far apart (i.e., > 64 frames), then return 1. + if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 && + relative_dist <= 64) { + return 1; + } + return 0; + } + + // Check whether all references are distinct frames. + const RefCntBuffer *seen_bufs[INTER_REFS_PER_FRAME] = { NULL }; + int num_refs = 0; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + int seen = 0; + for (int i = 0; i < num_refs; i++) { + if (seen_bufs[i] == buf) { + seen = 1; + break; + } + } + if (!seen) seen_bufs[num_refs++] = buf; + } + } + + // We only turn on frame_refs_short_signaling when all references are + // distinct. + if (num_refs < INTER_REFS_PER_FRAME) { + // It indicates that there exist more than one reference frame pointing to + // the same reference buffer, i.e. two or more references are duplicate. + return 0; + } + + // Check whether the encoder side ref frame choices are aligned with that to + // be derived at the decoder side. + int remapped_ref_idx_decoder[REF_FRAMES]; + + const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + + // Set up the frame refs mapping indexes according to the + // frame_refs_short_signaling policy. + av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx); + + // We only turn on frame_refs_short_signaling when the encoder side decision + // on ref frames is identical to that at the decoder side. + int frame_refs_short_signaling = 1; + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) { + // Compare the buffer index between two reference frames indexed + // respectively by the encoder and the decoder side decisions. + RefCntBuffer *ref_frame_buf_new = NULL; + if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) { + ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]]; + } + if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) { + frame_refs_short_signaling = 0; + break; + } + } + +#if 0 // For debug + printf("\nFrame=%d: \n", cm->current_frame.frame_number); + printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + printf("enc_ref(map_idx=%d)=%d, vs. " + "dec_ref(map_idx=%d)=%d\n", + get_ref_frame_map_idx(cm, ref_frame), ref_frame, + cm->remapped_ref_idx[ref_frame - LAST_FRAME], + ref_frame); + } +#endif // 0 + + return frame_refs_short_signaling; +} + +// New function based on HLS R18 +static AOM_INLINE void write_uncompressed_header_obu( + AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb, + struct aom_write_bit_buffer *wb) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const CommonQuantParams *quant_params = &cm->quant_params; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + + if (!cpi->sf.rt_sf.enable_ref_short_signaling || + !seq_params->order_hint_info.enable_order_hint || + seq_params->order_hint_info.enable_ref_frame_mvs) { + current_frame->frame_refs_short_signaling = 0; + } else { + current_frame->frame_refs_short_signaling = 1; + } + + if (seq_params->still_picture) { + assert(cm->show_existing_frame == 0); + assert(cm->show_frame == 1); + assert(current_frame->frame_type == KEY_FRAME); + } + if (!seq_params->reduced_still_picture_hdr) { + if (encode_show_existing_frame(cm)) { + aom_wb_write_bit(wb, 1); // show_existing_frame + aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); + + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) { + write_tu_pts_info(cm, wb); + } + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + aom_wb_write_literal(wb, display_frame_id, frame_id_len); + } + return; + } else { + aom_wb_write_bit(wb, 0); // show_existing_frame + } + + aom_wb_write_literal(wb, current_frame->frame_type, 2); + + aom_wb_write_bit(wb, cm->show_frame); + if (cm->show_frame) { + if (seq_params->decoder_model_info_present_flag && + seq_params->timing_info.equal_picture_interval == 0) + write_tu_pts_info(cm, wb); + } else { + aom_wb_write_bit(wb, cm->showable_frame); + } + if (frame_is_sframe(cm)) { + assert(features->error_resilient_mode); + } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) { + aom_wb_write_bit(wb, features->error_resilient_mode); + } + } + aom_wb_write_bit(wb, features->disable_cdf_update); + + if (seq_params->force_screen_content_tools == 2) { + aom_wb_write_bit(wb, features->allow_screen_content_tools); + } else { + assert(features->allow_screen_content_tools == + seq_params->force_screen_content_tools); + } + + if (features->allow_screen_content_tools) { + if (seq_params->force_integer_mv == 2) { + aom_wb_write_bit(wb, features->cur_frame_force_integer_mv); + } else { + assert(features->cur_frame_force_integer_mv == + seq_params->force_integer_mv); + } + } else { + assert(features->cur_frame_force_integer_mv == 0); + } + + int frame_size_override_flag = 0; + + if (seq_params->reduced_still_picture_hdr) { + assert(cm->superres_upscaled_width == seq_params->max_frame_width && + cm->superres_upscaled_height == seq_params->max_frame_height); + } else { + if (seq_params->frame_id_numbers_present_flag) { + int frame_id_len = seq_params->frame_id_length; + aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); + } + + if (cm->superres_upscaled_width > seq_params->max_frame_width || + cm->superres_upscaled_height > seq_params->max_frame_height) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Frame dimensions are larger than the maximum values"); + } + + frame_size_override_flag = + frame_is_sframe(cm) + ? 1 + : (cm->superres_upscaled_width != seq_params->max_frame_width || + cm->superres_upscaled_height != seq_params->max_frame_height); + if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); + + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_literal( + wb, current_frame->order_hint, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + + if (!features->error_resilient_mode && !frame_is_intra_only(cm)) { + aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS); + } + } + + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present); + if (cpi->ppi->buffer_removal_time_present) { + for (int op_num = 0; + op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { + if (seq_params->op_params[op_num].decoder_model_param_present_flag) { + if (seq_params->operating_point_idc[op_num] == 0 || + ((seq_params->operating_point_idc[op_num] >> + cm->temporal_layer_id) & + 0x1 && + (seq_params->operating_point_idc[op_num] >> + (cm->spatial_layer_id + 8)) & + 0x1)) { + aom_wb_write_unsigned_literal( + wb, cm->buffer_removal_times[op_num], + seq_params->decoder_model_info.buffer_removal_time_length); + cm->buffer_removal_times[op_num]++; + if (cm->buffer_removal_times[op_num] == 0) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "buffer_removal_time overflowed"); + } + } + } + } + } + } + + // Shown keyframes and switch-frames automatically refreshes all reference + // frames. For all other frame types, we need to write refresh_frame_flags. + if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) || + current_frame->frame_type == INTER_FRAME || + current_frame->frame_type == INTRA_ONLY_FRAME) + aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES); + + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) { + // Write all ref frame order hints if error_resilient_mode == 1 + if (features->error_resilient_mode && + seq_params->order_hint_info.enable_order_hint) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + aom_wb_write_literal( + wb, cm->ref_frame_map[ref_idx]->order_hint, + seq_params->order_hint_info.order_hint_bits_minus_1 + 1); + } + } + } + + if (current_frame->frame_type == KEY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !features->allow_intrabc); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, features->allow_intrabc); + } else { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + write_frame_size(cm, frame_size_override_flag, wb); + assert(!av1_superres_scaled(cm) || !features->allow_intrabc); + if (features->allow_screen_content_tools && !av1_superres_scaled(cm)) + aom_wb_write_bit(wb, features->allow_intrabc); + } else if (current_frame->frame_type == INTER_FRAME || + frame_is_sframe(cm)) { + MV_REFERENCE_FRAME ref_frame; + + // NOTE: Error resilient mode turns off frame_refs_short_signaling + // automatically. +#define FRAME_REFS_SHORT_SIGNALING 0 +#if FRAME_REFS_SHORT_SIGNALING + current_frame->frame_refs_short_signaling = + seq_params->order_hint_info.enable_order_hint; +#endif // FRAME_REFS_SHORT_SIGNALING + + if (current_frame->frame_refs_short_signaling) { + // In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true, + // we turn on frame_refs_short_signaling when the current frame and + // golden frame are in the same order_hint group, and their relative + // distance is <= 64 (in order to be decodable). + + // For other cases, an example solution for encoder-side + // implementation on frame_refs_short_signaling is also provided in + // this function, where frame_refs_short_signaling is only turned on + // when the encoder side decision on ref frames is identical to that + // at the decoder side. + + current_frame->frame_refs_short_signaling = + check_frame_refs_short_signaling( + cm, cpi->sf.rt_sf.enable_ref_short_signaling); + } + + if (seq_params->order_hint_info.enable_order_hint) + aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling); + + if (current_frame->frame_refs_short_signaling) { + const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME); + aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2); + + const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2); + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX); + if (!current_frame->frame_refs_short_signaling) + aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame), + REF_FRAMES_LOG2); + if (seq_params->frame_id_numbers_present_flag) { + int i = get_ref_frame_map_idx(cm, ref_frame); + int frame_id_len = seq_params->frame_id_length; + int diff_len = seq_params->delta_frame_id_length; + int delta_frame_id_minus_1 = + ((cm->current_frame_id - cm->ref_frame_id[i] + + (1 << frame_id_len)) % + (1 << frame_id_len)) - + 1; + if (delta_frame_id_minus_1 < 0 || + delta_frame_id_minus_1 >= (1 << diff_len)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Invalid delta_frame_id_minus_1"); + } + aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); + } + } + + if (!features->error_resilient_mode && frame_size_override_flag) { + write_frame_size_with_refs(cm, wb); + } else { + write_frame_size(cm, frame_size_override_flag, wb); + } + + if (!features->cur_frame_force_integer_mv) + aom_wb_write_bit(wb, features->allow_high_precision_mv); + write_frame_interp_filter(features->interp_filter, wb); + aom_wb_write_bit(wb, features->switchable_motion_mode); + if (frame_might_allow_ref_frame_mvs(cm)) { + aom_wb_write_bit(wb, features->allow_ref_frame_mvs); + } else { + assert(features->allow_ref_frame_mvs == 0); + } + } + } + + const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && + !(features->disable_cdf_update); + if (cm->tiles.large_scale) + assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); + + if (might_bwd_adapt) { + aom_wb_write_bit( + wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); + } + + write_tile_info(cm, saved_wb, wb); + encode_quantization(quant_params, av1_num_planes(cm), + cm->seq_params->separate_uv_delta_q, wb); + encode_segmentation(cm, wb); + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0); + if (quant_params->base_qindex > 0) { + aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag); + if (delta_q_info->delta_q_present_flag) { + aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2); + xd->current_base_qindex = quant_params->base_qindex; + if (features->allow_intrabc) + assert(delta_q_info->delta_lf_present_flag == 0); + else + aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag); + if (delta_q_info->delta_lf_present_flag) { + aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2); + aom_wb_write_bit(wb, delta_q_info->delta_lf_multi); + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + } + + if (features->all_lossless) { + assert(!av1_superres_scaled(cm)); + } else { + if (!features->coded_lossless) { + encode_loopfilter(cm, wb); + encode_cdef(cm, wb); + } + encode_restoration_mode(cm, wb); + } + + // Write TX mode + if (features->coded_lossless) + assert(features->tx_mode == ONLY_4X4); + else + aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT); + + if (!frame_is_intra_only(cm)) { + const int use_hybrid_pred = + current_frame->reference_mode == REFERENCE_MODE_SELECT; + + aom_wb_write_bit(wb, use_hybrid_pred); + } + + if (current_frame->skip_mode_info.skip_mode_allowed) + aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag); + + if (frame_might_allow_warped_motion(cm)) + aom_wb_write_bit(wb, features->allow_warped_motion); + else + assert(!features->allow_warped_motion); + + aom_wb_write_bit(wb, features->reduced_tx_set_used); + + if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); + + if (seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) + write_film_grain_params(cpi, wb); + + if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb); +} + +static int choose_size_bytes(uint32_t size, int spare_msbs) { + // Choose the number of bytes required to represent size, without + // using the 'spare_msbs' number of most significant bits. + + // Make sure we will fit in 4 bytes to start with.. + if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1; + + // Normalise to 32 bits + size <<= spare_msbs; + + if (size >> 24 != 0) + return 4; + else if (size >> 16 != 0) + return 3; + else if (size >> 8 != 0) + return 2; + else + return 1; +} + +static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz, + const int val) { + switch (sz) { + case 1: dst[0] = (uint8_t)(val & 0xff); break; + case 2: mem_put_le16(dst, val); break; + case 3: mem_put_le24(dst, val); break; + case 4: mem_put_le32(dst, val); break; + default: assert(0 && "Invalid size"); break; + } +} + +static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst, + const uint32_t data_size, const uint32_t max_tile_size, + const uint32_t max_tile_col_size, + int *const tile_size_bytes, + int *const tile_col_size_bytes) { + // Choose the tile size bytes (tsb) and tile column size bytes (tcsb) + int tsb; + int tcsb; + + if (tiles->large_scale) { + // The top bit in the tile size field indicates tile copy mode, so we + // have 1 less bit to code the tile size + tsb = choose_size_bytes(max_tile_size, 1); + tcsb = choose_size_bytes(max_tile_col_size, 0); + } else { + tsb = choose_size_bytes(max_tile_size, 0); + tcsb = 4; // This is ignored + (void)max_tile_col_size; + } + + assert(tsb > 0); + assert(tcsb > 0); + + *tile_size_bytes = tsb; + *tile_col_size_bytes = tcsb; + if (tsb == 4 && tcsb == 4) return data_size; + + uint32_t wpos = 0; + uint32_t rpos = 0; + + if (tiles->large_scale) { + int tile_row; + int tile_col; + + for (tile_col = 0; tile_col < tiles->cols; tile_col++) { + // All but the last column has a column header + if (tile_col < tiles->cols - 1) { + uint32_t tile_col_size = mem_get_le32(dst + rpos); + rpos += 4; + + // Adjust the tile column size by the number of bytes removed + // from the tile size fields. + tile_col_size -= (4 - tsb) * tiles->rows; + + mem_put_varsize(dst + wpos, tcsb, tile_col_size); + wpos += tcsb; + } + + for (tile_row = 0; tile_row < tiles->rows; tile_row++) { + // All, including the last row has a header + uint32_t tile_header = mem_get_le32(dst + rpos); + rpos += 4; + + // If this is a copy tile, we need to shift the MSB to the + // top bit of the new width, and there is no data to copy. + if (tile_header >> 31 != 0) { + if (tsb < 4) tile_header >>= 32 - 8 * tsb; + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + } else { + mem_put_varsize(dst + wpos, tsb, tile_header); + wpos += tsb; + + tile_header += AV1_MIN_TILE_SIZE_BYTES; + memmove(dst + wpos, dst + rpos, tile_header); + rpos += tile_header; + wpos += tile_header; + } + } + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; + } + const int n_tiles = tiles->cols * tiles->rows; + int n; + + for (n = 0; n < n_tiles; n++) { + int tile_size; + + if (n == n_tiles - 1) { + tile_size = data_size - rpos; + } else { + tile_size = mem_get_le32(dst + rpos); + rpos += 4; + mem_put_varsize(dst + wpos, tsb, tile_size); + tile_size += AV1_MIN_TILE_SIZE_BYTES; + wpos += tsb; + } + + memmove(dst + wpos, dst + rpos, tile_size); + + rpos += tile_size; + wpos += tile_size; + } + + assert(rpos > wpos); + assert(rpos == data_size); + + return wpos; +} + +uint32_t av1_write_obu_header(AV1LevelParams *const level_params, + int *frame_header_count, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst) { + if (level_params->keep_level_stats && + (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER)) + ++(*frame_header_count); + + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + aom_wb_write_literal(&wb, 0, 1); // forbidden bit. + aom_wb_write_literal(&wb, (int)obu_type, 4); + aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1); + aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field + aom_wb_write_literal(&wb, 0, 1); // reserved + + if (obu_extension) { + aom_wb_write_literal(&wb, obu_extension & 0xFF, 8); + } + + size = aom_wb_bytes_written(&wb); + return size; +} + +int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, + uint8_t *dest) { + const size_t offset = obu_header_size; + size_t coded_obu_size = 0; + const uint32_t obu_size = (uint32_t)obu_payload_size; + assert(obu_size == obu_payload_size); + + if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size, + uint8_t *data) { + const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); + const size_t move_dst_offset = length_field_size + obu_header_size; + const size_t move_src_offset = obu_header_size; + const size_t move_size = obu_payload_size; + memmove(data + move_dst_offset, data + move_src_offset, move_size); + return length_field_size; +} + +static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) { + if (aom_wb_is_byte_aligned(wb)) { + aom_wb_write_literal(wb, 0x80, 8); + } else { + // assumes that the other bits are already 0s + aom_wb_write_bit(wb, 1); + } +} + +static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx, + struct aom_write_bit_buffer *wb) { + assert(is_valid_seq_level_idx(seq_level_idx)); + aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); +} + +uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, + uint8_t *const dst) { + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + write_profile(seq_params->profile, &wb); + + // Still picture or not + aom_wb_write_bit(&wb, seq_params->still_picture); + assert(IMPLIES(!seq_params->still_picture, + !seq_params->reduced_still_picture_hdr)); + // whether to use reduced still picture header + aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr); + + if (seq_params->reduced_still_picture_hdr) { + assert(seq_params->timing_info_present == 0); + assert(seq_params->decoder_model_info_present_flag == 0); + assert(seq_params->display_model_info_present_flag == 0); + write_bitstream_level(seq_params->seq_level_idx[0], &wb); + } else { + aom_wb_write_bit( + &wb, seq_params->timing_info_present); // timing info present flag + + if (seq_params->timing_info_present) { + // timing_info + write_timing_info_header(&seq_params->timing_info, &wb); + aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag); + if (seq_params->decoder_model_info_present_flag) { + write_decoder_model_info(&seq_params->decoder_model_info, &wb); + } + } + aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag); + aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1, + OP_POINTS_CNT_MINUS_1_BITS); + int i; + for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { + aom_wb_write_literal(&wb, seq_params->operating_point_idc[i], + OP_POINTS_IDC_BITS); + write_bitstream_level(seq_params->seq_level_idx[i], &wb); + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) + aom_wb_write_bit(&wb, seq_params->tier[i]); + if (seq_params->decoder_model_info_present_flag) { + aom_wb_write_bit( + &wb, seq_params->op_params[i].decoder_model_param_present_flag); + if (seq_params->op_params[i].decoder_model_param_present_flag) { + write_dec_model_op_parameters( + &seq_params->op_params[i], + seq_params->decoder_model_info + .encoder_decoder_buffer_delay_length, + &wb); + } + } + if (seq_params->display_model_info_present_flag) { + aom_wb_write_bit( + &wb, seq_params->op_params[i].display_model_param_present_flag); + if (seq_params->op_params[i].display_model_param_present_flag) { + assert(seq_params->op_params[i].initial_display_delay >= 1); + assert(seq_params->op_params[i].initial_display_delay <= 10); + aom_wb_write_literal( + &wb, seq_params->op_params[i].initial_display_delay - 1, 4); + } + } + } + } + write_sequence_header(seq_params, &wb); + + write_color_config(seq_params, &wb); + + aom_wb_write_bit(&wb, seq_params->film_grain_params_present); + + add_trailing_bits(&wb); + + size = aom_wb_bytes_written(&wb); + return size; +} + +static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd, + struct aom_write_bit_buffer *saved_wb, + uint8_t *const dst, + int append_trailing_bits) { + struct aom_write_bit_buffer wb = { dst, 0 }; + write_uncompressed_header_obu(cpi, xd, saved_wb, &wb); + if (append_trailing_bits) add_trailing_bits(&wb); + return aom_wb_bytes_written(&wb); +} + +static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile, + int end_tile, int tiles_log2, + int tile_start_and_end_present_flag) { + struct aom_write_bit_buffer wb = { dst, 0 }; + uint32_t size = 0; + + if (!tiles_log2) return size; + + aom_wb_write_bit(&wb, tile_start_and_end_present_flag); + + if (tile_start_and_end_present_flag) { + aom_wb_write_literal(&wb, start_tile, tiles_log2); + aom_wb_write_literal(&wb, end_tile, tiles_log2); + } + + size = aom_wb_bytes_written(&wb); + return size; +} + +extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size, + const char *filename); + +typedef struct { + uint32_t tg_hdr_size; + uint32_t frame_header_size; +} LargeTileFrameOBU; + +// Initialize OBU header for large scale tile case. +static uint32_t init_large_scale_tile_obu_header( + AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb, + LargeTileFrameOBU *lst_obu) { + AV1LevelParams *const level_params = &cpi->ppi->level_params; + CurrentFrame *const current_frame = &cpi->common.current_frame; + // For large_scale_tile case, we always have only one tile group, so it can + // be written as an OBU_FRAME. + const OBU_TYPE obu_type = OBU_FRAME; + lst_obu->tg_hdr_size = av1_write_obu_header( + level_params, &cpi->frame_header_count, obu_type, 0, *data); + *data += lst_obu->tg_hdr_size; + + const uint32_t frame_header_size = + write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0); + *data += frame_header_size; + lst_obu->frame_header_size = frame_header_size; + // (yunqing) This test ensures the correctness of large scale tile coding. + if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) { + char fn[20] = "./fh"; + fn[4] = current_frame->frame_number / 100 + '0'; + fn[5] = (current_frame->frame_number % 100) / 10 + '0'; + fn[6] = (current_frame->frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_uncompressed_frame_header(*data - frame_header_size, + frame_header_size, fn); + } + return frame_header_size; +} + +// Write total buffer size and related information into the OBU header for large +// scale tile case. +static void write_large_scale_tile_obu_size( + const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data, + struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu, + int have_tiles, uint32_t *total_size, int max_tile_size, + int max_tile_col_size) { + int tile_size_bytes = 0; + int tile_col_size_bytes = 0; + if (have_tiles) { + *total_size = remux_tiles( + tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size, + max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes); + *total_size += lst_obu->frame_header_size; + } + + // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write + // current tile group size before tile data(include tile column header). + // Tile group size doesn't include the bytes storing tg size. + *total_size += lst_obu->tg_hdr_size; + const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size; + const size_t length_field_size = + av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst); + if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) != + AOM_CODEC_OK) + assert(0); + + *total_size += (uint32_t)length_field_size; + saved_wb->bit_buffer += length_field_size; + + // Now fill in the gaps in the uncompressed header. + if (have_tiles) { + assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2); + + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + } +} + +// Store information on each large scale tile in the OBU header. +static void write_large_scale_tile_obu( + AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu, + int *const largest_tile_id, uint32_t *total_size, const int have_tiles, + unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + + TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + unsigned int tile_size = 0; + + av1_reset_pack_bs_thread_data(&cpi->td); + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + TileInfo tile_info; + const int is_last_col = (tile_col == tile_cols - 1); + const uint32_t col_offset = *total_size; + + av1_tile_set_col(&tile_info, cm, tile_col); + + // The last column does not have a column header + if (!is_last_col) *total_size += 4; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; + const int data_offset = have_tiles ? 4 : 0; + const int tile_idx = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + av1_tile_set_row(&tile_info, cm, tile_row); + aom_writer mode_bc; + + buf->data = dst + *total_size + lst_obu->tg_hdr_size; + + // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, + // even for the last one, unless no tiling is used at all. + *total_size += data_offset; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + mode_bc.allow_update_cdf = !tiles->large_scale; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + aom_start_encode(&mode_bc, buf->data + data_offset); + write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes"); + } + tile_size = mode_bc.pos; + buf->size = tile_size; + + // Record the maximum tile size we see, so we can compact headers later. + if (tile_size > *max_tile_size) { + *max_tile_size = tile_size; + *largest_tile_id = tile_cols * tile_row + tile_col; + } + + if (have_tiles) { + // tile header: size of this tile, or copy offset + uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES; + const int tile_copy_mode = + ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1 + : 0; + + // If tile_copy_mode = 1, check if this tile is a copy tile. + // Very low chances to have copy tiles on the key frames, so don't + // search on key frames to reduce unnecessary search. + if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) { + const int identical_tile_offset = + find_identical_tile(tile_row, tile_col, tile_buffers); + + // Indicate a copy-tile by setting the most significant bit. + // The row-offset to copy from is stored in the highest byte. + // remux_tiles will move these around later + if (identical_tile_offset > 0) { + tile_size = 0; + tile_header = identical_tile_offset | 0x80; + tile_header <<= 24; + } + } + + mem_put_le32(buf->data, (MEM_VALUE_T)tile_header); + } + + *total_size += tile_size; + } + if (!is_last_col) { + uint32_t col_size = *total_size - col_offset - 4; + mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size); + + // Record the maximum tile column size we see. + *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); + } + } + av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); +} + +// Packs information in the obu header for large scale tiles. +static INLINE uint32_t pack_large_scale_tiles_in_tg_obus( + AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + uint32_t total_size = 0; + unsigned int max_tile_size = 0; + unsigned int max_tile_col_size = 0; + const int have_tiles = tiles->cols * tiles->rows > 1; + uint8_t *data = dst; + + LargeTileFrameOBU lst_obu; + + total_size += + init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu); + + write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size, + have_tiles, &max_tile_size, &max_tile_col_size); + + write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu, + have_tiles, &total_size, max_tile_size, + max_tile_col_size); + + return total_size; +} + +// Writes obu, tile group and uncompressed headers to bitstream. +void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd, + PackBSParams *const pack_bs_params, + const int tile_idx) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size; + const int tg_size = + (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg; + + // Write Tile group, frame and OBU header + // A new tile group begins at this tile. Write the obu header and + // tile group header + const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP; + *curr_tg_hdr_size = av1_write_obu_header( + &cpi->ppi->level_params, &cpi->frame_header_count, obu_type, + pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr); + pack_bs_params->obu_header_size = *curr_tg_hdr_size; + + if (cpi->num_tg == 1) + *curr_tg_hdr_size += write_frame_header_obu( + cpi, xd, pack_bs_params->saved_wb, + pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0); + *curr_tg_hdr_size += write_tile_group_header( + pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx, + AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1), + (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1); + *pack_bs_params->total_size += *curr_tg_hdr_size; +} + +// Pack tile data in the bitstream with tile_group, frame +// and OBU header. +void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td, + PackBSParams *const pack_bs_params) { + aom_writer mode_bc; + AV1_COMMON *const cm = &cpi->common; + int tile_row = pack_bs_params->tile_row; + int tile_col = pack_bs_params->tile_col; + uint32_t *const total_size = pack_bs_params->total_size; + TileInfo tile_info; + av1_tile_set_col(&tile_info, cm, tile_col); + av1_tile_set_row(&tile_info, cm, tile_row); + mode_bc.allow_update_cdf = 1; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + + unsigned int tile_size; + + const int num_planes = av1_num_planes(cm); + av1_reset_loop_restoration(&td->mb.e_mbd, num_planes); + + pack_bs_params->buf.data = pack_bs_params->dst + *total_size; + + // The last tile of the tile group does not have a header. + if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4; + + // Pack tile data + aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size); + write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR, + "Error writing modes"); + } + tile_size = mode_bc.pos; + assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); + + pack_bs_params->buf.size = tile_size; + + // Write tile size + if (!pack_bs_params->is_last_tile_in_tg) { + // size of this tile + mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES); + } +} + +void av1_write_last_tile_info( + AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, + struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, + uint8_t *curr_tg_start, uint32_t *const total_size, + uint8_t **tile_data_start, int *const largest_tile_id, + int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) { + // write current tile group size + const uint32_t obu_payload_size = + (uint32_t)(*curr_tg_data_size) - obu_header_size; + const size_t length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, + curr_tg_start) != AOM_CODEC_OK) { + assert(0); + } + *curr_tg_data_size += (int)length_field_size; + *total_size += (uint32_t)length_field_size; + *tile_data_start += length_field_size; + if (cpi->num_tg == 1) { + // if this tg is combined with the frame header then update saved + // frame header base offset according to length field size + saved_wb->bit_buffer += length_field_size; + } + + if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) { + // Make room for a duplicate Frame Header OBU. + memmove(curr_tg_start + fh_info->total_length, curr_tg_start, + *curr_tg_data_size); + + // Insert a copy of the Frame Header OBU. + memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length); + + // Force context update tile to be the first tile in error + // resilient mode as the duplicate frame headers will have + // context_update_tile_id set to 0 + *largest_tile_id = 0; + + // Rewrite the OBU header to change the OBU type to Redundant Frame + // Header. + av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count, + OBU_REDUNDANT_FRAME_HEADER, obu_extn_header, + &curr_tg_start[fh_info->obu_header_byte_offset]); + + *curr_tg_data_size += (int)(fh_info->total_length); + *total_size += (uint32_t)(fh_info->total_length); + } + *is_first_tg = 0; +} + +void av1_reset_pack_bs_thread_data(ThreadData *const td) { + td->coefficient_size = 0; + td->max_mv_magnitude = 0; + av1_zero(td->interp_filter_selected); +} + +void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi, + ThreadData const *td) { + int do_max_mv_magnitude_update = 1; + cpi->rc.coefficient_size += td->coefficient_size; + + // Disable max_mv_magnitude update for parallel frames based on update flag. + if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0; + + if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update) + cpi->mv_search_params.max_mv_magnitude = + AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude); + + for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++) + cpi->common.cur_frame->interp_filter_selected[filter] += + td->interp_filter_selected[filter]; +} + +// Store information related to each default tile in the OBU header. +static void write_tile_obu( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const CommonTileParams *const tiles = &cm->tiles; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; + int tile_count = 0; + size_t curr_tg_data_size = 0; + uint8_t *tile_data_curr = dst; + int new_tg = 1; + int is_first_tg = 1; + + av1_reset_pack_bs_thread_data(&cpi->td); + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + + int is_last_tile_in_tg = 0; + if (new_tg) { + tile_data_curr = dst + *total_size; + tile_count = 0; + } + tile_count++; + + if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) + is_last_tile_in_tg = 1; + + xd->tile_ctx = &this_tile->tctx; + + // PackBSParams stores all parameters required to pack tile and header + // info. + PackBSParams pack_bs_params; + pack_bs_params.dst = dst; + pack_bs_params.curr_tg_hdr_size = 0; + pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params.new_tg = new_tg; + pack_bs_params.obu_extn_header = obu_extn_header; + pack_bs_params.obu_header_size = 0; + pack_bs_params.saved_wb = saved_wb; + pack_bs_params.tile_col = tile_col; + pack_bs_params.tile_row = tile_row; + pack_bs_params.tile_data_curr = tile_data_curr; + pack_bs_params.total_size = total_size; + + if (new_tg) + av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx); + + av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params); + + if (new_tg) { + curr_tg_data_size = pack_bs_params.curr_tg_hdr_size; + *tile_data_start += pack_bs_params.curr_tg_hdr_size; + *obu_header_size = pack_bs_params.obu_header_size; + new_tg = 0; + } + if (is_last_tile_in_tg) new_tg = 1; + + curr_tg_data_size += + (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4)); + + if (pack_bs_params.buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params.buf.size; + } + + if (is_last_tile_in_tg) + av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size, + tile_data_curr, total_size, tile_data_start, + largest_tile_id, &is_first_tg, + *obu_header_size, obu_extn_header); + *total_size += (uint32_t)pack_bs_params.buf.size; + } + } + av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); +} + +// Write total buffer size and related information into the OBU header for +// default tile case. +static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + int largest_tile_id, uint32_t *const total_size, + unsigned int max_tile_size, + uint32_t obu_header_size, + uint8_t *tile_data_start) { + const CommonTileParams *const tiles = &cpi->common.tiles; + + // Fill in context_update_tile_id indicating the tile to use for the + // cdf update. The encoder currently sets it to the largest tile + // (but is up to the encoder) + aom_wb_overwrite_literal(saved_wb, largest_tile_id, + (tiles->log2_cols + tiles->log2_rows)); + // If more than one tile group. tile_size_bytes takes the default value 4 + // and does not need to be set. For a single tile group it is set in the + // section below. + if (cpi->num_tg != 1) return; + int tile_size_bytes = 4, unused; + const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst); + const uint32_t tile_data_size = *total_size - tile_data_offset; + + *total_size = remux_tiles(tiles, tile_data_start, tile_data_size, + max_tile_size, 0, &tile_size_bytes, &unused); + *total_size += tile_data_offset; + assert(tile_size_bytes >= 1 && tile_size_bytes <= 4); + + aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2); + + // Update the OBU length if remux_tiles() reduced the size. + uint64_t payload_size; + size_t length_field_size; + int res = + aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size, + &payload_size, &length_field_size); + assert(res == 0); + (void)res; + + const uint64_t new_payload_size = + *total_size - obu_header_size - length_field_size; + if (new_payload_size != payload_size) { + size_t new_length_field_size; + res = aom_uleb_encode(new_payload_size, length_field_size, + dst + obu_header_size, &new_length_field_size); + assert(res == 0); + if (new_length_field_size < length_field_size) { + const size_t src_offset = obu_header_size + length_field_size; + const size_t dst_offset = obu_header_size + new_length_field_size; + memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size); + *total_size -= (int)(length_field_size - new_length_field_size); + } + } +} + +// As per the experiments, single-thread bitstream packing is better for +// frames with a smaller bitstream size. This behavior is due to setup time +// overhead of multithread function would be more than that of time required +// to pack the smaller bitstream of such frames. This function computes the +// number of required number of workers based on setup time overhead and job +// dispatch time overhead for given tiles and available workers. +int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles, + int avail_workers, bool pack_bs_mt_enabled) { + if (!pack_bs_mt_enabled) return 1; + + uint64_t frame_abs_sum_level = 0; + + for (int idx = 0; idx < num_tiles; idx++) + frame_abs_sum_level += tile_data[idx].abs_sum_level; + + int ideal_num_workers = 1; + const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST; + float max_sum = 0.0; + + for (int num_workers = avail_workers; num_workers > 1; num_workers--) { + const float fas_per_worker_const = + ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level; + const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST; + const float this_sum = fas_per_worker_const - setup_time_const - + job_disp_time_const / num_workers; + + if (this_sum > max_sum) { + max_sum = this_sum; + ideal_num_workers = num_workers; + } + } + return ideal_num_workers; +} + +static INLINE uint32_t pack_tiles_in_tg_obus( + AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id) { + const CommonTileParams *const tiles = &cpi->common.tiles; + uint32_t total_size = 0; + unsigned int max_tile_size = 0; + uint32_t obu_header_size = 0; + uint8_t *tile_data_start = dst; + const int tile_cols = tiles->cols; + const int tile_rows = tiles->rows; + const int num_tiles = tile_rows * tile_cols; + + const int num_workers = calc_pack_bs_mt_workers( + cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS], + cpi->mt_info.pack_bs_mt_enabled); + + if (num_workers > 1) { + av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header, + fh_info, largest_tile_id, &max_tile_size, + &obu_header_size, &tile_data_start, num_workers); + } else { + write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, + fh_info, largest_tile_id, &max_tile_size, &obu_header_size, + &tile_data_start); + } + + if (num_tiles > 1) + write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size, + max_tile_size, obu_header_size, tile_data_start); + return total_size; +} + +static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + uint8_t obu_extension_header, + const FrameHeaderInfo *fh_info, + int *const largest_tile_id) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + *largest_tile_id = 0; + + // Select the coding strategy (temporal or spatial) + if (cm->seg.enabled && cm->seg.update_map) { + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + cm->seg.temporal_update = 0; + } else { + cm->seg.temporal_update = 1; + if (cpi->td.rd_counts.seg_tmp_pred_cost[0] < + cpi->td.rd_counts.seg_tmp_pred_cost[1]) + cm->seg.temporal_update = 0; + } + } + + if (tiles->large_scale) + return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb, + largest_tile_id); + + return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header, + fh_info, largest_tile_id); +} + +static size_t av1_write_metadata_obu(const aom_metadata_t *metadata, + uint8_t *const dst) { + size_t coded_metadata_size = 0; + const uint64_t metadata_type = (uint64_t)metadata->type; + if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst, + &coded_metadata_size) != 0) { + return 0; + } + memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz); + // Add trailing bits. + dst[coded_metadata_size + metadata->sz] = 0x80; + return (uint32_t)(coded_metadata_size + metadata->sz + 1); +} + +static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) { + if (!cpi->source) return 0; + AV1_COMMON *const cm = &cpi->common; + aom_metadata_array_t *arr = cpi->source->metadata; + if (!arr) return 0; + size_t obu_header_size = 0; + size_t obu_payload_size = 0; + size_t total_bytes_written = 0; + size_t length_field_size = 0; + for (size_t i = 0; i < arr->sz; i++) { + aom_metadata_t *current_metadata = arr->metadata_array[i]; + if (current_metadata && current_metadata->payload) { + if ((cm->current_frame.frame_type == KEY_FRAME && + current_metadata->insert_flag == AOM_MIF_KEY_FRAME) || + (cm->current_frame.frame_type != KEY_FRAME && + current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) || + current_metadata->insert_flag == AOM_MIF_ANY_FRAME) { + obu_header_size = av1_write_obu_header(&cpi->ppi->level_params, + &cpi->frame_header_count, + OBU_METADATA, 0, dst); + obu_payload_size = + av1_write_metadata_obu(current_metadata, dst + obu_header_size); + length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, dst); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) == + AOM_CODEC_OK) { + const size_t obu_size = obu_header_size + obu_payload_size; + dst += obu_size + length_field_size; + total_bytes_written += obu_size + length_field_size; + } else { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Error writing metadata OBU size"); + } + } + } + } + return total_bytes_written; +} + +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id) { + uint8_t *data = dst; + uint32_t data_size; + AV1_COMMON *const cm = &cpi->common; + AV1LevelParams *const level_params = &cpi->ppi->level_params; + uint32_t obu_header_size = 0; + uint32_t obu_payload_size = 0; + FrameHeaderInfo fh_info = { NULL, 0, 0 }; + const uint8_t obu_extension_header = + cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0; + + // If no non-zero delta_q has been used, reset delta_q_present_flag + if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { + cm->delta_q_info.delta_q_present_flag = 0; + } + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + + cpi->frame_header_count = 0; + + // The TD is now written outside the frame encode loop + + // write sequence header obu at each key frame or intra_only frame, + // preceded by 4-byte size + if (cm->current_frame.frame_type == INTRA_ONLY_FRAME || + cm->current_frame.frame_type == KEY_FRAME) { + obu_header_size = av1_write_obu_header( + level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data); + obu_payload_size = + av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size); + const size_t length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + data += obu_header_size + obu_payload_size + length_field_size; + } + + // write metadata obus before the frame obu that has the show_frame flag set + if (cm->show_frame) data += av1_write_metadata_array(cpi, data); + + const int write_frame_header = + (cpi->num_tg > 1 || encode_show_existing_frame(cm)); + struct aom_write_bit_buffer saved_wb = { NULL, 0 }; + size_t length_field = 0; + if (write_frame_header) { + // Write Frame Header OBU. + fh_info.frame_header = data; + obu_header_size = + av1_write_obu_header(level_params, &cpi->frame_header_count, + OBU_FRAME_HEADER, obu_extension_header, data); + obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb, + data + obu_header_size, 1); + + length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + fh_info.obu_header_byte_offset = 0; + fh_info.total_length = obu_header_size + obu_payload_size + length_field; + data += fh_info.total_length; + } + + if (encode_show_existing_frame(cm)) { + data_size = 0; + } else { + // Since length_field is determined adaptively after frame header + // encoding, saved_wb must be adjusted accordingly. + if (saved_wb.bit_buffer != NULL) { + saved_wb.bit_buffer += length_field; + } + + // Each tile group obu will be preceded by 4-byte size of the tile group + // obu + data_size = write_tiles_in_tg_obus( + cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id); + } + data += data_size; + *size = data - dst; + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h new file mode 100644 index 0000000000..12e8a630db --- /dev/null +++ b/third_party/aom/av1/encoder/bitstream.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_BITSTREAM_H_ +#define AOM_AV1_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/encoder/level.h" +#include "aom_dsp/bitwriter.h" + +struct aom_write_bit_buffer; +struct AV1_COMP; +struct ThreadData; + +/*!\cond */ + +// Stores the location and size of a tile's data in the bitstream. Used for +// later identifying identical tiles +typedef struct { + uint8_t *data; + size_t size; +} TileBufferEnc; + +typedef struct { + uint8_t *frame_header; + size_t obu_header_byte_offset; + size_t total_length; +} FrameHeaderInfo; + +typedef struct { + struct aom_write_bit_buffer *saved_wb; // Bit stream buffer writer structure + TileBufferEnc buf; // Structure to hold bitstream buffer and size + uint32_t *total_size; // Size of the bitstream buffer for the tile in bytes + uint8_t *dst; // Base address of tile bitstream buffer + uint8_t *tile_data_curr; // Base address of tile-group bitstream buffer + size_t tile_buf_size; // Available bitstream buffer for the tile in bytes + uint8_t obu_extn_header; // Presence of OBU extension header + uint32_t obu_header_size; // Size of the OBU header + int curr_tg_hdr_size; // Size of the obu, tg, frame headers + int tile_size_mi; // Tile size in mi units + int tile_row; // Number of tile rows + int tile_col; // Number of tile columns + int is_last_tile_in_tg; // Flag to indicate last tile in a tile-group + int new_tg; // Flag to indicate starting of a new tile-group +} PackBSParams; + +typedef struct { + uint64_t abs_sum_level; + uint16_t tile_idx; +} PackBSTileOrder; + +// Pack bitstream data for pack bitstream multi-threading. +typedef struct { +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif + // Tile order structure of pack bitstream multithreading. + PackBSTileOrder pack_bs_tile_order[MAX_TILES]; + + // Index of next job to be processed. + int next_job_idx; + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool pack_bs_mt_exit; +} AV1EncPackBSSync; + +/*!\endcond */ + +// Writes only the OBU Sequence Header payload, and returns the size of the +// payload written to 'dst'. This function does not write the OBU header, the +// optional extension, or the OBU size to 'dst'. +uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, + uint8_t *const dst); + +// Writes the OBU header byte, and the OBU header extension byte when +// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. +uint32_t av1_write_obu_header(AV1LevelParams *const level_params, + int *frame_header_count, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst); + +int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, + uint8_t *dest); + +// Pack tile data in the bitstream with tile_group, frame +// and OBU header. +void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td, + PackBSParams *const pack_bs_params); + +void av1_write_last_tile_info( + struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, + struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, + uint8_t *curr_tg_start, uint32_t *const total_size, + uint8_t **tile_data_start, int *const largest_tile_id, + int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header); + +/*!\brief Pack the bitstream for one frame + * + * \ingroup high_level_algo + * \callgraph + */ +int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id); + +void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, + TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w); + +void av1_reset_pack_bs_thread_data(struct ThreadData *const td); + +void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi, + struct ThreadData const *td); + +void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi, + MACROBLOCKD *const xd, + PackBSParams *const pack_bs_params, + const int tile_idx); + +int av1_neg_interleave(int x, int ref, int max); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_BITSTREAM_H_ diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h new file mode 100644 index 0000000000..33d2d8c2a0 --- /dev/null +++ b/third_party/aom/av1/encoder/block.h @@ -0,0 +1,1515 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*! \file + * Declares various structs used to encode the current partition block. + */ +#ifndef AOM_AV1_ENCODER_BLOCK_H_ +#define AOM_AV1_ENCODER_BLOCK_H_ + +#include "av1/common/blockd.h" +#include "av1/common/entropymv.h" +#include "av1/common/entropy.h" +#include "av1/common/enums.h" +#include "av1/common/mvref_common.h" + +#include "av1/encoder/enc_enums.h" +#include "av1/encoder/mcomp_structs.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/partition_cnn_weights.h" +#endif + +#include "av1/encoder/hash_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +//! Minimum linear dimension of a tpl block +#define MIN_TPL_BSIZE_1D 16 +//! Maximum number of tpl block in a super block +#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D) +//! Number of txfm hash records kept for the partition block. +#define RD_RECORD_BUFFER_LEN 8 + +/*! Maximum value taken by transform type probabilities */ +#define MAX_TX_TYPE_PROB 1024 + +//! Compute color sensitivity index for given plane +#define COLOR_SENS_IDX(plane) ((plane)-1) + +//! Enable timer statistics of mode search in non-rd +#define COLLECT_NONRD_PICK_MODE_STAT 0 + +/*!\cond */ +#if COLLECT_NONRD_PICK_MODE_STAT +#include "aom_ports/aom_timer.h" + +typedef struct _mode_search_stat_nonrd { + int32_t num_blocks[BLOCK_SIZES]; + int64_t total_block_times[BLOCK_SIZES]; + int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT]; + int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT]; + int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT]; + struct aom_usec_timer timer1; + struct aom_usec_timer timer2; + struct aom_usec_timer bsize_timer; +} mode_search_stat_nonrd; +#endif // COLLECT_NONRD_PICK_MODE_STAT +/*!\endcond */ + +/*! \brief Superblock level encoder info + * + * SuperblockEnc stores superblock level information used by the encoder for + * more efficient encoding. Currently this is mostly used to store TPL data + * for the current superblock. + */ +typedef struct { + //! Maximum partition size for the sb. + BLOCK_SIZE min_partition_size; + //! Minimum partition size for the sb. + BLOCK_SIZE max_partition_size; + + /***************************************************************************** + * \name TPL Info + * + * Information gathered from tpl_model at tpl block precision for the + * superblock to speed up the encoding process.. + ****************************************************************************/ + /**@{*/ + //! Number of TPL blocks in this superblock. + int tpl_data_count; + //! TPL's estimate of inter cost for each tpl block. + int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB]; + //! TPL's estimate of tpl cost for each tpl block. + int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB]; + //! Motion vectors found by TPL model for each tpl block. + int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME]; + //! TPL's stride for the arrays in this struct. + int tpl_stride; + /**@}*/ +} SuperBlockEnc; + +/*! \brief Stores the best performing modes. + */ +typedef struct { + //! The mbmi used to reconstruct the winner mode. + MB_MODE_INFO mbmi; + //! Rdstats of the winner mode. + RD_STATS rd_cost; + //! Rdcost of the winner mode + int64_t rd; + //! Luma rate of the winner mode. + int rate_y; + //! Chroma rate of the winner mode. + int rate_uv; + //! The color map needed to reconstruct palette mode. + uint8_t color_index_map[MAX_SB_SQUARE]; + //! The current winner mode. + THR_MODES mode_index; +} WinnerModeStats; + +/*! \brief Each source plane of the current macroblock + * + * This struct also stores the txfm buffers and quantizer settings. + */ +typedef struct macroblock_plane { + //! Stores source - pred so the txfm can be computed later + int16_t *src_diff; + //! Dequantized coefficients + tran_low_t *dqcoeff; + //! Quantized coefficients + tran_low_t *qcoeff; + //! Transformed coefficients + tran_low_t *coeff; + //! Location of the end of qcoeff (end of block). + uint16_t *eobs; + //! Contexts used to code the transform coefficients. + uint8_t *txb_entropy_ctx; + //! A buffer containing the source frame. + struct buf_2d src; + + /*! \name Quantizer Settings + * + * \attention These are used/accessed only in the quantization process. + * RDO does not and *must not* depend on any of these values. + * All values below share the coefficient scale/shift used in TX. + */ + /**@{*/ + //! Quantization step size used by AV1_XFORM_QUANT_FP. + const int16_t *quant_fp_QTX; + //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP. + const int16_t *round_fp_QTX; + //! Quantization step size used by AV1_XFORM_QUANT_B. + const int16_t *quant_QTX; + //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B. + const int16_t *round_QTX; + //! Scale factor to shift coefficients toward zero. Only used by QUANT_B. + const int16_t *quant_shift_QTX; + //! Size of the quantization bin around 0. Only Used by QUANT_B + const int16_t *zbin_QTX; + //! Dequantizer + const int16_t *dequant_QTX; + /**@}*/ +} MACROBLOCK_PLANE; + +/*! \brief Costs for encoding the coefficients within a level. + * + * Covers everything including txb_skip, eob, dc_sign, + */ +typedef struct { + //! Cost to skip txfm for the current txfm block. + int txb_skip_cost[TXB_SKIP_CONTEXTS][2]; + /*! \brief Cost for encoding the base_eob of a level. + * + * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1. + */ + int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3]; + /*! \brief Cost for encoding the base level of a coefficient. + * + * Decoder derives coeff_base as coeff_base := base_eob + 1. + */ + int base_cost[SIG_COEF_CONTEXTS][8]; + /*! \brief Cost for encoding the last non-zero coefficient. + * + * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1 + */ + int eob_extra_cost[EOB_COEF_CONTEXTS][2]; + //! Cost for encoding the dc_sign + int dc_sign_cost[DC_SIGN_CONTEXTS][2]; + //! Cost for encoding an increment to the coefficient + int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1]; +} LV_MAP_COEFF_COST; + +/*! \brief Costs for encoding the eob. + */ +typedef struct { + //! eob_cost. + int eob_cost[2][11]; +} LV_MAP_EOB_COST; + +/*! \brief Stores the transforms coefficients for the whole superblock. + */ +typedef struct { + //! The transformed coefficients. + tran_low_t *tcoeff[MAX_MB_PLANE]; + //! Where the transformed coefficients end. + uint16_t *eobs[MAX_MB_PLANE]; + /*! \brief Transform block entropy contexts. + * + * Each element is used as a bit field. + * - Bits 0~3: txb_skip_ctx + * - Bits 4~5: dc_sign_ctx. + */ + uint8_t *entropy_ctx[MAX_MB_PLANE]; +} CB_COEFF_BUFFER; + +/*! \brief Extended mode info derived from mbmi. + */ +typedef struct { + // TODO(angiebird): Reduce the buffer size according to sb_type + //! The reference mv list for the current block. + CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; + //! The weights used to compute the ref mvs. + uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE]; + //! Number of ref mvs in the drl. + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; + //! Global mvs + int_mv global_mvs[REF_FRAMES]; + //! Context used to encode the current mode. + int16_t mode_context[MODE_CTX_REF_FRAMES]; +} MB_MODE_INFO_EXT; + +/*! \brief Stores best extended mode information at frame level. + * + * The frame level in here is used in bitstream preparation stage. The + * information in \ref MB_MODE_INFO_EXT are copied to this struct to save + * memory. + */ +typedef struct { + //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack + CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE]; + //! \copydoc MB_MODE_INFO_EXT::weight + uint16_t weight[USABLE_REF_MV_STACK_SIZE]; + //! \copydoc MB_MODE_INFO_EXT::ref_mv_count + uint8_t ref_mv_count; + // TODO(Ravi/Remya): Reduce the buffer size of global_mvs + //! \copydoc MB_MODE_INFO_EXT::global_mvs + int_mv global_mvs[REF_FRAMES]; + //! \copydoc MB_MODE_INFO_EXT::mode_context + int16_t mode_context; + //! Offset of current coding block's coeff buffer relative to the sb. + uint16_t cb_offset[PLANE_TYPES]; +} MB_MODE_INFO_EXT_FRAME; + +/*! \brief Inter-mode txfm results for a partition block. + */ +typedef struct { + //! Txfm size used if the current mode is intra mode. + TX_SIZE tx_size; + //! Txfm sizes used if the current mode is inter mode. + TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + //! Map showing which txfm block skips the txfm process. + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + //! Map showing the txfm types for each block. + uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + //! Rd_stats for the whole partition block. + RD_STATS rd_stats; + //! Hash value of the current record. + uint32_t hash_value; +} MB_RD_INFO; + +/*! \brief Hash records of the inter-mode transform results + * + * Hash records of the inter-mode transform results for a whole partition block + * based on the residue. Since this operates on the partition block level, this + * can give us a whole txfm partition tree. + */ +typedef struct { + /*! Circular buffer that stores the inter-mode txfm results of a partition + * block. + */ + MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN]; + //! Index to insert the newest rd record. + int index_start; + //! Number of info stored in this record. + int num; + //! Hash function + CRC32C crc_calculator; +} MB_RD_RECORD; + +//! Number of compound rd stats +#define MAX_COMP_RD_STATS 64 +/*! \brief Rdcost stats in compound mode. + */ +typedef struct { + //! Rate of the compound modes. + int32_t rate[COMPOUND_TYPES]; + //! Distortion of the compound modes. + int64_t dist[COMPOUND_TYPES]; + //! Estimated rate of the compound modes. + int32_t model_rate[COMPOUND_TYPES]; + //! Estimated distortion of the compound modes. + int64_t model_dist[COMPOUND_TYPES]; + //! Rate need to send the mask type. + int comp_rs2[COMPOUND_TYPES]; + //! Motion vector for each predictor. + int_mv mv[2]; + //! Ref frame for each predictor. + MV_REFERENCE_FRAME ref_frames[2]; + //! Current prediction mode. + PREDICTION_MODE mode; + //! Current interpolation filter. + int_interpfilters filter; + //! Refmv index in the drl. + int ref_mv_idx; + //! Whether the predictors are GLOBALMV. + int is_global[2]; + //! Current parameters for interinter mode. + INTERINTER_COMPOUND_DATA interinter_comp; +} COMP_RD_STATS; + +/*! \brief Contains buffers used to speed up rdopt for obmc. + * + * See the comments for calc_target_weighted_pred for details. + */ +typedef struct { + /*! \brief A new source weighted with the above and left predictors. + * + * Used to efficiently construct multiple obmc predictors during rdopt. + */ + int32_t *wsrc; + /*! \brief A new mask constructed from the original horz/vert mask. + * + * \copydetails wsrc + */ + int32_t *mask; + /*! \brief Prediction from the up predictor. + * + * Used to build the obmc predictor. + */ + uint8_t *above_pred; + /*! \brief Prediction from the up predictor. + * + * \copydetails above_pred + */ + uint8_t *left_pred; +} OBMCBuffer; + +/*! \brief Contains color maps used in palette mode. + */ +typedef struct { + //! The best color map found. + uint8_t best_palette_color_map[MAX_PALETTE_SQUARE]; + //! A temporary buffer used for k-means clustering. + int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE]; +} PALETTE_BUFFER; + +/*! \brief Contains buffers used by av1_compound_type_rd() + * + * For sizes and alignment of these arrays, refer to + * alloc_compound_type_rd_buffers() function. + */ +typedef struct { + //! First prediction. + uint8_t *pred0; + //! Second prediction. + uint8_t *pred1; + //! Source - first prediction. + int16_t *residual1; + //! Second prediction - first prediction. + int16_t *diff10; + //! Backup of the best segmentation mask. + uint8_t *tmp_best_mask_buf; +} CompoundTypeRdBuffers; + +/*! \brief Holds some parameters related to partitioning schemes in AV1. + */ +// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE +typedef struct { +#if !CONFIG_REALTIME_ONLY + // The following 4 parameters are used for cnn-based partitioning on intra + // frame. + /*! \brief Current index on the partition block quad tree. + * + * Used to index into the cnn buffer for partition decision. + */ + int quad_tree_idx; + //! Whether the CNN buffer contains valid output. + int cnn_output_valid; + //! A buffer used by our segmentation CNN for intra-frame partitioning. + float cnn_buffer[CNN_OUT_BUF_SIZE]; + //! log of the quantization parameter of the ancestor BLOCK_64X64. + float log_q; +#endif + + /*! \brief Variance of the subblocks in the superblock. + * + * This is used by rt mode for variance based partitioning. + * The indices corresponds to the following block sizes: + * - 0 - 128x128 + * - 1-2 - 128x64 + * - 3-4 - 64x128 + * - 5-8 - 64x64 + * - 9-16 - 64x32 + * - 17-24 - 32x64 + * - 25-40 - 32x32 + * - 41-104 - 16x16 + */ + uint8_t variance_low[105]; +} PartitionSearchInfo; + +/*!\cond */ +enum { + /** + * Do not prune transform depths. + */ + TX_PRUNE_NONE = 0, + /** + * Prune largest transform (depth 0) based on NN model. + */ + TX_PRUNE_LARGEST = 1, + /** + * Prune split transforms (depth>=1) based on NN model. + */ + TX_PRUNE_SPLIT = 2, +} UENUM1BYTE(TX_PRUNE_TYPE); +/*!\endcond */ + +/*! \brief Defines the parameters used to perform txfm search. + * + * For the most part, this determines how various speed features are used. + */ +typedef struct { + /*! \brief Whether to limit the intra txfm search type to the default txfm. + * + * This could either be a result of either sequence parameter or speed + * features. + */ + int use_default_intra_tx_type; + + /*! Probability threshold used for conditionally forcing tx type*/ + int default_inter_tx_type_prob_thresh; + + //! Whether to prune 2d transforms based on 1d transform results. + int prune_2d_txfm_mode; + + /*! \brief Variable from \ref WinnerModeParams based on current eval mode. + * + * See the documentation for \ref WinnerModeParams for more detail. + */ + unsigned int coeff_opt_thresholds[2]; + /*! \copydoc coeff_opt_thresholds */ + unsigned int tx_domain_dist_threshold; + /*! \copydoc coeff_opt_thresholds */ + TX_SIZE_SEARCH_METHOD tx_size_search_method; + /*! \copydoc coeff_opt_thresholds */ + unsigned int use_transform_domain_distortion; + /*! \copydoc coeff_opt_thresholds */ + unsigned int skip_txfm_level; + + /*! \brief How to search for the optimal tx_size + * + * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for + * the current partition block; if TX_MODE_SELECT, search through the whole + * tree. + * + * \attention + * Although this looks suspicious similar to a bitstream element, this + * tx_mode_search_type is only used internally by the encoder, and is *not* + * written to the bitstream. It determines what kind of tx_mode would be + * searched. For example, we might set it to TX_MODE_LARGEST to find a good + * candidate, then code it as TX_MODE_SELECT. + */ + TX_MODE tx_mode_search_type; + + /*! + * Determines whether a block can be predicted as transform skip or DC only + * based on residual mean and variance. + * Type 0 : No skip block or DC only block prediction + * Type 1 : Prediction of skip block based on residual mean and variance + * Type 2 : Prediction of skip block or DC only block based on residual mean + * and variance + */ + unsigned int predict_dc_level; + + /*! + * Whether or not we should use the quantization matrix as weights for PSNR + * during RD search. + */ + int use_qm_dist_metric; + + /*! + * Keep track of previous mode evaluation stage type. This will be used to + * reset mb rd hash record when mode evaluation type changes. + */ + int mode_eval_type; + +#if !CONFIG_REALTIME_ONLY + //! Indicates the transform depths for which RD evaluation is skipped. + TX_PRUNE_TYPE nn_prune_depths_for_intra_tx; + + /*! \brief Indicates if NN model should be invoked to prune transform depths. + * + * Used to signal whether NN model should be evaluated to prune the R-D + * evaluation of specific transform depths. + */ + bool enable_nn_prune_intra_tx_depths; +#endif +} TxfmSearchParams; + +/*!\cond */ +#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)) +#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)) +#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)) +#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)) +/*!\endcond */ + +/*! \brief Stores various encoding/search decisions related to txfm search. + * + * This struct contains a cache of previous txfm results, and some buffers for + * the current txfm decision. + */ +typedef struct { + //! Whether to skip transform and quantization on a partition block level. + uint8_t skip_txfm; + + /*! \brief Whether to skip transform and quantization on a txfm block level. + * + * Skips transform and quantization on a transform block level inside the + * current partition block. Each element of this array is used as a bit-field. + * So for example, the we are skipping on the luma plane, then the last bit + * would be set to 1. + */ + uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + /*! \brief Transform types inside the partition block + * + * Keeps a record of what kind of transform to use for each of the transform + * block inside the partition block. + * \attention The buffer here is *never* directly used. Instead, this just + * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the + * partition block. So if we need to save memory, we could move the allocation + * to pick_sb_mode instead. + */ + uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + //! Txfm hash records of inter-modes. + MB_RD_RECORD *mb_rd_record; + + /*! \brief Number of txb splits. + * + * Keep track of how many times we've used split tx partition for transform + * blocks. Somewhat misleadingly, this parameter doesn't actually keep track + * of the count of the current block. Instead, it's a cumulative count across + * of the whole frame. The main usage is that if txb_split_count is zero, then + * we can signal TX_MODE_LARGEST at frame level. + */ + // TODO(chiyotsai@google.com): Move this to a more appropriate location such + // as ThreadData. + unsigned int txb_split_count; +#if CONFIG_SPEED_STATS + //! For debugging. Used to check how many txfm searches we are doing. + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS +} TxfmSearchInfo; +#undef MAX_NUM_8X8_TXBS +#undef MAX_NUM_16X16_TXBS +#undef MAX_NUM_32X32_TXBS +#undef MAX_NUM_64X64_TXBS + +/*! \brief Holds the entropy costs for various modes sent to the bitstream. + * + * \attention This does not include the costs for mv and transformed + * coefficients. + */ +typedef struct { + /***************************************************************************** + * \name Partition Costs + ****************************************************************************/ + /**@{*/ + //! Cost for coding the partition. + int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + /**@}*/ + + /***************************************************************************** + * \name Intra Costs: General + ****************************************************************************/ + /**@{*/ + //! Luma mode cost for inter frame. + int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES]; + //! Luma mode cost for intra frame. + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + //! Chroma mode cost + int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + //! filter_intra_cost + int filter_intra_cost[BLOCK_SIZES_ALL][2]; + //! filter_intra_mode_cost + int filter_intra_mode_cost[FILTER_INTRA_MODES]; + //! angle_delta_cost + int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + + //! Rate rate associated with each alpha codeword + int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE]; + /**@}*/ + + /***************************************************************************** + * \name Intra Costs: Screen Contents + ****************************************************************************/ + /**@{*/ + //! intrabc_cost + int intrabc_cost[2]; + + //! palette_y_size_cost + int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + //! palette_uv_size_cost + int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + //! palette_y_color_cost + int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + //! palette_uv_color_cost + int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + //! palette_y_mode_cost + int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + //! palette_uv_mode_cost + int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: MV Modes + ****************************************************************************/ + /**@{*/ + //! skip_mode_cost + int skip_mode_cost[SKIP_MODE_CONTEXTS][2]; + //! newmv_mode_cost + int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2]; + //! zeromv_mode_cost + int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2]; + //! refmv_mode_cost + int refmv_mode_cost[REFMV_MODE_CONTEXTS][2]; + //! drl_mode_cost0 + int drl_mode_cost0[DRL_MODE_CONTEXTS][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Ref Frame Types + ****************************************************************************/ + /**@{*/ + //! single_ref_cost + int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2]; + //! comp_inter_cost + int comp_inter_cost[COMP_INTER_CONTEXTS][2]; + //! comp_ref_type_cost + int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS] + [CDF_SIZE(COMP_REFERENCE_TYPES)]; + //! uni_comp_ref_cost + int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1] + [CDF_SIZE(2)]; + /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode + * + * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME. + */ + int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2]; + /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode + * + * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME. + */ + int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Compound Types + ****************************************************************************/ + /**@{*/ + //! intra_inter_cost + int intra_inter_cost[INTRA_INTER_CONTEXTS][2]; + //! inter_compound_mode_cost + int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + //! compound_type_cost + int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; + //! wedge_idx_cost + int wedge_idx_cost[BLOCK_SIZES_ALL][16]; + //! interintra_cost + int interintra_cost[BLOCK_SIZE_GROUPS][2]; + //! wedge_interintra_cost + int wedge_interintra_cost[BLOCK_SIZES_ALL][2]; + //! interintra_mode_cost + int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Compound Masks + ****************************************************************************/ + /**@{*/ + //! comp_idx_cost + int comp_idx_cost[COMP_INDEX_CONTEXTS][2]; + //! comp_group_idx_cost + int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2]; + /**@}*/ + + /***************************************************************************** + * \name Inter Costs: Motion Modes/Filters + ****************************************************************************/ + /**@{*/ + //! motion_mode_cost + int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES]; + //! motion_mode_cost1 + int motion_mode_cost1[BLOCK_SIZES_ALL][2]; + //! switchable_interp_costs + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + /**@}*/ + + /***************************************************************************** + * \name Txfm Mode Costs + ****************************************************************************/ + /**@{*/ + //! skip_txfm_cost + int skip_txfm_cost[SKIP_CONTEXTS][2]; + //! tx_size_cost + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; + //! txfm_partition_cost + int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2]; + //! inter_tx_type_costs + int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + //! intra_tx_type_costs + int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + /**@}*/ + + /***************************************************************************** + * \name Restoration Mode Costs + ****************************************************************************/ + /**@{*/ + //! switchable_restore_cost + int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES]; + //! wiener_restore_cost + int wiener_restore_cost[2]; + //! sgrproj_restore_cost + int sgrproj_restore_cost[2]; + /**@}*/ + + /***************************************************************************** + * \name Segmentation Mode Costs + ****************************************************************************/ + /**@{*/ + //! tmp_pred_cost + int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2]; + //! spatial_pred_cost + int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS]; + /**@}*/ +} ModeCosts; + +/*! \brief Holds mv costs for encoding and motion search. + */ +typedef struct { + /***************************************************************************** + * \name Encoding Costs + * Here are the entropy costs needed to encode a given mv. + * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds + * the memory for holding the mv cost. But since the motion vectors can be + * negative, we shift them to the middle and store the resulting pointer in + * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref + * mv_cost_stack points to the \ref nmv_cost with the mv precision we are + * currently working with. In essence, only \ref mv_cost_stack is needed for + * motion search, the other can be considered private. + ****************************************************************************/ + /**@{*/ + //! Costs for coding the zero components. + int nmv_joint_cost[MV_JOINTS]; + + //! Allocates memory for 1/4-pel motion vector costs. + int nmv_cost_alloc[2][MV_VALS]; + //! Allocates memory for 1/8-pel motion vector costs. + int nmv_cost_hp_alloc[2][MV_VALS]; + //! Points to the middle of \ref nmv_cost_alloc + int *nmv_cost[2]; + //! Points to the middle of \ref nmv_cost_hp_alloc + int *nmv_cost_hp[2]; + //! Points to the nmv_cost_hp in use. + int **mv_cost_stack; + /**@}*/ +} MvCosts; + +/*! \brief Holds mv costs for intrabc. + */ +typedef struct { + /*! Costs for coding the joint mv. */ + int joint_mv[MV_JOINTS]; + + /*! \brief Cost of transmitting the actual motion vector. + * dv_costs_alloc[0][i] is the cost of motion vector with horizontal + * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of + * motion vector with vertical component (mv_col) equal to i - MV_MAX. + */ + int dv_costs_alloc[2][MV_VALS]; + + /*! Points to the middle of \ref dv_costs_alloc. */ + int *dv_costs[2]; +} IntraBCMVCosts; + +/*! \brief Holds the costs needed to encode the coefficients + */ +typedef struct { + //! Costs for coding the coefficients. + LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES]; + //! Costs for coding the eobs. + LV_MAP_EOB_COST eob_costs[7][2]; +} CoeffCosts; + +/*!\cond */ +// 4: NEAREST, NEW, NEAR, GLOBAL +#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4) +/*!\endcond */ +struct inter_modes_info; + +/*! \brief Holds the motion samples for warp motion model estimation + */ +typedef struct { + //! Number of samples. + int num; + //! Sample locations in current frame. + int pts[16]; + //! Sample location in the reference frame. + int pts_inref[16]; +} WARP_SAMPLE_INFO; + +/*!\cond */ +typedef enum { + kZeroSad = 0, + kVeryLowSad = 1, + kLowSad = 2, + kMedSad = 3, + kHighSad = 4 +} SOURCE_SAD; + +typedef struct { + //! SAD levels in non-rd path + SOURCE_SAD source_sad_nonrd; + //! SAD levels in rd-path for var-based part qindex thresholds + SOURCE_SAD source_sad_rd; + int lighting_change; + int low_sumdiff; +} CONTENT_STATE_SB; + +// Structure to hold pixel level gradient info. +typedef struct { + uint16_t abs_dx_abs_dy_sum; + int8_t hist_bin_idx; + bool is_dx_zero; +} PixelLevelGradientInfo; + +// Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks. +typedef struct { + double log_var; + int var; +} Block4x4VarInfo; + +#ifndef NDEBUG +typedef struct SetOffsetsLoc { + int mi_row; + int mi_col; + BLOCK_SIZE bsize; +} SetOffsetsLoc; +#endif // NDEBUG + +/*!\endcond */ + +/*! \brief Encoder's parameters related to the current coding block. + * + * This struct contains most of the information the encoder needs to encode the + * current coding block. This includes the src and pred buffer, a copy of the + * decoder's view of the current block, the txfm coefficients. This struct also + * contains various buffers and data used to speed up the encoding process. + */ +typedef struct macroblock { + /***************************************************************************** + * \name Source, Buffers and Decoder + ****************************************************************************/ + /**@{*/ + /*! \brief Each of the encoding plane. + * + * An array holding the src buffer for each of plane of the current block. It + * also contains the txfm and quantized txfm coefficients. + */ + struct macroblock_plane plane[MAX_MB_PLANE]; + + /*! \brief Decoder's view of current coding block. + * + * Contains the encoder's copy of what the decoder sees in the current block. + * Most importantly, this struct contains pointers to mbmi that is used in + * final bitstream packing. + */ + MACROBLOCKD e_mbd; + + /*! \brief Derived coding information. + * + * Contains extra information not transmitted in the bitstream but are + * derived. For example, this contains the stack of ref_mvs. + */ + MB_MODE_INFO_EXT mbmi_ext; + + /*! \brief Finalized mbmi_ext for the whole frame. + * + * Contains the finalized info in mbmi_ext that gets used at the frame level + * for bitstream packing. + */ + MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame; + + //! Entropy context for the current row. + FRAME_CONTEXT *row_ctx; + /*! \brief Entropy context for the current tile. + * + * This context will be used to update color_map_cdf pointer which would be + * used during pack bitstream. For single thread and tile-multithreading case + * this pointer will be same as xd->tile_ctx, but for the case of row-mt: + * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point + * to the accurate tile context. + */ + FRAME_CONTEXT *tile_pb_ctx; + + /*! \brief Buffer of transformed coefficients + * + * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized + * coefficients. This is here to conveniently copy the best coefficients to + * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a + * superblock level, we need to combine it with cb_offset to get the proper + * position for the current coding block. + */ + CB_COEFF_BUFFER *cb_coef_buff; + //! Offset of current coding block's coeff buffer relative to the sb. + uint16_t cb_offset[PLANE_TYPES]; + + //! Modified source and masks used for fast OBMC search. + OBMCBuffer obmc_buffer; + //! Buffer to store the best palette map. + PALETTE_BUFFER *palette_buffer; + //! Buffer used for compound_type_rd(). + CompoundTypeRdBuffers comp_rd_buffer; + //! Buffer to store convolution during averaging process in compound mode. + CONV_BUF_TYPE *tmp_conv_dst; + + /*! \brief Temporary buffer to hold prediction. + * + * Points to a buffer that is used to hold temporary prediction results. This + * is used in two ways: + * - This is a temporary buffer used to ping-pong the prediction in + * handle_inter_mode. + * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc + * prediction. + */ + uint8_t *tmp_pred_bufs[2]; + /**@}*/ + + /***************************************************************************** + * \name Rdopt Costs + ****************************************************************************/ + /**@{*/ + /*! \brief Quantization index for the current partition block. + * + * This is used to as the index to find quantization parameter for luma and + * chroma transformed coefficients. + */ + int qindex; + + /*! \brief Difference between frame-level qindex and current qindex. + * + * This is used to track whether a non-zero delta for qindex is used at least + * once in the current frame. + */ + int delta_qindex; + + /*! \brief Difference between frame-level qindex and qindex used to + * compute rdmult (lambda). + * + * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep. + * During qp sweep, delta_qindex is changed and used to calculate the actual + * quant params, while rdmult_delta_qindex remains the same, and is used to + * calculate the rdmult in "set_deltaq_rdmult". + */ + int rdmult_delta_qindex; + + /*! \brief Current qindex (before being adjusted by delta_q_res) used to + * derive rdmult_delta_qindex. + */ + int rdmult_cur_qindex; + + /*! \brief Rate-distortion multiplier. + * + * The rd multiplier used to determine the rate-distortion trade-off. This is + * roughly proportional to the inverse of q-index for a given frame, but this + * can be manipulated for better rate-control. For example, in tune_ssim + * mode, this is scaled by a factor related to the variance of the current + * block. + */ + int rdmult; + + //! Intra only, per sb rd adjustment. + int intra_sb_rdmult_modifier; + + //! Superblock level distortion propagation factor. + double rb; + + //! Energy in the current source coding block. Used to calculate \ref rdmult + int mb_energy; + //! Energy in the current source superblock. Used to calculate \ref rdmult + int sb_energy_level; + + //! The rate needed to signal a mode to the bitstream. + ModeCosts mode_costs; + + //! The rate needed to encode a new motion vector to the bitstream and some + //! multipliers for motion search. + MvCosts *mv_costs; + + /*! The rate needed to encode a new motion vector to the bitstream in intrabc + * mode. + */ + IntraBCMVCosts *dv_costs; + + //! The rate needed to signal the txfm coefficients to the bitstream. + CoeffCosts coeff_costs; + /**@}*/ + + /***************************************************************************** + * \name Rate to Distortion Multipliers + ****************************************************************************/ + /**@{*/ + //! A multiplier that converts mv cost to l2 error. + int errorperbit; + //! A multiplier that converts mv cost to l1 error. + int sadperbit; + /**@}*/ + + /****************************************************************************** + * \name Segmentation + *****************************************************************************/ + /**@{*/ + /*! \brief Skip mode for the segment + * + * A syntax element of the segmentation mode. In skip_block mode, all mvs are + * set 0 and all txfms are skipped. + */ + int seg_skip_block; + + /*! \brief Number of segment 1 blocks + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 1. + */ + int actual_num_seg1_blocks; + + /*!\brief Number of segment 2 blocks + * Actual number of (4x4) blocks that were applied delta-q, + * for segment 2. + */ + int actual_num_seg2_blocks; + + /*!\brief Number of zero motion vectors + */ + int cnt_zeromv; + + /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path. + * + * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks + * in the superblock may be marked as zeromv-skip at block level. + */ + int force_zeromv_skip_for_sb; + + /*!\brief Flag to force zeromv-skip at block level, for nonrd path. + */ + int force_zeromv_skip_for_blk; + + /*! \brief Previous segment id for which qmatrices were updated. + * This is used to bypass setting of qmatrices if no change in qindex. + */ + int prev_segment_id; + /**@}*/ + + /***************************************************************************** + * \name Superblock + ****************************************************************************/ + /**@{*/ + //! Information on a whole superblock level. + // TODO(chiyotsai@google.com): Refactor this out of macroblock + SuperBlockEnc sb_enc; + + /*! \brief Characteristics of the current superblock. + * + * Characteristics like whether the block has high sad, low sad, etc. This is + * only used by av1 realtime mode. + */ + CONTENT_STATE_SB content_state_sb; + /**@}*/ + + /***************************************************************************** + * \name Reference Frame Search + ****************************************************************************/ + /**@{*/ + /*! \brief Sum absolute distortion of the predicted mv for each ref frame. + * + * This is used to measure how viable a reference frame is. + */ + int pred_mv_sad[REF_FRAMES]; + /*! \brief The minimum of \ref pred_mv_sad. + * + * Index 0 stores the minimum \ref pred_mv_sad across past reference frames. + * Index 1 stores the minimum \ref pred_mv_sad across future reference frames. + */ + int best_pred_mv_sad[2]; + //! The sad of the 1st mv ref (nearest). + int pred_mv0_sad[REF_FRAMES]; + //! The sad of the 2nd mv ref (near). + int pred_mv1_sad[REF_FRAMES]; + + /*! \brief Disables certain ref frame pruning based on tpl. + * + * Determines whether a given ref frame is "good" based on data from the TPL + * model. If so, this stops selective_ref frame from pruning the given ref + * frame at block level. + */ + uint8_t tpl_keep_ref_frame[REF_FRAMES]; + + /*! \brief Warp motion samples buffer. + * + * Store the motion samples used for warp motion. + */ + WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES]; + + /*! \brief Reference frames picked by the square subblocks in a superblock. + * + * Keeps track of ref frames that are selected by square partition blocks + * within a superblock, in MI resolution. They can be used to prune ref frames + * for rectangular blocks. + */ + int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE]; + + /*! \brief Prune ref frames in real-time mode. + * + * Determines whether to prune reference frames in real-time mode. For the + * most part, this is the same as nonrd_prune_ref_frame_search in + * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively + * turned off if the only frame available is GOLDEN_FRAME. + */ + int nonrd_prune_ref_frame_search; + /**@}*/ + + /***************************************************************************** + * \name Partition Search + ****************************************************************************/ + /**@{*/ + //! Stores some partition-search related buffers. + PartitionSearchInfo part_search_info; + + /*! \brief Whether to disable some features to force a mode in current block. + * + * In some cases, our speed features can be overly aggressive and remove all + * modes search in the superblock. When this happens, we set + * must_find_valid_partition to 1 to reduce the number of speed features, and + * recode the superblock again. + */ + int must_find_valid_partition; + /**@}*/ + + /***************************************************************************** + * \name Prediction Mode Search + ****************************************************************************/ + /**@{*/ + /*! \brief Inter skip mode. + * + * Skip mode tries to use the closest forward and backward references for + * inter prediction. Skip here means to skip transmitting the reference + * frames, not to be confused with skip_txfm. + */ + int skip_mode; + + /*! \brief Factors used for rd-thresholding. + * + * Determines a rd threshold to determine whether to continue searching the + * current mode. If the current best rd is already <= threshold, then we skip + * the current mode. + */ + int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; + + /*! \brief Tracks the winner modes in the current coding block. + * + * Winner mode is a two-pass strategy to find the best prediction mode. In the + * first pass, we search the prediction modes with a limited set of txfm + * options, and keep the top modes. These modes are called the winner modes. + * In the second pass, we retry the winner modes with more thorough txfm + * options. + */ + WinnerModeStats *winner_mode_stats; + //! Tracks how many winner modes there are. + int winner_mode_count; + + /*! \brief The model used for rd-estimation to avoid txfm + * + * These are for inter_mode_rd_model_estimation, which is another two pass + * approach. In this speed feature, we collect data in the first couple frames + * to build an rd model to estimate the rdcost of a prediction model based on + * the residue error. Once enough data is collected, this speed feature uses + * the estimated rdcost to find the most performant prediction mode. Then we + * follow up with a second pass find the best transform for the mode. + * Determines if one would go with reduced complexity transform block + * search model to select prediction modes, or full complexity model + * to select transform kernel. + */ + TXFM_RD_MODEL rd_model; + + /*! \brief Stores the inter mode information needed to build an rd model. + * + * These are for inter_mode_rd_model_estimation, which is another two pass + * approach. In this speed feature, we collect data in the first couple frames + * to build an rd model to estimate the rdcost of a prediction model based on + * the residue error. Once enough data is collected, this speed feature uses + * the estimated rdcost to find the most performant prediction mode. Then we + * follow up with a second pass find the best transform for the mode. + */ + // TODO(any): try to consolidate this speed feature with winner mode + // processing. + struct inter_modes_info *inter_modes_info; + + //! How to blend the compound predictions. + uint8_t compound_idx; + + //! A caches of results of compound type search so they can be reused later. + COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS]; + //! The idx for the latest compound mode in the cache \ref comp_rd_stats. + int comp_rd_stats_idx; + + /*! \brief Whether to recompute the luma prediction. + * + * In interpolation search, we can usually skip recalculating the luma + * prediction because it is already calculated by a previous predictor. This + * flag signifies that some modes might have been skipped, so we need to + * rebuild the prediction. + */ + int recalc_luma_mc_data; + + /*! \brief Data structure to speed up intrabc search. + * + * Contains the hash table, hash function, and buffer used for intrabc. + */ + IntraBCHashInfo intrabc_hash_info; + + /*! \brief Whether to reuse the mode stored in mb_mode_cache. */ + int use_mb_mode_cache; + /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and + * \ref av1_rd_pick_inter_mode. */ + const MB_MODE_INFO *mb_mode_cache; + /*! \brief Pointer to the buffer which caches gradient information. + * + * Pointer to the array of structures to store gradient information of each + * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level + * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). + */ + PixelLevelGradientInfo *pixel_gradient_info; + /*! \brief Flags indicating the availability of cached gradient info. */ + bool is_sb_gradient_cached[PLANE_TYPES]; + + /*! \brief Flag to reuse predicted samples of inter block. */ + bool reuse_inter_pred; + /**@}*/ + + /***************************************************************************** + * \name MV Search + ****************************************************************************/ + /**@{*/ + /*! \brief Context used to determine the initial step size in motion search. + * + * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for + * each frame. + */ + unsigned int max_mv_context[REF_FRAMES]; + + /*! \brief Limit for the range of motion vectors. + * + * These define limits to motion vector components to prevent them from + * extending outside the UMV borders + */ + FullMvLimits mv_limits; + + /*! \brief Buffer for storing the search site config. + * + * When resize mode or super resolution mode is on, the stride of the + * reference frame does not always match what's specified in \ref + * MotionVectorSearchParams::search_site_cfg. When his happens, we update the + * search_sine_config buffer here and use it for motion search. + */ + search_site_config search_site_cfg_buf[NUM_DISTINCT_SEARCH_METHODS]; + /**@}*/ + + /***************************************************************************** + * \name Txfm Search + ****************************************************************************/ + /**@{*/ + /*! \brief Parameters that control how motion search is done. + * + * Stores various txfm search related parameters such as txfm_type, txfm_size, + * trellis eob search, etc. + */ + TxfmSearchParams txfm_search_params; + + /*! \brief Results of the txfm searches that have been done. + * + * Caches old txfm search results and keeps the current txfm decisions to + * facilitate rdopt. + */ + TxfmSearchInfo txfm_search_info; + + /*! \brief Whether there is a strong color activity. + * + * Used in REALTIME coding mode to enhance the visual quality at the boundary + * of moving color objects. + */ + uint8_t color_sensitivity_sb[MAX_MB_PLANE - 1]; + //! Color sensitivity flag for the superblock for golden reference. + uint8_t color_sensitivity_sb_g[MAX_MB_PLANE - 1]; + //! Color sensitivity flag for the superblock for altref reference. + uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1]; + //! Color sensitivity flag for the coding block. + uint8_t color_sensitivity[MAX_MB_PLANE - 1]; + //! Coding block distortion value for uv/color, minimum over the inter modes. + int64_t min_dist_inter_uv; + + //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane + // so we can keep dqcoeff of the best tx_type. + tran_low_t *dqcoeff_buf; + /**@}*/ + + /***************************************************************************** + * \name Misc + ****************************************************************************/ + /**@{*/ + //! Variance of the source frame. + unsigned int source_variance; + //! Flag to indicate coding block is zero sad. + int block_is_zero_sad; + //! Flag to indicate superblock ME in variance partition is determined to be + // good/reliable, and so the superblock MV will be tested in the + // nonrd_pickmode. This is only used for LAST_FRAME. + int sb_me_partition; + //! Flag to indicate to test the superblock MV for the coding block in the + // nonrd_pickmode. + int sb_me_block; + //! Motion vector from superblock MV derived from int_pro_motion() in + // the variance_partitioning. + int_mv sb_me_mv; + //! SSE of the current predictor. + unsigned int pred_sse[REF_FRAMES]; + //! Prediction for ML based partition. +#if CONFIG_RT_ML_PARTITIONING + DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]); +#endif + /**@}*/ + + /*! \brief NONE partition evaluated for merge. + * + * In variance based partitioning scheme, NONE & SPLIT partitions are + * evaluated to check the SPLIT can be merged as NONE. This flag signifies the + * partition is evaluated in the scheme. + */ + int try_merge_partition; + + /*! \brief Pointer to buffer which caches sub-block variances in a superblock. + * + * Pointer to the array of structures to store source variance information of + * each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to + * store source variance and log of source variance of each 4x4 sub-block. + */ + Block4x4VarInfo *src_var_info_of_4x4_sub_blocks; +#ifndef NDEBUG + /*! \brief A hash to make sure av1_set_offsets is called */ + SetOffsetsLoc last_set_offsets_loc; +#endif // NDEBUG + +#if COLLECT_NONRD_PICK_MODE_STAT + mode_search_stat_nonrd ms_stat_nonrd; +#endif // COLLECT_NONRD_PICK_MODE_STAT + + /*!\brief Number of pixels in current thread that choose palette mode in the + * fast encoding stage for screen content tool detemination. + */ + int palette_pixels; + + /*!\brief Pointer to the structure which stores the statistics used by + * sb-level multi-pass encoding. + */ + struct SB_FIRST_PASS_STATS *sb_stats_cache; + + /*!\brief Pointer to the structure which stores the statistics used by + * first-pass when superblock is searched twice consecutively. + */ + struct SB_FIRST_PASS_STATS *sb_fp_stats; + +#if CONFIG_PARTITION_SEARCH_ORDER + /*!\brief Pointer to RD_STATS structure to be used in + * av1_rd_partition_search(). + */ + RD_STATS *rdcost; +#endif // CONFIG_PARTITION_SEARCH_ORDER +} MACROBLOCK; +#undef SINGLE_REF_MODES + +/*!\cond */ +// Zeroes out 'n_stats' elements in the array x->winner_mode_stats. +// It only zeroes out what is necessary in 'color_index_map' (just the block +// size, not the whole array). +static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats, + WinnerModeStats *stats) { + // When winner mode stats are not required, the memory allocation is avoided + // for x->winner_mode_stats. The stats pointer will be NULL in such cases. + if (stats == NULL) return; + + const int block_height = block_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + for (int i = 0; i < n_stats; ++i) { + WinnerModeStats *const stat = &stats[i]; + memset(&stat->mbmi, 0, sizeof(stat->mbmi)); + memset(&stat->rd_cost, 0, sizeof(stat->rd_cost)); + memset(&stat->rd, 0, sizeof(stat->rd)); + memset(&stat->rate_y, 0, sizeof(stat->rate_y)); + memset(&stat->rate_uv, 0, sizeof(stat->rate_uv)); + // Do not reset the whole array as it is CPU intensive. + memset(&stat->color_index_map, 0, + block_width * block_height * sizeof(stat->color_index_map[0])); + memset(&stat->mode_index, 0, sizeof(stat->mode_index)); + } +} + +static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { + static const char LUT[BLOCK_SIZES_ALL] = { + 0, // BLOCK_4X4 + 1, // BLOCK_4X8 + 1, // BLOCK_8X4 + 0, // BLOCK_8X8 + 1, // BLOCK_8X16 + 1, // BLOCK_16X8 + 0, // BLOCK_16X16 + 1, // BLOCK_16X32 + 1, // BLOCK_32X16 + 0, // BLOCK_32X32 + 1, // BLOCK_32X64 + 1, // BLOCK_64X32 + 0, // BLOCK_64X64 + 0, // BLOCK_64X128 + 0, // BLOCK_128X64 + 0, // BLOCK_128X128 + 1, // BLOCK_4X16 + 1, // BLOCK_16X4 + 1, // BLOCK_8X32 + 1, // BLOCK_32X8 + 1, // BLOCK_16X64 + 1, // BLOCK_64X16 + }; + + return LUT[bsize]; +} + +static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + return is_rect_tx_allowed_bsize(mbmi->bsize) && + !xd->lossless[mbmi->segment_id]; +} + +static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) { + TX_SIZE ctx_size = max_txsize_rect_lookup[bsize]; + int depth = 0; + while (tx_size != ctx_size) { + depth++; + ctx_size = sub_tx_size_map[ctx_size]; + assert(depth <= MAX_TX_DEPTH); + } + return depth; +} + +static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx, + int skip) { + if (skip) + txb_skip[blk_idx] |= 1UL << plane; + else + txb_skip[blk_idx] &= ~(1UL << plane); +#ifndef NDEBUG + // Set chroma planes to uninitialized states when luma is set to check if + // it will be set later + if (plane == 0) { + txb_skip[blk_idx] |= 1UL << (1 + 4); + txb_skip[blk_idx] |= 1UL << (2 + 4); + } + + // Clear the initialization checking bit + txb_skip[blk_idx] &= ~(1UL << (plane + 4)); +#endif +} + +static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) { +#ifndef NDEBUG + // Check if this is initialized + assert(!(txb_skip[blk_idx] & (1UL << (plane + 4)))); + + // The magic number is 0x77, this is to test if there is garbage data + assert((txb_skip[blk_idx] & 0x88) == 0); +#endif + return (txb_skip[blk_idx] >> plane) & 1; +} + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_BLOCK_H_ diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c new file mode 100644 index 0000000000..6ad2ddaf25 --- /dev/null +++ b/third_party/aom/av1/encoder/blockiness.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +static int horizontal_filter(const uint8_t *s) { + return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; +} + +static int vertical_filter(const uint8_t *s, int p) { + return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6; +} + +static int variance(int sum, int sum_squared, int size) { + return sum_squared / size - (sum / size) * (sum / size); +} +// Calculate a blockiness level for a vertical block edge. +// This function returns a new blockiness metric that's defined as + +// p0 p1 p2 p3 +// q0 q1 q2 q3 +// block edge -> +// r0 r1 r2 r3 +// s0 s1 s2 s3 + +// blockiness = p0*-2+q0*6+r0*-6+s0*2 + +// p1*-2+q1*6+r1*-6+s1*2 + +// p2*-2+q2*6+r2*-6+s2*2 + +// p3*-2+q3*6+r3*-6+s3*2 ; + +// reconstructed_blockiness = abs(blockiness from reconstructed buffer - +// blockiness from source buffer,0) +// +// I make the assumption that flat blocks are much more visible than high +// contrast blocks. As such, I scale the result of the blockiness calc +// by dividing the blockiness by the variance of the pixels on either side +// of the edge as follows: +// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2 +// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2 +// The returned blockiness is the scaled value +// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ; +static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, s += sp, r += rp) { + s_blockiness += horizontal_filter(s); + r_blockiness += horizontal_filter(r); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-1]; + sum_sq_1 += s[-1] * s[-1]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// Calculate a blockiness level for a horizontal block edge +// same as above. +static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, + int rp, int size) { + int s_blockiness = 0; + int r_blockiness = 0; + int sum_0 = 0; + int sum_sq_0 = 0; + int sum_1 = 0; + int sum_sq_1 = 0; + int i; + int var_0; + int var_1; + for (i = 0; i < size; ++i, ++s, ++r) { + s_blockiness += vertical_filter(s, sp); + r_blockiness += vertical_filter(r, rp); + sum_0 += s[0]; + sum_sq_0 += s[0] * s[0]; + sum_1 += s[-sp]; + sum_sq_1 += s[-sp] * s[-sp]; + } + var_0 = variance(sum_0, sum_sq_0, size); + var_1 = variance(sum_1, sum_sq_1, size); + r_blockiness = abs(r_blockiness); + s_blockiness = abs(s_blockiness); + + if (r_blockiness > s_blockiness) + return (r_blockiness - s_blockiness) / (1 + var_0 + var_1); + else + return 0; +} + +// This function returns the blockiness for the entire frame currently by +// looking at all borders in steps of 4. +double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, int width, + int height) { + double blockiness = 0; + int i, j; + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4) { + if (i > 0 && i < height && j > 0 && j < width) { + blockiness += + blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4); + blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j, + img2_pitch, 4); + } + } + } + blockiness /= width * height / 16; + return blockiness; +} diff --git a/third_party/aom/av1/encoder/cnn.c b/third_party/aom/av1/encoder/cnn.c new file mode 100644 index 0000000000..598b362753 --- /dev/null +++ b/third_party/aom/av1/encoder/cnn.c @@ -0,0 +1,1189 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" + +#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a))) + +typedef struct { + const float **input; + int in_width; + int in_height; + int in_stride; + const CNN_LAYER_CONFIG *layer_config; + float **output; + int out_stride; + int start_idx; + int th_step; +} CONVOLVE_OPS; + +static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); } + +static INLINE float relu(float x) { return (x < 0) ? 0 : x; } + +typedef struct { + int allocsize; + int channels; + int width, height, stride; + float *buf[CNN_MAX_CHANNELS]; +} TENSOR; + +static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); } + +static void free_tensor(TENSOR *tensor) { + if (tensor->allocsize) { + aom_free(tensor->buf[0]); + tensor->buf[0] = NULL; + tensor->allocsize = 0; + } +} + +static bool realloc_tensor(TENSOR *tensor, int channels, int width, + int height) { + const int newallocsize = channels * width * height; + if (tensor->allocsize < newallocsize) { + free_tensor(tensor); + tensor->buf[0] = + (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize); + if (!tensor->buf[0]) return false; + tensor->allocsize = newallocsize; + } + tensor->width = width; + tensor->height = height; + tensor->stride = width; + tensor->channels = channels; + for (int c = 1; c < channels; ++c) + tensor->buf[c] = &tensor->buf[0][c * width * height]; + return true; +} + +static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset, + TENSOR *dst) { + assert(src->width == dst->width); + assert(src->height == dst->height); + assert(copy_channels <= src->channels); + if (src->stride == dst->width && dst->stride == dst->width) { + for (int c = 0; c < copy_channels; ++c) { + memcpy(dst->buf[dst_offset + c], src->buf[c], + sizeof(*dst->buf[0]) * src->width * src->height); + } + } else { + for (int c = 0; c < copy_channels; ++c) { + for (int r = 0; r < dst->height; ++r) { + memcpy(&dst->buf[dst_offset + c][r * dst->stride], + &src->buf[c][r * src->stride], + dst->width * sizeof(*dst->buf[c])); + } + } + } +} + +static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS], + int channels, int width, int height, int stride) { + tensor->allocsize = 0; + tensor->channels = channels; + tensor->width = width; + tensor->height = height; + tensor->stride = stride; + if (buf) { + for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c]; + } else { + for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL; + } +} + +static void swap_tensor(TENSOR *t1, TENSOR *t2) { + TENSOR t = *t1; + *t1 = *t2; + *t2 = t; +} + +// The concatenated tensor goes into dst with first the channels in +// original dst followed by the channels in the src +static bool concat_tensor(const TENSOR *src, TENSOR *dst) { + assert(src->width == dst->width); + assert(src->height == dst->height); + + const int dst_channels = dst->channels; + const int channels = dst->channels + src->channels; + const int newallocsize = channels * dst->width * dst->height; + if (dst->allocsize < newallocsize) { + TENSOR t; + init_tensor(&t); + // allocate new buffers and copy first the dst channels + if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false; + copy_tensor(dst, dst->channels, 0, &t); + // Swap the tensors and free the old buffers + swap_tensor(dst, &t); + free_tensor(&t); + } + for (int c = 1; c < channels; ++c) + dst->buf[c] = &dst->buf[0][c * dst->width * dst->height]; + // Copy the channels in src after the first dst_channels channels. + copy_tensor(src, src->channels, dst_channels, dst); + return true; +} + +int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) { + return (t1->width == t2->width && t1->height == t2->height); +} + +int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) { + return (t1->channels == t2->channels && t1->width == t2->width && + t1->height == t2->height); +} + +void av1_find_cnn_layer_output_size(int in_width, int in_height, + const CNN_LAYER_CONFIG *layer_config, + int *out_width, int *out_height) { + assert(layer_config->skip_width > 0); + assert(layer_config->skip_height > 0); + if (!layer_config->deconvolve) { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + case PADDING_SAME_REPLICATE: + *out_width = (in_width + layer_config->skip_width - 1) / + layer_config->skip_width; + *out_height = (in_height + layer_config->skip_height - 1) / + layer_config->skip_height; + break; + case PADDING_VALID: + *out_width = + (in_width - layer_config->filter_width + layer_config->skip_width) / + layer_config->skip_width; + *out_height = (in_height - layer_config->filter_height + + layer_config->skip_height) / + layer_config->skip_height; + break; + default: assert(0 && "Unknown padding type"); + } + } else { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + case PADDING_SAME_REPLICATE: + *out_width = in_width * layer_config->skip_width; + *out_height = in_height * layer_config->skip_height; + break; + case PADDING_VALID: + *out_width = (in_width - 1) * layer_config->skip_width + + layer_config->filter_width; + *out_height = (in_height - 1) * layer_config->skip_height + + layer_config->filter_height; + break; + default: assert(0 && "Unknown padding type"); + } + } +} + +void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config, + int channels_per_branch[]) { + int branch = layer_config->branch; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + if (layer_config->branch_copy_type == BRANCH_INPUT) { + channels_per_branch[b] = layer_config->in_channels; + } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + channels_per_branch[b] = layer_config->out_channels; + } else if (layer_config->branch_copy_type == BRANCH_COMBINED) { + channels_per_branch[b] = layer_config->out_channels; + for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { + if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { + assert(channels_per_branch[c] > 0); + channels_per_branch[b] += channels_per_branch[c]; + } + } + } + } + } + channels_per_branch[branch] = layer_config->out_channels; + for (int c = 0; c < CNN_MAX_BRANCHES; ++c) { + if ((branch_config->branches_to_combine & (1 << c)) && c != branch) { + assert(channels_per_branch[c] > 0); + channels_per_branch[branch] += channels_per_branch[c]; + } + } +} + +#if CONFIG_DEBUG +static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) { + const int num_layers = cnn_config->num_layers; + const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config; + + for (int idx = 0; idx < num_layers; idx++) { + if (layer_configs[idx].output_num != -1) { + return 1; + } + } + return 0; +} +#endif + +void av1_find_cnn_output_size(int in_width, int in_height, + const CNN_CONFIG *cnn_config, int *out_width, + int *out_height, int *out_channels) { + int channels_per_branch[CNN_MAX_BRANCHES] = { 0 }; + int i_width[CNN_MAX_BRANCHES] = { 0 }; + int i_height[CNN_MAX_BRANCHES] = { 0 }; + i_width[0] = in_width + cnn_config->ext_width * 2; + i_height[0] = in_height + cnn_config->ext_height * 2; + +#if CONFIG_DEBUG + assert(cnn_has_at_least_one_output(cnn_config)); +#endif + + for (int i = 0; i < cnn_config->num_layers; ++i) { + const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i]; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + const int branch = layer_config->branch; + int o_width = 0, o_height = 0; + + if (layer_config->branch_copy_type == BRANCH_INPUT) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + assert(i_width[branch] > 0 && i_height[branch] > 0); + i_width[b] = i_width[branch]; + i_height[b] = i_height[branch]; + } + } + } + + av1_find_cnn_layer_output_size(i_width[branch], i_height[branch], + layer_config, &o_width, &o_height); + i_width[branch] = o_width; + i_height[branch] = o_height; + + if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + i_width[b] = o_width; + i_height[b] = o_height; + } + } + } + + find_cnn_out_channels(layer_config, channels_per_branch); + + const int output_num = layer_config->output_num; + if (output_num != -1) { // Current layer is an output layer + out_width[output_num] = o_width; + out_height[output_num] = o_height; + out_channels[output_num] = channels_per_branch[layer_config->branch]; + } + } +} + +static INLINE int get_start_shift_convolve(int width, int filt_width, + int stride) { + const int mod = (width % stride); + const int filt_off = (filt_width - 1) / 2; + const int dif = (mod ? mod - 1 : stride - 1); + return AOMMIN((dif + (filt_width % 2)) / 2, filt_off); +} + +void av1_cnn_add_c(float **output, int channels, int width, int height, + int stride, const float **add) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] += add[c][i * stride + j]; + } +} + +void av1_cnn_activate_c(float **output, int channels, int width, int height, + int stride, ACTIVATION layer_activation) { + if (layer_activation == RELU) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = relu(output[c][i * stride + j]); + } + } else if (layer_activation == SOFTSIGN) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = softsign(output[c][i * stride + j]); + } + } else if (layer_activation == SIGMOID) { + assert(0 && "Sigmoid has not been supported in CNN."); // TO DO + } else if (layer_activation != NONE) { + assert(0 && "Unknown activation type"); + } +} + +static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor, + const CNN_LAYER_CONFIG *layer_config, + int branch, TENSOR branch_output[]) { + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->input_to_branches & (1 << b)) && b != branch) { + // Copy layer's active tensor to output tensor of branch b if set in + // mask. The output becomes the input of the first layer of the branch + // because the layer of the branch is not the first layer. + int copy_channels = branch_config->channels_to_copy > 0 + ? branch_config->channels_to_copy + : layer_active_tensor->channels; + if (!realloc_tensor(&branch_output[b], copy_channels, + layer_active_tensor->width, + layer_active_tensor->height)) { + return false; + } + copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]); + } + } + return true; +} + +// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height +// greater than 1 and padding equal to PADDING_SAME_ZERO. +static void convolve_maxpool_padding_zero( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + const int cstep, const int filter_width_half, + const int filter_height_half) { + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { + for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = hh + l - filter_height_half; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = ww + m - filter_width_half; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } +} + +// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height +// greater than 1 and padding equal to PADDING_SAME_REPLICATE. +static void convolve_maxpool_padding_replicate( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + const int cstep, const int filter_width_half, + const int filter_height_half) { + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) { + for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = + CLAMPINDEX(hh + l - filter_height_half, in_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = + CLAMPINDEX(ww + m - filter_width_half, in_width); + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } +} + +// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height +// greater than 1 and padding equal to PADDING_VALID. +static void convolve_maxpool_padding_valid( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + const int cstep) { + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; + h += layer_config->skip_height, ++u) { + for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1; + w += layer_config->skip_width, ++v) { + for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height); + ++hh) { + for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width); + ++ww) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int ii = hh + l; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int jj = ww + m; + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + const float a = sum; + if (h == hh && w == ww) + output[i][u * out_stride + v] = a; + else + output[i][u * out_stride + v] = + AOMMAX(output[i][u * out_stride + v], a); + } + } + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width +// equal to 1. +static void convolve_element_wise(const float **input, int in_width, + int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, + float **output, int out_stride, int start_idx, + int step) { + const int start_h = get_start_shift_convolve( + in_height, layer_config->filter_height, layer_config->skip_height); + const int start_w = + get_start_shift_convolve(in_width, layer_config->filter_width, + layer_config->skip_width) + + start_idx * layer_config->skip_width; + const int out_w_step = AOMMAX(step, 1); + const int in_w_step = layer_config->skip_width * out_w_step; + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int h = start_h, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + const int in_h = h * in_stride; + const int out_h = u * out_stride + start_idx; + for (int w = start_w, out_index = out_h; w < in_width; + w += in_w_step, out_index += out_w_step) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + sum += layer_config->weights[k * layer_config->out_channels + i] * + input[k][in_h + w]; + } + output[i][out_index] = sum; + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 and padding equal to +// PADDING_SAME_ZERO. +static void convolve_no_maxpool_padding_zero( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int filter_width_half, + const int filter_height_half, const int ii_shift, const int jj_shift, + const int channel_step) { + const int start_h = get_start_shift_convolve( + in_height, layer_config->filter_height, layer_config->skip_height); + const int start_w = get_start_shift_convolve( + in_width, layer_config->filter_width, layer_config->skip_width); + const int end_ii_shift = filter_height_half + 1; + const int end_jj_shift = filter_width_half + 1; + // *_filter_margin stores the number of pixels along a dimension in the + // intersection of the complement of the image in the extended image + // and the filter. + const int top_filter_margin = layer_config->filter_width * ii_shift; + const int right_filter_margin = end_jj_shift - in_width; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + for (int h = start_h, u = 0; h < in_height; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int top_cstep = + AOMMAX(0, top_filter_margin - h * layer_config->filter_width) * + cstep + + i; + const int start_ii = AOMMAX(0, h - ii_shift); + const int end_ii = AOMMIN(in_height, h + end_ii_shift); + for (int w = start_w, out_index = out_h; w < in_width; + w += layer_config->skip_width, ++out_index) { + const int left_cstep = AOMMAX(0, jj_shift - w) * cstep; + const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep; + const int start_jj = AOMMAX(0, w - jj_shift); + const int end_jj = AOMMIN(in_width, w + end_jj_shift); + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + top_cstep; + for (int ii = start_ii; ii < end_ii; ++ii) { + off += left_cstep; + for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) { + sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; + } + off += right_cstep; + } + } + output[i][out_index] = sum; + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 and padding equal to +// PADDING_SAME_REPLICATE. +static void convolve_no_maxpool_padding_replicate( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int ii_shift, const int jj_shift, + const int channel_step) { + // h and w are shifted to an offset coordinate system to reduce in-loop + // computation. + const int start_h = + get_start_shift_convolve(in_height, layer_config->filter_height, + layer_config->skip_height) - + ii_shift; + const int start_w = + get_start_shift_convolve(in_width, layer_config->filter_width, + layer_config->skip_width) - + jj_shift; + const int end_h = in_height - ii_shift; + const int end_w = in_width - jj_shift; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + for (int h = start_h, u = 0; h < end_h; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int upper_ii_index = layer_config->filter_height + h; + for (int w = start_w, out_index = out_h; w < end_w; + w += layer_config->skip_width, ++out_index) { + const int upper_jj_index = layer_config->filter_width + w; + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int ii = h; ii < upper_ii_index; ++ii) { + const int clamped_ii = CLAMPINDEX(ii, in_height); + for (int jj = w; jj < upper_jj_index; ++jj) { + const int clamped_jj = CLAMPINDEX(jj, in_width); + assert(clamped_ii >= 0 && clamped_ii < in_height && + clamped_jj >= 0 && clamped_jj < in_width); + sum += layer_config->weights[off] * + input[k][clamped_ii * in_stride + clamped_jj]; + off += cstep; + } + } + } + output[i][out_index] = sum; + } + } + } +} + +// CNNConvolve specific to maxpool set as 0 and padding equal to +// PADDING_VALID. +void av1_cnn_convolve_no_maxpool_padding_valid_c( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || + !layer_config->maxpool); + assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); + assert(layer_config->pad == PADDING_VALID); + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1; + h += layer_config->skip_height, ++u) { + const int out_h = u * out_stride; + const int upper_ii_index = layer_config->filter_height + h; + for (int w = 0, out_index = out_h; + w < in_width - layer_config->filter_width + 1; + w += layer_config->skip_width, ++out_index) { + const int upper_jj_index = layer_config->filter_width + w; + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int ii = h; ii < upper_ii_index; ++ii) { + for (int jj = w; jj < upper_jj_index; ++jj) { + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * input[k][ii * in_stride + jj]; + off += cstep; + } + } + } + output[i][out_index] = sum; + } + } + } +} + +static void av1_cnn_convolve(const float **input, int in_width, int in_height, + int in_stride, + const CNN_LAYER_CONFIG *layer_config, + float **output, int out_stride, int start_idx, + int step) { + assert(!layer_config->deconvolve); + const int cstep = layer_config->in_channels * layer_config->out_channels; + const int filter_height_half = layer_config->filter_height >> 1; + const int filter_width_half = layer_config->filter_width >> 1; + const int channel_step = AOMMAX(step, 1); + + if (layer_config->maxpool && + (layer_config->skip_height > 1 || layer_config->skip_width > 1)) { + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + convolve_maxpool_padding_zero(input, in_width, in_height, in_stride, + layer_config, output, out_stride, cstep, + filter_width_half, filter_height_half); + break; + case PADDING_SAME_REPLICATE: + convolve_maxpool_padding_replicate( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, cstep, filter_width_half, filter_height_half); + break; + case PADDING_VALID: + convolve_maxpool_padding_valid(input, in_width, in_height, in_stride, + layer_config, output, out_stride, cstep); + break; + default: assert(0 && "Unknown padding type"); + } + } else { + // Results in element-wise matrix multiplication. + if (layer_config->filter_height == 1 && layer_config->filter_width == 1) { + convolve_element_wise(input, in_width, in_height, in_stride, layer_config, + output, out_stride, start_idx, step); + return; + } + const int ii_shift = + filter_height_half - (layer_config->filter_height - 1) % 2; + const int jj_shift = + filter_width_half - (layer_config->filter_width - 1) % 2; + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + convolve_no_maxpool_padding_zero( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, filter_width_half, filter_height_half, + ii_shift, jj_shift, channel_step); + break; + case PADDING_SAME_REPLICATE: + convolve_no_maxpool_padding_replicate( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step); + break; + case PADDING_VALID: + av1_cnn_convolve_no_maxpool_padding_valid( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, channel_step); + break; + default: assert(0 && "Unknown padding type"); + } + } +} + +static int convolve_layer(void *arg1, void *arg2) { + const CONVOLVE_OPS *convolve_ops = arg1; + (void)arg2; + av1_cnn_convolve( + convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height, + convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output, + convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step); + return 1; +} + +static void convolve_layer_mt(const float **input, int in_width, int in_height, + int in_stride, + const CNN_LAYER_CONFIG *layer_config, + const CNN_THREAD_DATA *thread_data, + float **output, int out_stride) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + const int num_workers = thread_data->num_workers; + assert(thread_data->workers); + + CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS]; + for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { + AVxWorker *const worker = &thread_data->workers[th]; + winterface->reset(worker); + + CONVOLVE_OPS convolve_op = { input, in_width, in_height, + in_stride, layer_config, output, + out_stride, th, num_workers }; + convolve_ops[th] = convolve_op; + worker->hook = convolve_layer; + worker->data1 = &(convolve_ops[th]); + worker->data2 = NULL; + + // Start convolving. + if (th == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait until all workers have finished. + for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) { + winterface->sync(&thread_data->workers[th]); + } +} + +static INLINE int get_start_shift_deconvolve(int filt_width, int stride) { + const int dif = AOMMAX(filt_width - stride, 0); + return dif / 2; +} + +void av1_cnn_batchnorm_c(float **image, int channels, int width, int height, + int stride, const float *gamma, const float *beta, + const float *mean, const float *std) { + assert(gamma && beta && beta && std && "batchnorm has null parameter!"); + for (int ch = 0; ch < channels; ch++) { + const float ch_gamma = gamma[ch]; + const float ch_beta = beta[ch]; + const float ch_mean = mean[ch]; + const float ch_std = std[ch]; + float *image_row = image[ch]; + + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + image_row[col] = + ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta; + } + image_row += stride; + } + } +} + +void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_LAYER_CONFIG *layer_config, + float **output, int out_stride) { + assert(layer_config->deconvolve); + + const int cstep = layer_config->in_channels * layer_config->out_channels; + + int out_width = 0; + int out_height = 0; + av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width, + &out_height); + switch (layer_config->pad) { + case PADDING_SAME_ZERO: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = + u - l + + get_start_shift_deconvolve(layer_config->filter_height, + layer_config->skip_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = + v - m + + get_start_shift_deconvolve(layer_config->filter_width, + layer_config->skip_width); + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = h / layer_config->skip_height; + const int jj = w / layer_config->skip_width; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + case PADDING_SAME_REPLICATE: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = + u - l + + get_start_shift_deconvolve(layer_config->filter_height, + layer_config->skip_height); + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = + v - m + + get_start_shift_deconvolve(layer_config->filter_width, + layer_config->skip_width); + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = + CLAMPINDEX(h / layer_config->skip_height, in_height); + const int jj = + CLAMPINDEX(w / layer_config->skip_width, in_width); + assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width); + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + case PADDING_VALID: + for (int i = 0; i < layer_config->out_channels; ++i) { + for (int u = 0; u < out_height; ++u) { + for (int v = 0; v < out_width; ++v) { + float sum = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + int off = k * layer_config->out_channels + i; + for (int l = 0; l < layer_config->filter_height; ++l) { + const int h = u - l; + for (int m = 0; m < layer_config->filter_width; + ++m, off += cstep) { + const int w = v - m; + if ((h % layer_config->skip_height) != 0 || + (w % layer_config->skip_width) != 0) + continue; + const int ii = h / layer_config->skip_height; + const int jj = w / layer_config->skip_width; + if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width) + continue; + sum += layer_config->weights[off] * + input[k][ii * in_stride + jj]; + } + } + } + output[i][u * out_stride + v] = sum; + } + } + } + break; + default: assert(0 && "Unknown padding type"); + } +} + +bool av1_cnn_predict_c(const float **input, int in_width, int in_height, + int in_stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output_struct) { + bool success = false; + TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } }; + TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } }; + + float **output[CNN_MAX_BRANCHES]; + const int *out_chs = output_struct->output_channels; + output[0] = output_struct->output_buffer; + for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) { + output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1]; + } + + int i_width = in_width; + int i_height = in_height; + int o_width = 0, o_height = 0; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + init_tensor(&tensor1[b]); + init_tensor(&tensor2[b]); + } + + const int *out_stride = output_struct->output_strides; + for (int layer = 0; layer < cnn_config->num_layers; ++layer) { + const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer]; + const int branch = layer_config->branch; + const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config; + + // Allocate input tensor + if (layer == 0) { // First layer + assert(branch == 0); // First layer must be primary branch + assign_tensor(&tensor1[branch], (float **)input, + layer_config->in_channels, in_width, in_height, in_stride); + } else { // Non-first layer + // Swap tensor1 and tensor2 + swap_tensor(&tensor1[branch], &tensor2[branch]); + + i_width = tensor1[branch].width; + i_height = tensor1[branch].height; + } + + // Allocate output tensor + av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width, + &o_height); + const int output_num = layer_config->output_num; + if (output_num == -1) { // Non-output layer + if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width, + o_height)) { + goto Error; + } + } else { // Output layer + free_tensor(&tensor2[branch]); + assign_tensor(&tensor2[branch], output[output_num], + layer_config->out_channels, o_width, o_height, + out_stride[output_num]); + } + + // If we are combining branches make sure that the branch to combine + // is different from the current branch. + assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC, + !(branch_config->branches_to_combine & (1 << branch)))); + + if (layer_config->branch_copy_type == BRANCH_INPUT) { + if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config, + branch, tensor2)) { + goto Error; + } + } + // Check consistency of input and output channels + assert(tensor1[branch].channels == layer_config->in_channels); + assert(tensor2[branch].channels == layer_config->out_channels); + + // Convolve/Deconvolve + if (!cnn_config->layer_config[layer].deconvolve) { + if (thread_data->num_workers > 1) { + convolve_layer_mt((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, thread_data, + tensor2[branch].buf, tensor2[branch].stride); + } else { + av1_cnn_convolve((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, + tensor2[branch].buf, tensor2[branch].stride, 0, 1); + } + } else { + av1_cnn_deconvolve((const float **)tensor1[branch].buf, + tensor1[branch].width, tensor1[branch].height, + tensor1[branch].stride, layer_config, + tensor2[branch].buf, tensor2[branch].stride); + } + + if (layer_config->branch_copy_type == BRANCH_OUTPUT) { + if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config, + branch, tensor2)) { + goto Error; + } + } + + // Add tensors from other branches if needed + if (layer_config->branch_combine_type == BRANCH_ADD) { + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch])); + av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, (const float **)tensor2[b].buf); + } + } + } + + // Non-linearity + av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, layer_config->activation); + + if (layer_config->bn_params.bn_gamma) { + av1_cnn_batchnorm( + tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width, + tensor2[branch].height, tensor2[branch].stride, + layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta, + layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std); + } + + // Concatenate tensors + if (layer_config->branch_combine_type == BRANCH_CAT) { + if (output_num == -1) { // Non-output layer + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + assert(tensor2[b].channels > 0); + if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error; + } + } + } else { // Output layer + const int existing_channels = tensor2[branch].channels; + int num_chs = existing_channels; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + // Needed only to assign the new channel buffers + num_chs += tensor2[b].channels; + } + } + assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width, + o_height, out_stride[output_num]); + + num_chs = existing_channels; + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + if ((branch_config->branches_to_combine & (1 << b)) && b != branch) { + assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch])); + // Needed only to assign the new channel buffers + copy_tensor(&tensor2[b], tensor2[b].channels, num_chs, + &tensor2[branch]); + num_chs += tensor2[b].channels; + } + } + } + } + + if (layer_config->branch_copy_type == BRANCH_COMBINED) { + if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config, + branch, tensor2)) { + goto Error; + } + } + } + + success = true; +Error: + for (int b = 0; b < CNN_MAX_BRANCHES; ++b) { + free_tensor(&tensor1[b]); + free_tensor(&tensor2[b]); + } + return success; +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + CNN_MULTI_OUT *output) { + const float max_val = 255.0; + + const int in_width = width + 2 * cnn_config->ext_width; + const int in_height = height + 2 * cnn_config->ext_height; + const int in_channels = cnn_config->layer_config[0].in_channels; + float *inputs[CNN_MAX_CHANNELS]; + float *input_ = + (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); + if (!input_) return false; + const int in_stride = in_width; + + for (int c = 0; c < in_channels; ++c) { + inputs[c] = input_ + c * in_stride * in_height; + float *input = + inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; + + if (cnn_config->strict_bounds) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + // extend left and right + for (int i = 0; i < height; ++i) { + for (int j = -cnn_config->ext_width; j < 0; ++j) + input[i * in_stride + j] = input[i * in_stride]; + for (int j = width; j < width + cnn_config->ext_width; ++j) + input[i * in_stride + j] = input[i * in_stride + width - 1]; + } + // extend top and bottom + for (int i = -cnn_config->ext_height; i < 0; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[-cnn_config->ext_width], in_width * sizeof(*input)); + for (int i = height; i < height + cnn_config->ext_height; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[(height - 1) * in_stride - cnn_config->ext_width], + in_width * sizeof(*input)); + } else { + for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; + ++i) + for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; + ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + } + } + bool success = av1_cnn_predict((const float **)inputs, in_width, in_height, + in_stride, cnn_config, thread_data, output); + + aom_free(input_); + return success; +} + +// Assume output already has proper allocation +// Assume input image buffers all have same resolution and strides +bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, + int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, + CNN_MULTI_OUT *output) { + const float max_val = (float)((1 << bit_depth) - 1); + + const int in_width = width + 2 * cnn_config->ext_width; + const int in_height = height + 2 * cnn_config->ext_height; + const int in_channels = cnn_config->layer_config[0].in_channels; + float *inputs[CNN_MAX_CHANNELS]; + float *input_ = + (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_)); + if (!input_) return false; + const int in_stride = in_width; + + for (int c = 0; c < in_channels; ++c) { + inputs[c] = input_ + c * in_stride * in_height; + float *input = + inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width; + + if (cnn_config->strict_bounds) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + // extend left and right + for (int i = 0; i < height; ++i) { + for (int j = -cnn_config->ext_width; j < 0; ++j) + input[i * in_stride + j] = input[i * in_stride]; + for (int j = width; j < width + cnn_config->ext_width; ++j) + input[i * in_stride + j] = input[i * in_stride + width - 1]; + } + // extend top and bottom + for (int i = -cnn_config->ext_height; i < 0; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[-cnn_config->ext_width], in_width * sizeof(*input)); + for (int i = height; i < height + cnn_config->ext_height; ++i) + memcpy(&input[i * in_stride - cnn_config->ext_width], + &input[(height - 1) * in_stride - cnn_config->ext_width], + in_width * sizeof(*input)); + } else { + for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height; + ++i) + for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width; + ++j) + input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val; + } + } + + bool success = av1_cnn_predict((const float **)inputs, in_width, in_height, + in_stride, cnn_config, thread_data, output); + + aom_free(input_); + return success; +} diff --git a/third_party/aom/av1/encoder/cnn.h b/third_party/aom/av1/encoder/cnn.h new file mode 100644 index 0000000000..df6401f73f --- /dev/null +++ b/third_party/aom/av1/encoder/cnn.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_CNN_H_ +#define AOM_AV1_ENCODER_CNN_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#include "aom_util/aom_thread.h" +#include "config/av1_rtcd.h" + +struct AV1Common; + +#define CNN_MAX_HIDDEN_LAYERS 64 +#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1) +#define CNN_MAX_CHANNELS 256 +#define CNN_MAX_BRANCHES 4 +#define CNN_MAX_THREADS 32 + +#define NO_BRANCH_CONFIG \ + { 0, 0, 0 } +#define NO_BN_PARAMS \ + { NULL, NULL, NULL, NULL } + +enum { + PADDING_SAME_ZERO, // tensorflow's SAME padding with pixels outside + // the image area assumed to be 0 (default) + PADDING_SAME_REPLICATE, // tensorflow's SAME padding with pixels outside + // the image area replicated from closest edge + PADDING_VALID // tensorflow's VALID padding +} UENUM1BYTE(PADDING_TYPE); + +// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION); + +// Times when input tensor may be copied to branches given in input_to_branches. +// BRANCH_NO_COPY: doesn't copy any tensor. +// BRANCH_INPUT: copies the input tensor to branches. +// BRANCH_OUTPUT: copies the convolved tensor to branches. +// BRANCH_COMBINED: copies the combined (after convolving and branch combining) +// tensor. If no combinations happen at this layer, then this option +// has the same effect as COPY_OUTPUT. +enum { + BRANCH_NO_COPY, + BRANCH_INPUT, + BRANCH_OUTPUT, + BRANCH_COMBINED +} UENUM1BYTE(BRANCH_COPY); + +// Types of combining branches with output of current layer: +// BRANCH_NOC: no branch combining +// BRANCH_ADD: Add previously stored branch tensor to output of layer +// BRANCH_CAT: Concatenate branch tensor to output of layer +enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE); + +// The parameters used to scale each channel in batch +// normalization. The processing in done on a per-channel basis. +// e.g. bn_mean[c] is the mean for all pixels in channel c. This +// is always applied after activation. The output is given by +// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where +// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c] +// here we assume that the effect of variance_epsilon is already +// taken into account when bn_std is calculated. The pointers +// needs to be either all zero or all valid. If all zero, then +// batchnorm is disabled, else batchnorm is applied. +struct CNN_BATCHNORM_PARAMS { + const float *bn_gamma; + const float *bn_beta; + const float *bn_mean; + const float *bn_std; +}; + +struct CNN_BRANCH_CONFIG { + int input_to_branches; // If nonzero, copy the active tensor to the current + // layer and store for future use in branches + // specified in the field as a binary mask. For + // example, if input_to_branch = 0x06, it means the + // input tensor to the current branch is copied to + // branches 1 and 2 (where 0 represents the primary + // branch). One restriction is that the mask + // cannot indicate copying to the current branch. + // If greater than 0, only copies the channels up + // to the given index. + int channels_to_copy; // Within the layer, input a copy of active + // tensor to branches given in input_to_branches. + int branches_to_combine; // mask of branches to combine with output of + // current layer, if + // branch_combine_type != BRANCH_NOC + // For example, if branches_to_combine = 0x0A, + // it means that braches 1 and 3 are combined + // with the current branch. +}; + +struct CNN_LAYER_CONFIG { + int in_channels; + int filter_width; + int filter_height; + int out_channels; + int skip_width; + int skip_height; + int maxpool; // whether to use maxpool or not (only effective when + // skip width or skip_height are > 1) + const float *weights; // array of length filter_height x filter_width x + // in_channels x out_channels where the inner-most + // scan is out_channels and the outer most scan is + // filter_height. + const float *bias; // array of length out_channels + PADDING_TYPE pad; // padding type + ACTIVATION activation; // the activation function to use after convolution + int deconvolve; // whether this is a deconvolution layer. + // 0: If skip_width or skip_height are > 1, then we + // reduce resolution + // 1: If skip_width or skip_height are > 1, then we + // increase resolution + int branch; // branch index in [0, CNN_MAX_BRANCHES - 1], where + // 0 refers to the primary branch. + BRANCH_COPY branch_copy_type; + BRANCH_COMBINE branch_combine_type; + struct CNN_BRANCH_CONFIG branch_config; + struct CNN_BATCHNORM_PARAMS + bn_params; // A struct that contains the parameters + // used for batch normalization. + int output_num; // The output buffer idx to which the layer output is + // written. Set to -1 to disable writing it to the output. In + // the case that branch_combine_type is BRANCH_CAT, all + // concatenated channels will be written to output. In the + // case of BRANCH_ADD, the output will be the result of + // summation. +}; + +struct CNN_CONFIG { + int num_layers; // number of CNN layers ( = number of hidden layers + 1) + int is_residue; // whether the output activation is a residue + int ext_width, ext_height; // extension horizontally and vertically + int strict_bounds; // whether the input bounds are strict or not. + // If strict, the extension area is filled by + // replication; if not strict, image data is + // assumed available beyond the bounds. + CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS]; +}; + +struct CNN_THREAD_DATA { + int num_workers; + AVxWorker *workers; +}; + +struct CNN_MULTI_OUT { + int num_outputs; + const int *output_channels; + const int *output_strides; + float **output_buffer; +}; + +// Function to return size of output +void av1_find_cnn_output_size(int in_width, int in_height, + const CNN_CONFIG *cnn_config, int *out_width, + int *out_height, int *out_channels); + +// Function to return output width and output height of given layer. +void av1_find_cnn_layer_output_size(int in_width, int in_height, + const CNN_LAYER_CONFIG *layer_config, + int *out_width, int *out_height); + +// Prediction functions from set of input image buffers. This function supports +// CNN with multiple outputs. +bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height, + int stride, const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + struct CNN_MULTI_OUT *output); +bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height, + int stride, + const CNN_CONFIG *cnn_config, + const CNN_THREAD_DATA *thread_data, + int bit_depth, CNN_MULTI_OUT *output); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_CNN_H_ diff --git a/third_party/aom/av1/encoder/compound_type.c b/third_party/aom/av1/encoder/compound_type.c new file mode 100644 index 0000000000..3b0ee88241 --- /dev/null +++ b/third_party/aom/av1/encoder/compound_type.c @@ -0,0 +1,1678 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/pred_common.h" +#include "av1/encoder/compound_type.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tx_search.h" + +typedef int64_t (*pick_interinter_mask_type)( + const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10, + uint64_t *best_sse); + +// Checks if characteristics of search match +static INLINE int is_comp_rd_match(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const COMP_RD_STATS *st, + const MB_MODE_INFO *const mi, + int32_t *comp_rate, int64_t *comp_dist, + int32_t *comp_model_rate, + int64_t *comp_model_dist, int *comp_rs2) { + // TODO(ranjit): Ensure that compound type search use regular filter always + // and check if following check can be removed + // Check if interp filter matches with previous case + if (st->filter.as_int != mi->interp_filters.as_int) return 0; + + const MACROBLOCKD *const xd = &x->e_mbd; + // Match MV and reference indices + for (int i = 0; i < 2; ++i) { + if ((st->ref_frames[i] != mi->ref_frame[i]) || + (st->mv[i].as_int != mi->mv[i].as_int)) { + return 0; + } + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]]; + if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; + } + + int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 }; + // For compound wedge, reuse data if newmv search is disabled when NEWMV is + // present or if NEWMV is not present in either of the directions + if ((!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode)) || + (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) + reuse_data[COMPOUND_WEDGE] = 1; + // For compound diffwtd, reuse data if fast search is enabled (no newmv search + // when NEWMV is present) or if NEWMV is not present in either of the + // directions + if (cpi->sf.inter_sf.enable_fast_compound_mode_search || + (!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode))) + reuse_data[COMPOUND_DIFFWTD] = 1; + + // Store the stats for the different compound types + for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES; + comp_type++) { + if (reuse_data[comp_type]) { + comp_rate[comp_type] = st->rate[comp_type]; + comp_dist[comp_type] = st->dist[comp_type]; + comp_model_rate[comp_type] = st->model_rate[comp_type]; + comp_model_dist[comp_type] = st->model_dist[comp_type]; + comp_rs2[comp_type] = st->comp_rs2[comp_type]; + } + } + return 1; +} + +// Checks if similar compound type search case is accounted earlier +// If found, returns relevant rd data +static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *const mbmi, + int32_t *comp_rate, int64_t *comp_dist, + int32_t *comp_model_rate, + int64_t *comp_model_dist, int *comp_rs2, + int *match_index) { + for (int j = 0; j < x->comp_rd_stats_idx; ++j) { + if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate, + comp_dist, comp_model_rate, comp_model_dist, + comp_rs2)) { + *match_index = j; + return 1; + } + } + return 0; // no match result found +} + +static INLINE bool enable_wedge_search( + MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) { + // Enable wedge search if source variance and edge strength are above + // the thresholds. + return x->source_variance > disable_wedge_var_thresh; +} + +static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search( + x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) && + cpi->oxcf.comp_type_cfg.enable_interinter_wedge; +} + +static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search( + x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) && + cpi->oxcf.comp_type_cfg.enable_interintra_wedge; +} + +static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, + const BLOCK_SIZE bsize, const uint8_t *pred0, + int stride0, const uint8_t *pred1, + int stride1) { + static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = { + // 4X4 + BLOCK_INVALID, + // 4X8, 8X4, 8X8 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, + // 8X16, 16X8, 16X16 + BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, + // 16X32, 32X16, 32X32 + BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, + // 32X64, 64X32, 64X64 + BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, + // 64x128, 128x64, 128x128 + BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, + // 4X16, 16X4, 8X32 + BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16, + // 32X8, 16X64, 64X16 + BLOCK_16X4, BLOCK_8X32, BLOCK_32X8 + }; + const struct macroblock_plane *const p = &x->plane[0]; + const uint8_t *src = p->src.buf; + int src_stride = p->src.stride; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int bw_by2 = bw >> 1; + const int bh_by2 = bh >> 1; + uint32_t esq[2][2]; + int64_t tl, br; + + const BLOCK_SIZE f_index = split_qtr[bsize]; + assert(f_index != BLOCK_INVALID); + + if (is_cur_buf_hbd(&x->e_mbd)) { + pred0 = CONVERT_TO_BYTEPTR(pred0); + pred1 = CONVERT_TO_BYTEPTR(pred1); + } + + // Residual variance computation over relevant quandrants in order to + // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1), + // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0) + // The 2nd and 3rd quadrants cancel out in TL + BR + // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0) + // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants) + // for all codebooks; experiment with other quadrant combinations for + // 0, 90 and 135 degrees also. + cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); + cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred0 + bh_by2 * stride0 + bw_by2, stride0, + &esq[0][1]); + cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); + cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred1 + bh_by2 * stride1 + bw_by2, stride0, + &esq[1][1]); + + tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]); + br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]); + return (tl + br > 0); +} + +// Choose the best wedge index and sign +static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const uint8_t *const p0, + const int16_t *const residual1, + const int16_t *const diff10, + int8_t *const best_wedge_sign, + int8_t *const best_wedge_index, uint64_t *best_sse) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + assert(N >= 64); + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int8_t wedge_index; + int8_t wedge_sign; + const int8_t wedge_types = get_wedge_types_lookup(bsize); + const uint8_t *mask; + uint64_t sse; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + + DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0 +#if CONFIG_AV1_HIGHBITDEPTH + if (hbd) { + aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p0), bw); + } else { + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); + } +#else + (void)hbd; + aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw); +#endif + + int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) - + (int64_t)aom_sum_squares_i16(residual1, N)) * + (1 << WEDGE_WEIGHT_BITS) / 2; + int16_t *ds = residual0; + + av1_wedge_compute_delta_squares(ds, residual0, residual1, N); + + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize); + + wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit); + + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + // int rate2; + // int64_t dist2; + // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2); + // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n", + // sse, rate, dist, rate2, dist2); dist = dist2; + // rate = rate2; + + rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index]; + rd = RDCOST(x->rdmult, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + *best_wedge_sign = wedge_sign; + best_rd = rd; + *best_sse = sse; + } + } + + return best_rd - + RDCOST(x->rdmult, + x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0); +} + +// Choose the best wedge index the specified sign +static int64_t pick_wedge_fixed_sign( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int16_t *const residual1, + const int16_t *const diff10, const int8_t wedge_sign, + int8_t *const best_wedge_index, uint64_t *best_sse) { + const MACROBLOCKD *const xd = &x->e_mbd; + + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = bw * bh; + assert(N >= 64); + int rate; + int64_t dist; + int64_t rd, best_rd = INT64_MAX; + int8_t wedge_index; + const int8_t wedge_types = get_wedge_types_lookup(bsize); + const uint8_t *mask; + uint64_t sse; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { + mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); + sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index]; + rd = RDCOST(x->rdmult, rate, dist); + + if (rd < best_rd) { + *best_wedge_index = wedge_index; + best_rd = rd; + *best_sse = sse; + } + } + return best_rd - + RDCOST(x->rdmult, + x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0); +} + +static int64_t pick_interinter_wedge( + const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, const uint8_t *const p1, + const int16_t *const residual1, const int16_t *const diff10, + uint64_t *best_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + + int64_t rd; + int8_t wedge_index = -1; + int8_t wedge_sign = 0; + + assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); + assert(cpi->common.seq_params->enable_masked_compound); + + if (cpi->sf.inter_sf.fast_wedge_sign_estimate) { + wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); + rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign, + &wedge_index, best_sse); + } else { + rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign, + &wedge_index, best_sse); + } + + mbmi->interinter_comp.wedge_sign = wedge_sign; + mbmi->interinter_comp.wedge_index = wedge_index; + return rd; +} + +static int64_t pick_interinter_seg(const AV1_COMP *const cpi, + MACROBLOCK *const x, const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1, + const int16_t *const residual1, + const int16_t *const diff10, + uint64_t *best_sse) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int N = 1 << num_pels_log2_lookup[bsize]; + int rate; + int64_t dist; + DIFFWTD_MASK_TYPE cur_mask_type; + int64_t best_rd = INT64_MAX; + DIFFWTD_MASK_TYPE best_mask_type = 0; + const int hbd = is_cur_buf_hbd(xd); + const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; + DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; + // try each mask type and its inverse + for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) { + // build mask and inverse +#if CONFIG_AV1_HIGHBITDEPTH + if (hbd) + av1_build_compound_diffwtd_mask_highbd( + tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw, + CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd); + else + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, + p0, bw, p1, bw, bh, bw); +#else + (void)hbd; + av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0, + bw, p1, bw, bh, bw); +#endif // CONFIG_AV1_HIGHBITDEPTH + + // compute rd for mask + uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10, + tmp_mask[cur_mask_type], N); + sse = ROUND_POWER_OF_TWO(sse, bd_round); + + model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N, + &rate, &dist); + const int64_t rd0 = RDCOST(x->rdmult, rate, dist); + + if (rd0 < best_rd) { + best_mask_type = cur_mask_type; + best_rd = rd0; + *best_sse = sse; + } + } + mbmi->interinter_comp.mask_type = best_mask_type; + if (best_mask_type == DIFFWTD_38_INV) { + memcpy(xd->seg_mask, seg_mask, N * 2); + } + return best_rd; +} + +static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const BLOCK_SIZE bsize, + const uint8_t *const p0, + const uint8_t *const p1) { + const MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(av1_is_wedge_used(bsize)); + assert(cpi->common.seq_params->enable_interintra_compound); + + const struct buf_2d *const src = &x->plane[0].src; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 + DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(p1), bw); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw, + CONVERT_TO_BYTEPTR(p0), bw); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); + } +#else + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw); + aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw); +#endif + int8_t wedge_index = -1; + uint64_t sse; + int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, + &wedge_index, &sse); + + mbmi->interintra_wedge_index = wedge_index; + return rd; +} + +static AOM_INLINE void get_inter_predictors_masked_compound( + MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1, + int16_t *residual1, int16_t *diff10, int *strides) { + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + // get inter predictors to use for masked compound modes + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0, + strides); + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1, + strides); + const struct buf_2d *const src = &x->plane[0].src; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, + CONVERT_TO_BYTEPTR(*preds1), bw); + aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), + bw, CONVERT_TO_BYTEPTR(*preds0), bw); + } else { + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, + bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); + } +#else + aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw); + aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw); +#endif +} + +// Computes the rd cost for the given interintra mode and updates the best +static INLINE void compute_best_interintra_mode( + const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, + MACROBLOCK *const x, const int *const interintra_mode_cost, + const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf, + INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd, + INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + int rate; + uint8_t skip_txfm_sb; + int64_t dist, skip_sse_sb; + const int bw = block_size_wide[bsize]; + mbmi->interintra_mode = interintra_mode; + int rmode = interintra_mode_cost[interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist, + &skip_txfm_sb, &skip_sse_sb, NULL, + NULL, NULL); + int64_t rd = RDCOST(x->rdmult, rate + rmode, dist); + if (rd < *best_interintra_rd) { + *best_interintra_rd = rd; + *best_interintra_mode = mbmi->interintra_mode; + } +} + +static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, + MACROBLOCK *x, int64_t ref_best_rd, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + if (ref_best_rd < 0) return INT64_MAX; + av1_subtract_plane(x, bs, 0); + const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, + max_txsize_rect_lookup[bs]); + if (rd != INT64_MAX) { + const int skip_ctx = av1_get_skip_txfm_context(xd); + if (rd_stats->skip_txfm) { + const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + rd_stats->rate = s1; + } else { + const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + rd_stats->rate += s0; + } + } + return rd; +} + +// Computes the rd_threshold for smooth interintra rd search. +static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x, + int total_mode_rate, + int64_t ref_best_rd) { + const int64_t rd_thresh = get_rd_thresh_from_best_rd( + ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT), + INTER_INTRA_RD_THRESH_SCALE); + const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0); + return (rd_thresh - mode_rd); +} + +// Computes the best wedge interintra mode +static AOM_INLINE int64_t compute_best_wedge_interintra( + const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd, + MACROBLOCK *const x, const int *const interintra_mode_cost, + const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_, + int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + int64_t best_interintra_rd_wedge = INT64_MAX; + int64_t best_total_rd = INT64_MAX; + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) { + mbmi->interintra_mode = mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + const int rate_overhead = + interintra_mode_cost[mode] + + x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index]; + const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0); + if (total_rd < best_total_rd) { + best_total_rd = total_rd; + best_interintra_rd_wedge = rd; + *best_mode = mbmi->interintra_mode; + *best_wedge_index = mbmi->interintra_wedge_index; + } + } + return best_interintra_rd_wedge; +} + +static int handle_smooth_inter_intra_mode( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv, + INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd, + int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf, + uint8_t *intrapred, HandleInterModeArgs *args) { + MACROBLOCKD *xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const interintra_mode_cost = + mode_costs->interintra_mode_cost[size_group_lookup[bsize]]; + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + + mbmi->use_wedge_interintra = 0; + + if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 || + *best_interintra_mode == INTERINTRA_MODES) { + int64_t best_interintra_rd = INT64_MAX; + for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; + ++cur_mode) { + if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + cur_mode == II_SMOOTH_PRED) + continue; + compute_best_interintra_mode( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf, + best_interintra_mode, &best_interintra_rd, cur_mode, bsize); + } + args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode; + } + assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra, + *best_interintra_mode != II_SMOOTH_PRED)); + // Recompute prediction if required + bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode || + *best_interintra_mode != INTERINTRA_MODES; + if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) { + mbmi->interintra_mode = *best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + + // Compute rd cost for best smooth_interintra + RD_STATS rd_stats; + const int is_wedge_used = av1_is_wedge_used(bsize); + const int rmode = + interintra_mode_cost[*best_interintra_mode] + + (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0); + const int total_mode_rate = rmode + *rate_mv; + const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd); + int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist); + } else { + return IGNORE_MODE; + } + *best_rd = rd; + *best_mode_rate = rmode; + // Return early if best rd not good enough + if (ref_best_rd < INT64_MAX && + (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE > + ref_best_rd) { + return IGNORE_MODE; + } + return 0; +} + +static int handle_wedge_inter_intra_mode( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode, + int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_, + uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred, + HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead, + int_mv *tmp_mv, int64_t best_rd_no_wedge) { + MACROBLOCKD *xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const interintra_mode_cost = + mode_costs->interintra_mode_cost[size_group_lookup[bsize]]; + const AV1_COMMON *const cm = &cpi->common; + const int bw = block_size_wide[bsize]; + const int try_smooth_interintra = + cpi->oxcf.comp_type_cfg.enable_smooth_interintra; + + mbmi->use_wedge_interintra = 1; + + if (!cpi->sf.inter_sf.fast_interintra_wedge_search) { + // Exhaustive search of all wedge and mode combinations. + int best_mode = 0; + int best_wedge_index = 0; + *best_rd = compute_best_wedge_interintra( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_, + &best_mode, &best_wedge_index, bsize); + mbmi->interintra_mode = best_mode; + mbmi->interintra_wedge_index = best_wedge_index; + if (best_mode != INTERINTRA_MODES - 1) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + } + } else if (!try_smooth_interintra) { + if (*best_interintra_mode == INTERINTRA_MODES) { + mbmi->interintra_mode = INTERINTRA_MODES - 1; + *best_interintra_mode = INTERINTRA_MODES - 1; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + // Pick wedge mask based on INTERINTRA_MODES - 1 + *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + // Find the best interintra mode for the chosen wedge mask + for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES; + ++cur_mode) { + compute_best_interintra_mode( + cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, + tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize); + } + args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode; + mbmi->interintra_mode = *best_interintra_mode; + + // Recompute prediction if required + if (*best_interintra_mode != INTERINTRA_MODES - 1) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + } + } else { + // Pick wedge mask for the best interintra mode (reused) + mbmi->interintra_mode = *best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + } else { + // Pick wedge mask for the best interintra mode from smooth_interintra + *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + + *rate_overhead = + interintra_mode_cost[mbmi->interintra_mode] + + mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] + + mode_costs->wedge_interintra_cost[bsize][1]; + *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0); + + int64_t rd = INT64_MAX; + const int_mv mv0 = mbmi->mv[0]; + // Refine motion vector for NEWMV case. + if (have_newmv_in_inter_mode(mbmi->mode)) { + int rate_sum; + uint8_t skip_txfm_sb; + int64_t dist_sum, skip_sse_sb; + // get negative of mask + const uint8_t *mask = + av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize); + av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred, + mask, bw, tmp_rate_mv, 0); + if (mbmi->mv[0].as_int != tmp_mv->as_int) { + mbmi->mv[0].as_int = tmp_mv->as_int; + // Set ref_frame[1] to NONE_FRAME temporarily so that the intra + // predictor is not calculated again in av1_enc_build_inter_predictor(). + mbmi->ref_frame[1] = NONE_FRAME; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + mbmi->ref_frame[1] = INTRA_FRAME; + av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf, + xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb, + &skip_sse_sb, NULL, NULL, NULL); + rd = + RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum); + } + } + if (rd >= *best_rd) { + tmp_mv->as_int = mv0.as_int; + *tmp_rate_mv = *rate_mv; + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + } + // Evaluate closer to true rd + RD_STATS rd_stats; + const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0); + const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd; + rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate, + rd_stats.dist); + } else { + if (*best_rd == INT64_MAX) return IGNORE_MODE; + } + *best_rd = rd; + return 0; +} + +int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, int64_t ref_best_rd, + int *rate_mv, int *tmp_rate2, + const BUFFER_SET *orig_dst) { + const int try_smooth_interintra = + cpi->oxcf.comp_type_cfg.enable_smooth_interintra; + + const int is_wedge_used = av1_is_wedge_used(bsize); + const int try_wedge_interintra = + is_wedge_used && enable_wedge_interintra_search(x, cpi); + + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int bw = block_size_wide[bsize]; + DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]); + DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]); + uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_); + uint8_t *intrapred = get_buf_by_bd(xd, intrapred_); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Single reference inter prediction + mbmi->ref_frame[1] = NONE_FRAME; + xd->plane[0].dst.buf = tmp_buf; + xd->plane[0].dst.stride = bw; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + const int num_planes = av1_num_planes(cm); + + // Restore the buffers for intra prediction + restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = INTRA_FRAME; + INTERINTRA_MODE best_interintra_mode = + args->inter_intra_mode[mbmi->ref_frame[0]]; + + // Compute smooth_interintra + int64_t best_interintra_rd_nowedge = INT64_MAX; + int best_mode_rate = INT_MAX; + if (try_smooth_interintra) { + int ret = handle_smooth_inter_intra_mode( + cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode, + &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf, + intrapred, args); + if (ret == IGNORE_MODE) { + return IGNORE_MODE; + } + } + + // Compute wedge interintra + int64_t best_interintra_rd_wedge = INT64_MAX; + const int_mv mv0 = mbmi->mv[0]; + int_mv tmp_mv = mv0; + int tmp_rate_mv = 0; + int rate_overhead = 0; + if (try_wedge_interintra) { + int ret = handle_wedge_inter_intra_mode( + cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode, + &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_, + intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv, + best_interintra_rd_nowedge); + if (ret == IGNORE_MODE) { + return IGNORE_MODE; + } + } + + if (best_interintra_rd_nowedge == INT64_MAX && + best_interintra_rd_wedge == INT64_MAX) { + return IGNORE_MODE; + } + if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { + mbmi->mv[0].as_int = tmp_mv.as_int; + *tmp_rate2 += tmp_rate_mv - *rate_mv; + *rate_mv = tmp_rate_mv; + best_mode_rate = rate_overhead; + } else if (try_smooth_interintra && try_wedge_interintra) { + // If smooth was best, but we over-wrote the values when evaluating the + // wedge mode, we need to recompute the smooth values. + mbmi->use_wedge_interintra = 0; + mbmi->interintra_mode = best_interintra_mode; + mbmi->mv[0].as_int = mv0.as_int; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + *tmp_rate2 += best_mode_rate; + + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_U, num_planes - 1); + } + return 0; +} + +// Computes the valid compound_types to be evaluated +static INLINE int compute_valid_comp_types(MACROBLOCK *x, + const AV1_COMP *const cpi, + BLOCK_SIZE bsize, + int masked_compound_used, + int mode_search_mask, + COMPOUND_TYPE *valid_comp_types) { + const AV1_COMMON *cm = &cpi->common; + int valid_type_count = 0; + int comp_type, valid_check; + int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 }; + + const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE)); + const int try_distwtd_comp = + ((mode_search_mask & (1 << COMPOUND_DISTWTD)) && + cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 && + cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); + + // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases + for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; + comp_type++) { + valid_check = + (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp; + if (valid_check && is_interinter_compound_used(comp_type, bsize)) + valid_comp_types[valid_type_count++] = comp_type; + } + // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases + if (masked_compound_used) { + // enable_masked_type[0] corresponds to COMPOUND_WEDGE + // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD + enable_masked_type[0] = enable_wedge_interinter_search(x, cpi); + enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp; + for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD; + comp_type++) { + if ((mode_search_mask & (1 << comp_type)) && + is_interinter_compound_used(comp_type, bsize) && + enable_masked_type[comp_type - COMPOUND_WEDGE]) + valid_comp_types[valid_type_count++] = comp_type; + } + } + return valid_type_count; +} + +// Calculates the cost for compound type mask +static INLINE void calc_masked_type_cost( + const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx, + int comp_index_ctx, int masked_compound_used, int *masked_type_cost) { + av1_zero_array(masked_type_cost, COMPOUND_TYPES); + // Account for group index cost when wedge and/or diffwtd prediction are + // enabled + if (masked_compound_used) { + // Compound group index of average and distwtd is 0 + // Compound group index of wedge and diffwtd is 1 + masked_type_cost[COMPOUND_AVERAGE] += + mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0]; + masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE]; + masked_type_cost[COMPOUND_WEDGE] += + mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1]; + masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE]; + } + + // Compute the cost to signal compound index/type + masked_type_cost[COMPOUND_AVERAGE] += + mode_costs->comp_idx_cost[comp_index_ctx][1]; + masked_type_cost[COMPOUND_DISTWTD] += + mode_costs->comp_idx_cost[comp_index_ctx][0]; + masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0]; + masked_type_cost[COMPOUND_DIFFWTD] += + mode_costs->compound_type_cost[bsize][1]; +} + +// Updates mbmi structure with the relevant compound type info +static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi, + COMPOUND_TYPE cur_type) { + mbmi->interinter_comp.type = cur_type; + mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE); + mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD); +} + +// When match is found, populate the compound type data +// and calculate the rd cost using the stored stats and +// update the mbmi appropriately. +static INLINE int populate_reuse_comp_type_data( + const MACROBLOCK *x, MB_MODE_INFO *mbmi, + BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate, + int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd, + int match_index) { + const int winner_comp_type = + x->comp_rd_stats[match_index].interinter_comp.type; + if (comp_rate[winner_comp_type] == INT_MAX) + return best_type_stats->best_compmode_interinter_cost; + update_mbmi_for_compound_type(mbmi, winner_comp_type); + mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp; + *rd = RDCOST( + x->rdmult, + comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type], + comp_dist[winner_comp_type]); + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + return comp_rs2[winner_comp_type]; +} + +// Updates rd cost and relevant compound type data for the best compound type +static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd, + BEST_COMP_TYPE_STATS *best_type_stats, + int64_t best_rd_cur, + int64_t comp_model_rd_cur, int rs2) { + *rd = best_rd_cur; + best_type_stats->comp_best_model_rd = comp_model_rd_cur; + best_type_stats->best_compound_data = mbmi->interinter_comp; + best_type_stats->best_compmode_interinter_cost = rs2; +} + +// Updates best_mv for masked compound types +static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi, + int_mv *best_mv, int *best_tmp_rate_mv, + int tmp_rate_mv) { + *best_tmp_rate_mv = tmp_rate_mv; + best_mv[0].as_int = mbmi->mv[0].as_int; + best_mv[1].as_int = mbmi->mv[1].as_int; +} + +static INLINE void save_comp_rd_search_stat( + MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate, + const int64_t *comp_dist, const int32_t *comp_model_rate, + const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) { + const int offset = x->comp_rd_stats_idx; + if (offset < MAX_COMP_RD_STATS) { + COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset; + memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate)); + memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist)); + memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate)); + memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist)); + memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2)); + memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv)); + memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames)); + rd_stats->mode = mbmi->mode; + rd_stats->filter = mbmi->interp_filters; + rd_stats->ref_mv_idx = mbmi->ref_mv_idx; + const MACROBLOCKD *const xd = &x->e_mbd; + for (int i = 0; i < 2; ++i) { + const WarpedMotionParams *const wm = + &xd->global_motion[mbmi->ref_frame[i]]; + rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype); + } + memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp, + sizeof(rd_stats->interinter_comp)); + ++x->comp_rd_stats_idx; + } +} + +static INLINE int get_interinter_compound_mask_rate( + const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) { + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD + if (compound_type == COMPOUND_WEDGE) { + return av1_is_wedge_used(mbmi->bsize) + ? av1_cost_literal(1) + + mode_costs + ->wedge_idx_cost[mbmi->bsize] + [mbmi->interinter_comp.wedge_index] + : 0; + } else { + assert(compound_type == COMPOUND_DIFFWTD); + return av1_cost_literal(1); + } +} + +// Takes a backup of rate, distortion and model_rd for future reuse +static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate, + int64_t *comp_dist, int32_t *comp_model_rate, + int64_t *comp_model_dist, int rate_sum, + int64_t dist_sum, RD_STATS *rd_stats, + int *comp_rs2, int rs2) { + comp_rate[cur_type] = rd_stats->rate; + comp_dist[cur_type] = rd_stats->dist; + comp_model_rate[cur_type] = rate_sum; + comp_model_dist[cur_type] = dist_sum; + comp_rs2[cur_type] = rs2; +} + +static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode, + const int reuse_level) { + if (reuse_level || (this_mode == NEW_NEWMV)) + return 1; + else + return 0; +} + +static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi, + MACROBLOCK *x, MACROBLOCKD *xd, + const BLOCK_SIZE bsize, + int64_t ref_skip_rd, int mode_rate) { + int eval_txfm = 1; + const int txfm_rd_gate_level = + get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, + TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0); + // Check if the mode is good enough based on skip rd + if (txfm_rd_gate_level) { + int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize); + int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4)); + eval_txfm = + check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1); + } + return eval_txfm; +} + +static int64_t masked_compound_type_rd( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, + int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, + int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound, + int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate, + int64_t *comp_model_dist, const int64_t comp_best_model_rd, + int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int64_t best_rd_cur = INT64_MAX; + int64_t rd = INT64_MAX; + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD + assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD); + int rate_sum; + uint8_t tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge, + pick_interinter_seg }; + + // TODO(any): Save pred and mask calculation as well into records. However + // this may increase memory requirements as compound segment mask needs to be + // stored in each record. + if (*calc_pred_masked_compound) { + get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1, + diff10, strides); + *calc_pred_masked_compound = 0; + } + if (compound_type == COMPOUND_WEDGE) { + unsigned int sse; + if (is_cur_buf_hbd(xd)) + (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, + CONVERT_TO_BYTEPTR(*preds1), *strides, + &sse); + else + (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, + &sse); + const unsigned int mse = + ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]); + // If two predictors are very similar, skip wedge compound mode search + if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + } + // Function pointer to pick the appropriate mask + // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge() + // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg() + uint64_t cur_sse = UINT64_MAX; + best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE]( + cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse); + *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0); + assert(cur_sse != UINT64_MAX); + int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4)); + + // Although the true rate_mv might be different after motion search, but it + // is unlikely to be the best mode considering the transform rd cost and other + // mode overhead cost + int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); + if (mode_rd > rd_thresh) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + + // Check if the mode is good enough based on skip rd + // TODO(nithya): Handle wedge_newmv_search if extending for lower speed + // setting + const int txfm_rd_gate_level = + get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, + TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0); + if (txfm_rd_gate_level) { + int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur, + txfm_rd_gate_level, 1); + if (!eval_txfm) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + } + + // Compute cost if matching record not found, else, reuse data + if (comp_rate[compound_type] == INT_MAX) { + // Check whether new MV search for wedge is to be done + int wedge_newmv_search = + have_newmv_in_inter_mode(this_mode) && + (compound_type == COMPOUND_WEDGE) && + (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search); + + // Search for new MV if needed and build predictor + if (wedge_newmv_search) { + *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } else { + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, + preds1, strides); + } + // Get the RD cost from model RD + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, + &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); + *comp_model_rd_cur = rd; + // Override with best if current is worse than best for new MV + if (wedge_newmv_search) { + if (rd >= best_rd_cur) { + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + *comp_model_rd_cur = best_rd_cur; + } + } + if (cpi->sf.inter_sf.prune_comp_type_by_model_rd && + (*comp_model_rd_cur > comp_best_model_rd) && + comp_best_model_rd != INT64_MAX) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + // Compute RD cost for the current type + RD_STATS rd_stats; + const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0); + const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd; + rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats); + if (rd != INT64_MAX) { + rd = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist); + // Backup rate and distortion for future reuse + backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2, + *rs2); + } + } else { + // Reuse data as matching record is found + assert(comp_dist[compound_type] != INT64_MAX); + // When disable_interinter_wedge_newmv_search is set, motion refinement is + // disabled. Hence rate and distortion can be reused in this case as well + assert(IMPLIES((have_newmv_in_inter_mode(this_mode) && + (compound_type == COMPOUND_WEDGE)), + cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)); + assert(mbmi->mv[0].as_int == cur_mv[0].as_int); + assert(mbmi->mv[1].as_int == cur_mv[1].as_int); + *out_rate_mv = rate_mv; + // Calculate RD cost based on stored stats + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type], + comp_dist[compound_type]); + // Recalculate model rdcost with the updated rate + *comp_model_rd_cur = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type], + comp_model_dist[compound_type]); + } + return rd; +} + +// scaling values to be used for gating wedge/compound segment based on best +// approximate rd +static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 }; +static int comp_type_rd_threshold_div[3] = { 3, 16, 16 }; + +int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + HandleInterModeArgs *args, BLOCK_SIZE bsize, + int_mv *cur_mv, int mode_search_mask, + int masked_compound_used, const BUFFER_SET *orig_dst, + const BUFFER_SET *tmp_dst, + const CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t ref_skip_rd, int *is_luma_interp_done, + int64_t rd_thresh) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + int ref_frame = av1_ref_frame_type(mbmi->ref_frame); + const int bw = block_size_wide[bsize]; + int rs2; + int_mv best_mv[2]; + int best_tmp_rate_mv = *rate_mv; + BEST_COMP_TYPE_STATS best_type_stats; + // Initializing BEST_COMP_TYPE_STATS + best_type_stats.best_compound_data.type = COMPOUND_AVERAGE; + best_type_stats.best_compmode_interinter_cost = 0; + best_type_stats.comp_best_model_rd = INT64_MAX; + + uint8_t *preds0[1] = { buffers->pred0 }; + uint8_t *preds1[1] = { buffers->pred1 }; + int strides[1] = { bw }; + int tmp_rate_mv; + COMPOUND_TYPE cur_type; + // Local array to store the mask cost for different compound types + int masked_type_cost[COMPOUND_TYPES]; + + int calc_pred_masked_compound = 1; + int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, + INT_MAX }; + int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int match_index = 0; + const int match_found = + find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, comp_rs2, &match_index); + best_mv[0].as_int = cur_mv[0].as_int; + best_mv[1].as_int = cur_mv[1].as_int; + *rd = INT64_MAX; + + // Local array to store the valid compound types to be evaluated in the core + // loop + COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = { + COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD + }; + int valid_type_count = 0; + // compute_valid_comp_types() returns the number of valid compound types to be + // evaluated and populates the same in the local array valid_comp_types[]. + // It also sets the flag 'try_average_and_distwtd_comp' + valid_type_count = compute_valid_comp_types( + x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types); + + // The following context indices are independent of compound type + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); + const int comp_index_ctx = get_comp_index_context(cm, xd); + + // Populates masked_type_cost local array for the 4 compound types + calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx, + comp_index_ctx, masked_compound_used, masked_type_cost); + + int64_t comp_model_rd_cur = INT64_MAX; + int64_t best_rd_cur = ref_best_rd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // If the match is found, calculate the rd cost using the + // stored stats and update the mbmi appropriately. + if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) { + return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv, + comp_rate, comp_dist, comp_rs2, + rate_mv, rd, match_index); + } + + // If COMPOUND_AVERAGE is not valid, use the spare buffer + if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + + // Loop over valid compound types + for (int i = 0; i < valid_type_count; i++) { + cur_type = valid_comp_types[i]; + + if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) { + if (cur_type == COMPOUND_WEDGE) continue; + } + + comp_model_rd_cur = INT64_MAX; + tmp_rate_mv = *rate_mv; + best_rd_cur = INT64_MAX; + ref_best_rd = AOMMIN(ref_best_rd, *rd); + update_mbmi_for_compound_type(mbmi, cur_type); + rs2 = masked_type_cost[cur_type]; + + int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd >= ref_best_rd) continue; + + // Derive the flags to indicate enabling/disabling of MV refinement process. + const int enable_fast_compound_mode_search = + cpi->sf.inter_sf.enable_fast_compound_mode_search; + const bool skip_mv_refinement_for_avg_distwtd = + enable_fast_compound_mode_search == 3 || + (enable_fast_compound_mode_search == 2 && (this_mode != NEW_NEWMV)); + const bool skip_mv_refinement_for_diffwtd = + (!enable_fast_compound_mode_search && cur_type == COMPOUND_DIFFWTD); + + // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD + if (cur_type < COMPOUND_WEDGE) { + if (skip_mv_refinement_for_avg_distwtd) { + int rate_sum; + uint8_t tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + + // Reuse data if matching record is found + if (comp_rate[cur_type] == INT_MAX) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; + // Compute RD cost for the current type + RD_STATS est_rd_stats; + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd; + int64_t est_rd = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + // Evaluate further if skip rd is low enough + if (eval_txfm) { + est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, + &est_rd_stats); + } + if (est_rd != INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + // Backup rate and distortion for future reuse + backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, rate_sum, dist_sum, &est_rd_stats, + comp_rs2, rs2); + } + } else { + // Calculate RD cost based on stored stats + assert(comp_dist[cur_type] != INT64_MAX); + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type], + comp_dist[cur_type]); + // Recalculate model rdcost with the updated rate + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type], + comp_model_dist[cur_type]); + } + } else { + tmp_rate_mv = *rate_mv; + if (have_newmv_in_inter_mode(this_mode)) { + InterPredParams inter_pred_params; + av1_dist_wtd_comp_weight_assign( + &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset, + &inter_pred_params.conv_params.bck_offset, + &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1); + int mask_value = inter_pred_params.conv_params.fwd_offset * 4; + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; + + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + RD_STATS est_rd_stats; + estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats); + + best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + } + + // use spare buffer for following compound type try + if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + } else if (cur_type == COMPOUND_WEDGE) { + int best_mask_index = 0; + int best_wedge_sign = 0; + int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] }; + int best_rs2 = 0; + int best_rate_mv = *rate_mv; + int wedge_mask_size = get_wedge_types_lookup(bsize); + int need_mask_search = args->wedge_index == -1; + int wedge_newmv_search = + have_newmv_in_inter_mode(this_mode) && + !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search; + + if (need_mask_search && !wedge_newmv_search) { + // short cut repeated single reference block build + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, + preds0, strides); + av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, + preds1, strides); + } + + for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search; + ++wedge_mask) { + for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) { + tmp_rate_mv = *rate_mv; + mbmi->interinter_comp.wedge_index = wedge_mask; + mbmi->interinter_comp.wedge_sign = wedge_sign; + rs2 = masked_type_cost[cur_type]; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd >= ref_best_rd / 2) continue; + + if (wedge_newmv_search) { + tmp_rate_mv = av1_interinter_compound_motion_search( + cpi, x, cur_mv, bsize, this_mode); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, + bsize, AOM_PLANE_Y, AOM_PLANE_Y); + } else { + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + } + + RD_STATS est_rd_stats; + int64_t this_rd_cur = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + this_rd_cur = estimate_yrd_for_sb( + cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats); + } + if (this_rd_cur < INT64_MAX) { + this_rd_cur = + RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + if (this_rd_cur < best_rd_cur) { + best_mask_index = wedge_mask; + best_wedge_sign = wedge_sign; + best_rd_cur = this_rd_cur; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + best_rate_mv = tmp_rate_mv; + best_rs2 = rs2; + } + } + // Consider the asymmetric partitions for oblique angle only if the + // corresponding symmetric partition is the best so far. + // Note: For horizontal and vertical types, both symmetric and + // asymmetric partitions are always considered. + if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) { + // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16] + // correspond to symmetric partitions of the 4 oblique angles, the + // next 4 entries correspond to the vertical/horizontal + // symmetric/asymmetric partitions and the last 8 entries correspond + // to the asymmetric partitions of oblique types. + const int idx_before_asym_oblique = 7; + const int last_oblique_sym_idx = 3; + if (wedge_mask == idx_before_asym_oblique) { + if (best_mask_index > last_oblique_sym_idx) { + break; + } else { + // Asymmetric (Index-1) map for the corresponding oblique masks. + // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9 + // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13 + // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15 + // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11 + const int asym_mask_idx[4] = { 7, 11, 13, 9 }; + wedge_mask = asym_mask_idx[best_mask_index]; + wedge_mask_size = wedge_mask + 3; + } + } + } + } + + if (need_mask_search) { + if (save_mask_search_results( + this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) { + args->wedge_index = best_mask_index; + args->wedge_sign = best_wedge_sign; + } + } else { + mbmi->interinter_comp.wedge_index = args->wedge_index; + mbmi->interinter_comp.wedge_sign = args->wedge_sign; + rs2 = masked_type_cost[cur_type]; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + if (wedge_newmv_search) { + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + + best_mask_index = args->wedge_index; + best_wedge_sign = args->wedge_sign; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + best_rate_mv = tmp_rate_mv; + best_rs2 = masked_type_cost[cur_type]; + best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + best_rs2 + *rate_mv); + if (eval_txfm) { + RD_STATS est_rd_stats; + estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats); + best_rd_cur = + RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + } + + mbmi->interinter_comp.wedge_index = best_mask_index; + mbmi->interinter_comp.wedge_sign = best_wedge_sign; + mbmi->mv[0] = tmp_mv[0]; + mbmi->mv[1] = tmp_mv[1]; + tmp_rate_mv = best_rate_mv; + rs2 = best_rs2; + } else if (skip_mv_refinement_for_diffwtd) { + int_mv tmp_mv[2]; + int best_mask_index = 0; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + int need_mask_search = args->diffwtd_index == -1; + + for (int mask_index = 0; mask_index < 2 && need_mask_search; + ++mask_index) { + tmp_rate_mv = *rate_mv; + mbmi->interinter_comp.mask_type = mask_index; + if (have_newmv_in_inter_mode(this_mode)) { + // hard coded number for diff wtd + int mask_value = mask_index == 0 ? 38 : 26; + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + RD_STATS est_rd_stats; + int64_t this_rd_cur = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + this_rd_cur = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + } + if (this_rd_cur < INT64_MAX) { + this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + + if (this_rd_cur < best_rd_cur) { + best_rd_cur = this_rd_cur; + best_mask_index = mbmi->interinter_comp.mask_type; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + } + } + + if (need_mask_search) { + if (save_mask_search_results(this_mode, 0)) + args->diffwtd_index = best_mask_index; + } else { + mbmi->interinter_comp.mask_type = args->diffwtd_index; + rs2 = masked_type_cost[cur_type]; + rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); + + int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26; + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); + + if (have_newmv_in_inter_mode(this_mode)) { + tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, + bsize, this_mode); + } + best_mask_index = mbmi->interinter_comp.mask_type; + tmp_mv[0] = mbmi->mv[0]; + tmp_mv[1] = mbmi->mv[1]; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + RD_STATS est_rd_stats; + int64_t this_rd_cur = INT64_MAX; + int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd, + rs2 + *rate_mv); + if (eval_txfm) { + this_rd_cur = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + } + if (this_rd_cur < INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + } + } + + mbmi->interinter_comp.mask_type = best_mask_index; + mbmi->mv[0] = tmp_mv[0]; + mbmi->mv[1] = tmp_mv[1]; + } else { + // Handle masked compound types + bool eval_masked_comp_type = true; + if (*rd != INT64_MAX) { + // Factors to control gating of compound type selection based on best + // approximate rd so far + const int max_comp_type_rd_threshold_mul = + comp_type_rd_threshold_mul[cpi->sf.inter_sf + .prune_comp_type_by_comp_avg]; + const int max_comp_type_rd_threshold_div = + comp_type_rd_threshold_div[cpi->sf.inter_sf + .prune_comp_type_by_comp_avg]; + // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is + // within threshold + const int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) * + max_comp_type_rd_threshold_mul); + if (approx_rd >= ref_best_rd) eval_masked_comp_type = false; + } + + if (eval_masked_comp_type) { + const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh); + best_rd_cur = masked_compound_type_rd( + cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, + &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, + strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound, + comp_rate, comp_dist, comp_model_rate, comp_model_dist, + best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2, + ref_skip_rd); + } + } + + // Update stats for best compound type + if (best_rd_cur < *rd) { + update_best_info(mbmi, rd, &best_type_stats, best_rd_cur, + comp_model_rd_cur, rs2); + if (have_newmv_in_inter_mode(this_mode)) + update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv); + } + // reset to original mvs for next iteration + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + } + + mbmi->comp_group_idx = + (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1; + mbmi->compound_idx = + !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD); + mbmi->interinter_comp = best_type_stats.best_compound_data; + + if (have_newmv_in_inter_mode(this_mode)) { + mbmi->mv[0].as_int = best_mv[0].as_int; + mbmi->mv[1].as_int = best_mv[1].as_int; + rd_stats->rate += best_tmp_rate_mv - *rate_mv; + *rate_mv = best_tmp_rate_mv; + } + + if (this_mode == NEW_NEWMV) + args->cmp_mode[ref_frame] = mbmi->interinter_comp.type; + + restore_dst_buf(xd, *orig_dst, 1); + if (!match_found) + save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate, + comp_model_dist, cur_mv, comp_rs2); + return best_type_stats.best_compmode_interinter_cost; +} diff --git a/third_party/aom/av1/encoder/compound_type.h b/third_party/aom/av1/encoder/compound_type.h new file mode 100644 index 0000000000..a028a35093 --- /dev/null +++ b/third_party/aom/av1/encoder/compound_type.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_ +#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/interp_search.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Structure to store the compound type related stats for best compound type +typedef struct { + INTERINTER_COMPOUND_DATA best_compound_data; + int64_t comp_best_model_rd; + int best_compmode_interinter_cost; +} BEST_COMP_TYPE_STATS; + +#define IGNORE_MODE -1 +// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode +// is found, 0 otherwise. +int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, MB_MODE_INFO *mbmi, + HandleInterModeArgs *args, int64_t ref_best_rd, + int *rate_mv, int *tmp_rate2, + const BUFFER_SET *orig_dst); + +int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + HandleInterModeArgs *args, BLOCK_SIZE bsize, + int_mv *cur_mv, int mode_search_mask, + int masked_compound_used, const BUFFER_SET *orig_dst, + const BUFFER_SET *tmp_dst, + const CompoundTypeRdBuffers *buffers, int *rate_mv, + int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t ref_skip_rd, int *is_luma_interp_done, + int64_t rd_thresh); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_COMPOUND_TYPE_H_ diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c new file mode 100644 index 0000000000..aafe55d2d0 --- /dev/null +++ b/third_party/aom/av1/encoder/context_tree.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rd.h" +#include + +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx) { + dst_ctx->mic = src_ctx->mic; + dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best; + + dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk; + dst_ctx->skippable = src_ctx->skippable; +#if CONFIG_INTERNAL_STATS + dst_ctx->best_mode_index = src_ctx->best_mode_index; +#endif // CONFIG_INTERNAL_STATS + + memcpy(dst_ctx->blk_skip, src_ctx->blk_skip, + sizeof(uint8_t) * src_ctx->num_4x4_blk); + av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map, + src_ctx->num_4x4_blk); + + dst_ctx->rd_stats = src_ctx->rd_stats; + dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready; +} + +void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params, + PC_TREE_SHARED_BUFFERS *shared_bufs, + struct aom_internal_error_info *error) { + const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE; + const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size]; + const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x + + seq_params->subsampling_y); + for (int i = 0; i < num_planes; i++) { + const int max_num_pix = + (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv; + AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i], + aom_memalign(32, max_num_pix * sizeof(tran_low_t))); + } +} + +void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) { + for (int i = 0; i < 3; i++) { + aom_free(shared_bufs->coeff_buf[i]); + aom_free(shared_bufs->qcoeff_buf[i]); + aom_free(shared_bufs->dqcoeff_buf[i]); + shared_bufs->coeff_buf[i] = NULL; + shared_bufs->qcoeff_buf[i] = NULL; + shared_bufs->dqcoeff_buf[i] = NULL; + } +} + +PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi, + BLOCK_SIZE bsize, + PC_TREE_SHARED_BUFFERS *shared_bufs) { + PICK_MODE_CONTEXT *volatile ctx = NULL; + const AV1_COMMON *const cm = &cpi->common; + struct aom_internal_error_info error; + + if (setjmp(error.jmp)) { + av1_free_pmc(ctx, av1_num_planes(cm)); + return NULL; + } + error.setjmp = 1; + + AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx))); + ctx->rd_mode_is_ready = 0; + + const int num_planes = av1_num_planes(cm); + const int num_pix = block_size_wide[bsize] * block_size_high[bsize]; + const int num_blk = num_pix / 16; + + AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip, + aom_calloc(num_blk, sizeof(*ctx->blk_skip))); + AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map, + aom_calloc(num_blk, sizeof(*ctx->tx_type_map))); + ctx->num_4x4_blk = num_blk; + + for (int i = 0; i < num_planes; ++i) { + ctx->coeff[i] = shared_bufs->coeff_buf[i]; + ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i]; + ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i]; + AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i], + aom_memalign(32, num_blk * sizeof(*ctx->eobs[i]))); + AOM_CHECK_MEM_ERROR( + &error, ctx->txb_entropy_ctx[i], + aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i]))); + } + + if (num_pix <= MAX_PALETTE_SQUARE) { + for (int i = 0; i < 2; ++i) { + if (cm->features.allow_screen_content_tools) { + AOM_CHECK_MEM_ERROR( + &error, ctx->color_index_map[i], + aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i]))); + } else { + ctx->color_index_map[i] = NULL; + } + } + } + + av1_invalid_rd_stats(&ctx->rd_stats); + + return ctx; +} + +void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) { + av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk); + av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk); + av1_invalid_rd_stats(&ctx->rd_stats); +} + +void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) { + if (ctx == NULL) return; + + aom_free(ctx->blk_skip); + ctx->blk_skip = NULL; + aom_free(ctx->tx_type_map); + for (int i = 0; i < num_planes; ++i) { + ctx->coeff[i] = NULL; + ctx->qcoeff[i] = NULL; + ctx->dqcoeff[i] = NULL; + aom_free(ctx->eobs[i]); + ctx->eobs[i] = NULL; + aom_free(ctx->txb_entropy_ctx[i]); + ctx->txb_entropy_ctx[i] = NULL; + } + + for (int i = 0; i < 2; ++i) { + if (ctx->color_index_map[i]) { + aom_free(ctx->color_index_map[i]); + ctx->color_index_map[i] = NULL; + } + } + + aom_free(ctx); +} + +PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) { + PC_TREE *pc_tree = aom_calloc(1, sizeof(*pc_tree)); + if (pc_tree == NULL) return NULL; + + pc_tree->partitioning = PARTITION_NONE; + pc_tree->block_size = bsize; + + return pc_tree; +} + +#define FREE_PMC_NODE(CTX) \ + do { \ + av1_free_pmc(CTX, num_planes); \ + CTX = NULL; \ + } while (0) + +void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best, + int keep_none, + PARTITION_SEARCH_TYPE partition_search_type) { + if (pc_tree == NULL) return; + + // Avoid freeing of extended partitions as they are not supported when + // partition_search_type is VAR_BASED_PARTITION. + if (partition_search_type == VAR_BASED_PARTITION && !keep_best && + !keep_none) { + FREE_PMC_NODE(pc_tree->none); + + for (int i = 0; i < 2; ++i) { + FREE_PMC_NODE(pc_tree->horizontal[i]); + FREE_PMC_NODE(pc_tree->vertical[i]); + } + +#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY + for (int i = 0; i < 3; ++i) { + assert(pc_tree->horizontala[i] == NULL); + assert(pc_tree->horizontalb[i] == NULL); + assert(pc_tree->verticala[i] == NULL); + assert(pc_tree->verticalb[i] == NULL); + } + for (int i = 0; i < 4; ++i) { + assert(pc_tree->horizontal4[i] == NULL); + assert(pc_tree->vertical4[i] == NULL); + } +#endif + + for (int i = 0; i < 4; ++i) { + if (pc_tree->split[i] != NULL) { + av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0, + partition_search_type); + pc_tree->split[i] = NULL; + } + } + aom_free(pc_tree); + return; + } + + const PARTITION_TYPE partition = pc_tree->partitioning; + + if (!keep_none && (!keep_best || (partition != PARTITION_NONE))) + FREE_PMC_NODE(pc_tree->none); + + for (int i = 0; i < 2; ++i) { + if (!keep_best || (partition != PARTITION_HORZ)) + FREE_PMC_NODE(pc_tree->horizontal[i]); + if (!keep_best || (partition != PARTITION_VERT)) + FREE_PMC_NODE(pc_tree->vertical[i]); + } +#if !CONFIG_REALTIME_ONLY + for (int i = 0; i < 3; ++i) { + if (!keep_best || (partition != PARTITION_HORZ_A)) + FREE_PMC_NODE(pc_tree->horizontala[i]); + if (!keep_best || (partition != PARTITION_HORZ_B)) + FREE_PMC_NODE(pc_tree->horizontalb[i]); + if (!keep_best || (partition != PARTITION_VERT_A)) + FREE_PMC_NODE(pc_tree->verticala[i]); + if (!keep_best || (partition != PARTITION_VERT_B)) + FREE_PMC_NODE(pc_tree->verticalb[i]); + } + for (int i = 0; i < 4; ++i) { + if (!keep_best || (partition != PARTITION_HORZ_4)) + FREE_PMC_NODE(pc_tree->horizontal4[i]); + if (!keep_best || (partition != PARTITION_VERT_4)) + FREE_PMC_NODE(pc_tree->vertical4[i]); + } +#endif + if (!keep_best || (partition != PARTITION_SPLIT)) { + for (int i = 0; i < 4; ++i) { + if (pc_tree->split[i] != NULL) { + av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0, + partition_search_type); + pc_tree->split[i] = NULL; + } + } + } + + if (!keep_best && !keep_none) aom_free(pc_tree); +} + +int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { + // The structure 'sms_tree' is used to store the simple motion search data for + // partition pruning in inter frames. Hence, the memory allocations and + // initializations related to it are avoided for allintra encoding mode. + if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0; + + AV1_COMMON *const cm = &cpi->common; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + int sms_tree_index = 0; + SIMPLE_MOTION_DATA_TREE *this_sms; + int square_index = 1; + int nodes; + + aom_free(td->sms_tree); + td->sms_tree = + (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree)); + if (!td->sms_tree) return -1; + this_sms = &td->sms_tree[0]; + + if (!stat_generation_stage) { + const int leaf_factor = is_sb_size_128 ? 4 : 1; + const int leaf_nodes = 256 * leaf_factor; + + // Sets up all the leaf nodes in the tree. + for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) { + SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; + tree->block_size = square[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (int i = 0; i < nodes; ++i) { + SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; + tree->block_size = square[square_index]; + for (int j = 0; j < 4; j++) tree->split[j] = this_sms++; + ++sms_tree_index; + } + ++square_index; + } + } else { + // Allocation for firstpass/LAP stage + // TODO(Mufaddal): refactor square_index to use a common block_size macro + // from firstpass.c + SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index]; + square_index = 2; + tree->block_size = square[square_index]; + } + + // Set up the root node for the largest superblock size + td->sms_root = &td->sms_tree[tree_nodes - 1]; + return 0; +} + +void av1_free_sms_tree(ThreadData *td) { + aom_free(td->sms_tree); + td->sms_tree = NULL; +} diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h new file mode 100644 index 0000000000..0be7ccbb54 --- /dev/null +++ b/third_party/aom/av1/encoder/context_tree.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_ +#define AOM_AV1_ENCODER_CONTEXT_TREE_H_ + +#include "config/aom_config.h" + +#include "av1/common/blockd.h" +#include "av1/encoder/block.h" +#include "av1/encoder/speed_features.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_PRIMARY; +struct AV1_COMP; +struct AV1Common; +struct ThreadData; + +typedef struct { + tran_low_t *coeff_buf[MAX_MB_PLANE]; + tran_low_t *qcoeff_buf[MAX_MB_PLANE]; + tran_low_t *dqcoeff_buf[MAX_MB_PLANE]; +} PC_TREE_SHARED_BUFFERS; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct PICK_MODE_CONTEXT { + MB_MODE_INFO mic; + MB_MODE_INFO_EXT_FRAME mbmi_ext_best; + uint8_t *color_index_map[2]; + uint8_t *blk_skip; + + tran_low_t *coeff[MAX_MB_PLANE]; + tran_low_t *qcoeff[MAX_MB_PLANE]; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + uint16_t *eobs[MAX_MB_PLANE]; + uint8_t *txb_entropy_ctx[MAX_MB_PLANE]; + uint8_t *tx_type_map; + + int num_4x4_blk; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 1. + int skippable; +#if CONFIG_INTERNAL_STATS + THR_MODES best_mode_index; +#endif // CONFIG_INTERNAL_STATS + RD_STATS rd_stats; + + int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has + // been made. +#if CONFIG_AV1_TEMPORAL_DENOISING + int64_t newmv_sse; + int64_t zeromv_sse; + int64_t zeromv_lastref_sse; + PREDICTION_MODE best_sse_inter_mode; + int_mv best_sse_mv; + MV_REFERENCE_FRAME best_reference_frame; + MV_REFERENCE_FRAME best_zeromv_reference_frame; + int sb_skip_denoising; +#endif +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT *none; + PICK_MODE_CONTEXT *horizontal[2]; + PICK_MODE_CONTEXT *vertical[2]; +#if !CONFIG_REALTIME_ONLY + PICK_MODE_CONTEXT *horizontala[3]; + PICK_MODE_CONTEXT *horizontalb[3]; + PICK_MODE_CONTEXT *verticala[3]; + PICK_MODE_CONTEXT *verticalb[3]; + PICK_MODE_CONTEXT *horizontal4[4]; + PICK_MODE_CONTEXT *vertical4[4]; +#endif + struct PC_TREE *split[4]; + int index; +} PC_TREE; + +typedef struct SIMPLE_MOTION_DATA_TREE { + BLOCK_SIZE block_size; + PARTITION_TYPE partitioning; + struct SIMPLE_MOTION_DATA_TREE *split[4]; + + // Simple motion search_features + FULLPEL_MV start_mvs[REF_FRAMES]; + unsigned int sms_none_feat[2]; + unsigned int sms_rect_feat[8]; + int sms_none_valid; + int sms_rect_valid; +} SIMPLE_MOTION_DATA_TREE; + +void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params, + PC_TREE_SHARED_BUFFERS *shared_bufs, + struct aom_internal_error_info *error); +void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs); + +PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize); +void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best, + int keep_none, + PARTITION_SEARCH_TYPE partition_search_type); + +PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi, + BLOCK_SIZE bsize, + PC_TREE_SHARED_BUFFERS *shared_bufs); +void av1_reset_pmc(PICK_MODE_CONTEXT *ctx); +void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes); +void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx, + PICK_MODE_CONTEXT *src_ctx); + +static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = { + BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128, +}; + +static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128, + int stat_generation_stage) { + const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0; + const int tree_nodes = + stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1); + return tree_nodes; +} + +// Returns 0 on success, -1 on memory allocation failure. +int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); +void av1_free_sms_tree(struct ThreadData *td); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_ diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c new file mode 100644 index 0000000000..323e2aed58 --- /dev/null +++ b/third_party/aom/av1/encoder/cost.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "av1/encoder/cost.h" +#include "av1/common/entropy.h" + +// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255. +const uint16_t av1_prob_cost[128] = { + 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435, + 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361, + 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294, + 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232, + 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175, + 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, + 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, + 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26, + 23, 20, 18, 15, 12, 9, 6, 3, +}; + +void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, + const int *inv_map) { + int i; + aom_cdf_prob prev_cdf = 0; + for (i = 0;; ++i) { + aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf; + p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15; + prev_cdf = AOM_ICDF(cdf[i]); + + if (inv_map) + costs[inv_map[i]] = av1_cost_symbol(p15); + else + costs[i] = av1_cost_symbol(p15); + + // Stop once we reach the end of the CDF + if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break; + } +} diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h new file mode 100644 index 0000000000..be0241a820 --- /dev/null +++ b/third_party/aom/av1/encoder/cost.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_COST_H_ +#define AOM_AV1_ENCODER_COST_H_ + +#include "aom_dsp/prob.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const uint16_t av1_prob_cost[128]; + +// The factor to scale from cost in bits to cost in av1_prob_cost units. +#define AV1_PROB_COST_SHIFT 9 + +// Cost of coding an n bit literal, using 128 (i.e. 50%) probability +// for each bit. +#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT)) + +// Calculate the cost of a symbol with probability p15 / 2^15 +static INLINE int av1_cost_symbol(aom_cdf_prob p15) { + // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the + // following cost calculation works correctly. Otherwise, if p15 = + // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong. + p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1); + assert(0 < p15 && p15 < CDF_PROB_TOP); + const int shift = CDF_PROB_BITS - 1 - get_msb(p15); + const int prob = get_prob(p15 << shift, CDF_PROB_TOP); + assert(prob >= 128); + return av1_prob_cost[prob - 128] + av1_cost_literal(shift); +} + +void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf, + const int *inv_map); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_COST_H_ diff --git a/third_party/aom/av1/encoder/deltaq4_model.c b/third_party/aom/av1/encoder/deltaq4_model.c new file mode 100644 index 0000000000..60a7e6d2cf --- /dev/null +++ b/third_party/aom/av1/encoder/deltaq4_model.c @@ -0,0 +1,7776 @@ +/* Embedded file: model.tflite */ +const int av1_deltaq4_model_fsize = 101032; +const unsigned char av1_deltaq4_model_file[101032] = { + 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c, + 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, + 0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00, + 0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00, + 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, + 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14, + 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65, + 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, + 0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69, + 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00, + 0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01, + 0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00, + 0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc, + 0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00, + 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, + 0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff, + 0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64, + 0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77, + 0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, + 0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd, + 0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60, + 0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a, + 0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7, + 0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd, + 0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb, + 0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76, + 0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02, + 0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c, + 0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9, + 0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68, + 0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a, + 0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d, + 0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31, + 0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00, + 0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba, + 0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd, + 0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf, + 0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5, + 0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7, + 0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d, + 0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d, + 0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab, + 0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e, + 0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d, + 0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71, + 0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63, + 0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45, + 0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd, + 0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37, + 0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c, + 0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3, + 0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c, + 0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74, + 0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32, + 0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16, + 0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd, + 0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee, + 0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31, + 0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26, + 0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d, + 0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7, + 0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94, + 0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3, + 0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd, + 0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf, + 0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c, + 0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a, + 0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba, + 0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90, + 0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a, + 0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf, + 0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc, + 0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66, + 0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79, + 0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17, + 0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc, + 0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1, + 0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c, + 0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f, + 0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c, + 0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4, + 0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a, + 0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68, + 0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd, + 0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb, + 0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88, + 0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54, + 0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d, + 0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b, + 0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58, + 0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20, + 0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d, + 0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f, + 0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03, + 0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3, + 0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc, + 0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40, + 0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29, + 0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb, + 0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d, + 0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b, + 0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c, + 0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5, + 0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d, + 0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda, + 0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88, + 0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96, + 0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c, + 0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72, + 0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad, + 0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54, + 0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d, + 0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47, + 0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27, + 0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd, + 0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd, + 0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45, + 0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd, + 0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1, + 0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d, + 0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e, + 0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc, + 0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7, + 0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d, + 0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad, + 0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38, + 0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b, + 0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d, + 0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76, + 0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f, + 0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde, + 0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd, + 0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf, + 0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a, + 0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68, + 0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d, + 0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d, + 0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3, + 0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f, + 0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c, + 0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21, + 0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82, + 0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2, + 0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d, + 0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6, + 0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8, + 0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2, + 0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d, + 0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55, + 0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24, + 0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba, + 0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d, + 0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32, + 0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59, + 0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99, + 0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd, + 0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16, + 0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f, + 0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52, + 0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d, + 0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e, + 0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8, + 0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef, + 0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d, + 0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06, + 0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e, + 0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f, + 0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d, + 0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a, + 0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69, + 0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5, + 0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d, + 0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac, + 0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0, + 0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5, + 0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e, + 0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe, + 0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54, + 0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c, + 0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d, + 0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe, + 0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea, + 0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94, + 0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d, + 0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38, + 0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68, + 0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f, + 0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd, + 0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea, + 0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b, + 0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d, + 0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d, + 0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6, + 0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8, + 0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73, + 0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd, + 0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89, + 0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68, + 0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a, + 0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd, + 0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0, + 0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7, + 0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c, + 0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc, + 0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a, + 0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12, + 0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04, + 0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9, + 0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f, + 0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55, + 0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05, + 0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd, + 0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0, + 0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47, + 0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90, + 0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd, + 0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30, + 0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34, + 0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99, + 0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d, + 0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26, + 0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a, + 0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2, + 0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe, + 0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94, + 0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa, + 0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac, + 0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd, + 0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2, + 0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40, + 0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2, + 0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d, + 0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c, + 0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4, + 0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85, + 0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d, + 0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16, + 0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4, + 0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70, + 0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd, + 0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7, + 0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc, + 0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6, + 0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d, + 0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f, + 0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c, + 0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f, + 0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d, + 0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7, + 0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0, + 0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e, + 0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd, + 0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4, + 0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89, + 0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f, + 0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d, + 0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9, + 0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b, + 0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb, + 0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d, + 0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3, + 0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b, + 0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac, + 0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd, + 0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a, + 0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e, + 0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde, + 0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba, + 0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06, + 0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d, + 0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1, + 0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d, + 0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c, + 0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf, + 0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1, + 0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d, + 0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a, + 0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76, + 0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73, + 0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc, + 0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a, + 0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3, + 0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63, + 0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d, + 0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c, + 0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e, + 0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92, + 0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd, + 0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8, + 0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5, + 0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf, + 0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d, + 0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17, + 0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19, + 0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1, + 0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c, + 0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b, + 0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd, + 0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba, + 0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd, + 0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0, + 0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c, + 0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4, + 0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd, + 0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e, + 0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22, + 0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c, + 0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d, + 0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7, + 0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19, + 0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7, + 0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c, + 0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d, + 0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37, + 0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf, + 0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd, + 0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13, + 0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19, + 0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf, + 0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd, + 0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26, + 0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe, + 0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5, + 0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc, + 0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c, + 0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29, + 0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56, + 0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc, + 0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22, + 0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf, + 0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04, + 0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc, + 0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e, + 0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95, + 0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c, + 0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd, + 0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb, + 0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2, + 0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b, + 0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d, + 0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52, + 0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97, + 0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f, + 0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c, + 0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd, + 0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca, + 0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15, + 0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc, + 0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35, + 0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf, + 0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c, + 0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd, + 0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f, + 0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6, + 0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48, + 0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd, + 0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d, + 0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16, + 0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45, + 0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d, + 0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda, + 0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb, + 0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4, + 0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd, + 0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01, + 0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f, + 0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7, + 0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d, + 0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b, + 0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32, + 0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee, + 0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d, + 0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a, + 0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba, + 0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc, + 0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd, + 0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce, + 0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4, + 0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a, + 0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd, + 0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44, + 0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31, + 0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e, + 0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d, + 0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71, + 0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f, + 0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e, + 0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b, + 0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f, + 0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99, + 0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f, + 0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd, + 0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97, + 0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe, + 0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf, + 0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d, + 0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9, + 0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85, + 0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55, + 0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd, + 0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8, + 0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71, + 0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a, + 0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b, + 0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51, + 0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2, + 0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53, + 0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d, + 0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa, + 0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb, + 0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96, + 0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d, + 0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f, + 0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce, + 0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e, + 0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd, + 0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2, + 0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d, + 0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22, + 0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d, + 0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16, + 0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda, + 0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0, + 0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd, + 0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46, + 0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6, + 0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3, + 0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc, + 0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f, + 0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9, + 0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d, + 0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d, + 0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d, + 0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0, + 0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0, + 0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9, + 0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c, + 0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8, + 0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f, + 0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d, + 0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1, + 0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18, + 0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11, + 0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d, + 0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06, + 0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67, + 0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68, + 0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b, + 0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae, + 0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde, + 0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a, + 0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d, + 0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed, + 0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f, + 0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62, + 0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d, + 0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5, + 0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd, + 0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a, + 0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd, + 0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77, + 0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1, + 0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5, + 0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd, + 0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b, + 0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62, + 0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97, + 0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd, + 0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea, + 0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b, + 0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17, + 0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd, + 0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20, + 0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc, + 0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08, + 0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d, + 0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57, + 0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36, + 0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e, + 0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe, + 0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6, + 0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40, + 0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04, + 0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd, + 0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43, + 0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23, + 0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24, + 0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd, + 0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88, + 0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8, + 0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e, + 0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd, + 0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4, + 0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0, + 0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd, + 0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd, + 0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a, + 0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e, + 0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce, + 0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d, + 0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc, + 0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85, + 0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd, + 0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd, + 0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd, + 0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc, + 0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a, + 0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d, + 0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54, + 0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61, + 0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94, + 0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd, + 0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8, + 0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52, + 0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a, + 0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d, + 0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c, + 0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81, + 0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa, + 0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd, + 0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b, + 0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96, + 0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88, + 0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd, + 0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad, + 0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01, + 0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46, + 0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d, + 0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f, + 0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6, + 0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc, + 0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd, + 0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24, + 0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08, + 0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5, + 0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd, + 0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43, + 0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc, + 0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51, + 0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c, + 0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5, + 0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f, + 0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88, + 0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d, + 0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60, + 0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e, + 0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b, + 0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc, + 0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0, + 0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04, + 0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98, + 0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd, + 0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68, + 0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83, + 0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a, + 0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d, + 0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f, + 0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21, + 0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93, + 0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d, + 0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6, + 0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea, + 0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3, + 0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e, + 0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1, + 0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15, + 0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c, + 0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d, + 0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5, + 0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35, + 0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf, + 0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd, + 0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19, + 0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d, + 0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5, + 0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd, + 0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85, + 0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c, + 0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88, + 0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd, + 0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d, + 0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15, + 0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb, + 0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd, + 0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15, + 0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2, + 0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d, + 0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e, + 0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51, + 0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7, + 0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2, + 0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd, + 0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa, + 0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca, + 0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c, + 0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe, + 0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a, + 0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa, + 0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e, + 0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d, + 0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f, + 0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf, + 0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57, + 0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d, + 0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31, + 0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24, + 0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03, + 0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d, + 0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85, + 0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47, + 0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7, + 0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe, + 0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11, + 0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75, + 0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61, + 0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d, + 0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a, + 0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f, + 0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa, + 0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc, + 0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35, + 0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57, + 0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07, + 0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d, + 0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92, + 0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01, + 0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f, + 0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d, + 0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89, + 0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e, + 0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff, + 0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d, + 0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63, + 0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d, + 0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6, + 0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc, + 0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54, + 0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39, + 0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56, + 0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd, + 0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51, + 0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75, + 0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad, + 0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc, + 0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5, + 0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a, + 0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5, + 0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd, + 0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63, + 0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2, + 0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5, + 0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd, + 0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e, + 0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6, + 0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67, + 0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d, + 0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff, + 0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf, + 0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b, + 0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d, + 0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0, + 0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f, + 0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7, + 0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b, + 0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0, + 0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93, + 0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2, + 0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d, + 0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24, + 0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53, + 0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16, + 0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc, + 0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72, + 0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6, + 0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb, + 0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd, + 0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0, + 0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42, + 0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f, + 0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb, + 0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43, + 0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8, + 0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4, + 0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d, + 0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7, + 0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb, + 0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef, + 0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c, + 0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea, + 0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22, + 0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8, + 0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb, + 0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e, + 0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45, + 0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2, + 0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e, + 0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25, + 0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec, + 0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53, + 0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd, + 0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7, + 0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd, + 0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76, + 0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d, + 0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0, + 0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed, + 0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7, + 0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d, + 0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3, + 0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf, + 0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21, + 0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc, + 0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89, + 0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe, + 0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0, + 0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d, + 0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3, + 0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36, + 0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29, + 0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d, + 0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e, + 0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75, + 0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca, + 0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb, + 0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63, + 0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a, + 0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f, + 0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc, + 0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41, + 0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63, + 0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78, + 0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd, + 0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4, + 0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4, + 0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8, + 0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd, + 0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09, + 0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11, + 0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7, + 0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c, + 0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4, + 0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc, + 0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a, + 0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb, + 0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2, + 0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d, + 0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97, + 0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d, + 0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2, + 0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e, + 0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46, + 0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d, + 0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea, + 0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0, + 0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f, + 0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c, + 0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6, + 0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01, + 0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce, + 0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c, + 0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5, + 0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15, + 0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16, + 0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc, + 0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b, + 0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d, + 0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe, + 0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc, + 0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9, + 0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f, + 0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93, + 0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc, + 0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87, + 0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f, + 0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90, + 0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c, + 0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3, + 0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16, + 0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f, + 0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d, + 0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1, + 0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87, + 0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5, + 0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd, + 0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac, + 0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95, + 0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c, + 0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd, + 0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba, + 0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1, + 0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c, + 0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d, + 0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90, + 0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23, + 0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e, + 0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd, + 0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32, + 0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b, + 0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b, + 0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d, + 0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b, + 0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17, + 0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5, + 0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc, + 0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e, + 0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9, + 0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92, + 0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d, + 0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76, + 0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28, + 0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32, + 0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd, + 0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2, + 0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2, + 0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08, + 0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc, + 0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f, + 0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b, + 0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16, + 0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc, + 0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9, + 0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef, + 0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34, + 0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d, + 0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf, + 0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6, + 0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde, + 0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd, + 0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec, + 0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78, + 0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1, + 0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe, + 0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79, + 0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e, + 0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa, + 0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd, + 0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89, + 0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d, + 0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24, + 0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd, + 0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81, + 0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c, + 0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41, + 0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d, + 0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20, + 0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5, + 0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c, + 0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d, + 0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed, + 0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf, + 0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9, + 0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd, + 0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e, + 0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb, + 0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d, + 0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc, + 0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96, + 0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b, + 0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96, + 0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d, + 0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07, + 0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6, + 0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1, + 0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d, + 0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e, + 0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88, + 0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0, + 0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc, + 0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60, + 0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3, + 0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce, + 0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c, + 0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e, + 0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2, + 0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00, + 0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c, + 0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49, + 0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24, + 0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5, + 0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d, + 0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d, + 0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01, + 0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00, + 0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b, + 0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0, + 0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b, + 0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f, + 0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d, + 0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b, + 0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66, + 0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48, + 0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd, + 0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26, + 0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23, + 0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc, + 0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d, + 0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6, + 0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18, + 0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a, + 0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc, + 0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05, + 0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58, + 0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00, + 0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd, + 0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81, + 0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac, + 0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0, + 0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc, + 0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27, + 0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d, + 0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c, + 0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d, + 0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91, + 0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e, + 0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8, + 0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc, + 0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d, + 0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7, + 0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e, + 0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd, + 0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d, + 0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33, + 0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42, + 0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d, + 0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9, + 0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45, + 0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2, + 0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd, + 0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7, + 0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90, + 0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06, + 0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d, + 0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64, + 0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1, + 0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7, + 0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d, + 0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e, + 0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69, + 0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64, + 0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd, + 0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a, + 0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8, + 0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d, + 0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc, + 0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38, + 0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f, + 0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb, + 0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb, + 0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65, + 0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d, + 0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1, + 0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd, + 0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0, + 0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a, + 0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce, + 0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe, + 0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10, + 0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f, + 0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf, + 0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd, + 0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8, + 0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49, + 0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f, + 0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c, + 0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c, + 0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99, + 0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03, + 0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d, + 0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a, + 0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c, + 0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2, + 0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd, + 0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe, + 0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78, + 0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b, + 0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c, + 0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88, + 0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26, + 0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb, + 0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe, + 0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5, + 0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc, + 0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c, + 0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd, + 0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41, + 0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55, + 0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96, + 0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d, + 0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8, + 0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5, + 0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18, + 0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d, + 0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44, + 0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1, + 0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0, + 0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d, + 0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52, + 0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45, + 0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b, + 0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d, + 0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21, + 0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1, + 0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf, + 0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d, + 0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68, + 0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b, + 0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09, + 0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d, + 0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8, + 0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24, + 0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd, + 0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd, + 0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44, + 0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86, + 0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26, + 0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd, + 0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6, + 0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13, + 0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14, + 0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d, + 0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec, + 0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4, + 0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99, + 0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d, + 0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3, + 0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06, + 0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac, + 0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd, + 0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5, + 0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63, + 0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb, + 0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd, + 0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a, + 0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e, + 0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6, + 0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d, + 0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42, + 0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d, + 0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1, + 0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd, + 0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4, + 0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a, + 0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4, + 0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d, + 0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9, + 0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12, + 0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93, + 0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c, + 0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09, + 0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42, + 0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b, + 0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d, + 0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92, + 0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65, + 0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01, + 0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d, + 0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33, + 0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4, + 0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc, + 0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc, + 0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37, + 0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27, + 0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96, + 0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe, + 0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92, + 0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f, + 0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3, + 0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd, + 0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe, + 0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe, + 0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75, + 0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd, + 0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca, + 0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d, + 0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3, + 0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb, + 0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc, + 0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73, + 0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab, + 0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c, + 0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c, + 0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27, + 0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee, + 0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8, + 0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45, + 0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab, + 0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b, + 0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c, + 0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2, + 0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7, + 0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40, + 0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd, + 0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf, + 0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a, + 0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31, + 0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d, + 0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c, + 0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5, + 0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3, + 0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d, + 0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b, + 0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12, + 0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14, + 0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b, + 0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21, + 0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb, + 0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85, + 0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d, + 0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e, + 0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e, + 0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3, + 0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb, + 0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18, + 0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b, + 0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45, + 0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d, + 0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62, + 0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46, + 0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95, + 0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d, + 0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d, + 0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4, + 0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12, + 0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd, + 0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94, + 0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25, + 0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1, + 0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b, + 0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23, + 0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24, + 0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe, + 0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c, + 0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39, + 0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63, + 0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50, + 0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd, + 0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50, + 0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97, + 0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96, + 0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e, + 0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b, + 0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65, + 0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf, + 0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd, + 0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0, + 0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83, + 0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b, + 0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd, + 0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf, + 0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46, + 0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30, + 0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d, + 0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3, + 0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14, + 0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b, + 0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd, + 0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda, + 0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b, + 0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84, + 0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd, + 0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28, + 0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34, + 0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42, + 0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c, + 0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed, + 0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9, + 0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab, + 0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d, + 0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57, + 0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7, + 0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb, + 0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd, + 0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32, + 0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6, + 0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79, + 0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd, + 0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b, + 0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58, + 0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf, + 0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d, + 0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc, + 0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb, + 0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24, + 0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd, + 0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4, + 0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8, + 0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96, + 0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d, + 0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62, + 0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e, + 0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4, + 0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc, + 0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf, + 0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54, + 0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1, + 0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d, + 0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d, + 0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a, + 0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84, + 0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d, + 0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07, + 0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5, + 0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba, + 0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d, + 0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f, + 0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b, + 0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a, + 0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb, + 0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b, + 0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6, + 0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86, + 0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d, + 0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83, + 0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec, + 0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91, + 0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c, + 0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89, + 0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd, + 0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b, + 0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c, + 0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80, + 0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1, + 0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9, + 0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c, + 0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6, + 0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f, + 0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97, + 0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd, + 0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3, + 0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb, + 0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca, + 0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba, + 0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7, + 0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1, + 0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20, + 0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b, + 0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda, + 0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27, + 0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82, + 0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd, + 0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d, + 0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c, + 0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70, + 0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc, + 0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e, + 0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80, + 0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4, + 0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d, + 0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf, + 0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d, + 0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73, + 0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d, + 0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9, + 0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae, + 0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf, + 0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc, + 0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81, + 0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27, + 0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51, + 0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd, + 0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57, + 0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b, + 0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20, + 0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d, + 0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4, + 0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71, + 0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01, + 0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d, + 0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e, + 0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d, + 0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84, + 0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e, + 0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5, + 0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3, + 0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d, + 0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd, + 0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98, + 0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28, + 0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f, + 0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e, + 0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b, + 0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba, + 0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1, + 0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d, + 0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88, + 0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6, + 0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0, + 0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd, + 0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6, + 0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33, + 0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09, + 0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d, + 0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4, + 0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff, + 0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02, + 0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc, + 0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12, + 0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54, + 0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23, + 0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d, + 0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d, + 0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d, + 0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde, + 0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd, + 0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9, + 0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c, + 0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89, + 0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc, + 0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4, + 0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98, + 0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e, + 0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d, + 0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52, + 0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95, + 0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5, + 0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d, + 0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea, + 0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba, + 0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32, + 0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd, + 0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55, + 0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde, + 0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83, + 0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc, + 0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae, + 0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2, + 0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6, + 0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d, + 0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85, + 0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0, + 0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6, + 0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d, + 0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a, + 0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b, + 0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12, + 0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b, + 0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec, + 0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f, + 0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59, + 0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d, + 0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71, + 0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c, + 0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53, + 0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd, + 0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3, + 0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc, + 0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1, + 0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d, + 0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb, + 0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e, + 0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae, + 0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc, + 0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab, + 0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07, + 0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8, + 0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d, + 0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29, + 0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a, + 0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d, + 0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d, + 0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa, + 0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24, + 0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9, + 0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d, + 0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87, + 0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9, + 0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab, + 0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd, + 0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b, + 0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7, + 0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7, + 0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd, + 0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0, + 0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04, + 0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c, + 0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d, + 0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f, + 0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b, + 0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7, + 0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d, + 0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e, + 0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91, + 0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e, + 0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd, + 0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88, + 0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a, + 0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9, + 0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d, + 0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13, + 0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc, + 0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80, + 0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d, + 0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca, + 0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf, + 0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab, + 0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d, + 0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06, + 0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06, + 0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc, + 0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc, + 0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d, + 0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4, + 0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea, + 0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d, + 0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2, + 0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94, + 0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6, + 0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd, + 0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d, + 0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83, + 0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf, + 0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c, + 0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb, + 0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0, + 0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4, + 0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e, + 0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a, + 0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95, + 0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5, + 0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c, + 0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13, + 0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66, + 0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d, + 0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d, + 0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3, + 0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9, + 0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a, + 0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d, + 0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48, + 0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f, + 0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8, + 0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d, + 0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95, + 0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79, + 0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88, + 0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d, + 0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca, + 0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02, + 0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef, + 0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd, + 0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9, + 0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d, + 0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f, + 0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd, + 0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1, + 0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37, + 0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02, + 0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd, + 0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0, + 0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42, + 0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2, + 0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d, + 0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda, + 0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed, + 0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70, + 0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd, + 0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b, + 0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e, + 0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29, + 0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d, + 0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc, + 0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3, + 0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a, + 0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd, + 0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9, + 0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3, + 0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10, + 0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd, + 0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8, + 0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4, + 0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a, + 0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd, + 0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe, + 0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84, + 0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83, + 0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd, + 0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd, + 0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36, + 0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83, + 0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd, + 0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25, + 0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc, + 0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a, + 0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd, + 0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00, + 0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9, + 0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43, + 0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb, + 0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35, + 0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2, + 0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a, + 0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d, + 0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe, + 0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc, + 0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88, + 0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d, + 0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f, + 0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40, + 0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f, + 0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe, + 0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80, + 0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92, + 0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50, + 0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e, + 0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad, + 0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b, + 0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9, + 0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd, + 0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74, + 0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5, + 0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab, + 0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d, + 0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb, + 0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f, + 0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b, + 0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e, + 0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1, + 0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a, + 0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f, + 0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d, + 0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81, + 0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb, + 0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3, + 0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d, + 0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d, + 0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4, + 0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb, + 0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe, + 0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc, + 0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4, + 0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7, + 0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d, + 0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20, + 0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92, + 0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5, + 0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d, + 0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34, + 0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43, + 0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13, + 0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d, + 0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20, + 0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea, + 0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98, + 0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e, + 0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f, + 0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88, + 0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b, + 0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e, + 0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01, + 0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c, + 0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e, + 0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe, + 0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8, + 0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd, + 0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e, + 0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd, + 0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f, + 0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0, + 0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b, + 0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d, + 0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e, + 0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa, + 0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde, + 0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe, + 0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda, + 0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f, + 0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27, + 0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd, + 0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21, + 0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0, + 0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b, + 0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe, + 0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28, + 0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0, + 0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc, + 0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd, + 0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37, + 0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2, + 0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b, + 0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e, + 0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54, + 0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6, + 0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2, + 0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d, + 0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89, + 0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2, + 0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0, + 0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d, + 0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c, + 0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa, + 0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82, + 0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd, + 0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f, + 0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0, + 0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5, + 0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe, + 0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9, + 0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b, + 0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea, + 0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc, + 0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6, + 0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49, + 0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12, + 0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e, + 0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80, + 0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33, + 0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22, + 0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe, + 0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68, + 0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64, + 0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0, + 0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e, + 0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97, + 0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30, + 0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c, + 0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d, + 0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51, + 0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb, + 0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8, + 0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d, + 0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21, + 0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a, + 0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf, + 0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e, + 0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86, + 0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78, + 0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99, + 0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb, + 0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37, + 0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb, + 0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95, + 0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d, + 0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1, + 0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6, + 0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46, + 0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd, + 0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83, + 0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84, + 0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23, + 0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d, + 0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde, + 0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5, + 0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07, + 0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d, + 0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4, + 0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81, + 0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76, + 0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e, + 0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93, + 0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7, + 0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21, + 0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc, + 0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1, + 0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3, + 0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b, + 0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd, + 0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05, + 0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90, + 0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c, + 0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd, + 0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9, + 0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9, + 0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7, + 0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d, + 0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3, + 0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6, + 0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29, + 0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd, + 0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9, + 0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31, + 0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff, + 0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d, + 0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a, + 0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f, + 0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b, + 0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d, + 0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12, + 0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb, + 0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d, + 0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd, + 0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53, + 0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01, + 0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce, + 0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe, + 0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d, + 0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07, + 0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a, + 0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb, + 0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48, + 0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90, + 0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7, + 0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d, + 0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11, + 0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26, + 0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c, + 0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e, + 0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a, + 0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e, + 0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48, + 0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c, + 0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8, + 0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87, + 0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a, + 0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd, + 0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d, + 0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84, + 0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26, + 0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe, + 0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce, + 0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8, + 0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b, + 0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd, + 0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74, + 0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18, + 0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde, + 0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd, + 0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78, + 0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce, + 0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda, + 0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d, + 0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a, + 0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84, + 0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6, + 0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d, + 0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0, + 0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d, + 0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9, + 0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd, + 0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17, + 0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38, + 0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20, + 0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd, + 0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d, + 0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5, + 0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5, + 0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe, + 0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27, + 0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde, + 0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab, + 0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd, + 0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7, + 0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24, + 0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01, + 0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd, + 0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50, + 0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c, + 0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39, + 0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a, + 0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5, + 0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74, + 0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4, + 0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d, + 0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c, + 0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a, + 0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f, + 0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d, + 0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16, + 0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a, + 0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09, + 0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e, + 0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74, + 0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1, + 0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10, + 0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb, + 0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e, + 0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5, + 0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4, + 0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe, + 0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc, + 0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29, + 0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e, + 0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c, + 0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b, + 0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4, + 0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe, + 0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a, + 0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4, + 0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf, + 0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25, + 0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d, + 0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19, + 0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1, + 0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d, + 0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d, + 0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef, + 0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34, + 0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75, + 0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc, + 0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40, + 0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62, + 0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea, + 0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd, + 0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, + 0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4, + 0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1, + 0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e, + 0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a, + 0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b, + 0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9, + 0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe, + 0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48, + 0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9, + 0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88, + 0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe, + 0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53, + 0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd, + 0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4, + 0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e, + 0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33, + 0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b, + 0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd, + 0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd, + 0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81, + 0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2, + 0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45, + 0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c, + 0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68, + 0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83, + 0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe, + 0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c, + 0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f, + 0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe, + 0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e, + 0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12, + 0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7, + 0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, + 0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c, + 0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8, + 0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83, + 0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30, + 0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc, + 0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20, + 0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd, + 0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12, + 0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb, + 0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa, + 0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d, + 0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe, + 0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c, + 0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1, + 0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c, + 0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29, + 0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd, + 0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7, + 0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97, + 0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f, + 0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b, + 0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79, + 0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9, + 0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b, + 0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd, + 0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0, + 0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c, + 0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c, + 0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd, + 0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91, + 0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4, + 0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83, + 0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c, + 0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95, + 0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74, + 0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb, + 0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd, + 0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c, + 0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1, + 0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64, + 0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c, + 0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4, + 0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06, + 0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e, + 0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c, + 0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0, + 0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79, + 0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d, + 0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd, + 0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68, + 0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d, + 0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d, + 0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c, + 0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95, + 0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6, + 0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03, + 0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d, + 0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8, + 0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b, + 0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66, + 0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd, + 0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc, + 0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1, + 0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f, + 0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c, + 0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1, + 0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab, + 0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26, + 0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd, + 0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47, + 0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf, + 0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a, + 0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd, + 0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e, + 0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68, + 0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47, + 0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd, + 0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4, + 0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2, + 0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c, + 0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c, + 0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde, + 0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6, + 0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed, + 0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc, + 0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd, + 0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf, + 0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58, + 0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c, + 0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48, + 0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f, + 0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52, + 0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd, + 0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9, + 0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90, + 0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8, + 0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc, + 0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4, + 0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d, + 0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09, + 0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a, + 0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d, + 0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a, + 0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44, + 0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc, + 0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55, + 0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59, + 0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62, + 0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c, + 0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34, + 0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4, + 0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f, + 0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d, + 0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71, + 0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a, + 0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79, + 0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d, + 0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3, + 0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e, + 0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80, + 0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d, + 0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd, + 0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43, + 0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb, + 0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d, + 0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61, + 0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf, + 0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80, + 0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd, + 0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54, + 0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde, + 0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43, + 0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b, + 0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8, + 0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c, + 0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd, + 0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba, + 0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57, + 0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4, + 0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c, + 0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd, + 0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43, + 0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3, + 0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70, + 0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd, + 0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41, + 0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3, + 0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80, + 0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c, + 0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a, + 0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a, + 0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8, + 0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba, + 0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc, + 0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5, + 0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b, + 0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d, + 0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2, + 0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b, + 0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91, + 0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c, + 0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52, + 0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a, + 0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1, + 0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d, + 0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97, + 0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce, + 0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01, + 0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d, + 0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10, + 0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09, + 0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78, + 0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d, + 0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98, + 0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17, + 0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f, + 0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc, + 0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8, + 0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62, + 0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e, + 0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd, + 0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42, + 0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3, + 0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb, + 0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d, + 0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c, + 0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4, + 0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10, + 0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd, + 0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d, + 0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9, + 0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83, + 0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d, + 0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b, + 0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0, + 0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06, + 0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd, + 0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc, + 0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1, + 0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c, + 0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c, + 0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc, + 0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57, + 0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f, + 0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d, + 0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f, + 0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6, + 0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23, + 0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc, + 0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd, + 0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1, + 0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c, + 0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d, + 0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4, + 0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d, + 0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6, + 0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc, + 0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30, + 0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f, + 0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b, + 0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd, + 0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02, + 0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3, + 0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18, + 0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c, + 0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84, + 0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99, + 0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9, + 0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb, + 0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa, + 0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f, + 0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20, + 0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d, + 0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd, + 0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53, + 0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9, + 0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc, + 0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68, + 0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a, + 0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56, + 0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd, + 0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89, + 0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47, + 0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66, + 0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d, + 0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e, + 0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09, + 0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85, + 0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d, + 0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd, + 0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87, + 0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3, + 0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb, + 0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11, + 0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91, + 0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c, + 0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d, + 0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c, + 0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b, + 0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f, + 0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc, + 0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4, + 0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b, + 0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91, + 0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb, + 0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29, + 0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90, + 0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde, + 0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d, + 0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd, + 0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41, + 0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51, + 0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd, + 0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58, + 0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40, + 0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49, + 0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd, + 0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75, + 0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56, + 0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05, + 0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd, + 0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9, + 0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64, + 0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90, + 0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b, + 0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e, + 0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf, + 0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0, + 0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b, + 0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1, + 0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6, + 0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96, + 0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c, + 0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9, + 0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59, + 0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99, + 0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb, + 0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70, + 0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4, + 0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3, + 0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba, + 0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd, + 0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7, + 0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87, + 0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd, + 0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb, + 0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22, + 0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70, + 0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd, + 0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf, + 0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b, + 0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7, + 0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c, + 0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40, + 0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2, + 0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d, + 0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c, + 0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84, + 0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8, + 0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65, + 0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc, + 0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5, + 0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb, + 0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83, + 0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d, + 0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2, + 0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc, + 0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d, + 0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc, + 0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e, + 0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83, + 0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29, + 0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc, + 0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd, + 0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06, + 0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e, + 0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d, + 0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a, + 0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf, + 0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9, + 0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb, + 0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31, + 0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e, + 0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7, + 0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd, + 0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90, + 0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc, + 0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59, + 0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd, + 0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10, + 0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8, + 0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f, + 0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc, + 0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0, + 0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec, + 0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6, + 0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c, + 0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1, + 0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7, + 0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44, + 0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc, + 0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0, + 0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54, + 0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d, + 0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d, + 0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4, + 0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98, + 0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f, + 0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b, + 0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9, + 0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4, + 0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd, + 0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d, + 0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c, + 0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02, + 0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75, + 0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd, + 0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40, + 0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e, + 0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50, + 0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c, + 0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93, + 0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e, + 0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47, + 0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d, + 0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88, + 0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4, + 0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15, + 0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd, + 0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2, + 0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26, + 0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53, + 0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd, + 0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8, + 0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3, + 0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a, + 0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd, + 0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16, + 0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf, + 0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b, + 0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd, + 0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c, + 0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15, + 0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83, + 0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b, + 0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53, + 0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f, + 0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09, + 0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c, + 0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6, + 0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb, + 0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d, + 0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c, + 0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c, + 0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22, + 0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82, + 0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d, + 0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb, + 0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6, + 0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92, + 0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc, + 0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e, + 0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81, + 0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3, + 0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c, + 0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80, + 0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4, + 0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b, + 0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd, + 0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f, + 0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f, + 0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00, + 0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd, + 0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46, + 0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3, + 0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85, + 0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d, + 0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb, + 0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe, + 0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7, + 0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d, + 0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84, + 0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e, + 0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c, + 0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c, + 0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11, + 0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f, + 0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e, + 0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd, + 0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a, + 0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d, + 0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30, + 0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd, + 0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76, + 0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a, + 0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46, + 0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc, + 0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a, + 0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a, + 0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb, + 0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c, + 0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2, + 0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4, + 0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34, + 0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd, + 0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9, + 0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46, + 0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86, + 0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc, + 0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46, + 0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e, + 0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f, + 0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd, + 0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c, + 0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79, + 0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48, + 0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c, + 0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f, + 0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8, + 0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88, + 0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd, + 0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a, + 0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc, + 0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71, + 0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd, + 0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3, + 0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18, + 0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28, + 0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc, + 0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47, + 0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67, + 0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45, + 0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c, + 0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02, + 0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11, + 0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96, + 0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb, + 0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e, + 0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8, + 0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76, + 0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d, + 0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2, + 0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8, + 0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe, + 0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c, + 0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d, + 0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb, + 0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05, + 0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd, + 0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58, + 0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c, + 0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9, + 0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9, + 0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2, + 0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7, + 0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd, + 0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc, + 0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9, + 0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf, + 0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89, + 0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d, + 0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e, + 0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e, + 0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c, + 0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd, + 0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf, + 0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b, + 0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d, + 0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd, + 0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54, + 0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe, + 0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c, + 0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd, + 0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf, + 0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f, + 0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62, + 0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c, + 0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a, + 0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8, + 0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c, + 0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd, + 0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75, + 0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c, + 0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35, + 0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc, + 0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65, + 0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd, + 0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b, + 0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb, + 0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05, + 0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9, + 0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69, + 0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd, + 0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96, + 0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d, + 0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe, + 0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc, + 0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2, + 0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63, + 0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4, + 0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb, + 0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62, + 0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d, + 0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a, + 0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d, + 0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27, + 0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf, + 0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a, + 0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd, + 0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27, + 0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b, + 0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e, + 0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c, + 0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4, + 0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6, + 0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59, + 0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c, + 0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8, + 0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f, + 0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42, + 0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d, + 0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69, + 0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04, + 0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77, + 0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd, + 0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5, + 0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69, + 0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60, + 0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc, + 0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40, + 0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e, + 0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09, + 0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc, + 0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28, + 0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65, + 0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa, + 0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd, + 0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12, + 0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25, + 0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e, + 0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd, + 0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b, + 0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7, + 0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92, + 0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d, + 0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8, + 0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4, + 0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee, + 0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd, + 0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4, + 0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35, + 0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f, + 0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d, + 0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a, + 0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5, + 0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8, + 0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d, + 0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67, + 0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c, + 0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54, + 0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d, + 0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18, + 0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a, + 0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23, + 0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd, + 0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde, + 0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c, + 0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c, + 0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d, + 0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e, + 0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a, + 0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4, + 0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c, + 0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5, + 0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0, + 0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a, + 0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d, + 0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e, + 0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71, + 0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91, + 0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b, + 0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62, + 0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57, + 0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3, + 0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd, + 0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c, + 0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45, + 0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45, + 0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b, + 0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d, + 0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac, + 0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94, + 0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d, + 0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e, + 0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20, + 0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95, + 0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd, + 0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94, + 0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b, + 0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad, + 0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c, + 0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba, + 0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11, + 0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb, + 0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd, + 0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10, + 0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc, + 0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15, + 0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c, + 0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34, + 0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12, + 0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0, + 0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d, + 0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78, + 0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03, + 0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84, + 0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd, + 0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9, + 0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1, + 0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91, + 0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb, + 0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70, + 0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa, + 0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96, + 0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c, + 0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d, + 0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13, + 0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56, + 0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c, + 0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa, + 0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c, + 0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6, + 0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd, + 0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80, + 0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20, + 0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef, + 0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd, + 0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38, + 0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82, + 0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a, + 0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd, + 0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3, + 0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6, + 0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94, + 0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd, + 0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3, + 0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99, + 0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52, + 0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd, + 0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30, + 0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f, + 0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3, + 0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d, + 0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17, + 0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b, + 0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b, + 0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d, + 0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a, + 0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83, + 0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e, + 0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd, + 0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02, + 0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b, + 0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59, + 0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd, + 0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf, + 0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49, + 0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff, + 0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c, + 0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee, + 0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e, + 0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20, + 0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d, + 0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87, + 0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94, + 0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c, + 0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb, + 0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42, + 0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2, + 0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b, + 0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd, + 0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c, + 0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a, + 0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54, + 0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd, + 0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6, + 0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47, + 0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f, + 0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c, + 0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee, + 0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1, + 0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45, + 0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd, + 0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68, + 0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97, + 0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9, + 0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd, + 0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8, + 0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae, + 0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f, + 0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc, + 0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea, + 0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d, + 0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3, + 0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd, + 0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0, + 0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33, + 0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08, + 0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d, + 0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00, + 0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf, + 0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1, + 0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb, + 0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca, + 0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc, + 0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03, + 0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd, + 0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c, + 0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a, + 0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe, + 0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c, + 0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0, + 0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92, + 0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a, + 0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b, + 0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8, + 0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14, + 0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e, + 0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd, + 0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50, + 0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b, + 0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63, + 0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd, + 0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29, + 0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97, + 0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5, + 0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c, + 0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a, + 0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17, + 0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f, + 0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd, + 0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48, + 0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b, + 0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10, + 0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd, + 0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31, + 0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d, + 0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24, + 0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c, + 0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6, + 0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6, + 0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45, + 0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd, + 0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e, + 0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e, + 0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83, + 0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc, + 0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0, + 0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28, + 0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f, + 0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd, + 0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4, + 0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e, + 0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29, + 0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c, + 0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8, + 0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41, + 0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f, + 0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd, + 0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0, + 0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf, + 0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84, + 0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd, + 0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0, + 0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6, + 0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb, + 0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd, + 0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2, + 0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05, + 0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47, + 0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d, + 0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8, + 0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a, + 0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf, + 0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c, + 0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae, + 0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f, + 0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06, + 0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c, + 0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0, + 0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f, + 0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18, + 0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c, + 0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a, + 0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8, + 0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b, + 0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd, + 0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0, + 0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b, + 0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a, + 0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd, + 0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59, + 0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16, + 0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08, + 0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d, + 0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2, + 0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f, + 0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33, + 0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd, + 0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc, + 0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27, + 0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3, + 0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c, + 0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c, + 0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e, + 0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd, + 0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c, + 0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6, + 0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04, + 0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21, + 0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d, + 0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0, + 0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78, + 0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18, + 0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd, + 0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66, + 0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19, + 0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef, + 0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a, + 0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64, + 0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf, + 0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a, + 0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c, + 0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4, + 0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a, + 0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91, + 0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc, + 0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28, + 0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1, + 0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67, + 0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d, + 0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87, + 0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5, + 0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f, + 0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd, + 0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e, + 0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1, + 0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12, + 0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d, + 0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24, + 0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86, + 0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8, + 0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd, + 0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c, + 0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12, + 0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f, + 0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc, + 0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58, + 0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e, + 0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5, + 0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39, + 0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec, + 0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a, + 0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6, + 0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c, + 0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e, + 0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa, + 0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c, + 0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d, + 0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62, + 0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94, + 0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65, + 0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd, + 0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce, + 0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a, + 0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40, + 0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd, + 0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c, + 0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62, + 0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b, + 0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c, + 0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66, + 0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d, + 0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68, + 0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d, + 0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60, + 0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32, + 0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc, + 0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc, + 0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b, + 0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92, + 0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a, + 0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc, + 0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4, + 0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b, + 0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6, + 0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c, + 0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00, + 0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e, + 0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c, + 0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a, + 0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40, + 0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb, + 0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a, + 0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c, + 0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d, + 0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5, + 0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a, + 0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd, + 0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17, + 0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85, + 0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21, + 0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc, + 0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1, + 0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d, + 0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f, + 0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b, + 0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80, + 0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31, + 0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08, + 0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd, + 0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98, + 0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd, + 0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e, + 0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d, + 0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72, + 0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5, + 0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f, + 0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc, + 0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae, + 0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a, + 0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f, + 0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc, + 0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf, + 0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65, + 0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40, + 0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb, + 0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b, + 0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67, + 0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee, + 0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d, + 0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50, + 0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5, + 0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90, + 0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb, + 0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b, + 0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5, + 0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05, + 0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc, + 0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61, + 0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e, + 0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84, + 0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d, + 0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a, + 0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81, + 0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87, + 0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc, + 0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5, + 0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7, + 0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79, + 0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c, + 0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38, + 0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1, + 0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b, + 0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c, + 0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54, + 0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19, + 0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42, + 0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd, + 0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0, + 0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf, + 0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c, + 0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c, + 0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc, + 0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16, + 0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5, + 0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc, + 0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1, + 0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea, + 0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a, + 0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d, + 0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00, + 0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb, + 0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81, + 0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c, + 0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8, + 0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5, + 0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e, + 0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c, + 0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82, + 0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a, + 0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78, + 0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd, + 0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0, + 0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08, + 0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f, + 0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b, + 0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05, + 0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3, + 0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b, + 0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd, + 0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa, + 0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e, + 0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8, + 0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb, + 0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76, + 0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45, + 0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2, + 0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba, + 0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc, + 0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc, + 0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31, + 0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc, + 0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90, + 0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a, + 0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19, + 0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc, + 0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b, + 0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17, + 0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f, + 0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc, + 0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12, + 0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55, + 0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44, + 0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc, + 0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa, + 0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8, + 0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90, + 0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd, + 0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7, + 0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56, + 0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0, + 0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d, + 0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53, + 0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3, + 0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c, + 0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd, + 0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3, + 0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb, + 0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d, + 0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d, + 0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb, + 0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7, + 0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91, + 0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd, + 0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e, + 0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4, + 0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64, + 0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d, + 0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22, + 0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e, + 0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80, + 0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc, + 0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88, + 0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab, + 0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04, + 0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d, + 0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b, + 0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3, + 0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9, + 0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c, + 0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90, + 0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b, + 0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71, + 0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d, + 0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6, + 0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78, + 0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4, + 0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d, + 0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c, + 0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e, + 0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35, + 0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc, + 0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6, + 0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36, + 0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae, + 0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb, + 0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2, + 0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad, + 0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69, + 0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd, + 0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d, + 0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b, + 0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a, + 0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d, + 0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d, + 0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf, + 0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02, + 0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d, + 0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02, + 0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f, + 0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14, + 0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a, + 0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4, + 0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5, + 0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23, + 0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc, + 0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf, + 0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0, + 0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b, + 0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb, + 0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4, + 0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99, + 0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17, + 0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d, + 0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93, + 0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e, + 0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48, + 0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d, + 0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e, + 0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42, + 0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a, + 0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc, + 0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e, + 0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13, + 0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47, + 0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c, + 0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58, + 0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca, + 0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43, + 0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd, + 0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23, + 0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31, + 0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49, + 0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c, + 0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50, + 0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56, + 0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13, + 0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc, + 0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37, + 0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41, + 0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7, + 0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc, + 0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42, + 0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb, + 0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62, + 0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc, + 0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa, + 0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1, + 0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99, + 0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd, + 0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70, + 0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8, + 0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02, + 0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d, + 0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3, + 0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41, + 0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70, + 0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d, + 0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11, + 0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc, + 0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0, + 0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc, + 0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89, + 0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c, + 0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb, + 0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc, + 0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9, + 0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3, + 0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d, + 0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc, + 0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12, + 0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37, + 0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21, + 0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c, + 0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff, + 0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39, + 0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d, + 0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c, + 0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba, + 0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d, + 0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e, + 0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c, + 0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41, + 0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95, + 0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c, + 0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c, + 0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99, + 0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8, + 0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15, + 0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc, + 0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99, + 0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e, + 0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52, + 0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c, + 0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e, + 0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05, + 0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83, + 0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd, + 0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00, + 0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e, + 0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5, + 0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd, + 0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7, + 0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07, + 0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a, + 0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd, + 0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3, + 0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42, + 0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a, + 0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d, + 0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8, + 0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98, + 0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f, + 0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd, + 0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c, + 0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc, + 0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90, + 0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc, + 0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69, + 0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23, + 0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e, + 0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c, + 0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82, + 0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34, + 0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a, + 0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd, + 0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba, + 0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a, + 0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49, + 0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c, + 0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e, + 0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9, + 0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9, + 0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c, + 0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d, + 0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40, + 0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27, + 0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd, + 0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53, + 0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80, + 0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23, + 0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd, + 0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5, + 0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b, + 0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d, + 0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c, + 0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a, + 0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9, + 0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29, + 0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc, + 0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d, + 0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4, + 0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e, + 0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd, + 0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca, + 0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81, + 0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23, + 0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd, + 0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03, + 0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a, + 0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a, + 0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c, + 0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb, + 0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb, + 0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89, + 0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc, + 0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2, + 0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f, + 0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83, + 0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c, + 0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30, + 0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8, + 0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d, + 0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd, + 0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90, + 0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b, + 0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40, + 0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc, + 0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce, + 0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49, + 0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70, + 0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd, + 0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e, + 0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc, + 0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d, + 0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d, + 0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c, + 0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17, + 0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57, + 0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc, + 0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a, + 0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f, + 0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc, + 0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d, + 0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a, + 0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f, + 0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e, + 0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d, + 0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16, + 0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f, + 0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42, + 0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc, + 0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32, + 0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14, + 0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74, + 0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc, + 0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6, + 0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f, + 0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c, + 0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd, + 0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5, + 0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf, + 0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19, + 0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d, + 0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b, + 0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f, + 0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc, + 0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c, + 0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07, + 0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3, + 0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79, + 0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d, + 0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73, + 0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44, + 0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91, + 0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd, + 0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91, + 0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a, + 0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91, + 0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b, + 0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78, + 0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38, + 0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5, + 0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d, + 0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55, + 0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19, + 0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c, + 0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c, + 0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b, + 0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a, + 0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57, + 0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d, + 0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3, + 0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e, + 0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05, + 0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d, + 0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e, + 0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65, + 0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b, + 0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd, + 0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b, + 0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c, + 0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f, + 0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd, + 0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38, + 0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6, + 0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f, + 0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d, + 0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86, + 0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80, + 0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78, + 0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc, + 0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79, + 0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46, + 0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8, + 0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd, + 0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38, + 0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01, + 0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f, + 0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d, + 0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda, + 0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52, + 0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff, + 0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd, + 0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c, + 0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1, + 0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32, + 0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d, + 0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6, + 0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83, + 0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e, + 0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb, + 0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f, + 0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f, + 0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb, + 0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd, + 0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7, + 0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62, + 0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97, + 0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba, + 0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02, + 0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6, + 0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba, + 0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d, + 0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99, + 0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2, + 0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3, + 0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d, + 0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b, + 0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2, + 0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59, + 0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d, + 0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18, + 0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc, + 0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70, + 0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc, + 0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c, + 0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e, + 0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e, + 0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd, + 0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd, + 0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1, + 0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a, + 0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd, + 0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33, + 0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21, + 0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a, + 0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd, + 0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15, + 0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95, + 0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99, + 0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d, + 0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7, + 0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c, + 0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b, + 0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc, + 0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53, + 0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe, + 0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80, + 0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d, + 0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a, + 0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a, + 0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb, + 0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d, + 0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb, + 0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32, + 0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77, + 0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd, + 0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61, + 0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f, + 0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7, + 0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d, + 0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed, + 0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52, + 0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8, + 0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d, + 0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4, + 0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b, + 0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f, + 0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd, + 0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4, + 0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14, + 0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb, + 0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d, + 0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2, + 0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62, + 0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8, + 0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c, + 0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e, + 0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca, + 0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80, + 0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc, + 0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52, + 0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67, + 0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a, + 0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd, + 0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07, + 0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99, + 0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44, + 0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9, + 0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77, + 0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29, + 0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15, + 0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd, + 0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7, + 0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e, + 0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82, + 0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd, + 0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa, + 0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90, + 0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e, + 0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c, + 0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf, + 0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59, + 0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0, + 0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb, + 0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c, + 0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7, + 0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf, + 0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c, + 0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9, + 0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf, + 0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d, + 0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c, + 0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a, + 0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20, + 0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28, + 0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd, + 0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71, + 0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8, + 0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad, + 0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc, + 0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92, + 0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4, + 0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4, + 0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c, + 0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd, + 0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6, + 0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0, + 0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c, + 0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82, + 0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52, + 0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c, + 0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc, + 0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba, + 0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72, + 0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f, + 0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c, + 0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1, + 0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10, + 0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35, + 0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d, + 0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00, + 0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9, + 0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa, + 0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d, + 0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40, + 0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2, + 0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b, + 0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d, + 0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7, + 0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06, + 0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f, + 0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc, + 0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00, + 0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65, + 0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95, + 0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c, + 0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13, + 0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7, + 0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0, + 0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd, + 0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42, + 0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe, + 0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99, + 0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d, + 0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92, + 0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81, + 0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28, + 0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd, + 0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd, + 0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82, + 0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a, + 0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb, + 0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5, + 0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19, + 0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb, + 0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d, + 0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73, + 0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30, + 0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82, + 0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd, + 0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0, + 0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd, + 0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e, + 0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d, + 0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55, + 0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00, + 0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a, + 0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c, + 0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e, + 0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54, + 0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30, + 0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39, + 0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17, + 0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea, + 0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f, + 0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c, + 0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1, + 0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a, + 0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25, + 0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d, + 0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32, + 0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6, + 0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96, + 0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c, + 0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f, + 0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e, + 0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99, + 0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d, + 0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c, + 0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06, + 0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee, + 0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd, + 0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12, + 0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed, + 0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25, + 0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc, + 0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f, + 0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1, + 0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76, + 0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d, + 0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c, + 0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c, + 0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f, + 0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd, + 0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54, + 0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b, + 0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0, + 0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb, + 0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7, + 0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20, + 0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6, + 0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd, + 0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e, + 0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a, + 0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65, + 0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d, + 0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd, + 0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6, + 0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10, + 0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d, + 0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52, + 0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc, + 0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc, + 0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb, + 0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e, + 0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05, + 0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0, + 0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc, + 0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5, + 0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d, + 0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75, + 0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd, + 0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc, + 0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a, + 0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3, + 0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd, + 0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0, + 0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18, + 0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f, + 0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d, + 0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39, + 0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64, + 0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b, + 0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd, + 0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d, + 0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00, + 0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23, + 0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d, + 0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c, + 0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d, + 0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3, + 0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8, + 0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3, + 0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28, + 0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11, + 0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc, + 0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32, + 0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb, + 0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90, + 0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d, + 0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73, + 0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c, + 0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0, + 0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d, + 0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2, + 0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25, + 0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0, + 0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb, + 0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f, + 0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8, + 0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86, + 0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d, + 0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f, + 0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9, + 0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a, + 0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd, + 0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c, + 0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff, + 0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc, + 0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc, + 0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67, + 0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12, + 0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7, + 0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc, + 0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae, + 0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef, + 0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37, + 0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c, + 0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2, + 0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7, + 0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf, + 0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d, + 0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe, + 0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48, + 0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d, + 0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c, + 0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0, + 0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31, + 0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91, + 0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb, + 0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5, + 0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9, + 0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0, + 0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d, + 0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a, + 0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda, + 0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b, + 0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd, + 0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0, + 0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9, + 0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d, + 0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d, + 0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16, + 0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c, + 0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d, + 0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd, + 0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4, + 0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6, + 0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09, + 0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c, + 0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6, + 0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15, + 0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6, + 0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd, + 0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32, + 0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7, + 0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9, + 0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c, + 0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95, + 0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50, + 0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81, + 0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c, + 0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96, + 0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49, + 0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07, + 0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc, + 0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec, + 0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c, + 0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3, + 0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d, + 0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e, + 0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72, + 0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e, + 0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d, + 0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2, + 0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8, + 0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32, + 0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd, + 0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99, + 0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae, + 0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56, + 0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd, + 0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90, + 0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05, + 0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a, + 0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c, + 0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7, + 0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0, + 0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16, + 0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb, + 0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec, + 0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4, + 0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45, + 0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc, + 0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2, + 0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a, + 0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f, + 0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d, + 0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d, + 0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45, + 0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86, + 0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c, + 0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08, + 0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98, + 0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17, + 0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d, + 0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa, + 0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac, + 0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55, + 0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d, + 0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0, + 0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d, + 0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae, + 0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c, + 0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98, + 0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf, + 0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86, + 0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c, + 0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c, + 0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54, + 0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46, + 0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d, + 0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f, + 0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7, + 0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a, + 0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c, + 0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec, + 0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35, + 0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17, + 0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc, + 0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1, + 0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81, + 0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a, + 0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd, + 0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06, + 0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6, + 0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f, + 0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c, + 0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7, + 0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39, + 0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8, + 0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd, + 0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7, + 0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25, + 0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38, + 0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd, + 0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc, + 0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8, + 0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13, + 0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c, + 0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90, + 0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60, + 0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b, + 0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc, + 0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc, + 0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74, + 0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42, + 0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b, + 0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80, + 0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c, + 0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52, + 0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc, + 0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b, + 0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50, + 0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e, + 0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd, + 0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70, + 0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f, + 0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf, + 0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc, + 0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55, + 0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f, + 0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a, + 0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b, + 0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b, + 0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09, + 0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c, + 0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc, + 0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab, + 0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf, + 0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f, + 0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc, + 0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02, + 0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a, + 0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc, + 0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd, + 0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1, + 0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e, + 0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68, + 0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c, + 0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf, + 0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51, + 0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98, + 0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc, + 0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31, + 0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53, + 0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90, + 0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd, + 0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e, + 0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c, + 0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44, + 0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d, + 0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf, + 0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51, + 0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5, + 0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd, + 0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22, + 0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5, + 0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24, + 0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc, + 0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a, + 0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae, + 0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d, + 0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb, + 0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c, + 0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8, + 0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64, + 0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a, + 0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb, + 0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83, + 0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19, + 0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd, + 0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc, + 0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a, + 0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f, + 0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd, + 0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99, + 0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42, + 0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05, + 0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c, + 0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd, + 0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62, + 0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84, + 0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c, + 0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28, + 0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1, + 0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5, + 0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb, + 0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea, + 0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06, + 0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26, + 0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc, + 0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42, + 0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e, + 0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f, + 0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd, + 0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78, + 0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4, + 0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d, + 0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d, + 0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f, + 0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4, + 0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb, + 0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd, + 0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3, + 0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64, + 0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d, + 0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb, + 0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb, + 0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd, + 0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9, + 0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d, + 0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e, + 0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57, + 0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85, + 0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d, + 0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03, + 0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac, + 0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89, + 0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d, + 0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6, + 0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36, + 0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e, + 0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d, + 0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e, + 0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2, + 0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13, + 0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc, + 0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d, + 0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40, + 0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63, + 0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d, + 0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff, + 0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73, + 0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d, + 0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd, + 0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9, + 0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0, + 0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78, + 0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d, + 0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc, + 0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a, + 0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58, + 0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d, + 0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae, + 0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e, + 0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7, + 0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd, + 0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2, + 0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8, + 0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c, + 0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c, + 0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59, + 0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93, + 0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9, + 0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d, + 0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d, + 0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a, + 0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56, + 0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb, + 0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f, + 0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5, + 0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e, + 0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd, + 0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2, + 0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99, + 0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2, + 0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb, + 0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5, + 0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea, + 0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f, + 0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd, + 0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d, + 0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f, + 0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f, + 0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d, + 0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce, + 0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c, + 0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2, + 0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c, + 0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67, + 0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7, + 0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78, + 0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd, + 0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74, + 0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a, + 0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d, + 0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd, + 0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae, + 0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab, + 0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28, + 0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d, + 0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18, + 0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72, + 0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f, + 0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc, + 0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40, + 0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12, + 0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97, + 0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd, + 0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a, + 0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc, + 0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41, + 0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d, + 0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf, + 0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06, + 0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97, + 0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d, + 0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60, + 0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9, + 0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38, + 0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d, + 0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72, + 0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed, + 0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea, + 0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d, + 0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a, + 0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27, + 0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04, + 0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd, + 0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd, + 0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5, + 0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01, + 0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c, + 0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01, + 0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68, + 0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f, + 0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d, + 0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac, + 0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e, + 0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64, + 0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d, + 0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15, + 0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02, + 0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e, + 0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd, + 0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e, + 0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba, + 0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87, + 0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc, + 0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e, + 0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02, + 0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13, + 0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c, + 0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd, + 0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9, + 0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23, + 0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc, + 0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0, + 0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80, + 0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b, + 0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc, + 0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b, + 0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3, + 0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e, + 0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d, + 0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20, + 0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6, + 0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d, + 0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d, + 0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62, + 0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa, + 0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9, + 0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d, + 0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16, + 0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa, + 0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c, + 0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd, + 0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1, + 0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c, + 0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97, + 0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd, + 0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5, + 0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15, + 0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc, + 0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c, + 0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc, + 0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3, + 0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85, + 0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d, + 0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51, + 0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30, + 0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c, + 0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c, + 0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28, + 0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24, + 0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0, + 0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d, + 0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4, + 0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7, + 0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa, + 0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd, + 0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb, + 0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c, + 0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d, + 0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd, + 0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57, + 0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d, + 0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88, + 0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd, + 0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27, + 0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18, + 0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90, + 0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc, + 0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe, + 0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e, + 0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e, + 0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb, + 0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff, + 0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e, + 0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30, + 0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c, + 0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60, + 0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0, + 0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20, + 0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd, + 0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd, + 0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5, + 0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4, + 0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd, + 0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49, + 0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3, + 0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1, + 0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c, + 0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2, + 0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f, + 0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c, + 0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d, + 0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c, + 0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf, + 0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7, + 0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c, + 0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90, + 0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38, + 0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93, + 0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd, + 0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88, + 0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88, + 0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60, + 0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd, + 0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8, + 0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79, + 0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79, + 0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d, + 0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43, + 0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41, + 0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4, + 0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd, + 0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6, + 0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34, + 0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1, + 0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c, + 0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c, + 0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff, + 0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58, + 0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d, + 0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92, + 0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8, + 0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d, + 0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c, + 0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90, + 0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3, + 0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c, + 0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc, + 0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2, + 0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d, + 0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b, + 0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a, + 0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa, + 0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c, + 0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e, + 0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd, + 0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3, + 0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17, + 0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88, + 0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd, + 0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7, + 0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f, + 0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79, + 0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd, + 0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4, + 0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5, + 0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87, + 0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c, + 0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50, + 0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12, + 0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b, + 0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d, + 0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55, + 0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8, + 0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82, + 0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd, + 0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60, + 0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c, + 0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66, + 0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd, + 0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e, + 0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81, + 0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b, + 0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd, + 0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26, + 0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88, + 0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72, + 0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc, + 0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc, + 0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76, + 0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98, + 0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b, + 0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5, + 0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf, + 0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd, + 0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d, + 0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00, + 0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0, + 0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14, + 0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c, + 0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3, + 0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b, + 0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a, + 0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d, + 0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b, + 0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80, + 0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29, + 0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc, + 0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07, + 0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69, + 0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf, + 0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c, + 0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc, + 0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e, + 0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84, + 0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d, + 0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40, + 0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e, + 0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e, + 0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d, + 0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92, + 0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb, + 0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29, + 0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc, + 0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd, + 0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5, + 0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22, + 0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc, + 0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b, + 0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec, + 0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32, + 0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc, + 0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce, + 0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61, + 0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda, + 0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d, + 0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2, + 0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82, + 0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b, + 0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd, + 0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a, + 0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f, + 0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c, + 0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d, + 0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee, + 0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5, + 0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17, + 0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc, + 0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56, + 0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6, + 0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08, + 0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d, + 0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35, + 0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79, + 0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29, + 0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc, + 0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46, + 0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69, + 0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19, + 0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c, + 0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8, + 0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29, + 0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a, + 0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd, + 0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c, + 0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0, + 0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6, + 0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d, + 0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c, + 0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f, + 0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c, + 0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c, + 0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a, + 0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4, + 0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0, + 0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d, + 0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4, + 0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92, + 0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0, + 0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd, + 0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf, + 0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb, + 0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd, + 0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c, + 0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1, + 0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5, + 0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53, + 0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd, + 0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c, + 0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61, + 0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23, + 0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd, + 0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5, + 0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf, + 0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6, + 0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d, + 0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe, + 0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05, + 0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20, + 0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b, + 0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c, + 0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43, + 0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85, + 0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd, + 0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4, + 0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f, + 0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13, + 0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d, + 0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6, + 0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95, + 0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba, + 0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd, + 0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03, + 0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a, + 0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52, + 0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd, + 0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1, + 0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8, + 0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8, + 0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd, + 0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90, + 0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0, + 0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10, + 0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd, + 0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68, + 0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa, + 0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48, + 0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd, + 0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b, + 0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d, + 0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81, + 0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d, + 0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78, + 0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50, + 0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63, + 0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d, + 0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7, + 0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87, + 0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b, + 0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c, + 0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4, + 0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58, + 0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff, + 0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd, + 0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30, + 0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0, + 0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab, + 0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b, + 0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c, + 0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70, + 0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71, + 0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd, + 0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6, + 0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0, + 0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb, + 0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd, + 0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf, + 0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a, + 0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76, + 0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d, + 0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4, + 0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed, + 0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f, + 0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd, + 0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10, + 0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a, + 0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c, + 0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd, + 0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c, + 0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3, + 0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2, + 0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b, + 0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e, + 0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07, + 0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d, + 0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd, + 0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57, + 0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5, + 0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f, + 0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd, + 0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81, + 0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a, + 0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5, + 0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd, + 0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e, + 0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0, + 0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95, + 0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d, + 0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd, + 0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44, + 0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05, + 0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c, + 0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4, + 0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d, + 0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34, + 0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc, + 0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c, + 0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2, + 0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99, + 0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd, + 0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb, + 0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22, + 0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf, + 0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c, + 0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91, + 0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4, + 0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53, + 0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd, + 0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a, + 0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84, + 0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5, + 0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc, + 0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c, + 0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78, + 0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84, + 0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c, + 0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9, + 0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f, + 0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b, + 0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d, + 0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64, + 0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00, + 0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb, + 0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d, + 0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc, + 0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08, + 0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a, + 0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d, + 0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf, + 0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d, + 0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00, + 0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d, + 0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20, + 0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5, + 0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b, + 0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc, + 0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67, + 0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e, + 0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a, + 0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d, + 0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f, + 0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29, + 0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55, + 0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd, + 0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9, + 0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf, + 0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a, + 0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc, + 0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85, + 0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c, + 0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44, + 0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d, + 0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19, + 0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09, + 0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6, + 0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d, + 0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40, + 0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f, + 0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d, + 0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d, + 0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9, + 0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0, + 0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7, + 0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd, + 0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12, + 0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4, + 0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90, + 0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d, + 0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5, + 0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55, + 0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51, + 0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d, + 0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92, + 0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03, + 0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03, + 0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c, + 0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e, + 0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12, + 0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63, + 0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d, + 0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2, + 0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15, + 0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58, + 0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc, + 0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b, + 0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f, + 0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7, + 0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c, + 0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d, + 0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22, + 0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86, + 0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c, + 0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d, + 0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b, + 0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7, + 0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d, + 0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a, + 0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c, + 0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e, + 0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d, + 0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43, + 0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94, + 0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a, + 0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd, + 0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f, + 0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e, + 0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf, + 0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc, + 0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85, + 0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31, + 0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f, + 0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb, + 0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51, + 0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08, + 0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88, + 0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd, + 0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b, + 0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c, + 0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47, + 0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d, + 0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39, + 0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13, + 0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82, + 0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd, + 0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7, + 0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f, + 0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09, + 0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc, + 0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b, + 0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69, + 0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad, + 0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d, + 0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf, + 0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c, + 0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde, + 0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c, + 0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40, + 0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f, + 0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3, + 0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc, + 0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15, + 0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7, + 0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15, + 0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c, + 0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16, + 0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab, + 0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85, + 0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d, + 0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde, + 0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5, + 0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87, + 0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c, + 0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42, + 0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc, + 0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7, + 0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d, + 0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae, + 0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92, + 0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87, + 0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d, + 0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21, + 0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d, + 0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40, + 0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b, + 0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff, + 0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e, + 0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a, + 0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c, + 0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34, + 0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3, + 0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4, + 0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd, + 0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f, + 0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3, + 0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82, + 0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb, + 0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27, + 0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c, + 0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f, + 0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb, + 0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8, + 0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6, + 0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e, + 0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc, + 0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90, + 0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c, + 0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e, + 0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd, + 0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14, + 0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a, + 0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28, + 0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d, + 0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f, + 0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d, + 0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f, + 0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d, + 0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47, + 0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9, + 0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d, + 0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d, + 0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb, + 0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77, + 0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18, + 0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd, + 0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5, + 0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88, + 0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b, + 0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc, + 0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b, + 0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8, + 0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0, + 0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d, + 0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe, + 0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa, + 0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13, + 0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d, + 0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25, + 0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8, + 0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a, + 0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d, + 0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4, + 0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e, + 0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93, + 0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc, + 0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20, + 0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20, + 0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01, + 0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc, + 0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86, + 0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f, + 0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41, + 0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd, + 0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b, + 0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a, + 0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf, + 0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d, + 0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3, + 0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae, + 0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73, + 0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c, + 0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4, + 0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50, + 0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d, + 0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd, + 0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70, + 0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15, + 0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf, + 0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd, + 0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f, + 0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e, + 0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f, + 0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c, + 0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02, + 0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf, + 0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1, + 0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d, + 0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47, + 0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21, + 0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e, + 0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d, + 0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38, + 0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6, + 0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55, + 0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd, + 0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36, + 0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a, + 0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d, + 0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d, + 0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef, + 0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7, + 0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f, + 0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d, + 0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2, + 0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c, + 0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e, + 0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c, + 0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a, + 0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4, + 0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff, + 0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d, + 0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51, + 0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58, + 0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21, + 0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b, + 0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e, + 0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd, + 0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44, + 0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc, + 0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07, + 0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e, + 0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1, + 0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c, + 0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43, + 0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90, + 0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24, + 0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc, + 0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0, + 0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45, + 0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31, + 0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd, + 0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8, + 0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51, + 0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c, + 0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd, + 0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38, + 0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45, + 0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79, + 0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc, + 0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e, + 0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92, + 0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87, + 0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd, + 0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc, + 0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91, + 0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9, + 0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d, + 0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c, + 0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a, + 0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f, + 0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd, + 0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c, + 0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f, + 0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b, + 0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b, + 0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c, + 0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0, + 0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f, + 0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c, + 0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22, + 0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28, + 0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31, + 0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd, + 0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69, + 0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19, + 0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e, + 0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d, + 0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda, + 0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56, + 0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f, + 0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc, + 0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18, + 0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22, + 0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90, + 0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c, + 0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66, + 0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46, + 0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09, + 0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd, + 0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e, + 0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c, + 0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f, + 0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd, + 0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0, + 0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76, + 0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c, + 0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c, + 0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00, + 0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b, + 0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05, + 0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c, + 0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40, + 0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71, + 0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31, + 0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba, + 0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68, + 0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81, + 0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde, + 0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d, + 0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14, + 0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0, + 0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0, + 0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd, + 0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f, + 0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9, + 0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a, + 0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd, + 0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24, + 0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9, + 0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83, + 0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d, + 0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3, + 0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f, + 0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11, + 0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc, + 0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00, + 0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a, + 0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32, + 0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a, + 0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce, + 0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f, + 0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70, + 0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d, + 0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8, + 0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f, + 0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a, + 0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd, + 0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13, + 0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef, + 0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29, + 0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc, + 0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc, + 0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6, + 0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b, + 0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c, + 0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0, + 0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f, + 0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85, + 0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd, + 0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1, + 0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57, + 0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8, + 0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b, + 0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c, + 0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5, + 0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90, + 0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc, + 0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e, + 0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae, + 0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84, + 0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd, + 0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4, + 0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab, + 0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54, + 0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b, + 0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8, + 0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4, + 0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac, + 0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c, + 0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8, + 0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b, + 0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48, + 0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd, + 0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e, + 0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54, + 0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59, + 0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c, + 0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79, + 0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba, + 0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f, + 0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd, + 0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90, + 0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13, + 0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2, + 0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd, + 0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28, + 0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8, + 0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a, + 0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd, + 0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0, + 0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7, + 0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68, + 0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d, + 0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0, + 0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde, + 0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19, + 0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c, + 0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52, + 0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32, + 0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b, + 0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c, + 0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa, + 0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75, + 0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3, + 0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d, + 0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81, + 0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd, + 0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea, + 0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d, + 0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6, + 0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66, + 0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e, + 0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb, + 0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84, + 0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d, + 0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80, + 0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c, + 0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3, + 0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a, + 0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c, + 0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d, + 0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70, + 0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb, + 0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8, + 0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d, + 0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd, + 0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c, + 0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c, + 0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd, + 0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c, + 0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed, + 0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1, + 0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c, + 0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78, + 0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1, + 0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15, + 0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd, + 0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c, + 0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa, + 0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66, + 0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d, + 0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00, + 0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32, + 0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e, + 0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d, + 0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70, + 0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf, + 0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80, + 0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b, + 0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0, + 0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b, + 0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37, + 0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd, + 0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02, + 0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad, + 0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58, + 0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d, + 0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a, + 0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04, + 0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c, + 0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd, + 0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30, + 0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42, + 0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca, + 0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d, + 0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96, + 0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0, + 0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99, + 0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d, + 0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda, + 0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a, + 0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8, + 0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c, + 0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e, + 0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f, + 0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff, + 0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd, + 0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68, + 0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26, + 0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18, + 0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d, + 0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70, + 0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6, + 0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf, + 0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c, + 0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80, + 0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28, + 0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2, + 0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd, + 0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0, + 0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c, + 0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87, + 0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c, + 0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba, + 0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde, + 0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d, + 0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc, + 0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10, + 0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f, + 0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a, + 0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d, + 0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2, + 0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69, + 0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6, + 0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc, + 0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30, + 0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37, + 0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0, + 0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc, + 0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0, + 0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e, + 0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66, + 0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d, + 0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc, + 0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6, + 0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89, + 0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c, + 0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e, + 0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3, + 0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec, + 0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd, + 0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78, + 0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61, + 0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b, + 0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc, + 0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96, + 0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10, + 0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81, + 0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d, + 0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80, + 0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13, + 0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60, + 0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b, + 0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0, + 0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90, + 0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6, + 0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d, + 0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb, + 0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50, + 0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79, + 0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d, + 0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40, + 0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57, + 0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85, + 0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd, + 0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85, + 0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9, + 0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70, + 0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd, + 0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02, + 0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a, + 0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a, + 0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d, + 0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96, + 0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06, + 0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05, + 0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd, + 0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68, + 0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2, + 0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61, + 0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c, + 0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8, + 0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78, + 0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c, + 0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b, + 0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4, + 0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9, + 0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb, + 0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c, + 0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8, + 0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a, + 0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01, + 0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c, + 0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90, + 0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90, + 0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11, + 0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c, + 0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40, + 0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e, + 0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d, + 0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd, + 0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2, + 0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc, + 0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75, + 0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc, + 0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca, + 0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf, + 0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85, + 0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc, + 0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0, + 0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b, + 0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6, + 0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d, + 0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5, + 0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4, + 0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17, + 0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c, + 0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc, + 0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64, + 0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8, + 0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd, + 0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3, + 0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b, + 0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87, + 0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd, + 0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6, + 0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1, + 0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c, + 0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a, + 0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c, + 0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a, + 0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d, + 0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd, + 0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4, + 0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7, + 0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d, + 0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd, + 0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30, + 0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd, + 0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac, + 0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d, + 0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e, + 0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38, + 0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe, + 0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d, + 0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24, + 0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde, + 0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6, + 0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b, + 0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a, + 0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d, + 0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e, + 0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba, + 0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e, + 0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40, + 0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f, + 0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c, + 0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30, + 0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8, + 0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88, + 0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b, + 0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10, + 0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92, + 0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e, + 0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d, + 0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42, + 0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1, + 0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6, + 0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc, + 0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2, + 0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b, + 0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0, + 0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c, + 0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb, + 0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf, + 0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d, + 0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c, + 0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb, + 0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d, + 0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e, + 0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d, + 0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea, + 0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8, + 0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2, + 0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc, + 0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4, + 0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b, + 0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f, + 0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c, + 0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27, + 0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32, + 0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa, + 0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c, + 0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f, + 0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad, + 0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b, + 0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd, + 0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04, + 0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15, + 0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a, + 0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b, + 0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac, + 0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92, + 0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8, + 0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd, + 0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc, + 0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51, + 0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6, + 0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc, + 0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed, + 0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9, + 0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f, + 0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc, + 0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d, + 0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb, + 0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd, + 0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b, + 0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74, + 0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e, + 0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd, + 0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc, + 0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd, + 0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c, + 0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98, + 0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d, + 0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe, + 0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b, + 0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51, + 0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c, + 0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b, + 0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15, + 0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b, + 0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd, + 0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90, + 0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14, + 0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c, + 0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd, + 0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c, + 0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61, + 0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89, + 0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc, + 0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55, + 0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4, + 0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4, + 0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c, + 0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12, + 0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd, + 0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90, + 0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc, + 0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5, + 0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d, + 0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79, + 0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd, + 0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a, + 0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f, + 0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad, + 0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd, + 0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25, + 0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97, + 0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2, + 0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d, + 0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6, + 0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a, + 0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1, + 0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d, + 0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e, + 0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e, + 0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78, + 0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c, + 0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84, + 0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0, + 0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7, + 0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb, + 0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5, + 0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10, + 0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b, + 0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d, + 0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28, + 0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1, + 0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66, + 0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d, + 0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d, + 0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3, + 0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10, + 0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a, + 0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72, + 0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda, + 0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c, + 0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd, + 0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea, + 0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd, + 0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3, + 0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c, + 0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99, + 0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56, + 0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2, + 0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc, + 0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1, + 0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec, + 0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45, + 0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd, + 0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b, + 0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48, + 0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3, + 0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d, + 0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9, + 0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec, + 0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09, + 0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c, + 0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26, + 0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0, + 0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e, + 0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c, + 0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87, + 0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97, + 0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee, + 0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd, + 0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a, + 0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2, + 0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92, + 0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c, + 0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0, + 0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88, + 0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd, + 0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c, + 0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8, + 0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f, + 0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7, + 0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc, + 0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f, + 0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e, + 0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27, + 0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe, + 0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6, + 0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37, + 0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7, + 0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc, + 0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e, + 0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4, + 0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87, + 0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c, + 0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e, + 0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99, + 0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f, + 0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd, + 0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01, + 0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59, + 0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a, + 0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb, + 0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3, + 0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52, + 0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0, + 0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd, + 0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b, + 0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b, + 0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7, + 0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d, + 0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b, + 0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7, + 0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5, + 0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd, + 0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2, + 0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47, + 0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52, + 0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c, + 0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a, + 0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85, + 0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b, + 0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d, + 0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5, + 0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30, + 0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd, + 0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd, + 0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1, + 0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26, + 0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92, + 0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c, + 0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6, + 0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f, + 0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4, + 0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b, + 0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f, + 0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16, + 0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6, + 0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc, + 0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74, + 0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6, + 0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43, + 0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c, + 0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39, + 0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9, + 0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f, + 0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d, + 0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b, + 0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac, + 0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03, + 0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b, + 0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e, + 0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5, + 0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1, + 0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d, + 0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f, + 0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63, + 0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3, + 0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc, + 0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6, + 0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98, + 0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8, + 0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd, + 0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95, + 0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8, + 0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b, + 0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d, + 0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a, + 0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e, + 0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53, + 0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c, + 0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f, + 0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce, + 0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec, + 0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d, + 0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90, + 0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37, + 0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7, + 0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d, + 0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0, + 0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65, + 0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29, + 0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d, + 0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02, + 0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef, + 0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3, + 0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a, + 0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc, + 0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83, + 0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c, + 0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d, + 0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54, + 0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9, + 0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8, + 0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d, + 0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e, + 0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee, + 0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc, + 0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c, + 0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f, + 0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4, + 0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f, + 0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b, + 0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65, + 0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac, + 0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60, + 0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c, + 0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a, + 0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10, + 0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e, + 0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd, + 0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45, + 0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe, + 0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92, + 0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd, + 0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96, + 0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4, + 0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde, + 0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d, + 0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52, + 0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe, + 0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86, + 0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d, + 0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1, + 0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1, + 0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48, + 0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d, + 0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46, + 0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac, + 0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15, + 0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb, + 0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0, + 0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17, + 0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c, + 0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d, + 0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9, + 0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77, + 0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d, + 0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d, + 0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53, + 0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1, + 0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05, + 0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc, + 0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd, + 0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d, + 0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06, + 0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd, + 0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff, + 0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e, + 0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d, + 0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb, + 0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6, + 0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91, + 0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85, + 0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc, + 0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b, + 0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5, + 0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33, + 0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c, + 0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99, + 0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43, + 0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6, + 0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d, + 0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41, + 0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20, + 0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38, + 0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd, + 0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d, + 0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3, + 0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22, + 0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb, + 0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08, + 0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e, + 0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04, + 0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd, + 0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe, + 0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa, + 0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89, + 0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd, + 0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8, + 0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53, + 0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b, + 0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c, + 0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14, + 0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e, + 0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88, + 0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd, + 0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4, + 0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2, + 0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1, + 0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d, + 0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e, + 0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2, + 0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c, + 0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c, + 0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe, + 0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c, + 0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a, + 0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c, + 0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b, + 0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe, + 0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e, + 0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c, + 0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d, + 0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05, + 0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44, + 0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d, + 0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c, + 0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75, + 0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85, + 0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d, + 0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39, + 0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f, + 0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98, + 0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39, + 0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb, + 0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4, + 0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78, + 0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d, + 0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8, + 0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03, + 0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b, + 0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb, + 0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67, + 0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42, + 0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b, + 0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb, + 0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92, + 0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0, + 0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39, + 0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d, + 0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30, + 0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a, + 0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30, + 0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc, + 0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00, + 0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a, + 0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4, + 0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39, + 0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c, + 0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06, + 0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59, + 0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd, + 0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d, + 0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d, + 0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91, + 0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc, + 0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8, + 0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c, + 0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33, + 0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d, + 0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0, + 0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38, + 0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4, + 0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc, + 0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42, + 0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a, + 0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12, + 0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c, + 0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24, + 0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42, + 0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01, + 0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c, + 0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22, + 0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69, + 0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36, + 0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd, + 0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43, + 0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91, + 0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53, + 0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9, + 0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75, + 0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5, + 0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40, + 0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d, + 0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49, + 0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98, + 0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91, + 0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c, + 0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25, + 0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5, + 0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54, + 0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b, + 0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd, + 0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17, + 0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19, + 0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c, + 0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67, + 0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97, + 0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e, + 0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc, + 0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32, + 0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd, + 0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a, + 0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d, + 0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea, + 0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8, + 0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a, + 0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c, + 0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e, + 0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7, + 0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48, + 0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc, + 0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85, + 0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25, + 0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a, + 0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd, + 0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2, + 0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee, + 0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35, + 0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d, + 0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74, + 0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd, + 0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0, + 0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d, + 0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03, + 0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc, + 0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d, + 0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c, + 0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4, + 0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f, + 0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2, + 0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d, + 0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15, + 0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0, + 0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf, + 0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc, + 0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3, + 0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89, + 0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda, + 0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c, + 0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba, + 0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3, + 0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab, + 0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd, + 0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9, + 0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a, + 0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b, + 0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc, + 0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1, + 0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0, + 0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26, + 0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd, + 0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd, + 0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09, + 0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79, + 0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd, + 0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd, + 0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54, + 0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31, + 0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c, + 0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0, + 0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce, + 0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12, + 0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c, + 0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7, + 0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2, + 0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a, + 0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d, + 0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c, + 0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e, + 0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab, + 0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c, + 0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0, + 0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81, + 0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70, + 0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd, + 0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46, + 0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06, + 0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59, + 0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c, + 0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a, + 0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e, + 0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1, + 0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c, + 0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10, + 0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22, + 0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11, + 0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c, + 0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e, + 0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5, + 0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59, + 0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c, + 0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d, + 0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5, + 0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd, + 0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c, + 0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8, + 0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62, + 0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15, + 0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc, + 0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b, + 0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd, + 0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4, + 0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd, + 0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74, + 0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7, + 0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82, + 0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc, + 0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22, + 0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa, + 0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8, + 0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd, + 0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd, + 0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f, + 0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e, + 0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd, + 0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86, + 0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca, + 0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e, + 0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc, + 0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb, + 0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58, + 0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94, + 0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c, + 0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76, + 0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72, + 0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a, + 0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c, + 0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1, + 0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0, + 0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1, + 0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd, + 0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c, + 0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde, + 0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e, + 0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd, + 0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c, + 0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a, + 0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18, + 0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd, + 0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50, + 0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8, + 0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7, + 0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d, + 0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e, + 0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b, + 0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18, + 0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c, + 0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35, + 0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8, + 0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86, + 0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d, + 0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16, + 0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf, + 0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e, + 0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d, + 0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08, + 0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d, + 0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4, + 0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd, + 0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41, + 0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae, + 0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d, + 0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd, + 0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62, + 0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19, + 0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31, + 0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b, + 0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8, + 0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0, + 0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec, + 0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc, + 0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e, + 0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7, + 0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d, + 0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b, + 0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71, + 0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c, + 0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77, + 0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc, + 0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41, + 0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14, + 0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7, + 0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d, + 0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9, + 0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42, + 0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d, + 0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c, + 0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea, + 0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe, + 0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86, + 0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c, + 0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0, + 0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6, + 0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20, + 0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d, + 0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3, + 0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81, + 0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd, + 0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c, + 0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88, + 0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22, + 0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a, + 0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c, + 0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57, + 0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01, + 0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b, + 0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c, + 0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88, + 0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf, + 0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81, + 0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb, + 0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6, + 0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda, + 0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a, + 0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc, + 0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba, + 0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05, + 0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6, + 0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb, + 0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d, + 0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7, + 0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd, + 0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d, + 0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b, + 0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1, + 0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3, + 0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd, + 0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73, + 0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d, + 0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f, + 0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c, + 0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe, + 0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2, + 0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d, + 0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39, + 0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8, + 0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02, + 0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9, + 0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd, + 0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03, + 0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28, + 0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51, + 0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d, + 0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28, + 0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f, + 0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e, + 0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d, + 0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4, + 0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7, + 0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8, + 0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d, + 0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0, + 0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68, + 0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a, + 0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d, + 0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb, + 0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9, + 0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d, + 0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd, + 0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1, + 0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c, + 0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7, + 0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc, + 0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d, + 0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d, + 0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6, + 0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d, + 0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46, + 0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20, + 0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54, + 0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc, + 0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52, + 0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda, + 0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf, + 0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d, + 0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63, + 0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b, + 0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87, + 0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd, + 0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa, + 0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39, + 0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25, + 0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc, + 0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69, + 0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84, + 0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c, + 0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d, + 0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d, + 0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89, + 0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e, + 0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb, + 0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f, + 0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff, + 0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94, + 0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b, + 0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65, + 0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64, + 0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96, + 0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd, + 0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce, + 0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45, + 0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7, + 0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d, + 0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86, + 0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f, + 0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99, + 0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb, + 0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31, + 0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9, + 0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c, + 0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd, + 0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52, + 0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40, + 0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d, + 0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a, + 0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09, + 0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65, + 0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f, + 0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc, + 0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e, + 0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4, + 0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58, + 0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd, + 0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28, + 0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96, + 0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85, + 0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd, + 0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4, + 0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64, + 0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94, + 0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd, + 0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde, + 0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23, + 0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9, + 0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d, + 0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0, + 0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99, + 0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f, + 0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d, + 0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86, + 0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71, + 0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21, + 0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d, + 0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f, + 0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49, + 0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4, + 0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd, + 0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84, + 0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9, + 0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86, + 0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d, + 0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83, + 0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb, + 0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b, + 0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc, + 0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79, + 0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd, + 0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0, + 0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd, + 0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb, + 0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69, + 0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78, + 0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc, + 0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26, + 0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7, + 0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88, + 0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d, + 0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59, + 0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8, + 0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b, + 0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c, + 0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1, + 0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e, + 0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07, + 0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c, + 0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19, + 0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58, + 0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91, + 0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d, + 0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66, + 0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac, + 0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34, + 0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d, + 0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e, + 0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7, + 0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6, + 0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d, + 0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65, + 0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1, + 0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b, + 0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d, + 0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac, + 0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71, + 0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34, + 0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc, + 0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04, + 0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c, + 0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49, + 0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d, + 0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe, + 0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9, + 0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91, + 0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d, + 0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe, + 0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8, + 0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59, + 0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc, + 0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78, + 0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7, + 0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85, + 0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d, + 0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c, + 0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc, + 0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4, + 0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c, + 0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91, + 0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e, + 0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76, + 0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d, + 0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d, + 0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e, + 0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42, + 0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd, + 0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c, + 0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86, + 0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6, + 0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c, + 0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08, + 0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf, + 0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7, + 0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d, + 0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30, + 0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01, + 0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12, + 0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c, + 0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c, + 0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4, + 0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c, + 0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d, + 0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd, + 0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3, + 0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50, + 0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc, + 0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19, + 0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76, + 0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30, + 0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd, + 0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f, + 0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b, + 0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09, + 0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb, + 0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a, + 0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e, + 0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59, + 0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b, + 0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2, + 0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec, + 0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82, + 0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd, + 0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5, + 0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5, + 0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22, + 0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d, + 0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3, + 0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e, + 0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21, + 0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd, + 0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4, + 0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee, + 0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41, + 0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd, + 0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad, + 0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06, + 0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76, + 0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d, + 0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae, + 0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2, + 0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1, + 0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc, + 0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15, + 0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd, + 0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb, + 0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd, + 0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1, + 0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82, + 0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87, + 0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d, + 0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69, + 0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b, + 0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00, + 0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd, + 0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0, + 0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd, + 0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79, + 0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd, + 0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52, + 0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b, + 0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02, + 0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd, + 0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9, + 0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0, + 0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e, + 0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc, + 0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07, + 0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9, + 0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52, + 0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d, + 0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93, + 0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9, + 0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80, + 0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc, + 0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1, + 0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3, + 0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86, + 0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c, + 0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff, + 0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea, + 0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30, + 0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d, + 0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb, + 0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2, + 0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85, + 0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc, + 0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7, + 0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1, + 0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4, + 0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d, + 0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40, + 0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31, + 0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80, + 0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd, + 0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10, + 0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9, + 0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c, + 0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d, + 0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48, + 0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a, + 0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e, + 0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc, + 0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04, + 0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0, + 0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f, + 0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc, + 0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b, + 0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95, + 0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84, + 0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd, + 0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28, + 0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5, + 0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad, + 0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c, + 0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2, + 0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d, + 0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06, + 0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd, + 0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6, + 0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a, + 0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38, + 0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd, + 0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40, + 0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06, + 0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70, + 0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d, + 0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49, + 0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8, + 0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0, + 0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc, + 0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70, + 0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a, + 0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40, + 0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d, + 0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee, + 0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41, + 0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c, + 0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd, + 0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6, + 0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d, + 0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec, + 0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc, + 0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1, + 0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71, + 0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c, + 0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d, + 0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0, + 0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9, + 0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b, + 0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38, + 0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2, + 0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86, + 0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84, + 0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb, + 0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99, + 0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb, + 0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88, + 0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd, + 0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40, + 0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95, + 0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf, + 0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c, + 0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48, + 0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2, + 0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a, + 0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d, + 0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69, + 0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f, + 0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a, + 0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb, + 0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8, + 0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d, + 0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a, + 0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d, + 0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e, + 0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0, + 0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40, + 0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d, + 0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12, + 0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71, + 0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6, + 0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c, + 0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75, + 0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a, + 0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60, + 0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c, + 0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78, + 0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c, + 0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e, + 0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd, + 0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91, + 0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb, + 0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a, + 0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d, + 0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84, + 0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67, + 0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a, + 0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc, + 0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0, + 0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5, + 0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0, + 0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd, + 0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47, + 0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20, + 0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47, + 0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd, + 0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8, + 0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02, + 0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f, + 0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd, + 0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a, + 0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf, + 0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3, + 0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd, + 0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a, + 0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05, + 0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd, + 0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc, + 0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe, + 0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4, + 0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6, + 0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c, + 0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd, + 0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb, + 0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87, + 0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd, + 0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e, + 0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47, + 0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3, + 0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc, + 0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44, + 0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4, + 0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b, + 0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d, + 0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5, + 0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42, + 0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82, + 0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd, + 0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44, + 0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01, + 0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32, + 0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc, + 0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c, + 0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86, + 0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2, + 0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d, + 0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc, + 0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30, + 0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d, + 0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b, + 0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf, + 0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5, + 0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42, + 0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc, + 0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f, + 0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1, + 0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f, + 0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d, + 0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58, + 0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63, + 0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59, + 0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc, + 0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61, + 0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f, + 0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa, + 0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d, + 0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74, + 0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec, + 0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf, + 0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb, + 0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda, + 0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84, + 0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98, + 0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d, + 0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2, + 0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89, + 0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25, + 0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b, + 0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32, + 0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b, + 0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80, + 0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc, + 0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03, + 0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde, + 0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4, + 0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc, + 0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e, + 0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99, + 0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87, + 0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d, + 0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5, + 0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43, + 0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40, + 0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d, + 0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3, + 0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f, + 0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd, + 0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc, + 0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d, + 0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8, + 0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78, + 0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd, + 0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4, + 0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b, + 0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18, + 0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d, + 0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68, + 0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85, + 0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2, + 0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc, + 0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b, + 0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35, + 0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d, + 0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d, + 0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76, + 0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63, + 0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19, + 0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd, + 0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03, + 0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f, + 0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98, + 0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b, + 0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6, + 0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54, + 0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28, + 0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d, + 0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81, + 0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd, + 0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82, + 0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd, + 0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce, + 0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b, + 0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a, + 0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd, + 0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a, + 0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b, + 0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61, + 0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c, + 0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c, + 0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba, + 0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77, + 0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd, + 0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17, + 0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26, + 0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1, + 0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b, + 0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0, + 0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56, + 0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d, + 0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd, + 0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb, + 0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb, + 0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b, + 0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc, + 0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68, + 0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69, + 0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57, + 0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd, + 0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2, + 0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71, + 0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd, + 0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd, + 0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a, + 0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2, + 0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21, + 0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd, + 0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4, + 0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae, + 0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40, + 0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d, + 0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d, + 0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7, + 0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99, + 0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c, + 0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90, + 0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85, + 0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65, + 0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d, + 0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79, + 0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa, + 0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b, + 0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d, + 0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d, + 0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a, + 0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa, + 0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd, + 0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69, + 0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79, + 0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89, + 0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c, + 0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d, + 0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2, + 0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac, + 0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc, + 0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d, + 0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3, + 0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37, + 0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc, + 0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6, + 0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb, + 0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10, + 0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c, + 0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6, + 0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b, + 0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1, + 0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd, + 0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e, + 0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28, + 0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45, + 0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd, + 0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49, + 0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40, + 0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17, + 0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b, + 0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d, + 0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f, + 0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10, + 0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd, + 0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d, + 0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77, + 0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06, + 0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd, + 0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49, + 0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd, + 0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5, + 0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c, + 0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12, + 0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4, + 0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81, + 0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d, + 0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53, + 0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d, + 0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7, + 0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc, + 0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2, + 0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17, + 0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72, + 0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c, + 0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f, + 0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae, + 0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3, + 0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d, + 0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65, + 0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35, + 0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6, + 0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b, + 0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67, + 0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6, + 0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02, + 0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d, + 0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d, + 0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad, + 0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94, + 0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc, + 0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7, + 0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36, + 0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5, + 0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c, + 0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72, + 0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14, + 0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85, + 0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc, + 0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82, + 0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a, + 0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91, + 0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d, + 0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f, + 0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20, + 0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a, + 0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39, + 0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f, + 0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7, + 0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d, + 0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d, + 0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a, + 0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc, + 0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07, + 0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c, + 0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02, + 0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e, + 0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c, + 0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d, + 0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10, + 0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27, + 0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65, + 0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d, + 0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d, + 0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9, + 0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80, + 0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d, + 0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d, + 0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b, + 0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08, + 0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc, + 0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda, + 0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3, + 0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12, + 0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c, + 0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f, + 0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8, + 0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2, + 0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d, + 0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d, + 0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba, + 0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a, + 0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc, + 0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10, + 0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc, + 0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30, + 0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd, + 0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1, + 0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4, + 0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63, + 0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc, + 0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca, + 0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83, + 0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79, + 0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc, + 0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10, + 0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39, + 0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc, + 0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd, + 0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00, + 0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7, + 0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46, + 0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc, + 0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc, + 0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89, + 0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff, + 0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d, + 0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b, + 0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22, + 0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92, + 0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb, + 0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44, + 0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4, + 0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a, + 0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc, + 0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68, + 0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9, + 0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63, + 0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba, + 0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c, + 0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d, + 0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd, + 0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d, + 0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83, + 0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61, + 0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b, + 0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c, + 0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe, + 0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23, + 0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb, + 0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d, + 0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2, + 0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e, + 0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20, + 0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc, + 0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c, + 0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e, + 0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86, + 0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb, + 0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80, + 0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8, + 0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72, + 0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd, + 0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c, + 0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20, + 0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c, + 0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d, + 0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60, + 0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea, + 0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab, + 0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd, + 0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10, + 0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c, + 0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5, + 0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd, + 0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca, + 0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a, + 0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14, + 0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd, + 0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47, + 0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00, + 0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec, + 0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b, + 0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0, + 0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f, + 0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98, + 0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d, + 0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd, + 0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0, + 0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11, + 0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c, + 0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec, + 0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7, + 0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2, + 0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd, + 0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0, + 0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3, + 0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65, + 0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd, + 0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4, + 0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8, + 0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05, + 0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d, + 0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0, + 0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1, + 0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83, + 0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d, + 0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24, + 0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1, + 0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75, + 0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd, + 0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30, + 0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0, + 0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10, + 0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc, + 0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8, + 0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73, + 0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91, + 0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd, + 0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00, + 0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d, + 0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4, + 0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc, + 0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13, + 0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d, + 0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e, + 0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d, + 0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35, + 0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d, + 0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19, + 0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c, + 0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16, + 0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79, + 0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35, + 0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd, + 0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c, + 0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e, + 0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8, + 0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd, + 0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84, + 0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71, + 0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37, + 0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b, + 0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10, + 0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44, + 0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94, + 0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c, + 0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50, + 0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42, + 0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69, + 0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd, + 0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c, + 0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82, + 0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7, + 0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd, + 0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e, + 0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e, + 0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59, + 0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a, + 0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0, + 0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68, + 0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80, + 0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c, + 0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde, + 0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f, + 0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48, + 0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd, + 0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc, + 0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c, + 0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97, + 0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d, + 0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32, + 0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29, + 0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b, + 0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d, + 0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80, + 0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b, + 0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69, + 0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c, + 0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc, + 0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61, + 0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e, + 0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d, + 0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30, + 0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b, + 0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85, + 0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a, + 0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80, + 0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6, + 0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02, + 0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d, + 0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30, + 0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18, + 0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30, + 0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d, + 0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe, + 0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40, + 0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05, + 0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d, + 0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0, + 0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93, + 0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d, + 0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd, + 0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3, + 0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4, + 0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82, + 0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a, + 0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb, + 0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c, + 0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21, + 0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd, + 0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12, + 0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9, + 0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08, + 0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d, + 0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53, + 0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13, + 0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44, + 0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc, + 0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f, + 0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89, + 0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d, + 0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b, + 0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8, + 0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0, + 0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54, + 0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c, + 0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0, + 0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca, + 0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71, + 0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb, + 0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a, + 0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e, + 0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62, + 0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d, + 0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e, + 0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c, + 0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b, + 0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d, + 0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8, + 0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f, + 0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62, + 0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd, + 0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50, + 0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73, + 0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77, + 0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d, + 0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c, + 0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87, + 0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d, + 0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd, + 0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08, + 0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1, + 0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab, + 0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c, + 0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a, + 0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03, + 0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34, + 0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb, + 0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa, + 0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26, + 0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52, + 0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b, + 0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2, + 0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58, + 0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f, + 0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd, + 0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e, + 0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98, + 0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27, + 0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d, + 0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70, + 0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d, + 0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7, + 0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d, + 0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41, + 0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66, + 0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a, + 0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c, + 0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66, + 0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c, + 0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf, + 0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c, + 0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd, + 0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8, + 0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71, + 0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc, + 0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0, + 0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85, + 0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9, + 0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c, + 0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0, + 0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23, + 0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a, + 0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d, + 0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00, + 0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06, + 0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a, + 0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d, + 0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2, + 0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab, + 0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44, + 0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d, + 0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94, + 0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e, + 0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6, + 0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc, + 0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c, + 0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb, + 0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2, + 0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d, + 0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50, + 0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8, + 0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83, + 0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc, + 0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81, + 0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a, + 0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f, + 0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd, + 0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a, + 0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea, + 0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84, + 0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd, + 0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0, + 0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76, + 0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a, + 0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c, + 0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa, + 0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4, + 0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92, + 0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd, + 0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78, + 0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15, + 0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f, + 0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d, + 0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5, + 0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0, + 0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38, + 0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc, + 0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8, + 0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75, + 0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6, + 0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc, + 0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0, + 0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e, + 0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e, + 0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c, + 0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80, + 0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc, + 0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82, + 0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c, + 0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02, + 0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e, + 0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a, + 0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd, + 0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10, + 0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3, + 0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0, + 0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd, + 0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02, + 0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63, + 0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69, + 0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c, + 0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae, + 0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8, + 0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40, + 0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd, + 0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a, + 0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80, + 0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0, + 0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc, + 0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7, + 0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1, + 0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1, + 0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c, + 0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84, + 0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f, + 0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05, + 0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd, + 0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b, + 0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8, + 0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33, + 0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb, + 0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c, + 0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b, + 0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b, + 0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc, + 0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7, + 0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78, + 0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1, + 0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d, + 0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed, + 0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84, + 0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb, + 0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd, + 0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe, + 0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8, + 0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a, + 0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc, + 0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69, + 0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54, + 0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9, + 0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d, + 0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a, + 0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c, + 0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99, + 0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d, + 0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8, + 0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78, + 0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b, + 0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c, + 0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc, + 0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb, + 0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d, + 0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd, + 0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09, + 0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0, + 0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8, + 0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd, + 0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e, + 0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6, + 0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29, + 0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d, + 0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21, + 0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96, + 0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04, + 0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd, + 0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38, + 0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6, + 0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84, + 0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd, + 0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac, + 0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec, + 0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03, + 0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba, + 0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae, + 0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b, + 0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce, + 0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c, + 0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31, + 0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd, + 0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21, + 0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb, + 0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32, + 0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42, + 0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0, + 0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c, + 0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47, + 0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34, + 0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3, + 0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c, + 0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a, + 0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a, + 0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43, + 0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d, + 0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9, + 0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3, + 0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28, + 0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d, + 0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e, + 0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7, + 0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd, + 0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d, + 0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6, + 0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9, + 0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50, + 0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd, + 0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa, + 0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45, + 0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c, + 0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd, + 0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1, + 0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47, + 0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62, + 0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d, + 0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb, + 0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7, + 0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77, + 0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb, + 0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73, + 0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66, + 0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82, + 0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d, + 0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07, + 0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01, + 0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34, + 0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd, + 0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3, + 0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4, + 0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95, + 0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c, + 0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0, + 0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4, + 0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f, + 0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c, + 0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd, + 0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4, + 0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16, + 0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d, + 0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92, + 0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb, + 0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95, + 0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c, + 0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58, + 0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd, + 0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc, + 0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd, + 0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24, + 0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14, + 0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43, + 0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd, + 0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42, + 0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67, + 0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5, + 0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd, + 0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27, + 0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b, + 0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12, + 0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc, + 0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52, + 0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f, + 0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f, + 0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b, + 0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50, + 0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35, + 0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65, + 0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c, + 0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c, + 0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff, + 0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31, + 0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c, + 0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00, + 0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48, + 0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce, + 0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c, + 0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8, + 0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57, + 0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e, + 0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc, + 0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec, + 0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c, + 0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d, + 0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc, + 0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67, + 0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc, + 0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29, + 0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd, + 0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02, + 0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24, + 0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46, + 0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d, + 0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b, + 0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23, + 0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f, + 0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc, + 0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad, + 0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6, + 0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a, + 0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d, + 0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11, + 0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49, + 0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e, + 0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc, + 0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87, + 0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02, + 0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d, + 0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb, + 0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e, + 0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd, + 0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c, + 0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d, + 0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89, + 0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08, + 0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7, + 0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc, + 0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e, + 0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f, + 0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26, + 0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b, + 0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a, + 0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52, + 0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a, + 0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd, + 0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99, + 0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98, + 0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42, + 0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd, + 0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05, + 0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca, + 0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37, + 0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c, + 0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9, + 0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e, + 0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a, + 0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d, + 0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9, + 0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb, + 0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1, + 0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd, + 0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35, + 0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1, + 0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e, + 0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc, + 0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b, + 0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70, + 0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44, + 0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc, + 0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac, + 0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34, + 0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b, + 0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb, + 0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10, + 0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1, + 0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76, + 0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd, + 0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15, + 0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b, + 0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47, + 0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd, + 0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9, + 0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50, + 0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54, + 0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd, + 0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84, + 0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e, + 0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c, + 0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c, + 0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3, + 0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9, + 0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18, + 0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd, + 0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f, + 0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01, + 0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81, + 0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d, + 0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5, + 0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4, + 0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24, + 0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc, + 0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6, + 0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b, + 0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57, + 0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd, + 0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f, + 0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85, + 0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73, + 0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc, + 0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe, + 0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0, + 0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb, + 0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd, + 0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a, + 0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1, + 0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc, + 0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d, + 0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88, + 0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37, + 0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0, + 0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c, + 0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f, + 0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70, + 0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4, + 0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd, + 0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b, + 0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84, + 0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b, + 0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b, + 0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c, + 0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69, + 0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3, + 0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c, + 0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d, + 0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70, + 0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26, + 0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b, + 0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c, + 0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa, + 0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f, + 0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d, + 0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87, + 0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1, + 0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8, + 0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc, + 0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1, + 0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4, + 0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65, + 0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc, + 0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07, + 0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9, + 0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b, + 0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd, + 0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f, + 0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69, + 0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4, + 0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d, + 0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8, + 0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae, + 0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad, + 0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d, + 0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a, + 0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c, + 0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5, + 0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d, + 0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d, + 0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d, + 0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24, + 0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d, + 0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca, + 0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44, + 0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd, + 0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d, + 0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50, + 0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05, + 0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0, + 0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c, + 0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa, + 0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d, + 0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96, + 0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c, + 0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e, + 0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55, + 0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93, + 0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d, + 0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb, + 0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab, + 0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6, + 0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc, + 0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b, + 0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d, + 0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83, + 0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c, + 0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03, + 0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b, + 0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39, + 0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd, + 0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15, + 0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46, + 0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00, + 0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd, + 0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64, + 0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11, + 0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a, + 0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d, + 0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09, + 0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f, + 0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e, + 0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c, + 0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47, + 0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88, + 0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9, + 0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd, + 0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84, + 0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19, + 0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8, + 0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc, + 0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d, + 0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6, + 0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e, + 0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc, + 0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f, + 0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93, + 0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25, + 0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d, + 0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2, + 0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef, + 0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8, + 0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd, + 0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d, + 0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8, + 0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82, + 0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c, + 0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54, + 0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f, + 0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06, + 0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc, + 0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a, + 0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44, + 0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d, + 0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d, + 0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e, + 0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d, + 0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57, + 0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d, + 0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6, + 0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2, + 0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84, + 0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd, + 0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c, + 0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76, + 0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2, + 0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd, + 0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74, + 0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a, + 0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b, + 0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd, + 0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8, + 0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4, + 0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1, + 0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd, + 0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60, + 0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46, + 0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e, + 0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc, + 0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c, + 0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26, + 0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f, + 0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc, + 0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7, + 0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d, + 0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85, + 0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb, + 0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a, + 0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0, + 0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b, + 0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d, + 0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24, + 0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66, + 0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22, + 0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9, + 0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71, + 0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5, + 0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46, + 0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c, + 0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a, + 0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70, + 0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6, + 0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd, + 0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a, + 0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc, + 0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c, + 0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c, + 0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2, + 0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a, + 0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a, + 0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c, + 0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69, + 0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a, + 0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b, + 0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc, + 0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5, + 0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6, + 0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67, + 0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d, + 0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03, + 0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a, + 0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92, + 0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd, + 0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60, + 0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78, + 0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43, + 0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc, + 0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50, + 0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46, + 0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05, + 0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d, + 0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef, + 0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2, + 0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77, + 0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc, + 0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2, + 0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36, + 0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74, + 0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d, + 0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce, + 0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b, + 0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d, + 0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc, + 0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8, + 0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7, + 0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14, + 0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c, + 0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11, + 0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84, + 0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71, + 0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d, + 0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08, + 0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92, + 0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a, + 0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c, + 0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7, + 0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7, + 0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a, + 0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c, + 0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1, + 0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d, + 0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a, + 0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd, + 0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0, + 0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe, + 0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44, + 0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd, + 0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19, + 0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7, + 0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec, + 0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc, + 0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4, + 0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3, + 0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e, + 0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd, + 0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e, + 0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e, + 0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38, + 0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd, + 0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6, + 0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d, + 0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d, + 0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd, + 0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab, + 0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50, + 0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30, + 0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd, + 0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9, + 0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab, + 0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b, + 0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc, + 0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8, + 0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c, + 0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca, + 0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d, + 0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c, + 0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe, + 0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45, + 0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d, + 0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda, + 0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa, + 0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90, + 0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b, + 0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72, + 0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0, + 0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee, + 0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c, + 0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77, + 0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22, + 0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62, + 0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd, + 0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81, + 0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5, + 0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94, + 0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d, + 0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09, + 0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9, + 0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24, + 0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc, + 0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4, + 0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16, + 0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86, + 0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc, + 0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a, + 0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde, + 0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb, + 0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd, + 0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10, + 0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c, + 0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe, + 0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d, + 0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9, + 0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4, + 0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46, + 0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d, + 0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b, + 0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01, + 0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17, + 0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd, + 0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41, + 0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03, + 0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4, + 0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd, + 0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47, + 0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe, + 0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90, + 0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc, + 0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d, + 0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c, + 0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05, + 0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b, + 0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd, + 0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1, + 0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8, + 0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c, + 0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13, + 0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e, + 0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18, + 0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc, + 0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77, + 0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34, + 0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b, + 0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d, + 0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24, + 0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63, + 0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31, + 0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d, + 0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb, + 0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f, + 0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36, + 0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d, + 0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4, + 0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78, + 0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3, + 0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd, + 0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d, + 0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87, + 0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80, + 0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd, + 0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7, + 0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d, + 0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89, + 0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc, + 0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61, + 0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26, + 0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77, + 0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd, + 0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80, + 0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f, + 0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58, + 0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c, + 0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf, + 0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf, + 0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c, + 0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc, + 0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3, + 0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6, + 0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac, + 0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd, + 0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3, + 0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39, + 0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43, + 0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d, + 0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13, + 0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87, + 0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6, + 0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d, + 0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c, + 0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04, + 0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07, + 0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd, + 0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa, + 0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4, + 0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34, + 0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d, + 0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1, + 0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55, + 0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86, + 0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d, + 0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62, + 0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5, + 0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40, + 0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc, + 0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66, + 0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda, + 0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c, + 0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c, + 0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45, + 0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55, + 0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13, + 0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd, + 0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4, + 0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93, + 0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34, + 0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d, + 0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5, + 0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6, + 0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b, + 0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc, + 0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13, + 0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50, + 0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, + 0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00, + 0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0, + 0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20, + 0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5, + 0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4, + 0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59, + 0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65, + 0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd, + 0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf, + 0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a, + 0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a, + 0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1, + 0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf, + 0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20, + 0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c, + 0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9, + 0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16, + 0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23, + 0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c, + 0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e, + 0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a, + 0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30, + 0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d, + 0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00, + 0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00, + 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c, + 0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d, + 0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, + 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, + 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0, + 0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, + 0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00, + 0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c, + 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe, + 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff, + 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13, + 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe, + 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00, + 0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, + 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, + 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, + 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0, + 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, + 0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff, + 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03, + 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, + 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00, + 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, + 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08, + 0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, + 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, + 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00, + 0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00, + 0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c, + 0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04, + 0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00, + 0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00, + 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14, + 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00, + 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, + 0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69, + 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00, + 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, + 0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8, + 0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, + 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, + 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65, + 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, + 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75, + 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, + 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42, + 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, + 0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65, + 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66, + 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73, + 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00, + 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, + 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88, + 0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, + 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f, + 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, + 0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, + 0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c, + 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00, + 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, + 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, + 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, + 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43, + 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, + 0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, + 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, + 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff, + 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, + 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, + 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78, + 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34, + 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, + 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, + 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, + 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65, + 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, + 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, + 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, + 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76, + 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, + 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00, + 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00, + 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, + 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, + 0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, + 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, + 0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff, + 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d, + 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, + 0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, + 0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, + 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, + 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42, + 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, + 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, + 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, + 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69, + 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e, + 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd, + 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, + 0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43, + 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b, + 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00, + 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, + 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, + 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff, + 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, + 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, + 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, + 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd, + 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, + 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, + 0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe, + 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, + 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, + 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, + 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, + 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00, + 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, + 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, + 0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, + 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff, + 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, + 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff, + 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, + 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, + 0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff, + 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, + 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, + 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0, + 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, + 0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, + 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22, + 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, + 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, + 0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, + 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, + 0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09, + 0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4, + 0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00, + 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, + 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03 +}; diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c new file mode 100644 index 0000000000..2fab99dd8b --- /dev/null +++ b/third_party/aom/av1/encoder/dwt.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/dwt.h" + +// Note: block length must be even for this implementation +static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++) * 2; + *b++ = *x - ((r + x[1] + 1) >> 1); + x++; + } + *a = (r = *x++) * 2; + *b = *x - r; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass, + tran_low_t *highpass) { + int n; + tran_low_t r, *a, *b; + + n = length >> 1; + b = highpass; + a = lowpass; + while (--n) { + *a++ = (r = *x++); + *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2; + x++; + } + *a = (r = *x++); + *b = (*x - r + 1) >> 1; + + n = length >> 1; + b = highpass; + a = lowpass; + r = *highpass; + while (n--) { + *a++ += (r + (*b) + 1) >> 1; + r = *b++; + } +} + +static void dyadic_analyze_53_uint8_input(int levels, int width, int height, + const uint8_t *x, int pitch_x, + tran_low_t *c, int pitch_c, + int dwt_scale_bits, int hbd) { + int lv, i, j, nh, nw, hh = height, hw = width; + tran_low_t buffer[2 * DWT_MAX_LENGTH]; + + if (hbd) { + const uint16_t *x16 = CONVERT_TO_SHORTPTR(x); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits; + } + } + } else { + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits; + } + } + } + + for (lv = 0; lv < levels; lv++) { + nh = hh; + hh = (hh + 1) >> 1; + nw = hw; + hw = (hw + 1) >> 1; + if ((nh < 2) || (nw < 2)) return; + for (i = 0; i < nh; i++) { + memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t)); + analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw); + } + for (j = 0; j < nw; j++) { + for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j]; + analysis_53_col(nh, buffer + nh, buffer, buffer + hh); + for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i]; + } + } +} + +void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output, + int stride, int hbd) { + dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd); +} + +static int haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) { + int acsad = 0; + + for (int r = 0; r < bh; ++r) + for (int c = 0; c < bw; ++c) { + if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]); + } + return acsad; +} + +static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, + int hbd) { + tran_low_t output[64]; + + av1_fdwt8x8_uint8_input_c(input, output, stride, hbd); + return haar_ac_sad(output, 8, 8, 8); +} + +int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, + int hbd, int num_8x8_rows, + int num_8x8_cols) { + int64_t wavelet_energy = 0; + for (int r8 = 0; r8 < num_8x8_rows; ++r8) { + for (int c8 = 0; c8 < num_8x8_cols; ++c8) { + wavelet_energy += haar_ac_sad_8x8_uint8_input( + input + c8 * 8 + r8 * 8 * stride, stride, hbd); + } + } + return wavelet_energy; +} diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h new file mode 100644 index 0000000000..443b6bc12c --- /dev/null +++ b/third_party/aom/av1/encoder/dwt.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_DWT_H_ +#define AOM_AV1_ENCODER_DWT_H_ + +#include "av1/common/common.h" +#include "av1/common/enums.h" + +#define DWT_MAX_LENGTH 64 + +void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output, + int stride, int hbd); + +int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, + int hbd, int num_8x8_rows, + int num_8x8_cols); + +#endif // AOM_AV1_ENCODER_DWT_H_ diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h new file mode 100644 index 0000000000..20cefa16a5 --- /dev/null +++ b/third_party/aom/av1/encoder/enc_enums.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_ +#define AOM_AV1_ENCODER_ENC_ENUMS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code. +enum { + THR_NEARESTMV, + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, + THR_NEARESTA2, + THR_NEARESTA, + THR_NEARESTG, + + THR_NEWMV, + THR_NEWL2, + THR_NEWL3, + THR_NEWB, + THR_NEWA2, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARL2, + THR_NEARL3, + THR_NEARB, + THR_NEARA2, + THR_NEARA, + THR_NEARG, + + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, + + THR_COMP_NEAREST_NEARESTLA, + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, + THR_COMP_NEAREST_NEARESTGA, + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTLA2, + THR_COMP_NEAREST_NEARESTL2A2, + THR_COMP_NEAREST_NEARESTL3A2, + THR_COMP_NEAREST_NEARESTGA2, + THR_COMP_NEAREST_NEARESTLL2, + THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, + THR_COMP_NEAREST_NEARESTBA, + + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_GLOBAL_GLOBALLB, + + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEWLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_GLOBAL_GLOBALLA, + + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_GLOBAL_GLOBALL2A, + + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_GLOBAL_GLOBALL3A, + + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEWGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_GLOBAL_GLOBALGA, + + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_GLOBAL_GLOBALL2B, + + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_GLOBAL_GLOBALL3B, + + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_GLOBAL_GLOBALGB, + + THR_COMP_NEAR_NEARLA2, + THR_COMP_NEW_NEWLA2, + THR_COMP_NEW_NEARESTLA2, + THR_COMP_NEAREST_NEWLA2, + THR_COMP_NEW_NEARLA2, + THR_COMP_NEAR_NEWLA2, + THR_COMP_GLOBAL_GLOBALLA2, + + THR_COMP_NEAR_NEARL2A2, + THR_COMP_NEW_NEWL2A2, + THR_COMP_NEW_NEARESTL2A2, + THR_COMP_NEAREST_NEWL2A2, + THR_COMP_NEW_NEARL2A2, + THR_COMP_NEAR_NEWL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, + + THR_COMP_NEAR_NEARL3A2, + THR_COMP_NEW_NEWL3A2, + THR_COMP_NEW_NEARESTL3A2, + THR_COMP_NEAREST_NEWL3A2, + THR_COMP_NEW_NEARL3A2, + THR_COMP_NEAR_NEWL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, + + THR_COMP_NEAR_NEARGA2, + THR_COMP_NEW_NEWGA2, + THR_COMP_NEW_NEARESTGA2, + THR_COMP_NEAREST_NEWGA2, + THR_COMP_NEW_NEARGA2, + THR_COMP_NEAR_NEWGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_COMP_NEAR_NEARLL2, + THR_COMP_NEW_NEWLL2, + THR_COMP_NEW_NEARESTLL2, + THR_COMP_NEAREST_NEWLL2, + THR_COMP_NEW_NEARLL2, + THR_COMP_NEAR_NEWLL2, + THR_COMP_GLOBAL_GLOBALLL2, + + THR_COMP_NEAR_NEARLL3, + THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEW_NEARLL3, + THR_COMP_NEAR_NEWLL3, + THR_COMP_GLOBAL_GLOBALLL3, + + THR_COMP_NEAR_NEARLG, + THR_COMP_NEW_NEWLG, + THR_COMP_NEW_NEARESTLG, + THR_COMP_NEAREST_NEWLG, + THR_COMP_NEW_NEARLG, + THR_COMP_NEAR_NEWLG, + THR_COMP_GLOBAL_GLOBALLG, + + THR_COMP_NEAR_NEARBA, + THR_COMP_NEW_NEWBA, + THR_COMP_NEW_NEARESTBA, + THR_COMP_NEAREST_NEWBA, + THR_COMP_NEW_NEARBA, + THR_COMP_NEAR_NEWBA, + THR_COMP_GLOBAL_GLOBALBA, + + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, + + MAX_MODES, + SINGLE_REF_MODE_START = THR_NEARESTMV, + SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA, + NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START, + THR_MODE_START = THR_NEARESTMV, + THR_MODE_END = MAX_MODES, + THR_INTER_MODE_START = THR_MODE_START, + THR_INTER_MODE_END = THR_DC, + THR_INVALID = 255 +} UENUM1BYTE(THR_MODES); + +enum { + THR_LAST, + THR_LAST2, + THR_LAST3, + THR_BWDR, + THR_ALTR2, + THR_GOLD, + THR_ALTR, + + THR_COMP_LA, + THR_COMP_L2A, + THR_COMP_L3A, + THR_COMP_GA, + + THR_COMP_LB, + THR_COMP_L2B, + THR_COMP_L3B, + THR_COMP_GB, + + THR_COMP_LA2, + THR_COMP_L2A2, + THR_COMP_L3A2, + THR_COMP_GA2, + + THR_INTRA, + + MAX_REFS +} UENUM1BYTE(THR_MODES_SUB8X8); + +enum { + FULL_TXFM_RD, + LOW_TXFM_RD, +} UENUM1BYTE(TXFM_RD_MODEL); + +enum { + USE_FULL_RD = 0, + USE_FAST_RD, + USE_LARGESTALL, +} UENUM1BYTE(TX_SIZE_SEARCH_METHOD); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENC_ENUMS_H_ diff --git a/third_party/aom/av1/encoder/encode_strategy.c b/third_party/aom/av1/encoder/encode_strategy.c new file mode 100644 index 0000000000..35ca83c3f4 --- /dev/null +++ b/third_party/aom/av1/encoder/encode_strategy.c @@ -0,0 +1,1767 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/blockd.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/temporal_filter.h" +#if CONFIG_THREE_PASS +#include "av1/encoder/thirdpass.h" +#endif // CONFIG_THREE_PASS +#include "av1/encoder/tpl_model.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1) + +static INLINE void set_refresh_frame_flags( + RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref, + bool refresh_arf) { + refresh_frame->golden_frame = refresh_gf; + refresh_frame->bwd_ref_frame = refresh_bwdref; + refresh_frame->alt_ref_frame = refresh_arf; +} + +void av1_configure_buffer_updates(AV1_COMP *const cpi, + RefreshFrameInfo *const refresh_frame, + const FRAME_UPDATE_TYPE type, + const REFBUF_STATE refbuf_state, + int force_refresh_all) { + // NOTE(weitinglin): Should we define another function to take care of + // cpi->rc.is_$Source_Type to make this function as it is in the comment? + const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &cpi->ext_flags.refresh_frame; + cpi->rc.is_src_frame_alt_ref = 0; + + switch (type) { + case KF_UPDATE: + set_refresh_frame_flags(refresh_frame, true, true, true); + break; + + case LF_UPDATE: + set_refresh_frame_flags(refresh_frame, false, false, false); + break; + + case GF_UPDATE: + set_refresh_frame_flags(refresh_frame, true, false, false); + break; + + case OVERLAY_UPDATE: + if (refbuf_state == REFBUF_RESET) + set_refresh_frame_flags(refresh_frame, true, true, true); + else + set_refresh_frame_flags(refresh_frame, true, false, false); + + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case ARF_UPDATE: + // NOTE: BWDREF does not get updated along with ALTREF_FRAME. + if (refbuf_state == REFBUF_RESET) + set_refresh_frame_flags(refresh_frame, true, true, true); + else + set_refresh_frame_flags(refresh_frame, false, false, true); + + break; + + case INTNL_OVERLAY_UPDATE: + set_refresh_frame_flags(refresh_frame, false, false, false); + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case INTNL_ARF_UPDATE: + set_refresh_frame_flags(refresh_frame, false, true, false); + break; + + default: assert(0); break; + } + + if (ext_refresh_frame_flags->update_pending && + (!is_stat_generation_stage(cpi))) { + set_refresh_frame_flags(refresh_frame, + ext_refresh_frame_flags->golden_frame, + ext_refresh_frame_flags->bwd_ref_frame, + ext_refresh_frame_flags->alt_ref_frame); + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (ext_refresh_frame_flags->golden_frame) + gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE; + if (ext_refresh_frame_flags->alt_ref_frame) + gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE; + if (ext_refresh_frame_flags->bwd_ref_frame) + gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE; + } + + if (force_refresh_all) + set_refresh_frame_flags(refresh_frame, true, true, true); +} + +static void set_additional_frame_flags(const AV1_COMMON *const cm, + unsigned int *const frame_flags) { + if (frame_is_intra_only(cm)) { + *frame_flags |= FRAMEFLAGS_INTRAONLY; + } + if (frame_is_sframe(cm)) { + *frame_flags |= FRAMEFLAGS_SWITCH; + } + if (cm->features.error_resilient_mode) { + *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT; + } +} + +static void set_ext_overrides(AV1_COMMON *const cm, + EncodeFrameParams *const frame_params, + ExternalFlags *const ext_flags) { + // Overrides the defaults with the externally supplied values with + // av1_update_reference() and av1_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to av1_encode_lowlevel() + + if (ext_flags->use_s_frame) { + frame_params->frame_type = S_FRAME; + } + + if (ext_flags->refresh_frame_context_pending) { + cm->features.refresh_frame_context = ext_flags->refresh_frame_context; + ext_flags->refresh_frame_context_pending = 0; + } + cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs; + + frame_params->error_resilient_mode = ext_flags->use_error_resilient; + // A keyframe is already error resilient and keyframes with + // error_resilient_mode interferes with the use of show_existing_frame + // when forward reference keyframes are enabled. + frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME; + // For bitstream conformance, s-frames must be error-resilient + frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME; +} + +static int choose_primary_ref_frame( + AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { + const AV1_COMMON *const cm = &cpi->common; + + const int intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + if (intra_only || frame_params->error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + return PRIMARY_REF_NONE; + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode) { + int wanted_fb = cpi->ppi->gf_group.primary_ref_idx[cpi->gf_frame_index]; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) + return ref_frame - LAST_FRAME; + } + + return PRIMARY_REF_NONE; + } +#endif // !CONFIG_REALTIME_ONLY + + // In large scale case, always use Last frame's frame contexts. + // Note(yunqing): In other cases, primary_ref_frame is chosen based on + // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls + // frame bit allocation. + if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME); + + if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) + return av1_svc_primary_ref_frame(cpi); + + // Find the most recent reference frame with the same reference type as the + // current frame + const int current_ref_type = get_current_frame_ref_type(cpi); + int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type]; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index]; + // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set. + if (frame_level == 1) { + cpi->wanted_fb = wanted_fb; + } + // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the + // set. + if (frame_level == 2 && + gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) { + assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1); + wanted_fb = cpi->wanted_fb; + } + } + } +#endif // CONFIG_FPMT_TEST + int primary_ref_frame = PRIMARY_REF_NONE; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) { + primary_ref_frame = ref_frame - LAST_FRAME; + } + } + + return primary_ref_frame; +} + +static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) { + TimeStamps *time_stamps = &cpi->time_stamps; + int64_t this_duration; + int step = 0; + + // Clear down mmx registers + + if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config && + cpi->svc.number_spatial_layers > 1) { + // ts_start is the timestamp for the current frame and ts_end is the + // expected next timestamp given the duration passed into codec_encode(). + // See the setting in encoder_encode() in av1_cx_iface.c: + // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol), + // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + + // duration). So the difference ts_end - ts_start is the duration passed + // in by the user. For spatial layers SVC set the framerate based directly + // on the duration, and bypass the adjustments below. + this_duration = ts_end - ts_start; + if (this_duration > 0) { + cpi->new_framerate = 10000000.0 / this_duration; + av1_new_framerate(cpi, cpi->new_framerate); + time_stamps->prev_ts_start = ts_start; + time_stamps->prev_ts_end = ts_end; + return; + } + } + + if (ts_start == time_stamps->first_ts_start) { + this_duration = ts_end - ts_start; + step = 1; + } else { + int64_t last_duration = + time_stamps->prev_ts_end - time_stamps->prev_ts_start; + + this_duration = ts_end - time_stamps->prev_ts_end; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + cpi->new_framerate = 10000000.0 / this_duration; + av1_new_framerate(cpi, cpi->new_framerate); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = + AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + cpi->new_framerate = (10000000.0 / avg_duration); + // For parallel frames update cpi->framerate with new_framerate + // during av1_post_encode_updates() + double framerate = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->framerate + : cpi->new_framerate; + av1_new_framerate(cpi, framerate); + } + } + + time_stamps->prev_ts_start = ts_start; + time_stamps->prev_ts_end = ts_end; +} + +// Determine whether there is a forced keyframe pending in the lookahead buffer +int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index, + const COMPRESSOR_STAGE compressor_stage) { + for (int i = 0; i <= up_to_index; i++) { + const struct lookahead_entry *e = + av1_lookahead_peek(lookahead, i, compressor_stage); + if (e == NULL) { + // We have reached the end of the lookahead buffer and not early-returned + // so there isn't a forced key-frame pending. + return -1; + } else if (e->flags == AOM_EFLAG_FORCE_KF) { + return i; + } else { + continue; + } + } + return -1; // Never reached +} + +// Check if we should encode an ARF or internal ARF. If not, try a LAST +// Do some setup associated with the chosen source +// temporal_filtered, flush, and frame_update_type are outputs. +// Return the frame source, or NULL if we couldn't find one +static struct lookahead_entry *choose_frame_source( + AV1_COMP *const cpi, int *const flush, int *pop_lookahead, + struct lookahead_entry **last_source, int *const show_frame) { + AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + struct lookahead_entry *source = NULL; + + // Source index in lookahead buffer. + int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; + + // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q + if (src_index && + (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index, + cpi->compressor_stage) != -1) && + cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) { + src_index = 0; + *flush = 1; + } + + // If the current frame is arf, then we should not pop from the lookahead + // buffer. If the current frame is not arf, then pop it. This assumes the + // first frame in the GF group is not arf. May need to change if it is not + // true. + *pop_lookahead = (src_index == 0); + // If this is a key frame and keyframe filtering is enabled with overlay, + // then do not pop. + if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 && + gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && + !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) { + if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz && + (*flush || + cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz == + cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) { + *pop_lookahead = 0; + } + } + + // LAP stage does not have ARFs or forward key-frames, + // hence, always pop_lookahead here. + if (is_stat_generation_stage(cpi)) { + *pop_lookahead = 1; + src_index = 0; + } + + *show_frame = *pop_lookahead; + +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) { +#else + { +#endif // CONFIG_FPMT_TEST + // Future frame in parallel encode set + if (gf_group->src_offset[cpi->gf_frame_index] != 0 && + !is_stat_generation_stage(cpi)) + src_index = gf_group->src_offset[cpi->gf_frame_index]; + } + if (*show_frame) { + // show frame, pop from buffer + // Get last frame source. + if (cm->current_frame.frame_number > 0) { + *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1, + cpi->compressor_stage); + } + // Read in the source frame. + source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, + cpi->compressor_stage); + } else { + // no show frames are arf frames + source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, + cpi->compressor_stage); + if (source != NULL) { + cm->showable_frame = 1; + } + } + return source; +} + +// Don't allow a show_existing_frame to coincide with an error resilient or +// S-Frame. An exception can be made in the case of a keyframe, since it does +// not depend on any previous frames. +static int allow_show_existing(const AV1_COMP *const cpi, + unsigned int frame_flags) { + if (cpi->common.current_frame.frame_number == 0) return 0; + + const struct lookahead_entry *lookahead_src = + av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); + if (lookahead_src == NULL) return 1; + + const int is_error_resilient = + cpi->oxcf.tool_cfg.error_resilient_mode || + (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); + const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe || + (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); + const int is_key_frame = + (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY); + return !(is_error_resilient || is_s_frame) || is_key_frame; +} + +// Update frame_flags to tell the encoder's caller what sort of frame was +// encoded. +static void update_frame_flags(const AV1_COMMON *const cm, + const RefreshFrameInfo *const refresh_frame, + unsigned int *frame_flags) { + if (encode_show_existing_frame(cm)) { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN; + *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF; + *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF; + *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY; + return; + } + + if (refresh_frame->golden_frame) { + *frame_flags |= FRAMEFLAGS_GOLDEN; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN; + } + + if (refresh_frame->alt_ref_frame) { + *frame_flags |= FRAMEFLAGS_ALTREF; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF; + } + + if (refresh_frame->bwd_ref_frame) { + *frame_flags |= FRAMEFLAGS_BWDREF; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF; + } + + if (cm->current_frame.frame_type == KEY_FRAME) { + *frame_flags |= FRAMEFLAGS_KEY; + } else { + *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY; + } +} + +#define DUMP_REF_FRAME_IMAGES 0 + +#if DUMP_REF_FRAME_IMAGES == 1 +static int dump_one_image(AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *const ref_buf, + char *file_name) { + int h; + FILE *f_ref = NULL; + + if (ref_buf == NULL) { + printf("Frame data buffer is NULL.\n"); + return AOM_CODEC_MEM_ERROR; + } + + if ((f_ref = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return AOM_CODEC_MEM_ERROR; + } + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + + fclose(f_ref); + + return AOM_CODEC_OK; +} + +static void dump_ref_frame_images(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + char file_name[256] = ""; + snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", + cm->current_frame.frame_number, ref_frame); + dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name); + } +} +#endif // DUMP_REF_FRAME_IMAGES == 1 + +int av1_get_refresh_ref_frame_map(int refresh_frame_flags) { + int ref_map_index; + + for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index) + if ((refresh_frame_flags >> ref_map_index) & 1) break; + + if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX; + return ref_map_index; +} + +static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) { + for (int idx = 0; idx < REF_FRAMES; ++idx) + if (ref_map_pairs[idx].disp_order == -1) return idx; + return INVALID_IDX; +} + +static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int update_arf, GF_GROUP *gf_group, int gf_index, + int enable_refresh_skip, int cur_frame_disp) { + int arf_count = 0; + int oldest_arf_order = INT32_MAX; + int oldest_arf_idx = -1; + + int oldest_frame_order = INT32_MAX; + int oldest_idx = -1; + + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; + if (ref_pair.disp_order == -1) continue; + const int frame_order = ref_pair.disp_order; + const int reference_frame_level = ref_pair.pyr_level; + // Keep future frames and three closest previous frames in output order. + if (frame_order > cur_frame_disp - 3) continue; + + if (enable_refresh_skip) { + int skip_frame = 0; + // Prevent refreshing a frame in gf_group->skip_frame_refresh. + for (int i = 0; i < REF_FRAMES; i++) { + int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i]; + if (frame_to_skip == INVALID_IDX) break; + if (frame_order == frame_to_skip) { + skip_frame = 1; + break; + } + } + if (skip_frame) continue; + } + + // Keep track of the oldest level 1 frame if the current frame is also level + // 1. + if (reference_frame_level == 1) { + // If there are more than 2 level 1 frames in the reference list, + // discard the oldest. + if (frame_order < oldest_arf_order) { + oldest_arf_order = frame_order; + oldest_arf_idx = map_idx; + } + arf_count++; + continue; + } + + // Update the overall oldest reference frame. + if (frame_order < oldest_frame_order) { + oldest_frame_order = frame_order; + oldest_idx = map_idx; + } + } + if (update_arf && arf_count > 2) return oldest_arf_idx; + if (oldest_idx >= 0) return oldest_idx; + if (oldest_arf_idx >= 0) return oldest_arf_idx; + if (oldest_idx == -1) { + assert(arf_count > 2 && enable_refresh_skip); + return oldest_arf_idx; + } + assert(0 && "No valid refresh index found"); + return -1; +} + +// Computes the reference refresh index for INTNL_ARF_UPDATE frame. +int av1_calc_refresh_idx_for_intnl_arf( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int gf_index) { + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + // Search for the open slot to store the current frame. + int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs); + + // Use a free slot if available. + if (free_fb_index != INVALID_IDX) { + return free_fb_index; + } else { + int enable_refresh_skip = !is_one_pass_rt_params(cpi); + int refresh_idx = + get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index, + enable_refresh_skip, gf_group->display_idx[gf_index]); + return refresh_idx; + } +} + +int av1_get_refresh_frame_flags( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { + const AV1_COMMON *const cm = &cpi->common; + const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &cpi->ext_flags.refresh_frame; + + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->refbuf_state[gf_index] == REFBUF_RESET) + return SELECT_ALL_BUF_SLOTS; + + // TODO(jingning): Deprecate the following operations. + // Switch frames and shown key-frames overwrite all reference slots + if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS; + + // show_existing_frames don't actually send refresh_frame_flags so set the + // flags to 0 to keep things consistent. + if (frame_params->show_existing_frame) return 0; + + const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0; + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { + int new_fb_map_idx = cpi->ppi->gf_group.update_ref_idx[gf_index]; + if (new_fb_map_idx == INVALID_IDX) return 0; + return 1 << new_fb_map_idx; + } +#endif // !CONFIG_REALTIME_ONLY + + int refresh_mask = 0; + if (ext_refresh_frame_flags->update_pending) { + if (rtc_ref->set_ref_frame_config || + use_rtc_reference_structure_one_layer(cpi)) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + int ref_frame_map_idx = rtc_ref->ref_idx[i]; + refresh_mask |= rtc_ref->refresh[ref_frame_map_idx] + << ref_frame_map_idx; + } + return refresh_mask; + } + // Unfortunately the encoder interface reflects the old refresh_*_frame + // flags so we have to replicate the old refresh_frame_flags logic here in + // order to preserve the behaviour of the flag overrides. + int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame + << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame + << ref_frame_map_idx; + + if (frame_update_type == OVERLAY_UPDATE) { + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->golden_frame + << ref_frame_map_idx; + } else { + ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->golden_frame + << ref_frame_map_idx; + + ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME); + if (ref_frame_map_idx != INVALID_IDX) + refresh_mask |= ext_refresh_frame_flags->alt_ref_frame + << ref_frame_map_idx; + } + return refresh_mask; + } + + // Search for the open slot to store the current frame. + int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs); + + // No refresh necessary for these frame types. + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) + return refresh_mask; + + // If there is an open slot, refresh that one instead of replacing a + // reference. + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + return refresh_mask; + } + const int enable_refresh_skip = !is_one_pass_rt_params(cpi); + const int update_arf = frame_update_type == ARF_UPDATE; + const int refresh_idx = + get_refresh_idx(ref_frame_map_pairs, update_arf, &cpi->ppi->gf_group, + gf_index, enable_refresh_skip, cur_disp_order); + return 1 << refresh_idx; +} + +#if !CONFIG_REALTIME_ONLY +void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + + av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size); + + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); + + set_mi_offsets(&cm->mi_params, xd, 0, 0); +} + +// Apply temporal filtering to source frames and encode the filtered frame. +// If the current frame does not require filtering, this function is identical +// to av1_encode() except that tpl is not performed. +static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, + EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time); +#endif + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int is_second_arf = + av1_gop_is_second_arf(gf_group, cpi->gf_frame_index); + + // Decide whether to apply temporal filtering to the source frame. + int apply_filtering = + av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi); + if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) { + apply_filtering = 0; + } + if (apply_filtering) { + if (frame_params->frame_type == KEY_FRAME) { + // TODO(angiebird): Move the noise level check to av1_tf_info_filtering. + // Decide whether it is allowed to perform key frame filtering + int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering && + !frame_params->show_existing_frame && + !is_lossless_requested(&oxcf->rc_cfg); + if (allow_kf_filtering) { + double y_noise_level = 0.0; + av1_estimate_noise_level( + frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y, + cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD); + apply_filtering = y_noise_level > 0; + } else { + apply_filtering = 0; + } + // If we are doing kf filtering, set up a few things. + if (apply_filtering) { + av1_setup_past_independence(cm); + } + } else if (is_second_arf) { + apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering; + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time); +#endif + // Save the pointer to the original source image. + YV12_BUFFER_CONFIG *source_buffer = frame_input->source; + // apply filtering to frame + if (apply_filtering) { + int show_existing_alt_ref = 0; + FRAME_DIFF frame_diff; + int top_index = 0; + int bottom_index = 0; + const int q_index = av1_rc_pick_q_and_bounds( + cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, + cpi->gf_frame_index, &bottom_index, &top_index); + + // TODO(bohanli): figure out why we need frame_type in cm here. + cm->current_frame.frame_type = frame_params->frame_type; + if (update_type == KF_UPDATE || update_type == ARF_UPDATE) { + YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf( + &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff); + if (tf_buf != NULL) { + frame_input->source = tf_buf; + show_existing_alt_ref = av1_check_show_filtered_frame( + tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth); + if (show_existing_alt_ref) { + cpi->common.showable_frame |= 1; + } else { + cpi->common.showable_frame = 0; + } + } + if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) { + cpi->ppi->show_existing_alt_ref = show_existing_alt_ref; + } + } + + if (is_second_arf) { + // Allocate the memory for tf_buf_second_arf buffer, only when it is + // required. + int ret = aom_realloc_frame_buffer( + &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0); + if (ret) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate tf_buf_second_arf"); + + YV12_BUFFER_CONFIG *tf_buf_second_arf = + &cpi->ppi->tf_info.tf_buf_second_arf; + // We didn't apply temporal filtering for second arf ahead in + // av1_tf_info_filtering(). + const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; + // Right now, we are still using tf_buf_second_arf due to + // implementation complexity. + // TODO(angiebird): Reuse tf_info->tf_buf here. + av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff, + tf_buf_second_arf); + show_existing_alt_ref = av1_check_show_filtered_frame( + tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth); + if (show_existing_alt_ref) { + aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm)); + frame_input->source = tf_buf_second_arf; + } + // Currently INTNL_ARF_UPDATE only do show_existing. + cpi->common.showable_frame |= 1; + } + + // Copy source metadata to the temporal filtered frame + if (source_buffer->metadata && + aom_copy_metadata_to_frame_buffer(frame_input->source, + source_buffer->metadata)) { + aom_internal_error( + cm->error, AOM_CODEC_MEM_ERROR, + "Failed to copy source metadata to the temporal filtered frame"); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time); +#endif + + int set_mv_params = frame_params->frame_type == KEY_FRAME || + update_type == ARF_UPDATE || update_type == GF_UPDATE; + cm->show_frame = frame_params->show_frame; + cm->current_frame.frame_type = frame_params->frame_type; + // TODO(bohanli): Why is this? what part of it is necessary? + av1_set_frame_size(cpi, cm->width, cm->height); + if (set_mv_params) av1_set_mv_search_params(cpi); + +#if CONFIG_RD_COMMAND + if (frame_params->frame_type == KEY_FRAME) { + char filepath[] = "rd_command.txt"; + av1_read_rd_command(filepath, &cpi->rd_command); + } +#endif // CONFIG_RD_COMMAND + if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) { + // perform tpl after filtering + int allow_tpl = + oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model; + if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) { + allow_tpl = 0; + } + if (frame_params->frame_type != KEY_FRAME) { + // In rare case, it's possible to have non ARF/GF update_type here. + // We should set allow_tpl to zero in the situation + allow_tpl = + allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE || + (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == + DUCKY_ENCODE_GOP_MODE_RCL)); + } + + if (allow_tpl) { + if (!cpi->skip_tpl_setup_stats) { + av1_tpl_preload_rc_estimate(cpi, frame_params); + av1_tpl_setup_stats(cpi, 0, frame_params); +#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS + assert(cpi->gf_frame_index == 0); + av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data, + gf_group, cm->seq_params->bit_depth); +#endif + } + } else { + av1_init_tpl_stats(&cpi->ppi->tpl_data); + } +#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + if (cpi->oxcf.pass == AOM_RC_SECOND_PASS && + cpi->second_pass_log_stream != NULL) { + TPL_INFO *tpl_info; + AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info))); + av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data); + av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream, + cpi->common.error); + aom_free(tpl_info); + } +#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + } + + if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + // Set frame_input source to true source for psnr calculation. + if (apply_filtering && is_psnr_calc_enabled(cpi)) { + cpi->source = av1_realloc_and_scale_if_required( + cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0, + false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + cpi->unscaled_source = source_buffer; + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time); +#endif + return AOM_CODEC_OK; +} +#endif // !CONFIG_REALTIME_ONLY + +/*!\cond */ +// Struct to keep track of relevant reference frame data. +typedef struct { + int map_idx; + int disp_order; + int pyr_level; + int used; +} RefBufMapData; +/*!\endcond */ + +// Comparison function to sort reference frames in ascending display order. +static int compare_map_idx_pair_asc(const void *a, const void *b) { + if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) { + return 0; + } else if (((const RefBufMapData *)a)->disp_order > + ((const RefBufMapData *)b)->disp_order) { + return 1; + } else { + return -1; + } +} + +// Checks to see if a particular reference frame is already in the reference +// frame map. +static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) { + for (int i = 0; i < n_frames; i++) { + if (disp_order == map[i].disp_order) return 1; + } + return 0; +} + +// Add a reference buffer index to a named reference slot. +static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx, + int frame) { + remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx; + ref->used = 1; +} + +// Threshold dictating when we are allowed to start considering +// leaving lowest level frames unmapped. +#define LOW_LEVEL_FRAMES_TR 5 + +// Find which reference buffer should be left out of the named mapping. +// This is because there are 8 reference buffers and only 7 named slots. +static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs, + int n_min_level_refs, int min_level, + int cur_frame_disp) { + int max_dist = 0; + int unmapped_idx = -1; + if (n_bufs <= ALTREF_FRAME) return; + for (int i = 0; i < n_bufs; i++) { + if (buffer_map[i].used) continue; + if (buffer_map[i].pyr_level != min_level || + n_min_level_refs >= LOW_LEVEL_FRAMES_TR) { + int dist = abs(cur_frame_disp - buffer_map[i].disp_order); + if (dist > max_dist) { + max_dist = dist; + unmapped_idx = i; + } + } + } + assert(unmapped_idx >= 0 && "Unmapped reference not found"); + buffer_map[unmapped_idx].used = 1; +} + +void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp, const AV1_COMP *cpi, int gf_index, + int is_parallel_encode, + int remapped_ref_idx[REF_FRAMES]) { + int buf_map_idx = 0; + + // Initialize reference frame mappings. + for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX; + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { + for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) { + if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) { + remapped_ref_idx[rf - LAST_FRAME] = + cpi->ppi->gf_group.ref_frame_list[gf_index][rf]; + } + } + + int valid_rf_idx = 0; + static const int ref_frame_type_order[REF_FRAMES - LAST_FRAME] = { + GOLDEN_FRAME, ALTREF_FRAME, LAST_FRAME, BWDREF_FRAME, + ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME + }; + for (int i = 0; i < REF_FRAMES - LAST_FRAME; i++) { + int rf = ref_frame_type_order[i]; + if (remapped_ref_idx[rf - LAST_FRAME] != INVALID_IDX) { + valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME]; + break; + } + } + + for (int i = 0; i < REF_FRAMES; ++i) { + if (remapped_ref_idx[i] == INVALID_IDX) { + remapped_ref_idx[i] = valid_rf_idx; + } + } + + return; + } +#endif // !CONFIG_REALTIME_ONLY + + RefBufMapData buffer_map[REF_FRAMES]; + int n_bufs = 0; + memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0])); + int min_level = MAX_ARF_LAYERS; + int max_level = 0; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + int skip_ref_unmapping = 0; + int is_one_pass_rt = is_one_pass_rt_params(cpi); + + // Go through current reference buffers and store display order, pyr level, + // and map index. + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + // Get reference frame buffer. + RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; + if (ref_pair.disp_order == -1) continue; + const int frame_order = ref_pair.disp_order; + // Avoid duplicates. + if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue; + const int reference_frame_level = ref_pair.pyr_level; + + // Keep track of the lowest and highest levels that currently exist. + if (reference_frame_level < min_level) min_level = reference_frame_level; + if (reference_frame_level > max_level) max_level = reference_frame_level; + + buffer_map[n_bufs].map_idx = map_idx; + buffer_map[n_bufs].disp_order = frame_order; + buffer_map[n_bufs].pyr_level = reference_frame_level; + buffer_map[n_bufs].used = 0; + n_bufs++; + } + + // Sort frames in ascending display order. + qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc); + + int n_min_level_refs = 0; + int closest_past_ref = -1; + int golden_idx = -1; + int altref_idx = -1; + + // Find the GOLDEN_FRAME and BWDREF_FRAME. + // Also collect various stats about the reference frames for the remaining + // mappings. + for (int i = n_bufs - 1; i >= 0; i--) { + if (buffer_map[i].pyr_level == min_level) { + // Keep track of the number of lowest level frames. + n_min_level_refs++; + if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 && + remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) { + // Save index for GOLDEN. + golden_idx = i; + } else if (buffer_map[i].disp_order > cur_frame_disp && + altref_idx == -1 && + remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) { + // Save index for ALTREF. + altref_idx = i; + } + } else if (buffer_map[i].disp_order == cur_frame_disp) { + // Map the BWDREF_FRAME if this is the show_existing_frame. + add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME); + } + + // During parallel encodes of lower layer frames, exclude the first frame + // (frame_parallel_level 1) from being used for the reference assignment of + // the second frame (frame_parallel_level 2). + if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 && + gf_group->frame_parallel_level[gf_index - 1] == 1 && + gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) { + assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE); +#if CONFIG_FPMT_TEST + is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) + ? is_parallel_encode + : 0; +#endif // CONFIG_FPMT_TEST + // If parallel cpis are active, use ref_idx_to_skip, else, use display + // index. + assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX)); + assert(IMPLIES(!is_parallel_encode, + gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX)); + buffer_map[i].used = is_parallel_encode + ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip) + : (buffer_map[i].disp_order == + gf_group->skip_frame_as_ref[gf_index]); + // In case a ref frame is excluded from being used during assignment, + // skip the call to set_unmapped_ref(). Applicable in steady state. + if (buffer_map[i].used) skip_ref_unmapping = 1; + } + + // Keep track of where the frames change from being past frames to future + // frames. + if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0) + closest_past_ref = i; + } + + // Do not map GOLDEN and ALTREF based on their pyramid level if all reference + // frames have the same level. + if (n_min_level_refs <= n_bufs) { + // Map the GOLDEN_FRAME. + if (golden_idx > -1) + add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME); + // Map the ALTREF_FRAME. + if (altref_idx > -1) + add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME); + } + + // Find the buffer to be excluded from the mapping. + if (!skip_ref_unmapping) + set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level, + cur_frame_disp); + + // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME. + for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer + // in decreasing ouptut order relative to current picture. + int next_buf_max = 0; + int next_disp_order = INT_MIN; + for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used && + buffer_map[buf_map_idx].disp_order < cur_frame_disp && + buffer_map[buf_map_idx].disp_order > next_disp_order) { + next_disp_order = buffer_map[buf_map_idx].disp_order; + next_buf_max = buf_map_idx; + } + } + buf_map_idx = next_buf_max; + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME. + for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer + // in increasing ouptut order relative to current picture. + int next_buf_max = 0; + int next_disp_order = INT_MAX; + for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used && + buffer_map[buf_map_idx].disp_order > cur_frame_disp && + buffer_map[buf_map_idx].disp_order < next_disp_order) { + next_disp_order = buffer_map[buf_map_idx].disp_order; + next_buf_max = buf_map_idx; + } + } + buf_map_idx = next_buf_max; + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place remaining past frames. + buf_map_idx = closest_past_ref; + for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer. + for (; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used) break; + } + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place remaining future frames. + buf_map_idx = n_bufs - 1; + for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer. + for (; buf_map_idx > closest_past_ref; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used) break; + } + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Fill any slots that are empty (should only happen for the first 7 frames). + for (int i = 0; i < REF_FRAMES; ++i) + if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0; +} + +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational64_t *const timestamp_ratio, + int *const pop_lookahead, int flush) { + AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + ExternalFlags *const ext_flags = &cpi->ext_flags; + GFConfig *const gf_cfg = &oxcf->gf_cfg; + + EncodeFrameInput frame_input; + EncodeFrameParams frame_params; + EncodeFrameResults frame_results; + memset(&frame_input, 0, sizeof(frame_input)); + memset(&frame_params, 0, sizeof(frame_params)); + memset(&frame_results, 0, sizeof(frame_results)); + +#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info; + if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) { + THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF]; + av1_open_second_pass_log(cpi, 1); + FILE *second_pass_log_stream = cpi->second_pass_log_stream; + fseek(second_pass_log_stream, 0, SEEK_END); + size_t file_size = ftell(second_pass_log_stream); + rewind(second_pass_log_stream); + size_t read_size = 0; + while (read_size < file_size) { + THIRD_PASS_GOP_INFO gop_info; + struct aom_internal_error_info *error = cpi->common.error; + // Read in GOP information from the second pass file. + av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error); + TPL_INFO *tpl_info; + AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info))); + av1_read_tpl_info(tpl_info, second_pass_log_stream, error); + // Read in per-frame info from second-pass encoding + av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info, + gop_info.num_frames, error); + av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info); + read_size = ftell(second_pass_log_stream); + aom_free(tpl_info); + } + av1_close_second_pass_log(cpi); + if (cpi->oxcf.rc_cfg.mode == AOM_Q) { + vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level; + av1_vbr_rc_compute_q_indices( + vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count, + vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth, + vbr_rc_info->q_index_list); + } else { + vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q( + vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth, + vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count, + vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list, + vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL); + } + vbr_rc_info->ready = 1; +#if CONFIG_RATECTRL_LOG + rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index, + vbr_rc_info->total_frame_count); +#endif // CONFIG_RATECTRL_LOG + } +#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + + // Check if we need to stuff more src frames + if (flush == 0) { + int srcbuf_size = + av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); + int pop_size = + av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage); + + // Continue buffering look ahead buffer. + if (srcbuf_size < pop_size) return -1; + } + + if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) { +#if !CONFIG_REALTIME_ONLY + if (flush && oxcf->pass == AOM_RC_FIRST_PASS && + !cpi->ppi->twopass.first_pass_done) { + av1_end_first_pass(cpi); /* get last stats packet */ + cpi->ppi->twopass.first_pass_done = 1; + } +#endif + return -1; + } + + // TODO(sarahparker) finish bit allocation for one pass pyramid + if (has_no_stats_stage(cpi)) { + gf_cfg->gf_max_pyr_height = + AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS); + gf_cfg->gf_min_pyr_height = + AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height); + } + + // Allocation of mi buffers. + alloc_mb_mode_info_buffers(cpi); + + cpi->skip_tpl_setup_stats = 0; +#if !CONFIG_REALTIME_ONLY + if (oxcf->pass != AOM_RC_FIRST_PASS) { + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (tpl_data->tpl_stats_pool[0] == NULL) { + av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, 0, + oxcf->gf_cfg.lag_in_frames); + } + } + cpi->twopass_frame.this_frame = NULL; + const int use_one_pass_rt_params = is_one_pass_rt_params(cpi); + if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_get_second_pass_params_time); +#endif + + // Initialise frame_level_rate_correction_factors with value previous + // to the parallel frames. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + for (int i = 0; i < RATE_FACTOR_LEVELS; i++) { + cpi->rc.frame_level_rate_correction_factors[i] = +#if CONFIG_FPMT_TEST + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) + ? cpi->ppi->p_rc.temp_rate_correction_factors[i] + : +#endif // CONFIG_FPMT_TEST + cpi->ppi->p_rc.rate_correction_factors[i]; + } + } + + // copy mv_stats from ppi to frame_level cpi. + cpi->mv_stats = cpi->ppi->mv_stats; + av1_get_second_pass_params(cpi, &frame_params, *frame_flags); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_get_second_pass_params_time); +#endif + } +#endif + + if (!is_stat_generation_stage(cpi)) { + // TODO(jingning): fwd key frame always uses show existing frame? + if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE && + gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + frame_params.show_existing_frame = 1; + } else { + frame_params.show_existing_frame = + (cpi->ppi->show_existing_alt_ref && + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) || + gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE; + } + frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags); + + // Special handling to reset 'show_existing_frame' in case of dropped + // frames. + if (oxcf->rc_cfg.drop_frames_water_mark && + (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE)) { + // During the encode of an OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE frame, loop + // over the gf group to check if the corresponding + // ARF_UPDATE/INTNL_ARF_UPDATE frame was dropped. + int cur_disp_idx = gf_group->display_idx[cpi->gf_frame_index]; + for (int idx = 0; idx < cpi->gf_frame_index; idx++) { + if (cur_disp_idx == gf_group->display_idx[idx]) { + assert(IMPLIES( + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE, + gf_group->update_type[idx] == ARF_UPDATE)); + assert(IMPLIES(gf_group->update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE, + gf_group->update_type[idx] == INTNL_ARF_UPDATE)); + // Reset show_existing_frame and set cpi->is_dropped_frame to true if + // the frame was dropped during its first encode. + if (gf_group->is_frame_dropped[idx]) { + frame_params.show_existing_frame = 0; + assert(!cpi->is_dropped_frame); + cpi->is_dropped_frame = true; + } + break; + } + } + } + + // Reset show_existing_alt_ref decision to 0 after it is used. + if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { + cpi->ppi->show_existing_alt_ref = 0; + } + } else { + frame_params.show_existing_frame = 0; + } + + struct lookahead_entry *source = NULL; + struct lookahead_entry *last_source = NULL; + if (frame_params.show_existing_frame) { + source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); + *pop_lookahead = 1; + frame_params.show_frame = 1; + } else { + source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source, + &frame_params.show_frame); + } + + if (source == NULL) { // If no source was found, we can't encode a frame. +#if !CONFIG_REALTIME_ONLY + if (flush && oxcf->pass == AOM_RC_FIRST_PASS && + !cpi->ppi->twopass.first_pass_done) { + av1_end_first_pass(cpi); /* get last stats packet */ + cpi->ppi->twopass.first_pass_done = 1; + } +#endif + return -1; + } + + // reset src_offset to allow actual encode call for this frame to get its + // source. + gf_group->src_offset[cpi->gf_frame_index] = 0; + + // Source may be changed if temporal filtered later. + frame_input.source = &source->img; + if ((cpi->ppi->use_svc || cpi->rc.prev_frame_is_dropped) && + last_source != NULL) + av1_svc_set_last_source(cpi, &frame_input, &last_source->img); + else + frame_input.last_source = last_source != NULL ? &last_source->img : NULL; + frame_input.ts_duration = source->ts_end - source->ts_start; + // Save unfiltered source. It is used in av1_get_second_pass_params(). + cpi->unfiltered_source = frame_input.source; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + if (source->ts_start < cpi->time_stamps.first_ts_start) { + cpi->time_stamps.first_ts_start = source->ts_start; + cpi->time_stamps.prev_ts_end = source->ts_start; + } + + av1_apply_encoding_flags(cpi, source->flags); + *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + cpi->framerate = cpi->temp_framerate; + } + } +#endif // CONFIG_FPMT_TEST + + // Shown frames and arf-overlay frames need frame-rate considering + if (frame_params.show_frame) + adjust_frame_rate(cpi, source->ts_start, source->ts_end); + + if (!frame_params.show_existing_frame) { + if (cpi->film_grain_table) { + cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup( + cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, + &cm->film_grain_params); + } else { + cm->cur_frame->film_grain_params_present = + cm->seq_params->film_grain_params_present; + } + // only one operating point supported now + const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp); + if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; + + cm->frame_presentation_time = (uint32_t)pts64; + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_get_one_pass_rt_params_time); +#endif +#if CONFIG_REALTIME_ONLY + av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input, + *frame_flags); + if (use_rtc_reference_structure_one_layer(cpi)) + av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0); +#else + if (use_one_pass_rt_params) { + av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input, + *frame_flags); + if (use_rtc_reference_structure_one_layer(cpi)) + av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0); + } +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_get_one_pass_rt_params_time); +#endif + + FRAME_UPDATE_TYPE frame_update_type = + get_frame_update_type(gf_group, cpi->gf_frame_index); + + if (frame_params.show_existing_frame && + frame_params.frame_type != KEY_FRAME) { + // Force show-existing frames to be INTER, except forward keyframes + frame_params.frame_type = INTER_FRAME; + } + + // Per-frame encode speed. In theory this can vary, but things may have + // been written assuming speed-level will not change within a sequence, so + // this parameter should be used with caution. + frame_params.speed = oxcf->speed; + +#if !CONFIG_REALTIME_ONLY + // Set forced key frames when necessary. For two-pass encoding / lap mode, + // this is already handled by av1_get_second_pass_params. However when no + // stats are available, we still need to check if the new frame is a keyframe. + // For one pass rt, this is already checked in av1_get_one_pass_rt_params. + if (!use_one_pass_rt_params && + (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) { + // Current frame is coded as a key-frame for any of the following cases: + // 1) First frame of a video + // 2) For all-intra frame encoding + // 3) When a key-frame is forced + const int kf_requested = + (cm->current_frame.frame_number == 0 || + oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY)); + if (kf_requested && frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + frame_params.frame_type = KEY_FRAME; + } else if (is_stat_generation_stage(cpi)) { + // For stats generation, set the frame type to inter here. + frame_params.frame_type = INTER_FRAME; + } + } +#endif + + // Work out some encoding parameters specific to the pass: + if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_update_parameters(cpi); + } else if (is_stat_generation_stage(cpi)) { + cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg); + } else if (is_stat_consumption_stage(cpi)) { +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif +#if TXCOEFF_COST_TIMER + cm->txcoeff_cost_timer = 0; + cm->txcoeff_cost_count = 0; +#endif + } + + if (!is_stat_generation_stage(cpi)) + set_ext_overrides(cm, &frame_params, ext_flags); + + // Shown keyframes and S frames refresh all reference buffers + const int force_refresh_all = + ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) || + frame_params.frame_type == S_FRAME) && + !frame_params.show_existing_frame; + + av1_configure_buffer_updates( + cpi, &frame_params.refresh_frame, frame_update_type, + gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all); + + if (!is_stat_generation_stage(cpi)) { + const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME]; + + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(cpi, ref_frame_map_pairs); + const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; + const int cur_frame_disp = + cpi->common.current_frame.frame_number + order_offset; + + int get_ref_frames = 0; +#if CONFIG_FPMT_TEST + get_ref_frames = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (get_ref_frames || + gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) { + if (!ext_flags->refresh_frame.update_pending) { + av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi, + cpi->gf_frame_index, 1, cm->remapped_ref_idx); + } else if (cpi->ppi->rtc_ref.set_ref_frame_config || + use_rtc_reference_structure_one_layer(cpi)) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) + cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i]; + } + } + + // Get the reference frames + bool has_ref_frames = false; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + const RefCntBuffer *ref_frame = + get_ref_frame_buf(cm, ref_frame_priority_order[i]); + ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL; + if (ref_frame != NULL) has_ref_frames = true; + } + if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME || + frame_params.frame_type == S_FRAME)) { + return AOM_CODEC_ERROR; + } + + // Work out which reference frame slots may be used. + frame_params.ref_frame_flags = + get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf, + ext_flags->ref_frame_flags); + + // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE. + if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) { + frame_params.primary_ref_frame = PRIMARY_REF_NONE; + } else { + frame_params.primary_ref_frame = + choose_primary_ref_frame(cpi, &frame_params); + } + + frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; + + // Call av1_get_refresh_frame_flags() if refresh index not available. + if (!cpi->refresh_idx_available) { + frame_params.refresh_frame_flags = av1_get_refresh_frame_flags( + cpi, &frame_params, frame_update_type, cpi->gf_frame_index, + cur_frame_disp, ref_frame_map_pairs); + } else { + assert(cpi->ref_refresh_index != INVALID_IDX); + frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index); + } + + // Make the frames marked as is_frame_non_ref to non-reference frames. + if (gf_group->is_frame_non_ref[cpi->gf_frame_index]) + frame_params.refresh_frame_flags = 0; + + frame_params.existing_fb_idx_to_show = INVALID_IDX; + // Find the frame buffer to show based on display order. + if (frame_params.show_existing_frame) { + for (int frame = 0; frame < REF_FRAMES; frame++) { + const RefCntBuffer *const buf = cm->ref_frame_map[frame]; + if (buf == NULL) continue; + const int frame_order = (int)buf->display_order_hint; + if (frame_order == cur_frame_disp) + frame_params.existing_fb_idx_to_show = frame; + } + } + } + + // The way frame_params->remapped_ref_idx is setup is a placeholder. + // Currently, reference buffer assignment is done by update_ref_frame_map() + // which is called by high-level strategy AFTER encoding a frame. It + // modifies cm->remapped_ref_idx. If you want to use an alternative method + // to determine reference buffer assignment, just put your assignments into + // frame_params->remapped_ref_idx here and they will be used when encoding + // this frame. If frame_params->remapped_ref_idx is setup independently of + // cm->remapped_ref_idx then update_ref_frame_map() will have no effect. + memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0; + + if (!frame_params.show_existing_frame) { + cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm; + } + + const int is_intra_frame = frame_params.frame_type == KEY_FRAME || + frame_params.frame_type == INTRA_ONLY_FRAME; + FeatureFlags *const features = &cm->features; + if (!is_stat_generation_stage(cpi) && + (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) && + is_intra_frame) { + av1_set_screen_content_options(cpi, features); + } + +#if CONFIG_REALTIME_ONLY + if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#else + if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME && + gf_cfg->lag_in_frames == 0) { + if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params, + &frame_results) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } +#endif // CONFIG_REALTIME_ONLY + + // This is used in rtc temporal filter case. Use true source in the PSNR + // calculation. + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && + cpi->common.current_frame.frame_type != KEY_FRAME) { + assert(cpi->orig_source.buffer_alloc_sz > 0); + cpi->source = &cpi->orig_source; + } + + if (!is_stat_generation_stage(cpi)) { + // First pass doesn't modify reference buffer assignment or produce frame + // flags + update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags); + set_additional_frame_flags(cm, frame_flags); + } + +#if !CONFIG_REALTIME_ONLY +#if TXCOEFF_COST_TIMER + if (!is_stat_generation_stage(cpi)) { + cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; + fprintf(stderr, + "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " + "in us\n", + cm->txcoeff_cost_count, cm->txcoeff_cost_timer, + cm->cum_txcoeff_cost_timer); + } +#endif +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_TUNE_VMAF + if (!is_stat_generation_stage(cpi) && + (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) { + av1_update_vmaf_curve(cpi); + } +#endif + + // Unpack frame_results: + *size = frame_results.size; + + // Leave a signal for a higher level caller about if this frame is droppable + if (*size > 0) { + cpi->droppable = + is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame); + } + + // For SVC, or when frame-dropper is enabled: + // keep track of the (unscaled) source corresponding to the refresh of LAST + // reference (base temporal layer - TL0). Copy only for the + // top spatial enhancement layer so all spatial layers of the next + // superframe have last_source to be aligned with previous TL0 superframe. + // Avoid cases where resolution changes for unscaled source (top spatial + // layer). Only needs to be done for frame that are encoded (size > 0). + if (*size > 0 && + (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.temporal_layer_id == 0 && + cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width && + cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) { + aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0); + aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0); + aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0); + } + + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/encoder/encode_strategy.h b/third_party/aom/av1/encoder/encode_strategy.h new file mode 100644 index 0000000000..c1d14d134c --- /dev/null +++ b/third_party/aom/av1/encoder/encode_strategy.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares frame encoding functions. + */ +#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ +#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "aom/aom_encoder.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" + +/*!\brief Implement high-level encode strategy + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * This function will implement high-level encode strategy, choosing frame type, + * frame placement, etc. It populates an EncodeFrameParams struct with the + * results of these decisions and then encodes the frame. The caller should use + * the output parameters *time_stamp and *time_end only when this function + * returns AOM_CODEC_OK. + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * \param[in] frame_flags Flags to decide how to encoding the frame + * \param[out] time_stamp Time stamp of the frame + * \param[out] time_end Time end + * \param[in] timestamp_ratio Time base + * \param[in] pop_lookahead Decide to pop the source frame from queue + * \param[in] flush Decide to encode one frame or the rest of frames + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval -1 + * \retval #AOM_CODEC_ERROR + */ +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational64_t *const timestamp_ratio, + int *const pop_lookahead, int flush); + +/*!\cond */ +// Set individual buffer update flags based on frame reference type. +// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all +// refresh_*_frame flags to be set, because we refresh all buffers in this case. +void av1_configure_buffer_updates(AV1_COMP *const cpi, + RefreshFrameInfo *const refresh_frame, + const FRAME_UPDATE_TYPE type, + const REFBUF_STATE refbuf_state, + int force_refresh_all); + +int av1_get_refresh_frame_flags( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]); + +int av1_get_refresh_ref_frame_map(int refresh_frame_flags); + +/*!\brief Obtain indices of reference frames in ref_frame_map + * + * \callgraph + * \callergraph + * + * \param[out] remapped_ref_idx An array for storing indices of reference + * frames. The index is used to retrieve a + * reference frame buffer from ref_frame_map + * in AV1Common. + */ +void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp, const AV1_COMP *cpi, int gf_index, + int is_parallel_encode, + int remapped_ref_idx[REF_FRAMES]); + +int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index, + const COMPRESSOR_STAGE compressor_stage); + +static AOM_INLINE int is_frame_droppable( + const RTC_REF *const rtc_ref, + const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) { + // Droppable frame is only used by external refresh flags. VoD setting won't + // trigger its use case. + if (rtc_ref->set_ref_frame_config) + return rtc_ref->non_reference_frame; + else if (ext_refresh_frame_flags->update_pending) + return !(ext_refresh_frame_flags->alt_ref_frame || + ext_refresh_frame_flags->alt2_ref_frame || + ext_refresh_frame_flags->bwd_ref_frame || + ext_refresh_frame_flags->golden_frame || + ext_refresh_frame_flags->last_frame); + else + return 0; +} + +static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) { + // We choose the reference "type" of this frame from the flags which indicate + // which reference frames will be refreshed by it. More than one of these + // flags may be set, so the order here implies an order of precedence. This is + // just used to choose the primary_ref_frame (as the most recent reference + // buffer of the same reference-type as the current frame). + + switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) { + case 0: return 0; + case 1: return 1; + case MAX_ARF_LAYERS: + case MAX_ARF_LAYERS + 1: return 4; + default: return 7; + } +} + +int av1_calc_refresh_idx_for_intnl_arf( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int gf_index); +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c new file mode 100644 index 0000000000..e2213a8355 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe.c @@ -0,0 +1,2408 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mv.h" +#include "av1/common/mvref_common.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconintra.h" +#include "av1/common/reconinter.h" +#include "av1/common/seg_common.h" +#include "av1/common/tile_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/global_motion_facade.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/partition_model_weights.h" +#endif +#include "av1/encoder/partition_search.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/var_based_part.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +/*!\cond */ +// This is used as a reference when computing the source variance for the +// purposes of activity masking. +// Eventually this should be replaced by custom no-reference routines, +// which will be faster. +static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +#if CONFIG_AV1_HIGHBITDEPTH +static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = { + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, + 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4 +}; + +static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, + 128 * 16, 128 * 16 +}; +#endif // CONFIG_AV1_HIGHBITDEPTH +/*!\endcond */ + +// For the given bit depth, returns a constant array used to assist the +// calculation of source block variance, which will then be used to decide +// adaptive quantizers. +static const uint8_t *get_var_offs(int use_hbd, int bd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + assert(bd == 8 || bd == 10 || bd == 12); + const int off_index = (bd - 8) >> 1; + static const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8, + AV1_HIGH_VAR_OFFS_10, + AV1_HIGH_VAR_OFFS_12 }; + return CONVERT_TO_BYTEPTR(high_var_offs[off_index]); + } +#else + (void)use_hbd; + (void)bd; + assert(!use_hbd); +#endif + assert(bd == 8); + return AV1_VAR_OFFS; +} + +void av1_init_rtc_counters(MACROBLOCK *const x) { + av1_init_cyclic_refresh_counters(x); + x->cnt_zeromv = 0; +} + +void av1_accumulate_rtc_counters(AV1_COMP *cpi, const MACROBLOCK *const x) { + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) + av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, x); + cpi->rc.cnt_zeromv += x->cnt_zeromv; +} + +unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane, + int use_hbd) { + const int subsampling_x = xd->plane[plane].subsampling_x; + const int subsampling_y = xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + unsigned int sse; + const unsigned int var = cpi->ppi->fn_ptr[plane_bsize].vf( + ref->buf, ref->stride, get_var_offs(use_hbd, xd->bd), 0, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[plane_bsize]); +} + +unsigned int av1_get_perpixel_variance_facade(const AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane) { + const int use_hbd = is_cur_buf_hbd(xd); + return av1_get_perpixel_variance(cpi, xd, ref, bsize, plane, use_hbd); +} + +void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, const int num_planes, + BLOCK_SIZE bsize) { + // Set current frame pointer. + x->e_mbd.cur_buf = src; + + // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet + // the static analysis warnings. + for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) { + const int is_uv = i > 0; + setup_pred_plane( + &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv], + src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL, + x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); + } +} + +#if !CONFIG_REALTIME_ONLY +/*!\brief Assigns different quantization parameters to each super + * block based on its TPL weight. + * + * \ingroup tpl_modelling + * + * \param[in] cpi Top level encoder instance structure + * \param[in,out] td Thread data structure + * \param[in,out] x Macro block level data for this block. + * \param[in] tile_info Tile infromation / identification + * \param[in] mi_row Block row (in "MI_SIZE" units) index + * \param[in] mi_col Block column (in "MI_SIZE" units) index + * \param[out] num_planes Number of image planes (e.g. Y,U,V) + * + * \remark No return value but updates macroblock and thread data + * related to the q / q delta to be used. + */ +static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, + MACROBLOCK *const x, + const TileInfo *const tile_info, + int mi_row, int mi_col, int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + assert(delta_q_info->delta_q_present_flag); + + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + // Delta-q modulation based on variance + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); + + const int delta_q_res = delta_q_info->delta_q_res; + int current_qindex = cm->quant_params.base_qindex; + if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode == + DUCKY_ENCODE_FRAME_MODE_QINDEX) { + const int sb_row = mi_row >> cm->seq_params->mib_size_log2; + const int sb_col = mi_col >> cm->seq_params->mib_size_log2; + const int sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + const int sb_index = sb_row * sb_cols + sb_col; + current_qindex = + cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index]; + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) { + if (DELTA_Q_PERCEPTUAL_MODULATION == 1) { + const int block_wavelet_energy_level = + av1_block_wavelet_energy_level(cpi, x, sb_size); + x->sb_energy_level = block_wavelet_energy_level; + current_qindex = av1_compute_q_from_energy_level_deltaq_mode( + cpi, block_wavelet_energy_level); + } else { + const int block_var_level = av1_log_block_var(cpi, x, sb_size); + x->sb_energy_level = block_var_level; + current_qindex = + av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level); + } + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE && + cpi->oxcf.algo_cfg.enable_tpl_model) { + // Setup deltaq based on tpl stats + current_qindex = + av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col); + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) { + current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col); + } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) { + current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col); + } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) { + current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col); + } + + x->rdmult_cur_qindex = current_qindex; + MACROBLOCKD *const xd = &x->e_mbd; + const int adjusted_qindex = av1_adjust_q_from_delta_q_res( + delta_q_res, xd->current_base_qindex, current_qindex); + if (cpi->use_ducky_encode) { + assert(adjusted_qindex == current_qindex); + } + current_qindex = adjusted_qindex; + + x->delta_qindex = current_qindex - cm->quant_params.base_qindex; + x->rdmult_delta_qindex = x->delta_qindex; + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + xd->mi[0]->current_qindex = current_qindex; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0); + + // keep track of any non-zero delta-q used + td->deltaq_used |= (x->delta_qindex != 0); + + if (cpi->oxcf.tool_cfg.enable_deltalf_mode) { + const int delta_lf_res = delta_q_info->delta_lf_res; + const int lfmask = ~(delta_lf_res - 1); + const int delta_lf_from_base = + ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask); + const int8_t delta_lf = + (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + const int mib_size = cm->seq_params->mib_size; + + // pre-set the delta lf for loop filter. Note that this value is set + // before mi is assigned for each block in current superblock + for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { + for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); + mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf; + } + } + } + } +} + +static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, + int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCK *x = &td->mb; + const int frame_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + av1_zero(x->tpl_keep_ref_frame); + + if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; + + const int is_overlay = + cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE; + if (is_overlay) { + memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame)); + return; + } + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 }; + const int step = 1 << block_mis_log2; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + const int mi_row_end = + AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size], + cm->superres_scale_denominator), + mi_cols_sr); + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < mi_row_end; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + const TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 }; + // Find the winner ref frame idx for the current block + int64_t best_inter_cost = this_stats->pred_error[0]; + int best_rf_idx = 0; + for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) { + if ((this_stats->pred_error[idx] < best_inter_cost) && + (this_stats->pred_error[idx] != 0)) { + best_inter_cost = this_stats->pred_error[idx]; + best_rf_idx = idx; + } + } + // tpl_pred_error is the pred_error reduction of best_ref w.r.t. + // LAST_FRAME. + tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] - + this_stats->pred_error[LAST_FRAME - 1]; + + for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) + inter_cost[rf_idx] += tpl_pred_error[rf_idx]; + } + } + + int rank_index[INTER_REFS_PER_FRAME - 1]; + for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { + rank_index[idx] = idx + 1; + for (int i = idx; i > 0; --i) { + if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) { + const int tmp = rank_index[i - 1]; + rank_index[i - 1] = rank_index[i]; + rank_index[i] = tmp; + } + } + } + + x->tpl_keep_ref_frame[INTRA_FRAME] = 1; + x->tpl_keep_ref_frame[LAST_FRAME] = 1; + + int cutoff_ref = 0; + for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) { + x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1; + if (idx > 2) { + if (!cutoff_ref) { + // If the predictive coding gains are smaller than the previous more + // relevant frame over certain amount, discard this frame and all the + // frames afterwards. + if (llabs(inter_cost[rank_index[idx]]) < + llabs(inter_cost[rank_index[idx - 1]]) / 8 || + inter_cost[rank_index[idx]] == 0) + cutoff_ref = 1; + } + + if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0; + } + } +} + +static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col) { + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; + const int orig_rdmult = cpi->rd.RDMULT; + + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int gf_group_index = cpi->gf_frame_index; + if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ && + cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 && + cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) { + const int dr = + av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult); + x->rdmult = dr; + } +} +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_RT_ML_PARTITIONING +// Get a prediction(stored in x->est_pred) for the whole superblock. +static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + MACROBLOCKD *xd = &x->e_mbd; + + // TODO(kyslov) Extend to 128x128 + assert(cm->seq_params->sb_size == BLOCK_64X64); + + av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MB_MODE_INFO *mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + + assert(yv12 != NULL); + + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, LAST_FRAME), 1); + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE; + mi->bsize = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + } else { +#if CONFIG_AV1_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} +#endif // CONFIG_RT_ML_PARTITIONING + +#define AVG_CDF_WEIGHT_LEFT 3 +#define AVG_CDF_WEIGHT_TOP_RIGHT 1 + +/*!\brief Encode a superblock (minimal RD search involved) + * + * \ingroup partition_search + * Encodes the superblock by a pre-determined partition pattern, only minor + * rd-based searches are allowed to adjust the initial pattern. It is only used + * by realtime encoding. + */ +static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + const int mi_row, const int mi_col, + const int seg_skip) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + const SPEED_FEATURES *const sf = &cpi->sf; + const TileInfo *const tile_info = &tile_data->tile_info; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + PC_TREE *const pc_root = td->pc_root; + +#if CONFIG_RT_ML_PARTITIONING + if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) { + RD_STATS dummy_rdc; + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root); + return; + } +#endif + // Set the partition + if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + (sf->rt_sf.use_fast_fixed_part && + x->content_state_sb.source_sad_nonrd < kMedSad)) { + // set a fixed-size partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; + if (sf->rt_sf.use_fast_fixed_part && + x->content_state_sb.source_sad_nonrd < kLowSad) { + bsize_select = BLOCK_64X64; + } + const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select; + av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { + // set a variance-based partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); + } + assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + sf->part_sf.partition_search_type == VAR_BASED_PARTITION); + set_cb_offsets(td->mb.cb_offset, 0, 0); + + // Initialize the flag to skip cdef to 1. + if (sf->rt_sf.skip_cdef_sb) { + const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1; + // If 128x128 block is used, we need to set the flag for all 4 64x64 sub + // "blocks". + for (int r = 0; r < block64_in_sb; ++r) { + for (int c = 0; c < block64_in_sb; ++c) { + const int idx_in_sb = + r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64; + if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1; + } + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, nonrd_use_partition_time); +#endif + av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, nonrd_use_partition_time); +#endif +} + +// This function initializes the stats for encode_rd_sb. +static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td, + const TileDataEnc *tile_data, + SIMPLE_MOTION_DATA_TREE *sms_root, + RD_STATS *rd_cost, int mi_row, int mi_col, + int gather_tpl_data) { + const AV1_COMMON *cm = &cpi->common; + const TileInfo *tile_info = &tile_data->tile_info; + MACROBLOCK *x = &td->mb; + + const SPEED_FEATURES *sf = &cpi->sf; + const int use_simple_motion_search = + (sf->part_sf.simple_motion_search_split || + sf->part_sf.simple_motion_search_prune_rect || + sf->part_sf.simple_motion_search_early_term_none || + sf->part_sf.ml_early_term_after_part_split_level) && + !frame_is_intra_only(cm); + if (use_simple_motion_search) { + av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root, + mi_row, mi_col); + } + +#if !CONFIG_REALTIME_ONLY + if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0)) { + init_ref_frame_space(cpi, td, mi_row, mi_col); + x->sb_energy_level = 0; + x->part_search_info.cnn_output_valid = 0; + if (gather_tpl_data) { + if (cm->delta_q_info.delta_q_present_flag) { + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes); + av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col); + } + + // TODO(jingning): revisit this function. + if (cpi->oxcf.algo_cfg.enable_tpl_model && (0)) { + adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col); + } + } + } +#else + (void)tile_info; + (void)mi_row; + (void)mi_col; + (void)gather_tpl_data; +#endif + + x->reuse_inter_pred = false; + x->txfm_search_params.mode_eval_type = DEFAULT_EVAL; + reset_mb_rd_record(x->txfm_search_info.mb_rd_record); + av1_zero(x->picked_ref_frames_mask); + av1_invalid_rd_stats(rd_cost); +} + +#if !CONFIG_REALTIME_ONLY +static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td, + const TileDataEnc *tile_data, + SIMPLE_MOTION_DATA_TREE *sms_tree, + RD_STATS *rd_cost, int mi_row, + int mi_col, int delta_qp_ofs) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const TileInfo *tile_info = &tile_data->tile_info; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + assert(delta_q_info->delta_q_present_flag); + const int delta_q_res = delta_q_info->delta_q_res; + + const SPEED_FEATURES *sf = &cpi->sf; + const int use_simple_motion_search = + (sf->part_sf.simple_motion_search_split || + sf->part_sf.simple_motion_search_prune_rect || + sf->part_sf.simple_motion_search_early_term_none || + sf->part_sf.ml_early_term_after_part_split_level) && + !frame_is_intra_only(cm); + if (use_simple_motion_search) { + av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree, + mi_row, mi_col); + } + + int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs; + + MACROBLOCKD *const xd = &x->e_mbd; + current_qindex = av1_adjust_q_from_delta_q_res( + delta_q_res, xd->current_base_qindex, current_qindex); + + x->delta_qindex = current_qindex - cm->quant_params.base_qindex; + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + xd->mi[0]->current_qindex = current_qindex; + av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0); + + // keep track of any non-zero delta-q used + td->deltaq_used |= (x->delta_qindex != 0); + + if (cpi->oxcf.tool_cfg.enable_deltalf_mode) { + const int delta_lf_res = delta_q_info->delta_lf_res; + const int lfmask = ~(delta_lf_res - 1); + const int delta_lf_from_base = + ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask); + const int8_t delta_lf = + (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + const int mib_size = cm->seq_params->mib_size; + + // pre-set the delta lf for loop filter. Note that this value is set + // before mi is assigned for each block in current superblock + for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) { + for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k); + mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf; + } + } + } + } + + x->reuse_inter_pred = false; + x->txfm_search_params.mode_eval_type = DEFAULT_EVAL; + reset_mb_rd_record(x->txfm_search_info.mb_rd_record); + av1_zero(x->picked_ref_frames_mask); + av1_invalid_rd_stats(rd_cost); +} + +static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, + SIMPLE_MOTION_DATA_TREE *sms_tree, + SB_FIRST_PASS_STATS *sb_org_stats) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + RD_STATS rdc_winner, cur_rdc; + av1_invalid_rd_stats(&rdc_winner); + + int best_qindex = td->mb.rdmult_delta_qindex; + const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12; + const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12; + const int step = cm->delta_q_info.delta_q_res; + + for (int sweep_qp_delta = start; sweep_qp_delta <= end; + sweep_qp_delta += step) { + sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row, + mi_col, sweep_qp_delta); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + const int backup_current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + + av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col); + av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col); + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex; + + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, + &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL, + SB_DRY_PASS, NULL); + + if ((rdc_winner.rdcost > cur_rdc.rdcost) || + (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) && + rdc_winner.rdcost == cur_rdc.rdcost)) { + rdc_winner = cur_rdc; + best_qindex = x->rdmult_delta_qindex + sweep_qp_delta; + } + } + + return best_qindex; +} +#endif //! CONFIG_REALTIME_ONLY + +/*!\brief Encode a superblock (RD-search-based) + * + * \ingroup partition_search + * Conducts partition search for a superblock, based on rate-distortion costs, + * from scratch or adjusting from a pre-calculated partition pattern. + */ +static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + const int mi_row, const int mi_col, + const int seg_skip) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const SPEED_FEATURES *const sf = &cpi->sf; + const TileInfo *const tile_info = &tile_data->tile_info; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int num_planes = av1_num_planes(cm); + int dummy_rate; + int64_t dummy_dist; + RD_STATS dummy_rdc; + SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root; + +#if CONFIG_REALTIME_ONLY + (void)seg_skip; +#endif // CONFIG_REALTIME_ONLY + + init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col, + 1); + + // Encode the superblock + if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { + // partition search starting from a variance-based partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_use_partition_time); +#endif + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, td->pc_root); + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, + sf->part_sf.partition_search_type); + td->pc_root = NULL; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_use_partition_time); +#endif + } +#if !CONFIG_REALTIME_ONLY + else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { + // partition search by adjusting a fixed-size partition + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + const BLOCK_SIZE bsize = + seg_skip ? sb_size : sf->part_sf.fixed_partition_size; + av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, td->pc_root); + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, + sf->part_sf.partition_search_type); + td->pc_root = NULL; + } else { + // The most exhaustive recursive partition search + SuperBlockEnc *sb_enc = &x->sb_enc; + // No stats for overlay frames. Exclude key frame. + av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc); + + // Reset the tree for simple motion search data + av1_reset_simple_motion_tree_partition(sms_root, sb_size); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif + + // Estimate the maximum square partition block size, which will be used + // as the starting block size for partitioning the sb + set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col); + + // The superblock can be searched only once, or twice consecutively for + // better quality. Note that the meaning of passes here is different from + // the general concept of 1-pass/2-pass encoders. + const int num_passes = + cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1; + + if (cpi->oxcf.sb_qp_sweep && + !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0) && + cm->delta_q_info.delta_q_present_flag) { + AOM_CHECK_MEM_ERROR( + x->e_mbd.error_info, td->mb.sb_stats_cache, + (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache))); + av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row, + mi_col); + assert(x->rdmult_delta_qindex == x->delta_qindex); + + const int best_qp_diff = + sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root, + td->mb.sb_stats_cache) - + x->rdmult_delta_qindex; + + sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc, + mi_row, mi_col, best_qp_diff); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + const int backup_current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + + av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); + av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row, + mi_col); + + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = + backup_current_qindex; + aom_free(td->mb.sb_stats_cache); + td->mb.sb_stats_cache = NULL; + } + if (num_passes == 1) { +#if CONFIG_PARTITION_SEARCH_ORDER + if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) { + av1_reset_part_sf(&cpi->sf.part_sf); + av1_reset_sf_for_ext_part(cpi); + RD_STATS this_rdc; + av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row, + mi_col, sb_size, &this_rdc); + } else { + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, + NULL, SB_SINGLE_PASS, NULL); + } +#else + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, + SB_SINGLE_PASS, NULL); +#endif // CONFIG_PARTITION_SEARCH_ORDER + } else { + // First pass + AOM_CHECK_MEM_ERROR( + x->e_mbd.error_info, td->mb.sb_fp_stats, + (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats))); + av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row, + mi_col); + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, + SB_DRY_PASS, NULL); + + // Second pass + init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, + mi_col, 0); + av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col); + av1_reset_simple_motion_tree_partition(sms_root, sb_size); + + av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row, + mi_col); + + td->pc_root = av1_alloc_pc_tree_node(sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, + &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL, + SB_WET_PASS, NULL); + aom_free(td->mb.sb_fp_stats); + td->mb.sb_fp_stats = NULL; + } + + // Reset to 0 so that it wouldn't be used elsewhere mistakenly. + sb_enc->tpl_data_count = 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif + } +#endif // !CONFIG_REALTIME_ONLY + + // Update the inter rd model + // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && + cm->tiles.cols == 1 && cm->tiles.rows == 1) { + av1_inter_mode_data_fit(tile_data, x->rdmult); + } +} + +// Check if the cost update of symbols mode, coeff and dv are tile or off. +static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off( + const AV1_COMP *const cpi) { + const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf; + + return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE && + inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE && + cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE); +} + +// When row-mt is enabled and cost update frequencies are set to off/tile, +// processing of current SB can start even before processing of top-right SB +// is finished. This function checks if it is sufficient to wait for top SB +// to finish processing before current SB starts processing. +static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) { + const MODE mode = cpi->oxcf.mode; + if (mode == GOOD) return 0; + + if (mode == ALLINTRA) + return is_mode_coeff_dv_upd_freq_tile_or_off(cpi); + else if (mode == REALTIME) + return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) && + cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE); + else + return 0; +} + +/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD + * + * \ingroup partition_search + * \callgraph + * \callergraph + */ +static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row, + int mi_col) { + if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX; + + const AV1_COMMON *const cm = &cpi->common; + const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int num_blk_64x64_cols = + (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis; + const int num_blk_64x64_rows = + (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis; + const int blk_64x64_col_index = mi_col / blk_64x64_in_mis; + const int blk_64x64_row_index = mi_row / blk_64x64_in_mis; + uint64_t curr_sb_sad = UINT64_MAX; + const uint64_t *const src_sad_blk_64x64_data = + &cpi->src_sad_blk_64x64[blk_64x64_col_index + + blk_64x64_row_index * num_blk_64x64_cols]; + if (cm->seq_params->sb_size == BLOCK_128X128 && + blk_64x64_col_index + 1 < num_blk_64x64_cols && + blk_64x64_row_index + 1 < num_blk_64x64_rows) { + // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the + // superblock + curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] + + src_sad_blk_64x64_data[num_blk_64x64_cols] + + src_sad_blk_64x64_data[num_blk_64x64_cols + 1]; + } else if (cm->seq_params->sb_size == BLOCK_64X64) { + curr_sb_sad = src_sad_blk_64x64_data[0]; + } + return curr_sb_sad; +} + +/*!\brief Determine whether grading content can be skipped based on sad stat + * + * \ingroup partition_search + * \callgraph + * \callergraph + */ +static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi, + MACROBLOCK *const x, + int mi_row, int mi_col) { + if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) + return true; + const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col); + if (curr_sb_sad == UINT64_MAX) return true; + if (curr_sb_sad == 0) { + x->content_state_sb.source_sad_nonrd = kZeroSad; + return false; + } + AV1_COMMON *const cm = &cpi->common; + bool do_calc_src_content = true; + + if (cpi->oxcf.speed < 9) return do_calc_src_content; + + // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size. + if (AOMMIN(cm->width, cm->height) < 360) { + // Derive Average 64x64 block source SAD from SB source SAD + const uint64_t avg_64x64_blk_sad = + (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2) + : curr_sb_sad; + + // The threshold is determined based on kLowSad and kHighSad threshold and + // test results. + const uint64_t thresh_low = 15000; + const uint64_t thresh_high = 40000; + + if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) { + do_calc_src_content = false; + // Note: set x->content_state_sb.source_sad_rd as well if this is extended + // to RTC rd path. + x->content_state_sb.source_sad_nonrd = kMedSad; + } + } + + return do_calc_src_content; +} + +/*!\brief Determine whether grading content is needed based on sf and frame stat + * + * \ingroup partition_search + * \callgraph + * \callergraph + */ +// TODO(any): consolidate sfs to make interface cleaner +static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi, + MACROBLOCK *const x, + TileDataEnc *tile_data, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + if (cm->current_frame.frame_type == KEY_FRAME || + (cpi->ppi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { + assert(x->content_state_sb.source_sad_nonrd == kMedSad); + assert(x->content_state_sb.source_sad_rd == kMedSad); + return; + } + bool calc_src_content = false; + + if (cpi->sf.rt_sf.source_metrics_sb_nonrd) { + if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) { + calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col); + } else { + x->content_state_sb.source_sad_nonrd = kZeroSad; + } + } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) && + (cm->width * cm->height <= 352 * 288)) { + if (cpi->rc.frame_source_sad > 0) + calc_src_content = true; + else + x->content_state_sb.source_sad_rd = kZeroSad; + } + if (calc_src_content) + av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col); +} + +/*!\brief Encode a superblock row by breaking it into superblocks + * + * \ingroup partition_search + * \callgraph + * \callergraph + * Do partition and mode search for an sb row: one row of superblocks filling up + * the width of the current tile. + */ +static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, int mi_row, + TokenExtra **tp) { + AV1_COMMON *const cm = &cpi->common; + const TileInfo *const tile_info = &tile_data->tile_info; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + bool row_mt_enabled = mt_info->row_mt_enabled; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mib_size = cm->seq_params->mib_size; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; + const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_row_time); +#endif + + // Initialize the left context for the new SB row + av1_zero_left_context(xd); + + // Reset delta for quantizer and loof filters at the beginning of every tile + if (mi_row == tile_info->mi_row_start || row_mt_enabled) { + if (cm->delta_q_info.delta_q_present_flag) + xd->current_base_qindex = cm->quant_params.base_qindex; + if (cm->delta_q_info.delta_lf_present_flag) { + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + } + } + + reset_thresh_freq_fact(x); + + // Code each SB in the row + for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0; + mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) { + // In realtime/allintra mode and when frequency of cost updates is off/tile, + // wait for the top superblock to finish encoding. Otherwise, wait for the + // top-right superblock to finish encoding. + enc_row_mt->sync_read_ptr( + row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi)); + +#if CONFIG_MULTITHREAD + if (row_mt_enabled) { + pthread_mutex_lock(enc_row_mt->mutex_); + const bool row_mt_exit = enc_row_mt->row_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Exit in case any worker has encountered an error. + if (row_mt_exit) return; + } +#endif + + const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled; + if (update_cdf && (tile_info->mi_row_start != mi_row)) { + if ((tile_info->mi_col_start == mi_col)) { + // restore frame context at the 1st column sb + memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx)); + } else { + // update context + int wt_left = AVG_CDF_WEIGHT_LEFT; + int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT; + if (tile_info->mi_col_end > (mi_col + mib_size)) + av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, + wt_left, wt_tr); + else + av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1, + wt_left, wt_tr); + } + } + + // Update the rate cost tables for some symbols + av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col); + + // Reset color coding related parameters + av1_zero(x->color_sensitivity_sb); + av1_zero(x->color_sensitivity_sb_g); + av1_zero(x->color_sensitivity_sb_alt); + av1_zero(x->color_sensitivity); + x->content_state_sb.source_sad_nonrd = kMedSad; + x->content_state_sb.source_sad_rd = kMedSad; + x->content_state_sb.lighting_change = 0; + x->content_state_sb.low_sumdiff = 0; + x->force_zeromv_skip_for_sb = 0; + x->sb_me_block = 0; + x->sb_me_partition = 0; + x->sb_me_mv.as_int = 0; + + if (cpi->oxcf.mode == ALLINTRA) { + x->intra_sb_rdmult_modifier = 128; + } + + xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv; + x->source_variance = UINT_MAX; + td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + + // Get segment id and skip flag + const struct segmentation *const seg = &cm->seg; + int seg_skip = 0; + if (seg->enabled) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + const uint8_t segment_id = + map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col) + : 0; + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + } + + produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col); + + init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks, + sb_size); + + // Grade the temporal variation of the sb, the grade will be used to decide + // fast mode search strategy for coding blocks + grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); + + // encode the superblock + if (use_nonrd_mode) { + encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); + } else { + encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); + } + + // Update the top-right context in row_mt coding + if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) { + if (sb_cols_in_tile == 1) + memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx)); + else if (sb_col_in_tile >= 1) + memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx, + sizeof(*xd->tile_ctx)); + } + enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile, + sb_cols_in_tile); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_row_time); +#endif +} + +static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCKD *const xd = &x->e_mbd; + + // Copy data over into macro block data structures. + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, + cm->seq_params->sb_size); + + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); +} + +void av1_alloc_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + av1_row_mt_mem_dealloc(cpi); + + aom_free(cpi->tile_data); + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; + + CHECK_MEM_ERROR( + cm, cpi->tile_data, + aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); + + cpi->allocated_tiles = tile_cols * tile_rows; + enc_row_mt->allocated_tile_cols = tile_cols; + enc_row_mt->allocated_tile_rows = tile_rows; + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + av1_zero(this_tile->row_mt_sync); + this_tile->row_ctx = NULL; + } + } +} + +void av1_init_tile_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + TokenInfo *const token_info = &cpi->token_info; + TokenExtra *pre_tok = token_info->tile_tok[0][0]; + TokenList *tplist = token_info->tplist[0][0]; + unsigned int tile_tok = 0; + int tplist_count = 0; + + if (!is_stat_generation_stage(cpi) && + cm->features.allow_screen_content_tools) { + // Number of tokens for which token info needs to be allocated. + unsigned int tokens_required = + get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols, + MAX_SB_SIZE_LOG2, num_planes); + // Allocate/reallocate memory for token related info if the number of tokens + // required is more than the number of tokens already allocated. This could + // occur in case of the following: + // 1) If the memory is not yet allocated + // 2) If the frame dimensions have changed + const bool realloc_tokens = tokens_required > token_info->tokens_allocated; + if (realloc_tokens) { + free_token_info(token_info); + alloc_token_info(cm, token_info, tokens_required); + pre_tok = token_info->tile_tok[0][0]; + tplist = token_info->tplist[0][0]; + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *const tile_info = &tile_data->tile_info; + av1_tile_init(tile_info, cm, tile_row, tile_col); + tile_data->firstpass_top_mv = kZeroMv; + tile_data->abs_sum_level = 0; + + if (is_token_info_allocated(token_info)) { + token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; + pre_tok = token_info->tile_tok[tile_row][tile_col]; + tile_tok = allocated_tokens( + tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, + num_planes); + token_info->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = token_info->tplist[tile_row][tile_col]; + tplist_count = av1_get_sb_rows_in_tile(cm, tile_info); + } + tile_data->allow_update_cdf = !cm->tiles.large_scale; + tile_data->allow_update_cdf = tile_data->allow_update_cdf && + !cm->features.disable_cdf_update && + !delay_wait_for_top_right_sb(cpi); + tile_data->tctx = *cm->fc; + } + } +} + +// Populate the start palette token info prior to encoding an SB row. +static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info, + int tile_row, int tile_col, int mi_row, + TokenExtra **tp) { + const TokenInfo *token_info = &cpi->token_info; + if (!is_token_info_allocated(token_info)) return; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col]; + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; + + get_start_tok(cpi, tile_row, tile_col, mi_row, tp, + cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes); + assert(tplist != NULL); + tplist[sb_row_in_tile].start = *tp; +} + +// Populate the token count after encoding an SB row. +static AOM_INLINE void populate_token_count(AV1_COMP *cpi, + const TileInfo *tile_info, + int tile_row, int tile_col, + int mi_row, TokenExtra *tok) { + const TokenInfo *token_info = &cpi->token_info; + if (!is_token_info_allocated(token_info)) return; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TokenList *const tplist = token_info->tplist[tile_row][tile_col]; + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int num_mb_rows_in_sb = + ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; + tplist[sb_row_in_tile].count = + (unsigned int)(tok - tplist[sb_row_in_tile].start); + + assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <= + get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, + cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, + num_planes)); + + (void)num_planes; + (void)tile_mb_cols; + (void)num_mb_rows_in_sb; +} + +/*!\brief Encode a superblock row + * + * \ingroup partition_search + */ +void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TokenExtra *tok = NULL; + + get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok); + + encode_sb_row(cpi, td, this_tile, mi_row, &tok); + + populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok); +} + +/*!\brief Encode a tile + * + * \ingroup partition_search + */ +void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, + int tile_col) { + AV1_COMMON *const cm = &cpi->common; + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile); + + av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, + tile_info->mi_col_end, tile_row); + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->mb.e_mbd); + + if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra) + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); + + if (td->mb.txfm_search_info.mb_rd_record != NULL) { + av1_crc32c_calculator_init( + &td->mb.txfm_search_info.mb_rd_record->crc_calculator); + } + + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += cm->seq_params->mib_size) { + av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); + } + this_tile->abs_sum_level = td->abs_sum_level; +} + +/*!\brief Break one frame into tiles and encode the tiles + * + * \ingroup partition_search + * + * \param[in] cpi Top-level encoder structure + */ +static AOM_INLINE void encode_tiles(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + + MACROBLOCK *const mb = &cpi->td.mb; + assert(IMPLIES(cpi->tile_data == NULL, + cpi->allocated_tiles < tile_cols * tile_rows)); + if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + av1_alloc_mb_data(cpi, mb); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + cpi->td.intrabc_used = 0; + cpi->td.deltaq_used = 0; + cpi->td.abs_sum_level = 0; + cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0; + cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0; + cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; + cpi->td.mb.tile_pb_ctx = &this_tile->tctx; + av1_init_rtc_counters(&cpi->td.mb); + cpi->td.mb.palette_pixels = 0; + av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); + if (!frame_is_intra_only(&cpi->common)) + av1_accumulate_rtc_counters(cpi, &cpi->td.mb); + cpi->palette_pixel_num += cpi->td.mb.palette_pixels; + cpi->intrabc_used |= cpi->td.intrabc_used; + cpi->deltaq_used |= cpi->td.deltaq_used; + } + } + + av1_dealloc_mb_data(mb, av1_num_planes(cm)); +} + +// Set the relative distance of a reference frame w.r.t. current frame +static AOM_INLINE void set_rel_frame_dist( + const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info, + const int ref_frame_flags) { + MV_REFERENCE_FRAME ref_frame; + int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX; + ref_frame_dist_info->nearest_past_ref = NONE_FRAME; + ref_frame_dist_info->nearest_future_ref = NONE_FRAME; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0; + if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + int dist = av1_encoder_get_relative_dist( + cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME], + cm->current_frame.display_order_hint); + ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist; + // Get the nearest ref_frame in the past + if (abs(dist) < min_past_dist && dist < 0) { + ref_frame_dist_info->nearest_past_ref = ref_frame; + min_past_dist = abs(dist); + } + // Get the nearest ref_frame in the future + if (dist < min_future_dist && dist > 0) { + ref_frame_dist_info->nearest_future_ref = ref_frame; + min_future_dist = dist; + } + } + } +} + +static INLINE int refs_are_one_sided(const AV1_COMMON *cm) { + assert(!frame_is_intra_only(cm)); + + int one_sided_refs = 1; + const int cur_display_order_hint = cm->current_frame.display_order_hint; + for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) continue; + if (av1_encoder_get_relative_dist(buf->display_order_hint, + cur_display_order_hint) > 0) { + one_sided_refs = 0; // bwd reference + break; + } + } + return one_sided_refs; +} + +static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm, + int ref_order_hint[2]) { + const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + ref_order_hint[0] = ref_order_hint[1] = 0; + if (!skip_mode_info->skip_mode_allowed) return; + + const RefCntBuffer *const buf_0 = + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0); + const RefCntBuffer *const buf_1 = + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1); + assert(buf_0 != NULL && buf_1 != NULL); + + ref_order_hint[0] = buf_0->order_hint; + ref_order_hint[1] = buf_1->order_hint; +} + +static int check_skip_mode_enabled(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + + av1_setup_skip_mode_allowed(cm); + if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0; + + // Turn off skip mode if the temporal distances of the reference pair to the + // current frame are different by more than 1 frame. + const int cur_offset = (int)cm->current_frame.order_hint; + int ref_offset[2]; + get_skip_mode_ref_offsets(cm, ref_offset); + const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info, + cur_offset, ref_offset[0]); + const int cur_to_ref1 = abs(get_relative_dist( + &cm->seq_params->order_hint_info, cur_offset, ref_offset[1])); + if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0; + + // High Latency: Turn off skip mode if all refs are fwd. + if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0; + + const int ref_frame[2] = { + cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME, + cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME + }; + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]])) + return 0; + + return 1; +} + +static AOM_INLINE void set_default_interp_skip_flags( + const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) { + const int num_planes = av1_num_planes(cm); + interp_search_flags->default_interp_skip_flags = + (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA + : INTERP_SKIP_LUMA_SKIP_CHROMA; +} + +static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) { + if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp || + cpi->sf.inter_sf.disable_onesided_comp) && + cpi->all_one_sided_refs) { + // Disable all compound references + cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES); + } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->sf.inter_sf.selective_ref_frame >= 2) { + AV1_COMMON *const cm = &cpi->common; + const int cur_frame_display_order_hint = + cm->current_frame.display_order_hint; + unsigned int *ref_display_order_hint = + cm->cur_frame->ref_display_order_hint; + const int arf2_dist = av1_encoder_get_relative_dist( + ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME], + cur_frame_display_order_hint); + const int bwd_dist = av1_encoder_get_relative_dist( + ref_display_order_hint[BWDREF_FRAME - LAST_FRAME], + cur_frame_display_order_hint); + + for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) { + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_idx); + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) { + continue; + } + + if (!cpi->all_one_sided_refs) { + int ref_dist[2]; + for (int i = 0; i < 2; ++i) { + ref_dist[i] = av1_encoder_get_relative_dist( + ref_display_order_hint[rf[i] - LAST_FRAME], + cur_frame_display_order_hint); + } + + // One-sided compound is used only when all reference frames are + // one-sided. + if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) { + cpi->prune_ref_frame_mask |= 1 << ref_idx; + } + } + + if (cpi->sf.inter_sf.selective_ref_frame >= 4 && + (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) { + // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references. + if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) { + // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer + // reference to the current frame than ALTREF2_FRAME + cpi->prune_ref_frame_mask |= 1 << ref_idx; + } + } + } + } +} + +static int allow_deltaq_mode(AV1_COMP *cpi) { +#if !CONFIG_REALTIME_ONLY + AV1_COMMON *const cm = &cpi->common; + BLOCK_SIZE sb_size = cm->seq_params->sb_size; + int sbs_wide = mi_size_wide[sb_size]; + int sbs_high = mi_size_high[sb_size]; + + int64_t delta_rdcost = 0; + for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) { + for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) { + int64_t this_delta_rdcost = 0; + av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size, + mi_row, mi_col); + delta_rdcost += this_delta_rdcost; + } + } + return delta_rdcost < 0; +#else + (void)cpi; + return 1; +#endif // !CONFIG_REALTIME_ONLY +} + +#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000 +#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4 + +// Populates block level thresholds for force zeromv-skip decision +static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) { + if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return; + + // Threshold for forcing zeromv-skip decision is as below: + // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103. + // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221 + // allowing slightly higher error for smaller blocks. + // Per Pixel Threshold of 64x64 block Area of 64x64 block 1 1 + // ------------------------------------=sqrt(---------------------)=sqrt(-)=- + // Per Pixel Threshold of 128x128 block Area of 128x128 block 4 2 + // Thus, per pixel thresholds for blocks of size 32x32, 16x16,... can be + // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for + // small blocks, the same is clipped to 4. + const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF; + const int num_128x128_pix = + block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128]; + + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) { + const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize]; + + // Calculate the threshold for zeromv-skip decision based on area of the + // partition + unsigned int thresh_exit_part_blk = + (unsigned int)(thresh_exit_128x128_part * + sqrt((double)num_block_pix / num_128x128_pix) + + 0.5); + thresh_exit_part_blk = AOMMIN( + thresh_exit_part_blk, + (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix)); + cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk; + } +} + +static void free_block_hash_buffers(uint32_t *block_hash_values[2][2], + int8_t *is_block_same[2][3]) { + for (int k = 0; k < 2; ++k) { + for (int j = 0; j < 2; ++j) { + aom_free(block_hash_values[k][j]); + } + + for (int j = 0; j < 3; ++j) { + aom_free(is_block_same[k][j]); + } + } +} + +/*!\brief Encoder setup(only for the current frame), encoding, and recontruction + * for a single frame + * + * \ingroup high_level_algo + */ +static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { + ThreadData *const td = &cpi->td; + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &x->e_mbd; + RD_COUNTS *const rdc = &cpi->td.rd_counts; +#if CONFIG_FPMT_TEST + FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; + FrameProbInfo *const temp_frame_probs_simulation = + &cpi->ppi->temp_frame_probs_simulation; +#endif + FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; + IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode; + int i; + + if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { + mi_params->setup_mi(mi_params); + } + + set_mi_offsets(mi_params, xd, 0, 0); + + av1_zero(*td->counts); + av1_zero(rdc->tx_type_used); + av1_zero(rdc->obmc_used); + av1_zero(rdc->warped_used); + av1_zero(rdc->seg_tmp_pred_cost); + + // Reset the flag. + cpi->intrabc_used = 0; + // Need to disable intrabc when superres is selected + if (av1_superres_scaled(cm)) { + features->allow_intrabc = 0; + } + + features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc); + + if (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int warped_probability = +#if CONFIG_FPMT_TEST + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE + ? temp_frame_probs->warped_probs[update_type] + : +#endif // CONFIG_FPMT_TEST + frame_probs->warped_probs[update_type]; + if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh) + features->allow_warped_motion = 0; + } + + int hash_table_created = 0; + if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) && + !cpi->sf.rt_sf.use_nonrd_pick_mode) { + // TODO(any): move this outside of the recoding loop to avoid recalculating + // the hash table. + // add to hash table + const int pic_width = cpi->source->y_crop_width; + const int pic_height = cpi->source->y_crop_height; + uint32_t *block_hash_values[2][2] = { { NULL } }; + int8_t *is_block_same[2][3] = { { NULL } }; + int k, j; + bool error = false; + + for (k = 0; k < 2 && !error; ++k) { + for (j = 0; j < 2; ++j) { + block_hash_values[k][j] = (uint32_t *)aom_malloc( + sizeof(*block_hash_values[0][0]) * pic_width * pic_height); + if (!block_hash_values[k][j]) { + error = true; + break; + } + } + + for (j = 0; j < 3 && !error; ++j) { + is_block_same[k][j] = (int8_t *)aom_malloc( + sizeof(*is_block_same[0][0]) * pic_width * pic_height); + if (!is_block_same[k][j]) error = true; + } + } + + av1_hash_table_init(intrabc_hash_info); + if (error || + !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) { + free_block_hash_buffers(block_hash_values, is_block_same); + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating intrabc_hash_table and buffers"); + } + hash_table_created = 1; + av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source, + block_hash_values[0], is_block_same[0]); + // Hash data generated for screen contents is used for intraBC ME + const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize]; + const int max_sb_size = + (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)); + int src_idx = 0; + for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) { + const int dst_idx = !src_idx; + av1_generate_block_hash_value( + intrabc_hash_info, cpi->source, size, block_hash_values[src_idx], + block_hash_values[dst_idx], is_block_same[src_idx], + is_block_same[dst_idx]); + if (size >= min_alloc_size) { + if (!av1_add_to_hash_map_by_row_with_precal_data( + &intrabc_hash_info->intrabc_hash_table, + block_hash_values[dst_idx], is_block_same[dst_idx][2], + pic_width, pic_height, size)) { + error = true; + break; + } + } + } + + free_block_hash_buffers(block_hash_values, is_block_same); + + if (error) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error adding data to intrabc_hash_table"); + } + } + + const CommonQuantParams *quant_params = &cm->quant_params; + for (i = 0; i < MAX_SEGMENTS; ++i) { + const int qindex = + cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex) + : quant_params->base_qindex; + xd->lossless[i] = + qindex == 0 && quant_params->y_dc_delta_q == 0 && + quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 && + quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0; + if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1; + xd->qindex[i] = qindex; + if (xd->lossless[i]) { + cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT; + } else { + cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients; + } + } + features->coded_lossless = is_coded_lossless(cm, xd); + features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm); + + // Fix delta q resolution for the moment + + cm->delta_q_info.delta_q_res = 0; + if (cpi->use_ducky_encode) { + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE; + } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) { + if (deltaq_mode == DELTA_Q_OBJECTIVE) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE; + else if (deltaq_mode == DELTA_Q_PERCEPTUAL) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + else if (deltaq_mode == DELTA_Q_USER_RATING_BASED) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + else if (deltaq_mode == DELTA_Q_HDR) + cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL; + // Set delta_q_present_flag before it is used for the first time + cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES; + cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q; + + // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q + // is used for ineligible frames. That effectively will turn off row_mt + // usage. Note objective delta_q and tpl eligible frames are only altref + // frames currently. + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (cm->delta_q_info.delta_q_present_flag) { + if (deltaq_mode == DELTA_Q_OBJECTIVE && + gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE) + cm->delta_q_info.delta_q_present_flag = 0; + + if (deltaq_mode == DELTA_Q_OBJECTIVE && + cm->delta_q_info.delta_q_present_flag) { + cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi); + } + } + + // Reset delta_q_used flag + cpi->deltaq_used = 0; + + cm->delta_q_info.delta_lf_present_flag = + cm->delta_q_info.delta_q_present_flag && + oxcf->tool_cfg.enable_deltalf_mode; + cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI; + + // update delta_q_present_flag and delta_lf_present_flag based on + // base_qindex + cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0; + cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0; + } else if (cpi->cyclic_refresh->apply_cyclic_refresh || + cpi->svc.number_temporal_layers == 1) { + cpi->cyclic_refresh->actual_num_seg1_blocks = 0; + cpi->cyclic_refresh->actual_num_seg2_blocks = 0; + } + cpi->rc.cnt_zeromv = 0; + + av1_frame_init_quantizer(cpi); + init_encode_frame_mb_context(cpi); + set_default_interp_skip_flags(cm, &cpi->interp_search_flags); + + if (cm->prev_frame && cm->prev_frame->seg.enabled) + cm->last_frame_seg_map = cm->prev_frame->seg_map; + else + cm->last_frame_seg_map = NULL; + if (features->allow_intrabc || features->coded_lossless) { + av1_set_default_ref_deltas(cm->lf.ref_deltas); + av1_set_default_mode_deltas(cm->lf.mode_deltas); + } else if (cm->prev_frame) { + memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES); + memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS); + } + memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES); + memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); + + cpi->all_one_sided_refs = + frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm); + + cpi->prune_ref_frame_mask = 0; + // Figure out which ref frames can be skipped at frame level. + setup_prune_ref_frame_mask(cpi); + + x->txfm_search_info.txb_split_count = 0; +#if CONFIG_SPEED_STATS + x->txfm_search_info.tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + +#if !CONFIG_REALTIME_ONLY +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_compute_global_motion_time); +#endif + av1_compute_global_motion_facade(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_compute_global_motion_time); +#endif +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_setup_motion_field_time); +#endif + av1_calculate_ref_frame_side(cm); + if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_setup_motion_field_time); +#endif + + cm->current_frame.skip_mode_info.skip_mode_flag = + check_skip_mode_enabled(cpi); + + // Initialization of skip mode cost depends on the value of + // 'skip_mode_flag'. This initialization happens in the function + // av1_fill_mode_rates(), which is in turn called in + // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts() + // has to be called after 'skip_mode_flag' is initialized. + av1_initialize_rd_consts(cpi); + av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex); + populate_thresh_to_force_zeromv_skip(cpi); + + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; + mt_info->row_mt_enabled = 0; + mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS], + cm->tiles.cols * cm->tiles.rows) > 1; + + if (oxcf->row_mt && (mt_info->num_workers > 1)) { + mt_info->row_mt_enabled = 1; + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write; + av1_encode_tiles_row_mt(cpi); + } else { + if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1) { + av1_encode_tiles_mt(cpi); + } else { + // Preallocate the pc_tree for realtime coding to reduce the cost of + // memory allocation. + const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + if (use_nonrd_mode) { + td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } else { + td->pc_root = NULL; + } + + encode_tiles(cpi); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + } + } + + // If intrabc is allowed but never selected, reset the allow_intrabc flag. + if (features->allow_intrabc && !cpi->intrabc_used) { + features->allow_intrabc = 0; + } + if (features->allow_intrabc) { + cm->delta_q_info.delta_lf_present_flag = 0; + } + + if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) { + cm->delta_q_info.delta_q_present_flag = 0; + } + + // Set the transform size appropriately before bitstream creation + const MODE_EVAL_TYPE eval_type = + cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch + ? WINNER_MODE_EVAL + : DEFAULT_EVAL; + const TX_SIZE_SEARCH_METHOD tx_search_type = + cpi->winner_mode_params.tx_size_search_methods[eval_type]; + assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL); + features->tx_mode = select_tx_mode(cm, tx_search_type); + + // Retain the frame level probability update conditions for parallel frames. + // These conditions will be consumed during postencode stage to update the + // probability. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] = + cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats; + cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] = + (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX); + cpi->do_update_frame_probs_warp[cpi->num_frame_recode] = + (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0); + cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] = + (cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && + features->interp_filter == SWITCHABLE); + } + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats || + ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != + INT_MAX) && + (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + for (i = 0; i < TX_SIZES_ALL; i++) { + int sum = 0; + int j; + int left = MAX_TX_TYPE_PROB; + + for (j = 0; j < TX_TYPES; j++) + sum += cpi->td.rd_counts.tx_type_used[i][j]; + + for (j = TX_TYPES - 1; j >= 0; j--) { + int update_txtype_frameprobs = 1; + const int new_prob = + sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum + : (j ? 0 : MAX_TX_TYPE_PROB); +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == + 0) { + int prob = + (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = + prob; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->tx_type_probs[update_type_idx][i][j] = + temp_frame_probs_simulation + ->tx_type_probs[update_type_idx][i][j]; + } + } + update_txtype_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_txtype_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode] + .tx_type_probs[update_type][i][j] = new_prob; + } + if (update_txtype_frameprobs) { + int prob = + (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->tx_type_probs[update_type][i][j] = prob; + } + } + } + } + + if (cm->seg.enabled) { + cm->seg.temporal_update = 1; + if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1]) + cm->seg.temporal_update = 0; + } + + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + int sum = 0; + int update_obmc_frameprobs = 1; + for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j]; + + const int new_prob = + sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + temp_frame_probs_simulation->obmc_probs[update_type][i] = + (temp_frame_probs_simulation->obmc_probs[update_type][i] + + new_prob) >> + 1; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->obmc_probs[update_type_idx][i] = + temp_frame_probs_simulation->obmc_probs[update_type_idx][i]; + } + } + update_obmc_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_obmc_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] = + new_prob; + } + if (update_obmc_frameprobs) { + frame_probs->obmc_probs[update_type][i] = + (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; + } + } + } + + if (features->allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int update_warp_frameprobs = 1; + int sum = 0; + for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i]; + const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + temp_frame_probs_simulation->warped_probs[update_type] = + (temp_frame_probs_simulation->warped_probs[update_type] + + new_prob) >> + 1; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->warped_probs[update_type_idx] = + temp_frame_probs_simulation->warped_probs[update_type_idx]; + } + } + update_warp_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_warp_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] = + new_prob; + } + if (update_warp_frameprobs) { + frame_probs->warped_probs[update_type] = + (frame_probs->warped_probs[update_type] + new_prob) >> 1; + } + } + + if (cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && + features->interp_filter == SWITCHABLE) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int sum = 0; + int j; + int left = 1536; + + for (j = 0; j < SWITCHABLE_FILTERS; j++) { + sum += cpi->td.counts->switchable_interp[i][j]; + } + + for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { + int update_interpfilter_frameprobs = 1; + const int new_prob = + sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum + : (j ? 0 : 1536); +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == + 0) { + int prob = (temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] = prob; + // Copy temp_frame_probs_simulation to temp_frame_probs + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = + temp_frame_probs_simulation + ->switchable_interp_probs[update_type_idx][i][j]; + } + } + update_interpfilter_frameprobs = 0; + } +#endif // CONFIG_FPMT_TEST + // Track the frame probabilities of parallel encode frames to update + // during postencode stage. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + update_interpfilter_frameprobs = 0; + cpi->frame_new_probs[cpi->num_frame_recode] + .switchable_interp_probs[update_type][i][j] = new_prob; + } + if (update_interpfilter_frameprobs) { + int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->switchable_interp_probs[update_type][i][j] = prob; + } + } + } + } + if (hash_table_created) { + av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table); + } +} + +/*!\brief Setup reference frame buffers and encode a frame + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * + * \param[in] cpi Top-level encoder structure + */ +void av1_encode_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + FeatureFlags *const features = &cm->features; + RD_COUNTS *const rdc = &cpi->td.rd_counts; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + // Indicates whether or not to use a default reduced set for ext-tx + // rather than the potential full set of 16 transforms + features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set; + + // Make sure segment_id is no larger than last_active_segid. + if (cm->seg.enabled && cm->seg.update_map) { + const int mi_rows = cm->mi_params.mi_rows; + const int mi_cols = cm->mi_params.mi_cols; + const int last_active_segid = cm->seg.last_active_segid; + uint8_t *map = cpi->enc_seg.map; + for (int mi_row = 0; mi_row < mi_rows; ++mi_row) { + for (int mi_col = 0; mi_col < mi_cols; ++mi_col) { + map[mi_col] = AOMMIN(map[mi_col], last_active_segid); + } + map += mi_cols; + } + } + + av1_setup_frame_buf_refs(cm); + enforce_max_ref_frames(cpi, &cpi->ref_frame_flags, + cm->cur_frame->ref_display_order_hint, + cm->current_frame.display_order_hint); + set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info, + cpi->ref_frame_flags); + av1_setup_frame_sign_bias(cm); + + // If global motion is enabled, then every buffer which is used as either + // a source or a ref frame should have an image pyramid allocated. + // Check here so that issues can be caught early in debug mode +#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY + if (cpi->image_pyramid_levels > 0) { + assert(cpi->source->y_pyramid); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + assert(buf->buf.y_pyramid); + } + } + } +#endif // !defined(NDEBUG) && !CONFIG_REALTIME_ONLY + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(av1_num_planes(cm)); +#endif + + rdc->newmv_or_intra_blocks = 0; + cpi->palette_pixel_num = 0; + + if (cpi->sf.hl_sf.frame_parameter_update || + cpi->sf.rt_sf.use_comp_ref_nonrd) { + if (frame_is_intra_only(cm)) + current_frame->reference_mode = SINGLE_REFERENCE; + else + current_frame->reference_mode = REFERENCE_MODE_SELECT; + + features->interp_filter = SWITCHABLE; + if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR; + + features->switchable_motion_mode = is_switchable_motion_mode_allowed( + features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc); + + rdc->compound_ref_used_flag = 0; + rdc->skip_mode_used_flag = 0; + + encode_frame_internal(cpi); + + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { + // Use a flag that includes 4x4 blocks + if (rdc->compound_ref_used_flag == 0) { + current_frame->reference_mode = SINGLE_REFERENCE; +#if CONFIG_ENTROPY_STATS + av1_zero(cpi->td.counts->comp_inter); +#endif // CONFIG_ENTROPY_STATS + } + } + // Re-check on the skip mode status as reference mode may have been + // changed. + SkipModeInfo *const skip_mode_info = ¤t_frame->skip_mode_info; + if (frame_is_intra_only(cm) || + current_frame->reference_mode == SINGLE_REFERENCE) { + skip_mode_info->skip_mode_allowed = 0; + skip_mode_info->skip_mode_flag = 0; + } + if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0) + skip_mode_info->skip_mode_flag = 0; + + if (!cm->tiles.large_scale) { + if (features->tx_mode == TX_MODE_SELECT && + cpi->td.mb.txfm_search_info.txb_split_count == 0) + features->tx_mode = TX_MODE_LARGEST; + } + } else { + // This is needed if real-time speed setting is changed on the fly + // from one using compound prediction to one using single reference. + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) + current_frame->reference_mode = SINGLE_REFERENCE; + encode_frame_internal(cpi); + } +} diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h new file mode 100644 index 0000000000..ce32fb47e6 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_H_ + +#include "aom/aom_integer.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" + +#include "av1/encoder/global_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DELTA_Q_PERCEPTUAL_MODULATION \ + 1 // 0: variance based + // 1: wavelet AC energy based + +struct macroblock; +struct yv12_buffer_config; +struct AV1_COMP; +struct ThreadData; + +void av1_init_rtc_counters(struct macroblock *const x); + +void av1_accumulate_rtc_counters(struct AV1_COMP *cpi, + const struct macroblock *const x); + +void av1_setup_src_planes(struct macroblock *x, + const struct yv12_buffer_config *src, int mi_row, + int mi_col, const int num_planes, BLOCK_SIZE bsize); + +void av1_encode_frame(struct AV1_COMP *cpi); + +void av1_alloc_tile_data(struct AV1_COMP *cpi); +void av1_init_tile_data(struct AV1_COMP *cpi); +void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row, + int tile_col); +void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_ diff --git a/third_party/aom/av1/encoder/encodeframe_utils.c b/third_party/aom/av1/encoder/encodeframe_utils.c new file mode 100644 index 0000000000..949837184a --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe_utils.c @@ -0,0 +1,1775 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common_data.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/rdopt.h" + +void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + + const BLOCK_SIZE bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 1.0; + + // To avoid overflow of 'geom_mean_of_scale', bsize_base must be at least + // BLOCK_8X8. + // + // For bsize=BLOCK_128X128 and bsize_base=BLOCK_8X8, the loop below would + // iterate 256 times. Considering the maximum value of + // cpi->ssim_rdmult_scaling_factors (see av1_set_mb_ssim_rdmult_scaling()), + // geom_mean_of_scale can go up to 4.8323^256, which is within DBL_MAX + // (maximum value a double data type can hold). If bsize_base is modified to + // BLOCK_4X4 (minimum possible block size), geom_mean_of_scale can go up + // to 4.8323^1024 and exceed DBL_MAX, resulting in data overflow. + assert(bsize_base >= BLOCK_8X8); + assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM); + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0); + geom_mean_of_scale *= cpi->ssim_rdmult_scaling_factors[index]; + num_of_mi += 1.0; + } + } + geom_mean_of_scale = pow(geom_mean_of_scale, (1.0 / num_of_mi)); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(errorperbit, *rdmult); +} + +#if CONFIG_SALIENCY_MAP +void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi, + int *errorperbit, const BLOCK_SIZE bsize, + const int mi_row, const int mi_col, + int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + const int num_mi_w = mi_size_wide[bsize]; + const int num_mi_h = mi_size_high[bsize]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + + *rdmult = + (int)(*rdmult * cpi->sm_scaling_factor[(mi_row / num_mi_h) * num_cols + + (mi_col / num_mi_w)]); + + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(errorperbit, *rdmult); +} +#endif + +// TODO(angiebird): Move these function to tpl_model.c +#if !CONFIG_REALTIME_ONLY +// Return the end column for the current superblock, in unit of TPL blocks. +static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col, + int num_mi_w) { + // Find the start column of this superblock. + const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2) + << cm->seq_params->mib_size_log2; + // Same but in superres upscaled dimension. + const int sb_mi_col_start_sr = + coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator); + // Width of this superblock in mi units. + const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size]; + // Same but in superres upscaled dimension. + const int sb_mi_width_sr = + coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator); + // Superblock end in mi units. + const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr; + // Superblock end in TPL units. + return (sb_mi_end + num_mi_w - 1) / num_mi_w; +} + +int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + int deltaq_rdmult = set_rdmult(cpi, x, -1); + if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult; + if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult; + if (x->rb == 0) return deltaq_rdmult; + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + int tpl_stride = tpl_frame->stride; + double intra_cost_base = 0; + double mc_dep_cost_base = 0; + double cbcmp_base = 0; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + for (int row = mi_row; row < mi_row + mi_high; row += step) { + for (int col = mi_col; col < mi_col + mi_wide; col += step) { + if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols) + continue; + + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + + double cbcmp = (double)this_stats->srcrf_dist; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + intra_cost_base += log(dist_scaled) * cbcmp; + mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp; + cbcmp_base += cbcmp; + } + } + + if (cbcmp_base == 0) return deltaq_rdmult; + + double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base); + deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb)); + + return AOMMAX(deltaq_rdmult, 1); +} + +int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int orig_rdmult) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + const int deltaq_rdmult = set_rdmult(cpi, x, -1); + if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) + return deltaq_rdmult; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult; + + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int block_mi_width_sr = + coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator); + + const BLOCK_SIZE bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + // This is required because the end col of superblock may be off by 1 in case + // of superres. + const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w); + int row, col; + double base_block_count = 0.0; + double geom_mean_of_scale = 0.0; + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col_sr / num_mi_h; + col < num_cols && col < mi_col_sr / num_mi_h + num_bcols && + col < sb_bcol_end; + ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]); + base_block_count += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count); + int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5); + rdmult = AOMMAX(rdmult, 0); + av1_set_error_per_bit(&x->errorperbit, rdmult); +#if !CONFIG_RD_COMMAND + if (bsize == cm->seq_params->sb_size) { + const int rdmult_sb = set_rdmult(cpi, x, -1); + assert(rdmult_sb == rdmult); + (void)rdmult_sb; + } +#endif // !CONFIG_RD_COMMAND + return rdmult; +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts, + const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi) { + int dir; + for (dir = 0; dir < 2; ++dir) { + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + + // Only allow the 3 valid SWITCHABLE_FILTERS. + assert(filter < SWITCHABLE_FILTERS); + ++counts->switchable_interp[ctx][filter]; + } +} + +// This function will copy the best reference mode information from +// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT. +static INLINE void copy_mbmi_ext_frame_to_mbmi_ext( + MB_MODE_INFO_EXT *mbmi_ext, + const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) { + memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; + mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; + memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, int mi_row, + int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) { + int i, x_idx, y; + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const MB_MODE_INFO *const mi = &ctx->mic; + MB_MODE_INFO *const mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + assert(bsize < BLOCK_SIZES_ALL); + const int bw = mi_size_wide[mi->bsize]; + const int bh = mi_size_high[mi->bsize]; + const int mis = mi_params->mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + assert(mi->bsize == bsize); + + *mi_addr = *mi; + copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best, + av1_ref_frame_type(ctx->mic.ref_frame)); + + memcpy(txfm_info->blk_skip, ctx->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + + txfm_info->skip_txfm = ctx->rd_stats.skip_txfm; + + xd->tx_type_map = ctx->tx_type_map; + xd->tx_type_map_stride = mi_size_wide[bsize]; + // If not dry_run, copy the transform type data into the frame level buffer. + // Encoder will fetch tx types when writing bitstream. + if (!dry_run) { + const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx; + const int mi_stride = mi_params->mi_stride; + for (int blk_row = 0; blk_row < bh; ++blk_row) { + av1_copy_array(tx_type_map + blk_row * mi_stride, + xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw); + } + xd->tx_type_map = tx_type_map; + xd->tx_type_map_stride = mi_stride; + } + + // If segmentation in use + if (seg->enabled) { + // For in frame complexity AQ copy the segment id from the segment map. + if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + mi_addr->segment_id = + map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0; + } + // Else for cyclic refresh mode update the segment map, set the segment id + // and then update the quantizer. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + !cpi->rc.rtc_external_ratectrl) { + av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize, + ctx->rd_stats.rate, ctx->rd_stats.dist, + txfm_info->skip_txfm, dry_run); + } + if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd)) + mi_addr->uv_mode = UV_DC_PRED; + + if (!dry_run && !mi_addr->skip_txfm) { + int cdf_num; + const uint8_t spatial_pred = av1_get_spatial_seg_pred( + cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4); + const uint8_t coded_id = av1_neg_interleave( + mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1); + int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id]; + td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost; + + const int pred_segment_id = + cm->last_frame_seg_map + ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row, + mi_col) + : 0; + const int use_tmp_pred = pred_segment_id == mi_addr->segment_id; + const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd); + td->rd_counts.seg_tmp_pred_cost[1] += + x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred]; + if (!use_tmp_pred) { + td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost; + } + } + } + + // Count zero motion vector. + if (!dry_run && !frame_is_intra_only(cm)) { + const MV mv = mi->mv[0].as_mv; + if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME && + abs(mv.row) < 8 && abs(mv.col) < 8) { + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + // Accumulate low_content_frame. + for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1; + } + } + + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + p[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + // Restore the coding context of the MB to that that was in place + // when the mode was picked for it + + const int cols = + AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width); + const int rows = AOMMIN( + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height); + for (y = 0; y < rows; y++) { + for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr; + } + + if (cpi->oxcf.q_cfg.aq_mode) + av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0); + + if (dry_run) return; + +#if CONFIG_INTERNAL_STATS + { + unsigned int *const mode_chosen_counts = + (unsigned int *)cpi->mode_chosen_counts; // Cast const away. + if (frame_is_intra_only(cm)) { + static const int kf_mode_index[] = { + THR_DC /*DC_PRED*/, + THR_V_PRED /*V_PRED*/, + THR_H_PRED /*H_PRED*/, + THR_D45_PRED /*D45_PRED*/, + THR_D135_PRED /*D135_PRED*/, + THR_D113_PRED /*D113_PRED*/, + THR_D157_PRED /*D157_PRED*/, + THR_D203_PRED /*D203_PRED*/, + THR_D67_PRED /*D67_PRED*/, + THR_SMOOTH, /*SMOOTH_PRED*/ + THR_SMOOTH_V, /*SMOOTH_V_PRED*/ + THR_SMOOTH_H, /*SMOOTH_H_PRED*/ + THR_PAETH /*PAETH_PRED*/, + }; + ++mode_chosen_counts[kf_mode_index[mi_addr->mode]]; + } else { + // Note how often each mode chosen as best + ++mode_chosen_counts[ctx->best_mode_index]; + } + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mi) && cm->features.interp_filter == SWITCHABLE) { + // When the frame interp filter is SWITCHABLE, several cases that always + // use the default type (EIGHTTAP_REGULAR) are described in + // av1_is_interp_needed(). Here, we should keep the counts for all + // applicable blocks, so the frame filter resetting decision in + // fix_interp_filter() is made correctly. + update_filter_type_count(td->counts, xd, mi_addr); + } + } + + const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col); + const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row); + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) + av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); +} + +void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, + PREDICTION_MODE mode, int16_t mode_context) { + (void)counts; + + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + if (mode == NEWMV) { +#if CONFIG_ENTROPY_STATS + ++counts->newmv_mode[mode_ctx][0]; +#endif + update_cdf(fc->newmv_cdf[mode_ctx], 0, 2); + return; + } + +#if CONFIG_ENTROPY_STATS + ++counts->newmv_mode[mode_ctx][1]; +#endif + update_cdf(fc->newmv_cdf[mode_ctx], 1, 2); + + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + if (mode == GLOBALMV) { +#if CONFIG_ENTROPY_STATS + ++counts->zeromv_mode[mode_ctx][0]; +#endif + update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2); + return; + } + +#if CONFIG_ENTROPY_STATS + ++counts->zeromv_mode[mode_ctx][1]; +#endif + update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2); + + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; +#if CONFIG_ENTROPY_STATS + ++counts->refmv_mode[mode_ctx][mode != NEARESTMV]; +#endif + update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2); +} + +static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + FRAME_COUNTS *counts) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const BLOCK_SIZE bsize = mbmi->bsize; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize); + + (void)counts; + + if (mbmi->mode == DC_PRED) { + const int n = pmi->palette_size[0]; + const int palette_mode_ctx = av1_get_palette_mode_ctx(xd); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0]; +#endif + update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx], + n > 0, 2); + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } + + if (mbmi->uv_mode == UV_DC_PRED) { + const int n = pmi->palette_size[1]; + const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0); + +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0]; +#endif + update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2); + + if (n > 0) { +#if CONFIG_ENTROPY_STATS + ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE]; +#endif + update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx], + n - PALETTE_MIN_SIZE, PALETTE_SIZES); + } + } +} + +void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, + MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, const int intraonly) { + FRAME_CONTEXT *fc = xd->tile_ctx; + const PREDICTION_MODE y_mode = mbmi->mode; + (void)counts; + const BLOCK_SIZE bsize = mbmi->bsize; + + if (intraonly) { +#if CONFIG_ENTROPY_STATS + const PREDICTION_MODE above = av1_above_block_mode(above_mi); + const PREDICTION_MODE left = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[above]; + const int left_ctx = intra_mode_context[left]; + ++counts->kf_y_mode[above_ctx][left_ctx][y_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES); + } else { +#if CONFIG_ENTROPY_STATS + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES); + } + + if (av1_filter_intra_allowed(cm, mbmi)) { + const int use_filter_intra_mode = + mbmi->filter_intra_mode_info.use_filter_intra; +#if CONFIG_ENTROPY_STATS + ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode]; + if (use_filter_intra_mode) { + ++counts + ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode]; + } +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2); + if (use_filter_intra_mode) { + update_cdf(fc->filter_intra_mode_cdf, + mbmi->filter_intra_mode_info.filter_intra_mode, + FILTER_INTRA_MODES); + } + } + if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[mbmi->mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA]; +#endif + update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED], + mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + + if (!xd->is_chroma_ref) return; + + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); +#if CONFIG_ENTROPY_STATS + ++counts->uv_mode[cfl_allowed][y_mode][uv_mode]; +#endif // CONFIG_ENTROPY_STATS + update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode, + UV_INTRA_MODES - !cfl_allowed); + if (uv_mode == UV_CFL_PRED) { + const int8_t joint_sign = mbmi->cfl_alpha_signs; + const uint8_t idx = mbmi->cfl_alpha_idx; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_sign[joint_sign]; +#endif + update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS); + if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)]; +#endif + update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE); + } + if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) { + aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + +#if CONFIG_ENTROPY_STATS + ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)]; +#endif + update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE); + } + } + const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->angle_delta[intra_mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA]; +#endif + update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED], + mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA, + 2 * MAX_ANGLE_DELTA + 1); + } + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + update_palette_cdf(xd, mbmi, counts); + } +} + +void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { + MACROBLOCKD *xd = &x->e_mbd; + int p; + const int num_4x4_blocks_wide = mi_size_wide[bsize]; + const int num_4x4_blocks_high = mi_size_high[bsize]; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + for (p = 0; p < num_planes; p++) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; + memcpy( + xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), + ctx->a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), + ctx->l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } + memcpy(xd->above_partition_context + mi_col, ctx->sa, + sizeof(*xd->above_partition_context) * mi_width); + memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl, + sizeof(xd->left_partition_context[0]) * mi_height); + xd->above_txfm_context = ctx->p_ta; + xd->left_txfm_context = ctx->p_tl; + memcpy(xd->above_txfm_context, ctx->ta, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(xd->left_txfm_context, ctx->tl, + sizeof(*xd->left_txfm_context) * mi_height); +} + +void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes) { + const MACROBLOCKD *xd = &x->e_mbd; + int p; + int mi_width = mi_size_wide[bsize]; + int mi_height = mi_size_high[bsize]; + + // buffer the above/left context information of the block in search. + for (p = 0; p < num_planes; ++p) { + int tx_col = mi_col; + int tx_row = mi_row & MAX_MIB_MASK; + memcpy( + ctx->a + mi_width * p, + xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x), + (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x); + memcpy(ctx->l + mi_height * p, + xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y); + } + memcpy(ctx->sa, xd->above_partition_context + mi_col, + sizeof(*xd->above_partition_context) * mi_width); + memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK), + sizeof(xd->left_partition_context[0]) * mi_height); + memcpy(ctx->ta, xd->above_txfm_context, + sizeof(*xd->above_txfm_context) * mi_width); + memcpy(ctx->tl, xd->left_txfm_context, + sizeof(*xd->left_txfm_context) * mi_height); + ctx->p_ta = xd->above_txfm_context; + ctx->p_tl = xd->left_txfm_context; +} + +static void set_partial_sb_partition(const AV1_COMMON *const cm, + MB_MODE_INFO *mi, int bh_in, int bw_in, + int mi_rows_remaining, + int mi_cols_remaining, BLOCK_SIZE bsize, + MB_MODE_INFO **mib) { + int bh = bh_in; + int r, c; + for (r = 0; r < cm->seq_params->mib_size; r += bh) { + int bw = bw_in; + for (c = 0; c < cm->seq_params->mib_size; c += bw) { + const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c); + const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c); + mib[grid_index] = mi + mi_index; + mib[grid_index]->bsize = find_partition_size( + bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw); + } + } +} + +// This function attempts to set all mode info entries in a given superblock +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MB_MODE_INFO **mib, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_rows_remaining = tile->mi_row_end - mi_row; + const int mi_cols_remaining = tile->mi_col_end - mi_col; + MB_MODE_INFO *const mi_upper_left = + mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col); + int bh = mi_size_high[bsize]; + int bw = mi_size_wide[bsize]; + + assert(bsize >= mi_params->mi_alloc_bsize && + "Attempted to use bsize < mi_params->mi_alloc_bsize"); + assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); + + // Apply the requested partition size to the SB if it is all "in image" + if ((mi_cols_remaining >= cm->seq_params->mib_size) && + (mi_rows_remaining >= cm->seq_params->mib_size)) { + for (int block_row = 0; block_row < cm->seq_params->mib_size; + block_row += bh) { + for (int block_col = 0; block_col < cm->seq_params->mib_size; + block_col += bw) { + const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col); + const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col); + mib[grid_index] = mi_upper_left + mi_index; + mib[grid_index]->bsize = bsize; + } + } + } else { + // Else this is a partial SB. + set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining, + mi_cols_remaining, bsize, mib); + } +} + +int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + assert(bsize >= BLOCK_8X8); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + for (int i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= cm->mi_params.mi_rows) || + (mi_col + x_idx >= cm->mi_params.mi_cols)) + return 0; + if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) != + PARTITION_NONE && + subsize != BLOCK_8X8) + return 0; + } + return 1; +} + +#if !CONFIG_REALTIME_ONLY +int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult) { + AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + + if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) { + return orig_rdmult; + } + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + return orig_rdmult; + } + +#ifndef NDEBUG + int mi_count = 0; +#endif + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int step = 1 << block_mis_log2; + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < mi_row + mi_high; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += this_stats->recrf_dist << RDDIV_BITS; + mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; +#ifndef NDEBUG + mi_count++; +#endif + } + } + assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + + double beta = 1.0; + if (mc_dep_cost > 0 && intra_cost > 0) { + const double r0 = cpi->rd.r0; + const double rk = (double)intra_cost / mc_dep_cost; + beta = (r0 / rk); + } + + int rdmult = av1_get_adaptive_rdmult(cpi, beta); + + rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2); + rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2); + + rdmult = AOMMAX(1, rdmult); + + return rdmult; +} + +// Checks to see if a super block is on a horizontal image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { + int top_edge = 0; + int bottom_edge = cpi->common.mi_params.mi_rows; + int is_active_h_edge = 0; + + // For two pass account for any formatting bars detected. + if (is_stat_consumption_stage_twopass(cpi)) { + const AV1_COMMON *const cm = &cpi->common; + const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( + &cpi->ppi->twopass, cm->current_frame.display_order_hint); + if (this_frame_stats == NULL) return AOM_CODEC_ERROR; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + top_edge += (int)(this_frame_stats->inactive_zone_rows * 4); + + bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4); + bottom_edge = AOMMAX(top_edge, bottom_edge); + } + + if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) || + ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) { + is_active_h_edge = 1; + } + return is_active_h_edge; +} + +// Checks to see if a super block is on a vertical image edge. +// In most cases this is the "real" edge unless there are formatting +// bars embedded in the stream. +int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { + int left_edge = 0; + int right_edge = cpi->common.mi_params.mi_cols; + int is_active_v_edge = 0; + + // For two pass account for any formatting bars detected. + if (is_stat_consumption_stage_twopass(cpi)) { + const AV1_COMMON *const cm = &cpi->common; + const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( + &cpi->ppi->twopass, cm->current_frame.display_order_hint); + if (this_frame_stats == NULL) return AOM_CODEC_ERROR; + + // The inactive region is specified in MBs not mi units. + // The image edge is in the following MB row. + left_edge += (int)(this_frame_stats->inactive_zone_cols * 4); + + right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4); + right_edge = AOMMAX(left_edge, right_edge); + } + + if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) || + ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) { + is_active_v_edge = 1; + } + return is_active_v_edge; +} + +void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, SuperBlockEnc *sb_enc) { + sb_enc->tpl_data_count = 0; + + if (!cpi->oxcf.algo_cfg.enable_tpl_model) return; + if (cpi->common.current_frame.frame_type == KEY_FRAME) return; + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE) + return; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + + AV1_COMMON *const cm = &cpi->common; + const int gf_group_index = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + + int mi_count = 0; + int count = 0; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + // mi_cols_sr is mi_cols at superres case. + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + // TPL store unit size is not the same as the motion estimation unit size. + // Here always use motion estimation size to avoid getting repetitive inter/ + // intra cost. + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); + assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]); + const int row_step = mi_size_high[tpl_bsize]; + const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize], + cm->superres_scale_denominator); + + // Stride is only based on SB size, and we fill in values for every 16x16 + // block in a SB. + sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr; + + for (int row = mi_row; row < mi_row + mi_high; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + // Handle partial SB, so that no invalid values are used later. + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) { + sb_enc->tpl_inter_cost[count] = INT64_MAX; + sb_enc->tpl_intra_cost[count] = INT64_MAX; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + sb_enc->tpl_mv[count][i].as_int = INVALID_MV; + } + count++; + continue; + } + + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + sb_enc->tpl_inter_cost[count] = this_stats->inter_cost + << TPL_DEP_COST_SCALE_LOG2; + sb_enc->tpl_intra_cost[count] = this_stats->intra_cost + << TPL_DEP_COST_SCALE_LOG2; + memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv)); + mi_count++; + count++; + } + } + + assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + sb_enc->tpl_data_count = mi_count; +} + +// analysis_type 0: Use mc_dep_cost and intra_cost +// analysis_type 1: Use count of best inter predictor chosen +// analysis_type 2: Use cost reduction from intra to inter for best inter +// predictor chosen +int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td, + int64_t *delta_dist, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + double intra_cost = 0; + double mc_dep_reg = 0; + double mc_dep_cost = 0; + double cbcmp_base = 1; + double srcrf_dist = 0; + double srcrf_sse = 0; + double srcrf_rate = 0; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int base_qindex = cm->quant_params.base_qindex; + + if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex; + + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + if (!tpl_frame->is_valid) return base_qindex; + +#ifndef NDEBUG + int mi_count = 0; +#endif + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int step = 1 << block_mis_log2; + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < mi_row + mi_high; row += row_step) { + for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) { + if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue; + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)]; + double cbcmp = (double)this_stats->srcrf_dist; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + intra_cost += log(dist_scaled) * cbcmp; + mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp; + mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp; + srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS); + srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS); + srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2); +#ifndef NDEBUG + mi_count++; +#endif + cbcmp_base += cbcmp; + } + } + assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); + + int offset = 0; + double beta = 1.0; + double rk; + if (mc_dep_cost > 0 && intra_cost > 0) { + const double r0 = cpi->rd.r0; + rk = exp((intra_cost - mc_dep_cost) / cbcmp_base); + td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base); + beta = (r0 / rk); + assert(beta > 0.0); + } else { + return base_qindex; + } + offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth); + int sbs_qstep = + av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth); + + if (delta_dist) { + double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0); + double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep); + sbs_dist = AOMMIN(sbs_dist, srcrf_sse); + *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk); + *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0); + *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0); + } + return qindex; +} + +#if !DISABLE_HDR_LUMA_DELTAQ +// offset table defined in Table3 of T-REC-H.Sup15 document. +static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0, 301, 367, 434, 501, 567, + 634, 701, 767, 834, 1024 }; + +static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3, 2, 1, 0, -1, + -2, -3, -4, -5, -6 }; +#endif + +int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + assert(cm->seq_params->bit_depth == AOM_BITS_10); + +#if DISABLE_HDR_LUMA_DELTAQ + (void)x; + (void)bsize; + (void)mi_row; + (void)mi_col; + return cm->quant_params.base_qindex; +#else + // calculate pixel average + const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col); + // adjust offset based on average of the pixel block + int offset = 0; + for (int i = 0; i < HDR_QP_LEVELS; i++) { + if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) { + offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR); + break; + } + } + + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); + offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1); + int qindex = cm->quant_params.base_qindex + offset; + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + return qindex; +#endif +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree, + BLOCK_SIZE bsize) { + if (sms_tree == NULL) return; + sms_tree->partitioning = PARTITION_NONE; + + if (bsize >= BLOCK_8X8) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int idx = 0; idx < 4; ++idx) + av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize); + } +} + +// Record the ref frames that have been selected by square partition blocks. +void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, + BLOCK_SIZE bsize, int mib_size, + int mi_row, int mi_col) { + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + const int sb_size_mask = mib_size - 1; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_size = mi_size_wide[bsize]; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) { + x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type; + } + } +} + +static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr, + int num_cdfs, int cdf_stride, int nsymbs, + int wt_left, int wt_tr) { + for (int i = 0; i < num_cdfs; i++) { + for (int j = 0; j <= nsymbs; j++) { + cdf_ptr_left[i * cdf_stride + j] = + (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left + + (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr + + ((wt_left + wt_tr) / 2)) / + (wt_left + wt_tr)); + assert(cdf_ptr_left[i * cdf_stride + j] >= 0 && + cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP); + } + } +} + +#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \ + AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs)) + +#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \ + aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \ + int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \ + wt_left, wt_tr); \ + } while (0) + +static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left, + int wt_tr) { + AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf, + MV_CLASSES); + AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf, + nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf, + nmv_tr->comps[i].class0_hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf, + CLASS0_SIZE); + AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2); + } +} + +// In case of row-based multi-threading of encoder, since we always +// keep a top - right sync, we can average the top - right SB's CDFs and +// the left SB's CDFs and use the same for current SB's encoding to +// improve the performance. This function facilitates the averaging +// of CDF and used only when row-mt is enabled in encoder. +void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, + int wt_left, int wt_tr) { + AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2); + AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2); + AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2); + AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5); + AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6); + AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7); + AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8); + AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9); + AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10); + AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11); + AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3); + AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4); + AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE); + AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2); + AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2); + AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2); + AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2); + AVERAGE_CDF(ctx_left->inter_compound_mode_cdf, + ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf, + MASKED_COMPOUND_TYPES); + AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16); + AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2); + AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2); + AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf, + INTERINTRA_MODES); + AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES); + AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2); + AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf, + PALETTE_SIZES); + AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf, + PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j], + ctx_tr->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j], + ctx_tr->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2); + AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2); + AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2); + AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2); + AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2); + AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2); + AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2); + AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2); + AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2); + AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2); + AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2); + avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr); + avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr); + AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2); + AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2); + AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf, + ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2); + AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); + AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2); + AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2); + AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES); + AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0], + UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES)); + AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4, + CDF_SIZE(10)); + } else if (i < 16) { + AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10); + } else { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8, + CDF_SIZE(10)); + } + } + AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf, + SWITCHABLE_FILTERS); + AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES); + AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf, + 2 * MAX_ANGLE_DELTA + 1); + AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1); + AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i], + DELTA_LF_PROBS + 1); + } + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2, + CDF_SIZE(TX_TYPES)); + AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS); + AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf, + CFL_ALPHABET_SIZE); +} + +// Check neighbor blocks' motion information. +static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride, + const TileInfo *const tile_info, int mi_row, + int mi_col) { + int is_above_low_motion = 1; + int is_left_low_motion = 1; + const int thr = 24; + + // Check above block. + if (mi_row > tile_info->mi_row_start) { + const MB_MODE_INFO *above_mbmi = mi[-mi_stride]; + const int_mv above_mv = above_mbmi->mv[0]; + if (above_mbmi->mode >= INTRA_MODE_END && + (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr)) + is_above_low_motion = 0; + } + + // Check left block. + if (mi_col > tile_info->mi_col_start) { + const MB_MODE_INFO *left_mbmi = mi[-1]; + const int_mv left_mv = left_mbmi->mv[0]; + if (left_mbmi->mode >= INTRA_MODE_END && + (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr)) + is_left_low_motion = 0; + } + + return (is_above_low_motion && is_left_low_motion); +} + +// Check this block's motion in a fast way. +static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y, + int src_ystride, + const uint8_t *last_src_y, + int last_src_ystride, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const BLOCK_SIZE bsize = cm->seq_params->sb_size; + unsigned int blk_sad = INT_MAX; + if (cpi->src_sad_blk_64x64 != NULL) { + const int sb_size_by_mb = (bsize == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = + (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sbi_col = mi_col / sb_size_by_mb; + const int sbi_row = mi_row / sb_size_by_mb; + blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols]; + } else { + blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + } + + // Search 4 1-away points. + const uint8_t *const search_pos[4] = { + last_src_y - last_src_ystride, + last_src_y - 1, + last_src_y + 1, + last_src_y + last_src_ystride, + }; + unsigned int sad_arr[4]; + cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos, + last_src_ystride, sad_arr); + + blk_sad = (blk_sad * 5) >> 3; + return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] && + blk_sad < sad_arr[2] && blk_sad < sad_arr[3]); +} + +// Grade the temporal variation of the source by comparing the current sb and +// its collocated block in the last frame. +void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, + int mi_row, int mi_col) { + if (cpi->last_source->y_width != cpi->source->y_width || + cpi->last_source->y_height != cpi->source->y_height) + return; +#if CONFIG_AV1_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return; +#endif + + unsigned int tmp_sse; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; + uint8_t *src_y = cpi->source->y_buffer; + const int src_ystride = cpi->source->y_stride; + const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2); + uint8_t *last_src_y = cpi->last_source->y_buffer; + const int last_src_ystride = cpi->last_source->y_stride; + const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2); + uint64_t avg_source_sse_threshold_verylow = 10000; // ~1.5*1.5*(64*64) + uint64_t avg_source_sse_threshold_low[2] = { 100000, // ~5*5*(64*64) + 36000 }; // ~3*3*(64*64) + + uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64) + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1; + avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1; + avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1; + } + uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 + src_y += src_offset; + last_src_y += last_src_offset; + tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // rd thresholds + if (tmp_sse < avg_source_sse_threshold_low[1]) + x->content_state_sb.source_sad_rd = kLowSad; + + // nonrd thresholds + if (tmp_sse == 0) { + x->content_state_sb.source_sad_nonrd = kZeroSad; + return; + } + if (tmp_sse < avg_source_sse_threshold_verylow) + x->content_state_sb.source_sad_nonrd = kVeryLowSad; + else if (tmp_sse < avg_source_sse_threshold_low[0]) + x->content_state_sb.source_sad_nonrd = kLowSad; + else if (tmp_sse > avg_source_sse_threshold_high) + x->content_state_sb.source_sad_nonrd = kHighSad; + + // Detect large lighting change. + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh) + x->content_state_sb.lighting_change = 1; + if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1)) + x->content_state_sb.low_sumdiff = 1; + + if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad || + cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1) + return; + + // In-place temporal filter. If psnr calculation is enabled, we store the + // source for that. + AV1_COMMON *const cm = &cpi->common; + // Calculate n*mean^2 + const unsigned int nmean2 = tmp_sse - tmp_variance; + const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params->bit_depth); + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME], + 0, cm->seq_params->bit_depth); + + const unsigned int threshold = + (cpi->sf.rt_sf.use_rtc_tf == 1) + ? (clamp(avg_q_step, 250, 1000)) * ac_q_step + : 250 * ac_q_step; + + // TODO(yunqing): use a weighted sum instead of averaging in filtering. + if (tmp_variance <= threshold && nmean2 <= 15) { + // Check neighbor blocks. If neighbor blocks aren't low-motion blocks, + // skip temporal filtering for this block. + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); + const TileInfo *const tile_info = &tile_data->tile_info; + const int is_neighbor_blocks_low_motion = check_neighbor_blocks( + mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col); + if (!is_neighbor_blocks_low_motion) return; + + // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB + // size. + // Test several nearby points. If non-zero mv exists, don't do temporal + // filtering. + const int is_this_blk_low_motion = fast_detect_non_zero_motion( + cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col); + + if (!is_this_blk_low_motion) return; + + const int shift_x[2] = { 0, cpi->source->subsampling_x }; + const int shift_y[2] = { 0, cpi->source->subsampling_y }; + const uint8_t h = block_size_high[bsize]; + const uint8_t w = block_size_wide[bsize]; + + for (int plane = 0; plane < av1_num_planes(cm); ++plane) { + uint8_t *src = cpi->source->buffers[plane]; + const int src_stride = cpi->source->strides[plane != 0]; + uint8_t *last_src = cpi->last_source->buffers[plane]; + const int last_src_stride = cpi->last_source->strides[plane != 0]; + src += src_stride * (mi_row << (2 - shift_y[plane != 0])) + + (mi_col << (2 - shift_x[plane != 0])); + last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) + + (mi_col << (2 - shift_x[plane != 0])); + + for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) { + for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) { + src[j] = (last_src[j] + src[j]) >> 1; + } + src += src_stride; + last_src += last_src_stride; + } + } + } +} + +// Memset the mbmis at the current superblock to 0 +void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size, + int mi_row, int mi_col) { + // size of sb in unit of mi (BLOCK_4X4) + const int sb_size_mi = mi_size_wide[sb_size]; + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + // size of sb in unit of allocated mi size + const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d; + assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 && + "mi is not allocated as a multiple of sb!"); + assert(mi_params->mi_stride % sb_size_mi == 0 && + "mi_grid_base is not allocated as a multiple of sb!"); + + const int mi_rows = mi_size_high[sb_size]; + for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) { + assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) < + mi_params->mi_stride); + const int mi_grid_idx = + get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col); + const int alloc_mi_idx = + get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col); + memset(&mi_params->mi_grid_base[mi_grid_idx], 0, + sb_size_mi * sizeof(*mi_params->mi_grid_base)); + memset(&mi_params->tx_type_map[mi_grid_idx], 0, + sb_size_mi * sizeof(*mi_params->tx_type_map)); + if (cur_mi_row % mi_alloc_size_1d == 0) { + memset(&mi_params->mi_alloc[alloc_mi_idx], 0, + sb_size_alloc_mi * sizeof(*mi_params->mi_alloc)); + } + } +} + +void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, + ThreadData *td, const TileDataEnc *tile_data, + int mi_row, int mi_col) { + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const TileInfo *tile_info = &tile_data->tile_info; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); + + sb_fp_stats->rd_count = td->rd_counts; + sb_fp_stats->split_count = x->txfm_search_info.txb_split_count; + + sb_fp_stats->fc = *td->counts; + + // Don't copy in row_mt case, otherwise run into data race. No behavior change + // in row_mt case. + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models, + sizeof(sb_fp_stats->inter_mode_rd_models)); + } + + memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact, + sizeof(sb_fp_stats->thresh_freq_fact)); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + sb_fp_stats->current_qindex = + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex; + +#if CONFIG_INTERNAL_STATS + memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts, + sizeof(sb_fp_stats->mode_chosen_counts)); +#endif // CONFIG_INTERNAL_STATS +} + +void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, + ThreadData *td, TileDataEnc *tile_data, int mi_row, + int mi_col) { + MACROBLOCK *x = &td->mb; + + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, + num_planes); + + td->rd_counts = sb_fp_stats->rd_count; + x->txfm_search_info.txb_split_count = sb_fp_stats->split_count; + + *td->counts = sb_fp_stats->fc; + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models, + sizeof(sb_fp_stats->inter_mode_rd_models)); + } + + memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact, + sizeof(sb_fp_stats->thresh_freq_fact)); + + const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col); + cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = + sb_fp_stats->current_qindex; + +#if CONFIG_INTERNAL_STATS + memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts, + sizeof(sb_fp_stats->mode_chosen_counts)); +#endif // CONFIG_INTERNAL_STATS +} + +/*! Checks whether to skip updating the entropy cost based on tile info. + * + * This function contains the common code used to skip the cost update of coeff, + * mode, mv and dv symbols. + */ +static int skip_cost_update(const SequenceHeader *seq_params, + const TileInfo *const tile_info, const int mi_row, + const int mi_col, + INTERNAL_COST_UPDATE_TYPE upd_level) { + if (upd_level == INTERNAL_COST_UPD_SB) return 0; + if (upd_level == INTERNAL_COST_UPD_OFF) return 1; + + // upd_level is at most as frequent as each sb_row in a tile. + if (mi_col != tile_info->mi_col_start) return 1; + + if (upd_level == INTERNAL_COST_UPD_SBROW_SET) { + const int mib_size_log2 = seq_params->mib_size_log2; + const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; + const int sb_size = seq_params->mib_size * MI_SIZE; + const int tile_height = + (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; + // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens + // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However, + // as the update will not be equally spaced in smaller resolutions making + // it equally spaced by calculating (mv_num_rows_cost_update) the number of + // rows after which the cost update should happen. + const int sb_size_update_freq_map[2] = { 2, 4 }; + const int update_freq_sb_rows = + sb_size_update_freq_map[sb_size != MAX_SB_SIZE]; + const int update_freq_num_rows = sb_size * update_freq_sb_rows; + // Round-up the division result to next integer. + const int num_updates_per_tile = + (tile_height + update_freq_num_rows - 1) / update_freq_num_rows; + const int num_rows_update_per_tile = num_updates_per_tile * sb_size; + // Round-up the division result to next integer. + const int num_sb_rows_per_update = + (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile; + if ((sb_row % num_sb_rows_per_update) != 0) return 1; + } + return 0; +} + +// Checks for skip status of mv cost update. +static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + const AV1_COMMON *cm = &cpi->common; + // For intra frames, mv cdfs are not updated during the encode. Hence, the mv + // cost calculation is skipped in this case. + if (frame_is_intra_only(cm)) return 1; + + return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.mv_cost_upd_level); +} + +// Checks for skip status of dv cost update. +static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + const AV1_COMMON *cm = &cpi->common; + // Intrabc is only applicable to intra frames. So skip if intrabc is not + // allowed. + if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) { + return 1; + } + + return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.intra_sf.dv_cost_upd_level); +} + +// Update the rate costs of some symbols according to the frequency directed +// by speed features +void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, + const TileInfo *const tile_info, const int mi_row, + const int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + if (cm->features.disable_cdf_update) { + return; + } + + switch (cpi->sf.inter_sf.coeff_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.coeff_cost_upd_level)) + break; + av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes); + break; + default: assert(0); + } + + switch (cpi->sf.inter_sf.mode_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.mode_cost_upd_level)) + break; + av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx); + break; + default: assert(0); + } + + switch (cpi->sf.inter_sf.mv_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + // Checks for skip status of mv cost update. + if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break; + av1_fill_mv_costs(&xd->tile_ctx->nmvc, + cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, x->mv_costs); + break; + default: assert(0); + } + + switch (cpi->sf.intra_sf.dv_cost_upd_level) { + case INTERNAL_COST_UPD_OFF: + case INTERNAL_COST_UPD_TILE: // Tile level + break; + case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile + case INTERNAL_COST_UPD_SBROW: // SB row level in tile + case INTERNAL_COST_UPD_SB: // SB level + // Checks for skip status of dv cost update. + if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break; + av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs); + break; + default: assert(0); + } +} + +void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) { + for (int plane = 0; plane < num_planes; ++plane) { + aom_free(mb->plane[plane].src_diff); + mb->plane[plane].src_diff = NULL; + } +} + +void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) { + const int num_planes = av1_num_planes(cm); +#ifndef NDEBUG + for (int plane = 0; plane < num_planes; ++plane) { + assert(!mb->plane[plane].src_diff); + } +#endif + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_xy = + plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y + : 0; + const int sb_size = MAX_SB_SQUARE >> subsampling_xy; + CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff, + (int16_t *)aom_memalign( + 32, sizeof(*mb->plane[plane].src_diff) * sb_size)); + } +} diff --git a/third_party/aom/av1/encoder/encodeframe_utils.h b/third_party/aom/av1/encoder/encodeframe_utils.h new file mode 100644 index 0000000000..14c71b8802 --- /dev/null +++ b/third_party/aom/av1/encoder/encodeframe_utils.h @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ +#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ + +#include "aom_ports/aom_timer.h" + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/rdopt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define WRITE_FEATURE_TO_FILE 0 + +#define FEATURE_SIZE_SMS_SPLIT_FAST 6 +#define FEATURE_SIZE_SMS_SPLIT 17 +#define FEATURE_SIZE_SMS_PRUNE_PART 25 +#define FEATURE_SIZE_SMS_TERM_NONE 28 +#define FEATURE_SIZE_FP_SMS_TERM_NONE 20 +#define FEATURE_SIZE_MAX_MIN_PART_PRED 13 +#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 + +#define FEATURE_SMS_NONE_FLAG 1 +#define FEATURE_SMS_SPLIT_FLAG (1 << 1) +#define FEATURE_SMS_RECT_FLAG (1 << 2) + +#define FEATURE_SMS_PRUNE_PART_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG) +#define FEATURE_SMS_SPLIT_MODEL_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG) + +// Number of sub-partitions in rectangular partition types. +#define SUB_PARTITIONS_RECT 2 + +// Number of sub-partitions in split partition type. +#define SUB_PARTITIONS_SPLIT 4 + +// Number of sub-partitions in AB partition types. +#define SUB_PARTITIONS_AB 3 + +// Number of sub-partitions in 4-way partition types. +#define SUB_PARTITIONS_PART4 4 + +// 4part partition types. +enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES); + +// AB partition types. +enum { + HORZ_A = 0, + HORZ_B, + VERT_A, + VERT_B, + NUM_AB_PARTS +} UENUM1BYTE(AB_PART_TYPE); + +// Rectangular partition types. +enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE); + +// Structure to keep win flags for HORZ and VERT partition evaluations. +typedef struct { + int rect_part_win[NUM_RECT_PARTS]; +} RD_RECT_PART_WIN_INFO; + +enum { PICK_MODE_RD = 0, PICK_MODE_NONRD }; + +enum { + SB_SINGLE_PASS, // Single pass encoding: all ctxs get updated normally + SB_DRY_PASS, // First pass of multi-pass: does not update the ctxs + SB_WET_PASS // Second pass of multi-pass: finalize and update the ctx +} UENUM1BYTE(SB_MULTI_PASS_MODE); + +typedef struct { + ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE]; + PARTITION_CONTEXT sa[MAX_MIB_SIZE]; + PARTITION_CONTEXT sl[MAX_MIB_SIZE]; + TXFM_CONTEXT *p_ta; + TXFM_CONTEXT *p_tl; + TXFM_CONTEXT ta[MAX_MIB_SIZE]; + TXFM_CONTEXT tl[MAX_MIB_SIZE]; +} RD_SEARCH_MACROBLOCK_CONTEXT; + +// This struct is used to store the statistics used by sb-level multi-pass +// encoding. Currently, this is only used to make a copy of the state before we +// perform the first pass +typedef struct SB_FIRST_PASS_STATS { + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_COUNTS rd_count; + + int split_count; + FRAME_COUNTS fc; + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; + int current_qindex; + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; +#endif // CONFIG_INTERNAL_STATS +} SB_FIRST_PASS_STATS; + +// This structure contains block size related +// variables for use in rd_pick_partition(). +typedef struct { + // Half of block width to determine block edge. + int mi_step; + + // Block row and column indices. + int mi_row; + int mi_col; + + // Block edge row and column indices. + int mi_row_edge; + int mi_col_edge; + + // Block width of current partition block. + int width; + + // Block width of minimum partition size allowed. + int min_partition_size_1d; + + // Flag to indicate if partition is 8x8 or higher size. + int bsize_at_least_8x8; + + // Indicates edge blocks in frame. + int has_rows; + int has_cols; + + // Block size of current partition. + BLOCK_SIZE bsize; + + // Size of current sub-partition. + BLOCK_SIZE subsize; + + // Size of split partition. + BLOCK_SIZE split_bsize2; +} PartitionBlkParams; + +#if CONFIG_COLLECT_PARTITION_STATS +typedef struct PartitionTimingStats { + // Tracks the number of partition decision used in the current call to \ref + // av1_rd_pick_partition + int partition_decisions[EXT_PARTITION_TYPES]; + // Tracks the number of partition_block searched in the current call to \ref + // av1_rd_pick_partition + int partition_attempts[EXT_PARTITION_TYPES]; + // Tracks the time spent on each partition search in the current call to \ref + // av1_rd_pick_partition + int64_t partition_times[EXT_PARTITION_TYPES]; + // Tracks the rdcost spent on each partition search in the current call to + // \ref av1_rd_pick_partition + int64_t partition_rdcost[EXT_PARTITION_TYPES]; + // Timer used to time the partitions. + struct aom_usec_timer timer; + // Whether the timer is on + int timer_is_on; +} PartitionTimingStats; +#endif // CONFIG_COLLECT_PARTITION_STATS + +// Structure holding state variables for partition search. +typedef struct { + // Intra partitioning related info. + PartitionSearchInfo *intra_part_info; + + // Parameters related to partition block size. + PartitionBlkParams part_blk_params; + + // Win flags for HORZ and VERT partition evaluations. + RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT]; + + // RD cost for the current block of given partition type. + RD_STATS this_rdc; + + // RD cost summed across all blocks of partition type. + RD_STATS sum_rdc; + + // Array holding partition type cost. + int tmp_partition_cost[PARTITION_TYPES]; + + // Pointer to partition cost buffer + int *partition_cost; + + // RD costs for different partition types. + int64_t none_rd; + int64_t split_rd[SUB_PARTITIONS_SPLIT]; + // RD costs for rectangular partitions. + // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ. + // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT. + int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT]; + + // Flags indicating if the corresponding partition was winner or not. + // Used to bypass similar blocks during AB partition evaluation. + int is_split_ctx_is_ready[2]; + int is_rect_ctx_is_ready[NUM_RECT_PARTS]; + + // If true, skips the rest of partition evaluation at the current bsize level. + int terminate_partition_search; + + // If false, skips rdopt on PARTITION_NONE. + int partition_none_allowed; + + // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ, + // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT. + int partition_rect_allowed[NUM_RECT_PARTS]; + + // If false, skips searching rectangular partition unless some logic related + // to edge detection holds. + int do_rectangular_split; + + // If false, skips searching PARTITION_SPLIT. + int do_square_split; + + // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that + // this does not directly affect the extended partitions, so this can be used + // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of + // PARTITION_HORZ_AB4, etc. + int prune_rect_part[NUM_RECT_PARTS]; + + // Chroma subsampling in x and y directions. + int ss_x; + int ss_y; + + // Partition plane context index. + int pl_ctx_idx; + + // This flag will be set if best partition is found from the search. + bool found_best_partition; + +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats part_timing_stats; +#endif // CONFIG_COLLECT_PARTITION_STATS +} PartitionSearchState; + +static AOM_INLINE void av1_disable_square_split_partition( + PartitionSearchState *part_state) { + part_state->do_square_split = 0; +} + +// Disables all possible rectangular splits. This includes PARTITION_AB4 as they +// depend on the corresponding partition_rect_allowed. +static AOM_INLINE void av1_disable_rect_partitions( + PartitionSearchState *part_state) { + part_state->do_rectangular_split = 0; + part_state->partition_rect_allowed[HORZ] = 0; + part_state->partition_rect_allowed[VERT] = 0; +} + +// Disables all possible splits so that only PARTITION_NONE *might* be allowed. +static AOM_INLINE void av1_disable_all_splits( + PartitionSearchState *part_state) { + av1_disable_square_split_partition(part_state); + av1_disable_rect_partitions(part_state); +} + +static AOM_INLINE void av1_set_square_split_only( + PartitionSearchState *part_state) { + part_state->partition_none_allowed = 0; + part_state->do_square_split = 1; + av1_disable_rect_partitions(part_state); +} + +static AOM_INLINE bool av1_blk_has_rows_and_cols( + const PartitionBlkParams *blk_params) { + return blk_params->has_rows && blk_params->has_cols; +} + +static AOM_INLINE bool av1_is_whole_blk_in_frame( + const PartitionBlkParams *blk_params, + const CommonModeInfoParams *mi_params) { + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + return mi_row + mi_size_high[bsize] <= mi_params->mi_rows && + mi_col + mi_size_wide[bsize] <= mi_params->mi_cols; +} + +static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd, + const MB_MODE_INFO *mbmi, + int dual_filter) { + for (int dir = 0; dir < 2; ++dir) { + if (dir && !dual_filter) break; + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir); + update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter, + SWITCHABLE_FILTERS); + } +} + +static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi, + const MACROBLOCK *const x, int segment_id) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const CommonQuantParams *quant_params = &cm->quant_params; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; + const FRAME_UPDATE_TYPE update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + + int qindex; + if (segment_id >= 0) { + qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex); + } else { + qindex = quant_params->base_qindex + x->rdmult_delta_qindex + + quant_params->y_dc_delta_q; + } + + return av1_compute_rd_mult( + qindex, bit_depth, update_type, layer_depth, boost_index, frame_type, + cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi)); +} + +static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) { + return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32); +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, + int frm) { + assert(frm >= 0); + if (frm < 0 || + p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) { + return NULL; + } + + return &p->stats_buf_ctx->stats_in_start[frm]; +} + +int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult); + +int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step); + +int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step); + +void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, SuperBlockEnc *sb_enc); + +int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td, + int64_t *delta_dist, BLOCK_SIZE bsize, + int mi_row, int mi_col); + +int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col); + +int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col); + +int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int orig_rdmult); +#endif // !CONFIG_REALTIME_ONLY + +void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult); + +#if CONFIG_SALIENCY_MAP +void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi, + int *errorperbit, const BLOCK_SIZE bsize, + const int mi_row, const int mi_col, + int *const rdmult); +#endif + +void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, + const PICK_MODE_CONTEXT *const ctx, int mi_row, + int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run); + +void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, + PREDICTION_MODE mode, int16_t mode_context); + +void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts, + MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi, + const MB_MODE_INFO *above_mi, + const MB_MODE_INFO *left_mi, const int intraonly); + +void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes); + +void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + const int num_planes); + +void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MB_MODE_INFO **mib, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, + BLOCK_SIZE bsize); + +void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree, + BLOCK_SIZE bsize); + +void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, + BLOCK_SIZE bsize, int mib_size, + int mi_row, int mi_col); + +void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, + int wt_left, int wt_tr); + +void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, + int mi_row, int mi_col); + +void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size, + int mi_row, int mi_col); + +void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, + ThreadData *td, const TileDataEnc *tile_data, + int mi_row, int mi_col); + +void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, + ThreadData *td, TileDataEnc *tile_data, int mi_row, + int mi_col); + +void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, + const TileInfo *const tile_info, const int mi_row, + const int mi_col); + +void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes); + +static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb, + int num_planes) { + aom_free(mb->txfm_search_info.mb_rd_record); + mb->txfm_search_info.mb_rd_record = NULL; + + aom_free(mb->inter_modes_info); + mb->inter_modes_info = NULL; + + av1_dealloc_src_diff_buf(mb, num_planes); + + aom_free(mb->e_mbd.seg_mask); + mb->e_mbd.seg_mask = NULL; + + aom_free(mb->winner_mode_stats); + mb->winner_mode_stats = NULL; + + aom_free(mb->dqcoeff_buf); + mb->dqcoeff_buf = NULL; +} + +static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi, + struct macroblock *mb) { + const SPEED_FEATURES *sf = &cpi->sf; + // The winner_mode_stats buffer is not required in these cases. + if (is_stat_generation_stage(cpi) || + (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) || + (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF)) + return; + + const AV1_COMMON *cm = &cpi->common; + const int winner_mode_count = + winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type]; + CHECK_MEM_ERROR(cm, mb->winner_mode_stats, + (WinnerModeStats *)aom_malloc( + winner_mode_count * sizeof(mb->winner_mode_stats[0]))); +} + +void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb); + +static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi, + struct macroblock *mb) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + if (!sf->rt_sf.use_nonrd_pick_mode) { + // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is + // enabled. + if (sf->rd_sf.use_mb_rd_hash) + CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record, + (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD))); + if (!frame_is_intra_only(cm)) + CHECK_MEM_ERROR( + cm, mb->inter_modes_info, + (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info))); + } + + av1_alloc_src_diff_buf(cm, mb); + + CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0]))); + + allocate_winner_mode_stats(cpi, mb); + + const int max_sb_square_y = 1 + << num_pels_log2_lookup[cm->seq_params->sb_size]; + CHECK_MEM_ERROR( + cm, mb->dqcoeff_buf, + (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t))); +} + +// This function will compute the number of reference frames to be disabled +// based on selective_ref_frame speed feature. +static AOM_INLINE unsigned int get_num_refs_to_disable( + const AV1_COMP *cpi, const int *ref_frame_flags, + const unsigned int *ref_display_order_hint, + unsigned int cur_frame_display_index) { + unsigned int num_refs_to_disable = 0; + if (cpi->sf.inter_sf.selective_ref_frame >= 3) { + num_refs_to_disable++; + if (cpi->sf.inter_sf.selective_ref_frame >= 6) { + // Disable LAST2_FRAME and ALTREF2_FRAME + num_refs_to_disable += 2; + } else if (cpi->sf.inter_sf.selective_ref_frame == 5 && + *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) { + const int last2_frame_dist = av1_encoder_get_relative_dist( + ref_display_order_hint[LAST2_FRAME - LAST_FRAME], + cur_frame_display_index); + // Disable LAST2_FRAME if it is a temporally distant frame + if (abs(last2_frame_dist) > 2) { + num_refs_to_disable++; + } +#if !CONFIG_REALTIME_ONLY + else if (is_stat_consumption_stage_twopass(cpi)) { + const FIRSTPASS_STATS *const this_frame_stats = + read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index); + const double coded_error_per_mb = this_frame_stats->coded_error; + // Disable LAST2_FRAME if the coded error of the current frame based on + // first pass stats is very low. + if (coded_error_per_mb < 100.0) num_refs_to_disable++; + } +#endif // CONFIG_REALTIME_ONLY + } + } + return num_refs_to_disable; +} + +static INLINE int get_max_allowed_ref_frames( + const AV1_COMP *cpi, const int *ref_frame_flags, + const unsigned int *ref_display_order_hint, + unsigned int cur_frame_display_index) { + const unsigned int max_reference_frames = + cpi->oxcf.ref_frm_cfg.max_reference_frames; + const unsigned int num_refs_to_disable = get_num_refs_to_disable( + cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index); + const unsigned int max_allowed_refs_for_given_speed = + INTER_REFS_PER_FRAME - num_refs_to_disable; + return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames); +} + +// Enforce the number of references for each arbitrary frame based on user +// options and speed. +static AOM_INLINE void enforce_max_ref_frames( + AV1_COMP *cpi, int *ref_frame_flags, + const unsigned int *ref_display_order_hint, + unsigned int cur_frame_display_index) { + MV_REFERENCE_FRAME ref_frame; + int total_valid_refs = 0; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + total_valid_refs++; + } + } + + const int max_allowed_refs = get_max_allowed_ref_frames( + cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index); + + for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) { + const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i]; + + if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) { + continue; + } + + switch (ref_frame_to_disable) { + case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break; + case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break; + case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break; + case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break; + default: assert(0); + } + --total_valid_refs; + } + assert(total_valid_refs <= max_allowed_refs); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c new file mode 100644 index 0000000000..c78761dd98 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemb.c @@ -0,0 +1,866 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/bitwriter.h" +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + +#include "av1/common/cfl.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/txb_rdopt.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" + +void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride) { + assert(rows >= 4 && cols >= 4); +#if CONFIG_AV1_HIGHBITDEPTH + if (bd_info.use_highbitdepth_buf) { + aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, + pred8, pred_stride); + return; + } +#endif + (void)bd_info; + aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, + pred_stride); +} + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int tx1d_width = tx_size_wide[tx_size]; + const int tx1d_height = tx_size_high[tx_size]; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; + av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride, + src, src_stride, dst, dst_stride); +} + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) { + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + assert(plane_bsize < BLOCK_SIZES_ALL); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + + av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride); +} + +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + const int eob = p->eobs[block]; + const int segment_id = xd->mi[0]->segment_id; + + if (eob == 0 || !cpi->optimize_seg_arr[segment_id] || + xd->lossless[segment_id]) { + *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size); + return eob; + } + + return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + rate_cost, cpi->oxcf.algo_cfg.sharpness); +} + +// Hyper-parameters for dropout optimization, based on following logics. +// TODO(yjshen): These settings are tuned by experiments. They may still be +// optimized for better performance. +// (1) Coefficients which are large enough will ALWAYS be kept. +const tran_low_t DROPOUT_COEFF_MAX = 2; // Max dropout-able coefficient. +// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is +// NOT required. For example, `5 0 0 0 7` is treated as two continuous +// coefficients if three zeros do not fulfill the dropout condition. +const int DROPOUT_CONTINUITY_MAX = 2; // Max dropout-able continuous coeff. +// (3) Dropout operation is NOT applicable to blocks with large or small +// quantization index. +const int DROPOUT_Q_MAX = 128; +const int DROPOUT_Q_MIN = 16; +// (4) Recall that dropout optimization will forcibly set some quantized +// coefficients to zero. The key logic on determining whether a coefficient +// should be dropped is to check the number of continuous zeros before AND +// after this coefficient. The exact number of zeros for judgement depends +// on block size and quantization index. More concretely, block size +// determines the base number of zeros, while quantization index determines +// the multiplier. Intuitively, larger block requires more zeros and larger +// quantization index also requires more zeros (more information is lost +// when using larger quantization index). +const int DROPOUT_BEFORE_BASE_MAX = 32; // Max base number for leading zeros. +const int DROPOUT_BEFORE_BASE_MIN = 16; // Min base number for leading zeros. +const int DROPOUT_AFTER_BASE_MAX = 32; // Max base number for trailing zeros. +const int DROPOUT_AFTER_BASE_MIN = 16; // Min base number for trailing zeros. +const int DROPOUT_MULTIPLIER_MAX = 8; // Max multiplier on number of zeros. +const int DROPOUT_MULTIPLIER_MIN = 2; // Min multiplier on number of zeros. +const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier. + +void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + TX_TYPE tx_type, int qindex) { + const int tx_width = tx_size_wide[tx_size]; + const int tx_height = tx_size_high[tx_size]; + + // Early return if `qindex` is out of range. + if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) { + return; + } + + // Compute number of zeros used for dropout judgement. + const int base_size = AOMMAX(tx_width, tx_height); + const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE, + DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX); + const int dropout_num_before = + multiplier * + CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX); + const int dropout_num_after = + multiplier * + CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX); + + av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before, + dropout_num_after); +} + +void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, TX_TYPE tx_type, + int dropout_num_before, int dropout_num_after) { + const struct macroblock_plane *const p = &mb->plane[plane]; + tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + const int max_eob = av1_get_max_eob(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + + // Early return if there are not enough non-zero coefficients. + if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before || + max_eob <= dropout_num_before + dropout_num_after) { + return; + } + + int count_zeros_before = 0; + int count_zeros_after = 0; + int count_nonzeros = 0; + // Index of the first non-zero coefficient after sufficient number of + // continuous zeros. If equals to `-1`, it means number of leading zeros + // hasn't reach `dropout_num_before`. + int idx = -1; + int eob = 0; // New end of block. + + for (int i = 0; i < p->eobs[block]; ++i) { + const int scan_idx = scan_order->scan[i]; + if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) { + // Keep large coefficients. + count_zeros_before = 0; + count_zeros_after = 0; + idx = -1; + eob = i + 1; + } else if (qcoeff[scan_idx] == 0) { // Count zeros. + if (idx == -1) { + ++count_zeros_before; + } else { + ++count_zeros_after; + } + } else { // Count non-zeros. + if (count_zeros_before >= dropout_num_before) { + idx = (idx == -1) ? i : idx; + ++count_nonzeros; + } else { + count_zeros_before = 0; + eob = i + 1; + } + } + + // Handle continuity. + if (count_nonzeros > DROPOUT_CONTINUITY_MAX) { + count_zeros_before = 0; + count_zeros_after = 0; + count_nonzeros = 0; + idx = -1; + eob = i + 1; + } + + // Handle the trailing zeros after original end of block. + if (idx != -1 && i == p->eobs[block] - 1) { + count_zeros_after += (max_eob - p->eobs[block]); + } + + // Set redundant coefficients to zeros if needed. + if (count_zeros_after >= dropout_num_after) { + for (int j = idx; j <= i; ++j) { + qcoeff[scan_order->scan[j]] = 0; + dqcoeff[scan_order->scan[j]] = 0; + } + count_zeros_before += (i - idx + 1); + count_zeros_after = 0; + count_nonzeros = 0; + } else if (i == p->eobs[block] - 1) { + eob = i + 1; + } + } + + if (eob != p->eobs[block]) { + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, eob); + } +} + +// Settings for optimization type. NOTE: To set optimization type for all intra +// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set. +// TODO(yjshen): These settings are hard-coded and look okay for now. They +// should be made configurable later. +// Blocks of key frames ONLY. +const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; +// Blocks of intra frames (key frames EXCLUSIVE). +const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; +// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default +// if trellis optimization is on for inter frames.) +const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT; + +enum { + QUANT_FUNC_LOWBD = 0, + QUANT_FUNC_HIGHBD = 1, + QUANT_FUNC_TYPES = 2 +} UENUM1BYTE(QUANT_FUNC); + +#if CONFIG_AV1_HIGHBITDEPTH +static AV1_QUANT_FACADE + quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { + { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade }, + { av1_quantize_b_facade, av1_highbd_quantize_b_facade }, + { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade }, + { NULL, NULL } + }; +#else +static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = { + av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL +}; +#endif + +// Computes the transform for DC only blocks +void av1_xform_dc_only(MACROBLOCK *x, int plane, int block, + TxfmParam *txfm_param, int64_t per_px_mean) { + assert(per_px_mean != INT64_MAX); + const struct macroblock_plane *const p = &x->plane[plane]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); + memset(coeff, 0, sizeof(*coeff) * n_coeffs); + coeff[0] = + (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12); +} + +void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, + const QUANT_PARAM *qparam) { + av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param); + av1_quant(x, plane, block, txfm_param, qparam); +} + +void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) { + const struct macroblock_plane *const p = &x->plane[plane]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + const int diff_stride = block_size_wide[plane_bsize]; + + const int src_offset = (blk_row * diff_stride + blk_col); + const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2]; + + av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, + const QUANT_PARAM *qparam) { + const struct macroblock_plane *const p = &x->plane[plane]; + const SCAN_ORDER *const scan_order = + get_scan(txfm_param->tx_size, txfm_param->tx_type); + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const qcoeff = p->qcoeff + block_offset; + tran_low_t *const dqcoeff = p->dqcoeff + block_offset; + uint16_t *const eob = &p->eobs[block]; + + if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) { + const int n_coeffs = av1_get_max_eob(txfm_param->tx_size); + if (LIKELY(!x->seg_skip_block)) { +#if CONFIG_AV1_HIGHBITDEPTH + quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#else + quant_func_list[qparam->xform_quant_idx]( + coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam); +#endif + } else { + av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); + } + } + // use_optimize_b is true means av1_optimze_b will be called, + // thus cannot update entropy ctx now (performed in optimize_b) + if (qparam->use_optimize_b) { + p->txb_entropy_ctx[block] = 0; + } else { + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, *eob); + } +} + +void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, + TX_TYPE tx_type, TxfmParam *txfm_param) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + txfm_param->tx_type = tx_type; + txfm_param->tx_size = tx_size; + txfm_param->lossless = xd->lossless[mbmi->segment_id]; + txfm_param->tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used); + + txfm_param->bd = xd->bd; + txfm_param->is_hbd = is_cur_buf_hbd(xd); +} +void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, + int use_quant_b_adapt, QUANT_PARAM *qparam) { + qparam->log_scale = av1_get_tx_scale(tx_size); + qparam->tx_size = tx_size; + + qparam->use_quant_b_adapt = use_quant_b_adapt; + + // TODO(bohanli): optimize_b and quantization idx has relationship, + // but is kind of buried and complicated in different encoding stages. + // Should have a unified function to derive quant_idx, rather than + // determine and pass in the quant_idx + qparam->use_optimize_b = use_optimize_b; + qparam->xform_quant_idx = xform_quant_idx; + + qparam->qmatrix = NULL; + qparam->iqmatrix = NULL; +} +void av1_setup_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, QUANT_PARAM *qparam) { + qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type); + qparam->iqmatrix = + av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type); +} + +static void encode_block(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg, + RUN_TYPE dry_run) { + (void)dry_run; + struct encode_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + uint8_t *dst; + ENTROPY_CONTEXT *a, *l; + int dummy_rate_cost = 0; + + const int bw = mi_size_wide[plane_bsize]; + dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + + a = &args->ta[blk_col]; + l = &args->tl[blk_row]; + + TX_TYPE tx_type = DCT_DCT; + const int blk_skip_idx = blk_row * bw + blk_col; + if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) && + !mbmi->skip_mode) { + tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + TxfmParam txfm_param; + QUANT_PARAM quant_param; + const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run); + int quant_idx; + if (use_trellis) + quant_idx = AV1_XFORM_QUANT_FP; + else + quant_idx = + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); + av1_setup_quant(tx_size, use_trellis, quant_idx, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Whether trellis or dropout optimization is required for inter frames. + const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT || + INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; + const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT || + INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT; + + if (quant_param.use_optimize_b && do_trellis) { + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + &dummy_rate_cost); + } + if (!quant_param.use_optimize_b && do_dropout) { + av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, + cm->quant_params.base_qindex); + } + } else { + p->eobs[block] = 0; + p->txb_entropy_ctx[block] = 0; + } + + av1_set_txb_context(x, plane, block, tx_size, a, l); + + if (p->eobs[block]) { + // As long as any YUV plane has non-zero quantized transform coefficients, + // mbmi->skip_txfm flag is set to 0. + mbmi->skip_txfm = 0; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + pd->dst.stride, p->eobs[block], + cm->features.reduced_tx_set_used); + } else { + // Only when YUV planes all have zero quantized transform coefficients, + // mbmi->skip_txfm flag is set to 1. + mbmi->skip_txfm &= 1; + } + + // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 + // case. It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. + if (p->eobs[block] == 0 && plane == 0) { +#if 0 + if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ && + args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) { + // TODO(jingning,angiebird,huisu@google.com): enable txk_check when + // enable_optimize_b is true to detect potential RD bug. + const uint8_t disable_txk_check = args->enable_optimize_b; + if (!disable_txk_check) { + assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == + DCT_DCT); + } + } +#endif + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + int pixel_c, pixel_r; + BLOCK_SIZE bsize = txsize_to_bsize[tx_size]; + int blk_w = block_size_wide[bsize]; + int blk_h = block_size_high[bsize]; + mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col, + blk_row, pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint, + plane, pixel_c, pixel_r, blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif +} + +static void encode_block_inter(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg, RUN_TYPE dry_run) { + struct encode_b_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + if (!plane) { + assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] && + tx_size_high[tx_size] >= tx_size_high[plane_tx_size]); + } + + if (tx_size == plane_tx_size || plane) { + encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg, + dry_run); + } else { + assert(tx_size < TX_SIZES_ALL); + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size)); + assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size)); + // This is the square transform block partition entry point. + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsh * bsw; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + + encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, + arg, dry_run); + block += step; + } + } + } +} + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + // Call visit() directly with zero offsets if the current block size is the + // same as the transform block size. + if (plane_bsize == tx_bsize) { + visit(plane, 0, 0, 0, plane_bsize, tx_size, arg); + return; + } + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y); + const int mu_blocks_wide = + AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide); + const int mu_blocks_high = + AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + int i = 0; + for (int r = 0; r < max_blocks_high; r += mu_blocks_high) { + const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high); + // Skip visiting the sub blocks that are wholly within the UMV. + for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) { + const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide); + for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) { + for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) { + visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg); + i += step; + } + } + } + } + // Check if visit() is invoked at least once. + assert(i >= 1); +} + +typedef struct encode_block_pass1_args { + AV1_COMP *cpi; + MACROBLOCK *x; +} encode_block_pass1_args; + +static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + encode_block_pass1_args *args = (encode_block_pass1_args *)arg; + AV1_COMP *cpi = args->cpi; + AV1_COMMON *cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + + uint8_t *dst; + dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2]; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, + &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + if (p->eobs[block] > 0) { + txfm_param.eob = p->eobs[block]; + if (txfm_param.is_hbd) { + av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); + return; + } + av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param); + } +} + +void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) { + encode_block_pass1_args args = { cpi, x }; + av1_subtract_plane(x, bsize, 0); + av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, &args); +} + +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RUN_TYPE dry_run) { + assert(bsize < BLOCK_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + // In the current encoder implementation, for inter blocks, + // only when YUV planes all have zero quantized transform coefficients, + // mbmi->skip_txfm flag is set to 1. + // For intra blocks, this flag is set to 0 since skipped blocks are so rare + // that transmitting skip_txfm = 1 is very expensive. + // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based + // on transform, quantization, and (if exists) trellis optimization. + mbmi->skip_txfm = 1; + if (x->txfm_search_info.skip_txfm) return; + + struct optimize_ctx ctx; + struct encode_b_args arg = { + cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id] + }; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int subsampling_x = pd->subsampling_x; + const int subsampling_y = pd->subsampling_y; + if (plane && !xd->is_chroma_ref) break; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = mi_size_wide[txb_size]; + const int bh = mi_size_high[txb_size]; + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]); + av1_subtract_plane(x, plane_bsize, plane); + arg.ta = ctx.ta[plane]; + arg.tl = ctx.tl[plane]; + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); + mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); + + for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { + for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { + int blk_row, blk_col; + const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); + const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); + for (blk_row = idy; blk_row < unit_height; blk_row += bh) { + for (blk_col = idx; blk_col < unit_width; blk_col += bw) { + encode_block_inter(plane, block, blk_row, blk_col, plane_bsize, + max_tx_size, &arg, dry_run); + block += step; + } + } + } + } + } +} + +static void encode_block_intra_and_set_context(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, + arg); + + struct encode_b_args *const args = arg; + MACROBLOCK *x = args->x; + ENTROPY_CONTEXT *a = &args->ta[blk_col]; + ENTROPY_CONTEXT *l = &args->tl[blk_row]; + av1_set_txb_context(x, plane, block, tx_size, a, l); +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct encode_b_args *const args = arg; + const AV1_COMP *const cpi = args->cpi; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + PLANE_TYPE plane_type = get_plane_type(plane); + uint16_t *eob = &p->eobs[block]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + int dummy_rate_cost = 0; + + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); + + TX_TYPE tx_type = DCT_DCT; + const int bw = mi_size_wide[plane_bsize]; + if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane, + blk_row * bw + blk_col)) { + *eob = 0; + p->txb_entropy_ctx[block] = 0; + } else { + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); + + const ENTROPY_CONTEXT *a = &args->ta[blk_col]; + const ENTROPY_CONTEXT *l = &args->tl[blk_row]; + tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + TxfmParam txfm_param; + QUANT_PARAM quant_param; + const int use_trellis = + is_trellis_used(args->enable_optimize_b, args->dry_run); + int quant_idx; + if (use_trellis) + quant_idx = AV1_XFORM_QUANT_FP; + else + quant_idx = + USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP; + + av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param); + av1_setup_quant(tx_size, use_trellis, quant_idx, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + // Whether trellis or dropout optimization is required for key frames and + // intra frames. + const bool do_trellis = (frame_is_intra_only(cm) && + (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT || + KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || + (!frame_is_intra_only(cm) && + (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT || + INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); + const bool do_dropout = (frame_is_intra_only(cm) && + (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT || + KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) || + (!frame_is_intra_only(cm) && + (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT || + INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)); + + if (quant_param.use_optimize_b && do_trellis) { + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + &dummy_rate_cost); + } + if (do_dropout) { + av1_dropout_qcoeff(x, plane, block, tx_size, tx_type, + cm->quant_params.base_qindex); + } + } + + if (*eob) { + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, *eob, + cm->features.reduced_tx_set_used); + } + + // TODO(jingning): Temporarily disable txk_type check for eob=0 case. + // It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. + if (*eob == 0 && plane == 0) { +#if 0 + if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ + && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) { + assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] == + DCT_DCT); + } +#endif + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + + // For intra mode, skipped blocks are so rare that transmitting + // skip_txfm = 1 is very expensive. + mbmi->skip_txfm = 0; + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { + cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); + } +} + +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, + TRELLIS_OPT_TYPE enable_optimize_b) { + assert(bsize < BLOCK_SIZES_ALL); + const MACROBLOCKD *const xd = &x->e_mbd; + if (plane && !xd->is_chroma_ref) return; + + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 }; + ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 }; + struct encode_b_args arg = { + cpi, x, NULL, ta, tl, dry_run, enable_optimize_b + }; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + if (enable_optimize_b) { + av1_get_entropy_contexts(plane_bsize, pd, ta, tl); + } + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg); +} diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h new file mode 100644 index 0000000000..f97bf8f517 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemb.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEMB_H_ +#define AOM_AV1_ENCODER_ENCODEMB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/tokenize.h" +#ifdef __cplusplus +extern "C" { +#endif + +enum { + AV1_XFORM_QUANT_FP = 0, + AV1_XFORM_QUANT_B = 1, + AV1_XFORM_QUANT_DC = 2, + AV1_XFORM_QUANT_SKIP_QUANT, + AV1_XFORM_QUANT_TYPES, +} UENUM1BYTE(AV1_XFORM_QUANT); + +// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE +// Available optimization types to optimize the quantized coefficients. +enum { + NONE_OPT = 0, // No optimization. + TRELLIS_OPT = 1, // Trellis optimization. See `av1_optimize_b()`. + DROPOUT_OPT = 2, // Dropout optimization. See `av1_dropout_qcoeff()`. + TRELLIS_DROPOUT_OPT = 3 // Perform dropout after trellis optimization. +} UENUM1BYTE(OPT_TYPE); + +enum { + NO_TRELLIS_OPT, // No trellis optimization + FULL_TRELLIS_OPT, // Trellis optimization in all stages + FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass + NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb +} UENUM1BYTE(TRELLIS_OPT_TYPE); + +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE]; +}; + +struct encode_b_args { + const struct AV1_COMP *cpi; + MACROBLOCK *x; + struct optimize_ctx *ctx; + ENTROPY_CONTEXT *ta; + ENTROPY_CONTEXT *tl; + RUN_TYPE dry_run; + TRELLIS_OPT_TYPE enable_optimize_b; +}; + +void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RUN_TYPE dry_run); + +void av1_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane, + foreach_transformed_block_visitor visit, void *arg); + +void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize); + +void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size, + TX_TYPE tx_type, TxfmParam *txfm_param); +void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx, + int use_quant_b_adapt, QUANT_PARAM *qparam); +void av1_setup_qmatrix(const CommonQuantParams *quant_params, + const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, + TX_TYPE tx_type, QUANT_PARAM *qparam); + +void av1_xform_dc_only(MACROBLOCK *x, int plane, int block, + TxfmParam *txfm_param, int64_t per_px_mean); + +void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param, + const QUANT_PARAM *qparam); + +void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TxfmParam *txfm_param); + +void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param, + const QUANT_PARAM *qparam); + +int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost); + +// This function can be used as (i) a further optimization to reduce the +// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis +// optimization, or (ii) an alternative to trellis optimization in high-speed +// compression mode (e.g., real-time mode under speed-6) due to its LOW time +// complexity. The rational behind is to drop out the may-be redundant quantized +// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as +// accurate as trellis optimization since the hyper-parameters are hard-coded +// instead of dynamic search. More adaptive logic may improve the performance. +// This function should be applied to all or partical block cells. +// Inputs: +// mb: Pointer to the MACROBLOCK to perform dropout on. +// plane: Index of the plane to which the target block belongs. +// block: Index of the target block. +// tx_size: Transform size of the target block. +// tx_type: Transform type of the target block. This field is particularly +// used to find out the scan order of the block. +// qindex: Quantization index used for target block. In general, all blocks +// in a same plane share the same quantization index. This field is +// particularly used to determine how many zeros should be used to +// drop out a coefficient. +// Returns: +// Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as +// `txb_entropy_ctx`, which `mb` points to, may be modified by this function. +void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, + TX_TYPE tx_type, int qindex); +// Same as above, with the number of zeroes needed before/after a coeff to drop +// it explicitly passed in, instead of being derived from qindex. +void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, TX_TYPE tx_type, + int dropout_num_before, int dropout_num_after); + +void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride); + +void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, + int blk_col, int blk_row, TX_SIZE tx_size); + +void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane); + +static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, ENTROPY_CONTEXT *a, + ENTROPY_CONTEXT *l) { + const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block]; + memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a)); + memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l)); +} + +void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run, + TRELLIS_OPT_TYPE enable_optimize_b); + +static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b, + RUN_TYPE dry_run) { + if (optimize_b == NO_TRELLIS_OPT) return false; + if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED) + return false; + return true; +} + +// Scaling terms (precision of 12 bits) to perform tx-size specific +// normalization that is used in DCT_DCT forward transform. +// For transform blocks of 1:2 and 2:1 - sqrt(2) normalization is used +// For transform blocks of 1:4 and 4:1 - factor of 2 is used +// For transform blocks TX_8x8 and below - an additional factor of 2 is used +// For transform blocks max(width,height)=64 - currently not supported + +static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = { + 1024, 2048, 4096, 4096, 0, 1448, 1448, 2896, 2896, 2896, + 2896, 0, 0, 2048, 2048, 4096, 4096, 0, 0 +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEMB_H_ diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c new file mode 100644 index 0000000000..7cae72c159 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemv.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/common.h" +#include "av1/common/entropymode.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/bitops.h" + +static void update_mv_component_stats(int comp, nmv_component *mvcomp, + MvSubpelPrecision precision) { + assert(comp != 0); + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + // Sign + update_cdf(mvcomp->sign_cdf, sign, 2); + + // Class + update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES); + + // Integer bits + if (mv_class == MV_CLASS_0) { + update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE); + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (int i = 0; i < n; ++i) + update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2); + } + // Fractional bits + if (precision > MV_SUBPEL_NONE) { + aom_cdf_prob *fp_cdf = + mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf; + update_cdf(fp_cdf, fr, MV_FP_SIZE); + } + + // High precision bit + if (precision > MV_SUBPEL_LOW_PRECISION) { + aom_cdf_prob *hp_cdf = + mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf; + update_cdf(hp_cdf, hp, 2); + } +} + +void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, + MvSubpelPrecision precision) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + + update_cdf(mvctx->joints_cdf, j, MV_JOINTS); + + if (mv_joint_vertical(j)) + update_mv_component_stats(diff.row, &mvctx->comps[0], precision); + + if (mv_joint_horizontal(j)) + update_mv_component_stats(diff.col, &mvctx->comps[1], precision); +} + +static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp, + MvSubpelPrecision precision) { + assert(comp != 0); + int offset; + const int sign = comp < 0; + const int mag = sign ? -comp : comp; + const int mv_class = av1_get_mv_class(mag - 1, &offset); + const int d = offset >> 3; // int mv data + const int fr = (offset >> 1) & 3; // fractional mv data + const int hp = offset & 1; // high precision mv data + + // Sign + aom_write_symbol(w, sign, mvcomp->sign_cdf, 2); + + // Class + aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES); + + // Integer bits + if (mv_class == MV_CLASS_0) { + aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE); + } else { + int i; + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (i = 0; i < n; ++i) + aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2); + } + // Fractional bits + if (precision > MV_SUBPEL_NONE) { + aom_write_symbol( + w, fr, + mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf, + MV_FP_SIZE); + } + + // High precision bit + if (precision > MV_SUBPEL_LOW_PRECISION) + aom_write_symbol( + w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, + 2); +} + +/* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This + * is more than most L1D caches and is a significant chunk of L2. Write + * SIMD that uses streaming writes to avoid loading all of that into L1, or + * just don't update the larger component costs every time this called + * (or both). + */ +void av1_build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + MvSubpelPrecision precision) { + int i, j, v, o, mantissa; + int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; + int bits_cost[MV_OFFSET_BITS][2]; + int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 }, + fp_cost[MV_FP_SIZE] = { 0 }; + int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 }; + + av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL); + av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL); + av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL); + for (i = 0; i < MV_OFFSET_BITS; ++i) { + av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL); + } + + if (precision > MV_SUBPEL_NONE) { + for (i = 0; i < CLASS0_SIZE; ++i) + av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], + NULL); + av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL); + } + + if (precision > MV_SUBPEL_LOW_PRECISION) { + av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL); + av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL); + } + + // Instead of accumulating the cost of each vector component's bits + // individually, compute the costs based on smaller vectors. Costs for + // [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1] + // respectively. Offsets are maintained to swap both 1) class costs when + // treated as a complete vector component with the highest set bit when + // treated as a mantissa (significand) and 2) leading zeros to account for + // the current exponent. + + // Cost offsets + int cost_swap[MV_OFFSET_BITS] = { 0 }; + // Delta to convert positive vector to negative vector costs + int negate_sign = sign_cost[1] - sign_cost[0]; + + // Initialize with offsets to swap the class costs with the costs of the + // highest set bit. + for (i = 1; i < MV_OFFSET_BITS; ++i) { + cost_swap[i] = bits_cost[i - 1][1]; + if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS]; + } + + // Seed the fractional costs onto the output (overwritten latter). + for (o = 0; o < MV_FP_SIZE; ++o) { + int hp; + for (hp = 0; hp < 2; ++hp) { + v = 2 * o + hp + 1; + mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0]; + } + } + + mvcost[0] = 0; + // Fill the costs for each exponent's vectors, using the costs set in the + // previous exponents. + for (i = 0; i < MV_OFFSET_BITS; ++i) { + const int exponent = (2 * MV_FP_SIZE) << i; + + int class = 0; + if (i >= CLASS0_BITS) { + class = class_cost[i - CLASS0_BITS + 1]; + } + + // Iterate through mantissas, keeping track of the location + // of the highest set bit for the mantissa. + // To be clear: in the outer loop, the position of the highest set bit + // (exponent) is tracked and, in this loop, the highest set bit of the + // mantissa is tracked. + mantissa = 0; + for (j = 0; j <= i; ++j) { + for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) { + int cost = mvcost[mantissa + 1] + class + cost_swap[j]; + v = exponent + mantissa + 1; + mvcost[v] = cost; + mvcost[-v] = cost + negate_sign; + } + cost_swap[j] += bits_cost[i][0]; + } + } + + // Special case to avoid buffer overrun + { + int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS; + int class = class_cost[MV_CLASSES - 1]; + mantissa = 0; + for (j = 0; j < MV_OFFSET_BITS; ++j) { + for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) { + int cost = mvcost[mantissa + 1] + class + cost_swap[j]; + v = exponent + mantissa + 1; + mvcost[v] = cost; + mvcost[-v] = cost + negate_sign; + } + } + // At this point: mantissa = exponent >> 1 + + // Manually calculate the final cost offset + int cost_swap_hi = + bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2]; + for (; mantissa < exponent - 1; ++mantissa) { + int cost = mvcost[mantissa + 1] + class + cost_swap_hi; + v = exponent + mantissa + 1; + mvcost[v] = cost; + mvcost[-v] = cost + negate_sign; + } + } + + // Fill costs for class0 vectors, overwriting previous placeholder values + // used for calculating the costs of the larger vectors. + for (i = 0; i < CLASS0_SIZE; ++i) { + const int top = i * 2 * MV_FP_SIZE; + for (o = 0; o < MV_FP_SIZE; ++o) { + int hp; + int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i]; + for (hp = 0; hp < 2; ++hp) { + v = top + 2 * o + hp + 1; + mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0]; + mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1]; + } + } + } +} + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, + const MV *ref, nmv_context *mvctx, int usehp) { + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + // If the mv_diff is zero, then we should have used near or nearest instead. + assert(j != MV_JOINT_ZERO); + if (cpi->common.features.cur_frame_force_integer_mv) { + usehp = MV_SUBPEL_NONE; + } + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); + + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (cpi->sf.mv_sf.auto_mv_step_size) { + int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3; + td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude); + } +} + +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx) { + // DV and ref DV should not have sub-pel. + assert((mv->col & 7) == 0); + assert((mv->row & 7) == 0); + assert((ref->col & 7) == 0); + assert((ref->row & 7) == 0); + const MV diff = { mv->row - ref->row, mv->col - ref->col }; + const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); + + aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS); + if (mv_joint_vertical(j)) + encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE); + + if (mv_joint_horizontal(j)) + encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE); +} + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *ctx, + MvSubpelPrecision precision) { + av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL); + av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision); + av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision); +} + +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const CANDIDATE_MV *curr_ref_mv_stack = + mbmi_ext->ref_mv_stack[ref_frame_type]; + + if (ref_frame[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type] + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext->global_mvs[ref_frame_type]; +} + +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx, + &x->mbmi_ext); +} + +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer) { + const int ref_idx = 0; + MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext); + lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer); + *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext); + lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer); +} diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h new file mode 100644 index 0000000000..c39001a5a2 --- /dev/null +++ b/third_party/aom/av1/encoder/encodemv.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODEMV_H_ +#define AOM_AV1_ENCODER_ENCODEMV_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, + const MV *ref, nmv_context *mvctx, int usehp); + +void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, + MvSubpelPrecision precision); + +void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context *mvctx, + MvSubpelPrecision precision); +void av1_build_nmv_component_cost_table(int *mvcost, + const nmv_component *const mvcomp, + MvSubpelPrecision precision); + +void av1_update_mv_count(ThreadData *td); + +void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref, + nmv_context *mvctx); +int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx); +int_mv av1_get_ref_mv_from_stack(int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext); +void av1_find_best_ref_mvs_from_stack(int allow_hp, + const MB_MODE_INFO_EXT *mbmi_ext, + MV_REFERENCE_FRAME ref_frame, + int_mv *nearest_mv, int_mv *near_mv, + int is_integer); + +static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) { + // row: Z col: Z | MV_JOINT_ZERO (0) + // row: Z col: NZ | MV_JOINT_HNZVZ (1) + // row: NZ col: Z | MV_JOINT_HZVNZ (2) + // row: NZ col: NZ | MV_JOINT_HNZVNZ (3) + return (!!mv->col) | ((!!mv->row) << 1); +} + +static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} + +// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. +static INLINE uint8_t av1_log_in_base_2(unsigned int n) { + // get_msb() is only valid when n != 0. + return n == 0 ? 0 : get_msb(n); +} + +static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) { + assert(z >= 0); + const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3); + assert(c <= MV_CLASS_10); + if (offset) *offset = z - av1_mv_class_base(c); + return c; +} + +static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm, + MACROBLOCK *const x) { + (void)cm; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + if (this_mode == NEW_NEWMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); + if (mbmi->mv[0].as_int == ref_mv_0.as_int || + mbmi->mv[1].as_int == ref_mv_1.as_int) { + return 0; + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + const int_mv ref_mv_1 = av1_get_ref_mv(x, 1); + if (mbmi->mv[1].as_int == ref_mv_1.as_int) { + return 0; + } + } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + if (mbmi->mv[0].as_int == ref_mv_0.as_int) { + return 0; + } + } else if (this_mode == NEWMV) { + const int_mv ref_mv_0 = av1_get_ref_mv(x, 0); + if (mbmi->mv[0].as_int == ref_mv_0.as_int) { + return 0; + } + } + return 1; +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODEMV_H_ diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c new file mode 100644 index 0000000000..4732ad435b --- /dev/null +++ b/third_party/aom/av1/encoder/encoder.c @@ -0,0 +1,5409 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "av1/common/scale.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aomcx.h" + +#if CONFIG_DENOISE +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_util.h" +#include "aom_dsp/noise_model.h" +#endif +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/psnr.h" +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "av1/common/alloccommon.h" +#include "av1/common/filter.h" +#include "av1/common/idct.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/dwt.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/hash_motion.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/mv_prec.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#if CONFIG_SALIENCY_MAP +#include "av1/encoder/saliency_map.h" +#endif +#include "av1/encoder/segmentation.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/superres_scale.h" +#include "av1/encoder/thirdpass.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/var_based_part.h" + +#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 + +// #define OUTPUT_YUV_REC +#ifdef OUTPUT_YUV_REC +FILE *yuv_rec_file; +#define FILE_NAME_LEN 100 +#endif + +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file = NULL; +#endif + +static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) { + switch (mode) { + case AOME_NORMAL: + *hr = 1; + *hs = 1; + break; + case AOME_FOURFIVE: + *hr = 4; + *hs = 5; + break; + case AOME_THREEFIVE: + *hr = 3; + *hs = 5; + break; + case AOME_THREEFOUR: + *hr = 3; + *hs = 4; + break; + case AOME_ONEFOUR: + *hr = 1; + *hs = 4; + break; + case AOME_ONEEIGHT: + *hr = 1; + *hs = 8; + break; + case AOME_ONETWO: + *hr = 1; + *hs = 2; + break; + case AOME_TWOTHREE: + *hr = 2; + *hs = 3; + break; + case AOME_ONETHREE: + *hr = 1; + *hs = 3; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) { + unsigned char *const active_map_4x4 = cpi->active_map.map; + const int mi_rows = mi_params->mi_rows; + const int mi_cols = mi_params->mi_cols; + const int row_scale = mi_size_high_log2[BLOCK_16X16]; + const int col_scale = mi_size_wide_log2[BLOCK_16X16]; + cpi->active_map.update = 0; + assert(mi_rows % 2 == 0); + assert(mi_cols % 2 == 0); + if (new_map_16x16) { + for (int r = 0; r < (mi_rows >> row_scale); ++r) { + for (int c = 0; c < (mi_cols >> col_scale); ++c) { + const uint8_t val = new_map_16x16[r * cols + c] + ? AM_SEGMENT_ID_ACTIVE + : AM_SEGMENT_ID_INACTIVE; + active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val; + active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val; + active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val; + active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val; + } + } + cpi->active_map.enabled = 1; + } + return 0; + } + + return -1; +} + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + if (rows == mi_params->mb_rows && cols == mi_params->mb_cols && + new_map_16x16) { + unsigned char *const seg_map_8x8 = cpi->enc_seg.map; + const int mi_rows = mi_params->mi_rows; + const int mi_cols = mi_params->mi_cols; + const int row_scale = mi_size_high_log2[BLOCK_16X16]; + const int col_scale = mi_size_wide_log2[BLOCK_16X16]; + assert(mi_rows % 2 == 0); + assert(mi_cols % 2 == 0); + + memset(new_map_16x16, !cpi->active_map.enabled, rows * cols); + if (cpi->active_map.enabled) { + for (int r = 0; r < (mi_rows >> row_scale); ++r) { + for (int c = 0; c < (mi_cols >> col_scale); ++c) { + // Cyclic refresh segments are considered active despite not having + // AM_SEGMENT_ID_ACTIVE + uint8_t temp = 0; + temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] != + AM_SEGMENT_ID_INACTIVE; + temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] != + AM_SEGMENT_ID_INACTIVE; + temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] != + AM_SEGMENT_ID_INACTIVE; + temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] != + AM_SEGMENT_ID_INACTIVE; + new_map_16x16[r * cols + c] |= temp; + } + } + } + return 0; + } + + return -1; +} + +void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) { + bool is_allintra = usage == ALLINTRA; + + av1_rtcd(); + aom_dsp_rtcd(); + aom_scale_rtcd(); + av1_init_intra_predictors(); + av1_init_me_luts(); + if (!is_allintra) av1_init_wedge_masks(); + if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts(); +} + +void av1_new_framerate(AV1_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; + av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); +} + +double av1_get_compression_ratio(const AV1_COMMON *const cm, + size_t encoded_frame_size) { + const int upscaled_width = cm->superres_upscaled_width; + const int height = cm->height; + const int64_t luma_pic_size = (int64_t)upscaled_width * height; + const SequenceHeader *const seq_params = cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int pic_size_profile_factor = + profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36); + encoded_frame_size = + (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1); + const int64_t uncompressed_frame_size = + (luma_pic_size * pic_size_profile_factor) >> 3; + return (double)uncompressed_frame_size / encoded_frame_size; +} + +static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs, + int num_tiles_lg, int tile_col_row) { + CommonTileParams *const tiles = &cm->tiles; + int i, start_sb; + int size_sb = num_sbs >> num_tiles_lg; + int res_sbs = num_sbs - (size_sb << num_tiles_lg); + int num_tiles = 1 << num_tiles_lg; + int inc_index = num_tiles - res_sbs; + + tiles->uniform_spacing = 0; + + for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) { + if (i == inc_index) ++size_sb; + if (tile_col_row) + tiles->col_start_sb[i] = start_sb; + else + tiles->row_start_sb[i] = start_sb; + + start_sb += AOMMIN(size_sb, tiles->max_width_sb); + } + + if (tile_col_row) { + tiles->cols = i; + tiles->col_start_sb[i] = num_sbs; + } else { + tiles->rows = i; + tiles->row_start_sb[i] = num_sbs; + } +} + +static void set_tile_info(AV1_COMMON *const cm, + const TileConfig *const tile_cfg) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + CommonTileParams *const tiles = &cm->tiles; + int i, start_sb; + + av1_get_tile_limits(cm); + + int sb_cols = + CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2); + // configure tile columns + if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) { + tiles->uniform_spacing = 1; + tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols); + // Add a special case to handle super resolution + sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator); + int min_log2_cols = 0; + for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) { + } + tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols); + + tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols); + } else if (tile_cfg->tile_widths[0] < 0) { + auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1); + } else { + int size_sb, j = 0; + tiles->uniform_spacing = 0; + for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) { + tiles->col_start_sb[i] = start_sb; + size_sb = tile_cfg->tile_widths[j++]; + if (j >= tile_cfg->tile_width_count) j = 0; + start_sb += AOMMIN(size_sb, tiles->max_width_sb); + } + tiles->cols = i; + tiles->col_start_sb[i] = sb_cols; + } + av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols, + tiles); + + // configure tile rows + int sb_rows = + CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2); + if (tiles->uniform_spacing) { + tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows); + tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows); + } else if (tile_cfg->tile_heights[0] < 0) { + auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0); + } else { + int size_sb, j = 0; + for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) { + tiles->row_start_sb[i] = start_sb; + size_sb = tile_cfg->tile_heights[j++]; + if (j >= tile_cfg->tile_height_count) j = 0; + start_sb += AOMMIN(size_sb, tiles->max_height_sb); + } + tiles->rows = i; + tiles->row_start_sb[i] = sb_rows; + } + av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles); +} + +void av1_update_frame_size(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + // Setup mi_params here in case we need more mi's. + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->set_mb_mi(mi_params, cm->width, cm->height, + cpi->sf.part_sf.default_min_partition_size); + + av1_init_macroblockd(cm, xd); + + if (!cpi->ppi->seq_params_locked) + set_sb_size(cm->seq_params, + av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, + cpi->ppi->number_spatial_layers)); + + set_tile_info(cm, &cpi->oxcf.tile_cfg); +} + +static INLINE int does_level_match(int width, int height, double fps, + int lvl_width, int lvl_height, + double lvl_fps, int lvl_dim_mult) { + const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height; + const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps; + const int64_t luma_pels = (int64_t)width * height; + const double display_sample_rate = luma_pels * fps; + return luma_pels <= lvl_luma_pels && + display_sample_rate <= lvl_display_sample_rate && + width <= lvl_width * lvl_dim_mult && + height <= lvl_height * lvl_dim_mult; +} + +static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width, + int height, double init_framerate) { + SequenceHeader *const seq_params = &ppi->seq_params; + const AV1LevelParams *const level_params = &ppi->level_params; + // TODO(any): This is a placeholder function that only addresses dimensions + // and max display sample rates. + // Need to add checks for max bit rate, max decoded luma sample rate, header + // rate, etc. that are not covered by this function. + AV1_LEVEL level = SEQ_LEVEL_MAX; + if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) { + level = SEQ_LEVEL_2_0; + } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0, + 4)) { + level = SEQ_LEVEL_2_1; + } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0, + 4)) { + level = SEQ_LEVEL_3_0; + } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0, + 4)) { + level = SEQ_LEVEL_3_1; + } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0, + 3)) { + level = SEQ_LEVEL_4_0; + } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0, + 3)) { + level = SEQ_LEVEL_4_1; + } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0, + 2)) { + level = SEQ_LEVEL_5_0; + } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0, + 2)) { + level = SEQ_LEVEL_5_1; + } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0, + 2)) { + level = SEQ_LEVEL_5_2; + } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0, + 2)) { + level = SEQ_LEVEL_6_0; + } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0, + 2)) { + level = SEQ_LEVEL_6_1; + } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0, + 2)) { + level = SEQ_LEVEL_6_2; + } +#if CONFIG_CWG_C013 + // TODO(bohanli): currently target level is only working for the 0th operating + // point, so scalable coding is not supported. + else if (level_params->target_seq_level_idx[0] >= SEQ_LEVEL_7_0 && + level_params->target_seq_level_idx[0] <= SEQ_LEVEL_8_3) { + // Only use level 7.x to 8.x when explicitly asked to. + if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0, 2)) { + level = SEQ_LEVEL_7_0; + } else if (does_level_match(width, height, init_framerate, 16384, 8704, + 60.0, 2)) { + level = SEQ_LEVEL_7_1; + } else if (does_level_match(width, height, init_framerate, 16384, 8704, + 120.0, 2)) { + level = SEQ_LEVEL_7_2; + } else if (does_level_match(width, height, init_framerate, 32768, 17408, + 30.0, 2)) { + level = SEQ_LEVEL_8_0; + } else if (does_level_match(width, height, init_framerate, 32768, 17408, + 60.0, 2)) { + level = SEQ_LEVEL_8_1; + } else if (does_level_match(width, height, init_framerate, 32768, 17408, + 120.0, 2)) { + level = SEQ_LEVEL_8_2; + } + } +#endif + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + assert(is_valid_seq_level_idx(level_params->target_seq_level_idx[i]) || + level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS); + // If a higher target level is specified, it is then used rather than the + // inferred one from resolution and framerate. + seq_params->seq_level_idx[i] = + level_params->target_seq_level_idx[i] < SEQ_LEVELS && + level_params->target_seq_level_idx[i] > level + ? level_params->target_seq_level_idx[i] + : level; + // Set the maximum parameters for bitrate and buffer size for this profile, + // level, and tier + seq_params->op_params[i].bitrate = av1_max_level_bitrate( + seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]); + // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the + // check + if (seq_params->op_params[i].bitrate == 0) + aom_internal_error( + &ppi->error, AOM_CODEC_UNSUP_BITSTREAM, + "AV1 does not support this combination of profile, level, and tier."); + // Buffer size in bits/s is bitrate in bits/s * 1 s + seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; + } +} + +void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, + const AV1EncoderConfig *oxcf, + int disable_frame_id_numbers) { + SequenceHeader *const seq = &ppi->seq_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + const ToolCfg *const tool_cfg = &oxcf->tool_cfg; + + seq->still_picture = + !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1); + seq->reduced_still_picture_hdr = + seq->still_picture && !tool_cfg->full_still_picture_hdr; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint; + seq->frame_id_numbers_present_flag = + !seq->reduced_still_picture_hdr && + !oxcf->tile_cfg.enable_large_scale_tile && + tool_cfg->error_resilient_mode && !disable_frame_id_numbers; + if (seq->reduced_still_picture_hdr) { + seq->order_hint_info.enable_order_hint = 0; + seq->force_screen_content_tools = 2; + seq->force_integer_mv = 2; + } + seq->order_hint_info.order_hint_bits_minus_1 = + seq->order_hint_info.enable_order_hint + ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 + : -1; + + seq->max_frame_width = frm_dim_cfg->forced_max_frame_width + ? frm_dim_cfg->forced_max_frame_width + : frm_dim_cfg->width; + seq->max_frame_height = frm_dim_cfg->forced_max_frame_height + ? frm_dim_cfg->forced_max_frame_height + : frm_dim_cfg->height; + seq->num_bits_width = + (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1; + seq->num_bits_height = + (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1; + assert(seq->num_bits_width <= 16); + assert(seq->num_bits_height <= 16); + + seq->frame_id_length = FRAME_ID_LENGTH; + seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; + + seq->enable_dual_filter = tool_cfg->enable_dual_filter; + seq->order_hint_info.enable_dist_wtd_comp = + oxcf->comp_type_cfg.enable_dist_wtd_comp; + seq->order_hint_info.enable_dist_wtd_comp &= + seq->order_hint_info.enable_order_hint; + seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present; + seq->order_hint_info.enable_ref_frame_mvs &= + seq->order_hint_info.enable_order_hint; + seq->enable_superres = oxcf->superres_cfg.enable_superres; + seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0; + seq->enable_restoration = tool_cfg->enable_restoration; + seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion; + seq->enable_interintra_compound = tool_cfg->enable_interintra_comp; + seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp; + seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter; + seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra; + + set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height, + oxcf->input_cfg.init_framerate); + + if (seq->operating_points_cnt_minus_1 == 0) { + seq->operating_point_idc[0] = 0; + } else { + // Set operating_point_idc[] such that the i=0 point corresponds to the + // highest quality operating point (all layers), and subsequent + // operarting points (i > 0) are lower quality corresponding to + // skip decoding enhancement layers (temporal first). + int i = 0; + assert(seq->operating_points_cnt_minus_1 == + (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1)); + for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) { + for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) { + seq->operating_point_idc[i] = + (~(~0u << (ppi->number_spatial_layers - sl)) << 8) | + ~(~0u << (ppi->number_temporal_layers - tl)); + i++; + } + } + } +} + +static void init_config_sequence(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf) { + SequenceHeader *const seq_params = &ppi->seq_params; + const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; + const ColorCfg *const color_cfg = &oxcf->color_cfg; + + ppi->use_svc = 0; + ppi->number_spatial_layers = 1; + ppi->number_temporal_layers = 1; + + seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->tool_cfg.bit_depth; + seq_params->use_highbitdepth = oxcf->use_highbitdepth; + seq_params->color_primaries = color_cfg->color_primaries; + seq_params->transfer_characteristics = color_cfg->transfer_characteristics; + seq_params->matrix_coefficients = color_cfg->matrix_coefficients; + seq_params->monochrome = oxcf->tool_cfg.enable_monochrome; + seq_params->chroma_sample_position = color_cfg->chroma_sample_position; + seq_params->color_range = color_cfg->color_range; + seq_params->timing_info_present = dec_model_cfg->timing_info_present; + seq_params->timing_info.num_units_in_display_tick = + dec_model_cfg->timing_info.num_units_in_display_tick; + seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale; + seq_params->timing_info.equal_picture_interval = + dec_model_cfg->timing_info.equal_picture_interval; + seq_params->timing_info.num_ticks_per_picture = + dec_model_cfg->timing_info.num_ticks_per_picture; + + seq_params->display_model_info_present_flag = + dec_model_cfg->display_model_info_present_flag; + seq_params->decoder_model_info_present_flag = + dec_model_cfg->decoder_model_info_present_flag; + if (dec_model_cfg->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + seq_params->decoder_model_info.num_units_in_decoding_tick = + dec_model_cfg->num_units_in_decoding_tick; + ppi->buffer_removal_time_present = 1; + av1_set_aom_dec_model_info(&seq_params->decoder_model_info); + av1_set_dec_model_op_parameters(&seq_params->op_params[0]); + } else if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + av1_set_resource_availability_parameters(&seq_params->op_params[0]); + } else { + seq_params->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + if (seq_params->monochrome) { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 1; + } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 && + seq_params->transfer_characteristics == AOM_CICP_TC_SRGB && + seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) { + seq_params->subsampling_x = 0; + seq_params->subsampling_y = 0; + } else { + if (seq_params->profile == 0) { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 1; + } else if (seq_params->profile == 1) { + seq_params->subsampling_x = 0; + seq_params->subsampling_y = 0; + } else { + if (seq_params->bit_depth == AOM_BITS_12) { + seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x; + seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y; + } else { + seq_params->subsampling_x = 1; + seq_params->subsampling_y = 0; + } + } + } + av1_change_config_seq(ppi, oxcf, NULL); +} + +static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->input_cfg.init_framerate; + + cm->width = oxcf->frm_dim_cfg.width; + cm->height = oxcf->frm_dim_cfg.height; + cpi->is_dropped_frame = false; + + alloc_compressor_data(cpi); + + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + + // Single thread case: use counts in common. + cpi->td.counts = &cpi->counts; + + // Init SVC parameters. + cpi->svc.number_spatial_layers = 1; + cpi->svc.number_temporal_layers = 1; + cm->spatial_layer_id = 0; + cm->temporal_layer_id = 0; + // Init rtc_ref parameters. + cpi->ppi->rtc_ref.set_ref_frame_config = 0; + cpi->ppi->rtc_ref.non_reference_frame = 0; + cpi->ppi->rtc_ref.ref_frame_comp[0] = 0; + cpi->ppi->rtc_ref.ref_frame_comp[1] = 0; + cpi->ppi->rtc_ref.ref_frame_comp[2] = 0; + + // change includes all joint functionality + av1_change_config(cpi, oxcf, false); + + cpi->ref_frame_flags = 0; + + // Reset resize pending flags + resize_pending_params->width = 0; + resize_pending_params->height = 0; + + // Setup identity scale factor + av1_setup_scale_factors_for_frame(&cm->sf_identity, 1, 1, 1, 1); + + init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx); + + av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); +} + +void av1_change_config_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf, + bool *is_sb_size_changed) { + SequenceHeader *const seq_params = &ppi->seq_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; + const ColorCfg *const color_cfg = &oxcf->color_cfg; + + if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile; + seq_params->bit_depth = oxcf->tool_cfg.bit_depth; + seq_params->color_primaries = color_cfg->color_primaries; + seq_params->transfer_characteristics = color_cfg->transfer_characteristics; + seq_params->matrix_coefficients = color_cfg->matrix_coefficients; + seq_params->monochrome = oxcf->tool_cfg.enable_monochrome; + seq_params->chroma_sample_position = color_cfg->chroma_sample_position; + seq_params->color_range = color_cfg->color_range; + + assert(IMPLIES(seq_params->profile <= PROFILE_1, + seq_params->bit_depth <= AOM_BITS_10)); + + seq_params->timing_info_present = dec_model_cfg->timing_info_present; + seq_params->timing_info.num_units_in_display_tick = + dec_model_cfg->timing_info.num_units_in_display_tick; + seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale; + seq_params->timing_info.equal_picture_interval = + dec_model_cfg->timing_info.equal_picture_interval; + seq_params->timing_info.num_ticks_per_picture = + dec_model_cfg->timing_info.num_ticks_per_picture; + + seq_params->display_model_info_present_flag = + dec_model_cfg->display_model_info_present_flag; + seq_params->decoder_model_info_present_flag = + dec_model_cfg->decoder_model_info_present_flag; + if (dec_model_cfg->decoder_model_info_present_flag) { + // set the decoder model parameters in schedule mode + seq_params->decoder_model_info.num_units_in_decoding_tick = + dec_model_cfg->num_units_in_decoding_tick; + ppi->buffer_removal_time_present = 1; + av1_set_aom_dec_model_info(&seq_params->decoder_model_info); + av1_set_dec_model_op_parameters(&seq_params->op_params[0]); + } else if (seq_params->timing_info_present && + seq_params->timing_info.equal_picture_interval && + !seq_params->decoder_model_info_present_flag) { + // set the decoder model parameters in resource availability mode + av1_set_resource_availability_parameters(&seq_params->op_params[0]); + } else { + seq_params->op_params[0].initial_display_delay = + 10; // Default value (not signaled) + } + + av1_update_film_grain_parameters_seq(ppi, oxcf); + + int sb_size = seq_params->sb_size; + // Superblock size should not be updated after the first key frame. + if (!ppi->seq_params_locked) { + set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width, + frm_dim_cfg->height, + ppi->number_spatial_layers)); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) + seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; + } + if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size) + *is_sb_size_changed = true; + + // Init sequence level coding tools + // This should not be called after the first key frame. + if (!ppi->seq_params_locked) { + seq_params->operating_points_cnt_minus_1 = + (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) + ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1 + : 0; + av1_init_seq_coding_tools( + ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config); + } + seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr; + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(ppi); +#endif + + set_primary_rc_buffer_sizes(oxcf, ppi); +} + +void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf, + bool is_sb_size_changed) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + MACROBLOCK *const x = &cpi->td.mb; + AV1LevelParams *const level_params = &cpi->ppi->level_params; + RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + FeatureFlags *const features = &cm->features; + + // in case of LAP, lag in frames is set according to number of lap buffers + // calculated at init time. This stores and restores LAP's lag in frames to + // prevent override by new cfg. + int lap_lag_in_frames = -1; + if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) { + lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames; + } + + cpi->oxcf = *oxcf; + + av1_update_film_grain_parameters(cpi, oxcf); + + // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize + // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure + // that any analysis (e.g. TPL) happening outside the main encoding loop still + // happens at full resolution. + // This value will later be set appropriately just before main encoding loop. + cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO + ? AOM_SUPERRES_NONE + : oxcf->superres_cfg.superres_mode; // default + x->e_mbd.bd = (int)seq_params->bit_depth; + x->e_mbd.global_motion = cm->global_motion; + + memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx, + sizeof(level_params->target_seq_level_idx)); + level_params->keep_level_stats = 0; + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + if (level_params->target_seq_level_idx[i] < SEQ_LEVELS || + level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) { + level_params->keep_level_stats |= 1u << i; + if (!level_params->level_info[i]) { + CHECK_MEM_ERROR(cm, level_params->level_info[i], + aom_calloc(1, sizeof(*level_params->level_info[i]))); + } + } + } + + // TODO(huisu@): level targeting currently only works for the 0th operating + // point, so scalable coding is not supported yet. + if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) { + // Adjust encoder config in order to meet target level. + config_target_level(cpi, level_params->target_seq_level_idx[0], + seq_params->tier[0]); + } + + if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) { + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL; + } else if (!is_one_pass_rt_params(cpi) || + cm->current_frame.frame_number == 0) { + // For rtc mode: logic for setting the baseline_gf_interval is done + // in av1_get_one_pass_rt_params(), and it should not be reset here in + // change_config(), unless after init_config (first frame). + p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + } + + refresh_frame->golden_frame = false; + refresh_frame->bwd_ref_frame = false; + + features->refresh_frame_context = + (oxcf->tool_cfg.frame_parallel_decoding_mode) + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->tile_cfg.enable_large_scale_tile) + features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + if (x->palette_buffer == NULL) { + CHECK_MEM_ERROR(cm, x->palette_buffer, + aom_memalign(16, sizeof(*x->palette_buffer))); + } + + if (x->tmp_conv_dst == NULL) { + CHECK_MEM_ERROR( + cm, x->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst))); + x->e_mbd.tmp_conv_dst = x->tmp_conv_dst; + } + // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames + // to store intermediate inter mode prediction results and are not required + // for allintra encoding mode. Hence, the memory allocations for these buffers + // are avoided for allintra encoding mode. + if (cpi->oxcf.kf_cfg.key_freq_max != 0) { + if (x->comp_rd_buffer.pred0 == NULL) + alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer); + + for (int i = 0; i < 2; ++i) { + if (x->tmp_pred_bufs[i] == NULL) { + CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*x->tmp_pred_bufs[i]))); + x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i]; + } + } + } + + av1_reset_segment_features(cm); + + av1_set_high_precision_mv(cpi, 1, 0); + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + p_rc->bits_off_target = + AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); + p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size); + + // Set up frame rate and related parameters rate control values. + av1_new_framerate(cpi, cpi->framerate); + + // Set absolute upper and lower quality limits + rc->worst_quality = rc_cfg->worst_allowed_q; + rc->best_quality = rc_cfg->best_allowed_q; + + // If lossless has been requested make sure average Q accumulators are reset. + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + int i; + for (i = 0; i < FRAME_TYPES; ++i) { + p_rc->avg_frame_qindex[i] = 0; + } + } + + features->interp_filter = + oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE; + features->switchable_motion_mode = is_switchable_motion_mode_allowed( + features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc); + + if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) { + cm->render_width = frm_dim_cfg->render_width; + cm->render_height = frm_dim_cfg->render_height; + } else { + cm->render_width = frm_dim_cfg->width; + cm->render_height = frm_dim_cfg->height; + } + cm->width = frm_dim_cfg->width; + cm->height = frm_dim_cfg->height; + + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height || is_sb_size_changed) { + av1_free_context_buffers(cm); + av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); + av1_free_sms_tree(&cpi->td); + av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); + cpi->td.firstpass_ctx = NULL; + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + } + av1_update_frame_size(cpi); + + rc->is_src_frame_alt_ref = 0; + + if (!cpi->ppi->rtc_ref.set_ref_frame_config) + cpi->ext_flags.refresh_frame.update_pending = 0; + cpi->ext_flags.refresh_frame_context_pending = 0; + + if (cpi->ppi->use_svc) + av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth); + + check_reset_rc_flag(cpi); + + // restore the value of lag_in_frame for LAP stage. + if (lap_lag_in_frames != -1) { + cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; + } + +#if CONFIG_REALTIME_ONLY + assert(!oxcf->tool_cfg.enable_global_motion); + cpi->image_pyramid_levels = 0; +#else + if (oxcf->tool_cfg.enable_global_motion) { + cpi->image_pyramid_levels = + global_motion_pyr_levels[default_global_motion_method]; + } else { + cpi->image_pyramid_levels = 0; + } +#endif // CONFIG_REALTIME_ONLY +} + +static INLINE void init_frame_info(FRAME_INFO *frame_info, + const AV1_COMMON *const cm) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + frame_info->frame_width = cm->width; + frame_info->frame_height = cm->height; + frame_info->mi_cols = mi_params->mi_cols; + frame_info->mi_rows = mi_params->mi_rows; + frame_info->mb_cols = mi_params->mb_cols; + frame_info->mb_rows = mi_params->mb_rows; + frame_info->num_mbs = mi_params->MBs; + frame_info->bit_depth = seq_params->bit_depth; + frame_info->subsampling_x = seq_params->subsampling_x; + frame_info->subsampling_y = seq_params->subsampling_y; +} + +static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) { + frame_index_set->show_frame_count = 0; +} + +static INLINE void update_counters_for_show_frame(AV1_COMP *const cpi) { + assert(cpi->common.show_frame); + cpi->frame_index_set.show_frame_count++; + cpi->common.current_frame.frame_number++; +} + +AV1_PRIMARY *av1_create_primary_compressor( + struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, + const AV1EncoderConfig *oxcf) { + AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY)); + if (!ppi) return NULL; + av1_zero(*ppi); + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(ppi->error.jmp)) { + ppi->error.setjmp = 0; + av1_remove_primary_compressor(ppi); + return 0; + } + ppi->error.setjmp = 1; + + ppi->seq_params_locked = 0; + ppi->lap_enabled = num_lap_buffers > 0; + ppi->output_pkt_list = pkt_list_head; + ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS; + ppi->frames_left = oxcf->input_cfg.limit; + ppi->num_fp_contexts = 1; + + init_config_sequence(ppi, oxcf); + +#if CONFIG_ENTROPY_STATS + av1_zero(ppi->aggregate_fc); +#endif // CONFIG_ENTROPY_STATS + + av1_primary_rc_init(oxcf, &ppi->p_rc); + + // For two pass and lag_in_frames > 33 in LAP. + ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2; + if (ppi->lap_enabled) { + if ((num_lap_buffers < + (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) && + num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) { + /* + * For lag in frames >= 19 and <33, enable scenecut + * with limited future frame prediction. + */ + ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1; + } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) { + // Disable scenecut when lag_in_frames < 19. + ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT; + } + } + +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \ + ppi->fn_ptr[BT].sdf = SDF; \ + ppi->fn_ptr[BT].sdaf = SDAF; \ + ppi->fn_ptr[BT].vf = VF; \ + ppi->fn_ptr[BT].svf = SVF; \ + ppi->fn_ptr[BT].svaf = SVAF; \ + ppi->fn_ptr[BT].sdx4df = SDX4DF; \ + ppi->fn_ptr[BT].jsdaf = JSDAF; \ + ppi->fn_ptr[BT].jsvaf = JSVAF; \ + ppi->fn_ptr[BT].sdx3df = SDX3DF; + +// Realtime mode doesn't use 4x rectangular blocks. +#if !CONFIG_REALTIME_ONLY + BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16, + aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, + aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg, + aom_dist_wtd_sub_pixel_avg_variance4x16) + + BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4, + aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, + aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg, + aom_dist_wtd_sub_pixel_avg_variance16x4) + + BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32, + aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, + aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg, + aom_dist_wtd_sub_pixel_avg_variance8x32) + + BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8, + aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, + aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg, + aom_dist_wtd_sub_pixel_avg_variance32x8) + + BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64, + aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, + aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg, + aom_dist_wtd_sub_pixel_avg_variance16x64) + + BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16, + aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, + aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg, + aom_dist_wtd_sub_pixel_avg_variance64x16) +#endif // !CONFIG_REALTIME_ONLY + + BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, + aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, + aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg, + aom_dist_wtd_sub_pixel_avg_variance128x128) + + BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, + aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, + aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg, + aom_dist_wtd_sub_pixel_avg_variance128x64) + + BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, + aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, + aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg, + aom_dist_wtd_sub_pixel_avg_variance64x128) + + BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, + aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, + aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg, + aom_dist_wtd_sub_pixel_avg_variance32x16) + + BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, + aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, + aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg, + aom_dist_wtd_sub_pixel_avg_variance16x32) + + BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, + aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, + aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg, + aom_dist_wtd_sub_pixel_avg_variance64x32) + + BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, + aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, + aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg, + aom_dist_wtd_sub_pixel_avg_variance32x64) + + BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, + aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, + aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg, + aom_dist_wtd_sub_pixel_avg_variance32x32) + + BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, + aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, + aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg, + aom_dist_wtd_sub_pixel_avg_variance64x64) + + BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, + aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, + aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg, + aom_dist_wtd_sub_pixel_avg_variance16x16) + + BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, + aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, + aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg, + aom_dist_wtd_sub_pixel_avg_variance16x8) + + BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, + aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, + aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg, + aom_dist_wtd_sub_pixel_avg_variance8x16) + + BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, + aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d, + aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg, + aom_dist_wtd_sub_pixel_avg_variance8x8) + + BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4, + aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d, + aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg, + aom_dist_wtd_sub_pixel_avg_variance8x4) + + BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8, + aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d, + aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg, + aom_dist_wtd_sub_pixel_avg_variance4x8) + + BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4, + aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d, + aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg, + aom_dist_wtd_sub_pixel_avg_variance4x4) + +#if !CONFIG_REALTIME_ONLY +#define OBFP(BT, OSDF, OVF, OSVF) \ + ppi->fn_ptr[BT].osdf = OSDF; \ + ppi->fn_ptr[BT].ovf = OVF; \ + ppi->fn_ptr[BT].osvf = OSVF; + + OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, + aom_obmc_sub_pixel_variance128x128) + OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64, + aom_obmc_sub_pixel_variance128x64) + OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128, + aom_obmc_sub_pixel_variance64x128) + OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64, + aom_obmc_sub_pixel_variance64x64) + OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32, + aom_obmc_sub_pixel_variance64x32) + OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64, + aom_obmc_sub_pixel_variance32x64) + OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32, + aom_obmc_sub_pixel_variance32x32) + OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16, + aom_obmc_sub_pixel_variance32x16) + OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32, + aom_obmc_sub_pixel_variance16x32) + OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16, + aom_obmc_sub_pixel_variance16x16) + OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8, + aom_obmc_sub_pixel_variance16x8) + OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16, + aom_obmc_sub_pixel_variance8x16) + OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8, + aom_obmc_sub_pixel_variance8x8) + OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8, + aom_obmc_sub_pixel_variance4x8) + OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4, + aom_obmc_sub_pixel_variance8x4) + OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4, + aom_obmc_sub_pixel_variance4x4) + OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16, + aom_obmc_sub_pixel_variance4x16) + OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4, + aom_obmc_sub_pixel_variance16x4) + OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32, + aom_obmc_sub_pixel_variance8x32) + OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8, + aom_obmc_sub_pixel_variance32x8) + OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64, + aom_obmc_sub_pixel_variance16x64) + OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16, + aom_obmc_sub_pixel_variance64x16) +#endif // !CONFIG_REALTIME_ONLY + +#define MBFP(BT, MCSDF, MCSVF) \ + ppi->fn_ptr[BT].msdf = MCSDF; \ + ppi->fn_ptr[BT].msvf = MCSVF; + + MBFP(BLOCK_128X128, aom_masked_sad128x128, + aom_masked_sub_pixel_variance128x128) + MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64) + MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128) + MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64) + MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32) + MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64) + MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32) + MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16) + MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32) + MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16) + MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8) + MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16) + MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8) + MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8) + MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4) + MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4) + +#if !CONFIG_REALTIME_ONLY + MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16) + MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4) + MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32) + MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8) + MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64) + MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16) +#endif + +#define SDSFP(BT, SDSF, SDSX4DF) \ + ppi->fn_ptr[BT].sdsf = SDSF; \ + ppi->fn_ptr[BT].sdsx4df = SDSX4DF; + + SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d) + SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d) + SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d) + SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d) + SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d) + + SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d) + SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d) + SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d) + + SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d) + SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d) + SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d) + SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d) + SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d) + + SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d) + +#if !CONFIG_REALTIME_ONLY + SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d) + SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d) + SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d) + SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d) + SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d) +#endif +#undef SDSFP + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(ppi); +#endif + + { + // As cm->mi_params is a part of the frame level context (cpi), it is + // unavailable at this point. mi_params is created as a local temporary + // variable, to be passed into the functions used for allocating tpl + // buffers. The values in this variable are populated according to initial + // width and height of the frame. + CommonModeInfoParams mi_params; + enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + BLOCK_4X4); + + const BLOCK_SIZE bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params.mi_cols + w - 1) / w; + const int num_rows = (mi_params.mi_rows + h - 1) / h; + AOM_CHECK_MEM_ERROR( + &ppi->error, ppi->tpl_sb_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*ppi->tpl_sb_rdmult_scaling_factors))); + +#if CONFIG_INTERNAL_STATS + ppi->b_calculate_blockiness = 1; + ppi->b_calculate_consistency = 1; + + for (int i = 0; i <= STAT_ALL; i++) { + ppi->psnr[0].stat[i] = 0; + ppi->psnr[1].stat[i] = 0; + + ppi->fastssim.stat[i] = 0; + ppi->psnrhvs.stat[i] = 0; + } + + ppi->psnr[0].worst = 100.0; + ppi->psnr[1].worst = 100.0; + ppi->worst_ssim = 100.0; + ppi->worst_ssim_hbd = 100.0; + + ppi->count[0] = 0; + ppi->count[1] = 0; + ppi->total_bytes = 0; + + if (ppi->b_calculate_psnr) { + ppi->total_sq_error[0] = 0; + ppi->total_samples[0] = 0; + ppi->total_sq_error[1] = 0; + ppi->total_samples[1] = 0; + ppi->total_recode_hits = 0; + ppi->summed_quality = 0; + ppi->summed_weights = 0; + ppi->summed_quality_hbd = 0; + ppi->summed_weights_hbd = 0; + } + + ppi->fastssim.worst = 100.0; + ppi->psnrhvs.worst = 100.0; + + if (ppi->b_calculate_blockiness) { + ppi->total_blockiness = 0; + ppi->worst_blockiness = 0.0; + } + + ppi->total_inconsistency = 0; + ppi->worst_consistency = 100.0; + if (ppi->b_calculate_consistency) { + AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars, + aom_malloc(sizeof(*ppi->ssim_vars) * 4 * + mi_params.mi_rows * mi_params.mi_cols)); + } +#endif + } + + ppi->error.setjmp = 0; + return ppi; +} + +AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, + BufferPool *const pool, COMPRESSOR_STAGE stage, + int lap_lag_in_frames) { + AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); + + if (!cpi) return NULL; + + av1_zero(*cpi); + + cpi->ppi = ppi; + + AV1_COMMON *volatile const cm = &cpi->common; + cm->seq_params = &ppi->seq_params; + cm->error = + (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error)); + if (!cm->error) { + aom_free(cpi); + return NULL; + } + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + av1_remove_compressor(cpi); + return NULL; + } + + cm->error->setjmp = 1; + cpi->compressor_stage = stage; + + cpi->do_frame_data_update = true; + + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->free_mi = enc_free_mi; + mi_params->setup_mi = enc_setup_mi; + mi_params->set_mb_mi = + (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE) + ? stat_stage_set_mb_mi + : enc_set_mb_mi; + + mi_params->mi_alloc_bsize = BLOCK_4X4; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + cm, cm->default_frame_context, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); + + cpi->common.buffer_pool = pool; + + init_config(cpi, oxcf); + if (cpi->compressor_stage == LAP_STAGE) { + cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; + } + + av1_rc_init(&cpi->oxcf, &cpi->rc); + + init_frame_info(&cpi->frame_info, cm); + init_frame_index_set(&cpi->frame_index_set); + + cm->current_frame.frame_number = 0; + cpi->rc.frame_number_encoded = 0; + cpi->rc.prev_frame_is_dropped = 0; + cpi->rc.max_consec_drop = INT_MAX; + cpi->rc.drop_count_consec = 0; + cm->current_frame_id = -1; + cpi->tile_data = NULL; + cpi->last_show_frame_buf = NULL; + realloc_segmentation_maps(cpi); + + cpi->refresh_frame.alt_ref_frame = false; + +#if CONFIG_SPEED_STATS + cpi->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + + cpi->time_stamps.first_ts_start = INT64_MAX; + +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "wb"); +#endif + +#if !CONFIG_REALTIME_ONLY + if (is_stat_consumption_stage(cpi)) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz); + + if (!cpi->ppi->lap_enabled) { + /*Re-initialize to stats buffer, populated by application in the case of + * two pass*/ + cpi->ppi->twopass.stats_buf_ctx->stats_in_start = + oxcf->twopass_stats_in.buf; + cpi->twopass_frame.stats_in = + cpi->ppi->twopass.stats_buf_ctx->stats_in_start; + cpi->ppi->twopass.stats_buf_ctx->stats_in_end = + &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1]; + + // The buffer size is packets - 1 because the last packet is total_stats. + av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, + oxcf->twopass_stats_in.buf, packets - 1); + av1_init_second_pass(cpi); + } else { + av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0); + av1_init_single_pass_lap(cpi); + } + } +#endif + + // The buffer "obmc_buffer" is used in inter frames for fast obmc search. + // Hence, the memory allocation for the same is avoided for allintra encoding + // mode. + if (cpi->oxcf.kf_cfg.key_freq_max != 0) + alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0]))); + + cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0; + + av1_set_speed_features_framesize_independent(cpi, oxcf->speed); + av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); + + int max_mi_cols = mi_params->mi_cols; + int max_mi_rows = mi_params->mi_rows; + if (oxcf->frm_dim_cfg.forced_max_frame_width) { + max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width); + } + if (oxcf->frm_dim_cfg.forced_max_frame_height) { + max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height); + } + + const int consec_zero_mv_alloc_size = (max_mi_rows * max_mi_cols) >> 2; + CHECK_MEM_ERROR( + cm, cpi->consec_zero_mv, + aom_calloc(consec_zero_mv_alloc_size, sizeof(*cpi->consec_zero_mv))); + cpi->consec_zero_mv_alloc_size = consec_zero_mv_alloc_size; + + cpi->mb_weber_stats = NULL; + cpi->mb_delta_q = NULL; + cpi->palette_pixel_num = 0; + cpi->scaled_last_source_available = 0; + + { + const BLOCK_SIZE bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (max_mi_cols + w - 1) / w; + const int num_rows = (max_mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->ssim_rdmult_scaling_factors))); + CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->tpl_rdmult_scaling_factors))); + } + +#if CONFIG_TUNE_VMAF + { + const BLOCK_SIZE bsize = BLOCK_64X64; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->vmaf_info.rdmult_scaling_factors))); + for (int i = 0; i < MAX_ARF_LAYERS; i++) { + cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0; + cpi->vmaf_info.last_frame_ysse[i] = -1.0; + cpi->vmaf_info.last_frame_vmaf[i] = -1.0; + } + cpi->vmaf_info.original_qindex = -1; + cpi->vmaf_info.vmaf_model = NULL; + } +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + { + const int w = mi_size_wide[butteraugli_rdo_bsize]; + const int h = mi_size_high[butteraugli_rdo_bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR( + cm, cpi->butteraugli_info.rdmult_scaling_factors, + aom_malloc(num_rows * num_cols * + sizeof(*cpi->butteraugli_info.rdmult_scaling_factors))); + memset(&cpi->butteraugli_info.source, 0, + sizeof(cpi->butteraugli_info.source)); + memset(&cpi->butteraugli_info.resized_source, 0, + sizeof(cpi->butteraugli_info.resized_source)); + cpi->butteraugli_info.recon_set = false; + } +#endif + +#if CONFIG_SALIENCY_MAP + { + CHECK_MEM_ERROR(cm, cpi->saliency_map, + (uint8_t *)aom_calloc(cm->height * cm->width, + sizeof(*cpi->saliency_map))); + // Buffer initialization based on MIN_MIB_SIZE_LOG2 to ensure that + // cpi->sm_scaling_factor buffer is allocated big enough, since we have no + // idea of the actual superblock size we are going to use yet. + const int min_mi_w_sb = (1 << MIN_MIB_SIZE_LOG2); + const int min_mi_h_sb = (1 << MIN_MIB_SIZE_LOG2); + const int max_sb_cols = + (cm->mi_params.mi_cols + min_mi_w_sb - 1) / min_mi_w_sb; + const int max_sb_rows = + (cm->mi_params.mi_rows + min_mi_h_sb - 1) / min_mi_h_sb; + CHECK_MEM_ERROR(cm, cpi->sm_scaling_factor, + (double *)aom_calloc(max_sb_rows * max_sb_cols, + sizeof(*cpi->sm_scaling_factor))); + } +#endif + +#if CONFIG_COLLECT_PARTITION_STATS + av1_zero(cpi->partition_stats); +#endif // CONFIG_COLLECT_PARTITION_STATS + + // Initialize the members of DeltaQuantParams with INT_MAX to ensure that + // the quantizer tables are correctly initialized using the default deltaq + // parameters when av1_init_quantizer is called for the first time. + DeltaQuantParams *const prev_deltaq_params = + &cpi->enc_quant_dequant_params.prev_deltaq_params; + prev_deltaq_params->y_dc_delta_q = INT_MAX; + prev_deltaq_params->u_dc_delta_q = INT_MAX; + prev_deltaq_params->v_dc_delta_q = INT_MAX; + prev_deltaq_params->u_ac_delta_q = INT_MAX; + prev_deltaq_params->v_ac_delta_q = INT_MAX; + + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + av1_qm_init(&cm->quant_params, av1_num_planes(cm)); + + av1_loop_filter_init(cm); + cm->superres_scale_denominator = SCALE_NUMERATOR; + cm->superres_upscaled_width = oxcf->frm_dim_cfg.width; + cm->superres_upscaled_height = oxcf->frm_dim_cfg.height; +#if !CONFIG_REALTIME_ONLY + av1_loop_restoration_precal(); +#endif + + cpi->third_pass_ctx = NULL; + if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) { + av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL); + } + + cpi->second_pass_log_stream = NULL; + cpi->use_ducky_encode = 0; + + cm->error->setjmp = 0; + return cpi; +} + +#if CONFIG_INTERNAL_STATS +#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T)) + +#define SNPRINT2(H, T, V) \ + snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) +#endif // CONFIG_INTERNAL_STATS + +void av1_remove_primary_compressor(AV1_PRIMARY *ppi) { + if (!ppi) return; +#if !CONFIG_REALTIME_ONLY + av1_tf_info_free(&ppi->tf_info); +#endif // !CONFIG_REALTIME_ONLY + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + aom_free(ppi->level_params.level_info[i]); + } + av1_lookahead_destroy(ppi->lookahead); + + aom_free(ppi->tpl_sb_rdmult_scaling_factors); + ppi->tpl_sb_rdmult_scaling_factors = NULL; + + TplParams *const tpl_data = &ppi->tpl_data; + aom_free(tpl_data->txfm_stats_list); + + for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { + aom_free(tpl_data->tpl_stats_pool[frame]); + aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]); + tpl_data->tpl_stats_pool[frame] = NULL; + } + +#if !CONFIG_REALTIME_ONLY + av1_tpl_dealloc(&tpl_data->tpl_mt_sync); +#endif + + av1_terminate_workers(ppi); + free_thread_data(ppi); + + aom_free(ppi->p_mt_info.tile_thr_data); + ppi->p_mt_info.tile_thr_data = NULL; + aom_free(ppi->p_mt_info.workers); + ppi->p_mt_info.workers = NULL; + ppi->p_mt_info.num_workers = 0; + + aom_free(ppi); +} + +void av1_remove_compressor(AV1_COMP *cpi) { + if (!cpi) return; +#if CONFIG_RATECTRL_LOG + if (cpi->oxcf.pass == 3) { + rc_log_show(&cpi->rc_log); + } +#endif // CONFIG_RATECTRL_LOG + + AV1_COMMON *cm = &cpi->common; + if (cm->current_frame.frame_number > 0) { +#if CONFIG_SPEED_STATS + if (!is_stat_generation_stage(cpi)) { + fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count); + } +#endif // CONFIG_SPEED_STATS + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + if (!is_stat_generation_stage(cpi)) { + av1_print_fr_partition_timing_stats(&cpi->partition_stats, + "fr_part_timing_data.csv"); + } +#endif + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + av1_denoiser_free(&(cpi->denoiser)); +#endif + + if (cm->error) { + // Help detect use after free of the error detail string. + memset(cm->error->detail, 'A', sizeof(cm->error->detail) - 1); + cm->error->detail[sizeof(cm->error->detail) - 1] = '\0'; + aom_free(cm->error); + } + aom_free(cpi->td.tctx); + MultiThreadInfo *const mt_info = &cpi->mt_info; +#if CONFIG_MULTITHREAD + pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_; + pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_; + pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_; + pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_; + pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_; + if (enc_row_mt_mutex_ != NULL) { + pthread_mutex_destroy(enc_row_mt_mutex_); + aom_free(enc_row_mt_mutex_); + } + if (enc_row_mt_cond_ != NULL) { + pthread_cond_destroy(enc_row_mt_cond_); + aom_free(enc_row_mt_cond_); + } + if (gm_mt_mutex_ != NULL) { + pthread_mutex_destroy(gm_mt_mutex_); + aom_free(gm_mt_mutex_); + } + if (tpl_error_mutex_ != NULL) { + pthread_mutex_destroy(tpl_error_mutex_); + aom_free(tpl_error_mutex_); + } + if (pack_bs_mt_mutex_ != NULL) { + pthread_mutex_destroy(pack_bs_mt_mutex_); + aom_free(pack_bs_mt_mutex_); + } +#endif + av1_row_mt_mem_dealloc(cpi); + + if (mt_info->num_workers > 1) { + av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); + av1_loop_filter_dealloc(&mt_info->lf_row_sync); + av1_cdef_mt_dealloc(&mt_info->cdef_sync); +#if !CONFIG_REALTIME_ONLY + av1_loop_restoration_dealloc(&mt_info->lr_row_sync); + av1_tf_mt_dealloc(&mt_info->tf_sync); +#endif + } + + av1_free_thirdpass_ctx(cpi->third_pass_ctx); + + av1_close_second_pass_log(cpi); + + dealloc_compressor_data(cpi); + + av1_ext_part_delete(&cpi->ext_part_controller); + + av1_remove_common(cm); + + aom_free(cpi); + +#ifdef OUTPUT_YUV_REC + fclose(yuv_rec_file); +#endif + +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif +} + +static void generate_psnr_packet(AV1_COMP *cpi) { + struct aom_codec_cx_pkt pkt; + int i; + PSNR_STATS psnr; +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr, + bit_depth, in_bit_depth); +#else + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); +#endif + + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples[i] = psnr.samples[i]; + pkt.data.psnr.sse[i] = psnr.sse[i]; + pkt.data.psnr.psnr[i] = psnr.psnr[i]; + } + +#if CONFIG_AV1_HIGHBITDEPTH + if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && + (in_bit_depth < bit_depth)) { + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i]; + pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i]; + pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i]; + } + } +#endif + + pkt.kind = AOM_CODEC_PSNR_PKT; + aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt); +} + +int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) { + if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1; + + *ext_ref_frame_flags = ref_frame_flags; + return 0; +} + +int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); + if (cfg) { + aom_yv12_copy_frame(cfg, sd, num_planes); + return 0; + } else { + return -1; + } +} + +int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx); + if (cfg) { + aom_yv12_copy_frame(sd, cfg, num_planes); + return 0; + } else { + return -1; + } +} + +#ifdef OUTPUT_YUV_REC +void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { + uint8_t *src = s->y_buffer; + int h = cm->height; + if (yuv_rec_file == NULL) return; + if (s->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer); + + do { + fwrite(src16, s->y_width, 2, yuv_rec_file); + src16 += s->y_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->u_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + src16 = CONVERT_TO_SHORTPTR(s->v_buffer); + h = s->uv_height; + + do { + fwrite(src16, s->uv_width, 2, yuv_rec_file); + src16 += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); + return; + } + + do { + fwrite(src, s->y_width, 1, yuv_rec_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); +} +#endif // OUTPUT_YUV_REC + +void av1_set_mv_search_params(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + const int max_mv_def = AOMMAX(cm->width, cm->height); + + // Default based on max resolution. + mv_search_params->mv_step_param = av1_init_search_range(max_mv_def); + + if (cpi->sf.mv_sf.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + mv_search_params->max_mv_magnitude = max_mv_def; + } else { + // Use adaptive mv steps based on previous frame stats for show frames and + // internal arfs. + FRAME_UPDATE_TYPE cur_update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + int use_auto_mv_step = + (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) && + mv_search_params->max_mv_magnitude != -1 && + cpi->sf.mv_sf.auto_mv_step_size >= 2; + if (use_auto_mv_step) { + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + mv_search_params->mv_step_param = av1_init_search_range( + AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude)); + } + // Reset max_mv_magnitude based on update flag. + if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1; + } + } +} + +void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) { + const AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + + if (cm->seq_params->force_screen_content_tools != 2) { + features->allow_screen_content_tools = features->allow_intrabc = + cm->seq_params->force_screen_content_tools; + return; + } + + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + features->allow_screen_content_tools = 1; + features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1; + cpi->is_screen_content_type = 1; + cpi->use_screen_content_tools = 1; + return; + } + + if (cpi->oxcf.mode == REALTIME) { + features->allow_screen_content_tools = features->allow_intrabc = 0; + return; + } + + // Screen content tools are not evaluated in non-RD encoding mode unless + // content type is not set explicitly, i.e., when + // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1 + // and hybrid_intra_pickmode = 0. Hence, screen content detection is + // disabled. + if (cpi->sf.rt_sf.use_nonrd_pick_mode && + !cpi->sf.rt_sf.hybrid_intra_pickmode) { + features->allow_screen_content_tools = features->allow_intrabc = 0; + return; + } + + // Estimate if the source frame is screen content, based on the portion of + // blocks that have few luma colors. + const uint8_t *src = cpi->unfiltered_source->y_buffer; + assert(src != NULL); + const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = cpi->unfiltered_source->y_stride; + const int width = cpi->unfiltered_source->y_width; + const int height = cpi->unfiltered_source->y_height; + const int64_t area = (int64_t)width * height; + const int bd = cm->seq_params->bit_depth; + const int blk_w = 16; + const int blk_h = 16; + // These threshold values are selected experimentally. + const int color_thresh = 4; + const unsigned int var_thresh = 0; + // Counts of blocks with no more than color_thresh colors. + int64_t counts_1 = 0; + // Counts of blocks with no more than color_thresh colors and variance larger + // than var_thresh. + int64_t counts_2 = 0; + + for (int r = 0; r + blk_h <= height; r += blk_h) { + for (int c = 0; c + blk_w <= width; c += blk_w) { + int count_buf[1 << 8]; // Maximum (1 << 8) bins for hbd path. + const uint8_t *const this_src = src + r * stride + c; + int n_colors; + if (use_hbd) + av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL, + count_buf, &n_colors, NULL); + else + av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors); + if (n_colors > 1 && n_colors <= color_thresh) { + ++counts_1; + struct buf_2d buf; + buf.stride = stride; + buf.buf = (uint8_t *)this_src; + const unsigned int var = av1_get_perpixel_variance( + cpi, xd, &buf, BLOCK_16X16, AOM_PLANE_Y, use_hbd); + if (var > var_thresh) ++counts_2; + } + } + } + + // The threshold values are selected experimentally. + features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area; + // IntraBC would force loop filters off, so we use more strict rules that also + // requires that the block has high variance. + features->allow_intrabc = features->allow_screen_content_tools && + counts_2 * blk_h * blk_w * 12 > area; + cpi->use_screen_content_tools = features->allow_screen_content_tools; + cpi->is_screen_content_type = + features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 && + counts_2 * blk_h * blk_w * 30 > area); +} + +static void init_motion_estimation(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params; + const int aligned_width = (cm->width + 7) & ~7; + const int y_stride = + aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels); + const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width || + cpi->oxcf.frm_dim_cfg.height != cm->height) || + av1_superres_scaled(cm)) + ? y_stride + : cpi->ppi->lookahead->buf->img.y_stride; + int fpf_y_stride = + cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride; + + // Update if search_site_cfg is uninitialized or the current frame has a new + // stride + const int should_update = + !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride || + !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride || + (y_stride != + mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride); + + if (!should_update) { + return; + } + + // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS. + for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) { + const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0; + av1_init_motion_compensation[i]( + &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level); + av1_init_motion_compensation[i]( + &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src, + level); + } + + // First pass search site config initialization. + av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND], + fpf_y_stride); + for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) { + memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i], + &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND], + sizeof(search_site_config)); + } +} + +static void init_ref_frame_bufs(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + int i; + if (cm->cur_frame) { + cm->cur_frame->ref_count--; + cm->cur_frame = NULL; + } + for (i = 0; i < REF_FRAMES; ++i) { + if (cm->ref_frame_map[i]) { + cm->ref_frame_map[i]->ref_count--; + cm->ref_frame_map[i] = NULL; + } + } +#ifndef NDEBUG + BufferPool *const pool = cm->buffer_pool; + for (i = 0; i < pool->num_frame_bufs; ++i) { + assert(pool->frame_bufs[i].ref_count == 0); + } +#endif +} + +// TODO(chengchen): consider renaming this function as it is necessary +// for the encoder to setup critical parameters, and it does not +// deal with initial width any longer. +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + + if (!cpi->frame_size_related_setup_done || + seq_params->use_highbitdepth != use_highbitdepth || + seq_params->subsampling_x != subsampling_x || + seq_params->subsampling_y != subsampling_y) { + seq_params->subsampling_x = subsampling_x; + seq_params->subsampling_y = subsampling_y; + seq_params->use_highbitdepth = use_highbitdepth; + + av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); + + if (!is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi)) + return AOM_CODEC_MEM_ERROR; +#endif // !CONFIG_REALTIME_ONLY + } + init_ref_frame_bufs(cpi); + + init_motion_estimation(cpi); // TODO(agrange) This can be removed. + + cpi->initial_mbs = cm->mi_params.MBs; + cpi->frame_size_related_setup_done = true; + } + return AOM_CODEC_OK; +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static void setup_denoiser_buffer(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + if (cpi->oxcf.noise_sensitivity > 0 && + !cpi->denoiser.frame_buffer_initialized) { + if (av1_denoiser_alloc( + cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc, + cpi->oxcf.noise_sensitivity, cm->width, cm->height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); + } +} +#endif + +// Returns 1 if the assigned width or height was <= 0. +static int set_size_literal(AV1_COMP *cpi, int width, int height) { + AV1_COMMON *cm = &cpi->common; + aom_codec_err_t err = av1_check_initial_width( + cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(cm->error, err, "av1_check_initial_width() failed"); + } + + if (width <= 0 || height <= 0) return 1; + + cm->width = width; + cm->height = height; + +#if CONFIG_AV1_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height) { + av1_free_context_buffers(cm); + av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); + av1_free_sms_tree(&cpi->td); + av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); + cpi->td.firstpass_ctx = NULL; + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; + cpi->frame_size_related_setup_done = false; + } + alloc_mb_mode_info_buffers(cpi); + av1_update_frame_size(cpi); + + return 0; +} + +void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int ref_frame; + + if (width != cm->width || height != cm->height) { + // There has been a change in the encoded frame size + set_size_literal(cpi, width, height); + // Recalculate 'all_lossless' in case super-resolution was (un)selected. + cm->features.all_lossless = + cm->features.coded_lossless && !av1_superres_scaled(cm); + + av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); +#if CONFIG_AV1_TEMPORAL_DENOISING + // Reset the denoiser on the resized frame. + if (cpi->oxcf.noise_sensitivity > 0) { + av1_denoiser_free(&(cpi->denoiser)); + setup_denoiser_buffer(cpi); + } +#endif + } + if (is_stat_consumption_stage(cpi)) { + av1_set_target_rate(cpi, cm->width, cm->height); + } + + alloc_frame_mvs(cm, cm->cur_frame); + + // Allocate above context buffers + CommonContexts *const above_contexts = &cm->above_contexts; + if (above_contexts->num_planes < av1_num_planes(cm) || + above_contexts->num_mi_cols < cm->mi_params.mi_cols || + above_contexts->num_tile_rows < cm->tiles.rows) { + av1_free_above_context_buffers(above_contexts); + if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, + cm->mi_params.mi_cols, + av1_num_planes(cm))) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + AV1EncoderConfig *oxcf = &cpi->oxcf; + oxcf->border_in_pixels = av1_get_enc_border_size( + av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0, + cm->seq_params->sb_size); + + // Reset the frame pointers to the current frame size. + if (aom_realloc_frame_buffer( + &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi); + +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + for (int i = 0; i < num_planes; ++i) + cm->rst_info[i].frame_restoration_type = RESTORE_NONE; + + const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter; + av1_alloc_restoration_buffers(cm, is_sgr_enabled); + // Store the allocated restoration buffers in MT object. + if (cpi->ppi->p_mt_info.num_workers > 1) { + av1_init_lr_mt_buffers(cpi); + } + } +#endif + + init_motion_estimation(cpi); + + int has_valid_ref_frame = 0; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); + av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width, + buf->buf.y_crop_height, cm->width, + cm->height); + has_valid_ref_frame |= av1_is_valid_scale(sf); + if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes); + } + } + if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { + aom_internal_error( + cm->error, AOM_CODEC_CORRUPT_FRAME, + "Can't find at least one reference frame with valid size"); + } + + av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, + cm->width, cm->height); + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); +} + +static INLINE int extend_borders_mt(const AV1_COMP *cpi, + MULTI_THREADED_MODULES stage, int plane) { + const AV1_COMMON *const cm = &cpi->common; + if (cpi->mt_info.num_mod_workers[stage] < 2) return 0; + switch (stage) { + // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled, + // multi-thread frame border extension along with loop filter frame. + // As loop-filtering of a superblock row modifies the pixels of the + // above superblock row, border extension requires that loop filtering + // of the current and above superblock row is complete. + case MOD_LPF: return 0; + case MOD_CDEF: + return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame && + !is_restoration_used(cm) && !av1_superres_scaled(cm); + case MOD_LR: + return is_restoration_used(cm) && + (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE); + default: assert(0); + } + return 0; +} + +/*!\brief Select and apply cdef filters and switchable restoration filters + * + * \ingroup high_level_algo + */ +static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm, + MACROBLOCKD *xd, int use_restoration, + int use_cdef, + unsigned int skip_apply_postproc_filters) { +#if !CONFIG_REALTIME_ONLY + if (use_restoration) + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0); +#else + (void)use_restoration; +#endif + + if (use_cdef) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, cdef_time); +#endif + const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF]; + // Find CDEF parameters + av1_cdef_search(cpi); + + // Apply the filter + if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) { + assert(!cpi->ppi->rtc_ref.non_reference_frame); + if (num_workers > 1) { + // Extension of frame borders is multi-threaded along with cdef. + const int do_extend_border = + extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0); + av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker, + cpi->mt_info.workers, &cpi->mt_info.cdef_sync, + num_workers, av1_cdef_init_fb_row_mt, + do_extend_border); + } else { + av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, cdef_time); +#endif + } + + const int use_superres = av1_superres_scaled(cm); + if (use_superres) { + if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) { + av1_superres_post_encode(cpi); + } + } + +#if !CONFIG_REALTIME_ONLY +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_restoration_time); +#endif + if (use_restoration) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = mt_info->num_mod_workers[MOD_LR]; + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1); + av1_pick_filter_restoration(cpi->source, cpi); + if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 && + (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || + cm->rst_info[1].frame_restoration_type != RESTORE_NONE || + cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) { + if (num_workers > 1) { + // Extension of frame borders is multi-threaded along with loop + // restoration filter. + const int do_extend_border = 1; + av1_loop_restoration_filter_frame_mt( + &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers, + &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border); + } else { + av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0, + &cpi->lr_ctxt); + } + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_restoration_time); +#endif +#endif // !CONFIG_REALTIME_ONLY +} + +static void extend_frame_borders(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + // TODO(debargha): Fix mv search range on encoder side + for (int plane = 0; plane < av1_num_planes(cm); ++plane) { + const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) || + extend_borders_mt(cpi, MOD_LR, plane); + if (!extend_border_done) { + const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf; + aom_extend_frame_borders_plane_row(ybf, plane, 0, + ybf->crop_heights[plane > 0]); + } + } +} + +/*!\brief Select and apply deblocking filters, cdef filters, and restoration + * filters. + * + * \ingroup high_level_algo + */ +static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = mt_info->num_mod_workers[MOD_LPF]; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + cpi->td.mb.rdmult = cpi->rd.RDMULT; + + assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg), + cm->features.coded_lossless && cm->features.all_lossless)); + + const int use_loopfilter = + is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc; + const int use_cdef = is_cdef_used(cm); + const int use_superres = av1_superres_scaled(cm); + const int use_restoration = is_restoration_used(cm); + + const unsigned int skip_apply_postproc_filters = + derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, + use_superres, use_restoration); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_filter_time); +#endif + if (use_loopfilter) { + av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick); + struct loopfilter *lf = &cm->lf; + if ((lf->filter_level[0] || lf->filter_level[1]) && + (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) { + assert(!cpi->ppi->rtc_ref.non_reference_frame); + // lpf_opt_level = 1 : Enables dual/quad loop-filtering. + // lpf_opt_level is set to 1 if transform size search depth in inter + // blocks is limited to one as quad loop filtering assumes that all the + // transform blocks within a 16x8/8x16/16x16 prediction block are of the + // same size. lpf_opt_level = 2 : Filters both chroma planes together, in + // addition to enabling dual/quad loop-filtering. This is enabled when lpf + // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are + // equal. + int lpf_opt_level = get_lpf_opt_level(&cpi->sf); + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0, + mt_info->workers, num_workers, + &mt_info->lf_row_sync, lpf_opt_level); + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_filter_time); +#endif + + cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef, + skip_apply_postproc_filters); +} + +static void update_motion_stat(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; + const int avg_cnt_zeromv = + 100 * cpi->rc.cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols); + if (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + rc->avg_frame_low_motion = + (rc->avg_frame_low_motion == 0) + ? avg_cnt_zeromv + : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4; + // For SVC: set avg_frame_low_motion (only computed on top spatial layer) + // to all lower spatial layers. + if (cpi->ppi->use_svc && + svc->spatial_layer_id == svc->number_spatial_layers - 1) { + for (int i = 0; i < svc->number_spatial_layers - 1; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_low_motion = rc->avg_frame_low_motion; + } + } + } +} + +/*!\brief Encode a frame without the recode loop, usually used in one-pass + * encoding and realtime coding. + * + * \ingroup high_level_algo + * + * \param[in] cpi Top-level encoder structure + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval #AOM_CODEC_ERROR + */ +static int encode_without_recode(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg; + SVC *const svc = &cpi->svc; + const int resize_pending = is_frame_resize_pending(cpi); + int top_index = 0, bottom_index = 0, q = 0; + YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source; + InterpFilter filter_scaler = + cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id] + : EIGHTTAP_SMOOTH; + int phase_scaler = cpi->ppi->use_svc + ? svc->downsample_filter_phase[svc->spatial_layer_id] + : 0; + + set_size_independent_vars(cpi); + av1_setup_frame_size(cpi); + cm->prev_frame = get_primary_ref_frame_buf(cm); + av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + av1_set_mv_search_params(cpi); + + if (cm->current_frame.frame_number == 0 && + (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) && + cpi->svc.temporal_layer_id == 0) { + const SequenceHeader *seq_params = cm->seq_params; + if (aom_alloc_frame_buffer( + &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width, + cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffer for source_last_TL0"); + } + } + + if (!cpi->ppi->use_svc) { + phase_scaler = 8; + // 2:1 scaling. + if ((cm->width << 1) == unscaled->y_crop_width && + (cm->height << 1) == unscaled->y_crop_height) { + filter_scaler = BILINEAR; + // For lower resolutions use eighttap_smooth. + if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH; + } else if ((cm->width << 2) == unscaled->y_crop_width && + (cm->height << 2) == unscaled->y_crop_height) { + // 4:1 scaling. + filter_scaler = EIGHTTAP_SMOOTH; + } else if ((cm->width << 2) == 3 * unscaled->y_crop_width && + (cm->height << 2) == 3 * unscaled->y_crop_height) { + // 4:3 scaling. + filter_scaler = EIGHTTAP_REGULAR; + } + } + + allocate_gradient_info_for_hog(cpi); + + allocate_src_var_of_4x4_sub_block_buf(cpi); + + const SPEED_FEATURES *sf = &cpi->sf; + if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) + variance_partition_alloc(cpi); + + if (cm->current_frame.frame_type == KEY_FRAME || + ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame))) + copy_frame_prob_info(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + av1_setup_butteraugli_rdmult(cpi); + } +#endif + + cpi->source = av1_realloc_and_scale_if_required( + cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true, + false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + if (frame_is_intra_only(cm) || resize_pending != 0) { + const int current_size = + (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2; + if (cpi->consec_zero_mv && + (cpi->consec_zero_mv_alloc_size < current_size)) { + aom_free(cpi->consec_zero_mv); + cpi->consec_zero_mv_alloc_size = 0; + CHECK_MEM_ERROR(cm, cpi->consec_zero_mv, + aom_malloc(current_size * sizeof(*cpi->consec_zero_mv))); + cpi->consec_zero_mv_alloc_size = current_size; + } + assert(cpi->consec_zero_mv != NULL); + memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv)); + } + + if (cpi->scaled_last_source_available) { + cpi->last_source = &cpi->scaled_last_source; + cpi->scaled_last_source_available = 0; + } else if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler, + phase_scaler, true, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + if (cpi->sf.rt_sf.use_temporal_noise_estimate) { + av1_update_noise_estimate(cpi); + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc) + av1_denoiser_reset_on_first_frame(cpi); +#endif + + // For 1 spatial layer encoding: if the (non-LAST) reference has different + // resolution from the source then disable that reference. This is to avoid + // significant increase in encode time from scaling the references in + // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger) + // resized frame and ALTREF will be refreshed ~4 frames later, so both + // references become available again after few frames. + // For superres: don't disable golden reference. + if (svc->number_spatial_layers == 1) { + if (!cpi->oxcf.superres_cfg.enable_superres) { + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) + cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + } + } + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) + cpi->ref_frame_flags ^= AOM_ALT_FLAG; + } + } + + int scale_references = 0; +#if CONFIG_FPMT_TEST + scale_references = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (scale_references || + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + if (!frame_is_intra_only(cm)) { + av1_scale_references(cpi, filter_scaler, phase_scaler, 1); + } + } + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + av1_set_variance_partition_thresholds(cpi, q, 0); + av1_setup_frame(cpi); + + // Check if this high_source_sad (scene/slide change) frame should be + // encoded at high/max QP, and if so, set the q and adjust some rate + // control parameters. + if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ && + cpi->rc.high_source_sad) { + if (av1_encodedframe_overshoot_cbr(cpi, &q)) { + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + av1_set_variance_partition_thresholds(cpi, q, 0); + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cm->features.primary_ref_frame == PRIMARY_REF_NONE) + av1_setup_frame(cpi); + } + } + + if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) { + suppress_active_map(cpi); + av1_cyclic_refresh_setup(cpi); + } + av1_apply_active_map(cpi); + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + // This is for rtc temporal filtering case. + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && + cm->current_frame.frame_type != KEY_FRAME) { + const SequenceHeader *seq_params = cm->seq_params; + + if (cpi->orig_source.buffer_alloc_sz == 0 || + cpi->last_source->y_width != cpi->source->y_width || + cpi->last_source->y_height != cpi->source->y_height) { + // Allocate a source buffer to store the true source for psnr calculation. + if (aom_alloc_frame_buffer( + &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width, + cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate scaled buffer"); + } + + aom_yv12_copy_y(cpi->source, &cpi->orig_source); + aom_yv12_copy_u(cpi->source, &cpi->orig_source); + aom_yv12_copy_v(cpi->source, &cpi->orig_source); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_encode_frame_time); +#endif + + // Set the motion vector precision based on mv stats from the last coded + // frame. + if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q); + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + + if (!cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm)) + update_motion_stat(cpi); + + // Adjust the refresh of the golden (longer-term) reference based on QP + // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode. + if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR && + cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 && + svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl && + sf->rt_sf.gf_refresh_based_on_qp) + av1_adjust_gf_refresh_qp_one_pass_rt(cpi); + + // For non-svc: if scaling is required, copy scaled_source + // into scaled_last_source. + if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc && + cpi->scaled_source.y_buffer != NULL && + cpi->scaled_last_source.y_buffer != NULL && + cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width && + cpi->scaled_source.y_crop_height == + cpi->scaled_last_source.y_crop_height && + (cm->width != cpi->unscaled_source->y_crop_width || + cm->height != cpi->unscaled_source->y_crop_height)) { + cpi->scaled_last_source_available = 1; + aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source); + aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source); + aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_encode_frame_time); +#endif +#if CONFIG_INTERNAL_STATS + ++cpi->frame_recode_hits; +#endif + + return AOM_CODEC_OK; +} + +#if !CONFIG_REALTIME_ONLY + +/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame + * for multiple times can be approaching a target bitrate or adjusting the usage + * of global motions. + * + * \ingroup high_level_algo + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval -1 + * \retval #AOM_CODEC_ERROR + */ +static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE); + // Must allow recode if minimum compression ratio is set. + assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode)); + + set_size_independent_vars(cpi); + if (is_stat_consumption_stage_twopass(cpi) && + cpi->sf.interp_sf.adaptive_interp_filter_search) + cpi->interp_search_flags.interp_filter_search_mask = + av1_setup_interp_filter_search_mask(cpi); + + av1_setup_frame_size(cpi); + + if (av1_superres_in_recode_allowed(cpi) && + cpi->superres_mode != AOM_SUPERRES_NONE && + cm->superres_scale_denominator == SCALE_NUMERATOR) { + // Superres mode is currently enabled, but the denominator selected will + // disable superres. So no need to continue, as we will go through another + // recode loop for full-resolution after this anyway. + return -1; + } + + int top_index = 0, bottom_index = 0; + int q = 0, q_low = 0, q_high = 0; + av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + q_low = bottom_index; + q_high = top_index; + + av1_set_mv_search_params(cpi); + + allocate_gradient_info_for_hog(cpi); + + allocate_src_var_of_4x4_sub_block_buf(cpi); + + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) + variance_partition_alloc(cpi); + + if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif + +#if !CONFIG_RD_COMMAND + // Determine whether to use screen content tools using two fast encoding. + if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode) + av1_determine_sc_tools_with_encoding(cpi, q); +#endif // !CONFIG_RD_COMMAND + +#if CONFIG_TUNE_VMAF + if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source); + } +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + cpi->butteraugli_info.recon_set = false; + int original_q = 0; +#endif + + cpi->num_frame_recode = 0; + + // Loop variables + int loop = 0; + int loop_count = 0; + int overshoot_seen = 0; + int undershoot_seen = 0; + int low_cr_seen = 0; + int last_loop_allow_hp = 0; + + do { + loop = 0; + int do_mv_stats_collection = 1; + + // if frame was scaled calculate global_motion_search again if already + // done + if (loop_count > 0 && cpi->source && gm_info->search_done) { + if (cpi->source->y_crop_width != cm->width || + cpi->source->y_crop_height != cm->height) { + gm_info->search_done = 0; + } + } + cpi->source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0, + false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + +#if CONFIG_TUNE_BUTTERAUGLI + if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + if (loop_count == 0) { + original_q = q; + // TODO(sdeng): different q here does not make big difference. Use a + // faster pass instead. + q = 96; + av1_setup_butteraugli_source(cpi); + } else { + q = original_q; + } + } +#endif + + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + int scale_references = 0; +#if CONFIG_FPMT_TEST + scale_references = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (scale_references || + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + if (!frame_is_intra_only(cm)) { + if (loop_count > 0) { + release_scaled_references(cpi); + } + av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0); + } + } + +#if CONFIG_TUNE_VMAF + if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + cpi->vmaf_info.original_qindex = q; + q = av1_get_vmaf_base_qindex(cpi, q); + } +#endif + +#if CONFIG_RD_COMMAND + RD_COMMAND *rd_command = &cpi->rd_command; + RD_OPTION option = rd_command->option_ls[rd_command->frame_index]; + if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) { + q = rd_command->q_index_ls[rd_command->frame_index]; + } +#endif // CONFIG_RD_COMMAND + +#if CONFIG_BITRATE_ACCURACY +#if CONFIG_THREE_PASS + if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) { + q = cpi->vbr_rc_info.q_index_list[frame_coding_idx]; + } else { + // TODO(angiebird): Investigate why sometimes there is an extra frame + // after the last GOP. + q = cpi->vbr_rc_info.base_q_index; + } + } +#else + if (cpi->vbr_rc_info.q_index_list_ready) { + q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index]; + } +#endif // CONFIG_THREE_PASS +#endif // CONFIG_BITRATE_ACCURACY + +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + // TODO(angiebird): Move this into a function. + if (oxcf->pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx]; + FRAME_UPDATE_TYPE update_type = + cpi->vbr_rc_info.update_type_list[frame_coding_idx]; + rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q, + update_type); + } +#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + + if (cpi->use_ducky_encode) { + const DuckyEncodeFrameInfo *frame_info = + &cpi->ducky_encode_info.frame_info; + if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) { + q = frame_info->q_index; + cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled; + } + } + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q, 0); + + // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n", + // cm->current_frame.frame_number, cm->show_frame, q, + // cm->current_frame.frame_type, cm->superres_scale_denominator); + + if (loop_count == 0) { + av1_setup_frame(cpi); + } else if (get_primary_ref_frame_buf(cm) == NULL) { + // Base q-index may have changed, so we need to assign proper default coef + // probs before every iteration. + av1_default_coef_probs(cm); + av1_setup_frame_contexts(cm); + } + + if (q_cfg->aq_mode == VARIANCE_AQ) { + av1_vaq_frame_setup(cpi); + } else if (q_cfg->aq_mode == COMPLEXITY_AQ) { + av1_setup_in_frame_q_adj(cpi); + } + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_encode_frame_time); +#endif + // Set the motion vector precision based on mv stats from the last coded + // frame. + if (!frame_is_intra_only(cm)) { + av1_pick_and_set_high_precision_mv(cpi, q); + + // If the precision has changed during different iteration of the loop, + // then we need to reset the global motion vectors + if (loop_count > 0 && + cm->features.allow_high_precision_mv != last_loop_allow_hp) { + gm_info->search_done = 0; + } + last_loop_allow_hp = cm->features.allow_high_precision_mv; + } + + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + + // Disable mv_stats collection for parallel frames based on update flag. + if (!cpi->do_frame_data_update) do_mv_stats_collection = 0; + + // Reset the mv_stats in case we are interrupted by an intraframe or an + // overlay frame. + if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats); + + // Gather the mv_stats for the next frame + if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && + av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) { + av1_collect_mv_stats(cpi, q); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_encode_frame_time); +#endif + +#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND + const int do_dummy_pack = 1; +#else // CONFIG_BITRATE_ACCURACY + // Dummy pack of the bitstream using up to date stats to get an + // accurate estimate of output frame size to determine if we need + // to recode. + const int do_dummy_pack = + (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF && + oxcf->rc_cfg.mode != AOM_Q) || + oxcf->rc_cfg.min_cr > 0; +#endif // CONFIG_BITRATE_ACCURACY + if (do_dummy_pack) { + av1_finalize_encoded_frame(cpi); + int largest_tile_id = 0; // Output from bitstream: unused here + rc->coefficient_size = 0; + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + // bits used for this frame + rc->projected_frame_size = (int)(*size) << 3; +#if CONFIG_RD_COMMAND + PSNR_STATS psnr; + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); + printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT, + rc->projected_frame_size, psnr.sse[0]); + ++rd_command->frame_index; + if (rd_command->frame_index == rd_command->frame_count) { + return AOM_CODEC_ERROR; + } +#endif // CONFIG_RD_COMMAND + +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + if (oxcf->pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx, + rc->projected_frame_size, rc->coefficient_size); + } +#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + } + +#if CONFIG_TUNE_VMAF + if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + q = cpi->vmaf_info.original_qindex; + } +#endif + if (allow_recode) { + // Update q and decide whether to do a recode loop + recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index, + bottom_index, &undershoot_seen, &overshoot_seen, + &low_cr_seen, loop_count); + } + +#if CONFIG_TUNE_BUTTERAUGLI + if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + loop = 1; + av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4); + } +#endif + + if (cpi->use_ducky_encode) { + // Ducky encode currently does not support recode loop. + loop = 0; + } +#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND + loop = 0; // turn off recode loop when CONFIG_BITRATE_ACCURACY is on +#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND + + if (loop) { + ++loop_count; + cpi->num_frame_recode = + (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1)) + ? (cpi->num_frame_recode + 1) + : (NUM_RECODES_PER_FRAME - 1); +#if CONFIG_INTERNAL_STATS + ++cpi->frame_recode_hits; +#endif + } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif + } while (loop); + + return AOM_CODEC_OK; +} +#endif // !CONFIG_REALTIME_ONLY + +// TODO(jingning, paulwilkins): Set up high grain level to test +// hardware decoders. Need to adapt the actual noise variance +// according to the difference between reconstructed frame and the +// source signal. +static void set_grain_syn_params(AV1_COMMON *cm) { + aom_film_grain_t *film_grain_params = &cm->film_grain_params; + film_grain_params->apply_grain = 1; + film_grain_params->update_parameters = 1; + film_grain_params->random_seed = rand() & 0xffff; + + film_grain_params->num_y_points = 1; + film_grain_params->scaling_points_y[0][0] = 128; + film_grain_params->scaling_points_y[0][1] = 100; + + if (!cm->seq_params->monochrome) { + film_grain_params->num_cb_points = 1; + film_grain_params->scaling_points_cb[0][0] = 128; + film_grain_params->scaling_points_cb[0][1] = 100; + + film_grain_params->num_cr_points = 1; + film_grain_params->scaling_points_cr[0][0] = 128; + film_grain_params->scaling_points_cr[0][1] = 100; + } else { + film_grain_params->num_cb_points = 0; + film_grain_params->num_cr_points = 0; + } + + film_grain_params->chroma_scaling_from_luma = 0; + + film_grain_params->scaling_shift = 1; + film_grain_params->ar_coeff_lag = 0; + film_grain_params->ar_coeff_shift = 1; + film_grain_params->overlap_flag = 1; + film_grain_params->grain_scale_shift = 0; +} + +/*!\brief Recode loop or a single loop for encoding one frame, followed by + * in-loop deblocking filters, CDEF filters, and restoration filters. + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * \param[in] sse Total distortion of the frame + * \param[in] rate Total rate of the frame + * \param[in] largest_tile_id Tile id of the last tile + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval #AOM_CODEC_ERROR + */ +static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size, + uint8_t *dest, int64_t *sse, + int64_t *rate, + int *largest_tile_id) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_or_without_recode_time); +#endif + for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) { + cpi->do_update_frame_probs_txtype[i] = 0; + cpi->do_update_frame_probs_obmc[i] = 0; + cpi->do_update_frame_probs_warp[i] = 0; + cpi->do_update_frame_probs_interpfilter[i] = 0; + } + + cpi->do_update_vbr_bits_off_target_fast = 0; + int err; +#if CONFIG_REALTIME_ONLY + err = encode_without_recode(cpi); +#else + if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE) + err = encode_without_recode(cpi); + else + err = encode_with_recode_loop(cpi, size, dest); +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_or_without_recode_time); +#endif + if (err != AOM_CODEC_OK) { + if (err == -1) { + // special case as described in encode_with_recode_loop(). + // Encoding was skipped. + err = AOM_CODEC_OK; + if (sse != NULL) *sse = INT64_MAX; + if (rate != NULL) *rate = INT64_MAX; + *largest_tile_id = 0; + } + return err; + } + +#ifdef OUTPUT_YUV_DENOISED + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { + aom_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.running_avg_y[INTRA_FRAME]); + } +#endif + + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + + // Special case code to reduce pulsing when key frames are forced at a + // fixed interval. Note the reconstruction error if it is the frame before + // the force key frame + if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { +#if CONFIG_AV1_HIGHBITDEPTH + if (seq_params->use_highbitdepth) { + cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); + } else { + cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } +#else + cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + } + + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = + seq_params->transfer_characteristics; + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = + seq_params->chroma_sample_position; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; + + if (!cpi->mt_info.pipeline_lpf_mt_with_enc) + set_postproc_filter_default_params(&cpi->common); + + if (!cm->features.allow_intrabc) { + loopfilter_frame(cpi, cm); + } + + if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) { + extend_frame_borders(cpi); + } + +#ifdef OUTPUT_YUV_REC + aom_write_one_yuv_frame(cm, &cm->cur_frame->buf); +#endif + + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) { + set_grain_syn_params(cm); + } + + av1_finalize_encoded_frame(cpi); + // Build the bitstream +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_pack_bitstream_final_time); +#endif + cpi->rc.coefficient_size = 0; + if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_pack_bitstream_final_time); +#endif + + // Compute sse and rate. + if (sse != NULL) { +#if CONFIG_AV1_HIGHBITDEPTH + *sse = (seq_params->use_highbitdepth) + ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf) + : aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#else + *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + } + if (rate != NULL) { + const int64_t bits = (*size << 3); + *rate = (bits << 5); // To match scale. + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode) { + PSNR_STATS psnr; + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); + DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result; + frame_result->global_order_idx = cm->cur_frame->display_order_hint; + frame_result->q_index = cm->quant_params.base_qindex; + frame_result->rdmult = cpi->rd.RDMULT; + frame_result->rate = (int)(*size) * 8; + frame_result->dist = psnr.sse[0]; + frame_result->psnr = psnr.psnr[0]; + } +#endif // !CONFIG_REALTIME_ONLY + + return AOM_CODEC_OK; +} + +static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, + uint8_t *dest, + int *largest_tile_id) { + const AV1_COMMON *const cm = &cpi->common; + assert(cm->seq_params->enable_superres); + assert(av1_superres_in_recode_allowed(cpi)); + aom_codec_err_t err = AOM_CODEC_OK; + av1_save_all_coding_context(cpi); + + int64_t sse1 = INT64_MAX; + int64_t rate1 = INT64_MAX; + int largest_tile_id1 = 0; + int64_t sse2 = INT64_MAX; + int64_t rate2 = INT64_MAX; + int largest_tile_id2; + double proj_rdcost1 = DBL_MAX; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; + + // Encode with superres. + if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) { + SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg; + int64_t superres_sses[SCALE_NUMERATOR]; + int64_t superres_rates[SCALE_NUMERATOR]; + int superres_largest_tile_ids[SCALE_NUMERATOR]; + // Use superres for Key-frames and Alt-ref frames only. + if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) { + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + superres_cfg->superres_scale_denominator = denom; + superres_cfg->superres_kf_scale_denominator = denom; + const int this_index = denom - (SCALE_NUMERATOR + 1); + + cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this loop. + err = encode_with_recode_loop_and_filter( + cpi, size, dest, &superres_sses[this_index], + &superres_rates[this_index], + &superres_largest_tile_ids[this_index]); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + if (err != AOM_CODEC_OK) return err; + restore_all_coding_context(cpi); + } + // Reset. + superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; + superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; + } else { + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + const int this_index = denom - (SCALE_NUMERATOR + 1); + superres_sses[this_index] = INT64_MAX; + superres_rates[this_index] = INT64_MAX; + } + } + // Encode without superres. + assert(cpi->superres_mode == AOM_SUPERRES_NONE); + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2, + &largest_tile_id2); + if (err != AOM_CODEC_OK) return err; + + // Note: Both use common rdmult based on base qindex of fullres. + const int64_t rdmult = av1_compute_rd_mult_based_on_qindex( + bit_depth, update_type, cm->quant_params.base_qindex); + + // Find the best rdcost among all superres denoms. + int best_denom = -1; + for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; + ++denom) { + const int this_index = denom - (SCALE_NUMERATOR + 1); + const int64_t this_sse = superres_sses[this_index]; + const int64_t this_rate = superres_rates[this_index]; + const int this_largest_tile_id = superres_largest_tile_ids[this_index]; + const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST( + rdmult, this_rate, this_sse, bit_depth); + if (this_rdcost < proj_rdcost1) { + sse1 = this_sse; + rate1 = this_rate; + largest_tile_id1 = this_largest_tile_id; + proj_rdcost1 = this_rdcost; + best_denom = denom; + } + } + const double proj_rdcost2 = + RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth); + // Re-encode with superres if it's better. + if (proj_rdcost1 < proj_rdcost2) { + restore_all_coding_context(cpi); + // TODO(urvang): We should avoid rerunning the recode loop by saving + // previous output+state, or running encode only for the selected 'q' in + // previous step. + // Again, temporarily force the best denom. + superres_cfg->superres_scale_denominator = best_denom; + superres_cfg->superres_kf_scale_denominator = best_denom; + int64_t sse3 = INT64_MAX; + int64_t rate3 = INT64_MAX; + cpi->superres_mode = + AOM_SUPERRES_AUTO; // Super-res on for this recode loop. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3, + largest_tile_id); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + assert(sse1 == sse3); + assert(rate1 == rate3); + assert(largest_tile_id1 == *largest_tile_id); + // Reset. + superres_cfg->superres_scale_denominator = SCALE_NUMERATOR; + superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR; + } else { + *largest_tile_id = largest_tile_id2; + } + } else { + assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL); + cpi->superres_mode = + AOM_SUPERRES_AUTO; // Super-res on for this recode loop. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1, + &largest_tile_id1); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + if (err != AOM_CODEC_OK) return err; + restore_all_coding_context(cpi); + // Encode without superres. + assert(cpi->superres_mode == AOM_SUPERRES_NONE); + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2, + &largest_tile_id2); + if (err != AOM_CODEC_OK) return err; + + // Note: Both use common rdmult based on base qindex of fullres. + const int64_t rdmult = av1_compute_rd_mult_based_on_qindex( + bit_depth, update_type, cm->quant_params.base_qindex); + proj_rdcost1 = + RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth); + const double proj_rdcost2 = + RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth); + // Re-encode with superres if it's better. + if (proj_rdcost1 < proj_rdcost2) { + restore_all_coding_context(cpi); + // TODO(urvang): We should avoid rerunning the recode loop by saving + // previous output+state, or running encode only for the selected 'q' in + // previous step. + int64_t sse3 = INT64_MAX; + int64_t rate3 = INT64_MAX; + cpi->superres_mode = + AOM_SUPERRES_AUTO; // Super-res on for this recode loop. + err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3, + largest_tile_id); + cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res). + assert(sse1 == sse3); + assert(rate1 == rate3); + assert(largest_tile_id1 == *largest_tile_id); + } else { + *largest_tile_id = largest_tile_id2; + } + } + + return err; +} + +// Conditions to disable cdf_update mode in selective mode for real-time. +// Handle case for layers, scene change, and resizing. +static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + // For single layer. + if (cpi->svc.number_spatial_layers == 1 && + cpi->svc.number_temporal_layers == 1) { + // Don't disable on intra_only, scene change (high_source_sad = 1), + // or resized frame. To avoid quality loss force enable at + // for ~30 frames after key or scene/slide change, and + // after 8 frames since last update if frame_source_sad > 0. + if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) || + rc->high_source_sad || rc->frames_since_key < 30 || + (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30) || + (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0)) + return 0; + else + return 1; + } else if (cpi->svc.number_temporal_layers > 1) { + // Disable only on top temporal enhancement layer for now. + return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1; + } + return 1; +} + +#if !CONFIG_REALTIME_ONLY +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->weight -= frame->weight; + section->intra_error -= frame->intra_error; + section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; + section->coded_error -= frame->coded_error; + section->sr_coded_error -= frame->sr_coded_error; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->intra_skip_pct -= frame->intra_skip_pct; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->inactive_zone_cols -= frame->inactive_zone_cols; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->new_mv_count -= frame->new_mv_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + const FIRSTPASS_STATS *const total_stats = + twopass->stats_buf_ctx->total_stats; + + if (is_one_pass_rt_params(cpi) || + (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) || + (is_fp_wavelet_energy_invalid(total_stats) == 0)) + return; + + const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source; + const uint8_t *const src = unfiltered_source->y_buffer; + const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = unfiltered_source->y_stride; + const BLOCK_SIZE fp_block_size = + get_fp_block_size(cpi->is_screen_content_type); + const int fp_block_size_width = block_size_wide[fp_block_size]; + const int fp_block_size_height = block_size_high[fp_block_size]; + const int num_unit_cols = + get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width); + const int num_unit_rows = + get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height); + const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8); + const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8); + int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input( + src, stride, hbd, num_8x8_rows, num_8x8_cols); + + cpi->twopass_frame.frame_avg_haar_energy = + log1p((double)frame_avg_wavelet_energy / num_mbs); +} +#endif + +extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc, + const char *filename); + +/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack + * the bitstream + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * + * \param[in] cpi Top-level encoder structure + * \param[in] size Bitstream size + * \param[in] dest Bitstream output + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval #AOM_CODEC_ERROR + */ +static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, + uint8_t *dest) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + struct segmentation *const seg = &cm->seg; + FeatureFlags *const features = &cm->features; + const TileConfig *const tile_cfg = &oxcf->tile_cfg; + assert(cpi->source != NULL); + cpi->td.mb.e_mbd.cur_buf = cpi->source; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_to_data_rate_time); +#endif + +#if !CONFIG_REALTIME_ONLY + calculate_frame_avg_haar_energy(cpi); +#endif + + // frame type has been decided outside of this function call + cm->cur_frame->frame_type = current_frame->frame_type; + + cm->tiles.large_scale = tile_cfg->enable_large_scale_tile; + cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding; + + features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm); + // features->allow_ref_frame_mvs needs to be written into the frame header + // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case + // is separated from frame_might_allow_ref_frame_mvs(). + features->allow_ref_frame_mvs &= !cm->tiles.large_scale; + + features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion && + frame_might_allow_warped_motion(cm); + + cpi->last_frame_type = current_frame->frame_type; + + if (frame_is_intra_only(cm)) { + cpi->frames_since_last_update = 0; + } + + if (frame_is_sframe(cm)) { + GF_GROUP *gf_group = &cpi->ppi->gf_group; + // S frame will wipe out any previously encoded altref so we cannot place + // an overlay frame + gf_group->update_type[gf_group->size] = GF_UPDATE; + } + + if (encode_show_existing_frame(cm)) { +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + // TODO(angiebird): Move this into a function. + if (oxcf->pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index); + rc_log_frame_encode_param( + &cpi->rc_log, frame_coding_idx, 1, 255, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]); + } +#endif + av1_finalize_encoded_frame(cpi); + // Build the bitstream + int largest_tile_id = 0; // Output from bitstream: unused here + cpi->rc.coefficient_size = 0; + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; + + if (seq_params->frame_id_numbers_present_flag && + current_frame->frame_type == KEY_FRAME) { + // Displaying a forward key-frame, so reset the ref buffer IDs + int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; + for (int i = 0; i < REF_FRAMES; i++) + cm->ref_frame_id[i] = display_frame_id; + } + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + av1_dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + +#if CONFIG_AV1_TEMPORAL_DENOISING + av1_denoiser_update_ref_frame(cpi); +#endif + + // Since we allocate a spot for the OVERLAY frame in the gf group, we need + // to do post-encoding update accordingly. + av1_set_target_rate(cpi, cm->width, cm->height); + + if (is_psnr_calc_enabled(cpi)) { + cpi->source = + realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width, + cm->cur_frame->buf.y_crop_height); + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->use_ducky_encode) { + PSNR_STATS psnr; + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr); + DuckyEncodeFrameResult *frame_result = + &cpi->ducky_encode_info.frame_result; + frame_result->global_order_idx = cm->cur_frame->display_order_hint; + frame_result->q_index = cm->quant_params.base_qindex; + frame_result->rdmult = cpi->rd.RDMULT; + frame_result->rate = (int)(*size) * 8; + frame_result->dist = psnr.sse[0]; + frame_result->psnr = psnr.psnr[0]; + } +#endif // !CONFIG_REALTIME_ONLY + + update_counters_for_show_frame(cpi); + return AOM_CODEC_OK; + } + + // Work out whether to force_integer_mv this frame + if (!is_stat_generation_stage(cpi) && + cpi->common.features.allow_screen_content_tools && + !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) { + if (cpi->common.seq_params->force_integer_mv == 2) { + // Adaptive mode: see what previous frame encoded did + if (cpi->unscaled_last_source != NULL) { + features->cur_frame_force_integer_mv = av1_is_integer_mv( + cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info); + } else { + cpi->common.features.cur_frame_force_integer_mv = 0; + } + } else { + cpi->common.features.cur_frame_force_integer_mv = + cpi->common.seq_params->force_integer_mv; + } + } else { + cpi->common.features.cur_frame_force_integer_mv = 0; + } + + // This is used by av1_pack_bitstream. So this needs to be set in case of + // row-mt where the encoding code will use a temporary structure. + cpi->td.mb.e_mbd.cur_frame_force_integer_mv = + cpi->common.features.cur_frame_force_integer_mv; + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm) || frame_is_sframe(cm)) { + // Reset the loop filter deltas and segmentation map. + av1_reset_segment_features(cm); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + } + if (tile_cfg->mtu == 0) { + cpi->num_tg = tile_cfg->num_tile_groups; + } else { + // Use a default value for the purposes of weighting costs in probability + // updates + cpi->num_tg = DEFAULT_MAX_NUM_TG; + } + + // For 1 pass CBR mode: check if we are dropping this frame. + if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) { + // Always drop for spatial enhancement layer if layer bandwidth is 0. + // Otherwise check for frame-dropping based on buffer level in + // av1_rc_drop_frame(). + if ((cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.rc_cfg.target_bandwidth == 0) || + av1_rc_drop_frame(cpi)) { + cpi->is_dropped_frame = true; + } + if (cpi->is_dropped_frame) { + av1_setup_frame_size(cpi); + av1_set_mv_search_params(cpi); + av1_rc_postencode_update_drop_frame(cpi); + release_scaled_references(cpi); + cpi->ppi->gf_group.is_frame_dropped[cpi->gf_frame_index] = true; + // A dropped frame might not be shown but it always takes a slot in the gf + // group. Therefore, even when it is not shown, we still need to update + // the relevant frame counters. + if (cm->show_frame) { + update_counters_for_show_frame(cpi); + } + return AOM_CODEC_OK; + } + } + + if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) { + av1_set_mb_ssim_rdmult_scaling(cpi); + } +#if CONFIG_SALIENCY_MAP + else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP && + !(cpi->source->flags & YV12_FLAG_HIGHBITDEPTH)) { + if (av1_set_saliency_map(cpi) == 0) { + return AOM_CODEC_MEM_ERROR; + } +#if !CONFIG_REALTIME_ONLY + double motion_ratio = av1_setup_motion_ratio(cpi); +#else + double motion_ratio = 1.0; +#endif + if (av1_setup_sm_rdmult_scaling_factor(cpi, motion_ratio) == 0) { + return AOM_CODEC_MEM_ERROR; + } + } +#endif +#if CONFIG_TUNE_VMAF + else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN || + oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + av1_set_mb_vmaf_rdmult_scaling(cpi); + } +#endif + + if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI && + cpi->sf.rt_sf.use_nonrd_pick_mode == 0) { + av1_init_mb_wiener_var_buffer(cpi); + av1_set_mb_wiener_variance(cpi); + } + + if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) { + av1_init_mb_ur_var_buffer(cpi); + av1_set_mb_ur_variance(cpi); + } + +#if CONFIG_INTERNAL_STATS + memset(cpi->mode_chosen_counts, 0, + MAX_MODES * sizeof(*cpi->mode_chosen_counts)); +#endif + + if (seq_params->frame_id_numbers_present_flag) { + /* Non-normative definition of current_frame_id ("frame counter" with + * wraparound) */ + if (cm->current_frame_id == -1) { + int lsb, msb; + /* quasi-random initialization of current_frame_id for a key frame */ + if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) { + lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff; + msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff; + } else { + lsb = cpi->source->y_buffer[0] & 0xff; + msb = cpi->source->y_buffer[1] & 0xff; + } + cm->current_frame_id = + ((msb << 8) + lsb) % (1 << seq_params->frame_id_length); + + // S_frame is meant for stitching different streams of different + // resolutions together, so current_frame_id must be the + // same across different streams of the same content current_frame_id + // should be the same and not random. 0x37 is a chosen number as start + // point + if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37; + } else { + cm->current_frame_id = + (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) % + (1 << seq_params->frame_id_length); + } + } + + switch (oxcf->algo_cfg.cdf_update_mode) { + case 0: // No CDF update for any frames(4~6% compression loss). + features->disable_cdf_update = 1; + break; + case 1: // Enable CDF update for all frames. + if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame && + cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2) + features->disable_cdf_update = 1; + else if (cpi->sf.rt_sf.selective_cdf_update) + features->disable_cdf_update = selective_disable_cdf_rtc(cpi); + else + features->disable_cdf_update = 0; + break; + case 2: + // Strategically determine at which frames to do CDF update. + // Currently only enable CDF update for all-intra and no-show frames(1.5% + // compression loss) for good qualiy or allintra mode. + if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) { + features->disable_cdf_update = + (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1; + } else { + features->disable_cdf_update = selective_disable_cdf_rtc(cpi); + } + break; + } + + // Disable cdf update for the INTNL_ARF_UPDATE frame with + // frame_parallel_level 1. + if (!cpi->do_frame_data_update && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1); + features->disable_cdf_update = 1; + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.tool_cfg.enable_global_motion && !frame_is_intra_only(cm)) { + // Flush any stale global motion information, which may be left over + // from a previous frame + aom_invalidate_pyramid(cpi->source->y_pyramid); + av1_invalidate_corner_list(cpi->source->corners); + } +#endif // !CONFIG_REALTIME_ONLY + + int largest_tile_id = 0; + if (av1_superres_in_recode_allowed(cpi)) { + if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { + const aom_superres_mode orig_superres_mode = cpi->superres_mode; // save + cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode; + if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL, + &largest_tile_id) != AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + cpi->superres_mode = orig_superres_mode; // restore + } + + // Update reference frame ids for reference frames this frame will overwrite + if (seq_params->frame_id_numbers_present_flag) { + for (int i = 0; i < REF_FRAMES; i++) { + if ((current_frame->refresh_frame_flags >> i) & 1) { + cm->ref_frame_id[i] = cm->current_frame_id; + } + } + } + + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.num_encoded_top_layer++; + +#if DUMP_RECON_FRAMES == 1 + // NOTE(zoeliu): For debug - Output the filtered reconstructed video. + av1_dump_filtered_recon_frames(cpi); +#endif // DUMP_RECON_FRAMES + + if (cm->seg.enabled) { + if (cm->seg.update_map == 0 && cm->last_frame_seg_map) { + memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map, + cm->cur_frame->mi_cols * cm->cur_frame->mi_rows * + sizeof(*cm->cur_frame->seg_map)); + } + } + + int release_scaled_refs = 0; +#if CONFIG_FPMT_TEST + release_scaled_refs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0; +#endif // CONFIG_FPMT_TEST + if (release_scaled_refs || + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) { + if (frame_is_intra_only(cm) == 0) { + release_scaled_references(cpi); + } + } +#if CONFIG_AV1_TEMPORAL_DENOISING + av1_denoiser_update_ref_frame(cpi); +#endif + + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + + if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { + *cm->fc = cpi->tile_data[largest_tile_id].tctx; + av1_reset_cdf_symbol_counters(cm->fc); + } + if (!cm->tiles.large_scale) { + cm->cur_frame->frame_context = *cm->fc; + } + + if (tile_cfg->enable_ext_tile_debug) { + // (yunqing) This test ensures the correctness of large scale tile coding. + if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) { + char fn[20] = "./fc"; + fn[4] = current_frame->frame_number / 100 + '0'; + fn[5] = (current_frame->frame_number % 100) / 10 + '0'; + fn[6] = (current_frame->frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_frame_contexts(cm->fc, fn); + } + } + + cpi->last_frame_type = current_frame->frame_type; + + if (cm->features.disable_cdf_update) { + cpi->frames_since_last_update++; + } else { + cpi->frames_since_last_update = 1; + } + + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.prev_number_spatial_layers = cpi->svc.number_spatial_layers; + + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; + + if (cm->show_frame) { + update_counters_for_show_frame(cpi); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_to_data_rate_time); +#endif + + return AOM_CODEC_OK; +} + +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + + cpi->unscaled_source = frame_input->source; + cpi->source = frame_input->source; + cpi->unscaled_last_source = frame_input->last_source; + + current_frame->refresh_frame_flags = frame_params->refresh_frame_flags; + cm->features.error_resilient_mode = frame_params->error_resilient_mode; + cm->features.primary_ref_frame = frame_params->primary_ref_frame; + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + cpi->ref_frame_flags = frame_params->ref_frame_flags; + cpi->speed = frame_params->speed; + cm->show_existing_frame = frame_params->show_existing_frame; + cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show; + + memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + memcpy(&cpi->refresh_frame, &frame_params->refresh_frame, + sizeof(cpi->refresh_frame)); + + if (current_frame->frame_type == KEY_FRAME && + cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + current_frame->frame_number = 0; + } + + current_frame->order_hint = + current_frame->frame_number + frame_params->order_offset; + + current_frame->display_order_hint = current_frame->order_hint; + current_frame->order_hint %= + (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1)); + + current_frame->pyramid_level = get_true_pyr_level( + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], + current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth); + + if (is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.q_cfg.use_fixed_qp_offsets) + av1_noop_first_pass_frame(cpi, frame_input->ts_duration); + else + av1_first_pass(cpi, frame_input->ts_duration); +#endif + } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS || + cpi->oxcf.pass >= AOM_RC_SECOND_PASS) { + if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { + return AOM_CODEC_ERROR; + } + + return AOM_CODEC_OK; +} + +#if CONFIG_DENOISE +static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, + int block_size, float noise_level, + int64_t time_stamp, int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + if (!cpi->denoise_and_model) { + cpi->denoise_and_model = aom_denoise_and_model_alloc( + cm->seq_params->bit_depth, block_size, noise_level); + if (!cpi->denoise_and_model) { + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating denoise and model"); + return -1; + } + } + if (!cpi->film_grain_table) { + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); + if (!cpi->film_grain_table) { + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating grain table"); + return -1; + } + memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); + } + if (aom_denoise_and_model_run(cpi->denoise_and_model, sd, + &cm->film_grain_params, + cpi->oxcf.enable_dnl_denoising)) { + if (cm->film_grain_params.apply_grain) { + aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time, + &cm->film_grain_params); + } + } + return 0; +} +#endif + +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + int res = 0; + const int subsampling_x = sd->subsampling_x; + const int subsampling_y = sd->subsampling_y; + const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + +#if CONFIG_TUNE_VMAF + if (!is_stat_generation_stage(cpi) && + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) { + av1_vmaf_frame_preprocessing(cpi, sd); + } + if (!is_stat_generation_stage(cpi) && + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) { + av1_vmaf_blk_preprocessing(cpi, sd); + } +#endif + +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + +#if CONFIG_DENOISE + // even if denoise_noise_level is > 0, we don't need need to denoise on pass + // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be + // encoding the original (non-denoised) frame + if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS && + !cpi->oxcf.enable_dnl_denoising)) { +#if !CONFIG_REALTIME_ONLY + // Choose a synthetic noise level for still images for enhanced perceptual + // quality based on an estimated noise level in the source, but only if + // the noise level is set on the command line to > 0. + if (cpi->oxcf.mode == ALLINTRA) { + // No noise synthesis if source is very clean. + // Uses a low edge threshold to focus on smooth areas. + // Increase output noise setting a little compared to measured value. + double y_noise_level = 0.0; + av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y, + cm->seq_params->bit_depth, 16); + cpi->oxcf.noise_level = (float)(y_noise_level - 0.1); + cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level); + if (cpi->oxcf.noise_level > 0.0) { + cpi->oxcf.noise_level += (float)0.5; + } + cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level); + } +#endif + + if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size, + cpi->oxcf.noise_level, time_stamp, end_time) < 0) + res = -1; + } +#endif // CONFIG_DENOISE + + if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time, + use_highbitdepth, cpi->image_pyramid_levels, + frame_flags)) { + aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed"); + res = -1; + } +#if CONFIG_INTERNAL_STATS + aom_usec_timer_mark(&timer); + cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer); +#endif + + // Note: Regarding profile setting, the following checks are added to help + // choose a proper profile for the input video. The criterion is that all + // bitstreams must be designated as the lowest profile that match its content. + // E.G. A bitstream that contains 4:4:4 video must be designated as High + // Profile in the seq header, and likewise a bitstream that contains 4:2:2 + // bitstream must be designated as Professional Profile in the sequence + // header. + if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && + (subsampling_x != 1 || subsampling_y != 1)) { + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 2"); + res = -1; + } + if ((seq_params->profile == PROFILE_1) && + !(subsampling_x == 0 && subsampling_y == 0)) { + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 1 requires 4:4:4 color format"); + res = -1; + } + if ((seq_params->profile == PROFILE_2) && + (seq_params->bit_depth <= AOM_BITS_10) && + !(subsampling_x == 1 && subsampling_y == 0)) { + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); + res = -1; + } + + return res; +} + +#if CONFIG_ENTROPY_STATS +void print_entropy_stats(AV1_PRIMARY *const ppi) { + if (!ppi->cpi) return; + + if (ppi->cpi->oxcf.pass != 1 && + ppi->cpi->common.current_frame.frame_number > 0) { + fprintf(stderr, "Writing counts.stt\n"); + FILE *f = fopen("counts.stt", "wb"); + fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f); + fclose(f); + } +} +#endif // CONFIG_ENTROPY_STATS + +#if CONFIG_INTERNAL_STATS +extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, + const unsigned char *img2, int img2_pitch, + int width, int height); + +static void adjust_image_stat(double y, double u, double v, double all, + ImageStat *s) { + s->stat[STAT_Y] += y; + s->stat[STAT_U] += u; + s->stat[STAT_V] += v; + s->stat[STAT_ALL] += all; + s->worst = AOMMIN(s->worst, all); +} + +static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { + AV1_PRIMARY *const ppi = cpi->ppi; + AV1_COMMON *const cm = &cpi->common; + double samples = 0.0; + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + + if (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) + return; + +#if CONFIG_INTER_STATS_ONLY + if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame +#endif + cpi->bytes += frame_bytes; + if (cm->show_frame) { + const YV12_BUFFER_CONFIG *orig = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + double y, u, v, frame_all; + + ppi->count[0]++; + ppi->count[1]++; + if (cpi->ppi->b_calculate_psnr) { + PSNR_STATS psnr; + double weight[2] = { 0.0, 0.0 }; + double frame_ssim2[2] = { 0.0, 0.0 }; +#if CONFIG_AV1_HIGHBITDEPTH + aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); +#else + aom_calc_psnr(orig, recon, &psnr); +#endif + adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], + &(ppi->psnr[0])); + ppi->total_sq_error[0] += psnr.sse[0]; + ppi->total_samples[0] += psnr.samples[0]; + samples = psnr.samples[0]; + + aom_calc_ssim(orig, recon, bit_depth, in_bit_depth, + cm->seq_params->use_highbitdepth, weight, frame_ssim2); + + ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]); + ppi->summed_quality += frame_ssim2[0] * weight[0]; + ppi->summed_weights += weight[0]; + +#if CONFIG_AV1_HIGHBITDEPTH + // Compute PSNR based on stream bit depth + if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && + (in_bit_depth < bit_depth)) { + adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3], + psnr.psnr_hbd[0], &ppi->psnr[1]); + ppi->total_sq_error[1] += psnr.sse_hbd[0]; + ppi->total_samples[1] += psnr.samples_hbd[0]; + + ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]); + ppi->summed_quality_hbd += frame_ssim2[1] * weight[1]; + ppi->summed_weights_hbd += weight[1]; + } +#endif + +#if 0 + { + FILE *f = fopen("q_used.stt", "a"); + double y2 = psnr.psnr[1]; + double u2 = psnr.psnr[2]; + double v2 = psnr.psnr[3]; + double frame_psnr2 = psnr.psnr[0]; + fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", + cm->current_frame.frame_number, y2, u2, v2, + frame_psnr2, frame_ssim2); + fclose(f); + } +#endif + } + if (ppi->b_calculate_blockiness) { + if (!cm->seq_params->use_highbitdepth) { + const double frame_blockiness = + av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, + recon->y_stride, orig->y_width, orig->y_height); + ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness); + ppi->total_blockiness += frame_blockiness; + } + + if (ppi->b_calculate_consistency) { + if (!cm->seq_params->use_highbitdepth) { + const double this_inconsistency = aom_get_ssim_metrics( + orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, + orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1); + + const double peak = (double)((1 << in_bit_depth) - 1); + const double consistency = + aom_sse_to_psnr(samples, peak, ppi->total_inconsistency); + if (consistency > 0.0) + ppi->worst_consistency = + AOMMIN(ppi->worst_consistency, consistency); + ppi->total_inconsistency += this_inconsistency; + } + } + } + + frame_all = + aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &ppi->fastssim); + frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); + adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs); + } +} + +void print_internal_stats(AV1_PRIMARY *ppi) { + if (!ppi->cpi) return; + AV1_COMP *const cpi = ppi->cpi; + + if (ppi->cpi->oxcf.pass != 1 && + ppi->cpi->common.current_frame.frame_number > 0) { + char headings[512] = { 0 }; + char results[512] = { 0 }; + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) / + 10000000.000; + double total_encode_time = + (ppi->total_time_receive_data + ppi->total_time_compress_data) / + 1000.000; + const double dr = + (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded; + const double peak = + (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1); + const double target_rate = + (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000; + const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); + + if (ppi->b_calculate_psnr) { + const double total_psnr = aom_sse_to_psnr( + (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]); + const double total_ssim = + 100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0); + snprintf(headings, sizeof(headings), + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsrnY\tAPsnrCb\tAPsnrCr"); + snprintf(results, sizeof(results), + "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", + dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, + ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, + total_ssim, total_ssim, + ppi->fastssim.stat[STAT_ALL] / ppi->count[0], + ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst, + ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst, + ppi->psnr[0].stat[STAT_Y] / ppi->count[0], + ppi->psnr[0].stat[STAT_U] / ppi->count[0], + ppi->psnr[0].stat[STAT_V] / ppi->count[0]); + + if (ppi->b_calculate_blockiness) { + SNPRINT(headings, "\t Block\tWstBlck"); + SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]); + SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness); + } + + if (ppi->b_calculate_consistency) { + double consistency = + aom_sse_to_psnr((double)ppi->total_samples[0], peak, + (double)ppi->total_inconsistency); + + SNPRINT(headings, "\tConsist\tWstCons"); + SNPRINT2(results, "\t%7.3f", consistency); + SNPRINT2(results, "\t%7.3f", ppi->worst_consistency); + } + + SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); + SNPRINT2(results, "\t%8.0f", total_encode_time); + SNPRINT2(results, " %7.2f", rate_err); + SNPRINT2(results, " %7.2f", fabs(rate_err)); + + SNPRINT(headings, "\tAPsnr611"); + SNPRINT2(results, " %7.3f", + (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] + + ppi->psnr[0].stat[STAT_V]) / + (ppi->count[0] * 8)); + +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = ppi->seq_params.bit_depth; + // Since cpi->source->flags is not available here, but total_samples[1] + // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was + // true in compute_internal_stats + if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) { + const double peak_hbd = (double)((1 << bit_depth) - 1); + const double total_psnr_hbd = + aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd, + (double)ppi->total_sq_error[1]); + const double total_ssim_hbd = + 100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0); + SNPRINT(headings, + "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH" + " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH" + " AOMSSIMH VPSSIMPH WstSsimH"); + SNPRINT2(results, "\t%7.3f", + ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); + SNPRINT2(results, " %7.3f", total_psnr_hbd); + SNPRINT2(results, " %7.3f", + ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); + SNPRINT2(results, " %7.3f", total_psnr_hbd); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].worst); + SNPRINT2(results, " %7.3f", total_ssim_hbd); + SNPRINT2(results, " %7.3f", total_ssim_hbd); + SNPRINT2(results, " %7.3f", ppi->worst_ssim_hbd); + } +#endif + fprintf(f, "%s\n", headings); + fprintf(f, "%s\n", results); + } + + fclose(f); + + aom_free(ppi->ssim_vars); + ppi->ssim_vars = NULL; + } +} +#endif // CONFIG_INTERNAL_STATS + +static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) { + if (cpi->common.show_frame && cpi->rc.frames_to_key) { +#if !CONFIG_REALTIME_ONLY + FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info; + if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) { + av1_firstpass_info_move_cur_index_and_pop(firstpass_info); + } else { + // When there is not enough past stats, we move the current + // index without popping the past stats + av1_firstpass_info_move_cur_index(firstpass_info); + } +#endif + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + cpi->rc.frames_to_fwd_kf--; + } + } +} + +static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { + // TODO(weitinglin): Updating this counter for is_frame_droppable + // is a work-around to handle the condition when a frame is drop. + // We should fix the cpi->common.show_frame flag + // instead of checking the other condition to update the counter properly. + if (cpi->common.show_frame || + is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } +} + +static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) { + // Increment the gf group index ready for the next frame. + if (is_one_pass_rt_params(cpi) && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + ++cpi->gf_frame_index; + // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH + // for real time encoding. + if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH) + cpi->gf_frame_index = 0; + } else { + ++cpi->gf_frame_index; + } +} + +static void update_fb_of_context_type(const AV1_COMP *const cpi, + int *const fb_of_context_type) { + const AV1_COMMON *const cm = &cpi->common; + const int current_frame_ref_type = get_current_frame_ref_type(cpi); + + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + for (int i = 0; i < REF_FRAMES; i++) { + fb_of_context_type[i] = -1; + } + fb_of_context_type[current_frame_ref_type] = + cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME) + : get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + + if (!encode_show_existing_frame(cm)) { + // Refresh fb_of_context_type[]: see encoder.h for explanation + if (cm->current_frame.frame_type == KEY_FRAME) { + // All ref frames are refreshed, pick one that will live long enough + fb_of_context_type[current_frame_ref_type] = 0; + } else { + // If more than one frame is refreshed, it doesn't matter which one we + // pick so pick the first. LST sometimes doesn't refresh any: this is ok + + for (int i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame.refresh_frame_flags & (1 << i)) { + fb_of_context_type[current_frame_ref_type] = i; + break; + } + } + } + } +} + +static void update_rc_counts(AV1_COMP *cpi) { + update_keyframe_counters(cpi); + update_frames_till_gf_update(cpi); + update_gf_group_index(cpi); +} + +static void update_end_of_frame_stats(AV1_COMP *cpi) { + if (cpi->do_frame_data_update) { + // Store current frame loopfilter levels in ppi, if update flag is set. + if (!cpi->common.show_existing_frame) { + AV1_COMMON *const cm = &cpi->common; + struct loopfilter *const lf = &cm->lf; + cpi->ppi->filter_level[0] = lf->filter_level[0]; + cpi->ppi->filter_level[1] = lf->filter_level[1]; + cpi->ppi->filter_level_u = lf->filter_level_u; + cpi->ppi->filter_level_v = lf->filter_level_v; + } + } + // Store frame level mv_stats from cpi to ppi. + cpi->ppi->mv_stats = cpi->mv_stats; +} + +// Updates frame level stats related to global motion +static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) { + FRAME_UPDATE_TYPE update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + int i, is_gm_present = 0; + + // Check if the current frame has any valid global motion model across its + // reference frames + for (i = 0; i < REF_FRAMES; i++) { + if (cpi->common.global_motion[i].wmtype != IDENTITY) { + is_gm_present = 1; + break; + } + } + int update_actual_stats = 1; +#if CONFIG_FPMT_TEST + update_actual_stats = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!update_actual_stats) { + if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) { + cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present; + } else { + cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present; + } + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + if (cpi->do_frame_data_update == 1 && + !show_existing_between_parallel_frames) { + for (i = 0; i < FRAME_UPDATE_TYPES; i++) { + cpi->ppi->valid_gm_model_found[i] = + cpi->ppi->temp_valid_gm_model_found[i]; + } + } + } +#endif + if (update_actual_stats) { + if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) { + cpi->ppi->valid_gm_model_found[update_type] = is_gm_present; + } else { + cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present; + } + } +} + +void av1_post_encode_updates(AV1_COMP *const cpi, + const AV1_COMP_DATA *const cpi_data) { + AV1_PRIMARY *const ppi = cpi->ppi; + AV1_COMMON *const cm = &cpi->common; + + update_gm_stats(cpi); + +#if !CONFIG_REALTIME_ONLY + // Update the total stats remaining structure. + if (cpi->twopass_frame.this_frame != NULL && + ppi->twopass.stats_buf_ctx->total_left_stats) { + subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats, + cpi->twopass_frame.this_frame); + } +#endif + +#if CONFIG_OUTPUT_FRAME_SIZE + FILE *f = fopen("frame_sizes.csv", "a"); + fprintf(f, "%d,", 8 * (int)cpi_data->frame_size); + fprintf(f, "%d\n", cm->quant_params.base_qindex); + fclose(f); +#endif // CONFIG_OUTPUT_FRAME_SIZE + + if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) { + // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy + // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel + // encode set of lower layer frames. + // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid + // copy. + if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 && + ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 && + ppi->gf_group.update_type[cpi->gf_frame_index - 1] == + INTNL_ARF_UPDATE) { + memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy, + sizeof(cm->ref_frame_map)); + } + refresh_reference_frames(cpi); + // For frame_parallel_level 1 frame in a parallel encode set of lower layer + // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy. + if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 && + ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map, + sizeof(cm->ref_frame_map)); + } + av1_rc_postencode_update(cpi, cpi_data->frame_size); + } + + if (cpi_data->pop_lookahead == 1) { + av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush, + cpi->compressor_stage); + } + if (cpi->common.show_frame) { + cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start; + cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end; + } + if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { + // Initialize level info. at the beginning of each sequence. + if (cm->current_frame.frame_type == KEY_FRAME && + ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + av1_init_level_info(cpi); + } + av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start, + cpi_data->ts_frame_end); + } + + if (!is_stat_generation_stage(cpi)) { +#if !CONFIG_REALTIME_ONLY + if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi); +#endif + update_fb_of_context_type(cpi, ppi->fb_of_context_type); + update_rc_counts(cpi); + update_end_of_frame_stats(cpi); + } + + if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) { + av1_pop_third_pass_info(cpi->third_pass_ctx); + } + + if (ppi->rtc_ref.set_ref_frame_config) { + av1_svc_update_buffer_slot_refreshed(cpi); + av1_svc_set_reference_was_previous(cpi); + } + + if (ppi->use_svc) av1_save_layer_context(cpi); + + // Note *size = 0 indicates a dropped frame for which psnr is not calculated + if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) { + if (cm->show_existing_frame || + (!is_stat_generation_stage(cpi) && cm->show_frame)) { + generate_psnr_packet(cpi); + } + } + +#if CONFIG_INTERNAL_STATS + if (!is_stat_generation_stage(cpi)) { + compute_internal_stats(cpi, (int)cpi_data->frame_size); + } +#endif // CONFIG_INTERNAL_STATS + + // Write frame info. Subtract 1 from frame index since if was incremented in + // update_rc_counts. + av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1); +} + +int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + return cm->error->error_code; + } + cm->error->setjmp = 1; + +#if CONFIG_INTERNAL_STATS + cpi->frame_recode_hits = 0; + cpi->time_compress_data = 0; + cpi->bytes = 0; +#endif +#if CONFIG_ENTROPY_STATS + if (cpi->compressor_stage == ENCODE_STAGE) { + av1_zero(cpi->counts); + } +#endif + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads <= 1 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); + + if (cm->seq_params->order_hint_info.enable_order_hint) { + aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 + + cm->show_frame); + } else { + // This is currently used in RTC encoding. cm->show_frame is always 1. + aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number); + } +#endif + if (cpi->ppi->use_svc) { + av1_one_pass_cbr_svc_start_layer(cpi); + } + + cpi->is_dropped_frame = false; + cm->showable_frame = 0; + cpi_data->frame_size = 0; + cpi->available_bs_size = cpi_data->cx_data_sz; +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer cmptimer; + aom_usec_timer_start(&cmptimer); +#endif + av1_set_high_precision_mv(cpi, 1, 0); + + // Normal defaults + cm->features.refresh_frame_context = + oxcf->tool_cfg.frame_parallel_decoding_mode + ? REFRESH_FRAME_CONTEXT_DISABLED + : REFRESH_FRAME_CONTEXT_BACKWARD; + if (oxcf->tile_cfg.enable_large_scale_tile) + cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + + if (assign_cur_frame_new_fb(cm) == NULL) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Failed to allocate new cur_frame"); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case. + if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) + start_timing(cpi, av1_encode_strategy_time); +#endif + + const int result = av1_encode_strategy( + cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags, + &cpi_data->ts_frame_start, &cpi_data->ts_frame_end, + cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush); + +#if CONFIG_COLLECT_COMPONENT_TIMING + if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) + end_timing(cpi, av1_encode_strategy_time); + + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) && + cpi->frame_component_time[0] > 100) { + int i; + uint64_t frame_total = 0, total = 0; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_UPDATE_TYPE frame_update_type = + get_frame_update_type(gf_group, cpi->gf_frame_index); + + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update " + "Type: %d, Q: %d\n", + cm->current_frame.frame_number, + get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame, + frame_update_type, cm->quant_params.base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use av1_encode_strategy_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; + } + } +#endif + + // Reset the flag to 0 afer encoding. + cpi->rc.use_external_qp_one_pass = 0; + + if (result == -1) { + cm->error->setjmp = 0; + // Returning -1 indicates no frame encoded; more input is required + return -1; + } + if (result != AOM_CODEC_OK) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Failed to encode frame"); + } +#if CONFIG_INTERNAL_STATS + aom_usec_timer_mark(&cmptimer); + cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); +#endif // CONFIG_INTERNAL_STATS + +#if CONFIG_SPEED_STATS + if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) { + cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count; + cpi->td.mb.txfm_search_info.tx_search_count = 0; + } +#endif // CONFIG_SPEED_STATS + + cm->error->setjmp = 0; + return AOM_CODEC_OK; +} + +// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode +// set. Also sets the bitmask 'ref_buffers_used_map'. +void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) { + AV1_COMMON *cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_buf[ref_frame - 1] = NULL; + continue; + } + + // FPMT does not support scaling yet. + assert(ref->y_crop_width == cm->width && + ref->y_crop_height == cm->height); + + RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); + cpi->scaled_ref_buf[ref_frame - 1] = buf; + for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) { + if (&cm->buffer_pool->frame_bufs[i] == buf) { + *ref_buffers_used_map |= (1 << i); + } + } + } else { + if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; + } + } +} + +// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf +// corresponding to frames in a parallel encode set. +void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map) { + for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) { + if (ref_buffers_used_map & (1 << i)) { + ++buffer_pool->frame_bufs[i].ref_count; + } + } +} + +// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode +// set. +void av1_release_scaled_references_fpmt(AV1_COMP *cpi) { + // TODO(isbs): only refresh the necessary frames, rather than all of them + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; + if (buf != NULL) { + cpi->scaled_ref_buf[i] = NULL; + } + } +} + +// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf +// corresponding to frames in a parallel encode set. +void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map) { + for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) { + if (ref_buffers_used_map & (1 << i)) { + --buffer_pool->frame_bufs[i].ref_count; + } + } +} + +// Initialize parallel frame contexts with screen content decisions. +void av1_init_sc_decisions(AV1_PRIMARY *const ppi) { + AV1_COMP *const first_cpi = ppi->cpi; + for (int i = 1; i < ppi->num_fp_contexts; ++i) { + AV1_COMP *cur_cpi = ppi->parallel_cpi[i]; + cur_cpi->common.features.allow_screen_content_tools = + first_cpi->common.features.allow_screen_content_tools; + cur_cpi->common.features.allow_intrabc = + first_cpi->common.features.allow_intrabc; + cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools; + cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type; + } +} + +AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data) { + int cpi_idx = 0; + + // Loop over parallel_cpi to find the cpi that processed the current + // gf_frame_index ahead of time. + for (int i = 1; i < ppi->num_fp_contexts; i++) { + if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) { + cpi_idx = i; + break; + } + } + + assert(cpi_idx > 0); + assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame); + + // Release the previously-used frame-buffer. + if (ppi->cpi->common.cur_frame != NULL) { + --ppi->cpi->common.cur_frame->ref_count; + ppi->cpi->common.cur_frame = NULL; + } + + // Swap the appropriate parallel_cpi with the parallel_cpi[0]. + ppi->cpi = ppi->parallel_cpi[cpi_idx]; + ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0]; + ppi->parallel_cpi[0] = ppi->cpi; + + // Copy appropriate parallel_frames_data to local data. + { + AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1]; + assert(data->frame_size > 0); + assert(first_cpi_data->cx_data_sz > data->frame_size); + + first_cpi_data->lib_flags = data->lib_flags; + first_cpi_data->ts_frame_start = data->ts_frame_start; + first_cpi_data->ts_frame_end = data->ts_frame_end; + memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size); + first_cpi_data->frame_size = data->frame_size; + if (ppi->cpi->common.show_frame) { + first_cpi_data->pop_lookahead = 1; + } + } + + return ppi->cpi; +} + +// Initialises frames belonging to a parallel encode set. +int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data, + AV1_PRIMARY *const ppi, + int *ref_buffers_used_map) { + AV1_COMP *const first_cpi = ppi->cpi; + GF_GROUP *const gf_group = &ppi->gf_group; + int gf_index_start = first_cpi->gf_frame_index; + assert(gf_group->frame_parallel_level[gf_index_start] == 1); + int parallel_frame_count = 0; + int cur_frame_num = first_cpi->common.current_frame.frame_number; + int show_frame_count = first_cpi->frame_index_set.show_frame_count; + int frames_since_key = first_cpi->rc.frames_since_key; + int frames_to_key = first_cpi->rc.frames_to_key; + int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf; + int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start]; + const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in; + + assert(*ref_buffers_used_map == 0); + + // Release the previously used frame-buffer by a frame_parallel_level 1 frame. + if (first_cpi->common.cur_frame != NULL) { + --first_cpi->common.cur_frame->ref_count; + first_cpi->common.cur_frame = NULL; + } + + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(first_cpi, first_ref_frame_map_pairs); + memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs, + sizeof(RefFrameMapPair) * REF_FRAMES); + + // Store the reference refresh index of frame_parallel_level 1 frame in a + // parallel encode set of lower layer frames. + if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) { + first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf( + first_cpi, ref_frame_map_pairs, gf_index_start); + assert(first_cpi->ref_refresh_index != INVALID_IDX && + first_cpi->ref_refresh_index < REF_FRAMES); + first_cpi->refresh_idx_available = true; + // Update ref_frame_map_pairs. + ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order = + gf_group->display_idx[gf_index_start]; + ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level = + gf_group->layer_depth[gf_index_start]; + } + + // Set do_frame_data_update flag as false for frame_parallel_level 1 frame. + first_cpi->do_frame_data_update = false; + if (gf_group->arf_src_offset[gf_index_start] == 0) { + first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame; + first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame; + } + + av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, first_cpi, + gf_index_start, 1, first_cpi->common.remapped_ref_idx); + + av1_scale_references_fpmt(first_cpi, ref_buffers_used_map); + parallel_frame_count++; + + // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2 + // frames which are part of the current parallel encode set and initialize the + // required cpi elements. + for (int i = gf_index_start + 1; i < gf_group->size; i++) { + // Update frame counters if previous frame was show frame or show existing + // frame. + if (gf_group->arf_src_offset[i - 1] == 0) { + cur_frame_num++; + show_frame_count++; + if (frames_to_fwd_kf <= 0) + frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist; + if (frames_to_key) { + frames_since_key++; + frames_to_key--; + frames_to_fwd_kf--; + } + stats_in++; + } + cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i]; + if (gf_group->frame_parallel_level[i] == 2) { + AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count]; + AV1_COMP_DATA *cur_cpi_data = + &ppi->parallel_frames_data[parallel_frame_count - 1]; + cur_cpi->gf_frame_index = i; + cur_cpi->framerate = first_cpi->framerate; + cur_cpi->common.current_frame.frame_number = cur_frame_num; + cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i]; + cur_cpi->frame_index_set.show_frame_count = show_frame_count; + cur_cpi->rc.frames_since_key = frames_since_key; + cur_cpi->rc.frames_to_key = frames_to_key; + cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf; + cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality; + cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth; + cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth; + cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth; + cur_cpi->rc.intervals_till_gf_calculate_due = + first_cpi->rc.intervals_till_gf_calculate_due; + cur_cpi->mv_search_params.max_mv_magnitude = + first_cpi->mv_search_params.max_mv_magnitude; + if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + cur_cpi->common.lf.mode_ref_delta_enabled = 1; + } + cur_cpi->do_frame_data_update = false; + // Initialize prev_ts_start and prev_ts_end for show frame(s) and show + // existing frame(s). + if (gf_group->arf_src_offset[i] == 0) { + // Choose source of prev frame. + int src_index = gf_group->src_offset[i]; + struct lookahead_entry *prev_source = av1_lookahead_peek( + ppi->lookahead, src_index - 1, cur_cpi->compressor_stage); + // Save timestamps of prev frame. + cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start; + cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end; + } + cur_cpi->time_stamps.first_ts_start = + first_cpi->time_stamps.first_ts_start; + + memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map, + sizeof(first_cpi->common.ref_frame_map)); + cur_cpi_data->lib_flags = 0; + cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio; + cur_cpi_data->flush = first_cpi_data->flush; + cur_cpi_data->frame_size = 0; + if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) { + // If the first frame in a parallel encode set is INTNL_ARF_UPDATE + // frame, initialize lib_flags of frame_parallel_level 2 frame in the + // set with that of frame_parallel_level 1 frame. + cur_cpi_data->lib_flags = first_cpi_data->lib_flags; + // Store the reference refresh index of frame_parallel_level 2 frame in + // a parallel encode set of lower layer frames. + cur_cpi->ref_refresh_index = + av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i); + cur_cpi->refresh_idx_available = true; + // Skip the reference frame which will be refreshed by + // frame_parallel_level 1 frame in a parallel encode set of lower layer + // frames. + cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index; + } else { + cur_cpi->ref_idx_to_skip = INVALID_IDX; + cur_cpi->ref_refresh_index = INVALID_IDX; + cur_cpi->refresh_idx_available = false; + } + cur_cpi->twopass_frame.stats_in = stats_in; + + av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, cur_cpi, i, + 1, cur_cpi->common.remapped_ref_idx); + av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map); + parallel_frame_count++; + } + + // Set do_frame_data_update to true for the last frame_parallel_level 2 + // frame in the current parallel encode set. + if (i == (gf_group->size - 1) || + (gf_group->frame_parallel_level[i + 1] == 0 && + (gf_group->update_type[i + 1] == ARF_UPDATE || + gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) || + gf_group->frame_parallel_level[i + 1] == 1) { + ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true; + break; + } + } + + av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool, + *ref_buffers_used_map); + + // Return the number of frames in the parallel encode set. + return parallel_frame_count; +} + +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { + AV1_COMMON *cm = &cpi->common; + if (!cm->show_frame) { + return -1; + } else { + int ret; + if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) { + *dest = cm->cur_frame->buf; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->seq_params->subsampling_x; + dest->uv_height = cm->height >> cm->seq_params->subsampling_y; + ret = 0; + } else { + ret = -1; + } + return ret; + } +} + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { + if (cpi->last_show_frame_buf == NULL || + cpi->oxcf.algo_cfg.skip_postproc_filtering) + return -1; + + *frame = cpi->last_show_frame_buf->buf; + return 0; +} + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd) { + const int num_planes = av1_num_planes(cm); + if (!equal_dimensions_and_border(new_frame, sd)) + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + aom_yv12_copy_frame(new_frame, sd, num_planes); + + return cm->error->error_code; +} + +int av1_set_internal_size(AV1EncoderConfig *const oxcf, + ResizePendingParams *resize_pending_params, + AOM_SCALING_MODE horiz_mode, + AOM_SCALING_MODE vert_mode) { + int hr = 0, hs = 0, vr = 0, vs = 0; + + // Checks for invalid AOM_SCALING_MODE values. + if (horiz_mode > AOME_ONETHREE || vert_mode > AOME_ONETHREE) return -1; + + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs; + resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs; + + if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) { + oxcf->resize_cfg.resize_mode = RESIZE_FIXED; + oxcf->algo_cfg.enable_tpl_model = 0; + } + return 0; +} + +int av1_get_quantizer(AV1_COMP *cpi) { + return cpi->common.quant_params.base_qindex; +} + +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) { + size_t output_size = 0; + size_t total_bytes_read = 0; + size_t remaining_size = *frame_size; + uint8_t *buff_ptr = buffer; + + // go through each OBUs + while (total_bytes_read < *frame_size) { + uint8_t saved_obu_header[2]; + uint64_t obu_payload_size; + size_t length_of_payload_size; + size_t length_of_obu_size; + uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1; + size_t obu_bytes_read = obu_header_size; // bytes read for current obu + + // save the obu header (1 or 2 bytes) + memmove(saved_obu_header, buff_ptr, obu_header_size); + // clear the obu_has_size_field + saved_obu_header[0] = saved_obu_header[0] & (~0x2); + + // get the payload_size and length of payload_size + if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size, + &obu_payload_size, &length_of_payload_size) != 0) { + return AOM_CODEC_ERROR; + } + obu_bytes_read += length_of_payload_size; + + // calculate the length of size of the obu header plus payload + length_of_obu_size = + aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size)); + + // move the rest of data to new location + memmove(buff_ptr + length_of_obu_size + obu_header_size, + buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read); + obu_bytes_read += (size_t)obu_payload_size; + + // write the new obu size + const uint64_t obu_size = obu_header_size + obu_payload_size; + size_t coded_obu_size; + if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr, + &coded_obu_size) != 0) { + return AOM_CODEC_ERROR; + } + + // write the saved (modified) obu_header following obu size + memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size); + + total_bytes_read += obu_bytes_read; + remaining_size -= obu_bytes_read; + buff_ptr += length_of_obu_size + obu_size; + output_size += length_of_obu_size + (size_t)obu_size; + } + + *frame_size = output_size; + return AOM_CODEC_OK; +} + +static void rtc_set_updates_ref_frame_config( + ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, + RTC_REF *const rtc_ref) { + ext_refresh_frame_flags->update_pending = 1; + ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]]; + ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]]; + ext_refresh_frame_flags->bwd_ref_frame = + rtc_ref->refresh[rtc_ref->ref_idx[4]]; + ext_refresh_frame_flags->alt2_ref_frame = + rtc_ref->refresh[rtc_ref->ref_idx[5]]; + ext_refresh_frame_flags->alt_ref_frame = + rtc_ref->refresh[rtc_ref->ref_idx[6]]; + rtc_ref->non_reference_frame = 1; + for (int i = 0; i < REF_FRAMES; i++) { + if (rtc_ref->refresh[i] == 1) { + rtc_ref->non_reference_frame = 0; + break; + } + } +} + +static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) { + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + int ref = AOM_REFFRAME_ALL; + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i); + } + return ref; +} + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { + // TODO(yunqingwang): For what references to use, external encoding flags + // should be consistent with internal reference frame selection. Need to + // ensure that there is not conflict between the two. In AV1 encoder, the + // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3, + // GOLDEN, BWDREF, ALTREF2. + + ExternalFlags *const ext_flags = &cpi->ext_flags; + ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &ext_flags->refresh_frame; + ext_flags->ref_frame_flags = AOM_REFFRAME_ALL; + if (flags & + (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2)) { + int ref = AOM_REFFRAME_ALL; + + if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG; + if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG; + + if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_REF_ARF) { + ref ^= AOM_ALT_FLAG; + ref ^= AOM_BWD_FLAG; + ref ^= AOM_ALT2_FLAG; + } else { + if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG; + if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG; + } + + av1_use_as_reference(&ext_flags->ref_frame_flags, ref); + } else { + if (cpi->ppi->rtc_ref.set_ref_frame_config) { + int ref = rtc_set_references_external_ref_frame_config(cpi); + av1_use_as_reference(&ext_flags->ref_frame_flags, ref); + } + } + + if (flags & + (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) { + int upd = AOM_REFFRAME_ALL; + + // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag. + if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG; + + if (flags & AOM_EFLAG_NO_UPD_ARF) { + upd ^= AOM_ALT_FLAG; + upd ^= AOM_BWD_FLAG; + upd ^= AOM_ALT2_FLAG; + } + + ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0; + ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0; + ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0; + ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0; + ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0; + ext_refresh_frame_flags->update_pending = 1; + } else { + if (cpi->ppi->rtc_ref.set_ref_frame_config) + rtc_set_updates_ref_frame_config(ext_refresh_frame_flags, + &cpi->ppi->rtc_ref); + else + ext_refresh_frame_flags->update_pending = 0; + } + + ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs & + ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0); + ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode | + ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0); + ext_flags->use_s_frame = + cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0); + ext_flags->use_primary_ref_none = + (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0; + + if (flags & AOM_EFLAG_NO_UPD_ENTROPY) { + update_entropy(&ext_flags->refresh_frame_context, + &ext_flags->refresh_frame_context_pending, 0); + } +} + +aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) { + if (!ppi) return NULL; + + uint8_t header_buf[512] = { 0 }; + const uint32_t sequence_header_size = + av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]); + assert(sequence_header_size <= sizeof(header_buf)); + if (sequence_header_size == 0) return NULL; + + const size_t obu_header_size = 1; + const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size); + const size_t payload_offset = obu_header_size + size_field_size; + + if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; + memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); + + if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count, + OBU_SEQUENCE_HEADER, 0, + &header_buf[0]) != obu_header_size) { + return NULL; + } + + size_t coded_size_field_size = 0; + if (aom_uleb_encode(sequence_header_size, size_field_size, + &header_buf[obu_header_size], + &coded_size_field_size) != 0) { + return NULL; + } + assert(coded_size_field_size == size_field_size); + + aom_fixed_buf_t *global_headers = + (aom_fixed_buf_t *)malloc(sizeof(*global_headers)); + if (!global_headers) return NULL; + + const size_t global_header_buf_size = + obu_header_size + size_field_size + sequence_header_size; + + global_headers->buf = malloc(global_header_buf_size); + if (!global_headers->buf) { + free(global_headers); + return NULL; + } + + memcpy(global_headers->buf, &header_buf[0], global_header_buf_size); + global_headers->sz = global_header_buf_size; + return global_headers; +} diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h new file mode 100644 index 0000000000..5f6f67eda8 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder.h @@ -0,0 +1,4512 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares top-level encoder structures and functions. + */ +#ifndef AOM_AV1_ENCODER_ENCODER_H_ +#define AOM_AV1_ENCODER_ENCODER_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aomcx.h" + +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/entropymode.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" +#include "av1/common/resize.h" +#include "av1/common/thread_common.h" +#include "av1/common/timing.h" + +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/external_partition.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/global_motion.h" +#include "av1/encoder/level.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/svc_layercontext.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/thirdpass.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/av1_noise_estimate.h" +#include "av1/encoder/bitstream.h" + +#if CONFIG_INTERNAL_STATS +#include "aom_dsp/ssim.h" +#endif +#include "aom_dsp/variance.h" +#if CONFIG_DENOISE +#include "aom_dsp/noise_model.h" +#endif +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif +#if CONFIG_AV1_TEMPORAL_DENOISING +#include "av1/encoder/av1_temporal_denoiser.h" +#endif +#if CONFIG_TUNE_BUTTERAUGLI +#include "av1/encoder/tune_butteraugli.h" +#endif + +#include "aom/internal/aom_codec_internal.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to +// adjust it while we work on documentation. +/*!\cond */ +// Number of frames required to test for scene cut detection +#define SCENE_CUT_KEY_TEST_INTERVAL 16 + +// Lookahead index threshold to enable temporal filtering for second arf. +#define TF_LOOKAHEAD_IDX_THR 7 + +#define HDR_QP_LEVELS 10 +#define CHROMA_CB_QP_SCALE 1.04 +#define CHROMA_CR_QP_SCALE 1.04 +#define CHROMA_QP_SCALE -0.46 +#define CHROMA_QP_OFFSET 9.26 +#define QP_SCALE_FACTOR 2.0 +#define DISABLE_HDR_LUMA_DELTAQ 1 + +// Rational number with an int64 numerator +// This structure holds a fractional value +typedef struct aom_rational64 { + int64_t num; // fraction numerator + int den; // fraction denominator +} aom_rational64_t; // alias for struct aom_rational + +enum { + // Good Quality Fast Encoding. The encoder balances quality with the amount of + // time it takes to encode the output. Speed setting controls how fast. + GOOD, + // Realtime Fast Encoding. Will force some restrictions on bitrate + // constraints. + REALTIME, + // All intra mode. All the frames are coded as intra frames. + ALLINTRA +} UENUM1BYTE(MODE); + +enum { + FRAMEFLAGS_KEY = 1 << 0, + FRAMEFLAGS_GOLDEN = 1 << 1, + FRAMEFLAGS_BWDREF = 1 << 2, + // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME + FRAMEFLAGS_ALTREF = 1 << 3, + FRAMEFLAGS_INTRAONLY = 1 << 4, + FRAMEFLAGS_SWITCH = 1 << 5, + FRAMEFLAGS_ERROR_RESILIENT = 1 << 6, +} UENUM1BYTE(FRAMETYPE_FLAGS); + +#if CONFIG_FPMT_TEST +enum { + PARALLEL_ENCODE = 0, + PARALLEL_SIMULATION_ENCODE, + NUM_FPMT_TEST_ENCODES +} UENUM1BYTE(FPMT_TEST_ENC_CFG); +#endif // CONFIG_FPMT_TEST +// 0 level frames are sometimes used for rate control purposes, but for +// reference mapping purposes, the minimum level should be 1. +#define MIN_PYR_LEVEL 1 +static INLINE int get_true_pyr_level(int frame_level, int frame_order, + int max_layer_depth) { + if (frame_order == 0) { + // Keyframe case + return MIN_PYR_LEVEL; + } else if (frame_level == MAX_ARF_LAYERS) { + // Leaves + return max_layer_depth; + } else if (frame_level == (MAX_ARF_LAYERS + 1)) { + // Altrefs + return MIN_PYR_LEVEL; + } + return AOMMAX(MIN_PYR_LEVEL, frame_level); +} + +enum { + NO_AQ = 0, + VARIANCE_AQ = 1, + COMPLEXITY_AQ = 2, + CYCLIC_REFRESH_AQ = 3, + AQ_MODE_COUNT // This should always be the last member of the enum +} UENUM1BYTE(AQ_MODE); +enum { + NO_DELTA_Q = 0, + DELTA_Q_OBJECTIVE = 1, // Modulation to improve objective quality + DELTA_Q_PERCEPTUAL = 2, // Modulation to improve video perceptual quality + DELTA_Q_PERCEPTUAL_AI = 3, // Perceptual quality opt for all intra mode + DELTA_Q_USER_RATING_BASED = 4, // User rating based delta q mode + DELTA_Q_HDR = 5, // QP adjustment based on HDR block pixel average + DELTA_Q_MODE_COUNT // This should always be the last member of the enum +} UENUM1BYTE(DELTAQ_MODE); + +enum { + RESIZE_NONE = 0, // No frame resizing allowed. + RESIZE_FIXED = 1, // All frames are coded at the specified scale. + RESIZE_RANDOM = 2, // All frames are coded at a random scale. + RESIZE_DYNAMIC = 3, // Frames coded at lower scale based on rate control. + RESIZE_MODES +} UENUM1BYTE(RESIZE_MODE); + +enum { + SS_CFG_SRC = 0, + SS_CFG_LOOKAHEAD = 1, + SS_CFG_FPF = 2, + SS_CFG_TOTAL = 3 +} UENUM1BYTE(SS_CFG_OFFSET); + +enum { + DISABLE_SCENECUT, // For LAP, lag_in_frames < 19 + ENABLE_SCENECUT_MODE_1, // For LAP, lag_in_frames >=19 and < 33 + ENABLE_SCENECUT_MODE_2 // For twopass and LAP - lag_in_frames >=33 +} UENUM1BYTE(SCENECUT_MODE); + +#define MAX_VBR_CORPUS_COMPLEXITY 10000 + +typedef enum { + MOD_FP, // First pass + MOD_TF, // Temporal filtering + MOD_TPL, // TPL + MOD_GME, // Global motion estimation + MOD_ENC, // Encode stage + MOD_LPF, // Deblocking loop filter + MOD_CDEF_SEARCH, // CDEF search + MOD_CDEF, // CDEF frame + MOD_LR, // Loop restoration filtering + MOD_PACK_BS, // Pack bitstream + MOD_FRAME_ENC, // Frame Parallel encode + MOD_AI, // All intra + NUM_MT_MODULES +} MULTI_THREADED_MODULES; + +/*!\endcond */ + +/*!\enum COST_UPDATE_TYPE + * \brief This enum controls how often the entropy costs should be updated. + * \warning In case of any modifications/additions done to the enum + * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as + * well. + */ +typedef enum { + COST_UPD_SB, /*!< Update every sb. */ + COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ + COST_UPD_TILE, /*!< Update every tile. */ + COST_UPD_OFF, /*!< Turn off cost updates. */ + NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */ +} COST_UPDATE_TYPE; + +/*!\enum LOOPFILTER_CONTROL + * \brief This enum controls to which frames loopfilter is applied. + */ +typedef enum { + LOOPFILTER_NONE = 0, /*!< Disable loopfilter on all frames. */ + LOOPFILTER_ALL = 1, /*!< Enable loopfilter for all frames. */ + LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */ + LOOPFILTER_SELECTIVELY = + 3, /*!< Disable loopfilter on frames with low motion. */ +} LOOPFILTER_CONTROL; + +/*!\enum SKIP_APPLY_POSTPROC_FILTER + * \brief This enum controls the application of post-processing filters on a + * reconstructed frame. + */ +typedef enum { + SKIP_APPLY_RESTORATION = 1 << 0, + SKIP_APPLY_SUPERRES = 1 << 1, + SKIP_APPLY_CDEF = 1 << 2, + SKIP_APPLY_LOOPFILTER = 1 << 3, +} SKIP_APPLY_POSTPROC_FILTER; + +/*! + * \brief Encoder config related to resize. + */ +typedef struct { + /*! + * Indicates the frame resize mode to be used by the encoder. + */ + RESIZE_MODE resize_mode; + /*! + * Indicates the denominator for resize of inter frames, assuming 8 as the + * numerator. Its value ranges between 8-16. + */ + uint8_t resize_scale_denominator; + /*! + * Indicates the denominator for resize of key frames, assuming 8 as the + * numerator. Its value ranges between 8-16. + */ + uint8_t resize_kf_scale_denominator; +} ResizeCfg; + +/*! + * \brief Encoder config for coding block partitioning. + */ +typedef struct { + /*! + * Flag to indicate if rectanguar partitions should be enabled. + */ + bool enable_rect_partitions; + /*! + * Flag to indicate if AB partitions should be enabled. + */ + bool enable_ab_partitions; + /*! + * Flag to indicate if 1:4 / 4:1 partitions should be enabled. + */ + bool enable_1to4_partitions; + /*! + * Indicates the minimum partition size that should be allowed. Both width and + * height of a partition cannot be smaller than the min_partition_size. + */ + BLOCK_SIZE min_partition_size; + /*! + * Indicates the maximum partition size that should be allowed. Both width and + * height of a partition cannot be larger than the max_partition_size. + */ + BLOCK_SIZE max_partition_size; +} PartitionCfg; + +/*! + * \brief Encoder flags for intra prediction. + */ +typedef struct { + /*! + * Flag to indicate if intra edge filtering process should be enabled. + */ + bool enable_intra_edge_filter; + /*! + * Flag to indicate if recursive filtering based intra prediction should be + * enabled. + */ + bool enable_filter_intra; + /*! + * Flag to indicate if smooth intra prediction modes should be enabled. + */ + bool enable_smooth_intra; + /*! + * Flag to indicate if PAETH intra prediction mode should be enabled. + */ + bool enable_paeth_intra; + /*! + * Flag to indicate if CFL uv intra mode should be enabled. + */ + bool enable_cfl_intra; + /*! + * Flag to indicate if directional modes should be enabled. + */ + bool enable_directional_intra; + /*! + * Flag to indicate if the subset of directional modes from D45 to D203 intra + * should be enabled. Has no effect if directional modes are disabled. + */ + bool enable_diagonal_intra; + /*! + * Flag to indicate if delta angles for directional intra prediction should be + * enabled. + */ + bool enable_angle_delta; + /*! + * Flag to indicate whether to automatically turn off several intral coding + * tools. + * This flag is only used when "--deltaq-mode=3" is true. + * When set to 1, the encoder will analyze the reconstruction quality + * as compared to the source image in the preprocessing pass. + * If the recontruction quality is considered high enough, we disable + * the following intra coding tools, for better encoding speed: + * "--enable_smooth_intra", + * "--enable_paeth_intra", + * "--enable_cfl_intra", + * "--enable_diagonal_intra". + */ + bool auto_intra_tools_off; +} IntraModeCfg; + +/*! + * \brief Encoder flags for transform sizes and types. + */ +typedef struct { + /*! + * Flag to indicate if 64-pt transform should be enabled. + */ + bool enable_tx64; + /*! + * Flag to indicate if flip and identity transform types should be enabled. + */ + bool enable_flip_idtx; + /*! + * Flag to indicate if rectangular transform should be enabled. + */ + bool enable_rect_tx; + /*! + * Flag to indicate whether or not to use a default reduced set for ext-tx + * rather than the potential full set of 16 transforms. + */ + bool reduced_tx_type_set; + /*! + * Flag to indicate if transform type for intra blocks should be limited to + * DCT_DCT. + */ + bool use_intra_dct_only; + /*! + * Flag to indicate if transform type for inter blocks should be limited to + * DCT_DCT. + */ + bool use_inter_dct_only; + /*! + * Flag to indicate if intra blocks should use default transform type + * (mode-dependent) only. + */ + bool use_intra_default_tx_only; + /*! + * Flag to indicate if transform size search should be enabled. + */ + bool enable_tx_size_search; +} TxfmSizeTypeCfg; + +/*! + * \brief Encoder flags for compound prediction modes. + */ +typedef struct { + /*! + * Flag to indicate if distance-weighted compound type should be enabled. + */ + bool enable_dist_wtd_comp; + /*! + * Flag to indicate if masked (wedge/diff-wtd) compound type should be + * enabled. + */ + bool enable_masked_comp; + /*! + * Flag to indicate if smooth interintra mode should be enabled. + */ + bool enable_smooth_interintra; + /*! + * Flag to indicate if difference-weighted compound type should be enabled. + */ + bool enable_diff_wtd_comp; + /*! + * Flag to indicate if inter-inter wedge compound type should be enabled. + */ + bool enable_interinter_wedge; + /*! + * Flag to indicate if inter-intra wedge compound type should be enabled. + */ + bool enable_interintra_wedge; +} CompoundTypeCfg; + +/*! + * \brief Encoder config related to frame super-resolution. + */ +typedef struct { + /*! + * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH + * mode is used for inter frames. + */ + int superres_qthresh; + /*! + * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH + * mode is used for key frames. + */ + int superres_kf_qthresh; + /*! + * Indicates the denominator of the fraction that specifies the ratio between + * the superblock width before and after upscaling for inter frames. The + * numerator of this fraction is equal to the constant SCALE_NUMERATOR. + */ + uint8_t superres_scale_denominator; + /*! + * Indicates the denominator of the fraction that specifies the ratio between + * the superblock width before and after upscaling for key frames. The + * numerator of this fraction is equal to the constant SCALE_NUMERATOR. + */ + uint8_t superres_kf_scale_denominator; + /*! + * Indicates the Super-resolution mode to be used by the encoder. + */ + aom_superres_mode superres_mode; + /*! + * Flag to indicate if super-resolution should be enabled for the sequence. + */ + bool enable_superres; +} SuperResCfg; + +/*! + * \brief Encoder config related to the coding of key frames. + */ +typedef struct { + /*! + * Indicates the minimum distance to a key frame. + */ + int key_freq_min; + + /*! + * Indicates the maximum distance to a key frame. + */ + int key_freq_max; + + /*! + * Indicates if temporal filtering should be applied on keyframe. + */ + int enable_keyframe_filtering; + + /*! + * Indicates the number of frames after which a frame may be coded as an + * S-Frame. + */ + int sframe_dist; + + /*! + * Indicates how an S-Frame should be inserted. + * 1: the considered frame will be made into an S-Frame only if it is an + * altref frame. 2: the next altref frame will be made into an S-Frame. + */ + int sframe_mode; + + /*! + * Indicates if encoder should autodetect cut scenes and set the keyframes. + */ + bool auto_key; + + /*! + * Indicates the forward key frame distance. + */ + int fwd_kf_dist; + + /*! + * Indicates if forward keyframe reference should be enabled. + */ + bool fwd_kf_enabled; + + /*! + * Indicates if S-Frames should be enabled for the sequence. + */ + bool enable_sframe; + + /*! + * Indicates if intra block copy prediction mode should be enabled or not. + */ + bool enable_intrabc; +} KeyFrameCfg; + +/*! + * \brief Encoder rate control configuration parameters + */ +typedef struct { + /*!\cond */ + // BUFFERING PARAMETERS + /*!\endcond */ + /*! + * Indicates the amount of data that will be buffered by the decoding + * application prior to beginning playback, and is expressed in units of + * time(milliseconds). + */ + int64_t starting_buffer_level_ms; + /*! + * Indicates the amount of data that the encoder should try to maintain in the + * decoder's buffer, and is expressed in units of time(milliseconds). + */ + int64_t optimal_buffer_level_ms; + /*! + * Indicates the maximum amount of data that may be buffered by the decoding + * application, and is expressed in units of time(milliseconds). + */ + int64_t maximum_buffer_size_ms; + + /*! + * Indicates the bandwidth to be used in bits per second. + */ + int64_t target_bandwidth; + + /*! + * Indicates average complexity of the corpus in single pass vbr based on + * LAP. 0 indicates that corpus complexity vbr mode is disabled. + */ + unsigned int vbr_corpus_complexity_lap; + /*! + * Indicates the maximum allowed bitrate for any intra frame as % of bitrate + * target. + */ + unsigned int max_intra_bitrate_pct; + /*! + * Indicates the maximum allowed bitrate for any inter frame as % of bitrate + * target. + */ + unsigned int max_inter_bitrate_pct; + /*! + * Indicates the percentage of rate boost for golden frame in CBR mode. + */ + unsigned int gf_cbr_boost_pct; + /*! + * min_cr / 100 indicates the target minimum compression ratio for each + * frame. + */ + unsigned int min_cr; + /*! + * Indicates the frame drop threshold. + */ + int drop_frames_water_mark; + /*! + * under_shoot_pct indicates the tolerance of the VBR algorithm to + * undershoot and is used as a trigger threshold for more aggressive + * adaptation of Q. It's value can range from 0-100. + */ + int under_shoot_pct; + /*! + * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot + * and is used as a trigger threshold for more aggressive adaptation of Q. + * It's value can range from 0-1000. + */ + int over_shoot_pct; + /*! + * Indicates the maximum qindex that can be used by the quantizer i.e. the + * worst quality qindex. + */ + int worst_allowed_q; + /*! + * Indicates the minimum qindex that can be used by the quantizer i.e. the + * best quality qindex. + */ + int best_allowed_q; + /*! + * Indicates the Constant/Constrained Quality level. + */ + int cq_level; + /*! + * Indicates if the encoding mode is vbr, cbr, constrained quality or + * constant quality. + */ + enum aom_rc_mode mode; + /*! + * Indicates the bias (expressed on a scale of 0 to 100) for determining + * target size for the current frame. The value 0 indicates the optimal CBR + * mode value should be used, and 100 indicates the optimal VBR mode value + * should be used. + */ + int vbrbias; + /*! + * Indicates the minimum bitrate to be used for a single frame as a percentage + * of the target bitrate. + */ + int vbrmin_section; + /*! + * Indicates the maximum bitrate to be used for a single frame as a percentage + * of the target bitrate. + */ + int vbrmax_section; +} RateControlCfg; + +/*!\cond */ +typedef struct { + // Indicates the number of frames lag before encoding is started. + int lag_in_frames; + // Indicates the minimum gf/arf interval to be used. + int min_gf_interval; + // Indicates the maximum gf/arf interval to be used. + int max_gf_interval; + // Indicates the minimum height for GF group pyramid structure to be used. + int gf_min_pyr_height; + // Indicates the maximum height for GF group pyramid structure to be used. + int gf_max_pyr_height; + // Indicates if automatic set and use of altref frames should be enabled. + bool enable_auto_arf; + // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be + // enabled. + bool enable_auto_brf; +} GFConfig; + +typedef struct { + // Indicates the number of tile groups. + unsigned int num_tile_groups; + // Indicates the MTU size for a tile group. If mtu is non-zero, + // num_tile_groups is set to DEFAULT_MAX_NUM_TG. + unsigned int mtu; + // Indicates the number of tile columns in log2. + int tile_columns; + // Indicates the number of tile rows in log2. + int tile_rows; + // Indicates the number of widths in the tile_widths[] array. + int tile_width_count; + // Indicates the number of heights in the tile_heights[] array. + int tile_height_count; + // Indicates the tile widths, and may be empty. + int tile_widths[MAX_TILE_COLS]; + // Indicates the tile heights, and may be empty. + int tile_heights[MAX_TILE_ROWS]; + // Indicates if large scale tile coding should be used. + bool enable_large_scale_tile; + // Indicates if single tile decoding mode should be enabled. + bool enable_single_tile_decoding; + // Indicates if EXT_TILE_DEBUG should be enabled. + bool enable_ext_tile_debug; +} TileConfig; + +typedef struct { + // Indicates the width of the input frame. + int width; + // Indicates the height of the input frame. + int height; + // If forced_max_frame_width is non-zero then it is used to force the maximum + // frame width written in write_sequence_header(). + int forced_max_frame_width; + // If forced_max_frame_width is non-zero then it is used to force the maximum + // frame height written in write_sequence_header(). + int forced_max_frame_height; + // Indicates the frame width after applying both super-resolution and resize + // to the coded frame. + int render_width; + // Indicates the frame height after applying both super-resolution and resize + // to the coded frame. + int render_height; +} FrameDimensionCfg; + +typedef struct { + // Indicates if warped motion should be enabled. + bool enable_warped_motion; + // Indicates if warped motion should be evaluated or not. + bool allow_warped_motion; + // Indicates if OBMC motion should be enabled. + bool enable_obmc; +} MotionModeCfg; + +typedef struct { + // Timing info for each frame. + aom_timing_info_t timing_info; + // Indicates the number of time units of a decoding clock. + uint32_t num_units_in_decoding_tick; + // Indicates if decoder model information is present in the coded sequence + // header. + bool decoder_model_info_present_flag; + // Indicates if display model information is present in the coded sequence + // header. + bool display_model_info_present_flag; + // Indicates if timing info for each frame is present. + bool timing_info_present; +} DecoderModelCfg; + +typedef struct { + // Indicates the update frequency for coeff costs. + COST_UPDATE_TYPE coeff; + // Indicates the update frequency for mode costs. + COST_UPDATE_TYPE mode; + // Indicates the update frequency for mv costs. + COST_UPDATE_TYPE mv; + // Indicates the update frequency for dv costs. + COST_UPDATE_TYPE dv; +} CostUpdateFreq; + +typedef struct { + // Indicates the maximum number of reference frames allowed per frame. + unsigned int max_reference_frames; + // Indicates if the reduced set of references should be enabled. + bool enable_reduced_reference_set; + // Indicates if one-sided compound should be enabled. + bool enable_onesided_comp; +} RefFrameCfg; + +typedef struct { + // Indicates the color space that should be used. + aom_color_primaries_t color_primaries; + // Indicates the characteristics of transfer function to be used. + aom_transfer_characteristics_t transfer_characteristics; + // Indicates the matrix coefficients to be used for the transfer function. + aom_matrix_coefficients_t matrix_coefficients; + // Indicates the chroma 4:2:0 sample position info. + aom_chroma_sample_position_t chroma_sample_position; + // Indicates if a limited color range or full color range should be used. + aom_color_range_t color_range; +} ColorCfg; + +typedef struct { + // Indicates if extreme motion vector unit test should be enabled or not. + unsigned int motion_vector_unit_test; + // Indicates if superblock multipass unit test should be enabled or not. + unsigned int sb_multipass_unit_test; +} UnitTestCfg; + +typedef struct { + // Indicates the file path to the VMAF model. + const char *vmaf_model_path; + // Indicates the path to the film grain parameters. + const char *film_grain_table_filename; + // Indicates the visual tuning metric. + aom_tune_metric tuning; + // Indicates if the current content is screen or default type. + aom_tune_content content; + // Indicates the film grain parameters. + int film_grain_test_vector; + // Indicates the in-block distortion metric to use. + aom_dist_metric dist_metric; +} TuneCfg; + +typedef struct { + // Indicates the framerate of the input video. + double init_framerate; + // Indicates the bit-depth of the input video. + unsigned int input_bit_depth; + // Indicates the maximum number of frames to be encoded. + unsigned int limit; + // Indicates the chrome subsampling x value. + unsigned int chroma_subsampling_x; + // Indicates the chrome subsampling y value. + unsigned int chroma_subsampling_y; +} InputCfg; + +typedef struct { + // If true, encoder will use fixed QP offsets, that are either: + // - Given by the user, and stored in 'fixed_qp_offsets' array, OR + // - Picked automatically from cq_level. + int use_fixed_qp_offsets; + // Indicates the minimum flatness of the quantization matrix. + int qm_minlevel; + // Indicates the maximum flatness of the quantization matrix. + int qm_maxlevel; + // Indicates if adaptive quantize_b should be enabled. + int quant_b_adapt; + // Indicates the Adaptive Quantization mode to be used. + AQ_MODE aq_mode; + // Indicates the delta q mode to be used. + DELTAQ_MODE deltaq_mode; + // Indicates the delta q mode strength. + DELTAQ_MODE deltaq_strength; + // Indicates if delta quantization should be enabled in chroma planes. + bool enable_chroma_deltaq; + // Indicates if delta quantization should be enabled for hdr video + bool enable_hdr_deltaq; + // Indicates if encoding with quantization matrices should be enabled. + bool using_qm; +} QuantizationCfg; + +/*!\endcond */ +/*! + * \brief Algorithm configuration parameters. + */ +typedef struct { + /*! + * Controls the level at which rate-distortion optimization of transform + * coefficients favours sharpness in the block. Has no impact on RD when set + * to zero (default). For values 1-7, eob and skip block optimization are + * avoided and rdmult is adjusted in favour of block sharpness. + */ + int sharpness; + + /*! + * Indicates the trellis optimization mode of quantized coefficients. + * 0: disabled + * 1: enabled + * 2: enabled for rd search + * 3: true for estimate yrd search + */ + int disable_trellis_quant; + + /*! + * The maximum number of frames used to create an arf. + */ + int arnr_max_frames; + + /*! + * The temporal filter strength for arf used when creating ARFs. + */ + int arnr_strength; + + /*! + * Indicates the CDF update mode + * 0: no update + * 1: update on every frame(default) + * 2: selectively update + */ + uint8_t cdf_update_mode; + + /*! + * Indicates if RDO based on frame temporal dependency should be enabled. + */ + bool enable_tpl_model; + + /*! + * Indicates if coding of overlay frames for filtered ALTREF frames is + * enabled. + */ + bool enable_overlay; + + /*! + * Controls loop filtering + * 0: Loop filter is disabled for all frames + * 1: Loop filter is enabled for all frames + * 2: Loop filter is disabled for non-reference frames + * 3: Loop filter is disables for the frames with low motion + */ + LOOPFILTER_CONTROL loopfilter_control; + + /*! + * Indicates if the application of post-processing filters should be skipped + * on reconstructed frame. + */ + bool skip_postproc_filtering; +} AlgoCfg; +/*!\cond */ + +typedef struct { + // Indicates the codec bit-depth. + aom_bit_depth_t bit_depth; + // Indicates the superblock size that should be used by the encoder. + aom_superblock_size_t superblock_size; + // Indicates if loopfilter modulation should be enabled. + bool enable_deltalf_mode; + // Indicates how CDEF should be applied. + CDEF_CONTROL cdef_control; + // Indicates if loop restoration filter should be enabled. + bool enable_restoration; + // When enabled, video mode should be used even for single frame input. + bool force_video_mode; + // Indicates if the error resiliency features should be enabled. + bool error_resilient_mode; + // Indicates if frame parallel decoding feature should be enabled. + bool frame_parallel_decoding_mode; + // Indicates if the input should be encoded as monochrome. + bool enable_monochrome; + // When enabled, the encoder will use a full header even for still pictures. + // When disabled, a reduced header is used for still pictures. + bool full_still_picture_hdr; + // Indicates if dual interpolation filters should be enabled. + bool enable_dual_filter; + // Indicates if frame order hint should be enabled or not. + bool enable_order_hint; + // Indicates if ref_frame_mvs should be enabled at the sequence level. + bool ref_frame_mvs_present; + // Indicates if ref_frame_mvs should be enabled at the frame level. + bool enable_ref_frame_mvs; + // Indicates if interintra compound mode is enabled. + bool enable_interintra_comp; + // Indicates if global motion should be enabled. + bool enable_global_motion; + // Indicates if palette should be enabled. + bool enable_palette; +} ToolCfg; + +/*!\endcond */ +/*! + * \brief Main encoder configuration data structure. + */ +typedef struct AV1EncoderConfig { + /*!\cond */ + // Configuration related to the input video. + InputCfg input_cfg; + + // Configuration related to frame-dimensions. + FrameDimensionCfg frm_dim_cfg; + + /*!\endcond */ + /*! + * Encoder algorithm configuration. + */ + AlgoCfg algo_cfg; + + /*! + * Configuration related to key-frames. + */ + KeyFrameCfg kf_cfg; + + /*! + * Rate control configuration + */ + RateControlCfg rc_cfg; + /*!\cond */ + + // Configuration related to Quantization. + QuantizationCfg q_cfg; + + // Internal frame size scaling. + ResizeCfg resize_cfg; + + // Frame Super-Resolution size scaling. + SuperResCfg superres_cfg; + + /*!\endcond */ + /*! + * stats_in buffer contains all of the stats packets produced in the first + * pass, concatenated. + */ + aom_fixed_buf_t twopass_stats_in; + /*!\cond */ + + // Configuration related to encoder toolsets. + ToolCfg tool_cfg; + + // Configuration related to Group of frames. + GFConfig gf_cfg; + + // Tile related configuration parameters. + TileConfig tile_cfg; + + // Configuration related to Tune. + TuneCfg tune_cfg; + + // Configuration related to color. + ColorCfg color_cfg; + + // Configuration related to decoder model. + DecoderModelCfg dec_model_cfg; + + // Configuration related to reference frames. + RefFrameCfg ref_frm_cfg; + + // Configuration related to unit tests. + UnitTestCfg unit_test_cfg; + + // Flags related to motion mode. + MotionModeCfg motion_mode_cfg; + + // Flags related to intra mode search. + IntraModeCfg intra_mode_cfg; + + // Flags related to transform size/type. + TxfmSizeTypeCfg txfm_cfg; + + // Flags related to compound type. + CompoundTypeCfg comp_type_cfg; + + // Partition related information. + PartitionCfg part_cfg; + + // Configuration related to frequency of cost update. + CostUpdateFreq cost_upd_freq; + +#if CONFIG_DENOISE + // Indicates the noise level. + float noise_level; + // Indicates the the denoisers block size. + int noise_block_size; + // Indicates whether to apply denoising to the frame to be encoded + int enable_dnl_denoising; +#endif + +#if CONFIG_AV1_TEMPORAL_DENOISING + // Noise sensitivity. + int noise_sensitivity; +#endif + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; + + // Indicates the number of pixels off the edge of a reference frame we're + // allowed to go when forming an inter prediction. + int border_in_pixels; + + // Indicates the maximum number of threads that may be used by the encoder. + int max_threads; + + // Indicates the speed preset to be used. + int speed; + + // Indicates the target sequence level index for each operating point(OP). + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + + // Indicates the bitstream profile to be used. + BITSTREAM_PROFILE profile; + + /*!\endcond */ + /*! + * Indicates the current encoder pass : + * AOM_RC_ONE_PASS = One pass encode, + * AOM_RC_FIRST_PASS = First pass of multiple-pass + * AOM_RC_SECOND_PASS = Second pass of multiple-pass + * AOM_RC_THIRD_PASS = Third pass of multiple-pass + */ + enum aom_enc_pass pass; + /*!\cond */ + + // Total number of encoding passes. + int passes; + + // the name of the second pass output file when passes > 2 + const char *two_pass_output; + + // the name of the second pass log file when passes > 2 + const char *second_pass_log; + + // Indicates if the encoding is GOOD or REALTIME. + MODE mode; + + // Indicates if row-based multi-threading should be enabled or not. + bool row_mt; + + // Indicates if frame parallel multi-threading should be enabled or not. + bool fp_mt; + + // Indicates if 16bit frame buffers are to be used i.e., the content is > + // 8-bit. + bool use_highbitdepth; + + // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as + // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B + // format. + bool save_as_annexb; + + // The path for partition stats reading and writing, used in the experiment + // CONFIG_PARTITION_SEARCH_ORDER. + const char *partition_info_path; + + // The flag that indicates whether we use an external rate distribution to + // guide adaptive quantization. It requires --deltaq-mode=3. The rate + // distribution map file name is stored in |rate_distribution_info|. + unsigned int enable_rate_guide_deltaq; + + // The input file of rate distribution information used in all intra mode + // to determine delta quantization. + const char *rate_distribution_info; + + // Exit the encoder when it fails to encode to a given level. + int strict_level_conformance; + + // Max depth for the GOP after a key frame + int kf_max_pyr_height; + + // A flag to control if we enable the superblock qp sweep for a given lambda + int sb_qp_sweep; + /*!\endcond */ +} AV1EncoderConfig; + +/*!\cond */ +static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) { + return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0; +} +/*!\endcond */ + +/*! + * \brief Encoder-side probabilities for pruning of various AV1 tools + */ +typedef struct { + /*! + * obmc_probs[i][j] is the probability of OBMC being the best motion mode for + * jth block size and ith frame update type, averaged over past frames. If + * obmc_probs[i][j] < thresh, then OBMC search is pruned. + */ + int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; + + /*! + * warped_probs[i] is the probability of warped motion being the best motion + * mode for ith frame update type, averaged over past frames. If + * warped_probs[i] < thresh, then warped motion search is pruned. + */ + int warped_probs[FRAME_UPDATE_TYPES]; + + /*! + * tx_type_probs[i][j][k] is the probability of kth tx_type being the best + * for jth transform size and ith frame update type, averaged over past + * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that + * type is pruned. + */ + int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES]; + + /*! + * switchable_interp_probs[i][j][k] is the probability of kth interpolation + * filter being the best for jth filter context and ith frame update type, + * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh, + * then interpolation filter search is pruned for that case. + */ + int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FrameProbInfo; + +/*!\cond */ + +typedef struct FRAME_COUNTS { +// Note: This structure should only contain 'unsigned int' fields, or +// aggregates built solely from 'unsigned int' fields/elements +#if CONFIG_ENTROPY_STATS + unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES]; + unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1]; + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES]; + unsigned int cfl_sign[CFL_JOINT_SIGNS]; + unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE]; + unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2]; + unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2]; + unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES]; + unsigned int palette_y_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int palette_uv_color_index[PALETTE_SIZES] + [PALETTE_COLOR_INDEX_CONTEXTS] + [PALETTE_COLORS]; + unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES]; + unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2]; + unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [EOB_COEF_CONTEXTS][2]; + unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2]; + unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS] + [2]; + unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2]; + unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5]; + unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6]; + unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7]; + unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8]; + unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9]; + unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10]; + unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11]; + unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [LEVEL_CONTEXTS][BR_CDF_SIZE]; + unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2]; + unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES] + [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1]; + unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2]; + unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2]; + unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2]; + unsigned int drl_mode[DRL_MODE_CONTEXTS][2]; + unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; + unsigned int wedge_idx[BLOCK_SIZES_ALL][16]; + unsigned int interintra[BLOCK_SIZE_GROUPS][2]; + unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; + unsigned int wedge_interintra[BLOCK_SIZES_ALL][2]; + unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; + unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES]; + unsigned int obmc[BLOCK_SIZES_ALL][2]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2]; + unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2]; + unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2]; + unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2]; + unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2]; + unsigned int intrabc[2]; + + unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2]; + unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1]; + unsigned int skip_mode[SKIP_MODE_CONTEXTS][2]; + unsigned int skip_txfm[SKIP_CONTEXTS][2]; + unsigned int compound_index[COMP_INDEX_CONTEXTS][2]; + unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2]; + unsigned int delta_q[DELTA_Q_PROBS][2]; + unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2]; + unsigned int delta_lf[DELTA_LF_PROBS][2]; + + unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES]; + unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES] + [TX_TYPES]; + unsigned int filter_intra_mode[FILTER_INTRA_MODES]; + unsigned int filter_intra[BLOCK_SIZES_ALL][2]; + unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES]; + unsigned int wiener_restore[2]; + unsigned int sgrproj_restore[2]; +#endif // CONFIG_ENTROPY_STATS + + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; +} FRAME_COUNTS; + +#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 + +typedef struct { + int ready; + double a; + double b; + double dist_mean; + double ld_mean; + double sse_mean; + double sse_sse_mean; + double sse_ld_mean; + int num; + double dist_sum; + double ld_sum; + double sse_sum; + double sse_sse_sum; + double sse_ld_sum; +} InterModeRdModel; + +typedef struct { + int idx; + int64_t rd; +} RdIdxPair; +// TODO(angiebird): This is an estimated size. We still need to figure what is +// the maximum number of modes. +#define MAX_INTER_MODES 1024 +// TODO(any): rename this struct to something else. There is already another +// struct called inter_mode_info, which makes this terribly confusing. +/*!\endcond */ +/*! + * \brief Struct used to hold inter mode data for fast tx search. + * + * This struct is used to perform a full transform search only on winning + * candidates searched with an estimate for transform coding RD. + */ +typedef struct inter_modes_info { + /*! + * The number of inter modes for which data was stored in each of the + * following arrays. + */ + int num; + /*! + * Mode info struct for each of the candidate modes. + */ + MB_MODE_INFO mbmi_arr[MAX_INTER_MODES]; + /*! + * The rate for each of the candidate modes. + */ + int mode_rate_arr[MAX_INTER_MODES]; + /*! + * The sse of the predictor for each of the candidate modes. + */ + int64_t sse_arr[MAX_INTER_MODES]; + /*! + * The estimated rd of the predictor for each of the candidate modes. + */ + int64_t est_rd_arr[MAX_INTER_MODES]; + /*! + * The rate and mode index for each of the candidate modes. + */ + RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; + /*! + * The full rd stats for each of the candidate modes. + */ + RD_STATS rd_cost_arr[MAX_INTER_MODES]; + /*! + * The full rd stats of luma only for each of the candidate modes. + */ + RD_STATS rd_cost_y_arr[MAX_INTER_MODES]; + /*! + * The full rd stats of chroma only for each of the candidate modes. + */ + RD_STATS rd_cost_uv_arr[MAX_INTER_MODES]; +} InterModesInfo; + +/*!\cond */ +typedef struct { + // TODO(kyslov): consider changing to 64bit + + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 32x32 (with 4x4 avg). + // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32 + // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit + uint32_t sum_square_error; + int32_t sum_error; + int log2_count; + int variance; +} VPartVar; + +typedef struct { + VPartVar none; + VPartVar horz[2]; + VPartVar vert[2]; +} VPVariance; + +typedef struct { + VPVariance part_variances; + VPartVar split[4]; +} VP4x4; + +typedef struct { + VPVariance part_variances; + VP4x4 split[4]; +} VP8x8; + +typedef struct { + VPVariance part_variances; + VP8x8 split[4]; +} VP16x16; + +typedef struct { + VPVariance part_variances; + VP16x16 split[4]; +} VP32x32; + +typedef struct { + VPVariance part_variances; + VP32x32 split[4]; +} VP64x64; + +typedef struct { + VPVariance part_variances; + VP64x64 *split; +} VP128x128; + +/*!\endcond */ + +/*! + * \brief Thresholds for variance based partitioning. + */ +typedef struct { + /*! + * If block variance > threshold, then that block is forced to split. + * thresholds[0] - threshold for 128x128; + * thresholds[1] - threshold for 64x64; + * thresholds[2] - threshold for 32x32; + * thresholds[3] - threshold for 16x16; + * thresholds[4] - threshold for 8x8; + */ + int64_t thresholds[5]; + + /*! + * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual + * minmax > threshold_minmax, the 16x16 is forced to split. + */ + int64_t threshold_minmax; +} VarBasedPartitionInfo; + +/*! + * \brief Encoder parameters for synchronization of row based multi-threading + */ +typedef struct { +#if CONFIG_MULTITHREAD + /** + * \name Synchronization objects for top-right dependency. + */ + /**@{*/ + pthread_mutex_t *mutex_; /*!< Mutex lock object */ + pthread_cond_t *cond_; /*!< Condition variable */ + /**@}*/ +#endif // CONFIG_MULTITHREAD + /*! + * Buffer to store the superblock whose encoding is complete. + * num_finished_cols[i] stores the number of superblocks which finished + * encoding in the ith superblock row. + */ + int *num_finished_cols; + /*! + * Denotes the superblock interval at which conditional signalling should + * happen. Also denotes the minimum number of extra superblocks of the top row + * to be complete to start encoding the current superblock. A value of 1 + * indicates top-right dependency. + */ + int sync_range; + /*! + * Denotes the additional number of superblocks in the previous row to be + * complete to start encoding the current superblock when intraBC tool is + * enabled. This additional top-right delay is required to satisfy the + * hardware constraints for intraBC tool when row multithreading is enabled. + */ + int intrabc_extra_top_right_sb_delay; + /*! + * Number of superblock rows. + */ + int rows; + /*! + * The superblock row (in units of MI blocks) to be processed next. + */ + int next_mi_row; + /*! + * Number of threads processing the current tile. + */ + int num_threads_working; +} AV1EncRowMultiThreadSync; + +/*!\cond */ + +// TODO(jingning) All spatially adaptive variables should go to TileDataEnc. +typedef struct TileDataEnc { + TileInfo tile_info; + DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); + FRAME_CONTEXT *row_ctx; + uint64_t abs_sum_level; + uint8_t allow_update_cdf; + InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; + AV1EncRowMultiThreadSync row_mt_sync; + MV firstpass_top_mv; +} TileDataEnc; + +typedef struct RD_COUNTS { + int compound_ref_used_flag; + int skip_mode_used_flag; + int tx_type_used[TX_SIZES_ALL][TX_TYPES]; + int obmc_used[BLOCK_SIZES_ALL][2]; + int warped_used[2]; + int newmv_or_intra_blocks; + uint64_t seg_tmp_pred_cost[2]; +} RD_COUNTS; + +typedef struct ThreadData { + MACROBLOCK mb; + MvCosts *mv_costs_alloc; + IntraBCMVCosts *dv_costs_alloc; + RD_COUNTS rd_counts; + FRAME_COUNTS *counts; + PC_TREE_SHARED_BUFFERS shared_coeff_buf; + SIMPLE_MOTION_DATA_TREE *sms_tree; + SIMPLE_MOTION_DATA_TREE *sms_root; + uint32_t *hash_value_buffer[2][2]; + OBMCBuffer obmc_buffer; + PALETTE_BUFFER *palette_buffer; + CompoundTypeRdBuffers comp_rd_buffer; + CONV_BUF_TYPE *tmp_conv_dst; + uint64_t abs_sum_level; + uint8_t *tmp_pred_bufs[2]; + uint8_t *wiener_tmp_pred_buf; + int intrabc_used; + int deltaq_used; + int coefficient_size; + int max_mv_magnitude; + int interp_filter_selected[SWITCHABLE]; + FRAME_CONTEXT *tctx; + VP64x64 *vt64x64; + int32_t num_64x64_blocks; + PICK_MODE_CONTEXT *firstpass_ctx; + TemporalFilterData tf_data; + TplBuffers tpl_tmp_buffers; + TplTxfmStats tpl_txfm_stats; + GlobalMotionData gm_data; + // Pointer to the array of structures to store gradient information of each + // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level + // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). + PixelLevelGradientInfo *pixel_gradient_info; + // Pointer to the array of structures to store source variance information of + // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to + // store source variance and log of source variance of each 4x4 sub-block + // for subsequent retrieval. + Block4x4VarInfo *src_var_info_of_4x4_sub_blocks; + // Pointer to pc tree root. + PC_TREE *pc_root; +} ThreadData; + +struct EncWorkerData; + +/*!\endcond */ + +/*! + * \brief Encoder data related to row-based multi-threading + */ +typedef struct { + /*! + * Number of tile rows for which row synchronization memory is allocated. + */ + int allocated_tile_rows; + /*! + * Number of tile cols for which row synchronization memory is allocated. + */ + int allocated_tile_cols; + /*! + * Number of rows for which row synchronization memory is allocated + * per tile. During first-pass/look-ahead stage this equals the + * maximum number of macroblock rows in a tile. During encode stage, + * this equals the maximum number of superblock rows in a tile. + */ + int allocated_rows; + /*! + * Number of columns for which entropy context memory is allocated + * per tile. During encode stage, this equals the maximum number of + * superblock columns in a tile minus 1. The entropy context memory + * is not allocated during first-pass/look-ahead stage. + */ + int allocated_cols; + + /*! + * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread. + */ + int thread_id_to_tile_id[MAX_NUM_THREADS]; + + /*! + * num_tile_cols_done[i] indicates the number of tile columns whose encoding + * is complete in the ith superblock row. + */ + int *num_tile_cols_done; + + /*! + * Number of superblock rows in a frame for which 'num_tile_cols_done' is + * allocated. + */ + int allocated_sb_rows; + + /*! + * Initialized to false, set to true by the worker thread that encounters an + * error in order to abort the processing of other worker threads. + */ + bool row_mt_exit; + + /*! + * Initialized to false, set to true during first pass encoding by the worker + * thread that encounters an error in order to abort the processing of other + * worker threads. + */ + bool firstpass_mt_exit; + + /*! + * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker + * thread that encounters an error in order to abort the processing of other + * worker threads. + */ + bool mb_wiener_mt_exit; + +#if CONFIG_MULTITHREAD + /*! + * Mutex lock used while dispatching jobs. + */ + pthread_mutex_t *mutex_; + /*! + * Condition variable used to dispatch loopfilter jobs. + */ + pthread_cond_t *cond_; +#endif + + /** + * \name Row synchronization related function pointers. + */ + /**@{*/ + /*! + * Reader. + */ + void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int); + /*! + * Writer. + */ + void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int); + /**@}*/ +} AV1EncRowMultiThreadInfo; + +/*! + * \brief Encoder data related to multi-threading for allintra deltaq-mode=3 + */ +typedef struct { +#if CONFIG_MULTITHREAD + /*! + * Mutex lock used while dispatching jobs. + */ + pthread_mutex_t *mutex_; + /*! + * Condition variable used to dispatch loopfilter jobs. + */ + pthread_cond_t *cond_; +#endif + + /** + * \name Row synchronization related function pointers for all intra mode + */ + /**@{*/ + /*! + * Reader. + */ + void (*intra_sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int); + /*! + * Writer. + */ + void (*intra_sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int); + /**@}*/ +} AV1EncAllIntraMultiThreadInfo; + +/*! + * \brief Max number of recodes used to track the frame probabilities. + */ +#define NUM_RECODES_PER_FRAME 10 + +/*! + * \brief Max number of frames that can be encoded in a parallel encode set. + */ +#define MAX_PARALLEL_FRAMES 4 + +/*! + * \brief Buffers to be backed up during parallel encode set to be restored + * later. + */ +typedef struct RestoreStateBuffers { + /*! + * Backup of original CDEF srcbuf. + */ + uint16_t *cdef_srcbuf; + + /*! + * Backup of original CDEF colbuf. + */ + uint16_t *cdef_colbuf[MAX_MB_PLANE]; + + /*! + * Backup of original LR rst_tmpbuf. + */ + int32_t *rst_tmpbuf; + + /*! + * Backup of original LR rlbs. + */ + RestorationLineBuffers *rlbs; +} RestoreStateBuffers; + +/*! + * \brief Parameters related to restoration types. + */ +typedef struct { + /*! + * Stores the best coefficients for Wiener restoration. + */ + WienerInfo wiener; + + /*! + * Stores the best coefficients for Sgrproj restoration. + */ + SgrprojInfo sgrproj; + + /*! + * The rtype to use for this unit given a frame rtype as index. Indices: + * WIENER, SGRPROJ, SWITCHABLE. + */ + RestorationType best_rtype[RESTORE_TYPES - 1]; +} RestUnitSearchInfo; + +/*! + * \brief Structure to hold search parameter per restoration unit and + * intermediate buffer of Wiener filter used in pick filter stage of Loop + * restoration. + */ +typedef struct { + /*! + * Array of pointers to 'RestUnitSearchInfo' which holds data related to + * restoration types. + */ + RestUnitSearchInfo *rusi[MAX_MB_PLANE]; + + /*! + * Buffer used to hold dgd-avg data during SIMD call of Wiener filter. + */ + int16_t *dgd_avg; +} AV1LrPickStruct; + +/*! + * \brief Primary Encoder parameters related to multi-threading. + */ +typedef struct PrimaryMultiThreadInfo { + /*! + * Number of workers created for multi-threading. + */ + int num_workers; + + /*! + * Number of workers used for different MT modules. + */ + int num_mod_workers[NUM_MT_MODULES]; + + /*! + * Synchronization object used to launch job in the worker thread. + */ + AVxWorker *workers; + + /*! + * Data specific to each worker in encoder multi-threading. + * tile_thr_data[i] stores the worker data of the ith thread. + */ + struct EncWorkerData *tile_thr_data; + + /*! + * CDEF row multi-threading data. + */ + AV1CdefWorkerData *cdef_worker; + + /*! + * Primary(Level 1) Synchronization object used to launch job in the worker + * thread. + */ + AVxWorker *p_workers[MAX_PARALLEL_FRAMES]; + + /*! + * Number of primary workers created for multi-threading. + */ + int p_num_workers; + + /*! + * Tracks the number of workers in encode stage multi-threading. + */ + int prev_num_enc_workers; +} PrimaryMultiThreadInfo; + +/*! + * \brief Encoder parameters related to multi-threading. + */ +typedef struct MultiThreadInfo { + /*! + * Number of workers created for multi-threading. + */ + int num_workers; + + /*! + * Number of workers used for different MT modules. + */ + int num_mod_workers[NUM_MT_MODULES]; + + /*! + * Synchronization object used to launch job in the worker thread. + */ + AVxWorker *workers; + + /*! + * Data specific to each worker in encoder multi-threading. + * tile_thr_data[i] stores the worker data of the ith thread. + */ + struct EncWorkerData *tile_thr_data; + + /*! + * When set, indicates that row based multi-threading of the encoder is + * enabled. + */ + bool row_mt_enabled; + + /*! + * When set, indicates that multi-threading for bitstream packing is enabled. + */ + bool pack_bs_mt_enabled; + + /*! + * Encoder row multi-threading data. + */ + AV1EncRowMultiThreadInfo enc_row_mt; + + /*! + * Encoder multi-threading data for allintra mode in the preprocessing stage + * when --deltaq-mode=3. + */ + AV1EncAllIntraMultiThreadInfo intra_mt; + + /*! + * Tpl row multi-threading data. + */ + AV1TplRowMultiThreadInfo tpl_row_mt; + + /*! + * Loop Filter multi-threading object. + */ + AV1LfSync lf_row_sync; + + /*! + * Loop Restoration multi-threading object. + */ + AV1LrSync lr_row_sync; + + /*! + * Pack bitstream multi-threading object. + */ + AV1EncPackBSSync pack_bs_sync; + + /*! + * Global Motion multi-threading object. + */ + AV1GlobalMotionSync gm_sync; + + /*! + * Temporal Filter multi-threading object. + */ + AV1TemporalFilterSync tf_sync; + + /*! + * CDEF search multi-threading object. + */ + AV1CdefSync cdef_sync; + + /*! + * Pointer to CDEF row multi-threading data for the frame. + */ + AV1CdefWorkerData *cdef_worker; + + /*! + * Buffers to be stored/restored before/after parallel encode. + */ + RestoreStateBuffers restore_state_buf; + + /*! + * In multi-threaded realtime encoding with row-mt enabled, pipeline + * loop-filtering after encoding. + */ + int pipeline_lpf_mt_with_enc; +} MultiThreadInfo; + +/*!\cond */ + +typedef struct ActiveMap { + int enabled; + int update; + unsigned char *map; +} ActiveMap; + +/*!\endcond */ + +/*! + * \brief Encoder info used for decision on forcing integer motion vectors. + */ +typedef struct { + /*! + * cs_rate_array[i] is the fraction of blocks in a frame which either match + * with the collocated block or are smooth, where i is the rate_index. + */ + double cs_rate_array[32]; + /*! + * rate_index is used to index cs_rate_array. + */ + int rate_index; + /*! + * rate_size is the total number of entries populated in cs_rate_array. + */ + int rate_size; +} ForceIntegerMVInfo; + +/*!\cond */ + +#if CONFIG_INTERNAL_STATS +// types of stats +enum { + STAT_Y, + STAT_U, + STAT_V, + STAT_ALL, + NUM_STAT_TYPES // This should always be the last member of the enum +} UENUM1BYTE(StatType); + +typedef struct IMAGE_STAT { + double stat[NUM_STAT_TYPES]; + double worst; +} ImageStat; +#endif // CONFIG_INTERNAL_STATS + +typedef struct { + int ref_count; + YV12_BUFFER_CONFIG buf; +} EncRefCntBuffer; + +/*!\endcond */ + +/*! + * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level + * + * This is used for bitstream preparation. + */ +typedef struct { + /*! + * frame_base[mi_row * stride + mi_col] stores the mode information of + * block (mi_row,mi_col). + */ + MB_MODE_INFO_EXT_FRAME *frame_base; + /*! + * Size of frame_base buffer. + */ + int alloc_size; + /*! + * Stride of frame_base buffer. + */ + int stride; +} MBMIExtFrameBufferInfo; + +/*!\cond */ + +#if CONFIG_COLLECT_PARTITION_STATS +typedef struct FramePartitionTimingStats { + int partition_decisions[6][EXT_PARTITION_TYPES]; + int partition_attempts[6][EXT_PARTITION_TYPES]; + int64_t partition_times[6][EXT_PARTITION_TYPES]; + + int partition_redo; +} FramePartitionTimingStats; +#endif // CONFIG_COLLECT_PARTITION_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "aom_ports/aom_timer.h" +// Adjust the following to add new components. +enum { + av1_encode_strategy_time, + av1_get_one_pass_rt_params_time, + av1_get_second_pass_params_time, + denoise_and_encode_time, + apply_filtering_time, + av1_tpl_setup_stats_time, + encode_frame_to_data_rate_time, + encode_with_or_without_recode_time, + loop_filter_time, + cdef_time, + loop_restoration_time, + av1_pack_bitstream_final_time, + av1_encode_frame_time, + av1_compute_global_motion_time, + av1_setup_motion_field_time, + encode_sb_row_time, + + rd_pick_partition_time, + rd_use_partition_time, + choose_var_based_partitioning_time, + av1_prune_partitions_time, + none_partition_search_time, + split_partition_search_time, + rectangular_partition_search_time, + ab_partitions_search_time, + rd_pick_4partition_time, + encode_sb_time, + + rd_pick_sb_modes_time, + av1_rd_pick_intra_mode_sb_time, + av1_rd_pick_inter_mode_sb_time, + set_params_rd_pick_inter_mode_time, + skip_inter_mode_time, + handle_inter_mode_time, + evaluate_motion_mode_for_winner_candidates_time, + do_tx_search_time, + handle_intra_mode_time, + refine_winner_mode_tx_time, + av1_search_palette_mode_time, + handle_newmv_time, + compound_type_rd_time, + interpolation_filter_search_time, + motion_mode_rd_time, + + nonrd_use_partition_time, + pick_sb_modes_nonrd_time, + hybrid_intra_mode_search_time, + nonrd_pick_inter_mode_sb_time, + encode_b_nonrd_time, + + kTimingComponents, +} UENUM1BYTE(TIMING_COMPONENT); + +static INLINE char const *get_component_name(int index) { + switch (index) { + case av1_encode_strategy_time: return "av1_encode_strategy_time"; + case av1_get_one_pass_rt_params_time: + return "av1_get_one_pass_rt_params_time"; + case av1_get_second_pass_params_time: + return "av1_get_second_pass_params_time"; + case denoise_and_encode_time: return "denoise_and_encode_time"; + case apply_filtering_time: return "apply_filtering_time"; + case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time"; + case encode_frame_to_data_rate_time: + return "encode_frame_to_data_rate_time"; + case encode_with_or_without_recode_time: + return "encode_with_or_without_recode_time"; + case loop_filter_time: return "loop_filter_time"; + case cdef_time: return "cdef_time"; + case loop_restoration_time: return "loop_restoration_time"; + case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time"; + case av1_encode_frame_time: return "av1_encode_frame_time"; + case av1_compute_global_motion_time: + return "av1_compute_global_motion_time"; + case av1_setup_motion_field_time: return "av1_setup_motion_field_time"; + case encode_sb_row_time: return "encode_sb_row_time"; + + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_use_partition_time: return "rd_use_partition_time"; + case choose_var_based_partitioning_time: + return "choose_var_based_partitioning_time"; + case av1_prune_partitions_time: return "av1_prune_partitions_time"; + case none_partition_search_time: return "none_partition_search_time"; + case split_partition_search_time: return "split_partition_search_time"; + case rectangular_partition_search_time: + return "rectangular_partition_search_time"; + case ab_partitions_search_time: return "ab_partitions_search_time"; + case rd_pick_4partition_time: return "rd_pick_4partition_time"; + case encode_sb_time: return "encode_sb_time"; + + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case av1_rd_pick_intra_mode_sb_time: + return "av1_rd_pick_intra_mode_sb_time"; + case av1_rd_pick_inter_mode_sb_time: + return "av1_rd_pick_inter_mode_sb_time"; + case set_params_rd_pick_inter_mode_time: + return "set_params_rd_pick_inter_mode_time"; + case skip_inter_mode_time: return "skip_inter_mode_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case evaluate_motion_mode_for_winner_candidates_time: + return "evaluate_motion_mode_for_winner_candidates_time"; + case do_tx_search_time: return "do_tx_search_time"; + case handle_intra_mode_time: return "handle_intra_mode_time"; + case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time"; + case av1_search_palette_mode_time: return "av1_search_palette_mode_time"; + case handle_newmv_time: return "handle_newmv_time"; + case compound_type_rd_time: return "compound_type_rd_time"; + case interpolation_filter_search_time: + return "interpolation_filter_search_time"; + case motion_mode_rd_time: return "motion_mode_rd_time"; + + case nonrd_use_partition_time: return "nonrd_use_partition_time"; + case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time"; + case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time"; + case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time"; + case encode_b_nonrd_time: return "encode_b_nonrd_time"; + + default: assert(0); + } + return "error"; +} +#endif + +// The maximum number of internal ARFs except ALTREF_FRAME +#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1) + +/*!\endcond */ + +/*! + * \brief Parameters related to global motion search + */ +typedef struct { + /*! + * Flag to indicate if global motion search needs to be rerun. + */ + bool search_done; + + /*! + * Array of pointers to the frame buffers holding the reference frames. + * ref_buf[i] stores the pointer to the reference frame of the ith + * reference frame type. + */ + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; + + /*! + * Holds the number of valid reference frames in past and future directions + * w.r.t. the current frame. num_ref_frames[i] stores the total number of + * valid reference frames in 'i' direction. + */ + int num_ref_frames[MAX_DIRECTIONS]; + + /*! + * Array of structure which stores the valid reference frames in past and + * future directions and their corresponding distance from the source frame. + * reference_frames[i][j] holds the jth valid reference frame type in the + * direction 'i' and its temporal distance from the source frame . + */ + FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1]; + + /** + * \name Dimensions for which segment map is allocated. + */ + /**@{*/ + int segment_map_w; /*!< segment map width */ + int segment_map_h; /*!< segment map height */ + /**@}*/ +} GlobalMotionInfo; + +/*! + * \brief Flags related to interpolation filter search + */ +typedef struct { + /*! + * Stores the default value of skip flag depending on chroma format + * Set as 1 for monochrome and 3 for other color formats + */ + int default_interp_skip_flags; + /*! + * Filter mask to allow certain interp_filter type. + */ + uint16_t interp_filter_search_mask; +} InterpSearchFlags; + +/*! + * \brief Parameters for motion vector search process + */ +typedef struct { + /*! + * Largest MV component used in a frame. + * The value from the previous frame is used to set the full pixel search + * range for the current frame. + */ + int max_mv_magnitude; + /*! + * Parameter indicating initial search window to be used in full-pixel search. + * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window. + */ + int mv_step_param; + /*! + * Pointer to sub-pixel search function. + * In encoder: av1_find_best_sub_pixel_tree + * av1_find_best_sub_pixel_tree_pruned + * av1_find_best_sub_pixel_tree_pruned_more + * In MV unit test: av1_return_max_sub_pixel_mv + * av1_return_min_sub_pixel_mv + */ + fractional_mv_step_fp *find_fractional_mv_step; + /*! + * Search site configuration for full-pel MV search. + * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple + * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal + * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead + */ + search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS]; +} MotionVectorSearchParams; + +/*! + * \brief Refresh frame flags for different type of frames. + * + * If the refresh flag is true for a particular reference frame, after the + * current frame is encoded, the reference frame gets refreshed (updated) to + * be the current frame. Note: Usually at most one flag will be set to true at + * a time. But, for key-frames, all flags are set to true at once. + */ +typedef struct { + bool golden_frame; /*!< Refresh flag for golden frame */ + bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */ + bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */ +} RefreshFrameInfo; + +/*! + * \brief Desired dimensions for an externally triggered resize. + * + * When resize is triggered externally, the desired dimensions are stored in + * this struct until used in the next frame to be coded. These values are + * effective only for one frame and are reset after they are used. + */ +typedef struct { + int width; /*!< Desired resized width */ + int height; /*!< Desired resized height */ +} ResizePendingParams; + +/*! + * \brief Refrence frame distance related variables. + */ +typedef struct { + /*! + * True relative distance of reference frames w.r.t. the current frame. + */ + int ref_relative_dist[INTER_REFS_PER_FRAME]; + /*! + * The nearest reference w.r.t. current frame in the past. + */ + int8_t nearest_past_ref; + /*! + * The nearest reference w.r.t. current frame in the future. + */ + int8_t nearest_future_ref; +} RefFrameDistanceInfo; + +/*! + * \brief Parameters used for winner mode processing. + * + * This is a basic two pass approach: in the first pass, we reduce the number of + * transform searches based on some thresholds during the rdopt process to find + * the "winner mode". In the second pass, we perform a more through tx search + * on the winner mode. + * There are some arrays in the struct, and their indices are used in the + * following manner: + * Index 0: Default mode evaluation, Winner mode processing is not applicable + * (Eg : IntraBc). + * Index 1: Mode evaluation. + * Index 2: Winner mode evaluation + * Index 1 and 2 are only used when the respective speed feature is on. + */ +typedef struct { + /*! + * Threshold to determine if trellis optimization is to be enabled + * based on : + * 0 : dist threshold + * 1 : satd threshold + * Corresponds to enable_winner_mode_for_coeff_opt speed feature. + */ + unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2]; + + /*! + * Determines the tx size search method during rdopt. + * Corresponds to enable_winner_mode_for_tx_size_srch speed feature. + */ + TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES]; + + /*! + * Controls how often we should approximate prediction error with tx + * coefficients. If it's 0, then never. If 1, then it's during the tx_type + * search only. If 2, then always. + * Corresponds to tx_domain_dist_level speed feature. + */ + unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES]; + + /*! + * Threshold to approximate pixel domain distortion with transform domain + * distortion. This is only used if use_transform_domain_distortion is on. + * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature. + */ + unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES]; + + /*! + * Controls how often we should try to skip the transform process based on + * result from dct. + * Corresponds to use_skip_flag_prediction speed feature. + */ + unsigned int skip_txfm_level[MODE_EVAL_TYPES]; + + /*! + * Predict DC only txfm blocks for default, mode and winner mode evaluation. + * Index 0: Default mode evaluation, Winner mode processing is not applicable. + * Index 1: Mode evaluation, Index 2: Winner mode evaluation + */ + unsigned int predict_dc_level[MODE_EVAL_TYPES]; +} WinnerModeParams; + +/*! + * \brief Frame refresh flags set by the external interface. + * + * Flags set by external interface to determine which reference buffers are + * refreshed by this frame. When set, the encoder will update the particular + * reference frame buffer with the contents of the current frame. + */ +typedef struct { + bool last_frame; /*!< Refresh flag for last frame */ + bool golden_frame; /*!< Refresh flag for golden frame */ + bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */ + bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */ + bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */ + /*! + * Flag indicating if the update of refresh frame flags is pending. + */ + bool update_pending; +} ExtRefreshFrameFlagsInfo; + +/*! + * \brief Flags signalled by the external interface at frame level. + */ +typedef struct { + /*! + * Bit mask to disable certain reference frame types. + */ + int ref_frame_flags; + + /*! + * Frame refresh flags set by the external interface. + */ + ExtRefreshFrameFlagsInfo refresh_frame; + + /*! + * Flag to enable the update of frame contexts at the end of a frame decode. + */ + bool refresh_frame_context; + + /*! + * Flag to indicate that update of refresh_frame_context from external + * interface is pending. + */ + bool refresh_frame_context_pending; + + /*! + * Flag to enable temporal MV prediction. + */ + bool use_ref_frame_mvs; + + /*! + * Indicates whether the current frame is to be coded as error resilient. + */ + bool use_error_resilient; + + /*! + * Indicates whether the current frame is to be coded as s-frame. + */ + bool use_s_frame; + + /*! + * Indicates whether the current frame's primary_ref_frame is set to + * PRIMARY_REF_NONE. + */ + bool use_primary_ref_none; +} ExternalFlags; + +/*!\cond */ + +typedef struct { + // Some misc info + int high_prec; + int q; + int order; + + // MV counters + int inter_count; + int intra_count; + int default_mvs; + int mv_joint_count[4]; + int last_bit_zero; + int last_bit_nonzero; + + // Keep track of the rates + int total_mv_rate; + int hp_total_mv_rate; + int lp_total_mv_rate; + + // Texture info + int horz_text; + int vert_text; + int diag_text; + + // Whether the current struct contains valid data + int valid; +} MV_STATS; + +typedef struct WeberStats { + int64_t mb_wiener_variance; + int64_t src_variance; + int64_t rec_variance; + int16_t src_pix_max; + int16_t rec_pix_max; + int64_t distortion; + int64_t satd; + double max_scale; +} WeberStats; + +typedef struct { + struct loopfilter lf; + CdefInfo cdef_info; + YV12_BUFFER_CONFIG copy_buffer; + RATE_CONTROL rc; + MV_STATS mv_stats; +} CODING_CONTEXT; + +typedef struct { + int frame_width; + int frame_height; + int mi_rows; + int mi_cols; + int mb_rows; + int mb_cols; + int num_mbs; + aom_bit_depth_t bit_depth; + int subsampling_x; + int subsampling_y; +} FRAME_INFO; + +/*! + * \brief This structure stores different types of frame indices. + */ +typedef struct { + int show_frame_count; +} FRAME_INDEX_SET; + +/*!\endcond */ + +/*! + * \brief Segmentation related information for the current frame. + */ +typedef struct { + /*! + * 3-bit number containing the segment affiliation for each 4x4 block in the + * frame. map[y * stride + x] contains the segment id of the 4x4 block at + * (x,y) position. + */ + uint8_t *map; + /*! + * Flag to indicate if current frame has lossless segments or not. + * 1: frame has at least one lossless segment. + * 0: frame has no lossless segments. + */ + bool has_lossless_segment; +} EncSegmentationInfo; + +/*! + * \brief Frame time stamps. + */ +typedef struct { + /*! + * Start time stamp of the previous frame + */ + int64_t prev_ts_start; + /*! + * End time stamp of the previous frame + */ + int64_t prev_ts_end; + /*! + * Start time stamp of the first frame + */ + int64_t first_ts_start; +} TimeStamps; + +/*! + * Pointers to the memory allocated for frame level transform coeff related + * info. + */ +typedef struct { + /*! + * Pointer to the transformed coefficients buffer. + */ + tran_low_t *tcoeff; + /*! + * Pointer to the eobs buffer. + */ + uint16_t *eobs; + /*! + * Pointer to the entropy_ctx buffer. + */ + uint8_t *entropy_ctx; +} CoeffBufferPool; + +#if !CONFIG_REALTIME_ONLY +/*!\cond */ +// DUCKY_ENCODE_FRAME_MODE is c version of EncodeFrameMode +enum { + DUCKY_ENCODE_FRAME_MODE_NONE, // Let native AV1 determine q index and rdmult + DUCKY_ENCODE_FRAME_MODE_QINDEX, // DuckyEncode determines q index and AV1 + // determines rdmult + DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT, // DuckyEncode determines q index and + // rdmult +} UENUM1BYTE(DUCKY_ENCODE_FRAME_MODE); + +enum { + DUCKY_ENCODE_GOP_MODE_NONE, // native AV1 decides GOP + DUCKY_ENCODE_GOP_MODE_RCL, // rate control lib decides GOP +} UENUM1BYTE(DUCKY_ENCODE_GOP_MODE); + +typedef struct DuckyEncodeFrameInfo { + DUCKY_ENCODE_FRAME_MODE qp_mode; + DUCKY_ENCODE_GOP_MODE gop_mode; + int q_index; + int rdmult; + // These two arrays are equivalent to std::vector + int *superblock_encode_qindex; + int *superblock_encode_rdmult; + int delta_q_enabled; +} DuckyEncodeFrameInfo; + +typedef struct DuckyEncodeFrameResult { + int global_order_idx; + int q_index; + int rdmult; + int rate; + int64_t dist; + double psnr; +} DuckyEncodeFrameResult; + +typedef struct DuckyEncodeInfo { + DuckyEncodeFrameInfo frame_info; + DuckyEncodeFrameResult frame_result; +} DuckyEncodeInfo; +/*!\endcond */ +#endif + +/*!\cond */ +typedef struct RTC_REF { + /*! + * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + */ + int reference[INTER_REFS_PER_FRAME]; + int ref_idx[INTER_REFS_PER_FRAME]; + int refresh[REF_FRAMES]; + int set_ref_frame_config; + int non_reference_frame; + int ref_frame_comp[3]; + int gld_idx_1layer; + /*! + * Frame number of the last frame that refreshed the buffer slot. + */ + unsigned int buffer_time_index[REF_FRAMES]; + /*! + * Spatial layer id of the last frame that refreshed the buffer slot. + */ + unsigned char buffer_spatial_layer[REF_FRAMES]; + /*! + * Flag to indicate whether closest reference was the previous frame. + */ + bool reference_was_previous_frame; + /*! + * Flag to indicate this frame is based on longer term reference only, + * for recovery from past loss, and it should be biased for improved coding. + */ + bool bias_recovery_frame; +} RTC_REF; +/*!\endcond */ + +/*! + * \brief Structure to hold data corresponding to an encoded frame. + */ +typedef struct AV1_COMP_DATA { + /*! + * Buffer to store packed bitstream data of a frame. + */ + unsigned char *cx_data; + + /*! + * Allocated size of the cx_data buffer. + */ + size_t cx_data_sz; + + /*! + * Size of data written in the cx_data buffer. + */ + size_t frame_size; + + /*! + * Flags for the frame. + */ + unsigned int lib_flags; + + /*! + * Time stamp for start of frame. + */ + int64_t ts_frame_start; + + /*! + * Time stamp for end of frame. + */ + int64_t ts_frame_end; + + /*! + * Flag to indicate flush call. + */ + int flush; + + /*! + * Time base for sequence. + */ + const aom_rational64_t *timestamp_ratio; + + /*! + * Decide to pop the source for this frame from input buffer queue. + */ + int pop_lookahead; + + /*! + * Display order hint of frame whose packed data is in cx_data buffer. + */ + int frame_display_order_hint; +} AV1_COMP_DATA; + +/*! + * \brief Top level primary encoder structure + */ +typedef struct AV1_PRIMARY { + /*! + * Array of frame level encoder stage top level structures + */ + struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES]; + + /*! + * Array of structures to hold data of frames encoded in a given parallel + * encode set. + */ + struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1]; +#if CONFIG_FPMT_TEST + /*! + * Flag which enables/disables simulation path for fpmt unit test. + * 0 - FPMT integration + * 1 - FPMT simulation + */ + FPMT_TEST_ENC_CFG fpmt_unit_test_cfg; + + /*! + * Temporary variable simulating the delayed frame_probability update. + */ + FrameProbInfo temp_frame_probs; + + /*! + * Temporary variable holding the updated frame probability across + * frames. Copy its value to temp_frame_probs for frame_parallel_level 0 + * frames or last frame in parallel encode set. + */ + FrameProbInfo temp_frame_probs_simulation; + + /*! + * Temporary variable simulating the delayed update of valid global motion + * model across frames. + */ + int temp_valid_gm_model_found[FRAME_UPDATE_TYPES]; +#endif // CONFIG_FPMT_TEST + /*! + * Copy of cm->ref_frame_map maintained to facilitate sequential update of + * ref_frame_map by lower layer depth frames encoded ahead of time in a + * parallel encode set. + */ + RefCntBuffer *ref_frame_map_copy[REF_FRAMES]; + + /*! + * Start time stamp of the last encoded show frame + */ + int64_t ts_start_last_show_frame; + + /*! + * End time stamp of the last encoded show frame + */ + int64_t ts_end_last_show_frame; + + /*! + * Number of frame level contexts(cpis) + */ + int num_fp_contexts; + + /*! + * Loopfilter levels of the previous encoded frame. + */ + int filter_level[2]; + + /*! + * Chrominance component loopfilter level of the previous encoded frame. + */ + int filter_level_u; + + /*! + * Chrominance component loopfilter level of the previous encoded frame. + */ + int filter_level_v; + + /*! + * Encode stage top level structure + * During frame parallel encode, this is the same as parallel_cpi[0] + */ + struct AV1_COMP *cpi; + + /*! + * Lookahead processing stage top level structure + */ + struct AV1_COMP *cpi_lap; + + /*! + * Look-ahead context. + */ + struct lookahead_ctx *lookahead; + + /*! + * Sequence parameters have been transmitted already and locked + * or not. Once locked av1_change_config cannot change the seq + * parameters. + */ + int seq_params_locked; + + /*! + * Pointer to internal utility functions that manipulate aom_codec_* data + * structures. + */ + struct aom_codec_pkt_list *output_pkt_list; + + /*! + * When set, indicates that internal ARFs are enabled. + */ + int internal_altref_allowed; + + /*! + * Tell if OVERLAY frame shows existing alt_ref frame. + */ + int show_existing_alt_ref; + + /*! + * Information related to a gf group. + */ + GF_GROUP gf_group; + + /*! + * Track prior gf group state. + */ + GF_STATE gf_state; + + /*! + * Flag indicating whether look ahead processing (LAP) is enabled. + */ + int lap_enabled; + + /*! + * Parameters for AV1 bitstream levels. + */ + AV1LevelParams level_params; + + /*! + * Calculates PSNR on each frame when set to 1. + */ + int b_calculate_psnr; + + /*! + * Number of frames left to be encoded, is 0 if limit is not set. + */ + int frames_left; + + /*! + * Information related to two pass encoding. + */ + TWO_PASS twopass; + + /*! + * Rate control related parameters. + */ + PRIMARY_RATE_CONTROL p_rc; + + /*! + * Info and resources used by temporal filtering. + */ + TEMPORAL_FILTER_INFO tf_info; + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader seq_params; + + /*! + * Indicates whether to use SVC. + */ + int use_svc; + + /*! + * If true, buffer removal times are present. + */ + bool buffer_removal_time_present; + + /*! + * Number of temporal layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_temporal_layers; + + /*! + * Number of spatial layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_spatial_layers; + + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info error; + + /*! + * Function pointers to variants of sse/sad/variance computation functions. + * fn_ptr[i] indicates the list of function pointers corresponding to block + * size i. + */ + aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; + + /*! + * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of + * the ith 16 x 16 block in raster scan order. + */ + double *tpl_sb_rdmult_scaling_factors; + + /*! + * Parameters related to tpl. + */ + TplParams tpl_data; + + /*! + * Motion vector stats of the previous encoded frame. + */ + MV_STATS mv_stats; + +#if CONFIG_INTERNAL_STATS + /*!\cond */ + uint64_t total_time_receive_data; + uint64_t total_time_compress_data; + + unsigned int total_mode_chosen_counts[MAX_MODES]; + + int count[2]; + uint64_t total_sq_error[2]; + uint64_t total_samples[2]; + ImageStat psnr[2]; + + double total_blockiness; + double worst_blockiness; + + int total_bytes; + double summed_quality; + double summed_weights; + double summed_quality_hbd; + double summed_weights_hbd; + unsigned int total_recode_hits; + double worst_ssim; + double worst_ssim_hbd; + + ImageStat fastssim; + ImageStat psnrhvs; + + int b_calculate_blockiness; + int b_calculate_consistency; + + double total_inconsistency; + double worst_consistency; + Ssimv *ssim_vars; + Metrics metrics; + /*!\endcond */ +#endif + +#if CONFIG_ENTROPY_STATS + /*! + * Aggregates frame counts for the sequence. + */ + FRAME_COUNTS aggregate_fc; +#endif // CONFIG_ENTROPY_STATS + + /*! + * For each type of reference frame, this contains the index of a reference + * frame buffer for a reference frame of the same type. We use this to + * choose our primary reference frame (which is the most recent reference + * frame of the same type as the current frame). + */ + int fb_of_context_type[REF_FRAMES]; + + /*! + * Primary Multi-threading parameters. + */ + PrimaryMultiThreadInfo p_mt_info; + + /*! + * Probabilities for pruning of various AV1 tools. + */ + FrameProbInfo frame_probs; + + /*! + * Indicates if a valid global motion model has been found in the different + * frame update types of a GF group. + * valid_gm_model_found[i] indicates if valid global motion model has been + * found in the frame update type with enum value equal to i + */ + int valid_gm_model_found[FRAME_UPDATE_TYPES]; + + /*! + * Struct for the reference structure for RTC. + */ + RTC_REF rtc_ref; + + /*! + * Struct for all intra mode row multi threading in the preprocess stage + * when --deltaq-mode=3. + */ + AV1EncRowMultiThreadSync intra_row_mt_sync; +} AV1_PRIMARY; + +/*! + * \brief Top level encoder structure. + */ +typedef struct AV1_COMP { + /*! + * Pointer to top level primary encoder structure + */ + AV1_PRIMARY *ppi; + + /*! + * Quantization and dequantization parameters for internal quantizer setup + * in the encoder. + */ + EncQuantDequantParams enc_quant_dequant_params; + + /*! + * Structure holding thread specific variables. + */ + ThreadData td; + + /*! + * Statistics collected at frame level. + */ + FRAME_COUNTS counts; + + /*! + * Holds buffer storing mode information at 4x4/8x8 level. + */ + MBMIExtFrameBufferInfo mbmi_ext_info; + + /*! + * Buffer holding the transform block related information. + * coeff_buffer_base[i] stores the transform block related information of the + * ith superblock in raster scan order. + */ + CB_COEFF_BUFFER *coeff_buffer_base; + + /*! + * Structure holding pointers to frame level memory allocated for transform + * block related information. + */ + CoeffBufferPool coeff_buffer_pool; + + /*! + * Structure holding variables common to encoder and decoder. + */ + AV1_COMMON common; + + /*! + * Encoder configuration related parameters. + */ + AV1EncoderConfig oxcf; + + /*! + * Stores the trellis optimization type at segment level. + * optimize_seg_arr[i] stores the trellis opt type for ith segment. + */ + TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS]; + + /*! + * Pointer to the frame buffer holding the source frame to be used during the + * current stage of encoding. It can be the raw input, temporally filtered + * input or scaled input. + */ + YV12_BUFFER_CONFIG *source; + + /*! + * Pointer to the frame buffer holding the last raw source frame. + * last_source is NULL for the following cases: + * 1) First frame + * 2) Alt-ref frames + * 3) All frames for all-intra frame encoding. + */ + YV12_BUFFER_CONFIG *last_source; + + /*! + * Pointer to the frame buffer holding the unscaled source frame. + * It can be either the raw input or temporally filtered input. + */ + YV12_BUFFER_CONFIG *unscaled_source; + + /*! + * Frame buffer holding the resized source frame (cropping / superres). + */ + YV12_BUFFER_CONFIG scaled_source; + + /*! + * Pointer to the frame buffer holding the unscaled last source frame. + */ + YV12_BUFFER_CONFIG *unscaled_last_source; + + /*! + * Frame buffer holding the resized last source frame. + */ + YV12_BUFFER_CONFIG scaled_last_source; + + /*! + * Pointer to the original source frame. This is used to determine if the + * content is screen. + */ + YV12_BUFFER_CONFIG *unfiltered_source; + + /*! + * Frame buffer holding the orig source frame for PSNR calculation in rtc tf + * case. + */ + YV12_BUFFER_CONFIG orig_source; + + /*! + * Skip tpl setup when tpl data from gop length decision can be reused. + */ + int skip_tpl_setup_stats; + + /*! + * Scaling factors used in the RD multiplier modulation. + * TODO(sdeng): consider merge the following arrays. + * tpl_rdmult_scaling_factors is a temporary buffer used to store the + * intermediate scaling factors which are used in the calculation of + * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the + * intermediate scaling factor of the ith 16 x 16 block in raster scan order. + */ + double *tpl_rdmult_scaling_factors; + + /*! + * Temporal filter context. + */ + TemporalFilterCtx tf_ctx; + + /*! + * Pointer to CDEF search context. + */ + CdefSearchCtx *cdef_search_ctx; + + /*! + * Variables related to forcing integer mv decisions for the current frame. + */ + ForceIntegerMVInfo force_intpel_info; + + /*! + * Pointer to the buffer holding the scaled reference frames. + * scaled_ref_buf[i] holds the scaled reference frame of type i. + */ + RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME]; + + /*! + * Pointer to the buffer holding the last show frame. + */ + RefCntBuffer *last_show_frame_buf; + + /*! + * Refresh frame flags for golden, bwd-ref and alt-ref frames. + */ + RefreshFrameInfo refresh_frame; + + /*! + * Flag to reduce the number of reference frame buffers used in rt. + */ + int rt_reduce_num_ref_buffers; + + /*! + * Flags signalled by the external interface at frame level. + */ + ExternalFlags ext_flags; + + /*! + * Temporary frame buffer used to store the non-loop filtered reconstructed + * frame during the search of loop filter level. + */ + YV12_BUFFER_CONFIG last_frame_uf; + + /*! + * Temporary frame buffer used to store the loop restored frame during loop + * restoration search. + */ + YV12_BUFFER_CONFIG trial_frame_rst; + + /*! + * Ambient reconstruction err target for force key frames. + */ + int64_t ambient_err; + + /*! + * Parameters related to rate distortion optimization. + */ + RD_OPT rd; + + /*! + * Temporary coding context used to save and restore when encoding with and + * without super-resolution. + */ + CODING_CONTEXT coding_context; + + /*! + * Parameters related to global motion search. + */ + GlobalMotionInfo gm_info; + + /*! + * Parameters related to winner mode processing. + */ + WinnerModeParams winner_mode_params; + + /*! + * Frame time stamps. + */ + TimeStamps time_stamps; + + /*! + * Rate control related parameters. + */ + RATE_CONTROL rc; + + /*! + * Frame rate of the video. + */ + double framerate; + + /*! + * Bitmask indicating which reference buffers may be referenced by this frame. + */ + int ref_frame_flags; + + /*! + * speed is passed as a per-frame parameter into the encoder. + */ + int speed; + + /*! + * sf contains fine-grained config set internally based on speed. + */ + SPEED_FEATURES sf; + + /*! + * Parameters for motion vector search process. + */ + MotionVectorSearchParams mv_search_params; + + /*! + * When set, indicates that all reference frames are forward references, + * i.e., all the reference frames are output before the current frame. + */ + int all_one_sided_refs; + + /*! + * Segmentation related information for current frame. + */ + EncSegmentationInfo enc_seg; + + /*! + * Parameters related to cyclic refresh aq-mode. + */ + CYCLIC_REFRESH *cyclic_refresh; + /*! + * Parameters related to active map. Active maps indicate + * if there is any activity on a 4x4 block basis. + */ + ActiveMap active_map; + + /*! + * The frame processing order within a GOP. + */ + unsigned char gf_frame_index; + +#if CONFIG_INTERNAL_STATS + /*!\cond */ + uint64_t time_compress_data; + + unsigned int mode_chosen_counts[MAX_MODES]; + int bytes; + unsigned int frame_recode_hits; + /*!\endcond */ +#endif + +#if CONFIG_SPEED_STATS + /*! + * For debugging: number of transform searches we have performed. + */ + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS + + /*! + * When set, indicates that the frame is droppable, i.e., this frame + * does not update any reference buffers. + */ + int droppable; + + /*! + * Stores the frame parameters during encoder initialization. + */ + FRAME_INFO frame_info; + + /*! + * Stores different types of frame indices. + */ + FRAME_INDEX_SET frame_index_set; + + /*! + * Store the cm->width in the last call of alloc_compressor_data(). Help + * determine whether compressor data should be reallocated when cm->width + * changes. + */ + int data_alloc_width; + + /*! + * Store the cm->height in the last call of alloc_compressor_data(). Help + * determine whether compressor data should be reallocated when cm->height + * changes. + */ + int data_alloc_height; + + /*! + * Number of MBs in the full-size frame; to be used to + * normalize the firstpass stats. This will differ from the + * number of MBs in the current frame when the frame is + * scaled. + */ + int initial_mbs; + + /*! + * Flag to indicate whether the frame size inforamation has been + * setup and propagated to associated allocations. + */ + bool frame_size_related_setup_done; + + /*! + * The width of the frame that is lastly encoded. + * It is updated in the function "encoder_encode()". + */ + int last_coded_width; + + /*! + * The height of the frame that is lastly encoded. + * It is updated in the function "encoder_encode()". + */ + int last_coded_height; + + /*! + * Resize related parameters. + */ + ResizePendingParams resize_pending_params; + + /*! + * Pointer to struct holding adaptive data/contexts/models for the tile during + * encoding. + */ + TileDataEnc *tile_data; + /*! + * Number of tiles for which memory has been allocated for tile_data. + */ + int allocated_tiles; + + /*! + * Structure to store the palette token related information. + */ + TokenInfo token_info; + + /*! + * VARIANCE_AQ segment map refresh. + */ + int vaq_refresh; + + /*! + * Thresholds for variance based partitioning. + */ + VarBasedPartitionInfo vbp_info; + + /*! + * Number of recodes in the frame. + */ + int num_frame_recode; + + /*! + * Current frame probability of parallel frames, across recodes. + */ + FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for transform type frame_probability calculation + */ + int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for obmc frame_probability calculation + */ + int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for warped motion frame_probability calculation + */ + int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME]; + + /*! + * Retain condition for interpolation filter frame_probability calculation + */ + int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME]; + +#if CONFIG_FPMT_TEST + /*! + * Temporary variable for simulation. + * Previous frame's framerate. + */ + double temp_framerate; +#endif + /*! + * Updated framerate for the current parallel frame. + * cpi->framerate is updated with new_framerate during + * post encode updates for parallel frames. + */ + double new_framerate; + + /*! + * Retain condition for fast_extra_bits calculation. + */ + int do_update_vbr_bits_off_target_fast; + + /*! + * Multi-threading parameters. + */ + MultiThreadInfo mt_info; + + /*! + * Specifies the frame to be output. It is valid only if show_existing_frame + * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to + * INVALID_IDX. + */ + int existing_fb_idx_to_show; + + /*! + * A flag to indicate if intrabc is ever used in current frame. + */ + int intrabc_used; + + /*! + * Mark which ref frames can be skipped for encoding current frame during RDO. + */ + int prune_ref_frame_mask; + + /*! + * Loop Restoration context. + */ + AV1LrStruct lr_ctxt; + + /*! + * Loop Restoration context used during pick stage. + */ + AV1LrPickStruct pick_lr_ctxt; + + /*! + * Pointer to list of tables with film grain parameters. + */ + aom_film_grain_table_t *film_grain_table; + +#if CONFIG_DENOISE + /*! + * Pointer to structure holding the denoised image buffers and the helper + * noise models. + */ + struct aom_denoise_and_model_t *denoise_and_model; +#endif + + /*! + * Flags related to interpolation filter search. + */ + InterpSearchFlags interp_search_flags; + + /*! + * Turn on screen content tools flag. + * Note that some videos are not screen content videos, but + * screen content tools could also improve coding efficiency. + * For example, videos with large flat regions, gaming videos that look + * like natural videos. + */ + int use_screen_content_tools; + + /*! + * A flag to indicate "real" screen content videos. + * For example, screen shares, screen editing. + * This type is true indicates |use_screen_content_tools| must be true. + * In addition, rate control strategy is adjusted when this flag is true. + */ + int is_screen_content_type; + +#if CONFIG_COLLECT_PARTITION_STATS + /*! + * Accumulates the partition timing stat over the whole frame. + */ + FramePartitionTimingStats partition_stats; +#endif // CONFIG_COLLECT_PARTITION_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct aom_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif + + /*! + * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. + */ + int frame_header_count; + + /*! + * Whether any no-zero delta_q was actually used. + */ + int deltaq_used; + + /*! + * Refrence frame distance related variables. + */ + RefFrameDistanceInfo ref_frame_dist_info; + + /*! + * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of + * the ith 16 x 16 block in raster scan order. This scaling factor is used for + * RD multiplier modulation when SSIM tuning is enabled. + */ + double *ssim_rdmult_scaling_factors; + +#if CONFIG_TUNE_VMAF + /*! + * Parameters for VMAF tuning. + */ + TuneVMAFInfo vmaf_info; +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + /*! + * Parameters for Butteraugli tuning. + */ + TuneButteraugliInfo butteraugli_info; +#endif + + /*! + * Parameters for scalable video coding. + */ + SVC svc; + + /*! + * Indicates whether current processing stage is encode stage or LAP stage. + */ + COMPRESSOR_STAGE compressor_stage; + + /*! + * Frame type of the last frame. May be used in some heuristics for speeding + * up the encoding. + */ + FRAME_TYPE last_frame_type; + + /*! + * Number of tile-groups. + */ + int num_tg; + + /*! + * Super-resolution mode currently being used by the encoder. + * This may / may not be same as user-supplied mode in oxcf->superres_mode + * (when we are recoding to try multiple options for example). + */ + aom_superres_mode superres_mode; + + /*! + * First pass related data. + */ + FirstPassData firstpass_data; + + /*! + * Temporal Noise Estimate + */ + NOISE_ESTIMATE noise_estimate; + +#if CONFIG_AV1_TEMPORAL_DENOISING + /*! + * Temporal Denoiser + */ + AV1_DENOISER denoiser; +#endif + + /*! + * Count on how many consecutive times a block uses small/zeromv for encoding + * in a scale of 8x8 block. + */ + uint8_t *consec_zero_mv; + + /*! + * Allocated memory size for |consec_zero_mv|. + */ + int consec_zero_mv_alloc_size; + + /*! + * Block size of first pass encoding + */ + BLOCK_SIZE fp_block_size; + + /*! + * The counter of encoded super block, used to differentiate block names. + * This number starts from 0 and increases whenever a super block is encoded. + */ + int sb_counter; + + /*! + * Available bitstream buffer size in bytes + */ + size_t available_bs_size; + + /*! + * The controller of the external partition model. + * It is used to do partition type selection based on external models. + */ + ExtPartController ext_part_controller; + + /*! + * Motion vector stats of the current encoded frame, used to update the + * ppi->mv_stats during postencode. + */ + MV_STATS mv_stats; + /*! + * Stores the reference refresh index for the current frame. + */ + int ref_refresh_index; + + /*! + * A flag to indicate if the reference refresh index is available for the + * current frame. + */ + bool refresh_idx_available; + + /*! + * Reference frame index corresponding to the frame to be excluded from being + * used as a reference by frame_parallel_level 2 frame in a parallel + * encode set of lower layer frames. + */ + int ref_idx_to_skip; +#if CONFIG_FPMT_TEST + /*! + * Stores the wanted frame buffer index for choosing primary ref frame by a + * frame_parallel_level 2 frame in a parallel encode set of lower layer + * frames. + */ + + int wanted_fb; +#endif // CONFIG_FPMT_TEST + + /*! + * A flag to indicate frames that will update their data to the primary + * context at the end of the encode. It is set for non-parallel frames and the + * last frame in encode order in a given parallel encode set. + */ + bool do_frame_data_update; + +#if CONFIG_RD_COMMAND + /*! + * A structure for assigning external q_index / rdmult for experiments + */ + RD_COMMAND rd_command; +#endif // CONFIG_RD_COMMAND + + /*! + * Buffer to store MB variance after Wiener filter. + */ + WeberStats *mb_weber_stats; + + /*! + * Buffer to store rate cost estimates for each macro block (8x8) in the + * preprocessing stage used in allintra mode. + */ + int *prep_rate_estimates; + + /*! + * Buffer to store rate cost estimates for each 16x16 block read + * from an external file, used in allintra mode. + */ + double *ext_rate_distribution; + + /*! + * The scale that equals sum_rate_uniform_quantizer / sum_ext_rate. + */ + double ext_rate_scale; + + /*! + * Buffer to store MB variance after Wiener filter. + */ + BLOCK_SIZE weber_bsize; + + /*! + * Frame level Wiener filter normalization. + */ + int64_t norm_wiener_variance; + + /*! + * Buffer to store delta-q values for delta-q mode 4. + */ + int *mb_delta_q; + + /*! + * Flag to indicate that current frame is dropped. + */ + bool is_dropped_frame; + +#if CONFIG_BITRATE_ACCURACY + /*! + * Structure stores information needed for bitrate accuracy experiment. + */ + VBR_RATECTRL_INFO vbr_rc_info; +#endif + +#if CONFIG_RATECTRL_LOG + /*! + * Structure stores information of rate control decisions. + */ + RATECTRL_LOG rc_log; +#endif // CONFIG_RATECTRL_LOG + + /*! + * Frame level twopass status and control data + */ + TWO_PASS_FRAME twopass_frame; + + /*! + * Context needed for third pass encoding. + */ + THIRD_PASS_DEC_CTX *third_pass_ctx; + + /*! + * File pointer to second pass log + */ + FILE *second_pass_log_stream; + + /*! + * Buffer to store 64x64 SAD + */ + uint64_t *src_sad_blk_64x64; + + /*! + * SSE between the current frame and the reconstructed last frame + * It is only used for CBR mode. + * It is not used if the reference frame has a different frame size. + */ + uint64_t rec_sse; + + /*! + * A flag to indicate whether the encoder is controlled by DuckyEncode or not. + * 1:yes 0:no + */ + int use_ducky_encode; + +#if !CONFIG_REALTIME_ONLY + /*! A structure that facilitates the communication between DuckyEncode and AV1 + * encoder. + */ + DuckyEncodeInfo ducky_encode_info; +#endif // CONFIG_REALTIME_ONLY + // + /*! + * Frames since last frame with cdf update. + */ + int frames_since_last_update; + + /*! + * Block level thresholds to force zeromv-skip at partition level. + */ + unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL]; + + /*! + * Number of downsampling pyramid levels to allocate for each frame + * This is currently only used for global motion + */ + int image_pyramid_levels; + +#if CONFIG_SALIENCY_MAP + /*! + * Pixel level saliency map for each frame. + */ + uint8_t *saliency_map; + + /*! + * Superblock level rdmult scaling factor driven by saliency map. + */ + double *sm_scaling_factor; +#endif + + /*! + * Number of pixels that choose palette mode for luma in the + * fast encoding pass in av1_determine_sc_tools_with_encoding(). + */ + int palette_pixel_num; + + /*! + * Flag to indicate scaled_last_source is available, + * so scaling is not needed for last_source. + */ + int scaled_last_source_available; +} AV1_COMP; + +/*! + * \brief Input frames and last input frame + */ +typedef struct EncodeFrameInput { + /*!\cond */ + YV12_BUFFER_CONFIG *source; + YV12_BUFFER_CONFIG *last_source; + int64_t ts_duration; + /*!\endcond */ +} EncodeFrameInput; + +/*! + * \brief contains per-frame encoding parameters decided upon by + * av1_encode_strategy() and passed down to av1_encode(). + */ +typedef struct EncodeFrameParams { + /*! + * Is error resilient mode enabled + */ + int error_resilient_mode; + /*! + * Frame type (eg KF vs inter frame etc) + */ + FRAME_TYPE frame_type; + + /*!\cond */ + int primary_ref_frame; + int order_offset; + + /*!\endcond */ + /*! + * Should the current frame be displayed after being decoded + */ + int show_frame; + + /*!\cond */ + int refresh_frame_flags; + + int show_existing_frame; + int existing_fb_idx_to_show; + + /*!\endcond */ + /*! + * Bitmask of which reference buffers may be referenced by this frame. + */ + int ref_frame_flags; + + /*! + * Reference buffer assignment for this frame. + */ + int remapped_ref_idx[REF_FRAMES]; + + /*! + * Flags which determine which reference buffers are refreshed by this + * frame. + */ + RefreshFrameInfo refresh_frame; + + /*! + * Speed level to use for this frame: Bigger number means faster. + */ + int speed; +} EncodeFrameParams; + +/*!\cond */ + +// EncodeFrameResults contains information about the result of encoding a +// single frame +typedef struct { + size_t size; // Size of resulting bitstream +} EncodeFrameResults; + +void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage); + +struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf, + BufferPool *const pool, + COMPRESSOR_STAGE stage, + int lap_lag_in_frames); + +struct AV1_PRIMARY *av1_create_primary_compressor( + struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, + const AV1EncoderConfig *oxcf); + +void av1_remove_compressor(AV1_COMP *cpi); + +void av1_remove_primary_compressor(AV1_PRIMARY *ppi); + +#if CONFIG_ENTROPY_STATS +void print_entropy_stats(AV1_PRIMARY *const ppi); +#endif +#if CONFIG_INTERNAL_STATS +void print_internal_stats(AV1_PRIMARY *ppi); +#endif + +void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, + bool *sb_size_changed); + +void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf, + bool sb_size_changed); + +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y); + +void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, + const AV1EncoderConfig *oxcf, int use_svc); + +void av1_post_encode_updates(AV1_COMP *const cpi, + const AV1_COMP_DATA *const cpi_data); + +void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map); + +void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map); + +void av1_release_scaled_references_fpmt(AV1_COMP *cpi); + +void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool, + int ref_buffers_used_map); + +void av1_init_sc_decisions(AV1_PRIMARY *const ppi); + +AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data); + +int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data, + AV1_PRIMARY *const ppi, + int *ref_buffers_used_map); +/*!\endcond */ + +/*!\brief Obtain the raw frame data + * + * \ingroup high_level_algo + * This function receives the raw frame data from input. + * + * \param[in] cpi Top-level encoder structure + * \param[in] frame_flags Flags to decide how to encoding the frame + * \param[in,out] sd Contain raw frame data + * \param[in] time_stamp Time stamp of the frame + * \param[in] end_time_stamp End time stamp + * + * \return Returns a value to indicate if the frame data is received + * successfully. + * \note The caller can assume that a copy of this frame is made and not just a + * copy of the pointer. + */ +int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + +/*!\brief Encode a frame + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + * This function encodes the raw frame data, and outputs the frame bit stream + * to the designated buffer. The caller should use the output parameters + * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function + * returns AOM_CODEC_OK. + * + * \param[in] cpi Top-level encoder structure + * \param[in,out] cpi_data Data corresponding to a frame encode + * + * \return Returns a value to indicate if the encoding is done successfully. + * \retval #AOM_CODEC_OK + * \retval -1 + * No frame encoded; more input is required. + * \retval "A nonzero (positive) aom_codec_err_t code" + * The encoding failed with the error. Sets the error code and error message + * in \c cpi->common.error. + */ +int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data); + +/*!\brief Run 1-pass/2-pass encoding + * + * \ingroup high_level_algo + * \callgraph + * \callergraph + */ +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results); + +/*!\cond */ +int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); + +int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); + +aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, + YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *sd); + +int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags); + +int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); + +int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); + +void av1_set_frame_size(AV1_COMP *cpi, int width, int height); + +void av1_set_mv_search_params(AV1_COMP *cpi); + +int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); + +int av1_set_internal_size(AV1EncoderConfig *const oxcf, + ResizePendingParams *resize_pending_params, + AOM_SCALING_MODE horiz_mode, + AOM_SCALING_MODE vert_mode); + +int av1_get_quantizer(struct AV1_COMP *cpi); + +int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size); + +void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td); + +void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td); + +// Set screen content options. +// This function estimates whether to use screen content tools, by counting +// the portion of blocks that have few luma colors. +// Modifies: +// cpi->commom.features.allow_screen_content_tools +// cpi->common.features.allow_intrabc +// cpi->use_screen_content_tools +// cpi->is_screen_content_type +// However, the estimation is not accurate and may misclassify videos. +// A slower but more accurate approach that determines whether to use screen +// content tools is employed later. See av1_determine_sc_tools_with_encoding(). +void av1_set_screen_content_options(struct AV1_COMP *cpi, + FeatureFlags *features); + +void av1_update_frame_size(AV1_COMP *cpi); + +typedef struct { + int pyr_level; + int disp_order; +} RefFrameMapPair; + +static INLINE void init_ref_map_pair( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { + if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) { + memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES); + return; + } + memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES); + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + // Get reference frame buffer. + const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx]; + if (ref_frame_map_pairs[map_idx].disp_order == -1) continue; + if (buf == NULL) { + ref_frame_map_pairs[map_idx].disp_order = -1; + ref_frame_map_pairs[map_idx].pyr_level = -1; + continue; + } else if (buf->ref_count > 1) { + // Once the keyframe is coded, the slots in ref_frame_map will all + // point to the same frame. In that case, all subsequent pointers + // matching the current are considered "free" slots. This will find + // the next occurrence of the current pointer if ref_count indicates + // there are multiple instances of it and mark it as free. + for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) { + const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2]; + if (buf2 == buf) { + ref_frame_map_pairs[idx2].disp_order = -1; + ref_frame_map_pairs[idx2].pyr_level = -1; + } + } + } + ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint; + ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level; + } +} + +#if CONFIG_FPMT_TEST +static AOM_INLINE void calc_frame_data_update_flag( + GF_GROUP *const gf_group, int gf_frame_index, + bool *const do_frame_data_update) { + *do_frame_data_update = true; + // Set the flag to false for all frames in a given parallel encode set except + // the last frame in the set with frame_parallel_level = 2. + if (gf_group->frame_parallel_level[gf_frame_index] == 1) { + *do_frame_data_update = false; + } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) { + // Check if this is the last frame in the set with frame_parallel_level = 2. + for (int i = gf_frame_index + 1; i < gf_group->size; i++) { + if ((gf_group->frame_parallel_level[i] == 0 && + (gf_group->update_type[i] == ARF_UPDATE || + gf_group->update_type[i] == INTNL_ARF_UPDATE)) || + gf_group->frame_parallel_level[i] == 1) { + break; + } else if (gf_group->frame_parallel_level[i] == 2) { + *do_frame_data_update = false; + break; + } + } + } +} +#endif + +// av1 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL + +static INLINE int64_t +timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) { + return n * timestamp_ratio->num / timestamp_ratio->den; +} + +static INLINE int64_t +ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) { + int64_t round = timestamp_ratio->num / 2; + if (round > 0) --round; + return (n * timestamp_ratio->den + round) / timestamp_ratio->num; +} + +static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; + + return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE || + update_type == GF_UPDATE; +} + +// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD. +static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) { + return (cpi->common.features.allow_screen_content_tools && + cpi->common.features.allow_intrabc && + frame_is_intra_only(&cpi->common)); +} + +static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf( + const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + return buf != NULL ? &buf->buf : NULL; +} + +static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) { + assert(buf != NULL); + ensure_mv_buffer(buf, cm); + buf->width = cm->width; + buf->height = cm->height; +} + +// Get the allocated token size for a tile. It does the same calculation as in +// the frame token allocation. +static INLINE unsigned int allocated_tokens(const TileInfo *tile, + int sb_size_log2, int num_planes) { + int tile_mb_rows = + ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2); + int tile_mb_cols = + ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2); + + return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes); +} + +static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col, + int mi_row, TokenExtra **tok, int sb_size_log2, + int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + const int tile_mb_cols = + (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; + const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2; + + *tok = cpi->token_info.tile_tok[tile_row][tile_col] + + get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes); +} + +void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags); + +#define ALT_MIN_LAG 3 +static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) { + return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf; +} + +static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) { + return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) && + (gf_cfg->gf_min_pyr_height == 0); +} + +// Helper function to compute number of blocks on either side of the frame. +static INLINE int get_num_blocks(const int frame_length, const int mb_length) { + return (frame_length + mb_length - 1) / mb_length; +} + +// Check if statistics generation stage +static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) { + assert(IMPLIES(cpi->compressor_stage == LAP_STAGE, + cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled)); + return (cpi->oxcf.pass == AOM_RC_FIRST_PASS || + (cpi->compressor_stage == LAP_STAGE)); +} +// Check if statistics consumption stage +static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) { + return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS); +} + +// Check if statistics consumption stage +static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) { + return (is_stat_consumption_stage_twopass(cpi) || + (cpi->oxcf.pass == AOM_RC_ONE_PASS && + (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled)); +} + +// Decide whether 'dv_costs' need to be allocated/stored during the encoding. +static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) { + return !cpi->sf.rt_sf.use_nonrd_pick_mode && + av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi); +} + +/*!\endcond */ +/*!\brief Check if the current stage has statistics + * + *\ingroup two_pass_algo + * + * \param[in] cpi Top - level encoder instance structure + * + * \return 0 if no stats for current stage else 1 + */ +static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) { + assert( + IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE)); + return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled); +} + +/*!\cond */ + +static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) { + return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0; +} + +// Use default/internal reference structure for single-layer RTC. +static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) { + return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 && + cpi->ppi->number_temporal_layers == 1 && + !cpi->ppi->rtc_ref.set_ref_frame_config; +} + +// Function return size of frame stats buffer +static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) { + /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */ + return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer); +} + +// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf + +static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { + xd->block_ref_scale_factors[0] = + get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1); + xd->block_ref_scale_factors[1] = + get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1); +} + +static INLINE int get_chessboard_index(int frame_index) { + return frame_index & 0x1; +} + +static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi, + const int *cost_list) { + const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && + cpi->sf.mv_sf.use_fullpel_costlist; + return use_cost_list ? cost_list : NULL; +} + +static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) { + const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE && + cpi->sf.mv_sf.use_fullpel_costlist; + return use_cost_list ? cost_list : NULL; +} + +// Compression ratio of current frame. +double av1_get_compression_ratio(const AV1_COMMON *const cm, + size_t encoded_frame_size); + +void av1_new_framerate(AV1_COMP *cpi, double framerate); + +void av1_setup_frame_size(AV1_COMP *cpi); + +#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) + +// Returns 1 if a frame is scaled and 0 otherwise. +static INLINE int av1_resize_scaled(const AV1_COMMON *cm) { + return cm->superres_upscaled_width != cm->render_width || + cm->superres_upscaled_height != cm->render_height; +} + +static INLINE int av1_frame_scaled(const AV1_COMMON *cm) { + return av1_superres_scaled(cm) || av1_resize_scaled(cm); +} + +// Don't allow a show_existing_frame to coincide with an error resilient +// frame. An exception can be made for a forward keyframe since it has no +// previous dependencies. +static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { + return cm->show_existing_frame && (!cm->features.error_resilient_mode || + cm->current_frame.frame_type == KEY_FRAME); +} + +// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given +// 'mi_row' and 'mi_col'. +static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col, + const BLOCK_SIZE mi_alloc_bsize, + const int mbmi_ext_stride) { + const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize]; + const int mi_ext_row = mi_row / mi_ext_size_1d; + const int mi_ext_col = mi_col / mi_ext_size_1d; + return mi_ext_row * mbmi_ext_stride + mi_ext_col; +} + +// Lighter version of set_offsets that only sets the mode info +// pointers. +static INLINE void set_mode_info_offsets( + const CommonModeInfoParams *const mi_params, + const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, int mi_col) { + set_mi_offsets(mi_params, xd, mi_row, mi_col); + const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize, + mbmi_ext_info->stride); + x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx; +} + +// Check to see if the given partition size is allowed for a specified number +// of mi block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, + int cols_left, int *bh, int *bw) { + int int_size = (int)bsize; + if (rows_left <= 0 || cols_left <= 0) { + return AOMMIN(bsize, BLOCK_8X8); + } else { + for (; int_size > 0; int_size -= 3) { + *bh = mi_size_high[int_size]; + *bw = mi_size_wide[int_size]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return (BLOCK_SIZE)int_size; +} + +static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + +// When more than 'max_allowed_refs' are available, we reduce the number of +// reference frames one at a time based on this order. +static const MV_REFERENCE_FRAME disable_order[] = { + LAST3_FRAME, + LAST2_FRAME, + ALTREF2_FRAME, + BWDREF_FRAME, +}; + +static const MV_REFERENCE_FRAME + ref_frame_priority_order[INTER_REFS_PER_FRAME] = { + LAST_FRAME, ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME, + ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME, + }; + +static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf, + const int use_one_pass_rt_params, + const YV12_BUFFER_CONFIG **ref_frames, + const int ext_ref_frame_flags) { + // cpi->ext_flags.ref_frame_flags allows certain reference types to be + // disabled by the external interface. These are set by + // av1_apply_encoding_flags(). Start with what the external interface allows, + // then suppress any reference types which we have found to be duplicates. + int flags = ext_ref_frame_flags; + + for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) { + const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i]; + // If this_ref has appeared before, mark the corresponding ref frame as + // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the + // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd). + int index = + (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME) + ? (1 + sf->rt_sf.use_nonrd_altref_frame) + : i; + for (int j = 0; j < index; ++j) { + // If this_ref has appeared before (same as the reference corresponding + // to lower index j), remove it as a reference only if that reference + // (for index j) is actually used as a reference. + if (this_ref == ref_frames[j] && + (flags & (1 << (ref_frame_priority_order[j] - 1)))) { + flags &= ~(1 << (ref_frame_priority_order[i] - 1)); + break; + } + } + } + return flags; +} + +// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon +// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this +// function, the memory must be freed by the caller. Both the buf member of the +// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory +// returned must be freed via call to free(). +// +// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, +// the obu_has_size_field bit is set, and the buffer contains the obu_size +// field. +aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi); + +#define MAX_GFUBOOST_FACTOR 10.0 +#define MIN_GFUBOOST_FACTOR 4.0 + +static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group, + uint8_t index) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index]; + return update_type == ARF_UPDATE || update_type == GF_UPDATE || + update_type == KF_UPDATE; +} + +static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group, + int selective_ref_frame, + int prune_ref_frames, + int gf_index) { + return (selective_ref_frame > 0) && (prune_ref_frames > 0) && + !is_frame_tpl_eligible(gf_group, gf_index); +} + +// Get update type of the current frame. +static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group, + int gf_frame_index) { + return gf_group->update_type[gf_frame_index]; +} + +static INLINE int av1_pixels_to_mi(int pixels) { + return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2; +} + +static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + + return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) && + cm->show_frame; +} + +static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) { + const ResizePendingParams *const resize_pending_params = + &cpi->resize_pending_params; + return (resize_pending_params->width && resize_pending_params->height && + (cpi->common.width != resize_pending_params->width || + cpi->common.height != resize_pending_params->height)); +} + +// Check if loop filter is used. +static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) { + return !cm->features.coded_lossless && !cm->tiles.large_scale; +} + +// Check if CDEF is used. +static INLINE int is_cdef_used(const AV1_COMMON *const cm) { + return cm->seq_params->enable_cdef && !cm->features.coded_lossless && + !cm->tiles.large_scale; +} + +// Check if loop restoration filter is used. +static INLINE int is_restoration_used(const AV1_COMMON *const cm) { + return cm->seq_params->enable_restoration && !cm->features.all_lossless && + !cm->tiles.large_scale; +} + +// Checks if post-processing filters need to be applied. +// NOTE: This function decides if the application of different post-processing +// filters on the reconstructed frame can be skipped at the encoder side. +// However the computation of different filter parameters that are signaled in +// the bitstream is still required. +static INLINE unsigned int derive_skip_apply_postproc_filters( + const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres, + int use_restoration) { + // Though CDEF parameter selection should be dependent on + // deblocked/loop-filtered pixels for cdef_pick_method <= + // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the + // pixel values that are not loop-filtered in svc real-time encoding mode. + // Hence this case is handled separately using the condition below. + if (cpi->ppi->rtc_ref.non_reference_frame) + return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF); + + if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr) + return 0; + assert(cpi->oxcf.mode == ALLINTRA); + + // The post-processing filters are applied one after the other in the + // following order: deblocking->cdef->superres->restoration. In case of + // ALLINTRA encoding, the reconstructed frame is not used as a reference + // frame. Hence, the application of these filters can be skipped when + // 1. filter parameters of the subsequent stages are not dependent on the + // filtered output of the current stage or + // 2. subsequent filtering stages are disabled + if (use_restoration) return SKIP_APPLY_RESTORATION; + if (use_superres) return SKIP_APPLY_SUPERRES; + if (use_cdef) { + // CDEF parameter selection is not dependent on the deblocked frame if + // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking + // filters and cdef filters can be skipped in this case. + return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q && + use_loopfilter) + ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF) + : SKIP_APPLY_CDEF; + } + if (use_loopfilter) return SKIP_APPLY_LOOPFILTER; + + // If we reach here, all post-processing stages are disabled, so none need to + // be skipped. + return 0; +} + +static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) { + struct loopfilter *const lf = &cm->lf; + CdefInfo *const cdef_info = &cm->cdef_info; + RestorationInfo *const rst_info = cm->rst_info; + + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + cdef_info->cdef_bits = 0; + cdef_info->cdef_strengths[0] = 0; + cdef_info->nb_cdef_strengths = 1; + cdef_info->cdef_uv_strengths[0] = 0; + rst_info[0].frame_restoration_type = RESTORE_NONE; + rst_info[1].frame_restoration_type = RESTORE_NONE; + rst_info[2].frame_restoration_type = RESTORE_NONE; +} + +static INLINE int is_inter_tx_size_search_level_one( + const TX_SPEED_FEATURES *tx_sf) { + return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 && + tx_sf->inter_tx_size_search_init_depth_sqr >= 1); +} + +static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) { + int lpf_opt_level = 0; + if (is_inter_tx_size_search_level_one(&sf->tx_sf)) + lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1; + return lpf_opt_level; +} + +// Enable switchable motion mode only if warp and OBMC tools are allowed +static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion, + bool enable_obmc) { + return (allow_warped_motion || enable_obmc); +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static INLINE int denoise_svc(const struct AV1_COMP *const cpi) { + return (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); +} +#endif + +#if CONFIG_COLLECT_PARTITION_STATS == 2 +static INLINE void av1_print_fr_partition_timing_stats( + const FramePartitionTimingStats *part_stats, const char *filename) { + FILE *f = fopen(filename, "w"); + if (!f) { + return; + } + + fprintf(f, "bsize,redo,"); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "decision_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "attempt_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "time_%d,", part); + } + fprintf(f, "\n"); + + static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 }; + + for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) { + fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]); + } + fprintf(f, "\n"); + } + fclose(f); +} +#endif // CONFIG_COLLECT_PARTITION_STATS == 2 + +#if CONFIG_COLLECT_PARTITION_STATS +static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) { + assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || + bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 || + bsize == BLOCK_4X4); + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + case BLOCK_4X4: return 5; + default: assert(0 && "Invalid bsize for partition_stats."); return -1; + } +} +#endif // CONFIG_COLLECT_PARTITION_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + aom_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + case 2: return "INTRA_ONLY_FRAME"; + case 3: return "S_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + +/*!\endcond */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_H_ diff --git a/third_party/aom/av1/encoder/encoder_alloc.h b/third_party/aom/av1/encoder/encoder_alloc.h new file mode 100644 index 0000000000..ce48496d48 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder_alloc.h @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_ +#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/global_motion_facade.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/pickcdef.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static AOM_INLINE void dealloc_context_buffers_ext( + MBMIExtFrameBufferInfo *mbmi_ext_info) { + aom_free(mbmi_ext_info->frame_base); + mbmi_ext_info->frame_base = NULL; + mbmi_ext_info->alloc_size = 0; +} + +static AOM_INLINE void alloc_context_buffers_ext( + AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + const int mi_alloc_rows = + (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + const int mi_alloc_cols = + (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols; + + if (new_ext_mi_size > mbmi_ext_info->alloc_size) { + dealloc_context_buffers_ext(mbmi_ext_info); + CHECK_MEM_ERROR( + cm, mbmi_ext_info->frame_base, + aom_malloc(new_ext_mi_size * sizeof(*mbmi_ext_info->frame_base))); + mbmi_ext_info->alloc_size = new_ext_mi_size; + } + // The stride needs to be updated regardless of whether new allocation + // happened or not. + mbmi_ext_info->stride = mi_alloc_cols; +} + +static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + + // Setup mi_params + mi_params->set_mb_mi(mi_params, cm->width, cm->height, + cpi->sf.part_sf.default_min_partition_size); + + if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi); + + aom_free(cpi->td.mv_costs_alloc); + cpi->td.mv_costs_alloc = NULL; + // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding + // mode. + if (cpi->oxcf.kf_cfg.key_freq_max != 0) { + CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc, + (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc))); + cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc; + } + + av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf, + cm->error); + if (av1_setup_sms_tree(cpi, &cpi->td)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } + cpi->td.firstpass_ctx = + av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf); + if (!cpi->td.firstpass_ctx) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); +} + +// Allocate mbmi buffers which are used to store mode information at block +// level. +static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + if (av1_alloc_context_buffers(cm, cm->width, cm->height, + cpi->sf.part_sf.default_min_partition_size)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } + + if (!is_stat_generation_stage(cpi)) + alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info); +} + +static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CommonModeInfoParams *const mi_params = &cm->mi_params; + + // Create the encoder segmentation map and set all entries to 0 + aom_free(cpi->enc_seg.map); + CHECK_MEM_ERROR(cm, cpi->enc_seg.map, + aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); + + // Create a map used for cyclic background refresh. + if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh); + CHECK_MEM_ERROR( + cm, cpi->cyclic_refresh, + av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols)); + + // Create a map used to mark inactive areas. + aom_free(cpi->active_map.map); + CHECK_MEM_ERROR(cm, cpi->active_map.map, + aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1)); +} + +static AOM_INLINE void alloc_obmc_buffers( + OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) { + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->wsrc, + (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc))); + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->mask, + (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask))); + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->above_pred, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred))); + AOM_CHECK_MEM_ERROR( + error, obmc_buffer->left_pred, + (uint8_t *)aom_memalign( + 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred))); +} + +static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) { + aom_free(obmc_buffer->mask); + aom_free(obmc_buffer->above_pred); + aom_free(obmc_buffer->left_pred); + aom_free(obmc_buffer->wsrc); + + obmc_buffer->mask = NULL; + obmc_buffer->above_pred = NULL; + obmc_buffer->left_pred = NULL; + obmc_buffer->wsrc = NULL; +} + +static AOM_INLINE void alloc_compound_type_rd_buffers( + struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) { + AOM_CHECK_MEM_ERROR( + error, bufs->pred0, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0))); + AOM_CHECK_MEM_ERROR( + error, bufs->pred1, + (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1))); + AOM_CHECK_MEM_ERROR( + error, bufs->residual1, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1))); + AOM_CHECK_MEM_ERROR( + error, bufs->diff10, + (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10))); + AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf, + (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE * + sizeof(*bufs->tmp_best_mask_buf))); +} + +static AOM_INLINE void release_compound_type_rd_buffers( + CompoundTypeRdBuffers *const bufs) { + aom_free(bufs->pred0); + aom_free(bufs->pred1); + aom_free(bufs->residual1); + aom_free(bufs->diff10); + aom_free(bufs->tmp_best_mask_buf); + av1_zero(*bufs); // Set all pointers to NULL for safety. +} + +static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + TokenInfo *token_info = &cpi->token_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int num_planes = av1_num_planes(cm); + dealloc_context_buffers_ext(&cpi->mbmi_ext_info); + + aom_free(cpi->tile_data); + cpi->tile_data = NULL; + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; + + // Delete sementation map + aom_free(cpi->enc_seg.map); + cpi->enc_seg.map = NULL; + + av1_cyclic_refresh_free(cpi->cyclic_refresh); + cpi->cyclic_refresh = NULL; + + aom_free(cpi->active_map.map); + cpi->active_map.map = NULL; + + aom_free(cpi->ssim_rdmult_scaling_factors); + cpi->ssim_rdmult_scaling_factors = NULL; + + aom_free(cpi->tpl_rdmult_scaling_factors); + cpi->tpl_rdmult_scaling_factors = NULL; + +#if CONFIG_TUNE_VMAF + aom_free(cpi->vmaf_info.rdmult_scaling_factors); + cpi->vmaf_info.rdmult_scaling_factors = NULL; + aom_close_vmaf_model(cpi->vmaf_info.vmaf_model); +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + aom_free(cpi->butteraugli_info.rdmult_scaling_factors); + cpi->butteraugli_info.rdmult_scaling_factors = NULL; + aom_free_frame_buffer(&cpi->butteraugli_info.source); + aom_free_frame_buffer(&cpi->butteraugli_info.resized_source); +#endif + +#if CONFIG_SALIENCY_MAP + aom_free(cpi->saliency_map); + aom_free(cpi->sm_scaling_factor); +#endif + + release_obmc_buffers(&cpi->td.mb.obmc_buffer); + + aom_free(cpi->td.mv_costs_alloc); + cpi->td.mv_costs_alloc = NULL; + aom_free(cpi->td.dv_costs_alloc); + cpi->td.dv_costs_alloc = NULL; + + aom_free(cpi->td.mb.sb_stats_cache); + cpi->td.mb.sb_stats_cache = NULL; + + aom_free(cpi->td.mb.sb_fp_stats); + cpi->td.mb.sb_fp_stats = NULL; + +#if CONFIG_PARTITION_SEARCH_ORDER + aom_free(cpi->td.mb.rdcost); + cpi->td.mb.rdcost = NULL; +#endif + + av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0, + cpi->sf.part_sf.partition_search_type); + cpi->td.pc_root = NULL; + + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) { + aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]); + cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL; + } + + av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table); + + aom_free(cm->tpl_mvs); + cm->tpl_mvs = NULL; + + aom_free(cpi->td.pixel_gradient_info); + cpi->td.pixel_gradient_info = NULL; + + aom_free(cpi->td.src_var_info_of_4x4_sub_blocks); + cpi->td.src_var_info_of_4x4_sub_blocks = NULL; + + aom_free(cpi->td.vt64x64); + cpi->td.vt64x64 = NULL; + + av1_free_pmc(cpi->td.firstpass_ctx, num_planes); + cpi->td.firstpass_ctx = NULL; + + const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; + // This call ensures that the buffers allocated by tf_alloc_and_reset_data() + // in av1_temporal_filter() for single-threaded encode are freed in case an + // error is encountered during temporal filtering (due to early termination + // tf_dealloc_data() in av1_temporal_filter() would not be invoked). + tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth); + + // This call ensures that tpl_tmp_buffers for single-threaded encode are freed + // in case of an error during tpl. + tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers); + + // This call ensures that the global motion (gm) data buffers for + // single-threaded encode are freed in case of an error during gm. + gm_dealloc_data(&cpi->td.gm_data); + + // This call ensures that CDEF search context buffers are deallocated in case + // of an error during cdef search. + av1_cdef_dealloc_data(cpi->cdef_search_ctx); + aom_free(cpi->cdef_search_ctx); + cpi->cdef_search_ctx = NULL; + + av1_dealloc_mb_data(&cpi->td.mb, num_planes); + + av1_dealloc_mb_wiener_var_pred_buf(&cpi->td); + + av1_free_txb_buf(cpi); + av1_free_context_buffers(cm); + + aom_free_frame_buffer(&cpi->last_frame_uf); +#if !CONFIG_REALTIME_ONLY + av1_free_restoration_buffers(cm); + av1_free_firstpass_data(&cpi->firstpass_data); +#endif + + if (!is_stat_generation_stage(cpi)) { + av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker, + &cpi->mt_info.cdef_sync); + } + + for (int plane = 0; plane < num_planes; plane++) { + aom_free(cpi->pick_lr_ctxt.rusi[plane]); + cpi->pick_lr_ctxt.rusi[plane] = NULL; + } + aom_free(cpi->pick_lr_ctxt.dgd_avg); + cpi->pick_lr_ctxt.dgd_avg = NULL; + + aom_free_frame_buffer(&cpi->trial_frame_rst); + aom_free_frame_buffer(&cpi->scaled_source); + aom_free_frame_buffer(&cpi->scaled_last_source); + aom_free_frame_buffer(&cpi->orig_source); + aom_free_frame_buffer(&cpi->svc.source_last_TL0); + + free_token_info(token_info); + + av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); + av1_free_sms_tree(&cpi->td); + + aom_free(cpi->td.mb.palette_buffer); + release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer); + aom_free(cpi->td.mb.tmp_conv_dst); + for (int j = 0; j < 2; ++j) { + aom_free(cpi->td.mb.tmp_pred_bufs[j]); + } + +#if CONFIG_DENOISE + if (cpi->denoise_and_model) { + aom_denoise_and_model_free(cpi->denoise_and_model); + cpi->denoise_and_model = NULL; + } +#endif + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + aom_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } + + if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi); + aom_free(cpi->svc.layer_context); + cpi->svc.layer_context = NULL; + + aom_free(cpi->consec_zero_mv); + cpi->consec_zero_mv = NULL; + cpi->consec_zero_mv_alloc_size = 0; + + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + + aom_free(cpi->mb_weber_stats); + cpi->mb_weber_stats = NULL; + + if (cpi->oxcf.enable_rate_guide_deltaq) { + aom_free(cpi->prep_rate_estimates); + cpi->prep_rate_estimates = NULL; + + aom_free(cpi->ext_rate_distribution); + cpi->ext_rate_distribution = NULL; + } + + aom_free(cpi->mb_delta_q); + cpi->mb_delta_q = NULL; +} + +static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) { + if (!is_gradient_caching_for_hog_enabled(cpi)) return; + + PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info; + if (!pixel_gradient_info) { + const AV1_COMMON *const cm = &cpi->common; + const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome; + CHECK_MEM_ERROR( + cm, pixel_gradient_info, + aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE)); + cpi->td.pixel_gradient_info = pixel_gradient_info; + } + + cpi->td.mb.pixel_gradient_info = pixel_gradient_info; +} + +static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) { + if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return; + + Block4x4VarInfo *source_variance_info = + cpi->td.src_var_info_of_4x4_sub_blocks; + if (!source_variance_info) { + const AV1_COMMON *const cm = &cpi->common; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; + CHECK_MEM_ERROR(cm, source_variance_info, + aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb)); + cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info; + } + + cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info; +} + +static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4; + if (cpi->td.vt64x64) { + if (num_64x64_blocks != cpi->td.num_64x64_blocks) { + aom_free(cpi->td.vt64x64); + cpi->td.vt64x64 = NULL; + } + } + if (!cpi->td.vt64x64) { + CHECK_MEM_ERROR(cm, cpi->td.vt64x64, + aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks)); + cpi->td.num_64x64_blocks = num_64x64_blocks; + } +} + +static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source( + AV1_COMP *cpi, int scaled_width, int scaled_height) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + + if (scaled_width == cpi->unscaled_source->y_crop_width && + scaled_height == cpi->unscaled_source->y_crop_height) { + return cpi->unscaled_source; + } + + if (aom_realloc_frame_buffer( + &cpi->scaled_source, scaled_width, scaled_height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, NULL, NULL, NULL, + cpi->image_pyramid_levels, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate scaled source buffer"); + assert(cpi->scaled_source.y_crop_width == scaled_width); + assert(cpi->scaled_source.y_crop_height == scaled_height); + if (!av1_resize_and_extend_frame_nonnormative( + cpi->unscaled_source, &cpi->scaled_source, + (int)cm->seq_params->bit_depth, num_planes)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to reallocate buffers during resize"); + return &cpi->scaled_source; +} + +// Deallocate allocated thread_data. +static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + const int num_tf_workers = + AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers); + const int num_tpl_workers = + AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers); + const int is_highbitdepth = ppi->seq_params.use_highbitdepth; + const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE; + for (int t = 1; t < p_mt_info->num_workers; ++t) { + EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t]; + thread_data->td = thread_data->original_td; + ThreadData *const td = thread_data->td; + if (!td) continue; + aom_free(td->tctx); + aom_free(td->palette_buffer); + aom_free(td->tmp_conv_dst); + release_compound_type_rd_buffers(&td->comp_rd_buffer); + for (int j = 0; j < 2; ++j) { + aom_free(td->tmp_pred_bufs[j]); + } + aom_free(td->pixel_gradient_info); + aom_free(td->src_var_info_of_4x4_sub_blocks); + release_obmc_buffers(&td->obmc_buffer); + aom_free(td->vt64x64); + + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + aom_free(td->hash_value_buffer[x][y]); + td->hash_value_buffer[x][y] = NULL; + } + } + aom_free(td->mv_costs_alloc); + td->mv_costs_alloc = NULL; + aom_free(td->dv_costs_alloc); + td->dv_costs_alloc = NULL; + aom_free(td->counts); + av1_free_pmc(td->firstpass_ctx, num_planes); + td->firstpass_ctx = NULL; + av1_free_shared_coeff_buffer(&td->shared_coeff_buf); + av1_free_sms_tree(td); + // This call ensures that the buffers allocated by tf_alloc_and_reset_data() + // in prepare_tf_workers() for MT encode are freed in case an error is + // encountered during temporal filtering (due to early termination + // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be + // invoked). + if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth); + // This call ensures that tpl_tmp_buffers for MT encode are freed in case of + // an error during tpl. + if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); + // This call ensures that the buffers in gm_data for MT encode are freed in + // case of an error during gm. + gm_dealloc_data(&td->gm_data); + av1_dealloc_mb_data(&td->mb, num_planes); + aom_free(td->mb.sb_stats_cache); + td->mb.sb_stats_cache = NULL; + aom_free(td->mb.sb_fp_stats); + td->mb.sb_fp_stats = NULL; +#if CONFIG_PARTITION_SEARCH_ORDER + aom_free(td->mb.rdcost); + td->mb.rdcost = NULL; +#endif + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION); + td->pc_root = NULL; + av1_dealloc_mb_wiener_var_pred_buf(td); + aom_free(td); + thread_data->td = NULL; + thread_data->original_td = NULL; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_ALLOC_H_ diff --git a/third_party/aom/av1/encoder/encoder_utils.c b/third_party/aom/av1/encoder/encoder_utils.c new file mode 100644 index 0000000000..c35873d207 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder_utils.c @@ -0,0 +1,1503 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aomcx.h" + +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/grain_test_vectors.h" +#include "av1/encoder/mv_prec.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/superres_scale.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/var_based_part.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define MIN_BOOST_COMBINE_FACTOR 4.0 +#define MAX_BOOST_COMBINE_FACTOR 12.0 + +const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = { + { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 }, + { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 }, + { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 }, + { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 }, + { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 }, + { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 }, + { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 }, + { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 }, + { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 }, + { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 }, + { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 }, + { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 }, + { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 }, + { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 }, + { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 }, + { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 }, + { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 }, + { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, + { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 }, + { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 }, + { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 }, + { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 }, + { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 }, + { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 }, + { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 }, + { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 }, + { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 }, + { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 }, + { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 }, + { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 }, + { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 }, + { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 }, + { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 }, + { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 }, + { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 }, + { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 }, + { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 }, + { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, + { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } }, + { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 }, + { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 }, + { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 }, + { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 }, + { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 }, + { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 }, + { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 }, + { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 }, + { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 }, + { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 }, + { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 }, + { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } } +}; + +const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 106, 90, 90, 97, 67, 59, 70, 28, + 30, 38, 16, 16, 16, 0, 0, 44, 50, 26, 25 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 98, 93, 97, 68, 82, 85, 33, 30, + 33, 16, 16, 16, 16, 0, 0, 43, 37, 26, 16 }, + { 0, 0, 0, 91, 80, 76, 78, 55, 49, 24, 16, + 16, 16, 16, 16, 16, 0, 0, 29, 45, 16, 38 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 103, 89, 89, 89, 62, 63, 76, 34, + 35, 32, 19, 16, 16, 0, 0, 49, 55, 29, 19 } +}; + +const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64, + 64, 64, 64 }; + +// TODO(yunqing): the default probs can be trained later from better +// performance. +const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] + [SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS] = { + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } }, + { { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 }, + { 512, 512, 512 } } + }; + +static void configure_static_seg_features(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + struct segmentation *const seg = &cm->seg; + + double avg_q; +#if CONFIG_FPMT_TEST + avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) && + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)) + ? cpi->ppi->p_rc.temp_avg_q + : cpi->ppi->p_rc.avg_q; +#else + avg_q = cpi->ppi->p_rc.avg_q; +#endif + + int high_q = (int)(avg_q > 48.0); + int qi_delta; + + // Disable and clear down for KF + if (cm->current_frame.frame_type == KEY_FRAME) { + // Clear down the global segmentation map + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + seg->update_map = 0; + seg->update_data = 0; + + // Disable segmentation + av1_disable_segmentation(seg); + + // Clear down the segment features. + av1_clearall_segfeatures(seg); + } else if (cpi->refresh_frame.alt_ref_frame) { + // If this is an alt ref frame + // Clear down the global segmentation map + memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + seg->update_map = 0; + seg->update_data = 0; + + // Disable segmentation and individual segment features by default + av1_disable_segmentation(seg); + av1_clearall_segfeatures(seg); + + // If segmentation was enabled set those features needed for the + // arf itself. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + + qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875, + cm->seq_params->bit_depth); + av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2); + av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V); + + av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + } + } else if (seg->enabled) { + // All other frames if segmentation has been enabled + + // First normal frame in a valid gf or alt ref group + if (rc->frames_since_golden == 0) { + // Set up segment features for normal frames in an arf group + // Disable segmentation and clear down features if alt ref + // is not active for this group + + av1_disable_segmentation(seg); + + memset(cpi->enc_seg.map, 0, + cm->mi_params.mi_rows * cm->mi_params.mi_cols); + + seg->update_map = 0; + seg->update_data = 0; + + av1_clearall_segfeatures(seg); + } else if (rc->is_src_frame_alt_ref) { + // Special case where we are coding over the top of a previous + // alt ref frame. + // Segment coding disabled for compred testing + + // Enable ref frame features for segment 0 as well + av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); + av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + + // All mbs should use ALTREF_FRAME + av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); + av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + + // Skip all MBs if high Q (0,0 mv and skip coeffs) + if (high_q) { + av1_enable_segfeature(seg, 0, SEG_LVL_SKIP); + av1_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + // Enable data update + seg->update_data = 1; + } else { + // All other frames. + + // No updates.. leave things as they are. + seg->update_map = 0; + seg->update_data = 0; + } + } +} + +void av1_apply_active_map(AV1_COMP *cpi) { + struct segmentation *const seg = &cpi->common.seg; + unsigned char *const seg_map = cpi->enc_seg.map; + const unsigned char *const active_map = cpi->active_map.map; + int i; + + assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); + + if (frame_is_intra_only(&cpi->common)) { + cpi->active_map.enabled = 0; + cpi->active_map.update = 1; + } + + if (cpi->active_map.update) { + if (cpi->active_map.enabled) { + const int num_mis = + cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; + for (i = 0; i < num_mis; ++i) + if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + av1_enable_segmentation(seg); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); + av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); + + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U, + -MAX_LOOP_FILTER); + av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V, + -MAX_LOOP_FILTER); + } else { + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U); + av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V); + if (seg->enabled) { + seg->update_data = 1; + seg->update_map = 1; + } + } + cpi->active_map.update = 0; + } +} + +#if !CONFIG_REALTIME_ONLY +static void process_tpl_stats_frame(AV1_COMP *cpi) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + AV1_COMMON *const cm = &cpi->common; + + assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size)); + + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + if (tpl_frame->is_valid) { + int tpl_stride = tpl_frame->stride; + double intra_cost_base = 0; + double mc_dep_cost_base = 0; + double cbcmp_base = 1; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) { + for (int col = 0; col < mi_cols_sr; col += col_step_sr) { + TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + double cbcmp = (double)(this_stats->srcrf_dist); + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + intra_cost_base += log(dist_scaled) * cbcmp; + mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp; + cbcmp_base += cbcmp; + } + } + + if (mc_dep_cost_base == 0) { + tpl_frame->is_valid = 0; + } else { + cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base); + if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + if (cpi->ppi->lap_enabled) { + double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval); + const int gfu_boost = get_gfu_boost_from_r0_lap( + min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0, + cpi->ppi->p_rc.num_stats_required_for_gfu_boost); + // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost, + // gfu_boost); + cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( + min_boost_factor, MAX_BOOST_COMBINE_FACTOR, + cpi->ppi->p_rc.gfu_boost, gfu_boost, + cpi->ppi->p_rc.num_stats_used_for_gfu_boost); + } else { + // TPL may only look at a subset of frame in the gf group when the + // speed feature 'reduce_num_frames' is on, which affects the r0 + // calcuation. Thus, to compensate for TPL not using all frames a + // factor to adjust r0 is used. + const int gfu_boost = + (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0); + cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( + MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, + cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key); + } + } + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, + int *top_index) { + AV1_COMMON *const cm = &cpi->common; + + // Setup variables that depend on the dimensions of the frame. + av1_set_speed_features_framesize_dependent(cpi, cpi->speed); + +#if !CONFIG_REALTIME_ONLY + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (cpi->oxcf.algo_cfg.enable_tpl_model && + av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) { + process_tpl_stats_frame(cpi); + av1_tpl_rdmult_setup(cpi); + } +#endif + + // Decide q and q bounds. + *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index, + bottom_index, top_index); + +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.rc_cfg.mode == AOM_Q && + cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid && + !is_lossless_requested(&cpi->oxcf.rc_cfg)) { + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + const int tpl_q = av1_tpl_get_q_index( + &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality, + cm->seq_params->bit_depth); + *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q); + *top_index = *bottom_index = *q; + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE) + cpi->ppi->p_rc.arf_q = *q; + } + + if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) { + if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + const double qratio_grad = + cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3; + const double qstep_ratio = + 0.2 + + (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad; + *q = av1_get_q_index_from_qstep_ratio( + cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth); + *top_index = *bottom_index = *q; + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE) + cpi->ppi->p_rc.arf_q = *q; + } else if (gf_group->layer_depth[cpi->gf_frame_index] < + gf_group->max_layer_depth) { + int this_height = gf_group->layer_depth[cpi->gf_frame_index]; + int arf_q = cpi->ppi->p_rc.arf_q; + while (this_height > 1) { + arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2; + --this_height; + } + *top_index = *bottom_index = *q = arf_q; + } + } +#endif + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in the second pass of a two pass encode, as it requires + // lagged coding, and if the relevant speed feature flag is set. + if (is_stat_consumption_stage_twopass(cpi) && + cpi->sf.hl_sf.static_segmentation) + configure_static_seg_features(cpi); +} + +static void reset_film_grain_chroma_params(aom_film_grain_t *pars) { + pars->num_cr_points = 0; + pars->cr_mult = 0; + pars->cr_luma_mult = 0; + memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr)); + memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr)); + pars->num_cb_points = 0; + pars->cb_mult = 0; + pars->cb_luma_mult = 0; + pars->chroma_scaling_from_luma = 0; + memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb)); + memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb)); +} + +void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf) { + SequenceHeader *const seq_params = &ppi->seq_params; + const TuneCfg *const tune_cfg = &oxcf->tune_cfg; + + if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename || + tune_cfg->content == AOM_CONTENT_FILM) { + seq_params->film_grain_params_present = 1; + } else { +#if CONFIG_DENOISE + seq_params->film_grain_params_present = (oxcf->noise_level > 0); +#else + seq_params->film_grain_params_present = 0; +#endif + } +} + +void av1_update_film_grain_parameters(struct AV1_COMP *cpi, + const AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + const TuneCfg *const tune_cfg = &oxcf->tune_cfg; + + if (cpi->film_grain_table) { + aom_film_grain_table_free(cpi->film_grain_table); + aom_free(cpi->film_grain_table); + cpi->film_grain_table = NULL; + } + + if (tune_cfg->film_grain_test_vector) { + if (cm->current_frame.frame_type == KEY_FRAME) { + memcpy(&cm->film_grain_params, + film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1, + sizeof(cm->film_grain_params)); + if (oxcf->tool_cfg.enable_monochrome) + reset_film_grain_chroma_params(&cm->film_grain_params); + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; + if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) { + cm->film_grain_params.clip_to_restricted_range = 0; + } + } + } else if (tune_cfg->film_grain_table_filename) { + CHECK_MEM_ERROR(cm, cpi->film_grain_table, + aom_calloc(1, sizeof(*cpi->film_grain_table))); + + aom_film_grain_table_read(cpi->film_grain_table, + tune_cfg->film_grain_table_filename, cm->error); + } else if (tune_cfg->content == AOM_CONTENT_FILM) { + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; + if (oxcf->tool_cfg.enable_monochrome) + reset_film_grain_chroma_params(&cm->film_grain_params); + if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) + cm->film_grain_params.clip_to_restricted_range = 0; + } else { + memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); + } +} + +void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, + const int phase, const int use_optimized_scaler) { + AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1). + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + BufferPool *const pool = cm->buffer_pool; + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, ref_frame); + + if (ref == NULL) { + cpi->scaled_ref_buf[ref_frame - 1] = NULL; + continue; + } + + // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the + // motion search can be skipped for the references: last, golden, altref. + // If so, we can skip scaling that reference. + if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref && + cpi->ppi->rtc_ref.set_ref_frame_config) { + if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue; + if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue; + if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref) + continue; + } + // For RTC with superres on: golden reference only needs to be scaled + // if it was refreshed in previous frame. + if (is_one_pass_rt_params(cpi) && + cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME && + cpi->rc.frame_num_last_gf_refresh < + (int)cm->current_frame.frame_number - 1) { + continue; + } + + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + // Replace the reference buffer with a copy having a thicker border, + // if the reference buffer is higher resolution than the current + // frame, and the border is thin. + if ((ref->y_crop_width > cm->width || + ref->y_crop_height > cm->height) && + ref->border < AOM_BORDER_IN_PIXELS) { + RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); + if (aom_yv12_realloc_with_new_border( + &ref_fb->buf, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, cpi->image_pyramid_levels, + num_planes) != 0) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + } + int force_scaling = 0; + RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1]; + if (new_fb == NULL) { + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Unable to find free frame buffer"); + } + force_scaling = 1; + new_fb = &pool->frame_bufs[new_fb_idx]; + } + + if (force_scaling || new_fb->buf.y_crop_width != cm->width || + new_fb->buf.y_crop_height != cm->height) { + if (aom_realloc_frame_buffer( + &new_fb->buf, cm->width, cm->height, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) { + if (force_scaling) { + // Release the reference acquired in the get_free_fb() call above. + --new_fb->ref_count; + } + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + bool has_optimized_scaler = av1_has_optimized_scaler( + ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width, + new_fb->buf.y_crop_height); + if (num_planes > 1) { + has_optimized_scaler = + has_optimized_scaler && + av1_has_optimized_scaler( + ref->uv_crop_width, ref->uv_crop_height, + new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height); + } +#if CONFIG_AV1_HIGHBITDEPTH + if (use_optimized_scaler && has_optimized_scaler && + cm->seq_params->bit_depth == AOM_BITS_8) { + av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, + num_planes); + } else if (!av1_resize_and_extend_frame_nonnormative( + ref, &new_fb->buf, (int)cm->seq_params->bit_depth, + num_planes)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffer during resize"); + } +#else + if (use_optimized_scaler && has_optimized_scaler) { + av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, + num_planes); + } else if (!av1_resize_and_extend_frame_nonnormative( + ref, &new_fb->buf, (int)cm->seq_params->bit_depth, + num_planes)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate buffer during resize"); + } +#endif + cpi->scaled_ref_buf[ref_frame - 1] = new_fb; + alloc_frame_mvs(cm, new_fb); + } + } else { + RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); + buf->buf.y_crop_width = ref->y_crop_width; + buf->buf.y_crop_height = ref->y_crop_height; + cpi->scaled_ref_buf[ref_frame - 1] = buf; + ++buf->ref_count; + } + } else { + if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL; + } + } +} + +BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, + int height, int number_spatial_layers) { + if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) { + return BLOCK_64X64; + } + if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) { + return BLOCK_128X128; + } +#if CONFIG_TFLITE + if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64; +#endif + // Force 64x64 superblock size to increase resolution in perceptual + // AQ mode. + if (oxcf->mode == ALLINTRA && + (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI || + oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) { + return BLOCK_64X64; + } + assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); + + if (number_spatial_layers > 1 || + oxcf->resize_cfg.resize_mode != RESIZE_NONE) { + // Use the configured size (top resolution) for spatial layers or + // on resize. + return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720 + ? BLOCK_128X128 + : BLOCK_64X64; + } else if (oxcf->mode == REALTIME) { + if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) { + const TileConfig *const tile_cfg = &oxcf->tile_cfg; + const int num_tiles = + (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows); + // For multi-thread encode: if the number of (128x128) superblocks + // per tile is low use 64X64 superblock. + if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 && + oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 && + (width * height) / (128 * 128 * num_tiles) <= 38) + return BLOCK_64X64; + else + return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64; + } else { + return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64; + } + } + + // TODO(any): Possibly could improve this with a heuristic. + // When superres / resize is on, 'cm->width / height' can change between + // calls, so we don't apply this heuristic there. + // Things break if superblock size changes between the first pass and second + // pass encoding, which is why this heuristic is not configured as a + // speed-feature. + if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE && + oxcf->resize_cfg.resize_mode == RESIZE_NONE) { + int is_480p_or_lesser = AOMMIN(width, height) <= 480; + if (oxcf->speed >= 1 && is_480p_or_lesser) return BLOCK_64X64; + + // For 1080p and lower resolutions, choose SB size adaptively based on + // resolution and speed level for multi-thread encode. + int is_1080p_or_lesser = AOMMIN(width, height) <= 1080; + if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD && + oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5) + return BLOCK_64X64; + + // For allintra encode, since the maximum partition size is set to 32X32 for + // speed>=6, superblock size is set to 64X64 instead of 128X128. This + // improves the multithread performance due to reduction in top right delay + // and thread sync wastage. Currently, this setting is selectively enabled + // only for speed>=9 and resolutions less than 4k since cost update + // frequency is set to INTERNAL_COST_UPD_OFF in these cases. + const int is_4k_or_larger = AOMMIN(width, height) >= 2160; + if (oxcf->mode == ALLINTRA && oxcf->speed >= 9 && !is_4k_or_larger) + return BLOCK_64X64; + } + return BLOCK_128X128; +} + +void av1_setup_frame(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + + if (frame_is_intra_only(cm) || cm->features.error_resilient_mode || + cpi->ext_flags.use_primary_ref_none) { + av1_setup_past_independence(cm); + } + + if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || + frame_is_sframe(cm)) { + if (!cpi->ppi->seq_params_locked) { + set_sb_size(cm->seq_params, + av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, + cpi->ppi->number_spatial_layers)); + } + } else { + const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm); + if (primary_ref_buf == NULL) { + av1_setup_past_independence(cm); + cm->seg.update_map = 1; + cm->seg.update_data = 1; + } else { + *cm->fc = primary_ref_buf->frame_context; + } + } + + av1_zero(cm->cur_frame->interp_filter_selected); + cm->prev_frame = get_primary_ref_frame_buf(cm); + cpi->vaq_refresh = 0; +} + +#if !CONFIG_REALTIME_ONLY +static int get_interp_filter_selected(const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref, + InterpFilter ifilter) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) return 0; + return buf->interp_filter_selected[ifilter]; +} + +uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + int ref_total[REF_FRAMES] = { 0 }; + uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK; + + if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame) + return mask; + + for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter); + } + } + int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] + + ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] + + ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]); + + for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30; + if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) { + int filter_score = + get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10; + if (filter_score < ref_total_total) { + DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter; + reset_interp_filter_allowed_mask(&mask, filt_type); + } + } + } + return mask; +} + +#define STRICT_PSNR_DIFF_THRESH 0.9 +// Encode key frame with/without screen content tools to determine whether +// screen content tools should be enabled for this key frame group or not. +// The first encoding is without screen content tools. +// The second encoding is with screen content tools. +// We compare the psnr and frame size to make the decision. +static void screen_content_tools_determination( + AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision, + const int allow_intrabc_orig_decision, + const int use_screen_content_tools_orig_decision, + const int is_screen_content_type_orig_decision, const int pass, + int *projected_size_pass, PSNR_STATS *psnr) { + AV1_COMMON *const cm = &cpi->common; + FeatureFlags *const features = &cm->features; + +#if CONFIG_FPMT_TEST + projected_size_pass[pass] = + ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) && + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)) + ? cpi->ppi->p_rc.temp_projected_frame_size + : cpi->rc.projected_frame_size; +#else + projected_size_pass[pass] = cpi->rc.projected_frame_size; +#endif + +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass], + bit_depth, in_bit_depth); +#else + aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]); +#endif + if (pass != 1) return; + + const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0]; + // Calculate % of palette mode to be chosen in a frame from mode decision. + const double palette_ratio = + (double)cpi->palette_pixel_num / (double)(cm->height * cm->width); + const int psnr_diff_is_large = (psnr_diff > STRICT_PSNR_DIFF_THRESH); + const int ratio_is_large = + ((palette_ratio >= 0.0001) && ((psnr_diff / palette_ratio) > 4)); + const int is_sc_encoding_much_better = (psnr_diff_is_large || ratio_is_large); + if (is_sc_encoding_much_better) { + // Use screen content tools, if we get coding gain. + features->allow_screen_content_tools = 1; + features->allow_intrabc = cpi->intrabc_used; + cpi->use_screen_content_tools = 1; + cpi->is_screen_content_type = 1; + } else { + // Use original screen content decision. + features->allow_screen_content_tools = + allow_screen_content_tools_orig_decision; + features->allow_intrabc = allow_intrabc_orig_decision; + cpi->use_screen_content_tools = use_screen_content_tools_orig_decision; + cpi->is_screen_content_type = is_screen_content_type_orig_decision; + } +} + +// Set some encoding parameters to make the encoding process fast. +// A fixed block partition size, and a large q is used. +static void set_encoding_params_for_screen_content(AV1_COMP *cpi, + const int pass) { + AV1_COMMON *const cm = &cpi->common; + if (pass == 0) { + // In the first pass, encode without screen content tools. + // Use a high q, and a fixed block size for fast encoding. + cm->features.allow_screen_content_tools = 0; + cm->features.allow_intrabc = 0; + cpi->use_screen_content_tools = 0; + cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; + return; + } + assert(pass == 1); + // In the second pass, encode with screen content tools. + // Use a high q, and a fixed block size for fast encoding. + cm->features.allow_screen_content_tools = 1; + // TODO(chengchen): turn intrabc on could lead to data race issue. + // cm->allow_intrabc = 1; + cpi->use_screen_content_tools = 1; + cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; +} + +// Determines whether to use screen content tools for the key frame group. +// This function modifies "cm->features.allow_screen_content_tools", +// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools". +void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + // Variables to help determine if we should allow screen content tools. + int projected_size_pass[3] = { 0 }; + PSNR_STATS psnr[3]; + const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME; + const int allow_screen_content_tools_orig_decision = + cm->features.allow_screen_content_tools; + const int allow_intrabc_orig_decision = cm->features.allow_intrabc; + const int use_screen_content_tools_orig_decision = + cpi->use_screen_content_tools; + const int is_screen_content_type_orig_decision = cpi->is_screen_content_type; + // Turn off the encoding trial for forward key frame and superres. + if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled || + cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME || + use_screen_content_tools_orig_decision || !is_key_frame) { + return; + } + + // TODO(chengchen): multiple encoding for the lossless mode is time consuming. + // Find a better way to determine whether screen content tools should be used + // for lossless coding. + // Use a high q and a fixed partition to do quick encoding. + const int q_for_screen_content_quick_run = + is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244); + const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type; + const BLOCK_SIZE fixed_partition_block_size_orig = + cpi->sf.part_sf.fixed_partition_size; + + // Setup necessary params for encoding, including frame source, etc. + + cpi->source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, + 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + av1_setup_frame(cpi); + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + // The two encoding passes aim to help determine whether to use screen + // content tools, with a high q and fixed partition. + for (int pass = 0; pass < 2; ++pass) { + set_encoding_params_for_screen_content(cpi, pass); + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, + q_for_screen_content_quick_run, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run, + 0); + // transform / motion compensation build reconstruction frame + av1_encode_frame(cpi); + // Screen content decision + screen_content_tools_determination( + cpi, allow_screen_content_tools_orig_decision, + allow_intrabc_orig_decision, use_screen_content_tools_orig_decision, + is_screen_content_type_orig_decision, pass, projected_size_pass, psnr); + } + + // Set partition speed feature back. + cpi->sf.part_sf.partition_search_type = partition_search_type_orig; + cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig; + + // Free token related info if screen content coding tools are not enabled. + if (!cm->features.allow_screen_content_tools) + free_token_info(&cpi->token_info); +} +#endif // CONFIG_REALTIME_ONLY + +static void fix_interp_filter(InterpFilter *const interp_filter, + const FRAME_COUNTS *const counts) { + if (*interp_filter == SWITCHABLE) { + // Check to see if only one of the filters is actually used + int count[SWITCHABLE_FILTERS] = { 0 }; + int num_filters_used = 0; + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + count[i] += counts->switchable_interp[j][i]; + num_filters_used += (count[i] > 0); + } + if (num_filters_used == 1) { + // Only one filter is used. So set the filter at frame level + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + if (count[i]) { + *interp_filter = i; + break; + } + } + } + } +} + +void av1_finalize_encoded_frame(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + + if (!cm->seq_params->reduced_still_picture_hdr && + encode_show_existing_frame(cm)) { + RefCntBuffer *const frame_to_show = + cm->ref_frame_map[cpi->existing_fb_idx_to_show]; + + if (frame_to_show == NULL) { + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a reconstructed frame"); + } + assert(frame_to_show->ref_count > 0); + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + } + + if (!encode_show_existing_frame(cm) && + cm->seq_params->film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { + // Copy the current frame's film grain params to the its corresponding + // RefCntBuffer slot. + cm->cur_frame->film_grain_params = cm->film_grain_params; + + // We must update the parameters if this is not an INTER_FRAME + if (current_frame->frame_type != INTER_FRAME) + cm->cur_frame->film_grain_params.update_parameters = 1; + + // Iterate the random seed for the next frame. + cm->film_grain_params.random_seed += 3381; + if (cm->film_grain_params.random_seed == 0) + cm->film_grain_params.random_seed = 7391; + } + + // Initialise all tiles' contexts from the global frame context + for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) { + for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) { + const int tile_idx = tile_row * cm->tiles.cols + tile_col; + cpi->tile_data[tile_idx].tctx = *cm->fc; + } + } + + if (!frame_is_intra_only(cm)) + fix_interp_filter(&cm->features.interp_filter, cpi->td.counts); +} + +int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, + const YV12_BUFFER_CONFIG *last_picture, + ForceIntegerMVInfo *const force_intpel_info) { + // check use hash ME + int k; + + const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE; + const double threshold_current = 0.8; + const double threshold_average = 0.95; + const int max_history_size = 32; + int T = 0; // total block + int C = 0; // match with collocated block + int S = 0; // smooth region but not match with collocated block + + const int pic_width = cur_picture->y_width; + const int pic_height = cur_picture->y_height; + for (int i = 0; i + block_size <= pic_height; i += block_size) { + for (int j = 0; j + block_size <= pic_width; j += block_size) { + const int x_pos = j; + const int y_pos = i; + int match = 1; + T++; + + // check whether collocated block match with current + uint8_t *p_cur = cur_picture->y_buffer; + uint8_t *p_ref = last_picture->y_buffer; + int stride_cur = cur_picture->y_stride; + int stride_ref = last_picture->y_stride; + p_cur += (y_pos * stride_cur + x_pos); + p_ref += (y_pos * stride_ref + x_pos); + + if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); + uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p16_cur[tmpX] != p16_ref[tmpX]) { + match = 0; + } + } + p16_cur += stride_cur; + p16_ref += stride_ref; + } + } else { + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p_cur[tmpX] != p_ref[tmpX]) { + match = 0; + } + } + p_cur += stride_cur; + p_ref += stride_ref; + } + } + + if (match) { + C++; + continue; + } + + if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos, + y_pos) || + av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) { + S++; + continue; + } + } + } + + assert(T > 0); + double cs_rate = ((double)(C + S)) / ((double)(T)); + + force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate; + + force_intpel_info->rate_index = + (force_intpel_info->rate_index + 1) % max_history_size; + force_intpel_info->rate_size++; + force_intpel_info->rate_size = + AOMMIN(force_intpel_info->rate_size, max_history_size); + + if (cs_rate < threshold_current) { + return 0; + } + + if (C == T) { + return 1; + } + + double cs_average = 0.0; + + for (k = 0; k < force_intpel_info->rate_size; k++) { + cs_average += force_intpel_info->cs_rate_array[k]; + } + cs_average /= force_intpel_info->rate_size; + + if (cs_average < threshold_average) { + return 0; + } + + if ((T - C - S) < 0) { + return 1; + } + + if (cs_average > 1.01) { + return 1; + } + + return 0; +} + +void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + uint8_t *y_buffer = cpi->source->y_buffer; + const int y_stride = cpi->source->y_stride; + const int block_size = BLOCK_16X16; + + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h; + double log_sum = 0.0; + + // Loop through each 16x16 block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + // Loop through each 8x8 block. + for (int mi_row = row * num_mi_h; + mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h; + mi_row += 2) { + for (int mi_col = col * num_mi_w; + mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w; + mi_col += 2) { + struct buf_2d buf; + const int row_offset_y = mi_row << 2; + const int col_offset_y = mi_col << 2; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + var += av1_get_perpixel_variance_facade(cpi, xd, &buf, BLOCK_8X8, + AOM_PLANE_Y); + num_of_var += 1.0; + } + } + var = var / num_of_var; + + // Curve fitting with an exponential model on all 16x16 blocks from the + // midres dataset. + var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222; + + // As per the above computation, var will be in the range of + // [17.492222, 84.527656], assuming the data type is of infinite + // precision. The following assert conservatively checks if var is in the + // range of [17.0, 85.0] to avoid any issues due to the precision of the + // relevant data type. + assert(var > 17.0 && var < 85.0); + cpi->ssim_rdmult_scaling_factors[index] = var; + log_sum += log(var); + } + } + + // As log_sum holds the geometric mean, it will be in the range + // [17.492222, 84.527656]. Hence, in the below loop, the value of + // cpi->ssim_rdmult_scaling_factors[index] would be in the range + // [0.2069, 4.8323]. + log_sum = exp(log_sum / (double)(num_rows * num_cols)); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->ssim_rdmult_scaling_factors[index] /= log_sum; + } + } +} + +// Coding context that only needs to be saved when recode loop includes +// filtering (deblocking, CDEF, superres post-encode upscale and/or loop +// restoraton). +static void save_extra_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + + cc->lf = cm->lf; + cc->cdef_info = cm->cdef_info; + cc->rc = cpi->rc; + cc->mv_stats = cpi->ppi->mv_stats; +} + +void av1_save_all_coding_context(AV1_COMP *cpi) { + save_extra_coding_context(cpi); + if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); +} + +#if DUMP_RECON_FRAMES == 1 + +// NOTE(zoeliu): For debug - Output the filtered reconstructed video. +void av1_dump_filtered_recon_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const CurrentFrame *const current_frame = &cm->current_frame; + const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf; + + if (recon_buf == NULL) { + printf("Frame %d is not ready.\n", current_frame->frame_number); + return; + } + + static const int flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + printf( + "\n***Frame=%d (frame_offset=%d, show_frame=%d, " + "show_existing_frame=%d) " + "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[", + current_frame->frame_number, current_frame->order_hint, cm->show_frame, + cm->show_existing_frame); + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + const int ref_offset = buf != NULL ? (int)buf->order_hint : -1; + printf(" %d(%c)", ref_offset, + (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N'); + } + printf(" ]\n"); + + if (!cm->show_frame) { + printf("Frame %d is a no show frame, so no image dump.\n", + current_frame->frame_number); + return; + } + + int h; + char file_name[256] = "/tmp/enc_filtered_recon.yuv"; + FILE *f_recon = NULL; + + if (current_frame->frame_number == 0) { + if ((f_recon = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return; + } + } else { + if ((f_recon = fopen(file_name, "ab")) == NULL) { + printf("Unable to open file %s to append.\n", file_name); + return; + } + } + printf( + "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, " + "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " + "refresh_alt_ref_frame=%d, " + "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", + current_frame->frame_number, cpi->gf_frame_index, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + current_frame->order_hint, cm->show_frame, cm->show_existing_frame, + cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame, + recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height); +#if 0 + int ref_frame; + printf("get_ref_frame_map_idx: ["); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + printf(" %d", get_ref_frame_map_idx(cm, ref_frame)); + printf(" ]\n"); +#endif // 0 + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width, + f_recon); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1), + f_recon); + } + + fclose(f_recon); +} +#endif // DUMP_RECON_FRAMES diff --git a/third_party/aom/av1/encoder/encoder_utils.h b/third_party/aom/av1/encoder/encoder_utils.h new file mode 100644 index 0000000000..113f62aa59 --- /dev/null +++ b/third_party/aom/av1/encoder/encoder_utils.h @@ -0,0 +1,1141 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_ +#define AOM_AV1_ENCODER_ENCODER_UTILS_H_ + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define AM_SEGMENT_ID_INACTIVE 7 +#define AM_SEGMENT_ID_ACTIVE 0 +#define DUMP_RECON_FRAMES 0 + +extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL] + [TX_TYPES]; + +extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL]; + +extern const int default_warped_probs[FRAME_UPDATE_TYPES]; + +extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES] + [SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; + +// Mark all inactive blocks as active. Other segmentation features may be set +// so memset cannot be used, instead only inactive blocks should be reset. +static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) { + unsigned char *const seg_map = cpi->enc_seg.map; + int i; + const int num_mis = + cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; + if (cpi->active_map.enabled || cpi->active_map.update) + for (i = 0; i < num_mis; ++i) + if (seg_map[i] == AM_SEGMENT_ID_INACTIVE) + seg_map[i] = AM_SEGMENT_ID_ACTIVE; +} + +// Returns 'size' in the number of Mode Info (MI) units. 'size' is either the +// width or height. +static AOM_INLINE int size_in_mi(int size) { + // Ensure that the decoded width and height are both multiples of + // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if + // subsampling is used). + // This simplifies the implementation of various experiments, + // eg. cdef, which operates on units of 8x8 luma pixels. + const int aligned_size = ALIGN_POWER_OF_TWO(size, 3); + return aligned_size >> MI_SIZE_LOG2; +} + +static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height) { + mi_params->mi_cols = size_in_mi(width); + mi_params->mi_rows = size_in_mi(height); + mi_params->mi_stride = calc_mi_size(mi_params->mi_cols); + + mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2); + mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2); + mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols; + + const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize]; + mi_params->mi_alloc_stride = + (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d; + + assert(mi_size_wide[mi_params->mi_alloc_bsize] == + mi_size_high[mi_params->mi_alloc_bsize]); +} + +static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) { + aom_free(mi_params->mi_alloc); + mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; + aom_free(mi_params->mi_grid_base); + mi_params->mi_grid_base = NULL; + mi_params->mi_grid_size = 0; + aom_free(mi_params->tx_type_map); + mi_params->tx_type_map = NULL; +} + +static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width, + int height, + BLOCK_SIZE min_partition_size) { + mi_params->mi_alloc_bsize = min_partition_size; + + set_mb_mi(mi_params, width, height); +} + +static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, + int width, int height, + BLOCK_SIZE min_partition_size) { + (void)min_partition_size; + mi_params->mi_alloc_bsize = BLOCK_16X16; + + set_mb_mi(mi_params, width, height); +} + +static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) { + const int mi_grid_size = + mi_params->mi_stride * calc_mi_size(mi_params->mi_rows); + memset(mi_params->mi_alloc, 0, + mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc)); + memset(mi_params->mi_grid_base, 0, + mi_grid_size * sizeof(*mi_params->mi_grid_base)); + memset(mi_params->tx_type_map, 0, + mi_grid_size * sizeof(*mi_params->tx_type_map)); +} + +static AOM_INLINE void init_buffer_indices( + ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) { + int fb_idx; + for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) + remapped_ref_idx[fb_idx] = fb_idx; + force_intpel_info->rate_index = 0; + force_intpel_info->rate_size = 0; +} + +#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \ + ppi->fn_ptr[BT].sdf = SDF; \ + ppi->fn_ptr[BT].sdaf = SDAF; \ + ppi->fn_ptr[BT].vf = VF; \ + ppi->fn_ptr[BT].svf = SVF; \ + ppi->fn_ptr[BT].svaf = SVAF; \ + ppi->fn_ptr[BT].sdx4df = SDX4DF; \ + ppi->fn_ptr[BT].sdx3df = SDX3DF; \ + ppi->fn_ptr[BT].jsdaf = JSDAF; \ + ppi->fn_ptr[BT].jsvaf = JSVAF; + +#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_BFP( \ + BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \ + aom_highbd_##BD##_variance##WIDTH##x##HEIGHT, \ + aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \ + aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \ + aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \ + aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD, \ + aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \ + aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT) + +#define MAKE_BFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ + int source_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \ + } + +#define MAKE_BFP_SADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \ + 4; \ + } + +#define MAKE_BFP_SAD4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred, \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ + jcp_param) >> \ + 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d) + +#if !CONFIG_REALTIME_ONLY +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d) +MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16) +MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d) +MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d) +#endif + +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg) +#if !CONFIG_REALTIME_ONLY +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg) +#endif +#endif // CONFIG_AV1_HIGHBITDEPTH + +#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ + ppi->fn_ptr[BT].msdf = MCSDF; \ + ppi->fn_ptr[BT].msvf = MCSVF; + +#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT) + +#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask); \ + } \ + static unsigned int fnname##_bits10( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask) >> \ + 2; \ + } \ + static unsigned int fnname##_bits12( \ + const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \ + int m_stride, int invert_mask) { \ + return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \ + second_pred_ptr, m, m_stride, invert_mask) >> \ + 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4) +#if !CONFIG_REALTIME_ONLY +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64) +MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16) +#endif +#endif + +#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \ + ppi->fn_ptr[BT].sdsf = SDSF; \ + ppi->fn_ptr[BT].sdsx4df = SDSX4DF; + +#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD) + +#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return fnname(src, src_stride, ref, ref_stride); \ + } \ + static unsigned int fnname##_bits10(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return fnname(src, src_stride, ref, ref_stride) >> 2; \ + } \ + static unsigned int fnname##_bits12(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return fnname(src, src_stride, ref, ref_stride) >> 4; \ + } + +#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname) \ + static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + } \ + static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 2; \ + } \ + static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ + const uint8_t *const ref_ptr[], int ref_stride, \ + unsigned int *sad_array) { \ + int i; \ + fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ + for (i = 0; i < 4; i++) sad_array[i] >>= 4; \ + } + +#if CONFIG_AV1_HIGHBITDEPTH +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8) + +#if !CONFIG_REALTIME_ONLY +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16) +MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32) +#endif + +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d) + +#if !CONFIG_REALTIME_ONLY +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d) +MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d) +#endif +#endif + +#if !CONFIG_REALTIME_ONLY + +#if CONFIG_AV1_HIGHBITDEPTH +#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT) \ + HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \ + aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT, \ + aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) + +#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ + ppi->fn_ptr[BT].osdf = OSDF; \ + ppi->fn_ptr[BT].ovf = OVF; \ + ppi->fn_ptr[BT].osvf = OSVF; + +#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD) \ + HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ + aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD, \ + aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \ + aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) + +#define MAKE_OBFP_SAD_WRAPPER(fnname) \ + static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk); \ + } \ + static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 2; \ + } \ + static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *msk) { \ + return fnname(ref, ref_stride, wsrc, msk) >> 4; \ + } +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // !CONFIG_REALTIME_ONLY + +#if CONFIG_AV1_HIGHBITDEPTH +#if !CONFIG_REALTIME_ONLY +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64) +MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) +#endif + +static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) { + SequenceHeader *const seq_params = &ppi->seq_params; + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: +#if !CONFIG_REALTIME_ONLY + HIGHBD_BFP_WRAPPER(64, 16, 8) + HIGHBD_BFP_WRAPPER(16, 64, 8) + HIGHBD_BFP_WRAPPER(32, 8, 8) + HIGHBD_BFP_WRAPPER(8, 32, 8) + HIGHBD_BFP_WRAPPER(16, 4, 8) + HIGHBD_BFP_WRAPPER(4, 16, 8) +#endif + HIGHBD_BFP_WRAPPER(32, 16, 8) + HIGHBD_BFP_WRAPPER(16, 32, 8) + HIGHBD_BFP_WRAPPER(64, 32, 8) + HIGHBD_BFP_WRAPPER(32, 64, 8) + HIGHBD_BFP_WRAPPER(32, 32, 8) + HIGHBD_BFP_WRAPPER(64, 64, 8) + HIGHBD_BFP_WRAPPER(16, 16, 8) + HIGHBD_BFP_WRAPPER(16, 8, 8) + HIGHBD_BFP_WRAPPER(8, 16, 8) + HIGHBD_BFP_WRAPPER(8, 8, 8) + HIGHBD_BFP_WRAPPER(8, 4, 8) + HIGHBD_BFP_WRAPPER(4, 8, 8) + HIGHBD_BFP_WRAPPER(4, 4, 8) + HIGHBD_BFP_WRAPPER(128, 128, 8) + HIGHBD_BFP_WRAPPER(128, 64, 8) + HIGHBD_BFP_WRAPPER(64, 128, 8) + + HIGHBD_MBFP_WRAPPER(128, 128, 8) + HIGHBD_MBFP_WRAPPER(128, 64, 8) + HIGHBD_MBFP_WRAPPER(64, 128, 8) + HIGHBD_MBFP_WRAPPER(64, 64, 8) + HIGHBD_MBFP_WRAPPER(64, 32, 8) + HIGHBD_MBFP_WRAPPER(32, 64, 8) + HIGHBD_MBFP_WRAPPER(32, 32, 8) + HIGHBD_MBFP_WRAPPER(32, 16, 8) + HIGHBD_MBFP_WRAPPER(16, 32, 8) + HIGHBD_MBFP_WRAPPER(16, 16, 8) + HIGHBD_MBFP_WRAPPER(8, 16, 8) + HIGHBD_MBFP_WRAPPER(16, 8, 8) + HIGHBD_MBFP_WRAPPER(8, 8, 8) + HIGHBD_MBFP_WRAPPER(4, 8, 8) + HIGHBD_MBFP_WRAPPER(8, 4, 8) + HIGHBD_MBFP_WRAPPER(4, 4, 8) +#if !CONFIG_REALTIME_ONLY + HIGHBD_MBFP_WRAPPER(64, 16, 8) + HIGHBD_MBFP_WRAPPER(16, 64, 8) + HIGHBD_MBFP_WRAPPER(32, 8, 8) + HIGHBD_MBFP_WRAPPER(8, 32, 8) + HIGHBD_MBFP_WRAPPER(16, 4, 8) + HIGHBD_MBFP_WRAPPER(4, 16, 8) +#endif + +// OBMC excluded from realtime only build. +#if !CONFIG_REALTIME_ONLY + HIGHBD_OBFP_WRAPPER_8(128, 128) + HIGHBD_OBFP_WRAPPER_8(128, 64) + HIGHBD_OBFP_WRAPPER_8(64, 128) + HIGHBD_OBFP_WRAPPER_8(64, 64) + HIGHBD_OBFP_WRAPPER_8(64, 32) + HIGHBD_OBFP_WRAPPER_8(32, 64) + HIGHBD_OBFP_WRAPPER_8(32, 32) + HIGHBD_OBFP_WRAPPER_8(32, 16) + HIGHBD_OBFP_WRAPPER_8(16, 32) + HIGHBD_OBFP_WRAPPER_8(16, 16) + HIGHBD_OBFP_WRAPPER_8(8, 16) + HIGHBD_OBFP_WRAPPER_8(16, 8) + HIGHBD_OBFP_WRAPPER_8(8, 8) + HIGHBD_OBFP_WRAPPER_8(4, 8) + HIGHBD_OBFP_WRAPPER_8(8, 4) + HIGHBD_OBFP_WRAPPER_8(4, 4) + HIGHBD_OBFP_WRAPPER_8(64, 16) + HIGHBD_OBFP_WRAPPER_8(16, 64) + HIGHBD_OBFP_WRAPPER_8(32, 8) + HIGHBD_OBFP_WRAPPER_8(8, 32) + HIGHBD_OBFP_WRAPPER_8(16, 4) + HIGHBD_OBFP_WRAPPER_8(4, 16) +#endif + + HIGHBD_SDSFP_WRAPPER(128, 128, 8) + HIGHBD_SDSFP_WRAPPER(128, 64, 8) + HIGHBD_SDSFP_WRAPPER(64, 128, 8) + HIGHBD_SDSFP_WRAPPER(64, 64, 8) + HIGHBD_SDSFP_WRAPPER(64, 32, 8) + HIGHBD_SDSFP_WRAPPER(32, 64, 8) + HIGHBD_SDSFP_WRAPPER(32, 32, 8) + HIGHBD_SDSFP_WRAPPER(32, 16, 8) + HIGHBD_SDSFP_WRAPPER(16, 32, 8) + HIGHBD_SDSFP_WRAPPER(16, 16, 8) + HIGHBD_SDSFP_WRAPPER(16, 8, 8) + HIGHBD_SDSFP_WRAPPER(8, 16, 8) + HIGHBD_SDSFP_WRAPPER(8, 8, 8) + HIGHBD_SDSFP_WRAPPER(4, 8, 8) +#if !CONFIG_REALTIME_ONLY + HIGHBD_SDSFP_WRAPPER(64, 16, 8) + HIGHBD_SDSFP_WRAPPER(32, 8, 8) + HIGHBD_SDSFP_WRAPPER(16, 64, 8) + HIGHBD_SDSFP_WRAPPER(8, 32, 8) + HIGHBD_SDSFP_WRAPPER(4, 16, 8) +#endif + break; + + case AOM_BITS_10: +#if !CONFIG_REALTIME_ONLY + HIGHBD_BFP_WRAPPER(64, 16, 10) + HIGHBD_BFP_WRAPPER(16, 64, 10) + HIGHBD_BFP_WRAPPER(32, 8, 10) + HIGHBD_BFP_WRAPPER(8, 32, 10) + HIGHBD_BFP_WRAPPER(16, 4, 10) + HIGHBD_BFP_WRAPPER(4, 16, 10) +#endif + HIGHBD_BFP_WRAPPER(32, 16, 10) + HIGHBD_BFP_WRAPPER(16, 32, 10) + HIGHBD_BFP_WRAPPER(64, 32, 10) + HIGHBD_BFP_WRAPPER(32, 64, 10) + HIGHBD_BFP_WRAPPER(32, 32, 10) + HIGHBD_BFP_WRAPPER(64, 64, 10) + HIGHBD_BFP_WRAPPER(16, 16, 10) + HIGHBD_BFP_WRAPPER(16, 8, 10) + HIGHBD_BFP_WRAPPER(8, 16, 10) + HIGHBD_BFP_WRAPPER(8, 8, 10) + HIGHBD_BFP_WRAPPER(8, 4, 10) + HIGHBD_BFP_WRAPPER(4, 8, 10) + HIGHBD_BFP_WRAPPER(4, 4, 10) + HIGHBD_BFP_WRAPPER(128, 128, 10) + HIGHBD_BFP_WRAPPER(128, 64, 10) + HIGHBD_BFP_WRAPPER(64, 128, 10) + + HIGHBD_MBFP_WRAPPER(128, 128, 10) + HIGHBD_MBFP_WRAPPER(128, 64, 10) + HIGHBD_MBFP_WRAPPER(64, 128, 10) + HIGHBD_MBFP_WRAPPER(64, 64, 10) + HIGHBD_MBFP_WRAPPER(64, 32, 10) + HIGHBD_MBFP_WRAPPER(32, 64, 10) + HIGHBD_MBFP_WRAPPER(32, 32, 10) + HIGHBD_MBFP_WRAPPER(32, 16, 10) + HIGHBD_MBFP_WRAPPER(16, 32, 10) + HIGHBD_MBFP_WRAPPER(16, 16, 10) + HIGHBD_MBFP_WRAPPER(8, 16, 10) + HIGHBD_MBFP_WRAPPER(16, 8, 10) + HIGHBD_MBFP_WRAPPER(8, 8, 10) + HIGHBD_MBFP_WRAPPER(4, 8, 10) + HIGHBD_MBFP_WRAPPER(8, 4, 10) + HIGHBD_MBFP_WRAPPER(4, 4, 10) +#if !CONFIG_REALTIME_ONLY + HIGHBD_MBFP_WRAPPER(64, 16, 10) + HIGHBD_MBFP_WRAPPER(16, 64, 10) + HIGHBD_MBFP_WRAPPER(32, 8, 10) + HIGHBD_MBFP_WRAPPER(8, 32, 10) + HIGHBD_MBFP_WRAPPER(16, 4, 10) + HIGHBD_MBFP_WRAPPER(4, 16, 10) +#endif + +// OBMC excluded from realtime only build. +#if !CONFIG_REALTIME_ONLY + HIGHBD_OBFP_WRAPPER(128, 128, 10) + HIGHBD_OBFP_WRAPPER(128, 64, 10) + HIGHBD_OBFP_WRAPPER(64, 128, 10) + HIGHBD_OBFP_WRAPPER(64, 64, 10) + HIGHBD_OBFP_WRAPPER(64, 32, 10) + HIGHBD_OBFP_WRAPPER(32, 64, 10) + HIGHBD_OBFP_WRAPPER(32, 32, 10) + HIGHBD_OBFP_WRAPPER(32, 16, 10) + HIGHBD_OBFP_WRAPPER(16, 32, 10) + HIGHBD_OBFP_WRAPPER(16, 16, 10) + HIGHBD_OBFP_WRAPPER(8, 16, 10) + HIGHBD_OBFP_WRAPPER(16, 8, 10) + HIGHBD_OBFP_WRAPPER(8, 8, 10) + HIGHBD_OBFP_WRAPPER(4, 8, 10) + HIGHBD_OBFP_WRAPPER(8, 4, 10) + HIGHBD_OBFP_WRAPPER(4, 4, 10) + HIGHBD_OBFP_WRAPPER(64, 16, 10) + HIGHBD_OBFP_WRAPPER(16, 64, 10) + HIGHBD_OBFP_WRAPPER(32, 8, 10) + HIGHBD_OBFP_WRAPPER(8, 32, 10) + HIGHBD_OBFP_WRAPPER(16, 4, 10) + HIGHBD_OBFP_WRAPPER(4, 16, 10) +#endif + + HIGHBD_SDSFP_WRAPPER(128, 128, 10) + HIGHBD_SDSFP_WRAPPER(128, 64, 10) + HIGHBD_SDSFP_WRAPPER(64, 128, 10) + HIGHBD_SDSFP_WRAPPER(64, 64, 10) + HIGHBD_SDSFP_WRAPPER(64, 32, 10) + HIGHBD_SDSFP_WRAPPER(32, 64, 10) + HIGHBD_SDSFP_WRAPPER(32, 32, 10) + HIGHBD_SDSFP_WRAPPER(32, 16, 10) + HIGHBD_SDSFP_WRAPPER(16, 32, 10) + HIGHBD_SDSFP_WRAPPER(16, 16, 10) + HIGHBD_SDSFP_WRAPPER(16, 8, 10) + HIGHBD_SDSFP_WRAPPER(8, 16, 10) + HIGHBD_SDSFP_WRAPPER(8, 8, 10) + HIGHBD_SDSFP_WRAPPER(4, 8, 10) + +#if !CONFIG_REALTIME_ONLY + HIGHBD_SDSFP_WRAPPER(64, 16, 10) + HIGHBD_SDSFP_WRAPPER(32, 8, 10) + HIGHBD_SDSFP_WRAPPER(16, 64, 10) + HIGHBD_SDSFP_WRAPPER(8, 32, 10) + HIGHBD_SDSFP_WRAPPER(4, 16, 10) +#endif + break; + + case AOM_BITS_12: +#if !CONFIG_REALTIME_ONLY + HIGHBD_BFP_WRAPPER(64, 16, 12) + HIGHBD_BFP_WRAPPER(16, 64, 12) + HIGHBD_BFP_WRAPPER(32, 8, 12) + HIGHBD_BFP_WRAPPER(8, 32, 12) + HIGHBD_BFP_WRAPPER(16, 4, 12) + HIGHBD_BFP_WRAPPER(4, 16, 12) +#endif + HIGHBD_BFP_WRAPPER(32, 16, 12) + HIGHBD_BFP_WRAPPER(16, 32, 12) + HIGHBD_BFP_WRAPPER(64, 32, 12) + HIGHBD_BFP_WRAPPER(32, 64, 12) + HIGHBD_BFP_WRAPPER(32, 32, 12) + HIGHBD_BFP_WRAPPER(64, 64, 12) + HIGHBD_BFP_WRAPPER(16, 16, 12) + HIGHBD_BFP_WRAPPER(16, 8, 12) + HIGHBD_BFP_WRAPPER(8, 16, 12) + HIGHBD_BFP_WRAPPER(8, 8, 12) + HIGHBD_BFP_WRAPPER(8, 4, 12) + HIGHBD_BFP_WRAPPER(4, 8, 12) + HIGHBD_BFP_WRAPPER(4, 4, 12) + HIGHBD_BFP_WRAPPER(128, 128, 12) + HIGHBD_BFP_WRAPPER(128, 64, 12) + HIGHBD_BFP_WRAPPER(64, 128, 12) + + HIGHBD_MBFP_WRAPPER(128, 128, 12) + HIGHBD_MBFP_WRAPPER(128, 64, 12) + HIGHBD_MBFP_WRAPPER(64, 128, 12) + HIGHBD_MBFP_WRAPPER(64, 64, 12) + HIGHBD_MBFP_WRAPPER(64, 32, 12) + HIGHBD_MBFP_WRAPPER(32, 64, 12) + HIGHBD_MBFP_WRAPPER(32, 32, 12) + HIGHBD_MBFP_WRAPPER(32, 16, 12) + HIGHBD_MBFP_WRAPPER(16, 32, 12) + HIGHBD_MBFP_WRAPPER(16, 16, 12) + HIGHBD_MBFP_WRAPPER(8, 16, 12) + HIGHBD_MBFP_WRAPPER(16, 8, 12) + HIGHBD_MBFP_WRAPPER(8, 8, 12) + HIGHBD_MBFP_WRAPPER(4, 8, 12) + HIGHBD_MBFP_WRAPPER(8, 4, 12) + HIGHBD_MBFP_WRAPPER(4, 4, 12) +#if !CONFIG_REALTIME_ONLY + HIGHBD_MBFP_WRAPPER(64, 16, 12) + HIGHBD_MBFP_WRAPPER(16, 64, 12) + HIGHBD_MBFP_WRAPPER(32, 8, 12) + HIGHBD_MBFP_WRAPPER(8, 32, 12) + HIGHBD_MBFP_WRAPPER(16, 4, 12) + HIGHBD_MBFP_WRAPPER(4, 16, 12) +#endif + +// OBMC excluded from realtime only build. +#if !CONFIG_REALTIME_ONLY + HIGHBD_OBFP_WRAPPER(128, 128, 12) + HIGHBD_OBFP_WRAPPER(128, 64, 12) + HIGHBD_OBFP_WRAPPER(64, 128, 12) + HIGHBD_OBFP_WRAPPER(64, 64, 12) + HIGHBD_OBFP_WRAPPER(64, 32, 12) + HIGHBD_OBFP_WRAPPER(32, 64, 12) + HIGHBD_OBFP_WRAPPER(32, 32, 12) + HIGHBD_OBFP_WRAPPER(32, 16, 12) + HIGHBD_OBFP_WRAPPER(16, 32, 12) + HIGHBD_OBFP_WRAPPER(16, 16, 12) + HIGHBD_OBFP_WRAPPER(8, 16, 12) + HIGHBD_OBFP_WRAPPER(16, 8, 12) + HIGHBD_OBFP_WRAPPER(8, 8, 12) + HIGHBD_OBFP_WRAPPER(4, 8, 12) + HIGHBD_OBFP_WRAPPER(8, 4, 12) + HIGHBD_OBFP_WRAPPER(4, 4, 12) + HIGHBD_OBFP_WRAPPER(64, 16, 12) + HIGHBD_OBFP_WRAPPER(16, 64, 12) + HIGHBD_OBFP_WRAPPER(32, 8, 12) + HIGHBD_OBFP_WRAPPER(8, 32, 12) + HIGHBD_OBFP_WRAPPER(16, 4, 12) + HIGHBD_OBFP_WRAPPER(4, 16, 12) +#endif + + HIGHBD_SDSFP_WRAPPER(128, 128, 12) + HIGHBD_SDSFP_WRAPPER(128, 64, 12) + HIGHBD_SDSFP_WRAPPER(64, 128, 12) + HIGHBD_SDSFP_WRAPPER(64, 64, 12) + HIGHBD_SDSFP_WRAPPER(64, 32, 12) + HIGHBD_SDSFP_WRAPPER(32, 64, 12) + HIGHBD_SDSFP_WRAPPER(32, 32, 12) + HIGHBD_SDSFP_WRAPPER(32, 16, 12) + HIGHBD_SDSFP_WRAPPER(16, 32, 12) + HIGHBD_SDSFP_WRAPPER(16, 16, 12) + HIGHBD_SDSFP_WRAPPER(16, 8, 12) + HIGHBD_SDSFP_WRAPPER(8, 16, 12) + HIGHBD_SDSFP_WRAPPER(8, 8, 12) + HIGHBD_SDSFP_WRAPPER(4, 8, 12) + +#if !CONFIG_REALTIME_ONLY + HIGHBD_SDSFP_WRAPPER(64, 16, 12) + HIGHBD_SDSFP_WRAPPER(32, 8, 12) + HIGHBD_SDSFP_WRAPPER(16, 64, 12) + HIGHBD_SDSFP_WRAPPER(8, 32, 12) + HIGHBD_SDSFP_WRAPPER(4, 16, 12) +#endif + break; + + default: + assert(0 && + "cm->seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) { + FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(frame_probs->tx_type_probs, default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(frame_probs->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(frame_probs->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(frame_probs->switchable_interp_probs, + default_switchable_interp_probs); + } + +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(temp_frame_probs->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(temp_frame_probs->switchable_interp_probs, + default_switchable_interp_probs); + } + + FrameProbInfo *const temp_frame_probs_simulation = + &cpi->ppi->temp_frame_probs_simulation; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(temp_frame_probs_simulation->tx_type_probs, + default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(temp_frame_probs_simulation->switchable_interp_probs, + default_switchable_interp_probs); + } + } +#endif +} + +static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst, + const CdefInfo *const src) { + dst->cdef_bits = src->cdef_bits; + dst->cdef_damping = src->cdef_damping; + av1_copy(dst->cdef_strengths, src->cdef_strengths); + av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths); + dst->nb_cdef_strengths = src->nb_cdef_strengths; +} + +// Coding context that only needs to be restored when recode loop includes +// filtering (deblocking, CDEF, superres post-encode upscale and/or loop +// restoraton). +static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + AV1_COMMON *cm = &cpi->common; + cm->lf = cc->lf; + restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info); + cpi->rc = cc->rc; + cpi->ppi->mv_stats = cc->mv_stats; +} + +static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width && + a->y_stride == b->y_stride && a->uv_stride == b->uv_stride && + a->border == b->border && + (a->flags & YV12_FLAG_HIGHBITDEPTH) == + (b->flags & YV12_FLAG_HIGHBITDEPTH); +} + +static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context, + bool *ext_refresh_frame_context_pending, + bool update) { + *ext_refresh_frame_context = update; + *ext_refresh_frame_context_pending = 1; + return 0; +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor, + double max_factor, + int prior_boost, + int tpl_boost, + int frames_to_key) { + double factor = sqrt((double)frames_to_key); + double range = max_factor - min_factor; + factor = AOMMIN(factor, max_factor); + factor = AOMMAX(factor, min_factor); + factor -= min_factor; + int boost = + (int)((factor * prior_boost + (range - factor) * tpl_boost) / range); + return boost; +} +#endif + +static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) { + int i; + AV1_COMMON *const cm = &cpi->common; + FeatureFlags *const features = &cm->features; + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + cm->global_motion[i] = default_warp_params; + } + cpi->gm_info.search_done = 0; + + av1_set_speed_features_framesize_independent(cpi, cpi->speed); + av1_set_rd_speed_thresholds(cpi); + features->interp_filter = SWITCHABLE; + features->switchable_motion_mode = is_switchable_motion_mode_allowed( + features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc); +} + +static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) { + // Scaled references should only need to be released under certain conditions: + // if the reference will be updated, or if the scaled reference has same + // resolution. For now only apply this to Golden for non-svc RTC mode. + AV1_COMMON *const cm = &cpi->common; + const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0; + bool release_golden = true; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; + const int golden_ref = (i == GOLDEN_FRAME - 1); + if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc && + buf != NULL) { + const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME); + const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width && + buf->buf.y_crop_height == ref->buf.y_crop_height; + release_golden = refresh_golden || same_resoln; + } + if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) { + --buf->ref_count; + cpi->scaled_ref_buf[i] = NULL; + } + } +} + +static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) { + restore_extra_coding_context(cpi); + if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi); +} + +static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) { + const SequenceHeader *const seq_params = cpi->common.seq_params; + return is_one_pass_rt_params(cpi) && + use_rtc_reference_structure_one_layer(cpi) && + (seq_params->order_hint_info.enable_order_hint == 0) && + cpi->rt_reduce_num_ref_buffers; +} + +// Refresh reference frame buffers according to refresh_frame_flags. +static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // All buffers are refreshed for shown keyframes and S-frames. + // In case of RT, golden frame refreshes the 6th slot and other reference + // frames refresh slots 0 to 5. Slot 7 is not refreshed by any reference + // frame. Thus, only 7 buffers are refreshed for keyframes and S-frames + // instead of 8. + int num_ref_buffers = REF_FRAMES; + if (reduce_num_ref_buffers(cpi)) { + const int refresh_all_bufs = + (cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET || + frame_is_sframe(cm)); + assert(IMPLIES(((cm->current_frame.refresh_frame_flags >> 7) & 1) == 1, + refresh_all_bufs)); + (void)refresh_all_bufs; + num_ref_buffers--; + } + + for (int ref_frame = 0; ref_frame < num_ref_buffers; ref_frame++) { + if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) { + assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame); + } + } +} + +void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf); +void av1_update_film_grain_parameters(struct AV1_COMP *cpi, + const AV1EncoderConfig *oxcf); + +void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, + const int phase, const int use_optimized_scaler); + +void av1_setup_frame(AV1_COMP *cpi); + +BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, + int height, int number_spatial_layers); + +void av1_apply_active_map(AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi); + +void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig); +#endif + +void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, + int *top_index); + +void av1_finalize_encoded_frame(AV1_COMP *const cpi); + +int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture, + const YV12_BUFFER_CONFIG *last_picture, + ForceIntegerMVInfo *const force_intpel_info); + +void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi); + +void av1_save_all_coding_context(AV1_COMP *cpi); + +#if DUMP_RECON_FRAMES == 1 +void av1_dump_filtered_recon_frames(AV1_COMP *cpi); +#endif + +static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra, + BLOCK_SIZE sb_size) { + // For allintra encoding mode, inter-frame motion search is not applicable and + // the intraBC motion vectors are restricted within the tile boundaries. Hence + // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case. + if (resize) { + return AOM_BORDER_IN_PIXELS; + } + if (all_intra) { + return AOM_ENC_ALLINTRA_BORDER; + } + return block_size_wide[sb_size] + 32; +} + +static AOM_INLINE bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) { + const ResizeCfg *resize_cfg = &oxcf->resize_cfg; + const SuperResCfg *superres_cfg = &oxcf->superres_cfg; + return resize_cfg->resize_mode || superres_cfg->superres_mode; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODER_UTILS_H_ diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c new file mode 100644 index 0000000000..5fe2a497c7 --- /dev/null +++ b/third_party/aom/av1/encoder/encodetxb.c @@ -0,0 +1,886 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encodetxb.h" + +#include "aom_ports/mem.h" +#include "av1/common/blockd.h" +#include "av1/common/idct.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/hash.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +void av1_alloc_txb_buf(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; + const int num_sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + const int num_sb_cols = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + const int size = num_sb_rows * num_sb_cols; + const int num_planes = av1_num_planes(cm); + const int subsampling_x = cm->seq_params->subsampling_x; + const int subsampling_y = cm->seq_params->subsampling_y; + const int luma_max_sb_square = + 1 << num_pels_log2_lookup[cm->seq_params->sb_size]; + const int chroma_max_sb_square = + luma_max_sb_square >> (subsampling_x + subsampling_y); + const int num_tcoeffs = + size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square); + const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN; + + av1_free_txb_buf(cpi); + // TODO(jingning): This should be further reduced. + CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base, + aom_malloc(sizeof(*cpi->coeff_buffer_base) * size)); + CHECK_MEM_ERROR( + cm, coeff_buf_pool->tcoeff, + aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs)); + CHECK_MEM_ERROR( + cm, coeff_buf_pool->eobs, + aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size)); + CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx, + aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) * + num_tcoeffs / txb_unit_size)); + + tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff; + uint16_t *eob_ptr = coeff_buf_pool->eobs; + uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx; + for (int i = 0; i < size; i++) { + for (int plane = 0; plane < num_planes; plane++) { + const int max_sb_square = + (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square; + cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr; + cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr; + cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr; + tcoeff_ptr += max_sb_square; + eob_ptr += max_sb_square / txb_unit_size; + entropy_ctx_ptr += max_sb_square / txb_unit_size; + } + } +} + +void av1_free_txb_buf(AV1_COMP *cpi) { + CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; + aom_free(cpi->coeff_buffer_base); + cpi->coeff_buffer_base = NULL; + aom_free(coeff_buf_pool->tcoeff); + coeff_buf_pool->tcoeff = NULL; + aom_free(coeff_buf_pool->eobs); + coeff_buf_pool->eobs = NULL; + aom_free(coeff_buf_pool->entropy_ctx); + coeff_buf_pool->entropy_ctx = NULL; +} + +static void write_golomb(aom_writer *w, int level) { + int x = level + 1; + int i = x; + int length = 0; + + while (i) { + i >>= 1; + ++length; + } + assert(length > 0); + + for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0); + + for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01); +} + +static const int8_t eob_to_pos_small[33] = { + 0, 1, 2, // 0-2 + 3, 3, // 3-4 + 4, 4, 4, 4, // 5-8 + 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32 +}; + +static const int8_t eob_to_pos_large[17] = { + 6, // place holder + 7, // 33-64 + 8, 8, // 65-128 + 9, 9, 9, 9, // 129-256 + 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 + 11 // 513- +}; + +int av1_get_eob_pos_token(const int eob, int *const extra) { + int t; + + if (eob < 33) { + t = eob_to_pos_small[eob]; + } else { + const int e = AOMMIN((eob - 1) >> 5, 16); + t = eob_to_pos_large[e]; + } + + *extra = eob - av1_eob_group_start[t]; + + return t; +} + +#if CONFIG_ENTROPY_STATS +void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, PLANE_TYPE plane, + FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { +#else +void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class, + PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx, + uint8_t allow_update_cdf) { +#endif + int eob_extra; + const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); + TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + + switch (eob_multi_size) { + case 0: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5); + break; + case 1: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6); + break; + case 2: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) + update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7); + break; + case 3: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1, + 8); + } + break; + case 4: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1, + 9); + } + break; + case 5: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1, + 10); + } + break; + case 6: + default: +#if CONFIG_ENTROPY_STATS + ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1]; +#endif + if (allow_update_cdf) { + update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1, + 11); + } + break; + } + + if (av1_eob_offset_bits[eob_pt] > 0) { + int eob_ctx = eob_pt - 3; + int eob_shift = av1_eob_offset_bits[eob_pt] - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; +#if CONFIG_ENTROPY_STATS + counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2); + } +} + +static INLINE int get_nz_map_ctx(const uint8_t *const levels, + const int coeff_idx, const int bhl, + const int width, const int scan_idx, + const int is_eob, const TX_SIZE tx_size, + const TX_CLASS tx_class) { + if (is_eob) { + if (scan_idx == 0) return 0; + if (scan_idx <= (width << bhl) / 8) return 1; + if (scan_idx <= (width << bhl) / 4) return 2; + return 3; + } + const int stats = + get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class); + return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class); +} + +void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + uint8_t *ls = levels; + + memset(levels + stride * width, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); + + for (int i = 0; i < width; i++) { + for (int j = 0; j < height; j++) { + *ls++ = (uint8_t)clamp(abs(coeff[i * height + j]), 0, INT8_MAX); + } + for (int j = 0; j < TX_PAD_HOR; j++) { + *ls++ = 0; + } + } +} + +void av1_get_nz_map_contexts_c(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + for (int i = 0; i < eob; ++i) { + const int pos = scan[i]; + coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bhl, width, i, + i == eob - 1, tx_size, tx_class); + } +} + +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, + aom_writer *w, int blk_row, int blk_col, int plane, + int block, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const PLANE_TYPE plane_type = get_plane_type(plane); + const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / + (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + const uint16_t eob = eob_txb[block]; + const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2); + if (eob == 0) return; + + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + // Only y plane's tx_type is transmitted + if (plane == 0) { + av1_write_tx_type(cm, xd, tx_type, tx_size, w); + } + + int eob_extra; + const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + switch (eob_multi_size) { + case 0: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5); + break; + case 1: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6); + break; + case 2: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7); + break; + case 3: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8); + break; + case 4: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9); + break; + case 5: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10); + break; + default: + aom_write_symbol(w, eob_pt - 1, + ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11); + break; + } + + const int eob_offset_bits = av1_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { + const int eob_ctx = eob_pt - 3; + int eob_shift = eob_offset_bits - 1; + int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_symbol(w, bit, + ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2); + for (int i = 1; i < eob_offset_bits; i++) { + eob_shift = eob_offset_bits - 1 - i; + bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + aom_write_bit(w, bit); + } + } + + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + const tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; + const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block); + av1_txb_init_levels(tcoeff, width, height, levels); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int bhl = get_txb_bhl(tx_size); + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = tcoeff[pos]; + const tran_low_t level = abs(v); + + if (c == eob - 1) { + aom_write_symbol( + w, AOMMIN(level, 3) - 1, + ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3); + } else { + aom_write_symbol(w, AOMMIN(level, 3), + ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx], + 4); + } + if (level > NUM_BASE_LEVELS) { + // level is above 1. + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + aom_cdf_prob *cdf = + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + aom_write_symbol(w, k, cdf, BR_CDF_SIZE); + if (k < BR_CDF_SIZE - 1) break; + } + } + } + + // Loop to code all signs in the transform block, + // starting with the sign of DC (if applicable) + for (int c = 0; c < eob; ++c) { + const tran_low_t v = tcoeff[scan[c]]; + const tran_low_t level = abs(v); + const int sign = (v < 0) ? 1 : 0; + if (level) { + if (c == 0) { + const int dc_sign_ctx = + (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK; + aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], + 2); + } else { + aom_write_bit(w, sign); + } + if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS) + write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS); + } + } +} + +void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + const int num_planes = av1_num_planes(cm); + int block[MAX_MB_PLANE] = { 0 }; + int row, col; + assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x, + xd->plane[0].subsampling_y)); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int max_blocks_high = max_block_high(xd, bsize, 0); + const BLOCK_SIZE max_unit_bsize = BLOCK_64X64; + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide); + mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high); + + for (row = 0; row < max_blocks_high; row += mu_blocks_high) { + for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) { + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int step = stepr * stepc; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int unit_height = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y); + const int unit_width = ROUND_POWER_OF_TWO( + AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x); + for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height; + blk_row += stepr) { + for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width; + blk_col += stepc) { + av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, + block[plane], tx_size); + block[plane] += step; + } + } + } + } + } +} + +uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob) { + const int16_t *const scan = scan_order->scan; + int cul_level = 0; + int c; + + if (eob == 0) return 0; + for (c = 0; c < eob; ++c) { + cul_level += abs(qcoeff[scan[c]]); + if (cul_level > COEFF_CONTEXT_MASK) break; + } + + cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level); + set_dc_sign(&cul_level, qcoeff[0]); + + return (uint8_t)cul_level; +} + +static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm, + MACROBLOCKD *xd, int blk_row, int blk_col, + int plane, TX_SIZE tx_size, + FRAME_COUNTS *counts, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + int is_inter = is_inter_block(mbmi); + const int reduced_tx_set_used = cm->features.reduced_tx_set_used; + FRAME_CONTEXT *fc = xd->tile_ctx; +#if !CONFIG_ENTROPY_STATS + (void)counts; +#endif // !CONFIG_ENTROPY_STATS + + // Only y plane's tx_type is updated + if (plane > 0) return; + const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col, + tx_size, reduced_tx_set_used); + if (is_inter) { + if (cpi->oxcf.txfm_cfg.use_inter_dct_only) { + assert(tx_type == DCT_DCT); + } + } else { + if (cpi->oxcf.txfm_cfg.use_intra_dct_only) { + assert(tx_type == DCT_DCT); + } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) { + const TX_TYPE default_type = get_default_tx_type( + PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools); + (void)default_type; + // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in + // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode + // search, when picking up intra mode in nonRD inter mode search and in RD + // REALTIME mode when we limit TX type usage. + // We need to fix txfm cfg for these cases. Meanwhile relieving the + // assert. + assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode || + cpi->oxcf.mode == REALTIME); + } + } + + if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && + cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); + if (eset > 0) { + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used); + if (is_inter) { + if (allow_update_cdf) { + update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); + } +#if CONFIG_ENTROPY_STATS + ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + } else { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; +#if CONFIG_ENTROPY_STATS + ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir] + [av1_ext_tx_ind[tx_set_type][tx_type]]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf( + fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir], + av1_ext_tx_ind[tx_set_type][tx_type], + av1_num_ext_tx_set[tx_set_type]); + } + } + } + } +} + +void av1_update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + tran_low_t *tcoeff; + assert(args->dry_run != DRY_RUN_COSTCOEFFS); + if (args->dry_run == OUTPUT_ENABLED) { + MB_MODE_INFO *mbmi = xd->mi[0]; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, + pd->above_entropy_context + blk_col, + pd->left_entropy_context + blk_row, &txb_ctx); + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const uint8_t allow_update_cdf = args->allow_update_cdf; + const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; +#if CONFIG_ENTROPY_STATS + int cdf_idx = cm->coef_cdf_category; + ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) { + update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], + eob == 0, 2); + } + + CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / + (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + entropy_ctx[block] = txb_ctx.txb_skip_ctx; + eob_txb[block] = eob; + + if (eob == 0) { + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, + blk_row); + return; + } + const int segment_id = mbmi->segment_id; + const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); + tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; + tcoeff = tcoeff_txb + block_offset; + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + av1_txb_init_levels(tcoeff, width, height, levels); + update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, + td->counts, allow_update_cdf); + + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const int16_t *const scan = scan_order->scan; + + // record tx type usage + td->rd_counts.tx_type_used[tx_size][tx_type]++; + +#if CONFIG_ENTROPY_STATS + av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, allow_update_cdf); +#else + av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx, + allow_update_cdf); +#endif + + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, + coeff_contexts); + + for (int c = eob - 1; c >= 0; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const tran_low_t level = abs(v); + /* abs_sum_level is needed to decide the job scheduling order of + * pack bitstream multi-threading. This data is not needed if + * multi-threading is disabled. */ + if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level; + + if (allow_update_cdf) { + if (c == eob - 1) { + assert(coeff_ctx < 4); + update_cdf( + ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3) - 1, 3); + } else { + update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx], + AOMMIN(level, 3), 4); + } + } + if (c == eob - 1) { + assert(coeff_ctx < 4); +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3) - 1]; + } else { + ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3)]; +#endif + } + if (level > NUM_BASE_LEVELS) { + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + if (allow_update_cdf) { + update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx], + k, BR_CDF_SIZE); + } + for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] + [lps][br_ctx][lps == k]; +#endif // CONFIG_ENTROPY_STATS + if (lps == k) break; + } +#if CONFIG_ENTROPY_STATS + ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx][k]; +#endif + if (k < BR_CDF_SIZE - 1) break; + } + } + } + // Update the context needed to code the DC sign (if applicable) + if (tcoeff[0] != 0) { + const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; + const int dc_sign_ctx = txb_ctx.dc_sign_ctx; +#if CONFIG_ENTROPY_STATS + ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; +#endif // CONFIG_ENTROPY_STATS + if (allow_update_cdf) + update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2); + entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; + } + } else { + tcoeff = qcoeff; + } + const uint8_t cul_level = + av1_get_txb_entropy_context(tcoeff, scan_order, eob); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, + blk_col, blk_row); +} + +void av1_record_txb_context(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct tokenize_b_args *const args = arg; + const AV1_COMP *cpi = args->cpi; + const AV1_COMMON *cm = &cpi->common; + ThreadData *const td = args->td; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int eob = p->eobs[block]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + const PLANE_TYPE plane_type = pd->plane_type; + const TX_TYPE tx_type = + av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + tran_low_t *tcoeff; + assert(args->dry_run != DRY_RUN_COSTCOEFFS); + if (args->dry_run == OUTPUT_ENABLED) { + MB_MODE_INFO *mbmi = xd->mi[0]; + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, + pd->above_entropy_context + blk_col, + pd->left_entropy_context + blk_row, &txb_ctx); +#if CONFIG_ENTROPY_STATS + const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size); + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + int cdf_idx = cm->coef_cdf_category; + ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0]; +#endif // CONFIG_ENTROPY_STATS + + CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff; + const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] / + (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset; + entropy_ctx[block] = txb_ctx.txb_skip_ctx; + eob_txb[block] = eob; + + if (eob == 0) { + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, + blk_row); + return; + } + const int segment_id = mbmi->segment_id; + const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); + tran_low_t *tcoeff_txb = + cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type]; + tcoeff = tcoeff_txb + block_offset; + memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob); + +#if CONFIG_ENTROPY_STATS + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + av1_txb_init_levels(tcoeff, width, height, levels); + update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size, + td->counts, 0 /*allow_update_cdf*/); + + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const bool do_coeff_scan = true; +#else + const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled; +#endif + const int16_t *const scan = scan_order->scan; + + // record tx type usage + td->rd_counts.tx_type_used[tx_size][tx_type]++; + +#if CONFIG_ENTROPY_STATS + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx, + td->counts, 0 /*allow_update_cdf*/); + + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, + coeff_contexts); +#endif + + for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const tran_low_t level = abs(v); + /* abs_sum_level is needed to decide the job scheduling order of + * pack bitstream multi-threading. This data is not needed if + * multi-threading is disabled. */ + if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level; + +#if CONFIG_ENTROPY_STATS + const int coeff_ctx = coeff_contexts[pos]; + if (c == eob - 1) { + assert(coeff_ctx < 4); + ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3) - 1]; + } else { + ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type] + [coeff_ctx][AOMMIN(level, 3)]; + } + if (level > NUM_BASE_LEVELS) { + const int base_range = level - 1 - NUM_BASE_LEVELS; + const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class); + for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { + const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); + for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) { + ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type] + [lps][br_ctx][lps == k]; + if (lps == k) break; + } + ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)] + [plane_type][br_ctx][k]; + if (k < BR_CDF_SIZE - 1) break; + } + } +#endif + } + // Update the context needed to code the DC sign (if applicable) + if (tcoeff[0] != 0) { + const int dc_sign_ctx = txb_ctx.dc_sign_ctx; +#if CONFIG_ENTROPY_STATS + const int dc_sign = (tcoeff[0] < 0) ? 1 : 0; + ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign]; +#endif // CONFIG_ENTROPY_STATS + entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT; + } + } else { + tcoeff = qcoeff; + } + const uint8_t cul_level = + av1_get_txb_entropy_context(tcoeff, scan_order, eob); + av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, + blk_col, blk_row); +} + +void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + uint8_t allow_update_cdf) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; + if (mbmi->skip_txfm) { + av1_reset_entropy_context(xd, bsize, num_planes); + return; + } + const foreach_transformed_block_visitor visit = + allow_update_cdf ? av1_update_and_record_txb_context + : av1_record_txb_context; + + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg); + } +} + +CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, + int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int stride = + CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); + const int offset = + (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); + return cpi->coeff_buffer_base + offset; +} diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h new file mode 100644 index 0000000000..67b94046b4 --- /dev/null +++ b/third_party/aom/av1/encoder/encodetxb.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODETXB_H_ +#define AOM_AV1_ENCODER_ENCODETXB_H_ + +#include "config/aom_config.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "aom_dsp/bitwriter.h" +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +#define TXB_SKIP_CTX_MASK 15 +#define DC_SIGN_CTX_SHIFT 4 +#define DC_SIGN_CTX_MASK 3 + +int av1_get_eob_pos_token(const int eob, int *const extra); + +/*!\endcond */ +/*!\brief Allocate the memory resources for all the macro blocks in the current + * coding frame. + * \ingroup coefficient_coding + * + * Each macro block will need a \ref CB_COEFF_BUFFER to store information for + * rate-distortion optimization and entropy coding of transform coefficients. + * + * \param[in] cpi Top-level encoder structure + */ +void av1_alloc_txb_buf(AV1_COMP *cpi); +/*!\brief Free the memory resources for all the macro blocks in the current + * coding frame. + * \ingroup coefficient_coding + * + * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details. + * + * \param[in] cpi Top-level encoder structure + */ +void av1_free_txb_buf(AV1_COMP *cpi); + +/*!\brief Write quantized coefficients in a transform block into bitstream using + * entropy coding. + * + * \ingroup coefficient_coding + * + * This function will write the quantized coefficients in a transform block into + * the bitstream using entropy coding. + * + * The coding steps are as follows. + * + * 1) Code the end of block position "eob", which is the scan index of the + * last non-zero coefficient plus one. + * + * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS) + * for each coefficient in reversed scan order. + * + * 3) Code the sign and higher magnitude level + * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order. + * + * \param[in] cm Top-level structure shared by encoder and + * decoder + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] w Entropy coding write pointer + * \param[in] blk_row The row index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane + * \param[in] blk_col The col index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block + * \param[in] tx_size The given transform size + */ +void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x, + aom_writer *w, int blk_row, int blk_col, int plane, + int block, TX_SIZE tx_size); + +/*!\brief Write quantized coefficients of all transform blocks in an intra + * macroblock into the bitstream using entropy coding. + * + * \ingroup coefficient_coding + * + * All transform blocks in the intra macroblock share the same transform size. + * + * This function use \ref av1_write_coeffs_txb() to code each transform block in + * raster order. + * + * \param[in] cm Top-level structure shared by encoder and + * decoder + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] w Entropy coding write pointer + * \param[in] bsize Block size of the current macroblock + */ +void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, + aom_writer *w, BLOCK_SIZE bsize); + +/*!\brief Pack the context info of the current transform block into an uint8_t. + * \ingroup coefficient_coding + * + * This context info will be collected and consolidated by its neighbor + * transform blocks for coding transform block skip flag (tx_skip) and + * the sign of DC coefficient (dc_sign). + * + * \param[in] qcoeff Buffer of quantized coefficients + * \param[in] scan_order Coding order of coefficients in the transform + * block + * \param[in] eob The scan index of last non-zero coefficient plus + * one + */ +uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff, + const SCAN_ORDER *scan_order, int eob); + +/*!\brief Update the probability model (cdf) and the entropy context related to + * coefficient coding for all transform blocks in the intra macroblock. + * + * \ingroup coefficient_coding + * + * This function will go through each transform block in the intra macorblock + * and call \ref av1_update_and_record_txb_context to update the probability + * model and entropy context properly. + * + * \param[in] cpi Top-level encoder structure + * \param[in] td Top-level multithreading structure + * \param[in] dry_run Whether this is a dry run. + * \param[in] bsize Block size of the current macroblock + * \param[in] allow_update_cdf Allowed to update probability model (cdf) or + * not. + */ +void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + uint8_t allow_update_cdf); + +/*!\brief Update the probability model (cdf) and the entropy context related to + * coefficient coding for a transform block. + * + * \ingroup coefficient_coding + * + * There are regular mode and dry run for this funtion. + * + * Regular mode: + * + * The probability model (cdf) for each coding symbol in the + * transform block will be updated. + * + * The entropy context of this transform block will be updated. + * + * Dry run: + * + * The probability model update will be skipped. + * + * The entropy context of this transform block will be updated. + * + * \param[in] plane The index of the current plane. + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block. + * \param[in] blk_row The row index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] blk_col The col index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] plane_bsize Block size for this plane. When the video source + * uses chroma subsampling, the block size of UV planes will be smaller than the + * block size of Y plane. + * \param[in] tx_size The given transform size. + * \param[in] arg This parameter will be translated into + * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run. + */ +void av1_update_and_record_txb_context(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); + +/*!\brief Update the entropy context related to coefficient coding for a + * transform block. + * + * \ingroup coefficient_coding + * + * There are regular mode and dry run for this function. + * + * Regular mode: + * + * The entropy context of this transform block will be updated. + * + * Dry run: + * + * The probability model update will be skipped. + * + * The entropy context of this transform block will be updated. + * + * \param[in] plane The index of the current plane. + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block. + * \param[in] blk_row The row index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] blk_col The col index of the current transform block + * in the macroblock. Each unit has 4 pixels in y plane. + * \param[in] plane_bsize Block size for this plane. When the video source + * uses chroma subsampling, the block size of UV planes will be smaller than the + * block size of Y plane. + * \param[in] tx_size The given transform size. + * \param[in] arg This parameter will be translated into + * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run. + */ +void av1_record_txb_context(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); + +/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block. + * + * \ingroup coefficient_coding + * + * The macroblock's location is described by mi_row and mi_col, row and column + * mi indexes in the coding frame. + * + * Each mi unit is a 4x4 pixel block. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] mi_row Row mi index of the current transform block + * in the frame. + * \param[in] mi_col Column mi index of the current transform + * block in the frame. + * \return CB_COEFF_BUFFER* Pointer of \ref CB_COEFF_BUFFER associated + * to this macroblock. + */ +CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, + int mi_col); + +/*!\brief Returns the entropy cost associated with skipping the current + * transform block. + * + * \ingroup coefficient_coding + * + * \param[in] coeff_costs Table of entropy cost for coefficient coding. + * \param[in] txb_ctx Context info for entropy coding transform block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[in] plane The index of the current plane + * \param[in] tx_size The transform size + */ +static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs, + const TXB_CTX *const txb_ctx, int plane, + TX_SIZE tx_size) { + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs_ = + &coeff_costs->coeff_costs[txs_ctx][plane_type]; + return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; +} + +/*!\cond */ +// These numbers are empirically obtained. +static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { + { 17, 13 }, + { 16, 10 }, +}; +/*!\endcond */ + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_ENCODETXB_H_ diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c new file mode 100644 index 0000000000..d6a806d504 --- /dev/null +++ b/third_party/aom/av1/encoder/ethread.c @@ -0,0 +1,3469 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/warped_motion.h" +#include "av1/common/thread_common.h" + +#include "av1/encoder/allintra_vis.h" +#include "av1/encoder/bitstream.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/ethread.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/firstpass.h" +#endif +#include "av1/encoder/global_motion.h" +#include "av1/encoder/global_motion_facade.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/rdopt.h" +#include "aom_dsp/aom_dsp_common.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/tpl_model.h" + +static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { + td->rd_counts.compound_ref_used_flag |= + td_t->rd_counts.compound_ref_used_flag; + td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag; + + for (int i = 0; i < TX_SIZES_ALL; i++) { + for (int j = 0; j < TX_TYPES; j++) + td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j]; + } + + for (int i = 0; i < BLOCK_SIZES_ALL; i++) { + for (int j = 0; j < 2; j++) { + td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j]; + } + } + + for (int i = 0; i < 2; i++) { + td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i]; + } + + td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0]; + td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1]; + + td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks; +} + +static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + const int mib_size = cm->seq_params->mib_size; + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int row = 0; row < cm->tiles.rows; row++) { + for (int col = 0; col < cm->tiles.cols; col++) { + TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col]; + const TileInfo *const tile_info = &tile_data->tile_info; + for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += mib_size) { + if (mi_row == tile_info->mi_row_start) + av1_reset_loop_filter_delta(xd, av1_num_planes(cm)); + for (int mi_col = tile_info->mi_col_start; + mi_col < tile_info->mi_col_end; mi_col += mib_size) { + const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; + MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; + MB_MODE_INFO *mbmi = mi[0]; + if (mbmi->skip_txfm == 1 && + (mbmi->bsize == cm->seq_params->sb_size)) { + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; + } else { + if (cm->delta_q_info.delta_lf_multi) { + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } else { + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + } + } + } +} + +void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c) { + (void)row_mt_sync; + (void)r; + (void)c; +} + +void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c, int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +} + +void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->num_finished_cols[r - 1] - nsync - + row_mt_sync->intrabc_extra_top_right_sb_delay) { + pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, + int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex_[r]); + + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + row_mt_sync->num_finished_cols[r] = + AOMMAX(row_mt_sync->num_finished_cols[r], cur); + + pthread_cond_signal(&row_mt_sync->cond_[r]); + pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +// Allocate memory for row synchronization +static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync, + AV1_COMMON *cm, int rows) { +#if CONFIG_MULTITHREAD + int i; + + CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, + aom_malloc(sizeof(*row_mt_sync->mutex_) * rows)); + if (row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, row_mt_sync->cond_, + aom_malloc(sizeof(*row_mt_sync->cond_) * rows)); + if (row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond_[i], NULL); + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols, + aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows)); + + row_mt_sync->rows = rows; + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + } + aom_free(row_mt_sync->mutex_); + } + if (row_mt_sync->cond_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond_[i]); + } + aom_free(row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + aom_free(row_mt_sync->num_finished_cols); + + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + av1_zero(*row_mt_sync); + } +} + +static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) { + return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, + cm->seq_params->mib_size_log2); +} + +static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols, + int alloc_row_ctx) { + struct AV1Common *cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int tile_col, tile_row; + + av1_row_mt_mem_dealloc(cpi); + + // Allocate memory for row based multi-threading + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + + row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows); + + if (alloc_row_ctx) { + assert(max_cols > 0); + const int num_row_ctx = AOMMAX(1, (max_cols - 1)); + CHECK_MEM_ERROR(cm, this_tile->row_ctx, + (FRAME_CONTEXT *)aom_memalign( + 16, num_row_ctx * sizeof(*this_tile->row_ctx))); + } + } + } + const int sb_rows = get_sb_rows_in_frame(cm); + CHECK_MEM_ERROR( + cm, enc_row_mt->num_tile_cols_done, + aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows)); + + enc_row_mt->allocated_rows = max_rows; + enc_row_mt->allocated_cols = max_cols - 1; + enc_row_mt->allocated_sb_rows = sb_rows; +} + +void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = enc_row_mt->allocated_tile_cols; + const int tile_rows = enc_row_mt->allocated_tile_rows; + int tile_col, tile_row; + + // Free row based multi-threading sync memory + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + + av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + + if (cpi->oxcf.algo_cfg.cdf_update_mode) { + aom_free(this_tile->row_ctx); + this_tile->row_ctx = NULL; + } + } + } + aom_free(enc_row_mt->num_tile_cols_done); + enc_row_mt->num_tile_cols_done = NULL; + enc_row_mt->allocated_rows = 0; + enc_row_mt->allocated_cols = 0; + enc_row_mt->allocated_sb_rows = 0; +} + +static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id, + int num_tiles, int num_workers) { + int tile_id = 0; + int i; + + for (i = 0; i < num_workers; i++) { + thread_id_to_tile_id[i] = tile_id++; + if (tile_id == num_tiles) tile_id = 0; + } +} + +static AOM_INLINE int get_next_job(TileDataEnc *const tile_data, + int *current_mi_row, int mib_size) { + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + const int mi_row_end = tile_data->tile_info.mi_row_end; + + if (row_mt_sync->next_mi_row < mi_row_end) { + *current_mi_row = row_mt_sync->next_mi_row; + row_mt_sync->num_threads_working++; + row_mt_sync->next_mi_row += mib_size; + return 1; + } + return 0; +} + +static AOM_INLINE void switch_tile_and_get_next_job( + AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id, + int *current_mi_row, int *end_of_frame, int is_firstpass, + const BLOCK_SIZE fp_block_size) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_mis_to_encode = 0; + int min_num_threads_working = INT_MAX; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &tile_data[tile_index]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + +#if CONFIG_REALTIME_ONLY + int num_b_rows_in_tile = + av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); + int num_b_cols_in_tile = + av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); +#else + int num_b_rows_in_tile = + is_firstpass + ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size) + : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info); + int num_b_cols_in_tile = + is_firstpass + ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size) + : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info); +#endif + int theoretical_limit_on_threads = + AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile); + int num_threads_working = row_mt_sync->num_threads_working; + + if (num_threads_working < theoretical_limit_on_threads) { + int num_mis_to_encode = + this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row; + + // Tile to be processed by this thread is selected on the basis of + // availability of jobs: + // 1) If jobs are available, tile to be processed is chosen on the + // basis of minimum number of threads working for that tile. If two or + // more tiles have same number of threads working for them, then the + // tile with maximum number of jobs available will be chosen. + // 2) If no jobs are available, then end_of_frame is reached. + if (num_mis_to_encode > 0) { + if (num_threads_working < min_num_threads_working) { + min_num_threads_working = num_threads_working; + max_mis_to_encode = 0; + } + if (num_threads_working == min_num_threads_working && + num_mis_to_encode > max_mis_to_encode) { + tile_id = tile_index; + max_mis_to_encode = num_mis_to_encode; + } + } + } + } + } + if (tile_id == -1) { + *end_of_frame = 1; + } else { + // Update the current tile id to the tile id that will be processed next, + // which will be the least processed tile. + *cur_tile_id = tile_id; + const int unit_height = mi_size_high[fp_block_size]; + get_next_job(&tile_data[tile_id], current_mi_row, + is_firstpass ? unit_height : cm->seq_params->mib_size); + } +} + +#if !CONFIG_REALTIME_ONLY +static void set_firstpass_encode_done(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const BLOCK_SIZE fp_block_size = cpi->fp_block_size; + const int unit_height = mi_size_high[fp_block_size]; + + // In case of multithreading of firstpass encode, due to top-right + // dependency, the worker on a firstpass row waits for the completion of the + // firstpass processing of the top and top-right fp_blocks. Hence, in case a + // thread (main/worker) encounters an error, update the firstpass processing + // of every row in the frame to indicate that it is complete in order to avoid + // dependent workers waiting indefinitely. + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *tile = &tile_data->tile_info; + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + const int unit_cols_in_tile = + av1_get_unit_cols_in_tile(tile, fp_block_size); + for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0; + mi_row < tile->mi_row_end; + mi_row += unit_height, unit_row_in_tile++) { + enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, + unit_cols_in_tile - 1, unit_cols_in_tile); + } + } + } +} + +static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + int thread_id = thread_data->thread_id; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; +#endif + (void)unused; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); + enc_row_mt->firstpass_mt_exit = true; + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + set_firstpass_encode_done(cpi); + return 0; + } + error_info->setjmp = 1; + + AV1_COMMON *const cm = &cpi->common; + int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; + assert(cur_tile_id != -1); + + const BLOCK_SIZE fp_block_size = cpi->fp_block_size; + const int unit_height = mi_size_high[fp_block_size]; + int end_of_frame = 0; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; + if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id], + ¤t_mi_row, unit_height)) { + // No jobs are available for the current tile. Query for the status of + // other tiles and get the next job if available + switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, + ¤t_mi_row, &end_of_frame, 1, + fp_block_size); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + // When firstpass_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (firstpass_mt_exit || end_of_frame) break; + + TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + ThreadData *td = thread_data->td; + + assert(current_mi_row != -1 && + current_mi_row < this_tile->tile_info.mi_row_end); + + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2, + fp_block_size); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + } + error_info->setjmp = 0; + return 1; +} +#endif + +static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data, + AV1EncRowMultiThreadInfo *enc_row_mt, + int mib_size_log2) { + AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync; + const int sb_rows = get_sb_rows_in_frame(cm); + AV1LfMTInfo *cur_job_info; + bool row_mt_exit = false; + (void)enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; +#endif + + while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) { + LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data; + const int lpf_opt_level = cur_job_info->lpf_opt_level; + (void)sb_rows; +#if CONFIG_MULTITHREAD + const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2; + const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1); + // Wait for current and next superblock row to finish encoding. + pthread_mutex_lock(enc_row_mt_mutex_); + while (!enc_row_mt->row_mt_exit && + (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols || + enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) { + pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_); + } + row_mt_exit = enc_row_mt->row_mt_exit; + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + if (row_mt_exit) return; + + av1_thread_loop_filter_rows( + lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd, + cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir, + lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf, + lf_data->tx_buf, mib_size_log2); + } +} + +static void set_encoding_done(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + const int mib_size = cm->seq_params->mib_size; + + // In case of row-multithreading, due to top-right dependency, the worker on + // an SB row waits for the completion of the encode of the top and top-right + // SBs. Hence, in case a thread (main/worker) encounters an error, update that + // encoding of every SB row in the frame is complete in order to avoid the + // dependent workers of every tile from waiting indefinitely. + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); + for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0; + mi_row < tile_info->mi_row_end; + mi_row += mib_size, sb_row_in_tile++) { + enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile, + sb_cols_in_tile - 1, sb_cols_in_tile); + } + } + } +} + +static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc, + const int filter_level[2]) { + return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]); +} + +static int enc_row_mt_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + int thread_id = thread_data->thread_id; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; +#endif + (void)unused; + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + AV1LfSync *const lf_sync = thread_data->lf_sync; + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + xd->error_info = error_info; + AV1_COMMON *volatile const cm = &cpi->common; + volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled( + cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level); + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); + enc_row_mt->row_mt_exit = true; + // Wake up all the workers waiting in launch_loop_filter_rows() to exit in + // case of an error. + pthread_cond_broadcast(enc_row_mt->cond_); + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + set_encoding_done(cpi); + + if (do_pipelined_lpf_mt_with_enc) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->job_mutex); + lf_sync->lf_mt_exit = true; + pthread_mutex_unlock(lf_sync->job_mutex); +#endif + av1_set_vert_loop_filter_done(&cpi->common, lf_sync, + cpi->common.seq_params->mib_size_log2); + } + return 0; + } + error_info->setjmp = 1; + + const int mib_size_log2 = cm->seq_params->mib_size_log2; + int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; + + // Preallocate the pc_tree for realtime coding to reduce the cost of memory + // allocation. + if (cpi->sf.rt_sf.use_nonrd_pick_mode) { + thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); + if (!thread_data->td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } else { + thread_data->td->pc_root = NULL; + } + + assert(cur_tile_id != -1); + + const BLOCK_SIZE fp_block_size = cpi->fp_block_size; + int end_of_frame = 0; + bool row_mt_exit = false; + + // When master thread does not have a valid job to process, xd->tile_ctx + // is not set and it contains NULL pointer. This can result in NULL pointer + // access violation if accessed beyond the encode stage. Hence, updating + // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame + // context to avoid NULL pointer access in subsequent stages. + thread_data->td->mb.e_mbd.tile_ctx = cm->fc; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + row_mt_exit = enc_row_mt->row_mt_exit; + // row_mt_exit check here can be avoided as it is checked after + // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here, + // tries to return before calling the function get_next_job(). + if (!row_mt_exit && + !get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, + cm->seq_params->mib_size)) { + // No jobs are available for the current tile. Query for the status of + // other tiles and get the next job if available + switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, + ¤t_mi_row, &end_of_frame, 0, + fp_block_size); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + // When row_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (row_mt_exit) { + error_info->setjmp = 0; + return 1; + } + + if (end_of_frame) break; + + TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + const TileInfo *const tile_info = &this_tile->tile_info; + const int tile_row = tile_info->tile_row; + const int tile_col = tile_info->tile_col; + ThreadData *td = thread_data->td; + const int sb_row = current_mi_row >> mib_size_log2; + + assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end); + + td->mb.e_mbd.tile_ctx = td->tctx; + td->mb.tile_pb_ctx = &this_tile->tctx; + td->abs_sum_level = 0; + + if (this_tile->allow_update_cdf) { + td->mb.row_ctx = this_tile->row_ctx; + if (current_mi_row == tile_info->mi_row_start) + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } else { + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } + + av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, + &td->mb.e_mbd); + + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); + if (td->mb.txfm_search_info.mb_rd_record != NULL) { + av1_crc32c_calculator_init( + &td->mb.txfm_search_info.mb_rd_record->crc_calculator); + } + + av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex_); +#endif + this_tile->abs_sum_level += td->abs_sum_level; + row_mt_sync->num_threads_working--; + enc_row_mt->num_tile_cols_done[sb_row]++; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(enc_row_mt->cond_); + pthread_mutex_unlock(enc_row_mt_mutex_); +#endif + } + if (do_pipelined_lpf_mt_with_enc) { + // Loop-filter a superblock row if encoding of the current and next + // superblock row is complete. + // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving + // encoding and loop filter stage. + launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2); + } + av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + thread_data->td->pc_root = NULL; + error_info->setjmp = 0; + return 1; +} + +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + const AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int t; + + (void)unused; + + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; + return 0; + } + error_info->setjmp = 1; + + // Preallocate the pc_tree for realtime coding to reduce the cost of memory + // allocation. + if (cpi->sf.rt_sf.use_nonrd_pick_mode) { + thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size); + if (!thread_data->td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } else { + thread_data->td->pc_root = NULL; + } + + for (t = thread_data->start; t < tile_rows * tile_cols; + t += cpi->mt_info.num_workers) { + int tile_row = t / tile_cols; + int tile_col = t % tile_cols; + + TileDataEnc *const this_tile = + &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; + av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); + } + + av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + thread_data->td->pc_root = NULL; + error_info->setjmp = 0; + return 1; +} + +void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) { + cpi->mt_info.workers = ppi->p_mt_info.workers; + cpi->mt_info.num_workers = ppi->p_mt_info.num_workers; + cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data; + int i; + for (i = MOD_FP; i < NUM_MT_MODULES; i++) { + cpi->mt_info.num_mod_workers[i] = + AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]); + } +} + +void av1_init_cdef_worker(AV1_COMP *cpi) { + // The allocation is done only for level 0 parallel frames. No change + // in config is supported in the middle of a parallel encode set, since the + // rest of the MT modules also do not support dynamic change of config. + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return; + PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; + int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF); + + av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker, + &cpi->mt_info.cdef_sync, num_cdef_workers, 1); + cpi->mt_info.cdef_worker = p_mt_info->cdef_worker; +} + +#if !CONFIG_REALTIME_ONLY +void av1_init_lr_mt_buffers(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync; + if (lr_sync->sync_range) { + if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + return; + int num_lr_workers = + av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR); + assert(num_lr_workers <= lr_sync->num_workers); + lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf; + lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs; + } +} +#endif + +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + aom_internal_error_copy(&cpi->ppi->error, cm->error); + } + cm->error->setjmp = 1; + // Initialize enc row MT object. + if (is_first_pass || cpi->oxcf.row_mt == 1) { + AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; + if (enc_row_mt->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, enc_row_mt->mutex_, + aom_malloc(sizeof(*(enc_row_mt->mutex_)))); + if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL); + } + if (enc_row_mt->cond_ == NULL) { + CHECK_MEM_ERROR(cm, enc_row_mt->cond_, + aom_malloc(sizeof(*(enc_row_mt->cond_)))); + if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL); + } + } + + if (!is_first_pass) { + // Initialize global motion MT object. + AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; + if (gm_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, gm_sync->mutex_, + aom_malloc(sizeof(*(gm_sync->mutex_)))); + if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); + } +#if !CONFIG_REALTIME_ONLY + // Initialize temporal filtering MT object. + AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; + if (tf_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, tf_sync->mutex_, + aom_malloc(sizeof(*tf_sync->mutex_))); + if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); + } +#endif // !CONFIG_REALTIME_ONLY + // Initialize CDEF MT object. + AV1CdefSync *cdef_sync = &mt_info->cdef_sync; + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } + + // Initialize loop filter MT object. + AV1LfSync *lf_sync = &mt_info->lf_row_sync; + // Number of superblock rows + const int sb_rows = + CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2); + PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info; + int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF); + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_lf_workers > lf_sync->num_workers) { + av1_loop_filter_dealloc(lf_sync); + av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers); + } + + // Initialize tpl MT object. + AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt; + if (tpl_row_mt->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_, + aom_malloc(sizeof(*(tpl_row_mt->mutex_)))); + if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL); + } + +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + // Initialize loop restoration MT object. + AV1LrSync *lr_sync = &mt_info->lr_row_sync; + int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; + int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height); + int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR); + if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows || + num_lr_workers > lr_sync->num_workers || + MAX_MB_PLANE > lr_sync->num_planes) { + av1_loop_restoration_dealloc(lr_sync); + av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr, + MAX_MB_PLANE, cm->width); + } + } +#endif + + // Initialization of pack bitstream MT object. + AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync; + if (pack_bs_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_, + aom_malloc(sizeof(*pack_bs_sync->mutex_))); + if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); + } + } + cm->error->setjmp = 0; +} +#endif // CONFIG_MULTITHREAD + +// Computes the number of workers to be considered while allocating memory for a +// multi-threaded module under FPMT. +int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, + MULTI_THREADED_MODULES mod_name) { + int num_mod_workers = p_mt_info->num_mod_workers[mod_name]; + if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) { + // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC]. + // As frame parallel jobs will only perform multi-threading for the encode + // stage, we can limit the allocations according to num_enc_workers per + // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]). + num_mod_workers = p_mt_info->num_workers; + } + return num_mod_workers; +} + +void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + + assert(p_mt_info->workers != NULL); + assert(p_mt_info->tile_thr_data != NULL); + + int num_workers = p_mt_info->num_workers; + int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC); + assert(num_enc_workers <= num_workers); + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; + + if (i > 0) { + // Allocate thread data. + ThreadData *td; + AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td))); + av1_zero(*td); + thread_data->original_td = thread_data->td = td; + + // Set up shared coeff buffers. + av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf, + &ppi->error); + AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*td->tmp_conv_dst))); + + if (i < p_mt_info->num_mod_workers[MOD_FP]) { + // Set up firstpass PICK_MODE_CONTEXT. + td->firstpass_ctx = + av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf); + if (!td->firstpass_ctx) + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + + if (!is_first_pass && i < num_enc_workers) { + // Set up sms_tree. + if (av1_setup_sms_tree(ppi->cpi, td)) { + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + AOM_CHECK_MEM_ERROR( + &ppi->error, td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*td->hash_value_buffer[0][0]))); + + // Allocate frame counters in thread data. + AOM_CHECK_MEM_ERROR(&ppi->error, td->counts, + aom_calloc(1, sizeof(*td->counts))); + + // Allocate buffers used by palette coding mode. + AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer, + aom_memalign(16, sizeof(*td->palette_buffer))); + + // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are + // used in inter frames to store intermediate inter mode prediction + // results and are not required for allintra encoding mode. Hence, the + // memory allocations for these buffers are avoided for allintra + // encoding mode. + if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) { + alloc_obmc_buffers(&td->obmc_buffer, &ppi->error); + + alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer); + + for (int j = 0; j < 2; ++j) { + AOM_CHECK_MEM_ERROR( + &ppi->error, td->tmp_pred_bufs[j], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*td->tmp_pred_bufs[j]))); + } + } + + if (is_gradient_caching_for_hog_enabled(ppi->cpi)) { + const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome; + AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info, + aom_malloc(sizeof(*td->pixel_gradient_info) * + plane_types * MAX_SB_SQUARE)); + } + + if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) { + const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size; + const int mi_count_in_sb = + mi_size_wide[sb_size] * mi_size_high[sb_size]; + + AOM_CHECK_MEM_ERROR( + &ppi->error, td->src_var_info_of_4x4_sub_blocks, + aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) * + mi_count_in_sb)); + } + + if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { + const int num_64x64_blocks = + (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + AOM_CHECK_MEM_ERROR( + &ppi->error, td->vt64x64, + aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks)); + } + } + } + + if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) { + if (i == 0) { + for (int j = 0; j < ppi->num_fp_contexts; j++) { + AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx, + (FRAME_CONTEXT *)aom_memalign( + 16, sizeof(*ppi->parallel_cpi[j]->td.tctx))); + } + } else { + AOM_CHECK_MEM_ERROR( + &ppi->error, thread_data->td->tctx, + (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx))); + } + } + } + + // Record the number of workers in encode stage multi-threading for which + // allocation is done. + p_mt_info->prev_num_enc_workers = num_enc_workers; +} + +void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + assert(p_mt_info->num_workers == 0); + + AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers, + aom_malloc(num_workers * sizeof(*p_mt_info->workers))); + + AOM_CHECK_MEM_ERROR( + &ppi->error, p_mt_info->tile_thr_data, + aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data))); + + for (int i = 0; i < num_workers; ++i) { + AVxWorker *const worker = &p_mt_info->workers[i]; + EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; + + winterface->init(worker); + worker->thread_name = "aom enc worker"; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + if (i > 0) { + // Create threads + if (!winterface->reset(worker)) + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, + "Tile encoder thread creation failed"); + } + winterface->sync(worker); + + ++p_mt_info->num_workers; + } +} + +// This function will change the state and free the mutex of corresponding +// workers and terminate the object. The object can not be re-used unless a call +// to reset() is made. +void av1_terminate_workers(AV1_PRIMARY *ppi) { + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + for (int t = 0; t < p_mt_info->num_workers; ++t) { + AVxWorker *const worker = &p_mt_info->workers[t]; + aom_get_worker_interface()->end(worker); + } +} + +// This function returns 1 if frame parallel encode is supported for +// the current configuration. Returns 0 otherwise. +static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { + // FPMT is enabled for AOM_Q and AOM_VBR. + // TODO(Tarun): Test and enable resize config. + if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) { + return 0; + } + if (ppi->use_svc) { + return 0; + } + if (oxcf->tile_cfg.enable_large_scale_tile) { + return 0; + } + if (oxcf->dec_model_cfg.timing_info_present) { + return 0; + } + if (oxcf->mode != GOOD) { + return 0; + } + if (oxcf->tool_cfg.error_resilient_mode) { + return 0; + } + if (oxcf->resize_cfg.resize_mode) { + return 0; + } + if (oxcf->pass != AOM_RC_SECOND_PASS) { + return 0; + } + if (oxcf->max_threads < 2) { + return 0; + } + if (!oxcf->fp_mt) { + return 0; + } + + return 1; +} + +int av1_check_fpmt_config(AV1_PRIMARY *const ppi, + AV1EncoderConfig *const oxcf) { + if (is_fpmt_config(ppi, oxcf)) return 1; + // Reset frame parallel configuration for unsupported config + if (ppi->num_fp_contexts > 1) { + for (int i = 1; i < ppi->num_fp_contexts; i++) { + // Release the previously-used frame-buffer + if (ppi->parallel_cpi[i]->common.cur_frame != NULL) { + --ppi->parallel_cpi[i]->common.cur_frame->ref_count; + ppi->parallel_cpi[i]->common.cur_frame = NULL; + } + } + + int cur_gf_index = ppi->cpi->gf_frame_index; + int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index); + av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index], + reset_size); + av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size); + av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size); + memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX, + sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) * + reset_size * REF_FRAMES); + memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX, + sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size); + ppi->num_fp_contexts = 1; + } + return 0; +} + +// A large value for threads used to compute the max num_enc_workers +// possible for each resolution. +#define MAX_THREADS 100 + +// Computes the max number of enc workers possible for each resolution. +static AOM_INLINE int compute_max_num_enc_workers( + CommonModeInfoParams *const mi_params, int mib_size_log2) { + int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2); + int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2); + + return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows); +} + +// Computes the number of frame parallel(fp) contexts to be created +// based on the number of max_enc_workers. +int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { + ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0; + if (!av1_check_fpmt_config(ppi, oxcf)) { + return 1; + } + int max_num_enc_workers = compute_max_num_enc_workers( + &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2); + // Scaling factors and rounding factors used to tune worker_per_frame + // computation. + int rounding_factor[2] = { 2, 4 }; + int scaling_factor[2] = { 4, 8 }; + int is_480p_or_lesser = + AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480; + int is_sb_64 = 0; + if (ppi->cpi != NULL) + is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64; + // A parallel frame encode has at least 1/4th the + // theoretical limit of max enc workers in default case. For resolutions + // larger than 480p, if SB size is 64x64, optimal performance is obtained with + // limit of 1/8. + int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0; + int workers_per_frame = + AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) / + scaling_factor[index]); + int max_threads = oxcf->max_threads; + int num_fp_contexts = max_threads / workers_per_frame; + // Based on empirical results, FPMT gains with multi-tile are significant when + // more parallel frames are available. Use FPMT with multi-tile encode only + // when sufficient threads are available for parallel encode of + // MAX_PARALLEL_FRAMES frames. + if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) { + if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1; + } + + num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES)); + // Limit recalculated num_fp_contexts to ppi->num_fp_contexts. + num_fp_contexts = (ppi->num_fp_contexts == 1) + ? num_fp_contexts + : AOMMIN(num_fp_contexts, ppi->num_fp_contexts); + if (num_fp_contexts > 1) { + ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = + AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads); + } + return num_fp_contexts; +} + +// Computes the number of workers to process each of the parallel frames. +static AOM_INLINE int compute_num_workers_per_frame( + const int num_workers, const int parallel_frame_count) { + // Number of level 2 workers per frame context (floor division). + int workers_per_frame = (num_workers / parallel_frame_count); + return workers_per_frame; +} + +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared); + +// Prepare level 1 workers. This function is only called for +// parallel_frame_count > 1. This function populates the mt_info structure of +// frame level contexts appropriately by dividing the total number of available +// workers amongst the frames as level 2 workers. It also populates the hook and +// data members of level 1 workers. +static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi, + AV1_COMP_DATA *first_cpi_data, + AVxWorkerHook hook, + int parallel_frame_count) { + assert(parallel_frame_count <= ppi->num_fp_contexts && + parallel_frame_count > 1); + + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + int num_workers = p_mt_info->num_workers; + + volatile int frame_idx = 0; + volatile int i = 0; + while (i < num_workers) { + // Assign level 1 worker + AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] = + &p_mt_info->workers[i]; + AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; + MultiThreadInfo *mt_info = &cur_cpi->mt_info; + // This 'aom_internal_error_info' pointer is not derived from the local + // pointer ('AV1_COMMON *const cm') to silence the compiler warning + // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]". + struct aom_internal_error_info *const error = cur_cpi->common.error; + + // The jmp_buf is valid only within the scope of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error->jmp)) { + error->setjmp = 0; + restore_workers_after_fpmt(ppi, parallel_frame_count, i); + aom_internal_error_copy(&ppi->error, error); + } + error->setjmp = 1; + + AV1_COMMON *const cm = &cur_cpi->common; + // Assign start of level 2 worker pool + mt_info->workers = &p_mt_info->workers[i]; + mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i]; + // Assign number of workers for each frame in the parallel encode set. + mt_info->num_workers = compute_num_workers_per_frame( + num_workers - i, parallel_frame_count - frame_idx); + for (int j = MOD_FP; j < NUM_MT_MODULES; j++) { + mt_info->num_mod_workers[j] = + AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]); + } + if (p_mt_info->cdef_worker != NULL) { + mt_info->cdef_worker = &p_mt_info->cdef_worker[i]; + + // Back up the original cdef_worker pointers. + mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf; + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; plane++) + mt_info->restore_state_buf.cdef_colbuf[plane] = + mt_info->cdef_worker->colbuf[plane]; + } +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + // Back up the original LR buffers before update. + int idx = i + mt_info->num_workers - 1; + assert(idx < mt_info->lr_row_sync.num_workers); + mt_info->restore_state_buf.rst_tmpbuf = + mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf; + mt_info->restore_state_buf.rlbs = + mt_info->lr_row_sync.lrworkerdata[idx].rlbs; + + // Update LR buffers. + mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf; + mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs; + } +#endif + + i += mt_info->num_workers; + + // At this stage, the thread specific CDEF buffers for the current frame's + // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has + // already been allocated across parallel frames. + av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync, + p_mt_info->num_workers, 0); + + frame_worker->hook = hook; + frame_worker->data1 = cur_cpi; + frame_worker->data2 = (frame_idx == 0) + ? first_cpi_data + : &ppi->parallel_frames_data[frame_idx - 1]; + frame_idx++; + error->setjmp = 0; + } + p_mt_info->p_num_workers = parallel_frame_count; +} + +// Launch level 1 workers to perform frame parallel encode. +static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_workers = ppi->p_mt_info.p_num_workers; + + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +// Restore worker states after parallel encode. +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) { + assert(parallel_frame_count <= ppi->num_fp_contexts && + parallel_frame_count > 1); + (void)parallel_frame_count; + + PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; + + int frame_idx = 0; + int i = 0; + while (i < num_fpmt_workers_prepared) { + AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; + MultiThreadInfo *mt_info = &cur_cpi->mt_info; + const AV1_COMMON *const cm = &cur_cpi->common; + const int num_planes = av1_num_planes(cm); + + // Restore the original cdef_worker pointers. + if (p_mt_info->cdef_worker != NULL) { + mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf; + for (int plane = 0; plane < num_planes; plane++) + mt_info->cdef_worker->colbuf[plane] = + mt_info->restore_state_buf.cdef_colbuf[plane]; + } +#if !CONFIG_REALTIME_ONLY + if (is_restoration_used(cm)) { + // Restore the original LR buffers. + int idx = i + mt_info->num_workers - 1; + assert(idx < mt_info->lr_row_sync.num_workers); + mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = + mt_info->restore_state_buf.rst_tmpbuf; + mt_info->lr_row_sync.lrworkerdata[idx].rlbs = + mt_info->restore_state_buf.rlbs; + } +#endif + + frame_idx++; + i += mt_info->num_workers; + } +} + +// Synchronize level 1 workers. +static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi, + int frames_in_parallel_set) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int num_workers = ppi->p_mt_info.p_num_workers; + int had_error = 0; + // Points to error in the earliest display order frame in the parallel set. + const struct aom_internal_error_info *error; + + // Encoding ends. + for (int i = num_workers - 1; i >= 0; --i) { + AVxWorker *const worker = ppi->p_mt_info.p_workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error = ppi->parallel_cpi[i]->common.error; + } + } + + restore_workers_after_fpmt(ppi, frames_in_parallel_set, + ppi->p_mt_info.num_workers); + + if (had_error) aom_internal_error_copy(&ppi->error, error); +} + +static int get_compressed_data_hook(void *arg1, void *arg2) { + AV1_COMP *cpi = (AV1_COMP *)arg1; + AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2; + int status = av1_get_compressed_data(cpi, cpi_data); + + // AOM_CODEC_OK(0) means no error. + return !status; +} + +// This function encodes the raw frame data for each frame in parallel encode +// set, and outputs the frame bit stream to the designated buffers. +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data) { + // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf + // corresponding to frames in the current parallel encode set. + int ref_buffers_used_map = 0; + int frames_in_parallel_set = av1_init_parallel_frame_context( + first_cpi_data, ppi, &ref_buffers_used_map); + prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook, + frames_in_parallel_set); + launch_fpmt_workers(ppi); + sync_fpmt_workers(ppi, frames_in_parallel_set); + + // Release cpi->scaled_ref_buf corresponding to frames in the current parallel + // encode set. + for (int i = 0; i < frames_in_parallel_set; ++i) { + av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]); + } + av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool, + ref_buffers_used_map); +} + +static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + worker->had_error = 0; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info, + AV1_COMMON *const cm, int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + const AVxWorker *const worker_main = &mt_info->workers[0]; + int had_error = worker_main->had_error; + struct aom_internal_error_info error_info; + + // Read the error_info of main thread. + if (had_error) { + error_info = ((EncWorkerData *)worker_main->data1)->error_info; + } + + // Encoding ends. + for (int i = num_workers - 1; i > 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + if (!winterface->sync(worker)) { + had_error = 1; + error_info = ((EncWorkerData *)worker->data1)->error_info; + } + } + + if (had_error) aom_internal_error_copy(cm->error, &error_info); + + // Restore xd->error_info of the main thread back to cm->error so that the + // multithreaded code, when executed using a single thread, has a valid + // xd->error_info. + MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd; + xd->error_info = cm->error; +} + +static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi, + int num_workers) { + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->mt_info.workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + cpi->intrabc_used |= thread_data->td->intrabc_used; + cpi->deltaq_used |= thread_data->td->deltaq_used; + // Accumulate rtc counters. + if (!frame_is_intra_only(&cpi->common)) + av1_accumulate_rtc_counters(cpi, &thread_data->td->mb); + cpi->palette_pixel_num += thread_data->td->mb.palette_pixels; + if (thread_data->td != &cpi->td) { + // Keep these conditional expressions in sync with the corresponding ones + // in prepare_enc_workers(). + if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + aom_free(thread_data->td->mv_costs_alloc); + thread_data->td->mv_costs_alloc = NULL; + } + if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + aom_free(thread_data->td->dv_costs_alloc); + thread_data->td->dv_costs_alloc = NULL; + } + } + av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common)); + + // Accumulate counters. + if (i > 0) { + av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); + accumulate_rd_opt(&cpi->td, thread_data->td); + cpi->td.mb.txfm_search_info.txb_split_count += + thread_data->td->mb.txfm_search_info.txb_split_count; +#if CONFIG_SPEED_STATS + cpi->td.mb.txfm_search_info.tx_search_count += + thread_data->td->mb.txfm_search_info.tx_search_count; +#endif // CONFIG_SPEED_STATS + } + } +} + +static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1_COMMON *const cm = &cpi->common; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + thread_data->td->intrabc_used = 0; + thread_data->td->deltaq_used = 0; + thread_data->td->abs_sum_level = 0; + thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0; + thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer; + + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + memcpy(thread_data->td->hash_value_buffer[x][y], + cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*thread_data->td->hash_value_buffer[0][0])); + thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] = + thread_data->td->hash_value_buffer[x][y]; + } + } + // Keep these conditional expressions in sync with the corresponding ones + // in accumulate_counters_enc_workers(). + if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + CHECK_MEM_ERROR( + cm, thread_data->td->mv_costs_alloc, + (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc))); + thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc; + memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, + sizeof(MvCosts)); + } + if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) { + // Reset dv_costs to NULL for worker threads when dv cost update is + // enabled so that only dv_cost_upd_level needs to be checked before the + // aom_free() call for the same. + thread_data->td->mb.dv_costs = NULL; + if (av1_need_dv_costs(cpi)) { + CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc, + (IntraBCMVCosts *)aom_malloc( + sizeof(*thread_data->td->dv_costs_alloc))); + thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc; + memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, + sizeof(IntraBCMVCosts)); + } + } + } + av1_alloc_mb_data(cpi, &thread_data->td->mb); + + // Reset rtc counters. + av1_init_rtc_counters(&thread_data->td->mb); + + thread_data->td->mb.palette_pixels = 0; + + if (thread_data->td->counts != &cpi->counts) { + memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); + } + + if (i > 0) { + thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; + thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer; + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.tmp_pred_bufs[j] = + thread_data->td->tmp_pred_bufs[j]; + } + thread_data->td->mb.pixel_gradient_info = + thread_data->td->pixel_gradient_info; + + thread_data->td->mb.src_var_info_of_4x4_sub_blocks = + thread_data->td->src_var_info_of_4x4_sub_blocks; + + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + for (int j = 0; j < 2; ++j) { + thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] = + thread_data->td->mb.tmp_pred_bufs[j]; + } + } + } +} + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + // Before encoding a frame, copy the thread data from cpi. + thread_data->td->mb = cpi->td.mb; + } + av1_alloc_src_diff_buf(cm, &thread_data->td->mb); + } +} +#endif + +// Computes the number of workers for row multi-threading of encoding stage +static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm, + int max_threads) { + TileInfo tile_info; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int total_num_threads_row_mt = 0; + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + av1_tile_init(&tile_info, cm, row, col); + const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info); + const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info); + total_num_threads_row_mt += + AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); + } + } + return AOMMIN(max_threads, total_num_threads_row_mt); +} + +// Computes the number of workers for tile multi-threading of encoding stage +static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm, + int max_threads) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + return AOMMIN(max_threads, tile_cols * tile_rows); +} + +// Find max worker of all MT stages +int av1_get_max_num_workers(const AV1_COMP *cpi) { + int max_num_workers = 0; + for (int i = MOD_FP; i < NUM_MT_MODULES; i++) + max_num_workers = + AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers); + assert(max_num_workers >= 1); + return AOMMIN(max_num_workers, cpi->oxcf.max_threads); +} + +// Computes the number of workers for encoding stage (row/tile multi-threading) +int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) { + if (max_workers <= 1) return 1; + if (cpi->oxcf.row_mt) + return compute_num_enc_row_mt_workers(&cpi->common, max_workers); + else + return compute_num_enc_tile_mt_workers(&cpi->common, max_workers); +} + +void av1_encode_tiles_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int num_workers = mt_info->num_mod_workers[MOD_ENC]; + + assert(IMPLIES(cpi->tile_data == NULL, + cpi->allocated_tiles < tile_cols * tile_rows)); + if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); + + av1_init_tile_data(cpi); + num_workers = AOMMIN(num_workers, mt_info->num_workers); + + prepare_enc_workers(cpi, enc_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); + accumulate_counters_enc_workers(cpi, num_workers); +} + +// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int' +// members, so we treat it as an array, and sum over the whole length. +void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts, + const FRAME_COUNTS *counts) { + unsigned int *const acc = (unsigned int *)acc_counts; + const unsigned int *const cnt = (const unsigned int *)counts; + + const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int); + + for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i]; +} + +// Computes the maximum number of sb rows and sb_cols across tiles which are +// used to allocate memory for multi-threaded encoding with row-mt=1. +static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm, + int *max_sb_rows_in_tile, + int *max_sb_cols_in_tile) { + const int tile_rows = cm->tiles.rows; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int num_mi_rows = cm->mi_params.mi_rows; + const int *const row_start_sb = cm->tiles.row_start_sb; + for (int row = 0; row < tile_rows; row++) { + const int mi_row_start = row_start_sb[row] << mib_size_log2; + const int mi_row_end = + AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); + const int num_sb_rows_in_tile = + CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2); + *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile); + } + + const int tile_cols = cm->tiles.cols; + const int num_mi_cols = cm->mi_params.mi_cols; + const int *const col_start_sb = cm->tiles.col_start_sb; + for (int col = 0; col < tile_cols; col++) { + const int mi_col_start = col_start_sb[col] << mib_size_log2; + const int mi_col_end = + AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols); + const int num_sb_cols_in_tile = + CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2); + *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile); + } +} + +#if !CONFIG_REALTIME_ONLY +// Computes the number of workers for firstpass stage (row/tile multi-threading) +int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int total_num_threads_row_mt = 0; + TileInfo tile_info; + + if (cpi->oxcf.max_threads <= 1) return 1; + + for (int row = 0; row < tile_rows; row++) { + for (int col = 0; col < tile_cols; col++) { + av1_tile_init(&tile_info, cm, row, col); + const int num_mb_rows_in_tile = + av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size); + const int num_mb_cols_in_tile = + av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size); + total_num_threads_row_mt += + AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile); + } + } + return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); +} + +// Computes the maximum number of mb_rows for row multi-threading of firstpass +// stage +static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm, + BLOCK_SIZE fp_block_size) { + const int tile_rows = cm->tiles.rows; + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const int num_mi_rows = cm->mi_params.mi_rows; + const int *const row_start_sb = cm->tiles.row_start_sb; + int max_mb_rows = 0; + + for (int row = 0; row < tile_rows; row++) { + const int mi_row_start = row_start_sb[row] << mib_size_log2; + const int mi_row_end = + AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows); + const int num_mb_rows_in_tile = + CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2); + max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile); + } + return max_mb_rows; +} +#endif + +static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) { + // Pipelining of loop-filtering after encoding is enabled when loop-filter + // level is chosen based on quantizer and frame type. It is disabled in case + // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage + // decides the filter level. Loop-filtering is disabled in case + // of non-reference frames and for frames with intra block copy tool enabled. + AV1_COMMON *cm = &cpi->common; + const int use_loopfilter = is_loopfilter_used(cm); + const int use_superres = av1_superres_scaled(cm); + const int use_cdef = is_cdef_used(cm); + const int use_restoration = is_restoration_used(cm); + MultiThreadInfo *const mt_info = &cpi->mt_info; + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + + const unsigned int skip_apply_postproc_filters = + derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef, + use_superres, use_restoration); + mt_info->pipeline_lpf_mt_with_enc = + (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) && + (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) && + (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) && + !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc && + ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0); + + if (!mt_info->pipeline_lpf_mt_with_enc) return; + + set_postproc_filter_default_params(cm); + + if (!use_loopfilter) return; + + const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick; + assert(method == LPF_PICK_FROM_Q); + assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY); + + av1_pick_filter_level(cpi->source, cpi, method); + + struct loopfilter *lf = &cm->lf; + const int plane_start = 0; + const int plane_end = av1_num_planes(cm); + int planes_to_lf[MAX_MB_PLANE]; + if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc, + lf->filter_level)) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); + int lpf_opt_level = get_lpf_opt_level(&cpi->sf); + assert(lpf_opt_level == 2); + + const int start_mi_row = 0; + const int end_mi_row = start_mi_row + cm->mi_params.mi_rows; + + av1_loop_filter_frame_init(cm, plane_start, plane_end); + + assert(mt_info->num_mod_workers[MOD_ENC] == + mt_info->num_mod_workers[MOD_LPF]); + loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf, + mt_info->num_mod_workers[MOD_LPF], + &mt_info->lf_row_sync, lpf_opt_level, + cm->seq_params->mib_size_log2); + + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + // Initialize loopfilter data + thread_data->lf_sync = &mt_info->lf_row_sync; + thread_data->lf_data = &thread_data->lf_sync->lfdata[i]; + loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd); + } + } +} + +void av1_encode_tiles_row_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int sb_rows_in_frame = get_sb_rows_in_frame(cm); + int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; + int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0; + int num_workers = mt_info->num_mod_workers[MOD_ENC]; + + compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile); + const bool alloc_row_mt_mem = + (enc_row_mt->allocated_tile_cols != tile_cols || + enc_row_mt->allocated_tile_rows != tile_rows || + enc_row_mt->allocated_rows != max_sb_rows_in_tile || + enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) || + enc_row_mt->allocated_sb_rows != sb_rows_in_frame); + const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; + + assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); + if (alloc_tile_data) { + av1_alloc_tile_data(cpi); + } + + assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); + if (alloc_row_mt_mem) { + row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile, + cpi->oxcf.algo_cfg.cdf_update_mode); + } + + num_workers = AOMMIN(num_workers, mt_info->num_workers); + lpf_pipeline_mt_init(cpi, num_workers); + + av1_init_tile_data(cpi); + + memset(thread_id_to_tile_id, -1, + sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); + memset(enc_row_mt->num_tile_cols_done, 0, + sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame); + enc_row_mt->row_mt_exit = false; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + + // Initialize num_finished_cols to -1 for all rows. + memset(row_mt_sync->num_finished_cols, -1, + sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile); + row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; + row_mt_sync->num_threads_working = 0; + row_mt_sync->intrabc_extra_top_right_sb_delay = + av1_get_intrabc_extra_top_right_sb_delay(cm); + + av1_inter_mode_data_init(this_tile); + av1_zero_above_context(cm, &cpi->td.mb.e_mbd, + this_tile->tile_info.mi_col_start, + this_tile->tile_info.mi_col_end, tile_row); + } + } + + assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, + num_workers); + prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); + if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi); + accumulate_counters_enc_workers(cpi, num_workers); +} + +#if !CONFIG_REALTIME_ONLY +static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) { + for (int i = num_workers - 1; i >= 0; --i) { + EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i]; + if (thread_data->td != &cpi->td) + av1_dealloc_src_diff_buf(&thread_data->td->mb, + av1_num_planes(&cpi->common)); + } +} + +void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id; + int num_workers = 0; + int max_mb_rows = 0; + + max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size); + const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols || + enc_row_mt->allocated_tile_rows != tile_rows || + enc_row_mt->allocated_rows != max_mb_rows; + const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows; + + assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data)); + if (alloc_tile_data) { + av1_alloc_tile_data(cpi); + } + + assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem)); + if (alloc_row_mt_mem) { + row_mt_mem_alloc(cpi, max_mb_rows, -1, 0); + } + + av1_init_tile_data(cpi); + + // For pass = 1, compute the no. of workers needed. For single-pass encode + // (pass = 0), no. of workers are already computed. + if (mt_info->num_mod_workers[MOD_FP] == 0) + num_workers = av1_fp_compute_num_enc_workers(cpi); + else + num_workers = mt_info->num_mod_workers[MOD_FP]; + + memset(thread_id_to_tile_id, -1, + sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); + enc_row_mt->firstpass_mt_exit = false; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync; + + // Initialize num_finished_cols to -1 for all rows. + memset(row_mt_sync->num_finished_cols, -1, + sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows); + row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start; + row_mt_sync->num_threads_working = 0; + + // intraBC mode is not evaluated during first-pass encoding. Hence, no + // additional top-right delay is required. + row_mt_sync->intrabc_extra_top_right_sb_delay = 0; + } + } + + num_workers = AOMMIN(num_workers, mt_info->num_workers); + assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows, + num_workers); + fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); + dealloc_thread_data_src_diff_buf(cpi, num_workers); +} + +void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c) { + (void)tpl_mt_sync; + (void)r; + (void)c; +} + +void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c, int cols) { + (void)tpl_mt_sync; + (void)r; + (void)c; + (void)cols; +} + +void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, + int c) { +#if CONFIG_MULTITHREAD + int nsync = tpl_row_mt_sync->sync_range; + + if (r) { + pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync) + pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex); + pthread_mutex_unlock(mutex); + } +#else + (void)tpl_row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r, + int c, int cols) { +#if CONFIG_MULTITHREAD + int nsync = tpl_row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]); + + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + tpl_row_mt_sync->num_finished_cols[r] = + AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur); + + pthread_cond_signal(&tpl_row_mt_sync->cond_[r]); + pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]); + } +#else + (void)tpl_row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const BLOCK_SIZE bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + const int mi_height = mi_size_high[bsize]; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; + const int tplb_cols_in_tile = + ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); + // In case of tpl row-multithreading, due to top-right dependency, the worker + // on an mb_row waits for the completion of the tpl processing of the top and + // top-right blocks. Hence, in case a thread (main/worker) encounters an + // error, update that the tpl processing of every mb_row in the frame is + // complete in order to avoid dependent workers waiting indefinitely. + for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows; + mi_row += mi_height, tplb_row++) { + (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, + tplb_cols_in_tile - 1, tplb_cols_in_tile); + } +} + +// Each worker calls tpl_worker_hook() and computes the tpl data. +static int tpl_worker_hook(void *arg1, void *unused) { + (void)unused; + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1_COMP *cpi = thread_data->cpi; + AV1_COMMON *cm = &cpi->common; + MACROBLOCK *x = &thread_data->td->mb; + MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats; + TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers; + CommonModeInfoParams *mi_params = &cm->mi_params; + int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working; + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt; + (void)tpl_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_; +#endif + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(tpl_error_mutex_); + tpl_row_mt->tpl_mt_exit = true; + pthread_mutex_unlock(tpl_error_mutex_); +#endif + set_mode_estimation_done(cpi); + return 0; + } + error_info->setjmp = 1; + + BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + TX_SIZE tx_size = max_txsize_lookup[bsize]; + int mi_height = mi_size_high[bsize]; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); + + for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows; + mi_row += num_active_workers * mi_height) { + // Motion estimation row boundary + av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, + cpi->oxcf.border_in_pixels); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, + bsize, tx_size); + } + error_info->setjmp = 0; + return 1; +} + +// Deallocate tpl synchronization related mutex and data. +void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) { + assert(tpl_sync != NULL); + +#if CONFIG_MULTITHREAD + if (tpl_sync->mutex_ != NULL) { + for (int i = 0; i < tpl_sync->rows; ++i) + pthread_mutex_destroy(&tpl_sync->mutex_[i]); + aom_free(tpl_sync->mutex_); + } + if (tpl_sync->cond_ != NULL) { + for (int i = 0; i < tpl_sync->rows; ++i) + pthread_cond_destroy(&tpl_sync->cond_[i]); + aom_free(tpl_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + + aom_free(tpl_sync->num_finished_cols); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + av1_zero(*tpl_sync); +} + +// Allocate memory for tpl row synchronization. +void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm, + int mb_rows) { + tpl_sync->rows = mb_rows; +#if CONFIG_MULTITHREAD + { + CHECK_MEM_ERROR(cm, tpl_sync->mutex_, + aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows)); + if (tpl_sync->mutex_) { + for (int i = 0; i < mb_rows; ++i) + pthread_mutex_init(&tpl_sync->mutex_[i], NULL); + } + + CHECK_MEM_ERROR(cm, tpl_sync->cond_, + aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows)); + if (tpl_sync->cond_) { + for (int i = 0; i < mb_rows; ++i) + pthread_cond_init(&tpl_sync->cond_[i], NULL); + } + } +#endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols, + aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows)); + + // Set up nsync. + tpl_sync->sync_range = 1; +} + +// Each worker is prepared by assigning the hook function and individual thread +// data. +static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + // OBMC buffers are used only to init MS params and remain unused when + // called from tpl, hence set the buffers to defaults. + av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); + if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers, + cpi->ppi->tpl_data.tpl_bsize_1d)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating tpl data"); + } + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + } + } +} + +#if CONFIG_BITRATE_ACCURACY +// Accumulate transform stats after tpl. +static void tpl_accumulate_txfm_stats(ThreadData *main_td, + const MultiThreadInfo *mt_info, + int num_workers) { + TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + ThreadData *td = thread_data->td; + if (td != main_td) { + const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; + av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats); + } + } +} +#endif // CONFIG_BITRATE_ACCURACY + +// Implements multi-threading for tpl. +void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CommonModeInfoParams *mi_params = &cm->mi_params; + MultiThreadInfo *mt_info = &cpi->mt_info; + TplParams *tpl_data = &cpi->ppi->tpl_data; + AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync; + int mb_rows = mi_params->mb_rows; + int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers); + + if (mb_rows != tpl_sync->rows) { + av1_tpl_dealloc(tpl_sync); + av1_tpl_alloc(tpl_sync, cm, mb_rows); + } + tpl_sync->num_threads_working = num_workers; + mt_info->tpl_row_mt.tpl_mt_exit = false; + + // Initialize cur_mb_col to -1 for all MB rows. + memset(tpl_sync->num_finished_cols, -1, + sizeof(*tpl_sync->num_finished_cols) * mb_rows); + + prepare_tpl_workers(cpi, tpl_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, cm, num_workers); +#if CONFIG_BITRATE_ACCURACY + tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers); +#endif // CONFIG_BITRATE_ACCURACY + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); + } +} + +// Deallocate memory for temporal filter multi-thread synchronization. +void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) { + assert(tf_sync != NULL); +#if CONFIG_MULTITHREAD + if (tf_sync->mutex_ != NULL) { + pthread_mutex_destroy(tf_sync->mutex_); + aom_free(tf_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD + tf_sync->next_tf_row = 0; +} + +// Checks if a job is available. If job is available, +// populates next_tf_row and returns 1, else returns 0. +static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync, + int *current_mb_row, int mb_rows) { + int do_next_row = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_; + pthread_mutex_lock(tf_mutex_); +#endif + if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) { + *current_mb_row = tf_mt_sync->next_tf_row; + tf_mt_sync->next_tf_row++; + do_next_row = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(tf_mutex_); +#endif + return do_next_row; +} + +// Hook function for each thread in temporal filter multi-threading. +static int tf_worker_hook(void *arg1, void *unused) { + (void)unused; + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1_COMP *cpi = thread_data->cpi; + ThreadData *td = thread_data->td; + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync; + const struct scale_factors *scale = &cpi->tf_ctx.sf; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *tf_mutex_ = tf_sync->mutex_; +#endif + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(tf_mutex_); + tf_sync->tf_mt_exit = true; + pthread_mutex_unlock(tf_mutex_); +#endif + return 0; + } + error_info->setjmp = 1; + + const int num_planes = av1_num_planes(&cpi->common); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + MACROBLOCKD *mbd = &td->mb.e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + MB_MODE_INFO **input_mb_mode_info; + tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); + tf_setup_macroblockd(mbd, &td->tf_data, scale); + + int current_mb_row = -1; + + while (tf_get_next_job(tf_sync, ¤t_mb_row, tf_ctx->mb_rows)) + av1_tf_do_filtering_row(cpi, td, current_mb_row); + + tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); + + error_info->setjmp = 0; + return 1; +} + +// Assigns temporal filter hook function and thread data to each worker. +static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers, int is_highbitdepth) { + MultiThreadInfo *mt_info = &cpi->mt_info; + mt_info->tf_sync.next_tf_row = 0; + mt_info->tf_sync.tf_mt_exit = false; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + // OBMC buffers are used only to init MS params and remain unused when + // called from tf, hence set the buffers to defaults. + av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); + if (!tf_alloc_and_reset_data(&thread_data->td->tf_data, + cpi->tf_ctx.num_pels, is_highbitdepth)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + } + } +} + +// Deallocate thread specific data for temporal filter. +static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers, + int is_highbitdepth) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth); + } +} + +// Accumulate sse and sum after temporal filtering. +static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) { + FRAME_DIFF *total_diff = &cpi->td.tf_data.diff; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &cpi->mt_info.workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + ThreadData *td = thread_data->td; + FRAME_DIFF *diff = &td->tf_data.diff; + if (td != &cpi->td) { + total_diff->sse += diff->sse; + total_diff->sum += diff->sum; + } + } +} + +// Implements multi-threading for temporal filter. +void av1_tf_do_filtering_mt(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + MultiThreadInfo *mt_info = &cpi->mt_info; + const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth; + + int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers); + + prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, cm, num_workers); + tf_accumulate_frame_diff(cpi, num_workers); + tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth); +} + +// Checks if a job is available in the current direction. If a job is available, +// frame_idx will be populated and returns 1, else returns 0. +static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, + int cur_dir) { + GlobalMotionInfo *gm_info = &cpi->gm_info; + JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + + int total_refs = gm_info->num_ref_frames[cur_dir]; + int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir]; + + if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) { + *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame; + job_info->next_frame_to_process[cur_dir] += 1; + return 1; + } + return 0; +} + +// Switches the current direction and calls the function get_next_gm_job() if +// the speed feature 'prune_ref_frame_for_gm_search' is not set. +static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx, + int *cur_dir) { + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return; + // Switch the direction and get next job + *cur_dir = !(*cur_dir); + get_next_gm_job(cpi, frame_idx, *(cur_dir)); +} + +// Hook function for each thread in global motion multi-threading. +static int gm_mt_worker_hook(void *arg1, void *unused) { + (void)unused; + + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1_COMP *cpi = thread_data->cpi; + GlobalMotionInfo *gm_info = &cpi->gm_info; + AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync; + JobInfo *job_info = &gm_sync->job_info; + int thread_id = thread_data->thread_id; + GlobalMotionData *gm_thread_data = &thread_data->td->gm_data; +#if CONFIG_MULTITHREAD + pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_; +#endif + + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(gm_mt_mutex_); + gm_sync->gm_mt_exit = true; + pthread_mutex_unlock(gm_mt_mutex_); +#endif + return 0; + } + error_info->setjmp = 1; + + int cur_dir = job_info->thread_id_to_dir[thread_id]; + bool gm_mt_exit = false; + while (1) { + int ref_buf_idx = -1; + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(gm_mt_mutex_); +#endif + + gm_mt_exit = gm_sync->gm_mt_exit; + // Populates ref_buf_idx(the reference frame type) for which global motion + // estimation will be done. + if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) { + // No jobs are available for the current direction. Switch + // to other direction and get the next job, if available. + switch_direction(cpi, &ref_buf_idx, &cur_dir); + } + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(gm_mt_mutex_); +#endif + + // When gm_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (gm_mt_exit || ref_buf_idx == -1) break; + + // Compute global motion for the given ref_buf_idx. + av1_compute_gm_for_valid_ref_frames( + cpi, error_info, gm_info->ref_buf, ref_buf_idx, + gm_thread_data->motion_models, gm_thread_data->segment_map, + gm_info->segment_map_w, gm_info->segment_map_h); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(gm_mt_mutex_); +#endif + // If global motion w.r.t. current ref frame is + // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t + // the remaining ref frames in that direction. + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && + cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION) + job_info->early_exit[cur_dir] = 1; + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(gm_mt_mutex_); +#endif + } + error_info->setjmp = 0; + return 1; +} + +// Assigns global motion hook function and thread data to each worker. +static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + mt_info->gm_sync.gm_mt_exit = false; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread. + thread_data->start = i; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + if (thread_data->td != &cpi->td) + gm_alloc_data(cpi, &thread_data->td->gm_data); + } +} + +// Assigns available threads to past/future direction. +static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir, + int num_workers) { + int8_t frame_dir_idx = 0; + + for (int i = 0; i < num_workers; i++) { + thread_id_to_dir[i] = frame_dir_idx++; + if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0; + } +} + +// Computes number of workers for global motion multi-threading. +static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) { + int total_refs = + cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1]; + int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search + ? AOMMIN(MAX_DIRECTIONS, total_refs) + : total_refs; + num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers); + return (num_gm_workers); +} + +// Frees the memory allocated for each worker in global motion multi-threading. +static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int j = 0; j < num_workers; j++) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) gm_dealloc_data(&td->gm_data); + } +} + +// Implements multi-threading for global motion. +void av1_global_motion_estimation_mt(AV1_COMP *cpi) { + JobInfo *job_info = &cpi->mt_info.gm_sync.job_info; + + av1_zero(*job_info); + + int num_workers = compute_gm_workers(cpi); + + assign_thread_to_dir(job_info->thread_id_to_dir, num_workers); + prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers); + launch_workers(&cpi->mt_info, num_workers); + sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers); + gm_dealloc_thread_data(cpi, num_workers); +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE int get_next_job_allintra( + AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end, + int *current_mi_row, int mib_size) { + if (row_mt_sync->next_mi_row < mi_row_end) { + *current_mi_row = row_mt_sync->next_mi_row; + row_mt_sync->num_threads_working++; + row_mt_sync->next_mi_row += mib_size; + return 1; + } + return 0; +} + +static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi, + AVxWorkerHook hook, + const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = NULL; + + thread_data->thread_id = i; + // Set the starting tile for each thread, in this case the preprocessing + // stage does not need tiles. So we set it to 0. + thread_data->start = 0; + + thread_data->cpi = cpi; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td); + } + } +} + +static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL); + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; + const int mt_unit_cols = + (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + + // Update the wiener variance computation of every row in the frame to + // indicate that it is complete in order to avoid dependent workers waiting + // indefinitely. + for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows; + mi_row += mb_step, ++mt_thread_id) { + intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_cols - 1, mt_unit_cols); + } +} + +static int cal_mb_wiener_var_hook(void *arg1, void *unused) { + (void)unused; + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + AV1_COMP *const cpi = thread_data->cpi; + MACROBLOCK *x = &thread_data->td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; + (void)enc_row_mt; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_; +#endif + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); + enc_row_mt->mb_wiener_mt_exit = true; + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + set_mb_wiener_var_calc_done(cpi); + return 0; + } + error_info->setjmp = 1; + DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + double sum_rec_distortion = 0; + double sum_est_rate = 0; + while (1) { + int current_mi_row = -1; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); +#endif + int has_jobs = enc_row_mt->mb_wiener_mt_exit + ? 0 + : get_next_job_allintra(intra_row_mt_sync, + cpi->common.mi_params.mi_rows, + ¤t_mi_row, mb_step); +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + if (!has_jobs) break; + // TODO(chengchen): properly accumulate the distortion and rate. + av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff, + qcoeff, dqcoeff, &sum_rec_distortion, + &sum_est_rate, + thread_data->td->wiener_tmp_pred_buf); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); +#endif + intra_row_mt_sync->num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(enc_row_mt_mutex); +#endif + } + error_info->setjmp = 0; + return 1; +} + +static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) { + av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync); + + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int j = 0; j < num_workers; ++j) { + EncWorkerData *thread_data = &mt_info->tile_thr_data[j]; + ThreadData *td = thread_data->td; + if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td); + } +} + +// This function is the multi-threading version of computing the wiener +// variance. +// Note that the wiener variance is used for allintra mode (1 pass) and its +// computation is before the frame encoding, so we don't need to consider +// the number of tiles, instead we allocate all available threads to +// the computation. +void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, + double *sum_rec_distortion, + double *sum_est_rate) { + (void)sum_rec_distortion; + (void)sum_est_rate; + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + + // TODO(chengchen): the memory usage could be improved. + const int mi_rows = cm->mi_params.mi_rows; + row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows); + + intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0; + intra_row_mt_sync->num_threads_working = num_workers; + intra_row_mt_sync->next_mi_row = 0; + memset(intra_row_mt_sync->num_finished_cols, -1, + sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows); + mt_info->enc_row_mt.mb_wiener_mt_exit = false; + + prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, cm, num_workers); + dealloc_mb_wiener_var_mt_data(cpi, num_workers); +} + +// Compare and order tiles based on absolute sum of tx coeffs. +static int compare_tile_order(const void *a, const void *b) { + const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a; + const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b; + + if (tile_a->abs_sum_level > tile_b->abs_sum_level) + return -1; + else if (tile_a->abs_sum_level == tile_b->abs_sum_level) + return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1); + else + return 1; +} + +// Get next tile index to be processed for pack bitstream +static AOM_INLINE int get_next_pack_bs_tile_idx( + AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { + assert(pack_bs_sync->next_job_idx <= num_tiles); + if (pack_bs_sync->next_job_idx == num_tiles) return -1; + + return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++] + .tile_idx; +} + +// Calculates bitstream chunk size based on total buffer size and tile or tile +// group size. +static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size, + const int frame_or_tg_size, + size_t *remain_buf_size, + size_t max_buf_size, + int is_last_chunk) { + size_t this_chunk_size; + assert(*remain_buf_size > 0); + if (is_last_chunk) { + this_chunk_size = *remain_buf_size; + *remain_buf_size = 0; + } else { + const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size; + this_chunk_size = (size_t)(size_scale / frame_or_tg_size); + *remain_buf_size -= this_chunk_size; + assert(*remain_buf_size > 0); + } + assert(this_chunk_size > 0); + return this_chunk_size; +} + +// Initializes params required for pack bitstream tile. +static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + PackBSParams *const pack_bs_params_arr, + uint8_t obu_extn_header) { + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + // Tile group size in terms of number of tiles. + const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs; + uint8_t *tile_dst = dst; + uint8_t *tile_data_curr = dst; + // Max tile group count can not be more than MAX_TILES. + int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units + int tile_idx; + int tg_idx = 0; + int tile_count_in_tg = 0; + int new_tg = 1; + + // Populate pack bitstream params of all tiles. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info; + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + // Calculate tile size in mi units. + const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) * + (tile_info->mi_row_end - tile_info->mi_row_start); + int is_last_tile_in_tg = 0; + tile_count_in_tg++; + if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1)) + is_last_tile_in_tg = 1; + + // Populate pack bitstream params of this tile. + pack_bs_params->curr_tg_hdr_size = 0; + pack_bs_params->obu_extn_header = obu_extn_header; + pack_bs_params->saved_wb = saved_wb; + pack_bs_params->obu_header_size = 0; + pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params->new_tg = new_tg; + pack_bs_params->tile_col = tile_info->tile_col; + pack_bs_params->tile_row = tile_info->tile_row; + pack_bs_params->tile_size_mi = tile_size_mi; + tg_size_mi[tg_idx] += tile_size_mi; + + if (new_tg) new_tg = 0; + if (is_last_tile_in_tg) { + tile_count_in_tg = 0; + new_tg = 1; + tg_idx++; + } + } + + assert(cpi->available_bs_size > 0); + size_t tg_buf_size[MAX_TILES] = { 0 }; + size_t max_buf_size = cpi->available_bs_size; + size_t remain_buf_size = max_buf_size; + const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols; + + tile_idx = 0; + // Prepare obu, tile group and frame header of each tile group. + for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + int is_last_tg = tg_idx == cpi->num_tg - 1; + // Prorate bitstream buffer size based on tile group size and available + // buffer size. This buffer will be used to store headers and tile data. + tg_buf_size[tg_idx] = + get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size, + max_buf_size, is_last_tg); + + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_dst; + + // Write obu, tile group and frame header at first tile in the tile + // group. + av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx); + tile_dst += tg_buf_size[tg_idx]; + + // Exclude headers from tile group buffer size. + tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size; + tile_idx += tg_size_in_tiles; + } + + tg_idx = 0; + // Calculate bitstream buffer size of each tile in the tile group. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + + if (pack_bs_params->new_tg) { + max_buf_size = tg_buf_size[tg_idx]; + remain_buf_size = max_buf_size; + } + + // Prorate bitstream buffer size of this tile based on tile size and + // available buffer size. For this proration, header size is not accounted. + const size_t tile_buf_size = get_bs_chunk_size( + pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size, + max_buf_size, pack_bs_params->is_last_tile_in_tg); + pack_bs_params->tile_buf_size = tile_buf_size; + + // Update base address of bitstream buffer for tile and tile group. + if (pack_bs_params->new_tg) { + tile_dst = pack_bs_params->dst; + tile_data_curr = pack_bs_params->tile_data_curr; + // Account header size in first tile of a tile group. + pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size; + } else { + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_data_curr; + } + + if (pack_bs_params->is_last_tile_in_tg) tg_idx++; + tile_dst += pack_bs_params->tile_buf_size; + } +} + +// Worker hook function of pack bitsteam multithreading. +static int pack_bs_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + PackBSParams *const pack_bs_params = (PackBSParams *)arg2; + AV1_COMP *const cpi = thread_data->cpi; + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_; +#endif + MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pack_bs_mutex); + pack_bs_sync->pack_bs_mt_exit = true; + pthread_mutex_unlock(pack_bs_mutex); +#endif + return 0; + } + error_info->setjmp = 1; + + while (1) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pack_bs_mutex); +#endif + const int tile_idx = + pack_bs_sync->pack_bs_mt_exit + ? -1 + : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles); +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pack_bs_mutex); +#endif + // When pack_bs_mt_exit is set to true, other workers need not pursue any + // further jobs. + if (tile_idx == -1) break; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + + av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]); + } + + error_info->setjmp = 0; + return 1; +} + +// Prepares thread data and workers of pack bitsteam multithreading. +static void prepare_pack_bs_workers(AV1_COMP *const cpi, + PackBSParams *const pack_bs_params, + AVxWorkerHook hook, const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + if (i == 0) { + thread_data->td = &cpi->td; + } else { + thread_data->td = thread_data->original_td; + } + + if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb; + + thread_data->cpi = cpi; + thread_data->start = i; + thread_data->thread_id = i; + av1_reset_pack_bs_thread_data(thread_data->td); + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = pack_bs_params; + } + + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; + const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; + pack_bs_sync->next_job_idx = 0; + pack_bs_sync->pack_bs_mt_exit = false; + + PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; + // Reset tile order data of pack bitstream + av1_zero_array(pack_bs_tile_order, num_tiles); + + // Populate pack bitstream tile order structure + for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + pack_bs_tile_order[tile_idx].abs_sum_level = + cpi->tile_data[tile_idx].abs_sum_level; + pack_bs_tile_order[tile_idx].tile_idx = tile_idx; + } + + // Sort tiles in descending order based on tile area. + qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order), + compare_tile_order); +} + +// Accumulates data after pack bitsteam processing. +static void accumulate_pack_bs_data( + AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr, + uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info, + int *const largest_tile_id, unsigned int *max_tile_size, + uint32_t *const obu_header_size, uint8_t **tile_data_start, + const int num_workers) { + const AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int tile_count = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + size_t curr_tg_data_size = 0; + int is_first_tg = 1; + uint8_t *curr_tg_start = dst; + size_t src_offset = 0; + size_t dst_offset = 0; + + for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) { + // PackBSParams stores all parameters required to pack tile and header + // info. + const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + uint32_t tile_size = 0; + + if (pack_bs_params->new_tg) { + curr_tg_start = dst + *total_size; + curr_tg_data_size = pack_bs_params->curr_tg_hdr_size; + *tile_data_start += pack_bs_params->curr_tg_hdr_size; + *obu_header_size = pack_bs_params->obu_header_size; + } + curr_tg_data_size += + pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4); + + if (pack_bs_params->buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params->buf.size; + } + tile_size += + (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size; + + // Pack all the chunks of tile bitstreams together + if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size); + + if (pack_bs_params->is_last_tile_in_tg) + av1_write_last_tile_info( + cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size, + curr_tg_start, &tile_size, tile_data_start, largest_tile_id, + &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header); + src_offset += pack_bs_params->tile_buf_size; + dst_offset += tile_size; + *total_size += tile_size; + } + + // Accumulate thread data + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int idx = num_workers - 1; idx >= 0; idx--) { + ThreadData const *td = mt_info->tile_thr_data[idx].td; + av1_accumulate_pack_bs_thread_data(cpi, td); + } +} + +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start, const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + + PackBSParams pack_bs_params[MAX_TILES]; + uint32_t tile_size[MAX_TILES] = { 0 }; + + for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++) + pack_bs_params[tile_idx].total_size = &tile_size[tile_idx]; + + init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header); + prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook, + num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, &cpi->common, num_workers); + accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info, + largest_tile_id, max_tile_size, obu_header_size, + tile_data_start, num_workers); +} + +// Deallocate memory for CDEF search multi-thread synchronization. +void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { + (void)cdef_sync; + assert(cdef_sync != NULL); +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ != NULL) { + pthread_mutex_destroy(cdef_sync->mutex_); + aom_free(cdef_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD +} + +// Updates the row and column indices of the next job to be processed. +// Also updates end_of_frame flag when the processing of all blocks is complete. +static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { + cdef_sync->fbc++; + if (cdef_sync->fbc == nhfb) { + cdef_sync->fbr++; + if (cdef_sync->fbr == nvfb) { + cdef_sync->end_of_frame = 1; + } else { + cdef_sync->fbc = 0; + } + } +} + +// Initializes cdef_sync parameters. +static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) { +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); +#endif // CONFIG_MULTITHREAD + cdef_sync->end_of_frame = 0; + cdef_sync->fbr = 0; + cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; +} + +// Checks if a job is available. If job is available, +// populates next job information and returns 1, else returns 0. +static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync, + CdefSearchCtx *cdef_search_ctx, + volatile int *cur_fbr, + volatile int *cur_fbc, + volatile int *sb_count) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + int do_next_block = 0; + const int nvfb = cdef_search_ctx->nvfb; + const int nhfb = cdef_search_ctx->nhfb; + + // If a block is skip, do not process the block and + // check the skip condition for the next block. + while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame && + cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr, + cdef_sync->fbc)) { + update_next_job_info(cdef_sync, nvfb, nhfb); + } + + // Populates information needed for current job and update the row, + // column indices of the next block to be processed. + if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) { + do_next_block = 1; + *cur_fbr = cdef_sync->fbr; + *cur_fbc = cdef_sync->fbc; + *sb_count = cdef_search_ctx->sb_count; + cdef_search_ctx->sb_count++; + update_next_job_info(cdef_sync, nvfb, nhfb); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + return do_next_block; +} + +// Hook function for each thread in CDEF search multi-threading. +static int cdef_filter_block_worker_hook(void *arg1, void *arg2) { + EncWorkerData *thread_data = (EncWorkerData *)arg1; + AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2; + +#if CONFIG_MULTITHREAD + pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_; +#endif + struct aom_internal_error_info *const error_info = &thread_data->error_info; + CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_mutex_); + cdef_sync->cdef_mt_exit = true; + pthread_mutex_unlock(cdef_mutex_); +#endif + return 0; + } + error_info->setjmp = 1; + + volatile int cur_fbr, cur_fbc, sb_count; + while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc, + &sb_count)) { + av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc, + sb_count); + } + error_info->setjmp = 0; + return 1; +} + +// Assigns CDEF search hook function and thread data to each worker. +static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook, + int num_workers) { + MultiThreadInfo *mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *thread_data = &mt_info->tile_thr_data[i]; + + thread_data->cpi = cpi; + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = &mt_info->cdef_sync; + } +} + +// Implements multi-threading for CDEF search. +void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) { + MultiThreadInfo *mt_info = &cpi->mt_info; + AV1CdefSync *cdef_sync = &mt_info->cdef_sync; + const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH]; + + cdef_reset_job_info(cdef_sync); + prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, &cpi->common, num_workers); +} + +// Computes num_workers for temporal filter multi-threading. +static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) { + // For single-pass encode, using no. of workers as per tf block size was not + // found to improve speed. Hence the thread assignment for single-pass encode + // is kept based on compute_num_enc_workers(). + if (cpi->oxcf.pass < AOM_RC_SECOND_PASS) + return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads)); + + if (cpi->oxcf.max_threads <= 1) return 1; + + const int frame_height = cpi->common.height; + const BLOCK_SIZE block_size = TF_BLOCK_SIZE; + const int mb_height = block_size_high[block_size]; + const int mb_rows = get_num_blocks(frame_height, mb_height); + return AOMMIN(cpi->oxcf.max_threads, mb_rows); +} + +// Computes num_workers for tpl multi-threading. +static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for loop filter multi-threading. +static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for cdef multi-threading. +static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for loop-restoration multi-threading. +static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) { + return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); +} + +// Computes num_workers for pack bitstream multi-threading. +static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) { + if (cpi->oxcf.max_threads <= 1) return 1; + return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads); +} + +// Computes num_workers for all intra multi-threading. +static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) { + if (cpi->oxcf.max_threads <= 1) return 1; + // The multi-threading implementation of deltaq-mode = 3 in allintra + // mode is based on row multi threading. + if (!cpi->oxcf.row_mt) return 1; + cpi->weber_bsize = BLOCK_8X8; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step; + return AOMMIN(num_mb_rows, cpi->oxcf.max_threads); +} + +static int compute_num_mod_workers(AV1_COMP *cpi, + MULTI_THREADED_MODULES mod_name) { + int num_mod_workers = 0; + switch (mod_name) { + case MOD_FP: + if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS) + num_mod_workers = 0; + else + num_mod_workers = + av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); + break; + case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break; + case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break; + case MOD_GME: num_mod_workers = 1; break; + case MOD_ENC: + num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads); + break; + case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break; + case MOD_CDEF_SEARCH: + num_mod_workers = compute_num_cdef_workers(cpi); + break; + case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break; + case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break; + case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break; + case MOD_FRAME_ENC: + num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC]; + break; + case MOD_AI: + if (cpi->oxcf.pass == AOM_RC_ONE_PASS) { + num_mod_workers = compute_num_ai_workers(cpi); + } else { + num_mod_workers = 0; + } + break; + default: assert(0); break; + } + return (num_mod_workers); +} +// Computes the number of workers for each MT modules in the encoder +void av1_compute_num_workers_for_mt(AV1_COMP *cpi) { + for (int i = MOD_FP; i < NUM_MT_MODULES; i++) { + cpi->ppi->p_mt_info.num_mod_workers[i] = + compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i); + } +} diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h new file mode 100644 index 0000000000..468e120776 --- /dev/null +++ b/third_party/aom/av1/encoder/ethread.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ETHREAD_H_ +#define AOM_AV1_ENCODER_ETHREAD_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct ThreadData; + +typedef struct EncWorkerData { + struct AV1_COMP *cpi; + struct ThreadData *td; + struct ThreadData *original_td; + struct aom_internal_error_info error_info; + AV1LfSync *lf_sync; + LFWorkerData *lf_data; + int start; + int thread_id; +} EncWorkerData; + +void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c); +void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c, + int cols); + +void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c); +void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r, + int c, int cols); + +void av1_encode_tiles_mt(struct AV1_COMP *cpi); +void av1_encode_tiles_row_mt(struct AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi); + +int av1_fp_compute_num_enc_workers(AV1_COMP *cpi); +#endif + +void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts, + const struct FRAME_COUNTS *counts); + +void av1_row_mt_mem_dealloc(AV1_COMP *cpi); + +void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync); + +void av1_global_motion_estimation_mt(AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c); +void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync, + int r, int c, int cols); + +void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, + int c); +void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, + int c, int cols); + +void av1_mc_flow_dispenser_mt(AV1_COMP *cpi); + +void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync); + +#endif // !CONFIG_REALTIME_ONLY + +void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers, + double *sum_rec_distortion, + double *sum_est_rate); + +void av1_tf_do_filtering_mt(AV1_COMP *cpi); + +void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync); + +void av1_compute_num_workers_for_mt(AV1_COMP *cpi); + +int av1_get_max_num_workers(const AV1_COMP *cpi); + +void av1_create_workers(AV1_PRIMARY *ppi, int num_workers); + +void av1_terminate_workers(AV1_PRIMARY *ppi); + +void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi); + +void av1_init_cdef_worker(AV1_COMP *cpi); + +#if !CONFIG_REALTIME_ONLY +void av1_init_lr_mt_buffers(AV1_COMP *cpi); +#endif + +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass); +#endif // CONFIG_MULTITHREAD + +int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info, + MULTI_THREADED_MODULES mod_name); + +void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass); + +void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi); + +void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync); + +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start, const int num_workers); + +int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers); + +int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf); + +int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf); + +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ETHREAD_H_ diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c new file mode 100644 index 0000000000..e1b1e69ca7 --- /dev/null +++ b/third_party/aom/av1/encoder/extend.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/common.h" +#include "av1/encoder/extend.h" + +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right, + int chroma_step) { + int i, linesize; + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + (w - 1) * chroma_step; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + memset(dst_ptr1, src_ptr1[0], extend_left); + if (chroma_step == 1) { + memcpy(dst_ptr1 + extend_left, src_ptr1, w); + } else { + for (int j = 0; j < w; j++) { + dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j]; + } + } + memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + assert(linesize <= dst_pitch); + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; + } +} + +static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch, + uint8_t *dst8, int dst_pitch, int w, + int h, int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + // copy the left and right most columns out + const uint16_t *src_ptr1 = src; + const uint16_t *src_ptr2 = src + w - 1; + uint16_t *dst_ptr1 = dst - extend_left; + uint16_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + aom_memset16(dst_ptr1, src_ptr1[0], extend_left); + memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0])); + aom_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h)-extend_left; + linesize = extend_left + extend_right + w; + assert(linesize <= dst_pitch); + + for (i = 0; i < extend_top; i++) { + memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0])); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0])); + dst_ptr2 += dst_pitch; + } +} + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // Extend src frame in buffer + const int et_y = dst->border; + const int el_y = dst->border; + const int er_y = + AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) - + src->y_crop_width; + const int eb_y = AOMMAX(src->y_height + dst->border, + ALIGN_POWER_OF_TWO(src->y_height, 6)) - + src->y_crop_height; + const int uv_width_subsampling = src->subsampling_x; + const int uv_height_subsampling = src->subsampling_y; + const int et_uv = et_y >> uv_height_subsampling; + const int el_uv = el_y >> uv_width_subsampling; + const int eb_uv = eb_y >> uv_height_subsampling; + const int er_uv = er_y >> uv_width_subsampling; + + if (src->flags & YV12_FLAG_HIGHBITDEPTH) { + highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, + src->y_crop_height, et_y, el_y, eb_y, er_y); + if (!src->monochrome) { + highbd_copy_and_extend_plane( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + highbd_copy_and_extend_plane( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv); + } + return; + } + + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, src->y_crop_width, src->y_crop_height, + et_y, el_y, eb_y, er_y, 1); + if (!src->monochrome) { + // detect nv12 format + const int chroma_step = src->v_buffer ? 1 : 2; + const uint8_t *src_v_buffer = + src->v_buffer ? src->v_buffer : src->u_buffer + 1; + copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, src->uv_crop_width, + src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, + chroma_step); + copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, src->uv_crop_width, + src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv, + chroma_step); + } +} diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h new file mode 100644 index 0000000000..b8cc5b9d28 --- /dev/null +++ b/third_party/aom/av1/encoder/extend.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_EXTEND_H_ +#define AOM_AV1_ENCODER_EXTEND_H_ + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_EXTEND_H_ diff --git a/third_party/aom/av1/encoder/external_partition.c b/third_party/aom/av1/encoder/external_partition.c new file mode 100644 index 0000000000..79f8b4c8a4 --- /dev/null +++ b/third_party/aom/av1/encoder/external_partition.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common.h" +#include "av1/encoder/external_partition.h" + +aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, + aom_ext_part_config_t config, + ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + ext_part_controller->funcs = funcs; + ext_part_controller->config = config; + const aom_ext_part_status_t status = ext_part_controller->funcs.create_model( + ext_part_controller->funcs.priv, &ext_part_controller->config, + &ext_part_controller->model); + if (status == AOM_EXT_PART_ERROR) { + return AOM_CODEC_ERROR; + } else if (status == AOM_EXT_PART_TEST) { + ext_part_controller->test_mode = 1; + ext_part_controller->ready = 0; + return AOM_CODEC_OK; + } + assert(status == AOM_EXT_PART_OK); + ext_part_controller->ready = 1; + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + av1_zero(ext_part_controller); + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + if (ext_part_controller->ready) { + const aom_ext_part_status_t status = + ext_part_controller->funcs.delete_model(ext_part_controller->model); + if (status != AOM_EXT_PART_OK) { + return AOM_CODEC_ERROR; + } + } + return av1_ext_part_init(ext_part_controller); +} + +bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, + aom_partition_decision_t *decision) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(decision != NULL); + const aom_ext_part_status_t status = + ext_part_controller->funcs.get_partition_decision( + ext_part_controller->model, decision); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, + const aom_partition_stats_t *stats) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(stats != NULL); + const aom_ext_part_status_t status = + ext_part_controller->funcs.send_partition_stats( + ext_part_controller->model, stats); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +bool av1_ext_part_send_features(ExtPartController *ext_part_controller, + const aom_partition_features_t *features) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(features != NULL); + const aom_ext_part_status_t status = ext_part_controller->funcs.send_features( + ext_part_controller->model, features); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode( + const ExtPartController *ext_part_controller) { + return ext_part_controller->funcs.decision_mode; +} diff --git a/third_party/aom/av1/encoder/external_partition.h b/third_party/aom/av1/encoder/external_partition.h new file mode 100644 index 0000000000..f74973e9eb --- /dev/null +++ b/third_party/aom/av1/encoder/external_partition.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ +#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ + +#include + +#include "aom/aom_codec.h" +#include "aom/aom_external_partition.h" + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ + +typedef struct ExtPartController { + int ready; + int test_mode; + aom_ext_part_config_t config; + aom_ext_part_model_t model; + aom_ext_part_funcs_t funcs; +} ExtPartController; + +aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, + aom_ext_part_config_t config, + ExtPartController *ext_part_controller); + +aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller); + +aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller); + +bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, + aom_partition_decision_t *decision); + +bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, + const aom_partition_stats_t *stats); + +bool av1_ext_part_send_features(ExtPartController *ext_part_controller, + const aom_partition_features_t *features); + +aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode( + const ExtPartController *ext_part_controller); + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c new file mode 100644 index 0000000000..e20b6c177e --- /dev/null +++ b/third_party/aom/av1/encoder/firstpass.c @@ -0,0 +1,1600 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/variance.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "aom_scale/yv12config.h" + +#include "av1/common/entropymv.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" // av1_setup_dst_planes() +#include "av1/common/reconintra.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/block.h" +#include "av1/encoder/dwt.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/reconinter_enc.h" + +#define OUTPUT_FPF 0 + +#define FIRST_PASS_Q 10.0 +#define INTRA_MODE_PENALTY 1024 +#define NEW_MV_MODE_PENALTY 32 +#define DARK_THRESH 64 + +#define NCOUNT_INTRA_THRESH 8192 +#define NCOUNT_INTRA_FACTOR 3 + +#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1 + +static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats, + struct aom_codec_pkt_list *pktlist) { + struct aom_codec_cx_pkt pkt; + pkt.kind = AOM_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); + if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt); + +// TEMP debug code +#if OUTPUT_FPF + { + FILE *fpfile; + fpfile = fopen("firstpass.stt", "a"); + + fprintf(fpfile, + "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" + "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n", + stats->frame, stats->weight, stats->intra_error, stats->coded_error, + stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion, + stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct, + stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, + stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, + stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, + stats->count, stats->duration); + fclose(fpfile); + } +#endif +} + +void av1_twopass_zero_stats(FIRSTPASS_STATS *section) { + section->frame = 0.0; + section->weight = 0.0; + section->intra_error = 0.0; + section->frame_avg_wavelet_energy = 0.0; + section->coded_error = 0.0; + section->log_intra_error = 0.0; + section->log_coded_error = 0.0; + section->sr_coded_error = 0.0; + section->pcnt_inter = 0.0; + section->pcnt_motion = 0.0; + section->pcnt_second_ref = 0.0; + section->pcnt_neutral = 0.0; + section->intra_skip_pct = 0.0; + section->inactive_zone_rows = 0.0; + section->inactive_zone_cols = 0.0; + section->MVr = 0.0; + section->mvr_abs = 0.0; + section->MVc = 0.0; + section->mvc_abs = 0.0; + section->MVrv = 0.0; + section->MVcv = 0.0; + section->mv_in_out_count = 0.0; + section->new_mv_count = 0.0; + section->count = 0.0; + section->duration = 1.0; + section->is_flash = 0; + section->noise_var = 0; + section->cor_coeff = 1.0; +} + +void av1_accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame += frame->frame; + section->weight += frame->weight; + section->intra_error += frame->intra_error; + section->log_intra_error += log1p(frame->intra_error); + section->log_coded_error += log1p(frame->coded_error); + section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy; + section->coded_error += frame->coded_error; + section->sr_coded_error += frame->sr_coded_error; + section->pcnt_inter += frame->pcnt_inter; + section->pcnt_motion += frame->pcnt_motion; + section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_neutral += frame->pcnt_neutral; + section->intra_skip_pct += frame->intra_skip_pct; + section->inactive_zone_rows += frame->inactive_zone_rows; + section->inactive_zone_cols += frame->inactive_zone_cols; + section->MVr += frame->MVr; + section->mvr_abs += frame->mvr_abs; + section->MVc += frame->MVc; + section->mvc_abs += frame->mvc_abs; + section->MVrv += frame->MVrv; + section->MVcv += frame->MVcv; + section->mv_in_out_count += frame->mv_in_out_count; + section->new_mv_count += frame->new_mv_count; + section->count += frame->count; + section->duration += frame->duration; +} + +static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) { + const int height_mi_log2 = mi_size_high_log2[fp_block_size]; + const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16]; + if (height_mi_log2 > mb_height_mi_log2) { + return mb_rows >> (height_mi_log2 - mb_height_mi_log2); + } + + return mb_rows << (mb_height_mi_log2 - height_mi_log2); +} + +static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) { + const int width_mi_log2 = mi_size_wide_log2[fp_block_size]; + const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16]; + if (width_mi_log2 > mb_width_mi_log2) { + return mb_cols >> (width_mi_log2 - mb_width_mi_log2); + } + + return mb_cols << (mb_width_mi_log2 - width_mi_log2); +} + +// TODO(chengchen): can we simplify it even if resize has to be considered? +static int get_num_mbs(const BLOCK_SIZE fp_block_size, + const int num_mbs_16X16) { + const int width_mi_log2 = mi_size_wide_log2[fp_block_size]; + const int height_mi_log2 = mi_size_high_log2[fp_block_size]; + const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16]; + const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16]; + // TODO(chengchen): Now this function assumes a square block is used. + // It does not support rectangular block sizes. + assert(width_mi_log2 == height_mi_log2); + if (width_mi_log2 > mb_width_mi_log2) { + return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) + + (height_mi_log2 - mb_height_mi_log2)); + } + + return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) + + (mb_height_mi_log2 - height_mi_log2)); +} + +void av1_end_first_pass(AV1_COMP *cpi) { + if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled) + output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, + cpi->ppi->output_pkt_list); +} + +static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_8X8: return aom_mse8x8; + case BLOCK_16X8: return aom_mse16x8; + case BLOCK_8X16: return aom_mse8x16; + default: return aom_mse16x16; + } +} + +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { + unsigned int sse; + const aom_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, + int bd) { + switch (bd) { + default: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_8_mse8x8; + case BLOCK_16X8: return aom_highbd_8_mse16x8; + case BLOCK_8X16: return aom_highbd_8_mse8x16; + default: return aom_highbd_8_mse16x16; + } + case 10: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_10_mse8x8; + case BLOCK_16X8: return aom_highbd_10_mse16x8; + case BLOCK_8X16: return aom_highbd_10_mse8x16; + default: return aom_highbd_10_mse16x16; + } + case 12: + switch (bsize) { + case BLOCK_8X8: return aom_highbd_12_mse8x8; + case BLOCK_16X8: return aom_highbd_12_mse16x8; + case BLOCK_8X16: return aom_highbd_12_mse8x16; + default: return aom_highbd_12_mse16x16; + } + } +} + +static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref, + int bd) { + unsigned int sse; + const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(int width, int height) { + int sr = 0; + const int dim = AOMMIN(width, height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; + return sr; +} + +static AOM_INLINE const search_site_config * +av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x, + SEARCH_METHODS search_method) { + const int ref_stride = x->e_mbd.plane[0].pre[0].stride; + + // For AVIF applications, even the source frames can have changing resolution, + // so we need to manually check for the strides :( + // AV1_COMP::mv_search_params.search_site_config is a compressor level cache + // that's shared by multiple threads. In most cases where all frames have the + // same resolution, the cache contains the search site config that we need. + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) { + return mv_search_params->search_site_cfg[SS_CFG_FPF]; + } + + // If the cache does not contain the correct stride, then we will need to rely + // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the + // thread level config doesn't match, then we need to update it. + search_method = search_method_lookup[search_method]; + assert(search_method_lookup[search_method] == search_method && + "The search_method_lookup table should be idempotent."); + if (ref_stride != x->search_site_cfg_buf[search_method].stride) { + av1_refresh_search_site_config(x->search_site_cfg_buf, search_method, + ref_stride); + } + + return x->search_site_cfg_buf; +} + +static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + const MV *ref_mv, + FULLPEL_MV *best_mv, + int *best_motion_err) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); + int tmp_err; + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + const int sr = get_search_range(cm->width, cm->height); + const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr; + + const search_site_config *first_pass_search_sites = + av1_get_first_pass_search_site_config(cpi, x, NSTEP); + const int fine_search_interval = + cpi->is_screen_content_type && cm->features.allow_intrabc; + FULLPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv, + start_mv, first_pass_search_sites, NSTEP, + fine_search_interval); + + FULLPEL_MV this_best_mv; + FULLPEL_MV_STATS best_mv_stats; + tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL, + &this_best_mv, &best_mv_stats, NULL); + + if (tmp_err < INT_MAX) { + aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize]; + const MSBuffers *ms_buffers = &ms_params.ms_buffers; + tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv, + &v_fn_ptr, ms_buffers->src, ms_buffers->ref) + + new_mv_mode_penalty; + } + + if (tmp_err < *best_motion_err) { + *best_motion_err = tmp_err; + *best_mv = this_best_mv; + } +} + +static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params, + const BLOCK_SIZE fp_block_size, const int unit_row, + const int unit_col) { + const int unit_width = mi_size_wide[fp_block_size]; + const int unit_height = mi_size_high[fp_block_size]; + const int is_half_width = + unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols; + const int is_half_height = + unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows; + const int max_dimension = + AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]); + int square_block_size = 0; + // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 + switch (max_dimension) { + case 4: square_block_size = 0; break; + case 8: square_block_size = 1; break; + case 16: square_block_size = 2; break; + case 32: square_block_size = 3; break; + case 64: square_block_size = 4; break; + case 128: square_block_size = 5; break; + default: assert(0 && "First pass block size is not supported!"); break; + } + if (is_half_width && is_half_height) { + return subsize_lookup[PARTITION_SPLIT][square_block_size]; + } else if (is_half_width) { + return subsize_lookup[PARTITION_VERT][square_block_size]; + } else if (is_half_height) { + return subsize_lookup[PARTITION_HORZ][square_block_size]; + } else { + return fp_block_size; + } +} + +static int find_fp_qindex(aom_bit_depth_t bit_depth) { + return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1); +} + +static double raw_motion_error_stdev(int *raw_motion_err_list, + int raw_motion_err_counts) { + int64_t sum_raw_err = 0; + double raw_err_avg = 0; + double raw_err_stdev = 0; + if (raw_motion_err_counts == 0) return 0; + + int i; + for (i = 0; i < raw_motion_err_counts; i++) { + sum_raw_err += raw_motion_err_list[i]; + } + raw_err_avg = (double)sum_raw_err / raw_motion_err_counts; + for (i = 0; i < raw_motion_err_counts; i++) { + raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) * + (raw_motion_err_list[i] - raw_err_avg); + } + // Calculate the standard deviation for the motion error of all the inter + // blocks of the 0,0 motion using the last source + // frame as the reference. + raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts); + return raw_err_stdev; +} + +static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) { + return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL; +} +typedef struct intra_pred_block_pass1_args { + const SequenceHeader *seq_params; + MACROBLOCK *x; +} intra_pred_block_pass1_args; + +static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src, + int sstride, int width, int height, int use_hbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride, + CONVERT_TO_SHORTPTR(dst), dstride, width, height); + } else { + aom_convolve_copy(src, sstride, dst, dstride, width, height); + } +#else + (void)use_hbd; + aom_convolve_copy(src, sstride, dst, dstride, width, height); +#endif +} + +static void first_pass_intra_pred_and_calc_diff(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + (void)block; + struct intra_pred_block_pass1_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + MACROBLOCK_PLANE *const p = &x->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const SequenceHeader *seq_params = args->seq_params; + const int src_stride = p->src.stride; + uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src, + src_stride, dst, dst_stride, blk_col, blk_row, plane); + + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); +} + +static void first_pass_predict_intra_block_for_luma_plane( + const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = AOM_PLANE_Y; + const MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int dst_stride = pd->dst.stride; + uint8_t *dst = pd->dst.buf; + const MACROBLOCK_PLANE *const p = &x->plane[plane]; + const int src_stride = p->src.stride; + const uint8_t *src = p->src.buf; + + intra_pred_block_pass1_args args = { seq_params, x }; + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args); + + // copy source data to recon buffer, as the recon buffer will be used as a + // reference frame subsequently. + copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize], + block_size_high[bsize], seq_params->use_highbitdepth); +} + +#define UL_INTRA_THRESH 50 +#define INVALID_ROW -1 +// Computes and returns the intra pred error of a block. +// intra pred error: sum of squared error of the intra predicted residual. +// Inputs: +// cpi: the encoder setting. Only a few params in it will be used. +// this_frame: the current frame buffer. +// tile: tile information (not used in first pass, already init to zero) +// unit_row: row index in the unit of first pass block size. +// unit_col: column index in the unit of first pass block size. +// y_offset: the offset of y frame buffer, indicating the starting point of +// the current block. +// uv_offset: the offset of u and v frame buffer, indicating the starting +// point of the current block. +// fp_block_size: first pass block size. +// qindex: quantization step size to encode the frame. +// stats: frame encoding stats. +// Modifies: +// stats->intra_skip_count +// stats->image_data_start_row +// stats->intra_factor +// stats->brightness_factor +// stats->intra_error +// stats->frame_avg_wavelet_energy +// Returns: +// this_intra_error. +static int firstpass_intra_prediction( + AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame, + const TileInfo *const tile, const int unit_row, const int unit_col, + const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size, + const int qindex, FRAME_STATS *const stats) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int unit_scale = mi_size_wide[fp_block_size]; + const int num_planes = av1_num_planes(cm); + const BLOCK_SIZE bsize = + get_bsize(mi_params, fp_block_size, unit_row, unit_col); + + set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale); + xd->plane[0].dst.buf = this_frame->y_buffer + y_offset; + if (num_planes > 1) { + xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset; + xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset; + } + xd->left_available = (unit_col != 0); + xd->mi[0]->bsize = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize], + unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows, + mi_params->mi_cols); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes); + xd->mi[0]->segment_id = 0; + xd->lossless[xd->mi[0]->segment_id] = (qindex == 0); + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = TX_4X4; + + if (cpi->sf.fp_sf.disable_recon) + first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize); + else + av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0); + int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff); + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: this_intra_error >>= 4; break; + case AOM_BITS_12: this_intra_error >>= 8; break; + default: + assert(0 && + "seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + } + + if (this_intra_error < UL_INTRA_THRESH) { + ++stats->intra_skip_count; + } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) { + stats->image_data_start_row = unit_row; + } + + double log_intra = log1p(this_intra_error); + if (log_intra < 10.0) { + stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); + } else { + stats->intra_factor += 1.0; + } + + int level_sample; + if (seq_params->use_highbitdepth) { + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + } else { + level_sample = x->plane[0].src.buf[0]; + } + + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: level_sample >>= 2; break; + case AOM_BITS_12: level_sample >>= 4; break; + default: + assert(0 && + "seq_params->bit_depth should be AOM_BITS_8, " + "AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + } + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); + } else { + stats->brightness_factor += 1.0; + } + + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_intra_error += INTRA_MODE_PENALTY; + + // Accumulate the intra error. + stats->intra_error += (int64_t)this_intra_error; + + // Stats based on wavelet energy is used in the following cases : + // 1. ML model which predicts if a flat structure (golden-frame only structure + // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in + // constant quality mode under certain conditions. + // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL. + // Thus, wavelet energy calculation is enabled for the above cases. + if (calc_wavelet_energy(&cpi->oxcf)) { + const int hbd = is_cur_buf_hbd(xd); + const int stride = x->plane[0].src.stride; + const int num_8x8_rows = block_size_high[fp_block_size] / 8; + const int num_8x8_cols = block_size_wide[fp_block_size] / 8; + const uint8_t *buf = x->plane[0].src.buf; + stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input( + buf, stride, hbd, num_8x8_rows, num_8x8_cols); + } else { + stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP; + } + + return this_intra_error; +} + +// Returns the sum of square error between source and reference blocks. +static int get_prediction_error_bitdepth(const int is_high_bitdepth, + const int bitdepth, + const BLOCK_SIZE block_size, + const struct buf_2d *src, + const struct buf_2d *ref) { + (void)is_high_bitdepth; + (void)bitdepth; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_high_bitdepth) { + return highbd_get_prediction_error(block_size, src, ref, bitdepth); + } +#endif // CONFIG_AV1_HIGHBITDEPTH + return get_prediction_error(block_size, src, ref); +} + +// Accumulates motion vector stats. +// Modifies member variables of "stats". +static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, + const int mb_row, const int mb_col, + const int mb_rows, const int mb_cols, + MV *last_non_zero_mv, FRAME_STATS *stats) { + if (is_zero_mv(&best_mv)) return; + + ++stats->mv_count; + // Non-zero vector, was it different from the last non zero vector? + if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count; + *last_non_zero_mv = best_mv; + + // Does the row vector point inwards or outwards? + if (mb_row < mb_rows / 2) { + if (mv.row > 0) { + --stats->sum_in_vectors; + } else if (mv.row < 0) { + ++stats->sum_in_vectors; + } + } else if (mb_row > mb_rows / 2) { + if (mv.row > 0) { + ++stats->sum_in_vectors; + } else if (mv.row < 0) { + --stats->sum_in_vectors; + } + } + + // Does the col vector point inwards or outwards? + if (mb_col < mb_cols / 2) { + if (mv.col > 0) { + --stats->sum_in_vectors; + } else if (mv.col < 0) { + ++stats->sum_in_vectors; + } + } else if (mb_col > mb_cols / 2) { + if (mv.col > 0) { + ++stats->sum_in_vectors; + } else if (mv.col < 0) { + --stats->sum_in_vectors; + } + } +} + +// Computes and returns the inter prediction error from the last frame. +// Computes inter prediction errors from the golden and alt ref frams and +// Updates stats accordingly. +// Inputs: +// cpi: the encoder setting. Only a few params in it will be used. +// last_frame: the frame buffer of the last frame. +// golden_frame: the frame buffer of the golden frame. +// unit_row: row index in the unit of first pass block size. +// unit_col: column index in the unit of first pass block size. +// recon_yoffset: the y offset of the reconstructed frame buffer, +// indicating the starting point of the current block. +// recont_uvoffset: the u/v offset of the reconstructed frame buffer, +// indicating the starting point of the current block. +// src_yoffset: the y offset of the source frame buffer. +// fp_block_size: first pass block size. +// this_intra_error: the intra prediction error of this block. +// raw_motion_err_counts: the count of raw motion vectors. +// raw_motion_err_list: the array that records the raw motion error. +// ref_mv: the reference used to start the motion search +// best_mv: the best mv found +// last_non_zero_mv: the last non zero mv found in this tile row. +// stats: frame encoding stats. +// Modifies: +// raw_motion_err_list +// best_ref_mv +// last_mv +// stats: many member params in it. +// Returns: +// this_inter_error +static int firstpass_inter_prediction( + AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame, + const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row, + const int unit_col, const int recon_yoffset, const int recon_uvoffset, + const int src_yoffset, const BLOCK_SIZE fp_block_size, + const int this_intra_error, const int raw_motion_err_counts, + int *raw_motion_err_list, const MV ref_mv, MV *best_mv, + MV *last_non_zero_mv, FRAME_STATS *stats) { + int this_inter_error = this_intra_error; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CurrentFrame *const current_frame = &cm->current_frame; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int is_high_bitdepth = is_cur_buf_hbd(xd); + const int bitdepth = xd->bd; + const int unit_scale = mi_size_wide[fp_block_size]; + const BLOCK_SIZE bsize = + get_bsize(mi_params, fp_block_size, unit_row, unit_col); + const int fp_block_size_height = block_size_wide[fp_block_size]; + const int unit_width = mi_size_wide[fp_block_size]; + const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows); + const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols); + // Assume 0,0 motion with no mv overhead. + FULLPEL_MV mv = kZeroFullMv; + xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width, + fp_block_size_height >> MI_SIZE_LOG2, + cpi->oxcf.border_in_pixels); + + int motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + // TODO(chiyotsai): The unscaled last source might be different dimension + // as the current source. See BUG=aomedia:3413 + struct buf_2d unscaled_last_source_buf_2d; + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + src_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; + const int raw_motion_error = get_prediction_error_bitdepth( + is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + raw_motion_err_list[raw_motion_err_counts] = raw_motion_error; + const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf; + + if (raw_motion_error > fp_sf->skip_motion_search_threshold) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) { + FULLPEL_MV tmp_mv = kZeroFullMv; + int tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + } + } + } + + // Motion search in 2nd reference frame. + int gf_motion_error = motion_error; + if ((current_frame->frame_number > 1) && golden_frame != NULL) { + FULLPEL_MV tmp_mv = kZeroFullMv; + // Assume 0,0 motion with no mv overhead. + av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1); + xd->plane[0].pre[0].buf += recon_yoffset; + gf_motion_error = + get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, + &x->plane[0].src, &xd->plane[0].pre[0]); + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error); + } + if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) { + ++stats->second_ref_count; + } + // In accumulating a score for the 2nd reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if ((current_frame->frame_number > 1) && golden_frame != NULL) { + stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error); + } else { + // TODO(chengchen): I believe logically this should also be changed to + // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error). + stats->sr_coded_error += motion_error; + } + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; + if (av1_num_planes(&cpi->common) > 1) { + xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset; + } + + // Start by assuming that intra mode is best. + *best_mv = kZeroMv; + + if (motion_error <= this_intra_error) { + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) && + (this_intra_error < (2 * INTRA_MODE_PENALTY))) { + stats->neutral_count += 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_intra_error > NCOUNT_INTRA_THRESH) && + (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + stats->neutral_count += + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error); + } + + *best_mv = get_mv_from_fullmv(&mv); + this_inter_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = *best_mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NONE_FRAME; + + if (fp_sf->disable_recon == 0) { + av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale, + unit_col * unit_scale, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + av1_encode_sby_pass1(cpi, x, bsize); + } + stats->sum_mvr += best_mv->row; + stats->sum_mvr_abs += abs(best_mv->row); + stats->sum_mvc += best_mv->col; + stats->sum_mvc_abs += abs(best_mv->col); + stats->sum_mvrs += best_mv->row * best_mv->row; + stats->sum_mvcs += best_mv->col * best_mv->col; + ++stats->inter_count; + + accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols, + last_non_zero_mv, stats); + } + + return this_inter_error; +} + +// Normalize the first pass stats. +// Error / counters are normalized to each MB. +// MVs are normalized to the width/height of the frame. +static void normalize_firstpass_stats(FIRSTPASS_STATS *fps, + double num_mbs_16x16, double f_w, + double f_h) { + fps->coded_error /= num_mbs_16x16; + fps->sr_coded_error /= num_mbs_16x16; + fps->intra_error /= num_mbs_16x16; + fps->frame_avg_wavelet_energy /= num_mbs_16x16; + fps->log_coded_error = log1p(fps->coded_error); + fps->log_intra_error = log1p(fps->intra_error); + fps->MVr /= f_h; + fps->mvr_abs /= f_h; + fps->MVc /= f_w; + fps->mvc_abs /= f_w; + fps->MVrv /= (f_h * f_h); + fps->MVcv /= (f_w * f_w); + fps->new_mv_count /= num_mbs_16x16; +} + +// Updates the first pass stats of this frame. +// Input: +// cpi: the encoder setting. Only a few params in it will be used. +// stats: stats accumulated for this frame. +// raw_err_stdev: the statndard deviation for the motion error of all the +// inter blocks of the (0,0) motion using the last source +// frame as the reference. +// frame_number: current frame number. +// ts_duration: Duration of the frame / collection of frames. +// Updates: +// twopass->total_stats: the accumulated stats. +// twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats, +// update its value and its position +// in the buffer. +static void update_firstpass_stats(AV1_COMP *cpi, + const FRAME_STATS *const stats, + const double raw_err_stdev, + const int frame_number, + const int64_t ts_duration, + const BLOCK_SIZE fp_block_size) { + TWO_PASS *twopass = &cpi->ppi->twopass; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; + FIRSTPASS_STATS fps; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : mi_params->MBs; + // Number of actual units used in the first pass, it can be other square + // block sizes than 16X16. + const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16); + const double min_err = 200 * sqrt(num_mbs); + + fps.weight = stats->intra_factor * stats->brightness_factor; + fps.frame = frame_number; + fps.coded_error = (double)(stats->coded_error >> 8) + min_err; + fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err; + fps.intra_error = (double)(stats->intra_error >> 8) + min_err; + fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy; + fps.count = 1.0; + fps.pcnt_inter = (double)stats->inter_count / num_mbs; + fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs; + fps.pcnt_neutral = (double)stats->neutral_count / num_mbs; + fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs; + fps.inactive_zone_rows = (double)stats->image_data_start_row; + fps.inactive_zone_cols = 0.0; // Placeholder: not currently supported. + fps.raw_error_stdev = raw_err_stdev; + fps.is_flash = 0; + fps.noise_var = 0.0; + fps.cor_coeff = 1.0; + fps.log_coded_error = 0.0; + fps.log_intra_error = 0.0; + + if (stats->mv_count > 0) { + fps.MVr = (double)stats->sum_mvr / stats->mv_count; + fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count; + fps.MVc = (double)stats->sum_mvc / stats->mv_count; + fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count; + fps.MVrv = ((double)stats->sum_mvrs - + ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) / + stats->mv_count; + fps.MVcv = ((double)stats->sum_mvcs - + ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) / + stats->mv_count; + fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2); + fps.new_mv_count = stats->new_mv_count; + fps.pcnt_motion = (double)stats->mv_count / num_mbs; + } else { + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.pcnt_motion = 0.0; + } + + // TODO(paulwilkins): Handle the case when duration is set to 0, or + // something less than the full time between subsequent values of + // cpi->source_time_stamp. + fps.duration = (double)ts_duration; + + normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height); + + // We will store the stats inside the persistent twopass struct (and NOT the + // local variable 'fps'), and then cpi->output_pkt_list will point to it. + *this_frame_stats = fps; + if (!cpi->ppi->lap_enabled) { + output_stats(this_frame_stats, cpi->ppi->output_pkt_list); + } else { + av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats); + } + if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) { + av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps); + } + twopass->stats_buf_ctx->stats_in_end++; + // When ducky encode is on, we always use linear buffer for stats_buf_ctx. + if (cpi->use_ducky_encode == 0) { + // TODO(angiebird): Figure out why first pass uses circular buffer. + /* In the case of two pass, first pass uses it as a circular buffer, + * when LAP is enabled it is used as a linear buffer*/ + if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) && + (twopass->stats_buf_ctx->stats_in_end >= + twopass->stats_buf_ctx->stats_in_buf_end)) { + twopass->stats_buf_ctx->stats_in_end = + twopass->stats_buf_ctx->stats_in_start; + } + } +} + +static void print_reconstruction_frame( + const YV12_BUFFER_CONFIG *const last_frame, int frame_number, + int do_print) { + if (!do_print) return; + + char filename[512]; + FILE *recon_file; + snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number); + + if (frame_number == 0) { + recon_file = fopen(filename, "wb"); + } else { + recon_file = fopen(filename, "ab"); + } + + fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file); + fclose(recon_file); +} + +static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows, + int mb_cols) { + FRAME_STATS stats = { 0 }; + int i, j; + + stats.image_data_start_row = INVALID_ROW; + for (j = 0; j < mb_rows; j++) { + for (i = 0; i < mb_cols; i++) { + FRAME_STATS mb_stat = mb_stats[j * mb_cols + i]; + stats.brightness_factor += mb_stat.brightness_factor; + stats.coded_error += mb_stat.coded_error; + stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy; + if (stats.image_data_start_row == INVALID_ROW && + mb_stat.image_data_start_row != INVALID_ROW) { + stats.image_data_start_row = mb_stat.image_data_start_row; + } + stats.inter_count += mb_stat.inter_count; + stats.intra_error += mb_stat.intra_error; + stats.intra_factor += mb_stat.intra_factor; + stats.intra_skip_count += mb_stat.intra_skip_count; + stats.mv_count += mb_stat.mv_count; + stats.neutral_count += mb_stat.neutral_count; + stats.new_mv_count += mb_stat.new_mv_count; + stats.second_ref_count += mb_stat.second_ref_count; + stats.sr_coded_error += mb_stat.sr_coded_error; + stats.sum_in_vectors += mb_stat.sum_in_vectors; + stats.sum_mvc += mb_stat.sum_mvc; + stats.sum_mvc_abs += mb_stat.sum_mvc_abs; + stats.sum_mvcs += mb_stat.sum_mvcs; + stats.sum_mvr += mb_stat.sum_mvr; + stats.sum_mvr_abs += mb_stat.sum_mvr_abs; + stats.sum_mvrs += mb_stat.sum_mvrs; + } + } + return stats; +} + +static void setup_firstpass_data(AV1_COMMON *const cm, + FirstPassData *firstpass_data, + const int unit_rows, const int unit_cols) { + CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list, + aom_calloc(unit_rows * unit_cols, + sizeof(*firstpass_data->raw_motion_err_list))); + CHECK_MEM_ERROR( + cm, firstpass_data->mb_stats, + aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats))); + for (int j = 0; j < unit_rows; j++) { + for (int i = 0; i < unit_cols; i++) { + firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row = + INVALID_ROW; + } + } +} + +void av1_free_firstpass_data(FirstPassData *firstpass_data) { + aom_free(firstpass_data->raw_motion_err_list); + firstpass_data->raw_motion_err_list = NULL; + aom_free(firstpass_data->mb_stats); + firstpass_data->mb_stats = NULL; +} + +int av1_get_unit_rows_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size) { + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + const int mi_rows = tile->mi_row_end - tile->mi_row_start; + const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2); + + return unit_rows; +} + +int av1_get_unit_cols_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size) { + const int unit_width_log2 = mi_size_wide_log2[fp_block_size]; + const int mi_cols = tile->mi_col_end - tile->mi_col_start; + const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2); + + return unit_cols; +} + +#define FIRST_PASS_ALT_REF_DISTANCE 16 +static void first_pass_tile(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, + const BLOCK_SIZE fp_block_size) { + TileInfo *tile = &tile_data->tile_info; + const int unit_height = mi_size_high[fp_block_size]; + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += unit_height) { + av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2, + fp_block_size); + } +} + +static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) { + AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + av1_alloc_src_diff_buf(cm, &cpi->td.mb); + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *const tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size); + } + } +} + +void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + const int unit_row, const BLOCK_SIZE fp_block_size) { + MACROBLOCK *const x = &td->mb; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo *tile = &tile_data->tile_info; + const int qindex = find_fp_qindex(seq_params->bit_depth); + const int fp_block_size_width = block_size_high[fp_block_size]; + const int fp_block_size_height = block_size_wide[fp_block_size]; + const int unit_width = mi_size_wide[fp_block_size]; + const int unit_width_log2 = mi_size_wide_log2[fp_block_size]; + const int unit_height_log2 = mi_size_high_log2[fp_block_size]; + const int unit_cols = mi_params->mb_cols * 4 / unit_width; + int raw_motion_err_counts = 0; + int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2); + int unit_col_start = tile->mi_col_start >> unit_width_log2; + int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size); + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync; + + const YV12_BUFFER_CONFIG *last_frame = + av1_get_scaled_ref_frame(cpi, LAST_FRAME); + if (!last_frame) { + last_frame = get_ref_frame_yv12_buf(cm, LAST_FRAME); + } + const YV12_BUFFER_CONFIG *golden_frame = + av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + if (!golden_frame) { + golden_frame = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + } + YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; + + PICK_MODE_CONTEXT *ctx = td->firstpass_ctx; + FRAME_STATS *mb_stats = + cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start; + int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list + + unit_row * unit_cols + unit_col_start; + MV *first_top_mv = &tile_data->firstpass_top_mv; + + for (int i = 0; i < num_planes; ++i) { + x->plane[i].coeff = ctx->coeff[i]; + x->plane[i].qcoeff = ctx->qcoeff[i]; + x->plane[i].eobs = ctx->eobs[i]; + x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + x->plane[i].dqcoeff = ctx->dqcoeff[i]; + } + + const int src_y_stride = cpi->source->y_stride; + const int recon_y_stride = this_frame->y_stride; + const int recon_uv_stride = this_frame->uv_stride; + const int uv_mb_height = + fp_block_size_height >> (this_frame->y_height > this_frame->uv_height); + + MV best_ref_mv = kZeroMv; + MV last_mv; + + // Reset above block coeffs. + xd->up_available = (unit_row_in_tile != 0); + int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) + + (unit_col_start * fp_block_size_width); + int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) + + (unit_col_start * fp_block_size_width); + int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) + + (unit_col_start * uv_mb_height); + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + av1_set_mv_row_limits( + mi_params, &x->mv_limits, (unit_row << unit_height_log2), + (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels); + + av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2, + tile->mi_col_start, num_planes, fp_block_size); + + // Fix - zero the 16x16 block first. This ensures correct this_intra_error for + // block sizes smaller than 16x16. + av1_zero_array(x->plane[0].src_diff, 256); + + for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile; + unit_col_in_tile++) { + const int unit_col = unit_col_start + unit_col_in_tile; + + enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile); + +#if CONFIG_MULTITHREAD + if (cpi->ppi->p_mt_info.num_workers > 1) { + pthread_mutex_lock(enc_row_mt->mutex_); + bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Exit in case any worker has encountered an error. + if (firstpass_mt_exit) return; + } +#endif + + if (unit_col_in_tile == 0) { + last_mv = *first_top_mv; + } + int this_intra_error = firstpass_intra_prediction( + cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset, + recon_uvoffset, fp_block_size, qindex, mb_stats); + + if (!frame_is_intra_only(cm)) { + const int this_inter_error = firstpass_inter_prediction( + cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset, + recon_uvoffset, src_yoffset, fp_block_size, this_intra_error, + raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv, + &last_mv, mb_stats); + if (unit_col_in_tile == 0) { + *first_top_mv = last_mv; + } + mb_stats->coded_error += this_inter_error; + ++raw_motion_err_counts; + } else { + mb_stats->sr_coded_error += this_intra_error; + mb_stats->coded_error += this_intra_error; + } + + // Adjust to the next column of MBs. + x->plane[0].src.buf += fp_block_size_width; + if (num_planes > 1) { + x->plane[1].src.buf += uv_mb_height; + x->plane[2].src.buf += uv_mb_height; + } + + recon_yoffset += fp_block_size_width; + src_yoffset += fp_block_size_width; + recon_uvoffset += uv_mb_height; + mb_stats++; + + enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile, + unit_cols_in_tile); + } +} + +void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + int max_mb_rows = mi_params->mb_rows; + int max_mb_cols = mi_params->mb_cols; + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) { + int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width); + max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2); + } + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) { + int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height); + max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2); + } + const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows); + const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols); + setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols); + FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats; + FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols); + av1_free_firstpass_data(&cpi->firstpass_data); + update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number, + ts_duration, BLOCK_16X16); +} + +void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { + MACROBLOCK *const x = &cpi->td.mb; + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + CurrentFrame *const current_frame = &cm->current_frame; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int qindex = find_fp_qindex(seq_params->bit_depth); + const int ref_frame_flags_backup = cpi->ref_frame_flags; + cpi->ref_frame_flags = av1_ref_frame_flag_list[LAST_FRAME] | + av1_ref_frame_flag_list[GOLDEN_FRAME]; + + // Detect if the key frame is screen content type. + if (frame_is_intra_only(cm)) { + FeatureFlags *const features = &cm->features; + assert(cpi->source != NULL); + xd->cur_buf = cpi->source; + av1_set_screen_content_options(cpi, features); + } + + // Prepare the speed features + av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + + // Unit size for the first pass encoding. + const BLOCK_SIZE fp_block_size = + get_fp_block_size(cpi->is_screen_content_type); + + int max_mb_rows = mi_params->mb_rows; + int max_mb_cols = mi_params->mb_cols; + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) { + int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width); + max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2); + } + if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) { + int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height); + max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2); + } + + // Number of rows in the unit size. + // Note max_mb_rows and max_mb_cols are in the unit of 16x16. + const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows); + const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols); + + // Set fp_block_size, for the convenience of multi-thread usage. + cpi->fp_block_size = fp_block_size; + + setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols); + int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list; + FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats; + + // multi threading info + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + if (cpi->allocated_tiles < tile_cols * tile_rows) { + av1_alloc_tile_data(cpi); + } + + av1_init_tile_data(cpi); + + const YV12_BUFFER_CONFIG *last_frame = NULL; + const YV12_BUFFER_CONFIG *golden_frame = NULL; + if (!frame_is_intra_only(cm)) { + av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0); + last_frame = av1_is_scaled(get_ref_scale_factors_const(cm, LAST_FRAME)) + ? av1_get_scaled_ref_frame(cpi, LAST_FRAME) + : get_ref_frame_yv12_buf(cm, LAST_FRAME); + golden_frame = av1_is_scaled(get_ref_scale_factors_const(cm, GOLDEN_FRAME)) + ? av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME) + : get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + } + + YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf; + // First pass code requires valid last and new frame buffers. + assert(this_frame != NULL); + assert(frame_is_intra_only(cm) || (last_frame != NULL)); + + av1_setup_frame_size(cpi); + av1_set_mv_search_params(cpi); + + set_mi_offsets(mi_params, xd, 0, 0); + xd->mi[0]->bsize = fp_block_size; + + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; + + av1_set_quantizer( + cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex, + cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq); + + av1_setup_block_planes(xd, seq_params->subsampling_x, + seq_params->subsampling_y, num_planes); + + av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size); + av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0, + num_planes); + + if (!frame_is_intra_only(cm)) { + av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes); + } + + set_mi_offsets(mi_params, xd, 0, 0); + + // Don't store luma on the fist pass since chroma is not computed + xd->cfl.store_y = 0; + av1_frame_init_quantizer(cpi); + + av1_default_coef_probs(cm); + av1_init_mode_probs(cm->fc); + av1_init_mv_probs(cm); + av1_initialize_rd_consts(cpi); + + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; + + if (mt_info->num_workers > 1) { + enc_row_mt->sync_read_ptr = av1_row_mt_sync_read; + enc_row_mt->sync_write_ptr = av1_row_mt_sync_write; + av1_fp_encode_tiles_row_mt(cpi); + } else { + first_pass_tiles(cpi, fp_block_size); + } + + FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols); + int total_raw_motion_err_count = + frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols; + const double raw_err_stdev = + raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count); + av1_free_firstpass_data(&cpi->firstpass_data); + av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm)); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((stats.image_data_start_row > unit_rows / 2) || + (stats.image_data_start_row == INVALID_ROW)) { + stats.image_data_start_row = unit_rows / 2; + } + // Exclude any image dead zone + if (stats.image_data_start_row > 0) { + stats.intra_skip_count = + AOMMAX(0, stats.intra_skip_count - + (stats.image_data_start_row * unit_cols * 2)); + } + + TWO_PASS *twopass = &cpi->ppi->twopass; + const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : mi_params->MBs; + // Number of actual units used in the first pass, it can be other square + // block sizes than 16X16. + const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16); + stats.intra_factor = stats.intra_factor / (double)num_mbs; + stats.brightness_factor = stats.brightness_factor / (double)num_mbs; + FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; + update_firstpass_stats(cpi, &stats, raw_err_stdev, + current_frame->frame_number, ts_duration, + fp_block_size); + + // Copy the previous Last Frame back into gf buffer if the prediction is good + // enough... but also don't allow it to lag too far. + if ((twopass->sr_update_lag > 3) || + ((current_frame->frame_number > 0) && + (this_frame_stats->pcnt_inter > 0.20) && + ((this_frame_stats->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) { + if (golden_frame != NULL) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); + } + twopass->sr_update_lag = 1; + } else { + ++twopass->sr_update_lag; + } + + aom_extend_frame_borders(this_frame, num_planes); + + // The frame we just compressed now becomes the last frame. + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame); + + // Special case for the first frame. Copy into the GF buffer as a second + // reference. + if (current_frame->frame_number == 0 && + get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); + } + + print_reconstruction_frame(last_frame, current_frame->frame_number, + /*do_print=*/0); + + ++current_frame->frame_number; + cpi->ref_frame_flags = ref_frame_flags_backup; + if (!frame_is_intra_only(cm)) { + release_scaled_references(cpi); + } +} + +aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info, + FIRSTPASS_STATS *ext_stats_buf, + int ext_stats_buf_size) { + assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0)); + if (ext_stats_buf == NULL) { + firstpass_info->stats_buf = firstpass_info->static_stats_buf; + firstpass_info->stats_buf_size = + sizeof(firstpass_info->static_stats_buf) / + sizeof(firstpass_info->static_stats_buf[0]); + firstpass_info->start_index = 0; + firstpass_info->cur_index = 0; + firstpass_info->stats_count = 0; + firstpass_info->future_stats_count = 0; + firstpass_info->past_stats_count = 0; + av1_zero(firstpass_info->total_stats); + if (ext_stats_buf_size == 0) { + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } + } else { + firstpass_info->stats_buf = ext_stats_buf; + firstpass_info->stats_buf_size = ext_stats_buf_size; + firstpass_info->start_index = 0; + firstpass_info->cur_index = 0; + firstpass_info->stats_count = firstpass_info->stats_buf_size; + firstpass_info->future_stats_count = firstpass_info->stats_count; + firstpass_info->past_stats_count = 0; + av1_zero(firstpass_info->total_stats); + for (int i = 0; i < firstpass_info->stats_count; ++i) { + av1_accumulate_stats(&firstpass_info->total_stats, + &firstpass_info->stats_buf[i]); + } + } + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_firstpass_info_move_cur_index( + FIRSTPASS_INFO *firstpass_info) { + assert(firstpass_info->future_stats_count + + firstpass_info->past_stats_count == + firstpass_info->stats_count); + if (firstpass_info->future_stats_count > 1) { + firstpass_info->cur_index = + (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size; + --firstpass_info->future_stats_count; + ++firstpass_info->past_stats_count; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } +} + +aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) { + if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) { + const int next_start = + (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size; + firstpass_info->start_index = next_start; + --firstpass_info->stats_count; + --firstpass_info->past_stats_count; + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } +} + +aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop( + FIRSTPASS_INFO *firstpass_info) { + aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info); + if (ret != AOM_CODEC_OK) return ret; + ret = av1_firstpass_info_pop(firstpass_info); + return ret; +} + +aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info, + const FIRSTPASS_STATS *input_stats) { + if (firstpass_info->stats_count < firstpass_info->stats_buf_size) { + const int next_index = + (firstpass_info->start_index + firstpass_info->stats_count) % + firstpass_info->stats_buf_size; + firstpass_info->stats_buf[next_index] = *input_stats; + ++firstpass_info->stats_count; + ++firstpass_info->future_stats_count; + av1_accumulate_stats(&firstpass_info->total_stats, input_stats); + return AOM_CODEC_OK; + } else { + return AOM_CODEC_ERROR; + } +} + +const FIRSTPASS_STATS *av1_firstpass_info_peek( + const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) { + if (offset_from_cur >= -firstpass_info->past_stats_count && + offset_from_cur < firstpass_info->future_stats_count) { + const int index = (firstpass_info->cur_index + offset_from_cur) % + firstpass_info->stats_buf_size; + return &firstpass_info->stats_buf[index]; + } else { + return NULL; + } +} + +int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur) { + if (offset_from_cur < firstpass_info->future_stats_count) { + return firstpass_info->future_stats_count - offset_from_cur; + } + return 0; +} + +int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur) { + if (offset_from_cur >= -firstpass_info->past_stats_count) { + return offset_from_cur + firstpass_info->past_stats_count; + } + return 0; +} diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h new file mode 100644 index 0000000000..d01363a80e --- /dev/null +++ b/third_party/aom/av1/encoder/firstpass.h @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_ +#define AOM_AV1_ENCODER_FIRSTPASS_H_ + +#include + +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) + +#define MIN_ZERO_MOTION 0.95 +#define MAX_SR_CODED_ERROR 40 +#define MAX_RAW_ERR_VAR 2000 +#define MIN_MV_IN_OUT 0.4 + +#define VLOW_MOTION_THRESHOLD 950 +struct ThreadData; + +/*! + * \brief The stucture of acummulated frame stats in the first pass. + * + * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are + * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to + * the frame width and height. See function normalize_firstpass_stats. + */ +typedef struct FIRSTPASS_STATS { + /*! + * Frame number in display order, if stats are for a single frame. + * No real meaning for a collection of frames. + */ + double frame; + /*! + * Weight assigned to this frame (or total weight for the collection of + * frames) currently based on intra factor and brightness factor. This is used + * to distribute bits betweeen easier and harder frames. + */ + double weight; + /*! + * Intra prediction error. + */ + double intra_error; + /*! + * Average wavelet energy computed using Discrete Wavelet Transform (DWT). + */ + double frame_avg_wavelet_energy; + /*! + * Best of intra pred error and inter pred error using last frame as ref. + */ + double coded_error; + /*! + * Best of intra pred error and inter pred error using golden frame as ref. + */ + double sr_coded_error; + /*! + * Percentage of blocks with inter pred error < intra pred error. + */ + double pcnt_inter; + /*! + * Percentage of blocks using (inter prediction and) non-zero motion vectors. + */ + double pcnt_motion; + /*! + * Percentage of blocks where golden frame was better than last or intra: + * inter pred error using golden frame < inter pred error using last frame and + * inter pred error using golden frame < intra pred error + */ + double pcnt_second_ref; + /*! + * Percentage of blocks where intra and inter prediction errors were very + * close. Note that this is a 'weighted count', that is, the so blocks may be + * weighted by how close the two errors were. + */ + double pcnt_neutral; + /*! + * Percentage of blocks that have almost no intra error residual + * (i.e. are in effect completely flat and untextured in the intra + * domain). In natural videos this is uncommon, but it is much more + * common in animations, graphics and screen content, so may be used + * as a signal to detect these types of content. + */ + double intra_skip_pct; + /*! + * Image mask rows top and bottom. + */ + double inactive_zone_rows; + /*! + * Image mask columns at left and right edges. + */ + double inactive_zone_cols; + /*! + * Average of row motion vectors. + */ + double MVr; + /*! + * Mean of absolute value of row motion vectors. + */ + double mvr_abs; + /*! + * Mean of column motion vectors. + */ + double MVc; + /*! + * Mean of absolute value of column motion vectors. + */ + double mvc_abs; + /*! + * Variance of row motion vectors. + */ + double MVrv; + /*! + * Variance of column motion vectors. + */ + double MVcv; + /*! + * Value in range [-1,1] indicating fraction of row and column motion vectors + * that point inwards (negative MV value) or outwards (positive MV value). + * For example, value of 1 indicates, all row/column MVs are inwards. + */ + double mv_in_out_count; + /*! + * Count of unique non-zero motion vectors. + */ + double new_mv_count; + /*! + * Duration of the frame / collection of frames. + */ + double duration; + /*! + * 1.0 if stats are for a single frame, OR + * Number of frames in this collection for which the stats are accumulated. + */ + double count; + /*! + * standard deviation for (0, 0) motion prediction error + */ + double raw_error_stdev; + /*! + * Whether the frame contains a flash + */ + int64_t is_flash; + /*! + * Estimated noise variance + */ + double noise_var; + /*! + * Correlation coefficient with the previous frame + */ + double cor_coeff; + /*! + * log of intra_error + */ + double log_intra_error; + /*! + * log of coded_error + */ + double log_coded_error; +} FIRSTPASS_STATS; + +// We want to keep one past stats for key frame detection +// in test_candidate_kf() +#define FIRSTPASS_INFO_STATS_PAST_MIN 1 + +// The size of static buffer used in FIRSTPASS_INFO. +#define FIRSTPASS_INFO_STATIC_BUF_SIZE \ + (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN) + +/*! + * \brief Data structure used for managing first pass stats + */ +typedef struct { + /*! + * A static buffer that will be used when no ext_stats_buf is assigned. The + * ext_stats_buf is assigned through av1_firstpass_info_init() when the user + * already has a pre-existing firstpass stats that is stored in an external + * buffer. The ext_stats_buf is usually used in two pass mode. When using one + * pass mode, we generate "firstpass" stats and encode the video in the same + * pass. In this scenario, the stats will be pushed and popped from + * static_stats_buf. + */ + FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE]; + /*! + * A pointer to first pass stats. + * Note that this buffer will be used as ring buffer. + */ + FIRSTPASS_STATS *stats_buf; + /*! + * size of stats_buf + */ + int stats_buf_size; + /*! + * start index of the available frame stats + * Note that start_index doesn't always point to + * current frame's stats because we need to + * keep past stats as well. To access current + * frame's stats, please use cur_index. + */ + int start_index; + + /*! + * count available stats stored in stats_buf + * the following condition should stay true + * stats_count = future_stats_count + past_stats_count + */ + int stats_count; + + /*! + * index of the current frame's stats + */ + int cur_index; + + /*! + * count available future stats including current stats + */ + int future_stats_count; + + /*! + * count available past stats EXCLUDING current stats + */ + int past_stats_count; + + /*! + * Accumulation of the stats being pushed into firstpass_info + */ + FIRSTPASS_STATS total_stats; +} FIRSTPASS_INFO; + +/*!\brief Init firstpass_info + * + * If using ext_stats_buf, the buffer needs to stay available during encoding + * process. + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \param[in] ext_stats_buf external stats buffer. Pass in NULL if + * choose to use internal static_stats_buf. + * \param[in] ext_stats_buf_size external stats buffer size. Pass in 0 if + * choose to use internal static_stats_buf. \return status + */ +aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info, + FIRSTPASS_STATS *ext_stats_buf, + int ext_stats_buf_size); + +/*!\brief Move cur_index by 1 + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \return status + */ +aom_codec_err_t av1_firstpass_info_move_cur_index( + FIRSTPASS_INFO *firstpass_info); + +/*!\brief Pop a stats from firstpass_info + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \return status + */ +aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info); + +/*!\brief Move cur_index by 1 and pop a stats from firstpass_info + * + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \return status + */ +aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop( + FIRSTPASS_INFO *firstpass_info); + +/*!\brief Push a stats into firstpass_info + * + * Note that the input stats will be copied into firstpass_info. + * \ingroup rate_control + * \param[out] firstpass_info struct of firstpass_info. + * \param[in] input_stats input stats + * \return status + */ +aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info, + const FIRSTPASS_STATS *input_stats); + +/*!\brief Peek at a stats from firstpass_info + * + * The target index is as follows. + * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size + * + * \ingroup rate_control + * \param[in] firstpass_info struct of firstpass_info. + * \param[in] offset_from_cur index offset from cur_index. + * \return pointer to the stats. The pointer will be NULL if + * stats_index_offset is invalid. + */ +const FIRSTPASS_STATS *av1_firstpass_info_peek( + const FIRSTPASS_INFO *firstpass_info, int offset_from_cur); + +/*!\brief Count the future stats from the target in firstpass_info + * Note that the target stats will be counted as well. + * The target index is as follows. + * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size + * + * \ingroup rate_control + * \param[in] firstpass_info struct of firstpass_info. + * \param[in] offset_from_cur target stats's inffset + * from cur_index. + * \return Number of stats in the future after the target stats + * including itself. + */ +int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur); + +/*!\brief Count the past stats before the target in firstpass_info + * Note that the target stats will NOT be counted. + * The target index is as follows. + * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size + * + * \ingroup rate_control + * \param[in] firstpass_info struct of firstpass_info. + * \param[in] offset_from_cur target stats's index offset + * from cur_index. + * \return Number of stats in the past before the target stats + * excluding itself. + */ +int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info, + int offset_from_cur); + +/*!\cond */ +#define FC_ANIMATION_THRESH 0.15 +enum { + FC_NORMAL = 0, + FC_GRAPHICS_ANIMATION = 1, + FRAME_CONTENT_TYPES = 2 +} UENUM1BYTE(FRAME_CONTENT_TYPE); +/*!\endcond */ + +/*! + * \brief Data related to the current GF/ARF group and the + * individual frames within the group + */ +typedef struct GF_GROUP { + /*!\cond */ + // Frame update type, e.g. ARF/GF/LF/Overlay + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH]; + // The number of frames displayed so far within the GOP at a given coding + // frame. + unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH]; + int layer_depth[MAX_STATIC_GF_GROUP_LENGTH]; + int arf_boost[MAX_STATIC_GF_GROUP_LENGTH]; + int max_layer_depth; + int max_layer_depth_allowed; + // This is currently only populated for AOM_Q mode + int q_val[MAX_STATIC_GF_GROUP_LENGTH]; + int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH]; + // The frame coding type - inter/intra frame + FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH]; + // The reference frame buffer control - update or reset + REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH]; + int arf_index; // the index in the gf group of ARF, if no arf, then -1 + int size; // The total length of a GOP + + // The offset into lookahead_ctx for choosing + // source of frame parallel encodes. + int src_offset[MAX_STATIC_GF_GROUP_LENGTH]; + // Stores the display order hint of each frame in the current GF_GROUP. + int display_idx[MAX_STATIC_GF_GROUP_LENGTH]; + + // The reference frame list maps the reference frame indexes to its + // buffer index in the decoded buffer. A value of -1 means the + // corresponding reference frame index doesn't point towards any + // previously decoded frame. + int8_t ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; + // Update frame index + int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH]; + // The map_idx of primary reference + int primary_ref_idx[MAX_STATIC_GF_GROUP_LENGTH]; + + // Indicates the level of parallelism in frame parallel encodes. + // 0 : frame is independently encoded (not part of parallel encodes). + // 1 : frame is the first in encode order in a given parallel encode set. + // 2 : frame occurs later in encode order in a given parallel encode set. + int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH]; + // Indicates whether a frame should act as non-reference frame. + bool is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH]; + // Indicates whether a frame is dropped. + bool is_frame_dropped[MAX_STATIC_GF_GROUP_LENGTH]; + + // Stores the display order hint of the frames not to be + // refreshed by the current frame. + int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES]; + // Stores the display order hint of the frame to be excluded during reference + // assignment. + int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH]; + /*!\endcond */ +} GF_GROUP; +/*!\cond */ + +typedef struct { + // Track if the last frame in a GOP has higher quality. + int arf_gf_boost_lst; +} GF_STATE; + +typedef struct { + FIRSTPASS_STATS *stats_in_start; + FIRSTPASS_STATS *stats_in_end; + FIRSTPASS_STATS *stats_in_buf_end; + FIRSTPASS_STATS *total_stats; + FIRSTPASS_STATS *total_left_stats; +} STATS_BUFFER_CTX; + +/*!\endcond */ + +/*! + * \brief Two pass status and control data. + */ +typedef struct { + /*!\cond */ + unsigned int section_intra_rating; + // Circular queue of first pass stats stored for most recent frames. + // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored + // here. + FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1]; + int frame_stats_next_idx; // Index to next unused element in frame_stats_arr. + STATS_BUFFER_CTX *stats_buf_ctx; + FIRSTPASS_INFO firstpass_info; // This is the first pass data structure + // intended to replace stats_in + int first_pass_done; + int64_t bits_left; + double modified_error_min; + double modified_error_max; + double modified_error_left; + + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + + // Error score of frames still to be coded in kf group + double kf_group_error_left; + + // Over time correction for bits per macro block estimation + double bpm_factor; + + // Record of target and actual bits spent in current ARF group + int rolling_arf_group_target_bits; + int rolling_arf_group_actual_bits; + + int sr_update_lag; + + int kf_zeromotion_pct; + int last_kfgroup_zeromotion_pct; + int extend_minq; + int extend_maxq; + /*!\endcond */ +} TWO_PASS; + +/*! + * \brief Frame level Two pass status and control data. + */ +typedef struct { + /*!\cond */ + const FIRSTPASS_STATS *stats_in; + // Pointer to the stats of the current frame. + const FIRSTPASS_STATS *this_frame; + double mb_av_energy; + // An indication of the content type of the current frame + FRAME_CONTENT_TYPE fr_content_type; + double frame_avg_haar_energy; + /*!\endcond */ +} TWO_PASS_FRAME; + +/*!\cond */ + +// This structure contains several key parameters to be accumulated for this +// frame. +typedef struct { + // Intra prediction error. + int64_t intra_error; + // Average wavelet energy computed using Discrete Wavelet Transform (DWT). + int64_t frame_avg_wavelet_energy; + // Best of intra pred error and inter pred error using last frame as ref. + int64_t coded_error; + // Best of intra pred error and inter pred error using golden frame as ref. + int64_t sr_coded_error; + // Count of motion vector. + int mv_count; + // Count of blocks that pick inter prediction (inter pred error is smaller + // than intra pred error). + int inter_count; + // Count of blocks that pick second ref (golden frame). + int second_ref_count; + // Count of blocks where the inter and intra are very close and very low. + double neutral_count; + // Count of blocks where intra error is very small. + int intra_skip_count; + // Start row. + int image_data_start_row; + // Count of unique non-zero motion vectors. + int new_mv_count; + // Sum of inward motion vectors. + int sum_in_vectors; + // Sum of motion vector row. + int sum_mvr; + // Sum of motion vector column. + int sum_mvc; + // Sum of absolute value of motion vector row. + int sum_mvr_abs; + // Sum of absolute value of motion vector column. + int sum_mvc_abs; + // Sum of the square of motion vector row. + int64_t sum_mvrs; + // Sum of the square of motion vector column. + int64_t sum_mvcs; + // A factor calculated using intra pred error. + double intra_factor; + // A factor that measures brightness. + double brightness_factor; +} FRAME_STATS; + +// This structure contains first pass data. +typedef struct { + // Buffer holding frame stats for all MACROBLOCKs. + // mb_stats[i] stores the FRAME_STATS of the ith + // MB in raster scan order. + FRAME_STATS *mb_stats; + // Buffer to store the prediction error of the (0,0) motion + // vector using the last source frame as the reference. + // raw_motion_err_list[i] stores the raw_motion_err of + // the ith MB in raster scan order. + int *raw_motion_err_list; +} FirstPassData; + +struct AV1_COMP; +struct EncodeFrameParams; +struct AV1EncoderConfig; +struct TileDataEnc; + +static INLINE int is_fp_wavelet_energy_invalid( + const FIRSTPASS_STATS *fp_stats) { + assert(fp_stats != NULL); + return (fp_stats->frame_avg_wavelet_energy < 0); +} + +static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) { + return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16); +} + +int av1_get_unit_rows_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size); +int av1_get_unit_cols_in_tile(const TileInfo *tile, + const BLOCK_SIZE fp_block_size); + +void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td, + struct TileDataEnc *tile_data, const int mb_row, + const BLOCK_SIZE fp_block_size); +void av1_end_first_pass(struct AV1_COMP *cpi); + +void av1_free_firstpass_data(FirstPassData *firstpass_data); + +void av1_twopass_zero_stats(FIRSTPASS_STATS *section); +void av1_accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame); +/*!\endcond */ + +/*!\brief AV1 first pass encoding. + * + * \ingroup rate_control + * This function is the first encoding pass for the two pass encoding mode. + * It encodes the whole video and collect essential information. + * Two pass encoding is an encoding mode in the reference software (libaom) + * of AV1 for high performance encoding. The first pass is a fast encoding + * process to collect essential information to help the second pass make + * encoding decisions and improve coding quality. The collected stats is used + * in rate control, for example, to determine frame cut, the position of + * alternative reference frame (ARF), etc. + * + * \param[in] cpi Top-level encoder structure + * \param[in] ts_duration Duration of the frame / collection of frames + * + * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi" + * is modified to store information computed in this function. + */ +void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration); + +void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_FIRSTPASS_H_ diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c new file mode 100644 index 0000000000..73910de121 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "av1/encoder/global_motion.h" + +#include "av1/common/convolve.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/segmentation.h" + +#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR) + +// Border over which to compute the global motion +#define ERRORADV_BORDER 0 + +/* clang-format off */ +// Error metric used for global motion evaluation. +// For 8-bit input, the pixel error used to index this table will always +// be between -255 and +255. But for 10- and 12-bit input, we use interpolation +// which means that we need to support indices of -256 and +256 as well. +// Therefore, the table is offset so that logical index 0 corresponds to +// error_measure_lut[256]. +const int error_measure_lut[513] = { + // pow 0.7 + 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113, + 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749, + 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381, + 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010, + 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634, + 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254, + 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870, + 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481, + 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088, + 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689, + 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285, + 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875, + 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458, + 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036, + 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606, + 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168, + 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723, + 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268, + 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804, + 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329, + 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842, + 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341, + 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826, + 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293, + 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741, + 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164, + 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558, + 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916, + 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224, + 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461, + 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577, + 1452, 1323, 1187, 1045, 894, 731, 550, 339, + 0, 339, 550, 731, 894, 1045, 1187, 1323, + 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255, + 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041, + 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748, + 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401, + 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015, + 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599, + 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157, + 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695, + 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214, + 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718, + 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208, + 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686, + 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153, + 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610, + 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058, + 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497, + 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929, + 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353, + 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771, + 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183, + 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588, + 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988, + 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383, + 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773, + 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159, + 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539, + 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916, + 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289, + 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657, + 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022, + 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384, + 16384, +}; +/* clang-format on */ + +int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) { + return best_erroradvantage < erroradv_tr && + best_erroradvantage * params_cost < erroradv_prod_tr; +} + +static void convert_to_params(const double *params, int32_t *model) { + int i; + model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5); + model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) * + GM_TRANS_DECODE_FACTOR; + + for (i = 2; i < 6; ++i) { + const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0); + model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5); + model[i] = + (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX); + model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR; + } +} + +void av1_convert_model_to_params(const double *params, + WarpedMotionParams *model) { + convert_to_params(params, model->wmmat); + model->wmtype = get_wmtype(model); + model->invalid = 0; +} + +// Adds some offset to a global motion parameter and handles +// all of the necessary precision shifts, clamping, and +// zero-centering. +static int32_t add_param_offset(int param_index, int32_t param_value, + int32_t offset) { + const int scale_vals[2] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF }; + const int clamp_vals[2] = { GM_TRANS_MAX, GM_ALPHA_MAX }; + // type of param: 0 - translation, 1 - affine + const int param_type = (param_index < 2 ? 0 : 1); + const int is_one_centered = (param_index == 2 || param_index == 5); + + // Make parameter zero-centered and offset the shift that was done to make + // it compatible with the warped model + param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >> + scale_vals[param_type]; + // Add desired offset to the rescaled/zero-centered parameter + param_value += offset; + // Clamp the parameter so it does not overflow the number of bits allotted + // to it in the bitstream + param_value = (int32_t)clamp(param_value, -clamp_vals[param_type], + clamp_vals[param_type]); + // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible + // with the warped motion library + param_value *= (1 << scale_vals[param_type]); + + // Undo the zero-centering step if necessary + return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS); +} + +static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) { + switch (wmtype) { + case IDENTITY: + wm->wmmat[0] = 0; + wm->wmmat[1] = 0; + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS; + wm->wmmat[3] = 0; + AOM_FALLTHROUGH_INTENDED; + case ROTZOOM: + wm->wmmat[4] = -wm->wmmat[3]; + wm->wmmat[5] = wm->wmmat[2]; + AOM_FALLTHROUGH_INTENDED; + case AFFINE: break; + default: assert(0); + } + wm->wmtype = wmtype; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride, + const uint16_t *const dst, int dst_stride, + int p_width, int p_height) { + // This function should only be called for patches smaller than + // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels + // small enough that we don't need a 64-bit accumulator + assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK); + + int sad = 0; + for (int i = 0; i < p_height; ++i) { + for (int j = 0; j < p_width; ++j) { + sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]); + } + } + return sad; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in highbd_segmented_frame_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t highbd_segmented_frame_error( + const uint16_t *const ref, int ref_stride, const uint16_t *const dst, + int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map, + int segment_map_stride) { + (void)bd; + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + + if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) { + sum_error += aom_highbd_sad32x32( + CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride, + CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride); + } else { + sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride, + dst + j + i * dst_stride, dst_stride, + patch_w, patch_h); + } + } + } + return sum_error; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in highbd_warp_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t highbd_warp_error(WarpedMotionParams *wm, + const uint16_t *const ref, int ref_width, + int ref_height, int ref_stride, + const uint16_t *const dst, int dst_stride, + int p_col, int p_row, int p_width, + int p_height, int subsampling_x, + int subsampling_y, int bd, int64_t best_error, + uint8_t *segment_map, int segment_map_stride) { + int64_t gm_sumerr = 0; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); + + ConvolveParams conv_params = get_conv_params(0, 0, bd); + conv_params.use_dist_wtd_comp_avg = 0; + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j); + const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i); + highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, + warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x, + subsampling_y, bd, &conv_params); + + if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) { + gm_sumerr += aom_highbd_sad32x32( + CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK, + CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride); + } else { + gm_sumerr += + generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, + dst_stride, warp_w, warp_h); + } + + if (gm_sumerr > best_error) return INT64_MAX; + } + } + return gm_sumerr; +} +#endif + +static INLINE int generic_sad(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int dst_stride, + int p_width, int p_height) { + // This function should only be called for patches smaller than + // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels + // small enough that we don't need a 64-bit accumulator + assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK); + + int sad = 0; + for (int i = 0; i < p_height; ++i) { + for (int j = 0; j < p_width; ++j) { + sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]); + } + } + return sad; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in segmented_warp_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride, + const uint8_t *const dst, int dst_stride, + int p_width, int p_height, + uint8_t *segment_map, + int segment_map_stride) { + int patch_w, patch_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + int64_t sum_error = 0; + for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) { + for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + + // avoid computing error into the frame padding + patch_w = AOMMIN(error_bsize_w, p_width - j); + patch_h = AOMMIN(error_bsize_h, p_height - i); + + if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) { + sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride, + dst + j + i * dst_stride, dst_stride); + } else { + sum_error += + generic_sad(ref + j + i * ref_stride, ref_stride, + dst + j + i * dst_stride, dst_stride, patch_w, patch_h); + } + } + } + return sum_error; +} + +#if WARP_ERROR_BLOCK != 32 +#error "Need to change SAD call size in warp_error" +#endif // WARP_ERROR_BLOCK != 32 +static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, + int ref_width, int ref_height, int ref_stride, + const uint8_t *const dst, int dst_stride, int p_col, + int p_row, int p_width, int p_height, + int subsampling_x, int subsampling_y, + int64_t best_error, uint8_t *segment_map, + int segment_map_stride) { + int64_t gm_sumerr = 0; + int warp_w, warp_h; + const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK); + const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); + DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]); + ConvolveParams conv_params = get_conv_params(0, 0, 8); + conv_params.use_dist_wtd_comp_avg = 0; + + for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { + for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { + int seg_x = j >> WARP_ERROR_BLOCK_LOG; + int seg_y = i >> WARP_ERROR_BLOCK_LOG; + // Only compute the error if this block contains inliers from the motion + // model + if (!segment_map[seg_y * segment_map_stride + seg_x]) continue; + // avoid warping extra 8x8 blocks in the padded region of the frame + // when p_width and p_height are not multiples of WARP_ERROR_BLOCK + warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j); + warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i); + warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w, + warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y, + &conv_params); + + if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) { + gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK, + dst + j + i * dst_stride, dst_stride); + } else { + gm_sumerr += + generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride, + dst_stride, warp_w, warp_h); + } + + if (gm_sumerr > best_error) return INT64_MAX; + } + } + return gm_sumerr; +} + +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + int p_width, int p_height, + uint8_t *segment_map, + int segment_map_stride) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + return highbd_segmented_frame_error( + CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst), + dst_stride, p_width, p_height, bd, segment_map, segment_map_stride); + } +#endif + (void)use_hbd; + (void)bd; + return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width, + p_height, segment_map, segment_map_stride); +} + +int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int ref_width, int ref_height, + int ref_stride, uint8_t *dst, int dst_stride, int p_col, + int p_row, int p_width, int p_height, int subsampling_x, + int subsampling_y, int64_t best_error, + uint8_t *segment_map, int segment_map_stride) { + if (!av1_get_shear_params(wm)) return INT64_MAX; +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width, + ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst), + dst_stride, p_col, p_row, p_width, p_height, + subsampling_x, subsampling_y, bd, best_error, + segment_map, segment_map_stride); +#endif + (void)use_hbd; + (void)bd; + return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride, + p_col, p_row, p_width, p_height, subsampling_x, + subsampling_y, best_error, segment_map, segment_map_stride); +} + +int64_t av1_refine_integerized_param( + WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, + uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, + int d_width, int d_height, int d_stride, int n_refinements, + int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) { + static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; + const int border = ERRORADV_BORDER; + int i = 0, p; + int n_params = max_trans_model_params[wmtype]; + int32_t *param_mat = wm->wmmat; + int64_t step_error, best_error; + int32_t step; + int32_t *param; + int32_t curr_param; + int32_t best_param; + + force_wmtype(wm, wmtype); + wm->wmtype = get_wmtype(wm); + + if (n_refinements == 0) { + // Compute the maximum error value that will be accepted, so that + // av1_warp_error can terminate early if it proves the model will not + // be accepted. + int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr); + return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, + 0, 0, selection_threshold, segment_map, + segment_map_stride); + } + + // When refining, use a slightly higher threshold for the initial error + // calculation - see comment above erroradv_early_tr for why. + int64_t selection_threshold = + (int64_t)lrint(ref_frame_error * erroradv_early_tr); + best_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, border, + d_width - 2 * border, d_height - 2 * border, 0, 0, + selection_threshold, segment_map, segment_map_stride); + + if (best_error > selection_threshold) { + return INT64_MAX; + } + + step = 1 << (n_refinements - 1); + for (i = 0; i < n_refinements; i++, step >>= 1) { + for (p = 0; p < n_params; ++p) { + int step_dir = 0; + param = param_mat + p; + curr_param = *param; + best_param = curr_param; + // look to the left + // Note: We have to use force_wmtype() to keep the proper symmetry for + // ROTZOOM type models + *param = add_param_offset(p, curr_param, -step); + force_wmtype(wm, wmtype); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, 0, + 0, best_error, segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = -1; + } + + // look to the right + *param = add_param_offset(p, curr_param, step); + force_wmtype(wm, wmtype); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, 0, + 0, best_error, segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + step_dir = 1; + } + + // look to the direction chosen above repeatedly until error increases + // for the biggest step size + while (step_dir) { + *param = add_param_offset(p, best_param, step * step_dir); + force_wmtype(wm, wmtype); + step_error = + av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride, + dst + border * d_stride + border, d_stride, border, + border, d_width - 2 * border, d_height - 2 * border, + 0, 0, best_error, segment_map, segment_map_stride); + if (step_error < best_error) { + best_error = step_error; + best_param = *param; + } else { + step_dir = 0; + } + } + + // Restore best parameter value so far + *param = best_param; + force_wmtype(wm, wmtype); + } + } + + wm->wmtype = get_wmtype(wm); + return best_error; +} + +#define FEAT_COUNT_TR 3 +#define SEG_COUNT_TR 48 +void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, + int height, int *inliers, + int num_inliers) { + int seg_count = 0; + memset(segment_map, 0, sizeof(*segment_map) * width * height); + + for (int i = 0; i < num_inliers; i++) { + int x = inliers[i * 2]; + int y = inliers[i * 2 + 1]; + int seg_x = x >> WARP_ERROR_BLOCK_LOG; + int seg_y = y >> WARP_ERROR_BLOCK_LOG; + segment_map[seg_y * width + seg_x] += 1; + } + + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + uint8_t feat_count = segment_map[i * width + j]; + segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR); + seg_count += (segment_map[i * width + j]); + } + } + + // If this motion does not make up a large enough portion of the frame, + // use the unsegmented version of the error metric + if (seg_count < SEG_COUNT_TR) + memset(segment_map, 1, width * height * sizeof(*segment_map)); +} diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h new file mode 100644 index 0000000000..8c9c60f0f5 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_ + +#include "aom/aom_integer.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_thread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RANSAC_NUM_MOTIONS 1 +#define GM_MAX_REFINEMENT_STEPS 5 +#define MAX_DIRECTIONS 2 + +// The structure holds a valid reference frame type and its temporal distance +// from the source frame. +typedef struct { + int distance; + MV_REFERENCE_FRAME frame; +} FrameDistPair; + +typedef struct { + // Array of structure which holds the global motion parameters for a given + // motion model. motion_models[i] holds the parameters for a given motion + // model for the ith ransac motion. + MotionModel motion_models[RANSAC_NUM_MOTIONS]; + + // Pointer to hold inliers from motion model. + uint8_t *segment_map; +} GlobalMotionData; + +typedef struct { + // Holds the mapping of each thread to past/future direction. + // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1) + // assigned to the ith thread. + int8_t thread_id_to_dir[MAX_NUM_THREADS]; + + // A flag which holds the early exit status based on the speed feature + // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed + // feature based early exit happens in the direction 'i'. + int8_t early_exit[MAX_DIRECTIONS]; + + // Counter for the next reference frame to be processed. + // next_frame_to_process[i] will hold the count of next reference frame to be + // processed in the direction 'i'. + int8_t next_frame_to_process[MAX_DIRECTIONS]; +} JobInfo; + +typedef struct { + // Data related to assigning jobs for global motion multi-threading. + JobInfo job_info; + +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif + + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool gm_mt_exit; +} AV1GlobalMotionSync; + +void av1_convert_model_to_params(const double *params, + WarpedMotionParams *model); + +// Criteria for accepting a global motion model +static const double erroradv_tr = 0.65; +static const double erroradv_prod_tr = 20000; + +// Early exit threshold for global motion refinement +// This is set slightly higher than erroradv_tr, as a compromise between +// two factors: +// +// 1) By rejecting un-promising models early, we can reduce the encode time +// spent trying to refine them +// +// 2) When we refine a model, its error may decrease to below the acceptance +// threshold even if the model is initially above the threshold +static const double erroradv_early_tr = 0.70; + +int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost); + +void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width, + int height, int *inliers, + int num_inliers); + +extern const int error_measure_lut[513]; + +static INLINE int error_measure(int err) { + return error_measure_lut[256 + err]; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE int highbd_error_measure(int err, int bd) { + const int b = bd - 8; + const int bmask = (1 << b) - 1; + const int v = (1 << b); + + // Split error into two parts and do an interpolated table lookup + // To compute the table index and interpolation value, we want to calculate + // the quotient and remainder of err / 2^b. But it is very important that + // the division must round down, and the remainder must be positive, + // ie. in the range [0, 2^b). + // + // In C, the >> and & operators do what we want, but the / and % operators + // give the wrong results for negative inputs. So we must use >> and & here. + // + // For example, if bd == 10 and err == -5, compare the results: + // (-5) >> 2 = -2, (-5) & 3 = 3 + // vs. (-5) / 4 = -1, (-5) % 4 = -1 + const int e1 = err >> b; + const int e2 = err & bmask; + return error_measure_lut[256 + e1] * (v - e2) + + error_measure_lut[257 + e1] * e2; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + int p_width, int p_height, + uint8_t *segment_map, int segment_map_stride); + +// Returns the error between the result of applying motion 'wm' to the frame +// described by 'ref' and the frame described by 'dst'. +int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd, + const uint8_t *ref, int ref_width, int ref_height, + int ref_stride, uint8_t *dst, int dst_stride, int p_col, + int p_row, int p_width, int p_height, int subsampling_x, + int subsampling_y, int64_t best_error, + uint8_t *segment_map, int segment_map_stride); + +// Returns the av1_warp_error between "dst" and the result of applying the +// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is +// modified in place. +int64_t av1_refine_integerized_param( + WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd, + uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst, + int d_width, int d_height, int d_stride, int n_refinements, + int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_ diff --git a/third_party/aom/av1/encoder/global_motion_facade.c b/third_party/aom/av1/encoder/global_motion_facade.c new file mode 100644 index 0000000000..02a4e70ed3 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion_facade.c @@ -0,0 +1,450 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/binary_codes_writer.h" + +#include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_dsp/pyramid.h" +#include "av1/common/warped_motion.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/global_motion_facade.h" + +// Range of model types to search +#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM +#define LAST_GLOBAL_TRANS_TYPE ROTZOOM + +// Computes the cost for the warp parameters. +static int gm_get_params_cost(const WarpedMotionParams *gm, + const WarpedMotionParams *ref_gm, int allow_hp) { + int params_cost = 0; + int trans_bits, trans_prec_diff; + switch (gm->wmtype) { + case AFFINE: + case ROTZOOM: + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF)); + if (gm->wmtype >= AFFINE) { + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF), + (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF)); + params_cost += aom_count_signed_primitive_refsubexpfin( + GM_ALPHA_MAX + 1, SUBEXPFIN_K, + (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - + (1 << GM_ALPHA_PREC_BITS), + (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS)); + } + AOM_FALLTHROUGH_INTENDED; + case TRANSLATION: + trans_bits = (gm->wmtype == TRANSLATION) + ? GM_ABS_TRANS_ONLY_BITS - !allow_hp + : GM_ABS_TRANS_BITS; + trans_prec_diff = (gm->wmtype == TRANSLATION) + ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp + : GM_TRANS_PREC_DIFF; + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[0] >> trans_prec_diff), + (gm->wmmat[0] >> trans_prec_diff)); + params_cost += aom_count_signed_primitive_refsubexpfin( + (1 << trans_bits) + 1, SUBEXPFIN_K, + (ref_gm->wmmat[1] >> trans_prec_diff), + (gm->wmmat[1] >> trans_prec_diff)); + AOM_FALLTHROUGH_INTENDED; + case IDENTITY: break; + default: assert(0); + } + return (params_cost << AV1_PROB_COST_SHIFT); +} + +// For the given reference frame, computes the global motion parameters for +// different motion models and finds the best. +static AOM_INLINE void compute_global_motion_for_ref_frame( + AV1_COMP *cpi, struct aom_internal_error_info *error_info, + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w, + const int segment_map_h, const WarpedMotionParams *ref_params) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + int src_width = cpi->source->y_crop_width; + int src_height = cpi->source->y_crop_height; + int src_stride = cpi->source->y_stride; + assert(ref_buf[frame] != NULL); + int bit_depth = cpi->common.seq_params->bit_depth; + GlobalMotionMethod global_motion_method = default_global_motion_method; + int num_refinements = cpi->sf.gm_sf.num_refinement_steps; + bool mem_alloc_failed = false; + + // Select the best model based on fractional error reduction. + // By initializing this to erroradv_tr, the same logic which is used to + // select the best model will automatically filter out any model which + // doesn't meet the required quality threshold + double best_erroradv = erroradv_tr; + for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE; + model <= LAST_GLOBAL_TRANS_TYPE; ++model) { + if (!aom_compute_global_motion( + model, cpi->source, ref_buf[frame], bit_depth, global_motion_method, + motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { + if (mem_alloc_failed) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate global motion buffers"); + } + continue; + } + + for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) { + if (motion_models[i].num_inliers == 0) continue; + + WarpedMotionParams tmp_wm_params; + av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params); + + // Skip models that we won't use (IDENTITY or TRANSLATION) + // + // For IDENTITY type models, we don't need to evaluate anything because + // all the following logic is effectively comparing the estimated model + // to an identity model. + // + // For TRANSLATION type global motion models, gm_get_motion_vector() gives + // the wrong motion vector (see comments in that function for details). + // As translation-type models do not give much gain, we can avoid this bug + // by never choosing a TRANSLATION type model + if (tmp_wm_params.wmtype <= TRANSLATION) continue; + + av1_compute_feature_segmentation_map( + segment_map, segment_map_w, segment_map_h, motion_models[i].inliers, + motion_models[i].num_inliers); + + int64_t ref_frame_error = av1_segmented_frame_error( + is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer, + ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride, + src_width, src_height, segment_map, segment_map_w); + + if (ref_frame_error == 0) continue; + + const int64_t warp_error = av1_refine_integerized_param( + &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd, + ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width, + ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride, + cpi->source->y_buffer, src_width, src_height, src_stride, + num_refinements, ref_frame_error, segment_map, segment_map_w); + + // av1_refine_integerized_param() can return a simpler model type than + // its input, so re-check model type here + if (tmp_wm_params.wmtype <= TRANSLATION) continue; + + double erroradvantage = (double)warp_error / ref_frame_error; + + if (erroradvantage < best_erroradv) { + best_erroradv = erroradvantage; + // Save the wm_params modified by + // av1_refine_integerized_param() rather than motion index to + // avoid rerunning refine() below. + memcpy(&(cm->global_motion[frame]), &tmp_wm_params, + sizeof(WarpedMotionParams)); + } + } + } + + if (!av1_get_shear_params(&cm->global_motion[frame])) + cm->global_motion[frame] = default_warp_params; + +#if 0 + // We never choose translational models, so this code is disabled + if (cm->global_motion[frame].wmtype == TRANSLATION) { + cm->global_motion[frame].wmmat[0] = + convert_to_trans_prec(cm->features.allow_high_precision_mv, + cm->global_motion[frame].wmmat[0]) * + GM_TRANS_ONLY_DECODE_FACTOR; + cm->global_motion[frame].wmmat[1] = + convert_to_trans_prec(cm->features.allow_high_precision_mv, + cm->global_motion[frame].wmmat[1]) * + GM_TRANS_ONLY_DECODE_FACTOR; + } +#endif + + if (cm->global_motion[frame].wmtype == IDENTITY) return; + + // If the best error advantage found doesn't meet the threshold for + // this motion type, revert to IDENTITY. + if (!av1_is_enough_erroradvantage( + best_erroradv, + gm_get_params_cost(&cm->global_motion[frame], ref_params, + cm->features.allow_high_precision_mv))) { + cm->global_motion[frame] = default_warp_params; + } +} + +// Computes global motion for the given reference frame. +void av1_compute_gm_for_valid_ref_frames( + AV1_COMP *cpi, struct aom_internal_error_info *error_info, + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + MotionModel *motion_models, uint8_t *segment_map, int segment_map_w, + int segment_map_h) { + AV1_COMMON *const cm = &cpi->common; + const WarpedMotionParams *ref_params = + cm->prev_frame ? &cm->prev_frame->global_motion[frame] + : &default_warp_params; + + compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame, + motion_models, segment_map, segment_map_w, + segment_map_h, ref_params); +} + +// Loops over valid reference frames and computes global motion estimation. +static AOM_INLINE void compute_global_motion_for_references( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], + FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames, + MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w, + const int segment_map_h) { + AV1_COMMON *const cm = &cpi->common; + struct aom_internal_error_info *const error_info = + cpi->td.mb.e_mbd.error_info; + // Compute global motion w.r.t. reference frames starting from the nearest ref + // frame in a given direction. + for (int frame = 0; frame < num_ref_frames; frame++) { + int ref_frame = reference_frame[frame].frame; + av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame, + motion_models, segment_map, + segment_map_w, segment_map_h); + // If global motion w.r.t. current ref frame is + // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t + // the remaining ref frames in that direction. + if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search && + cm->global_motion[ref_frame].wmtype <= TRANSLATION) + break; + } +} + +// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to +// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise. +static int compare_distance(const void *a, const void *b) { + const int diff = + ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance; + if (diff > 0) + return 1; + else if (diff < 0) + return -1; + return 0; +} + +static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) { + int is_gm_present = 1; + + // Check number of GM models only in GF groups with ARF frames. GM param + // estimation is always done in the case of GF groups with no ARF frames (flat + // gops) + if (cpi->ppi->gf_group.arf_index > -1) { + // valid_gm_model_found is initialized to INT32_MAX in the beginning of + // every GF group. + // Therefore, GM param estimation is always done for all frames until + // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are + // encoded in a GF group For subsequent frames, GM param estimation is + // disabled, if no valid models have been found in all the three update + // types. + is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) || + (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) || + (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0); + } + return !is_gm_present; +} + +// Prunes reference frames for global motion estimation based on the speed +// feature 'gm_search_type'. +static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) { + (void)frame; + switch (sf->gm_sf.gm_search_type) { + case GM_FULL_SEARCH: return 1; + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME); + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME || + (frame == ALTREF2_FRAME)); + case GM_SEARCH_CLOSEST_REFS_ONLY: return 1; + case GM_DISABLE_SEARCH: return 0; + default: assert(0); + } + return 1; +} + +// Populates valid reference frames in past/future directions in +// 'reference_frames' and their count in 'num_ref_frames'. +static AOM_INLINE void update_valid_ref_frames_for_gm( + AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], + FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1], + int *num_ref_frames) { + AV1_COMMON *const cm = &cpi->common; + int *num_past_ref_frames = &num_ref_frames[0]; + int *num_future_ref_frames = &num_ref_frames[1]; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( + gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index); + int cur_frame_gm_disabled = 0; + int pyr_lvl = cm->cur_frame->pyramid_level; + + if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) { + cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi); + } + + for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { + const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME }; + RefCntBuffer *buf = get_ref_frame_buf(cm, frame); + const int ref_disabled = + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]); + ref_buf[frame] = NULL; + cm->global_motion[frame] = default_warp_params; + // Skip global motion estimation for invalid ref frames + if (buf == NULL || + (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) { + continue; + } else { + ref_buf[frame] = &buf->buf; + } + + int prune_ref_frames = + ref_pruning_enabled && + prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame, + cm->cur_frame->ref_display_order_hint); + int ref_pyr_lvl = buf->pyramid_level; + + if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width && + ref_buf[frame]->y_crop_height == cpi->source->y_crop_height && + do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames && + ref_pyr_lvl <= pyr_lvl && !cur_frame_gm_disabled) { + assert(ref_buf[frame] != NULL); + const int relative_frame_dist = av1_encoder_get_relative_dist( + buf->display_order_hint, cm->cur_frame->display_order_hint); + // Populate past and future ref frames. + // reference_frames[0][] indicates past direction and + // reference_frames[1][] indicates future direction. + if (relative_frame_dist == 0) { + // Skip global motion estimation for frames at the same nominal instant. + // This will generally be either a "real" frame coded against a + // temporal filtered version, or a higher spatial layer coded against + // a lower spatial layer. In either case, the optimal motion model will + // be IDENTITY, so we don't need to search explicitly. + } else if (relative_frame_dist < 0) { + reference_frames[0][*num_past_ref_frames].distance = + abs(relative_frame_dist); + reference_frames[0][*num_past_ref_frames].frame = frame; + (*num_past_ref_frames)++; + } else { + reference_frames[1][*num_future_ref_frames].distance = + abs(relative_frame_dist); + reference_frames[1][*num_future_ref_frames].frame = frame; + (*num_future_ref_frames)++; + } + } + } +} + +// Initializes parameters used for computing global motion. +static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) { + GlobalMotionInfo *const gm_info = &cpi->gm_info; + YV12_BUFFER_CONFIG *source = cpi->source; + + gm_info->segment_map_w = + (source->y_crop_width + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG; + gm_info->segment_map_h = + (source->y_crop_height + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG; + + memset(gm_info->reference_frames, -1, + sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS * + (REF_FRAMES - 1)); + av1_zero(gm_info->num_ref_frames); + + // Populate ref_buf for valid ref frames in global motion + update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf, + gm_info->reference_frames, + gm_info->num_ref_frames); + + // Sort the past and future ref frames in the ascending order of their + // distance from the current frame. reference_frames[0] => past direction + // and reference_frames[1] => future direction. + qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0], + sizeof(gm_info->reference_frames[0][0]), compare_distance); + qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1], + sizeof(gm_info->reference_frames[1][0]), compare_distance); + + if (cpi->sf.gm_sf.gm_search_type == GM_SEARCH_CLOSEST_REFS_ONLY) { + // Filter down to the nearest two ref frames. + // Prefer one past and one future ref over two past refs, even if + // the second past ref is closer + if (gm_info->num_ref_frames[1] > 0) { + gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 1); + gm_info->num_ref_frames[1] = AOMMIN(gm_info->num_ref_frames[1], 1); + } else { + gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 2); + } + } +} + +// Computes global motion w.r.t. valid reference frames. +static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) { + GlobalMotionInfo *const gm_info = &cpi->gm_info; + GlobalMotionData *gm_data = &cpi->td.gm_data; + + // Compute global motion w.r.t. past reference frames and future reference + // frames + for (int dir = 0; dir < MAX_DIRECTIONS; dir++) { + if (gm_info->num_ref_frames[dir] > 0) + compute_global_motion_for_references( + cpi, gm_info->ref_buf, gm_info->reference_frames[dir], + gm_info->num_ref_frames[dir], gm_data->motion_models, + gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h); + } +} + +// Global motion estimation for the current frame is computed.This computation +// happens once per frame and the winner motion model parameters are stored in +// cm->cur_frame->global_motion. +void av1_compute_global_motion_facade(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + GlobalMotionInfo *const gm_info = &cpi->gm_info; + + if (cpi->oxcf.tool_cfg.enable_global_motion) { + if (cpi->gf_frame_index == 0) { + for (int i = 0; i < FRAME_UPDATE_TYPES; i++) { + cpi->ppi->valid_gm_model_found[i] = INT32_MAX; +#if CONFIG_FPMT_TEST + if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) + cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX; +#endif + } + } + } + + if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source && + cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done && + cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) { + setup_global_motion_info_params(cpi); + // Terminate early if the total number of reference frames is zero. + if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) { + gm_alloc_data(cpi, &cpi->td.gm_data); + if (cpi->mt_info.num_workers > 1) + av1_global_motion_estimation_mt(cpi); + else + global_motion_estimation(cpi); + gm_dealloc_data(&cpi->td.gm_data); + gm_info->search_done = 1; + } + } + memcpy(cm->cur_frame->global_motion, cm->global_motion, + sizeof(cm->cur_frame->global_motion)); +} diff --git a/third_party/aom/av1/encoder/global_motion_facade.h b/third_party/aom/av1/encoder/global_motion_facade.h new file mode 100644 index 0000000000..f13989aa25 --- /dev/null +++ b/third_party/aom/av1/encoder/global_motion_facade.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ +#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ + +#ifdef __cplusplus +extern "C" { +#endif +struct yv12_buffer_config; +struct AV1_COMP; + +// Allocates memory for members of GlobalMotionData. +static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) { + AV1_COMMON *cm = &cpi->common; + GlobalMotionInfo *gm_info = &cpi->gm_info; + + CHECK_MEM_ERROR(cm, gm_data->segment_map, + aom_malloc(sizeof(*gm_data->segment_map) * + gm_info->segment_map_w * gm_info->segment_map_h)); + + av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS); + for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { + CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers, + aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 * + MAX_CORNERS)); + } +} + +// Deallocates the memory allocated for members of GlobalMotionData. +static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) { + aom_free(gm_data->segment_map); + gm_data->segment_map = NULL; + for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) { + aom_free(gm_data->motion_models[m].inliers); + gm_data->motion_models[m].inliers = NULL; + } +} + +void av1_compute_gm_for_valid_ref_frames( + AV1_COMP *cpi, struct aom_internal_error_info *error_info, + YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame, + MotionModel *motion_models, uint8_t *segment_map, int segment_map_w, + int segment_map_h); +void av1_compute_global_motion_facade(struct AV1_COMP *cpi); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_ diff --git a/third_party/aom/av1/encoder/gop_structure.c b/third_party/aom/av1/encoder/gop_structure.c new file mode 100644 index 0000000000..5078098450 --- /dev/null +++ b/third_party/aom/av1/encoder/gop_structure.c @@ -0,0 +1,867 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/blockd.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "av1/common/av1_common_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" + +// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based +// on the value of parallel_frame_count. +static void set_frame_parallel_level(int *frame_parallel_level, + int *parallel_frame_count, + int max_parallel_frames) { + assert(*parallel_frame_count > 0); + // parallel_frame_count > 1 indicates subsequent frame(s) in the current + // parallel encode set. + *frame_parallel_level = 1 + (*parallel_frame_count > 1); + // Update the count of no. of parallel frames. + (*parallel_frame_count)++; + if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1; +} + +// This function sets gf_group->src_offset based on frame_parallel_level. +// Outputs are gf_group->src_offset and first_frame_index +static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index, + int cur_frame_idx, int frame_ind) { + if (gf_group->frame_parallel_level[frame_ind] > 0) { + if (gf_group->frame_parallel_level[frame_ind] == 1) { + *first_frame_index = cur_frame_idx; + } + + // Obtain the offset of the frame at frame_ind in the lookahead queue by + // subtracting the display order hints of the current frame from the display + // order hint of the first frame in parallel encoding set (at + // first_frame_index). + gf_group->src_offset[frame_ind] = + (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) - + *first_frame_index; + } +} + +// Sets the GF_GROUP params for LF_UPDATE frames. +static AOM_INLINE void set_params_for_leaf_frames( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, + int *parallel_frame_count, int max_parallel_frames, + int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index, + int layer_depth, int start, int end) { + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth); + gf_group->display_idx[*frame_ind] = (*cur_disp_index); + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start, + end - start, 0, NULL, NULL, 0); + ++(*cur_disp_index); + + // Set the level of parallelism for the LF_UPDATE frame. + if (do_frame_parallel_encode) { + set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], + parallel_frame_count, max_parallel_frames); + // Set LF_UPDATE frames as non-reference frames. + gf_group->is_frame_non_ref[*frame_ind] = true; + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + + ++(*frame_ind); + ++(*cur_frame_idx); +} + +// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames. +static AOM_INLINE void set_params_for_intnl_overlay_frames( + GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, + int *first_frame_index, int *cur_disp_index, int layer_depth) { + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->display_idx[*frame_ind] = (*cur_disp_index); + ++(*cur_disp_index); + + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); + ++(*cur_frame_idx); +} + +// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames. +static AOM_INLINE void set_params_for_internal_arfs( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind, + int *parallel_frame_count, int max_parallel_frames, + int do_frame_parallel_encode, int *first_frame_index, int depth_thr, + int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset, + int f_frames, int b_frames) { + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = arf_src_offset; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->display_idx[*frame_ind] = + (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind]; + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset, + f_frames, b_frames, NULL, NULL, 0); + + if (do_frame_parallel_encode) { + if (depth_thr != INT_MAX) { + assert(depth_thr == 3 || depth_thr == 4); + assert(IMPLIES(depth_thr == 3, layer_depth == 4)); + assert(IMPLIES(depth_thr == 4, layer_depth == 5)); + // Set frame_parallel_level of the first frame in the given layer to 1. + if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) { + gf_group->frame_parallel_level[*frame_ind] = 1; + } else { + // Set frame_parallel_level of the consecutive frame in the same given + // layer to 2. + assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1); + gf_group->frame_parallel_level[*frame_ind] = 2; + // Store the display order hints of the past 2 INTNL_ARF_UPDATE + // frames which would not have been displayed at the time of the encode + // of current frame. + gf_group->skip_frame_refresh[*frame_ind][0] = + gf_group->display_idx[(*frame_ind) - 1]; + gf_group->skip_frame_refresh[*frame_ind][1] = + gf_group->display_idx[(*frame_ind) - 2]; + // Set the display_idx of frame_parallel_level 1 frame in + // gf_group->skip_frame_as_ref. + gf_group->skip_frame_as_ref[*frame_ind] = + gf_group->display_idx[(*frame_ind) - 1]; + } + } + // If max_parallel_frames is not exceeded and if the frame will not be + // temporally filtered, encode the next internal ARF frame in parallel. + if (*parallel_frame_count > 1 && + *parallel_frame_count <= max_parallel_frames) { + if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR) + gf_group->frame_parallel_level[*frame_ind] = 2; + *parallel_frame_count = 1; + } + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); +} + +// Set parameters for frames between 'start' and 'end' (excluding both). +static void set_multi_layer_params_for_fp( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc, + RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end, + int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, + int max_parallel_frames, int do_frame_parallel_encode, + int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) { + const int num_frames_to_process = end - start; + + // Either we are at the last level of the pyramid, or we don't have enough + // frames between 'l' and 'r' to create one more level. + if (layer_depth > gf_group->max_layer_depth_allowed || + num_frames_to_process < 3) { + // Leaf nodes. + while (start < end) { + set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info, + gf_group, cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, + cur_disp_idx, layer_depth, start, end); + ++start; + } + } else { + const int m = (start + end - 1) / 2; + + // Internal ARF. + int arf_src_offset = m - start; + set_params_for_internal_arfs( + twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, + frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx, + layer_depth, arf_src_offset, m, end - m, m - start); + + // If encode reordering is enabled, configure the multi-layers accordingly + // and return. For e.g., the encode order for gf-interval 16 after + // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10-> + // 14-> 9-> 11-> 13-> 15. + if (layer_depth >= depth_thr) { + int m1 = (m + start - 1) / 2; + int m2 = (m + 1 + end) / 2; + int arf_src_offsets[2] = { m1 - start, m2 - start }; + // Parameters to compute arf_boost. + int offset[2] = { m1, m2 }; + int f_frames[2] = { m - m1, end - m2 }; + int b_frames[2] = { m1 - start, m2 - (m + 1) }; + + // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered. + for (int i = 0; i < 2; i++) { + set_params_for_internal_arfs( + twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, + frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, depth_thr, + cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i], + f_frames[i], b_frames[i]); + } + + // Initialize the start and end indices to configure LF_UPDATE frames. + int start_idx[4] = { start, m1 + 1, m + 1, end - 1 }; + int end_idx[4] = { m1, m, m2, end }; + int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth, + layer_depth + 1, INVALID_IDX }; + + // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE + // frames after reordering. + for (int i = 0; i < 4; i++) { + set_multi_layer_params_for_fp( + twopass, twopass_frame, gf_group, p_rc, rc, frame_info, + start_idx[i], end_idx[i], cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, do_frame_parallel_encode, + first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2); + if (layer_depth_for_intnl_overlay[i] != INVALID_IDX) + set_params_for_intnl_overlay_frames( + gf_group, cur_frame_idx, frame_ind, first_frame_index, + cur_disp_idx, layer_depth_for_intnl_overlay[i]); + } + return; + } + + // Frames displayed before this internal ARF. + set_multi_layer_params_for_fp( + twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m, + cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, + layer_depth + 1); + + // Overlay for internal ARF. + set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind, + first_frame_index, cur_disp_idx, + layer_depth); + + // Frames displayed after this internal ARF. + set_multi_layer_params_for_fp( + twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end, + cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx, + layer_depth + 1); + } +} + +// Structure for bookkeeping start, end and display indices to configure +// INTNL_ARF_UPDATE frames. +typedef struct { + int start; + int end; + int display_index; +} FRAME_REORDER_INFO; + +// Updates the stats required to configure the GF_GROUP. +static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats, + int arf_frame_index, + int display_idx, int start, + int end) { + arf_frame_stats[arf_frame_index].start = start; + arf_frame_stats[arf_frame_index].end = end; + arf_frame_stats[arf_frame_index].display_index = display_idx; +} + +// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates +// doh_gf_index_map and arf_frame_stats. +static AOM_INLINE void set_params_for_internal_arfs_in_gf14( + GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, + int *cur_frame_idx, int *cur_disp_idx, int *frame_ind, + int *count_arf_frames, int *doh_gf_index_map, int start, int end, + int layer_depth, int layer_with_parallel_encodes) { + int index = (start + end - 1) / 2; + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = index - 1; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->display_idx[*frame_ind] = + (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind]; + + // Update the display index of the current frame with its gf index. + doh_gf_index_map[index] = *frame_ind; + if (layer_with_parallel_encodes) { + assert(layer_depth == 4); + // Set frame_parallel_level of the first frame in the given layer depth + // to 1. + if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) { + gf_group->frame_parallel_level[*frame_ind] = 1; + } else { + // Set frame_parallel_level of the consecutive frame in the same given + // layer depth to 2. + assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1); + gf_group->frame_parallel_level[*frame_ind] = 2; + // Set the display_idx of frame_parallel_level 1 frame in + // gf_group->skip_frame_as_ref. + gf_group->skip_frame_as_ref[*frame_ind] = + gf_group->display_idx[(*frame_ind) - 1]; + } + } + ++(*frame_ind); + + // Update arf_frame_stats. + fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end); + ++(*count_arf_frames); +} + +// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer +// dpeth. +static AOM_INLINE void set_params_for_cur_layer_frames( + GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, + int *cur_frame_idx, int *cur_disp_idx, int *frame_ind, + int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start, + int node_end, int layer_depth) { + assert(num_dir < 3); + int start, end; + // Iterate through the nodes in the previous layer depth. + for (int i = node_start; i < node_end; i++) { + // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on + // either direction. + for (int dir = 0; dir < num_dir; dir++) { + // Checks for a frame to the left of current node. + if (dir == 0) { + start = arf_frame_stats[i].start; + end = arf_frame_stats[i].display_index; + } else { + // Checks for a frame to the right of current node. + start = arf_frame_stats[i].display_index + 1; + end = arf_frame_stats[i].end; + } + const int num_frames_to_process = end - start; + // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If + // num_frames_to_process is less than 3, then there are not enough frames + // between 'start' and 'end' to create another level. + if (num_frames_to_process >= 3) { + // Flag to indicate the lower layer depths for which parallel encoding + // is enabled. Currently enabled for layer 4 frames. + int layer_with_parallel_encodes = layer_depth == 4; + set_params_for_internal_arfs_in_gf14( + gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind, + count_arf_frames, doh_gf_index_map, start, end, layer_depth, + layer_with_parallel_encodes); + } + } + } +} + +// Configures multi-layers of the GF_GROUP when consecutive encode of frames in +// the same layer depth is enbaled. +static AOM_INLINE void set_multi_layer_params_for_gf14( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats, + int *cur_frame_idx, int *frame_ind, int *count_arf_frames, + int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index, + int *cur_disp_index, int gf_interval, int layer_depth, + int max_parallel_frames) { + assert(layer_depth == 2); + assert(gf_group->max_layer_depth_allowed >= 4); + int layer, node_start, node_end = 0; + // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only + // for gf-interval 14. + const int max_layer_depth = 4; + // Iterate through each layer depth starting from 2 till 'max_layer_depth'. + for (layer = layer_depth; layer <= max_layer_depth; layer++) { + // 'node_start' and 'node_end' indicate the number of nodes from the + // previous layer depth to be considered. It also corresponds to the indices + // of arf_frame_stats. + node_start = node_end; + node_end = (*count_arf_frames); + // 'num_dir' indicates the number of directions to traverse w.r.t. a given + // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would + // have only one frame and hence needs to traverse only in the left + // direction w.r.t the node in the previous layer. + int num_dir = layer == 2 ? 1 : 2; + set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx, + cur_disp_index, frame_ind, count_arf_frames, + doh_gf_index_map, num_dir, node_start, + node_end, layer); + } + + for (int i = 1; i < gf_interval; i++) { + // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE + // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an + // LF_UPDATE frame. + if (doh_gf_index_map[i] == INVALID_IDX) { + // LF_UPDATE frames. + // TODO(Remya): Correct start and end parameters passed to + // set_params_for_leaf_frames() once encode reordering for gf-interval 14 + // is enbaled for parallel encode of lower layer frames. + set_params_for_leaf_frames( + twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx, + frame_ind, parallel_frame_count, max_parallel_frames, 1, + first_frame_index, cur_disp_index, layer, 0, 0); + } else { + // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get + // the gf index of corresponding INTNL_ARF_UPDATE frames. + int intnl_arf_index = doh_gf_index_map[i]; + int ld = gf_group->layer_depth[intnl_arf_index]; + set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind, + first_frame_index, cur_disp_index, + ld); + } + } +} + +// Set parameters for frames between 'start' and 'end' (excluding both). +static void set_multi_layer_params( + const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame, + GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc, + RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end, + int *cur_frame_idx, int *frame_ind, int *parallel_frame_count, + int max_parallel_frames, int do_frame_parallel_encode, + int *first_frame_index, int *cur_disp_idx, int layer_depth) { + const int num_frames_to_process = end - start; + + // Either we are at the last level of the pyramid, or we don't have enough + // frames between 'l' and 'r' to create one more level. + if (layer_depth > gf_group->max_layer_depth_allowed || + num_frames_to_process < 3) { + // Leaf nodes. + while (start < end) { + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->display_idx[*frame_ind] = *cur_disp_idx; + gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start, + end - start, 0, NULL, NULL, 0); + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + gf_group->max_layer_depth = + AOMMAX(gf_group->max_layer_depth, layer_depth); + // Set the level of parallelism for the LF_UPDATE frame. + if (do_frame_parallel_encode) { + set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], + parallel_frame_count, max_parallel_frames); + // Set LF_UPDATE frames as non-reference frames. + gf_group->is_frame_non_ref[*frame_ind] = true; + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); + ++(*cur_frame_idx); + ++(*cur_disp_idx); + ++start; + } + } else { + const int m = (start + end - 1) / 2; + + // Internal ARF. + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = m - start; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->display_idx[*frame_ind] = + *cur_disp_idx + gf_group->arf_src_offset[*frame_ind]; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + + if (do_frame_parallel_encode) { + // If max_parallel_frames is not exceeded and if the frame will not be + // temporally filtered, encode the next internal ARF frame in parallel. + if (*parallel_frame_count > 1 && + *parallel_frame_count <= max_parallel_frames) { + if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR) + gf_group->frame_parallel_level[*frame_ind] = 2; + *parallel_frame_count = 1; + } + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + + // Get the boost factor for intermediate ARF frames. + gf_group->arf_boost[*frame_ind] = + av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m, + m - start, NULL, NULL, 0); + ++(*frame_ind); + + // Frames displayed before this internal ARF. + set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc, + frame_info, start, m, cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, + cur_disp_idx, layer_depth + 1); + + // Overlay for internal ARF. + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; + gf_group->display_idx[*frame_ind] = *cur_disp_idx; + gf_group->arf_boost[*frame_ind] = 0; + gf_group->layer_depth[*frame_ind] = layer_depth; + gf_group->frame_type[*frame_ind] = INTER_FRAME; + gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); + ++(*frame_ind); + ++(*cur_frame_idx); + ++(*cur_disp_idx); + + // Frames displayed after this internal ARF. + set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc, + frame_info, m + 1, end, cur_frame_idx, frame_ind, + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, + cur_disp_idx, layer_depth + 1); + } +} + +static int construct_multi_layer_gf_structure( + AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group, + RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval, + FRAME_UPDATE_TYPE first_frame_update_type) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + // TODO(angiebird): Why do we need "-1" here? + const int gf_interval = baseline_gf_interval - 1; + int frame_index = 0; + int cur_frame_index = 0; + + // Set the display order hint for the first frame in the GF_GROUP. + int cur_disp_index = (first_frame_update_type == KF_UPDATE) + ? 0 + : cpi->common.current_frame.frame_number; + + // Initialize gf_group->frame_parallel_level, gf_group->is_frame_non_ref, + // gf_group->src_offset and gf_group->is_frame_dropped with 0. + memset(gf_group->frame_parallel_level, 0, + sizeof(gf_group->frame_parallel_level)); + memset(gf_group->is_frame_non_ref, 0, sizeof(gf_group->is_frame_non_ref)); + memset(gf_group->src_offset, 0, sizeof(gf_group->src_offset)); + memset(gf_group->is_frame_dropped, 0, sizeof(gf_group->is_frame_dropped)); + // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref + // with INVALID_IDX. + memset(gf_group->skip_frame_refresh, INVALID_IDX, + sizeof(gf_group->skip_frame_refresh)); + memset(gf_group->skip_frame_as_ref, INVALID_IDX, + sizeof(gf_group->skip_frame_as_ref)); + + int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1; + // This is a patch that fixes https://crbug.com/aomedia/3163 + // enable_keyframe_filtering > 1 will introduce an extra overlay frame at + // key frame location. However when + // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't + // afford to have an extra overlay frame. Otherwise, the gf_group->size will + // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error. + // A cheap solution is to turn of kf_decomp here. + // TODO(angiebird): Find a systematic way to solve this issue. + if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) { + kf_decomp = 0; + } + if (first_frame_update_type == KF_UPDATE) { + gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 0; + gf_group->frame_type[frame_index] = KEY_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_RESET; + gf_group->max_layer_depth = 0; + gf_group->display_idx[frame_index] = cur_disp_index; + if (!kf_decomp) cur_disp_index++; + ++frame_index; + + if (kf_decomp) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 0; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = 0; + gf_group->display_idx[frame_index] = cur_disp_index; + cur_disp_index++; + ++frame_index; + } + cur_frame_index++; + } + + if (first_frame_update_type == GF_UPDATE) { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 0; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = 0; + gf_group->display_idx[frame_index] = cur_disp_index; + cur_disp_index++; + ++frame_index; + ++cur_frame_index; + } + + // ALTREF. + const int use_altref = gf_group->max_layer_depth_allowed > 0; + int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval; + + if (use_altref) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = 1; + gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost; + gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = 1; + gf_group->arf_index = frame_index; + gf_group->display_idx[frame_index] = + cur_disp_index + gf_group->arf_src_offset[frame_index]; + ++frame_index; + } else { + gf_group->arf_index = -1; + } + + // Flag to indicate if multi-layer configuration is complete. + int is_multi_layer_configured = 0; + + // Running count of no. of frames that is part of a given parallel + // encode set in a gf_group. Value of 1 indicates no parallel encode. + int parallel_frame_count = 1; + // Enable parallel encode of frames if gf_group has a multi-layer pyramid + // structure with minimum 4 layers. + int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref && + gf_group->max_layer_depth_allowed >= 4); + + int first_frame_index = cur_frame_index; + if (do_frame_parallel_encode) { + // construct_multi_layer_gf_structure() takes the input parameter + // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the + // actual GF_GROUP length by compensating for this offset. + int actual_gf_length = ((first_frame_update_type == KF_UPDATE) || + (first_frame_update_type == GF_UPDATE)) + ? gf_interval + : gf_interval + 1; + + // In order to facilitate parallel encoding of frames in lower layer depths, + // encode reordering is done. Currently encode reordering is enabled only + // for gf-intervals 16 and 32. NOTE: Since the buffer holding the + // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a + // limitation on the number of hidden frames possible at any given point and + // hence the reordering is enabled only for gf-intervals 16 and 32. + // Disabling encode reordering for gf-interval 14 since some cross-frame + // dependencies related to temporal filtering for FPMT is currently not + // handled. + int disable_gf14_reorder = 1; + if (actual_gf_length == 14 && !disable_gf14_reorder) { + // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot + // corresponding to their display order hint. This is used while + // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames. + int doh_gf_index_map[FIXED_GF_INTERVAL]; + // Initialize doh_gf_index_map with INVALID_IDX. + memset(&doh_gf_index_map[0], INVALID_IDX, + (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL)); + + FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1]; + // Store the stats corresponding to layer 1 frame. + fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1, + actual_gf_length); + int count_arf_frames = 1; + + // Sets multi-layer params for gf-interval 14 to consecutively encode + // frames in the same layer depth, i.e., encode order would be 0-> 14-> + // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13. + // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames. + set_multi_layer_params_for_gf14( + twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group, + arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames, + doh_gf_index_map, ¶llel_frame_count, &first_frame_index, + &cur_disp_index, actual_gf_length, use_altref + 1, + cpi->ppi->num_fp_contexts); + + // Set gf_group->skip_frame_refresh. + for (int i = 0; i < actual_gf_length; i++) { + int count = 0; + if (gf_group->update_type[i] == INTNL_ARF_UPDATE) { + for (int j = 0; j < i; j++) { + // Store the display order hint of the frames which would not + // have been displayed at the encode call of frame 'i'. + if ((gf_group->display_idx[j] < gf_group->display_idx[i]) && + gf_group->update_type[j] == INTNL_ARF_UPDATE) { + gf_group->skip_frame_refresh[i][count++] = + gf_group->display_idx[j]; + } + } + } + } + } else { + // Set layer depth threshold for reordering as per the gf length. + int depth_thr = (actual_gf_length == 16) ? 3 + : (actual_gf_length == 32) ? 4 + : INT_MAX; + + set_multi_layer_params_for_fp( + twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info, + cur_frame_index, gf_interval, &cur_frame_index, &frame_index, + ¶llel_frame_count, cpi->ppi->num_fp_contexts, + do_frame_parallel_encode, &first_frame_index, depth_thr, + &cur_disp_index, use_altref + 1); + } + is_multi_layer_configured = 1; + } + + // Rest of the frames. + if (!is_multi_layer_configured) + set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc, + frame_info, cur_frame_index, gf_interval, + &cur_frame_index, &frame_index, + ¶llel_frame_count, cpi->ppi->num_fp_contexts, + do_frame_parallel_encode, &first_frame_index, + &cur_disp_index, use_altref + 1); + + if (use_altref) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; + gf_group->arf_boost[frame_index] = NORMAL_BOOST; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = + is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE; + gf_group->display_idx[frame_index] = cur_disp_index; + ++frame_index; + } else { + for (; cur_frame_index <= gf_interval; ++cur_frame_index) { + gf_group->update_type[frame_index] = LF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->cur_frame_idx[frame_index] = cur_frame_index; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS; + gf_group->arf_boost[frame_index] = NORMAL_BOOST; + gf_group->frame_type[frame_index] = INTER_FRAME; + gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; + gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); + set_src_offset(gf_group, &first_frame_index, cur_frame_index, + frame_index); + gf_group->display_idx[frame_index] = cur_disp_index; + cur_disp_index++; + ++frame_index; + } + } + if (do_frame_parallel_encode) { + // Iterate through the gf_group and reset frame_parallel_level to 0 in case + // a frame is marked as frame_parallel_level 1 with no subsequent + // frame_parallel_level 2 frame(s). + int level1_frame_idx = INT_MAX; + int level2_frame_count = 0; + for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) { + if (gf_group->frame_parallel_level[frame_idx] == 1) { + // Set frame_parallel_level to 0 if only one frame is present in a + // parallel encode set. + if (level1_frame_idx != INT_MAX && !level2_frame_count) + gf_group->frame_parallel_level[level1_frame_idx] = 0; + // Book-keep frame_idx of frame_parallel_level 1 frame and reset the + // count of frame_parallel_level 2 frames in the corresponding parallel + // encode set. + level1_frame_idx = frame_idx; + level2_frame_count = 0; + } + if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++; + } + // If frame_parallel_level is set to 1 for the last LF_UPDATE + // frame in the gf_group, reset it to zero since there are no subsequent + // frames in the gf_group. + if (gf_group->frame_parallel_level[frame_index - 2] == 1) { + assert(gf_group->update_type[frame_index - 2] == LF_UPDATE); + gf_group->frame_parallel_level[frame_index - 2] = 0; + } + } + + for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH; + ++gf_idx) { + gf_group->update_type[gf_idx] = LF_UPDATE; + gf_group->arf_src_offset[gf_idx] = 0; + gf_group->cur_frame_idx[gf_idx] = gf_idx; + gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS; + gf_group->arf_boost[gf_idx] = NORMAL_BOOST; + gf_group->frame_type[gf_idx] = INTER_FRAME; + gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE; + gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); + } + + return frame_index; +} + +static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) { + int log_gop_length = 0; + while ((1 << log_gop_length) < gop_length) { + ++log_gop_length; + } + + for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) { + int count = 0; + // Find the trailing zeros + for (; count < MAX_ARF_LAYERS; ++count) { + if ((gf_index >> count) & 0x01) break; + } + gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0); + } + gf_group->max_layer_depth = AOMMIN(log_gop_length, MAX_ARF_LAYERS); +} + +void av1_gop_setup_structure(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + const int key_frame = rc->frames_since_key == 0; + FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE; + + if (key_frame) { + first_frame_update_type = KF_UPDATE; + if (cpi->oxcf.kf_max_pyr_height != -1) { + gf_group->max_layer_depth_allowed = AOMMIN( + cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed); + } + } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) { + first_frame_update_type = GF_UPDATE; + } + + gf_group->size = construct_multi_layer_gf_structure( + cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval, + first_frame_update_type); + + if (gf_group->max_layer_depth_allowed == 0) + set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval); +} + +int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group, + int gf_frame_index) { + return gf_group->frame_type[gf_frame_index] == KEY_FRAME && + gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE; +} + +int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) { + const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index]; + // TODO(angiebird): when gf_group->size == 32, it's possble to + // have "two" second arf. Check if this is acceptable. + if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE && + arf_src_offset >= TF_LOOKAHEAD_IDX_THR) { + return 1; + } + return 0; +} diff --git a/third_party/aom/av1/encoder/gop_structure.h b/third_party/aom/av1/encoder/gop_structure.h new file mode 100644 index 0000000000..ff22f54136 --- /dev/null +++ b/third_party/aom/av1/encoder/gop_structure.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_ +#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_ + +#include "av1/common/av1_common_int.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ +struct AV1_COMP; +struct EncodeFrameParams; + +#define MIN_ARF_GF_BOOST 240 +#define NORMAL_BOOST 100 + +/*!\endcond */ + +/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP. + * + *\ingroup rate_control + * + * This function defines the Group-Of-Pictures structure for this GF_GROUP. + * This involves deciding where to place the various FRAME_UPDATE_TYPEs in + * the group. It does this primarily by updateing entries in + * cpi->twopass.gf_group.update_type[]. + * + * \param[in] cpi Top - level encoder instance structure + * + * \remark No return value but this function updates group data structures. + */ +void av1_gop_setup_structure(struct AV1_COMP *cpi); + +/*!\brief Distributes bits to frames in a group + * + *\ingroup rate_control + * + * This function decides on the allocation of bits between the different + * frames and types of frame in a GF/ARF group. + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] rc Rate control data + * \param[in] gf_group GF/ARF group data structure + * \param[in] is_key_frame Indicates if the first frame in the group is + * also a key frame. + * \param[in] use_arf Are ARF frames enabled or is this a GF only + * uni-directional group. + * \param[in] gf_group_bits Bits available to be allocated. + * + * \remark No return but updates the rate control and group data structures + * to reflect the allocation of bits. + */ +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits); + +/*!\brief Check whether a frame in the GOP is a forward key frame + * + *\ingroup rate_control + * + * \param[in] gf_group GF/ARF group data structure + * \param[in] gf_frame_index GOP index + * + * \return Return 1 if it is a forward key frame, otherwise return 0 + */ +int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group, + int gf_frame_index); + +/*!\brief Check whether a frame in the GOP is the second arf + * + *\ingroup rate_control + * + * \param[in] gf_group GF/ARF group data structure + * \param[in] gf_frame_index GOP index + * + * \return Return 1 if it is the second arf + */ +int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_ diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h new file mode 100644 index 0000000000..945dc37331 --- /dev/null +++ b/third_party/aom/av1/encoder/grain_test_vectors.h @@ -0,0 +1,781 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ +#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ + +/* Test vectors for emulation of different film grain types. + * Note that bit depth would be derived from the bitstream and + * not signaled in film grain metadata. The parameters are valid + * for any bit depth. + */ +static aom_film_grain_t film_grain_test_vectors[16] = { + /* Test 1 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 16, 0 }, + { 25, 136 }, + { 33, 144 }, + { 41, 160 }, + { 48, 168 }, + { 56, 136 }, + { 67, 128 }, + { 82, 144 }, + { 97, 152 }, + { 113, 144 }, + { 128, 176 }, + { 143, 168 }, + { 158, 176 }, + { 178, 184 } }, + 14 /* num_points_y */, + { { 16, 0 }, + { 20, 64 }, + { 28, 88 }, + { 60, 104 }, + { 90, 136 }, + { 105, 160 }, + { 134, 168 }, + { 168, 208 } }, + 8 /* num_cb_points */, + { { 16, 0 }, + { 28, 96 }, + { 56, 80 }, + { 66, 96 }, + { 80, 104 }, + { 108, 96 }, + { 122, 112 }, + { 137, 112 }, + { 169, 176 } }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 247 /* cb_mult */, + 192 /* cb_luma_mult */, + 18 /* cb_offset */, + 229 /* cr_mult */, + 192 /* cr_luma_mult */, + 54 /* cr_offset */, + 0 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /* chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 2 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cb_points */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 3 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 192 }, { 255, 192 } }, + 2 /* num_points_y */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cb_points */, + { { 0, 128 }, { 255, 128 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + { + 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19, + -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 1 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 4 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 137 }, + { 53, 146 }, + { 63, 155 }, + { 78, 155 }, + { 107, 150 }, + { 122, 147 }, + { 136, 147 }, + { 166, 153 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 72 }, + { 27, 82 }, + { 33, 91 }, + { 69, 121 }, + { 95, 143 }, + { 108, 154 }, + { 134, 169 }, + { 147, 177 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 24, 95 }, + { 54, 93 }, + { 65, 94 }, + { 79, 98 }, + { 109, 107 }, + { 124, 119 }, + { 139, 136 }, + { 169, 170 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 5 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 64 }, { 255, 64 } }, + 2 /* num_points_y */, + { + { 0, 96 }, + { 32, 90 }, + { 64, 83 }, + { 96, 76 }, + { 128, 68 }, + { 159, 59 }, + { 191, 48 }, + { 223, 34 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 34 }, + { 64, 48 }, + { 96, 59 }, + { 128, 68 }, + { 159, 76 }, + { 191, 83 }, + { 223, 90 }, + { 255, 96 }, + }, + 9 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2, + -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0, + }, + { + -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2, + 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1063 /* random_seed */ + }, + /* Test 6 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 96 }, + { 20, 92 }, + { 39, 88 }, + { 59, 84 }, + { 78, 80 }, + { 98, 75 }, + { 118, 70 }, + { 137, 65 }, + { 157, 60 }, + { 177, 53 }, + { 196, 46 }, + { 216, 38 }, + { 235, 27 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 2754 /* random_seed */ + }, + /* Test 7 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 27 }, + { 39, 38 }, + { 59, 46 }, + { 78, 53 }, + { 98, 60 }, + { 118, 65 }, + { 137, 70 }, + { 157, 75 }, + { 177, 80 }, + { 196, 84 }, + { 216, 88 }, + { 235, 92 }, + { 255, 96 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 8 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 2 /* num_points_y */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cb_points */, + { { 0, 62 }, { 255, 62 } }, + 2 /* num_cr_points */, + 11 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 9 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 10 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 48 }, { 255, 48 } }, + 2 /* num_points_y */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cb_points */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 11 */ + { + 1 /* apply_grain */, + 0 /* update_parameters */, + { { 0, 32 }, { 255, 32 } }, + 2 /* num_points_y */, + { + { 0, 48 }, + { 32, 45 }, + { 64, 42 }, + { 96, 38 }, + { 128, 34 }, + { 159, 29 }, + { 191, 24 }, + { 223, 17 }, + { 255, 0 }, + }, + 9 /* num_cb_points */, + { + { 0, 0 }, + { 32, 17 }, + { 64, 24 }, + { 96, 29 }, + { 128, 34 }, + { 159, 38 }, + { 191, 42 }, + { 223, 45 }, + { 255, 48 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42, + 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113, + }, + { + -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5, + -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0, + }, + { + 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2, + -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0, + }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 1357 /* random_seed */ + }, + /* Test 12 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 24, 49 }, + { 39, 69 }, + { 46, 84 }, + { 53, 91 }, + { 63, 100 }, + { 78, 114 }, + { 92, 134 }, + { 164, 139 }, + }, + 9 /* num_points_y */, + { + { 16, 0 }, + { 20, 31 }, + { 26, 42 }, + { 33, 54 }, + { 40, 65 }, + { 47, 72 }, + { 56, 85 }, + { 84, 123 }, + { 152, 157 }, + }, + 9 /* num_cb_points */, + { + { 16, 0 }, + { 25, 14 }, + { 39, 33 }, + { 47, 40 }, + { 54, 47 }, + { 64, 62 }, + { 79, 76 }, + { 94, 83 }, + { 167, 101 }, + }, + 9 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 }, + { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 }, + { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 0 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 13 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 48 }, + { 20, 46 }, + { 39, 44 }, + { 59, 42 }, + { 78, 40 }, + { 98, 38 }, + { 118, 35 }, + { 137, 33 }, + { 157, 30 }, + { 177, 27 }, + { 196, 23 }, + { 216, 19 }, + { 235, 13 }, + { 255, 0 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 14 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 0, 0 }, + { 20, 13 }, + { 39, 19 }, + { 59, 23 }, + { 78, 27 }, + { 98, 30 }, + { 118, 33 }, + { 137, 35 }, + { 157, 38 }, + { 177, 40 }, + { 196, 42 }, + { 216, 44 }, + { 235, 46 }, + { 255, 48 }, + }, + 14 /* num_points_y */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cb_points */, + { { 0, 0 }, { 255, 0 } }, + 0 /* num_cr_points */, + 10 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + 8 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 1 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 15 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { { 0, 96 }, { 255, 96 } }, + 1 /* num_points_y */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cb_points */, + { { 0, 96 }, { 255, 96 } }, + 0 /* num_cr_points */, + 11 /* scaling_shift */, + 2 /* ar_coeff_lag */, + { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 }, + { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 }, + { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 1 /*chroma_scaling_from_luma*/, + 0 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, + /* Test 16 */ + { + 1 /* apply_grain */, + 1 /* update_parameters */, + { + { 16, 0 }, + { 58, 126 }, + { 87, 120 }, + { 97, 122 }, + { 112, 125 }, + { 126, 131 }, + { 141, 139 }, + { 199, 153 }, + }, + 8 /* num_points_y */, + { + { 16, 0 }, + { 59, 68 }, + { 66, 76 }, + { 73, 82 }, + { 79, 85 }, + { 86, 86 }, + { 151, 95 }, + { 192, 101 }, + }, + 8 /* num_cb_points */, + { + { 16, 0 }, + { 59, 64 }, + { 89, 80 }, + { 99, 86 }, + { 114, 90 }, + { 129, 93 }, + { 144, 97 }, + { 203, 85 }, + }, + 8 /* num_cr_points */, + 10 /* scaling_shift */, + 3 /* ar_coeff_lag */, + { + 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25, + 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66, + }, + { + 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6, + -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69, + }, + { + 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8, + -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55, + }, + 7 /* ar_coeff_shift */, + 128 /* cb_mult */, + 192 /* cb_luma_mult */, + 256 /* cb_offset */, + 128 /* cr_mult */, + 192 /* cr_luma_mult */, + 256 /* cr_offset */, + 1 /* overlap_flag */, + 0 /* clip_to_restricted_range */, + 8 /* bit_depth */, + 0 /*chroma_scaling_from_luma*/, + 2 /* grain_scale_shift*/, + 45231 /* random_seed */ + }, +}; +#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_ diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c new file mode 100644 index 0000000000..8037b59bef --- /dev/null +++ b/third_party/aom/av1/encoder/hash.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/hash.h" +#include "config/av1_rtcd.h" + +static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, + uint8_t *pData, uint32_t dataLength) { + for (uint32_t i = 0; i < dataLength; i++) { + const uint8_t index = (uint8_t)( + (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^ + pData[i]); + p_crc_calculator->remainder <<= 8; + p_crc_calculator->remainder ^= p_crc_calculator->table[index]; + } +} + +static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { + p_crc_calculator->remainder = 0; +} + +static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) { + return p_crc_calculator->remainder & p_crc_calculator->final_result_mask; +} + +static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) { + const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1); + const uint32_t byte_high_bit = 1 << (8 - 1); + + for (uint32_t value = 0; value < 256; value++) { + uint32_t remainder = 0; + for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) { + if (value & mask) { + remainder ^= high_bit; + } + + if (remainder & high_bit) { + remainder <<= 1; + remainder ^= p_crc_calculator->trunc_poly; + } else { + remainder <<= 1; + } + } + p_crc_calculator->table[value] = remainder; + } +} + +void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, + uint32_t truncPoly) { + p_crc_calculator->remainder = 0; + p_crc_calculator->bits = bits; + p_crc_calculator->trunc_poly = truncPoly; + p_crc_calculator->final_result_mask = (1 << bits) - 1; + crc_calculator_init_table(p_crc_calculator); +} + +uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, + int length) { + crc_calculator_reset(p_crc_calculator); + crc_calculator_process_data(p_crc_calculator, p, length); + return crc_calculator_get_crc(p_crc_calculator); +} + +/* CRC-32C (iSCSI) polynomial in reversed bit order. */ +#define POLY 0x82f63b78 + +/* Construct table for software CRC-32C calculation. */ +void av1_crc32c_calculator_init(CRC32C *p_crc32c) { + uint32_t crc; + + for (int n = 0; n < 256; n++) { + crc = n; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + p_crc32c->table[0][n] = crc; + } + for (int n = 0; n < 256; n++) { + crc = p_crc32c->table[0][n]; + for (int k = 1; k < 8; k++) { + crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8); + p_crc32c->table[k][n] = crc; + } + } +} + +/* Table-driven software version as a fall-back. This is about 15 times slower + than using the hardware instructions. This assumes little-endian integers, + as is the case on Intel processors that the assembler code here is for. */ +uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) { + const uint8_t *next = (const uint8_t *)(buf); + uint64_t crc; + CRC32C *p = (CRC32C *)c; + crc = 0 ^ 0xffffffff; + while (len && ((uintptr_t)next & 7) != 0) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^ + p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^ + p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^ + p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56]; + next += 8; + len -= 8; + } + while (len) { + crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + return (uint32_t)crc ^ 0xffffffff; +} diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h new file mode 100644 index 0000000000..d8e8cc3a0b --- /dev/null +++ b/third_party/aom/av1/encoder/hash.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HASH_H_ +#define AOM_AV1_ENCODER_HASH_H_ + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _crc_calculator { + uint32_t remainder; + uint32_t trunc_poly; + uint32_t bits; + uint32_t table[256]; + uint32_t final_result_mask; +} CRC_CALCULATOR; + +// Initialize the crc calculator. It must be executed at least once before +// calling av1_get_crc_value(). +void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, + uint32_t truncPoly); +uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, + int length); + +// CRC32C: POLY = 0x82f63b78; +typedef struct _CRC32C { + /* Table for a quadword-at-a-time software crc. */ + uint32_t table[8][256]; +} CRC32C; + +// init table for software version crc32c +void av1_crc32c_calculator_init(CRC32C *p_crc32c); + +#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HASH_H_ diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c new file mode 100644 index 0000000000..8b04e22d6c --- /dev/null +++ b/third_party/aom/av1/encoder/hash_motion.c @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/hash.h" +#include "av1/encoder/hash_motion.h" + +#define kSrcBits 16 +#define kBlockSizeBits 3 +#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits)) + +// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported? +// If yes, fix this function +static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src, + int stride, + uint8_t *p_pixels_in1D) { + const uint8_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + +static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src, + int stride, + uint16_t *p_pixels_in1D) { + const uint16_t *p_pel = y_src; + int index = 0; + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 2; j++) { + p_pixels_in1D[index++] = p_pel[j]; + } + p_pel += stride; + } +} + +static int is_block_2x2_row_same_value(const uint8_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } + return 1; +} + +static int is_block16_2x2_row_same_value(const uint16_t *p) { + if (p[0] != p[1] || p[2] != p[3]) { + return 0; + } + return 1; +} + +static int is_block_2x2_col_same_value(const uint8_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } + return 1; +} + +static int is_block16_2x2_col_same_value(const uint16_t *p) { + if ((p[0] != p[2]) || (p[1] != p[3])) { + return 0; + } + return 1; +} + +// the hash value (hash_value1 consists two parts, the first 3 bits relate to +// the block size and the remaining 16 bits are the crc values. This fuction +// is used to get the first 3 bits. +static int hash_block_size_to_index(int block_size) { + switch (block_size) { + case 4: return 0; + case 8: return 1; + case 16: return 2; + case 32: return 3; + case 64: return 4; + case 128: return 5; + default: return -1; + } +} + +void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) { + if (!intrabc_hash_info->g_crc_initialized) { + av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB); + av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB); + intrabc_hash_info->g_crc_initialized = 1; + } + intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL; +} + +void av1_hash_table_clear_all(hash_table *p_hash_table) { + if (p_hash_table->p_lookup_table == NULL) { + return; + } + for (int i = 0; i < kMaxAddr; i++) { + if (p_hash_table->p_lookup_table[i] != NULL) { + aom_vector_destroy(p_hash_table->p_lookup_table[i]); + aom_free(p_hash_table->p_lookup_table[i]); + p_hash_table->p_lookup_table[i] = NULL; + } + } +} + +void av1_hash_table_destroy(hash_table *p_hash_table) { + av1_hash_table_clear_all(p_hash_table); + aom_free(p_hash_table->p_lookup_table); + p_hash_table->p_lookup_table = NULL; +} + +bool av1_hash_table_create(hash_table *p_hash_table) { + if (p_hash_table->p_lookup_table != NULL) { + av1_hash_table_clear_all(p_hash_table); + return true; + } + p_hash_table->p_lookup_table = + (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0])); + if (!p_hash_table->p_lookup_table) return false; + return true; +} + +static bool hash_table_add_to_table(hash_table *p_hash_table, + uint32_t hash_value, + block_hash *curr_block_hash) { + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + p_hash_table->p_lookup_table[hash_value] = + aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0])); + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + return false; + } + if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, + sizeof(curr_block_hash[0])) == VECTOR_ERROR) + return false; + if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash) == VECTOR_ERROR) + return false; + } else { + if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], + curr_block_hash) == VECTOR_ERROR) + return false; + } + return true; +} + +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value) { + if (p_hash_table->p_lookup_table[hash_value] == NULL) { + return 0; + } else { + return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size); + } +} + +Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, + uint32_t hash_value) { + assert(av1_hash_table_count(p_hash_table, hash_value) > 0); + return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); +} + +int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, + uint32_t hash_value2) { + if (p_hash_table->p_lookup_table[hash_value1] == NULL) { + return 0; + } + Iterator iterator = + aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]); + Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]); + for (; !aom_iterator_equals(&iterator, &last); + aom_iterator_increment(&iterator)) { + if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 == + hash_value2) { + return 1; + } + } + return 0; +} + +void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info, + const YV12_BUFFER_CONFIG *picture, + uint32_t *pic_block_hash[2], + int8_t *pic_block_same_info[3]) { + const int width = 2; + const int height = 2; + const int x_end = picture->y_crop_width - width + 1; + const int y_end = picture->y_crop_height - height + 1; + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + + const int length = width * 2; + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_short_array_by_block_2x2( + CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } else { + uint8_t p[4]; + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + get_pixels_in_1D_char_array_by_block_2x2( + picture->y_buffer + y_pos * picture->y_stride + x_pos, + picture->y_stride, p); + pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p); + pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p); + + pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, p, length * sizeof(p[0])); + pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, p, length * sizeof(p[0])); + pos++; + } + pos += width - 1; + } + } +} + +void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const YV12_BUFFER_CONFIG *picture, + int block_size, + uint32_t *src_pic_block_hash[2], + uint32_t *dst_pic_block_hash[2], + int8_t *src_pic_block_same_info[3], + int8_t *dst_pic_block_same_info[3]) { + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + + const int pic_width = picture->y_crop_width; + const int x_end = picture->y_crop_width - block_size + 1; + const int y_end = picture->y_crop_height - block_size + 1; + + const int src_size = block_size >> 1; + const int quad_size = block_size >> 2; + + uint32_t p[4]; + const int length = sizeof(p); + + int pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + p[0] = src_pic_block_hash[0][pos]; + p[1] = src_pic_block_hash[0][pos + src_size]; + p[2] = src_pic_block_hash[0][pos + src_size * pic_width]; + p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size]; + dst_pic_block_hash[0][pos] = + av1_get_crc_value(calc_1, (uint8_t *)p, length); + + p[0] = src_pic_block_hash[1][pos]; + p[1] = src_pic_block_hash[1][pos + src_size]; + p[2] = src_pic_block_hash[1][pos + src_size * pic_width]; + p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size]; + dst_pic_block_hash[1][pos] = + av1_get_crc_value(calc_2, (uint8_t *)p, length); + + dst_pic_block_same_info[0][pos] = + src_pic_block_same_info[0][pos] && + src_pic_block_same_info[0][pos + quad_size] && + src_pic_block_same_info[0][pos + src_size] && + src_pic_block_same_info[0][pos + src_size * pic_width] && + src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] && + src_pic_block_same_info[0][pos + src_size * pic_width + src_size]; + + dst_pic_block_same_info[1][pos] = + src_pic_block_same_info[1][pos] && + src_pic_block_same_info[1][pos + src_size] && + src_pic_block_same_info[1][pos + quad_size * pic_width] && + src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] && + src_pic_block_same_info[1][pos + src_size * pic_width] && + src_pic_block_same_info[1][pos + src_size * pic_width + src_size]; + pos++; + } + pos += block_size - 1; + } + + if (block_size >= 4) { + const int size_minus_1 = block_size - 1; + pos = 0; + for (int y_pos = 0; y_pos < y_end; y_pos++) { + for (int x_pos = 0; x_pos < x_end; x_pos++) { + dst_pic_block_same_info[2][pos] = + (!dst_pic_block_same_info[0][pos] && + !dst_pic_block_same_info[1][pos]) || + (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0)); + pos++; + } + pos += block_size - 1; + } + } +} + +bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, + uint32_t *pic_hash[2], + int8_t *pic_is_same, + int pic_width, int pic_height, + int block_size) { + const int x_end = pic_width - block_size + 1; + const int y_end = pic_height - block_size + 1; + + const int8_t *src_is_added = pic_is_same; + const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] }; + + int add_value = hash_block_size_to_index(block_size); + assert(add_value >= 0); + add_value <<= kSrcBits; + const int crc_mask = (1 << kSrcBits) - 1; + + for (int x_pos = 0; x_pos < x_end; x_pos++) { + for (int y_pos = 0; y_pos < y_end; y_pos++) { + const int pos = y_pos * pic_width + x_pos; + // valid data + if (src_is_added[pos]) { + block_hash curr_block_hash; + curr_block_hash.x = x_pos; + curr_block_hash.y = y_pos; + + const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value; + curr_block_hash.hash_value2 = src_hash[1][pos]; + + if (!hash_table_add_to_table(p_hash_table, hash_value1, + &curr_block_hash)) { + return false; + } + } + } + } + return true; +} + +int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start) { + const int stride = picture->y_stride; + const uint8_t *p = picture->y_buffer + y_start * stride + x_start; + + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j] != p16[0]) { + return 0; + } + } + p16 += stride; + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j] != p[0]) { + return 0; + } + } + p += stride; + } + } + + return 1; +} + +int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start) { + const int stride = picture->y_stride; + const uint8_t *p = picture->y_buffer + y_start * stride + x_start; + + if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p16[j * stride + i] != p16[i]) { + return 0; + } + } + } + } else { + for (int i = 0; i < block_size; i++) { + for (int j = 1; j < block_size; j++) { + if (p[j * stride + i] != p[i]) { + return 0; + } + } + } + } + return 1; +} + +void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const uint8_t *y_src, int stride, int block_size, + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth) { + int add_value = hash_block_size_to_index(block_size); + assert(add_value >= 0); + add_value <<= kSrcBits; + const int crc_mask = (1 << kSrcBits) - 1; + + CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1; + CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2; + uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0]; + uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1]; + + // 2x2 subblock hash values in current CU + int sub_block_in_width = (block_size >> 1); + if (use_highbitdepth) { + uint16_t pixel_to_hash[4]; + uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src); + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_short_array_by_block_2x2( + y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash, + sizeof(pixel_to_hash)); + } + } + } else { + uint8_t pixel_to_hash[4]; + for (int y_pos = 0; y_pos < block_size; y_pos += 2) { + for (int x_pos = 0; x_pos < block_size; x_pos += 2) { + int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); + get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, + stride, pixel_to_hash); + assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + buf_1[0][pos] = + av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash)); + buf_2[0][pos] = + av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash)); + } + } + } + + int src_sub_block_in_width = sub_block_in_width; + sub_block_in_width >>= 1; + + int src_idx = 1; + int dst_idx = 0; + + // 4x4 subblock hash values to current block hash values + uint32_t to_hash[4]; + for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) { + src_idx = 1 - src_idx; + dst_idx = 1 - dst_idx; + + int dst_pos = 0; + for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) { + for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { + int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); + + assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(srcPos + src_sub_block_in_width + 1 < + AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); + to_hash[0] = buf_1[src_idx][srcPos]; + to_hash[1] = buf_1[src_idx][srcPos + 1]; + to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1]; + + buf_1[dst_idx][dst_pos] = + av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash)); + + to_hash[0] = buf_2[src_idx][srcPos]; + to_hash[1] = buf_2[src_idx][srcPos + 1]; + to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width]; + to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1]; + buf_2[dst_idx][dst_pos] = + av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash)); + dst_pos++; + } + } + + src_sub_block_in_width = sub_block_in_width; + sub_block_in_width >>= 1; + } + + *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value; + *hash_value2 = buf_2[dst_idx][0]; +} diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h new file mode 100644 index 0000000000..8974ba27cb --- /dev/null +++ b/third_party/aom/av1/encoder/hash_motion.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_ +#define AOM_AV1_ENCODER_HASH_MOTION_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_scale/yv12config.h" +#include "av1/encoder/hash.h" +#include "third_party/vector/vector.h" +#ifdef __cplusplus +extern "C" { +#endif + +// Block size used for force_integer_mv decisions +#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8 + +// store a block's hash info. +// x and y are the position from the top left of the picture +// hash_value2 is used to store the second hash value +typedef struct _block_hash { + int16_t x; + int16_t y; + uint32_t hash_value2; +} block_hash; + +typedef struct _hash_table { + Vector **p_lookup_table; +} hash_table; + +struct intrabc_hash_info; + +typedef struct intrabc_hash_info { + // buffer for hash value calculation of a block + // used only in av1_get_block_hash_value() + // [first hash/second hash] + // [two buffers used ping-pong] + uint32_t *hash_value_buffer[2][2]; + hash_table intrabc_hash_table; + + CRC_CALCULATOR crc_calculator1; + CRC_CALCULATOR crc_calculator2; + int g_crc_initialized; +} IntraBCHashInfo; + +void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info); +void av1_hash_table_clear_all(hash_table *p_hash_table); +void av1_hash_table_destroy(hash_table *p_hash_table); +bool av1_hash_table_create(hash_table *p_hash_table); +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value); +Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, + uint32_t hash_value); +int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, + uint32_t hash_value2); +void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info, + const YV12_BUFFER_CONFIG *picture, + uint32_t *pic_block_hash[2], + int8_t *pic_block_same_info[3]); +void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info, + const YV12_BUFFER_CONFIG *picture, + int block_size, + uint32_t *src_pic_block_hash[2], + uint32_t *dst_pic_block_hash[2], + int8_t *src_pic_block_same_info[3], + int8_t *dst_pic_block_same_info[3]); +bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, + uint32_t *pic_hash[2], + int8_t *pic_is_same, + int pic_width, int pic_height, + int block_size); + +// check whether the block starts from (x_start, y_start) with the size of +// block_size x block_size has the same color in all rows +int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start); +// check whether the block starts from (x_start, y_start) with the size of +// block_size x block_size has the same color in all columns +int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, + int block_size, int x_start, int y_start); + +void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info, + const uint8_t *y_src, int stride, int block_size, + uint32_t *hash_value1, uint32_t *hash_value2, + int use_highbitdepth); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HASH_MOTION_H_ diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c new file mode 100644 index 0000000000..a108e8148c --- /dev/null +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/idct.h" +#include "av1/common/blockd.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per + pixel. + Shared for both high and low bit depth. + */ +void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { + int i; + tran_high_t a1, b1, c1, d1, e1; + const int16_t *ip_pass0 = input; + const tran_low_t *ip = NULL; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip_pass0[0 * stride]; + b1 = ip_pass0[1 * stride]; + c1 = ip_pass0[2 * stride]; + d1 = ip_pass0[3 * stride]; + + a1 += b1; + d1 = d1 - c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[0] = (tran_low_t)a1; + op[1] = (tran_low_t)c1; + op[2] = (tran_low_t)d1; + op[3] = (tran_low_t)b1; + + ip_pass0++; + op += 4; + } + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + b1 = ip[4 * 1]; + c1 = ip[4 * 2]; + d1 = ip[4 * 3]; + + a1 += b1; + d1 -= c1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= c1; + d1 += b1; + op[4 * 0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); + op[4 * 1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); + op[4 * 2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); + op[4 * 3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); + + ip++; + op++; + } +} + +static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + if (txfm_param->lossless) { + assert(tx_type == DCT_DCT); + av1_fwht4x4(src_diff, coeff, diff_stride); + return; + } + av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +#if !CONFIG_REALTIME_ONLY +static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} + +static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} +#endif + +static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + int32_t *dst_coeff = (int32_t *)coeff; + const TX_TYPE tx_type = txfm_param->tx_type; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd); +} + +static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + bd); +} + +static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, + bd); +} + +#if !CONFIG_REALTIME_ONLY +static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} +#endif + +static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(txfm_param->tx_type == DCT_DCT); + int32_t *dst_coeff = (int32_t *)coeff; + const int bd = txfm_param->bd; + av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd); +} + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + TxfmParam *txfm_param) { + if (txfm_param->bd == 8) + av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); + else + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param); +} + +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); + const TX_SIZE tx_size = txfm_param->tx_size; + switch (tx_size) { + case TX_64X64: + highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X64: + highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X32: + highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); + break; + + case TX_32X32: + highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X16: + highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X8: + highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X8: + highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X4: + highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X16: + highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X8: + highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X32: + highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X16: + highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_4X4: + highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param); + break; +#if !CONFIG_REALTIME_ONLY + case TX_4X16: + highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X4: + highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_8X32: + highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_32X8: + highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_16X64: + highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); + break; + case TX_64X16: + highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); + break; +#endif + default: assert(0); break; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff, + ptrdiff_t src_stride, + tran_low_t *coeff) { + switch (tx_size) { + // As the output transform co-efficients of 4x4 Hadamard transform can be + // represented using 15 bits (for 12-bit clip) use lowbd variant of + // hadamard_4x4. + case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; + case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break; + case TX_16X16: + aom_highbd_hadamard_16x16(src_diff, src_stride, coeff); + break; + case TX_32X32: + aom_highbd_hadamard_32x32(src_diff, src_stride, coeff); + break; + default: assert(0); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + switch (tx_size) { + case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; + case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break; + case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break; + case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break; + default: assert(0); + } +} + +void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, + const int16_t *src_diff, int src_stride, + tran_low_t *coeff) { + if (use_hadamard) { +#if CONFIG_AV1_HIGHBITDEPTH + if (bd_info.use_highbitdepth_buf) { + highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); + } else { + wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); + } +#else + wht_fwd_txfm(tx_size, src_diff, src_stride, coeff); +#endif // CONFIG_AV1_HIGHBITDEPTH + } else { + TxfmParam txfm_param; + txfm_param.tx_type = DCT_DCT; + txfm_param.tx_size = tx_size; + txfm_param.lossless = 0; + txfm_param.bd = bd_info.bit_depth; + txfm_param.is_hbd = bd_info.use_highbitdepth_buf; + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param); + } +} diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h new file mode 100644 index 0000000000..30f8a2258b --- /dev/null +++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ +#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ + +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, + TxfmParam *txfm_param); + +void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param); + +/*!\brief Apply Hadamard or DCT transform + * + * \callergraph + * DCT and Hadamard transforms are commonly used for quick RD score estimation. + * The coeff buffer's size should be equal to the number of pixels + * corresponding to tx_size. + */ +void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, + const int16_t *src_diff, int src_stride, tran_low_t *coeff); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_ diff --git a/third_party/aom/av1/encoder/interp_search.c b/third_party/aom/av1/encoder/interp_search.c new file mode 100644 index 0000000000..27235303c0 --- /dev/null +++ b/third_party/aom/av1/encoder/interp_search.c @@ -0,0 +1,801 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/pred_common.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/reconinter_enc.h" + +// return mv_diff +static INLINE int is_interp_filter_good_match( + const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi, + int skip_level) { + const int is_comp = has_second_ref(mi); + int i; + + for (i = 0; i < 1 + is_comp; ++i) { + if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX; + } + + if (skip_level == 1 && is_comp) { + if (st->comp_type != mi->interinter_comp.type) return INT_MAX; + if (st->compound_idx != mi->compound_idx) return INT_MAX; + } + + int mv_diff = 0; + for (i = 0; i < 1 + is_comp; ++i) { + mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) + + abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col); + } + return mv_diff; +} + +static INLINE int save_interp_filter_search_stat( + MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx) { + if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) { + INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters, + { mbmi->mv[0], mbmi->mv[1] }, + { mbmi->ref_frame[0], + mbmi->ref_frame[1] }, + mbmi->interinter_comp.type, + mbmi->compound_idx, + rd, + pred_sse }; + interp_filter_stats[interp_filter_stats_idx] = stat; + interp_filter_stats_idx++; + } + return interp_filter_stats_idx; +} + +static INLINE int find_interp_filter_in_stats( + MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx, int skip_level) { + // [skip_levels][single or comp] + const int thr[2][2] = { { 0, 0 }, { 3, 7 } }; + const int is_comp = has_second_ref(mbmi); + + // Find good enough match. + // TODO(yunqing): Separate single-ref mode and comp mode stats for fast + // search. + int best = INT_MAX; + int match = -1; + for (int j = 0; j < interp_filter_stats_idx; ++j) { + const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j]; + const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level); + // Exact match is found. + if (mv_diff == 0) { + match = j; + break; + } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) { + best = mv_diff; + match = j; + } + } + + if (match != -1) { + mbmi->interp_filters = interp_filter_stats[match].filters; + return match; + } + return -1; // no match result found +} + +int av1_find_interp_filter_match( + MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, + const InterpFilter assign_filter, const int need_search, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx) { + int match_found_idx = -1; + if (cpi->sf.interp_sf.use_interp_filter && need_search) + match_found_idx = find_interp_filter_in_stats( + mbmi, interp_filter_stats, interp_filter_stats_idx, + cpi->sf.interp_sf.use_interp_filter); + + if (!need_search || match_found_idx == -1) + set_default_interp_filters(mbmi, assign_filter); + return match_found_idx; +} + +static INLINE int get_switchable_rate(MACROBLOCK *const x, + const int_interpfilters filters, + const int ctx[2], int dual_filter) { + const InterpFilter filter0 = filters.as_filters.y_filter; + int inter_filter_cost = + x->mode_costs.switchable_interp_costs[ctx[0]][filter0]; + if (dual_filter) { + const InterpFilter filter1 = filters.as_filters.x_filter; + inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1]; + } + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; +} + +// Build inter predictor and calculate model rd +// for a given plane. +static INLINE void interp_model_rd_eval( + MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int plane_from, int plane_to, + RD_STATS *rd_stats, int is_skip_build_pred) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + RD_STATS tmp_rd_stats; + av1_init_rd_stats(&tmp_rd_stats); + + // Skip inter predictor if the predictor is already available. + if (!is_skip_build_pred) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane_from, plane_to); + } + + model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model + ? MODELRD_LEGACY + : MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate, + &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL, + NULL, NULL); + + av1_merge_rd_stats(rd_stats, &tmp_rd_stats); +} + +// calculate the rdcost of given interpolation_filter +static INLINE int64_t interpolation_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, + RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2], + const int skip_pred) { + const AV1_COMMON *cm = &cpi->common; + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS this_rd_stats_luma, this_rd_stats; + + // Initialize rd_stats structures to default values. + av1_init_rd_stats(&this_rd_stats_luma); + this_rd_stats = *rd_stats_luma; + const int_interpfilters last_best = mbmi->interp_filters; + mbmi->interp_filters = filter_sets[filter_idx]; + const int tmp_rs = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, + cm->seq_params->enable_dual_filter); + + int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0); + if (min_rd > *rd) { + mbmi->interp_filters = last_best; + return 0; + } + + (void)tile_data; + + assert(skip_pred != 2); + assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0)); + assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0)); + assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0)); + assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1)); + assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1)); + assert((skip_pred >= 0) && + (skip_pred <= interp_search_flags->default_interp_skip_flags)); + + // When skip_txfm pred is equal to default_interp_skip_flags, + // skip both luma and chroma MC. + // For mono-chrome images: + // num_planes = 1 and cpi->default_interp_skip_flags = 1, + // skip_pred = 1: skip both luma and chroma + // skip_pred = 0: Evaluate luma and as num_planes=1, + // skip chroma evaluation + int tmp_skip_pred = + (skip_pred == interp_search_flags->default_interp_skip_flags) + ? INTERP_SKIP_LUMA_SKIP_CHROMA + : skip_pred; + + switch (tmp_skip_pred) { + case INTERP_EVAL_LUMA_EVAL_CHROMA: + // skip_pred = 0: Evaluate both luma and chroma. + // Luma MC + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, + &this_rd_stats_luma, 0); + this_rd_stats = this_rd_stats_luma; +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + AOM_FALLTHROUGH_INTENDED; + case INTERP_SKIP_LUMA_EVAL_CHROMA: + // skip_pred = 1: skip luma evaluation (retain previous best luma stats) + // and do chroma evaluation. + for (int plane = 1; plane < num_planes; ++plane) { + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); + if (tmp_rd >= *rd) { + mbmi->interp_filters = last_best; + return 0; + } + interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane, + &this_rd_stats, 0); + } + break; + case INTERP_SKIP_LUMA_SKIP_CHROMA: + // both luma and chroma evaluation is skipped + this_rd_stats = *rd_stats; + break; + case INTERP_EVAL_INVALID: + default: assert(0); return 0; + } + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist); + + if (tmp_rd < *rd) { + *rd = tmp_rd; + *switchable_rate = tmp_rs; + if (skip_pred != interp_search_flags->default_interp_skip_flags) { + if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) { + // Overwrite the data as current filter is the best one + *rd_stats_luma = this_rd_stats_luma; + *rd_stats = this_rd_stats; + // As luma MC data is computed, no need to recompute after the search + x->recalc_luma_mc_data = 0; + } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) { + // As luma MC data is not computed, update of luma data can be skipped + *rd_stats = this_rd_stats; + // As luma MC data is not recomputed and current filter is the best, + // indicate the possibility of recomputing MC data + // If current buffer contains valid MC data, toggle to indicate that + // luma MC data needs to be recomputed + x->recalc_luma_mc_data ^= 1; + } + swap_dst_buf(xd, dst_bufs, num_planes); + } + return 1; + } + mbmi->interp_filters = last_best; + return 0; +} + +static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed( + const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize, + int_interpfilters *af, int_interpfilters *lf) { + const AV1_COMMON *cm = &cpi->common; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + const int bsl = mi_size_wide_log2[bsize]; + int is_horiz_eq = 0, is_vert_eq = 0; + + if (above_mbmi && is_inter_block(above_mbmi)) + *af = above_mbmi->interp_filters; + + if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters; + + if (af->as_filters.x_filter != INTERP_INVALID) + is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter; + if (af->as_filters.y_filter != INTERP_INVALID) + is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter; + + INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int pred_filter_enable = + cpi->sf.interp_sf.cb_pred_filter_search + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1 + : 0; + pred_filter_enable &= is_horiz_eq || is_vert_eq; + // pred_filter_search = 0: pred_filter is disabled + // pred_filter_search = 1: pred_filter is enabled and only horz pred matching + // pred_filter_search = 2: pred_filter is enabled and only vert pred matching + // pred_filter_search = 3: pred_filter is enabled and + // both vert, horz pred matching + return pred_filter_enable * pred_filter_type; +} + +static DUAL_FILTER_TYPE find_best_interp_rd_facade( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) { + int tmp_skip_pred = skip_pred; + DUAL_FILTER_TYPE best_filt_type = REG_REG; + + // If no filter are set to be evaluated, return from function + if (allow_interp_mask == 0x0) return best_filt_type; + // For block width or height is 4, skip the pred evaluation of SHARP_SHARP + tmp_skip_pred = is_w4_or_h4 + ? cpi->interp_search_flags.default_interp_skip_flags + : skip_pred; + + // Loop over the all filter types and evaluate for only allowed filter types + for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) { + const int is_filter_allowed = + get_interp_filter_allowed_mask(allow_interp_mask, filt_type); + if (is_filter_allowed) + if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, + dst_bufs, filt_type, switchable_ctx, + tmp_skip_pred)) + best_filt_type = filt_type; + tmp_skip_pred = skip_pred; + } + return best_filt_type; +} + +static INLINE void pred_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af, + int_interpfilters *lf) { + (void)lf; + assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ); + assert(pred_filt_type < INTERP_PRED_TYPE_ALL); + uint16_t allowed_interp_mask = 0; + + if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) { + // pred_filter_search = 1: Only horizontal filter is matching + allowed_interp_mask = + av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter]; + } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) { + // pred_filter_search = 2: Only vertical filter is matching + allowed_interp_mask = + av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter]; + } else { + // pred_filter_search = 3: Both horizontal and vertical filter are matching + int filt_type = + af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS; + set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type); + } + // REG_REG is already been evaluated in the beginning + reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y, + rd_stats, switchable_rate, dst_bufs, + switchable_ctx, skip_pred, allowed_interp_mask, 0); +} +// Evaluate dual filter type +// a) Using above, left block interp filter +// b) Find the best horizontal filter and +// then evaluate corresponding vertical filters. +static INLINE void fast_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_hor, const int skip_ver) { + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; + int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); + int_interpfilters lf = af; + + if (!have_newmv_in_inter_mode(mbmi->mode)) { + pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); + } + + if (pred_filter_type) { + pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + switchable_ctx, (skip_hor & skip_ver), + pred_filter_type, &af, &lf); + } else { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + int best_dual_mode = 0; + int skip_pred = + bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor; + // TODO(any): Make use of find_best_interp_rd_facade() + // if speed impact is negligible + for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { + if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, + dst_bufs, i, switchable_ctx, skip_pred)) { + best_dual_mode = i; + } + skip_pred = skip_hor; + } + // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes + skip_pred = + bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver; + for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2)); + i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) { + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + i, switchable_ctx, skip_pred); + skip_pred = skip_ver; + } + } +} + +// Find the best interp filter if dual_interp_filter = 0 +static INLINE void find_best_non_dual_interp_filter( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y, + RD_STATS *rd_stats, int *const switchable_rate, + const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], + const int skip_ver, const int skip_hor) { + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + int8_t i; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + uint16_t interp_filter_search_mask = + interp_search_flags->interp_filter_search_mask; + + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0); + const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1); + int use_actual_frame_probs = 1; + const int *switchable_interp_p0; + const int *switchable_interp_p1; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs + .switchable_interp_probs[update_type][ctx0]; + switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs + .switchable_interp_probs[update_type][ctx1]; + } +#endif + if (use_actual_frame_probs) { + switchable_interp_p0 = + cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0]; + switchable_interp_p1 = + cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1]; + } + static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 }; + const int thresh = thr[update_type]; + for (i = 0; i < SWITCHABLE_FILTERS; i++) { + // For non-dual case, the 2 dir's prob should be identical. + assert(switchable_interp_p0[i] == switchable_interp_p1[i]); + if (switchable_interp_p0[i] < thresh && + switchable_interp_p1[i] < thresh) { + DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i; + reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type); + } + } + } + + // Regular filter evaluation should have been done and hence the same should + // be the winner + assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int); + if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) { + INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ; + int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID); + int_interpfilters lf = af; + + pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf); + if (pred_filter_type) { + assert(af.as_filters.x_filter != INTERP_INVALID); + int filter_idx = SWITCHABLE * af.as_filters.x_filter; + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert(filter_sets[filter_idx].as_filters.x_filter == + filter_sets[filter_idx].as_filters.y_filter); + if (cpi->sf.interp_sf.adaptive_interp_filter_search && + !(get_interp_filter_allowed_mask(interp_filter_search_mask, + filter_idx))) { + return; + } + if (filter_idx) { + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + filter_idx, switchable_ctx, + (skip_hor & skip_ver)); + } + return; + } + } + // Reuse regular filter's modeled rd data for sharp filter for following + // cases + // 1) When bsize is 4x4 + // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical + // direction is full-pel + // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal + // direction is full-pel + // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction + // alone is full-pel + + if ((bsize == BLOCK_4X4) || + (block_size_wide[bsize] == 4 && + skip_ver == interp_search_flags->default_interp_skip_flags) || + (block_size_high[bsize] == 4 && + skip_hor == interp_search_flags->default_interp_skip_flags)) { + int skip_pred = skip_hor & skip_ver; + uint16_t allowed_interp_mask = 0; + + // REG_REG filter type is evaluated beforehand, hence skip it + set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP); + set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH); + if (cpi->sf.interp_sf.adaptive_interp_filter_search) + allowed_interp_mask &= interp_filter_search_mask; + + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + switchable_ctx, skip_pred, allowed_interp_mask, + 1); + } else { + int skip_pred = (skip_hor & skip_ver); + for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE; + i += (SWITCHABLE_FILTERS + 1)) { + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert(filter_sets[i].as_filters.x_filter == + filter_sets[i].as_filters.y_filter); + if (cpi->sf.interp_sf.adaptive_interp_filter_search && + !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) { + continue; + } + interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + rd_stats_y, rd_stats, switchable_rate, dst_bufs, + i, switchable_ctx, skip_pred); + // In first iteration, smooth filter is evaluated. If smooth filter + // (which is less sharper) is the winner among regular and smooth filters, + // sharp filter evaluation is skipped + // TODO(any): Refine this gating based on modelled rd only (i.e., by not + // accounting switchable filter rate) + if (cpi->sf.interp_sf.skip_sharp_interp_filter_search && + skip_pred != interp_search_flags->default_interp_skip_flags) { + if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int) + break; + } + } + } +} + +static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x, + const AV1_COMP *const cpi, + int *skip_hor, int *skip_ver) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + const int is_compound = has_second_ref(mbmi); + assert(is_intrabc_block(mbmi) == 0); + for (int ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]); + // TODO(any): Refine skip flag calculation considering scaling + if (av1_is_scaled(sf)) { + *skip_hor = 0; + *skip_ver = 0; + break; + } + const MV mv = mbmi->mv[ref].as_mv; + int skip_hor_plane = 0; + int skip_ver_plane = 0; + for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1)); + ++plane_idx) { + struct macroblockd_plane *const pd = &xd->plane[plane_idx]; + const int bw = pd->width; + const int bh = pd->height; + const MV mv_q4 = clamp_mv_to_umv_border_sb( + xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y); + const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS; + const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS; + skip_hor_plane |= ((sub_x == 0) << plane_idx); + skip_ver_plane |= ((sub_y == 0) << plane_idx); + } + *skip_hor &= skip_hor_plane; + *skip_ver &= skip_ver_plane; + // It is not valid that "luma MV is sub-pel, whereas chroma MV is not" + assert(*skip_hor != 2); + assert(*skip_ver != 2); + } + // When compond prediction type is compound segment wedge, luma MC and chroma + // MC need to go hand in hand as mask generated during luma MC is reuired for + // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during + // vertical filter decision may be incorrect as temporary MC evaluation + // overwrites the mask. Make skip_ver as 0 for this case so that mask is + // populated during luma MC + if (is_compound && mbmi->compound_idx == 1 && + mbmi->interinter_comp.type == COMPOUND_DIFFWTD) { + assert(mbmi->comp_group_idx == 1); + if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0; + } +} + +/*!\brief AV1 interpolation filter search + * + * \ingroup inter_mode_search + * + * \param[in] cpi Top-level encoder structure. + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding. + * \param[in] x Pointer to struc holding all the data for + * the current macroblock. + * \param[in] bsize Current block size. + * \param[in] tmp_dst A temporary prediction buffer to hold a + * computed prediction. + * \param[in,out] orig_dst A prediction buffer to hold a computed + * prediction. This will eventually hold the + * final prediction, and the tmp_dst info will + * be copied here. + * \param[in,out] rd The RD cost associated with the selected + * interpolation filter parameters. + * \param[in,out] switchable_rate The rate associated with using a SWITCHABLE + * filter mode. + * \param[in,out] skip_build_pred Indicates whether or not to build the inter + * predictor. If this is 0, the inter predictor + * has already been built and thus we can avoid + * repeating computation. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * + * \return Returns INT64_MAX if the filter parameters are invalid and the + * current motion mode being tested should be skipped. It returns 0 if the + * parameter search is a success. + */ +int64_t av1_interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + int64_t *const rd, int *const switchable_rate, int *skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd) { + const AV1_COMMON *cm = &cpi->common; + const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int need_search = av1_is_interp_needed(xd); + const int ref_frame = xd->mi[0]->ref_frame[0]; + RD_STATS rd_stats_luma, rd_stats; + + // Initialization of rd_stats structures with default values + av1_init_rd_stats(&rd_stats_luma); + av1_init_rd_stats(&rd_stats); + + int match_found_idx = -1; + const InterpFilter assign_filter = cm->features.interp_filter; + + match_found_idx = av1_find_interp_filter_match( + mbmi, cpi, assign_filter, need_search, args->interp_filter_stats, + args->interp_filter_stats_idx); + + if (match_found_idx != -1) { + *rd = args->interp_filter_stats[match_found_idx].rd; + x->pred_sse[ref_frame] = + args->interp_filter_stats[match_found_idx].pred_sse; + *skip_build_pred = 0; + return 0; + } + + int switchable_ctx[2]; + switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0); + switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); + *switchable_rate = + get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, + cm->seq_params->enable_dual_filter); + + // Do MC evaluation for default filter_type. + // Luma MC + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y, + &rd_stats_luma, *skip_build_pred); + +#if CONFIG_COLLECT_RD_STATS == 3 + RD_STATS rd_stats_y; + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 3 + // Chroma MC + if (num_planes > 1) { + interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V, + &rd_stats, *skip_build_pred); + } + *skip_build_pred = 1; + + av1_merge_rd_stats(&rd_stats, &rd_stats_luma); + + assert(rd_stats.rate >= 0); + + *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist); + x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); + + if (assign_filter != SWITCHABLE || match_found_idx != -1) { + return 0; + } + if (!need_search) { + int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + return 0; + } + if (args->modelled_rd != NULL) { + if (has_second_ref(mbmi)) { + const int ref_mv_idx = mbmi->ref_mv_idx; + MV_REFERENCE_FRAME *refs = mbmi->ref_frame; + const int mode0 = compound_ref0_mode(mbmi->mode); + const int mode1 = compound_ref1_mode(mbmi->mode); + const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) { + return INT64_MAX; + } + } + } + + x->recalc_luma_mc_data = 0; + // skip_flag=xx (in binary form) + // Setting 0th flag corresonds to skipping luma MC and setting 1st bt + // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip + // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only" + // Skip_flag=2 is not a valid case + // skip_flag=3 corresponds to "Skip both luma and chroma MC" + int skip_hor = interp_search_flags->default_interp_skip_flags; + int skip_ver = interp_search_flags->default_interp_skip_flags; + calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver); + + // do interp_filter search + restore_dst_buf(xd, *tmp_dst, num_planes); + const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst }; + // Evaluate dual interp filters + if (cm->seq_params->enable_dual_filter) { + if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) { + fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, + &rd_stats_luma, &rd_stats, switchable_rate, + dst_bufs, switchable_ctx, skip_hor, skip_ver); + } else { + // Use full interpolation filter search + uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK; + // REG_REG filter type is evaluated beforehand, so loop is repeated over + // REG_SMOOTH to SHARP_SHARP for full interpolation filter search + reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG); + find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, + &rd_stats_luma, &rd_stats, switchable_rate, + dst_bufs, switchable_ctx, + (skip_hor & skip_ver), allowed_interp_mask, 0); + } + } else { + // Evaluate non-dual interp filters + find_best_non_dual_interp_filter( + x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, + switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor); + } + swap_dst_buf(xd, dst_bufs, num_planes); + // Recompute final MC data if required + if (x->recalc_luma_mc_data == 1) { + // Recomputing final luma MC data is required only if the same was skipped + // in either of the directions Condition below is necessary, but not + // sufficient + assert((skip_hor == 1) || (skip_ver == 1)); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4); + + // save search results + if (cpi->sf.interp_sf.use_interp_filter) { + assert(match_found_idx == -1); + args->interp_filter_stats_idx = save_interp_filter_search_stat( + mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats, + args->interp_filter_stats_idx); + } + return 0; +} diff --git a/third_party/aom/av1/encoder/interp_search.h b/third_party/aom/av1/encoder/interp_search.h new file mode 100644 index 0000000000..9815e0bcfb --- /dev/null +++ b/third_party/aom/av1/encoder/interp_search.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ +#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/rdopt_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +#define MAX_INTERP_FILTER_STATS 128 +#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) + +typedef struct { + int_interpfilters filters; + int_mv mv[2]; + int8_t ref_frames[2]; + COMPOUND_TYPE comp_type; + int compound_idx; + int64_t rd; + unsigned int pred_sse; +} INTERPOLATION_FILTER_STATS; +/*!\endcond */ + +/*!\brief Miscellaneous arguments for inter mode search. + */ +typedef struct HandleInterModeArgs { + /*! + * Buffer for the above predictor in OBMC + */ + uint8_t *above_pred_buf[MAX_MB_PLANE]; + /*! + * Stride for the above predictor in OBMC + */ + int above_pred_stride[MAX_MB_PLANE]; + /*! + * Buffer for the left predictor in OBMC + */ + uint8_t *left_pred_buf[MAX_MB_PLANE]; + /*! + * Stride for the left predictor in OBMC + */ + int left_pred_stride[MAX_MB_PLANE]; + /*! + * Pointer to the first member in a 2D array which holds + * single reference mode motion vectors to be used as a starting + * point in the mv search for compound modes. Each array is length REF_FRAMES, + * meaning there is a slot for a single reference motion vector for + * each possible reference frame. The 2D array consists of N of these arrays, + * where N is the length of the reference mv stack computed for the single + * reference case for that particular reference frame. + */ + int_mv (*single_newmv)[REF_FRAMES]; + /*! + * Pointer to the first array of a 2D array with the same setup as + * single_newmv array above. This is a 2D array to hold the rate + * corresponding to each of the single reference mode motion vectors + * held in single_newmv. + */ + int (*single_newmv_rate)[REF_FRAMES]; + /*! + * Pointer to the first array of a 2D array with the same setup as + * single_newmv array above. This is a 2D array to hold a 0 or 1 + * validity value corresponding to each of the single reference mode motion + * vectors held in single_newmv. + */ + int (*single_newmv_valid)[REF_FRAMES]; + /*! + * Pointer to the first array in a 3D array of predicted rate-distortion. + * The dimensions of this structure are: + * (number of possible inter modes) X + * (number of reference MVs) X + * (number of reference frames). + */ + int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; + /*! + * Holds an estimated entropy cost for picking the current reference frame. + * This is used to compute an rd estimate. + */ + int ref_frame_cost; + /*! + * Holds an estimated entropy cost for picking single or compound + * reference. This is used to compute an rd estimate. + */ + int single_comp_cost; + /*! + * Pointer to the first element in a 3D array holding rd's of + * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref + * modes used to determine compound ref modes. The full structure is: + * (number of inter modes) X (length of refmv list) X (number of ref frames) + */ + int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES]; + /*! + * An integer value 0 or 1 which indicates whether or not to skip the motion + * mode search and default to SIMPLE_TRANSLATION as a speed feature. + */ + int skip_motion_mode; + /*! + * Initialized to false. If true, skips interpolation filter search and uses + * the default EIGHTTAP_REGULAR. + */ + bool skip_ifs; + /*! + * A pointer to the first element in an array of INTERINTRA_MODE types. This + * contains the best inter_intra mode for each reference frame. + */ + INTERINTRA_MODE *inter_intra_mode; + /*! + * Array of saved interpolation filter stats collected to avoid repeating + * an interpolation filter search when the mv and ref_frame are the same + * as a previous search. + */ + INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS]; + + /*! + * Stack to store full pixel search start mv of NEWMV mode. + */ + FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2]; + + /*! + * Stack to store ref_mv_idx of NEWMV mode. + */ + uint8_t ref_mv_idx_stack[(MAX_REF_MV_SEARCH - 1) * 2]; + + /*! + * Count of mvs in start mv stack. + */ + int start_mv_cnt; + + /*! + * Index of the last set of saved stats in the interp_filter_stats array. + */ + int interp_filter_stats_idx; + /*! + * Estimated wedge index. + */ + int wedge_index; + /*! + * Estimated wedge sign. + */ + int wedge_sign; + /*! + * Estimated diff wtd index. + */ + int diffwtd_index; + /*! + * Estimated cmp mode. + */ + int cmp_mode[MODE_CTX_REF_FRAMES]; + /*! + * The best sse during single new_mv search. Note that the sse here comes from + * single_motion_search, and not from interpolation_filter_search. This has + * two implications: + * 1. The mv used to calculate the sse here does not have to be the best sse + * found in handle_inter_mode. + * 2. Even if the mvs agree, the sse here can differ from the sse in \ref + * MACROBLOCK::pred_sse due to different interpolation filter used. + */ + unsigned int best_single_sse_in_refs[REF_FRAMES]; + /*! + * Holds the sse of best mode so far in the mode evaluation process. This is + * used in intermediate termination of NEWMV mode evaluation. + */ + unsigned int best_pred_sse; +} HandleInterModeArgs; + +/*!\cond */ +static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = { + { 0x00000000 }, { 0x00010000 }, { 0x00020000 }, // y = 0 + { 0x00000001 }, { 0x00010001 }, { 0x00020001 }, // y = 1 + { 0x00000002 }, { 0x00010002 }, { 0x00020002 }, // y = 2 +}; + +int av1_find_interp_filter_match( + MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi, + const InterpFilter assign_filter, const int need_search, + INTERPOLATION_FILTER_STATS *interp_filter_stats, + int interp_filter_stats_idx); + +int64_t av1_interpolation_filter_search( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + int64_t *const rd, int *const switchable_rate, int *skip_build_pred, + HandleInterModeArgs *args, int64_t ref_best_rd); + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/intra_mode_search.c b/third_party/aom/av1/encoder/intra_mode_search.c new file mode 100644 index 0000000000..99b0af2f8e --- /dev/null +++ b/third_party/aom/av1/encoder/intra_mode_search.c @@ -0,0 +1,1739 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/tx_search.h" + +// Even though there are 7 delta angles, this macro is set to 9 to facilitate +// the rd threshold check to prune -3 and 3 delta angles. +#define SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY (2 * MAX_ANGLE_DELTA + 3) + +// The order for evaluating delta angles while processing the luma directional +// intra modes. Currently, this order of evaluation is applicable only when +// speed feature prune_luma_odd_delta_angles_in_intra is enabled. In this case, +// even angles are evaluated first in order to facilitate the pruning of odd +// delta angles based on the rd costs of the neighboring delta angles. +static const int8_t luma_delta_angles_order[2 * MAX_ANGLE_DELTA] = { + -2, 2, -3, -1, 1, 3, +}; + +/*!\cond */ +static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = { + DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED, + SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED, + D67_PRED, D113_PRED, D45_PRED, +}; + +static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { + UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED, + UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED, + UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED, + UV_D113_PRED, UV_D45_PRED, +}; + +// The bitmask corresponds to the filter intra modes as defined in enums.h +// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to +// disable the evaluation of corresponding filter intra mode. The table +// av1_derived_filter_intra_mode_used_flag is used when speed feature +// prune_filter_intra_level is 1. The evaluated filter intra modes are union +// of the following: +// 1) FILTER_DC_PRED +// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED, +// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED). +static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = { + 0x01, // DC_PRED: 0000 0001 + 0x03, // V_PRED: 0000 0011 + 0x05, // H_PRED: 0000 0101 + 0x01, // D45_PRED: 0000 0001 + 0x01, // D135_PRED: 0000 0001 + 0x01, // D113_PRED: 0000 0001 + 0x09, // D157_PRED: 0000 1001 + 0x01, // D203_PRED: 0000 0001 + 0x01, // D67_PRED: 0000 0001 + 0x01, // SMOOTH_PRED: 0000 0001 + 0x01, // SMOOTH_V_PRED: 0000 0001 + 0x01, // SMOOTH_H_PRED: 0000 0001 + 0x11 // PAETH_PRED: 0001 0001 +}; + +// The bitmask corresponds to the chroma intra modes as defined in enums.h +// UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to +// disable the evaluation of corresponding chroma intra mode. The table +// av1_derived_chroma_intra_mode_used_flag is used when speed feature +// prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma +// intra modes are union of the following: +// 1) UV_DC_PRED +// 2) UV_SMOOTH_PRED +// 3) UV_CFL_PRED +// 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma +// intra mode winner is V_PRED). +static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = { + 0x2201, // DC_PRED: 0010 0010 0000 0001 + 0x2203, // V_PRED: 0010 0010 0000 0011 + 0x2205, // H_PRED: 0010 0010 0000 0101 + 0x2209, // D45_PRED: 0010 0010 0000 1001 + 0x2211, // D135_PRED: 0010 0010 0001 0001 + 0x2221, // D113_PRED: 0010 0010 0010 0001 + 0x2241, // D157_PRED: 0010 0010 0100 0001 + 0x2281, // D203_PRED: 0010 0010 1000 0001 + 0x2301, // D67_PRED: 0010 0011 0000 0001 + 0x2201, // SMOOTH_PRED: 0010 0010 0000 0001 + 0x2601, // SMOOTH_V_PRED: 0010 0110 0000 0001 + 0x2a01, // SMOOTH_H_PRED: 0010 1010 0000 0001 + 0x3201 // PAETH_PRED: 0011 0010 0000 0001 +}; + +DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 }; +DECLARE_ALIGNED(16, static const uint16_t, + highbd_all_zeros[MAX_SB_SIZE]) = { 0 }; + +int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf, + const int stride, const int is_hbd) { + unsigned int sse; + + if (is_hbd) + return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse); + else + return vf(buf, stride, all_zeros, 0, &sse); +} + +// Computes average of log(1 + variance) across 4x4 sub-blocks for source and +// reconstructed blocks. +static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x, + const BLOCK_SIZE bs, + double *avg_log_src_variance, + double *avg_log_recon_variance) { + const MACROBLOCKD *const xd = &x->e_mbd; + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; + const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); + const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); + const int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + const int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow); + const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow); + const int is_hbd = is_cur_buf_hbd(xd); + + for (int i = 0; i < bh; i += MI_SIZE) { + const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2); + for (int j = 0; j < bw; j += MI_SIZE) { + const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2); + const int mi_offset = r * mi_size_wide[sb_size] + c; + Block4x4VarInfo *block_4x4_var_info = + &x->src_var_info_of_4x4_sub_blocks[mi_offset]; + int src_var = block_4x4_var_info->var; + double log_src_var = block_4x4_var_info->log_var; + // Compute average of log(1 + variance) for the source block from 4x4 + // sub-block variance values. Calculate and store 4x4 sub-block variance + // and log(1 + variance), if the values present in + // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily + // available with valid values. + if (src_var < 0) { + src_var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, + x->plane[0].src.buf + i * x->plane[0].src.stride + j, + x->plane[0].src.stride, is_hbd); + block_4x4_var_info->var = src_var; + log_src_var = log1p(src_var / 16.0); + block_4x4_var_info->log_var = log_src_var; + } else { + // When source variance is already calculated and available for + // retrieval, check if log(1 + variance) is also available. If it is + // available, then retrieve from buffer. Else, calculate the same and + // store to the buffer. + if (log_src_var < 0) { + log_src_var = log1p(src_var / 16.0); + block_4x4_var_info->log_var = log_src_var; + } + } + *avg_log_src_variance += log_src_var; + + const int recon_var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, + xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j, + xd->plane[0].dst.stride, is_hbd); + *avg_log_recon_variance += log1p(recon_var / 16.0); + } + } + + const int blocks = (bw * bh) / 16; + *avg_log_src_variance /= (double)blocks; + *avg_log_recon_variance /= (double)blocks; +} + +// Returns a factor to be applied to the RD value based on how well the +// reconstructed block variance matches the source variance. +static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed); + // For non-positive threshold values, the comparison of source and + // reconstructed variances with threshold evaluates to false + // (src_var < threshold/rec_var < threshold) as these metrics are greater than + // than 0. Hence further calculations are skipped. + if (threshold <= 0) return 1.0; + + double variance_rd_factor = 1.0; + double avg_log_src_variance = 0.0; + double avg_log_recon_variance = 0.0; + double var_diff = 0.0; + + compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance, + &avg_log_recon_variance); + + // Dont allow 0 to prevent / 0 below. + avg_log_src_variance += 0.000001; + avg_log_recon_variance += 0.000001; + + if (avg_log_src_variance >= avg_log_recon_variance) { + var_diff = (avg_log_src_variance - avg_log_recon_variance); + if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) { + variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance); + } + } else { + var_diff = (avg_log_recon_variance - avg_log_src_variance); + if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) { + variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance)); + } + } + + // Limit adjustment; + variance_rd_factor = AOMMIN(3.0, variance_rd_factor); + + return variance_rd_factor; +} +/*!\endcond */ + +/*!\brief Search for the best filter_intra mode when coding intra frame. + * + * \ingroup intra_mode_search + * \callergraph + * This function loops through all filter_intra modes to find the best one. + * + * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise. + */ +static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, int mode_cost, + PREDICTION_MODE best_mode_so_far, + int64_t *best_rd, int64_t *best_model_rd, + PICK_MODE_CONTEXT *ctx) { + // Skip the evaluation of filter intra modes. + if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0; + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + int filter_intra_selected_flag = 0; + FILTER_INTRA_MODE mode; + TX_SIZE best_tx_size = TX_8X8; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_zero(filter_intra_mode_info); + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->mode = DC_PRED; + mbmi->palette_mode_info.palette_size[0] = 0; + + // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have + // filter-intra as winner. + if (x->use_mb_mode_cache && + !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra) + return 0; + + for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + mbmi->filter_intra_mode_info.filter_intra_mode = mode; + + if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) && + !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] & + (1 << mode))) + continue; + + // Skip the evaluation of modes that do not match with the winner mode in + // x->mb_mode_cache. + if (x->use_mb_mode_cache && + mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode) + continue; + + if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) { + continue; + } + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + const int this_rate = + tokenonly_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + // Visual quality adjustment based on recon vs source variance. + if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) { + this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize)); + } + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (this_rd < *best_rd) { + *best_rd = this_rd; + best_tx_size = mbmi->tx_size; + filter_intra_mode_info = mbmi->filter_intra_mode_info; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip_txfm; + filter_intra_selected_flag = 1; + } + } + + if (filter_intra_selected_flag) { + mbmi->mode = DC_PRED; + mbmi->tx_size = best_tx_size; + mbmi->filter_intra_mode_info = filter_intra_mode_info; + av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + return 1; + } else { + return 0; + } +} + +void av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count, int *num_colors) { + const int max_pix_val = 1 << 8; + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int this_val = src[r * stride + c]; + assert(this_val < max_pix_val); + ++val_count[this_val]; + } + } + int n = 0; + for (int i = 0; i < max_pix_val; ++i) { + if (val_count[i]) ++n; + } + *num_colors = n; +} + +void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, + int cols, int bit_depth, int *val_count, + int *bin_val_count, int *num_color_bins, + int *num_colors) { + assert(bit_depth <= 12); + const int max_bin_val = 1 << 8; + const int max_pix_val = 1 << bit_depth; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0])); + if (val_count != NULL) + memset(val_count, 0, max_pix_val * sizeof(val_count[0])); + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + /* + * Down-convert the pixels to 8-bit domain before counting. + * This provides consistency of behavior for palette search + * between lbd and hbd encodes. This down-converted pixels + * are only used for calculating the threshold (n). + */ + const int this_val = ((src[r * stride + c]) >> (bit_depth - 8)); + assert(this_val < max_bin_val); + if (this_val >= max_bin_val) continue; + ++bin_val_count[this_val]; + if (val_count != NULL) ++val_count[(src[r * stride + c])]; + } + } + int n = 0; + // Count the colors based on 8-bit domain used to gate the palette path + for (int i = 0; i < max_bin_val; ++i) { + if (bin_val_count[i]) ++n; + } + *num_color_bins = n; + + // Count the actual hbd colors used to create top_colors + n = 0; + if (val_count != NULL) { + for (int i = 0; i < max_pix_val; ++i) { + if (val_count[i]) ++n; + } + *num_colors = n; + } +} + +void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi, + int reorder_delta_angle_eval) { + if (mode_idx < INTRA_MODE_END) { + mbmi->mode = intra_rd_search_mode_order[mode_idx]; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + } else { + mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED; + int delta_angle_eval_idx = + (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2); + if (reorder_delta_angle_eval) { + mbmi->angle_delta[PLANE_TYPE_Y] = + luma_delta_angles_order[delta_angle_eval_idx]; + } else { + mbmi->angle_delta[PLANE_TYPE_Y] = + (delta_angle_eval_idx < 3 ? (delta_angle_eval_idx - 3) + : (delta_angle_eval_idx - 2)); + } + } +} + +static AOM_INLINE int get_model_rd_index_for_pruning( + const MACROBLOCK *const x, + const INTRA_MODE_SPEED_FEATURES *const intra_sf) { + const int top_intra_model_count_allowed = + intra_sf->top_intra_model_count_allowed; + if (!intra_sf->adapt_top_model_rd_count_using_neighbors) + return top_intra_model_count_allowed - 1; + + const MACROBLOCKD *const xd = &x->e_mbd; + const PREDICTION_MODE mode = xd->mi[0]->mode; + int model_rd_index_for_pruning = top_intra_model_count_allowed - 1; + int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0; + if (xd->left_available) + is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode; + if (xd->up_available) + is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode; + // The pruning of luma intra modes is made more aggressive at lower quantizers + // and vice versa. The value for model_rd_index_for_pruning is derived as + // follows. + // qidx 0 to 127: Reduce the index of a candidate used for comparison only if + // the current mode does not match either of the available neighboring modes. + // qidx 128 to 255: Reduce the index of a candidate used for comparison only + // if the current mode does not match both the available neighboring modes. + if (x->qindex <= 127) { + if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode) + model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0); + } else { + if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode) + model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0); + } + return model_rd_index_for_pruning; +} + +int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, + int64_t top_intra_model_rd[], int max_model_cnt_allowed, + int model_rd_index_for_pruning) { + const double thresh_best = 1.50; + const double thresh_top = 1.00; + for (int i = 0; i < max_model_cnt_allowed; i++) { + if (this_model_rd < top_intra_model_rd[i]) { + for (int j = max_model_cnt_allowed - 1; j > i; j--) { + top_intra_model_rd[j] = top_intra_model_rd[j - 1]; + } + top_intra_model_rd[i] = this_model_rd; + break; + } + } + if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX && + this_model_rd > + thresh_top * top_intra_model_rd[model_rd_index_for_pruning]) + return 1; + + if (this_model_rd != INT64_MAX && + this_model_rd > thresh_best * (*best_model_rd)) + return 1; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + return 0; +} + +// Run RD calculation with given chroma intra prediction angle., and return +// the RD cost. Update the best mode info. if the RD cost is the best so far. +static int64_t pick_intra_angle_routine_sbuv( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats, + int *best_angle_delta, int64_t *best_rd) { + MB_MODE_INFO *mbmi = x->e_mbd.mi[0]; + assert(!is_inter_block(mbmi)); + int this_rate; + int64_t this_rd; + RD_STATS tokenonly_rd_stats; + + if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in)) + return INT64_MAX; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + *rate = this_rate; + rd_stats->rate = tokenonly_rd_stats.rate; + rd_stats->dist = tokenonly_rd_stats.dist; + rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm; + } + return this_rd; +} + +/*!\brief Search for the best angle delta for chroma prediction + * + * \ingroup intra_mode_search + * \callergraph + * Given a chroma directional intra prediction mode, this function will try to + * estimate the best delta_angle. + * + * \returns Return if there is a new mode with smaller rdcost than best_rd. + */ +static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int rate_overhead, + int64_t best_rd, int *rate, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + int i, angle_delta, best_angle_delta = 0; + int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; + + rd_stats->rate = INT_MAX; + rd_stats->skip_txfm = 0; + rd_stats->dist = INT64_MAX; + for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; + + for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + for (i = 0; i < 2; ++i) { + best_rd_in = (best_rd == INT64_MAX) + ? INT64_MAX + : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5))); + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; + this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, + best_rd_in, rate, rd_stats, + &best_angle_delta, &best_rd); + rd_cost[2 * angle_delta + i] = this_rd; + if (angle_delta == 0) { + if (this_rd == INT64_MAX) return 0; + rd_cost[1] = this_rd; + break; + } + } + } + + assert(best_rd != INT64_MAX); + for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { + int64_t rd_thresh; + for (i = 0; i < 2; ++i) { + int skip_search = 0; + rd_thresh = best_rd + (best_rd >> 5); + if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && + rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) + skip_search = 1; + if (!skip_search) { + mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta; + pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd, + rate, rd_stats, &best_angle_delta, + &best_rd); + } + } + } + + mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta; + return rd_stats->rate != INT_MAX; +} + +#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \ + (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1) + +static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign, + int *cfl_alpha) { + int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO; + if (cfl_linear_idx == 0) { + *cfl_sign = CFL_SIGN_ZERO; + *cfl_alpha = 0; + } else { + *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG; + *cfl_alpha = abs(cfl_linear_idx) - 1; + } +} + +static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int cfl_idx, + int fast_mode, RD_STATS *rd_stats) { + assert(IMPLIES(fast_mode, rd_stats == NULL)); + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int cfl_plane = get_cfl_pred_type(plane); + CFL_SIGN_TYPE cfl_sign; + int cfl_alpha; + cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha); + // We conly build CFL for a given plane, the other plane's sign is dummy + int dummy_sign = CFL_SIGN_NEG; + const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs; + const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx; + mbmi->cfl_alpha_signs = + PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign); + mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha; + int64_t cfl_cost; + if (fast_mode) { + cfl_cost = + intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0); + } else { + av1_init_rd_stats(rd_stats); + av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize, + tx_size, FTXS_NONE, 0); + av1_rd_cost_update(x->rdmult, rd_stats); + cfl_cost = rd_stats->rdcost; + } + mbmi->cfl_alpha_signs = orig_cfl_alpha_signs; + mbmi->cfl_alpha_idx = orig_cfl_alpha_idx; + return cfl_cost; +} + +static const int cfl_dir_ls[2] = { 1, -1 }; + +// If cfl_search_range is CFL_MAGS_SIZE, return zero. Otherwise return the index +// of the best alpha found using intra_model_rd(). +static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, + int cfl_search_range) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + + if (cfl_search_range == CFL_MAGS_SIZE) return CFL_INDEX_ZERO; + + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->uv_mode == UV_CFL_PRED); + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + int est_best_cfl_idx = CFL_INDEX_ZERO; + int fast_mode = 1; + int start_cfl_idx = CFL_INDEX_ZERO; + int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, + start_cfl_idx, fast_mode, NULL); + for (int si = 0; si < 2; ++si) { + const int dir = cfl_dir_ls[si]; + for (int i = 1; i < CFL_MAGS_SIZE; ++i) { + int cfl_idx = start_cfl_idx + dir * i; + if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; + int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, + cfl_idx, fast_mode, NULL); + if (cfl_cost < best_cfl_cost) { + best_cfl_cost = cfl_cost; + est_best_cfl_idx = cfl_idx; + } else { + break; + } + } + } + return est_best_cfl_idx; +} + +static AOM_INLINE void set_invalid_cfl_parameters( + uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) { + *best_cfl_alpha_idx = 0; + *best_cfl_alpha_signs = 0; +} + +static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, int cfl_search_range, + RD_STATS cfl_rd_arr[CFL_MAGS_SIZE], + int est_best_cfl_idx) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->uv_mode == UV_CFL_PRED); + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) { + av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]); + } + + int fast_mode = 0; + int start_cfl_idx = est_best_cfl_idx; + cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode, + &cfl_rd_arr[start_cfl_idx]); + + if (cfl_search_range == 1) return; + + for (int si = 0; si < 2; ++si) { + const int dir = cfl_dir_ls[si]; + for (int i = 1; i < cfl_search_range; ++i) { + int cfl_idx = start_cfl_idx + dir * i; + if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; + cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode, + &cfl_rd_arr[cfl_idx]); + } + } +} + +/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component + * + * \ingroup intra_mode_search + * \callergraph + * + * This function will use DCT_DCT followed by computing SATD (sum of absolute + * transformed differences) to estimate the RD score and find the best possible + * CFL parameter. + * + * Then the function will apply a full RD search near the best possible CFL + * parameter to find the best actual CFL parameter. + * + * Side effect: + * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD + * search. + * + * \param[in] x Encoder prediction block structure. + * \param[in] cpi Top-level encoder instance structure. + * \param[in] tx_size Transform size. + * \param[in] ref_best_rd Reference best RD. + * \param[in] cfl_search_range The search range of full RD search near the + * estimated best CFL parameter. + * + * \param[out] best_rd_stats RD stats of the best CFL parameter + * \param[out] best_cfl_alpha_idx Best CFL alpha index + * \param[out] best_cfl_alpha_signs Best CFL joint signs + * + */ +static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, + TX_SIZE tx_size, int64_t ref_best_rd, + int cfl_search_range, RD_STATS *best_rd_stats, + uint8_t *best_cfl_alpha_idx, + int8_t *best_cfl_alpha_signs) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + const ModeCosts *mode_costs = &x->mode_costs; + RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE]; + RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE]; + MACROBLOCKD *const xd = &x->e_mbd; + int est_best_cfl_idx_u, est_best_cfl_idx_v; + + av1_invalid_rd_stats(best_rd_stats); + + // As the dc pred data is same for different values of alpha, enable the + // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before + // returning to avoid the unintentional usage of cached dc pred data. + xd->cfl.use_dc_pred_cache = true; + // Evaluate alpha parameter of each chroma plane. + est_best_cfl_idx_u = + cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range); + est_best_cfl_idx_v = + cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range); + + if (cfl_search_range == 1) { + // For cfl_search_range=1, further refinement of alpha is not enabled. Hence + // CfL index=0 for both the chroma planes implies invalid CfL mode. + if (est_best_cfl_idx_u == CFL_INDEX_ZERO && + est_best_cfl_idx_v == CFL_INDEX_ZERO) { + set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); + clear_cfl_dc_pred_cache_flags(&xd->cfl); + return 0; + } + + int cfl_alpha_u, cfl_alpha_v; + CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v; + const MB_MODE_INFO *mbmi = xd->mi[0]; + cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u); + cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v); + const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; + // Compute alpha and mode signaling rate. + const int rate_overhead = + mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] + + mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] + + mode_costs + ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED]; + // Skip the CfL mode evaluation if the RD cost derived using the rate needed + // to signal the CfL mode and alpha parameter exceeds the ref_best_rd. + if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) { + set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); + clear_cfl_dc_pred_cache_flags(&xd->cfl); + return 0; + } + } + + // Compute the rd cost of each chroma plane using the alpha parameters which + // were already evaluated. + cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u, + est_best_cfl_idx_u); + cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v, + est_best_cfl_idx_v); + + clear_cfl_dc_pred_cache_flags(&xd->cfl); + + for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) { + if (cfl_rd_arr_u[ui].rate == INT_MAX) continue; + int cfl_alpha_u; + CFL_SIGN_TYPE cfl_sign_u; + cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u); + for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) { + if (cfl_rd_arr_v[vi].rate == INT_MAX) continue; + int cfl_alpha_v; + CFL_SIGN_TYPE cfl_sign_v; + cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v); + // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a + // valid parameter for CFL + if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue; + int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; + RD_STATS rd_stats = cfl_rd_arr_u[ui]; + av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]); + if (rd_stats.rate != INT_MAX) { + rd_stats.rate += + mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u]; + rd_stats.rate += + mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v]; + } + av1_rd_cost_update(x->rdmult, &rd_stats); + if (rd_stats.rdcost < best_rd_stats->rdcost) { + *best_rd_stats = rd_stats; + *best_cfl_alpha_idx = + (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v; + *best_cfl_alpha_signs = joint_sign; + } + } + } + if (best_rd_stats->rdcost >= ref_best_rd) { + av1_invalid_rd_stats(best_rd_stats); + // Set invalid CFL parameters here since the rdcost is not better than + // ref_best_rd. + set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs); + return 0; + } + return 1; +} + +static bool should_prune_chroma_smooth_pred_based_on_source_variance( + const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) { + if (!cpi->sf.intra_sf.prune_smooth_intra_mode_for_chroma) return false; + + // If the source variance of both chroma planes is less than 20 (empirically + // derived), prune UV_SMOOTH_PRED. + for (int i = AOM_PLANE_U; i < av1_num_planes(&cpi->common); i++) { + const unsigned int variance = av1_get_perpixel_variance_facade( + cpi, &x->e_mbd, &x->plane[i].src, bsize, i); + if (variance >= 20) return false; + } + return true; +} + +int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + MB_MODE_INFO best_mbmi = *mbmi; + int64_t best_rd = INT64_MAX, this_rd; + const ModeCosts *mode_costs = &x->mode_costs; + const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; + + init_sbuv_mode(mbmi); + + // Return if the current block does not correspond to a chroma block. + if (!xd->is_chroma_ref) { + *rate = 0; + *rate_tokenonly = 0; + *distortion = 0; + *skippable = 1; + return INT64_MAX; + } + + // Only store reconstructed luma when there's chroma RDO. When there's no + // chroma RDO, the reconstructed luma will be stored in encode_superblock(). + xd->cfl.store_y = store_cfl_required_rdo(cm, x); + if (xd->cfl.store_y) { + // Restore reconstructed luma values. + // TODO(chiyotsai@google.com): right now we are re-computing the txfm in + // this function everytime we search through uv modes. There is some + // potential speed up here if we cache the result to avoid redundant + // computation. + av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y, + DRY_RUN_NORMAL, + cpi->optimize_seg_arr[mbmi->segment_id]); + xd->cfl.store_y = 0; + } + IntraModeSearchState intra_search_state; + init_intra_mode_search_state(&intra_search_state); + const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd); + + // Search through all non-palette modes. + for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) { + int this_rate; + RD_STATS tokenonly_rd_stats; + UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx]; + + // Skip the current mode evaluation if the RD cost derived using the mode + // signaling rate exceeds the best_rd so far. + const int mode_rate = + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; + if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue; + + PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode); + const int is_directional_mode = av1_is_directional_mode(intra_mode); + + if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra) + continue; + if (is_directional_mode && + !cpi->oxcf.intra_mode_cfg.enable_directional_intra) + continue; + + if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & + (1 << uv_mode))) + continue; + if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED && + uv_mode <= UV_SMOOTH_H_PRED) + continue; + + if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED) + continue; + + assert(mbmi->mode < INTRA_MODES); + if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner && + !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode))) + continue; + + mbmi->uv_mode = uv_mode; + + // Init variables for cfl and angle delta + const SPEED_FEATURES *sf = &cpi->sf; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + if (uv_mode == UV_CFL_PRED) { + if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue; + assert(!is_directional_mode); + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd, + sf->intra_sf.cfl_search_range, &tokenonly_rd_stats, + &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) { + continue; + } + } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) && + intra_mode_cfg->enable_angle_delta) { + if (sf->intra_sf.chroma_intra_pruning_with_hog && + !intra_search_state.dir_mode_skip_mask_ready) { + static const float thresh[2][4] = { + { -1.2f, 0.0f, 0.0f, 1.2f }, // Interframe + { -1.2f, -1.2f, -0.6f, 0.4f }, // Intraframe + }; + const int is_chroma = 1; + const int is_intra_frame = frame_is_intra_only(cm); + prune_intra_mode_with_hog( + x, bsize, cm->seq_params->sb_size, + thresh[is_intra_frame] + [sf->intra_sf.chroma_intra_pruning_with_hog - 1], + intra_search_state.directional_mode_skip_mask, is_chroma); + intra_search_state.dir_mode_skip_mask_ready = 1; + } + if (intra_search_state.directional_mode_skip_mask[uv_mode]) { + continue; + } + + // Search through angle delta + const int rate_overhead = + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; + if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, + &this_rate, &tokenonly_rd_stats)) + continue; + } else { + if (uv_mode == UV_SMOOTH_PRED && + should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x, + bsize)) + continue; + + // Predict directly if we don't need to search for angle delta. + if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) { + continue; + } + } + const int mode_cost = + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode]; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = tokenonly_rd_stats.rate; + *distortion = tokenonly_rd_stats.dist; + *skippable = tokenonly_rd_stats.skip_txfm; + } + } + + // Search palette mode + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->bsize); + if (try_palette) { + uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map; + av1_rd_pick_palette_intra_sbuv( + cpi, x, + mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][UV_DC_PRED], + best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly, + distortion, skippable); + } + + *mbmi = best_mbmi; + // Make sure we actually chose a mode + assert(best_rd < INT64_MAX); + return best_rd; +} + +// Searches palette mode for luma channel in inter frame. +int av1_search_palette_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + int rate2 = 0; + int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd; + int skippable = 0; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = *mbmi; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const intra_mode_cost = + mode_costs->mbmode_cost[size_group_lookup[bsize]]; + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + av1_zero(pmi->palette_size); + + RD_STATS rd_stats_y; + av1_invalid_rd_stats(&rd_stats_y); + av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, + &best_rd_palette, &rd_stats_y.rate, NULL, + &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, + ctx, best_blk_skip, best_tx_type_map); + if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { + this_rd_cost->rdcost = INT64_MAX; + return skippable; + } + + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + + skippable = rd_stats_y.skip_txfm; + distortion2 = rd_stats_y.dist; + rate2 = rd_stats_y.rate + ref_frame_cost; + if (num_planes > 1) { + if (intra_search_state->rate_uv_intra == INT_MAX) { + // We have not found any good uv mode yet, so we need to search for it. + TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra, + &intra_search_state->rate_uv_tokenonly, + &intra_search_state->dist_uvs, + &intra_search_state->skip_uvs, bsize, uv_tx); + intra_search_state->mode_uv = mbmi->uv_mode; + intra_search_state->pmi_uv = *pmi; + intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + } + + // We have found at least one good uv mode before, so copy and paste it + // over. + mbmi->uv_mode = intra_search_state->mode_uv; + pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; + if (pmi->palette_size[1] > 0) { + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; + skippable = skippable && intra_search_state->skip_uvs; + distortion2 += intra_search_state->dist_uvs; + rate2 += intra_search_state->rate_uv_intra; + } + + if (skippable) { + rate2 -= rd_stats_y.rate; + if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly; + rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1]; + } else { + rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; + } + this_rd = RDCOST(x->rdmult, rate2, distortion2); + this_rd_cost->rate = rate2; + this_rd_cost->dist = distortion2; + this_rd_cost->rdcost = this_rd; + return skippable; +} + +void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, + RD_STATS *this_rd_cost, int64_t best_rd) { + MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + MACROBLOCKD *const xd = &x->e_mbd; + int64_t best_rd_palette = best_rd, this_rd; + uint8_t *const best_palette_color_map = + x->palette_buffer->best_palette_color_map; + uint8_t *const color_map = xd->plane[0].color_index_map; + MB_MODE_INFO best_mbmi_palette = *mbmi; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const ModeCosts *mode_costs = &x->mode_costs; + const int *const intra_mode_cost = + mode_costs->mbmode_cost[size_group_lookup[bsize]]; + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + av1_zero(pmi->palette_size); + + RD_STATS rd_stats_y; + av1_invalid_rd_stats(&rd_stats_y); + av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, + &best_rd_palette, &rd_stats_y.rate, NULL, + &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, + ctx, best_blk_skip, best_tx_type_map); + if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { + this_rd_cost->rdcost = INT64_MAX; + return; + } + + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize)); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(color_map, best_palette_color_map, + rows * cols * sizeof(best_palette_color_map[0])); + + rd_stats_y.rate += ref_frame_cost; + + if (rd_stats_y.skip_txfm) { + rd_stats_y.rate = + ref_frame_cost + + mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1]; + } else { + rd_stats_y.rate += + mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; + } + this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist); + this_rd_cost->rate = rd_stats_y.rate; + this_rd_cost->dist = rd_stats_y.dist; + this_rd_cost->rdcost = this_rd; + this_rd_cost->skip_txfm = rd_stats_y.skip_txfm; +} + +/*!\brief Get the intra prediction by searching through tx_type and tx_size. + * + * \ingroup intra_mode_search + * \callergraph + * Currently this function is only used in the intra frame code path for + * winner-mode processing. + * + * \return Returns whether the current mode is an improvement over best_rd. + */ +static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, const int *bmode_costs, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, + MB_MODE_INFO *best_mbmi, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + RD_STATS rd_stats; + // In order to improve txfm search, avoid rd based breakouts during winner + // mode evaluation. Hence passing ref_best_rd as INT64_MAX by default when the + // speed feature use_rd_based_breakout_for_intra_tx_search is disabled. + int64_t ref_best_rd = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search + ? *best_rd + : INT64_MAX; + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, ref_best_rd); + if (rd_stats.rate == INT_MAX) return 0; + int this_rate_tokenonly = rd_stats.rate; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size + // in the tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); + } + const int this_rate = + rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0); + const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist); + if (this_rd < *best_rd) { + *best_mbmi = *mbmi; + *best_rd = this_rd; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = rd_stats.dist; + *skippable = rd_stats.skip_txfm; + av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip, + ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + return 1; + } + return 0; +} + +/*!\brief Search for the best filter_intra mode when coding inter frame. + * + * \ingroup intra_mode_search + * \callergraph + * This function loops through all filter_intra modes to find the best one. + * + * \remark Returns nothing, but updates the mbmi and rd_stats. + */ +static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, + const PICK_MODE_CONTEXT *ctx, + RD_STATS *rd_stats_y, int mode_cost, + int64_t best_rd, + int64_t best_rd_so_far) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->mode == DC_PRED && + av1_filter_intra_allowed_bsize(&cpi->common, bsize)); + + RD_STATS rd_stats_y_fi; + int filter_intra_selected_flag = 0; + TX_SIZE best_tx_size = mbmi->tx_size; + FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + memcpy(best_blk_skip, x->txfm_search_info.blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + mbmi->filter_intra_mode_info.use_filter_intra = 1; + for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES; + ++fi_mode) { + mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode; + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd); + if (rd_stats_y_fi.rate == INT_MAX) continue; + const int this_rate_tmp = + rd_stats_y_fi.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + const int64_t this_rd_tmp = + RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist); + + if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) { + break; + } + if (this_rd_tmp < best_rd_so_far) { + best_tx_size = mbmi->tx_size; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + memcpy(best_blk_skip, x->txfm_search_info.blk_skip, + sizeof(best_blk_skip[0]) * ctx->num_4x4_blk); + best_fi_mode = fi_mode; + *rd_stats_y = rd_stats_y_fi; + filter_intra_selected_flag = 1; + best_rd_so_far = this_rd_tmp; + } + } + + mbmi->tx_size = best_tx_size; + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + + if (filter_intra_selected_flag) { + mbmi->filter_intra_mode_info.use_filter_intra = 1; + mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode; + } else { + mbmi->filter_intra_mode_info.use_filter_intra = 0; + } +} + +// Evaluate a given luma intra-mode in inter frames. +int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, + int64_t best_rd, int *mode_cost_y, int64_t *rd_y, + int64_t *best_model_rd, + int64_t top_intra_model_rd[]) { + const AV1_COMMON *cm = &cpi->common; + const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->ref_frame[0] == INTRA_FRAME); + const PREDICTION_MODE mode = mbmi->mode; + const ModeCosts *mode_costs = &x->mode_costs; + const int mode_cost = + mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost; + const int skip_ctx = av1_get_skip_txfm_context(xd); + + int known_rate = mode_cost; + const int intra_cost_penalty = av1_get_intra_cost_penalty( + cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q, + cm->seq_params->bit_depth); + + if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty; + known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0], + mode_costs->skip_txfm_cost[skip_ctx][1]); + const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0); + if (known_rd > best_rd) { + intra_search_state->skip_intra_modes = 1; + return 0; + } + + const int is_directional_mode = av1_is_directional_mode(mode); + if (is_directional_mode && av1_use_angle_delta(bsize) && + cpi->oxcf.intra_mode_cfg.enable_angle_delta) { + if (intra_sf->intra_pruning_with_hog && + !intra_search_state->dir_mode_skip_mask_ready) { + const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f }; + const int is_chroma = 0; + prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size, + thresh[intra_sf->intra_pruning_with_hog - 1], + intra_search_state->directional_mode_skip_mask, + is_chroma); + intra_search_state->dir_mode_skip_mask_ready = 1; + } + if (intra_search_state->directional_mode_skip_mask[mode]) return 0; + } + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int64_t this_model_rd = + intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); + + const int model_rd_index_for_pruning = + get_model_rd_index_for_pruning(x, intra_sf); + + if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd, + intra_sf->top_intra_model_count_allowed, + model_rd_index_for_pruning)) + return 0; + av1_init_rd_stats(rd_stats_y); + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd); + + // Pick filter intra modes. + if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + int try_filter_intra = 1; + int64_t best_rd_so_far = INT64_MAX; + if (rd_stats_y->rate != INT_MAX) { + // best_rd_so_far is the rdcost of DC_PRED without using filter_intra. + // Later, in filter intra search, best_rd_so_far is used for comparison. + mbmi->filter_intra_mode_info.use_filter_intra = 0; + const int tmp_rate = + rd_stats_y->rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); + try_filter_intra = (best_rd_so_far / 2) <= best_rd; + } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) { + // As rd cost of luma intra dc mode is more than best_rd (i.e., + // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes. + try_filter_intra = 0; + } + + if (try_filter_intra) { + handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost, + best_rd, best_rd_so_far); + } + } + + if (rd_stats_y->rate == INT_MAX) return 0; + + *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0); + const int rate_y = rd_stats_y->skip_txfm + ? mode_costs->skip_txfm_cost[skip_ctx][1] + : rd_stats_y->rate; + *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist); + if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) { + intra_search_state->skip_intra_modes = 1; + return 0; + } + + return 1; +} + +int av1_search_intra_uv_modes_in_interframe( + IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, + const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->ref_frame[0] == INTRA_FRAME); + + // TODO(chiyotsai@google.com): Consolidate the chroma search code here with + // the one in av1_search_palette_mode. + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize); + + assert(intra_search_state->rate_uv_intra == INT_MAX); + if (intra_search_state->rate_uv_intra == INT_MAX) { + // If no good uv-predictor had been found, search for it. + const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra, + &intra_search_state->rate_uv_tokenonly, + &intra_search_state->dist_uvs, + &intra_search_state->skip_uvs, bsize, uv_tx); + intra_search_state->mode_uv = mbmi->uv_mode; + if (try_palette) intra_search_state->pmi_uv = *pmi; + intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV]; + + const int uv_rate = intra_search_state->rate_uv_tokenonly; + const int64_t uv_dist = intra_search_state->dist_uvs; + const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist); + if (uv_rd > best_rd) { + // If there is no good intra uv-mode available, we can skip all intra + // modes. + intra_search_state->skip_intra_modes = 1; + return 0; + } + } + + // If we are here, then the encoder has found at least one good intra uv + // predictor, so we can directly copy its statistics over. + // TODO(any): the stats here is not right if the best uv mode is CFL but the + // best y mode is palette. + rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly; + rd_stats_uv->dist = intra_search_state->dist_uvs; + rd_stats_uv->skip_txfm = intra_search_state->skip_uvs; + rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm; + mbmi->uv_mode = intra_search_state->mode_uv; + if (try_palette) { + pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1]; + memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, + intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE, + 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0])); + } + mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta; + + return 1; +} + +// Checks if odd delta angles can be pruned based on rdcosts of even delta +// angles of the corresponding directional mode. +static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost( + const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost, + int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) { + const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y]; + if (!prune_luma_odd_delta_angles_in_intra || + !av1_is_directional_mode(mbmi->mode) || !(abs(luma_delta_angle) & 1) || + best_rd == INT64_MAX) + return 0; + + const int64_t rd_thresh = best_rd + (best_rd >> 3); + + // Neighbour rdcosts are considered for pruning of odd delta angles as + // mentioned below: + // Delta angle Delta angle rdcost + // to be pruned to be considered + // -3 -2 + // -1 -2, 0 + // 1 0, 2 + // 3 2 + return intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA] > rd_thresh && + intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA + 2] > + rd_thresh; +} + +// Finds the best non-intrabc mode on an intra frame. +int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + int64_t best_model_rd = INT64_MAX; + int is_directional_mode; + uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 }; + // Flag to check rd of any intra mode is better than best_rd passed to this + // function + int beat_best_rd = 0; + const int *bmode_costs; + const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->bsize); + uint8_t *best_palette_color_map = + try_palette ? x->palette_buffer->best_palette_color_map : NULL; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[A]; + const int left_ctx = intra_mode_context[L]; + bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; + + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf; + if (intra_sf->intra_pruning_with_hog) { + // Less aggressive thresholds are used here than those used in inter frame + // encoding in av1_handle_intra_y_mode() because we want key frames/intra + // frames to have higher quality. + const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f }; + const int is_chroma = 0; + prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size, + thresh[intra_sf->intra_pruning_with_hog - 1], + directional_mode_skip_mask, is_chroma); + } + mbmi->filter_intra_mode_info.use_filter_intra = 0; + pmi->palette_size[0] = 0; + + // Set params for mode evaluation + set_mode_eval_params(cpi, x, MODE_EVAL); + + MB_MODE_INFO best_mbmi = *mbmi; + const int max_winner_mode_count = + winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type]; + zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats); + x->winner_mode_count = 0; + + // Searches the intra-modes except for intrabc, palette, and filter_intra. + int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; + for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { + top_intra_model_rd[i] = INT64_MAX; + } + + // Initialize the rdcost corresponding to all the directional and + // non-directional intra modes. + // 1. For directional modes, it stores the rdcost values for delta angles -4, + // -3, ..., 3, 4. + // 2. The rdcost value for luma_delta_angle is stored at index + // luma_delta_angle + MAX_ANGLE_DELTA + 1. + // 3. The rdcost values for fictitious/nonexistent luma_delta_angle -4 and 4 + // (array indices 0 and 8) are always set to INT64_MAX (the initial value). + int64_t intra_modes_rd_cost[INTRA_MODE_END] + [SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY]; + for (int i = 0; i < INTRA_MODE_END; i++) { + for (int j = 0; j < SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY; j++) { + intra_modes_rd_cost[i][j] = INT64_MAX; + } + } + + for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT; + ++mode_idx) { + set_y_mode_and_delta_angle(mode_idx, mbmi, + intra_sf->prune_luma_odd_delta_angles_in_intra); + RD_STATS this_rd_stats; + int this_rate, this_rate_tokenonly, s; + int is_diagonal_mode; + int64_t this_distortion, this_rd; + const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y]; + + is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode); + if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue; + if (av1_is_directional_mode(mbmi->mode) && + !intra_mode_cfg->enable_directional_intra) + continue; + + // The smooth prediction mode appears to be more frequently picked + // than horizontal / vertical smooth prediction modes. Hence treat + // them differently in speed features. + if ((!intra_mode_cfg->enable_smooth_intra || + intra_sf->disable_smooth_intra) && + (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED) + continue; + + // The functionality of filter intra modes and smooth prediction + // overlap. Hence smooth prediction is pruned only if all the + // filter intra modes are enabled. + if (intra_sf->disable_smooth_intra && + intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED) + continue; + if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED) + continue; + + // Skip the evaluation of modes that do not match with the winner mode in + // x->mb_mode_cache. + if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue; + + is_directional_mode = av1_is_directional_mode(mbmi->mode); + if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; + if (is_directional_mode && + !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) && + luma_delta_angle != 0) + continue; + + // Use intra_y_mode_mask speed feature to skip intra mode evaluation. + if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] & + (1 << mbmi->mode))) + continue; + + if (prune_luma_odd_delta_angles_using_rd_cost( + mbmi, intra_modes_rd_cost[mbmi->mode], best_rd, + intra_sf->prune_luma_odd_delta_angles_in_intra)) + continue; + + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int64_t this_model_rd = + intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); + + const int model_rd_index_for_pruning = + get_model_rd_index_for_pruning(x, intra_sf); + + if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd, + intra_sf->top_intra_model_count_allowed, + model_rd_index_for_pruning)) + continue; + + // Builds the actual prediction. The prediction from + // model_intra_yrd_and_prune was just an estimation that did not take into + // account the effect of txfm pipeline, so we need to redo it for real + // here. + av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd); + this_rate_tokenonly = this_rd_stats.rate; + this_distortion = this_rd_stats.dist; + s = this_rd_stats.skip_txfm; + + if (this_rate_tokenonly == INT_MAX) continue; + + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the + // tx_size in the tokenonly rate, but for intra blocks, tx_size is always + // coded (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size); + } + this_rate = + this_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0); + this_rd = RDCOST(x->rdmult, this_rate, this_distortion); + + // Visual quality adjustment based on recon vs source variance. + if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) { + this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize)); + } + + intra_modes_rd_cost[mbmi->mode][luma_delta_angle + MAX_ANGLE_DELTA + 1] = + this_rd; + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (this_rd < best_rd) { + best_mbmi = *mbmi; + best_rd = this_rd; + // Setting beat_best_rd flag because current mode rd is better than + // best_rd passed to this function + beat_best_rd = 1; + *rate = this_rate; + *rate_tokenonly = this_rate_tokenonly; + *distortion = this_distortion; + *skippable = s; + memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } + } + + // Searches palette + if (try_palette) { + av1_rd_pick_palette_intra_sby( + cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map, + &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd, + ctx, ctx->blk_skip, ctx->tx_type_map); + } + + // Searches filter_intra + if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { + if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, + skippable, bsize, bmode_costs[DC_PRED], + best_mbmi.mode, &best_rd, &best_model_rd, + ctx)) { + best_mbmi = *mbmi; + } + } + + // No mode is identified with less rd value than best_rd passed to this + // function. In such cases winner mode processing is not necessary and return + // best_rd as INT64_MAX to indicate best mode is not identified + if (!beat_best_rd) return INT64_MAX; + + // In multi-winner mode processing, perform tx search for few best modes + // identified during mode evaluation. Winner mode processing uses best tx + // configuration for tx search. + if (cpi->sf.winner_mode_sf.multi_winner_mode_type) { + int best_mode_idx = 0; + int block_width, block_height; + uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map; + av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, + &block_height, NULL, NULL); + + for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) { + *mbmi = x->winner_mode_stats[mode_idx].mbmi; + if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) { + // Restore color_map of palette mode before winner mode processing + if (mbmi->palette_mode_info.palette_size[0] > 0) { + uint8_t *color_map_src = + x->winner_mode_stats[mode_idx].color_index_map; + memcpy(color_map_dst, color_map_src, + block_width * block_height * sizeof(*color_map_src)); + } + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + + // Winner mode processing + // If previous searches use only the default tx type/no R-D optimization + // of quantized coeffs, do an extra search for the best tx type/better + // R-D optimization of quantized coeffs + if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, + rate_tokenonly, distortion, skippable, &best_mbmi, + ctx)) + best_mode_idx = mode_idx; + } + } + // Copy color_map of palette mode for final winner mode + if (best_mbmi.palette_mode_info.palette_size[0] > 0) { + uint8_t *color_map_src = + x->winner_mode_stats[best_mode_idx].color_index_map; + memcpy(color_map_dst, color_map_src, + block_width * block_height * sizeof(*color_map_src)); + } + } else { + // If previous searches use only the default tx type/no R-D optimization of + // quantized coeffs, do an extra search for the best tx type/better R-D + // optimization of quantized coeffs + if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) { + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + *mbmi = best_mbmi; + intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, + rate_tokenonly, distortion, skippable, &best_mbmi, ctx); + } + } + *mbmi = best_mbmi; + av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); + return best_rd; +} diff --git a/third_party/aom/av1/encoder/intra_mode_search.h b/third_party/aom/av1/encoder/intra_mode_search.h new file mode 100644 index 0000000000..75289c4e3c --- /dev/null +++ b/third_party/aom/av1/encoder/intra_mode_search.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares high level functions to search through intra modes. + */ +#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ +#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief Variables related to intra-mode search during inter frame coding. + * + * \ingroup intra_mode_search + * This is a set of variables used during intra-mode search for inter frames. + * This includes an histogram of gradient speed features and a cache of uv + * prediction to avoid repeated search of chroma prediction. + */ +typedef struct IntraModeSearchState { + /*! + * \brief The best luma intra-mode found so far + */ + PREDICTION_MODE best_intra_mode; + + /** \name Speed feature variables + * Variables to help with pruning some luma intra-modes during inter frame + * coding process. + */ + /**@{*/ + /*! + * \brief Whether to terminate all intra mode search. + */ + int skip_intra_modes; + /*! + * \brief Whether a directional mode is pruned. + */ + uint8_t directional_mode_skip_mask[INTRA_MODES]; + /*! + * \brief Whether \ref directional_mode_skip_mask is valid for pruning. + */ + int dir_mode_skip_mask_ready; + /**@}*/ + + /** \name Chroma mode search cache + * A cache of the best chroma prediction mode to avoid having to search for + * chroma predictions repeatedly in \ref + * av1_search_intra_uv_modes_in_interframe() + */ + /**@{*/ + int rate_uv_intra; /*!< \brief Total rate to transmit uv_mode */ + int rate_uv_tokenonly; /*!< \brief Rate transmit txfm tokens */ + int64_t dist_uvs; /*!< \brief Distortion of the uv_mode's recon */ + uint8_t skip_uvs; /*!< \brief Whether the uv txfm is skippable */ + UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */ + PALETTE_MODE_INFO pmi_uv; /*!< \brief Color map if mode_uv is palette */ + int8_t uv_angle_delta; /*!< \brief Angle delta if mode_uv directional */ + /**@}*/ +} IntraModeSearchState; + +/*!\brief Evaluate a given luma intra-mode for inter frames. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This function handles an intra-mode luma prediction when the current frame + * is an inter frame. This is the intra-mode counterpart of handle_inter_mode. + * This function performs an intra luma prediction using the mode specified by + * x->e_mbd.mi[0]->mode. This function does *not* support palette mode + * prediction in the luma channel. + * + * \param[in,out] intra_search_state Structure to intra search state. + * \param[in] cpi Top-level encoder structure. + * \param[in,out] x Pointer to structure holding all the + * data for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[in] ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] ctx Structure to hold the number of 4x4 blks + * to copy tx_type and txfm_skip arrays. + * \param[out] rd_stats_y Struct to keep track of the current + * intra-mode's rd_stats (luma only). + * \param[in] best_rd Best RD seen for this block so far. + * \param[out] mode_cost_y The cost needed to signal the current + * intra mode. + * \param[out] rd_y The rdcost of the chosen mode. + * \param[in] best_model_rd Best model RD seen for this block so far + * \param[in] top_intra_model_rd Top intra model RD seen for this + * block so far. + * + * \return Returns 1 if a valid intra mode is found, 0 otherwise. + * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and + * rd_y are also updated. Moreover, in the first evaluation with directional + * mode, a prune_mask computed with histogram of gradient is also stored in + * intra_search_state. + */ +int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, + int64_t best_rd, int *mode_cost_y, int64_t *rd_y, + int64_t *best_model_rd, + int64_t top_intra_model_rd[]); + +/*!\brief Search through all chroma intra-modes for inter frames. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This function handles intra-mode chroma prediction when the current frame + * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode + * with some additional book-keeping. + * + * \param[in,out] intra_search_state Structure to intra search state. + * \param[in] cpi Top-level encoder structure. + * \param[in,out] x Pointer to structure holding all the + * data for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[out] rd_stats Struct to keep track of the current + * intra-mode's rd_stats (all planes). + * \param[out] rd_stats_y Struct to keep track of the current + * intra-mode's rd_stats (luma only). + * \param[out] rd_stats_uv Struct to keep track of the current + * intra-mode's rd_stats (chroma only). + * \param[in] best_rd Best RD seen for this block so far. + * + * \return Returns 1 if a valid intra mode is found, 0 otherwise. + * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv) are also + * updated. Moreover, in the first evocation of the function, the chroma intra + * mode result is cached in intra_search_state to be used in subsequent calls. + */ +int av1_search_intra_uv_modes_in_interframe( + IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats, + const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd); + +/*!\brief Evaluate luma palette mode for inter frames. + * + * \ingroup intra_mode_search + * \callergraph + * \callgraph + * This function handles luma palette mode when the current frame is an + * inter frame. + * + * \param[in] intra_search_state Structure to hold the best luma intra mode + * and cache chroma prediction for speed up. + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[in] ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * \param[in] this_rd_cost Struct to keep track of palette mode's + * rd_stats. + * \param[in] best_rd Best RD seen for this block so far. + * + * \return Returns whether luma palette mode can skip the txfm. The + * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in + * ctx are also updated. + */ +int av1_search_palette_mode(IntraModeSearchState *intra_search_state, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost, + int64_t best_rd); + +/*!\brief Evaluate luma palette mode for inter frames. + * + * \ingroup intra_mode_search + * \callergraph + * \callgraph + * This function handles luma palette mode when the current frame is an + * inter frame. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] bsize Current partition block size. + * \param[in] ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * \param[in] this_rd_cost Struct to keep track of palette mode's + * rd_stats. + * \param[in] best_rd Best RD seen for this block so far. + */ +void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int ref_frame_cost, + PICK_MODE_CONTEXT *ctx, + RD_STATS *this_rd_cost, int64_t best_rd); + +/*!\brief Perform intra-mode search on luma channels for intra frames. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This function performs intra-mode search on the luma channel when the + * current frame is intra-only. This function does not search intrabc mode, + * but it does search palette and filter_intra. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] rate The total rate needed to predict the current + * chroma block. + * \param[in] rate_tokenonly The rate without the cost of sending the + * prediction modes. + * chroma block. + * after the reconstruction. + * \param[in] distortion The chroma distortion of the best prediction + * after the reconstruction. + * \param[in] skippable Whether we can skip txfm process. + * \param[in] bsize Current partition block size. + * \param[in] best_rd Best RD seen for this block so far. + * \param[in] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * + * \return Returns the rd_cost if this function finds a mode better than + * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate + * and distortion, and the tx_type arrays in ctx. + */ +int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, int64_t best_rd, + PICK_MODE_CONTEXT *ctx); + +/*!\brief Perform intra-mode search on chroma channels. + * + * \ingroup intra_mode_search + * \callergraph + * \callgraph + * This function performs intra-mode search on the chroma channels. Just like + * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode + * (filter_intra is not available on chroma planes). Unlike \ref + * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra + * frames. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] rate The total rate needed to predict the current + * chroma block. + * \param[in] rate_tokenonly The rate without the cost of sending the + * prediction modes. + * chroma block. + * after the reconstruction. + * \param[in] distortion The chroma distortion of the best prediction + * after the reconstruction. + * \param[in] skippable Whether we can skip txfm process. + * \param[in] bsize Current partition block size. + * \param[in] max_tx_size The maximum tx_size available + * + * \return Returns the rd_cost of the best uv mode found. This also updates the + * mbmi, the rate and distortion, distortion. + */ +int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, uint8_t *skippable, + BLOCK_SIZE bsize, TX_SIZE max_tx_size); + +/*! \brief Return the number of colors in src. Used by palette mode. + */ +void av1_count_colors(const uint8_t *src, int stride, int rows, int cols, + int *val_count, int *num_colors); + +/*! \brief See \ref av1_count_colors(), but for highbd. + */ +void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, + int cols, int bit_depth, int *val_count, + int *val_count_8bit, int *num_color_bins, + int *num_colors); + +/*! \brief Initializes the \ref IntraModeSearchState struct. + */ +static AOM_INLINE void init_intra_mode_search_state( + IntraModeSearchState *intra_search_state) { + memset(intra_search_state, 0, sizeof(*intra_search_state)); + intra_search_state->rate_uv_intra = INT_MAX; +} + +/*! \brief set the luma intra mode and delta angles for a given mode index. + * The total number of luma intra mode is LUMA_MODE_COUNT = 61. + * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional + * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2 + * delta angles. + * \param[in] mode_idx mode index in intra mode decision + * process. + * \param[in] mbmi Pointer to structure holding the mode + * info for the current macroblock. + * \param[in] reorder_delta_angle_eval Indicates whether to reorder the + * evaluation of delta angle modes. + */ +void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi, + int reorder_delta_angle_eval); + +/*! \brief prune luma intra mode based on the model rd. + * \param[in] this_model_rd model rd for current mode. + * \param[in] best_model_rd Best model RD seen for this block so + * far. + * \param[in] top_intra_model_rd Top intra model RD seen for this + * block so far. + * \param[in] max_model_cnt_allowed The maximum number of top intra + * model RD allowed. + * \param[in] model_rd_index_for_pruning Index of the candidate used for + * pruning based on model rd. + */ +int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, + int64_t top_intra_model_rd[], int max_model_cnt_allowed, + int model_rd_index_for_pruning); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/intra_mode_search_utils.h b/third_party/aom/av1/encoder/intra_mode_search_utils.h new file mode 100644 index 0000000000..107c2236f8 --- /dev/null +++ b/third_party/aom/av1/encoder/intra_mode_search_utils.h @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Defines utility functions used in intra mode search. + * + * This includes rdcost estimations, histogram based pruning, etc. + */ +#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ +#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ + +#include "av1/common/enums.h" +#include "av1/common/pred_common.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/hybrid_fwd_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +// Macro for computing the speed-preset dependent threshold which is used for +// deciding whether to enable/disable variance calculations in +// intra_rd_variance_factor(). +#define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X))) + +#define BINS 32 +static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = { + 0.450578f, 0.695518f, -0.717944f, -0.639894f, + -0.602019f, -0.453454f, 0.055857f, -0.465480f, +}; + +static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = { + -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f, + -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f, + -0.434156f, 0.322868f, 2.260546f, 3.368715f, 3.989290f, 3.308487f, + 2.277893f, 0.923793f, 0.026412f, -0.385174f, -0.718622f, -1.408867f, + -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f, + -2.985709f, -3.447155f, 3.758139f, 3.204353f, 2.170998f, 0.826587f, + -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f, + -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f, + -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f, + -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f, + -0.088058f, 0.753494f, 2.092413f, 3.215266f, -3.300277f, -2.748658f, + -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f, + -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f, + -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f, + 0.813112f, 1.702213f, 2.653045f, 3.351749f, 3.243554f, 3.199409f, + 2.437856f, 1.468854f, 0.533039f, -0.099065f, -0.622643f, -2.200732f, + -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f, 1.975043f, + 3.179528f, 3.939064f, 3.454379f, 3.689386f, 3.116411f, 1.970991f, + 0.798406f, -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f, + -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f, + -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f, + -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f, + -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f, 1.416882f, + 2.572884f, 3.607755f, 3.974820f, 3.997783f, 2.970459f, 0.791687f, + -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f, + -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f, + -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f, + 2.794130f, 3.685984f, 3.745195f, 3.252444f, 2.316108f, 1.399146f, + -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f, + -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f, + -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f, + -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f, + -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f, + -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f, + -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f, + -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f, + 0.716997f, 1.481393f, 2.216702f, 2.737986f, 3.109809f, 3.226084f, + 2.490098f, -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f, + -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f, + -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f, + -1.430687f, 0.872896f, 2.766550f, 3.610080f, 3.578041f, 3.334928f, + 2.586680f, 1.895721f, 1.122195f, 0.488519f, -0.140689f, -0.799076f, + -1.222860f, -1.502437f, -1.900969f, -3.206816f, +}; + +static const NN_CONFIG av1_intra_hog_model_nnconfig = { + BINS, // num_inputs + DIRECTIONAL_MODES, // num_outputs + 0, // num_hidden_layers + { 0 }, + { + av1_intra_hog_model_weights, + }, + { + av1_intra_hog_model_bias, + }, +}; + +#define FIX_PREC_BITS (16) +static AOM_INLINE int get_hist_bin_idx(int dx, int dy) { + const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx; + + // Find index by bisection + static const int thresholds[BINS] = { + -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303, + -59392, -48579, -39272, -30982, -23445, -16400, -9715, -3194, + 3227, 9748, 16433, 23478, 31015, 39305, 48611, 59425, + 72336, 88392, 109364, 138593, 183191, 261638, 441831, INT32_MAX + }; + + int lo_idx = 0, hi_idx = BINS - 1; + // Divide into segments of size 8 gives better performance than binary search + // here. + if (ratio <= thresholds[7]) { + lo_idx = 0; + hi_idx = 7; + } else if (ratio <= thresholds[15]) { + lo_idx = 8; + hi_idx = 15; + } else if (ratio <= thresholds[23]) { + lo_idx = 16; + hi_idx = 23; + } else { + lo_idx = 24; + hi_idx = 31; + } + + for (int idx = lo_idx; idx <= hi_idx; idx++) { + if (ratio <= thresholds[idx]) { + return idx; + } + } + assert(0 && "No valid histogram bin found!"); + return BINS - 1; +} +#undef FIX_PREC_BITS + +// Normalizes the hog data. +static AOM_INLINE void normalize_hog(float total, float *hist) { + for (int i = 0; i < BINS; ++i) hist[i] /= total; +} + +static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride, + int rows, int cols, float *hist) { + float total = 0.1f; + src += stride; + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint8_t *above = &src[c - stride]; + const uint8_t *below = &src[c + stride]; + const uint8_t *left = &src[c - 1]; + const uint8_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + if (dx == 0 && dy == 0) continue; + const int temp = abs(dx) + abs(dy); + if (!temp) continue; + total += temp; + if (dx == 0) { + hist[0] += temp / 2; + hist[BINS - 1] += temp / 2; + } else { + const int idx = get_hist_bin_idx(dx, dy); + assert(idx >= 0 && idx < BINS); + hist[idx] += temp; + } + } + src += stride; + } + + normalize_hog(total, hist); +} + +// Computes and stores pixel level gradient information of a given superblock +// for LBD encode. +static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { + PixelLevelGradientInfo *const grad_info_sb = + x->pixel_gradient_info + plane * MAX_SB_SQUARE; + const uint8_t *src = x->plane[plane].src.buf; + const int stride = x->plane[plane].src.stride; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_height = block_size_high[sb_size] >> ss_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + src += stride; + for (int r = 1; r < sb_height - 1; ++r) { + for (int c = 1; c < sb_width - 1; ++c) { + const uint8_t *above = &src[c - stride]; + const uint8_t *below = &src[c + stride]; + const uint8_t *left = &src[c - 1]; + const uint8_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); + grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = + (uint16_t)(abs(dx) + abs(dy)); + grad_info_sb[r * sb_width + c].hist_bin_idx = + (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; + } + src += stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride, + int rows, int cols, float *hist) { + float total = 0.1f; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src += stride; + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint16_t *above = &src[c - stride]; + const uint16_t *below = &src[c + stride]; + const uint16_t *left = &src[c - 1]; + const uint16_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + if (dx == 0 && dy == 0) continue; + const int temp = abs(dx) + abs(dy); + if (!temp) continue; + total += temp; + if (dx == 0) { + hist[0] += temp / 2; + hist[BINS - 1] += temp / 2; + } else { + const int idx = get_hist_bin_idx(dx, dy); + assert(idx >= 0 && idx < BINS); + hist[idx] += temp; + } + } + src += stride; + } + + normalize_hog(total, hist); +} + +// Computes and stores pixel level gradient information of a given superblock +// for HBD encode. +static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { + PixelLevelGradientInfo *const grad_info_sb = + x->pixel_gradient_info + plane * MAX_SB_SQUARE; + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf); + const int stride = x->plane[plane].src.stride; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_height = block_size_high[sb_size] >> ss_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + src += stride; + for (int r = 1; r < sb_height - 1; ++r) { + for (int c = 1; c < sb_width - 1; ++c) { + const uint16_t *above = &src[c - stride]; + const uint16_t *below = &src[c + stride]; + const uint16_t *left = &src[c - 1]; + const uint16_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); + grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = + (uint16_t)(abs(dx) + abs(dy)); + grad_info_sb[r * sb_width + c].hist_bin_idx = + (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; + } + src += stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows, + int cols, float *hist, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + highbd_generate_hog(src8, stride, rows, cols, hist); + return; + } +#else + (void)highbd; +#endif // CONFIG_AV1_HIGHBITDEPTH + lowbd_generate_hog(src8, stride, rows, cols, hist); +} + +static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(&x->e_mbd)) { + highbd_compute_gradient_info_sb(x, sb_size, plane); + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + lowbd_compute_gradient_info_sb(x, sb_size, plane); +} + +// Gradient caching at superblock level is allowed only if all of the following +// conditions are satisfied: +// (1) The current frame is an intra only frame +// (2) Non-RD mode decisions are not enabled +// (3) The sf partition_search_type is set to SEARCH_PARTITION +// (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled +// +// SB level caching of gradient data may not help in speedup for the following +// cases: +// (1) Inter frames (due to early intra gating) +// (2) When partition_search_type is not SEARCH_PARTITION +// Hence, gradient data is computed at block level in such cases. +static AOM_INLINE bool is_gradient_caching_for_hog_enabled( + const AV1_COMP *const cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode && + (sf->part_sf.partition_search_type == SEARCH_PARTITION) && + (sf->intra_sf.intra_pruning_with_hog || + sf->intra_sf.chroma_intra_pruning_with_hog); +} + +// Function to generate pixel level gradient information for a given superblock. +// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if +// gradient info is generated for the same. +static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + // Initialise flags related to hog data caching. + x->is_sb_gradient_cached[PLANE_TYPE_Y] = false; + x->is_sb_gradient_cached[PLANE_TYPE_UV] = false; + if (!is_gradient_caching_for_hog_enabled(cpi)) return; + + const SPEED_FEATURES *sf = &cpi->sf; + const int num_planes = av1_num_planes(&cpi->common); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); + + if (sf->intra_sf.intra_pruning_with_hog) { + compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y); + x->is_sb_gradient_cached[PLANE_TYPE_Y] = true; + } + if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) { + compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV); + x->is_sb_gradient_cached[PLANE_TYPE_UV] = true; + } +} + +// Reuses the pixel level gradient data generated at superblock level for block +// level histogram computation. +static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x, + int rows, int cols, + BLOCK_SIZE sb_size, + PLANE_TYPE plane, + float *hist) { + float total = 0.1f; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + + // Derive the offset from the starting of the superblock in order to locate + // the block level gradient data in the cache. + const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); + const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); + const int block_offset_in_grad_cache = + sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) + + (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x)); + const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info + + plane * MAX_SB_SQUARE + + block_offset_in_grad_cache; + + // Retrieve the cached gradient information and generate the histogram. + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint16_t abs_dx_abs_dy_sum = + grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum; + if (!abs_dx_abs_dy_sum) continue; + total += abs_dx_abs_dy_sum; + const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero; + if (is_dx_zero) { + hist[0] += abs_dx_abs_dy_sum >> 1; + hist[BINS - 1] += abs_dx_abs_dy_sum >> 1; + } else { + const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx; + assert(idx >= 0 && idx < BINS); + hist[idx] += abs_dx_abs_dy_sum; + } + } + } + normalize_hog(total, hist); +} + +static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize, + BLOCK_SIZE sb_size, int plane, float *hog) { + const MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + const int rows = + ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >> + ss_y; + const int cols = + ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >> + ss_x; + + // If gradient data is already generated at SB level, reuse the cached data. + // Otherwise, compute the data. + if (x->is_sb_gradient_cached[plane]) { + generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog); + } else { + const uint8_t *src = x->plane[plane].src.buf; + const int src_stride = x->plane[plane].src.stride; + generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd)); + } + + // Scale the hog so the luma and chroma are on the same scale + for (int b = 0; b < BINS; ++b) { + hog[b] *= (1 + ss_x) * (1 + ss_y); + } +} + +static AOM_INLINE void prune_intra_mode_with_hog( + const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th, + uint8_t *directional_mode_skip_mask, int is_chroma) { + const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y; + float hist[BINS] = { 0.0f }; + collect_hog_data(x, bsize, sb_size, plane, hist); + + // Make prediction for each of the mode + float scores[DIRECTIONAL_MODES] = { 0.0f }; + av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores); + for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED; + uv_mode++) { + if (scores[uv_mode - UV_V_PRED] <= th) { + directional_mode_skip_mask[uv_mode] = 1; + } + } +} +#undef BINS + +int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf, + const int stride, const int is_hbd); + +// Returns whether caching of source variance for 4x4 sub-blocks is allowed. +static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled( + const AV1_COMP *const cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + if (cpi->oxcf.mode != ALLINTRA) return false; + + if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true; + + if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 || + (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode)) + return false; + + return true; +} + +// Initialize the members of Block4x4VarInfo structure to -1 at the start +// of every superblock. +static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks( + const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks, + const BLOCK_SIZE sb_size) { + if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return; + + const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size]; + for (int i = 0; i < mi_count_in_sb; i++) { + src_var_info_of_4x4_sub_blocks[i].var = -1; + src_var_info_of_4x4_sub_blocks[i].log_var = -1.0; + } +} + +// Returns the cost needed to send a uniformly distributed r.v. +static AOM_INLINE int write_uniform_cost(int n, int v) { + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (l == 0) return 0; + if (v < m) + return av1_cost_literal(l - 1); + else + return av1_cost_literal(l); +} +/*!\endcond */ + +/*!\brief Returns the rate cost for luma prediction mode info of intra blocks. + * + * \callergraph + */ +static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int mode_cost, + int discount_color_cost) { + int total_rate = mode_cost; + const ModeCosts *mode_costs = &x->mode_costs; + const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0; + const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra; + const int use_intrabc = mbmi->use_intrabc; + // Can only activate one mode. + assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc + + use_filter_intra) <= 1); + const int try_palette = av1_allow_palette( + cpi->common.features.allow_screen_content_tools, mbmi->bsize); + if (try_palette && mbmi->mode == DC_PRED) { + const MACROBLOCKD *xd = &x->e_mbd; + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int mode_ctx = av1_get_palette_mode_ctx(xd); + total_rate += + mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette]; + if (use_palette) { + const uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const int plt_size = mbmi->palette_mode_info.palette_size[0]; + int palette_mode_cost = + mode_costs + ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + palette_mode_cost += + av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, + n_cache, cpi->common.seq_params->bit_depth); + if (!discount_color_cost) + palette_mode_cost += + av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); + + total_rate += palette_mode_cost; + } + } + if (av1_filter_intra_allowed(&cpi->common, mbmi)) { + total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra]; + if (use_filter_intra) { + total_rate += + mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info + .filter_intra_mode]; + } + } + if (av1_is_directional_mode(mbmi->mode)) { + if (av1_use_angle_delta(bsize)) { + total_rate += + mode_costs->angle_delta_cost[mbmi->mode - V_PRED] + [MAX_ANGLE_DELTA + + mbmi->angle_delta[PLANE_TYPE_Y]]; + } + } + if (av1_allow_intrabc(&cpi->common)) + total_rate += mode_costs->intrabc_cost[use_intrabc]; + return total_rate; +} + +/*!\brief Return the rate cost for chroma prediction mode info of intra blocks. + * + * \callergraph + */ +static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int mode_cost) { + int total_rate = mode_cost; + const ModeCosts *mode_costs = &x->mode_costs; + const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0; + const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; + // Can only activate one mode. + assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1); + + const int try_palette = av1_allow_palette( + cpi->common.features.allow_screen_content_tools, mbmi->bsize); + if (try_palette && uv_mode == UV_DC_PRED) { + const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info; + total_rate += + mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette]; + if (use_palette) { + const int bsize_ctx = av1_get_palette_bsize_ctx(bsize); + const int plt_size = pmi->palette_size[1]; + const MACROBLOCKD *xd = &x->e_mbd; + const uint8_t *const color_map = xd->plane[1].color_index_map; + int palette_mode_cost = + mode_costs + ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] + + write_uniform_cost(plt_size, color_map[0]); + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + palette_mode_cost += av1_palette_color_cost_uv( + pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth); + palette_mode_cost += + av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); + total_rate += palette_mode_cost; + } + } + const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode); + if (av1_is_directional_mode(intra_mode)) { + if (av1_use_angle_delta(bsize)) { + total_rate += + mode_costs->angle_delta_cost[intra_mode - V_PRED] + [mbmi->angle_delta[PLANE_TYPE_UV] + + MAX_ANGLE_DELTA]; + } + } + return total_rate; +} + +/*!\cond */ +// Makes a quick intra prediction and estimate the rdcost with a model without +// going through the whole txfm/quantize/itxfm process. +static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x, + int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int use_hadamard) { + MACROBLOCKD *const xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + int row, col; + assert(!is_inter_block(xd->mi[0])); + const int stepr = tx_size_high_unit[tx_size]; + const int stepc = tx_size_wide_unit[tx_size]; + const int txbw = tx_size_wide[tx_size]; + const int txbh = tx_size_high[tx_size]; + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + int64_t satd_cost = 0; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + // Prediction. + for (row = 0; row < max_blocks_high; row += stepr) { + for (col = 0; col < max_blocks_wide; col += stepc) { + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + // Here we use p->src_diff and p->coeff as temporary buffers for + // prediction residue and transform coefficients. The buffers are only + // used in this for loop, therefore we don't need to properly add offset + // to the buffers. + av1_subtract_block( + bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize], + p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride, + pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride); + av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff, + block_size_wide[plane_bsize], p->coeff); + satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]); + } + } + return satd_cost; +} +/*!\endcond */ + +/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it. + * + * \ingroup intra_mode_search + * \callergraph + * This function first makes a quick luma prediction and estimates the rdcost + * with a model without going through the txfm, then try to prune the current + * mode if the new estimate y_rd > 1.25 * best_model_rd. + * + * \return Returns 1 if the given mode is prune; 0 otherwise. + */ +static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int64_t *best_model_rd) { + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int plane = 0; + const AV1_COMMON *cm = &cpi->common; + const int64_t this_model_rd = + intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1); + if (*best_model_rd != INT64_MAX && + this_model_rd > *best_model_rd + (*best_model_rd >> 2)) { + return 1; + } else if (this_model_rd < *best_model_rd) { + *best_model_rd = this_model_rd; + } + return 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_ diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h new file mode 100644 index 0000000000..4be2038a6f --- /dev/null +++ b/third_party/aom/av1/encoder/k_means_template.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "av1/common/blockd.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/random.h" + +#ifndef AV1_K_MEANS_DIM +#error "This template requires AV1_K_MEANS_DIM to be defined" +#endif + +#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) +#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) + +// Though we want to compute the smallest L2 norm, in 1 dimension, +// it is equivalent to find the smallest L1 norm and then square it. +// This is preferrable for speed, especially on the SIMD side. +static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) { +#if AV1_K_MEANS_DIM == 1 + return abs(p1[0] - p2[0]); +#else + int dist = 0; + for (int i = 0; i < AV1_K_MEANS_DIM; ++i) { + const int diff = p1[i] - p2[i]; + dist += diff * diff; + } + return dist; +#endif +} + +void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *dist, int n, int k) { + if (dist) { + *dist = 0; + } + for (int i = 0; i < n; ++i) { + int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids); + indices[i] = 0; + for (int j = 1; j < k; ++j) { + const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, + centroids + j * AV1_K_MEANS_DIM); + if (this_dist < min_dist) { + min_dist = this_dist; + indices[i] = j; + } + } + if (dist) { +#if AV1_K_MEANS_DIM == 1 + *dist += min_dist * min_dist; +#else + *dist += min_dist; +#endif + } + } +} + +static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids, + const uint8_t *indices, int n, int k) { + int i, j; + int count[PALETTE_MAX_SIZE] = { 0 }; + int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE]; + unsigned int rand_state = (unsigned int)data[0]; + assert(n <= 32768); + memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM); + + for (i = 0; i < n; ++i) { + const int index = indices[i]; + assert(index < k); + ++count[index]; + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids_sum[index * AV1_K_MEANS_DIM + j] += + data[i * AV1_K_MEANS_DIM + j]; + } + } + + for (i = 0; i < k; ++i) { + if (count[i] == 0) { + memcpy(centroids + i * AV1_K_MEANS_DIM, + data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM, + sizeof(centroids[0]) * AV1_K_MEANS_DIM); + } else { + for (j = 0; j < AV1_K_MEANS_DIM; ++j) { + centroids[i * AV1_K_MEANS_DIM + j] = + DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]); + } + } + } +} + +void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, int max_itr) { + int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE]; + uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT]; + int16_t *meta_centroids[2] = { centroids, centroids_tmp }; + uint8_t *meta_indices[2] = { indices, indices_tmp }; + int i, l = 0, prev_l, best_l = 0; + int64_t this_dist; + + assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT); + +#if AV1_K_MEANS_DIM == 1 + av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k); +#else + av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k); +#endif + + for (i = 0; i < max_itr; ++i) { + const int64_t prev_dist = this_dist; + prev_l = l; + l = (l == 1) ? 0 : 1; + + RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k); + if (!memcmp(meta_centroids[l], meta_centroids[prev_l], + sizeof(centroids[0]) * k * AV1_K_MEANS_DIM)) { + break; + } +#if AV1_K_MEANS_DIM == 1 + av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist, + n, k); +#else + av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist, + n, k); +#endif + + if (this_dist > prev_dist) { + best_l = prev_l; + break; + } + } + if (i == max_itr) best_l = l; + if (best_l != 0) { + memcpy(centroids, meta_centroids[1], + sizeof(centroids[0]) * k * AV1_K_MEANS_DIM); + memcpy(indices, meta_indices[1], sizeof(indices[0]) * n); + } +} +#undef RENAME_ +#undef RENAME diff --git a/third_party/aom/av1/encoder/level.c b/third_party/aom/av1/encoder/level.c new file mode 100644 index 0000000000..5d5fe9ce96 --- /dev/null +++ b/third_party/aom/av1/encoder/level.c @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/level.h" + +#define UNDEFINED_LEVEL \ + { \ + .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \ + .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \ + .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \ + .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \ + } + +static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = { + { .level = SEQ_LEVEL_2_0, + .max_picture_size = 147456, + .max_h_size = 2048, + .max_v_size = 1152, + .max_display_rate = 4423680L, + .max_decode_rate = 5529600L, + .max_header_rate = 150, + .main_mbps = 1.5, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + { .level = SEQ_LEVEL_2_1, + .max_picture_size = 278784, + .max_h_size = 2816, + .max_v_size = 1584, + .max_display_rate = 8363520L, + .max_decode_rate = 10454400L, + .max_header_rate = 150, + .main_mbps = 3.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_3_0, + .max_picture_size = 665856, + .max_h_size = 4352, + .max_v_size = 2448, + .max_display_rate = 19975680L, + .max_decode_rate = 24969600L, + .max_header_rate = 150, + .main_mbps = 6.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + { .level = SEQ_LEVEL_3_1, + .max_picture_size = 1065024, + .max_h_size = 5504, + .max_v_size = 3096, + .max_display_rate = 31950720L, + .max_decode_rate = 39938400L, + .max_header_rate = 150, + .main_mbps = 10.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_4_0, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 70778880L, + .max_decode_rate = 77856768L, + .max_header_rate = 300, + .main_mbps = 12.0, + .high_mbps = 30.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_4_1, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 141557760L, + .max_decode_rate = 155713536L, + .max_header_rate = 300, + .main_mbps = 20.0, + .high_mbps = 50.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_5_0, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 267386880L, + .max_decode_rate = 273715200L, + .max_header_rate = 300, + .main_mbps = 30.0, + .high_mbps = 100.0, + .main_cr = 6.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_1, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 534773760L, + .max_decode_rate = 547430400L, + .max_header_rate = 300, + .main_mbps = 40.0, + .high_mbps = 160.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_2, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1094860800L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_3, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_6_0, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_1, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 2139095040L, + .max_decode_rate = 2189721600L, + .max_header_rate = 300, + .main_mbps = 100.0, + .high_mbps = 480.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_2, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4379443200L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_3, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4706009088L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, +#if CONFIG_CWG_C013 + { .level = SEQ_LEVEL_7_0, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 4278190080L, + .max_decode_rate = 4706009088L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_7_1, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 8556380160L, + .max_decode_rate = 8758886400L, + .max_header_rate = 300, + .main_mbps = 200.0, + .high_mbps = 960.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_7_2, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 17112760320L, + .max_decode_rate = 17517772800L, + .max_header_rate = 300, + .main_mbps = 320.0, + .high_mbps = 1600.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_7_3, + .max_picture_size = 142606336, + .max_h_size = 32768, + .max_v_size = 17408, + .max_display_rate = 17112760320L, + .max_decode_rate = 18824036352L, + .max_header_rate = 300, + .main_mbps = 320.0, + .high_mbps = 1600.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 256, + .max_tile_cols = 32 }, + { .level = SEQ_LEVEL_8_0, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 17112760320L, + .max_decode_rate = 18824036352L, + .max_header_rate = 300, + .main_mbps = 320.0, + .high_mbps = 1600.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, + { .level = SEQ_LEVEL_8_1, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 34225520640L, + .max_decode_rate = 34910031052L, + .max_header_rate = 300, + .main_mbps = 400.0, + .high_mbps = 1920.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, + { .level = SEQ_LEVEL_8_2, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 68451041280L, + .max_decode_rate = 69820062105L, + .max_header_rate = 300, + .main_mbps = 640.0, + .high_mbps = 3200.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, + { .level = SEQ_LEVEL_8_3, + .max_picture_size = 530841600, + .max_h_size = 65536, + .max_v_size = 34816, + .max_display_rate = 68451041280L, + .max_decode_rate = 75296145408L, + .max_header_rate = 300, + .main_mbps = 640.0, + .high_mbps = 3200.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 512, + .max_tile_cols = 64 }, +#else // !CONFIG_CWG_C013 + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, +#endif // CONFIG_CWG_C013 +}; + +typedef enum { + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_LARGE, + LUMA_PIC_V_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_SMALL, + LUMA_PIC_V_SIZE_TOO_SMALL, + TOO_MANY_TILE_COLUMNS, + TOO_MANY_TILES, + TILE_RATE_TOO_HIGH, + TILE_TOO_LARGE, + SUPERRES_TILE_WIDTH_TOO_LARGE, + CROPPED_TILE_WIDTH_TOO_SMALL, + CROPPED_TILE_HEIGHT_TOO_SMALL, + TILE_WIDTH_INVALID, + FRAME_HEADER_RATE_TOO_HIGH, + DISPLAY_RATE_TOO_HIGH, + DECODE_RATE_TOO_HIGH, + CR_TOO_SMALL, + TILE_SIZE_HEADER_RATE_TOO_HIGH, + BITRATE_TOO_HIGH, + DECODER_MODEL_FAIL, + + TARGET_LEVEL_FAIL_IDS, + TARGET_LEVEL_OK, +} TARGET_LEVEL_FAIL_ID; + +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The picture size is too large.", + "The picture width is too large.", + "The picture height is too large.", + "The picture width is too small.", + "The picture height is too small.", + "Too many tile columns are used.", + "Too many tiles are used.", + "The tile rate is too high.", + "The tile size is too large.", + "The superres tile width is too large.", + "The cropped tile width is less than 8.", + "The cropped tile height is less than 8.", + "The tile width is invalid.", + "The frame header rate is too high.", + "The display luma sample rate is too high.", + "The decoded luma sample rate is too high.", + "The compression ratio is too small.", + "The product of max tile size and header rate is too high.", + "The bitrate is too high.", + "The decoder model fails.", +}; + +static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier, + BITSTREAM_PROFILE profile) { + if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; + const double bitrate_basis = + (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6; + const double bitrate_profile_factor = + profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0); + return bitrate_basis * bitrate_profile_factor; +} + +double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, + BITSTREAM_PROFILE profile) { + assert(is_valid_seq_level_idx(level_index)); + return get_max_bitrate(&av1_level_defs[level_index], tier, profile); +} + +void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, + int *const max_tile_cols) { + assert(is_valid_seq_level_idx(level_index)); + const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; + *max_tiles = level_spec->max_tiles; + *max_tile_cols = level_spec->max_tile_cols; +} + +// We assume time t to be valid if and only if t >= 0.0. +// So INVALID_TIME can be defined as anything less than 0. +#define INVALID_TIME (-1.0) + +// This corresponds to "free_buffer" in the spec. +static void release_buffer(DECODER_MODEL *const decoder_model, int idx) { + assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE); + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; + this_buffer->decoder_ref_count = 0; + this_buffer->player_ref_count = 0; + this_buffer->display_index = -1; + this_buffer->presentation_time = INVALID_TIME; +} + +static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + release_buffer(decoder_model, i); + } + for (int i = 0; i < REF_FRAMES; ++i) { + decoder_model->vbi[i] = -1; + } +} + +static int get_free_buffer(DECODER_MODEL *const decoder_model) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count == 0 && + this_buffer->player_ref_count == 0) + return i; + } + return -1; +} + +static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx, + int refresh_frame_flags) { + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx]; + for (int i = 0; i < REF_FRAMES; ++i) { + if (refresh_frame_flags & (1 << i)) { + const int pre_idx = decoder_model->vbi[i]; + if (pre_idx != -1) { + --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count; + } + decoder_model->vbi[i] = idx; + ++this_buffer->decoder_ref_count; + } + } +} + +// The time (in seconds) required to decode a frame. +static double time_to_decode_frame(const AV1_COMMON *const cm, + int64_t max_decode_rate) { + if (cm->show_existing_frame) return 0.0; + + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + int luma_samples = 0; + if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) { + luma_samples = cm->superres_upscaled_width * cm->height; + } else { + const int spatial_layer_dimensions_present_flag = 0; + if (spatial_layer_dimensions_present_flag) { + assert(0 && "Spatial layer dimensions not supported yet."); + } else { + const SequenceHeader *const seq_params = cm->seq_params; + const int max_frame_width = seq_params->max_frame_width; + const int max_frame_height = seq_params->max_frame_height; + luma_samples = max_frame_width * max_frame_height; + } + } + + return luma_samples / (double)max_decode_rate; +} + +// Release frame buffers that are no longer needed for decode or display. +// It corresponds to "start_decode_at_removal_time" in the spec. +static void release_processed_frames(DECODER_MODEL *const decoder_model, + double removal_time) { + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i]; + if (this_buffer->player_ref_count > 0) { + if (this_buffer->presentation_time >= 0.0 && + this_buffer->presentation_time <= removal_time) { + this_buffer->player_ref_count = 0; + if (this_buffer->decoder_ref_count == 0) { + release_buffer(decoder_model, i); + } + } + } + } +} + +static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) { + int frames_in_pool = 0; + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count > 0 || + this_buffer->player_ref_count > 0) { + ++frames_in_pool; + } + } + return frames_in_pool; +} + +static double get_presentation_time(const DECODER_MODEL *const decoder_model, + int display_index) { + if (decoder_model->mode == SCHEDULE_MODE) { + assert(0 && "SCHEDULE_MODE NOT SUPPORTED"); + return INVALID_TIME; + } else { + const double initial_presentation_delay = + decoder_model->initial_presentation_delay; + // Can't decide presentation time until the initial presentation delay is + // known. + if (initial_presentation_delay < 0.0) return INVALID_TIME; + + return initial_presentation_delay + + display_index * decoder_model->num_ticks_per_picture * + decoder_model->display_clock_tick; + } +} + +#define MAX_TIME 1e16 +static double time_next_buffer_is_free(int num_decoded_frame, + int decoder_buffer_delay, + const FRAME_BUFFER *frame_buffer_pool, + double current_time) { + if (num_decoded_frame == 0) { + return (double)decoder_buffer_delay / 90000.0; + } + + double buf_free_time = MAX_TIME; + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i]; + if (this_buffer->decoder_ref_count == 0) { + if (this_buffer->player_ref_count == 0) { + return current_time; + } + const double presentation_time = this_buffer->presentation_time; + if (presentation_time >= 0.0 && presentation_time < buf_free_time) { + buf_free_time = presentation_time; + } + } + } + return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME; +} +#undef MAX_TIME + +static double get_removal_time(int mode, int num_decoded_frame, + int decoder_buffer_delay, + const FRAME_BUFFER *frame_buffer_pool, + double current_time) { + if (mode == SCHEDULE_MODE) { + assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET"); + return INVALID_TIME; + } else { + return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay, + frame_buffer_pool, current_time); + } +} + +void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) { + printf( + "\n status %d, num_frame %3d, num_decoded_frame %3d, " + "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, " + "presentation delay %6.2f, total interval %6.2f\n", + decoder_model->status, decoder_model->num_frame, + decoder_model->num_decoded_frame, decoder_model->num_shown_frame, + decoder_model->current_time, frames_in_buffer_pool(decoder_model), + decoder_model->initial_presentation_delay, + decoder_model->dfg_interval_queue.total_interval); + for (int i = 0; i < 10; ++i) { + const FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + printf("buffer %d, decode count %d, display count %d, present time %6.4f\n", + i, this_buffer->decoder_ref_count, this_buffer->player_ref_count, + this_buffer->presentation_time); + } +} + +// op_index is the operating point index. +void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level, + int op_index, DECODER_MODEL *const decoder_model) { + decoder_model->status = DECODER_MODEL_OK; + decoder_model->level = level; + + const AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + decoder_model->bit_rate = get_max_bitrate( + av1_level_defs + level, seq_params->tier[op_index], seq_params->profile); + + // TODO(huisu or anyone): implement SCHEDULE_MODE. + decoder_model->mode = RESOURCE_MODE; + decoder_model->encoder_buffer_delay = 20000; + decoder_model->decoder_buffer_delay = 70000; + decoder_model->is_low_delay_mode = false; + + decoder_model->first_bit_arrival_time = 0.0; + decoder_model->last_bit_arrival_time = 0.0; + decoder_model->coded_bits = 0; + + decoder_model->removal_time = INVALID_TIME; + decoder_model->presentation_time = INVALID_TIME; + decoder_model->decode_samples = 0; + decoder_model->display_samples = 0; + decoder_model->max_decode_rate = 0.0; + decoder_model->max_display_rate = 0.0; + + decoder_model->num_frame = -1; + decoder_model->num_decoded_frame = -1; + decoder_model->num_shown_frame = -1; + decoder_model->current_time = 0.0; + + initialize_buffer_pool(decoder_model); + + DFG_INTERVAL_QUEUE *const dfg_interval_queue = + &decoder_model->dfg_interval_queue; + dfg_interval_queue->total_interval = 0.0; + dfg_interval_queue->head = 0; + dfg_interval_queue->size = 0; + + if (seq_params->timing_info_present) { + decoder_model->num_ticks_per_picture = + seq_params->timing_info.num_ticks_per_picture; + decoder_model->display_clock_tick = + seq_params->timing_info.num_units_in_display_tick / + seq_params->timing_info.time_scale; + } else { + decoder_model->num_ticks_per_picture = 1; + decoder_model->display_clock_tick = 1.0 / cpi->framerate; + } + + decoder_model->initial_display_delay = + seq_params->op_params[op_index].initial_display_delay; + decoder_model->initial_presentation_delay = INVALID_TIME; + decoder_model->decode_rate = av1_level_defs[level].max_decode_rate; +} + +DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf( + const AV1_COMP *const cpi, size_t coded_bits, + const DECODER_MODEL *const decoder_model) { + DECODER_MODEL_STATUS status = DECODER_MODEL_OK; + + if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) { + return status; + } + + const AV1_COMMON *const cm = &cpi->common; + const int show_existing_frame = cm->show_existing_frame; + + size_t cur_coded_bits = decoder_model->coded_bits + coded_bits; + int num_decoded_frame = decoder_model->num_decoded_frame; + if (!show_existing_frame) ++num_decoded_frame; + + if (show_existing_frame) { + return status; + } else { + const double removal_time = get_removal_time( + decoder_model->mode, num_decoded_frame, + decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool, + decoder_model->current_time); + if (removal_time < 0.0) { + status = DECODE_FRAME_BUF_UNAVAILABLE; + return status; + } + + // A frame with show_existing_frame being false indicates the end of a DFG. + // Update the bits arrival time of this DFG. + const double buffer_delay = (decoder_model->encoder_buffer_delay + + decoder_model->decoder_buffer_delay) / + 90000.0; + const double latest_arrival_time = removal_time - buffer_delay; + const double first_bit_arrival_time = + AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); + const double last_bit_arrival_time = + first_bit_arrival_time + + (double)cur_coded_bits / decoder_model->bit_rate; + // Smoothing buffer underflows if the last bit arrives after the removal + // time. + if (last_bit_arrival_time > removal_time && + !decoder_model->is_low_delay_mode) { + status = SMOOTHING_BUFFER_UNDERFLOW; + return status; + } + + // Check if the smoothing buffer overflows. + const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; + if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { + assert(0); + } + + double total_interval = queue->total_interval; + int qhead = queue->head; + int qsize = queue->size; + // Remove the DFGs with removal time earlier than last_bit_arrival_time. + while (queue->buf[qhead].removal_time <= last_bit_arrival_time && + qsize > 0) { + if (queue->buf[qhead].removal_time - first_bit_arrival_time + + total_interval > + 1.0) { + status = SMOOTHING_BUFFER_OVERFLOW; + return status; + } + total_interval -= queue->buf[qhead].last_bit_arrival_time - + queue->buf[qhead].first_bit_arrival_time; + qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE; + --qsize; + } + total_interval += last_bit_arrival_time - first_bit_arrival_time; + // The smoothing buffer can hold at most "bit_rate" bits, which is + // equivalent to 1 second of total interval. + if (total_interval > 1.0) { + status = SMOOTHING_BUFFER_OVERFLOW; + return status; + } + + return status; + } +} + +void av1_decoder_model_process_frame(const AV1_COMP *const cpi, + size_t coded_bits, + DECODER_MODEL *const decoder_model) { + if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return; + + const AV1_COMMON *const cm = &cpi->common; + const int luma_pic_size = cm->superres_upscaled_width * cm->height; + const int show_existing_frame = cm->show_existing_frame; + const int show_frame = cm->show_frame || show_existing_frame; + ++decoder_model->num_frame; + if (!show_existing_frame) ++decoder_model->num_decoded_frame; + if (show_frame) ++decoder_model->num_shown_frame; + decoder_model->coded_bits += coded_bits; + + int display_idx = -1; + if (show_existing_frame) { + display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show]; + if (display_idx < 0) { + decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY; + return; + } + if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) { + update_ref_buffers(decoder_model, display_idx, 0xFF); + } + } else { + const double removal_time = get_removal_time( + decoder_model->mode, decoder_model->num_decoded_frame, + decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool, + decoder_model->current_time); + if (removal_time < 0.0) { + decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; + return; + } + + const int previous_decode_samples = decoder_model->decode_samples; + const double previous_removal_time = decoder_model->removal_time; + assert(previous_removal_time < removal_time); + decoder_model->removal_time = removal_time; + decoder_model->decode_samples = luma_pic_size; + const double this_decode_rate = + previous_decode_samples / (removal_time - previous_removal_time); + decoder_model->max_decode_rate = + AOMMAX(decoder_model->max_decode_rate, this_decode_rate); + + // A frame with show_existing_frame being false indicates the end of a DFG. + // Update the bits arrival time of this DFG. + const double buffer_delay = (decoder_model->encoder_buffer_delay + + decoder_model->decoder_buffer_delay) / + 90000.0; + const double latest_arrival_time = removal_time - buffer_delay; + decoder_model->first_bit_arrival_time = + AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time); + decoder_model->last_bit_arrival_time = + decoder_model->first_bit_arrival_time + + (double)decoder_model->coded_bits / decoder_model->bit_rate; + // Smoothing buffer underflows if the last bit arrives after the removal + // time. + if (decoder_model->last_bit_arrival_time > removal_time && + !decoder_model->is_low_delay_mode) { + decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW; + return; + } + // Reset the coded bits for the next DFG. + decoder_model->coded_bits = 0; + + // Check if the smoothing buffer overflows. + DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue; + if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) { + assert(0); + } + const double first_bit_arrival_time = decoder_model->first_bit_arrival_time; + const double last_bit_arrival_time = decoder_model->last_bit_arrival_time; + // Remove the DFGs with removal time earlier than last_bit_arrival_time. + while (queue->buf[queue->head].removal_time <= last_bit_arrival_time && + queue->size > 0) { + if (queue->buf[queue->head].removal_time - first_bit_arrival_time + + queue->total_interval > + 1.0) { + decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; + return; + } + queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time - + queue->buf[queue->head].first_bit_arrival_time; + queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE; + --queue->size; + } + // Push current DFG into the queue. + const int queue_index = + (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE; + queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time; + queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time; + queue->buf[queue_index].removal_time = removal_time; + queue->total_interval += last_bit_arrival_time - first_bit_arrival_time; + // The smoothing buffer can hold at most "bit_rate" bits, which is + // equivalent to 1 second of total interval. + if (queue->total_interval > 1.0) { + decoder_model->status = SMOOTHING_BUFFER_OVERFLOW; + return; + } + + release_processed_frames(decoder_model, removal_time); + decoder_model->current_time = + removal_time + time_to_decode_frame(cm, decoder_model->decode_rate); + + const int cfbi = get_free_buffer(decoder_model); + if (cfbi < 0) { + decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE; + return; + } + const CurrentFrame *const current_frame = &cm->current_frame; + decoder_model->frame_buffer_pool[cfbi].frame_type = + cm->current_frame.frame_type; + display_idx = cfbi; + update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags); + + if (decoder_model->initial_presentation_delay < 0.0) { + // Display can begin after required number of frames have been buffered. + if (frames_in_buffer_pool(decoder_model) >= + decoder_model->initial_display_delay - 1) { + decoder_model->initial_presentation_delay = decoder_model->current_time; + // Update presentation time for each shown frame in the frame buffer. + for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) { + FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[i]; + if (this_buffer->player_ref_count == 0) continue; + assert(this_buffer->display_index >= 0); + this_buffer->presentation_time = + get_presentation_time(decoder_model, this_buffer->display_index); + } + } + } + } + + // Display. + if (show_frame) { + assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE); + FRAME_BUFFER *const this_buffer = + &decoder_model->frame_buffer_pool[display_idx]; + ++this_buffer->player_ref_count; + this_buffer->display_index = decoder_model->num_shown_frame; + const double presentation_time = + get_presentation_time(decoder_model, this_buffer->display_index); + this_buffer->presentation_time = presentation_time; + if (presentation_time >= 0.0 && + decoder_model->current_time > presentation_time) { + decoder_model->status = DISPLAY_FRAME_LATE; + return; + } + + const int previous_display_samples = decoder_model->display_samples; + const double previous_presentation_time = decoder_model->presentation_time; + decoder_model->display_samples = luma_pic_size; + decoder_model->presentation_time = presentation_time; + if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) { + assert(previous_presentation_time < presentation_time); + const double this_display_rate = + previous_display_samples / + (presentation_time - previous_presentation_time); + decoder_model->max_display_rate = + AOMMAX(decoder_model->max_display_rate, this_display_rate); + } + } +} + +void av1_init_level_info(AV1_COMP *cpi) { + for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) { + AV1LevelInfo *const this_level_info = + cpi->ppi->level_params.level_info[op_index]; + if (!this_level_info) continue; + memset(this_level_info, 0, sizeof(*this_level_info)); + AV1LevelSpec *const level_spec = &this_level_info->level_spec; + level_spec->level = SEQ_LEVEL_MAX; + AV1LevelStats *const level_stats = &this_level_info->level_stats; + level_stats->min_cropped_tile_width = INT_MAX; + level_stats->min_cropped_tile_height = INT_MAX; + level_stats->min_frame_width = INT_MAX; + level_stats->min_frame_height = INT_MAX; + level_stats->tile_width_is_valid = 1; + level_stats->min_cr = 1e8; + + FrameWindowBuffer *const frame_window_buffer = + &this_level_info->frame_window_buffer; + frame_window_buffer->num = 0; + frame_window_buffer->start = 0; + + const AV1_COMMON *const cm = &cpi->common; + const int upscaled_width = cm->superres_upscaled_width; + const int height = cm->height; + const int pic_size = upscaled_width * height; + for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { + DECODER_MODEL *const this_model = &this_level_info->decoder_models[level]; + const AV1LevelSpec *const spec = &av1_level_defs[level]; + if (upscaled_width > spec->max_h_size || height > spec->max_v_size || + pic_size > spec->max_picture_size) { + // Turn off decoder model for this level as the frame size already + // exceeds level constraints. + this_model->status = DECODER_MODEL_DISABLED; + } else { + av1_decoder_model_init(cpi, level, op_index, this_model); + } + } + } +} + +static double get_min_cr(const AV1LevelSpec *const level_spec, int tier, + int is_still_picture, int64_t decoded_sample_rate) { + if (is_still_picture) return 0.8; + if (level_spec->level < SEQ_LEVEL_4_0) tier = 0; + const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr; + const double speed_adj = + (double)decoded_sample_rate / level_spec->max_display_rate; + return AOMMAX(min_cr_basis * speed_adj, 0.8); +} + +double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, + int is_still_picture) { + assert(is_valid_seq_level_idx(level_index)); + const AV1LevelSpec *const level_spec = &av1_level_defs[level_index]; + return get_min_cr(level_spec, tier, is_still_picture, + level_spec->max_decode_rate); +} + +static void get_temporal_parallel_params(int scalability_mode_idc, + int *temporal_parallel_num, + int *temporal_parallel_denom) { + if (scalability_mode_idc < 0) { + *temporal_parallel_num = 1; + *temporal_parallel_denom = 1; + return; + } + + // TODO(huisu@): handle scalability cases. + if (scalability_mode_idc == SCALABILITY_SS) { + (void)scalability_mode_idc; + } else { + (void)scalability_mode_idc; + } +} + +#define MIN_CROPPED_TILE_WIDTH 8 +#define MIN_CROPPED_TILE_HEIGHT 8 +#define MIN_FRAME_WIDTH 16 +#define MIN_FRAME_HEIGHT 16 +#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136 + +static TARGET_LEVEL_FAIL_ID check_level_constraints( + const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier, + int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) { + const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level]; + const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status; + if (decoder_model_status != DECODER_MODEL_OK && + decoder_model_status != DECODER_MODEL_DISABLED) { + return DECODER_MODEL_FAIL; + } + + const AV1LevelSpec *const level_spec = &level_info->level_spec; + const AV1LevelSpec *const target_level_spec = &av1_level_defs[level]; + const AV1LevelStats *const level_stats = &level_info->level_stats; + TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK; + do { + if (level_spec->max_picture_size > target_level_spec->max_picture_size) { + fail_id = LUMA_PIC_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_h_size > target_level_spec->max_h_size) { + fail_id = LUMA_PIC_H_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_v_size > target_level_spec->max_v_size) { + fail_id = LUMA_PIC_V_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) { + fail_id = TOO_MANY_TILE_COLUMNS; + break; + } + + if (level_spec->max_tiles > target_level_spec->max_tiles) { + fail_id = TOO_MANY_TILES; + break; + } + + if (level_spec->max_header_rate > target_level_spec->max_header_rate) { + fail_id = FRAME_HEADER_RATE_TOO_HIGH; + break; + } + + if (decoder_model->max_display_rate > + (double)target_level_spec->max_display_rate) { + fail_id = DISPLAY_RATE_TOO_HIGH; + break; + } + + // TODO(huisu): we are not using max decode rate calculated by the decoder + // model because the model in resource availability mode always returns + // MaxDecodeRate(as in the level definitions) as the max decode rate. + if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) { + fail_id = DECODE_RATE_TOO_HIGH; + break; + } + + if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) { + fail_id = TILE_RATE_TOO_HIGH; + break; + } + +#if CONFIG_CWG_C013 + const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3) + ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE + : MAX_TILE_AREA; +#else + const int max_tile_size = MAX_TILE_AREA; +#endif + if (level_stats->max_tile_size > max_tile_size) { + fail_id = TILE_TOO_LARGE; + break; + } + + if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) { + fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE; + break; + } + + if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) { + fail_id = CROPPED_TILE_WIDTH_TOO_SMALL; + break; + } + + if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) { + fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL; + break; + } + + if (level_stats->min_frame_width < MIN_FRAME_WIDTH) { + fail_id = LUMA_PIC_H_SIZE_TOO_SMALL; + break; + } + + if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) { + fail_id = LUMA_PIC_V_SIZE_TOO_SMALL; + break; + } + + if (!level_stats->tile_width_is_valid) { + fail_id = TILE_WIDTH_INVALID; + break; + } + + const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture, + level_spec->max_decode_rate); + if (level_stats->min_cr < min_cr) { + fail_id = CR_TOO_SMALL; + break; + } + + if (check_bitrate) { + // Check average bitrate instead of max_bitrate. + const double bitrate_limit = + get_max_bitrate(target_level_spec, tier, profile); + const double avg_bitrate = level_stats->total_compressed_size * 8.0 / + level_stats->total_time_encoded; + if (avg_bitrate > bitrate_limit) { + fail_id = BITRATE_TOO_HIGH; + break; + } + } + + if (target_level_spec->level > SEQ_LEVEL_5_1) { + int temporal_parallel_num; + int temporal_parallel_denom; + const int scalability_mode_idc = -1; + get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num, + &temporal_parallel_denom); + const int val = level_stats->max_tile_size * level_spec->max_header_rate * + temporal_parallel_denom / temporal_parallel_num; + if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) { + fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH; + break; + } + } + } while (0); + + return fail_id; +} + +static void get_tile_stats(const AV1_COMMON *const cm, + const TileDataEnc *const tile_data, + int *max_tile_size, int *max_superres_tile_width, + int *min_cropped_tile_width, + int *min_cropped_tile_height, + int *tile_width_valid) { + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int superres_scale_denominator = cm->superres_scale_denominator; + + *max_tile_size = 0; + *max_superres_tile_width = 0; + *min_cropped_tile_width = INT_MAX; + *min_cropped_tile_height = INT_MAX; + *tile_width_valid = 1; + + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const TileInfo *const tile_info = + &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info; + const int tile_width = + (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE; + const int tile_height = + (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; + const int tile_size = tile_width * tile_height; + *max_tile_size = AOMMAX(*max_tile_size, tile_size); + + const int supperres_tile_width = + tile_width * superres_scale_denominator / SCALE_NUMERATOR; + *max_superres_tile_width = + AOMMAX(*max_superres_tile_width, supperres_tile_width); + + const int cropped_tile_width = + cm->width - tile_info->mi_col_start * MI_SIZE; + const int cropped_tile_height = + cm->height - tile_info->mi_row_start * MI_SIZE; + *min_cropped_tile_width = + AOMMIN(*min_cropped_tile_width, cropped_tile_width); + *min_cropped_tile_height = + AOMMIN(*min_cropped_tile_height, cropped_tile_height); + + const int is_right_most_tile = + tile_info->mi_col_end == cm->mi_params.mi_cols; + if (!is_right_most_tile) { + if (av1_superres_scaled(cm)) + *tile_width_valid &= tile_width >= 128; + else + *tile_width_valid &= tile_width >= 64; + } + } + } +} + +static int store_frame_record(int64_t ts_start, int64_t ts_end, + size_t encoded_size, int pic_size, + int frame_header_count, int tiles, int show_frame, + int show_existing_frame, + FrameWindowBuffer *const buffer) { + if (buffer->num < FRAME_WINDOW_SIZE) { + ++buffer->num; + } else { + buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE; + } + const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + FrameRecord *const record = &buffer->buf[new_idx]; + record->ts_start = ts_start; + record->ts_end = ts_end; + record->encoded_size_in_bytes = encoded_size; + record->pic_size = pic_size; + record->frame_header_count = frame_header_count; + record->tiles = tiles; + record->show_frame = show_frame; + record->show_existing_frame = show_existing_frame; + + return new_idx; +} + +// Count the number of frames encoded in the last "duration" ticks, in display +// time. +static int count_frames(const FrameWindowBuffer *const buffer, + int64_t duration) { + const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + // Assume current frame is shown frame. + assert(buffer->buf[current_idx].show_frame); + + const int64_t current_time = buffer->buf[current_idx].ts_end; + const int64_t time_limit = AOMMAX(current_time - duration, 0); + int num_frames = 1; + int index = current_idx - 1; + for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) { + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_frame) continue; + const int64_t ts_start = record->ts_start; + if (ts_start < time_limit) break; + } + + return num_frames; +} + +// Scan previously encoded frames and update level metrics accordingly. +static void scan_past_frames(const FrameWindowBuffer *const buffer, + int num_frames_to_scan, + AV1LevelSpec *const level_spec, + AV1LevelStats *const level_stats) { + const int num_frames_in_buffer = buffer->num; + int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE; + int frame_headers = 0; + int tiles = 0; + int64_t display_samples = 0; + int64_t decoded_samples = 0; + size_t encoded_size_in_bytes = 0; + for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) { + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_existing_frame) { + frame_headers += record->frame_header_count; + decoded_samples += record->pic_size; + } + if (record->show_frame) { + display_samples += record->pic_size; + } + tiles += record->tiles; + encoded_size_in_bytes += record->encoded_size_in_bytes; + --index; + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + } + level_spec->max_header_rate = + AOMMAX(level_spec->max_header_rate, frame_headers); + // TODO(huisu): we can now compute max display rate with the decoder model, so + // these couple of lines can be removed. Keep them here for a while for + // debugging purpose. + level_spec->max_display_rate = + AOMMAX(level_spec->max_display_rate, display_samples); + level_spec->max_decode_rate = + AOMMAX(level_spec->max_decode_rate, decoded_samples); + level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles); + level_stats->max_bitrate = + AOMMAX(level_stats->max_bitrate, + (int)AOMMIN(encoded_size_in_bytes * 8, (size_t)INT_MAX)); +} + +void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end) { + AV1_COMMON *const cm = &cpi->common; + const AV1LevelParams *const level_params = &cpi->ppi->level_params; + + const int upscaled_width = cm->superres_upscaled_width; + const int width = cm->width; + const int height = cm->height; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + const int tiles = tile_cols * tile_rows; + const int luma_pic_size = upscaled_width * height; + const int frame_header_count = cpi->frame_header_count; + const int show_frame = cm->show_frame; + const int show_existing_frame = cm->show_existing_frame; + + int max_tile_size; + int min_cropped_tile_width; + int min_cropped_tile_height; + int max_superres_tile_width; + int tile_width_is_valid; + get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width, + &min_cropped_tile_width, &min_cropped_tile_height, + &tile_width_is_valid); + + const double compression_ratio = av1_get_compression_ratio(cm, size); + + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + const SequenceHeader *const seq_params = cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int is_still_picture = seq_params->still_picture; + // update level_stats + // TODO(kyslov@) fix the implementation according to buffer model + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) { + if (!is_in_operating_point(seq_params->operating_point_idc[i], + temporal_layer_id, spatial_layer_id) || + !((level_params->keep_level_stats >> i) & 1)) { + continue; + } + + AV1LevelInfo *const level_info = level_params->level_info[i]; + assert(level_info != NULL); + AV1LevelStats *const level_stats = &level_info->level_stats; + + level_stats->max_tile_size = + AOMMAX(level_stats->max_tile_size, max_tile_size); + level_stats->max_superres_tile_width = + AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width); + level_stats->min_cropped_tile_width = + AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width); + level_stats->min_cropped_tile_height = + AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height); + level_stats->tile_width_is_valid &= tile_width_is_valid; + level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width); + level_stats->min_frame_height = + AOMMIN(level_stats->min_frame_height, height); + level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio); + level_stats->total_compressed_size += (double)size; + + // update level_spec + // TODO(kyslov@) update all spec fields + AV1LevelSpec *const level_spec = &level_info->level_spec; + level_spec->max_picture_size = + AOMMAX(level_spec->max_picture_size, luma_pic_size); + level_spec->max_h_size = + AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width); + level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height); + level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols); + level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles); + + // Store info. of current frame into FrameWindowBuffer. + FrameWindowBuffer *const buffer = &level_info->frame_window_buffer; + store_frame_record(ts_start, ts_end, size, luma_pic_size, + frame_header_count, tiles, show_frame, + show_existing_frame, buffer); + if (show_frame) { + // Count the number of frames encoded in the past 1 second. + const int encoded_frames_in_last_second = + show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0; + scan_past_frames(buffer, encoded_frames_in_last_second, level_spec, + level_stats); + level_stats->total_time_encoded += + (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) / + (double)TICKS_PER_SEC; + } + + DECODER_MODEL *const decoder_models = level_info->decoder_models; + for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) { + av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]); + } + + // Check whether target level is met. + const AV1_LEVEL target_level = level_params->target_seq_level_idx[i]; + if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance) { + assert(is_valid_seq_level_idx(target_level)); + const int tier = seq_params->tier[i]; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + level_info, target_level, tier, is_still_picture, profile, 0); + if (fail_id != TARGET_LEVEL_OK) { + const int target_level_major = 2 + (target_level >> 2); + const int target_level_minor = target_level & 3; + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to encode to the target level %d_%d. %s", + target_level_major, target_level_minor, + level_fail_messages[fail_id]); + } + } + } +} + +aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *seq_level_idx) { + const int is_still_picture = seq_params->still_picture; + const BITSTREAM_PROFILE profile = seq_params->profile; + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + if (!((level_params->keep_level_stats >> op) & 1)) continue; + const int tier = seq_params->tier[op]; + const AV1LevelInfo *const level_info = level_params->level_info[op]; + assert(level_info != NULL); + for (int level = 0; level < SEQ_LEVELS; ++level) { + if (!is_valid_seq_level_idx(level)) continue; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + level_info, level, tier, is_still_picture, profile, 1); + if (fail_id == TARGET_LEVEL_OK) { + seq_level_idx[op] = level; + break; + } + } + } + + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *target_seq_level_idx) { + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + if (!((level_params->keep_level_stats >> op) & 1)) continue; + target_seq_level_idx[op] = level_params->target_seq_level_idx[op]; + } + + return AOM_CODEC_OK; +} diff --git a/third_party/aom/av1/encoder/level.h b/third_party/aom/av1/encoder/level.h new file mode 100644 index 0000000000..ebf2a1c19d --- /dev/null +++ b/third_party/aom/av1/encoder/level.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_LEVEL_H_ +#define AOM_AV1_ENCODER_LEVEL_H_ + +#include "av1/common/enums.h" + +struct AV1_COMP; + +// AV1 Level Specifications +typedef struct { + AV1_LEVEL level; + int max_picture_size; + int max_h_size; + int max_v_size; + int max_header_rate; + int max_tile_rate; + int max_tiles; + int max_tile_cols; + int64_t max_display_rate; + int64_t max_decode_rate; + double main_mbps; + double high_mbps; + double main_cr; + double high_cr; +} AV1LevelSpec; + +typedef struct { + int64_t ts_start; + int64_t ts_end; + size_t encoded_size_in_bytes; + int pic_size; + int frame_header_count; + int tiles; + int show_frame; + int show_existing_frame; +} FrameRecord; + +// Record frame info. in a rolling window. +#define FRAME_WINDOW_SIZE 256 +typedef struct { + FrameRecord buf[FRAME_WINDOW_SIZE]; + int num; // Number of FrameRecord stored in the buffer. + int start; // Buffer index of the first FrameRecord. +} FrameWindowBuffer; + +typedef struct { + int max_bitrate; // Max bitrate in any 1-second window, in bps. + int max_tile_size; + int max_superres_tile_width; + int min_cropped_tile_width; + int min_cropped_tile_height; + int tile_width_is_valid; + int min_frame_width; + int min_frame_height; + double total_compressed_size; // In bytes. + double total_time_encoded; // In seconds. + double min_cr; +} AV1LevelStats; + +// The following data structures are for the decoder model. +typedef struct { + int decoder_ref_count; + int player_ref_count; + int display_index; + FRAME_TYPE frame_type; + double presentation_time; +} FRAME_BUFFER; + +// Interval of bits transmission for a DFG(Decodable Frame Group). +typedef struct { + double first_bit_arrival_time; // Time when the first bit arrives. + double last_bit_arrival_time; // Time when the last bit arrives. + // Removal time means the time when the bits to be decoded are removed from + // the smoothing buffer. Removal time is essentially the time when the + // decoding of the frame starts. + double removal_time; +} DFG_INTERVAL; + +#define DFG_INTERVAL_QUEUE_SIZE 64 +typedef struct { + int head; + int size; + double total_interval; + DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE]; +} DFG_INTERVAL_QUEUE; + +enum { + RESOURCE_MODE = 0, // Resource availability mode. + SCHEDULE_MODE // Decoding schedule mode. +} UENUM1BYTE(DECODER_MODEL_MODE); + +enum { + DECODER_MODEL_OK = 0, + DECODE_BUFFER_AVAILABLE_LATE, + DECODE_FRAME_BUF_UNAVAILABLE, + DECODE_EXISTING_FRAME_BUF_EMPTY, + DISPLAY_FRAME_LATE, + SMOOTHING_BUFFER_UNDERFLOW, + SMOOTHING_BUFFER_OVERFLOW, + DECODER_MODEL_DISABLED +} UENUM1BYTE(DECODER_MODEL_STATUS); + +#define BUFFER_POOL_MAX_SIZE 10 +typedef struct { + DECODER_MODEL_STATUS status; + DECODER_MODEL_MODE mode; + bool is_low_delay_mode; + AV1_LEVEL level; + int encoder_buffer_delay; // In units of 1/90000 seconds. + int decoder_buffer_delay; // In units of 1/90000 seconds. + int num_ticks_per_picture; + int initial_display_delay; // In units of frames. + int64_t decode_rate; + double display_clock_tick; // In units of seconds. + double current_time; // In units of seconds. + double initial_presentation_delay; // In units of seconds. + double bit_rate; // Bits per second. + + int num_frame; + int num_decoded_frame; + int num_shown_frame; + int vbi[REF_FRAMES]; // Virtual buffer index. + FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE]; + DFG_INTERVAL_QUEUE dfg_interval_queue; + + // Information for the DFG(Decodable Frame Group) being processed. + double first_bit_arrival_time; + double last_bit_arrival_time; + size_t coded_bits; + + // Information for the frame being processed. + double removal_time; + double presentation_time; + int decode_samples; + int display_samples; + + double max_display_rate; + double max_decode_rate; +} DECODER_MODEL; + +typedef struct { + AV1LevelStats level_stats; + AV1LevelSpec level_spec; + FrameWindowBuffer frame_window_buffer; + DECODER_MODEL decoder_models[SEQ_LEVELS]; +} AV1LevelInfo; + +typedef struct AV1LevelParams { + // Specifies the level that the coded video sequence conforms to for each + // operating point. + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to indicate whether to keep level stats for corresponding + // operating points. + uint32_t keep_level_stats; + // Level information for each operating point. + AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS]; +} AV1LevelParams; + +static INLINE int is_in_operating_point(int operating_point, + int temporal_layer_id, + int spatial_layer_id) { + if (!operating_point) return 1; + + return ((operating_point >> temporal_layer_id) & 1) && + ((operating_point >> (spatial_layer_id + 8)) & 1); +} + +void av1_init_level_info(struct AV1_COMP *cpi); + +void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end); + +// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS]. +aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *seq_level_idx); + +aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params, + const AV1LevelParams *level_params, + int *target_seq_level_idx); + +// Print the status of the decoder model(for debugging). +void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model); + +void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level, + int op_index, DECODER_MODEL *const decoder_model); + +void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi, + size_t coded_bits, + DECODER_MODEL *const decoder_model); + +// This function uses the decoder model to check whether there could be +// SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not +// update the content of decoder_model, and can be used to target certain +// encoding level in the recode loop. +DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf( + const struct AV1_COMP *const cpi, size_t coded_bits, + const DECODER_MODEL *const decoder_model); + +// Return max bitrate(bps) for given level. +double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier, + BITSTREAM_PROFILE profile); + +// Get max number of tiles and tile columns for given level. +void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles, + int *const max_tile_cols); + +// Return minimum compression ratio for given level. +double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier, + int is_still_picture); +#endif // AOM_AV1_ENCODER_LEVEL_H_ diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c new file mode 100644 index 0000000000..9ef9b88675 --- /dev/null +++ b/third_party/aom/av1/encoder/lookahead.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" +#include "av1/common/common.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/lookahead.h" + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) { + int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if (++index >= ctx->max_sz) index -= ctx->max_sz; + *idx = index; + return buf; +} + +void av1_lookahead_destroy(struct lookahead_ctx *ctx) { + if (ctx) { + if (ctx->buf) { + int i; + + for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img); + free(ctx->buf); + } + free(ctx); + } +} + +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int byte_alignment, int num_lap_buffers, + bool is_all_intra, int num_pyramid_levels) { + int lag_in_frames = AOMMAX(1, depth); + + // For all-intra frame encoding, previous source frames are not required. + // Hence max_pre_frames is set to 0 in this case. As previous source frames + // are accessed using a negative index to av1_lookahead_peek(), setting + // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a + // negative index. + const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES; + + // Add the lags to depth and clamp + depth += num_lap_buffers; + depth = clamp(depth, 1, MAX_TOTAL_BUFFERS); + + // Allocate memory to keep previous source frames available. + depth += max_pre_frames; + + // Allocate the lookahead structures + struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx)); + if (ctx) { + unsigned int i; + ctx->max_sz = depth; + ctx->push_frame_count = 0; + ctx->max_pre_frames = max_pre_frames; + ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames; + ctx->read_ctxs[ENCODE_STAGE].valid = 1; + if (num_lap_buffers) { + ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames; + ctx->read_ctxs[LAP_STAGE].valid = 1; + } + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + if (!ctx->buf) goto fail; + for (i = 0; i < depth; i++) { + if (aom_realloc_frame_buffer( + &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, + use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL, + NULL, num_pyramid_levels, 0)) { + goto fail; + } + } + } + return ctx; +fail: + av1_lookahead_destroy(ctx); + return NULL; +} + +int av1_lookahead_full(const struct lookahead_ctx *ctx) { + // TODO(angiebird): Test this function. + return ctx->read_ctxs[ENCODE_STAGE].sz >= ctx->read_ctxs[ENCODE_STAGE].pop_sz; +} + +int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + int num_pyramid_levels, aom_enc_frame_flags_t flags) { + int width = src->y_crop_width; + int height = src->y_crop_height; + int uv_width = src->uv_crop_width; + int uv_height = src->uv_crop_height; + int subsampling_x = src->subsampling_x; + int subsampling_y = src->subsampling_y; + int larger_dimensions, new_dimensions; + + assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1); + if (ctx->read_ctxs[ENCODE_STAGE].sz + ctx->max_pre_frames > ctx->max_sz) + return 1; + + ctx->read_ctxs[ENCODE_STAGE].sz++; + if (ctx->read_ctxs[LAP_STAGE].valid) { + ctx->read_ctxs[LAP_STAGE].sz++; + } + + struct lookahead_entry *buf = pop(ctx, &ctx->write_idx); + + new_dimensions = width != buf->img.y_crop_width || + height != buf->img.y_crop_height || + uv_width != buf->img.uv_crop_width || + uv_height != buf->img.uv_crop_height; + larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || + uv_width > buf->img.uv_width || + uv_height > buf->img.uv_height; + assert(!larger_dimensions || new_dimensions); + + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, use_highbitdepth, + AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0)) + return 1; + aom_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; + } + // Partial copy not implemented yet + av1_copy_and_extend_frame(src, &buf->img); + + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->display_idx = ctx->push_frame_count; + buf->flags = flags; + ++ctx->push_frame_count; + aom_remove_metadata_from_frame_buffer(&buf->img); + if (src->metadata && + aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) { + return 1; + } + return 0; +} + +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, + COMPRESSOR_STAGE stage) { + struct lookahead_entry *buf = NULL; + if (ctx) { + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) { + buf = pop(ctx, &read_ctx->read_idx); + read_ctx->sz--; + } + } + return buf; +} + +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, + COMPRESSOR_STAGE stage) { + struct lookahead_entry *buf = NULL; + if (ctx == NULL) { + return buf; + } + + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + if (index >= 0) { + // Forward peek + if (index < read_ctx->sz) { + index += read_ctx->read_idx; + if (index >= ctx->max_sz) index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (index < 0) { + // Backward peek + if (-index <= ctx->max_pre_frames) { + index += (int)(read_ctx->read_idx); + if (index < 0) index += (int)(ctx->max_sz); + buf = ctx->buf + index; + } + } + + return buf; +} + +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, + COMPRESSOR_STAGE stage) { + assert(ctx != NULL); + + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + return read_ctx->sz; +} + +int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) { + assert(ctx != NULL); + + struct read_ctx *read_ctx = &ctx->read_ctxs[stage]; + assert(read_ctx->valid == 1); + return read_ctx->pop_sz; +} diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h new file mode 100644 index 0000000000..c0e6d222f5 --- /dev/null +++ b/third_party/aom/av1/encoder/lookahead.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes look ahead buffer operations. + */ +#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_ +#define AOM_AV1_ENCODER_LOOKAHEAD_H_ + +#include + +#include "aom_scale/yv12config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ +#define MAX_LAG_BUFFERS 48 +#define MAX_LAP_BUFFERS 48 +#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS) +#define LAP_LAG_IN_FRAMES 17 + +struct lookahead_entry { + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + int display_idx; + aom_enc_frame_flags_t flags; +}; + +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 + +enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE); + +struct read_ctx { + int sz; /* Number of buffers currently in the queue */ + int read_idx; /* Read index */ + int pop_sz; /* Size to check for pop condition */ + int valid; /* Is this ctx valid? */ +}; + +struct lookahead_ctx { + int max_sz; /* Absolute size of the queue */ + int write_idx; /* Write index */ + struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */ + struct lookahead_entry *buf; /* Buffer list */ + int push_frame_count; /* Number of frames that have been pushed in the queue*/ + uint8_t + max_pre_frames; /* Maximum number of past frames allowed in the queue */ +}; +/*!\endcond */ + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + */ +struct lookahead_ctx *av1_lookahead_init( + unsigned int width, unsigned int height, unsigned int subsampling_x, + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int byte_alignment, int num_lap_buffers, + bool is_all_intra, int num_pyramid_levels); + +/**\brief Destroys the lookahead stage + */ +void av1_lookahead_destroy(struct lookahead_ctx *ctx); + +/**\brief Check if lookahead buffer is full + */ +int av1_lookahead_full(const struct lookahead_ctx *ctx); + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] use_highbitdepth Tell if HBD is used + * \param[in] num_pyramid_levels Number of pyramid levels to allocate + for each frame buffer + * \param[in] flags Flags set on this frame + */ +int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, int use_highbitdepth, + int num_pyramid_levels, aom_enc_frame_flags_t flags); + +/**\brief Get the next source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * \param[in] stage Encoder stage + * + * \retval Return NULL, if drain set and queue is empty, or if drain not set and + * queue not of the configured depth. + */ +struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain, + COMPRESSOR_STAGE stage); + +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * \param[in] stage Encoder stage + * + * \retval Return NULL, if no buffer exists at the specified index + */ +struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index, + COMPRESSOR_STAGE stage); + +/**\brief Get the number of frames currently in the lookahead queue + */ +unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx, + COMPRESSOR_STAGE stage); + +/**\brief Get pop_sz value + */ +int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_ diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c new file mode 100644 index 0000000000..4e53447379 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp.c @@ -0,0 +1,3998 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/common.h" +#include "av1/common/filter.h" +#include "av1/common/mvref_common.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" + +static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params, + const MvCosts *mv_costs, + const MV *ref_mv, int errorperbit, + int sadperbit) { + mv_cost_params->ref_mv = ref_mv; + mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv); + mv_cost_params->mv_cost_type = MV_COST_ENTROPY; + mv_cost_params->error_per_bit = errorperbit; + mv_cost_params->sad_per_bit = sadperbit; + // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the + // population of mvjcost and mvcost are avoided. In case of IntraBC, these + // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode(). + if (mv_costs != NULL) { + mv_cost_params->mvjcost = mv_costs->nmv_joint_cost; + mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0]; + mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1]; + } +} + +static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) { + ms_buffers->ref = &x->e_mbd.plane[0].pre[0]; + ms_buffers->src = &x->plane[0].src; + + av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0); + + ms_buffers->wsrc = x->obmc_buffer.wsrc; + ms_buffers->obmc_mask = x->obmc_buffer.mask; +} + +void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) { + obmc_buffer->wsrc = NULL; + obmc_buffer->mask = NULL; + obmc_buffer->above_pred = NULL; + obmc_buffer->left_pred = NULL; +} + +void av1_make_default_fullpel_ms_params( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv, + const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], + SEARCH_METHODS search_method, int fine_search_interval) { + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const int is_key_frame = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE; + + // High level params + ms_params->bsize = bsize; + ms_params->vfp = &cpi->ppi->fn_ptr[bsize]; + + init_ms_buffers(&ms_params->ms_buffers, x); + + av1_set_mv_search_method(ms_params, search_sites, search_method); + + ms_params->mesh_patterns[0] = mv_sf->mesh_patterns; + ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns; + ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh; + ms_params->prune_mesh_search = + (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0; + ms_params->mesh_search_mv_diff_threshold = 4; + ms_params->run_mesh_search = 0; + ms_params->fine_search_interval = fine_search_interval; + + ms_params->is_intra_mode = 0; + + ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level; + + ms_params->mv_limits = x->mv_limits; + av1_set_mv_search_range(&ms_params->mv_limits, ref_mv); + + // Mvcost params + init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv, + x->errorperbit, x->sadperbit); + + ms_params->sdf = ms_params->vfp->sdf; + ms_params->sdx4df = ms_params->vfp->sdx4df; + ms_params->sdx3df = ms_params->vfp->sdx3df; + + if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) { + ms_params->sdf = ms_params->vfp->sdsf; + ms_params->sdx4df = ms_params->vfp->sdsx4df; + // Skip version of sadx3 is not available yet + ms_params->sdx3df = ms_params->vfp->sdsx4df; + } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 && + !is_key_frame) { + FULLPEL_MV start_mv_clamped = start_mv; + // adjust start_mv to make sure it is within MV range + clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits); + + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped); + const struct buf_2d *const src = ms_params->ms_buffers.src; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows; + start_mv_sad_even_rows = + ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride); + start_mv_sad_odd_rows = + ms_params->vfp->sdsf(src_buf + src_stride, src_stride, + best_address + ref_stride, ref_stride); + + // If the absolute SAD difference computed between the pred-to-src of even + // and odd rows is small, skip every other row in sad computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 4; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + ms_params->sdf = ms_params->vfp->sdsf; + ms_params->sdx4df = ms_params->vfp->sdsx4df; + ms_params->sdx3df = ms_params->vfp->sdsx4df; + } + } +} + +void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const IntraBCMVCosts *dv_costs) { + ms_params->is_intra_mode = 1; + + MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + + mv_cost_params->mvjcost = dv_costs->joint_mv; + mv_cost_params->mvcost[0] = dv_costs->dv_costs[0]; + mv_cost_params->mvcost[1] = dv_costs->dv_costs[1]; +} + +void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, const int *cost_list) { + const AV1_COMMON *cm = &cpi->common; + // High level params + ms_params->allow_hp = cm->features.allow_high_precision_mv; + ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop; + ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step; + ms_params->cost_list = cond_cost_list_const(cpi, cost_list); + + av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv); + + // Mvcost params + init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv, + x->errorperbit, x->sadperbit); + + // Subpel variance params + ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize]; + ms_params->var_params.subpel_search_type = + cpi->sf.mv_sf.use_accurate_subpel_search; + ms_params->var_params.w = block_size_wide[bsize]; + ms_params->var_params.h = block_size_high[bsize]; + + // Ref and src buffers + MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers; + init_ms_buffers(ms_buffers, x); +} + +void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) { + // Calculate the outermost full-pixel MVs which are inside the limits set by + // av1_set_subpel_mv_search_range(). + // + // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar + // for mv->row. We can then divide by 8 to find the fullpel MV limits. But + // we have to be careful about the rounding. We want these bounds to be + // at least as tight as the subpel limits, which means that we must round + // the minimum values up and the maximum values down when dividing. + int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL; + int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL; + int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + + col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1); + row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1); + col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1); + row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1); + + // Get intersection of UMV window and valid MV window to reduce # of checks + // in diamond search. + if (mv_limits->col_min < col_min) mv_limits->col_min = col_min; + if (mv_limits->col_max > col_max) mv_limits->col_max = col_max; + if (mv_limits->row_min < row_min) mv_limits->row_min = row_min; + if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; + + mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max); + mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max); +} + +int av1_init_search_range(int size) { + int sr = 0; + // Minimum search size no matter what the passed in value. + size = AOMMAX(16, size); + + while ((size << sr) < MAX_FULL_PEL_VAL) sr++; + + sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2); + return sr; +} + +// ============================================================================ +// Cost of motion vectors +// ============================================================================ +// TODO(any): Adaptively adjust the regularization strength based on image size +// and motion activity instead of using hard-coded values. It seems like we +// roughly half the lambda for each increase in resolution +// These are multiplier used to perform regularization in motion compensation +// when x->mv_cost_type is set to MV_COST_L1. +// LOWRES +#define SSE_LAMBDA_LOWRES 2 // Used by mv_cost_err_fn +#define SAD_LAMBDA_LOWRES 32 // Used by mvsad_err_cost during full pixel search +// MIDRES +#define SSE_LAMBDA_MIDRES 0 // Used by mv_cost_err_fn +#define SAD_LAMBDA_MIDRES 15 // Used by mvsad_err_cost during full pixel search +// HDRES +#define SSE_LAMBDA_HDRES 1 // Used by mv_cost_err_fn +#define SAD_LAMBDA_HDRES 8 // Used by mvsad_err_cost during full pixel search + +// Returns the rate of encoding the current motion vector based on the +// joint_cost and comp_cost. joint_costs covers the cost of transmitting +// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion +// vector. +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + const int *const comp_cost[2]) { + return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr)) +// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost +// is defined as the rate required to encode diff * weight, rounded to the +// nearest 2 ** 7. +// This is NOT used during motion compensation. +int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, + int *const mvcost[2], int weight) { + const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; + return ROUND_POWER_OF_TWO( + mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7); +} + +// Returns the cost of using the current mv during the motion search. This is +// used when var is used as the error metric. +#define PIXEL_TRANSFORM_ERROR_SCALE 4 +static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv, + const int *mvjcost, const int *const mvcost[2], + int error_per_bit, MV_COST_TYPE mv_cost_type) { + const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; + const MV abs_diff = { abs(diff.row), abs(diff.col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + if (mvcost) { + return (int)ROUND_POWER_OF_TWO_64( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT + + PIXEL_TRANSFORM_ERROR_SCALE); + } + return 0; + case MV_COST_L1_LOWRES: + return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_L1_MIDRES: + return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_L1_HDRES: + return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3; + case MV_COST_NONE: return 0; + default: assert(0 && "Invalid rd_cost_type"); return 0; + } +} + +static INLINE int mv_err_cost_(const MV *mv, + const MV_COST_PARAMS *mv_cost_params) { + if (mv_cost_params->mv_cost_type == MV_COST_NONE) { + return 0; + } + return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost, + mv_cost_params->mvcost, mv_cost_params->error_per_bit, + mv_cost_params->mv_cost_type); +} + +// Returns the cost of using the current mv during the motion search. This is +// only used during full pixel motion search when sad is used as the error +// metric +static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv, + const int *mvjcost, const int *const mvcost[2], + int sad_per_bit, MV_COST_TYPE mv_cost_type) { + const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row), + GET_MV_SUBPEL(mv->col - ref_mv->col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * + sad_per_bit, + AV1_PROB_COST_SHIFT); + case MV_COST_L1_LOWRES: + return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_L1_MIDRES: + return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_L1_HDRES: + return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3; + case MV_COST_NONE: return 0; + default: assert(0 && "Invalid rd_cost_type"); return 0; + } +} + +static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv, + const MV_COST_PARAMS *mv_cost_params) { + return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv, + mv_cost_params->mvjcost, mv_cost_params->mvcost, + mv_cost_params->sad_per_bit, + mv_cost_params->mv_cost_type); +} + +// ============================================================================= +// Fullpixel Motion Search: Translational +// ============================================================================= +#define MAX_PATTERN_SCALES 11 +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale +#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates + +// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods. +// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND. +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride, + int level) { + int num_search_steps = 0; + int stage_index = MAX_MVSEARCH_STEPS - 1; + + cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0; + cfg->site[stage_index][0].offset = 0; + cfg->stride = stride; + + // Choose the initial step size depending on level. + const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP; + + for (int radius = first_step; radius > 0;) { + int num_search_pts = 8; + + const FULLPEL_MV search_site_mvs[13] = { + { 0, 0 }, { -radius, 0 }, { radius, 0 }, + { 0, -radius }, { 0, radius }, { -radius, -radius }, + { radius, radius }, { -radius, radius }, { radius, -radius }, + }; + + int i; + for (i = 0; i <= num_search_pts; ++i) { + search_site *const site = &cfg->site[stage_index][i]; + site->mv = search_site_mvs[i]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + // Update the search radius based on level. + if (!level || ((stage_index < 9) && level)) radius /= 2; + --stage_index; + ++num_search_steps; + } + cfg->num_search_steps = num_search_steps; +} + +void av1_init_motion_fpf(search_site_config *cfg, int stride) { + int num_search_steps = 0; + int stage_index = MAX_MVSEARCH_STEPS - 1; + + cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0; + cfg->site[stage_index][0].offset = 0; + cfg->stride = stride; + + for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) { + // Generate offsets for 8 search sites per step. + int tan_radius = AOMMAX((int)(0.41 * radius), 1); + int num_search_pts = 12; + if (radius == 1) num_search_pts = 8; + + const FULLPEL_MV search_site_mvs[13] = { + { 0, 0 }, + { -radius, 0 }, + { radius, 0 }, + { 0, -radius }, + { 0, radius }, + { -radius, -tan_radius }, + { radius, tan_radius }, + { -tan_radius, radius }, + { tan_radius, -radius }, + { -radius, tan_radius }, + { radius, -tan_radius }, + { tan_radius, radius }, + { -tan_radius, -radius }, + }; + + int i; + for (i = 0; i <= num_search_pts; ++i) { + search_site *const site = &cfg->site[stage_index][i]; + site->mv = search_site_mvs[i]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + --stage_index; + ++num_search_steps; + } + cfg->num_search_steps = num_search_steps; +} + +// Search site initialization for NSTEP / NSTEP_8PT search methods. +// level = 0: NSTEP, level = 1: NSTEP_8PT. +void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride, + int level) { + int num_search_steps = 0; + int stage_index = 0; + cfg->stride = stride; + int radius = 1; + const int num_stages = (level > 0) ? 16 : 15; + for (stage_index = 0; stage_index < num_stages; ++stage_index) { + int tan_radius = AOMMAX((int)(0.41 * radius), 1); + int num_search_pts = 12; + if ((radius <= 5) || (level > 0)) { + tan_radius = radius; + num_search_pts = 8; + } + const FULLPEL_MV search_site_mvs[13] = { + { 0, 0 }, + { -radius, 0 }, + { radius, 0 }, + { 0, -radius }, + { 0, radius }, + { -radius, -tan_radius }, + { radius, tan_radius }, + { -tan_radius, radius }, + { tan_radius, -radius }, + { -radius, tan_radius }, + { radius, -tan_radius }, + { tan_radius, radius }, + { -tan_radius, -radius }, + }; + + for (int i = 0; i <= num_search_pts; ++i) { + search_site *const site = &cfg->site[stage_index][i]; + site->mv = search_site_mvs[i]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + cfg->searches_per_step[stage_index] = num_search_pts; + cfg->radius[stage_index] = radius; + ++num_search_steps; + if (stage_index < 12) + radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1); + } + cfg->num_search_steps = num_search_steps; +} + +// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND +// search methods. +void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride, + int level) { + (void)level; + cfg->stride = stride; + // First scale has 4-closest points, the rest have 8 points in diamond + // shape at increasing scales + static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { + 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + + // BIGDIA search method candidates. + // Note that the largest candidate step at each scale is 2^scale + /* clang-format off */ + static const FULLPEL_MV + site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 }, + { 0, 0 }, { 0, 0 } }, + { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 }, + { -1, 1 }, { -2, 0 } }, + { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 }, + { -2, 2 }, { -4, 0 } }, + { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 }, + { -4, 4 }, { -8, 0 } }, + { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 }, + { -8, 8 }, { -16, 0 } }, + { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 }, + { 0, 32 }, { -16, 16 }, { -32, 0 } }, + { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 }, + { 0, 64 }, { -32, 32 }, { -64, 0 } }, + { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 }, + { 0, 128 }, { -64, 64 }, { -128, 0 } }, + { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, + { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } }, + { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, + { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } }, + { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 }, + { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } }, + }; + + /* clang-format on */ + int radius = 1; + for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { + cfg->searches_per_step[i] = bigdia_num_candidates[i]; + cfg->radius[i] = radius; + for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) { + search_site *const site = &cfg->site[i][j]; + site->mv = site_candidates[i][j]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + radius *= 2; + } + cfg->num_search_steps = MAX_PATTERN_SCALES; +} + +// Search site initialization for SQUARE search method. +void av1_init_motion_compensation_square(search_site_config *cfg, int stride, + int level) { + (void)level; + cfg->stride = stride; + // All scales have 8 closest points in square shape. + static const int square_num_candidates[MAX_PATTERN_SCALES] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + + // Square search method candidates. + // Note that the largest candidate step at each scale is 2^scale. + /* clang-format off */ + static const FULLPEL_MV + square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 }, + { -2, 2 }, { -2, 0 } }, + { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 }, + { -4, 4 }, { -4, 0 } }, + { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 }, + { -8, 8 }, { -8, 0 } }, + { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 }, + { 0, 16 }, { -16, 16 }, { -16, 0 } }, + { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 }, + { 0, 32 }, { -32, 32 }, { -32, 0 } }, + { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 }, + { 0, 64 }, { -64, 64 }, { -64, 0 } }, + { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, + { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } }, + { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, + { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } }, + { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, + { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } }, + { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 }, + { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } }, + }; + + /* clang-format on */ + int radius = 1; + for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { + cfg->searches_per_step[i] = square_num_candidates[i]; + cfg->radius[i] = radius; + for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) { + search_site *const site = &cfg->site[i][j]; + site->mv = square_candidates[i][j]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + radius *= 2; + } + cfg->num_search_steps = MAX_PATTERN_SCALES; +} + +// Search site initialization for HEX / FAST_HEX search methods. +void av1_init_motion_compensation_hex(search_site_config *cfg, int stride, + int level) { + (void)level; + cfg->stride = stride; + // First scale has 8-closest points, the rest have 6 points in hex shape + // at increasing scales. + static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6 }; + // Note that the largest candidate step at each scale is 2^scale. + /* clang-format off */ + static const FULLPEL_MV + hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, + { -1, 1 }, { -1, 0 } }, + { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }, + { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } }, + { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } }, + { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, + { -8, 16 }, { -16, 0 } }, + { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 }, + { -32, 0 } }, + { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 }, + { -64, 0 } }, + { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, + { -64, 128 }, { -128, 0 } }, + { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, + { -128, 256 }, { -256, 0 } }, + { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, + { -256, 512 }, { -512, 0 } }, + { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 }, + { -512, 1024 }, { -1024, 0 } }, + }; + + /* clang-format on */ + int radius = 1; + for (int i = 0; i < MAX_PATTERN_SCALES; ++i) { + cfg->searches_per_step[i] = hex_num_candidates[i]; + cfg->radius[i] = radius; + for (int j = 0; j < hex_num_candidates[i]; ++j) { + search_site *const site = &cfg->site[i][j]; + site->mv = hex_candidates[i][j]; + site->offset = get_offset_from_fullmv(&site->mv, stride); + } + radius *= 2; + } + cfg->num_search_steps = MAX_PATTERN_SCALES; +} + +const av1_init_search_site_config + av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = { + av1_init_dsmotion_compensation, av1_init_motion_compensation_nstep, + av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation, + av1_init_motion_compensation_hex, av1_init_motion_compensation_bigdia, + av1_init_motion_compensation_square + }; + +// Checks whether the mv is within range of the mv_limits +static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col, + int range) { + return ((row - range) >= mv_limits->row_min) & + ((row + range) <= mv_limits->row_max) & + ((col - range) >= mv_limits->col_min) & + ((col + range) <= mv_limits->col_max); +} + +static INLINE int get_mvpred_var_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv, + FULLPEL_MV_STATS *mv_stats) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV sub_this_mv = get_mv_from_fullmv(this_mv); + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + int bestsme; + + bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), + ref_stride, &mv_stats->sse); + mv_stats->distortion = bestsme; + + mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); + bestsme += mv_stats->err_cost; + + return bestsme; +} + +static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct buf_2d *const src, + const uint8_t *const ref_address, + const int ref_stride) { + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride); +} + +static INLINE int get_mvpred_compound_var_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv, + FULLPEL_MV_STATS *mv_stats) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const int mask_stride = ms_params->ms_buffers.mask_stride; + const int invert_mask = ms_params->ms_buffers.inv_mask; + int bestsme; + + if (mask) { + bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, + src_buf, src_stride, second_pred, mask, mask_stride, + invert_mask, &mv_stats->sse); + } else if (second_pred) { + bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0, + src_buf, src_stride, &mv_stats->sse, second_pred); + } else { + bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv), + ref_stride, &mv_stats->sse); + } + mv_stats->distortion = bestsme; + + const MV sub_this_mv = get_mv_from_fullmv(this_mv); + mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params); + bestsme += mv_stats->err_cost; + + return bestsme; +} + +static INLINE int get_mvpred_compound_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct buf_2d *const src, const uint8_t *const ref_address, + const int ref_stride) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + + const uint8_t *mask = ms_params->ms_buffers.mask; + const uint8_t *second_pred = ms_params->ms_buffers.second_pred; + const int mask_stride = ms_params->ms_buffers.mask_stride; + const int invert_mask = ms_params->ms_buffers.inv_mask; + + if (mask) { + return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred, + mask, mask_stride, invert_mask); + } else if (second_pred) { + return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred); + } else { + return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride); + } +} + +// Calculates and returns a sad+mvcost list around an integer best pel during +// fullpixel motion search. The resulting list can be used to speed up subpel +// motion search later. +#define USE_SAD_COSTLIST 1 + +// calc_int_cost_list uses var to populate the costlist, which is more accurate +// than sad but slightly slower. +static AOM_FORCE_INLINE void calc_int_cost_list( + const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int *cost_list) { + static const FULLPEL_MV neighbors[4] = { + { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } + }; + const int br = best_mv.row; + const int bc = best_mv.col; + + FULLPEL_MV_STATS mv_stats; + cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats); + + if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + cost_list[i + 1] = + get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats); + } + } else { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV neighbor_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) { + cost_list[i + 1] = INT_MAX; + } else { + cost_list[i + 1] = + get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats); + } + } + } +} + +// calc_int_sad_list uses sad to populate the costlist, which is less accurate +// than var but faster. +static AOM_FORCE_INLINE void calc_int_sad_list( + const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int *cost_list, int costlist_has_sad) { + static const FULLPEL_MV neighbors[4] = { + { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } + }; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + const int br = best_mv.row; + const int bc = best_mv.col; + + assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv)); + + // Refresh the costlist it does not contain valid sad + if (!costlist_has_sad) { + cost_list[0] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride); + + if (check_bounds(&ms_params->mv_limits, br, bc, 1)) { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV this_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + cost_list[i + 1] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + } + } else { + for (int i = 0; i < 4; i++) { + const FULLPEL_MV this_mv = { br + neighbors[i].row, + bc + neighbors[i].col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + cost_list[i + 1] = INT_MAX; + } else { + cost_list[i + 1] = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride); + } + } + } + } + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params); + + for (int idx = 0; idx < 4; idx++) { + if (cost_list[idx + 1] != INT_MAX) { + const FULLPEL_MV this_mv = { br + neighbors[idx].row, + bc + neighbors[idx].col }; + cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params); + } + } +} + +// Computes motion vector cost and adds to the sad cost. +// Then updates the best sad and motion vectors. +// Inputs: +// this_sad: the sad to be evaluated. +// mv: the current motion vector. +// mv_cost_params: a structure containing information to compute mv cost. +// best_sad: the current best sad. +// raw_best_sad (optional): the current best sad without calculating mv cost. +// best_mv: the current best motion vector. +// second_best_mv (optional): the second best motion vector up to now. +// Modifies: +// best_sad, raw_best_sad, best_mv, second_best_mv +// If the current sad is lower than the current best sad. +// Returns: +// Whether the input sad (mv) is better than the current best. +static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad, + const FULLPEL_MV *mv, + const MV_COST_PARAMS *mv_cost_params, + unsigned int *best_sad, + unsigned int *raw_best_sad, + FULLPEL_MV *best_mv, + FULLPEL_MV *second_best_mv) { + if (this_sad >= *best_sad) return 0; + + // Add the motion vector cost. + const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params); + if (sad < *best_sad) { + if (raw_best_sad) *raw_best_sad = this_sad; + *best_sad = sad; + if (second_best_mv) *second_best_mv = *best_mv; + *best_mv = *mv; + return 1; + } + return 0; +} + +// Calculate sad4 and update the bestmv information +// in FAST_DIAMOND search method. +static AOM_INLINE void calc_sad4_update_bestmv( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + const FULLPEL_MV center_mv, const uint8_t *center_address, + unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, + int *best_site, int cand_start, int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + + unsigned char const *block_offset[4]; + unsigned int sads_buf[4]; + unsigned int *sads; + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + if (cost_list) { + sads = (unsigned int *)(cost_list + 1); + } else { + sads = sads_buf; + } + // Loop over number of candidates. + for (int j = 0; j < 4; j++) + block_offset[j] = site[cand_start + j].offset + center_address; + + // 4-point sad calculation. + ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads); + + for (int j = 0; j < 4; j++) { + const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row, + center_mv.col + site[cand_start + j].mv.col }; + const int found_better_mv = update_mvs_and_sad( + sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = cand_start + j; + } +} + +static AOM_INLINE void calc_sad3_update_bestmv( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad, + unsigned int *raw_bestsad, int search_step, int *best_site, + const int *chkpts_indices, int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + unsigned char const *block_offset[4] = { + center_address + site[chkpts_indices[0]].offset, + center_address + site[chkpts_indices[1]].offset, + center_address + site[chkpts_indices[2]].offset, + center_address, + }; + unsigned int sads[4]; + ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads); + for (int j = 0; j < 3; j++) { + const int index = chkpts_indices[j]; + const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row, + center_mv.col + site[index].mv.col }; + const int found_better_mv = update_mvs_and_sad( + sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = j; + } + if (cost_list) { + for (int j = 0; j < 3; j++) { + int index = chkpts_indices[j]; + cost_list[index + 1] = sads[j]; + } + } +} + +// Calculate sad and update the bestmv information +// in FAST_DIAMOND search method. +static AOM_INLINE void calc_sad_update_bestmv( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + const FULLPEL_MV center_mv, const uint8_t *center_address, + unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, + int *best_site, const int num_candidates, int cand_start, int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + // Loop over number of candidates. + for (int i = cand_start; i < num_candidates; i++) { + const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row, + center_mv.col + site[i].mv.col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue; + int thissad = get_mvpred_sad(ms_params, src, + center_address + site[i].offset, ref->stride); + if (cost_list) { + cost_list[i + 1] = thissad; + } + const int found_better_mv = update_mvs_and_sad( + thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = i; + } +} + +static AOM_INLINE void calc_sad_update_bestmv_with_indices( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv, + const FULLPEL_MV center_mv, const uint8_t *center_address, + unsigned int *bestsad, unsigned int *raw_bestsad, int search_step, + int *best_site, const int num_candidates, const int *chkpts_indices, + int *cost_list) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site *site = ms_params->search_sites->site[search_step]; + // Loop over number of candidates. + for (int i = 0; i < num_candidates; i++) { + int index = chkpts_indices[i]; + const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row, + center_mv.col + site[index].mv.col }; + if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + if (cost_list) { + cost_list[index + 1] = INT_MAX; + } + continue; + } + const int thissad = get_mvpred_sad( + ms_params, src, center_address + site[index].offset, ref->stride); + if (cost_list) { + cost_list[index + 1] = thissad; + } + const int found_better_mv = update_mvs_and_sad( + thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv, + /*second_best_mv=*/NULL); + if (found_better_mv) *best_site = i; + } +} + +// Generic pattern search function that searches over multiple scales. +// Each scale can have a different number of candidates and shape of +// candidates as indicated in the num_candidates and candidates arrays +// passed into this function +static int pattern_search(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + static const int search_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, s, t; + + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const search_site_config *search_sites = ms_params->search_sites; + const int *num_candidates = search_sites->searches_per_step; + const int ref_stride = ref->stride; + const int last_is_4 = num_candidates[0] == 4; + int br, bc; + unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX; + int k = -1; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1); + assert(search_step >= 0); + int best_init_s = search_steps[search_step]; + // adjust ref_mv to make sure it is within MV range + clamp_fullmv(&start_mv, &ms_params->mv_limits); + br = start_mv.row; + bc = start_mv.col; + if (cost_list != NULL) { + cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = + INT_MAX; + } + int costlist_has_sad = 0; + + // Work out the start point for the search + raw_bestsad = get_mvpred_sad(ms_params, src, + get_buf_from_fullmv(ref, &start_mv), ref_stride); + bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params); + + // Search all possible scales up to the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv); + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + int best_site = -1; + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) { + // Call 4-point sad for multiples of 4 candidates. + const int no_of_4_cand_loops = num_candidates[t] >> 2; + for (i = 0; i < no_of_4_cand_loops; i++) { + calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, t, + &best_site, i * 4, /*cost_list=*/NULL); + } + // Rest of the candidates + const int remaining_cand = num_candidates[t] % 4; + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, t, + &best_site, remaining_cand, + no_of_4_cand_loops * 4, NULL); + } else { + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, t, + &best_site, num_candidates[t], 0, NULL); + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } + } + if (best_init_s != -1) { + br += search_sites->site[best_init_s][k].mv.row; + bc += search_sites->site[best_init_s][k].mv.col; + center_address += search_sites->site[best_init_s][k].offset; + } + } + + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + const int last_s = (last_is_4 && cost_list != NULL); + int best_site = -1; + s = best_init_s; + + for (; s >= last_s; s--) { + // No need to search all points the 1st time if initial search was used + if (!do_init_search || s != best_init_s) { + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + // Call 4-point sad for multiples of 4 candidates. + const int no_of_4_cand_loops = num_candidates[s] >> 2; + for (i = 0; i < no_of_4_cand_loops; i++) { + calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, + center_mv, center_address, &bestsad, + &raw_bestsad, s, &best_site, i * 4, + /*cost_list=*/NULL); + } + // Rest of the candidates + const int remaining_cand = num_candidates[s] % 4; + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, remaining_cand, + no_of_4_cand_loops * 4, NULL); + } else { + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, num_candidates[s], 0, NULL); + } + + if (best_site == -1) { + continue; + } else { + br += search_sites->site[s][best_site].mv.row; + bc += search_sites->site[s][best_site].mv.col; + center_address += search_sites->site[s][best_site].offset; + k = best_site; + } + } + + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, next_chkpts_indices, NULL); + } else { + calc_sad_update_bestmv_with_indices( + ms_params, mv_cost_params, best_mv, center_mv, center_address, + &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF, + next_chkpts_indices, NULL); + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += search_sites->site[s][k].mv.row; + bc += search_sites->site[s][k].mv.col; + center_address += search_sites->site[s][k].offset; + } + } while (best_site != -1); + } + // Note: If we enter the if below, then cost_list must be non-NULL. + if (s == 0) { + cost_list[0] = raw_bestsad; + costlist_has_sad = 1; + assert(num_candidates[s] == 4); + if (!do_init_search || s != best_init_s) { + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, 0, cost_list); + } else { + calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, /*num_candidates=*/4, + /*cand_start=*/0, cost_list); + } + + if (best_site != -1) { + br += search_sites->site[s][best_site].mv.row; + bc += search_sites->site[s][best_site].mv.col; + center_address += search_sites->site[s][best_site].offset; + k = best_site; + } + } + while (best_site != -1) { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; + cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX; + cost_list[((k + 2) % 4) + 1] = cost_list[0]; + cost_list[0] = raw_bestsad; + + FULLPEL_MV center_mv = { br, bc }; + if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) { + assert(PATTERN_CANDIDATES_REF == 3); + calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv, + center_address, &bestsad, &raw_bestsad, s, + &best_site, next_chkpts_indices, cost_list); + } else { + calc_sad_update_bestmv_with_indices( + ms_params, mv_cost_params, best_mv, center_mv, center_address, + &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF, + next_chkpts_indices, cost_list); + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += search_sites->site[s][k].mv.row; + bc += search_sites->site[s][k].mv.col; + center_address += search_sites->site[s][k].offset; + } + } + } + } + best_mv->row = br; + best_mv->col = bc; + + assert(center_address == get_buf_from_fullmv(ref, best_mv) && + "center address is out of sync with best_mv!\n"); + + // Returns the one-away integer pel cost/sad around the best as follows: + // cost_list[0]: cost/sad at the best integer pel + // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel + // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel + // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel + // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel + if (cost_list) { + if (USE_SAD_COSTLIST) { + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + + const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats); + return var_cost; +} + +// For the following foo_search, the input arguments are: +// start_mv: where we are starting our motion search +// ms_params: a collection of motion search parameters +// search_step: how many steps to skip in our motion search. For example, +// a value 3 suggests that 3 search steps have already taken place prior to +// this function call, so we jump directly to step 4 of the search process +// do_init_search: if on, do an initial search of all possible scales around the +// start_mv, and then pick the best scale. +// cond_list: used to hold the cost around the best full mv so we can use it to +// speed up subpel search later. +// best_mv: the best mv found in the motion search +static int hex_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return pattern_search(start_mv, ms_params, search_step, do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int bigdia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return pattern_search(start_mv, ms_params, search_step, do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int square_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return pattern_search(start_mv, ms_params, search_step, do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int fast_hex_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return hex_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search, + cost_list, best_mv, best_mv_stats); +} + +static int vfast_dia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step), + do_init_search, cost_list, best_mv, best_mv_stats); +} + +static int fast_dia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), + do_init_search, cost_list, best_mv, best_mv_stats); +} + +static int fast_bigdia_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, const int do_init_search, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats) { + return bigdia_search(start_mv, ms_params, + AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step), + do_init_search, cost_list, best_mv, best_mv_stats); +} + +static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int search_step, int *num00, + FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) { +#define UPDATE_SEARCH_STEP \ + do { \ + if (best_site != 0) { \ + tmp_second_best_mv = *best_mv; \ + best_mv->row += site[best_site].mv.row; \ + best_mv->col += site[best_site].mv.col; \ + best_address += site[best_site].offset; \ + is_off_center = 1; \ + } \ + \ + if (is_off_center == 0) num_center_steps++; \ + \ + if (best_site == 0 && step > 2) { \ + int next_step_size = cfg->radius[step - 1]; \ + while (next_step_size == cfg->radius[step] && step > 2) { \ + num_center_steps++; \ + --step; \ + next_step_size = cfg->radius[step - 1]; \ + } \ + } \ + } while (0) + + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + + const uint8_t *src_buf = src->buf; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + + const search_site_config *cfg = ms_params->search_sites; + + int is_off_center = 0; + // Number of times that we have stayed in the middle. This is used to skip + // search steps in the future if diamond_search_sad is called again. + int num_center_steps = 0; + + // search_step determines the length of the initial step and hence the number + // of iterations. + const int tot_steps = cfg->num_search_steps - search_step; + FULLPEL_MV tmp_second_best_mv; + if (second_best_mv) { + tmp_second_best_mv = *second_best_mv; + } + + *best_mv = start_mv; + + // Check the starting position + const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv); + unsigned int bestsad = start_mv_sad; + + // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf + if (ms_params->ms_buffers.second_pred) { + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *site = cfg->site[step]; + const int num_searches = cfg->searches_per_step[step]; + int best_site = 0; + + for (int idx = 1; idx <= num_searches; idx++) { + const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row, + best_mv->col + site[idx].mv.col }; + + if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + const uint8_t *const check_here = site[idx].offset + best_address; + unsigned int thissad = + get_mvpred_compound_sad(ms_params, src, check_here, ref_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx; + } + } + } + } + UPDATE_SEARCH_STEP; + } + } else { + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *site = cfg->site[step]; + const int num_searches = cfg->searches_per_step[step]; + int best_site = 0; + + int all_in = 1; + // Trap illegal vectors + all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min; + all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max; + all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min; + all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max; + + if (all_in) { + for (int idx = 1; idx <= num_searches; idx += 4) { + unsigned char const *block_offset[4]; + unsigned int sads[4]; + + for (int j = 0; j < 4; j++) + block_offset[j] = site[idx + j].offset + best_address; + + ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride, + sads); + for (int j = 0; j < 4; j++) { + if (sads[j] < bestsad) { + const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row, + best_mv->col + + site[idx + j].mv.col }; + unsigned int thissad = + sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx + j; + } + } + } + } + } else { + for (int idx = 1; idx <= num_searches; idx++) { + const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row, + best_mv->col + site[idx].mv.col }; + + if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) { + const uint8_t *const check_here = site[idx].offset + best_address; + unsigned int thissad = + get_mvpred_sad(ms_params, src, check_here, ref_stride); + + if (thissad < bestsad) { + thissad += mvsad_err_cost_(&this_mv, mv_cost_params); + if (thissad < bestsad) { + bestsad = thissad; + best_site = idx; + } + } + } + } + } + UPDATE_SEARCH_STEP; + } + } + + *num00 = num_center_steps; + if (second_best_mv) { + *second_best_mv = tmp_second_best_mv; + } + + return bestsad; + +#undef UPDATE_SEARCH_STEP +} + +static INLINE unsigned int get_start_mvpred_sad_cost( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) { + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv); + + unsigned int start_mv_sad = + mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params); + + if (ms_params->ms_buffers.second_pred) + start_mv_sad += + get_mvpred_compound_sad(ms_params, src, best_address, ref->stride); + else + start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride); + + return start_mv_sad; +} + +static int full_pixel_diamond(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *best_mv_stats, + FULLPEL_MV *second_best_mv) { + const search_site_config *cfg = ms_params->search_sites; + int thissme, n, num00 = 0; + + // Clamp start mv and calculate the cost + clamp_fullmv(&start_mv, &ms_params->mv_limits); + unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv); + + diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv, + second_best_mv); + + int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats); + + // If there won't be more n-step search, check to see if refining search is + // needed. + const int further_steps = cfg->num_search_steps - 1 - step_param; + while (n < further_steps) { + ++n; + + // TODO(chiyotsai@google.com): There is another bug here where the second + // best mv gets incorrectly overwritten. Fix it later. + FULLPEL_MV tmp_best_mv; + FULLPEL_MV_STATS tmp_best_mv_stats; + diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n, + &num00, &tmp_best_mv, second_best_mv); + + thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv, + &tmp_best_mv_stats); + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_best_mv; + *best_mv_stats = tmp_best_mv_stats; + } + + if (num00) { + // Advance the loop by num00 steps + n += num00; + num00 = 0; + } + } + + // Return cost list. + if (cost_list) { + if (USE_SAD_COSTLIST) { + const int costlist_has_sad = 0; + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + return bestsme; +} + +// Exhaustive motion search around a given centre position with a given +// step size. +static int exhaustive_mesh_search(FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int range, const int step, + FULLPEL_MV *best_mv, + FULLPEL_MV *second_best_mv) { + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const struct buf_2d *const src = ms_params->ms_buffers.src; + const struct buf_2d *const ref = ms_params->ms_buffers.ref; + const int ref_stride = ref->stride; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; + const int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + *best_mv = start_mv; + best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv), + ref_stride); + best_sad += mvsad_err_cost_(&start_mv, mv_cost_params); + start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row); + start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col); + end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row); + end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c }; + unsigned int sad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad, + /*raw_best_sad=*/NULL, best_mv, second_best_mv); + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + addrs[i] = get_buf_from_fullmv(ref, &mv); + } + + ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad, + /*raw_best_sad=*/NULL, best_mv, + second_best_mv); + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i }; + unsigned int sad = get_mvpred_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad, + /*raw_best_sad=*/NULL, best_mv, second_best_mv); + } + } + } + } + } + + return best_sad; +} + +// Runs an limited range exhaustive mesh search using a pattern set +// according to the encode speed profile. +static int full_pixel_exhaustive(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct MESH_PATTERN *const mesh_patterns, + int *cost_list, FULLPEL_MV *best_mv, + FULLPEL_MV_STATS *mv_stats, + FULLPEL_MV *second_best_mv) { + const int kMinRange = 7; + const int kMaxRange = 256; + const int kMinInterval = 1; + + int bestsme; + int i; + int interval = mesh_patterns[0].interval; + int range = mesh_patterns[0].range; + int baseline_interval_divisor; + + // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref + // version of sad and variance function. We still need to check the + // performance when compound ref exhaustive search is enabled. + assert(!ms_params->ms_buffers.second_pred && + "Mesh search does not support compound mode!"); + + *best_mv = start_mv; + + // Trap illegal values for interval and range for this function. + if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) || + (interval > range)) + return INT_MAX; + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4); + range = AOMMIN(range, kMaxRange); + interval = AOMMAX(interval, range / baseline_interval_divisor); + // Use a small search step/interval for certain kind of clips. + // For example, screen content clips with a lot of texts. + // Large interval could lead to a false matching position, and it can't find + // the best global candidate in following iterations due to reduced search + // range. The solution here is to use a small search iterval in the beginning + // and thus reduces the chance of missing the best candidate. + if (ms_params->fine_search_interval) { + interval = AOMMIN(interval, 4); + } + + // initial search + bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval, + best_mv, second_best_mv); + + if ((interval > kMinInterval) && (range > kMinRange)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search( + *best_mv, ms_params, mesh_patterns[i].range, + mesh_patterns[i].interval, best_mv, second_best_mv); + + if (mesh_patterns[i].interval == 1) break; + } + } + + if (bestsme < INT_MAX) { + bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats); + } + + // Return cost list. + if (cost_list) { + if (USE_SAD_COSTLIST) { + const int costlist_has_sad = 0; + calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad); + } else { + calc_int_cost_list(*best_mv, ms_params, cost_list); + } + } + return bestsme; +} + +// This function is called when we do joint motion search in comp_inter_inter +// mode, or when searching for one component of an ext-inter compound mode. +int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) { + static const search_neighbors neighbors[8] = { + { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 }, + { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 }, + { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 }, + { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 } + }; + + uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * + SEARCH_GRID_STRIDE_8P] = { 0 }; + int grid_center = SEARCH_GRID_CENTER_8P; + int grid_coord = grid_center; + + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const FullMvLimits *mv_limits = &ms_params->mv_limits; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const struct buf_2d *src = ms_buffers->src; + const struct buf_2d *ref = ms_buffers->ref; + const int ref_stride = ref->stride; + + *best_mv = start_mv; + clamp_fullmv(best_mv, mv_limits); + + unsigned int best_sad = get_mvpred_compound_sad( + ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride); + best_sad += mvsad_err_cost_(best_mv, mv_cost_params); + + do_refine_search_grid[grid_coord] = 1; + + for (int i = 0; i < SEARCH_RANGE_8P; ++i) { + int best_site = -1; + + for (int j = 0; j < 8; ++j) { + grid_coord = grid_center + neighbors[j].coord_offset; + if (do_refine_search_grid[grid_coord] == 1) { + continue; + } + const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row, + best_mv->col + neighbors[j].coord.col }; + + do_refine_search_grid[grid_coord] = 1; + if (av1_is_fullmv_in_range(mv_limits, mv)) { + unsigned int sad; + sad = get_mvpred_compound_sad( + ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].coord.row; + best_mv->col += neighbors[best_site].coord.col; + grid_center += neighbors[best_site].coord_offset; + } + } + return best_sad; +} + +int av1_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, + FULLPEL_MV *second_best_mv) { + const BLOCK_SIZE bsize = ms_params->bsize; + const SEARCH_METHODS search_method = ms_params->search_method; + + const int is_intra_mode = ms_params->is_intra_mode; + int run_mesh_search = ms_params->run_mesh_search; + + int var = 0; + MARK_MV_INVALID(best_mv); + if (second_best_mv) { + MARK_MV_INVALID(second_best_mv); + } + + if (cost_list) { + cost_list[0] = INT_MAX; + cost_list[1] = INT_MAX; + cost_list[2] = INT_MAX; + cost_list[3] = INT_MAX; + cost_list[4] = INT_MAX; + } + + assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride); + assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width); + + switch (search_method) { + case FAST_BIGDIA: + var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case VFAST_DIAMOND: + var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case FAST_DIAMOND: + var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case FAST_HEX: + var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list, + best_mv, best_mv_stats); + break; + case HEX: + var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv, + best_mv_stats); + break; + case SQUARE: + var = square_search(start_mv, ms_params, step_param, 1, cost_list, + best_mv, best_mv_stats); + break; + case BIGDIA: + var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list, + best_mv, best_mv_stats); + break; + case NSTEP: + case NSTEP_8PT: + case DIAMOND: + case CLAMPED_DIAMOND: + var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list, + best_mv, best_mv_stats, second_best_mv); + break; + default: assert(0 && "Invalid search method."); + } + + // Should we allow a follow on exhaustive search? + if (!run_mesh_search && + ((search_method == NSTEP) || (search_method == NSTEP_8PT)) && + !ms_params->ms_buffers.second_pred) { + int exhaustive_thr = ms_params->force_mesh_thresh; + exhaustive_thr >>= + 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + // Threshold variance for an exhaustive full search. + if (var > exhaustive_thr) run_mesh_search = 1; + } + + // TODO(yunqing): the following is used to reduce mesh search in temporal + // filtering. Can extend it to intrabc. + if (!is_intra_mode && ms_params->prune_mesh_search) { + const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row), + abs(start_mv.col - best_mv->col)); + if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) { + run_mesh_search = 0; + } + } + + if (ms_params->sdf != ms_params->vfp->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run mesh search with + // skip row features off. + // TODO(chiyotsai@google.com): Handle the case where we have a vertical + // offset of 1 before we hit this statement to avoid having to redo + // motion search. + const struct buf_2d *src = ms_params->ms_buffers.src; + const struct buf_2d *ref = ms_params->ms_buffers.ref; + const int src_stride = src->stride; + const int ref_stride = ref->stride; + + const uint8_t *src_address = src->buf; + const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv); + const int sad = + ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride); + const int skip_sad = + ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride); + // We will keep the result of skipping rows if it's good enough. Here, good + // enough means the error is less than 1 per pixel. + const int kSADThresh = + 1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params; + new_ms_params.sdf = new_ms_params.vfp->sdf; + new_ms_params.sdx4df = new_ms_params.vfp->sdx4df; + new_ms_params.sdx3df = new_ms_params.vfp->sdx3df; + + return av1_full_pixel_search(start_mv, &new_ms_params, step_param, + cost_list, best_mv, best_mv_stats, + second_best_mv); + } + } + + if (run_mesh_search) { + int var_ex; + FULLPEL_MV tmp_mv_ex; + FULLPEL_MV_STATS tmp_mv_stats; + // Pick the mesh pattern for exhaustive search based on the toolset (intraBC + // or non-intraBC) + // TODO(chiyotsai@google.com): There is a bug here where the second best mv + // gets overwritten without actually comparing the rdcost. + const MESH_PATTERN *const mesh_patterns = + ms_params->mesh_patterns[is_intra_mode]; + // TODO(chiyotsai@google.com): the second best mv is not set correctly by + // full_pixel_exhaustive, which can incorrectly override it. + var_ex = + full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list, + &tmp_mv_ex, &tmp_mv_stats, second_best_mv); + if (var_ex < var) { + var = var_ex; + *best_mv_stats = tmp_mv_stats; + *best_mv = tmp_mv_ex; + } + } + + return var; +} + +int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + IntraBCHashInfo *intrabc_hash_info, + FULLPEL_MV *best_mv) { + if (!av1_use_hash_me(cpi)) return INT_MAX; + + const BLOCK_SIZE bsize = ms_params->bsize; + const int block_width = block_size_wide[bsize]; + const int block_height = block_size_high[bsize]; + + if (block_width != block_height) return INT_MAX; + + const FullMvLimits *mv_limits = &ms_params->mv_limits; + const MSBuffers *ms_buffer = &ms_params->ms_buffers; + + const uint8_t *src = ms_buffer->src->buf; + const int src_stride = ms_buffer->src->stride; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int x_pos = mi_col * MI_SIZE; + const int y_pos = mi_row * MI_SIZE; + + uint32_t hash_value1, hash_value2; + int best_hash_cost = INT_MAX; + + // for the hashMap + hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table; + + av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width, + &hash_value1, &hash_value2, is_cur_buf_hbd(xd)); + + const int count = av1_hash_table_count(ref_frame_hash, hash_value1); + if (count <= 1) { + return INT_MAX; + } + + Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1); + for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) { + block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator)); + if (hash_value2 == ref_block_hash.hash_value2) { + // Make sure the prediction is from valid area. + const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos), + GET_MV_SUBPEL(ref_block_hash.x - x_pos) }; + if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize, + cpi->common.seq_params->mib_size_log2)) + continue; + + FULLPEL_MV hash_mv; + hash_mv.col = ref_block_hash.x - x_pos; + hash_mv.row = ref_block_hash.y - y_pos; + if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue; + FULLPEL_MV_STATS mv_stats; + const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats); + if (refCost < best_hash_cost) { + best_hash_cost = refCost; + *best_mv = hash_mv; + } + } + } + + return best_hash_cost; +} + +static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size, + int full_search, int *sad) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + int bw = search_size << 1; + + if (full_search) { + for (d = 0; d <= bw; d++) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + *sad = best_sad; + return (center - (bw >> 1)); + } + + for (d = 0; d <= bw; d += 16) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 16) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 8) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + *sad = best_sad; + return (center - (bw >> 1)); +} + +// A special fast version of motion search used in rt mode. +// The search window along columns and row is given by: +// +/- me_search_size_col/row. +unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv, + unsigned int *y_sad_zero, + int me_search_size_col, + int me_search_size_row) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int idx; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + const int full_search = is_screen; + const bool screen_scroll_superblock = + is_screen && bsize == cm->seq_params->sb_size; + // Keep border a multiple of 16. + const int border = (cpi->oxcf.border_in_pixels >> 4) << 4; + int search_size_width = me_search_size_col; + int search_size_height = me_search_size_row; + // Adjust based on boundary. + if (((mi_col << 2) - search_size_width < -border) || + ((mi_col << 2) + search_size_width > cm->width + border)) + search_size_width = border; + if (((mi_row << 2) - search_size_height < -border) || + ((mi_row << 2) + search_size_height > cm->height + border)) + search_size_height = border; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + int_mv *best_int_mv = &xd->mi[0]->mv[0]; + unsigned int best_sad, tmp_sad, this_sad[4]; + int best_sad_col, best_sad_row; + const int row_norm_factor = mi_size_high_log2[bsize] + 1; + const int col_norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + static const MV search_pos[4] = { + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, + }; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + MAX_MB_PLANE); + } + + if (xd->bd != 8) { + best_int_mv->as_fullmv = kZeroFullMv; + best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return best_sad; + } + const int width_ref_buf = (search_size_width << 1) + bw; + const int height_ref_buf = (search_size_height << 1) + bh; + int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf)); + int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf)); + int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf)); + int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf)); + if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) { + aom_free(hbuf); + aom_free(vbuf); + aom_free(src_hbuf); + aom_free(src_vbuf); + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf"); + } + + // Set up prediction 1-D reference set for rows. + ref_buf = xd->plane[0].pre[0].buf - search_size_width; + aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh, + row_norm_factor); + + // Set up prediction 1-D reference set for cols + ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride; + aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf, + col_norm_factor); + + // Set up src 1-D reference set + src_buf = x->plane[0].src.buf; + aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor); + aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor); + + // Find the best match per 1-D search + best_int_mv->as_fullmv.col = + vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width, + full_search, &best_sad_col); + best_int_mv->as_fullmv.row = + vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height, + full_search, &best_sad_row); + + // For screen: select between horiz or vert motion. + if (is_screen) { + if (best_sad_col < best_sad_row) + best_int_mv->as_fullmv.row = 0; + else + best_int_mv->as_fullmv.col = 0; + } + + FULLPEL_MV this_mv = best_int_mv->as_fullmv; + src_buf = x->plane[0].src.buf; + ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); + best_sad = + cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + + // Evaluate zero MV if found MV is non-zero. + if (best_int_mv->as_int != 0) { + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + *y_sad_zero = tmp_sad; + if (tmp_sad < best_sad) { + best_int_mv->as_fullmv = kZeroFullMv; + this_mv = best_int_mv->as_fullmv; + ref_buf = xd->plane[0].pre[0].buf; + best_sad = tmp_sad; + } + } else { + *y_sad_zero = best_sad; + } + + if (!screen_scroll_superblock) { + const uint8_t *const pos[4] = { + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, + }; + + cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, + this_sad); + + for (idx = 0; idx < 4; ++idx) { + if (this_sad[idx] < best_sad) { + best_sad = this_sad[idx]; + best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row; + best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col; + } + } + + if (this_sad[0] < this_sad[3]) + this_mv.row -= 1; + else + this_mv.row += 1; + + if (this_sad[1] < this_sad[2]) + this_mv.col -= 1; + else + this_mv.col += 1; + + ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); + + tmp_sad = + cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + if (best_sad > tmp_sad) { + best_int_mv->as_fullmv = this_mv; + best_sad = tmp_sad; + } + } + + FullMvLimits mv_limits = x->mv_limits; + av1_set_mv_search_range(&mv_limits, ref_mv); + clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits); + + convert_fullmv_to_mv(best_int_mv); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + + aom_free(hbuf); + aom_free(vbuf); + aom_free(src_hbuf); + aom_free(src_vbuf); + return best_sad; +} + +// ============================================================================= +// Fullpixel Motion Search: OBMC +// ============================================================================= +static INLINE int get_obmc_mvpred_var( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) { + const aom_variance_fn_ptr_t *vfp = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *ref_buf = ms_buffers->ref; + + const MV mv = get_mv_from_fullmv(this_mv); + unsigned int unused; + + return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc, + mask, &unused) + + mv_err_cost_(&mv, mv_cost_params); +} + +static int obmc_refining_search_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) { + const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *ref_buf = ms_buffers->ref; + const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const int kSearchRange = 8; + + unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv), + ref_buf->stride, wsrc, mask) + + mvsad_err_cost_(best_mv, mv_cost_params); + + for (int i = 0; i < kSearchRange; i++) { + int best_site = -1; + + for (int j = 0; j < 4; j++) { + const FULLPEL_MV mv = { best_mv->row + neighbors[j].row, + best_mv->col + neighbors[j].col }; + if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { + unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv), + ref_buf->stride, wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_mv->row += neighbors[best_site].row; + best_mv->col += neighbors[best_site].col; + } + } + return best_sad; +} + +static int obmc_diamond_search_sad( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv, + FULLPEL_MV *best_mv, int search_step, int *num00) { + const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp; + const search_site_config *cfg = ms_params->search_sites; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const MSBuffers *ms_buffers = &ms_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const struct buf_2d *const ref_buf = ms_buffers->ref; + + // search_step determines the length of the initial step and hence the number + // of iterations. + const int tot_steps = cfg->num_search_steps - search_step; + const uint8_t *best_address, *init_ref; + int best_sad = INT_MAX; + int best_site = 0; + + clamp_fullmv(&start_mv, &ms_params->mv_limits); + best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv); + *num00 = 0; + *best_mv = start_mv; + + // Check the starting position + best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) + + mvsad_err_cost_(best_mv, mv_cost_params); + + for (int step = tot_steps - 1; step >= 0; --step) { + const search_site *const site = cfg->site[step]; + best_site = 0; + for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) { + const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row, + best_mv->col + site[idx].mv.col }; + if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) { + int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride, + wsrc, mask); + if (sad < best_sad) { + sad += mvsad_err_cost_(&mv, mv_cost_params); + + if (sad < best_sad) { + best_sad = sad; + best_site = idx; + } + } + } + } + + if (best_site != 0) { + best_mv->row += site[best_site].mv.row; + best_mv->col += site[best_site].mv.col; + best_address += site[best_site].offset; + } else if (best_address == init_ref) { + (*num00)++; + } + } + return best_sad; +} + +static int obmc_full_pixel_diamond( + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv, + int step_param, FULLPEL_MV *best_mv) { + const search_site_config *cfg = ms_params->search_sites; + FULLPEL_MV tmp_mv; + int thissme, n, num00 = 0; + int bestsme = + obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n); + if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv); + *best_mv = tmp_mv; + + // If there won't be more n-step search, check to see if refining search is + // needed. + const int further_steps = cfg->num_search_steps - 1 - step_param; + + while (n < further_steps) { + ++n; + + if (num00) { + num00--; + } else { + thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, + step_param + n, &num00); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv); + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = tmp_mv; + } + } + } + + return bestsme; +} + +int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, FULLPEL_MV *best_mv) { + if (!ms_params->fast_obmc_search) { + const int bestsme = + obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv); + return bestsme; + } else { + *best_mv = start_mv; + clamp_fullmv(best_mv, &ms_params->mv_limits); + int thissme = obmc_refining_search_sad(ms_params, best_mv); + if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv); + return thissme; + } +} + +// ============================================================================= +// Subpixel Motion Search: Translational +// ============================================================================= +#define INIT_SUBPEL_STEP_SIZE (4) +/* + * To avoid the penalty for crossing cache-line read, preload the reference + * area in a small buffer, which is aligned to make sure there won't be crossing + * cache-line read while reading from this buffer. This reduced the cpu + * cycles spent on reading ref data in sub-pixel filter functions. + * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x + * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we + * could reduce the area. + */ + +// Returns the subpel offset used by various subpel variance functions [m]sv[a]f +static INLINE int get_subpel_part(int x) { return x & 7; } + +// Gets the address of the ref buffer at subpel location (r, c), rounded to the +// nearest fullpel precision toward - \infty +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV mv) { + const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3); + return &buf->buf[offset]; +} + +// Estimates the variance of prediction residue using bilinear filter for fast +// search. +static INLINE int estimated_pref_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int src_stride = ms_buffers->src->stride; + const int ref_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + if (second_pred == NULL) { + return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse); + } else if (mask) { + return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + second_pred, mask, mask_stride, invert_mask, sse); + } else { + return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride, + sse, second_pred); + } +} + +// Calculates the variance of prediction residue. +static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, + const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int src_stride = ms_buffers->src->stride; + const int ref_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + const int w = var_params->w; + const int h = var_params->h; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + unsigned int besterr; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); + if (second_pred != NULL) { + if (mask) { + aom_highbd_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, + invert_mask, xd->bd, subpel_search_type); + } else { + aom_highbd_comp_avg_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, + subpel_search_type); + } + } else { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + xd->bd, subpel_search_type); + } + besterr = vfp->vf(pred8, w, src, src_stride, sse); + } else { + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + if (second_pred != NULL) { + if (mask) { + aom_comp_mask_upsampled_pred( + xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride, + invert_mask, subpel_search_type); + } else { + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, subpel_search_type); + } + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search_type); + } + + besterr = vfp->vf(pred, w, src, src_stride, sse); + } +#else + DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]); + if (second_pred != NULL) { + if (mask) { + aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, mask, mask_stride, + invert_mask, subpel_search_type); + } else { + aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, + second_pred, w, h, subpel_x_q3, subpel_y_q3, + ref, ref_stride, subpel_search_type); + } + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + } + + besterr = vfp->vf(pred, w, src, src_stride, sse); +#endif + return besterr; +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. It is suffixed "fast" because +// it uses bilinear filter to estimate the prediction. +static INLINE unsigned int check_better_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + int thismse; + if (is_scaled) { + thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); + } else { + thismse = estimated_pref_error(this_mv, var_params, &sse); + } + cost = mv_err_cost_(this_mv, mv_cost_params); + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +// Checks whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static AOM_FORCE_INLINE unsigned int check_better( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *is_better) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + int thismse; + thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + cost += thismse; + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *is_better |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost, + unsigned int right_cost, + unsigned int up_cost, + unsigned int down_cost) { + const MV diag_step = { up_cost <= down_cost ? -step_size : step_size, + left_cost <= right_cost ? -step_size : step_size }; + + return diag_step; +} + +// Searches the four cardinal direction for a better mv, then follows up with a +// search in the best quadrant. This uses bilinear filter to speed up the +// calculation. +static AOM_FORCE_INLINE MV first_level_check_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv, + int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int is_scaled) { + // Check the four cardinal directions + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + int dummy = 0; + const unsigned int left = check_better_fast( + xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy, is_scaled); + + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const unsigned int right = check_better_fast( + xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy, is_scaled); + + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const unsigned int up = check_better_fast( + xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy, is_scaled); + + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + const unsigned int down = check_better_fast( + xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy, is_scaled); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + return diag_step; +} + +// Performs a following up search after first_level_check_fast is called. This +// performs two extra chess pattern searches in the best quadrant. +static AOM_FORCE_INLINE void second_level_check_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step, + MV *best_mv, int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int is_scaled) { + assert(diag_step.row == hstep || diag_step.row == -hstep); + assert(diag_step.col == hstep || diag_step.col == -hstep); + const int tr = this_mv.row; + const int tc = this_mv.col; + const int br = best_mv->row; + const int bc = best_mv->col; + int dummy = 0; + if (tr != br && tc != bc) { + assert(diag_step.col == bc - tc); + assert(diag_step.row == br - tr); + const MV chess_mv_1 = { br, bc + diag_step.col }; + const MV chess_mv_2 = { br + diag_step.row, bc }; + check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + } else if (tr == br && tc != bc) { + assert(diag_step.col == bc - tc); + // Continue searching in the best direction + const MV bottom_long_mv = { br + hstep, bc + diag_step.col }; + const MV top_long_mv = { br - hstep, bc + diag_step.col }; + check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + // Search in the direction opposite of the best quadrant + const MV rev_mv = { br - diag_step.row, bc }; + check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + } else if (tr != br && tc == bc) { + assert(diag_step.row == br - tr); + // Continue searching in the best direction + const MV right_long_mv = { br + diag_step.row, bc + hstep }; + const MV left_long_mv = { br + diag_step.row, bc - hstep }; + check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + + // Search in the direction opposite of the best quadrant + const MV rev_mv = { br, bc - diag_step.col }; + check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy, + is_scaled); + } +} + +// Combines first level check and second level check when applicable. This first +// searches the four cardinal directions, and perform several +// diagonal/chess-pattern searches in the best quadrant. +static AOM_FORCE_INLINE void two_level_checks_fast( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv, + int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int iters, int is_scaled) { + const MV diag_step = first_level_check_fast( + xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, is_scaled); + if (iters > 1) { + second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep, + mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, is_scaled); + } +} + +static AOM_FORCE_INLINE MV +first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, + MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + int dummy = 0; + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + + const unsigned int left = + check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int right = + check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int up = + check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int down = + check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params, + besterr, sse1, distortion, &dummy); + + return diag_step; +} + +// A newer version of second level check that gives better quality. +// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different +// from av1_find_best_sub_pixel_tree +static AOM_FORCE_INLINE void second_level_check_v2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, + MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int is_scaled) { + assert(best_mv->row == this_mv.row + diag_step.row || + best_mv->col == this_mv.col + diag_step.col); + if (CHECK_MV_EQUAL(this_mv, *best_mv)) { + return; + } else if (this_mv.row == best_mv->row) { + // Search away from diagonal step since diagonal search did not provide any + // improvement + diag_step.row *= -1; + } else if (this_mv.col == best_mv->col) { + diag_step.col *= -1; + } + + const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; + const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; + const MV diag_bias_mv = { best_mv->row + diag_step.row, + best_mv->col + diag_step.col }; + int has_better_mv = 0; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv); + } + } else { + check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv, + is_scaled); + check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &has_better_mv, + is_scaled); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv, is_scaled); + } + } +} + +// Gets the error at the beginning when the mv has fullpel precision +static unsigned int setup_center_error( + const MACROBLOCKD *xd, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const int w = var_params->w; + const int h = var_params->h; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const uint8_t *src = ms_buffers->src->buf; + const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv); + const int src_stride = ms_buffers->src->stride; + const int y_stride = ms_buffers->ref->stride; + const uint8_t *second_pred = ms_buffers->second_pred; + const uint8_t *mask = ms_buffers->mask; + const int mask_stride = ms_buffers->mask_stride; + const int invert_mask = ms_buffers->inv_mask; + + unsigned int besterr; + + if (second_pred != NULL) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); + uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); + if (mask) { + aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, + mask, mask_stride, invert_mask); + } else { + aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } else { + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + if (mask) { + aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, + mask_stride, invert_mask); + } else { + aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); + } +#else + (void)xd; + DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]); + if (mask) { + aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask, + mask_stride, invert_mask); + } else { + aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + } + besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); +#endif + } else { + besterr = vfp->vf(y, y_stride, src, src_stride, sse1); + } + *distortion = besterr; + besterr += mv_err_cost_(bestmv, mv_cost_params); + return besterr; +} + +// Gets the error at the beginning when the mv has fullpel precision +static unsigned int upsampled_setup_center_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1); + *distortion = besterr; + besterr += mv_err_cost_(bestmv, mv_cost_params); + return besterr; +} + +static INLINE int divide_and_round(int n, int d) { + return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); +} + +static INLINE int is_cost_list_wellbehaved(const int *cost_list) { + return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] && + cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4]; +} + +// Returns surface minima estimate at given precision in 1/2^n bits. +// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C +// For a given set of costs S0, S1, S2, S3, S4 at points +// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively, +// the solution for the location of the minima (x0, y0) is given by: +// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0), +// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). +// The code below is an integerized version of that. +static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic, + int bits) { + *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), + (cost_list[1] - 2 * cost_list[0] + cost_list[3])); + *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), + (cost_list[4] - 2 * cost_list[0] + cost_list[2])); +} + +// Checks the list of mvs searched in the last iteration and see if we are +// repeating it. If so, return 1. Otherwise we update the last_mv_search_list +// with current_mv and return 0. +static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list, + const MV current_mv, int iter) { + if (last_mv_search_list) { + if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) { + return 1; + } + + last_mv_search_list[iter].as_mv = current_mv; + } + return 0; +} + +static AOM_INLINE int setup_center_error_facade( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion, + int is_scaled) { + if (is_scaled) { + return upsampled_setup_center_error(xd, cm, bestmv, var_params, + mv_cost_params, sse1, distortion); + } else { + return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + } +} + +int av1_find_best_sub_pixel_tree_pruned_more( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, + unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) + ? &cm->sf_identity + : xd->block_ref_scale_factors[0]; + const int is_scaled = av1_is_scaled(sf); + + if (start_mv_stats != NULL && !is_scaled) { + besterr = start_mv_stats->distortion + start_mv_stats->err_cost; + *distortion = start_mv_stats->distortion; + *sse1 = start_mv_stats->sse; + } else { + besterr = + setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params, + sse1, distortion, is_scaled); + } + + // If forced_stop is FULL_PEL, return. + if (forced_stop == FULL_PEL) return besterr; + + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { + int ir, ic; + get_cost_surf_min(cost_list, &ir, &ic, 1); + if (ir != 0 || ic != 0) { + const MV this_mv = { start_mv.row + ir * hstep, + start_mv.col + ic * hstep }; + int dummy = 0; + check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + } + } else { + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop < HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree_pruned( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, + unsigned int *sse1, int_mv *last_mv_search_list) { + (void)cm; + (void)start_mv_stats; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const int *cost_list = ms_params->cost_list; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + + // The iteration we are current searching for. Iter 0 corresponds to fullpel + // mv, iter 1 to half pel, and so on + int iter = 0; + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) + ? &cm->sf_identity + : xd->block_ref_scale_factors[0]; + const int is_scaled = av1_is_scaled(sf); + + if (start_mv_stats != NULL && !is_scaled) { + besterr = start_mv_stats->distortion + start_mv_stats->err_cost; + *distortion = start_mv_stats->distortion; + *sse1 = start_mv_stats->sse; + } else { + besterr = + setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params, + sse1, distortion, is_scaled); + } + + // If forced_stop is FULL_PEL, return. + if (forced_stop == FULL_PEL) return besterr; + + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && + cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && + cost_list[4] != INT_MAX) { + const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) + + (cost_list[2] < cost_list[4] ? 0 : 2); + + const MV left_mv = { start_mv.row, start_mv.col - hstep }; + const MV right_mv = { start_mv.row, start_mv.col + hstep }; + const MV bottom_mv = { start_mv.row + hstep, start_mv.col }; + const MV top_mv = { start_mv.row - hstep, start_mv.col }; + + const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep }; + const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep }; + const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep }; + const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep }; + + int dummy = 0; + + switch (whichdir) { + case 0: // bottom left quadrant + check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, &dummy, is_scaled); + break; + case 1: // bottom right quadrant + check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, &dummy, is_scaled); + break; + case 2: // top left quadrant + check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + break; + case 3: // top right quadrant + check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params, + mv_cost_params, &besterr, sse1, distortion, &dummy, + is_scaled); + break; + } + } else { + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + if (forced_stop < HALF_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + if (allow_hp && forced_stop == EIGHTH_PEL) { + if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) { + return INT_MAX; + } + iter++; + + hstep >>= 1; + start_mv = *bestmv; + two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits, + var_params, mv_cost_params, &besterr, sse1, + distortion, iters_per_step, is_scaled); + } + + return besterr; +} + +int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)start_mv_stats; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const SUBPEL_SEARCH_TYPE subpel_search_type = + ms_params->var_params.subpel_search_type; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + // How many steps to take. A round of 0 means fullpel search only, 1 means + // half-pel, and so on. + const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); + int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel + + unsigned int besterr = INT_MAX; + + *bestmv = start_mv; + + const struct scale_factors *const sf = is_intrabc_block(xd->mi[0]) + ? &cm->sf_identity + : xd->block_ref_scale_factors[0]; + const int is_scaled = av1_is_scaled(sf); + + if (start_mv_stats != NULL && !is_scaled) { + besterr = start_mv_stats->distortion + start_mv_stats->err_cost; + *distortion = start_mv_stats->distortion; + *sse1 = start_mv_stats->sse; + } else { + if (subpel_search_type != USE_2_TAPS_ORIG) { + besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params, + mv_cost_params, sse1, distortion); + } else { + besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1, + distortion); + } + } + + // If forced_stop is FULL_PEL, return. + if (!round) return besterr; + + for (int iter = 0; iter < round; ++iter) { + MV iter_center_mv = *bestmv; + if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv, + iter)) { + return INT_MAX; + } + + MV diag_step; + if (subpel_search_type != USE_2_TAPS_ORIG) { + diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } else { + diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion, is_scaled); + } + + // Check diagonal sub-pixel position + if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { + second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, + mv_limits, var_params, mv_cost_params, &besterr, + sse1, distortion, is_scaled); + } + + hstep >>= 1; + } + + return besterr; +} + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +// Returns the maximum MV. +int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)xd; + (void)cm; + (void)start_mv; + (void)start_mv_stats; + (void)sse1; + (void)distortion; + (void)last_mv_search_list; + + const int allow_hp = ms_params->allow_hp; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + bestmv->row = mv_limits->row_max; + bestmv->col = mv_limits->col_max; + + unsigned int besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp, 0); + return besterr; +} + +// Returns the minimum MV. +int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, unsigned int *sse1, + int_mv *last_mv_search_list) { + (void)xd; + (void)cm; + (void)start_mv; + (void)start_mv_stats; + (void)sse1; + (void)distortion; + (void)last_mv_search_list; + + const int allow_hp = ms_params->allow_hp; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + bestmv->row = mv_limits->row_min; + bestmv->col = mv_limits->col_min; + + unsigned int besterr = 0; + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp, 0); + return besterr; +} + +#if !CONFIG_REALTIME_ONLY +// Computes the cost of the current predictor by going through the whole +// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv +// during motion_mode_rd. We are going through the whole +// av1_enc_build_inter_predictor because we might have changed the interpolation +// filter, etc before motion_mode_rd is called. +static INLINE unsigned int compute_motion_cost( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize, + const MV *this_mv) { + unsigned int mse; + unsigned int sse; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const MSBuffers *ms_buffers = &var_params->ms_buffers; + + const uint8_t *const src = ms_buffers->src->buf; + const int src_stride = ms_buffers->src->stride; + const uint8_t *const dst = xd->plane[0].dst.buf; + const int dst_stride = xd->plane[0].dst.stride; + const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp; + + mse = vfp->vf(dst, dst_stride, src, src_stride, &sse); + mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params); + return mse; +} + +// Refines MV in a small range + +// Macros to build bitmasks which help us avoid redundant computations +// +// To explain the idea here, imagine that on the first iteration of the +// loop below, we step rightwards. Then, on the second iteration, the neighbors +// to consider are: +// . . . +// 0 1 . +// . . . +// Where 0 is the initial search point, 1 is the best candidate found in the +// first iteration, and the dots are the other neighbors of point 1. +// +// Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and +// the seven points marked with dots), and compare them to see where to move +// next. However, we already evaluated 5 of those 8 neighbors in the last +// iteration, and decided that they are worse than point 1. So we don't need +// to re-consider these points. We only really need to consider the three +// points which are adjacent to point 1 but *not* to point 0. +// +// As the algorithm goes on, there are other ways that redundant evaluations +// can happen, if the search path curls back around on itself. +// +// To avoid all possible redundancies, we'd have to build a set containing +// every point we have already checked, and this would be quite expensive. +// +// So instead, we apply a 95%-effective solution with a much lower overhead: +// we prune out the points which were considered during the previous +// iteration, but we don't worry about any prior iteration. This can be done +// as follows: +// +// We build a static table, called neighbor_mask, which answers the question +// "if we moved in direction X last time, which neighbors are new, and which +// were scanned last iteration?" +// Then we can query this table to quickly determine which points we need to +// evaluate, and which we can skip. +// +// To query the table, the logic is simply: +// neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration, +// do we need to scan neighbor j this iteration?" +#define NEIGHBOR_MASK_DIA(left, down, right, up) \ + (left | (down << 1) | (right << 2) | (up << 3)) + +#define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \ + up_left, up_right) \ + (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) | \ + (down_right << 5) | (up_left << 6) | (up_right << 7)) + +static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = { + // WARP_SEARCH_DIAMOND + { + .num_neighbors = 4, + .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } }, + .neighbor_mask = { + // If we stepped left last time, consider all points except right + NEIGHBOR_MASK_DIA(1, 1, 0, 1), + // If we stepped down last time, consider all points except up + NEIGHBOR_MASK_DIA(1, 1, 1, 0), + // Stepped right last time + NEIGHBOR_MASK_DIA(0, 1, 1, 1), + // Stepped up last time + NEIGHBOR_MASK_DIA(1, 0, 1, 1), + }, + }, + // WARP_SEARCH_SQUARE + { + .num_neighbors = 8, + .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, + { 1, -1 }, { 1, 1 }, { -1, -1 }, { -1, 1 } }, + .neighbor_mask = { + // If we stepped left last time, then we only need to consider 3 points: + // left, down+left, up+left + NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0), + // If we stepped down last time, then we only need to consider 3 points: + // down, down+left, down+right + NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0), + // Stepped right last time + NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1), + // Stepped up last time + NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1), + + // If we stepped down+left last time, then we need to consider 5 points: + // left, down, down+left, down+right, up+left + NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0), + // Stepped down+right last time + NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1), + // Stepped up+left last time + NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1), + // Stepped up+right last time + NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1), + }, + }, +}; + +unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + BLOCK_SIZE bsize, const int *pts0, + const int *pts_inref0, int total_samples, + WARP_SEARCH_METHOD search_method, + int num_iterations) { + MB_MODE_INFO *mbmi = xd->mi[0]; + + const MV *neighbors = warp_search_info[search_method].neighbors; + const int num_neighbors = warp_search_info[search_method].num_neighbors; + const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask; + + MV *best_mv = &mbmi->mv[0].as_mv; + + WarpedMotionParams best_wm_params = mbmi->wm_params; + int best_num_proj_ref = mbmi->num_proj_ref; + unsigned int bestmse; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + const int mv_shift = ms_params->allow_hp ? 0 : 1; + + // Calculate the center position's error + assert(av1_is_subpelmv_in_range(mv_limits, *best_mv)); + bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv); + + // MV search + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // First step always scans all neighbors + uint8_t valid_neighbors = UINT8_MAX; + + for (int ite = 0; ite < num_iterations; ++ite) { + int best_idx = -1; + + for (int idx = 0; idx < num_neighbors; ++idx) { + if ((valid_neighbors & (1 << idx)) == 0) { + continue; + } + + unsigned int thismse; + + MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift), + best_mv->col + neighbors[idx].col * (1 << mv_shift) }; + if (av1_is_subpelmv_in_range(mv_limits, this_mv)) { + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + if (total_samples > 1) { + mbmi->num_proj_ref = + av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize); + } + + if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + this_mv.row, this_mv.col, &mbmi->wm_params, + mi_row, mi_col)) { + thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv); + + if (thismse < bestmse) { + best_idx = idx; + best_wm_params = mbmi->wm_params; + best_num_proj_ref = mbmi->num_proj_ref; + bestmse = thismse; + } + } + } + } + + if (best_idx == -1) break; + + if (best_idx >= 0) { + best_mv->row += neighbors[best_idx].row * (1 << mv_shift); + best_mv->col += neighbors[best_idx].col * (1 << mv_shift); + valid_neighbors = neighbor_mask[best_idx]; + } + } + + mbmi->wm_params = best_wm_params; + mbmi->num_proj_ref = best_num_proj_ref; + return bestmse; +} + +#endif // !CONFIG_REALTIME_ONLY +// ============================================================================= +// Subpixel Motion Search: OBMC +// ============================================================================= +// Estimates the variance of prediction residue +static INLINE int estimate_obmc_pref_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *src = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int ref_stride = ms_buffers->ref->stride; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse); +} + +// Calculates the variance of prediction residue +static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm, + const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + unsigned int *sse) { + const aom_variance_fn_ptr_t *vfp = var_params->vfp; + const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type; + const int w = var_params->w; + const int h = var_params->h; + + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv); + const int ref_stride = ms_buffers->ref->stride; + + const int subpel_x_q3 = get_subpel_part(this_mv->col); + const int subpel_y_q3 = get_subpel_part(this_mv->row); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + unsigned int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h, + subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd, + subpel_search_type); + besterr = vfp->ovf(pred8, w, wsrc, mask, sse); + } else { + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); + } +#else + aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3, + subpel_y_q3, ref, ref_stride, subpel_search_type); + + besterr = vfp->ovf(pred, w, wsrc, mask, sse); +#endif + return besterr; +} + +static unsigned int setup_obmc_center_error( + const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + // TODO(chiyotsai@google.com): There might be a bug here where we didn't use + // get_buf_from_mv(ref, *this_mv). + const MSBuffers *ms_buffers = &var_params->ms_buffers; + const int32_t *wsrc = ms_buffers->wsrc; + const int32_t *mask = ms_buffers->obmc_mask; + const uint8_t *ref = ms_buffers->ref->buf; + const int ref_stride = ms_buffers->ref->stride; + unsigned int besterr = + var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1); + *distortion = besterr; + besterr += mv_err_cost_(this_mv, mv_cost_params); + return besterr; +} + +static unsigned int upsampled_setup_obmc_center_error( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) { + unsigned int besterr = + upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1); + *distortion = besterr; + besterr += mv_err_cost_(this_mv, mv_cost_params); + return besterr; +} + +// Estimates the variance of prediction residue +// TODO(chiyotsai@google.com): the cost does does not match the cost in +// mv_cost_. Investigate this later. +static INLINE int estimate_obmc_mvcost(const MV *this_mv, + const MV_COST_PARAMS *mv_cost_params) { + const MV *ref_mv = mv_cost_params->ref_mv; + const int *mvjcost = mv_cost_params->mvjcost; + const int *const *mvcost = mv_cost_params->mvcost; + const int error_per_bit = mv_cost_params->error_per_bit; + const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type; + const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row), + GET_MV_SUBPEL(this_mv->col - ref_mv->col) }; + + switch (mv_cost_type) { + case MV_COST_ENTROPY: + return (unsigned)((mv_cost(&diff_mv, mvjcost, + CONVERT_TO_CONST_MVCOST(mvcost)) * + error_per_bit + + 4096) >> + 13); + case MV_COST_NONE: return 0; + default: + assert(0 && "L1 norm is not tuned for estimated obmc mvcost"); + return 0; + } +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static INLINE unsigned int obmc_check_better_fast( + const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse); + + cost = estimate_obmc_mvcost(this_mv, mv_cost_params); + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +// Estimates whether this_mv is better than best_mv. This function incorporates +// both prediction error and residue into account. +static INLINE unsigned int obmc_check_better( + MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv, + const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion, int *has_better_mv) { + unsigned int cost; + if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) { + unsigned int sse; + const int thismse = + upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse); + cost = mv_err_cost_(this_mv, mv_cost_params); + + cost += thismse; + + if (cost < *besterr) { + *besterr = cost; + *best_mv = *this_mv; + *distortion = thismse; + *sse1 = sse; + *has_better_mv |= 1; + } + } else { + cost = INT_MAX; + } + return cost; +} + +static AOM_FORCE_INLINE MV obmc_first_level_check( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv, + const int hstep, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + int dummy = 0; + const MV left_mv = { this_mv.row, this_mv.col - hstep }; + const MV right_mv = { this_mv.row, this_mv.col + hstep }; + const MV top_mv = { this_mv.row - hstep, this_mv.col }; + const MV bottom_mv = { this_mv.row + hstep, this_mv.col }; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + const unsigned int left = + obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int right = + obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int up = + obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + const unsigned int down = + obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + return diag_step; + } else { + const unsigned int left = obmc_check_better_fast( + &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, + distortion, &dummy); + const unsigned int right = obmc_check_better_fast( + &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy); + + const unsigned int up = obmc_check_better_fast( + &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1, + distortion, &dummy); + + const unsigned int down = obmc_check_better_fast( + &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, + sse1, distortion, &dummy); + + const MV diag_step = get_best_diag_step(hstep, left, right, up, down); + const MV diag_mv = { this_mv.row + diag_step.row, + this_mv.col + diag_step.col }; + + // Check the diagonal direction with the best mv + obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, &dummy); + + return diag_step; + } +} + +// A newer version of second level check for obmc that gives better quality. +static AOM_FORCE_INLINE void obmc_second_level_check_v2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step, + MV *best_mv, const SubpelMvLimits *mv_limits, + const SUBPEL_SEARCH_VAR_PARAMS *var_params, + const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr, + unsigned int *sse1, int *distortion) { + assert(best_mv->row == this_mv.row + diag_step.row || + best_mv->col == this_mv.col + diag_step.col); + if (CHECK_MV_EQUAL(this_mv, *best_mv)) { + return; + } else if (this_mv.row == best_mv->row) { + // Search away from diagonal step since diagonal search did not provide any + // improvement + diag_step.row *= -1; + } else if (this_mv.col == best_mv->col) { + diag_step.col *= -1; + } + + const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col }; + const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col }; + const MV diag_bias_mv = { best_mv->row + diag_step.row, + best_mv->col + diag_step.col }; + int has_better_mv = 0; + + if (var_params->subpel_search_type != USE_2_TAPS_ORIG) { + obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } else { + obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + + // Do an additional search if the second iteration gives a better mv + if (has_better_mv) { + obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params, + mv_cost_params, besterr, sse1, distortion, + &has_better_mv); + } + } +} + +int av1_find_best_obmc_sub_pixel_tree_up( + MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion, + unsigned int *sse1, int_mv *last_mv_search_list) { + (void)last_mv_search_list; + (void)start_mv_stats; + const int allow_hp = ms_params->allow_hp; + const int forced_stop = ms_params->forced_stop; + const int iters_per_step = ms_params->iters_per_step; + const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; + const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params; + const SUBPEL_SEARCH_TYPE subpel_search_type = + ms_params->var_params.subpel_search_type; + const SubpelMvLimits *mv_limits = &ms_params->mv_limits; + + int hstep = INIT_SUBPEL_STEP_SIZE; + const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp); + + unsigned int besterr = INT_MAX; + *bestmv = start_mv; + + if (subpel_search_type != USE_2_TAPS_ORIG) + besterr = upsampled_setup_obmc_center_error( + xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion); + else + besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1, + distortion); + + for (int iter = 0; iter < round; ++iter) { + MV iter_center_mv = *bestmv; + MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + + if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) { + obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv, + mv_limits, var_params, mv_cost_params, + &besterr, sse1, distortion); + } + hstep >>= 1; + } + + return besterr; +} + +// ============================================================================= +// Public cost function: mv_cost + pred error +// ============================================================================= +int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, const struct buf_2d *pre) { + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int sse, var; + + var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv), + pre->stride, &sse); + (void)var; + + return sse + mv_err_cost_(&mv, mv_cost_params); +} + +static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre) { + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int unused; + + return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0, + src->buf, src->stride, &unused, second_pred) + + mv_err_cost_(&mv, mv_cost_params); +} + +static INLINE int get_mvpred_mask_var( + const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, int mask_stride, + int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src, + const struct buf_2d *pre) { + const MV mv = get_mv_from_fullmv(&best_mv); + unsigned int unused; + + return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0, + src->buf, src->stride, second_pred, mask, mask_stride, + invert_mask, &unused) + + mv_err_cost_(&mv, mv_cost_params); +} + +int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre) { + if (mask) { + return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask, + mask_stride, invert_mask, vfp, src, pre); + } else { + return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src, + pre); + } +} diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h new file mode 100644 index 0000000000..87b9309b61 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp.h @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MCOMP_H_ +#define AOM_AV1_ENCODER_MCOMP_H_ + +#include "av1/common/mv.h" +#include "av1/encoder/block.h" +#include "av1/encoder/rd.h" + +#include "aom_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct SPEED_FEATURES; + +// ============================================================================= +// Cost functions +// ============================================================================= + +enum { + MV_COST_ENTROPY, // Use the entropy rate of the mv as the cost + MV_COST_L1_LOWRES, // Use the l1 norm of the mv as the cost (<480p) + MV_COST_L1_MIDRES, // Use the l1 norm of the mv as the cost (>=480p) + MV_COST_L1_HDRES, // Use the l1 norm of the mv as the cost (>=720p) + MV_COST_NONE // Use 0 as as cost irrespective of the current mv +} UENUM1BYTE(MV_COST_TYPE); + +typedef struct { + // The reference mv used to compute the mv cost + const MV *ref_mv; + FULLPEL_MV full_ref_mv; + MV_COST_TYPE mv_cost_type; + const int *mvjcost; + const int *mvcost[2]; + int error_per_bit; + // A multiplier used to convert rate to sad cost + int sad_per_bit; +} MV_COST_PARAMS; + +int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, + int *const mvcost[2], int weight); + +int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, + const FULLPEL_MV best_mv, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, const struct buf_2d *pre); +int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params, + const FULLPEL_MV best_mv, + const uint8_t *second_pred, const uint8_t *mask, + int mask_stride, int invert_mask, + const aom_variance_fn_ptr_t *vfp, + const struct buf_2d *src, + const struct buf_2d *pre); + +// ============================================================================= +// Motion Search +// ============================================================================= +typedef struct { + // The reference buffer + const struct buf_2d *ref; + + // The source and predictors/mask used by translational search + const struct buf_2d *src; + const uint8_t *second_pred; + const uint8_t *mask; + int mask_stride; + int inv_mask; + + // The weighted source and mask used by OBMC + const int32_t *wsrc; + const int32_t *obmc_mask; +} MSBuffers; + +static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers, + const uint8_t *second_pred, + const uint8_t *mask, + int mask_stride, int invert_mask) { + ms_buffers->second_pred = second_pred; + ms_buffers->mask = mask; + ms_buffers->mask_stride = mask_stride; + ms_buffers->inv_mask = invert_mask; +} + +// ============================================================================= +// Fullpixel Motion Search +// ============================================================================= +// This struct holds fullpixel motion search parameters that should be constant +// during the search +typedef struct { + BLOCK_SIZE bsize; + // A function pointer to the simd function for fast computation + const aom_variance_fn_ptr_t *vfp; + + MSBuffers ms_buffers; + + // WARNING: search_method should be regarded as a private variable and should + // not be modified directly so it is in sync with search_sites. To modify it, + // use av1_set_mv_search_method. + SEARCH_METHODS search_method; + const search_site_config *search_sites; + FullMvLimits mv_limits; + + int run_mesh_search; // Sets mesh search unless it got pruned by + // prune_mesh_search. + int prune_mesh_search; // Disables mesh search if the best_mv after a normal + // search if close to the start_mv. + int mesh_search_mv_diff_threshold; // mv diff threshold to enable + // prune_mesh_search + int force_mesh_thresh; // Forces mesh search if the residue variance is + // higher than the threshold. + const struct MESH_PATTERN *mesh_patterns[2]; + + // Use maximum search interval of 4 if true. This helps motion search to find + // the best motion vector for screen content types. + int fine_search_interval; + + int is_intra_mode; + + int fast_obmc_search; + + // For calculating mv cost + MV_COST_PARAMS mv_cost_params; + + // Stores the function used to compute the sad. This can be different from the + // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up. + aom_sad_fn_t sdf; + aom_sad_multi_d_fn_t sdx4df; + aom_sad_multi_d_fn_t sdx3df; +} FULLPEL_MOTION_SEARCH_PARAMS; + +typedef struct { + int err_cost; + unsigned int distortion; + unsigned int sse; +} FULLPEL_MV_STATS; + +void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer); + +void av1_make_default_fullpel_ms_params( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv, + const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], + SEARCH_METHODS search_method, int fine_search_interval); + +/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */ +void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const IntraBCMVCosts *dv_costs); + +// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method. +void av1_init_dsmotion_compensation(search_site_config *cfg, int stride, + int level); +// Sets up configs for firstpass motion search. +void av1_init_motion_fpf(search_site_config *cfg, int stride); +// Sets up configs for NSTEP / NSTEP_8PT motion search method. +void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride, + int level); +// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA +// motion search method. +void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride, + int level); +// Sets up configs for HEX or FAST_HEX motion search method. +void av1_init_motion_compensation_hex(search_site_config *cfg, int stride, + int level); +// Sets up configs for SQUARE motion search method. +void av1_init_motion_compensation_square(search_site_config *cfg, int stride, + int level); + +/*! Function pointer to search site config initialization of different search + * method functions. */ +typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride, + int level); + +/*! Array of function pointers used to set the motion search config. */ +extern const av1_init_search_site_config + av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS]; + +// Array to inform which all search methods are having +// same candidates and different in number of search steps. +static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = { + DIAMOND, // DIAMOND + NSTEP, // NSTEP + NSTEP_8PT, // NSTEP_8PT + CLAMPED_DIAMOND, // CLAMPED_DIAMOND + HEX, // HEX + BIGDIA, // BIGDIA + SQUARE, // SQUARE + HEX, // FAST_HEX + BIGDIA, // FAST_DIAMOND + BIGDIA, // FAST_BIGDIA + BIGDIA // VFAST_DIAMOND +}; + +// Reinitialize the search site config. +static AOM_INLINE void av1_refresh_search_site_config( + search_site_config *ss_cfg_buf, SEARCH_METHODS search_method, + const int ref_stride) { + const int level = + search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND; + search_method = search_method_lookup[search_method]; + av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method], + ref_stride, level); +} + +// Mv beyond the range do not produce new/different prediction block. +static INLINE void av1_set_mv_search_method( + FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS], + SEARCH_METHODS search_method) { + ms_params->search_method = search_method; + ms_params->search_sites = + &search_sites[search_method_lookup[ms_params->search_method]]; +} + +// Set up limit values for MV components. +// Mv beyond the range do not produce new/different prediction block. +static INLINE void av1_set_mv_row_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_row, int mi_height, int border) { + const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); + const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); + mv_limits->row_min = AOMMAX(min1, min2); + const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE + + border - 2 * AOM_INTERP_EXTEND; + const int max2 = + (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND; + mv_limits->row_max = AOMMIN(max1, max2); +} + +static INLINE void av1_set_mv_col_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_col, int mi_width, int border) { + const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND); + const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND); + mv_limits->col_min = AOMMAX(min1, min2); + const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border - + 2 * AOM_INTERP_EXTEND; + const int max2 = + (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND; + mv_limits->col_max = AOMMIN(max1, max2); +} + +static INLINE void av1_set_mv_limits( + const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits, + int mi_row, int mi_col, int mi_height, int mi_width, int border) { + av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border); + av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border); +} + +void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv); + +int av1_init_search_range(int size); + +unsigned int av1_int_pro_motion_estimation( + const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv, unsigned int *y_sad_zero, + int me_search_size_col, int me_search_size_row); + +int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const FULLPEL_MV start_mv, FULLPEL_MV *best_mv); + +int av1_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, int *cost_list, + FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats, + FULLPEL_MV *second_best_mv); + +int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + IntraBCHashInfo *intrabc_hash_info, + FULLPEL_MV *best_mv); + +int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv, + const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, + const int step_param, FULLPEL_MV *best_mv); + +static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits, + FULLPEL_MV mv) { + return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && + (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); +} +// ============================================================================= +// Subpixel Motion Search +// ============================================================================= +enum { + EIGHTH_PEL, + QUARTER_PEL, + HALF_PEL, + FULL_PEL +} UENUM1BYTE(SUBPEL_FORCE_STOP); + +typedef struct { + const aom_variance_fn_ptr_t *vfp; + SUBPEL_SEARCH_TYPE subpel_search_type; + // Source and reference buffers + MSBuffers ms_buffers; + int w, h; +} SUBPEL_SEARCH_VAR_PARAMS; + +// This struct holds subpixel motion search parameters that should be constant +// during the search +typedef struct { + // High level motion search settings + int allow_hp; + const int *cost_list; + SUBPEL_FORCE_STOP forced_stop; + int iters_per_step; + SubpelMvLimits mv_limits; + + // For calculating mv cost + MV_COST_PARAMS mv_cost_params; + + // Distortion calculation params + SUBPEL_SEARCH_VAR_PARAMS var_params; +} SUBPEL_MOTION_SEARCH_PARAMS; + +void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + const struct AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *ref_mv, const int *cost_list); + +typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + MV start_mv, + const FULLPEL_MV_STATS *start_mv_stats, + MV *bestmv, int *distortion, + unsigned int *sse1, + int_mv *last_mv_search_list); + +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned; +extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more; +extern fractional_mv_step_fp av1_return_max_sub_pixel_mv; +extern fractional_mv_step_fp av1_return_min_sub_pixel_mv; +extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up; + +unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm, + const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, + BLOCK_SIZE bsize, const int *pts0, + const int *pts_inref0, int total_samples, + WARP_SEARCH_METHOD search_method, + int num_iterations); + +static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) { + for (int z = 0; z < 3; z++) { + fractional_best_mv[z].as_int = INVALID_MV; + } +} + +static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits, + const FullMvLimits *mv_limits, + const MV *ref_mv) { + const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL); + int minc = AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv); + int maxc = AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv); + int minr = AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv); + int maxr = AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv); + + maxc = AOMMAX(minc, maxc); + maxr = AOMMAX(minr, maxr); + + subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc); + subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc); + subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr); + subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr); +} + +static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits, + MV mv) { + return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) && + (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max); +} + +static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) { + return mv->row * stride + mv->col; +} + +static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf, + const FULLPEL_MV *mv) { + return &buf->buf[get_offset_from_fullmv(mv, buf->stride)]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MCOMP_H_ diff --git a/third_party/aom/av1/encoder/mcomp_structs.h b/third_party/aom/av1/encoder/mcomp_structs.h new file mode 100644 index 0000000000..06660cf4a6 --- /dev/null +++ b/third_party/aom/av1/encoder/mcomp_structs.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ +#define AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ + +#include "av1/common/mv.h" + +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 11 +// Max full pel mv specified in the unit of full pixel +// Enable the use of motion vector in range [-1023, 1023]. +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) +// Maximum number of neighbors to scan per iteration during +// WARPED_CAUSAL refinement +// Note: The elements of warp_search_config.neighbor_mask must be at least +// MAX_WARP_SEARCH_NEIGHBORS many bits wide. So the type may need to be +// widened if this value is increased. +#define MAX_WARP_SEARCH_NEIGHBORS 8 + +#define SEARCH_RANGE_8P 3 +#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1) +#define SEARCH_GRID_CENTER_8P \ + (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P) + +typedef struct { + FULLPEL_MV coord; + int coord_offset; +} search_neighbors; +// motion search site +typedef struct search_site { + FULLPEL_MV mv; + int offset; +} search_site; + +typedef struct search_site_config { + search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1]; + // Number of search steps. + int num_search_steps; + int searches_per_step[MAX_MVSEARCH_STEPS * 2]; + int radius[MAX_MVSEARCH_STEPS * 2]; + int stride; +} search_site_config; + +enum { + // Search 8-points in the radius grid around center, up to 11 search stages. + DIAMOND = 0, + // Search 12-points in the radius/tan_radius grid around center, + // up to 15 search stages. + NSTEP = 1, + // Search 8-points in the radius grid around center, up to 16 search stages. + NSTEP_8PT = 2, + // Search 8-points in the radius grid around center, upto 11 search stages + // with clamping of search radius. + CLAMPED_DIAMOND = 3, + // Search maximum 8-points in the radius grid around center, + // up to 11 search stages. First stage consists of 8 search points + // and the rest with 6 search points each in hex shape. + HEX = 4, + // Search maximum 8-points in the radius grid around center, + // up to 11 search stages. First stage consists of 4 search + // points and the rest with 8 search points each. + BIGDIA = 5, + // Search 8-points in the square grid around center, up to 11 search stages. + SQUARE = 6, + // HEX search with up to 2 stages. + FAST_HEX = 7, + // BIGDIA search with up to 2 stages. + FAST_DIAMOND = 8, + // BIGDIA search with up to 3 stages. + FAST_BIGDIA = 9, + // BIGDIA search with up to 1 stage. + VFAST_DIAMOND = 10, + // Total number of search methods. + NUM_SEARCH_METHODS, + // Number of distinct search methods. + NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1, +} UENUM1BYTE(SEARCH_METHODS); + +typedef struct warp_search_config { + int num_neighbors; + MV neighbors[MAX_WARP_SEARCH_NEIGHBORS]; + // Bitmask which is used to prune the search neighbors at one iteration + // based on which direction we chose in the previous iteration. + // See comments in av1_refine_warped_mv for details. + uint8_t neighbor_mask[MAX_WARP_SEARCH_NEIGHBORS]; +} warp_search_config; + +// Methods for refining WARPED_CAUSAL motion vectors +enum { + // Search 4 adjacent points in a diamond shape at each iteration + WARP_SEARCH_DIAMOND, + // Search 8 adjacent points in a square at each iteration + WARP_SEARCH_SQUARE, + WARP_SEARCH_METHODS +} UENUM1BYTE(WARP_SEARCH_METHOD); + +#endif // AOM_AV1_ENCODER_MCOMP_STRUCTS_H_ diff --git a/third_party/aom/av1/encoder/misc_model_weights.h b/third_party/aom/av1/encoder/misc_model_weights.h new file mode 100644 index 0000000000..f00aeabcf6 --- /dev/null +++ b/third_party/aom/av1/encoder/misc_model_weights.h @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define MV_PREC_FEATURE_SIZE 18 + +#define NUM_DNN_LAYERS 1 +#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE +#define MV_PREC_LAYER_SIZE_0 32 +#define NUM_LOGITS 1 + +const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f, + 141.6251917346238f, + 0.36313633945679064f, + 0.0028162791958822085f, + 0.000484820537626698f, + 0.002769969388939025f, + 0.0f, + 0.00031274626720947577f, + 0.00020578555375160075f, + 0.0007075246732697733f, + 0.000539641029909925f, + 0.0013939401375906984f, + 4.985394760423499f, + 4.985394760423499f, + 4.9992148717283085f, + 5.143739822380163f, + 5.518483124004564f, + 87.63597847427077f }; + +const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f, + 68.04472572607503f, + 13.23247674430399f, + 0.0029123438396921955f, + 0.0015331406169374737f, + 0.0029149813096313775f, + 1.0f, + 0.00047501102871357813f, + 0.00030025962993117947f, + 0.0009861163580391207f, + 0.0012157593528004055f, + 0.002004954948490521f, + 6.539447500484038f, + 6.539447500484038f, + 6.396589058279465f, + 3.4870155874262516f, + 3.8911353973740535f, + 112.07985259573601f }; + +const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f, + -0.1483527373474774f, + 0.08112076098858864f, + -0.9582568679627453f, + -0.34794757171071206f, + 0.6465225723304947f, + 0.0f, + 0.06754171885839604f, + 0.27156803620541214f, + 0.10635231245664407f, + -0.031183926995968583f, + 0.048122572260291f, + -0.19498534230045128f, + -0.2614116319273316f, + -0.3223762845136331f, + -1.2063368350609205f, + -0.523333556911706f, + 1.075632260890728f, + 0.48989726814387946f, + -0.34816466111070477f, + 0.41668357610256473f, + -1.0973562848791671f, + 0.04183921854389494f, + -0.9123815389260476f, + 0.0f, + 0.859965047744027f, + 0.1962095804679813f, + 0.2606564339077058f, + 0.26695868715184895f, + 0.5319308568326692f, + -0.23717505799723165f, + -0.43127224481782567f, + -0.3214545776203726f, + 0.5850852241402176f, + -0.26705531612587813f, + -0.5786016766610093f, + 0.9360519909983003f, + 0.20771329289016555f, + -0.027614159544811823f, + -1.175022807046164f, + -0.07578967497693835f, + 0.6890172485324256f, + 0.0f, + -0.008008338164988263f, + -0.08064800010158935f, + -0.22606910981666667f, + 0.4541586669210879f, + 0.07731527661370792f, + -0.6744475941247964f, + -0.2625842448396184f, + 1.7018613444303785f, + -0.08622229073162656f, + 0.041858142814941275f, + -0.24575964090386415f, + -0.046626044730994964f, + 0.7608713064175202f, + -0.23330119070907146f, + -0.10115510984500826f, + 0.9722537349192069f, + 0.11718554254290829f, + 0.0f, + 0.2075123446014759f, + 0.09465167310768637f, + 0.7609896851963016f, + 0.4441038581385328f, + 0.26064144727430955f, + -0.14678625366485035f, + -0.03597014452200524f, + 0.3128680867196166f, + 1.102496797385966f, + 0.06642253233084111f, + -1.2665494483407629f, + 0.09049412632000911f, + -1.1160621999565095f, + 0.043420275255913035f, + -0.8811412259978966f, + 0.21076234632287777f, + 0.16571534463543866f, + 0.0f, + -0.7324075176473275f, + -0.3677622514459495f, + 0.3273532243056415f, + 0.22922161936797775f, + 0.8204766691058087f, + 0.02982161033720488f, + 0.5266419954188112f, + -1.0032154963302191f, + 0.7007602969763729f, + 0.37196355167990885f, + -0.7608579453228548f, + 0.08568111584781847f, + 0.07011061059123677f, + 0.3233263598082507f, + -0.08249928295410253f, + 0.08220165761319252f, + 0.22148722752246794f, + 0.0f, + 0.6122392701743506f, + -0.26429838296378333f, + 0.31958081620005463f, + -0.006027177397853826f, + -0.3088310785887994f, + -0.5436192046707807f, + -0.011080356757423306f, + 0.12632650770008413f, + -0.45097913215234525f, + 1.8008072867127298f, + -0.7630029654575501f, + -0.4054774329826579f, + 0.40386074452544535f, + -0.18541426257453025f, + 0.2444879765079863f, + -0.6216724756115081f, + 0.27030299321302f, + 0.0f, + -0.6835848952967989f, + -0.7914184320964815f, + -0.6761595019582928f, + -1.009565565604081f, + -0.1904242439353305f, + 0.4463417126318631f, + 0.6025503823452971f, + 0.5149990860115566f, + 1.0242970663937634f, + 0.037947306826401385f, + 0.07039339786212848f, + 0.14273796789711987f, + 0.168103961425691f, + 1.6596066376811978f, + 0.19321092229384657f, + -0.3710750388148514f, + -0.01717015559410288f, + 0.0f, + 0.3005688477942597f, + 0.23877080653829577f, + 0.2718594552971173f, + 0.3885402571589898f, + 0.32999531945669247f, + -0.6134460954213243f, + -0.13972265462799183f, + -0.07180089575716991f, + -1.014572598188105f, + 0.0717207322809836f, + 0.34896157745155615f, + -0.27127687591403f, + -0.5058651212773623f, + -1.5442435628306925f, + -0.6399784724734707f, + 0.6274301429074947f, + -0.4645750072767051f, + 0.0f, + -0.2406726815244178f, + -0.06321214115916597f, + 0.312856714253404f, + 0.16459514124116134f, + 0.3993579604809623f, + -0.15232044351561913f, + -0.5613743948568469f, + 0.7219801372223262f, + 0.2936857469624009f, + 0.7823466656034087f, + -0.12416947814098349f, + -0.36413756654028345f, + -0.07992098796866462f, + -0.7395722879842416f, + 0.8639913543220514f, + -0.311931773757945f, + -1.7308240470400613f, + 0.0f, + 0.394499716712104f, + 0.6511462819539963f, + -0.0722425275974144f, + 0.13490818194661386f, + 0.055319135836378035f, + 0.15389577508097013f, + 0.28958598328870605f, + -0.14608429470539772f, + 0.09488817462478298f, + -0.17231294096622088f, + 0.6721115415911466f, + -0.05664621150536103f, + 0.03291799673669331f, + 0.02845382711057482f, + -0.9953563446999164f, + -0.17994298220605923f, + 0.6560824519337476f, + 0.0f, + -0.30990646375917935f, + 0.17215517202874f, + 0.2026816225170481f, + 0.22011958747715601f, + 0.3562520768889686f, + -0.18436559057189175f, + 0.1733377147302066f, + 0.02818276995640877f, + -0.29703005574859076f, + -0.3310652639215064f, + -1.6091173258529277f, + 0.45461585790028003f, + -0.5078643334592593f, + -0.338997374732338f, + 0.4688619590359733f, + 0.627099126828289f, + -0.5249801376494249f, + 0.0f, + 0.34465498218272883f, + 0.009891680630908135f, + -0.27244020967349f, + 0.05404589867626979f, + -0.06220329325739666f, + -0.13365376464759104f, + -0.13098573553512366f, + 0.11434198976289106f, + 0.6740951247574676f, + 1.3381727185724581f, + -1.4865773213251936f, + 0.05809898701966341f, + 0.25380780261023456f, + 1.2716367496512722f, + 0.1768290070780598f, + -0.07554828135356352f, + 0.8180570085344856f, + 0.0f, + 1.0788448980077463f, + 0.0651938742459459f, + 0.3807672030015587f, + 0.6144792680268445f, + 0.011660612214908059f, + -0.018306023765580288f, + 0.44140813809926516f, + -0.13411994195502386f, + 0.15920368955127778f, + -0.19382358417849888f, + -0.08802147969690055f, + -0.019731052733814477f, + 0.1104744229169665f, + -0.195834419735958f, + -0.5005295046454347f, + -0.17041241868229032f, + -0.471942117351489f, + 0.0f, + -0.3599073304761372f, + -0.2745532782968519f, + -0.8323064841106417f, + -0.88355885384943f, + -0.02826466859020679f, + 0.06977870308805256f, + 0.11926112095374196f, + 1.367382707959643f, + -0.06119843162964051f, + -0.5331395268889569f, + -1.2155531584240624f, + -0.01896651779524327f, + 0.10591845408571081f, + -0.010632842156504733f, + 0.6150787968629282f, + -0.4191690185896091f, + -0.9961718918346271f, + 0.0f, + 0.23370364516013867f, + 0.4156033072362998f, + 0.1261005546633433f, + 0.0812413884532226f, + -0.008894337353937203f, + 0.07984447025056046f, + -0.1258098052766725f, + -0.40245475467767916f, + 1.78188906675019f, + -1.1544387954232302f, + -0.41768781481273387f, + 0.6791211165341995f, + -0.4175127856183446f, + -0.07353219159767788f, + -0.2888813577574072f, + -0.7107767892597061f, + -1.0450031091195449f, + 0.0f, + -0.9221599545079143f, + -0.6747876356740621f, + 0.30241454354872105f, + 0.4924965303373908f, + -0.14042722740054084f, + 0.27744210409350445f, + -0.14788270997426836f, + -0.9081467469237995f, + -0.04513115674995093f, + -0.5254168669125793f, + -0.6999012037974789f, + 0.434661246306547f, + -0.7193303957246092f, + -0.9117952623409744f, + -1.5097267865916142f, + -0.20779888103770922f, + 0.4935562480901218f, + 0.0f, + 0.18303393908923593f, + 0.34753722677570037f, + 0.29291001533177663f, + 0.3832351878354224f, + 0.3295194956120599f, + -0.32398033003617527f, + -0.31570906736433746f, + 0.23657779050372962f, + 0.9510794465234161f, + -0.5122243902568278f, + 0.08652112725315658f, + 0.2246634353717998f, + -0.9032595595582497f, + -0.8936484034533545f, + 0.6012969720865752f, + -0.6454216646117924f, + -1.1753786049658332f, + 0.0f, + -0.4360545677728656f, + -0.6586237455328507f, + -0.34347301697886656f, + -0.8909724651992144f, + -0.24378721818350263f, + 0.6179733359297576f, + 0.0661661181742234f, + -0.14120142044993794f, + -0.07732699885498932f, + 1.0221355882357506f, + 0.44514798994115284f, + -0.7371569579959046f, + -0.7212499572378936f, + 0.7453626921081045f, + 0.5478757761345768f, + -0.39411232789985384f, + 0.7200542656743857f, + 0.0f, + -0.11790869453118827f, + -0.12317030713581928f, + -0.4207902738133338f, + 0.15895105878327986f, + 0.304261777102111f, + 0.11450744587017621f, + -0.11470709991317944f, + 0.5949222371739038f, + 0.6549518619412444f, + -0.24390606570422838f, + -0.4212796009440803f, + -0.6269666206320964f, + -0.5421193969807078f, + -0.12297772128652287f, + 0.021517257619930424f, + 0.25462855095544523f, + -0.22107798187348246f, + 0.0f, + 0.5204516300095662f, + 0.2837402841862462f, + 0.11310823283285916f, + 0.8944351685018025f, + 0.17487203235834015f, + -0.5271221928634433f, + -0.19516594503423199f, + 0.452456617580365f, + 1.2456272242706414f, + 0.24166615894862817f, + 0.09411429305204502f, + -0.2730072283327243f, + -0.8129383770918172f, + -0.24093254193486136f, + 0.5696499174142177f, + -0.11110805836073044f, + -0.3968204166235694f, + 0.0f, + -0.04388165369378549f, + -0.005631266017272595f, + -0.02574211858479705f, + 0.06230399626660669f, + 0.17677671232932785f, + 0.5172871274400965f, + 0.4919150085620063f, + -1.597656637582941f, + 0.02415185715719143f, + -0.17945446376668306f, + -0.39340600199798886f, + 0.25013205256886845f, + 0.05972330340308685f, + 0.1359911505596489f, + -0.02341033271820833f, + 0.15726074644063684f, + 0.47512625913020357f, + 0.0f, + 0.7327341664835779f, + -0.3689092312320013f, + 0.4571824787436036f, + 0.6215465537945456f, + 0.0944111296842023f, + -0.12571956176607574f, + -0.2507235674395462f, + -0.09579602654351593f, + 1.4463357293728496f, + 0.749153535856049f, + -0.5553955120807588f, + -0.09622771929369946f, + -0.2598697420394813f, + -0.964691815299676f, + -0.8289963178173902f, + 0.7112949291983329f, + -0.8667009730492162f, + 0.0f, + -0.48698304169042794f, + -0.18786095669893707f, + -0.11425249263203247f, + -0.3693391011684809f, + 0.09933145842585253f, + 0.2568559685298844f, + 0.7048512233651738f, + 0.6056238412407038f, + -0.4355558119826642f, + 0.17318931883915484f, + 0.6481333496429564f, + -0.45728823054344486f, + -0.006325004538589701f, + 0.45609864075494927f, + -0.6199385981116988f, + 0.035105808783046165f, + 0.1203147963894839f, + 0.0f, + 0.383402190836527f, + 0.048429009055370106f, + 0.5887186439275204f, + -0.20538767641607814f, + -0.031237879611002117f, + 0.3140759860883231f, + 0.24447070584999556f, + 0.7271263905705878f, + 0.8432799162434237f, + -0.11530577554199217f, + -0.7781023892314718f, + 0.05359488822710336f, + 0.5624870388700809f, + 0.5134656523208906f, + 0.18304041423438375f, + -0.04237421156328257f, + -0.20759809886942207f, + 0.0f, + -0.06249337454975615f, + 0.10081284533873777f, + 0.3894374350259183f, + 1.518217777528342f, + -0.9100037950171563f, + 0.17796906121831477f, + -0.2892167255357892f, + 0.6117902467884032f, + 0.13332120964959573f, + -0.3487155932849374f, + -0.32920583745734694f, + 0.08242631209809854f, + -0.24920225708110588f, + 0.8401757259392635f, + 0.11729108681358365f, + 0.11222925752499184f, + -0.027078490721459958f, + 0.0f, + 0.726132375517389f, + 0.72220359881096f, + 0.5721582611845177f, + 0.15139162075524315f, + 0.6676549461551197f, + -0.321449586554697f, + -0.10141104515219895f, + -0.09711123988777906f, + 0.9623356184776928f, + -0.7941822373167173f, + -0.9373923554119346f, + 0.4573241832354059f, + -0.42029139056126147f, + 0.2675223459380999f, + -0.5487300191551386f, + 0.2236621891916084f, + 0.11692039230044018f, + 0.0f, + 0.1758399202780961f, + 0.676447587678781f, + 0.5945412815881029f, + 0.5669863357359594f, + 0.8433565415303922f, + -0.30300550790708036f, + -0.43332881999693673f, + -0.4996522695731392f, + -0.2084930815451962f, + 0.27765278702463786f, + 1.0886848763946915f, + -0.0739433655813831f, + -0.4762801579229192f, + -0.2490825339320731f, + -1.8820479350439439f, + -0.4251592225775914f, + -0.3992922365484464f, + 0.0f, + 0.19598917760218867f, + 0.4860238022746914f, + 0.3364528828641281f, + 0.3350950865226741f, + 0.2773654548632006f, + -0.30547262140782566f, + 0.028649620490728344f, + -0.11763407628280315f, + 0.6237318502627169f, + -0.3958952632477945f, + 0.14797171297835243f, + 0.45821729624747465f, + -0.8687137170773626f, + 0.06989667196937126f, + -0.5752606929478727f, + 0.16986945686358412f, + 0.6925071596817824f, + 0.0f, + 0.4991250796183003f, + 0.03424654896322111f, + 0.6153698611882319f, + 0.5070872444849457f, + 0.43615747516328135f, + -0.7870352838659244f, + -0.6424101231965247f, + -0.7005774876651399f, + 0.79983115431488f, + 0.15720357955596242f, + -1.408372612176309f, + -0.039294695217213765f, + 0.6979415372962309f, + 0.27403316751965656f, + 1.2844596102619275f, + -0.2781534150257364f, + 0.3248437714908865f, + 0.0f, + 0.4364362371752831f, + -0.2548580911485434f, + -0.19578001373349452f, + -0.04597194387828005f, + -0.010035156855533233f, + 0.0415941475251266f, + 0.07929549739797387f, + -0.060629652912508866f, + 0.5977303008711333f, + -1.4404008068066554f, + 0.8555694790197376f, + -0.03693438534401856f, + 0.17761411164512408f, + -0.11858304304109235f, + -1.4241324353471327f, + 0.1533849765389186f, + 0.7650643783126995f, + 0.0f, + -0.0639949379280401f, + 0.4288617817939563f, + 0.4235508646885404f, + 0.3419843254383798f, + -0.015992360660098768f, + -0.773247697505441f, + -0.4908452922015917f, + 0.9868134897291486f, + -0.5078689994742608f, + 1.05632043744864f, + -0.38867419409275117f, + -0.0065547696858664194f, + -0.3056003173415037f, + -0.333762331930102f, + 0.4459671174011671f, + 0.08219092584580244f, + -0.08099158579518179f, + 0.0f, + -0.1568180656346373f, + -0.061962372393910135f, + 0.14065868174859464f, + -0.055925712798972765f, + 0.05136117465820622f, + 0.0907831030477633f, + 0.19518110495319604f, + -0.7470794578145956f, + 1.5945999734733545f, + -0.4351697502345834f, + -0.33253649399571805f }; + +const float av1_mv_prec_nn_bias_layer_0[] = { + -0.651213833993862f, -1.1243309933417809f, -0.2123880023097051f, + 0.23095477452877616f, -0.6668057665893545f, 0.3082268148379634f, + -0.3344916753975844f, -0.20920185606857844f, 0.6057933917964854f, + 0.5031857662559803f, -1.5380096313468152f, -0.4457245344804041f, + 1.82368055812373f, 0.7973912064077963f, 0.25706500555622913f, + 0.1394695119825382f, 0.4508811973450553f, -0.5408959545111782f, + 1.064829233697863f, 0.3733268644246235f, 1.1173169029905483f, + -0.2012817466400134f, -0.16628447748302294f, 1.3086000088940826f, + 0.7267092979664235f, -0.9097857006590555f, -0.7564259343863077f, + -0.49844128036716173f, -0.4675729246975423f, -0.03626154526362181f, + -0.41957330902404616f, -0.9658160514319954f +}; + +const float av1_mv_prec_nn_weights_layer_1[] = { + 1.5017296484510276f, 1.044216918060133f, -1.066541411740906f, + -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f, + 0.7117244268817873f, -0.7695942296628597f, 0.7892157680137047f, + -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f, + -0.9699580532370483f, 0.5849682956422552f, -1.0372272986941953f, + -0.5005014627824439f, 1.1816204711740521f, -1.2204867615892114f, + 0.4510263977504913f, 0.35567865078585165f, -0.7811389330738839f, + -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f, + 0.8861643352684585f, 0.6438840651522237f, 0.6677191546466089f, + 0.9703715021995785f, 1.250893534236489f, 0.7733742028067933f, + -1.249673977776904f, -1.2890127265725608f +}; + +const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f }; + +static const NN_CONFIG av1_mv_prec_dnn_config = { + NUM_DNN_FEATURES, + NUM_LOGITS, + NUM_DNN_LAYERS, + { MV_PREC_LAYER_SIZE_0 }, + { + av1_mv_prec_nn_weights_layer_0, + av1_mv_prec_nn_weights_layer_1, + }, + { + av1_mv_prec_nn_bias_layer_0, + av1_mv_prec_nn_bias_layer_1, + }, +}; +#undef NUM_DNN_LAYERS +#undef NUM_DNN_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c new file mode 100644 index 0000000000..94cd56c5d1 --- /dev/null +++ b/third_party/aom/av1/encoder/ml.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/ml.h" + +void av1_nn_output_prec_reduce(float *const output, int num_output) { + const int prec_bits = 9; + const int prec = 1 << prec_bits; + const float inv_prec = (float)(1.0 / prec); + for (int i = 0; i < num_output; i++) { + output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec; + } +} + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_c(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int layer = 0; layer < num_layers; ++layer) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (int node = 0; node < num_output_nodes; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_input_nodes; ++i) + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + // ReLU as activation function. + val = val > 0.0f ? val : 0.0f; // Could use AOMMAX(). + output_nodes[node] = val; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + const float *layer_weights = nn_config->weights[num_layers]; + const float *layer_bias = nn_config->bias[num_layers]; + for (int node = 0; node < nn_config->num_outputs; ++node) { + float val = layer_bias[node]; + for (int i = 0; i < num_input_nodes; ++i) + val += layer_weights[node * num_input_nodes + i] * input_nodes[i]; + output[node] = val; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} + +#if CONFIG_NN_V2 +// Applies the ReLu activation to one fc layer +// output[i] = Max(input[i],0.0f) +static float *nn_relu(const float *input, FC_LAYER *layer) { + for (int i = 0; i < layer->num_outputs; ++i) { + layer->output[i] = AOMMAX(input[i], 0.0f); + } + + return layer->output; +} + +// Applies the Sigmoid activation to one fc layer +// output[i] = 1/(1+exp(input[i])) +static float *nn_sigmoid(const float *input, FC_LAYER *layer) { + for (int i = 0; i < layer->num_outputs; ++i) { + const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f); + layer->output[i] = 1.0f / (1.0f + expf(-tmp)); + } + + return layer->output; +} + +// Forward prediction in one fc layer, used in function av1_nn_predict_V2 +static float *nn_fc_forward(const float *input, FC_LAYER *layer) { + const float *weights = layer->weights; + const float *bias = layer->bias; + assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER); + // fc + for (int node = 0; node < layer->num_outputs; ++node) { + float val = bias[node]; + for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i]; + layer->output[node] = val; + weights += layer->num_inputs; + } + + // activation + switch (layer->activation) { + case NONE: return layer->output; + case RELU: return nn_relu(layer->output, layer); + case SIGMOID: return nn_sigmoid(layer->output, layer); + case SOFTSIGN: + assert(0 && "Softsign has not been supported in NN."); // TO DO + return NULL; + default: + assert(0 && "Unknown activation"); // Unknown activation + return NULL; + } +} + +void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config, + int reduce_prec, float *output) { + const float *input_nodes = feature; + + // Propagate the layers. + const int num_layers = nn_config->num_hidden_layers; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (int i = 0; i < num_layers; ++i) { + input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i); + assert(nn_config->layer[i + 1].num_inputs == + nn_config->layer[i].num_outputs); + } + + // Final layer + input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers); + assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits); + // Copy the final layer output + memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits); + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits); +} +#endif // CONFIG_NN_V2 + +void av1_nn_softmax(const float *input, float *output, int n) { + // Softmax function is invariant to adding the same constant + // to all input values, so we subtract the maximum input to avoid + // possible overflow. + float max_input = input[0]; + for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < n; i++) { + // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. + const float normalized_input = AOMMAX(input[i] - max_input, -10.0f); + output[i] = expf(normalized_input); + sum_out += output[i]; + } + for (int i = 0; i < n; i++) output[i] /= sum_out; +} + +void av1_nn_fast_softmax_16_c(const float *input, float *output) { + const int kNumClasses = 16; + float max_input = input[0]; + for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]); + float sum_out = 0.0f; + for (int i = 0; i < kNumClasses; i++) { + // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. + const float normalized_input = AOMMAX(input[i] - max_input, -10.0f); + output[i] = approx_exp(normalized_input); + sum_out += output[i]; + } + for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out; +} diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h new file mode 100644 index 0000000000..566f9271dd --- /dev/null +++ b/third_party/aom/av1/encoder/ml.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ML_H_ +#define AOM_AV1_ENCODER_ML_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/av1_rtcd.h" + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +struct NN_CONFIG { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +}; +// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs + +#if CONFIG_NN_V2 +// Fully-connectedly layer configuration +struct FC_LAYER { + const int num_inputs; // Number of input nodes, i.e. features. + const int num_outputs; // Number of output nodes. + + float *weights; // Weight parameters. + float *bias; // Bias parameters. + const ACTIVATION activation; // Activation function. + + float *output; // The output array. + float *dY; // Gradient of outputs + float *dW; // Gradient of weights. + float *db; // Gradient of bias +}; + +// NN configure structure V2 +struct NN_CONFIG_V2 { + const int num_hidden_layers; // Number of hidden layers, max = 10. + FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1]; // The layer array + const int num_logits; // Number of output nodes. + float *logits; // Raw prediction (same as output of final layer) + const LOSS loss; // Loss function +}; + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config, + int reduce_prec, float *output); +#endif // CONFIG_NN_V2 + +// Applies the softmax normalization function to the input +// to get a valid probability distribution in the output: +// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k])) +void av1_nn_softmax(const float *input, float *output, int n); + +// A faster but less accurate version of av1_nn_softmax(input, output, 16) +void av1_nn_fast_softmax_16_c(const float *input, float *output); + +// Applies a precision reduction to output of av1_nn_predict to prevent +// mismatches between C and SIMD implementations. +void av1_nn_output_prec_reduce(float *const output, int num_output); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ML_H_ diff --git a/third_party/aom/av1/encoder/mode_prune_model_weights.h b/third_party/aom/av1/encoder/mode_prune_model_weights.h new file mode 100644 index 0000000000..98ec36808a --- /dev/null +++ b/third_party/aom/av1/encoder/mode_prune_model_weights.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_HIDDEN_LAYERS_12 1 +#define NUM_FEATURES_12 6 +#define NUM_LAYER_0_UNITS_12 24 +#define NUM_LOGITS_12 2 + +static const float av1_intrap_hiddenlayer_0_kernel_12[] = { + 7.28372f, -1.3333898f, -1.3180022f, -0.007156151f, -0.40799126f, + -0.57538104f, -31.81647f, 6.7057495f, 6.351472f, -0.029544508f, + 0.026801195f, 1.12863f, -0.70769817f, -0.24183524f, 0.0649113f, + -0.7189517f, 0.21791299f, 0.12840256f, -0.56424767f, 0.16924907f, + 0.4605501f, -0.170895f, -0.60358995f, -0.15383226f, -4.0523643f, + 0.6961917f, 1.3100256f, -0.4189354f, 0.37264112f, -0.14555685f, + 10.628014f, 8.184437f, 8.941916f, -0.011731001f, -0.45127156f, + 0.42704004f, 36.84277f, 8.988796f, 8.844238f, 0.00030091056f, + -0.022038324f, 1.3566176f, -8.863219f, -0.84811693f, -1.0908632f, + 0.00023130262f, -1.0698471f, -6.755927f, 7.1711984f, 4.7216063f, + 3.5099216f, -0.6650184f, 0.5935173f, -0.6696286f, 11.8595295f, + 0.3001874f, 0.29822728f, 0.04319222f, -1.203178f, 1.1210147f, + 0.035045594f, -0.20559944f, -0.015388541f, -0.7857941f, -0.94100875f, + -0.1278549f, -19.22603f, 7.9466896f, 6.5048656f, -0.22195444f, + 0.19061874f, 1.3927288f, -8.896529f, -0.48146892f, -1.6098932f, + -0.0030235797f, -0.6533787f, -2.1333003f, -22.256454f, -4.934058f, + -4.4707212f, -0.015831878f, -0.4243649f, -2.776269f, -0.23762038f, + 0.1820098f, -0.51865315f, -1.1893421f, 0.34969202f, 0.10636194f, + 14.545696f, 1.3849198f, 2.6815193f, -0.5145498f, 0.45948258f, + -0.8842355f, -0.9111363f, -0.39652422f, 0.077266276f, -0.68084997f, + 0.4593515f, -0.28872707f, -6.936231f, 1.12253f, 1.7616503f, + -0.014069137f, -0.0052156276f, -4.5095444f, 6.2076726f, -0.058755957f, + -0.4675936f, -0.13039507f, 0.12094394f, -0.07285393f, 68.26125f, + 7.4893136f, 8.770954f, 0.020274093f, -0.027877754f, 1.6579602f, + -0.1825479f, 0.34832543f, 0.07472531f, -0.44812247f, -1.0941806f, + -0.16749863f, 1.1394324f, 0.47983396f, -0.99983627f, -0.00064249727f, + -1.3345739f, -0.057157427f, -18.14875f, 16.506035f, 15.539248f, + 0.013191509f, -0.021674965f, -25.006235f, 0.51220596f, 0.7334426f, + 0.81836903f, -1.0443225f, 0.4459505f, -1.2045046f +}; + +static const float av1_intrap_hiddenlayer_0_bias_12[] = { + -4.154915f, 14.33833f, 0.0f, 0.0f, 2.0440118f, 12.40922f, + -16.77514f, 0.5879813f, 3.2305415f, 0.8303539f, 0.0f, 14.488708f, + 2.94393f, 1.874383f, 0.0f, -0.53140444f, 0.0f, 1.8456234f, + -0.55427986f, -19.856262f, 0.0f, 0.17281002f, 48.31631f, 0.0f +}; + +static const float av1_intrap_logits_kernel_12[] = { + 0.26843873f, -0.09576241f, 0.34427166f, 0.09914787f, -0.10275399f, + 0.02999484f, -0.1467772f, 0.11594324f, 0.29200763f, 0.0067976206f, + 0.050393578f, -0.018694371f, 0.3333476f, 0.2127221f, 0.35128218f, + 0.19968672f, 0.08099991f, 0.084850654f, -0.16045967f, 0.30286232f, + 0.6164765f, -0.27140254f, 0.08210814f, 0.34852806f, 0.25028184f, + -0.12188078f, 0.16310331f, 0.31253803f, -0.10792341f, 0.065858394f, + -0.1349708f, 0.08948815f, 0.31905392f, 0.03680656f, -0.05040944f, + -0.051539157f, 0.3211852f, 0.2137136f, 0.45037416f, 0.22748767f, + -0.10978614f, 0.06475646f, -0.16954158f, 0.32831904f, 0.16479677f, + -0.30020145f, 0.066221856f, 0.37213042f +}; + +static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f }; + +static const NN_CONFIG av1_intrap_nn_config = { + NUM_FEATURES_12, + NUM_LOGITS_12, + NUM_HIDDEN_LAYERS_12, + { + NUM_LAYER_0_UNITS_12, + }, + { + av1_intrap_hiddenlayer_0_kernel_12, + av1_intrap_logits_kernel_12, + }, + { + av1_intrap_hiddenlayer_0_bias_12, + av1_intrap_logits_bias_12, + }, +}; + +#undef NUM_HIDDEN_LAYERS_12 +#undef NUM_FEATURES_12 +#undef NUM_LAYER_0_UNITS_12 +#undef NUM_LOGITS_12 + +#define NUM_HIDDEN_LAYERS_15 1 +#define NUM_FEATURES_15 6 +#define NUM_LAYER_0_UNITS_15 24 +#define NUM_LOGITS_15 2 + +static const float av1_intraph_hiddenlayer_0_kernel_15[] = { + -0.77480125f, 0.3219551f, -0.015702145f, -0.5310235f, 0.5254026f, + -1.1522819f, 2.682016f, 0.08001052f, -0.2539285f, 0.04711023f, + -0.81296307f, 0.2675382f, 0.1952474f, -0.0664705f, 1.2989824f, + -0.3150117f, -0.8022715f, 0.045423955f, -27.584324f, -2.5608704f, + -3.2280366f, 0.05272543f, -0.47141576f, -0.07644298f, -53.77942f, + -22.393923f, -23.027853f, -0.00015186476f, -0.010696465f, 2.7064638f, + -22.776028f, 11.514891f, 11.138167f, -0.001243723f, -0.4802433f, + -8.758646f, 0.26398206f, -0.23485385f, 0.27586034f, -0.004954741f, + -0.4935232f, -0.017607696f, 69.56049f, -1.1756641f, -0.052366666f, + -0.38052833f, 0.32474658f, 0.04634263f, 0.8583235f, -0.528438f, + -0.7868907f, -0.4757781f, 0.4620985f, -0.70621157f, 231.40195f, + 6.805205f, 9.420295f, 0.02585775f, -0.03480937f, 1.3577378f, + 0.1758226f, 15.056758f, 14.437874f, -0.1305005f, 0.115103304f, + 0.21297209f, 55.821743f, -6.611156f, -6.8552365f, -0.011928095f, + -0.2042175f, 1.2557873f, -1.0722278f, -0.2683614f, 0.48318478f, + -0.73739994f, 0.54055226f, -0.03224738f, -0.06767959f, -0.21015017f, + 0.29171246f, -0.6937296f, -1.2342545f, -0.41278538f, -37.9365f, + 17.68424f, 16.263042f, -0.074828684f, 0.06607806f, -0.16763286f, + 13.594707f, 0.6152676f, -0.4371223f, -0.8365592f, 0.8273623f, + -1.2126317f, 0.1216157f, -1.3002136f, -0.18856938f, -0.2589358f, + -0.76897144f, 0.21777137f, -122.25033f, -0.23490006f, -3.1238277f, + -0.13916978f, 0.08576391f, -1.7391548f, -116.24812f, 14.906071f, + 13.468357f, 0.02332889f, -0.034617376f, -18.506111f, 0.7500542f, + -1.1882535f, 0.40848416f, -0.28434393f, -0.71471655f, -0.29188696f, + -0.46588746f, -0.17324813f, -0.62460244f, -1.1801276f, 0.28993344f, + -0.22072886f, 129.2688f, -0.33782578f, -0.34836572f, -0.034112718f, + -0.023666814f, -0.5865087f, -33.484146f, 1.1431375f, 0.56056374f, + -0.0049730353f, -0.24347587f, -1.3003352f, 0.88973033f, 0.8499571f, + -0.5678484f, -0.39009875f, -0.062105156f, -0.13965102f +}; + +static const float av1_intraph_hiddenlayer_0_bias_15[] = { + 0.0f, -0.2926711f, 0.0f, -1.0303509f, -27.459345f, 12.412848f, + 0.0f, -2.5971522f, -0.02733541f, -19.881912f, 14.391992f, -8.249469f, + 0.0f, 0.0f, 13.676118f, -0.6472994f, -0.07189449f, 1.1986839f, + 52.479107f, 0.0f, 0.0f, -3.0187025f, 1.4435643f, 0.0f +}; + +static const float av1_intraph_logits_kernel_15[] = { + 0.05390722f, -0.06859513f, 0.036842898f, 0.190772f, 0.13623567f, + 0.09321194f, 0.2314745f, -0.13958375f, -0.3058229f, -0.0104543045f, + 0.11336068f, -0.276115f, 0.00470723f, -0.49123898f, -0.15988174f, + 0.087681435f, 0.022517204f, 0.073877744f, 0.2968856f, -0.1401399f, + -0.38788354f, -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f, + -0.032179773f, -0.35758728f, 0.25819537f, 0.11468631f, 0.13573235f, + -0.2672175f, 0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f, + -0.21821865f, 0.08434734f, 0.3129456f, -0.18215221f, 0.08884877f, + -0.35621428f, 0.11405768f, 0.27370325f, 0.14956686f, 0.01604587f, + -0.18334487f, -0.42385718f, -0.08033409f +}; + +static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f }; + +static const NN_CONFIG av1_intrap_hd_nn_config = { + NUM_FEATURES_15, + NUM_LOGITS_15, + NUM_HIDDEN_LAYERS_15, + { + NUM_LAYER_0_UNITS_15, + }, + { + av1_intraph_hiddenlayer_0_kernel_15, + av1_intraph_logits_kernel_15, + }, + { + av1_intraph_hiddenlayer_0_bias_15, + av1_intraph_logits_bias_15, + }, +}; + +#undef NUM_HIDDEN_LAYERS_15 +#undef NUM_FEATURES_15 +#undef NUM_LAYER_0_UNITS_15 +#undef NUM_LOGITS_15 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/model_rd.h b/third_party/aom/av1/encoder/model_rd.h new file mode 100644 index 0000000000..f7e8b96b5b --- /dev/null +++ b/third_party/aom/av1/encoder/model_rd.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MODEL_RD_H_ +#define AOM_AV1_ENCODER_MODEL_RD_H_ + +#include "aom/aom_integer.h" +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/rdopt_utils.h" +#include "config/aom_dsp_rtcd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// 0: Legacy model +// 1: Curve fit model +// 2: Surface fit model +// 3: DNN regression model +// 4: Full rd model +#define MODELRD_TYPE_INTERP_FILTER 1 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 1 +#define MODELRD_TYPE_MASKED_COMPOUND 1 +#define MODELRD_TYPE_INTERINTRA 1 +#define MODELRD_TYPE_INTRA 1 +#define MODELRD_TYPE_MOTION_MODE_RD 1 + +typedef void (*model_rd_for_sb_type)( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist); +typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, int *rate, + int64_t *dist); + +static int64_t calculate_sse(MACROBLOCKD *const xd, + const struct macroblock_plane *p, + struct macroblockd_plane *pd, const int bw, + const int bh) { + int64_t sse = 0; + const int shift = xd->bd - 8; +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } else { + sse = + aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); + } +#else + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); +#endif + sse = ROUND_POWER_OF_TWO(sse, shift * 2); + return sse; +} + +static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd, + int plane, const BLOCK_SIZE bsize) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + + int64_t sse = calculate_sse(xd, p, pd, bw, bh); + + return sse; +} + +static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)num_samples; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + + // Fast approximate the modelling function. + if (cpi->sf.rd_sf.simple_model_rd_from_var) { + const int64_t square_error = sse; + int quantizer = p->dequant_QTX[1] >> dequant_shift; + if (quantizer < 120) + *rate = (int)AOMMIN( + (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT), + INT_MAX); + else + *rate = 0; + assert(*rate >= 0); + *dist = (square_error * quantizer) >> 8; + } else { + av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize], + p->dequant_QTX[1] >> dequant_shift, rate, + dist); + } + *dist <<= 4; +} + +// Fits a curve for rate and distortion using as feature: +// log2(sse_norm/qstep^2) +static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + BLOCK_SIZE plane_bsize, int plane, + int64_t sse, int num_samples, + int *rate, int64_t *dist) { + (void)cpi; + (void)plane_bsize; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1); + + if (sse == 0) { + if (rate) *rate = 0; + if (dist) *dist = 0; + return; + } + const double sse_norm = (double)sse / num_samples; + const double qstepsqr = (double)qstep * qstep; + const double xqr = log2(sse_norm / qstepsqr); + double rate_f, dist_by_sse_norm_f; + av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f, + &dist_by_sse_norm_f); + + const double dist_f = dist_by_sse_norm_f * sse_norm; + int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); + int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5); + + // Check if skip is better + if (rate_i == 0) { + dist_i = sse << 4; + } else if (RDCOST(x->rdmult, rate_i, dist_i) >= + RDCOST(x->rdmult, 0, sse << 4)) { + rate_i = 0; + dist_i = sse << 4; + } + + if (rate) *rate = rate_i; + if (dist) *dist = dist_i; +} + +static AOM_INLINE void model_rd_for_sb( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int plane; + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + assert(bsize < BLOCK_SIZES_ALL); + + for (plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + int64_t sse; + int rate; + int64_t dist; + + sse = calculate_sse(xd, p, pd, bw, bh); + + model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + assert(rate_sum >= 0); + } + + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + rate_sum = AOMMIN(rate_sum, INT_MAX); + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +static AOM_INLINE void model_rd_for_sb_with_curvfit( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum, + uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate, + int64_t *plane_sse, int64_t *plane_dist) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + int64_t dist, sse; + int rate; + int bw, bh; + const struct macroblock_plane *const p = &x->plane[plane]; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, + &bw, &bh); + + sse = calculate_sse(xd, p, pd, bw, bh); + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; + } + + if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; +} + +enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType); + +static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { + model_rd_for_sb, model_rd_for_sb_with_curvfit +}; + +static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { + model_rd_from_sse, model_rd_with_curvfit +}; + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_MODEL_RD_H_ diff --git a/third_party/aom/av1/encoder/motion_search_facade.c b/third_party/aom/av1/encoder/motion_search_facade.c new file mode 100644 index 0000000000..e7eec29dc3 --- /dev/null +++ b/third_party/aom/av1/encoder/motion_search_facade.c @@ -0,0 +1,1071 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/tx_search.h" + +#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3) + +typedef struct { + int_mv fmv; + int weight; +} cand_mv_t; + +static int compare_weight(const void *a, const void *b) { + const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight; + if (diff < 0) + return 1; + else if (diff > 0) + return -1; + return 0; +} + +// Allow more mesh searches for screen content type on the ARF. +static int use_fine_search_interval(const AV1_COMP *const cpi) { + return cpi->is_screen_content_type && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE && + cpi->oxcf.speed <= 2; +} + +// Iterate through the tpl and collect the mvs to be used as candidates +static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi, + const MACROBLOCK *x, + BLOCK_SIZE bsize, int ref, + cand_mv_t *cand, int *cand_count, + int *total_cand_weight) { + const SuperBlockEnc *sb_enc = &x->sb_enc; + if (!sb_enc->tpl_data_count) { + return; + } + + const AV1_COMMON *cm = &cpi->common; + const MACROBLOCKD *xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + const BLOCK_SIZE tpl_bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size]; + const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size]; + const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw; + int valid = 1; + + // Assign large weight to start_mv, so it is always tested. + cand[0].weight = nw * nh; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + const int_mv mv = + sb_enc + ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME]; + if (mv.as_int == INVALID_MV) { + valid = 0; + break; + } + + const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row), + GET_MV_RAWPEL(mv.as_mv.col) }; + int unique = 1; + for (int m = 0; m < *cand_count; m++) { + if (RIGHT_SHIFT_MV(fmv.row) == + RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) && + RIGHT_SHIFT_MV(fmv.col) == + RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) { + unique = 0; + cand[m].weight++; + break; + } + } + + if (unique) { + cand[*cand_count].fmv.as_fullmv = fmv; + cand[*cand_count].weight = 1; + (*cand_count)++; + } + } + if (!valid) break; + } + + if (valid) { + *total_cand_weight = 2 * nh * nw; + if (*cand_count > 2) + qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight); + } + } +} + +void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_idx, int *rate_mv, + int search_range, inter_mode_info *mode_info, + int_mv *best_mv, + struct HandleInterModeArgs *const args) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *mbmi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + int bestsme = INT_MAX; + const int ref = mbmi->ref_frame[ref_idx]; + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const MvCosts *mv_costs = x->mv_costs; + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + } + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc. + int step_param; + if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) { + // Take the weighted average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (av1_init_search_range(x->max_mv_context[ref]) + + mv_search_params->mv_step_param) / + 2; + } else { + step_param = mv_search_params->mv_step_param; + } + + const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; + FULLPEL_MV start_mv; + if (mbmi->motion_mode != SIMPLE_TRANSLATION) + start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv); + else + start_mv = get_fullmv_from_mv(&ref_mv); + + // cand stores start_mv and all possible MVs in a SB. + cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1]; + av1_zero(cand); + cand[0].fmv.as_fullmv = start_mv; + int cnt = 1; + int total_weight = 0; + + if (!cpi->sf.mv_sf.full_pixel_search_level && + mbmi->motion_mode == SIMPLE_TRANSLATION) { + get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight); + } + + const int cand_cnt = AOMMIN(2, cnt); + // TODO(any): Test the speed feature for OBMC_CAUSAL mode. + if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv && + mbmi->motion_mode == SIMPLE_TRANSLATION) { + const int stack_size = args->start_mv_cnt; + for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) { + int_mv *fmv_cand = &cand[cand_idx].fmv; + int skip_cand_mv = 0; + + // Check difference between mvs in the stack and candidate mv. + for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) { + const uint8_t this_ref_mv_idx = args->ref_mv_idx_stack[stack_idx]; + const FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx]; + const int this_newmv_valid = + args->single_newmv_valid[this_ref_mv_idx][ref]; + const int row_diff = abs(fmv_stack->row - fmv_cand->as_fullmv.row); + const int col_diff = abs(fmv_stack->col - fmv_cand->as_fullmv.col); + + if (!this_newmv_valid) continue; + + if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 2) { + // Prunes the current start_mv candidate, if the absolute mv + // difference of both row and column are <= 1. + if (row_diff <= 1 && col_diff <= 1) { + skip_cand_mv = 1; + break; + } + } else if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 1) { + // Prunes the current start_mv candidate, if the sum of the absolute + // mv difference of row and column is <= 1. + if (row_diff + col_diff <= 1) { + skip_cand_mv = 1; + break; + } + } + } + if (skip_cand_mv) { + // Ensure atleast one full-pel motion search is not pruned. + assert(mbmi->ref_mv_idx != 0); + // Mark the candidate mv as invalid so that motion search gets skipped. + cand[cand_idx].fmv.as_int = INVALID_MV; + } else { + // Store start_mv candidate and corresponding ref_mv_idx of full-pel + // search in the mv stack (except last ref_mv_idx). + if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) { + assert(args->start_mv_cnt < (MAX_REF_MV_SEARCH - 1) * 2); + args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv; + args->ref_mv_idx_stack[args->start_mv_cnt] = mbmi->ref_mv_idx; + args->start_mv_cnt++; + } + } + } + } + + // Hot fix for asan complaints when resize mode is on. When resize mode is on, + // the stride of the reference frame can be different from indicated by + // MotionVectorSearchParams::search_site_cfg. When this happens, we need to + // readjust the stride. + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, mv_sf, bsize); + const search_site_config *src_search_site_cfg = + av1_get_search_site_config(cpi, x, search_method); + + // Further reduce the search range. + if (search_range < INT_MAX) { + const search_site_config *search_site_cfg = + &src_search_site_cfg[search_method_lookup[search_method]]; + // Max step_param is search_site_cfg->num_search_steps. + if (search_range < 1) { + step_param = search_site_cfg->num_search_steps; + } else { + while (search_site_cfg->radius[search_site_cfg->num_search_steps - + step_param - 1] > (search_range << 1) && + search_site_cfg->num_search_steps - step_param - 1 > 0) + step_param++; + } + } + + int cost_list[5]; + FULLPEL_MV_STATS best_mv_stats; + int_mv second_best_mv; + best_mv->as_int = second_best_mv.as_int = INVALID_MV; + + // Allow more mesh searches for screen content type on the ARF. + const int fine_search_interval = use_fine_search_interval(cpi); + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: { + // Perform a search with the top 2 candidates + int sum_weight = 0; + for (int m = 0; m < cand_cnt; m++) { + int_mv smv = cand[m].fmv; + FULLPEL_MV this_best_mv, this_second_best_mv; + FULLPEL_MV_STATS this_mv_stats; + + if (smv.as_int == INVALID_MV) continue; + + av1_make_default_fullpel_ms_params( + &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv, + src_search_site_cfg, search_method, fine_search_interval); + + const int thissme = + av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &this_best_mv, + &this_mv_stats, &this_second_best_mv); + + if (thissme < bestsme) { + bestsme = thissme; + best_mv->as_fullmv = this_best_mv; + best_mv_stats = this_mv_stats; + second_best_mv.as_fullmv = this_second_best_mv; + } + + sum_weight += cand[m].weight; + if (4 * sum_weight > 3 * total_weight) break; + } + } break; + case OBMC_CAUSAL: + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv, start_mv, src_search_site_cfg, + search_method, fine_search_interval); + + bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params, + step_param, &best_mv->as_fullmv); + break; + default: assert(0 && "Invalid motion mode!\n"); + } + if (best_mv->as_int == INVALID_MV) return; + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref_idx] = backup_yv12[i]; + } + } + + // Terminate search with the current ref_idx based on fullpel mv, rate cost, + // and other know cost. + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 && + mbmi->motion_mode == SIMPLE_TRANSLATION && + best_mv->as_int != INVALID_MV) { + int_mv this_mv; + this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + const int ref_mv_idx = mbmi->ref_mv_idx; + const int this_mv_rate = + av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int; + mode_info[ref_mv_idx].full_mv_rate = this_mv_rate; + mode_info[ref_mv_idx].full_mv_bestsme = bestsme; + + for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { + // Check if the motion search result same as previous results + if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) { + // Compare the rate cost + const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate + + mode_info[prev_ref_idx].drl_cost; + const int this_rate_cost = + this_mv_rate + mode_info[ref_mv_idx].drl_cost; + + if (prev_rate_cost <= this_rate_cost) { + // If the current rate_cost is worse than the previous rate_cost, then + // we terminate the search. Since av1_single_motion_search is only + // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the + // best_mv to INVALID mv to signal that we wish to terminate search + // for the current mode. + best_mv->as_int = INVALID_MV; + return; + } + } + + // Terminate the evaluation of current ref_mv_idx based on bestsme and + // drl_cost. + const int psme = mode_info[prev_ref_idx].full_mv_bestsme; + if (psme == INT_MAX) continue; + const int thr = + cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme; + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 && + mode_info[ref_mv_idx].full_mv_bestsme > thr && + mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) { + best_mv->as_int = INVALID_MV; + return; + } + } + } + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(best_mv); + } + + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + int best_mv_rate = 0; + int mv_rate_calculated = 0; + if (use_fractional_mv) { + int_mv fractional_ms_list[3]; + av1_set_fractional_mv(fractional_ms_list); + int dis; /* TODO: use dis in distortion calculation later. */ + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + + switch (mbmi->motion_mode) { + case SIMPLE_TRANSLATION: + if (mv_sf->use_accurate_subpel_search) { + const int try_second = second_best_mv.as_int != INVALID_MV && + second_best_mv.as_int != best_mv->as_int && + (mv_sf->disable_second_mv <= 1); + const int best_mv_var = mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, + &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list); + + if (try_second) { + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + int64_t rd = INT64_MAX; + if (!mv_sf->disable_second_mv) { + // Calculate actual rd cost. + mbmi->mv[0].as_mv = best_mv->as_mv; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, + bsize, 0, 0); + av1_subtract_plane(x, bsize, 0); + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize, + max_txsize_rect_lookup[bsize]); + int this_mv_rate = av1_mv_bit_cost( + &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate, + this_rd_stats.dist); + } + + MV this_best_mv; + subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); + if (av1_is_subpelmv_in_range(&ms_params.mv_limits, + subpel_start_mv)) { + unsigned int sse; + const int this_var = mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, + &dis, &sse, fractional_ms_list); + + if (!mv_sf->disable_second_mv) { + // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost + // to choose the better MV. + mbmi->mv[0].as_mv = this_best_mv; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, + bsize, 0, 0); + av1_subtract_plane(x, bsize, 0); + RD_STATS tmp_rd_stats; + av1_init_rd_stats(&tmp_rd_stats); + av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize, + max_txsize_rect_lookup[bsize]); + int tmp_mv_rate = av1_mv_bit_cost( + &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + int64_t tmp_rd = + RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate, + tmp_rd_stats.dist); + if (tmp_rd < rd) { + best_mv->as_mv = this_best_mv; + x->pred_sse[ref] = sse; + } + } else { + // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the + // best MV. + if (this_var < best_mv_var) { + best_mv->as_mv = this_best_mv; + x->pred_sse[ref] = sse; + } + } + } + } + } else { + mv_search_params->find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, + &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL); + } + break; + case OBMC_CAUSAL: + av1_find_best_obmc_sub_pixel_tree_up( + xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis, + &x->pred_sse[ref], NULL); + break; + default: assert(0 && "Invalid motion mode!\n"); + } + + // Terminate search with the current ref_idx based on subpel mv and rate + // cost. + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL && + mbmi->motion_mode == SIMPLE_TRANSLATION && + best_mv->as_int != INVALID_MV) { + const int ref_mv_idx = mbmi->ref_mv_idx; + best_mv_rate = + av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + mv_rate_calculated = 1; + + for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { + if (!args->single_newmv_valid[prev_ref_idx][ref]) continue; + // Check if the motion vectors are the same. + if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) { + // Skip this evaluation if the previous one is skipped. + if (mode_info[prev_ref_idx].skip) { + mode_info[ref_mv_idx].skip = 1; + break; + } + // Compare the rate cost that we current know. + const int prev_rate_cost = + args->single_newmv_rate[prev_ref_idx][ref] + + mode_info[prev_ref_idx].drl_cost; + const int this_rate_cost = + best_mv_rate + mode_info[ref_mv_idx].drl_cost; + + if (prev_rate_cost <= this_rate_cost) { + // If the current rate_cost is worse than the previous rate_cost, + // then we terminate the search for this ref_mv_idx. + mode_info[ref_mv_idx].skip = 1; + break; + } + } + } + } + } + + if (mv_rate_calculated) { + *rate_mv = best_mv_rate; + } else { + *rate_mv = + av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } +} + +int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, + int allow_second_mv, int joint_me_num_refine_iter) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + const int plane = 0; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] }; + const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] }; + const MvCosts *mv_costs = x->mv_costs; + int_mv ref_mv[2]; + int ite, ref; + + // Get the prediction block from the 'other' reference frame. + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + InterPredParams inter_pred_params; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Do joint motion search in compound mode to get more accurate mv. + struct buf_2d backup_yv12[2][MAX_MB_PLANE]; + int last_besterr[2] = { INT_MAX, INT_MAX }; + const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + av1_get_scaled_ref_frame(cpi, refs[0]), + av1_get_scaled_ref_frame(cpi, refs[1]) + }; + + // Prediction buffer from second frame. + DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]); + uint8_t *second_pred = get_buf_by_bd(xd, second_pred16); + + int_mv best_mv, second_best_mv; + + // Allow joint search multiple times iteratively for each reference frame + // and break out of the search loop if it couldn't find a better mv. + for (ite = 0; ite < (2 * joint_me_num_refine_iter); ite++) { + struct buf_2d ref_yv12[2]; + int bestsme = INT_MAX; + int id = ite % 2; // Even iterations search in the first reference frame, + // odd iterations search in the second. The predictor + // found for the 'other' reference frame is factored in. + if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) { + if (cur_mv[id].as_int == init_mv[id].as_int) { + break; + } else { + int_mv cur_int_mv, init_int_mv; + cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; + init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; + if (cur_int_mv.as_int == init_int_mv.as_int) { + break; + } + } + } + for (ref = 0; ref < 2; ++ref) { + ref_mv[ref] = av1_get_ref_mv(x, ref); + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + if (scaled_ref_frame[ref]) { + int i; + for (i = 0; i < num_planes; i++) + backup_yv12[ref][i] = xd->plane[i].pre[ref]; + av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL, num_planes); + } + } + + assert(IMPLIES(scaled_ref_frame[0] != NULL, + cm->width == scaled_ref_frame[0]->y_crop_width && + cm->height == scaled_ref_frame[0]->y_crop_height)); + assert(IMPLIES(scaled_ref_frame[1] != NULL, + cm->width == scaled_ref_frame[1]->y_crop_width && + cm->height == scaled_ref_frame[1]->y_crop_height)); + + // Initialize based on (possibly scaled) prediction buffers. + ref_yv12[0] = xd->plane[plane].pre[0]; + ref_yv12[1] = xd->plane[plane].pre[1]; + + av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + &cm->sf_identity, &ref_yv12[!id], interp_filters); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + // Since we have scaled the reference frames to match the size of the + // current frame we must use a unit scaling factor during mode selection. + av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv, + &inter_pred_params); + + // Do full-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + // Make motion search params + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + // Use the mv result from the single mode as mv predictor. + const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv); + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv[id].as_mv, start_fullmv, + src_search_sites, search_method, + /*fine_search_interval=*/0); + + av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, + mask_stride, id); + + // Small-range full-pixel motion search. + if (!mv_sf->disable_extensive_joint_motion_search && + mbmi->interinter_comp.type != COMPOUND_WEDGE) { + bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL, + &best_mv.as_fullmv, &best_mv_stats, + &second_best_mv.as_fullmv); + } else { + bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv, + &best_mv.as_fullmv); + second_best_mv = best_mv; + } + + const int try_second = second_best_mv.as_int != INVALID_MV && + second_best_mv.as_int != best_mv.as_int && + allow_second_mv; + + // Restore the pointer to the first (possibly scaled) prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + + for (ref = 0; ref < 2; ++ref) { + if (scaled_ref_frame[ref]) { + // Swap back the original buffers for subpel motion search. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[ref] = backup_yv12[ref][i]; + } + // Re-initialize based on unscaled prediction buffers. + ref_yv12[ref] = xd->plane[plane].pre[ref]; + } + } + + // Do sub-pixel compound motion search on the current reference frame. + if (id) xd->plane[plane].pre[0] = ref_yv12[id]; + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(&best_mv); + } + if (bestsme < INT_MAX && + cpi->common.features.cur_frame_force_integer_mv == 0) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv[id].as_mv, NULL); + av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, + mask, mask_stride, id); + ms_params.forced_stop = EIGHTH_PEL; + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL); + + if (try_second) { + MV this_best_mv; + MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); + if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) { + const int thissme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis, + &sse, NULL); + if (thissme < bestsme) { + best_mv.as_mv = this_best_mv; + bestsme = thissme; + } + } + } + } + + // Restore the pointer to the first prediction buffer. + if (id) xd->plane[plane].pre[0] = ref_yv12[0]; + if (bestsme < last_besterr[id]) { + cur_mv[id] = best_mv; + last_besterr[id] = bestsme; + } else { + break; + } + } + + *rate_mv = 0; + + for (ref = 0; ref < 2; ++ref) { + const int_mv curr_ref_mv = av1_get_ref_mv(x, ref); + *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, + mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + + return AOMMIN(last_besterr[0], last_besterr[1]); +} + +// Search for the best mv for one component of a compound, +// given that the other component is fixed. +int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *this_mv, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + int *rate_mv, int ref_idx) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int ref = mbmi->ref_frame[ref_idx]; + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); + struct macroblockd_plane *const pd = &xd->plane[0]; + const MvCosts *mv_costs = x->mv_costs; + + struct buf_2d backup_yv12[MAX_MB_PLANE]; + const YV12_BUFFER_CONFIG *const scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + + // Check that this is either an interinter or an interintra block + assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi))); + + // Store the first prediction buffer. + struct buf_2d orig_yv12; + if (ref_idx) { + orig_yv12 = pd->pre[0]; + pd->pre[0] = pd->pre[ref_idx]; + } + + if (scaled_ref_frame) { + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // full-pixel motion search code to be used without additional + // modifications. + for (int i = 0; i < num_planes; i++) { + backup_yv12[i] = xd->plane[i].pre[ref_idx]; + } + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + // The index below needs to be 0 instead of ref_idx since we assume the + // 0th slot to be used for subsequent searches. Note that the ref_idx + // reference buffer has been copied to the 0th slot in the code above. + // Now we need to swap the reference frame for the 0th slot. + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + int bestsme = INT_MAX; + int_mv best_mv; + + // Make motion search params + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + // Use the mv result from the single mode as mv predictor. + const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv); + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, + &ref_mv.as_mv, start_fullmv, + src_search_sites, search_method, + /*fine_search_interval=*/0); + + av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask, + mask_stride, ref_idx); + + // Small-range full-pixel motion search. + bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL, + &best_mv.as_fullmv, &best_mv_stats, NULL); + + if (scaled_ref_frame) { + // Swap back the original buffers for subpel motion search for the 0th slot. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = backup_yv12[i]; + } + } + + if (cpi->common.features.cur_frame_force_integer_mv) { + convert_fullmv_to_mv(&best_mv); + } + const int use_fractional_mv = + bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + if (use_fractional_mv) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv, + NULL); + av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred, + mask, mask_stride, ref_idx); + ms_params.forced_stop = EIGHTH_PEL; + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis, + &sse, NULL); + } + + // Restore the pointer to the first unscaled prediction buffer. + if (ref_idx) pd->pre[0] = orig_yv12; + + if (bestsme < INT_MAX) *this_mv = best_mv.as_mv; + + *rate_mv = 0; + + *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + return bestsme; +} + +static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + const MV *other_mv, int ref_idx, + uint8_t *second_pred) { + const AV1_COMMON *const cm = &cpi->common; + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x); + const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y); + + // This function should only ever be called for compound modes + assert(has_second_ref(mbmi)); + + const int plane = 0; + struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx]; + + struct scale_factors sf; + av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height, + cm->width, cm->height); + + InterPredParams inter_pred_params; + + av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), 0, &sf, &ref_yv12, + mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + + // Get the prediction block from the 'other' reference frame. + av1_enc_build_one_inter_predictor(second_pred, pw, other_mv, + &inter_pred_params); +} + +// Wrapper for av1_compound_single_motion_search, for the common case +// where the second prediction is also an inter mode. +int av1_compound_single_motion_search_interinter( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) { + MACROBLOCKD *xd = &x->e_mbd; + // This function should only ever be called for compound modes + assert(has_second_ref(xd->mi[0])); + + // Prediction buffer from second frame. + DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); + uint8_t *second_pred; + if (is_cur_buf_hbd(xd)) + second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); + else + second_pred = (uint8_t *)second_pred_alloc_16; + + MV *this_mv = &cur_mv[ref_idx].as_mv; + const MV *other_mv = &cur_mv[!ref_idx].as_mv; + build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred); + return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, + mask, mask_stride, rate_mv, ref_idx); +} + +static AOM_INLINE void do_masked_motion_search_indexed( + const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize, + int_mv *tmp_mv, int *rate_mv, int which) { + // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + BLOCK_SIZE sb_type = mbmi->bsize; + const uint8_t *mask; + const int mask_stride = block_size_wide[bsize]; + + mask = av1_get_compound_type_mask(comp_data, sb_type); + + tmp_mv[0].as_int = cur_mv[0].as_int; + tmp_mv[1].as_int = cur_mv[1].as_int; + if (which == 0 || which == 1) { + av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask, + mask_stride, rate_mv, which); + } else if (which == 2) { + const int joint_me_num_refine_iter = + cpi->sf.inter_sf.enable_fast_compound_mode_search == 2 + ? REDUCED_JOINT_ME_REFINE_ITER + : NUM_JOINT_ME_REFINE_ITER; + av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv, + !cpi->sf.mv_sf.disable_second_mv, + joint_me_num_refine_iter); + } +} + +int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + int_mv tmp_mv[2]; + int tmp_rate_mv = 0; + // TODO(jingning): The average compound mode has proper SAD and variance + // functions implemented, and is triggerd by setting the mask pointer as + // Null. Need to further implement those for frame distance weighted mode. + mbmi->interinter_comp.seg_mask = + mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask; + const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp; + + if (this_mode == NEW_NEWMV) { + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, + tmp_mv, &tmp_rate_mv, 2); + mbmi->mv[0].as_int = tmp_mv[0].as_int; + mbmi->mv[1].as_int = tmp_mv[1].as_int; + } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) { + // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV + // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV + int which = (NEWMV == compound_ref1_mode(this_mode)); + do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize, + tmp_mv, &tmp_rate_mv, which); + mbmi->mv[which].as_int = tmp_mv[which].as_int; + } + return tmp_rate_mv; +} + +int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int ref, + FULLPEL_MV start_mv, int num_planes, + int use_subpixel, unsigned int *sse, + unsigned int *var) { + assert(num_planes == 1 && + "Currently simple_motion_search only supports luma plane"); + assert(!frame_is_intra_only(&cpi->common) && + "Simple motion search only enabled for non-key frames"); + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->bsize = bsize; + mbmi->ref_frame[0] = ref; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + struct buf_2d backup_yv12; + // ref_mv is used to calculate the cost of the motion vector + const MV ref_mv = kZeroMv; + const int step_param = + AOMMIN(cpi->mv_search_params.mv_step_param + + cpi->sf.part_sf.simple_motion_search_reduce_search_steps, + MAX_MVSEARCH_STEPS - 2); + int cost_list[5]; + const int ref_idx = 0; + int bestsme; + int_mv best_mv; + FULLPEL_MV_STATS best_mv_stats; + + av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, ref), num_planes); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + if (scaled_ref_frame) { + backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx]; + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + // Allow more mesh searches for screen content type on the ARF. + const int fine_search_interval = use_fine_search_interval(cpi); + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv, + start_mv, src_search_sites, search_method, + fine_search_interval); + + bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv.as_fullmv, &best_mv_stats, NULL); + + const int use_subpel_search = + bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv && + use_subpixel && + (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL); + if (scaled_ref_frame) { + xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; + } + if (use_subpel_search) { + int not_used = 0; + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params(). + ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop; + + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv, + ¬_used, &x->pred_sse[ref], NULL); + + mbmi->mv[0] = best_mv; + + // Get a copy of the prediction output + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + *var = cpi->ppi->fn_ptr[bsize].vf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, sse); + } else { + // Manually convert from units of pixel to 1/8-pixels if we are not doing + // subpel search + convert_fullmv_to_mv(&best_mv); + *var = best_mv_stats.distortion; + *sse = best_mv_stats.sse; + } + + return best_mv; +} diff --git a/third_party/aom/av1/encoder/motion_search_facade.h b/third_party/aom/av1/encoder/motion_search_facade.h new file mode 100644 index 0000000000..d1fa915bca --- /dev/null +++ b/third_party/aom/av1/encoder/motion_search_facade.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_ +#define AOM_AV1_ENCODER_MOTION_SEARCH_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define NUM_JOINT_ME_REFINE_ITER 2 +#define REDUCED_JOINT_ME_REFINE_ITER 1 +// TODO(any): rename this struct to something else. There is already another +// struct called inter_modes_info, which makes this terribly confusing. +typedef struct { + int drl_cost; + int_mv full_search_mv; + int full_mv_rate; + int full_mv_bestsme; + int skip; +} inter_mode_info; + +struct HandleInterModeArgs; +void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int ref_idx, int *rate_mv, + int search_range, inter_mode_info *mode_info, + int_mv *best_mv, + struct HandleInterModeArgs *const args); + +int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, + int allow_second_mv, int joint_me_num_refine_iter); + +int av1_interinter_compound_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *x, + const int_mv *const cur_mv, + const BLOCK_SIZE bsize, + const PREDICTION_MODE this_mode); + +int av1_compound_single_motion_search_interinter( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, + const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx); + +int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *this_mv, + const uint8_t *second_pred, + const uint8_t *mask, int mask_stride, + int *rate_mv, int ref_idx); + +// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame +// ref and calculates the sse and var of the residue. Note that this sets the +// offset of mbmi, so we will need to reset it after calling this function. +int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int ref, + const FULLPEL_MV start_mv, + int num_planes, int use_subpixel, + unsigned int *sse, unsigned int *var); + +static AOM_INLINE const search_site_config *av1_get_search_site_config( + const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) { + const int ref_stride = x->e_mbd.plane[0].pre[0].stride; + + // AV1_COMP::mv_search_params.search_site_config is a compressor level cache + // that's shared by multiple threads. In most cases where all frames have the + // same resolution, the cache contains the search site config that we need. + const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; + if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) { + return mv_search_params->search_site_cfg[SS_CFG_SRC]; + } else if (ref_stride == + mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]->stride) { + return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]; + } + + // If the cache does not contain the correct stride, then we will need to rely + // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the + // thread level config doesn't match, then we need to update it. + search_method = search_method_lookup[search_method]; + assert(search_method_lookup[search_method] == search_method && + "The search_method_lookup table should be idempotent."); + if (ref_stride != x->search_site_cfg_buf[search_method].stride) { + av1_refresh_search_site_config(x->search_site_cfg_buf, search_method, + ref_stride); + } + + return x->search_site_cfg_buf; +} + +static AOM_INLINE SEARCH_METHODS +av1_get_faster_search_method(SEARCH_METHODS search_method) { + // Note on search method's accuracy: + // 1. NSTEP + // 2. DIAMOND + // 3. BIGDIA \approx SQUARE + // 4. HEX. + // 5. FAST_HEX \approx FAST_DIAMOND + switch (search_method) { + case NSTEP: return DIAMOND; + case NSTEP_8PT: return DIAMOND; + case DIAMOND: return BIGDIA; + case CLAMPED_DIAMOND: return BIGDIA; + case BIGDIA: return HEX; + case SQUARE: return HEX; + case HEX: return FAST_HEX; + case FAST_HEX: return FAST_HEX; + case FAST_DIAMOND: return VFAST_DIAMOND; + case FAST_BIGDIA: return FAST_BIGDIA; + case VFAST_DIAMOND: return VFAST_DIAMOND; + default: assert(0 && "Invalid search method!"); return DIAMOND; + } +} + +static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method( + const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) { + SEARCH_METHODS search_method = mv_sf->search_method; + const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method; + const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]); + const int qband = x->qindex >> (QINDEX_BITS - 2); + const bool use_faster_search_method = + (sf_blk_search_method == 1 && min_dim >= 32) || + (sf_blk_search_method >= 2 && min_dim >= 16 && + x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3); + + if (use_faster_search_method) { + search_method = av1_get_faster_search_method(search_method); + } + return search_method; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_MOTION_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/mv_prec.c b/third_party/aom/av1/encoder/mv_prec.c new file mode 100644 index 0000000000..b64f4dcd0e --- /dev/null +++ b/third_party/aom/av1/encoder/mv_prec.c @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_config.h" + +#include "av1/encoder/encodemv.h" +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/misc_model_weights.h" +#endif // !CONFIG_REALTIME_ONLY +#include "av1/encoder/mv_prec.h" + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE int_mv get_ref_mv_for_mv_stats( + const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, + int ref_idx) { + int ref_mv_idx = mbmi->ref_mv_idx; + if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) { + assert(has_second_ref(mbmi)); + ref_mv_idx += 1; + } + + const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack; + + if (ref_frames[1] > INTRA_FRAME) { + assert(ref_idx == 0 || ref_idx == 1); + return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv + : curr_ref_mv_stack[ref_mv_idx].this_mv; + } + + assert(ref_idx == 0); + return ref_mv_idx < mbmi_ext_frame->ref_mv_count + ? curr_ref_mv_stack[ref_mv_idx].this_mv + : mbmi_ext_frame->global_mvs[ref_frame_type]; +} + +static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) { + const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]); + const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0; + const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB); + + return av1_cost_symbol(p15); +} + +static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val, + int comp_idx, const AV1_COMP *cpi, + int *rates) { + assert(comp_val != 0 && "mv component should not have zero value!"); + const int sign = comp_val < 0; + const int mag = sign ? -comp_val : comp_val; + const int mag_minus_1 = mag - 1; + int offset; + const int mv_class = av1_get_mv_class(mag_minus_1, &offset); + const int int_part = offset >> 3; // int mv data + const int frac_part = (offset >> 1) & 3; // fractional mv data + const int high_part = offset & 1; // high precision mv data + const int use_hp = cpi->common.features.allow_high_precision_mv; + int r_idx = 0; + + const MACROBLOCK *const x = &cpi->td.mb; + const MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + nmv_context *nmvc = &ec_ctx->nmvc; + nmv_component *mvcomp_ctx = nmvc->comps; + nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx]; + aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf; + aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf; + aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf; + aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf; + aom_cdf_prob *frac_part_cdf = mv_class + ? (cur_mvcomp_ctx->fp_cdf) + : (cur_mvcomp_ctx->class0_fp_cdf[int_part]); + aom_cdf_prob *high_part_cdf = + mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf); + + const int sign_rate = get_symbol_cost(sign_cdf, sign); + rates[r_idx++] = sign_rate; + update_cdf(sign_cdf, sign, 2); + + const int class_rate = get_symbol_cost(class_cdf, mv_class); + rates[r_idx++] = class_rate; + update_cdf(class_cdf, mv_class, MV_CLASSES); + + int int_bit_rate = 0; + if (mv_class == MV_CLASS_0) { + int_bit_rate = get_symbol_cost(class0_cdf, int_part); + update_cdf(class0_cdf, int_part, CLASS0_SIZE); + } else { + const int n = mv_class + CLASS0_BITS - 1; // number of bits + for (int i = 0; i < n; ++i) { + int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1); + update_cdf(bits_cdf[i], (int_part >> i) & 1, 2); + } + } + rates[r_idx++] = int_bit_rate; + const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part); + rates[r_idx++] = frac_part_rate; + update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE); + const int high_part_rate = + use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0; + if (use_hp) { + update_cdf(high_part_cdf, high_part, 2); + } + rates[r_idx++] = high_part_rate; + + mv_stats->last_bit_zero += !high_part; + mv_stats->last_bit_nonzero += high_part; + const int total_rate = + (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate); + return total_rate; +} + +static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv, + const MV *cur_mv, const AV1_COMP *cpi) { + const MACROBLOCK *const x = &cpi->td.mb; + const MACROBLOCKD *const xd = &x->e_mbd; + FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + nmv_context *nmvc = &ec_ctx->nmvc; + aom_cdf_prob *joint_cdf = nmvc->joints_cdf; + const int use_hp = cpi->common.features.allow_high_precision_mv; + + const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col }; + const int mv_joint = av1_get_mv_joint(&diff); + // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp + const MV hp_diff = diff; + const int hp_mv_joint = av1_get_mv_joint(&hp_diff); + const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 }; + const MV lp_diff = use_hp ? truncated_diff : diff; + const int lp_mv_joint = av1_get_mv_joint(&lp_diff); + + const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint); + const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint); + const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint); + + update_cdf(joint_cdf, mv_joint, MV_JOINTS); + + mv_stats->total_mv_rate += mv_joint_rate; + mv_stats->hp_total_mv_rate += hp_mv_joint_rate; + mv_stats->lp_total_mv_rate += lp_mv_joint_rate; + mv_stats->mv_joint_count[mv_joint]++; + + for (int comp_idx = 0; comp_idx < 2; comp_idx++) { + const int comp_val = comp_idx ? diff.col : diff.row; + const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row; + const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row; + int rates[5]; + av1_zero_array(rates, 5); + + const int comp_rate = + comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates) + : 0; + // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false + const int hp_rate = + hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0; + const int lp_rate = + lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0; + + mv_stats->total_mv_rate += comp_rate; + mv_stats->hp_total_mv_rate += hp_rate; + mv_stats->lp_total_mv_rate += lp_rate; + } +} + +static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats, + const AV1_COMP *cpi, int mi_row, + int mi_col) { + const AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) { + return; + } + + const MB_MODE_INFO *mbmi = + mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col]; + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = + cpi->mbmi_ext_info.frame_base + + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, + cpi->mbmi_ext_info.stride); + + if (!is_inter_block(mbmi)) { + mv_stats->intra_count++; + return; + } + mv_stats->inter_count++; + + const PREDICTION_MODE mode = mbmi->mode; + const int is_compound = has_second_ref(mbmi); + + if (mode == NEWMV || mode == NEW_NEWMV) { + // All mvs are new + for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) { + const MV ref_mv = + get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; + const MV cur_mv = mbmi->mv[ref_idx].as_mv; + keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); + } + } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV || + mode == NEW_NEARESTMV || mode == NEW_NEARMV) { + // has exactly one new_mv + mv_stats->default_mvs += 1; + + const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV); + const MV ref_mv = + get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv; + const MV cur_mv = mbmi->mv[ref_idx].as_mv; + + keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi); + } else { + // No new_mv + mv_stats->default_mvs += 1 + is_compound; + } + + // Add texture information + const BLOCK_SIZE bsize = mbmi->bsize; + const int num_rows = block_size_high[bsize]; + const int num_cols = block_size_wide[bsize]; + const int y_stride = cpi->source->y_stride; + const int px_row = 4 * mi_row, px_col = 4 * mi_col; + const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; + const int bd = cm->seq_params->bit_depth; + if (buf_is_hbd) { + uint16_t *source_buf = + CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col; + for (int row = 0; row < num_rows - 1; row++) { + for (int col = 0; col < num_cols - 1; col++) { + const int offset = row * y_stride + col; + const int horz_diff = + abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8); + const int vert_diff = + abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8); + mv_stats->horz_text += horz_diff; + mv_stats->vert_text += vert_diff; + mv_stats->diag_text += horz_diff * vert_diff; + } + } + } else { + uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col; + for (int row = 0; row < num_rows - 1; row++) { + for (int col = 0; col < num_cols - 1; col++) { + const int offset = row * y_stride + col; + const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]); + const int vert_diff = + abs(source_buf[offset + y_stride] - source_buf[offset]); + mv_stats->horz_text += horz_diff; + mv_stats->vert_text += vert_diff; + mv_stats->diag_text += horz_diff * vert_diff; + } + } + } +} + +// Split block +static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats, + const AV1_COMP *cpi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *cm = &cpi->common; + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + const int hbs = mi_size_wide[bsize] / 2; + const int qbs = mi_size_wide[bsize] / 4; + switch (partition) { + case PARTITION_NONE: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + break; + case PARTITION_HORZ: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + break; + case PARTITION_VERT: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + break; + case PARTITION_SPLIT: + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize); + collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize); + break; + case PARTITION_HORZ_A: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + break; + case PARTITION_HORZ_B: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_VERT_A: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + break; + case PARTITION_VERT_B: + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col); + collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs); + collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs); + break; + case PARTITION_HORZ_4: + for (int i = 0; i < 4; ++i) { + const int this_mi_row = mi_row + i * qbs; + collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col); + } + break; + case PARTITION_VERT_4: + for (int i = 0; i < 4; ++i) { + const int this_mi_col = mi_col + i * qbs; + collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col); + } + break; + default: assert(0); + } +} + +static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats, + const AV1_COMP *cpi, + const TileInfo *tile_info) { + const AV1_COMMON *cm = &cpi->common; + const int mi_row_start = tile_info->mi_row_start; + const int mi_row_end = tile_info->mi_row_end; + const int mi_col_start = tile_info->mi_col_start; + const int mi_col_end = tile_info->mi_col_end; + const int sb_size_mi = cm->seq_params->mib_size; + BLOCK_SIZE sb_size = cm->seq_params->sb_size; + for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) { + for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) { + collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size); + } + } +} + +void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) { + MV_STATS *mv_stats = &cpi->mv_stats; + const AV1_COMMON *cm = &cpi->common; + const int tile_cols = cm->tiles.cols; + const int tile_rows = cm->tiles.rows; + + for (int tile_row = 0; tile_row < tile_rows; tile_row++) { + TileInfo tile_info; + av1_tile_set_row(&tile_info, cm, tile_row); + for (int tile_col = 0; tile_col < tile_cols; tile_col++) { + const int tile_idx = tile_row * tile_cols + tile_col; + av1_tile_set_col(&tile_info, cm, tile_col); + cpi->tile_data[tile_idx].tctx = *cm->fc; + cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx; + collect_mv_stats_tile(mv_stats, cpi, &tile_info); + } + } + + mv_stats->q = current_q; + mv_stats->order = cpi->common.current_frame.order_hint; + mv_stats->valid = 1; +} + +static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats, + int current_q) { + const AV1_COMMON *cm = &cpi->common; + const int order_hint = cpi->common.current_frame.order_hint; + const int order_diff = order_hint - mv_stats->order; + const float area = (float)(cm->width * cm->height); + float features[MV_PREC_FEATURE_SIZE] = { + (float)current_q, + (float)mv_stats->q, + (float)order_diff, + mv_stats->inter_count / area, + mv_stats->intra_count / area, + mv_stats->default_mvs / area, + mv_stats->mv_joint_count[0] / area, + mv_stats->mv_joint_count[1] / area, + mv_stats->mv_joint_count[2] / area, + mv_stats->mv_joint_count[3] / area, + mv_stats->last_bit_zero / area, + mv_stats->last_bit_nonzero / area, + mv_stats->total_mv_rate / area, + mv_stats->hp_total_mv_rate / area, + mv_stats->lp_total_mv_rate / area, + mv_stats->horz_text / area, + mv_stats->vert_text / area, + mv_stats->diag_text / area, + }; + + for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) { + features[f_idx] = + (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx]; + } + float score = 0.0f; + + av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score); + + const int use_high_hp = score >= 0.0f; + return use_high_hp; +} +#endif // !CONFIG_REALTIME_ONLY + +void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) { + int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH; +#if !CONFIG_REALTIME_ONLY + MV_STATS *mv_stats = &cpi->mv_stats; +#endif // !CONFIG_REALTIME_ONLY + + if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) { + use_hp = 0; + } +#if !CONFIG_REALTIME_ONLY + else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && + av1_frame_allows_smart_mv(cpi) && mv_stats->valid) { + use_hp = get_smart_mv_prec(cpi, mv_stats, qindex); + } +#endif // !CONFIG_REALTIME_ONLY + + av1_set_high_precision_mv(cpi, use_hp, + cpi->common.features.cur_frame_force_integer_mv); +} diff --git a/third_party/aom/av1/encoder/mv_prec.h b/third_party/aom/av1/encoder/mv_prec.h new file mode 100644 index 0000000000..55108b6cdb --- /dev/null +++ b/third_party/aom/av1/encoder/mv_prec.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_MV_PREC_H_ +#define AOM_AV1_ENCODER_MV_PREC_H_ + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" + +// Q threshold for high precision mv. +#define HIGH_PRECISION_MV_QTHRESH 128 +#if !CONFIG_REALTIME_ONLY +void av1_collect_mv_stats(AV1_COMP *cpi, int current_q); + +static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) { + const int gf_group_index = cpi->gf_frame_index; + const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index]; + return !frame_is_intra_only(&cpi->common) && + !(gf_update_type == INTNL_OVERLAY_UPDATE || + gf_update_type == OVERLAY_UPDATE); +} +#endif // !CONFIG_REALTIME_ONLY + +static AOM_INLINE void av1_set_high_precision_mv( + AV1_COMP *cpi, int allow_high_precision_mv, + int cur_frame_force_integer_mv) { + MvCosts *const mv_costs = cpi->td.mb.mv_costs; + // Avoid accessing 'mv_costs' when it is not allocated. + if (mv_costs == NULL) return; + + const int copy_hp = cpi->common.features.allow_high_precision_mv = + allow_high_precision_mv && !cur_frame_force_integer_mv; + + mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX]; + mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX]; + mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX]; + mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX]; + mv_costs->mv_cost_stack = + copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost; +} + +void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex); + +#endif // AOM_AV1_ENCODER_MV_PREC_H_ diff --git a/third_party/aom/av1/encoder/nonrd_opt.c b/third_party/aom/av1/encoder/nonrd_opt.c new file mode 100644 index 0000000000..651ca43a2e --- /dev/null +++ b/third_party/aom/av1/encoder/nonrd_opt.c @@ -0,0 +1,933 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/reconinter.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/rdopt.h" + +static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = { + av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16 +}; + +#define DECLARE_BLOCK_YRD_BUFFERS() \ + DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \ + DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]); \ + DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]); \ + uint16_t eob[1]; + +#define DECLARE_BLOCK_YRD_VARS() \ + /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the \ + * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \ + * as a non-const so we can reassign it to macroblock_plane::coeff. */ \ + int16_t *low_coeff = (int16_t *)coeff_buf; \ + int16_t *const low_qcoeff = (int16_t *)qcoeff_buf; \ + int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf; \ + const int diff_stride = bw; + +#define DECLARE_LOOP_VARS_BLOCK_YRD() \ + const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2]; + +static AOM_FORCE_INLINE void update_yrd_loop_vars( + MACROBLOCK *x, int *skippable, int step, int ncoeffs, + int16_t *const low_coeff, int16_t *const low_qcoeff, + int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost, + int tx_blk_id) { + const int is_txfm_skip = (ncoeffs == 0); + *skippable &= is_txfm_skip; + x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; + *eob_cost += get_msb(ncoeffs + 1); + if (ncoeffs == 1) + this_rdc->rate += (int)abs(low_qcoeff[0]); + else if (ncoeffs > 1) + this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4); + + this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2; +} + +static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x, + int max_blocks_high, + int max_blocks_wide, + int num_4x4_w, int step, + int block_step) { + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const int bw = 4 * num_4x4_w; + const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide); + int block = 0; + + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0; c < num_4x4; c += 2 * block_step) { + const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2]; + int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block); + aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff); + block += 2 * step; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +#define DECLARE_BLOCK_YRD_HBD_VARS() \ + tran_low_t *const coeff = coeff_buf; \ + tran_low_t *const qcoeff = qcoeff_buf; \ + tran_low_t *const dqcoeff = dqcoeff_buf; + +static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd( + MACROBLOCK *x, int *skippable, int step, int ncoeffs, + tran_low_t *const coeff, tran_low_t *const qcoeff, + tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost, + int tx_blk_id) { + const MACROBLOCKD *xd = &x->e_mbd; + const int is_txfm_skip = (ncoeffs == 0); + *skippable &= is_txfm_skip; + x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip; + *eob_cost += get_msb(ncoeffs + 1); + + int64_t dummy; + if (ncoeffs == 1) + this_rdc->rate += (int)abs(qcoeff[0]); + else if (ncoeffs > 1) + this_rdc->rate += aom_satd(qcoeff, step << 4); + this_rdc->dist += + av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2; +} +#endif + +/*!\brief Calculates RD Cost using Hadamard transform. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Calculates RD Cost using Hadamard transform. For low bit depth this function + * uses low-precision set of functions (16-bit) and 32 bit for high bit depth + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] this_rdc Pointer to calculated RD Cost + * \param[in] skippable Pointer to a flag indicating possible tx skip + * \param[in] bsize Current block size + * \param[in] tx_size Transform size + * \param[in] is_inter_mode Flag to indicate inter mode + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c this_rdc. \c skippable flag is set if there is no non-zero quantized + * coefficients for Hadamard transform + */ +void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y]; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + assert(bsize < BLOCK_SIZES_ALL); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + const int row_step = step * num_4x4_w >> tx_size; + int block = 0; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); + int eob_cost = 0; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + const int use_hbd = is_cur_buf_hbd(xd); + int num_blk_skip_w = num_4x4_w; + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride); + } else { + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } +#else + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); +#endif + + // Keep the intermediate value on the stack here. Writing directly to + // skippable causes speed regression due to load-and-store issues in + // update_yrd_loop_vars. + int temp_skippable = 1; + this_rdc->dist = 0; + this_rdc->rate = 0; + // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks + // can be done per function call. Hence the call of Hadamard txfm is + // abstracted here for the specified cases. + int is_tx_8x8_dual_applicable = + (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 && + block_size_high[bsize] >= 8); + +#if CONFIG_AV1_HIGHBITDEPTH + // As of now, dual implementation of hadamard txfm is available for low + // bitdepth. + if (use_hbd) is_tx_8x8_dual_applicable = 0; +#endif + + if (is_tx_8x8_dual_applicable) { + aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w, + step, block_step); + } + + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + DECLARE_BLOCK_YRD_BUFFERS() + DECLARE_BLOCK_YRD_VARS() +#if CONFIG_AV1_HIGHBITDEPTH + DECLARE_BLOCK_YRD_HBD_VARS() +#else + (void)use_hbd; +#endif + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { + DECLARE_LOOP_VARS_BLOCK_YRD() + + switch (tx_size) { +#if CONFIG_AV1_HIGHBITDEPTH + case TX_16X16: + if (use_hbd) { + aom_hadamard_16x16(src_diff, diff_stride, coeff); + av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, + dqcoeff, p->dequant_QTX, eob, + // default_scan_fp_16x16_transpose and + // av1_default_iscan_fp_16x16_transpose have to be + // used together. + default_scan_fp_16x16_transpose, + av1_default_iscan_fp_16x16_transpose); + } else { + aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); + av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, + p->quant_fp_QTX, low_qcoeff, low_dqcoeff, + p->dequant_QTX, eob, + // default_scan_lp_16x16_transpose and + // av1_default_iscan_lp_16x16_transpose have to be + // used together. + default_scan_lp_16x16_transpose, + av1_default_iscan_lp_16x16_transpose); + } + break; + case TX_8X8: + if (use_hbd) { + aom_hadamard_8x8(src_diff, diff_stride, coeff); + av1_quantize_fp( + coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX, + p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob, + default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); + } else { + if (is_tx_8x8_dual_applicable) { + // The coeffs are pre-computed for the whole block, so re-assign + // low_coeff to the appropriate location. + const int block_offset = BLOCK_OFFSET(block + s); + low_coeff = (int16_t *)p->coeff + block_offset; + } else { + aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); + } + av1_quantize_lp( + low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, + low_dqcoeff, p->dequant_QTX, eob, + // default_scan_8x8_transpose and + // av1_default_iscan_8x8_transpose have to be used together. + default_scan_8x8_transpose, av1_default_iscan_8x8_transpose); + } + break; + default: + assert(tx_size == TX_4X4); + // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate + // normal coefficients order, so we don't need to change the scan + // order here. + if (use_hbd) { + aom_fdct4x4(src_diff, coeff, diff_stride); + av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, + dqcoeff, p->dequant_QTX, eob, scan_order->scan, + scan_order->iscan); + } else { + aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); + av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + scan_order->scan, scan_order->iscan); + } + break; +#else + case TX_16X16: + aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); + av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + default_scan_lp_16x16_transpose, + av1_default_iscan_lp_16x16_transpose); + break; + case TX_8X8: + if (is_tx_8x8_dual_applicable) { + // The coeffs are pre-computed for the whole block, so re-assign + // low_coeff to the appropriate location. + const int block_offset = BLOCK_OFFSET(block + s); + low_coeff = (int16_t *)p->coeff + block_offset; + } else { + aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); + } + av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + default_scan_8x8_transpose, + av1_default_iscan_8x8_transpose); + break; + default: + aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); + av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, + low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, + scan_order->scan, scan_order->iscan); + break; +#endif + } + assert(*eob <= 1024); +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) + update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff, + dqcoeff, this_rdc, &eob_cost, + r * num_blk_skip_w + c); + else +#endif + update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, + low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, + r * num_blk_skip_w + c); + } + block += row_step; + } + + this_rdc->skip_txfm = *skippable = temp_skippable; + if (this_rdc->sse < INT64_MAX) { + this_rdc->sse = (this_rdc->sse << 6) >> 2; + if (temp_skippable) { + this_rdc->dist = 0; + this_rdc->dist = this_rdc->sse; + return; + } + } + + // If skippable is set, rate gets clobbered later. + this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); + this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); +} + +// Explicitly enumerate the cases so the compiler can generate SIMD for the +// function. According to the disassembler, gcc generates SSE codes for each of +// the possible block sizes. The hottest case is tx_width 16, which takes up +// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since +// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the +// potential room of improvement for writing AVX2 optimization is only 3% * 8% = +// 0.24% of total encoding time. +static AOM_INLINE void scale_square_buf_vals(int16_t *dst, int tx_width, + const int16_t *src, + int src_stride) { +#define DO_SCALING \ + do { \ + for (int idy = 0; idy < tx_width; ++idy) { \ + for (int idx = 0; idx < tx_width; ++idx) { \ + dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \ + } \ + } \ + } while (0) + + if (tx_width == 4) { + DO_SCALING; + } else if (tx_width == 8) { + DO_SCALING; + } else if (tx_width == 16) { + DO_SCALING; + } else { + assert(0); + } + +#undef DO_SCALING +} + +/*!\brief Calculates RD Cost when the block uses Identity transform. + * Note that this function is only for low bit depth encoding, since it + * is called in real-time mode for now, which sets high bit depth to 0: + * -DCONFIG_AV1_HIGHBITDEPTH=0 + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Calculates RD Cost. For low bit depth this function + * uses low-precision set of functions (16-bit) and 32 bit for high bit depth + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] pred_buf Pointer to the prediction buffer + * \param[in] pred_stride Stride for the prediction buffer + * \param[in] this_rdc Pointer to calculated RD Cost + * \param[in] skippable Pointer to a flag indicating possible tx skip + * \param[in] bsize Current block size + * \param[in] tx_size Transform size + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c this_rdc. \c skippable flag is set if all coefficients are zero. + */ +void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, + int pred_stride, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + assert(bsize < BLOCK_SIZES_ALL); + const int num_4x4_w = mi_size_wide[bsize]; + const int num_4x4_h = mi_size_high[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5); + int eob_cost = 0; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + const int num_blk_skip_w = num_4x4_w; + // Keep the intermediate value on the stack here. Writing directly to + // skippable causes speed regression due to load-and-store issues in + // update_yrd_loop_vars. + int temp_skippable = 1; + int tx_wd = 0; + const SCAN_ORDER *scan_order = NULL; + switch (tx_size) { + case TX_64X64: + assert(0); // Not implemented + break; + case TX_32X32: + assert(0); // Not used + break; + case TX_16X16: + scan_order = &av1_fast_idtx_scan_order_16x16; + tx_wd = 16; + break; + case TX_8X8: + scan_order = &av1_fast_idtx_scan_order_8x8; + tx_wd = 8; + break; + default: + assert(tx_size == TX_4X4); + scan_order = &av1_fast_idtx_scan_order_4x4; + tx_wd = 4; + break; + } + assert(scan_order != NULL); + + this_rdc->dist = 0; + this_rdc->rate = 0; + aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pred_buf, pred_stride); + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + DECLARE_BLOCK_YRD_BUFFERS() + DECLARE_BLOCK_YRD_VARS() + for (int r = 0; r < max_blocks_high; r += block_step) { + for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) { + DECLARE_LOOP_VARS_BLOCK_YRD() + scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride); + av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX, + p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, + eob, scan_order->scan, scan_order->iscan); + assert(*eob <= 1024); + update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff, + low_qcoeff, low_dqcoeff, this_rdc, &eob_cost, + r * num_blk_skip_w + c); + } + } + this_rdc->skip_txfm = *skippable = temp_skippable; + if (this_rdc->sse < INT64_MAX) { + this_rdc->sse = (this_rdc->sse << 6) >> 2; + if (temp_skippable) { + this_rdc->dist = 0; + this_rdc->dist = this_rdc->sse; + return; + } + } + // If skippable is set, rate gets clobbered later. + this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT); + this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT); +} + +int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *this_rdc, int start_plane, + int stop_plane) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + int plane; + int64_t tot_sse = 0; + + this_rdc->rate = 0; + this_rdc->dist = 0; + this_rdc->skip_txfm = 0; + + for (plane = start_plane; plane <= stop_plane; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const uint32_t dc_quant = p->dequant_QTX[0]; + const uint32_t ac_quant = p->dequant_QTX[1]; + const BLOCK_SIZE bs = plane_bsize; + unsigned int var; + if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue; + + var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + assert(sse >= var); + tot_sse += sse; + + av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs], + dc_quant >> 3, &rate, &dist); + + this_rdc->rate += rate >> 1; + this_rdc->dist += dist << 3; + + av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3, + &rate, &dist); + + this_rdc->rate += rate; + this_rdc->dist += dist << 4; + } + + if (this_rdc->rate == 0) { + this_rdc->skip_txfm = 1; + } + + if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >= + RDCOST(x->rdmult, 0, tot_sse << 4)) { + this_rdc->rate = 0; + this_rdc->dist = tot_sse << 4; + this_rdc->skip_txfm = 1; + } + + return tot_sse; +} + +static void compute_intra_yprediction(const AV1_COMMON *cm, + PREDICTION_MODE mode, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd) { + const SequenceHeader *seq_params = cm->seq_params; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + int plane = 0; + int row, col; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0, + FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride, + 0, 0, plane); + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; +} + +// Checks whether Intra mode needs to be pruned based on +// 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad' +// speed features. +static INLINE bool is_prune_intra_mode( + AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize, + uint8_t segment_id, SOURCE_SAD source_sad_nonrd, + uint8_t color_sensitivity[MAX_MB_PLANE - 1]) { + const PREDICTION_MODE this_mode = intra_mode_list[mode_index]; + if (mode_index > 2 || force_intra_check == 0) { + if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize])) + return true; + + if (this_mode == DC_PRED) return false; + + if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false; + + const bool has_color_sensitivity = + color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] && + color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]; + if (has_color_sensitivity && + (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad || + cyclic_refresh_segment_id_boosted(segment_id) || + source_sad_nonrd > kMedSad)) + return false; + + return true; + } + return false; +} + +/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Calculates RD Cost for an intra mode for a single TX block using Hadamard + * transform. + * \param[in] plane Color plane + * \param[in] block Index of a TX block in a prediction block + * \param[in] row Row of a current TX block + * \param[in] col Column of a current TX block + * \param[in] plane_bsize Block size of a current prediction block + * \param[in] tx_size Transform size + * \param[in] arg Pointer to a structure that holds parameters + * for intra mode search + * + * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode + * are set in \c args->rdc and \c args->mode + */ +void av1_estimate_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + struct estimate_block_intra_args *const args = arg; + AV1_COMP *const cpi = args->cpi; + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int64_t src_stride = p->src.stride; + const int64_t dst_stride = pd->dst.stride; + + (void)block; + + av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + + if (args->prune_mode_based_on_sad) { + unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); + const unsigned int sad_threshold = + args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4) + : UINT_MAX; + // Skip the evaluation of current mode if its SAD is more than a threshold. + if (this_sad > sad_threshold) { + // For the current mode, set rate and distortion to maximum possible + // values and return. + // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip + // the evaluation of the current mode. + args->rdc->rate = INT_MAX; + args->rdc->dist = INT64_MAX; + return; + } + if (this_sad < args->best_sad) { + args->best_sad = this_sad; + } + } + + RD_STATS this_rdc; + av1_invalid_rd_stats(&this_rdc); + + p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; + + if (plane == 0) { + av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx, + AOMMIN(tx_size, TX_16X16)); + } else { + av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane); + } + + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; + assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX); + args->rdc->rate += this_rdc.rate; + args->rdc->dist += this_rdc.dist; +} + +/*!\brief Estimates best intra mode for inter mode search + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * + * Using heuristics based on best inter mode, block size, and other decides + * whether to check intra modes. If so, estimates and selects best intra mode + * from the reduced set of intra modes (max 4 intra modes checked) + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] bsize Current block size + * \param[in] best_early_term Flag, indicating that TX for the + * best inter mode was skipped + * \param[in] ref_cost_intra Cost of signalling intra mode + * \param[in] reuse_prediction Flag, indicating prediction re-use + * \param[in] orig_dst Original destination buffer + * \param[in] tmp_buffers Pointer to a temporary buffers for + * prediction re-use + * \param[out] this_mode_pred Pointer to store prediction buffer + * for prediction re-use + * \param[in] best_rdc Pointer to RD cost for the best + * selected intra mode + * \param[in] best_pickmode Pointer to a structure containing + * best mode picked so far + * \param[in] ctx Pointer to structure holding coding + * contexts and modes for the block + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c best_rdc and best selected mode is placed to \c best_pickmode + * + */ +void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int best_early_term, unsigned int ref_cost_intra, + int reuse_prediction, struct buf_2d *orig_dst, + PRED_BUFFER *tmp_buffers, + PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, + BEST_PICKMODE *best_pickmode, + PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const unsigned char segment_id = mi->segment_id; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize]; + const bool is_screen_content = + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + + const CommonQuantParams *quant_params = &cm->quant_params; + + RD_STATS this_rdc; + + int intra_cost_penalty = av1_get_intra_cost_penalty( + quant_params->base_qindex, quant_params->y_dc_delta_q, + cm->seq_params->bit_depth); + int64_t inter_mode_thresh = + RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); + int perform_intra_pred = rt_sf->check_intra_pred_nonrd; + int force_intra_check = 0; + // For spatial enhancement layer: turn off intra prediction if the + // previous spatial layer as golden ref is not chosen as best reference. + // only do this for temporal enhancement layer and on non-key frames. + if (cpi->svc.spatial_layer_id > 0 && + best_pickmode->best_ref_frame != GOLDEN_FRAME && + cpi->svc.temporal_layer_id > 0 && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) + perform_intra_pred = 0; + + int do_early_exit_rdthresh = 1; + + uint32_t spatial_var_thresh = 50; + int motion_thresh = 32; + // Adjust thresholds to make intra mode likely tested if the other + // references (golden, alt) are skipped/not checked. For now always + // adjust for svc mode. + if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 && + rt_sf->nonrd_prune_ref_frame_search > 0)) { + spatial_var_thresh = 150; + motion_thresh = 0; + } + + // Some adjustments to checking intra mode based on source variance. + if (x->source_variance < spatial_var_thresh) { + // If the best inter mode is large motion or non-LAST ref reduce intra cost + // penalty, so intra mode is more likely tested. + if (best_rdc->rdcost != INT64_MAX && + (best_pickmode->best_ref_frame != LAST_FRAME || + abs(mi->mv[0].as_mv.row) >= motion_thresh || + abs(mi->mv[0].as_mv.col) >= motion_thresh)) { + intra_cost_penalty = intra_cost_penalty >> 2; + inter_mode_thresh = + RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0); + do_early_exit_rdthresh = 0; + } + if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) && + x->content_state_sb.source_sad_nonrd >= kHighSad) || + (is_screen_content && x->source_variance < 50 && + ((bsize >= BLOCK_32X32 && + x->content_state_sb.source_sad_nonrd != kZeroSad) || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))) + force_intra_check = 1; + // For big blocks worth checking intra (since only DC will be checked), + // even if best_early_term is set. + if (bsize >= BLOCK_32X32) best_early_term = 0; + } else if (rt_sf->source_metrics_sb_nonrd && + x->content_state_sb.source_sad_nonrd <= kLowSad) { + perform_intra_pred = 0; + } + + if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) { + if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV) + perform_intra_pred = 0; + else if (rt_sf->skip_intra_pred == 2) + perform_intra_pred = 0; + } + + if (!(best_rdc->rdcost == INT64_MAX || force_intra_check || + (perform_intra_pred && !best_early_term && + bsize <= cpi->sf.part_sf.max_intra_bsize))) { + return; + } + + // Early exit based on RD cost calculated using known rate. When + // is_screen_content is true, more bias is given to intra modes. Hence, + // considered conservative threshold in early exit for the same. + const int64_t known_rd = is_screen_content + ? CALC_BIASED_RDCOST(inter_mode_thresh) + : inter_mode_thresh; + if (known_rd > best_rdc->rdcost) return; + + struct estimate_block_intra_args args; + init_estimate_block_intra_args(&args, cpi, x); + TX_SIZE intra_tx_size = AOMMIN( + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), + TX_16X16); + if (is_screen_content && cpi->rc.high_source_sad && + x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16) + intra_tx_size = TX_4X4; + + PRED_BUFFER *const best_pred = best_pickmode->best_pred; + if (reuse_prediction && best_pred != NULL) { + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + if (best_pred->data == orig_dst->buf) { + *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)]; + aom_convolve_copy(best_pred->data, best_pred->stride, + (*this_mode_pred)->data, (*this_mode_pred)->stride, bw, + bh); + best_pickmode->best_pred = *this_mode_pred; + } + } + pd->dst = *orig_dst; + + for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) { + const PREDICTION_MODE this_mode = intra_mode_list[midx]; + const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; + const int64_t mode_rd_thresh = rd_threshes[mode_index]; + + if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id, + x->content_state_sb.source_sad_nonrd, + x->color_sensitivity)) + continue; + + if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { + // For spatially flat blocks with zero motion only check + // DC mode. + if (x->content_state_sb.source_sad_nonrd == kZeroSad && + x->source_variance == 0 && this_mode != DC_PRED) + continue; + // Only test Intra for big blocks if spatial_variance is small. + else if (bsize > BLOCK_32X32 && x->source_variance > 50) + continue; + } + + if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh, + rd_thresh_freq_fact[mode_index]) && + (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) { + continue; + } + const BLOCK_SIZE uv_bsize = + get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + + mi->mode = this_mode; + mi->ref_frame[0] = INTRA_FRAME; + mi->ref_frame[1] = NONE_FRAME; + + av1_invalid_rd_stats(&this_rdc); + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->tx_size = intra_tx_size; + compute_intra_yprediction(cm, this_mode, bsize, x, xd); + // Look into selecting tx_size here, based on prediction residual. + av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size); + // TODO(kyslov@) Need to account for skippable + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { + av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U, + av1_estimate_block_intra, &args); + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { + av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V, + av1_estimate_block_intra, &args); + } + + int mode_cost = 0; + if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) { + mode_cost += + x->mode_costs.angle_delta_cost[this_mode - V_PRED] + [MAX_ANGLE_DELTA + + mi->angle_delta[PLANE_TYPE_Y]]; + } + if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { + mode_cost += x->mode_costs.filter_intra_cost[bsize][0]; + } + this_rdc.rate += ref_cost_intra; + this_rdc.rate += intra_cost_penalty; + this_rdc.rate += mode_cost; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + + if (is_screen_content && rt_sf->source_metrics_sb_nonrd) { + // For blocks with low spatial variance and color sad, + // favor the intra-modes, only on scene/slide change. + if (cpi->rc.high_source_sad && x->source_variance < 800 && + (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) + this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost); + // Otherwise bias against intra for blocks with zero + // motion and no color, on non-scene/slide changes. + else if (!cpi->rc.high_source_sad && x->source_variance > 0 && + x->content_state_sb.source_sad_nonrd == kZeroSad && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) + this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1; + } + + if (this_rdc.rdcost < best_rdc->rdcost) { + *best_rdc = this_rdc; + best_pickmode->best_mode = this_mode; + best_pickmode->best_tx_size = mi->tx_size; + best_pickmode->best_ref_frame = INTRA_FRAME; + best_pickmode->best_second_ref_frame = NONE; + best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm; + mi->uv_mode = this_mode; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + if (!this_rdc.skip_txfm) + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + } + } + if (best_pickmode->best_ref_frame == INTRA_FRAME) + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + mi->tx_size = best_pickmode->best_tx_size; +} diff --git a/third_party/aom/av1/encoder/nonrd_opt.h b/third_party/aom/av1/encoder/nonrd_opt.h new file mode 100644 index 0000000000..a53578ebad --- /dev/null +++ b/third_party/aom/av1/encoder/nonrd_opt.h @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_NONRD_OPT_H_ +#define AOM_AV1_ENCODER_NONRD_OPT_H_ + +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/rdopt.h" + +#define RTC_INTER_MODES (4) +#define RTC_INTRA_MODES (4) +#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES)) +#define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3) +#define NUM_COMP_INTER_MODES_RT (6) +#define NUM_INTER_MODES 12 +#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \ + (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false) +#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16) +#define FILTER_SEARCH_SIZE 2 +#if !CONFIG_REALTIME_ONLY +#define MOTION_MODE_SEARCH_SIZE 2 +#endif + +extern int g_pick_inter_mode_cnt; +/*!\cond */ +typedef struct { + uint8_t *data; + int stride; + int in_use; +} PRED_BUFFER; + +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_TYPE tx_type; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + uint8_t best_mode_initial_skip_flag; + int_interpfilters best_pred_filter; + MOTION_MODE best_motion_mode; + WarpedMotionParams wm_params; + int num_proj_ref; + PALETTE_MODE_INFO pmi; + int64_t best_sse; +} BEST_PICKMODE; + +typedef struct { + MV_REFERENCE_FRAME ref_frame; + PREDICTION_MODE pred_mode; +} REF_MODE; + +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; + PREDICTION_MODE pred_mode; +} COMP_REF_MODE; + +struct estimate_block_intra_args { + AV1_COMP *cpi; + MACROBLOCK *x; + PREDICTION_MODE mode; + int skippable; + RD_STATS *rdc; + unsigned int best_sad; + bool prune_mode_based_on_sad; +}; +/*!\endcond */ + +/*!\brief Structure to store parameters and statistics used in non-rd inter mode + * evaluation. + */ +typedef struct { + //! Structure to hold best inter mode data + BEST_PICKMODE best_pickmode; + //! Structure to RD cost of current mode + RD_STATS this_rdc; + //! Pointer to the RD Cost for the best mode found so far + RD_STATS best_rdc; + //! Distortion of chroma planes for all modes and reference frames + int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES]; + //! Buffer to hold predicted block for all reference frames and planes + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + //! Array to hold variance of all modes and reference frames + unsigned int vars[RTC_INTER_MODES][REF_FRAMES]; + //! Array to hold ref cost of single reference mode for all ref frames + unsigned int ref_costs_single[REF_FRAMES]; + //! Array to hold motion vector for all modes and reference frames + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; + //! Array to hold best mv for all modes and reference frames + int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES]; + //! Array to hold inter mode cost of single ref mode for all ref frames + int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES]; + //! Array to hold use reference frame mask for each reference frame + int use_ref_frame_mask[REF_FRAMES]; + //! Array to hold flags of evaluated modes for each reference frame + uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES]; + //! Array to hold flag indicating if scaled reference frame is used. + bool use_scaled_ref_frame[REF_FRAMES]; +} InterModeSearchStateNonrd; + +static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2, + 2, 2, 3, 3, 3, 4, + 4, 4, 5, 5 }; +static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1, + 2, 3, 2, 3, 4, 3, + 4, 5, 4, 5 }; + +static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, + SMOOTH_PRED }; + +static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV, + NEWMV }; + +static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = { + { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH }, + { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV }, + { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 }, + { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 }, + { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG }, + { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB }, + { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 }, + { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA }, +}; + +// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT +// mode +static const REF_MODE ref_mode_set[NUM_INTER_MODES] = { + { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, + { LAST_FRAME, GLOBALMV }, { LAST_FRAME, NEWMV }, + { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, + { GOLDEN_FRAME, GLOBALMV }, { GOLDEN_FRAME, NEWMV }, + { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV }, + { ALTREF_FRAME, GLOBALMV }, { ALTREF_FRAME, NEWMV }, +}; + +static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = { + { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV }, + { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV }, + { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV }, + { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV }, + { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV }, + { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV }, +}; + +static const int_interpfilters filters_ref_set[9] = { + [0].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, + [1].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH }, + [2].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH }, + [3].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR }, + [4].as_filters = { MULTITAP_SHARP, MULTITAP_SHARP }, + [5].as_filters = { EIGHTTAP_REGULAR, MULTITAP_SHARP }, + [6].as_filters = { MULTITAP_SHARP, EIGHTTAP_REGULAR }, + [7].as_filters = { EIGHTTAP_SMOOTH, MULTITAP_SHARP }, + [8].as_filters = { MULTITAP_SHARP, EIGHTTAP_SMOOTH } +}; + +enum { + // INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV), + INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV), + INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV), +}; + +// The original scan order (default_scan_8x8) is modified according to the extra +// transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and +// aom_hadamard_8x8_c. +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = { + 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, + 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, + 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, + 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 +}; + +// The original scan order (av1_default_iscan_8x8) is modified to match +// hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and +// aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the +// order of coefficients, such that the normal scan order is no longer +// guaranteed to scan low coefficients first, therefore we modify the scan order +// accordingly. +// Note that this one has to be used together with default_scan_8x8_transpose. +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_8x8_transpose[64]) = { + 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36, + 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49, + 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58, + 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63 +}; + +// The original scan order (default_scan_16x16) is modified according to the +// extra transpose in hadamard c implementation in lp case, i.e., +// aom_hadamard_lp_16x16_c. +DECLARE_ALIGNED(16, static const int16_t, + default_scan_lp_16x16_transpose[256]) = { + 0, 8, 2, 4, 10, 16, 24, 18, 12, 6, 64, 14, 20, 26, 32, + 40, 34, 28, 22, 72, 66, 68, 74, 80, 30, 36, 42, 48, 56, 50, + 44, 38, 88, 82, 76, 70, 128, 78, 84, 90, 96, 46, 52, 58, 1, + 9, 3, 60, 54, 104, 98, 92, 86, 136, 130, 132, 138, 144, 94, 100, + 106, 112, 62, 5, 11, 17, 25, 19, 13, 7, 120, 114, 108, 102, 152, + 146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65, 15, 21, 27, + 33, 41, 35, 29, 23, 73, 67, 124, 118, 168, 162, 156, 150, 200, 194, + 196, 202, 208, 158, 164, 170, 176, 126, 69, 75, 81, 31, 37, 43, 49, + 57, 51, 45, 39, 89, 83, 77, 71, 184, 178, 172, 166, 216, 210, 204, + 198, 206, 212, 218, 224, 174, 180, 186, 129, 79, 85, 91, 97, 47, 53, + 59, 61, 55, 105, 99, 93, 87, 137, 131, 188, 182, 232, 226, 220, 214, + 222, 228, 234, 240, 190, 133, 139, 145, 95, 101, 107, 113, 63, 121, 115, + 109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143, + 149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252, + 246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217, + 211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221, + 215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247, + 255 +}; + +#if CONFIG_AV1_HIGHBITDEPTH +// The original scan order (default_scan_16x16) is modified according to the +// extra shift in hadamard c implementation in fp case, i.e., +// aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different +// outputs, so we handle them separately. +DECLARE_ALIGNED(16, static const int16_t, + default_scan_fp_16x16_transpose[256]) = { + 0, 4, 2, 8, 6, 16, 20, 18, 12, 10, 64, 14, 24, 22, 32, + 36, 34, 28, 26, 68, 66, 72, 70, 80, 30, 40, 38, 48, 52, 50, + 44, 42, 84, 82, 76, 74, 128, 78, 88, 86, 96, 46, 56, 54, 1, + 5, 3, 60, 58, 100, 98, 92, 90, 132, 130, 136, 134, 144, 94, 104, + 102, 112, 62, 9, 7, 17, 21, 19, 13, 11, 116, 114, 108, 106, 148, + 146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65, 15, 25, 23, + 33, 37, 35, 29, 27, 69, 67, 124, 122, 164, 162, 156, 154, 196, 194, + 200, 198, 208, 158, 168, 166, 176, 126, 73, 71, 81, 31, 41, 39, 49, + 53, 51, 45, 43, 85, 83, 77, 75, 180, 178, 172, 170, 212, 210, 204, + 202, 206, 216, 214, 224, 174, 184, 182, 129, 79, 89, 87, 97, 47, 57, + 55, 61, 59, 101, 99, 93, 91, 133, 131, 188, 186, 228, 226, 220, 218, + 222, 232, 230, 240, 190, 137, 135, 145, 95, 105, 103, 113, 63, 117, 115, + 109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143, + 153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252, + 250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213, + 211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221, + 219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251, + 255 +}; +#endif + +// The original scan order (av1_default_iscan_16x16) is modified to match +// hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2. +// Since hadamard AVX2 implementation will modify the order of coefficients, +// such that the normal scan order is no longer guaranteed to scan low +// coefficients first, therefore we modify the scan order accordingly. Note that +// this one has to be used together with default_scan_lp_16x16_transpose. +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_lp_16x16_transpose[256]) = { + 0, 44, 2, 46, 3, 63, 9, 69, 1, 45, 4, 64, 8, 68, 11, + 87, 5, 65, 7, 67, 12, 88, 18, 94, 6, 66, 13, 89, 17, 93, + 24, 116, 14, 90, 16, 92, 25, 117, 31, 123, 15, 91, 26, 118, 30, + 122, 41, 148, 27, 119, 29, 121, 42, 149, 48, 152, 28, 120, 43, 150, + 47, 151, 62, 177, 10, 86, 20, 96, 21, 113, 35, 127, 19, 95, 22, + 114, 34, 126, 37, 144, 23, 115, 33, 125, 38, 145, 52, 156, 32, 124, + 39, 146, 51, 155, 58, 173, 40, 147, 50, 154, 59, 174, 73, 181, 49, + 153, 60, 175, 72, 180, 83, 198, 61, 176, 71, 179, 84, 199, 98, 202, + 70, 178, 85, 200, 97, 201, 112, 219, 36, 143, 54, 158, 55, 170, 77, + 185, 53, 157, 56, 171, 76, 184, 79, 194, 57, 172, 75, 183, 80, 195, + 102, 206, 74, 182, 81, 196, 101, 205, 108, 215, 82, 197, 100, 204, 109, + 216, 131, 223, 99, 203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221, + 141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78, 193, 104, + 208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214, + 133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139, + 231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250, + 168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211, + 255 +}; + +#if CONFIG_AV1_HIGHBITDEPTH +// The original scan order (av1_default_iscan_16x16) is modified to match +// hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2. +// Since hadamard AVX2 implementation will modify the order of coefficients, +// such that the normal scan order is no longer guaranteed to scan low +// coefficients first, therefore we modify the scan order accordingly. Note that +// this one has to be used together with default_scan_fp_16x16_transpose. +DECLARE_ALIGNED(16, static const int16_t, + av1_default_iscan_fp_16x16_transpose[256]) = { + 0, 44, 2, 46, 1, 45, 4, 64, 3, 63, 9, 69, 8, 68, 11, + 87, 5, 65, 7, 67, 6, 66, 13, 89, 12, 88, 18, 94, 17, 93, + 24, 116, 14, 90, 16, 92, 15, 91, 26, 118, 25, 117, 31, 123, 30, + 122, 41, 148, 27, 119, 29, 121, 28, 120, 43, 150, 42, 149, 48, 152, + 47, 151, 62, 177, 10, 86, 20, 96, 19, 95, 22, 114, 21, 113, 35, + 127, 34, 126, 37, 144, 23, 115, 33, 125, 32, 124, 39, 146, 38, 145, + 52, 156, 51, 155, 58, 173, 40, 147, 50, 154, 49, 153, 60, 175, 59, + 174, 73, 181, 72, 180, 83, 198, 61, 176, 71, 179, 70, 178, 85, 200, + 84, 199, 98, 202, 97, 201, 112, 219, 36, 143, 54, 158, 53, 157, 56, + 171, 55, 170, 77, 185, 76, 184, 79, 194, 57, 172, 75, 183, 74, 182, + 81, 196, 80, 195, 102, 206, 101, 205, 108, 215, 82, 197, 100, 204, 99, + 203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221, + 128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78, 193, 104, + 208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214, + 133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139, + 231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250, + 168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211, + 255 +}; +#endif + +// For entropy coding, IDTX shares the scan orders of the other 2D-transforms, +// but the fastest way to calculate the IDTX transform (i.e. no transposes) +// results in coefficients that are a transposition of the entropy coding +// versions. These tables are used as substitute for the scan order for the +// faster version of IDTX. + +// Must be used together with av1_fast_idtx_iscan_4x4 +DECLARE_ALIGNED(16, static const int16_t, + av1_fast_idtx_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + +// Must be used together with av1_fast_idtx_scan_4x4 +DECLARE_ALIGNED(16, static const int16_t, + av1_fast_idtx_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12, + 3, 8, 11, 13, 9, 10, 14, 15 }; + +static const SCAN_ORDER av1_fast_idtx_scan_order_4x4 = { + av1_fast_idtx_scan_4x4, av1_fast_idtx_iscan_4x4 +}; + +// Must be used together with av1_fast_idtx_iscan_8x8 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_8x8[64]) = { + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 +}; + +// Must be used together with av1_fast_idtx_scan_8x8 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_8x8[64]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, + 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, + 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, + 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63 +}; + +static const SCAN_ORDER av1_fast_idtx_scan_order_8x8 = { + av1_fast_idtx_scan_8x8, av1_fast_idtx_iscan_8x8 +}; + +// Must be used together with av1_fast_idtx_iscan_16x16 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_16x16[256]) = { + 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, + 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, + 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, + 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, + 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, + 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, + 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, + 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14, + 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, + 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46, + 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, + 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94, + 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185, + 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231, + 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203, + 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, + 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, + 255 +}; + +// Must be used together with av1_fast_idtx_scan_16x16 +DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = { + 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119, + 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118, + 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117, + 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116, + 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115, + 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114, + 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113, + 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112, + 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111, + 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110, + 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109, + 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108, + 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107, + 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106, + 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105, + 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253, + 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254, + 255 +}; + +// Indicates the blocks for which RD model should be based on special logic +static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int large_block = bsize >= BLOCK_32X32; + // Only enable for low bitdepth to mitigate issue: b/303023614. + return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block && + !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && + cm->quant_params.base_qindex && !cpi->oxcf.use_highbitdepth; +} +/*!\brief Finds predicted motion vectors for a block. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Finds predicted motion vectors for a block from a certain reference frame. + * First, it fills reference MV stack, then picks the test from the stack and + * predicts the final MV for a block for each mode. + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] ref_frame Reference frame for which to find + * ref MVs + * \param[out] frame_mv Predicted MVs for a block + * \param[in] yv12_mb Buffer to hold predicted block + * \param[in] bsize Current block size + * \param[in] force_skip_low_temp_var Flag indicating possible mode search + * prune for low temporal variance block + * \param[in] skip_pred_mv Flag indicating to skip av1_mv_pred + * \param[out] use_scaled_ref_frame Flag to indicate if scaled reference + * frame is used. + * + * \remark Nothing is returned. Instead, predicted MVs are placed into + * \c frame_mv array, and use_scaled_ref_frame is set. + */ +static INLINE void find_predictors( + AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], + struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize, + int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame); + const bool ref_is_scaled = + ref->y_crop_height != cm->height || ref->y_crop_width != cm->width; + const YV12_BUFFER_CONFIG *scaled_ref = + av1_get_scaled_ref_frame(cpi, ref_frame); + const YV12_BUFFER_CONFIG *yv12 = + ref_is_scaled && scaled_ref ? scaled_ref : ref; + const int num_planes = av1_num_planes(cm); + x->pred_mv_sad[ref_frame] = INT_MAX; + x->pred_mv0_sad[ref_frame] = INT_MAX; + x->pred_mv1_sad[ref_frame] = INT_MAX; + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + // TODO(kyslov) this needs various further optimizations. to be continued.. + assert(yv12 != NULL); + if (yv12 != NULL) { + struct scale_factors *const sf = + scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame); + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + av1_find_best_ref_mvs_from_stack( + cm->features.allow_high_precision_mv, mbmi_ext, ref_frame, + &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0); + frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame]; + // Early exit for non-LAST frame if force_skip_low_temp_var is set. + if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv && + !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) { + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, + bsize); + } + } + if (cm->features.switchable_motion_mode) { + av1_count_overlappable_neighbors(cm, xd); + } + mbmi->num_proj_ref = 1; + *use_scaled_ref_frame = ref_is_scaled && scaled_ref; +} + +static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi, + PREDICTION_MODE pred_mode, + MV_REFERENCE_FRAME ref_frame0, + MV_REFERENCE_FRAME ref_frame1, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + mbmi->ref_mv_idx = 0; + mbmi->mode = pred_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frame0; + mbmi->ref_frame[1] = ref_frame1; + pmi->palette_size[PLANE_TYPE_Y] = 0; + pmi->palette_size[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->num_proj_ref = 1; + mbmi->interintra_mode = 0; + set_default_interp_filters(mbmi, cm->features.interp_filter); +} + +static INLINE void init_estimate_block_intra_args( + struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) { + args->cpi = cpi; + args->x = x; + args->mode = DC_PRED; + args->skippable = 1; + args->rdc = 0; + args->best_sad = UINT_MAX; + args->prune_mode_based_on_sad = false; +} + +static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) { + for (int buf_idx = 0; buf_idx < len; buf_idx++) { + if (!p[buf_idx].in_use) { + p[buf_idx].in_use = 1; + return buf_idx; + } + } + return -1; +} + +static INLINE void free_pred_buffer(PRED_BUFFER *p) { + if (p != NULL) p->in_use = 0; +} + +#if CONFIG_INTERNAL_STATS +static INLINE void store_coding_context_nonrd(MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, + int mode_index) { +#else +static INLINE void store_coding_context_nonrd(MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx) { +#endif // CONFIG_INTERNAL_STATS + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->rd_stats.skip_txfm = txfm_info->skip_txfm; + + ctx->skippable = txfm_info->skip_txfm; +#if CONFIG_INTERNAL_STATS + ctx->best_mode_index = mode_index; +#endif // CONFIG_INTERNAL_STATS + ctx->mic = *xd->mi[0]; + ctx->skippable = txfm_info->skip_txfm; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); +} + +void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size); + +void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf, + int pred_stride, RD_STATS *this_rdc, int *skippable, + BLOCK_SIZE bsize, TX_SIZE tx_size); + +int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *this_rdc, int start_plane, + int stop_plane); + +void av1_estimate_block_intra(int plane, int block, int row, int col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg); + +void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int best_early_term, unsigned int ref_cost_intra, + int reuse_prediction, struct buf_2d *orig_dst, + PRED_BUFFER *tmp_buffers, + PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc, + BEST_PICKMODE *best_pickmode, + PICK_MODE_CONTEXT *ctx); + +#endif // AOM_AV1_ENCODER_NONRD_OPT_H_ diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c new file mode 100644 index 0000000000..f939b6d1fa --- /dev/null +++ b/third_party/aom/av1/encoder/nonrd_pickmode.c @@ -0,0 +1,3537 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + + */ + +#include +#include +#include +#include + +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/var_based_part.h" + +static INLINE int early_term_inter_search_with_sse(int early_term_idx, + BLOCK_SIZE bsize, + int64_t this_sse, + int64_t best_sse, + PREDICTION_MODE this_mode) { + // Aggressiveness to terminate inter mode search early is adjusted based on + // speed and block size. + static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 }, + { 0.6, 0.65, 0.85, 0.9 }, + { 0.5, 0.5, 0.55, 0.6 }, + { 0.6, 0.75, 0.85, 0.85 } }; + static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3, + 0.3 }; + + const int size_group = size_group_lookup[bsize]; + assert(size_group < 4); + assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES)); + const double threshold = + ((early_term_idx == EARLY_TERM_IDX_4) && + (this_mode == NEWMV || this_mode == NEARESTMV)) + ? early_term_thresh_newmv_nearestmv[size_group] + : early_term_thresh[early_term_idx - 1][size_group]; + + // Terminate inter mode search early based on best sse so far. + if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) { + return 1; + } + return 0; +} + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_sse = INT64_MAX; + bp->best_mode = NEARESTMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_second_ref_frame = NONE_FRAME; + bp->best_tx_size = TX_8X8; + bp->tx_type = DCT_DCT; + bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + bp->best_mode_skip_txfm = 0; + bp->best_mode_initial_skip_flag = 0; + bp->best_pred = NULL; + bp->best_motion_mode = SIMPLE_TRANSLATION; + bp->num_proj_ref = 0; + av1_zero(bp->wm_params); + av1_zero(bp->pmi); +} + +// Copy best inter mode parameters to best_pickmode +static INLINE void update_search_state_nonrd( + InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi, + TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx, + PREDICTION_MODE this_best_mode, const int64_t sse_y) { + BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; + + best_pickmode->best_sse = sse_y; + best_pickmode->best_mode = this_best_mode; + best_pickmode->best_motion_mode = mi->motion_mode; + best_pickmode->wm_params = mi->wm_params; + best_pickmode->num_proj_ref = mi->num_proj_ref; + best_pickmode->best_pred_filter = mi->interp_filters; + best_pickmode->best_tx_size = mi->tx_size; + best_pickmode->best_ref_frame = mi->ref_frame[0]; + best_pickmode->best_second_ref_frame = mi->ref_frame[1]; + best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm; + best_pickmode->best_mode_initial_skip_flag = + (nonskip_rdc->rate == INT_MAX && search_state->this_rdc.skip_txfm); + if (!best_pickmode->best_mode_skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } +} + +static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int_mv *mv, MV ref_mv, FULLPEL_MV start_mv, + bool fullpel_performed_well) { + const int frame_lowmotion = cpi->rc.avg_frame_low_motion; + const int reduce_mv_pel_precision_highmotion = + cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion; + + // Reduce MV precision for higher int MV value & frame-level motion + if (reduce_mv_pel_precision_highmotion >= 3) { + int mv_thresh = 4; + const int is_low_resoln = + (cpi->common.width * cpi->common.height <= 320 * 240); + mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6; + if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12; + mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh; + if (abs(mv->as_fullmv.row) >= mv_thresh || + abs(mv->as_fullmv.col) >= mv_thresh) + return HALF_PEL; + } else if (reduce_mv_pel_precision_highmotion >= 1) { + int mv_thresh; + const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } }; + const int th_idx = reduce_mv_pel_precision_highmotion - 1; + assert(th_idx >= 0 && th_idx < 2); + if (frame_lowmotion > 0 && frame_lowmotion < 40) + mv_thresh = 12; + else + mv_thresh = (bsize >= BLOCK_32X32) ? th_vals[th_idx][0] + : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1] + : th_vals[th_idx][2]; + if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) || + abs(mv->as_fullmv.col) >= (mv_thresh << 1)) + return FULL_PEL; + else if (abs(mv->as_fullmv.row) >= mv_thresh || + abs(mv->as_fullmv.col) >= mv_thresh) + return HALF_PEL; + } + // Reduce MV precision for relatively static (e.g. background), low-complex + // large areas + if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) { + const int qband = x->qindex >> (QINDEX_BITS - 2); + assert(qband < 4); + if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad && + bsize > BLOCK_16X16 && qband != 0) { + if (x->source_variance < 500) + return FULL_PEL; + else if (x->source_variance < 5000) + return HALF_PEL; + } + } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) { + if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 && + start_mv.row == 0 && start_mv.col == 0) + return HALF_PEL; + } + return cpi->sf.mv_sf.subpel_force_stop; +} + +static bool use_aggressive_subpel_search_method(MACROBLOCK *x, + bool use_adaptive_subpel_search, + bool fullpel_performed_well) { + if (!use_adaptive_subpel_search) return false; + const int qband = x->qindex >> (QINDEX_BITS - 2); + assert(qband < 4); + if ((qband > 0) && (fullpel_performed_well || + (x->content_state_sb.source_sad_nonrd <= kLowSad) || + (x->source_variance < 100))) + return true; + return false; +} + +/*!\brief Runs Motion Estimation for a specific block and specific ref frame. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Finds the best Motion Vector by running Motion Estimation for a specific + * block and a specific reference frame. Exits early if RDCost of Full Pel part + * exceeds best RD Cost fund so far + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] bsize Current block size + * \param[in] tmp_mv Pointer to best found New MV + * \param[in] rate_mv Pointer to Rate of the best new MV + * \param[in] best_rd_sofar RD Cost of the best mode found so far + * \param[in] use_base_mv Flag, indicating that tmp_mv holds + * specific MV to start the search with + * + * \return Returns 0 if ME was terminated after Full Pel Search because too + * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv. + * Rate estimation for this vector is placed to \c rate_mv + */ +static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int_mv *tmp_mv, + int *rate_mv, int64_t best_rd_sofar, + int use_base_mv) { + MACROBLOCKD *xd = &x->e_mbd; + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + MB_MODE_INFO *mi = xd->mi[0]; + int step_param = (sf->rt_sf.fullpel_search_step_param) + ? sf->rt_sf.fullpel_search_step_param + : cpi->mv_search_params.mv_step_param; + FULLPEL_MV start_mv; + const int ref = mi->ref_frame[0]; + const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv; + MV center_mv; + int dis; + int rv = 0; + int cost_list[5]; + int search_subpel = 1; + + start_mv = get_fullmv_from_mv(&ref_mv); + + if (!use_base_mv) + center_mv = ref_mv; + else + center_mv = tmp_mv->as_mv; + + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); + const search_site_config *src_search_sites = + av1_get_search_site_config(cpi, x, search_method); + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, + start_mv, src_search_sites, search_method, + /*fine_search_interval=*/0); + + const unsigned int full_var_rd = av1_full_pixel_search( + start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list), + &tmp_mv->as_fullmv, &best_mv_stats, NULL); + + // calculate the bit cost on motion vector + MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv); + + *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + + // TODO(kyslov) Account for Rate Mode! + rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar); + + if (rv && search_subpel) { + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, + cost_list); + const bool fullpel_performed_well = + (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) || + (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) || + (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127); + if (sf->rt_sf.reduce_mv_pel_precision_highmotion || + sf->rt_sf.reduce_mv_pel_precision_lowcomplex) + ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv, + start_mv, fullpel_performed_well); + + MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + // adaptively downgrade subpel search method based on block properties + if (use_aggressive_subpel_search_method( + x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well)) + av1_find_best_sub_pixel_tree_pruned_more( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv, + &dis, &x->pred_sse[ref], NULL); + else + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv, + &dis, &x->pred_sse[ref], NULL); + *rate_mv = + av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + // The final MV can not be equal to the reference MV as this will trigger an + // assert later. This can happen if both NEAREST and NEAR modes were skipped. + rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row); + return rv; +} + +/*!\brief Searches for the best New Motion Vector. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Finds the best Motion Vector by doing Motion Estimation. Uses reduced + * complexity ME for non-LAST frames or calls \c combined_motion_search + * for LAST reference frame + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] frame_mv Array that holds MVs for all modes + * and ref frames + * \param[in] ref_frame Reference frame for which to find + * the best New MVs + * \param[in] gf_temporal_ref Flag, indicating temporal reference + * for GOLDEN frame + * \param[in] bsize Current block size + * \param[in] mi_row Row index in 4x4 units + * \param[in] mi_col Column index in 4x4 units + * \param[in] rate_mv Pointer to Rate of the best new MV + * \param[in] best_rdc Pointer to the RD Cost for the best + * mode found so far + * + * \return Returns -1 if the search was not done, otherwise returns 0. + * Best New MV is placed into \c frame_mv array, Rate estimation for this + * vector is placed to \c rate_mv + */ +static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv, + RD_STATS *best_rdc) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + AV1_COMMON *cm = &cpi->common; + int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame]; + unsigned int y_sad_zero; + if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR && + gf_temporal_ref) { + int tmp_sad; + int dis; + + if (bsize < BLOCK_16X16) return -1; + + int me_search_size_col = block_size_wide[bsize] >> 1; + int me_search_size_row = block_size_high[bsize] >> 1; + tmp_sad = av1_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero, + me_search_size_col, me_search_size_row); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + + this_ref_frm_newmv->as_int = mi->mv[0].as_int; + int_mv best_mv = mi->mv[0]; + best_mv.as_mv.row >>= 3; + best_mv.as_mv.col >>= 3; + MV ref_mv = av1_get_ref_mv(x, 0).as_mv; + this_ref_frm_newmv->as_mv.row >>= 3; + this_ref_frm_newmv->as_mv.col >>= 3; + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL); + if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion || + cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) { + FULLPEL_MV start_mv = { .row = 0, .col = 0 }; + ms_params.forced_stop = + subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false); + } + MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv)); + cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, + &x->pred_sse[ref_frame], NULL); + this_ref_frm_newmv->as_int = best_mv.as_int; + + // When NEWMV is same as ref_mv from the drl, it is preferred to code the + // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to + // avoid an assert failure at a later stage. The scenario can occur if + // NEARESTMV was not evaluated for ALTREF. + if (this_ref_frm_newmv->as_mv.col == ref_mv.col && + this_ref_frm_newmv->as_mv.row == ref_mv.row) + return -1; + + *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame], + rate_mv, best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static void estimate_single_ref_frame_costs(const AV1_COMMON *cm, + const MACROBLOCKD *xd, + const ModeCosts *mode_costs, + int segment_id, BLOCK_SIZE bsize, + unsigned int *ref_costs_single) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = + mode_costs->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1]; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && + is_comp_ref_allowed(bsize)) { + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1]; + } + ref_costs_single[LAST_FRAME] = base_cost; + ref_costs_single[GOLDEN_FRAME] = base_cost; + ref_costs_single[ALTREF_FRAME] = base_cost; + // add cost for last, golden, altref + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0]; + } +} + +static INLINE void set_force_skip_flag(const AV1_COMP *const cpi, + MACROBLOCK *const x, unsigned int sse, + int *force_skip) { + if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT && + cpi->sf.rt_sf.tx_size_level_based_on_qstep && + cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) { + const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (x->e_mbd.bd - 5); + const unsigned int qstep_sq = qstep * qstep; + // If the sse is low for low source variance blocks, mark those as + // transform skip. + // Note: Though qstep_sq is based on ac qstep, the threshold is kept + // low so that reliable early estimate of tx skip can be obtained + // through its comparison with sse. + if (sse < qstep_sq && x->source_variance < qstep_sq && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) + *force_skip = 1; + } +} + +#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \ + (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false) +#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16) + +static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *const x, unsigned int var, + unsigned int sse, int *force_skip) { + MACROBLOCKD *const xd = &x->e_mbd; + TX_SIZE tx_size; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) { + int multiplier = 8; + unsigned int var_thresh = 0; + unsigned int is_high_var = 1; + // Use quantizer based thresholds to determine transform size. + if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) { + const int qband = x->qindex >> (QINDEX_BITS - 2); + const int mult[4] = { 8, 7, 6, 5 }; + assert(qband < 4); + multiplier = mult[qband]; + const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (xd->bd - 5); + const unsigned int qstep_sq = qstep * qstep; + var_thresh = qstep_sq * 2; + if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) { + // If the sse is low for low source variance blocks, mark those as + // transform skip. + // Note: Though qstep_sq is based on ac qstep, the threshold is kept + // low so that reliable early estimate of tx skip can be obtained + // through its comparison with sse. + if (sse < qstep_sq && x->source_variance < qstep_sq && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) + *force_skip = 1; + // Further lower transform size based on aq mode only if residual + // variance is high. + is_high_var = (var >= var_thresh); + } + } + // Choose larger transform size for blocks where dc component is dominant or + // the ac component is low. + if (sse > ((var * multiplier) >> 2) || (var < var_thresh)) + tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); + else + tx_size = TX_8X8; + + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var) + tx_size = TX_8X8; + else if (tx_size > TX_16X16) + tx_size = TX_16X16; + } else { + tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); + } + + if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) + tx_size = TX_SIZE_FOR_BSIZE_GT32; + + return AOMMIN(tx_size, TX_16X16); +} + +static void block_variance(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, int block_size, + uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { + int k = 0; + *sse = 0; + *sum = 0; + + // This function is called for block sizes >= BLOCK_32x32. As per the design + // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32) + // per call. Hence the width and height of the block need to be at least 8 and + // 32 samples respectively. + assert(w >= 32); + assert(h >= 8); + for (int row = 0; row < h; row += block_size) { + for (int col = 0; col < w; col += 32) { + aom_get_var_sse_sum_8x8_quad(src + src_stride * row + col, src_stride, + ref + ref_stride * row + col, ref_stride, + &sse8x8[k], &sum8x8[k], sse, sum, + &var8x8[k]); + k += 4; + } + } +} + +static void block_variance_16x16_dual(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, + int h, unsigned int *sse, int *sum, + int block_size, uint32_t *sse16x16, + uint32_t *var16x16) { + int k = 0; + *sse = 0; + *sum = 0; + // This function is called for block sizes >= BLOCK_32x32. As per the design + // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a + // 16x32) per call. Hence the width and height of the block need to be at + // least 16 and 32 samples respectively. + assert(w >= 32); + assert(h >= 16); + for (int row = 0; row < h; row += block_size) { + for (int col = 0; col < w; col += 32) { + aom_get_var_sse_sum_16x16_dual(src + src_stride * row + col, src_stride, + ref + ref_stride * row + col, ref_stride, + &sse16x16[k], sse, sum, &var16x16[k]); + k += 2; + } + } +} + +static void calculate_variance(int bw, int bh, TX_SIZE tx_size, + unsigned int *sse_i, int *sum_i, + unsigned int *var_o, unsigned int *sse_o, + int *sum_o) { + const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size]; + const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); + const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); + int row, col, k = 0; + + for (row = 0; row < nh; row += 2) { + for (col = 0; col < nw; col += 2) { + sse_o[k] = sse_i[row * nw + col] + sse_i[row * nw + col + 1] + + sse_i[(row + 1) * nw + col] + sse_i[(row + 1) * nw + col + 1]; + sum_o[k] = sum_i[row * nw + col] + sum_i[row * nw + col + 1] + + sum_i[(row + 1) * nw + col] + sum_i[(row + 1) * nw + col + 1]; + var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + k++; + } + } +} + +// Adjust the ac_thr according to speed, width, height and normalized sum +static int ac_thr_factor(int speed, int width, int height, int norm_sum) { + if (speed >= 8 && norm_sum < 5) { + if (width <= 640 && height <= 480) + return 4; + else + return 2; + } + return 1; +} + +// Sets early_term flag based on chroma planes prediction +static INLINE void set_early_term_based_on_uv_plane( + AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row, + int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx, + const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) { + AV1_COMMON *const cm = &cpi->common; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const uint32_t dc_quant = p->dequant_QTX[0]; + const uint32_t ac_quant = p->dequant_QTX[1]; + int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t ac_thr = ac_quant * ac_quant >> 6; + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + int ac_test = 1; + int dc_test = 1; + const int norm_sum = abs(sum) >> (bw + bh); + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5) + ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, + norm_sum, cpi->svc.temporal_layer_id); + else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum); +#else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum); + +#endif + + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + dc_thr = dc_thr << 1; + ac_thr = ac_thr << 2; + } + + for (int k = 0; k < num_blk; k++) { + // Check if all ac coefficients can be quantized to zero. + if (!(var_tx[k] < ac_thr || var == 0)) { + ac_test = 0; + break; + } + // Check if dc coefficient can be quantized to zero. + if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) { + dc_test = 0; + break; + } + } + + // Check if chroma can be skipped based on ac and dc test flags. + if (ac_test && dc_test) { + int skip_uv[2] = { 0 }; + unsigned int var_uv[2]; + unsigned int sse_uv[2]; + // Transform skipping test in UV planes. + for (int plane = AOM_PLANE_U; plane <= AOM_PLANE_V; plane++) { + int j = plane - 1; + skip_uv[j] = 1; + if (x->color_sensitivity[COLOR_SENS_IDX(plane)]) { + skip_uv[j] = 0; + struct macroblock_plane *const puv = &x->plane[plane]; + struct macroblockd_plane *const puvd = &xd->plane[plane]; + const BLOCK_SIZE uv_bsize = get_plane_block_size( + bsize, puvd->subsampling_x, puvd->subsampling_y); + // Adjust these thresholds for UV. + const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3; + const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3; + const int64_t uv_dc_thr = + (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc; + const int64_t uv_ac_thr = + (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + plane, plane); + var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, + puvd->dst.buf, + puvd->dst.stride, &sse_uv[j]); + if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && + (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) + skip_uv[j] = 1; + else + break; + } + } + if (skip_uv[0] & skip_uv[1]) { + *early_term = 1; + } + } +} + +static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + int calculate_rd, int *early_term, + BLOCK_SIZE bsize, + unsigned int sse) { + if (calculate_rd) { + if (!*early_term) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + + model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh, + &rd_stats->rate, &rd_stats->dist); + } + + if (*early_term) { + rd_stats->rate = 0; + rd_stats->dist = sse << 4; + } + } +} + +static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, MACROBLOCK *x, + MACROBLOCKD *xd, RD_STATS *rd_stats, + int *early_term, int calculate_rd, + int64_t best_sse, + unsigned int *var_output, + unsigned int var_prune_threshold) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int test_skip = 1; + unsigned int var; + int sum; + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + unsigned int sse16x16[64] = { 0 }; + unsigned int var16x16[64] = { 0 }; + assert(xd->mi[0]->tx_size == TX_16X16); + assert(bsize > BLOCK_32X32); + + // Calculate variance for whole partition, and also save 16x16 blocks' + // variance to be used in following transform skipping test. + block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16, + sse16x16, var16x16); + + var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + if (var_output) { + *var_output = var; + if (*var_output > var_prune_threshold) { + return; + } + } + + rd_stats->sse = sse; + // Skipping test + *early_term = 0; + set_force_skip_flag(cpi, x, sse, early_term); + // The code below for setting skip flag assumes transform size of at least + // 8x8, so force this lower limit on transform. + MB_MODE_INFO *const mi = xd->mi[0]; + if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search && + early_term_inter_search_with_sse( + cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse, + mi->mode)) + test_skip = 0; + + if (*early_term) test_skip = 0; + + // Evaluate if the partition block is a skippable block in Y plane. + if (test_skip) { + const unsigned int *sse_tx = sse16x16; + const unsigned int *var_tx = var16x16; + const unsigned int num_block = (1 << (bw + bh - 2)) >> 2; + set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col, + early_term, num_block, sse_tx, var_tx, sum, + var, sse); + } + calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize, + sse); +} + +static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col, MACROBLOCK *x, + MACROBLOCKD *xd, RD_STATS *rd_stats, + int *early_term, int calculate_rd, + int64_t best_sse, + unsigned int *var_output, + unsigned int var_prune_threshold) { + if (x->force_zeromv_skip_for_blk) { + *early_term = 1; + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + return; + } + + // For block sizes greater than 32x32, the transform size is always 16x16. + // This function avoids calling calculate_variance() for tx_size 16x16 cases + // by directly populating variance at tx_size level from + // block_variance_16x16_dual() function. + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) { + xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32; + model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats, + early_term, calculate_rd, best_sse, var_output, + var_prune_threshold); + return; + } + + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + int test_skip = 1; + unsigned int var; + int sum; + + const int bw = b_width_log2_lookup[bsize]; + const int bh = b_height_log2_lookup[bsize]; + unsigned int sse8x8[256] = { 0 }; + int sum8x8[256] = { 0 }; + unsigned int var8x8[256] = { 0 }; + TX_SIZE tx_size; + + // Calculate variance for whole partition, and also save 8x8 blocks' variance + // to be used in following transform skipping test. + block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8); + var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + if (var_output) { + *var_output = var; + if (*var_output > var_prune_threshold) { + return; + } + } + + rd_stats->sse = sse; + // Skipping test + *early_term = 0; + tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term); + assert(tx_size <= TX_16X16); + // The code below for setting skip flag assumes transform size of at least + // 8x8, so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; + xd->mi[0]->tx_size = tx_size; + + MB_MODE_INFO *const mi = xd->mi[0]; + if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search && + early_term_inter_search_with_sse( + cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse, + mi->mode)) + test_skip = 0; + + if (*early_term) test_skip = 0; + + // Evaluate if the partition block is a skippable block in Y plane. + if (test_skip) { + unsigned int sse16x16[64] = { 0 }; + int sum16x16[64] = { 0 }; + unsigned int var16x16[64] = { 0 }; + const unsigned int *sse_tx = sse8x8; + const unsigned int *var_tx = var8x8; + unsigned int num_blks = 1 << (bw + bh - 2); + + if (tx_size >= TX_16X16) { + calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16, + sum16x16); + sse_tx = sse16x16; + var_tx = var16x16; + num_blks = num_blks >> 2; + } + set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col, + early_term, num_blks, sse_tx, var_tx, sum, + var, sse); + } + calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize, + sse); +} + +static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + RD_STATS *rd_stats, unsigned int *var_out, + int calculate_rd, int *early_term) { + if (x->force_zeromv_skip_for_blk && early_term != NULL) { + *early_term = 1; + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + } + + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + const int ref = xd->mi[0]->ref_frame[0]; + + assert(bsize < BLOCK_SIZES_ALL); + + struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + unsigned int sse; + int rate; + int64_t dist; + + unsigned int var = cpi->ppi->fn_ptr[bsize].vf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); + int force_skip = 0; + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip); + if (var_out) { + *var_out = var; + } + + if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) { + const int bwide = block_size_wide[bsize]; + const int bhigh = block_size_high[bsize]; + model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate, + &dist); + } else { + rate = INT_MAX; // this will be overwritten later with av1_block_yrd + dist = INT_MAX; + } + rd_stats->sse = sse; + x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + if (force_skip && ref > INTRA_FRAME) { + rate = 0; + dist = (int64_t)sse << 4; + } + + assert(rate >= 0); + + rd_stats->skip_txfm = (rate == 0); + rate = AOMMIN(rate, INT_MAX); + rd_stats->rate = rate; + rd_stats->dist = dist; +} + +static INLINE int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx, + const MB_MODE_INFO_EXT *mbmi_ext, + const int (*const drl_mode_cost0)[2], + int8_t ref_frame_type) { + int cost = 0; + if (this_mode == NEWMV || this_mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx]; + if (ref_mv_idx == idx) return cost; + } + } + return cost; + } + + if (have_nearmv_in_inter_mode(this_mode)) { + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)]; + if (ref_mv_idx == (idx - 1)) return cost; + } + } + return cost; + } + return cost; +} + +static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode, + int16_t mode_context) { + if (is_inter_compound_mode(mode)) { + return mode_costs + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } + + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + + if (mode == GLOBALMV) { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +} + +static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode, + RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row, + int mv_col, int speed, uint32_t spatial_variance, + CONTENT_STATE_SB content_state_sb) { + // Bias against MVs associated with NEWMV mode that are very different from + // top/left neighbors. + if (this_mode == NEWMV) { + int al_mv_average_row; + int al_mv_average_col; + int row_diff, col_diff; + int above_mv_valid = 0; + int left_mv_valid = 0; + int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL; + int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL; + if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad && + spatial_variance < 300 && + (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) { + this_rdc->rdcost = this_rdc->rdcost << 2; + return; + } + if (xd->above_mbmi) { + above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV; + above_row = xd->above_mbmi->mv[0].as_mv.row; + above_col = xd->above_mbmi->mv[0].as_mv.col; + } + if (xd->left_mbmi) { + left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV; + left_row = xd->left_mbmi->mv[0].as_mv.row; + left_col = xd->left_mbmi->mv[0].as_mv.col; + } + if (above_mv_valid && left_mv_valid) { + al_mv_average_row = (above_row + left_row + 1) >> 1; + al_mv_average_col = (above_col + left_col + 1) >> 1; + } else if (above_mv_valid) { + al_mv_average_row = above_row; + al_mv_average_col = above_col; + } else if (left_mv_valid) { + al_mv_average_row = left_row; + al_mv_average_col = left_col; + } else { + al_mv_average_row = al_mv_average_col = 0; + } + row_diff = al_mv_average_row - mv_row; + col_diff = al_mv_average_col - mv_col; + if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) { + if (bsize >= BLOCK_32X32) + this_rdc->rdcost = this_rdc->rdcost << 1; + else + this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; + } + } else { + // Bias for speed >= 8 for low spatial variance. + if (speed >= 8 && spatial_variance < 150 && + (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64)) + this_rdc->rdcost = 5 * this_rdc->rdcost >> 2; + } +} + +static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, + MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, + PREDICTION_MODE mode) { + const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4); + const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128); + for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) { + int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx]; + if (thr_mode_idx == best_mode_idx) { + *freq_fact -= (*freq_fact >> 4); + } else { + *freq_fact = + AOMMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } + } +} + +#if CONFIG_AV1_TEMPORAL_DENOISING +static void av1_pickmode_ctx_den_update( + AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, + unsigned int ref_frame_cost[REF_FRAMES], + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred, + BEST_PICKMODE *bp) { + ctx_den->zero_last_cost_orig = zero_last_cost_orig; + ctx_den->ref_frame_cost = ref_frame_cost; + ctx_den->frame_mv = frame_mv; + ctx_den->reuse_inter_pred = reuse_inter_pred; + ctx_den->best_tx_size = bp->best_tx_size; + ctx_den->best_mode = bp->best_mode; + ctx_den->best_ref_frame = bp->best_ref_frame; + ctx_den->best_pred_filter = bp->best_pred_filter; + ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; +} + +static void recheck_zeromv_after_denoising( + AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd, + AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den, + struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc, + BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) { + // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on + // denoised result. Only do this under noise conditions, and if rdcost of + // ZEROMV on original source is not significantly higher than rdcost of best + // mode. + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow && + ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) && + ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || + (ctx_den->best_ref_frame == GOLDEN_FRAME && + cpi->svc.number_spatial_layers == 1 && + decision == FILTER_ZEROMV_BLOCK))) { + // Check if we should pick ZEROMV on denoised signal. + AV1_COMMON *const cm = &cpi->common; + RD_STATS this_rdc; + const ModeCosts *mode_costs = &x->mode_costs; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + + mi->mode = GLOBALMV; + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE_FRAME; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME); + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[LAST_FRAME][AOM_PLANE_Y]; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + unsigned int var; + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL); + + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); + this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx); + + this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + txfm_info->skip_txfm = this_rdc.skip_txfm; + // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is higher than best_ref mode (on original source). + if (this_rdc.rdcost > best_rdc->rdcost) { + this_rdc = *best_rdc; + mi->mode = best_pickmode->best_mode; + mi->ref_frame[0] = best_pickmode->best_ref_frame; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME); + mi->interp_filters = best_pickmode->best_pred_filter; + if (best_pickmode->best_ref_frame == INTRA_FRAME) { + mi->mv[0].as_int = INVALID_MV; + } else { + mi->mv[0].as_int = ctx_den + ->frame_mv[best_pickmode->best_mode] + [best_pickmode->best_ref_frame] + .as_int; + if (ctx_den->reuse_inter_pred) { + xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[GOLDEN_FRAME][AOM_PLANE_Y]; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + } + } + mi->tx_size = best_pickmode->best_tx_size; + txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm; + } else { + ctx_den->best_ref_frame = LAST_FRAME; + *best_rdc = this_rdc; + } + } +} +#endif // CONFIG_AV1_TEMPORAL_DENOISING + +/*!\brief Searches for the best interpolation filter + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR, + * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects + * the one that gives lowest RD cost. RD cost is calculated using curvfit model. + * Support for dual filters (different filters in the x & y directions) is + * allowed if sf.interp_sf.disable_dual_filter = 0. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the + * data for the current macroblock + * \param[in] this_rdc Pointer to calculated RD Cost + * \param[in] inter_pred_params_sr Pointer to structure holding parameters of + inter prediction for single reference + * \param[in] mi_row Row index in 4x4 units + * \param[in] mi_col Column index in 4x4 units + * \param[in] tmp_buffer Pointer to a temporary buffer for + * prediction re-use + * \param[in] bsize Current block size + * \param[in] reuse_inter_pred Flag, indicating prediction re-use + * \param[out] this_mode_pred Pointer to store prediction buffer + * for prediction re-use + * \param[out] this_early_term Flag, indicating that transform can be + * skipped + * \param[out] var The residue variance of the current + * predictor. + * \param[in] use_model_yrd_large Flag, indicating special logic to handle + * large blocks + * \param[in] best_sse Best sse so far. + * \param[in] is_single_pred Flag, indicating single mode. + * + * \remark Nothing is returned. Instead, calculated RD cost is placed to + * \c this_rdc and best filter is placed to \c mi->interp_filters. In case + * \c reuse_inter_pred flag is set, this function also outputs + * \c this_mode_pred. Also \c this_early_temp is set if transform can be + * skipped + */ +static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, + InterPredParams *inter_pred_params_sr, int mi_row, + int mi_col, PRED_BUFFER *tmp_buffer, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, + int *this_early_term, unsigned int *var, + int use_model_yrd_large, int64_t best_sse, + int is_single_pred) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + MB_MODE_INFO *const mi = xd->mi[0]; + const int bw = block_size_wide[bsize]; + int dim_factor = + (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1; + RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 }; + TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 }; + PRED_BUFFER *current_pred = *this_mode_pred; + int best_skip = 0; + int best_early_term = 0; + int64_t best_cost = INT64_MAX; + int best_filter_index = -1; + + SubpelParams subpel_params; + // Initialize inter prediction params at mode level for single reference + // mode. + if (is_single_pred) + init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr, + &subpel_params, xd->block_ref_scale_factors[0], + pd->pre->width, pd->pre->height); + for (int filter_idx = 0; filter_idx < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; + ++filter_idx) { + int64_t cost; + if (cpi->sf.interp_sf.disable_dual_filter && + filters_ref_set[filter_idx].as_filters.x_filter != + filters_ref_set[filter_idx].as_filters.y_filter) + continue; + + mi->interp_filters.as_int = filters_ref_set[filter_idx].as_int; + if (is_single_pred) + av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr, + &subpel_params); + else + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + unsigned int curr_var = UINT_MAX; + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &pf_rd_stats[filter_idx], this_early_term, 1, + best_sse, &curr_var, UINT_MAX); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[filter_idx], &curr_var, + 1, NULL); + pf_rd_stats[filter_idx].rate += av1_get_switchable_rate( + x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter); + cost = RDCOST(x->rdmult, pf_rd_stats[filter_idx].rate, + pf_rd_stats[filter_idx].dist); + pf_tx_size[filter_idx] = mi->tx_size; + if (cost < best_cost) { + *var = curr_var; + best_filter_index = filter_idx; + best_cost = cost; + best_skip = pf_rd_stats[filter_idx].skip_txfm; + best_early_term = *this_early_term; + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + assert(best_filter_index >= 0 && + best_filter_index < dim_factor * FILTER_SEARCH_SIZE); + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filters.as_int = filters_ref_set[best_filter_index].as_int; + mi->tx_size = pf_tx_size[best_filter_index]; + this_rdc->rate = pf_rd_stats[best_filter_index].rate; + this_rdc->dist = pf_rd_stats[best_filter_index].dist; + this_rdc->sse = pf_rd_stats[best_filter_index].sse; + this_rdc->skip_txfm = (best_skip || best_early_term); + *this_early_term = best_early_term; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) { + if (is_single_pred) + av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr, + &subpel_params); + else + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } +} +#if !CONFIG_REALTIME_ONLY + +static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi, + MACROBLOCK *const x, + const MB_MODE_INFO *mbmi) { + const FeatureFlags *const features = &cpi->common.features; + const MACROBLOCKD *xd = &x->e_mbd; + + if (cpi->sf.inter_sf.extra_prune_warped) return 0; + if (has_second_ref(mbmi)) return 0; + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + + if (features->switchable_motion_mode) { + // Determine which motion modes to search if more than SIMPLE_TRANSLATION + // is allowed. + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + return 1; + } + + return 0; +} + +static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const FeatureFlags *const features = &cm->features; + + mi->num_proj_ref = 1; + WARP_SAMPLE_INFO *const warp_sample_info = + &x->warp_sample_info[mi->ref_frame[0]]; + int *pts0 = warp_sample_info->pts; + int *pts_inref0 = warp_sample_info->pts_inref; + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + + if (features->switchable_motion_mode) { + // Determine which motion modes to search if more than SIMPLE_TRANSLATION + // is allowed. + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + if (warp_sample_info->num < 0) { + warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0); + } + mi->num_proj_ref = warp_sample_info->num; + } +} + +static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int *this_early_term, int use_model_yrd_large, + int *rate_mv, int64_t best_sse) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const FeatureFlags *const features = &cm->features; + MB_MODE_INFO *const mi = xd->mi[0]; + RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 }; + int best_skip = 0; + int best_early_term = 0; + int64_t best_cost = INT64_MAX; + int best_mode_index = -1; + const int interp_filter = features->interp_filter; + + const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = { + SIMPLE_TRANSLATION, WARPED_CAUSAL + }; + int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1; + + WARP_SAMPLE_INFO *const warp_sample_info = + &x->warp_sample_info[mi->ref_frame[0]]; + int *pts0 = warp_sample_info->pts; + int *pts_inref0 = warp_sample_info->pts_inref; + + const int total_samples = mi->num_proj_ref; + if (total_samples == 0) { + // Do not search WARPED_CAUSAL if there are no samples to use to determine + // warped parameters. + mode_search_size = 1; + } + + const MB_MODE_INFO base_mbmi = *mi; + MB_MODE_INFO best_mbmi; + + for (int mode_index = 0; mode_index < mode_search_size; ++mode_index) { + int64_t cost = INT64_MAX; + MOTION_MODE motion_mode = motion_modes[mode_index]; + *mi = base_mbmi; + mi->motion_mode = motion_mode; + if (motion_mode == SIMPLE_TRANSLATION) { + mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &pf_rd_stats[mode_index], this_early_term, 1, + best_sse, NULL, UINT_MAX); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1, + NULL); + pf_rd_stats[mode_index].rate += + av1_get_switchable_rate(x, xd, cm->features.interp_filter, + cm->seq_params->enable_dual_filter); + cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate, + pf_rd_stats[mode_index].dist); + } else if (motion_mode == WARPED_CAUSAL) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + const ModeCosts *mode_costs = &x->mode_costs; + mi->wm_params.wmtype = DEFAULT_WMTYPE; + mi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + // Select the samples according to motion vector difference + if (mi->num_proj_ref > 1) { + mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref, + mi->num_proj_ref, bsize); + } + + // Compute the warped motion parameters with a least squares fit + // using the collected samples + if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize, + mi->mv[0].as_mv.row, mi->mv[0].as_mv.col, + &mi->wm_params, mi_row, mi_col)) { + if (mi->mode == NEWMV) { + const int_mv mv0 = mi->mv[0]; + const WarpedMotionParams wm_params0 = mi->wm_params; + const int num_proj_ref0 = mi->num_proj_ref; + + const int_mv ref_mv = av1_get_ref_mv(x, 0); + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv.as_mv, NULL); + + // Refine MV in a small range. + av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, + total_samples, cpi->sf.mv_sf.warp_search_method, + cpi->sf.mv_sf.warp_search_iters); + if (mi->mv[0].as_int == ref_mv.as_int) { + continue; + } + + if (mv0.as_int != mi->mv[0].as_int) { + // Keep the refined MV and WM parameters. + int tmp_rate_mv = av1_mv_bit_cost( + &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + *rate_mv = tmp_rate_mv; + } else { + // Restore the old MV and WM parameters. + mi->mv[0] = mv0; + mi->wm_params = wm_params0; + mi->num_proj_ref = num_proj_ref0; + } + } + // Build the warped predictor + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, av1_num_planes(cm) - 1); + if (use_model_yrd_large) + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &pf_rd_stats[mode_index], this_early_term, + 1, best_sse, NULL, UINT_MAX); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, + 1, NULL); + + pf_rd_stats[mode_index].rate += + mode_costs->motion_mode_cost[bsize][mi->motion_mode]; + cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate, + pf_rd_stats[mode_index].dist); + } else { + cost = INT64_MAX; + } + } + if (cost < best_cost) { + best_mode_index = mode_index; + best_cost = cost; + best_skip = pf_rd_stats[mode_index].skip_txfm; + best_early_term = *this_early_term; + best_mbmi = *mi; + } + } + assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE); + + *mi = best_mbmi; + this_rdc->rate = pf_rd_stats[best_mode_index].rate; + this_rdc->dist = pf_rd_stats[best_mode_index].dist; + this_rdc->sse = pf_rd_stats[best_mode_index].sse; + this_rdc->skip_txfm = (best_skip || best_early_term); + *this_early_term = best_early_term; + if (best_mode_index < FILTER_SEARCH_SIZE - 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } +} +#endif // !CONFIG_REALTIME_ONLY + +#define COLLECT_NON_SQR_STAT 0 + +#if COLLECT_NONRD_PICK_MODE_STAT + +static AOM_INLINE void print_stage_time(const char *stage_name, + int64_t stage_time, + int64_t total_time) { + printf(" %s: %ld (%f%%)\n", stage_name, stage_time, + 100 * stage_time / (float)total_time); +} + +static void print_time(const mode_search_stat_nonrd *const ms_stat, + BLOCK_SIZE bsize, int mi_rows, int mi_cols, int mi_row, + int mi_col) { + if ((mi_row + mi_size_high[bsize] >= mi_rows) && + (mi_col + mi_size_wide[bsize] >= mi_cols)) { + int64_t total_time = 0l; + int32_t total_blocks = 0; + for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) { + total_time += ms_stat->total_block_times[bs]; + total_blocks += ms_stat->num_blocks[bs]; + } + + printf("\n"); + for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) { + if (ms_stat->num_blocks[bs] == 0) { + continue; + } + if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) { + continue; + } + + printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n", + block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs], + ms_stat->total_block_times[bs], + 100 * ms_stat->total_block_times[bs] / (float)total_time, + (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]); + for (int j = 0; j < MB_MODE_COUNT; j++) { + if (ms_stat->nonskipped_search_times[bs][j] == 0) { + continue; + } + + int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j]; + printf(" Mode %d, %d/%d tps %f\n", j, + ms_stat->num_nonskipped_searches[bs][j], + ms_stat->num_searches[bs][j], + ms_stat->num_nonskipped_searches[bs][j] > 0 + ? (float)ms_stat->nonskipped_search_times[bs][j] / + ms_stat->num_nonskipped_searches[bs][j] + : 0l); + if (j >= INTER_MODE_START) { + total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] + + ms_stat->model_rd_time[bs][j] + + ms_stat->txfm_time[bs][j]; + print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j], + total_time); + print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j], + total_time); + print_stage_time("Model RD Time", ms_stat->model_rd_time[bs][j], + total_time); + print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j], + total_time); + } + print_stage_time("Total Mode Time", total_mode_time, total_time); + } + printf("\n"); + } + printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks); + } +} +#endif // COLLECT_NONRD_PICK_MODE_STAT + +static bool should_prune_intra_modes_using_neighbors( + const MACROBLOCKD *xd, bool enable_intra_mode_pruning_using_neighbors, + PREDICTION_MODE this_mode, PREDICTION_MODE above_mode, + PREDICTION_MODE left_mode) { + if (!enable_intra_mode_pruning_using_neighbors) return false; + + // Avoid pruning of DC_PRED as it is the most probable mode to win as per the + // statistics generated for nonrd intra mode evaluations. + if (this_mode == DC_PRED) return false; + + // Enable the pruning for current mode only if it is not the winner mode of + // both the neighboring blocks (left/top). + return xd->up_available && this_mode != above_mode && xd->left_available && + this_mode != left_mode; +} + +void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + RD_STATS this_rdc, best_rdc; + struct estimate_block_intra_args args; + init_estimate_block_intra_args(&args, cpi, x); + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + mi->tx_size = + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); + assert(IMPLIES(xd->lossless[mi->segment_id], mi->tx_size == TX_4X4)); + const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size]; + + // If the current block size is the same as the transform block size, enable + // mode pruning based on the best SAD so far. + if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize) + args.prune_mode_based_on_sad = true; + + int *bmode_costs; + PREDICTION_MODE best_mode = DC_PRED; + const MB_MODE_INFO *above_mi = xd->above_mbmi; + const MB_MODE_INFO *left_mi = xd->left_mbmi; + const PREDICTION_MODE A = av1_above_block_mode(above_mi); + const PREDICTION_MODE L = av1_left_block_mode(left_mi); + const int above_ctx = intra_mode_context[A]; + const int left_ctx = intra_mode_context[L]; + const unsigned int source_variance = x->source_variance; + bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; + + av1_invalid_rd_stats(&best_rdc); + av1_invalid_rd_stats(&this_rdc); + + init_mbmi_nonrd(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm); + mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV; + + // Change the limit of this loop to add other intra prediction + // mode tests. + for (int mode_index = 0; mode_index < RTC_INTRA_MODES; ++mode_index) { + PREDICTION_MODE this_mode = intra_mode_list[mode_index]; + + // As per the statistics generated for intra mode evaluation in the nonrd + // path, it is found that the probability of H_PRED mode being the winner is + // very low when the best mode so far is V_PRED (out of DC_PRED and V_PRED). + // If V_PRED is the winner mode out of DC_PRED and V_PRED, it could imply + // the presence of a vertically dominant pattern. Hence, H_PRED mode is not + // evaluated. + if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far && + this_mode == H_PRED && best_mode == V_PRED) + continue; + + if (should_prune_intra_modes_using_neighbors( + xd, cpi->sf.rt_sf.enable_intra_mode_pruning_using_neighbors, + this_mode, A, L)) { + // Prune V_PRED and H_PRED if source variance of the block is less than + // or equal to 50. The source variance threshold is obtained empirically. + if ((this_mode == V_PRED || this_mode == H_PRED) && source_variance <= 50) + continue; + + // As per the statistics, probability of SMOOTH_PRED being the winner is + // low when best mode so far is DC_PRED (out of DC_PRED, V_PRED and + // H_PRED). Hence, SMOOTH_PRED mode is not evaluated. + if (best_mode == DC_PRED && this_mode == SMOOTH_PRED) continue; + } + + this_rdc.dist = this_rdc.rate = 0; + args.mode = this_mode; + args.skippable = 1; + args.rdc = &this_rdc; + mi->mode = this_mode; + av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y, + av1_estimate_block_intra, &args); + + if (this_rdc.rate == INT_MAX) continue; + + const int skip_ctx = av1_get_skip_txfm_context(xd); + if (args.skippable) { + this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + } else { + this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0]; + } + this_rdc.rate += bmode_costs[this_mode]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + best_mode = this_mode; + if (!this_rdc.skip_txfm) { + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + } + } + } + + mi->mode = best_mode; + // Keep DC for UV since mode test is based on Y channel only. + mi->uv_mode = UV_DC_PRED; + *rd_cost = best_rdc; + + // For lossless: always force the skip flags off. + // Even though the blk_skip is set to 0 above in the rdcost comparison, + // do it here again in case the above logic changes. + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + x->txfm_search_info.skip_txfm = 0; + memset(ctx->blk_skip, 0, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + } + +#if CONFIG_INTERNAL_STATS + store_coding_context_nonrd(x, ctx, mi->mode); +#else + store_coding_context_nonrd(x, ctx); +#endif // CONFIG_INTERNAL_STATS +} + +static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) { + struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME); + struct scale_factors *const sf_golden = + get_ref_scale_factors(cm, GOLDEN_FRAME); + return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) && + (sf_last->y_scale_fp == sf_golden->y_scale_fp)); +} + +static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x, + MB_MODE_INFO *mi, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int gf_temporal_ref, + int use_ref_frame[], + int *force_skip_low_temp_var) { + AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); + + // When the ref_frame_config is used to set the reference frame structure + // then the usage of alt_ref is determined by the ref_frame_flags + // (and not the speed feature use_nonrd_altref_frame). + int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config || + cpi->sf.rt_sf.use_nonrd_altref_frame; + + int use_golden_ref_frame = 1; + int use_last_ref_frame = 1; + + // When the ref_frame_config is used to set the reference frame structure: + // check if LAST is used as a reference. And only remove golden and altref + // references below if last is used as a reference. + if (cpi->ppi->rtc_ref.set_ref_frame_config) + use_last_ref_frame = + cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0; + + // frame_since_golden is not used when user sets the referene structure. + if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame && + cpi->rc.frames_since_golden == 0 && gf_temporal_ref) { + use_golden_ref_frame = 0; + } + + if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var && + x->nonrd_prune_ref_frame_search) { + if (is_small_sb) + *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + else + *force_skip_low_temp_var = av1_get_force_skip_low_temp_var( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + // If force_skip_low_temp_var is set, skip golden reference. + if (*force_skip_low_temp_var) { + use_golden_ref_frame = 0; + use_alt_ref_frame = 0; + } + } + + if (use_last_ref_frame && + (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk || + (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) { + use_golden_ref_frame = 0; + use_alt_ref_frame = 0; + } + + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + use_golden_ref_frame = 1; + use_alt_ref_frame = 0; + } + + // Skip golden/altref reference if color is set, on flat blocks with motion. + // For screen: always skip golden/alt (if color_sensitivity_sb_g/alt is set) + // except when x->nonrd_prune_ref_frame_search = 0. This latter flag + // may be set in the variance partition when golden is a much better + // reference than last, in which case it may not be worth skipping + // golden/altref completely. + // Condition on use_last_ref to make sure there remains at least one + // reference. + if (use_last_ref_frame && + ((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + x->nonrd_prune_ref_frame_search != 0) || + (x->source_variance < 200 && + x->content_state_sb.source_sad_nonrd >= kLowSad))) { + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + use_golden_ref_frame = 0; + if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + use_alt_ref_frame = 0; + } + + // For non-screen: if golden and altref are not being selected as references + // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back + // based on the sad of nearest/nearmv of LAST ref. If this block sad is large, + // keep golden as reference. Only do this for the agrressive pruning mode and + // avoid it when color is set for golden reference. + if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && + (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame && + !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX && + x->nonrd_prune_ref_frame_search > 2 && + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) { + int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150; + int pred = x->pred_mv_sad[LAST_FRAME] >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (pred > thr) use_golden_ref_frame = 1; + } + + use_alt_ref_frame = + cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0; + use_golden_ref_frame = + cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0; + + // For spatial layers: enable golden ref if it is set by user and + // corresponds to the lower spatial layer. + if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) && + x->content_state_sb.source_sad_nonrd < kHighSad) { + const int buffslot_golden = + cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME]; + if (cpi->ppi->rtc_ref.buffer_time_index[buffslot_golden] == + cpi->svc.current_superframe) + use_golden_ref_frame = 1; + } + + use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame; + use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame; + use_ref_frame[LAST_FRAME] = use_last_ref_frame; + // Keep this assert on, as only 3 references are used in nonrd_pickmode + // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this + // frame must be an intra-only frame and hence should never enter the + // pickmode here for inter frames. + assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame); +} + +static AOM_INLINE int is_filter_search_enabled_blk( + AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize, + int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) { + const AV1_COMMON *const cm = &cpi->common; + // filt search disabled + if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0; + // filt search purely based on mode properties + if (!cb_pred_filter_search) return 1; + MACROBLOCKD *const xd = &x->e_mbd; + int enable_interp_search = 0; + if (!(xd->left_mbmi && xd->above_mbmi)) { + // neighbors info unavailable + enable_interp_search = 2; + } else if (!(is_inter_block(xd->left_mbmi) && + is_inter_block(xd->above_mbmi))) { + // neighbor is INTRA + enable_interp_search = 2; + } else if (xd->left_mbmi->interp_filters.as_int != + xd->above_mbmi->interp_filters.as_int) { + // filters are different + enable_interp_search = 2; + } else if ((cb_pred_filter_search == 1) && + (xd->left_mbmi->interp_filters.as_filters.x_filter != + EIGHTTAP_REGULAR)) { + // not regular + enable_interp_search = 2; + } else { + // enable prediction based on chessboard pattern + if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH) + *filt_select = EIGHTTAP_SMOOTH; + const int bsl = mi_size_wide_log2[bsize]; + enable_interp_search = + (bool)((((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1); + if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1; + } + return enable_interp_search; +} + +static AOM_INLINE int skip_mode_by_threshold( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv, + int frames_since_golden, const int *const rd_threshes, + const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip, + int extra_shift) { + int skip_this_mode = 0; + const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)]; + int64_t mode_rd_thresh = + best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1) + : ((int64_t)rd_threshes[mode_index]) << extra_shift; + + // Increase mode_rd_thresh value for non-LAST for improved encoding + // speed + if (ref_frame != LAST_FRAME) { + mode_rd_thresh = mode_rd_thresh << 1; + if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4) + mode_rd_thresh = mode_rd_thresh << (extra_shift + 1); + } + + if (rd_less_than_thresh(best_cost, mode_rd_thresh, + rd_thresh_freq_fact[mode_index])) + if (mv.as_int != 0) skip_this_mode = 1; + + return skip_this_mode; +} + +static AOM_INLINE int skip_mode_by_low_temp( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, + CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) { + // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) { + return 1; + } + + if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 && + force_skip_low_temp_var && mode == NEWMV) { + return 1; + } + return 0; +} + +static AOM_INLINE int skip_mode_by_bsize_and_ref_frame( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize, + int extra_prune, unsigned int sse_zeromv_norm, int more_prune) { + const unsigned int thresh_skip_golden = 500; + + if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden && + mode == NEWMV) + return 1; + + if (bsize == BLOCK_128X128 && mode == NEWMV) return 1; + + // Skip testing non-LAST if this flag is set. + if (extra_prune) { + if (extra_prune > 1 && ref_frame != LAST_FRAME && + (bsize > BLOCK_16X16 && mode == NEWMV)) + return 1; + + if (ref_frame != LAST_FRAME && mode == NEARMV) return 1; + + if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1; + } + return 0; +} + +static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + struct buf_2d *yv12_mb) { + struct macroblock_plane *const p = &x->plane[0]; + const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride, + yv12_mb->buf, yv12_mb->stride); + if (y_sad == 0) x->block_is_zero_sad = 1; +} + +static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int y_sad, + unsigned int source_variance, + struct buf_2d yv12_mb[MAX_MB_PLANE]) { + const int subsampling_x = cpi->common.seq_params->subsampling_x; + const int subsampling_y = cpi->common.seq_params->subsampling_y; + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + const int high_res = cpi->common.width * cpi->common.height >= 640 * 360; + if (bsize == cpi->common.seq_params->sb_size) { + // At superblock level color_sensitivity is already set to 0, 1, or 2. + // 2 is middle/uncertain level. To avoid additional sad + // computations when bsize = sb_size force level 2 to 1 (certain color) + // for motion areas. + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) { + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = + source_sad_nonrd >= kMedSad ? 1 : 0; + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) { + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = + source_sad_nonrd >= kMedSad ? 1 : 0; + } + return; + } + int shift = 3; + unsigned int source_var_thr = 50; + int uv_sad_thr = 100; + if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res) + shift = 4; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (cpi->rc.high_source_sad) shift = 6; + if (source_sad_nonrd > kMedSad) { + source_var_thr = 1200; + uv_sad_thr = 10; + } + } + NOISE_LEVEL noise_level = kLow; + int norm_sad = + y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000; + // If the spatial source variance is high and the normalized y_sad + // is low, then y-channel is likely good for mode estimation, so keep + // color_sensitivity off. For low noise content for now, since there is + // some bdrate regression for noisy color clip. + if (cpi->noise_estimate.enabled) + noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level == kLow && source_variance > thresh_spatial && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) { + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 0; + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 0; + return; + } + const int num_planes = av1_num_planes(&cpi->common); + + for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) { + // Always check if level = 2. If level = 0 check again for + // motion areas for higher resolns, where color artifacts + // are more noticeable. + if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 || + (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 && + source_sad_nonrd >= kMedSad && high_res)) { + struct macroblock_plane *const p = &x->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + + const int uv_sad = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride); + + const int norm_uv_sad = + uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]); + x->color_sensitivity[COLOR_SENS_IDX(plane)] = + uv_sad > (y_sad >> shift) && norm_uv_sad > 40; + if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr) + x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1; + } + } +} + +static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x, + struct buf_2d yv12_mb[8][MAX_MB_PLANE], + const int *use_ref_frame_mask, + const MV_REFERENCE_FRAME *rf, + int *ref_mv_idx) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + MV_REFERENCE_FRAME ref_frame_comp; + if (!use_ref_frame_mask[rf[1]]) { + // Need to setup pred_block, if it hasn't been done in find_predictors. + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]); + const int num_planes = av1_num_planes(cm); + if (yv12 != NULL) { + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, rf[1]); + av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes); + } + } + ref_frame_comp = av1_ref_frame_type(rf); + mbmi_ext->mode_context[ref_frame_comp] = 0; + mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX; + av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp); + *ref_mv_idx = mbmi->ref_mv_idx + 1; +} + +static void set_compound_mode(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME ref_frame2, int ref_mv_idx, + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], + PREDICTION_MODE this_mode) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = ref_frame2; + mi->compound_idx = 1; + mi->comp_group_idx = 0; + mi->interinter_comp.type = COMPOUND_AVERAGE; + MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame); + if (this_mode == GLOBAL_GLOBALMV) { + frame_mv[this_mode][ref_frame].as_int = 0; + frame_mv[this_mode][ref_frame2].as_int = 0; + } else if (this_mode == NEAREST_NEARESTMV) { + frame_mv[this_mode][ref_frame].as_int = + xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int; + frame_mv[this_mode][ref_frame2].as_int = + xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int; + } else if (this_mode == NEAR_NEARMV) { + frame_mv[this_mode][ref_frame].as_int = + xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int; + frame_mv[this_mode][ref_frame2].as_int = + xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int; + } +} + +// Prune compound mode if the single mode variance is lower than a fixed +// percentage of the median value. +static bool skip_comp_based_on_var( + const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) { + unsigned int best_var = UINT_MAX; + for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) { + for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { + best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]); + } + } + const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659); + const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281); + + // Currently, the thresh for 128 and 16 are not well-tuned. We are using the + // results from 64 and 32 as an heuristic. + switch (bsize) { + case BLOCK_128X128: return best_var < 4 * thresh_64; + case BLOCK_64X64: return best_var < thresh_64; + case BLOCK_32X32: return best_var < thresh_32; + case BLOCK_16X16: return best_var < thresh_32 / 4; + default: return false; + } +} + +static AOM_FORCE_INLINE void fill_single_inter_mode_costs( + int (*single_inter_mode_costs)[REF_FRAMES], int num_inter_modes, + const REF_MODE *reference_mode_set, const ModeCosts *mode_costs, + const int16_t *mode_context) { + bool ref_frame_used[REF_FRAMES] = { false }; + for (int idx = 0; idx < num_inter_modes; idx++) { + ref_frame_used[reference_mode_set[idx].ref_frame] = true; + } + + for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES; + this_ref_frame++) { + if (!ref_frame_used[this_ref_frame]) { + continue; + } + + const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME }; + const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf); + for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV; + this_mode++) { + single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] = + cost_mv_ref(mode_costs, this_mode, mode_ctx); + } + } +} + +static AOM_INLINE bool is_globalmv_better( + PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv, + const ModeCosts *mode_costs, + const int (*single_inter_mode_costs)[REF_FRAMES], + const MB_MODE_INFO_EXT *mbmi_ext) { + const int globalmv_mode_cost = + single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame]; + int this_mode_cost = + rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame]; + if (this_mode == NEWMV || this_mode == NEARMV) { + const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME }; + this_mode_cost += get_drl_cost( + NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf)); + } + return this_mode_cost > globalmv_mode_cost; +} + +// Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it +// succeeds, 0 if it fails. +static AOM_INLINE int setup_compound_params_from_comp_idx( + const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE], + PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame, + MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], + const int *use_ref_frame_mask, int comp_index, + bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame, + BLOCK_SIZE bsize) { + const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame; + int skip_gf = 0; + int skip_alt = 0; + *this_mode = comp_ref_mode_set[comp_index].pred_mode; + *ref_frame = rf[0]; + *ref_frame2 = rf[1]; + assert(*ref_frame == LAST_FRAME); + assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV); + if (x->source_variance < 50 && bsize > BLOCK_16X16) { + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + skip_gf = 1; + if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + skip_alt = 1; + } + if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) { + return 0; + } + if (*ref_frame2 == GOLDEN_FRAME && + (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 || skip_gf || + !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) { + return 0; + } else if (*ref_frame2 == LAST2_FRAME && + (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 || + !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) { + return 0; + } else if (*ref_frame2 == ALTREF_FRAME && + (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 || skip_alt || + !(cpi->ref_frame_flags & AOM_ALT_FLAG))) { + return 0; + } + int ref_mv_idx = 0; + if (*last_comp_ref_frame != rf[1]) { + // Only needs to be done once per reference pair. + setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf, + &ref_mv_idx); + *last_comp_ref_frame = rf[1]; + } + set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv, + *this_mode); + if (*this_mode != GLOBAL_GLOBALMV && + frame_mv[*this_mode][*ref_frame].as_int == 0 && + frame_mv[*this_mode][*ref_frame2].as_int == 0) { + return 0; + } + + return 1; +} + +static AOM_INLINE bool previous_mode_performed_poorly( + PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, + const unsigned int (*vars)[REF_FRAMES], + const int64_t (*uv_dist)[REF_FRAMES]) { + unsigned int best_var = UINT_MAX; + int64_t best_uv_dist = INT64_MAX; + for (int midx = 0; midx < RTC_INTER_MODES; midx++) { + best_var = AOMMIN(best_var, vars[midx][ref_frame]); + best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]); + } + assert(best_var != UINT_MAX && "Invalid variance data."); + const float mult = 1.125f; + bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame]; + if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX && + best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) { + // If we have chroma info, then take it into account + var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame]; + } + return var_bad; +} + +static AOM_INLINE bool prune_compoundmode_with_singlemode_var( + PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES], + const uint8_t (*mode_checked)[REF_FRAMES], + const unsigned int (*vars)[REF_FRAMES], + const int64_t (*uv_dist)[REF_FRAMES]) { + const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode); + const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode); + + bool first_ref_valid = false, second_ref_valid = false; + bool first_ref_bad = false, second_ref_bad = false; + if (mode_checked[single_mode0][ref_frame] && + frame_mv[single_mode0][ref_frame].as_int == + frame_mv[compound_mode][ref_frame].as_int && + vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) { + first_ref_valid = true; + first_ref_bad = + previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist); + } + if (mode_checked[single_mode1][ref_frame2] && + frame_mv[single_mode1][ref_frame2].as_int == + frame_mv[compound_mode][ref_frame2].as_int && + vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) { + second_ref_valid = true; + second_ref_bad = + previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist); + } + if (first_ref_valid && second_ref_valid) { + return first_ref_bad && second_ref_bad; + } else if (first_ref_valid || second_ref_valid) { + return first_ref_bad || second_ref_bad; + } + return false; +} + +// Function to setup parameters used for inter mode evaluation in non-rd. +static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col, + int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize +#if CONFIG_AV1_TEMPORAL_DENOISING + , + PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode +#endif +) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + MB_MODE_INFO *const mi = xd->mi[0]; + const ModeCosts *mode_costs = &x->mode_costs; + int skip_pred_mv = 0; + + // Initialize variance and distortion (chroma) for all modes and reference + // frames + for (int idx = 0; idx < RTC_INTER_MODES; idx++) { + for (int ref = 0; ref < REF_FRAMES; ref++) { + search_state->vars[idx][ref] = UINT_MAX; + search_state->uv_dist[idx][ref] = INT64_MAX; + } + } + + // Initialize values of color sensitivity with sb level color sensitivity + av1_copy(x->color_sensitivity, x->color_sensitivity_sb); + + init_best_pickmode(&search_state->best_pickmode); + + // Estimate cost for single reference frames + estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize, + search_state->ref_costs_single); + + // Reset flag to indicate modes evaluated + av1_zero(search_state->mode_checked); + + txfm_info->skip_txfm = 0; + + // Initialize mode decisions + av1_invalid_rd_stats(&search_state->best_rdc); + av1_invalid_rd_stats(&search_state->this_rdc); + av1_invalid_rd_stats(rd_cost); + for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) { + x->warp_sample_info[ref_idx].num = -1; + } + + mi->bsize = bsize; + mi->ref_frame[0] = NONE_FRAME; + mi->ref_frame[1] = NONE_FRAME; + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + // if (cpi->ppi->use_svc) denoise_svc_pickmode = + // av1_denoise_svc_non_key(cpi); + if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) + av1_denoiser_reset_frame_stats(ctx); + } +#endif + + // Populate predicated motion vectors for LAST_FRAME + if (cpi->ref_frame_flags & AOM_LAST_FLAG) { + find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv, + search_state->yv12_mb, bsize, *force_skip_low_temp_var, + x->force_zeromv_skip_for_blk, + &search_state->use_scaled_ref_frame[LAST_FRAME]); + } + // Update mask to use all reference frame + get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref, + search_state->use_ref_frame_mask, + force_skip_low_temp_var); + + skip_pred_mv = x->force_zeromv_skip_for_blk || + (x->nonrd_prune_ref_frame_search > 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2); + + // Populate predicated motion vectors for other single reference frame + // Start at LAST_FRAME + 1. + for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1; + ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) { + if (search_state->use_ref_frame_mask[ref_frame_iter]) { + find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv, + search_state->yv12_mb, bsize, *force_skip_low_temp_var, + skip_pred_mv, + &search_state->use_scaled_ref_frame[ref_frame_iter]); + } + } +} + +// Function to check the inter mode can be skipped based on mode statistics and +// speed features settings. +static AOM_FORCE_INLINE bool skip_inter_mode_nonrd( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *is_single_pred, + PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame, + MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx, + int_mv svc_mv, int force_skip_low_temp_var, unsigned int sse_zeromv_norm, + int num_inter_modes, unsigned char segment_id, BLOCK_SIZE bsize, + bool comp_use_zero_zeromv_only, bool check_globalmv) { + AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + const SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + + // Skip compound mode based on reference frame mask and type of the mode and + // for allowed compound modes, setup ref mv stack and reference frame. + if (idx >= num_inter_modes) { + const int comp_index = idx - num_inter_modes; + if (!setup_compound_params_from_comp_idx( + cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2, + search_state->frame_mv, search_state->use_ref_frame_mask, + comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame, + bsize)) { + return true; + } + *is_single_pred = 0; + } else { + *this_mode = ref_mode_set[idx].pred_mode; + *ref_frame = ref_mode_set[idx].ref_frame; + *ref_frame2 = NONE_FRAME; + } + + if (x->sb_me_block && *ref_frame == LAST_FRAME) { + // We want to make sure to test the superblock MV: + // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they + // have this sb MV. And don't skip NEWMV_LAST: this will be set to + // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't + // have it. + if (*this_mode == NEARESTMV && + search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int == + x->sb_me_mv.as_int) { + return false; + } + if (*this_mode == NEARMV && + search_state->frame_mv[NEARMV][LAST_FRAME].as_int == + x->sb_me_mv.as_int) { + return false; + } + if (*this_mode == NEWMV) { + return false; + } + } + + // Skip the single reference mode for which mode check flag is set. + if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) { + return true; + } + + // Skip GLOBALMV mode if check_globalmv flag is not enabled. + if (!check_globalmv && *this_mode == GLOBALMV) { + return true; + } + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.num_searches[bsize][*this_mode]++; +#endif + mi->mode = *this_mode; + mi->ref_frame[0] = *ref_frame; + mi->ref_frame[1] = *ref_frame2; + + // Skip the mode if use reference frame mask flag is not set. + if (!search_state->use_ref_frame_mask[*ref_frame]) return true; + + // Skip mode for some modes and reference frames when + // force_zeromv_skip_for_blk flag is true. + if (x->force_zeromv_skip_for_blk && + ((!(*this_mode == NEARESTMV && + search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) && + *this_mode != GLOBALMV) || + *ref_frame != LAST_FRAME)) + return true; + + // Skip compound mode based on variance of previously evaluated single + // reference modes. + if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred && + prune_compoundmode_with_singlemode_var( + *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv, + search_state->mode_checked, search_state->vars, + search_state->uv_dist)) { + return true; + } + + *force_mv_inter_layer = 0; + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) || + (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) || + (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) { + // Only test mode if NEARESTMV/NEARMV is (svc_mv.mv.col, svc_mv.mv.row), + // otherwise set NEWMV to (svc_mv.mv.col, svc_mv.mv.row). + // Skip newmv and filter search. + *force_mv_inter_layer = 1; + if (*this_mode == NEWMV) { + search_state->frame_mv[*this_mode][*ref_frame] = svc_mv; + } else if (search_state->frame_mv[*this_mode][*ref_frame].as_int != + svc_mv.as_int) { + return true; + } + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame)) + return true; + + // For screen content: skip mode testing based on source_sad. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + // If source_sad is computed: skip non-zero motion + // check for stationary (super)blocks. Otherwise if superblock + // has motion skip the modes with zero motion on last reference + // for flat blocks, and color is not set. + // For the latter condition: the same condition should apply + // to newmv if (0, 0), so this latter condition is repeated + // below after search_new_mv. + if (rt_sf->source_metrics_sb_nonrd) { + if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 && + x->content_state_sb.source_sad_nonrd == kZeroSad) || + (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 && + x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME && + ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) || + cpi->rc.high_source_sad) && + x->source_variance == 0)) + return true; + } + // Skip NEWMV search for flat blocks. + if (*this_mode == NEWMV && x->source_variance < 100) return true; + // Skip non-LAST for color on flat blocks. + if (*ref_frame > LAST_FRAME && x->source_variance == 0 && + (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)) + return true; + } + + // Skip mode based on block size, reference frame mode and other block + // properties. + if (skip_mode_by_bsize_and_ref_frame( + *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search, + sse_zeromv_norm, rt_sf->nonrd_aggressive_skip)) + return true; + + // Skip mode based on low temporal variance and souce sad. + if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb, + search_state->frame_mv[*this_mode][*ref_frame], + force_skip_low_temp_var)) + return true; + + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Check for skipping GOLDEN and ALTREF based pred_mv_sad. + if (rt_sf->nonrd_prune_ref_frame_search > 0 && + x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) { + if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true; + } + } + + // Check for skipping NEARMV based on pred_mv_sad. + if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX && + x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1)) + return true; + + // Skip single reference mode based on rd threshold. + if (*is_single_pred) { + if (skip_mode_by_threshold( + *this_mode, *ref_frame, + search_state->frame_mv[*this_mode][*ref_frame], + cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize], + x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost, + search_state->best_pickmode.best_mode_skip_txfm, + (rt_sf->nonrd_aggressive_skip ? 1 : 0))) + return true; + } + return false; +} + +// Function to perform inter mode evaluation for non-rd +static AOM_FORCE_INLINE bool handle_inter_mode_nonrd( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + PICK_MODE_CONTEXT *ctx, PRED_BUFFER **this_mode_pred, + PRED_BUFFER *tmp_buffer, InterPredParams inter_pred_params_sr, + int *best_early_term, unsigned int *sse_zeromv_norm, bool *check_globalmv, +#if CONFIG_AV1_TEMPORAL_DENOISING + int64_t *zero_last_cost_orig, int denoise_svc_pickmode, +#endif + int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref, + int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize, + PREDICTION_MODE this_mode, InterpFilter filt_select, + int cb_pred_filter_search, int reuse_inter_pred, + int *sb_me_has_been_tested) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + const int bw = block_size_wide[bsize]; + const InterpFilter filter_ref = cm->features.interp_filter; + const InterpFilter default_interp_filter = EIGHTTAP_REGULAR; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const ModeCosts *mode_costs = &x->mode_costs; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; + + MV_REFERENCE_FRAME ref_frame = mi->ref_frame[0]; + MV_REFERENCE_FRAME ref_frame2 = mi->ref_frame[1]; + int_mv *const this_mv = &search_state->frame_mv[this_mode][ref_frame]; + unsigned int var = UINT_MAX; + int this_early_term = 0; + int rate_mv = 0; + int is_skippable; + int skip_this_mv = 0; + unsigned int var_threshold = UINT_MAX; + PREDICTION_MODE this_best_mode; + RD_STATS nonskip_rdc; + av1_invalid_rd_stats(&nonskip_rdc); + + if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) { + // Set the NEWMV_LAST to the sb MV. + search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int; + } else if (this_mode == NEWMV && !force_mv_inter_layer) { +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + // Find the best motion vector for single/compound mode. + const bool skip_newmv = search_new_mv( + cpi, x, search_state->frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, &rate_mv, &search_state->best_rdc); +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.ms_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif + // Skip NEWMV mode, + // (i). For bsize smaller than 16X16 + // (ii). Based on sad of the predicted mv w.r.t LAST_FRAME + // (iii). When motion vector is same as that of reference mv + if (skip_newmv) { + return true; + } + } + + // Check the current motion vector is same as that of previously evaluated + // motion vectors. + for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; + inter_mv_mode++) { + if (inter_mv_mode == this_mode) continue; + if (is_single_pred && + search_state->mode_checked[inter_mv_mode][ref_frame] && + this_mv->as_int == + search_state->frame_mv[inter_mv_mode][ref_frame].as_int) { + skip_this_mv = 1; + break; + } + } + + // Skip single mode if current motion vector is same that of previously + // evaluated motion vectors. + if (skip_this_mv && is_single_pred) return true; + + // For screen: for spatially flat blocks with non-zero motion, + // skip newmv if the motion vector is (0, 0)-LAST, and color is not set. + if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) { + if (this_mv->as_int == 0 && ref_frame == LAST_FRAME && + x->block_is_zero_sad == 0 && + ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && + x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) || + cpi->rc.high_source_sad) && + x->source_variance == 0) + return true; + } + + mi->mode = this_mode; + mi->mv[0].as_int = this_mv->as_int; + mi->mv[1].as_int = 0; + if (!is_single_pred) + mi->mv[1].as_int = search_state->frame_mv[this_mode][ref_frame2].as_int; + + // Set buffers to store predicted samples for reuse + if (reuse_inter_pred) { + if (!*this_mode_pred) { + *this_mode_pred = &tmp_buffer[3]; + } else { + *this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = bw; + } + } + + mi->motion_mode = SIMPLE_TRANSLATION; +#if !CONFIG_REALTIME_ONLY + if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) { + calc_num_proj_ref(cpi, x, mi); + } +#endif + // set variance threshold for compound mode pruning + if (rt_sf->prune_compoundmode_with_singlecompound_var && !is_single_pred && + use_model_yrd_large) { + const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode); + const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode); + var_threshold = + AOMMIN(var_threshold, + search_state->vars[INTER_OFFSET(single_mode0)][ref_frame]); + var_threshold = + AOMMIN(var_threshold, + search_state->vars[INTER_OFFSET(single_mode1)][ref_frame2]); + } + + // decide interpolation filter, build prediction signal, get sse + const bool is_mv_subpel = + (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07); + const bool enable_filt_search_this_mode = + (filter_search_enabled_blk == 2) + ? true + : (filter_search_enabled_blk && !force_mv_inter_layer && + is_single_pred && + (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)); + if (is_mv_subpel && enable_filt_search_this_mode) { +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + search_filter_ref( + cpi, x, &search_state->this_rdc, &inter_pred_params_sr, mi_row, mi_col, + tmp_buffer, bsize, reuse_inter_pred, this_mode_pred, &this_early_term, + &var, use_model_yrd_large, best_pickmode->best_sse, is_single_pred); +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.ifs_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif +#if !CONFIG_REALTIME_ONLY + } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion && + this_mode == NEWMV) { + // Find the best motion mode when current mode is NEWMV + search_motion_mode(cpi, x, &search_state->this_rdc, mi_row, mi_col, bsize, + &this_early_term, use_model_yrd_large, &rate_mv, + best_pickmode->best_sse); + if (this_mode == NEWMV) { + this_mv[0] = mi->mv[0]; + } +#endif + } else { + mi->interp_filters = + (filter_ref == SWITCHABLE) + ? av1_broadcast_interp_filter(default_interp_filter) + : av1_broadcast_interp_filter(filter_ref); + if (force_mv_inter_layer) + mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // If it is sub-pel motion and cb_pred_filter_search is enabled, select + // the pre-decided filter + if (is_mv_subpel && cb_pred_filter_search) + mi->interp_filters = av1_broadcast_interp_filter(filt_select); + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + if (is_single_pred) { + SubpelParams subpel_params; + // Initialize inter mode level params for single reference mode. + init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr, + &subpel_params, xd->block_ref_scale_factors[0], + pd->pre->width, pd->pre->height); + av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr, + &subpel_params); + } else { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + } + + if (use_model_yrd_large) { + model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, + &search_state->this_rdc, &this_early_term, 0, + best_pickmode->best_sse, &var, var_threshold); + } else { + model_rd_for_sb_y(cpi, bsize, x, xd, &search_state->this_rdc, &var, 0, + &this_early_term); + } +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.model_rd_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif + } + + // update variance for single mode + if (is_single_pred) { + search_state->vars[INTER_OFFSET(this_mode)][ref_frame] = var; + if (this_mv->as_int == 0) { + search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var; + } + } + // prune compound mode based on single mode var threshold + if (!is_single_pred && var > var_threshold) { + if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); + return true; + } + + if (ref_frame == LAST_FRAME && this_mv->as_int == 0) { + *sse_zeromv_norm = (unsigned int)(search_state->this_rdc.sse >> + (b_width_log2_lookup[bsize] + + b_height_log2_lookup[bsize])); + } + + // Perform early termination based on sse. + if (rt_sf->sse_early_term_inter_search && + early_term_inter_search_with_sse(rt_sf->sse_early_term_inter_search, + bsize, search_state->this_rdc.sse, + best_pickmode->best_sse, this_mode)) { + if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); + return true; + } + +#if COLLECT_NONRD_PICK_MODE_STAT + x->ms_stat_nonrd.num_nonskipped_searches[bsize][this_mode]++; +#endif + + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1]; + const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0]; + const int64_t sse_y = search_state->this_rdc.sse; + + if (this_early_term) { + search_state->this_rdc.skip_txfm = 1; + search_state->this_rdc.rate = skip_txfm_cost; + search_state->this_rdc.dist = search_state->this_rdc.sse << 4; + } else { +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer2); +#endif + // Calculates RD Cost using Hadamard transform. + av1_block_yrd(x, &search_state->this_rdc, &is_skippable, bsize, + mi->tx_size); + if (search_state->this_rdc.skip_txfm || + RDCOST(x->rdmult, search_state->this_rdc.rate, + search_state->this_rdc.dist) >= + RDCOST(x->rdmult, 0, search_state->this_rdc.sse)) { + if (!search_state->this_rdc.skip_txfm) { + // Need to store "real" rdc for possible future use if UV rdc + // disallows tx skip + nonskip_rdc = search_state->this_rdc; + nonskip_rdc.rate += no_skip_txfm_cost; + } + search_state->this_rdc.rate = skip_txfm_cost; + search_state->this_rdc.skip_txfm = 1; + search_state->this_rdc.dist = search_state->this_rdc.sse; + } else { + search_state->this_rdc.rate += no_skip_txfm_cost; + } + + // Populate predicted sample for chroma planes based on color sensitivity. + if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) { + RD_STATS rdc_uv; + const BLOCK_SIZE uv_bsize = + get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_U); + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_V, AOM_PLANE_V); + } + // Compute sse for chroma planes. + const int64_t sse_uv = av1_model_rd_for_sb_uv( + cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V); + if (rdc_uv.dist < x->min_dist_inter_uv) + x->min_dist_inter_uv = rdc_uv.dist; + search_state->this_rdc.sse += sse_uv; + // Restore Y rdc if UV rdc disallows txfm skip + if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm && + nonskip_rdc.rate != INT_MAX) + search_state->this_rdc = nonskip_rdc; + if (is_single_pred) { + search_state->uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist; + } + search_state->this_rdc.rate += rdc_uv.rate; + search_state->this_rdc.dist += rdc_uv.dist; + search_state->this_rdc.skip_txfm = + search_state->this_rdc.skip_txfm && rdc_uv.skip_txfm; + } +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer2); + x->ms_stat_nonrd.txfm_time[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2); +#endif + } + + this_best_mode = this_mode; + // TODO(kyslov) account for UV prediction cost + search_state->this_rdc.rate += rate_mv; + if (!is_single_pred) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame); + search_state->this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx); + } else { + // If the current mode has zeromv but is not GLOBALMV, compare the rate + // cost. If GLOBALMV is cheaper, use GLOBALMV instead. + if (this_mode != GLOBALMV && + this_mv->as_int == search_state->frame_mv[GLOBALMV][ref_frame].as_int) { + if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs, + search_state->single_inter_mode_costs, mbmi_ext)) { + this_best_mode = GLOBALMV; + } + } + + search_state->this_rdc.rate += + search_state + ->single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame]; + } + + if (is_single_pred && this_mv->as_int == 0 && var < UINT_MAX) { + search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var; + } + + search_state->this_rdc.rate += search_state->ref_costs_single[ref_frame]; + + search_state->this_rdc.rdcost = RDCOST(x->rdmult, search_state->this_rdc.rate, + search_state->this_rdc.dist); + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && is_single_pred) { + newmv_diff_bias(xd, this_best_mode, &search_state->this_rdc, bsize, + search_state->frame_mv[this_best_mode][ref_frame].as_mv.row, + search_state->frame_mv[this_best_mode][ref_frame].as_mv.col, + cpi->speed, x->source_variance, x->content_state_sb); + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && + cpi->denoiser.denoising_level > kDenLowLow) { + av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx); + // Keep track of zero_last cost. + if (ref_frame == LAST_FRAME && this_mv->as_int == 0) + *zero_last_cost_orig = search_state->this_rdc.rdcost; + } +#else + (void)(sse_y); +#endif + + search_state->mode_checked[this_mode][ref_frame] = 1; + search_state->mode_checked[this_best_mode][ref_frame] = 1; + + if (*check_globalmv) { + int32_t abs_mv = + abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.row) + + abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.col); + // Early exit check: if the magnitude of this_best_mode's mv is small + // enough, we skip GLOBALMV check in the next loop iteration. + if (abs_mv < 2) { + *check_globalmv = false; + } + } +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.nonskipped_search_times[bsize][this_mode] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1); +#endif + + if (x->sb_me_block && ref_frame == LAST_FRAME && + search_state->frame_mv[this_best_mode][ref_frame].as_int == + x->sb_me_mv.as_int) + *sb_me_has_been_tested = 1; + + // Copy best mode params to search state + if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) { + search_state->best_rdc = search_state->this_rdc; + *best_early_term = this_early_term; + update_search_state_nonrd(search_state, mi, txfm_info, &nonskip_rdc, ctx, + this_best_mode, sse_y); + + // This is needed for the compound modes. + search_state->frame_mv_best[this_best_mode][ref_frame].as_int = + search_state->frame_mv[this_best_mode][ref_frame].as_int; + if (ref_frame2 > NONE_FRAME) { + search_state->frame_mv_best[this_best_mode][ref_frame2].as_int = + search_state->frame_mv[this_best_mode][ref_frame2].as_int; + } + + if (reuse_inter_pred) { + free_pred_buffer(best_pickmode->best_pred); + best_pickmode->best_pred = *this_mode_pred; + } + } else { + if (reuse_inter_pred) free_pred_buffer(*this_mode_pred); + } + + if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) { + txfm_info->skip_txfm = 1; + if (!x->sb_me_block || *sb_me_has_been_tested) return false; + } + return true; +} + +// Function to perform screen content mode evaluation for non-rd +static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd( + AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state, + PRED_BUFFER *this_mode_pred, PICK_MODE_CONTEXT *ctx, + PRED_BUFFER *tmp_buffer, struct buf_2d *orig_dst, int skip_idtx_palette, + int try_palette, BLOCK_SIZE bsize, int reuse_inter_pred, int mi_col, + int mi_row) { + AV1_COMMON *const cm = &cpi->common; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode; + + // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit + // when issue 3359 is fixed. + if (cm->seq_params->bit_depth == 8 && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette && + !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk && + is_inter_mode(best_pickmode->best_mode) && + best_pickmode->best_pred != NULL && + (!rt_sf->prune_idtx_nonrd || + (rt_sf->prune_idtx_nonrd && bsize <= BLOCK_32X32 && + best_pickmode->best_mode_skip_txfm != 1 && x->source_variance > 200))) { + RD_STATS idtx_rdc; + av1_init_rd_stats(&idtx_rdc); + int is_skippable; + this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + pd->dst.buf = this_mode_pred->data; + pd->dst.stride = bw; + const PRED_BUFFER *const best_pred = best_pickmode->best_pred; + av1_block_yrd_idtx(x, best_pred->data, best_pred->stride, &idtx_rdc, + &is_skippable, bsize, mi->tx_size); + int64_t idx_rdcost_y = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist); + int allow_idtx = 1; + // Incorporate color into rd cost. + if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) { + RD_STATS rdc_uv; + const BLOCK_SIZE uv_bsize = + get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_U); + } + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_V, AOM_PLANE_V); + } + av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, + AOM_PLANE_V); + if (rdc_uv.dist < x->min_dist_inter_uv) + x->min_dist_inter_uv = rdc_uv.dist; + idtx_rdc.rate += rdc_uv.rate; + idtx_rdc.dist += rdc_uv.dist; + idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm; + if (idx_rdcost_y == 0 && rdc_uv.dist > 0 && x->source_variance < 3000 && + x->content_state_sb.source_sad_nonrd > kMedSad) + allow_idtx = 0; + } + int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist); + if (allow_idtx && idx_rdcost < search_state->best_rdc.rdcost) { + best_pickmode->tx_type = IDTX; + search_state->best_rdc.rdcost = idx_rdcost; + best_pickmode->best_mode_skip_txfm = idtx_rdc.skip_txfm; + if (!idtx_rdc.skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } + xd->tx_type_map[0] = best_pickmode->tx_type; + memset(ctx->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk); + memset(xd->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk); + } + pd->dst = *orig_dst; + } + + if (!try_palette) return; + const unsigned int intra_ref_frame_cost = + search_state->ref_costs_single[INTRA_FRAME]; + + if (!is_mode_intra(best_pickmode->best_mode)) { + PRED_BUFFER *const best_pred = best_pickmode->best_pred; + if (reuse_inter_pred && best_pred != NULL) { + if (best_pred->data == orig_dst->buf) { + this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)]; + aom_convolve_copy(best_pred->data, best_pred->stride, + this_mode_pred->data, this_mode_pred->stride, bw, bh); + best_pickmode->best_pred = this_mode_pred; + } + } + pd->dst = *orig_dst; + } + // Search palette mode for Luma plane in inter frame. + av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx, + &search_state->this_rdc, + search_state->best_rdc.rdcost); + // Update best mode data in search_state + if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) { + best_pickmode->pmi = mi->palette_mode_info; + best_pickmode->best_mode = DC_PRED; + mi->mv[0].as_int = INVALID_MV; + mi->mv[1].as_int = INVALID_MV; + best_pickmode->best_ref_frame = INTRA_FRAME; + best_pickmode->best_second_ref_frame = NONE; + search_state->best_rdc.rate = search_state->this_rdc.rate; + search_state->best_rdc.dist = search_state->this_rdc.dist; + search_state->best_rdc.rdcost = search_state->this_rdc.rdcost; + best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm; + // Keep the skip_txfm off if the color_sensitivity is set. + if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) + search_state->this_rdc.skip_txfm = 0; + if (!search_state->this_rdc.skip_txfm) { + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + } + if (xd->tx_type_map[0] != DCT_DCT) + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } +} + +/*!\brief AV1 inter mode selection based on Non-RD optimized model. + * + * \ingroup nonrd_mode_search + * \callgraph + * Top level function for Non-RD optimized inter mode selection. + * This finction will loop over subset of inter modes and select the best one + * based on calculated modelled RD cost. While making decisions which modes to + * check, this function applies heuristics based on previously checked modes, + * block residual variance, block size, and other factors to prune certain + * modes and reference frames. Currently only single reference frame modes + * are checked. Additional heuristics are applied to decide if intra modes + * need to be checked. + * * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + data/contexts/models for the tile during + encoding + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + AV1_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + MV_REFERENCE_FRAME ref_frame, ref_frame2; + const unsigned char segment_id = mi->segment_id; + int best_early_term = 0; + int force_skip_low_temp_var = 0; + unsigned int sse_zeromv_norm = UINT_MAX; + const int num_inter_modes = NUM_INTER_MODES; + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + bool check_globalmv = rt_sf->check_globalmv_on_single_ref; + PRED_BUFFER tmp_buffer[4]; + DECLARE_ALIGNED(16, uint8_t, pred_buf[MAX_MB_PLANE * MAX_SB_SQUARE]); + PRED_BUFFER *this_mode_pred = NULL; + const int reuse_inter_pred = + rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8; + InterModeSearchStateNonrd search_state; + av1_zero(search_state.use_ref_frame_mask); + av1_zero(search_state.use_scaled_ref_frame); + BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode; + (void)tile_data; + + const int bh = block_size_high[bsize]; + const int bw = block_size_wide[bsize]; + const int pixels_in_block = bh * bw; + struct buf_2d orig_dst = pd->dst; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; +#if COLLECT_NONRD_PICK_MODE_STAT + // Mode statistics can be collected only when num_workers is 1 + assert(cpi->mt_info.num_workers <= 1); + aom_usec_timer_start(&x->ms_stat_nonrd.bsize_timer); +#endif + int64_t thresh_sad_pred = INT64_MAX; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int_mv svc_mv = { .as_int = 0 }; + int force_mv_inter_layer = 0; + bool comp_use_zero_zeromv_only = 0; + int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT; +#if CONFIG_AV1_TEMPORAL_DENOISING + const int denoise_recheck_zeromv = 1; + AV1_PICKMODE_CTX_DEN ctx_den; + int64_t zero_last_cost_orig = INT64_MAX; + int denoise_svc_pickmode = 1; + const int resize_pending = is_frame_resize_pending(cpi); +#endif + const ModeCosts *mode_costs = &x->mode_costs; + struct scale_factors sf_no_scale; + av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height, + cm->width, cm->height); + if (reuse_inter_pred) { + for (int buf_idx = 0; buf_idx < 3; buf_idx++) { + tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx]; + tmp_buffer[buf_idx].stride = bw; + tmp_buffer[buf_idx].in_use = 0; + } + tmp_buffer[3].data = pd->dst.buf; + tmp_buffer[3].stride = pd->dst.stride; + tmp_buffer[3].in_use = 0; + } + + const int gf_temporal_ref = is_same_gf_and_last_scale(cm); + + // If the lower spatial layer uses an averaging filter for downsampling + // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative + // to source, so use subpel motion vector to compensate. The nonzero motion + // is half pixel shifted to left and top, so (-4, -4). This has more effect + // on higher resolutions, so condition it on that for now. + // Exclude quality layers, which have the same resolution and hence no shift. + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + !svc->has_lower_quality_layer && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && + cm->width * cm->height > 640 * 480) { + svc_mv.as_mv.row = -4; + svc_mv.as_mv.col = -4; + } + + // Setup parameters used for inter mode evaluation. + set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost, + &force_skip_low_temp_var, mi_row, mi_col, + gf_temporal_ref, segment_id, bsize +#if CONFIG_AV1_TEMPORAL_DENOISING + , + ctx, denoise_svc_pickmode +#endif + ); + + if (rt_sf->use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) { + // Only search compound if bsize \gt BLOCK_16X16. + if (bsize > BLOCK_16X16) { + comp_use_zero_zeromv_only = rt_sf->check_only_zero_zeromv_on_large_blocks; + } else { + tot_num_comp_modes = 0; + } + } else { + tot_num_comp_modes = 0; + } + + if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) { + thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1; + // Increase threshold for less aggressive pruning. + if (rt_sf->nonrd_prune_ref_frame_search == 1) + thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2); + } + + const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize); + + // decide block-level interp filter search flags: + // filter_search_enabled_blk: + // 0: disabled + // 1: filter search depends on mode properties + // 2: filter search forced since prediction is unreliable + // cb_pred_filter_search 0: disabled cb prediction + InterpFilter filt_select = EIGHTTAP_REGULAR; + const int cb_pred_filter_search = + x->content_state_sb.source_sad_nonrd > kVeryLowSad + ? cpi->sf.interp_sf.cb_pred_filter_search + : 0; + const int filter_search_enabled_blk = + is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id, + cb_pred_filter_search, &filt_select); + +#if COLLECT_NONRD_PICK_MODE_STAT + x->ms_stat_nonrd.num_blocks[bsize]++; +#endif + init_mbmi_nonrd(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm); + mi->tx_size = AOMMIN( + AOMMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]), + TX_16X16); + + fill_single_inter_mode_costs(search_state.single_inter_mode_costs, + num_inter_modes, ref_mode_set, mode_costs, + mbmi_ext->mode_context); + + MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME; + + // Initialize inter prediction params at block level for single reference + // mode. + InterPredParams inter_pred_params_sr; + init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height, + mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), + /*is_intrabc=*/0); + inter_pred_params_sr.conv_params = + get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd); + + x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + !x->force_zeromv_skip_for_blk && + x->content_state_sb.source_sad_nonrd != kZeroSad && + x->source_variance == 0 && bsize < cm->seq_params->sb_size && + search_state.yv12_mb[LAST_FRAME][0].width == cm->width && + search_state.yv12_mb[LAST_FRAME][0].height == cm->height) { + set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]); + } + + int sb_me_has_been_tested = 0; + x->sb_me_block = x->sb_me_partition; + // Only use this feature (force testing of superblock motion) if coding + // block size is large. + if (x->sb_me_block) { + if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64) + x->sb_me_block = 0; + else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32) + x->sb_me_block = 0; + } + + x->min_dist_inter_uv = INT64_MAX; + for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) { + // If we are at the first compound mode, and the single modes already + // perform well, then end the search. + if (rt_sf->skip_compound_based_on_var && idx == num_inter_modes && + skip_comp_based_on_var(search_state.vars, bsize)) { + break; + } + + int is_single_pred = 1; + PREDICTION_MODE this_mode; + + if (idx == 0 && !x->force_zeromv_skip_for_blk) { + // Set color sensitivity on first tested mode only. + // Use y-sad already computed in find_predictors: take the sad with motion + // vector closest to 0; the uv-sad computed below in set_color_sensitivity + // is for zeromv. + // For screen: first check if golden reference is being used, if so, + // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1. + // The check in set_color_sensitivity() will then follow and check for + // setting the flag if the level is still 2 or 0. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + search_state.use_ref_frame_mask[GOLDEN_FRAME]) { + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1) + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1; + if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1) + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1; + } + if (search_state.use_ref_frame_mask[LAST_FRAME] && + x->pred_mv0_sad[LAST_FRAME] != INT_MAX) { + int y_sad = x->pred_mv0_sad[LAST_FRAME]; + if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX && + (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) + + abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) < + (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) + + abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row))) + y_sad = x->pred_mv1_sad[LAST_FRAME]; + set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance, + search_state.yv12_mb[LAST_FRAME]); + } + } + + // Check the inter mode can be skipped based on mode statistics and speed + // features settings. + if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred, + &force_mv_inter_layer, &is_single_pred, + &this_mode, &last_comp_ref_frame, &ref_frame, + &ref_frame2, idx, svc_mv, force_skip_low_temp_var, + sse_zeromv_norm, num_inter_modes, segment_id, + bsize, comp_use_zero_zeromv_only, check_globalmv)) + continue; + + // Select prediction reference frames. + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + xd->plane[plane].pre[0] = search_state.yv12_mb[ref_frame][plane]; + if (!is_single_pred) + xd->plane[plane].pre[1] = search_state.yv12_mb[ref_frame2][plane]; + } + + mi->ref_frame[0] = ref_frame; + mi->ref_frame[1] = ref_frame2; + set_ref_ptrs(cm, xd, ref_frame, ref_frame2); + + // Check if the scaled reference frame should be used. This is set in the + // find_predictors() for each usable reference. If so, set the + // block_ref_scale_factors[] to no reference scaling. + if (search_state.use_scaled_ref_frame[ref_frame]) { + xd->block_ref_scale_factors[0] = &sf_no_scale; + } + if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) { + xd->block_ref_scale_factors[1] = &sf_no_scale; + } + + // Perform inter mode evaluation for non-rd + if (!handle_inter_mode_nonrd( + cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer, + inter_pred_params_sr, &best_early_term, &sse_zeromv_norm, + &check_globalmv, +#if CONFIG_AV1_TEMPORAL_DENOISING + &zero_last_cost_orig, denoise_svc_pickmode, +#endif + idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref, + use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode, + filt_select, cb_pred_filter_search, reuse_inter_pred, + &sb_me_has_been_tested)) { + break; + } + } + + // Restore mode data of best inter mode + mi->mode = best_pickmode->best_mode; + mi->motion_mode = best_pickmode->best_motion_mode; + mi->wm_params = best_pickmode->wm_params; + mi->num_proj_ref = best_pickmode->num_proj_ref; + mi->interp_filters = best_pickmode->best_pred_filter; + mi->tx_size = best_pickmode->best_tx_size; + memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size)); + mi->ref_frame[0] = best_pickmode->best_ref_frame; + mi->mv[0].as_int = search_state + .frame_mv_best[best_pickmode->best_mode] + [best_pickmode->best_ref_frame] + .as_int; + mi->mv[1].as_int = 0; + if (best_pickmode->best_second_ref_frame > INTRA_FRAME) { + mi->ref_frame[1] = best_pickmode->best_second_ref_frame; + mi->mv[1].as_int = search_state + .frame_mv_best[best_pickmode->best_mode] + [best_pickmode->best_second_ref_frame] + .as_int; + } + // Perform intra prediction search, if the best SAD is above a certain + // threshold. + mi->angle_delta[PLANE_TYPE_Y] = 0; + mi->angle_delta[PLANE_TYPE_UV] = 0; + mi->filter_intra_mode_info.use_filter_intra = 0; + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_start(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.num_searches[bsize][DC_PRED]++; + x->ms_stat_nonrd.num_nonskipped_searches[bsize][DC_PRED]++; +#endif + + int force_palette_test = 0; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + x->content_state_sb.source_sad_nonrd != kZeroSad && + bsize <= BLOCK_16X16) { + unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000; + unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200; + unsigned int best_sse_inter_motion = + (unsigned int)(search_state.best_rdc.sse >> + (b_width_log2_lookup[bsize] + + b_height_log2_lookup[bsize])); + if (best_sse_inter_motion > thresh_sse && + x->source_variance > thresh_source_var) + force_palette_test = 1; + } + + // Evaluate Intra modes in inter frame + if (!x->force_zeromv_skip_for_blk) + av1_estimate_intra_mode(cpi, x, bsize, best_early_term, + search_state.ref_costs_single[INTRA_FRAME], + reuse_inter_pred, &orig_dst, tmp_buffer, + &this_mode_pred, &search_state.best_rdc, + best_pickmode, ctx); + + int skip_idtx_palette = (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) && + x->content_state_sb.source_sad_nonrd != kZeroSad && + !cpi->rc.high_source_sad; + + int try_palette = + !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mi->bsize); + try_palette = + try_palette && + (is_mode_intra(best_pickmode->best_mode) || force_palette_test) && + x->source_variance > 0 && !x->force_zeromv_skip_for_blk && + (cpi->rc.high_source_sad || x->source_variance > 300); + + if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0; + + // Perform screen content mode evaluation for non-rd + handle_screen_content_mode_nonrd( + cpi, x, &search_state, this_mode_pred, ctx, tmp_buffer, &orig_dst, + skip_idtx_palette, try_palette, bsize, reuse_inter_pred, mi_col, mi_row); + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.timer1); + x->ms_stat_nonrd.nonskipped_search_times[bsize][DC_PRED] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1); +#endif + + pd->dst = orig_dst; + // Best mode is finalized. Restore the mode data to mbmi + if (try_palette) mi->palette_mode_info = best_pickmode->pmi; + mi->mode = best_pickmode->best_mode; + mi->ref_frame[0] = best_pickmode->best_ref_frame; + mi->ref_frame[1] = best_pickmode->best_second_ref_frame; + // For lossless: always force the skip flags off. + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + txfm_info->skip_txfm = 0; + memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk); + } else { + txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm; + } + if (has_second_ref(mi)) { + mi->comp_group_idx = 0; + mi->compound_idx = 1; + mi->interinter_comp.type = COMPOUND_AVERAGE; + } + + if (!is_inter_block(mi)) { + mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS); + } else { + // If inter mode is selected and ref_frame was one that uses the + // scaled reference frame, then we can't use reuse_inter_pred. + if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] || + (has_second_ref(mi) && + search_state + .use_scaled_ref_frame[best_pickmode->best_second_ref_frame])) + x->reuse_inter_pred = 0; + } + + // Restore the predicted samples of best mode to final buffer + if (reuse_inter_pred && best_pickmode->best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode->best_pred; + if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { + aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } + } + +#if CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 && + denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && + cpi->denoiser.reset == 0) { + AV1_DENOISER_DECISION decision = COPY_BLOCK; + ctx->sb_skip_denoising = 0; + av1_pickmode_ctx_den_update( + &ctx_den, zero_last_cost_orig, search_state.ref_costs_single, + search_state.frame_mv, reuse_inter_pred, best_pickmode); + av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, + gf_temporal_ref); + if (denoise_recheck_zeromv) + recheck_zeromv_after_denoising( + cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb, + &search_state.best_rdc, best_pickmode, bsize, mi_row, mi_col); + best_pickmode->best_ref_frame = ctx_den.best_ref_frame; + } +#endif + + // Update the factors used for RD thresholding for all modes. + if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) { + THR_MODES best_mode_idx = + mode_idx[best_pickmode->best_ref_frame][mode_offset(mi->mode)]; + if (best_pickmode->best_ref_frame == INTRA_FRAME) { + // Only consider the modes that are included in the intra_mode_list. + int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); + for (int mode_index = 0; mode_index < intra_modes; mode_index++) { + update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx, + intra_mode_list[mode_index]); + } + } else { + PREDICTION_MODE this_mode; + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + update_thresh_freq_fact(cpi, x, bsize, best_pickmode->best_ref_frame, + best_mode_idx, this_mode); + } + } + } + +#if CONFIG_INTERNAL_STATS + store_coding_context_nonrd(x, ctx, mi->mode); +#else + store_coding_context_nonrd(x, ctx); +#endif // CONFIG_INTERNAL_STATS + +#if COLLECT_NONRD_PICK_MODE_STAT + aom_usec_timer_mark(&x->ms_stat_nonrd.bsize_timer); + x->ms_stat_nonrd.total_block_times[bsize] += + aom_usec_timer_elapsed(&x->ms_stat_nonrd.bsize_timer); + print_time(&x->ms_stat_nonrd, bsize, cm->mi_params.mi_rows, + cm->mi_params.mi_cols, mi_row, mi_col); +#endif // COLLECT_NONRD_PICK_MODE_STAT + + *rd_cost = search_state.best_rdc; + + // Reset the xd->block_ref_scale_factors[i], as they may have + // been set to pointer &sf_no_scale, which becomes invalid afer + // this function. + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); +} diff --git a/third_party/aom/av1/encoder/optical_flow.c b/third_party/aom/av1/encoder/optical_flow.c new file mode 100644 index 0000000000..dc168e7aee --- /dev/null +++ b/third_party/aom/av1/encoder/optical_flow.c @@ -0,0 +1,1113 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/mathutils.h" +#include "aom_mem/aom_mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/optical_flow.h" +#include "av1/encoder/sparse_linear_solver.h" +#include "av1/encoder/reconinter_enc.h" + +#if CONFIG_OPTICAL_FLOW_API + +void av1_init_opfl_params(OPFL_PARAMS *opfl_params) { + opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS; + opfl_params->warping_steps = OPFL_WARPING_STEPS; + opfl_params->lk_params = NULL; +} + +void av1_init_lk_params(LK_PARAMS *lk_params) { + lk_params->window_size = OPFL_WINDOW_SIZE; +} + +// Helper function to determine whether a frame is encoded with high bit-depth. +static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { + return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; +} + +// Helper function to determine whether optical flow method is sparse. +static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) { + return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0; +} + +static void gradients_over_window(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *ref_frame, + const double x_coord, const double y_coord, + const int window_size, const int bit_depth, + double *ix, double *iy, double *it, + LOCALMV *mv); + +// coefficients for bilinear interpolation on unit square +static int pixel_interp(const double x, const double y, const double b00, + const double b01, const double b10, const double b11) { + const int xint = (int)x; + const int yint = (int)y; + const double xdec = x - xint; + const double ydec = y - yint; + const double a = (1 - xdec) * (1 - ydec); + const double b = xdec * (1 - ydec); + const double c = (1 - xdec) * ydec; + const double d = xdec * ydec; + // if x, y are already integers, this results to b00 + int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11); + return interp; +} + +// Scharr filter to compute spatial gradient +static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord, + const int y_coord, const int direction, + double *derivative) { + double *filter; + // Scharr filters + double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 }; + double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 }; + if (direction == 0) { // x direction + filter = gx; + } else { // y direction + filter = gy; + } + int idx = 0; + double d = 0; + for (int yy = -1; yy <= 1; yy++) { + for (int xx = -1; xx <= 1; xx++) { + d += filter[idx] * + frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)]; + idx++; + } + } + // normalization scaling factor for scharr + *derivative = d / 32.0; +} + +// Determine the spatial gradient at subpixel locations +// For example, when reducing images for pyramidal LK, +// corners found in original image may be at subpixel locations. +static void gradient_interp(double *fullpel_deriv, const double x_coord, + const double y_coord, const int w, const int h, + double *derivative) { + const int xint = (int)x_coord; + const int yint = (int)y_coord; + double interp; + if (xint + 1 > w - 1 || yint + 1 > h - 1) { + interp = fullpel_deriv[yint * w + xint]; + } else { + interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint], + fullpel_deriv[yint * w + (xint + 1)], + fullpel_deriv[(yint + 1) * w + xint], + fullpel_deriv[(yint + 1) * w + (xint + 1)]); + } + + *derivative = interp; +} + +static void temporal_gradient(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *frame2, + const double x_coord, const double y_coord, + const int bit_depth, double *derivative, + LOCALMV *mv) { + const int w = 2; + const int h = 2; + uint8_t pred1[4]; + uint8_t pred2[4]; + + const int y = (int)y_coord; + const int x = (int)x_coord; + const double ydec = y_coord - y; + const double xdec = x_coord - x; + const int is_intrabc = 0; // Is intra-copied? + const int is_high_bitdepth = is_frame_high_bitdepth(frame2); + const int subsampling_x = 0, subsampling_y = 0; // for y-buffer + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP); + const int plane = 0; // y-plane + const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width, + frame2->y_crop_height, frame2->y_stride }; + struct scale_factors scale; + av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width, + frame->y_crop_height, frame->y_crop_width, + frame->y_crop_height); + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, + &scale, &ref_buf2, interp_filters); + inter_pred_params.interp_filter_params[0] = + &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; + inter_pred_params.interp_filter_params[1] = + &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8), + .col = (int16_t)round((mv->col + ydec) * 8) }; + av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params); + const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width, + frame->y_crop_height, frame->y_stride }; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, is_intrabc, + &scale, &ref_buf1, interp_filters); + inter_pred_params.interp_filter_params[0] = + &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; + inter_pred_params.interp_filter_params[1] = + &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + MV zeroMV = { .row = (int16_t)round(xdec * 8), + .col = (int16_t)round(ydec * 8) }; + av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params); + + *derivative = pred2[0] - pred1[0]; +} + +// Numerical differentiate over window_size x window_size surrounding (x,y) +// location. Alters ix, iy, it to contain numerical partial derivatives +static void gradients_over_window(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *ref_frame, + const double x_coord, const double y_coord, + const int window_size, const int bit_depth, + double *ix, double *iy, double *it, + LOCALMV *mv) { + const double left = x_coord - window_size / 2.0; + const double top = y_coord - window_size / 2.0; + // gradient operators need pixel before and after (start at 1) + const double x_start = AOMMAX(1, left); + const double y_start = AOMMAX(1, top); + const int frame_height = frame->y_crop_height; + const int frame_width = frame->y_crop_width; + double deriv_x; + double deriv_y; + double deriv_t; + + const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2); + const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2); + const int xs = (int)AOMMAX(1, x_start - 1); + const int ys = (int)AOMMAX(1, y_start - 1); + const int xe = (int)AOMMIN(x_end + 2, frame_width - 2); + const int ye = (int)AOMMIN(y_end + 2, frame_height - 2); + // with normalization, gradients may be double values + double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x)); + double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y)); + if (!fullpel_dx || !fullpel_dy) { + aom_free(fullpel_dx); + aom_free(fullpel_dy); + return; + } + + // TODO(any): This could be more efficient in the case that x_coord + // and y_coord are integers.. but it may look more messy. + + // calculate spatial gradients at full pixel locations + for (int j = ys; j < ye; j++) { + for (int i = xs; i < xe; i++) { + spatial_gradient(frame, i, j, 0, &deriv_x); + spatial_gradient(frame, i, j, 1, &deriv_y); + int idx = (j - ys) * (xe - xs) + (i - xs); + fullpel_dx[idx] = deriv_x; + fullpel_dy[idx] = deriv_y; + } + } + // compute numerical differentiation for every pixel in window + // (this potentially includes subpixels) + for (double j = y_start; j < y_end; j++) { + for (double i = x_start; i < x_end; i++) { + temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv); + gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x); + gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y); + int idx = (int)(j - top) * window_size + (int)(i - left); + ix[idx] = deriv_x; + iy[idx] = deriv_y; + it[idx] = deriv_t; + } + } + // TODO(any): to avoid setting deriv arrays to zero for every iteration, + // could instead pass these two values back through function call + // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left); + // int width = window_size - ((int)(x_start - left) + (int)(left + window_size + // - x_end)); + + aom_free(fullpel_dx); + aom_free(fullpel_dy); +} + +// To compute eigenvalues of 2x2 matrix: Solve for lambda where +// Determinant(matrix - lambda*identity) == 0 +static void eigenvalues_2x2(const double *matrix, double *eig) { + const double a = 1; + const double b = -1 * matrix[0] - matrix[3]; + const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3]; + // quadratic formula + const double discriminant = b * b - 4 * a * c; + eig[0] = (-b - sqrt(discriminant)) / (2.0 * a); + eig[1] = (-b + sqrt(discriminant)) / (2.0 * a); + // double check that eigenvalues are ordered by magnitude + if (fabs(eig[0]) > fabs(eig[1])) { + double tmp = eig[0]; + eig[0] = eig[1]; + eig[1] = tmp; + } +} + +// Shi-Tomasi corner detection criteria +static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, const int x, + const int y, double *i_x, double *i_y, double *i_t, + const int n, const int bit_depth) { + double eig[2]; + LOCALMV mv = { .row = 0, .col = 0 }; + // TODO(any): technically, ref_frame and i_t are not used by corner score + // so these could be replaced by dummy variables, + // or change this to spatial gradient function over window only + gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x, + i_y, i_t, &mv); + double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 }; + multiply_mat(i_x, i_x, Mres1, 1, n * n, 1); + multiply_mat(i_x, i_y, Mres2, 1, n * n, 1); + multiply_mat(i_y, i_y, Mres3, 1, n * n, 1); + double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] }; + eigenvalues_2x2(M, eig); + return fabs(eig[0]); +} + +// Finds corners in frame_to_filter +// For less strict requirements (i.e. more corners), decrease threshold +static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, + const int maxcorners, int *ref_corners, + const int bit_depth) { + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + // TODO(any): currently if maxcorners is decreased, then it only means + // corners will be omited from bottom-right of image. if maxcorners + // is actually used, then this algorithm would need to re-iterate + // and choose threshold based on that + assert(maxcorners == frame_height * frame_width); + int countcorners = 0; + const double threshold = 0.1; + double score; + const int n = 3; + double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + const int fromedge = n; + double max_score = corner_score(frame_to_filter, ref_frame, fromedge, + fromedge, i_x, i_y, i_t, n, bit_depth); + // rough estimate of max corner score in image + for (int x = fromedge; x < frame_width - fromedge; x += 1) { + for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) { + for (int i = 0; i < n * n; i++) { + i_x[i] = 0; + i_y[i] = 0; + i_t[i] = 0; + } + score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n, + bit_depth); + if (score > max_score) { + max_score = score; + } + } + } + // score all the points and choose corners over threshold + for (int x = fromedge; x < frame_width - fromedge; x += 1) { + for (int y = fromedge; + (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) { + for (int i = 0; i < n * n; i++) { + i_x[i] = 0; + i_y[i] = 0; + i_t[i] = 0; + } + score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n, + bit_depth); + if (score > threshold * max_score) { + ref_corners[countcorners * 2] = x; + ref_corners[countcorners * 2 + 1] = y; + countcorners++; + } + } + } + return countcorners; +} + +// weights is an nxn matrix. weights is filled with a gaussian function, +// with independent variable: distance from the center point. +static void gaussian(const double sigma, const int n, const int normalize, + double *weights) { + double total_weight = 0; + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i++) { + double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2)); + double weight = exp(-0.5 * pow(distance / sigma, 2)); + weights[j * n + i] = weight; + total_weight += weight; + } + } + if (normalize == 1) { + for (int j = 0; j < n; j++) { + weights[j] = weights[j] / total_weight; + } + } +} + +static double convolve(const double *filter, const int *img, const int size) { + double result = 0; + for (int i = 0; i < size; i++) { + result += filter[i] * img[i]; + } + return result; +} + +// Applies a Gaussian low-pass smoothing filter to produce +// a corresponding lower resolution image with halved dimensions +static void reduce(uint8_t *img, int height, int width, int stride, + uint8_t *reduced_img) { + const int new_width = width / 2; + const int window_size = 5; + const double gaussian_filter[25] = { + 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, + 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, + 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, + 1. / 64, 3. / 128, 1. / 64, 1. / 256 + }; + // filter is 5x5 so need prev and forward 2 pixels + int img_section[25]; + for (int y = 0; y < height - 1; y += 2) { + for (int x = 0; x < width - 1; x += 2) { + int i = 0; + for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) { + for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) { + int yvalue = yy; + int xvalue = xx; + // copied pixels outside the boundary + if (yvalue < 0) yvalue = 0; + if (xvalue < 0) xvalue = 0; + if (yvalue >= height) yvalue = height - 1; + if (xvalue >= width) xvalue = width - 1; + img_section[i++] = img[yvalue * stride + xvalue]; + } + } + reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve( + gaussian_filter, img_section, window_size * window_size); + } + } +} + +static int cmpfunc(const void *a, const void *b) { + return (*(int *)a - *(int *)b); +} +static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height, + const int frame_width, LOCALMV *localmvs, MV *mvs) { + const int n = 5; // window size + // for smoothing filter + const double gaussian_filter[25] = { + 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, + 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, + 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, + 1. / 64, 3. / 128, 1. / 64, 1. / 256 + }; + // for median filter + int mvrows[25]; + int mvcols[25]; + if (mv_filter != MV_FILTER_NONE) { + for (int y = 0; y < frame_height; y++) { + for (int x = 0; x < frame_width; x++) { + int center_idx = y * frame_width + x; + int i = 0; + double filtered_row = 0; + double filtered_col = 0; + for (int yy = y - n / 2; yy <= y + n / 2; yy++) { + for (int xx = x - n / 2; xx <= x + n / 2; xx++) { + int yvalue = yy; + int xvalue = xx; + // copied pixels outside the boundary + if (yvalue < 0) yvalue = 0; + if (xvalue < 0) xvalue = 0; + if (yvalue >= frame_height) yvalue = frame_height - 1; + if (xvalue >= frame_width) xvalue = frame_width - 1; + int index = yvalue * frame_width + xvalue; + if (mv_filter == MV_FILTER_SMOOTH) { + filtered_row += mvs[index].row * gaussian_filter[i]; + filtered_col += mvs[index].col * gaussian_filter[i]; + } else if (mv_filter == MV_FILTER_MEDIAN) { + mvrows[i] = mvs[index].row; + mvcols[i] = mvs[index].col; + } + i++; + } + } + + MV mv = mvs[center_idx]; + if (mv_filter == MV_FILTER_SMOOTH) { + mv.row = (int16_t)filtered_row; + mv.col = (int16_t)filtered_col; + } else if (mv_filter == MV_FILTER_MEDIAN) { + qsort(mvrows, 25, sizeof(mv.row), cmpfunc); + qsort(mvcols, 25, sizeof(mv.col), cmpfunc); + mv.row = mvrows[25 / 2]; + mv.col = mvcols[25 / 2]; + } + LOCALMV localmv = { .row = ((double)mv.row) / 8, + .col = ((double)mv.row) / 8 }; + localmvs[y * frame_width + x] = localmv; + // if mvs array is immediately updated here, then the result may + // propagate to other pixels. + } + } + for (int i = 0; i < frame_height * frame_width; i++) { + MV mv = { .row = (int16_t)round(8 * localmvs[i].row), + .col = (int16_t)round(8 * localmvs[i].col) }; + mvs[i] = mv; + } + } +} + +// Computes optical flow at a single pyramid level, +// using Lucas-Kanade algorithm. +// Modifies mvs array. +static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, const int level, + const LK_PARAMS *lk_params, const int num_ref_corners, + int *ref_corners, const int mv_stride, + const int bit_depth, LOCALMV *mvs) { + assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0); + const int n = lk_params->window_size; + // algorithm is sensitive to window size + double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x)); + double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y)); + double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t)); + double *weights = (double *)aom_malloc(n * n * sizeof(*weights)); + if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf; + + const int expand_multiplier = (int)pow(2, level); + double sigma = 0.2 * n; + // normalizing doesn't really affect anything since it's applied + // to every component of M and b + gaussian(sigma, n, 0, weights); + for (int i = 0; i < num_ref_corners; i++) { + const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier; + const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier; + int highres_x = ref_corners[i * 2]; + int highres_y = ref_corners[i * 2 + 1]; + int mv_idx = highres_y * (mv_stride) + highres_x; + LOCALMV mv_old = mvs[mv_idx]; + mv_old.row = mv_old.row / expand_multiplier; + mv_old.col = mv_old.col / expand_multiplier; + // using this instead of memset, since it's not completely + // clear if zero memset works on double arrays + for (int j = 0; j < n * n; j++) { + i_x[j] = 0; + i_y[j] = 0; + i_t[j] = 0; + } + gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth, + i_x, i_y, i_t, &mv_old); + double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 }; + double bres1[1] = { 0 }, bres2[1] = { 0 }; + for (int j = 0; j < n * n; j++) { + Mres1[0] += weights[j] * i_x[j] * i_x[j]; + Mres2[0] += weights[j] * i_x[j] * i_y[j]; + Mres3[0] += weights[j] * i_y[j] * i_y[j]; + bres1[0] += weights[j] * i_x[j] * i_t[j]; + bres2[0] += weights[j] * i_y[j] * i_t[j]; + } + double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] }; + double b[2] = { -1 * bres1[0], -1 * bres2[0] }; + double eig[2] = { 1, 1 }; + eigenvalues_2x2(M, eig); + double threshold = 0.1; + if (fabs(eig[0]) > threshold) { + // if M is not invertible, then displacement + // will default to zeros + double u[2] = { 0, 0 }; + linsolve(2, M, 2, b, u); + int mult = 1; + if (level != 0) + mult = expand_multiplier; // mv doubles when resolution doubles + LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)), + .col = (mult * (u[1] + mv_old.col)) }; + mvs[mv_idx] = mv; + mvs[mv_idx] = mv; + } + } +free_lk_buf: + aom_free(weights); + aom_free(i_t); + aom_free(i_x); + aom_free(i_y); +} + +// Warp the src_frame to warper_frame according to mvs. +// mvs point to src_frame +static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame, + const YV12_BUFFER_CONFIG *src_frame, + const LOCALMV *mvs, int mv_stride) { + int w, h; + const int fw = src_frame->y_crop_width; + const int fh = src_frame->y_crop_height; + const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride; + const uint8_t *src_buf = src_frame->y_buffer; + uint8_t *warped_buf = warped_frame->y_buffer; + double temp; + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + double cord_x = (double)w + mvs[h * mv_stride + w].col; + double cord_y = (double)h + mvs[h * mv_stride + w].row; + cord_x = fclamp(cord_x, 0, (double)(fw - 1)); + cord_y = fclamp(cord_y, 0, (double)(fh - 1)); + const int floorx = (int)floor(cord_x); + const int floory = (int)floor(cord_y); + const double fracx = cord_x - (double)floorx; + const double fracy = cord_y - (double)floory; + + temp = 0; + for (int hh = 0; hh < 2; hh++) { + const double weighth = hh ? (fracy) : (1 - fracy); + for (int ww = 0; ww < 2; ww++) { + const double weightw = ww ? (fracx) : (1 - fracx); + int y = floory + hh; + int x = floorx + ww; + y = clamp(y, 0, fh - 1); + x = clamp(x, 0, fw - 1); + temp += (double)src_buf[y * src_fs + x] * weightw * weighth; + } + } + warped_buf[h * warped_fs + w] = (uint8_t)round(temp); + } + } +} + +// Same as warp_back_frame, but using a better interpolation filter. +static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame, + const YV12_BUFFER_CONFIG *src_frame, + const LOCALMV *mvs, int mv_stride) { + int w, h; + const int fw = src_frame->y_crop_width; + const int fh = src_frame->y_crop_height; + const int warped_fs = warped_frame->y_stride; + uint8_t *warped_buf = warped_frame->y_buffer; + const int blk = 2; + uint8_t temp_blk[4]; + + const int is_intrabc = 0; // Is intra-copied? + const int is_high_bitdepth = is_frame_high_bitdepth(src_frame); + const int subsampling_x = 0, subsampling_y = 0; // for y-buffer + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP2); + const int plane = 0; // y-plane + const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer, + src_frame->y_crop_width, + src_frame->y_crop_height, + src_frame->y_stride }; + const int bit_depth = src_frame->bit_depth; + struct scale_factors scale; + av1_setup_scale_factors_for_frame( + &scale, src_frame->y_crop_width, src_frame->y_crop_height, + src_frame->y_crop_width, src_frame->y_crop_height); + + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, + is_intrabc, &scale, &ref_buf2, interp_filters); + inter_pred_params.interp_filter_params[0] = + &av1_interp_filter_params_list[interp_filters.as_filters.x_filter]; + inter_pred_params.interp_filter_params[1] = + &av1_interp_filter_params_list[interp_filters.as_filters.y_filter]; + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8), + .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) }; + av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv, + &inter_pred_params); + warped_buf[h * warped_fs + w] = temp_blk[0]; + } + } +} + +#define DERIVATIVE_FILTER_LENGTH 7 +double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60, -45.0 / 60, 0, + 45.0 / 60, -9.0 / 60, 1.0 / 60 }; + +// Get gradient of the whole frame +static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, double *ix, + double *iy, double *it, int grad_stride) { + int w, h, k, idx; + const int fw = from_frame->y_crop_width; + const int fh = from_frame->y_crop_height; + const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride; + const uint8_t *from_buf = from_frame->y_buffer; + const uint8_t *to_buf = to_frame->y_buffer; + + const int lh = DERIVATIVE_FILTER_LENGTH; + const int hleft = (lh - 1) / 2; + + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + // x + ix[h * grad_stride + w] = 0; + for (k = 0; k < lh; k++) { + // if we want to make this block dependent, need to extend the + // boundaries using other initializations. + idx = w + k - hleft; + idx = clamp(idx, 0, fw - 1); + ix[h * grad_stride + w] += filter[k] * 0.5 * + ((double)from_buf[h * from_fs + idx] + + (double)to_buf[h * to_fs + idx]); + } + // y + iy[h * grad_stride + w] = 0; + for (k = 0; k < lh; k++) { + // if we want to make this block dependent, need to extend the + // boundaries using other initializations. + idx = h + k - hleft; + idx = clamp(idx, 0, fh - 1); + iy[h * grad_stride + w] += filter[k] * 0.5 * + ((double)from_buf[idx * from_fs + w] + + (double)to_buf[idx * to_fs + w]); + } + // t + it[h * grad_stride + w] = + (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w]; + } + } +} + +// Solve for linear equations given by the H-S method +static void solve_horn_schunck(const double *ix, const double *iy, + const double *it, int grad_stride, int width, + int height, const LOCALMV *init_mvs, + int init_mv_stride, LOCALMV *mvs, + int mv_stride) { + // TODO(bohanli): May just need to allocate the buffers once per optical flow + // calculation + int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos)); + int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos)); + double *values = aom_calloc(width * height * 28, sizeof(*values)); + double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec)); + double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec)); + double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b)); + double *b = aom_calloc(width * height * 2, sizeof(*b)); + if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b || + !b) { + goto free_hs_solver_buf; + } + + // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors + const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 }; + const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 }; + + int h, w, checkh, checkw, k, ret; + const int offset = height * width; + SPARSE_MTX A; + int c = 0; + const double lambda = 100; + + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col; + mv_init_vec[w * height + h + offset] = + init_mvs[h * init_mv_stride + w].row; + } + } + + // get matrix A + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + int center_num_direct = 4; + const int center_idx = w * height + h; + if (w == 0 || w == width - 1) center_num_direct--; + if (h == 0 || h == height - 1) center_num_direct--; + // diagonal entry for this row from the center pixel + double cor_w = center_num_direct * center_num_direct + center_num_direct; + row_pos[c] = center_idx; + col_pos[c] = center_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = center_idx + offset; + values[c] = lambda * cor_w; + c++; + // other entries from direct neighbors + for (k = 0; k < 4; k++) { + checkh = h + check_locs_y[k]; + checkw = w + check_locs_x[k]; + if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { + continue; + } + int this_idx = checkw * height + checkh; + int this_num_direct = 4; + if (checkw == 0 || checkw == width - 1) this_num_direct--; + if (checkh == 0 || checkh == height - 1) this_num_direct--; + cor_w = -center_num_direct - this_num_direct; + row_pos[c] = center_idx; + col_pos[c] = this_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = this_idx + offset; + values[c] = lambda * cor_w; + c++; + } + // entries from neighbors on the diagonal corners + for (k = 4; k < 8; k++) { + checkh = h + check_locs_y[k]; + checkw = w + check_locs_x[k]; + if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { + continue; + } + int this_idx = checkw * height + checkh; + cor_w = 2; + row_pos[c] = center_idx; + col_pos[c] = this_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = this_idx + offset; + values[c] = lambda * cor_w; + c++; + } + // entries from neighbors with dist of 2 + for (k = 8; k < 12; k++) { + checkh = h + check_locs_y[k]; + checkw = w + check_locs_x[k]; + if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) { + continue; + } + int this_idx = checkw * height + checkh; + cor_w = 1; + row_pos[c] = center_idx; + col_pos[c] = this_idx; + values[c] = lambda * cor_w; + c++; + row_pos[c] = center_idx + offset; + col_pos[c] = this_idx + offset; + values[c] = lambda * cor_w; + c++; + } + } + } + ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, + 2 * width * height, &A); + if (ret < 0) goto free_hs_solver_buf; + // subtract init mv part from b + av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height); + for (int i = 0; i < 2 * width * height; i++) { + b[i] = -temp_b[i]; + } + av1_free_sparse_mtx_elems(&A); + + // add cross terms to A and modify b with ExEt / EyEt + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + int curidx = w * height + h; + // modify b + b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w]; + b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w]; + // add cross terms to A + row_pos[c] = curidx; + col_pos[c] = curidx + offset; + values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w]; + c++; + row_pos[c] = curidx + offset; + col_pos[c] = curidx; + values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w]; + c++; + } + } + // Add diagonal terms to A + for (int i = 0; i < c; i++) { + if (row_pos[i] == col_pos[i]) { + if (row_pos[i] < offset) { + w = row_pos[i] / height; + h = row_pos[i] % height; + values[i] += pow(ix[h * grad_stride + w], 2); + } else { + w = (row_pos[i] - offset) / height; + h = (row_pos[i] - offset) % height; + values[i] += pow(iy[h * grad_stride + w], 2); + } + } + } + + ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, + 2 * width * height, &A); + if (ret < 0) goto free_hs_solver_buf; + + // solve for the mvs + ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec); + if (ret < 0) goto free_hs_solver_buf; + + // copy mvs + for (w = 0; w < width; w++) { + for (h = 0; h < height; h++) { + mvs[h * mv_stride + w].col = mv_vec[w * height + h]; + mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset]; + } + } +free_hs_solver_buf: + aom_free(row_pos); + aom_free(col_pos); + aom_free(values); + aom_free(mv_vec); + aom_free(mv_init_vec); + aom_free(b); + aom_free(temp_b); + av1_free_sparse_mtx_elems(&A); +} + +// Calculate optical flow from from_frame to to_frame using the H-S method. +static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, const int level, + const int mv_stride, const int mv_height, + const int mv_width, const OPFL_PARAMS *opfl_params, + LOCALMV *mvs) { + // mvs are always on level 0, here we define two new mv arrays that is of size + // of this level. + const int fw = from_frame->y_crop_width; + const int fh = from_frame->y_crop_height; + const int factor = (int)pow(2, level); + int w, h, k, init_mv_stride; + LOCALMV *init_mvs = NULL, *refine_mvs = NULL; + double *ix = NULL, *iy = NULL, *it = NULL; + YV12_BUFFER_CONFIG temp_frame; + temp_frame.y_buffer = NULL; + if (level == 0) { + init_mvs = mvs; + init_mv_stride = mv_stride; + } else { + init_mvs = aom_calloc(fw * fh, sizeof(*mvs)); + if (!init_mvs) goto free_hs_buf; + init_mv_stride = fw; + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + init_mvs[h * init_mv_stride + w].row = + mvs[h * factor * mv_stride + w * factor].row / (double)factor; + init_mvs[h * init_mv_stride + w].col = + mvs[h * factor * mv_stride + w * factor].col / (double)factor; + } + } + } + refine_mvs = aom_calloc(fw * fh, sizeof(*mvs)); + if (!refine_mvs) goto free_hs_buf; + // temp frame for warping + temp_frame.y_buffer = + (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer)); + if (!temp_frame.y_buffer) goto free_hs_buf; + temp_frame.y_crop_height = fh; + temp_frame.y_crop_width = fw; + temp_frame.y_stride = fw; + // gradient buffers + ix = aom_calloc(fw * fh, sizeof(*ix)); + iy = aom_calloc(fw * fh, sizeof(*iy)); + it = aom_calloc(fw * fh, sizeof(*it)); + if (!ix || !iy || !it) goto free_hs_buf; + // For each warping step + for (k = 0; k < opfl_params->warping_steps; k++) { + // warp from_frame with init_mv + if (level == 0) { + warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride); + } else { + warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride); + } + // calculate frame gradients + get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw); + // form linear equations and solve mvs + solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride, + refine_mvs, fw); + // update init_mvs + for (h = 0; h < fh; h++) { + for (w = 0; w < fw; w++) { + init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col; + init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row; + } + } + } + // copy back the mvs if needed + if (level != 0) { + for (h = 0; h < mv_height; h++) { + for (w = 0; w < mv_width; w++) { + mvs[h * mv_stride + w].row = + init_mvs[h / factor * init_mv_stride + w / factor].row * + (double)factor; + mvs[h * mv_stride + w].col = + init_mvs[h / factor * init_mv_stride + w / factor].col * + (double)factor; + } + } + } +free_hs_buf: + if (level != 0) aom_free(init_mvs); + aom_free(refine_mvs); + aom_free(temp_frame.y_buffer); + aom_free(ix); + aom_free(iy); + aom_free(it); +} + +// Apply optical flow iteratively at each pyramid level +static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, + const int bit_depth, + const OPFL_PARAMS *opfl_params, + const OPTFLOW_METHOD method, LOCALMV *mvs) { + assert(opfl_params->pyramid_levels > 0 && + opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS); + int levels = opfl_params->pyramid_levels; + const int frame_height = from_frame->y_crop_height; + const int frame_width = from_frame->y_crop_width; + if ((frame_height / pow(2.0, levels - 1) < 50 || + frame_height / pow(2.0, levels - 1) < 50) && + levels > 1) + levels = levels - 1; + uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL }; + uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL }; + int *ref_corners = NULL; + + images1[0] = from_frame->y_buffer; + images2[0] = to_frame->y_buffer; + YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1)); + YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2)); + if (!buffers1 || !buffers2) goto free_pyramid_buf; + buffers1[0] = *from_frame; + buffers2[0] = *to_frame; + int fw = frame_width; + int fh = frame_height; + for (int i = 1; i < levels; i++) { + // TODO(bohanli): may need to extend buffers for better interpolation SIMD + images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i])); + images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i])); + if (!images1[i] || !images2[i]) goto free_pyramid_buf; + int stride; + if (i == 1) + stride = from_frame->y_stride; + else + stride = fw; + reduce(images1[i - 1], fh, fw, stride, images1[i]); + reduce(images2[i - 1], fh, fw, stride, images2[i]); + fh /= 2; + fw /= 2; + YV12_BUFFER_CONFIG a = { .y_buffer = images1[i], + .y_crop_width = fw, + .y_crop_height = fh, + .y_stride = fw }; + YV12_BUFFER_CONFIG b = { .y_buffer = images2[i], + .y_crop_width = fw, + .y_crop_height = fh, + .y_stride = fw }; + buffers1[i] = a; + buffers2[i] = b; + } + // Compute corners for specific frame + int num_ref_corners = 0; + if (is_sparse(opfl_params)) { + int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height; + ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners)); + if (!ref_corners) goto free_pyramid_buf; + num_ref_corners = detect_corners(from_frame, to_frame, maxcorners, + ref_corners, bit_depth); + } + const int stop_level = 0; + for (int i = levels - 1; i >= stop_level; i--) { + if (method == LUCAS_KANADE) { + assert(is_sparse(opfl_params)); + lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params, + num_ref_corners, ref_corners, buffers1[0].y_crop_width, + bit_depth, mvs); + } else if (method == HORN_SCHUNCK) { + assert(!is_sparse(opfl_params)); + horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width, + buffers1[0].y_crop_height, buffers1[0].y_crop_width, + opfl_params, mvs); + } + } +free_pyramid_buf: + for (int i = 1; i < levels; i++) { + aom_free(images1[i]); + aom_free(images2[i]); + } + aom_free(ref_corners); + aom_free(buffers1); + aom_free(buffers2); +} +// Computes optical flow by applying algorithm at +// multiple pyramid levels of images (lower-resolution, smoothed images) +// This accounts for larger motions. +// Inputs: +// from_frame Frame buffer. +// to_frame: Frame buffer. MVs point from_frame -> to_frame. +// from_frame_idx: Index of from_frame. +// to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal. +// bit_depth: +// opfl_params: contains algorithm-specific parameters. +// mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN. +// method: LUCAS_KANADE, HORN_SCHUNCK +// mvs: pointer to MVs. Contains initialization, and modified +// based on optical flow. Must have +// dimensions = from_frame->y_crop_width * from_frame->y_crop_height +void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, + const int from_frame_idx, const int to_frame_idx, + const int bit_depth, const OPFL_PARAMS *opfl_params, + const MV_FILTER_TYPE mv_filter, + const OPTFLOW_METHOD method, MV *mvs) { + const int frame_height = from_frame->y_crop_height; + const int frame_width = from_frame->y_crop_width; + // TODO(any): deal with the case where frames are not of the same dimensions + assert(frame_height == to_frame->y_crop_height && + frame_width == to_frame->y_crop_width); + if (from_frame_idx == to_frame_idx) { + // immediately return all zero mvs when frame indices are equal + for (int yy = 0; yy < frame_height; yy++) { + for (int xx = 0; xx < frame_width; xx++) { + MV mv = { .row = 0, .col = 0 }; + mvs[yy * frame_width + xx] = mv; + } + } + return; + } + + // Initialize double mvs based on input parameter mvs array + LOCALMV *localmvs = + aom_malloc(frame_height * frame_width * sizeof(*localmvs)); + if (!localmvs) return; + + filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs); + + for (int i = 0; i < frame_width * frame_height; i++) { + MV mv = mvs[i]; + LOCALMV localmv = { .row = ((double)mv.row) / 8, + .col = ((double)mv.col) / 8 }; + localmvs[i] = localmv; + } + // Apply optical flow algorithm + pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method, + localmvs); + + // Update original mvs array + for (int j = 0; j < frame_height; j++) { + for (int i = 0; i < frame_width; i++) { + int idx = j * frame_width + i; + if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height || + i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) { + continue; + } + MV mv = { .row = (int16_t)round(8 * localmvs[idx].row), + .col = (int16_t)round(8 * localmvs[idx].col) }; + mvs[idx] = mv; + } + } + + filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs); + + aom_free(localmvs); +} +#endif diff --git a/third_party/aom/av1/encoder/optical_flow.h b/third_party/aom/av1/encoder/optical_flow.h new file mode 100644 index 0000000000..2fbe474d77 --- /dev/null +++ b/third_party/aom/av1/encoder/optical_flow.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_ +#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_ + +#include "aom_scale/yv12config.h" +#include "av1/common/mv.h" +#include "config/aom_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_OPTICAL_FLOW_API + +typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD; + +typedef enum { + MV_FILTER_NONE, + MV_FILTER_SMOOTH, + MV_FILTER_MEDIAN +} MV_FILTER_TYPE; + +typedef struct LOCALMV { + double row; + double col; +} LOCALMV; + +#define MAX_PYRAMID_LEVELS 5 +// default options for optical flow +#define OPFL_WINDOW_SIZE 15 +#define OPFL_PYRAMID_LEVELS 3 // total levels +#define OPFL_WARPING_STEPS 3 + +// parameters specific to Lucas-Kanade +typedef struct lk_params { + int window_size; +} LK_PARAMS; + +// generic structure to contain parameters for all +// optical flow algorithms +typedef struct opfl_params { + int pyramid_levels; + int warping_steps; + LK_PARAMS *lk_params; + int flags; +} OPFL_PARAMS; + +#define OPFL_FLAG_SPARSE 1 + +void av1_init_opfl_params(OPFL_PARAMS *opfl_params); + +void av1_init_lk_params(LK_PARAMS *lk_params); + +void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, + const int from_frame_idx, const int to_frame_idx, + const int bit_depth, const OPFL_PARAMS *opfl_params, + const MV_FILTER_TYPE mv_filter, + const OPTFLOW_METHOD method, MV *mvs); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_OPTICAL_FLOW_H_ diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c new file mode 100644 index 0000000000..7f79e9596e --- /dev/null +++ b/third_party/aom/av1/encoder/palette.c @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/pred_common.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/random.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/tx_search.h" + +#define AV1_K_MEANS_DIM 1 +#include "av1/encoder/k_means_template.h" +#undef AV1_K_MEANS_DIM +#define AV1_K_MEANS_DIM 2 +#include "av1/encoder/k_means_template.h" +#undef AV1_K_MEANS_DIM + +static int int16_comparer(const void *a, const void *b) { + return (*(int16_t *)a - *(int16_t *)b); +} + +int av1_remove_duplicates(int16_t *centroids, int num_centroids) { + int num_unique; // number of unique centroids + int i; + qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer); + // Remove duplicates. + num_unique = 1; + for (i = 1; i < num_centroids; ++i) { + if (centroids[i] != centroids[i - 1]) { // found a new unique centroid + centroids[num_unique++] = centroids[i]; + } + } + return num_unique; +} + +static int delta_encode_cost(const int *colors, int num, int bit_depth, + int min_val) { + if (num <= 0) return 0; + int bits_cost = bit_depth; + if (num == 1) return bits_cost; + bits_cost += 2; + int max_delta = 0; + int deltas[PALETTE_MAX_SIZE]; + const int min_bits = bit_depth - 3; + for (int i = 1; i < num; ++i) { + const int delta = colors[i] - colors[i - 1]; + deltas[i - 1] = delta; + assert(delta >= min_val); + if (delta > max_delta) max_delta = delta; + } + int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits); + assert(bits_per_delta <= bit_depth); + int range = (1 << bit_depth) - colors[0] - min_val; + for (int i = 0; i < num - 1; ++i) { + bits_cost += bits_per_delta; + range -= deltas[i]; + bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range)); + } + return bits_cost; +} + +int av1_index_color_cache(const uint16_t *color_cache, int n_cache, + const uint16_t *colors, int n_colors, + uint8_t *cache_color_found, int *out_cache_colors) { + if (n_cache <= 0) { + for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i]; + return n_colors; + } + memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found)); + int n_in_cache = 0; + int in_cache_flags[PALETTE_MAX_SIZE]; + memset(in_cache_flags, 0, sizeof(in_cache_flags)); + for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) { + for (int j = 0; j < n_colors; ++j) { + if (colors[j] == color_cache[i]) { + in_cache_flags[j] = 1; + cache_color_found[i] = 1; + ++n_in_cache; + break; + } + } + } + int j = 0; + for (int i = 0; i < n_colors; ++i) + if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i]; + assert(j == n_colors - n_in_cache); + return j; +} + +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, + int *min_bits) { + const int n = pmi->palette_size[1]; + const int max_val = 1 << bit_depth; + int max_d = 0; + *min_bits = bit_depth - 4; + *zero_count = 0; + for (int i = 1; i < n; ++i) { + const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] - + pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1]; + const int v = abs(delta); + const int d = AOMMIN(v, max_val - v); + if (d > max_d) max_d = d; + if (d == 0) ++(*zero_count); + } + return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits); +} + +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth) { + const int n = pmi->palette_size[0]; + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = + av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n, + cache_color_found, out_cache_colors); + const int total_bits = + n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1); + return av1_cost_literal(total_bits); +} + +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth) { + const int n = pmi->palette_size[1]; + int total_bits = 0; + // U channel palette color cost. + int out_cache_colors[PALETTE_MAX_SIZE]; + uint8_t cache_color_found[2 * PALETTE_MAX_SIZE]; + const int n_out_cache = av1_index_color_cache( + color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n, + cache_color_found, out_cache_colors); + total_bits += + n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0); + + // V channel palette color cost. + int zero_count = 0, min_bits_v = 0; + const int bits_v = + av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v); + const int bits_using_delta = + 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count; + const int bits_using_raw = bit_depth * n; + total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw); + return av1_cost_literal(total_bits); +} + +// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x +// new_height'. Extra rows and columns are filled in by copying last valid +// row/column. +static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map, + int orig_width, int orig_height, + int new_width, int new_height) { + int j; + assert(new_width >= orig_width); + assert(new_height >= orig_height); + if (new_width == orig_width && new_height == orig_height) return; + + for (j = orig_height - 1; j >= 0; --j) { + memmove(color_map + j * new_width, color_map + j * orig_width, orig_width); + // Copy last column to extra columns. + memset(color_map + j * new_width + orig_width, + color_map[j * new_width + orig_width - 1], new_width - orig_width); + } + // Copy last row to extra rows. + for (j = orig_height; j < new_height; ++j) { + memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width, + new_width); + } +} + +// Bias toward using colors in the cache. +// TODO(huisu): Try other schemes to improve compression. +static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache, + int n_cache, int n_colors, + int stride, int16_t *centroids, + int bit_depth) { + if (n_cache <= 0) return; + for (int i = 0; i < n_colors * stride; i += stride) { + int min_diff = abs((int)centroids[i] - (int)color_cache[0]); + int idx = 0; + for (int j = 1; j < n_cache; ++j) { + const int this_diff = abs((int)centroids[i] - (int)color_cache[j]); + if (this_diff < min_diff) { + min_diff = this_diff; + idx = j; + } + } + const int min_threshold = 4 << (bit_depth - 8); + if (min_diff <= min_threshold) centroids[i] = color_cache[idx]; + } +} + +/*!\brief Calculate the luma palette cost from a given color palette + * + * \ingroup palette_mode_search + * \callergraph + * Given the base colors as specified in centroids[], calculate the RD cost + * of palette mode. + */ +static AOM_INLINE void palette_rd_y( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids, + int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, + uint8_t *tx_type_map, int *beat_best_palette_rd, + bool *do_header_rd_based_breakout, int discount_color_cost) { + if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false; + optimize_palette_colors(color_cache, n_cache, n, 1, centroids, + cpi->common.seq_params->bit_depth); + const int num_unique_colors = av1_remove_duplicates(centroids, n); + if (num_unique_colors < PALETTE_MIN_SIZE) { + // Too few unique colors to create a palette. And DC_PRED will work + // well for that case anyway. So skip. + return; + } + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + if (cpi->common.seq_params->use_highbitdepth) { + for (int i = 0; i < num_unique_colors; ++i) { + pmi->palette_colors[i] = clip_pixel_highbd( + (int)centroids[i], cpi->common.seq_params->bit_depth); + } + } else { + for (int i = 0; i < num_unique_colors; ++i) { + pmi->palette_colors[i] = clip_pixel(centroids[i]); + } + } + pmi->palette_size[0] = num_unique_colors; + MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *const color_map = xd->plane[0].color_index_map; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors, + 1); + extend_palette_color_map(color_map, cols, rows, block_width, block_height); + + RD_STATS tokenonly_rd_stats; + int this_rate; + + if (do_header_rd_based_gating) { + assert(do_header_rd_based_breakout != NULL); + const int palette_mode_rate = intra_mode_info_cost_y( + cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost); + const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0); + // Less aggressive pruning when prune_luma_palette_size_search_level == 1. + const int header_rd_shift = + (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0; + // Terminate further palette_size search, if the header cost corresponding + // to lower palette_size is more than *best_rd << header_rd_shift. This + // logic is implemented with a right shift in the LHS to prevent a possible + // overflow with the left shift in RHS. + if ((header_rd >> header_rd_shift) > *best_rd) { + *do_header_rd_based_breakout = true; + return; + } + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) return; + this_rate = tokenonly_rd_stats.rate + palette_mode_rate; + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, + *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) return; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost, + discount_color_cost); + } + + int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) { + tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size); + } + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize, + this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (this_rd < *best_rd) { + *best_rd = this_rd; + // Setting beat_best_rd flag because current mode rd is better than best_rd. + // This flag need to be updated only for palette evaluation in key frames + if (beat_best_rd) *beat_best_rd = 1; + memcpy(best_palette_color_map, color_map, + block_width * block_height * sizeof(color_map[0])); + *best_mbmi = *mbmi; + memcpy(blk_skip, x->txfm_search_info.blk_skip, + sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + if (rate) *rate = this_rate; + if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate; + if (distortion) *distortion = tokenonly_rd_stats.dist; + if (skippable) *skippable = tokenonly_rd_stats.skip_txfm; + if (beat_best_palette_rd) *beat_best_palette_rd = 1; + } +} + +static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) { + assert(step_size != 0); + return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx; +} + +// Performs count-based palette search with number of colors in interval +// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can +// be less than start_n. Saves the last numbers searched in last_n_searched and +// returns the best number of colors found. +static AOM_INLINE int perform_top_color_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, + int16_t *top_colors, int start_n, int end_n, int step_size, + bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) { + int16_t centroids[PALETTE_MAX_SIZE]; + int n = start_n; + int top_color_winner = end_n; + /* clang-format off */ + assert(IMPLIES(step_size < 0, start_n > end_n)); + /* clang-format on */ + assert(IMPLIES(step_size > 0, start_n < end_n)); + while (!is_iter_over(n, end_n, step_size)) { + int beat_best_palette_rd = 0; + bool do_header_rd_based_breakout = false; + memcpy(centroids, top_colors, n * sizeof(top_colors[0])); + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, do_header_rd_based_gating, best_mbmi, + best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, &beat_best_palette_rd, + &do_header_rd_based_breakout, discount_color_cost); + *last_n_searched = n; + if (do_header_rd_based_breakout) { + // Terminate palette_size search by setting last_n_searched to end_n. + *last_n_searched = end_n; + break; + } + if (beat_best_palette_rd) { + top_color_winner = n; + } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) { + // At search level 2, we return immediately if we don't see an improvement + return top_color_winner; + } + n += step_size; + } + return top_color_winner; +} + +// Performs k-means based palette search with number of colors in interval +// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can +// be less than start_n. Saves the last numbers searched in last_n_searched and +// returns the best number of colors found. +static AOM_INLINE int perform_k_means_palette_search( + const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound, + int upper_bound, int start_n, int end_n, int step_size, + bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache, + int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map, + int data_points, int discount_color_cost) { + int16_t centroids[PALETTE_MAX_SIZE]; + const int max_itr = 50; + int n = start_n; + int top_color_winner = end_n; + /* clang-format off */ + assert(IMPLIES(step_size < 0, start_n > end_n)); + /* clang-format on */ + assert(IMPLIES(step_size > 0, start_n < end_n)); + while (!is_iter_over(n, end_n, step_size)) { + int beat_best_palette_rd = 0; + bool do_header_rd_based_breakout = false; + for (int i = 0; i < n; ++i) { + centroids[i] = + lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2; + } + av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr); + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, + color_cache, n_cache, do_header_rd_based_gating, best_mbmi, + best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, &beat_best_palette_rd, + &do_header_rd_based_breakout, discount_color_cost); + *last_n_searched = n; + if (do_header_rd_based_breakout) { + // Terminate palette_size search by setting last_n_searched to end_n. + *last_n_searched = end_n; + break; + } + if (beat_best_palette_rd) { + top_color_winner = n; + } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) { + // At search level 2, we return immediately if we don't see an improvement + return top_color_winner; + } + n += step_size; + } + return top_color_winner; +} + +// Sets the parameters to search the current number of colors +- 1 +static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size, + int winner, int end_n) { + // Set min to winner - 1 unless we are already at the border, then we set it + // to winner + 1 + *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1) + : AOMMAX(winner - 1, PALETTE_MIN_SIZE); + // Set max to winner + 1 unless we are already at the border, then we set it + // to winner - 1 + *max_n = + (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE); + + // Set the step size to max_n - min_n so we only search those two values. + // If max_n == min_n, then set step_size to 1 to avoid infinite loop later. + *step_size = AOMMAX(1, *max_n - *min_n); +} + +static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src, + const int src_stride, + const int rows, const int cols, + const int is_high_bitdepth, + int16_t *data, int *lower_bound, + int *upper_bound) { + if (is_high_bitdepth) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + *lower_bound = *upper_bound = src_ptr[0]; + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int val = src_ptr[c]; + data[c] = (int16_t)val; + *lower_bound = AOMMIN(*lower_bound, val); + *upper_bound = AOMMAX(*upper_bound, val); + } + src_ptr += src_stride; + data += cols; + } + return; + } + + // low bit depth + *lower_bound = *upper_bound = src[0]; + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + const int val = src[c]; + data[c] = (int16_t)val; + *lower_bound = AOMMIN(*lower_bound, val); + *upper_bound = AOMMAX(*upper_bound, val); + } + src += src_stride; + data += cols; + } +} + +/*! \brief Colors are sorted by their count: the higher the better. + */ +struct ColorCount { + //! Color index in the histogram. + int index; + //! Histogram count. + int count; +}; + +int color_count_comp(const void *c1, const void *c2) { + const struct ColorCount *color_count1 = (const struct ColorCount *)c1; + const struct ColorCount *color_count2 = (const struct ColorCount *)c2; + if (color_count1->count > color_count2->count) return -1; + if (color_count1->count < color_count2->count) return 1; + if (color_count1->index < color_count2->index) return -1; + return 1; +} + +static void find_top_colors(const int *const count_buf, int bit_depth, + int n_colors, int16_t *top_colors) { + // Top color array, serving as a priority queue if more than n_colors are + // found. + struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } }; + int n_color_count = 0; + for (int i = 0; i < (1 << bit_depth); ++i) { + if (count_buf[i] > 0) { + if (n_color_count < n_colors) { + // Keep adding to the top colors. + top_color_counts[n_color_count].index = i; + top_color_counts[n_color_count].count = count_buf[i]; + ++n_color_count; + if (n_color_count == n_colors) { + qsort(top_color_counts, n_colors, sizeof(top_color_counts[0]), + color_count_comp); + } + } else { + // Check the worst in the sorted top. + if (count_buf[i] > top_color_counts[n_colors - 1].count) { + int j = n_colors - 1; + // Move up to the best one. + while (j >= 1 && count_buf[i] > top_color_counts[j - 1].count) --j; + memmove(top_color_counts + j + 1, top_color_counts + j, + (n_colors - j - 1) * sizeof(top_color_counts[0])); + top_color_counts[j].index = i; + top_color_counts[j].count = count_buf[i]; + } + } + } + } + assert(n_color_count == n_colors); + + for (int i = 0; i < n_colors; ++i) { + top_colors[i] = top_color_counts[i].index; + } +} + +void av1_rd_pick_palette_intra_sby( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost, + MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, + int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, + uint8_t *tx_type_map) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, + bsize)); + assert(PALETTE_MAX_SIZE == 8); + assert(PALETTE_MIN_SIZE == 2); + + const int src_stride = x->plane[0].src.stride; + const uint8_t *const src = x->plane[0].src.buf; + int block_width, block_height, rows, cols; + av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, + &cols); + const SequenceHeader *const seq_params = cpi->common.seq_params; + const int is_hbd = seq_params->use_highbitdepth; + const int bit_depth = seq_params->bit_depth; + const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode; + int unused; + + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + int colors, colors_threshold = 0; + if (is_hbd) { + int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path. + av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf, + count_buf_8bit, &colors_threshold, &colors); + } else { + av1_count_colors(src, src_stride, rows, cols, count_buf, &colors); + colors_threshold = colors; + } + + uint8_t *const color_map = xd->plane[0].color_index_map; + int color_thresh_palette = 64; + // Allow for larger color_threshold for palette search, based on color, + // scene_change, and block source variance. + // Since palette is Y based, only allow larger threshold if block + // color_dist is below threshold. + if (cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad && + x->source_variance > 50) { + int64_t norm_color_dist = 0; + if (x->color_sensitivity[0] || x->color_sensitivity[1]) { + norm_color_dist = x->min_dist_inter_uv >> + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]); + if (x->color_sensitivity[0] && x->color_sensitivity[1]) + norm_color_dist = norm_color_dist >> 1; + } + if (norm_color_dist < 8000) color_thresh_palette += 20; + } + if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) { + int16_t *const data = x->palette_buffer->kmeans_data_buf; + int16_t centroids[PALETTE_MAX_SIZE]; + int lower_bound, upper_bound; + fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data, + &lower_bound, &upper_bound); + + mbmi->mode = DC_PRED; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 0, color_cache); + + // Find the dominant colors, stored in top_colors[]. + int16_t top_colors[PALETTE_MAX_SIZE] = { 0 }; + find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE), + top_colors); + + // The following are the approaches used for header rdcost based gating + // for early termination for different values of prune_palette_search_level. + // 0: Pruning based on header rdcost for ascending order palette_size + // search. + // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size + // search and for finer search do_header_rd_based_gating parameter is + // explicitly passed as 'false'. + // 2: Enabled only for ascending order palette_size search and for + // descending order search do_header_rd_based_gating parameter is explicitly + // passed as 'false'. + const bool do_header_rd_based_gating = + cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0; + + // TODO(huisu@google.com): Try to avoid duplicate computation in cases + // where the dominant colors and the k-means results are similar. + if ((cpi->sf.intra_sf.prune_palette_search_level == 1) && + (colors > PALETTE_MIN_SIZE)) { + // Start index and step size below are chosen to evaluate unique + // candidates in neighbor search, in case a winner candidate is found in + // coarse search. Example, + // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step + // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8. + // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2 + // (3) and 8 (7). + // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same + // as for 8 colors) then step size should also be 2, to cover all + // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either + // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3, + // coarse search will evaluate 3 and 6. For the winner, unique neighbors + // (3: 2,4 or 6: 5,7) would be evaluated. + + // Start index for coarse palette search for dominant colors and k-means + const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, + 3, 3, 2, + 3, 3, 2 }; + // Step size for coarse palette search for dominant colors and k-means + const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0, + 3, 3, 3, + 3, 3, 3 }; + + // Choose the start index and step size for coarse search based on number + // of colors + const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE); + const int min_n = start_n_lookup_table[max_n]; + const int step_size = step_size_lookup_table[max_n]; + assert(min_n >= PALETTE_MIN_SIZE); + // Perform top color coarse palette search to find the winner candidate + const int top_color_winner = perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, + step_size, do_header_rd_based_gating, &unused, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + discount_color_cost); + // Evaluate neighbors for the winner color (if winner is found) in the + // above coarse search for dominant colors + if (top_color_winner <= max_n) { + int stage2_min_n, stage2_max_n, stage2_step_size; + set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size, + top_color_winner, max_n); + // perform finer search for the winner candidate + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n, + stage2_max_n + 1, stage2_step_size, + /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, discount_color_cost); + } + // K-means clustering. + // Perform k-means coarse palette search to find the winner candidate + const int k_means_winner = perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused, + color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols, + discount_color_cost); + // Evaluate neighbors for the winner color (if winner is found) in the + // above coarse search for k-means + if (k_means_winner <= max_n) { + int start_n_stage2, end_n_stage2, step_size_stage2; + set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2, + k_means_winner, max_n); + // perform finer search for the winner candidate + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + start_n_stage2, end_n_stage2 + 1, step_size_stage2, + /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, color_map, rows * cols, discount_color_cost); + } + } else { + const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE), + min_n = PALETTE_MIN_SIZE; + // Perform top color palette search in ascending order + int last_n_searched = min_n; + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, + 1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache, + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + discount_color_cost); + if (last_n_searched < max_n) { + // Search in descending order until we get to the previous best + perform_top_color_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, + last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused, + color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, discount_color_cost); + } + // K-means clustering. + if (colors == PALETTE_MIN_SIZE) { + // Special case: These colors automatically become the centroids. + assert(colors == 2); + centroids[0] = lower_bound; + centroids[1] = upper_bound; + palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors, + color_cache, n_cache, /*do_header_rd_based_gating=*/false, + best_mbmi, best_palette_color_map, best_rd, rate, + rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, NULL, NULL, + discount_color_cost); + } else { + // Perform k-means palette search in ascending order + last_n_searched = min_n; + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched, + color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols, + discount_color_cost); + if (last_n_searched < max_n) { + // Search in descending order until we get to the previous best + perform_k_means_palette_search( + cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, + max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false, + &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, + best_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, + rows * cols, discount_color_cost); + } + } + } + } + + if (best_mbmi->palette_mode_info.palette_size[0] > 0) { + memcpy(color_map, best_palette_color_map, + block_width * block_height * sizeof(best_palette_color_map[0])); + // Gather the stats to determine whether to use screen content tools in + // function av1_determine_sc_tools_with_encoding(). + x->palette_pixels += (block_width * block_height); + } + *mbmi = *best_mbmi; +} + +void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x, + int dc_mode_cost, + uint8_t *best_palette_color_map, + MB_MODE_INFO *const best_mbmi, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(!is_inter_block(mbmi)); + assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools, + mbmi->bsize)); + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->bsize; + const SequenceHeader *const seq_params = cpi->common.seq_params; + int this_rate; + int64_t this_rd; + int colors_u, colors_v; + int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0; + const int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + uint8_t *const color_map = xd->plane[1].color_index_map; + RD_STATS tokenonly_rd_stats; + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + + mbmi->uv_mode = UV_DC_PRED; + if (seq_params->use_highbitdepth) { + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path. + av1_count_colors_highbd(src_u, src_stride, rows, cols, + seq_params->bit_depth, count_buf, count_buf_8bit, + &colors_threshold_u, &colors_u); + av1_count_colors_highbd(src_v, src_stride, rows, cols, + seq_params->bit_depth, count_buf, count_buf_8bit, + &colors_threshold_v, &colors_v); + } else { + int count_buf[1 << 8]; + av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u); + av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v); + colors_threshold_u = colors_u; + colors_threshold_v = colors_v; + } + + uint16_t color_cache[2 * PALETTE_MAX_SIZE]; + const int n_cache = av1_get_palette_cache(xd, 1, color_cache); + + colors_threshold = colors_threshold_u > colors_threshold_v + ? colors_threshold_u + : colors_threshold_v; + if (colors_threshold > 1 && colors_threshold <= 64) { + int r, c, n, i, j; + const int max_itr = 50; + int lb_u, ub_u, val_u; + int lb_v, ub_v, val_v; + int16_t *const data = x->palette_buffer->kmeans_data_buf; + int16_t centroids[2 * PALETTE_MAX_SIZE]; + + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u); + uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v); + if (seq_params->use_highbitdepth) { + lb_u = src_u16[0]; + ub_u = src_u16[0]; + lb_v = src_v16[0]; + ub_v = src_v16[0]; + } else { + lb_u = src_u[0]; + ub_u = src_u[0]; + lb_v = src_v[0]; + ub_v = src_v[0]; + } + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + if (seq_params->use_highbitdepth) { + val_u = src_u16[r * src_stride + c]; + val_v = src_v16[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } else { + val_u = src_u[r * src_stride + c]; + val_v = src_v[r * src_stride + c]; + data[(r * cols + c) * 2] = val_u; + data[(r * cols + c) * 2 + 1] = val_v; + } + if (val_u < lb_u) + lb_u = val_u; + else if (val_u > ub_u) + ub_u = val_u; + if (val_v < lb_v) + lb_v = val_v; + else if (val_v > ub_v) + ub_v = val_v; + } + } + + const int colors = colors_u > colors_v ? colors_u : colors_v; + const int max_colors = + colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; + for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) { + for (i = 0; i < n; ++i) { + centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2; + centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2; + } + av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); + optimize_palette_colors(color_cache, n_cache, n, 2, centroids, + cpi->common.seq_params->bit_depth); + // Sort the U channel colors in ascending order. + for (i = 0; i < 2 * (n - 1); i += 2) { + int min_idx = i; + int min_val = centroids[i]; + for (j = i + 2; j < 2 * n; j += 2) + if (centroids[j] < min_val) min_val = centroids[j], min_idx = j; + if (min_idx != i) { + int temp_u = centroids[i], temp_v = centroids[i + 1]; + centroids[i] = centroids[min_idx]; + centroids[i + 1] = centroids[min_idx + 1]; + centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v; + } + } + av1_calc_indices(data, centroids, color_map, rows * cols, n, 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); + pmi->palette_size[1] = n; + for (i = 1; i < 3; ++i) { + for (j = 0; j < n; ++j) { + if (seq_params->use_highbitdepth) + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd( + (int)centroids[j * 2 + i - 1], seq_params->bit_depth); + else + pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = + clip_pixel((int)centroids[j * 2 + i - 1]); + } + } + + if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) { + const int palette_mode_rate = + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); + const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0); + // Terminate further palette_size search, if header cost corresponding + // to lower palette_size is more than the best_rd. + if (header_rd >= *best_rd) break; + av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + palette_mode_rate; + } else { + av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); + if (tokenonly_rd_stats.rate == INT_MAX) continue; + this_rate = tokenonly_rd_stats.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost); + } + + this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); + if (this_rd < *best_rd) { + *best_rd = this_rd; + *best_mbmi = *mbmi; + memcpy(best_palette_color_map, color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + *rate = this_rate; + *distortion = tokenonly_rd_stats.dist; + *rate_tokenonly = tokenonly_rd_stats.rate; + *skippable = tokenonly_rd_stats.skip_txfm; + } + } + } + if (best_mbmi->palette_mode_info.palette_size[1] > 0) { + memcpy(color_map, best_palette_color_map, + plane_block_width * plane_block_height * + sizeof(best_palette_color_map[0])); + } +} + +void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + const BLOCK_SIZE bsize = mbmi->bsize; + int src_stride = x->plane[1].src.stride; + const uint8_t *const src_u = x->plane[1].src.buf; + const uint8_t *const src_v = x->plane[2].src.buf; + int16_t *const data = x->palette_buffer->kmeans_data_buf; + int16_t centroids[2 * PALETTE_MAX_SIZE]; + uint8_t *const color_map = xd->plane[1].color_index_map; + int r, c; + const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u); + const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v); + int plane_block_width, plane_block_height, rows, cols; + av1_get_block_dimensions(bsize, 1, xd, &plane_block_width, + &plane_block_height, &rows, &cols); + + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) { + if (cpi->common.seq_params->use_highbitdepth) { + data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; + } else { + data[(r * cols + c) * 2] = src_u[r * src_stride + c]; + data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c]; + } + } + } + + for (r = 1; r < 3; ++r) { + for (c = 0; c < pmi->palette_size[1]; ++c) { + centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c]; + } + } + + av1_calc_indices(data, centroids, color_map, rows * cols, + pmi->palette_size[1], 2); + extend_palette_color_map(color_map, cols, rows, plane_block_width, + plane_block_height); +} diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h new file mode 100644 index 0000000000..7da863a0cc --- /dev/null +++ b/third_party/aom/av1/encoder/palette.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Declares functions used in palette search. + */ +#ifndef AOM_AV1_ENCODER_PALETTE_H_ +#define AOM_AV1_ENCODER_PALETTE_H_ + +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct PICK_MODE_CONTEXT; +struct macroblock; + +/*!\cond */ +#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c + +void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, + int max_itr); +void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, + int max_itr); +/*!\endcond */ + +/*!\brief Calculates the cluster to which each data point belong. + * + * \ingroup palette_mode_search + * \param[in] data The data points whose cluster indices are + * to be computed. The data layout is + * NUM_DATA_POINTS X DATA_DIM. + * \param[in] centroids Pointer to the centroids. The data layout + * is NUM_CENTROIDS X DATA_DIM. + * \param[in] indices Pointer to store the computed indices. + * \param[in] n Number of data points. + * \param[in] k Number of clusters. + * \param[in] dim Data dimension. + * + * \remark Returns nothing, but saves each data's cluster index in \a indices. + */ +static INLINE void av1_calc_indices(const int16_t *data, + const int16_t *centroids, uint8_t *indices, + int n, int k, int dim) { + assert(n > 0); + assert(k > 0); + if (dim == 1) { + av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k); + } else if (dim == 2) { + av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k); + } else { + assert(0 && "Untemplated k means dimension"); + } +} + +/*!\brief Performs k-means cluster on the data. + * + * \ingroup palette_mode_search + * \param[in] data The data points to be clustered. The data + * layout is NUM_DATA_POINTS X DATA_DIM. + * \param[in] centroids Pointer to store the computed centroids. + * The data layout is + * NUM_CENTROIDS X DATA_DIM. + * \param[in] indices Pointer to store the computed indices. For + * each training data. + * \param[in] n Number of data points. + * \param[in] k Number of clusters. + * \param[in] dim Data dimension. + * \param[in] max_itr Maximum number of iterations to run. + * + * \remark Returns nothing, but saves each cluster's centroid in centroids and + * each data's cluster index in \a indices. + * + * \attention The output centroids are rounded off to nearest integers. + */ +static INLINE void av1_k_means(const int16_t *data, int16_t *centroids, + uint8_t *indices, int n, int k, int dim, + int max_itr) { + assert(n > 0); + assert(k > 0); + if (dim == 1) { + AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr); + } else if (dim == 2) { + AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr); + } else { + assert(0 && "Untemplated k means dimension"); + } +} + +/*!\brief Removes duplicated centroid indices. + * + * \ingroup palette_mode_search + * \param[in] centroids A list of centroids index. + * \param[in] num_centroids Number of centroids. + * + * \return Returns the number of unique centroids and saves the unique centroids + * in beginning of the centroids array. + * + * \attention The centroids should be rounded to integers before calling this + * method. + */ +int av1_remove_duplicates(int16_t *centroids, int num_centroids); + +/*!\brief Checks what colors are in the color cache. + * + * \ingroup palette_mode_search + * \param[in] color_cache A cache of colors. + * \param[in] n_cache Number of colors in the cache. + * \param[in] colors New base colors. + * \param[in] n_colors Number of new colors. + * \param[in] cache_color_found Stores what cached colors are presented in + * colors. + * \param[in] out_cache_colors Stores what colors are not in the cache. + * + * \return Returns the number of colors that are not in cache. In addition, + * records whether each cache color is presented in colors in cache_color_found, + * and stores and stores the out of cache colors in out_cache_colors. + */ +int av1_index_color_cache(const uint16_t *color_cache, int n_cache, + const uint16_t *colors, int n_colors, + uint8_t *cache_color_found, int *out_cache_colors); + +/*!\brief Gets the rate cost for each delta-encoding v palette. + * + * \ingroup palette_mode_search + * \param[in] pmi Struct that stores the palette mode info. + * \param[in] bit_depth Pixel bitdepth of the sequence. + * \param[in] zero_count Stores the number of zero deltas. + * \param[in] min_bits Minimum bits for the deltas. Sets to + * bit_depth - 4. + * + * \return Returns the number of bits used to transmit each v palette color + * delta and assigns zero_count with the number of deltas being 0. + */ +int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi, + int bit_depth, int *zero_count, int *min_bits); + +/*!\brief Gets the rate cost for transmitting luma palette color values. + * + * \ingroup palette_mode_search + * \param[in] pmi Struct that stores the palette mode info. + * \param[in] color_cache Color cache presented at the decoder. + * \param[in] n_cache Number of colors in the cache. + * \param[in] bit_depth Pixel bitdepth of the sequence. + * + * \return Returns the rate needed to transmit the palette. Note that this does + * not include the cost of transmitted the color map. + */ +int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth); + +/*!\brief Gets the rate cost for transmitting luma palette chroma values. + * + * \ingroup palette_mode_search + * \param[in] pmi Struct that stores the palette mode info. + * \param[in] color_cache Color cache presented at the decoder. + * \param[in] n_cache Number of colors in the cache. + * \param[in] bit_depth Pixel bitdepth of the sequence. + * + * \return Returns the rate needed to transmit the palette. Note that this does + * not include the cost of transmitted the color map. + */ +int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, + const uint16_t *color_cache, int n_cache, + int bit_depth); + +/*!\brief Search for the best palette in the luma plane. + * + * \ingroup palette_mode_search + * \callergraph + * This function is used in both inter and intra frame coding. + */ +void av1_rd_pick_palette_intra_sby( + const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize, + int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map); + +/*!\brief Search for the best palette in the chroma plane. + * + * \ingroup palette_mode_search + * \callergraph + * This function is used in both inter and intra frame coding. + */ +void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi, + struct macroblock *x, int dc_mode_cost, + uint8_t *best_palette_color_map, + MB_MODE_INFO *const best_mbmi, + int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, + uint8_t *skippable); + +/*!\brief Resets palette color map for chroma channels. + */ +void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PALETTE_H_ diff --git a/third_party/aom/av1/encoder/partition_cnn_weights.h b/third_party/aom/av1/encoder/partition_cnn_weights.h new file mode 100644 index 0000000000..504038c63a --- /dev/null +++ b/third_party/aom/av1/encoder/partition_cnn_weights.h @@ -0,0 +1,2139 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/cnn.h" +#include "av1/encoder/ml.h" + +#define CNN_BRANCH_0_OUT_CH 20 +#define CNN_BRANCH_1_OUT_CH 4 +#define CNN_BRANCH_2_OUT_CH 20 +#define CNN_BRANCH_3_OUT_CH 20 +#define CNN_TOT_OUT_CH \ + (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \ + (CNN_BRANCH_3_OUT_CH))) +#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH) +#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2) +#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4) +#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8) +#define CNN_OUT_BUF_SIZE \ + (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \ + (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE))) + +#define NUM_DNN_BRANCHES 4 +#define NUM_CNN_LAYERS 5 +#define BRANCH_0_NUM_DNN_LAYERS 2 +#define BRANCH_1_NUM_DNN_LAYERS 2 +#define BRANCH_2_NUM_DNN_LAYERS 2 +#define BRANCH_3_NUM_DNN_LAYERS 2 +#define CNN_LAYER_0_HEIGHT 5 +#define CNN_LAYER_0_WIDTH 5 +#define CNN_LAYER_0_IN_CH 1 +#define CNN_LAYER_0_OUT_CH 20 +#define CNN_LAYER_0_HORZ_STRIDE 4 +#define CNN_LAYER_0_VERT_STRIDE 4 +#define CNN_LAYER_1_HEIGHT 2 +#define CNN_LAYER_1_WIDTH 2 +#define CNN_LAYER_1_IN_CH 20 +#define CNN_LAYER_1_OUT_CH 20 +#define CNN_LAYER_1_HORZ_STRIDE 2 +#define CNN_LAYER_1_VERT_STRIDE 2 +#define CNN_LAYER_2_HEIGHT 2 +#define CNN_LAYER_2_WIDTH 2 +#define CNN_LAYER_2_IN_CH 20 +#define CNN_LAYER_2_OUT_CH 20 +#define CNN_LAYER_2_HORZ_STRIDE 2 +#define CNN_LAYER_2_VERT_STRIDE 2 +#define CNN_LAYER_3_HEIGHT 2 +#define CNN_LAYER_3_WIDTH 2 +#define CNN_LAYER_3_IN_CH 20 +#define CNN_LAYER_3_OUT_CH 4 +#define CNN_LAYER_3_HORZ_STRIDE 2 +#define CNN_LAYER_3_VERT_STRIDE 2 +#define CNN_LAYER_4_HEIGHT 2 +#define CNN_LAYER_4_WIDTH 2 +#define CNN_LAYER_4_IN_CH 4 +#define CNN_LAYER_4_OUT_CH 20 +#define CNN_LAYER_4_HORZ_STRIDE 2 +#define CNN_LAYER_4_VERT_STRIDE 2 +#define BRANCH_0_NUM_DNN_FEATURES 37 +#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_0_NUM_LOGITS 1 +#define BRANCH_1_NUM_DNN_FEATURES 25 +#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_1_NUM_LOGITS 1 +#define BRANCH_2_NUM_DNN_FEATURES 25 +#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_2_NUM_LOGITS 1 +#define BRANCH_3_NUM_DNN_FEATURES 41 +#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16 +#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24 +#define BRANCH_3_NUM_LOGITS 1 + +static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = { + 0.131894f, -0.593536f, -0.212935f, -0.00220011f, -0.396949f, + 0.287753f, -0.91875f, -0.0095057f, 0.804197f, -0.395239f, + 0.516604f, 1.16439f, 0.445784f, -0.163349f, 0.746488f, + -0.33891f, -0.562652f, 0.481403f, 0.755378f, -0.200753f, + 0.0784307f, 0.105657f, 0.0205673f, -0.524089f, -0.476146f, + -0.161206f, -0.65079f, 0.137474f, 0.28584f, 0.508768f, + -0.643386f, 0.227068f, -0.899507f, -0.413382f, 0.631466f, + 0.398203f, -0.544392f, 0.825155f, 0.671847f, -0.249779f, + 0.323121f, 0.125357f, -0.719564f, -0.0714854f, -0.168472f, + -0.213246f, -0.674525f, 0.330148f, -0.138414f, 0.20462f, + -0.518571f, -0.15091f, -0.605116f, -0.448732f, -0.475599f, + 0.738f, -0.328526f, 0.755035f, 0.969414f, -0.321039f, + -0.23068f, 0.408567f, -0.377813f, -0.273974f, 1.0684f, + 0.373968f, -0.450305f, 0.439258f, -0.381846f, -0.267331f, + 0.30613f, -0.39369f, 0.622438f, -0.52877f, -0.334991f, + 0.263193f, -0.402121f, 0.64142f, 0.793048f, -0.0231174f, + -0.68474f, -0.293338f, -0.737511f, -0.462654f, 0.474629f, + 0.141397f, -0.152529f, 0.345879f, -0.499991f, 0.00174024f, + 0.337387f, -0.131151f, 0.427385f, -0.457449f, -0.879614f, + -0.425908f, -0.263172f, 0.0344974f, 1.07861f, -0.00416662f, + 0.0208952f, 0.233905f, 0.765965f, 0.0423685f, -0.117554f, + -0.248237f, 0.49848f, -0.845131f, 0.223648f, -0.838709f, + 0.5834f, 0.309956f, -0.0625093f, -0.619619f, 0.918957f, + 0.358271f, -0.668459f, 0.518783f, -0.418963f, -0.206788f, + 0.364983f, -0.0396087f, 0.624309f, -0.138679f, -0.142453f, + 0.28309f, 0.895092f, -0.215713f, 0.439025f, 0.659333f, + -0.366025f, -0.413518f, 0.66657f, -0.265919f, 0.473471f, + -1.0729f, -0.526702f, 0.2838f, 0.367648f, -0.61242f, + 0.121656f, 0.547727f, -0.0636793f, -0.33006f, -0.306604f, + -0.00897731f, 0.688242f, 0.0944626f, 0.321508f, 0.0437392f, + -0.560035f, -0.768334f, 0.0571051f, -0.0427601f, -0.0437806f, + -0.816209f, -0.395829f, 0.293733f, 0.217645f, -0.646428f, + 0.132448f, -0.435806f, -0.0556814f, 0.0218857f, 0.348525f, + -0.17296f, 0.669057f, 0.638604f, -0.0995596f, -0.024099f, + -0.262332f, -0.548975f, 0.357894f, 0.43873f, -0.688234f, + -0.425519f, 0.190986f, -0.074778f, 0.294232f, -0.548969f, + -0.731198f, 0.03616f, -0.475969f, -0.306075f, -0.111929f, + -0.234146f, 0.612669f, 0.882254f, -0.622893f, 0.262431f, + 0.465242f, 0.245384f, -0.811016f, 0.501798f, -0.925875f, + 0.264373f, 0.307766f, -0.26872f, 0.113027f, -0.158875f, + 0.0711483f, 0.220275f, -0.0699022f, -0.0111303f, -0.435384f, + -0.720014f, 0.593484f, -0.964082f, 0.750925f, 0.252433f, + 0.964332f, -0.256904f, -0.421715f, -0.403851f, -0.188081f, + 0.694014f, -1.00183f, 0.798921f, 0.0603123f, 0.213814f, + 0.739642f, -0.0203375f, 0.72569f, -0.260224f, 0.0199516f, + -0.322451f, 0.318204f, -0.38392f, 0.740994f, -0.265215f, + -0.54541f, -0.51479f, -0.458397f, 0.519564f, 0.0509182f, + 0.0363331f, -0.293051f, 0.317714f, -0.327488f, -0.0840401f, + 0.318437f, -0.619403f, 0.641094f, -0.288435f, -0.260185f, + 0.181083f, -0.169294f, 0.292645f, 0.140405f, 0.0572885f, + -0.637428f, -0.102616f, 0.288955f, 0.817314f, 0.116855f, + 0.635532f, 0.283334f, -0.236391f, -0.305035f, -0.217365f, + -0.033021f, -0.455858f, 0.439922f, -0.104039f, 0.373376f, + 0.310659f, 0.388789f, 0.266341f, 0.0746306f, -0.428192f, + -0.202695f, -0.347625f, 0.00585741f, 0.366203f, 0.221413f, + 0.518856f, 0.57245f, -0.375071f, -0.2436f, -0.511895f, + -1.03708f, 0.681455f, -0.111544f, -0.183563f, 0.109729f, + -0.422646f, -0.529777f, 0.747473f, -0.270223f, -0.11435f, + 0.378931f, 0.420456f, 0.236331f, 0.49261f, -0.0666801f, + 0.0475846f, 0.906095f, -0.4146f, -0.020588f, -0.653285f, + 0.135335f, 0.543846f, -0.309061f, 0.11899f, -0.639168f, + -0.719994f, -0.219706f, -0.645631f, -0.829049f, -0.0114746f, + 0.834604f, 0.0378035f, 0.107957f, 0.546929f, -0.674395f, + -0.854817f, -1.1443f, 0.223413f, -0.326324f, 0.440971f, + 0.383582f, -0.495084f, 0.280091f, -0.53116f, 0.0333923f, + -0.354339f, -0.0449156f, -0.538896f, -0.753355f, 0.463995f, + 0.000969967f, -0.2832f, 0.587276f, 0.853094f, -0.481985f, + -0.138202f, 0.180989f, -0.349044f, -0.417534f, 0.455591f, + 0.287332f, 0.251496f, 0.381416f, 0.339632f, -0.0825727f, + 0.352739f, 0.161697f, -0.319764f, -0.258015f, 0.668833f, + -0.553303f, -0.578815f, -0.3758f, 0.289f, 0.247368f, + 0.00681103f, 0.421092f, -0.191033f, -0.425868f, -0.1239f, + 0.0540422f, -0.0856856f, 0.481168f, -0.0283741f, -0.196018f, + 0.230923f, -0.145288f, 0.52188f, 0.00628462f, -0.604556f, + -0.562879f, 0.319282f, 0.323799f, 0.453941f, 0.271129f, + -0.0520196f, 0.684571f, -0.391779f, -0.404614f, 0.134097f, + -0.825482f, 0.0913949f, 0.483543f, 0.159084f, 0.301637f, + 0.427013f, 0.196153f, 0.460091f, -0.730573f, -0.12278f, + 0.221665f, 0.674622f, -0.623363f, -0.0761517f, 0.637979f, + -0.468498f, 0.527276f, -0.596894f, -0.34675f, -0.251241f, + 0.418533f, -0.476696f, -0.901267f, -0.0088241f, -0.12421f, + -0.660316f, -0.0222117f, -0.470898f, -1.10739f, -0.441645f, + 0.39516f, -0.0117906f, 0.254122f, 0.00722599f, -1.00697f, + 0.48908f, -0.122287f, -0.378608f, -0.339145f, 0.682463f, + 0.305606f, 0.453628f, -0.49923f, -0.791388f, -0.202515f, + 0.23214f, -0.434209f, -0.778283f, -0.538015f, 0.145769f, + 0.446281f, -0.339329f, -0.198478f, -0.183717f, -0.855441f, + -0.105778f, 0.575067f, -0.18592f, -0.348094f, 0.740614f, + 0.041549f, -0.109663f, 0.0434492f, 0.245242f, -1.22192f, + 0.685896f, -0.208115f, -0.0616216f, -1.00552f, 0.31045f, + -0.184394f, 0.466705f, -0.0984364f, -0.506252f, 0.144874f, + 0.357038f, 0.675221f, -0.822171f, -0.52729f, 0.991212f, + 0.432422f, 0.383493f, -0.372395f, 0.35651f, -0.25369f, + 0.660208f, -0.117745f, -0.142433f, -0.724115f, -1.0035f, + -0.59178f, 0.563444f, -0.282531f, -0.599989f, 0.507424f, + -0.782875f, 0.755029f, -0.754962f, -0.617825f, 0.565984f, + -0.826878f, -0.456563f, 0.0212161f, 0.469867f, -0.144864f, + 0.225748f, -0.279029f, 0.21052f, -0.440183f, 0.936069f, + 0.170595f, 0.40966f, 0.452453f, -0.576006f, 1.50696f, + 0.649049f, 0.094957f, -0.167706f, -0.258342f, 0.59269f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = { + 0.00475215f, -0.00362332f, -0.00317542f, 0.190083f, 0.0488147f, + -0.0268093f, -0.00432231f, 0.0112229f, 0.0626653f, -0.0025698f, + 0.0018675f, -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f, + 0.000136436f, 0.0667295f, 0.0251274f, 0.00226553f, -0.000638344f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = { + 0.228403f, 0.241933f, 0.181079f, 0.101728f, 0.278455f, + -0.222078f, 0.387578f, 0.0847356f, -0.0737012f, 0.26518f, + -1.0817f, 0.0404161f, -0.805199f, 0.336576f, -0.541494f, + 0.246264f, 0.116597f, -0.756804f, -0.914136f, 0.410265f, + 0.413294f, 0.07873f, 0.450017f, -0.264346f, 0.549095f, + 1.03755f, -0.203542f, 1.61018f, 0.374131f, 0.402515f, + -2.36115f, 0.116427f, -0.172157f, -0.231482f, -0.905736f, + -0.0183059f, -0.575746f, 0.110348f, -0.268018f, 0.140399f, + 0.427196f, 0.0718528f, 0.247936f, -0.326661f, 0.150404f, + -0.659979f, -0.157148f, 0.00826241f, -0.679275f, -0.131564f, + -1.04822f, 1.06039f, -0.207898f, 0.510167f, 0.484233f, + 0.138972f, -0.0801639f, -0.184416f, 0.0741107f, -0.0299281f, + 0.112263f, 0.380071f, -0.0185269f, -0.0821188f, 0.918796f, + -0.576106f, 0.593007f, 0.479446f, 0.0440703f, 0.322379f, + 0.176783f, -0.147111f, 0.0953247f, -0.636377f, 0.0702104f, + 0.130979f, 0.293892f, -0.0112124f, -0.040347f, -0.16034f, + 0.3252f, -0.586802f, 0.601786f, -0.487148f, -0.458777f, + 0.463835f, 0.144942f, 0.00339965f, -0.779966f, 0.0585298f, + -1.20758f, -0.275614f, 0.292346f, -0.132781f, 0.337892f, + -0.357677f, 1.48511f, 0.172907f, -0.148668f, 0.243184f, + -0.503392f, -0.0791543f, 0.0265389f, -0.102267f, 0.213294f, + 0.0657801f, 0.156996f, 0.0891168f, 0.120805f, 0.261285f, + -0.343025f, -0.0792235f, -0.106415f, 0.133878f, -0.112981f, + -0.00151126f, -0.0643829f, 0.0458938f, -0.0452731f, -0.00147422f, + 0.1871f, -0.0208793f, 0.0752037f, 0.0794674f, 0.167666f, + 0.198028f, -0.361015f, -0.0661721f, -0.10672f, -0.0773641f, + -1.15856f, -0.516443f, -0.322702f, 0.15668f, 0.0075841f, + -0.157731f, 0.270926f, -0.241551f, 0.0169097f, -0.0263953f, + -0.303556f, -0.239237f, 0.117792f, -0.137871f, 0.122054f, + -0.587381f, 0.112938f, 0.0867262f, -0.27909f, -0.203622f, + -0.622195f, 0.42623f, 0.670704f, 0.190826f, -0.304979f, + -0.570075f, -0.240699f, 0.43744f, 0.632896f, -0.563846f, + -0.0160434f, -0.0709745f, 0.816662f, 0.269999f, -0.358734f, + 0.193644f, 1.19339f, -0.118223f, -0.363291f, -0.723616f, + -1.58825f, 0.0222856f, 0.769852f, 0.322713f, 0.0857619f, + -0.669756f, -1.08414f, 1.18593f, 0.486166f, -0.520646f, + 0.0861854f, -0.134197f, 0.258337f, 0.223345f, 0.697639f, + -0.57261f, 0.54031f, 0.892644f, 0.497572f, -0.287076f, + -1.95928f, -0.0568128f, -0.253335f, 0.00233392f, -0.192787f, + -0.115203f, -0.0975649f, 0.277954f, 0.000704534f, -0.315884f, + 0.309583f, 0.357458f, 0.0939298f, -0.072701f, 0.433045f, + -0.536938f, 0.534523f, 0.184585f, -0.0415175f, -0.120909f, + -1.2622f, 0.412449f, -0.114741f, 0.290453f, -0.441671f, + -0.0242497f, -0.20746f, 0.139019f, -0.422668f, -0.146732f, + -0.688828f, -0.00339426f, 0.04166f, 0.41755f, 0.405675f, + 0.562564f, 0.0216812f, 0.0271391f, 0.215227f, 0.328183f, + -1.6442f, -0.827838f, 0.115491f, 0.0951442f, -0.133779f, + -0.0482928f, 0.203177f, 0.322953f, -0.513259f, 0.0676788f, + -0.0877928f, 0.224448f, 0.451957f, 0.314243f, 0.307403f, + 0.35653f, 0.0286278f, 2.27554f, 0.569313f, -0.0488753f, + -2.48809f, 0.274555f, -0.248375f, -0.635634f, -0.187663f, + 0.1827f, -0.409634f, -0.0280568f, -0.207119f, -0.208192f, + -0.410268f, -0.017669f, 0.134856f, 0.434551f, 0.165201f, + 0.584608f, -0.389997f, -0.088713f, 0.118087f, 0.00210905f, + -1.07698f, -0.520967f, -0.198742f, 0.190255f, -0.162639f, + 0.0122759f, 0.460774f, -0.684633f, -0.149512f, 0.167556f, + -0.295034f, -0.0650964f, 0.0868653f, -0.691352f, 0.089795f, + 0.0620608f, 0.0531289f, 0.0124286f, 0.151921f, 1.51067f, + -0.10586f, -0.0311871f, 0.114706f, 0.0565205f, -0.159634f, + -0.423987f, -0.226896f, 0.0605352f, -0.36324f, -0.142205f, + -0.252249f, 0.0666312f, 0.316655f, 0.00687196f, 0.131079f, + -0.128281f, -0.293468f, 1.3327f, 0.542277f, -0.060088f, + -1.73475f, 0.0542297f, -0.227522f, -0.376004f, -0.147028f, + 0.0228252f, 0.0569538f, -0.0796497f, 0.0937596f, -0.0660153f, + -0.979219f, -0.377322f, 0.0523787f, 0.467299f, 0.0824278f, + 0.437147f, 0.263637f, 0.0325681f, 0.303581f, 0.353479f, + -0.142369f, -0.394797f, 0.597185f, 0.116482f, -0.0782593f, + 0.364539f, -0.30396f, 0.119016f, -0.0022429f, -0.044292f, + -0.0110531f, 0.233571f, 0.000975879f, 0.447332f, -0.0320396f, + 0.541609f, 0.14232f, 0.163905f, 0.848609f, 0.19954f, + -0.186591f, -0.44465f, -0.431672f, 0.159037f, -0.129977f, + -0.141778f, 0.246818f, -0.197539f, -0.70115f, 0.185449f, + 0.400274f, -0.0350744f, 0.239727f, -0.290504f, 0.0698443f, + -0.180374f, -0.759591f, -0.0569088f, -0.50246f, -0.0986616f, + -0.892114f, 0.306737f, -0.133937f, 0.285625f, 0.495471f, + -0.686222f, -0.168647f, -0.0926158f, 0.351772f, -0.0215394f, + 0.361223f, 0.0657142f, 0.268229f, -0.616299f, 0.0564718f, + -0.294013f, -0.588019f, 0.0234195f, -0.426863f, -0.511253f, + -0.72177f, 0.420903f, 0.0987506f, 0.309368f, 0.523532f, + 1.06073f, -0.33028f, 0.0818142f, 0.0130354f, 0.0180882f, + 0.0316898f, -0.416614f, -0.566344f, -0.163083f, 0.285085f, + -0.0534352f, 0.385496f, 0.151068f, -0.208295f, -0.175648f, + 0.0476705f, 0.190428f, -0.643391f, 0.484004f, -0.421836f, + -0.19829f, -0.227574f, -0.0869152f, 1.09881f, 0.345129f, + -0.236732f, -0.381935f, -1.46271f, 0.465914f, 0.610375f, + 0.689968f, -0.688546f, 1.95033f, 0.420946f, 0.0282428f, + 0.147823f, 0.669393f, 0.429085f, -0.328385f, -0.150439f, + -0.419097f, -0.828102f, 0.248743f, 0.24644f, 0.0186131f, + -0.384319f, -0.126294f, -0.417067f, 0.271483f, -0.0128456f, + -0.881351f, 0.152581f, 0.185584f, -0.745827f, 0.0551359f, + 0.127083f, 0.936983f, -0.0225341f, 0.575861f, 0.767417f, + -0.140867f, -0.762518f, 0.422446f, -0.0611973f, 0.0515641f, + -0.144168f, -0.298882f, 0.308461f, 0.0208704f, 0.213872f, + -0.258708f, 1.13186f, 0.314083f, -0.347536f, -0.137768f, + 0.653953f, -0.217883f, -0.56112f, -0.864661f, 0.488836f, + 0.268133f, -0.548664f, -0.765226f, 0.117082f, 0.326798f, + -0.678246f, 0.477785f, -1.27584f, 0.198912f, -0.710395f, + 1.39096f, -0.411577f, -0.55119f, 0.51092f, -0.295023f, + 0.245983f, -0.0957192f, -0.312001f, 0.0175991f, 0.524423f, + -0.126379f, 0.124687f, -1.53945f, -0.342856f, 0.514072f, + 0.400884f, -0.00581101f, -0.219327f, 0.0977873f, 0.337551f, + -0.058603f, 0.20034f, 0.0429945f, 0.676803f, -0.273585f, + -0.173435f, -0.581596f, 0.226263f, -0.0946223f, -0.060088f, + -0.0100809f, -0.022242f, -0.22218f, -0.030463f, -0.141389f, + -0.190757f, -0.00526518f, -0.77519f, -0.0825695f, 0.308403f, + 0.262792f, -0.601842f, 0.0783697f, 0.197527f, 0.0714048f, + 0.0392629f, -0.388628f, 0.172541f, -0.0222009f, 0.252096f, + 0.0728652f, 0.173632f, 0.192914f, -0.00969965f, 0.0530136f, + -0.00765759f, 0.440234f, -0.0943323f, 0.112319f, 0.0878737f, + -0.739021f, 0.385305f, 0.133334f, -0.396697f, 0.177818f, + -0.0712558f, 0.516923f, 0.102174f, 0.17158f, -0.211068f, + 0.295795f, -0.36198f, 0.179087f, -0.845744f, -0.242514f, + -1.49073f, 0.272702f, 0.59011f, -0.408184f, -0.0731313f, + 0.234643f, 0.589642f, -0.100778f, 0.516921f, -0.700154f, + 0.316432f, 0.36117f, 0.0380282f, 0.480101f, -0.0975487f, + 0.941452f, 0.231705f, -0.151182f, -1.20305f, 0.28255f, + -0.0427662f, -0.00717175f, -0.842085f, -0.357376f, 0.545581f, + -0.290714f, 0.741498f, 1.00377f, 0.483864f, 0.150405f, + 0.0834512f, -0.10031f, 0.424054f, -0.0223491f, -0.0696701f, + -0.134479f, -0.747227f, 0.422208f, 0.123858f, -0.392624f, + -0.0299847f, -0.0376142f, -0.392536f, -0.0343114f, 0.298224f, + -0.375899f, 0.693119f, 0.27909f, -0.53463f, 0.105459f, + -0.0267383f, 0.5094f, -0.411557f, 0.451749f, -0.348479f, + -0.0497316f, -0.353913f, -0.14858f, 0.241838f, 0.331039f, + 0.756607f, -0.0701661f, -0.827264f, -0.367772f, 0.447201f, + 0.834616f, -0.00497265f, -0.0557285f, 0.055088f, -0.300115f, + -0.143833f, -1.07838f, -0.106896f, 0.16945f, 0.0170324f, + 0.108754f, 0.335893f, -0.0923708f, 0.450209f, -0.0713308f, + -0.0233037f, -0.0129902f, -1.40664f, -0.0996218f, 0.711236f, + 0.400716f, 0.227871f, 2.01499f, 0.572926f, 0.135673f, + -0.0340458f, -0.316736f, 0.24257f, -0.700768f, -0.194985f, + 0.312011f, -0.179599f, 0.128114f, 0.0725977f, -0.193816f, + 0.352143f, 0.070641f, -0.467808f, -0.399047f, 0.10136f, + 0.671574f, -0.553965f, 0.105729f, 0.210383f, 0.065048f, + 0.248198f, -0.731674f, 0.588725f, -0.308237f, 0.24511f, + 0.00608906f, 0.170906f, 0.246175f, 0.149521f, 0.106071f, + 0.160246f, 0.118487f, -0.104102f, 0.872823f, 0.227478f, + 0.0182631f, -0.115083f, 0.0142445f, 0.307947f, -0.884925f, + 0.0767105f, 0.0414042f, -0.448021f, -0.0400193f, -0.0765448f, + -0.411931f, -0.199624f, 0.333371f, 0.17267f, -0.0431816f, + 0.190826f, -0.0758961f, -1.02831f, -0.0414525f, 0.605374f, + -0.0188181f, -0.2207f, 1.30004f, -0.207005f, -0.0333617f, + 0.227145f, 0.105059f, -0.0473393f, -0.448752f, -0.0342152f, + -0.0244812f, 0.220329f, 0.0313591f, -0.0902074f, -0.0731945f, + 0.88488f, 0.306306f, -0.275613f, -0.476372f, 0.00678104f, + 0.442029f, 0.122049f, 0.118042f, 0.270527f, -0.462538f, + 0.0665021f, -0.260255f, 0.209182f, 0.162321f, 0.0629934f, + -0.244896f, -0.078863f, 0.655585f, -0.0506617f, -0.487128f, + 0.118765f, -0.34408f, 0.0930615f, -0.365632f, -0.0670776f, + 0.44428f, 0.286734f, 0.146608f, 0.686757f, -0.0738428f, + -0.10034f, -0.928438f, -0.172601f, -0.0959575f, -0.010532f, + 0.277549f, 0.28773f, -0.318883f, 0.71254f, 0.273593f, + -0.382845f, -0.0104587f, -0.647769f, 0.25541f, 0.194625f, + 0.265197f, -0.750938f, -0.0650515f, -0.567092f, 0.070613f, + 0.209531f, 0.429699f, 0.130676f, 0.514914f, 0.615778f, + 0.594535f, -0.0878778f, 0.40593f, -0.303383f, 0.0907863f, + -0.320068f, 0.0137162f, -0.303424f, 0.594207f, -0.236524f, + -0.692627f, -0.990063f, -0.0262934f, 0.222375f, 0.503412f, + 0.220224f, 0.676871f, -0.150996f, 0.379777f, 0.841339f, + -1.05981f, 0.259943f, -0.781745f, 0.0346478f, 0.115791f, + -0.25171f, -0.00872158f, 0.395561f, -0.0849893f, -1.20134f, + -0.313938f, 0.789542f, 0.159606f, -0.782095f, -0.229754f, + 0.266687f, -0.0354282f, -0.3041f, 0.0338618f, -0.390001f, + -0.28362f, -0.436144f, 0.777351f, 0.855321f, 0.653338f, + -0.0382912f, -0.204577f, 1.13828f, 0.220395f, -4.60853f, + 0.575694f, 0.0453189f, 1.76567f, 0.466151f, -0.366109f, + 0.594717f, 0.278891f, -0.750676f, -0.332739f, -0.942304f, + 0.280363f, 0.284561f, 0.209326f, 0.238347f, -0.0124311f, + -0.439463f, -0.036186f, 0.165997f, 0.374717f, -0.481148f, + -0.626417f, 0.0223598f, 0.039337f, -0.379918f, 0.211046f, + 0.0795812f, 0.863355f, -0.341448f, 0.421494f, 0.410477f, + -0.117025f, -0.511108f, 0.565193f, -0.063582f, -0.031349f, + -0.0750174f, 0.387941f, 0.541266f, 0.0919753f, 1.05041f, + 0.263004f, 0.289006f, 0.0439694f, -1.22439f, -0.247832f, + 0.260967f, 0.355794f, 0.599694f, -0.69418f, 0.372805f, + -0.161731f, 0.0720574f, 0.0394657f, 0.122772f, -0.458067f, + -0.370826f, -1.34495e-05f, -0.373404f, 0.0245539f, -2.3472f, + -2.61448f, 0.264794f, 0.0601582f, -0.968597f, -0.196022f, + -0.727067f, 0.167346f, 0.517478f, 0.0035377f, 0.777219f, + 0.553128f, 0.727211f, 0.606202f, -0.495604f, 2.41445f, + 0.465214f, -0.0443004f, 0.142972f, 0.141459f, -0.17771f, + 0.0156117f, 0.169264f, 0.0428022f, -0.164827f, -0.240632f, + 0.215289f, -0.213134f, -0.184163f, 0.0161321f, -0.20025f, + -0.0311616f, 0.00292108f, -0.0131921f, 0.0437664f, -0.104817f, + -0.131906f, 0.0822771f, 0.237307f, -0.347567f, -1.2485f, + 0.253616f, -0.442217f, 0.0514077f, 0.337561f, -0.0147658f, + -0.132888f, -0.643821f, 0.445573f, -0.0146213f, 0.235511f, + 0.53583f, -0.640644f, 0.0280044f, 0.00628834f, 0.143885f, + 0.380077f, -0.542342f, 0.363101f, 0.0647334f, -0.476556f, + -0.822676f, 0.482454f, -0.0467326f, -0.253083f, 0.116726f, + 0.317333f, 0.548131f, -0.234667f, 0.579923f, -0.420683f, + 0.595613f, -0.279864f, -0.753204f, -0.516844f, -0.436574f, + -0.120682f, -0.278939f, 0.752202f, -0.183443f, -0.14632f, + -0.0344068f, 0.127638f, -0.225245f, 0.489391f, 0.145082f, + -0.73672f, 0.980065f, -0.0367412f, 0.40632f, -0.802509f, + 0.356897f, 0.366172f, 1.23858f, -0.978381f, -0.684924f, + -0.0870693f, -0.353628f, 0.695788f, -0.244593f, -1.8897f, + -0.257803f, 0.686937f, 0.405155f, -0.125696f, 0.258075f, + 0.570584f, -0.439481f, -0.59798f, 0.0745711f, -0.235162f, + 0.133048f, -0.243033f, 0.0415527f, -0.00118735f, 0.00980514f, + -0.297429f, -0.144983f, 0.463093f, 0.0965441f, -0.338508f, + -0.651077f, 0.817577f, -0.0364773f, -0.388465f, 0.113288f, + 0.231198f, 0.316208f, -0.592201f, 0.530376f, -0.431434f, + 0.0200985f, 0.104303f, -0.130705f, 0.4374f, 0.362342f, + 0.70641f, 0.20037f, 0.309128f, -0.484535f, -1.18469f, + 0.513893f, 0.201236f, -0.022396f, 0.179638f, -0.361289f, + -0.0794946f, -1.04704f, -0.0281103f, 0.0494822f, 0.00196415f, + 0.0625478f, -0.229033f, 0.12018f, 0.542629f, -0.222423f, + -0.0123321f, -0.0988525f, 0.773192f, -0.192218f, -3.19156f, + 0.300606f, 0.462751f, 2.2968f, 0.137182f, 0.132539f, + 0.165884f, 0.128818f, -0.155856f, -0.558538f, -0.231742f, + -0.244377f, -0.442397f, 0.250947f, 0.0850658f, -0.00820139f, + 0.391284f, 0.17453f, 0.306003f, -0.531499f, -0.624451f, + 0.564584f, -0.343953f, -0.0278713f, 0.212664f, -0.135969f, + -0.0179867f, -0.687887f, 0.371065f, -0.0537029f, 0.0499509f, + 0.0980684f, -0.0438569f, 0.186731f, 0.182105f, 0.172254f, + -0.149446f, -0.0247637f, 0.148098f, 1.20772f, -0.136664f, + 0.00983112f, 0.0181381f, -0.0147549f, -0.0846561f, -0.827022f, + 0.00207177f, 0.0478215f, 0.0652549f, 0.0898219f, -0.0224959f, + -0.0274246f, 0.0166498f, -0.0211715f, -0.502932f, 0.0961452f, + 0.251206f, -0.0623632f, 0.741566f, 0.0078449f, -2.99162f, + -0.187244f, 0.0743479f, 1.46425f, 0.0737923f, 0.0133544f, + 0.20922f, -0.178671f, -0.0528492f, -0.526717f, 0.0282125f, + -0.0363201f, 0.37406f, -0.303658f, -0.066803f, 0.132237f, + 0.962057f, -0.399733f, 0.191765f, -0.452606f, -0.348732f, + 0.444939f, 0.153025f, 0.0796317f, 0.265985f, -0.319638f, + 0.0278161f, -0.333734f, 0.226108f, 0.147895f, -0.124066f, + -0.37306f, 0.19541f, 0.200175f, -0.0593244f, 0.0333887f, + -0.0284278f, 0.462491f, 0.0686487f, -0.332435f, -0.437166f, + 0.302795f, 0.100542f, 0.0265019f, 0.767212f, -0.140621f, + 0.11558f, -0.70584f, -0.00017415f, 0.00793092f, -0.0490901f, + 0.0598338f, 0.484876f, -0.13025f, 0.660349f, 0.147503f, + -0.462766f, 0.0843824f, 0.218493f, 0.310921f, -0.162284f, + 0.210404f, -0.788799f, 0.0698512f, -0.484799f, 0.0311505f, + -0.308243f, 0.417298f, 0.0593723f, 0.208908f, 0.451437f, + 0.354546f, -0.0700888f, -0.281678f, -0.311177f, 0.00914652f, + -0.372084f, 0.135036f, 0.185393f, 0.461347f, -0.114241f, + -0.402347f, -0.692327f, 0.0376155f, -0.200267f, 0.565963f, + -0.0627442f, 0.429677f, 0.170514f, 0.350565f, 0.699528f, + -0.948126f, -0.364205f, 0.348878f, -0.137832f, -0.0791649f, + -0.0462295f, -0.255078f, -0.398509f, 0.136783f, -0.0164628f, + -0.555472f, 0.690396f, 0.147715f, 0.000523095f, 0.14874f, + 0.524804f, 0.162974f, 0.797599f, 0.277473f, -0.500696f, + 0.189917f, -0.333309f, 0.00613646f, -1.07817f, 0.0470502f, + 0.210766f, 0.159768f, -0.447774f, -0.252968f, -1.72739f, + 0.0658259f, -0.448747f, 2.26511f, 0.349651f, 0.157232f, + 0.956842f, 0.856676f, 0.149227f, -0.626957f, -0.566771f, + -0.0980846f, 0.351668f, -0.362741f, -0.0272282f, -0.113632f, + 0.366015f, -0.00790003f, -0.458632f, -0.31157f, -0.182257f, + -0.953975f, 0.0583582f, 0.164721f, -0.900107f, -0.115542f, + 0.0654192f, 0.99056f, -0.247976f, 0.48254f, 0.670196f, + 0.098585f, -0.212855f, 0.310072f, 0.0894616f, 0.151944f, + 0.119629f, -0.26735f, 0.162257f, -0.0305818f, 0.681526f, + -0.229847f, 1.01556f, 0.29132f, 0.740113f, 0.0703937f, + 0.537892f, -0.18653f, -0.0252359f, -0.420014f, 0.197631f, + -0.176629f, 0.00674754f, 0.301288f, -0.162816f, 0.636235f, + -0.341362f, 0.197296f, -0.589747f, -0.749363f, -0.277197f, + -1.27291f, -0.0857908f, -0.147591f, -0.0956297f, -0.109097f, + 0.0717554f, 0.359078f, 0.301457f, 0.486934f, -0.260955f, + -0.126821f, 1.55756f, 0.477469f, -1.45363f, 1.42198f, + -0.360847f, -0.0211924f, -0.0184957f, -0.110706f, -0.152136f, + 0.104703f, 0.267615f, 0.127392f, 0.172996f, 0.258326f, + 0.268578f, -0.431123f, -0.114419f, 0.0101172f, -0.195671f, + 0.0792025f, -0.151505f, -0.064077f, 0.0479777f, -0.141882f, + 0.121492f, -0.139132f, -0.348252f, 0.341043f, -0.565367f, + -0.0791259f, -0.781086f, 0.0140045f, 0.571094f, -0.00875077f, + 0.217132f, -0.202345f, 0.157213f, 0.228445f, 0.366612f, + -0.529989f, 0.42241f, -0.540538f, -0.0425556f, -0.207774f, + -0.0663941f, 0.37836f, -0.0650245f, -0.0828694f, -0.0835478f, + -0.795512f, 0.470268f, 0.1551f, -0.69017f, -0.116735f, + 0.157614f, 0.555973f, -0.293311f, 0.245428f, -0.0853701f, + -0.449278f, -0.0551647f, -0.00137429f, 0.709439f, -0.456796f, + 0.132062f, -0.0449484f, -0.308599f, 0.180608f, -2.24196f, + 0.421478f, -0.640946f, -0.460397f, -0.920628f, -0.184949f, + -0.0416982f, 0.6484f, -0.22806f, 0.412229f, -0.468079f, + -0.72372f, -0.347698f, -1.3899f, 0.631876f, 0.0611046f, + 0.0294258f, -0.128091f, -0.205615f, 0.355348f, -0.267725f, + -0.644835f, 0.435879f, 0.517477f, -0.338123f, -0.157764f, + 0.32762f, -0.166454f, 0.221007f, -0.0438278f, -0.0777725f, + 0.10986f, 0.941545f, -0.542284f, -0.172312f, -0.256597f, + -0.0181391f, 0.220623f, -0.432456f, 0.0164074f, 0.250226f, + -0.522576f, 0.783109f, 0.198703f, -0.784554f, -0.0929628f, + 0.326861f, 0.470293f, 0.442684f, 0.271879f, -0.108256f, + 0.0483558f, -0.403151f, 0.36183f, -0.268186f, 0.270851f, + -0.696826f, -0.166037f, -0.354658f, 0.405977f, -0.473447f, + 0.649689f, -0.0863114f, -0.147319f, 0.0869966f, 0.319792f, + 0.493026f, -1.07456f, 0.354751f, 0.114605f, -0.120647f, + -0.238315f, 0.0290955f, -0.355299f, -0.45381f, 0.0812865f, + -0.0180434f, 0.00861318f, -0.892943f, -0.0127801f, -1.66398f, + 0.290505f, 0.126832f, 2.08173f, -0.0454847f, -0.162481f, + 1.07426f, 0.228566f, 0.280528f, -0.537625f, -0.175288f, + -0.118012f, 0.649114f, -0.349926f, -0.0189864f, -0.30934f, + -0.363178f, -0.119822f, -0.22656f, 0.484513f, -0.173269f, + 0.41987f, -0.448517f, -0.0950466f, 0.482443f, 0.061558f, + 0.4219f, -0.536388f, 0.0781972f, 0.212489f, 0.104229f, + -0.0792804f, 0.402066f, -0.676313f, -0.2272f, -0.16379f, + 0.260145f, -0.0504658f, -0.0826579f, -1.37749f, 0.00790747f, + 0.0841031f, -0.0671308f, -0.00301736f, -0.386206f, 0.190311f, + 0.0702639f, 0.0643968f, 0.133741f, -0.0141555f, -0.0365324f, + 0.87028f, 0.207894f, -0.421266f, 0.689256f, 0.145037f, + -0.270796f, 0.212604f, -0.345326f, 0.0074631f, -1.72379f, + 0.0672097f, -0.273153f, 1.30503f, -1.01324f, 0.00284696f, + 0.851459f, 0.176847f, 0.30948f, -0.57144f, -0.0596695f, + -0.111189f, 0.130361f, -0.298286f, 0.0567591f, -0.0885215f, + -0.847601f, 0.238624f, -0.162391f, 0.452357f, -0.0192713f, + 0.226661f, 0.0762922f, -0.0894055f, 0.332702f, 0.424484f, + 0.0443207f, -0.162345f, -0.601036f, 0.280527f, -0.137362f, + 0.266345f, 0.729438f, -0.887182f, 0.152943f, -0.573548f, + -0.0201383f, -0.56521f, 0.033582f, 0.300284f, -0.144472f, + 0.633026f, 0.30866f, 0.0653073f, 0.316901f, 0.0721326f, + 0.192252f, -0.833162f, 0.194292f, -0.08663f, -0.189401f, + -0.178242f, 0.111488f, 0.522487f, -0.65497f, 0.457049f, + 0.390654f, 0.0522936f, -0.39712f, -0.293717f, -0.374656f, + -0.118916f, -0.853076f, -0.0829578f, -0.17335f, -0.0218694f, + 0.367968f, 0.478469f, 0.0913813f, 0.519251f, 0.803526f, + -0.272516f, -0.341329f, 0.0897285f, 0.247653f, 0.000898686f, + 0.313196f, 0.000587979f, -0.314189f, -0.449439f, -0.0291611f, + -0.356287f, -0.722904f, -0.0480958f, -0.523758f, -0.576146f, + 0.133754f, 0.616921f, -0.085494f, 0.487487f, 0.745129f, + 0.993267f, 0.256555f, 0.0822743f, 0.0411971f, 0.139388f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = { + 0.00447951f, 0.0202534f, 0.00970833f, -0.00460874f, 0.0942288f, + -0.0534704f, 0.00829869f, -0.0255174f, -0.0809143f, 0.00169117f, + 0.0177427f, 0.0259387f, 0.0291077f, -0.0267599f, 0.100275f, + -0.00389366f, 0.0315499f, 0.0265846f, -0.000206604f, 0.0302221f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = { + 0.153048f, 0.0725422f, 0.068901f, -0.475608f, 0.0736706f, + -0.134076f, 0.229289f, 0.0217921f, 0.0449205f, -1.00002f, + 0.149133f, 0.0497258f, 0.118988f, 0.0741764f, 0.0385486f, + 0.225181f, 0.012966f, 0.155593f, -3.07175f, -0.0641051f, + 0.09161f, 0.0259005f, -0.209998f, -0.420298f, 0.0587126f, + 0.00352744f, 0.0451313f, -0.049384f, 0.11516f, 0.083135f, + 0.103675f, -0.0185604f, 0.0623248f, -0.0993726f, 0.0448522f, + 0.0134017f, -0.294776f, -0.251924f, 0.0712635f, -0.0764298f, + -0.463766f, -0.0295011f, -0.579168f, 0.573853f, -0.00596607f, + 0.0237762f, -0.0500104f, -0.0969275f, 0.155573f, 0.0515382f, + -0.178454f, -0.154008f, -0.278299f, -0.166421f, 0.0149533f, + -0.0700236f, 0.239287f, -1.19545f, -0.0744625f, 0.143037f, + 0.141874f, 0.086302f, 0.0838633f, -0.454179f, 0.120308f, + -0.0896718f, 0.254909f, 0.0714462f, 0.00471098f, -0.869494f, + 0.209407f, 0.138285f, 0.0816641f, 0.0666266f, 0.0848555f, + 0.173313f, 0.0695633f, 0.285667f, -3.15384f, 0.00140275f, + -0.969824f, -0.0318689f, -0.00487396f, 0.412541f, 0.0263593f, + -0.249824f, 0.0897776f, 0.0208836f, -0.0982745f, -0.16049f, + -0.12719f, -0.186166f, 0.102338f, 0.273931f, -0.0886306f, + -0.19513f, -0.0135712f, -0.194127f, -0.0834291f, 0.426623f, + -0.0705446f, 0.0327476f, 0.0800862f, 0.478757f, -0.00849111f, + -0.554911f, -0.0489312f, -0.184029f, -0.227428f, 0.159989f, + -0.0677731f, -0.0901436f, 0.00308696f, -0.352243f, 0.278715f, + 0.306374f, -0.0772054f, -0.0122733f, -0.0693457f, 0.074365f, + -0.267458f, -0.123612f, -0.495954f, 0.552604f, -0.103951f, + -0.121771f, 0.179966f, -0.377947f, -1.35472f, 0.153294f, + -0.445284f, -0.089813f, -0.00529807f, 0.254047f, -0.0378426f, + 0.114597f, -0.143052f, 0.0815258f, -0.10528f, 0.00833533f, + -0.117508f, 0.129052f, 0.0706719f, -1.39506f, 0.0124731f, + 0.109831f, -0.0744156f, 0.181612f, 0.0787894f, 0.0293352f, + 0.494929f, 0.00997207f, -0.585882f, -0.0844138f, -0.00864134f, + -0.109943f, 0.0713114f, 0.14883f, 0.0610554f, 0.204145f, + -0.00390313f, 0.0184763f, -0.111387f, 0.175442f, -0.0840215f, + -0.178785f, -0.0693612f, -0.254507f, -0.191549f, 0.501561f, + -0.0858995f, -0.164921f, 0.0250706f, -0.0916282f, 0.247085f, + 0.13877f, -0.419487f, -0.295065f, -0.213812f, -0.10362f, + 0.138243f, 0.086985f, 0.113633f, -0.459273f, 0.12388f, + -0.139296f, 0.253792f, 0.0421624f, 0.0665065f, -0.977282f, + 0.199927f, 0.115194f, 0.099045f, 0.0534806f, 0.089283f, + 0.0815367f, 0.150901f, 0.253458f, -3.24825f, -0.0118163f, + -0.544565f, 0.0201825f, -0.0682201f, 0.759028f, 0.00479696f, + -0.00625607f, 0.058007f, -0.0811189f, -0.114617f, -0.0998578f, + 0.133312f, 0.0246256f, -0.0167416f, 0.196118f, 0.109823f, + 0.109489f, 0.474682f, -0.763475f, 0.0818745f, 0.0798777f, + -0.0994905f, -0.00138143f, -0.108563f, 0.697289f, -0.103702f, + -0.306085f, -0.0996705f, -0.142618f, -0.130989f, 0.0813303f, + -0.0909275f, -0.10786f, -0.0280431f, 0.206877f, -1.70798f, + 0.525568f, 0.559891f, -0.166132f, -0.227574f, -0.150955f, + 0.0849226f, 0.00497342f, -0.168667f, -0.282575f, 0.00537805f, + -0.0185572f, 0.0607167f, -0.0534948f, -0.0215776f, -0.14825f, + -0.0164577f, -0.0611978f, 0.0347562f, 0.286917f, 0.226598f, + 0.149497f, -0.478101f, -0.246006f, 0.0663239f, -0.121728f, + 0.267087f, 0.0802681f, -0.184741f, -0.558267f, 0.0437066f, + 0.13816f, -0.0710939f, 0.0725697f, 0.339857f, 0.161069f, + 0.304871f, 0.108138f, 0.193396f, 0.0891607f, -0.0701939f, + -0.182038f, -0.451873f, -0.233883f, 0.0444747f, 0.0436545f, + -0.245894f, -0.0721136f, 0.309013f, 0.278996f, 0.0259377f, + 0.0278116f, 0.0686773f, -0.271237f, 0.235082f, -0.0778285f, + -0.456541f, -0.109303f, -0.074565f, -0.407301f, -0.162191f, + -0.801819f, 0.372435f, -0.559083f, -0.039189f, 0.0477762f, + 0.0875363f, 0.0699926f, 0.116552f, -0.308217f, 0.0341607f, + -0.14202f, 0.135517f, 0.0316971f, 0.153297f, -0.759722f, + 0.12849f, 0.114229f, 0.0814893f, 0.275402f, 0.0403976f, + 0.0357503f, 0.212295f, 0.0673998f, -2.59822f, -0.0475021f, + -0.0594725f, 0.0659163f, 0.0469717f, -0.0370461f, -0.12863f, + -0.381743f, -0.0445055f, -0.106843f, -0.0880648f, 0.00591106f, + 0.235514f, -0.165162f, -0.0696645f, 0.115374f, 0.245558f, + 0.192049f, -0.388628f, -0.48291f, 0.154313f, -0.160207f, + 0.125928f, 0.122039f, 0.0713794f, -0.161244f, 0.128082f, + -0.234659f, 0.0680219f, 0.0597933f, 0.208421f, -0.163623f, + 0.196873f, 0.156603f, 0.184179f, -0.278331f, -0.0481286f, + 0.0828152f, 0.247004f, 0.0915582f, -0.0906229f, -0.20376f, + 0.136593f, 0.0740336f, -0.0134935f, -0.355048f, 0.0898485f, + -0.0962068f, 0.185804f, -0.0145596f, 0.0966589f, -0.515784f, + 0.121602f, 0.0320428f, 0.11093f, -0.0559421f, 0.0355484f, + 0.192128f, 0.0500888f, 0.133641f, -1.73282f, -0.0624599f, + 0.122524f, 0.0757292f, -0.0974648f, -0.193649f, 0.0561096f, + 0.0159959f, 0.0334472f, -0.0168832f, -0.12386f, -0.112419f, + 0.19552f, 0.0308502f, 0.0537643f, -0.0181012f, 0.0392183f, + 0.0461833f, -0.52623f, -0.238252f, 0.0821762f, -0.212384f, + 0.112901f, 0.096063f, 0.0540225f, 0.0773583f, 0.143045f, + -0.101551f, 0.282418f, 0.0176749f, -0.00244542f, -0.780154f, + -0.254428f, -5.82215f, 0.106638f, 0.11746f, 0.0486823f, + 0.164562f, 0.0303006f, 0.229614f, -2.41845f, -0.117122f, + 0.0451654f, 0.0237383f, -0.208731f, 0.0721137f, 0.0761163f, + -0.0569416f, -0.00830511f, -0.045256f, 0.14535f, -0.0189222f, + -0.283363f, -3.15502f, 0.0971161f, -0.035913f, 0.00813281f, + 0.0187974f, -0.361573f, -0.302067f, 0.118014f, -0.0956148f, + -0.596567f, 0.0105443f, -0.49019f, -0.0801959f, 0.0322344f, + -0.0280032f, 0.0555038f, -0.111495f, -0.0994456f, 0.0178021f, + 0.0358362f, 1.07063f, -0.0833138f, 0.0621246f, 0.0637157f, + 0.0999207f, 0.191975f, -1.2811f, 0.0341681f, 0.14818f, + 0.0957259f, 0.109909f, 0.0566115f, 0.0585633f, 0.179939f, + -0.104372f, 0.309091f, 0.0172941f, 0.0243182f, -0.935252f, + -0.296257f, -5.83634f, 0.0899249f, 0.455347f, 0.129505f, + 0.220212f, 0.0214801f, 0.284802f, -2.94585f, -0.0805413f, + -1.01819f, 0.00534034f, -0.057203f, 0.0869331f, 0.0207575f, + -0.124479f, -0.0465806f, 0.0894252f, 0.32203f, 0.0858497f, + 0.25178f, 0.0932205f, 0.0888455f, 0.233153f, -0.446398f, + -0.00791233f, 0.0909603f, -0.0904397f, 0.131835f, 0.475597f, + -0.1236f, 0.0231622f, 0.138602f, -0.097731f, -0.0282484f, + -0.549095f, -0.0457428f, -0.0895407f, -0.293965f, 0.166872f, + 0.46719f, 0.236254f, 0.0615991f, 0.499236f, 0.540366f, + 0.402035f, 0.0606324f, -0.0499928f, -0.0155198f, 0.0994403f, + -0.14773f, -0.183433f, -0.612093f, -0.334201f, -0.110877f, + -0.143441f, 0.05815f, -0.318586f, -0.344235f, 0.199593f, + 0.51109f, -0.252281f, -0.028834f, 0.0615421f, 0.0623699f, + 0.210745f, -0.236448f, 0.166279f, 0.127516f, -0.0971157f, + -0.204389f, 0.208112f, 0.0377023f, 0.271837f, -0.00859528f, + 0.0797081f, -0.00582115f, 0.140018f, -0.384865f, -0.0853243f, + -0.586727f, -0.0664489f, -0.631436f, -0.245828f, -0.0647894f, + -0.171912f, -0.0801706f, 0.0731614f, -0.11725f, 0.281478f, + -0.03047f, 0.0363488f, -0.0481651f, -0.326329f, -0.0155898f, + -0.428316f, -0.0989367f, -0.271902f, -0.00263837f, 0.366168f, + 0.325989f, 0.165463f, 0.0668512f, -0.142202f, 0.419992f, + 0.164971f, -0.515479f, -0.187585f, -0.151783f, -0.0682468f, + 0.0910191f, 0.117086f, 0.106579f, 0.0961825f, 0.162148f, + -0.129645f, 0.301039f, 0.000320343f, -0.0558097f, -0.844295f, + -0.218919f, -5.7571f, 0.0982612f, 0.238955f, 0.0703565f, + 0.0969388f, 0.107202f, 0.321585f, -3.00594f, -0.058755f, + -0.620004f, 0.052114f, 0.128423f, -0.177673f, -0.00341509f, + -0.146756f, -0.0414309f, -0.0893262f, -0.0584779f, -0.129552f, + 0.127629f, 0.13275f, -0.0973342f, -0.215617f, 0.0724309f, + 0.0102229f, 0.178137f, -0.943374f, -0.171465f, 0.304949f, + -0.0963836f, -0.0346437f, -0.138667f, -0.234184f, 0.0344159f, + -0.319592f, -0.0990766f, -0.16065f, 0.369432f, 0.194911f, + 0.363348f, -0.356009f, -0.00736217f, 0.241788f, -2.21311f, + 0.704816f, 0.697019f, 0.129186f, -0.132799f, -0.11861f, + 0.0383451f, 0.0247782f, -0.12687f, 0.0256552f, 0.048413f, + 0.00660549f, 0.0457962f, -0.012819f, 0.115991f, -0.1117f, + -0.291045f, -0.646138f, 0.0813613f, 0.112063f, 0.191675f, + 0.120835f, -0.444267f, -0.340385f, 0.0391936f, -0.151132f, + 0.184419f, 0.124998f, -0.14089f, 0.214087f, 0.00108535f, + 0.119611f, 0.0236965f, 0.0715074f, -0.225997f, -0.0126552f, + -0.459214f, -0.490444f, 0.173716f, 0.355811f, -0.13607f, + -0.191091f, -0.530085f, -0.400666f, 0.011221f, 0.10527f, + -0.11498f, -0.011864f, 0.364376f, 0.0319587f, -0.0528563f, + 0.0353899f, 0.0393453f, -0.289211f, -0.347785f, -0.0417157f, + 0.545848f, 0.741785f, -0.0732565f, -1.29687f, -0.0433128f, + -1.44162f, 0.318894f, -0.377784f, 0.123751f, -0.00444347f, + 0.0957118f, 0.0893616f, 0.0911595f, 0.092917f, 0.127681f, + -0.159929f, 0.190417f, -0.0297948f, -0.00132599f, -0.742756f, + -0.0364169f, -4.00108f, 0.0784767f, 0.223048f, 0.0430138f, + 0.0180493f, 0.212842f, 0.122987f, -2.83267f, -0.0641464f, + -0.173247f, 0.100946f, 0.0804885f, 0.0172631f, 0.0877408f, + -0.353222f, 0.0108262f, -0.0452121f, -0.116127f, 0.268154f, + -0.132587f, -0.27481f, -0.0316914f, 0.0610525f, 0.439691f, + 0.00966415f, -0.78962f, -0.424823f, -0.0214365f, -0.113846f, + 0.100793f, 0.126482f, 0.0415354f, 0.0427995f, 0.14273f, + -0.315674f, 0.110095f, 0.0061568f, 0.0320474f, -0.3596f, + -0.12533f, -1.28837f, 0.174673f, -0.235912f, 0.00495439f, + 0.0695473f, 0.266489f, 0.049248f, 0.0868526f, -0.0685969f, + 0.102984f, 0.0924639f, -0.027535f, 0.0709277f, 0.155776f, + -0.190944f, 0.188273f, -0.00897471f, 0.0964232f, -0.475822f, + -0.209374f, -5.00252f, 0.103495f, 0.110698f, 0.00682092f, + 0.208586f, 0.0489575f, 0.0966254f, -1.42973f, -0.0645128f, + 0.0515961f, 0.0571281f, -0.0992321f, 0.00791648f, 0.0087609f, + 0.0607367f, 0.0315705f, 0.0183317f, 0.0756087f, -0.0292847f, + -0.212932f, -0.782259f, 0.0899944f, 0.102677f, 0.0681135f, + 0.0447764f, -0.481969f, -0.221459f, 0.0794475f, -0.229157f, + 0.136781f, 0.0832359f, 0.0297807f, -0.00287225f, -5.97897f, + -0.0960581f, 0.250945f, -0.00133314f, -0.112396f, -0.856922f, + 0.115776f, 0.124536f, 0.0914194f, -0.160775f, 0.128684f, + 0.106718f, 0.100665f, 0.139579f, -0.86141f, -0.190323f, + 0.0884896f, 0.0363845f, -0.19831f, 0.121601f, 0.0264453f, + -0.00557822f, 0.0720238f, -0.0140132f, -0.166814f, -0.266214f, + 0.00500545f, 0.0146905f, 0.126035f, 0.0812372f, 0.0615973f, + 0.0766063f, -0.420156f, -0.126157f, -0.0284299f, -0.112513f, + -0.567008f, -0.0100263f, -0.607567f, 0.193053f, 0.0067527f, + -0.0753897f, 0.00134269f, -0.0512249f, -0.161661f, 0.0667741f, + -0.113702f, -0.071606f, -0.300563f, 0.276479f, -0.155318f, + -0.0512306f, 0.0896443f, -0.987911f, 0.0440889f, 0.430958f, + 0.175427f, 0.101385f, 0.0303662f, 0.0672653f, -6.62463f, + -0.10475f, 0.228249f, -0.00482173f, -0.0608713f, -0.895836f, + 0.187976f, 0.162173f, 0.0747544f, 0.219953f, 0.0682489f, + 0.142665f, 0.100287f, 0.301887f, -1.97736f, -0.295001f, + -1.0733f, -0.0562668f, -0.0604295f, 0.0304073f, 0.194274f, + -0.243593f, 0.0727137f, 0.0610967f, -0.0692415f, -0.02967f, + 0.055633f, 0.0192402f, 0.105841f, 0.102236f, -0.0757102f, + -0.0067639f, 0.0102317f, -0.257959f, -0.0638652f, 0.45521f, + -0.114967f, 0.0921177f, 0.223796f, 0.277072f, -0.0613282f, + -0.564693f, -0.151333f, -0.158035f, 0.228491f, 0.12997f, + -0.192625f, -0.125344f, 0.0983258f, -0.931206f, 0.618715f, + 0.273759f, -0.145527f, -0.099431f, -0.119551f, 0.0663484f, + -0.161419f, -0.202377f, -0.545393f, 0.0917645f, 0.042263f, + -0.17117f, -0.178622f, -0.336977f, 0.866715f, 0.0376922f, + -0.319728f, -0.127406f, 0.0599384f, 0.268804f, -0.0331844f, + 0.355326f, -0.103902f, 0.0425935f, 0.00525512f, -0.133687f, + -0.122695f, 0.145582f, 0.139013f, -0.0053352f, 0.0313566f, + 0.327295f, -0.0117993f, 0.233524f, 0.162388f, -0.0793262f, + 0.454543f, 0.0442224f, -0.742673f, -0.144882f, 0.0874983f, + -0.0707259f, 0.0219869f, 0.201728f, 0.0204537f, 0.0788857f, + -0.0374329f, 0.0724169f, 0.0743593f, -0.0193526f, -0.313546f, + -0.418882f, -0.0815754f, -0.197144f, 0.305053f, 0.330196f, + -0.131006f, -0.00113249f, 0.0750458f, -0.541764f, 0.299935f, + 0.308516f, -0.20547f, -0.333066f, 0.0285833f, 0.191147f, + 0.160372f, 0.0724649f, 0.0426326f, 0.153046f, -6.59656f, + -0.081237f, 0.219163f, 0.0147081f, -0.0109837f, -1.01487f, + 0.170055f, 0.163386f, 0.106413f, 0.150188f, 0.0688875f, + 0.0541359f, 0.156307f, 0.178844f, -1.51054f, -0.149477f, + -0.504503f, 0.017878f, -0.181821f, -0.0999659f, 0.0484548f, + -0.32211f, 0.0406744f, 0.0017627f, 0.0220593f, 0.0900512f, + -0.561625f, 0.107279f, -0.0861521f, -0.0862376f, 0.0816765f, + 0.168072f, 0.150063f, -0.816825f, -0.13569f, 0.557555f, + -0.155265f, 0.025135f, -0.109304f, -0.0487062f, -0.00347487f, + -0.454803f, -0.0394371f, -0.214597f, -0.248898f, 0.286501f, + -0.249246f, -0.138935f, 0.00391409f, -0.122544f, -2.14993f, + 0.588942f, 0.541231f, 0.0154047f, -0.359742f, 0.0520729f, + 0.0667058f, 0.0418163f, -0.132533f, -0.184759f, 0.0546118f, + -0.131198f, 0.109664f, -0.0714679f, -0.114163f, -0.243081f, + -0.0405089f, 0.0342795f, 0.0801825f, -0.268408f, 0.192207f, + 0.0800494f, -0.586539f, -0.118155f, -0.0508569f, -0.193987f, + 0.261478f, 0.105719f, -0.125361f, -0.0956201f, 0.0233802f, + 0.271098f, 0.0113352f, 0.0910447f, 0.00628244f, -0.071722f, + 0.21439f, 0.0747191f, 0.207765f, -0.0782454f, -0.0151716f, + -0.196505f, -0.44798f, -0.228597f, 0.0549039f, -0.120715f, + -0.19388f, -0.0768461f, 0.361102f, 0.122936f, -0.0334211f, + -0.202503f, -0.0450776f, -0.272345f, 0.662321f, 0.109247f, + -0.218026f, -0.0669386f, -0.0864701f, -0.633421f, -0.158007f, + -1.10778f, 0.351211f, -0.541458f, -0.0171707f, 0.149606f, + 0.106105f, 0.0880349f, 0.0968455f, 0.113269f, -5.01949f, + -0.106404f, 0.175578f, -0.030045f, -0.0267249f, -0.563713f, + 0.173885f, 0.130772f, 0.0334519f, 0.0770157f, 0.0394389f, + -0.0290326f, 0.220003f, 0.180901f, -1.62203f, -0.151858f, + -0.202386f, -0.0067836f, 0.0287665f, -0.194183f, -0.239834f, + -0.484159f, 0.00671722f, -0.122459f, 0.0808959f, -0.263769f, + -0.015066f, -0.0429868f, -0.111255f, -0.231872f, 0.219659f, + -0.0437412f, -0.536618f, -0.477831f, 0.0421895f, -0.0815851f, + 0.119638f, 0.0786293f, -0.000668378f, 0.0305567f, -0.0868189f, + -0.178327f, 0.0799657f, 0.0280923f, -0.211395f, -0.464577f, + 0.216912f, 0.0761976f, 0.160288f, -0.416372f, -0.10286f, + -0.0733786f, 0.261033f, 0.0493698f, 0.143137f, -0.179979f, + 0.15655f, 0.0897976f, -0.0258041f, -0.152852f, -6.15512f, + -0.118917f, 0.227283f, -0.0514043f, -0.0786432f, -0.523485f, + 0.1644f, 0.0869001f, 0.0984082f, -0.428288f, 0.0791992f, + 0.141904f, 0.0652073f, 0.104429f, -0.775125f, -0.121479f, + 0.0841637f, 0.0135705f, -0.208863f, -0.0629523f, 0.0455794f, + 0.0513898f, -0.0147657f, 0.0401145f, 0.0660079f, 0.0210609f, + -0.0151801f, 0.0562111f, 0.140308f, -0.0196394f, 0.0230753f, + -0.0336115f, -0.422411f, -0.196974f, -0.0405748f, -0.283428f, + 0.15458f, 0.0876296f, 0.0314038f, 0.16389f, -7.01385f, + -0.117146f, 0.197273f, -0.0400688f, 0.0143951f, -0.964007f, + -0.0618919f, 0.0406891f, 0.07992f, -0.144132f, 0.116416f, + 0.0326838f, 0.103641f, 0.171805f, -1.05158f, -0.182589f, + 0.116991f, 0.0530774f, -0.212454f, -0.016727f, -0.0565992f, + 0.0712873f, 0.0445466f, -0.000107032f, -0.121449f, -0.15148f, + 0.0220338f, 0.0762024f, 0.12253f, 0.0622466f, 0.0835822f, + 0.0465119f, -0.388743f, -0.34665f, -0.0720734f, -0.101581f, + -0.630565f, -0.0512685f, -0.520541f, 0.0530119f, -0.0245276f, + -0.19116f, -0.0144446f, -0.0604486f, 0.187251f, -0.021341f, + -0.217823f, 0.0510256f, -0.197946f, 0.060955f, -0.0617316f, + 0.0741673f, 0.117591f, -1.47844f, -0.0911093f, 0.359225f, + 0.145027f, 0.127513f, 0.0617905f, 0.141154f, -7.63868f, + -0.0808127f, 0.274843f, 0.00693195f, -0.0283113f, -0.853871f, + -0.15737f, 0.0858904f, 0.0746279f, 0.109912f, 0.193775f, + 0.0698094f, 0.174159f, 0.259556f, -1.49885f, -0.156706f, + -1.04113f, -0.0329546f, -0.0491449f, -0.0304125f, 0.0514892f, + -0.244284f, 0.126814f, -0.0387081f, -0.153173f, -0.0566748f, + 0.294111f, -0.0170534f, 0.102381f, 0.447606f, -0.0613267f, + -0.0636869f, -0.0347599f, -0.259572f, -0.0657846f, 0.454352f, + -0.169453f, -0.00177987f, 0.133279f, -0.0863932f, -0.134423f, + -0.475107f, -0.00448962f, -0.214607f, 0.111413f, 0.194377f, + -0.0710837f, 0.0562353f, 0.0401193f, 0.248595f, 0.538374f, + 0.449469f, -0.39111f, 0.0125057f, 0.0448811f, -0.00707751f, + -0.164894f, -0.317516f, -0.56231f, -0.270262f, 0.127016f, + -0.12092f, -0.0881587f, -0.323908f, 0.872344f, 0.103391f, + 0.267971f, -0.155088f, -0.0136683f, 0.309517f, 0.119901f, + 0.271307f, -0.188463f, 0.185121f, -0.142777f, -0.110535f, + -0.163107f, 0.175502f, 0.0801924f, 0.240499f, 0.0874759f, + 0.308907f, -0.00222504f, 0.193366f, 0.109018f, -0.0772158f, + -0.520675f, 0.0259432f, -0.736666f, -0.296579f, 0.043486f, + -0.128932f, 0.0417669f, 0.125747f, 0.157879f, 0.112857f, + -0.0595681f, 0.0611936f, -0.042125f, -0.270338f, 0.120072f, + -0.36675f, -0.0347962f, -0.119539f, 0.0873369f, 0.296432f, + -0.069501f, -0.0383859f, 0.0913597f, -0.40747f, 0.234276f, + 0.332536f, -0.732132f, -0.312291f, 0.137759f, 0.227593f, + 0.14165f, 0.129068f, 0.102734f, 0.135818f, -7.35883f, + -0.101533f, 0.256027f, -0.0142278f, -0.0561601f, -1.09899f, + -0.106538f, 0.0612256f, 0.099487f, -0.0605983f, 0.134311f, + 0.052226f, 0.143672f, 0.219944f, -1.47539f, -0.101828f, + -0.429979f, 0.010478f, -0.0132605f, 0.103363f, 0.0267373f, + -0.338865f, 0.0090188f, 0.0810085f, -0.124368f, -0.0133776f, + 0.595666f, -0.00162201f, -0.212444f, -0.26342f, 0.0913656f, + -0.106279f, 0.414515f, -0.709901f, -0.00198859f, 0.305288f, + -0.188536f, -0.0377482f, -0.131909f, -0.116099f, -0.236827f, + -0.36356f, 0.0179455f, -0.202143f, -0.00395508f, 0.177363f, + 0.0630679f, -0.145173f, -0.0558639f, -0.44879f, -1.55687f, + 0.473398f, 0.50531f, -0.0656231f, -0.137197f, 0.064707f, + 0.122083f, 0.0321111f, -0.167096f, 0.0406581f, -0.0793592f, + -0.0777081f, 0.0321379f, -0.0108834f, -0.0652323f, -0.102918f, + 0.0178664f, 0.0781873f, 0.0613189f, -0.04177f, 0.159566f, + 0.15134f, -0.445996f, -0.384905f, 0.0951659f, -0.175046f, + 0.255746f, 0.177047f, -0.150632f, 0.200522f, 0.00778549f, + 0.232168f, -0.0304652f, 0.083155f, -0.125395f, -0.0203289f, + -0.23874f, 0.0349836f, 0.231701f, -0.14849f, -0.204272f, + -0.198309f, -0.364955f, -0.228428f, 0.0614142f, -0.040976f, + -0.227785f, -0.0898404f, 0.271566f, -0.209196f, 0.0226431f, + -0.0911715f, 0.0840369f, -0.299411f, -0.529182f, 0.0622292f, + 0.202475f, 0.0155583f, -0.083114f, 0.124253f, -0.22721f, + -1.02565f, 0.193961f, -0.54287f, -0.00849364f, 0.11124f, + 0.0993531f, 0.120621f, 0.0959537f, 0.136274f, -5.23358f, + -0.107433f, 0.155286f, -0.0136043f, -0.0246768f, -0.631187f, + -0.0493852f, 0.0446751f, 0.0588353f, 0.160766f, -0.0354385f, + -0.0672548f, 0.243743f, 0.186004f, -1.20199f, -0.151872f, + -0.0760096f, -0.00775123f, -0.0122227f, 0.0891327f, -0.377876f, + -0.469926f, -0.134715f, -0.0969362f, 0.212542f, 0.0871489f, + 0.164638f, -0.0485785f, -0.167754f, -0.515052f, 0.13821f, + 0.0515572f, -0.430691f, -0.394719f, 0.143947f, -0.00670816f, + 0.129623f, 0.140299f, 0.0336978f, 0.153545f, -0.350927f, + -0.213485f, 0.0344809f, 0.0405889f, 0.0749967f, -0.369352f, + -0.109398f, 0.0350649f, 0.190893f, -0.284106f, -0.185376f, + 0.0105842f, 0.263692f, 0.160429f, 0.0998209f, -0.127779f, + 0.140558f, 0.108968f, -0.0122672f, 0.102875f, -5.72172f, + -0.161288f, 0.135935f, -0.0143087f, 0.106556f, -0.649813f, + -0.123049f, -0.0108861f, 0.102918f, -0.298137f, 0.0329013f, + 0.100763f, 0.12018f, 0.100782f, -0.648036f, -0.111122f, + 0.12363f, 0.0211952f, -0.225201f, 0.0506021f, 0.0167621f, + 0.0608759f, -0.0245646f, 0.0503477f, -0.0972749f, -0.0415155f, + -0.00578366f, -0.0977591f, 0.124867f, 0.0134788f, -0.0375816f, + -0.00581233f, -0.272292f, -0.250393f, 0.024511f, -0.184891f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = { + 0.182474f, 0.0223202f, 0.204111f, 0.0573683f, 0.111143f, + 0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f, + 0.0133081f, 0.119719f, 0.237522f, -0.266705f, 0.129427f, + 0.0695857f, 0.22068f, 0.231667f, 0.405829f, -0.0972567f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = { + -0.0393876f, -0.269924f, -0.0703231f, -0.0236484f, 0.170478f, + 0.245566f, 0.175963f, 0.104194f, -0.0490501f, -0.157605f, + -0.0275165f, -0.0169499f, -0.250725f, 0.215203f, -0.00733655f, + 0.0111298f, 0.205606f, 0.928046f, 0.15139f, 0.0955483f, + -0.015115f, -0.126643f, 0.0957605f, -0.140178f, -0.0246866f, + 0.097097f, 0.116287f, 0.177746f, 0.0570021f, -0.0518686f, + -0.0446482f, -0.0125318f, 0.0116092f, 0.102431f, 0.0898519f, + 0.0870372f, -0.843274f, 0.383311f, -0.102761f, -0.0246494f, + 0.0312555f, 0.19472f, 0.111573f, 0.0920392f, -0.0555618f, + 0.326461f, 0.219357f, -0.133727f, -0.118399f, -0.0611432f, + -0.169931f, 0.123733f, -0.204607f, 0.082592f, 0.0323181f, + 0.201618f, -0.00388867f, -0.053583f, 0.0266333f, -0.0951787f, + -0.0358283f, -0.0649549f, 0.0119263f, -0.11812f, 0.209851f, + -0.036616f, -0.014911f, -0.138096f, -0.139664f, -0.207395f, + 0.0128848f, -0.201816f, 0.0899419f, 0.343308f, -0.0096243f, + -0.212605f, -0.0905284f, -0.0597114f, -0.055261f, -0.0653405f, + 0.0330484f, -0.27681f, -0.0994095f, -0.0468272f, 0.145713f, + 0.267216f, 0.185335f, 0.1798f, -0.0437882f, -0.200401f, + -0.0398117f, -0.0736501f, -0.166349f, 0.203316f, 0.0710647f, + 0.061825f, 0.281131f, 0.733323f, 0.215488f, 0.00145659f, + -0.138995f, -0.0833713f, 0.107809f, -0.105343f, -0.0672139f, + 0.101852f, 0.135455f, 0.132903f, 0.0312017f, -0.0643586f, + -0.0274546f, -0.0687466f, -0.020233f, 0.109444f, 0.0774587f, + 0.139497f, -0.800587f, 0.325783f, -0.0546695f, -0.092003f, + -0.0773301f, 0.189672f, 0.0604666f, 0.0939425f, 0.679495f, + 0.114789f, -0.161153f, 0.12843f, -0.0345385f, -0.134641f, + -0.153995f, 0.0823055f, -0.0349296f, 0.0299183f, -0.0606872f, + 0.137588f, 0.0449805f, -0.0555399f, -0.00553351f, -0.120719f, + -0.204701f, -0.0739813f, 0.0584115f, -0.104833f, -0.110989f, + 0.00845446f, 0.0630702f, -0.147861f, 0.0268545f, -0.216419f, + 0.00531986f, -0.206641f, 0.253082f, 0.413215f, -0.05909f, + -0.0939983f, -0.116818f, -0.0450892f, -0.0551134f, -0.00696931f, + -0.113003f, -0.289192f, -0.00884866f, -0.0365724f, 0.0401887f, + 0.238622f, 0.149151f, 0.175751f, -0.157425f, -0.138924f, + -0.0277598f, -0.0285915f, 0.10165f, 0.209532f, 0.0862249f, + 0.0256428f, 0.623204f, -0.0941196f, 0.20345f, -0.132869f, + 0.00947298f, -0.14753f, 0.103918f, -0.161799f, 0.125566f, + 0.10916f, 0.115446f, 0.135627f, -0.0181667f, -0.0734694f, + -0.0154729f, -0.085849f, -0.000427605f, 0.113614f, 0.0776308f, + 0.111899f, -0.214917f, 0.393234f, -0.132223f, 0.020783f, + -0.074902f, 0.217477f, 0.107883f, 0.109466f, 0.146609f, + 0.317061f, 0.074379f, -0.0505457f, -0.0503772f, -0.0678954f, + -0.220003f, 0.114878f, 0.176014f, -0.00657996f, -0.0875497f, + 0.065582f, 0.00238612f, -0.063395f, 0.0295323f, -0.127126f, + 0.099813f, -0.115452f, 0.0106309f, -0.179632f, -0.0436553f, + 0.0120295f, 0.0652713f, -0.131512f, -0.081714f, -0.205363f, + -0.0374944f, -0.196707f, 0.680568f, -0.00991824f, -0.0212223f, + -0.186258f, -0.432361f, -0.0291303f, -0.0475983f, -0.071383f, + -0.0116416f, -0.28257f, -0.0635272f, -0.0576546f, -0.280129f, + 0.286528f, 0.199997f, 0.192851f, 0.323829f, -0.185006f, + -0.04791f, -0.0882187f, -0.0496895f, 0.293135f, 0.125539f, + 0.0341828f, 0.993452f, 0.0369177f, 0.0453796f, 0.0329807f, + 0.157673f, -0.153195f, 0.122383f, -0.161983f, -0.317619f, + 0.105129f, 0.155673f, 0.152489f, 0.0685417f, -0.0595907f, + -0.026657f, -0.0954336f, -0.0359557f, 0.105617f, 0.0825066f, + 0.100189f, -0.22125f, 0.382508f, -0.0247677f, -0.115807f, + -0.0639787f, 0.177786f, 0.0566206f, 0.0496389f, 1.31533f, + 0.0482907f, -0.118743f, 0.190632f, 0.172867f, -0.108446f, + -0.200186f, 0.122572f, 0.0897468f, 0.0155328f, -0.0380217f, + 0.125161f, -0.141723f, -0.023157f, 0.0270805f, -0.101961f, + 0.12358f, -0.0866255f, 0.00306761f, -0.131764f, -0.461118f, + -0.00803936f, 0.0895496f, -0.153905f, 0.207623f, -0.249099f, + -0.0198487f, -0.160013f, 0.81136f, -0.109978f, -0.0880332f, + -0.0761368f, -0.0755881f, -0.0384827f, -0.0554777f, -0.0750048f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = { + 0.0106809f, 0.136699f, 0.285316f, 0.395746f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = { + -0.0161019f, -0.088871f, 0.0463358f, -0.198037f, 0.038122f, + 0.0135483f, -0.196641f, -0.433531f, 0.527972f, -0.143716f, + 0.558627f, 0.459889f, 0.322864f, -0.491514f, -0.190915f, + -0.0765601f, 0.210329f, 0.689389f, -0.100415f, -1.8788f, + 0.2228f, 0.292781f, -0.954838f, -0.0788763f, -0.131402f, + -0.17154f, 0.049934f, -0.0541183f, -0.530529f, -0.666165f, + 0.195492f, 0.218548f, -0.314895f, 0.0749444f, -0.191344f, + 0.349469f, 0.00811248f, -0.760157f, 0.0707434f, -0.0719285f, + -0.264495f, -0.432009f, -0.432686f, 0.155738f, -0.020197f, + 0.19278f, -0.658335f, -0.273143f, -0.286079f, 0.243402f, + 0.497701f, 0.0121003f, -0.666308f, 0.028172f, -0.547901f, + -0.11755f, 0.322028f, 0.0878274f, -0.0328334f, 0.311816f, + 0.0951026f, -1.11429f, -0.0417486f, 0.123467f, -0.0910681f, + -0.0154255f, 0.311201f, -0.0156158f, -0.600437f, 0.0274156f, + -0.174907f, -1.29313f, -0.178656f, 0.596556f, -0.421725f, + -0.289137f, 0.529297f, 0.114833f, -0.0155887f, -0.308232f, + -0.0228361f, 0.184017f, 0.138232f, 0.146347f, -0.117867f, + 0.248351f, -0.282846f, -0.18058f, 0.348355f, -0.415754f, + 0.0657168f, 0.431728f, -0.231043f, -0.186745f, 0.137401f, + -0.282329f, -0.159678f, 0.754262f, 0.037824f, -1.68521f, + -0.290175f, 0.289588f, -0.18683f, -0.300385f, 0.285449f, + -0.00386456f, 0.0563485f, -0.376541f, 0.159899f, -0.697312f, + 0.0284389f, 0.437307f, 0.3968f, -0.372082f, -0.232535f, + 0.394629f, 0.00315248f, -0.38374f, 0.0311291f, -0.624353f, + 0.498083f, -0.342663f, -0.125978f, 0.186797f, 0.187723f, + 0.149335f, -0.82727f, -0.0740974f, -0.659039f, 0.42671f, + -0.448835f, 0.150677f, 0.830742f, -0.233148f, -0.65308f, + -0.0878935f, -0.407797f, -0.511826f, -0.0739023f, 0.506305f, + -0.187451f, 0.0284968f, -0.822238f, 0.362523f, -0.270865f, + 0.032335f, 0.560413f, -0.00388247f, -0.446333f, 0.163147f, + -0.409633f, -0.372575f, 0.306993f, 0.55953f, -0.24362f, + -0.0929369f, -0.520298f, -0.444022f, 0.186077f, -0.0942208f, + 0.624049f, -0.429625f, -0.869528f, 0.405257f, -0.120445f, + 0.537685f, -0.3911f, 0.142142f, 0.0913808f, -0.00375967f, + 0.382781f, 0.60505f, -0.271608f, -0.0630436f, -0.150625f, + -0.0124598f, 0.0132878f, 0.138475f, -0.106264f, -0.416581f, + -0.518415f, 0.185127f, -0.464622f, -0.0102925f, 0.0389567f, + 0.406439f, -0.0414264f, -0.366185f, -0.511867f, -0.650255f, + 0.278252f, 0.0270234f, 0.262788f, -0.0294793f, 0.12651f, + 0.421537f, 0.0300837f, 0.0742187f, 0.281954f, -0.122069f, + -0.450145f, -0.312206f, -0.402633f, -0.0868137f, 0.190433f, + -0.149602f, -0.175029f, 0.00900023f, -0.266596f, 0.21721f, + -0.245079f, -1.09798f, 0.319409f, -0.337938f, 0.358514f, + 0.0771549f, 0.447087f, -0.305507f, -0.285492f, 0.383896f, + 0.145933f, -0.264944f, -0.118486f, 0.068805f, -0.194231f, + -1.79133f, 0.363408f, -0.17434f, -0.229629f, 0.132188f, + 0.207548f, -0.876264f, 0.265634f, 0.139332f, 0.236206f, + -0.0145184f, 0.562865f, 0.526612f, -0.0333508f, -0.421885f, + 0.273485f, -0.110882f, 0.425557f, 0.513303f, -0.422322f, + 0.0563155f, -0.0409693f, 0.194768f, -0.419828f, -0.107195f, + -1.19224f, 0.48552f, 0.132782f, -0.00932096f, -0.225484f, + -0.428484f, -0.0392684f, 0.750697f, 0.337615f, 0.158476f, + 0.413484f, 0.326017f, -0.757107f, -0.183962f, 0.00884361f, + 0.126507f, -0.0751588f, -0.308782f, -0.104237f, -0.703877f, + -0.491806f, -0.204251f, -0.317212f, 0.0815479f, 0.296323f, + 0.219632f, -0.039859f, 0.556257f, 0.176144f, -0.0750654f, + -0.106419f, 0.00400385f, -0.172266f, 0.000178763f, 0.146532f, + 0.255202f, -0.427235f, -0.182198f, -0.256557f, 0.260255f, + -0.0143364f, 0.0868664f, -0.564373f, -0.0876947f, 0.726289f, + 0.0160001f, -0.381562f, -0.638214f, -0.803803f, 0.25945f, + -0.371542f, -0.419611f, 0.238617f, 0.371834f, -0.226777f, + -0.894602f, 0.37458f, -0.354866f, 0.0249312f, 0.142374f, + 0.433813f, -0.0218183f, -0.33248f, 0.107223f, 0.390823f, + -0.0271108f, -0.616878f, -0.604984f, 0.517269f, -0.293573f +}; + +static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = { + -0.290371f, -0.0560272f, -0.118144f, -0.270583f, 0.401388f, + -0.308677f, 0.150729f, -0.0324442f, -0.135937f, 0.0875581f, + 0.0206493f, -0.212682f, -0.0266535f, -0.326656f, 0.0185105f, + -1.01429f, -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f +}; + +static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = { + NUM_CNN_LAYERS, // num_layers + 0, // is_residue + 0, // ext_width + 0, // ext_height + 0, // strict_bounds + { + { + CNN_LAYER_0_IN_CH, // in_channels + CNN_LAYER_0_WIDTH, // filter_width + CNN_LAYER_0_WIDTH, // filter_height + CNN_LAYER_0_OUT_CH, // out_channels + CNN_LAYER_0_HORZ_STRIDE, // skip_width + CNN_LAYER_0_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_0_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_0_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + -1, // output_num + }, + { + CNN_LAYER_1_IN_CH, // in_channels + CNN_LAYER_1_WIDTH, // filter_width + CNN_LAYER_1_WIDTH, // filter_height + CNN_LAYER_1_OUT_CH, // out_channels + CNN_LAYER_1_HORZ_STRIDE, // skip_width + CNN_LAYER_1_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_1_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_1_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 3, // output_num + }, + { + CNN_LAYER_2_IN_CH, // in_channels + CNN_LAYER_2_WIDTH, // filter_width + CNN_LAYER_2_WIDTH, // filter_height + CNN_LAYER_2_OUT_CH, // out_channels + CNN_LAYER_2_HORZ_STRIDE, // skip_width + CNN_LAYER_2_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_2_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_2_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 2, // output_num + }, + { + CNN_LAYER_3_IN_CH, // in_channels + CNN_LAYER_3_WIDTH, // filter_width + CNN_LAYER_3_WIDTH, // filter_height + CNN_LAYER_3_OUT_CH, // out_channels + CNN_LAYER_3_HORZ_STRIDE, // skip_width + CNN_LAYER_3_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_3_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_3_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 1, // output_num + }, + { + CNN_LAYER_4_IN_CH, // in_channels + CNN_LAYER_4_WIDTH, // filter_width + CNN_LAYER_4_WIDTH, // filter_height + CNN_LAYER_4_OUT_CH, // out_channels + CNN_LAYER_4_HORZ_STRIDE, // skip_width + CNN_LAYER_4_VERT_STRIDE, // skip_height + 0, // maxpool + av1_intra_mode_cnn_partition_cnn_layer_4_kernel, // weights + av1_intra_mode_cnn_partition_cnn_layer_4_bias, // bias + PADDING_VALID, // pad + RELU, // activation + 0, // deconvolve + 0, // branch + BRANCH_NO_COPY, // branch_copy_type + BRANCH_NOC, // branch_combine_type + NO_BRANCH_CONFIG, // branch_config + NO_BN_PARAMS, // bn_params + 0, // output_num + }, + }, +}; + +static const float + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = { + 0.604356f, -0.236007f, 0.342172f, 0.531397f, -0.635698f, + -0.591573f, 0.833872f, 0.492814f, -0.100308f, 0.186385f, + 0.202779f, 0.263578f, 0.330001f, -0.15531f, 0.879584f, + -0.0048796f, 0.490796f, 0.242254f, -0.292211f, -0.696912f, + 0.746664f, 0.129371f, -0.0122443f, 0.196234f, -0.251605f, + -0.385617f, 0.157707f, 0.699963f, 0.0432536f, -0.11141f, + -0.0353473f, -0.0364045f, -0.113556f, -0.520842f, 0.231248f, + 0.230638f, -0.323852f, -1.08633f, -0.0469168f, -0.481821f, + 0.366838f, 0.189627f, -0.0637262f, -0.484917f, -0.109874f, + 0.292237f, 0.368702f, -0.183896f, -0.109038f, -1.22613f, + -0.880355f, -1.63768f, 0.337426f, -0.940994f, 0.413097f, + -0.37879f, -0.480525f, -0.594819f, -0.0172653f, -0.499436f, + -0.298395f, -0.840181f, -0.0758645f, -0.772089f, -0.232727f, + -0.815968f, 0.160785f, -0.0767165f, 0.0064244f, -0.540491f, + 0.417776f, -0.384337f, -0.497377f, 0.68414f, 0.00797514f, + 0.262626f, 0.203732f, 0.702047f, 0.0617544f, 0.0878249f, + -0.315032f, -0.0169776f, 0.403986f, 0.815872f, 0.135388f, + 0.0858594f, 0.169172f, -0.638227f, -1.65268f, -0.0476042f, + -0.982685f, 0.45707f, -0.0577537f, 0.367329f, 0.176513f, + -0.356454f, 0.0979095f, -0.277476f, 0.257271f, -0.333451f, + 0.0241497f, 0.0671127f, 0.221216f, 0.106065f, 0.537151f, + 0.0257329f, 0.265559f, -0.348353f, 0.285569f, -0.0610511f, + -1.59334f, -1.63826f, -0.164898f, -0.36605f, -0.489304f, + 0.729241f, 0.0197627f, 0.200291f, -0.231506f, -0.255715f, + -0.0932264f, -0.728793f, 0.468297f, -1.09592f, -0.079791f, + -1.76531f, -0.182904f, -2.05897f, -0.371894f, 0.207124f, + 0.255029f, 0.186501f, -0.005805f, 0.00160733f, -0.178206f, + -0.352757f, -0.164741f, -0.557583f, -0.559692f, -0.00731467f, + 0.149326f, 0.409735f, 0.22083f, -0.332572f, -0.1741f, + -0.0519008f, -0.266402f, 0.294031f, -2.4453f, 0.339851f, + -0.573747f, -5.97783f, -0.084142f, 0.20286f, -0.576038f, + -0.111081f, 0.101238f, -5.83427f, -1.98537f, 0.322796f, + -0.60171f, 0.212412f, 0.247176f, 0.603694f, -0.54357f, + -0.693439f, 0.250725f, -4.31988f, 0.0935924f, 0.43669f, + -0.139706f, -0.158391f, 0.244309f, 0.619213f, -0.309154f, + -0.135341f, 0.475815f, -0.290804f, -0.109038f, -0.0937104f, + 0.0385907f, -0.29105f, -0.0597651f, -0.451187f, -1.51821f, + 0.141772f, 0.822204f, -0.729661f, -0.109908f, 0.178217f, + -0.750278f, 0.113762f, -0.0959985f, 0.066579f, -0.104209f, + -0.951378f, 1.4087f, -1.13175f, -1.09103f, -1.50416f, + -0.182273f, -1.80129f, -0.152135f, 0.356931f, 0.205591f, + 0.183148f, -0.498671f, -0.183034f, -0.176428f, 0.395706f, + -0.589908f, -0.318276f, -0.421162f, 0.658766f, -0.186752f, + 0.0656253f, 0.248002f, 0.289618f, -0.458111f, -0.130789f, + -0.542988f, 0.405804f, -0.35364f, -0.311927f, 0.218339f, + 0.309215f, -0.130347f, -0.0257543f, 0.0413234f, -0.190205f, + -0.242382f, 0.819886f, -0.255157f, -0.181219f, -0.290903f, + -0.301995f, -0.0469988f, 0.702936f, 0.209122f, 0.0234243f, + 0.598637f, 0.0305196f, 0.0423457f, -0.618799f, 0.0190867f, + 0.420584f, -0.224752f, -0.410077f, 0.127854f, 0.395261f, + -0.393685f, -0.282822f, 0.0289504f, 0.0406515f, -0.511531f, + -0.497611f, 0.0252715f, 0.0812549f, 0.80205f, 1.29084f, + 0.764972f, 0.561258f, -0.23499f, 0.217594f, -0.690935f, + -0.26607f, 0.357955f, 0.391608f, 0.448352f, 0.458586f, + -0.790071f, 0.719959f, -0.468052f, 1.24579f, 0.220705f, + 0.284044f, 0.141346f, 0.246687f, 0.147826f, -0.403557f, + -0.00648195f, 0.398034f, -0.100464f, -0.77107f, -0.188274f, + -0.219245f, -0.0330375f, 0.367585f, -0.220391f, 0.308736f, + 0.221399f, 0.340292f, 0.037597f, 0.606083f, 0.665634f, + -0.755529f, -0.95989f, -0.243673f, 0.233709f, -0.454628f, + -0.110952f, 0.776062f, 0.731136f, -0.140422f, 0.19261f, + 0.355086f, 0.975026f, 0.190936f, 0.776205f, 0.982781f, + 0.555569f, 0.42382f, -0.409721f, 0.25053f, -0.271328f, + 0.859941f, -0.0210901f, 0.0176916f, -0.562895f, -0.0787431f, + -0.861032f, -0.34022f, -0.571995f, 0.205436f, 0.346968f, + 0.377033f, -1.08484f, 0.297007f, -1.01693f, 0.189463f, + -0.483242f, 0.147058f, 0.0159503f, 0.0908779f, -0.46962f, + 0.174024f, -0.490704f, -0.383501f, -0.0507626f, 0.00902188f, + -0.202495f, 0.205047f, 0.0562261f, -0.143371f, 0.219524f, + -0.317294f, -0.0575756f, -0.0595825f, -0.000625279f, -0.278864f, + -0.0516874f, -0.225259f, 0.429046f, -0.0952421f, 0.0799135f, + -0.122883f, -0.262308f, -0.481006f, -0.0466122f, -0.402822f, + 0.150595f, -0.0919558f, -0.356765f, -0.199222f, 0.219389f, + -0.214452f, -0.196361f, -0.095758f, -0.115891f, -0.143777f, + 0.549843f, -0.113036f, 0.764895f, -0.0114812f, -0.0684054f, + -0.98045f, -0.0170634f, 0.247719f, -0.18718f, -0.381566f, + 0.150758f, -0.526257f, 1.00851f, 0.776634f, 1.69728f, + -0.303058f, 0.228967f, -0.414134f, 0.0858226f, -0.285472f, + 0.431459f, 0.315318f, 0.587835f, 0.335737f, -0.0222039f, + 0.18945f, 0.274008f, 0.609263f, 0.320232f, -0.214137f, + -0.0297668f, 0.0439046f, -0.52821f, -0.0127375f, 0.431885f, + 0.508846f, -0.329189f, -0.166778f, -0.94338f, -0.358807f, + 0.208641f, -0.517986f, -0.128278f, 0.693464f, -0.24408f, + -0.0669412f, -0.410287f, 0.0444145f, -0.264179f, 0.143884f, + 0.276842f, 0.498934f, -0.682557f, -0.217198f, -0.8249f, + -0.40446f, -0.115376f, 0.417934f, 0.65605f, -0.00570035f, + -0.365742f, -0.367625f, 0.526824f, -0.0164913f, -0.255998f, + 0.247292f, 0.0846536f, 0.109302f, -0.302996f, 0.160564f, + 0.0228132f, 0.035211f, -0.236951f, 0.493801f, 1.37315f, + -0.182348f, 0.234437f, -0.256906f, 0.12523f, 0.667113f, + -0.437981f, -0.0721831f, 0.303976f, -0.041336f, -0.145894f, + -0.733741f, 0.436056f, 0.368542f, -0.149072f, -0.290281f, + 0.0946743f, -0.0579292f, 0.264539f, 0.170048f, 0.262411f, + 0.049679f, 0.371369f, 0.760675f, 0.482157f, -0.0196783f, + 0.260888f, 0.948856f, 0.170228f, -0.134432f, -0.942235f, + -1.23226f, -0.373963f, -0.0381773f, -0.17947f, 0.00947998f, + 0.01086f, 0.389578f, -0.380389f, -0.0865851f, -0.220328f, + -0.171901f, -0.384325f, -0.0787615f, 0.392678f, 0.123392f, + -0.0895824f, 0.00480886f, -0.162918f, 0.214336f, -0.00147339f, + 0.203899f, -0.00292344f, -0.148594f, 0.0425697f, -0.306896f, + -0.342225f, -0.45088f, -0.184454f, -0.00923638f, -0.521993f, + -0.334464f, 0.156497f, -0.0856832f, -0.277661f, -0.0721105f, + -0.488781f, -0.509543f, -0.012664f, 0.0940558f, -0.29869f, + 0.0434843f, -0.0178945f, -0.0525666f, -0.303178f, 0.713507f, + -0.137413f, -0.170289f, -0.142942f, -0.316002f, 0.229125f, + -0.277585f, 0.0125026f, 0.508316f, -1.20614f, -0.915129f, + -1.63389f, -0.454604f, -0.893951f, -0.447403f, -0.751423f, + 1.3886f, 0.617818f, 0.611458f, -0.884173f, -0.7779f, + -0.608639f, -0.164759f, -0.631846f, -0.176894f, -0.459361f, + -0.187119f, 0.173283f, -0.477191f, -0.156736f, 0.182675f, + 0.598854f, -0.489941f, -0.420493f, -0.162002f, 0.344418f, + 0.33832f, -0.187463f, -0.388721f, -0.0733151f, -0.138835f, + 0.313699f, 0.0625967f, -0.291488f, 0.114088f, -0.356843f, + 0.197506f, 0.0320749f, 1.16745f, -0.36081f, 1.63416f, + 0.198392f, 1.13928f, -0.317971f, 0.531019f, 0.526518f, + 0.185814f, 0.0923607f, 0.192858f, -0.234378f, 0.18091f, + -0.228837f, 0.397216f, 0.581501f, 0.284376f, -0.130434f, + 0.20076f, 0.242662f, -0.0480872f, 0.131746f, 0.362712f, + 0.0146821f, 0.475679f + }; + +static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = { + 0.477356f, 0.385222f, 0.389122f, 0.539506f, -0.0272558f, 0.581605f, + -0.800961f, 0.142229f, 0.117549f, -0.0724944f, 0.102095f, -0.71319f, + -0.0162434f, -0.132858f, 0.543411f, -0.626599f +}; + +static const float + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = { + 0.195436f, -0.623354f, 1.27907f, 0.270071f, -0.677612f, + 0.0266141f, 0.272991f, -0.425446f, 0.891889f, -0.299836f, + -0.611825f, -0.0322273f, 0.185276f, 0.238639f, -0.150954f, + 0.083495f, -0.472106f, 0.573506f, 1.16465f, -0.154947f, + 0.640631f, -1.59467f, -9.8166f, -0.22889f, -0.189912f, + 0.227052f, -0.540787f, 0.0840873f, -3.04293f, -0.0209975f, + -6.10979f, -5.92801f, 0.288467f, -0.169476f, 0.0527948f, + -1.21202f, -0.280915f, 0.290863f, -0.601877f, 0.0598784f, + -0.592136f, -0.535588f, -0.0434018f, -0.653223f, 0.00339129f, + -0.133273f, 0.279463f, 0.483879f, 0.463664f, -0.14174f, + -1.56354f, 0.560043f, -1.44639f, 0.673528f, -0.108418f, + -0.707313f, 0.49633f, -0.0321971f, 0.411475f, -0.382184f, + -0.965501f, -0.0507655f, 0.540415f, -0.977297f, 0.370382f, + -0.375683f, 0.0844529f, -2.0002f, -0.346289f, 0.621251f, + -0.489855f, 0.191252f, -0.576629f, -0.35773f, 0.023167f, + 0.180793f, -0.417864f, 0.0587254f, 0.167824f, 0.0612058f, + -0.712108f, 0.155614f, 0.900036f, -0.480124f, 0.146117f, + 0.467011f, 0.412525f, 0.312724f, 0.551826f, -0.179601f, + 0.706261f, 0.00674965f, -0.495221f, 0.140829f, -0.0619195f, + -0.0697912f, 0.511967f, -0.0318237f, -0.285946f, -0.28608f, + 0.0894142f, 0.234351f, -0.272328f, -0.350369f, -0.392605f, + 0.287318f, 0.310426f, 0.293524f, 0.357681f, -0.157868f, + 0.149652f, -0.259363f, 0.192941f, -0.850096f, 0.456507f, + 0.387857f, -0.491187f, -0.0541993f, -0.28118f, 0.193991f, + -0.0956664f, 0.0679829f, 0.0341118f, 0.141826f, 0.271538f, + -0.285295f, -0.68666f, 0.306414f, 0.600678f, 0.494801f, + -1.11907f, 0.524849f, 0.151169f, 0.474068f, -0.43441f, + -0.229138f, 0.0345483f, 0.682888f, -0.471534f, -0.0457066f, + -2.36721f, 0.446407f, 0.20396f, -1.17868f, 0.815363f, + -1.13897f, 0.397217f, -0.593796f, -6.95512f, 0.650695f, + 0.771657f, 0.15227f, -0.824519f, 0.617854f, -0.295353f, + -0.101207f, 0.600989f, -0.550653f, -0.722371f, 0.292006f, + -0.451891f, 0.54544f, 0.354278f, 0.0136258f, 0.192003f, + 0.258275f, -0.0443647f, 0.0928186f, 0.667775f, 0.239558f, + 0.0523887f, 0.71586f, 0.292563f, 0.362479f, 0.373453f, + 0.250638f, -0.423037f, -0.486574f, -0.619397f, 0.343888f, + 0.974971f, 0.574218f, 0.273989f, -0.209956f, -0.274333f, + 0.0553766f, 0.263918f, 0.733824f, 0.038713f, -0.0788992f, + 0.292014f, 0.111808f, -0.197507f, 0.593668f, -0.0245337f, + 0.0873662f, 0.530997f, 0.620717f, 0.310697f, -1.54861f, + 1.12915f, 0.0991346f, -0.59214f, 0.422325f, -0.0157936f, + 0.380975f, 0.626403f, 0.268064f, -0.615231f, -1.43172f, + 0.0928048f, 0.0949026f, -0.470912f, -0.0867527f, -0.0381206f, + 0.178393f, -1.13737f, 0.12798f, 0.258214f, -0.803364f, + 0.177506f, 0.542718f, 0.660656f, 0.145091f, 0.183056f, + -0.47338f, 0.469287f, 0.10832f, 0.0994899f, -0.402719f, + 0.157287f, 0.523071f, -0.324493f, 0.343599f, 0.664839f, + -0.0375519f, -0.279238f, -0.0722333f, 0.395344f, -0.289316f, + 0.0259298f, -0.843245f, -0.160021f, 0.741429f, -1.38726f, + -0.2969f, -0.240443f, 0.247731f, -1.04088f, -0.280454f, + -0.237054f, -0.759227f, 0.0456369f, -0.647453f, -1.02372f, + -0.200395f, -0.546839f, -0.104226f, -0.152727f, -0.56685f, + -0.0559663f, -0.425494f, -0.610679f, -0.987096f, -0.575138f, + -0.0887979f, 0.463646f, -1.041f, -0.49412f, -0.175298f, + -0.463296f, -0.955177f, 0.17852f, -1.10694f, 0.181991f, + -0.18998f, 0.227818f, 0.688237f, -1.10444f, 0.549108f, + -0.171849f, -0.245614f, 0.120624f, 1.29571f, 0.607116f, + 0.00809927f, 0.1041f, -1.22918f, -0.212948f, 0.430239f, + -1.57341f, 0.482054f, 0.275905f, 0.939785f, -1.0209f, + -0.355534f, 0.397337f, -0.0593077f, -0.239603f, 0.475483f, + -0.999101f, -0.140578f, 1.04787f, -0.591981f, -0.306989f, + -0.879012f, -0.994715f, 0.0343158f, 0.218509f, 0.34704f, + 0.0672934f, -0.178941f, 0.20509f, -0.360031f, 0.161241f, + -0.324775f, -0.359531f, -0.0657085f, -0.864422f, -0.444865f, + 0.597095f, -0.948691f, 0.240001f, -0.783159f, -0.569422f, + 0.974205f, -1.04539f, 0.345915f, -0.681558f, -0.246047f, + 0.256174f, 0.493667f, 0.681324f, 0.155613f, 0.773309f, + -0.647027f, -0.214744f, -0.474202f, -0.661092f, -1.02316f, + 0.0572593f, -0.437082f, -0.119874f, -0.464877f, -0.58067f, + -0.218029f, 0.319516f, -0.378983f, -0.0698695f, 0.554693f, + -0.537875f, 0.126429f, -0.145113f, -0.594312f, -0.218021f, + -0.703569f, 0.0720548f, 0.261054f, -0.81438f, 0.249921f, + 0.165296f, -0.079028f, -0.322647f, 0.134458f, 0.0975046f, + 0.538594f, -0.250126f, 0.142309f, 0.526486f, 0.0532615f, + -0.383332f, -0.38143f, -0.101611f, 0.519776f, -0.278364f, + -0.23287f, -0.29139f, 0.22353f, 0.472085f, 0.366264f, + 0.741187f, 0.42019f, 0.0676459f, -0.230008f + }; + +static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = { + -0.48603f, -0.578556f, 0.257639f, 0.459915f, 0.178156f, -1.16663f, + 0.828891f, 0.620291f, 0.413257f, -1.00508f, -0.574179f, -1.20623f, + -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f, + 0.0233112f, 0.126045f, 0.361304f, 0.655317f, 0.413134f, 0.769947f +}; + +static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = { + 0.67244f, -2.59179f, 0.50425f, -1.86481f, 1.15891f, -1.26447f, + 0.761081f, 0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f, + -0.560935f, 0.838959f, 0.502264f, -1.28958f, -0.205551f, 0.635671f, + -1.12619f, -1.68277f, 0.83361f, 1.57235f, 1.15839f, 0.35345f +}; + +static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = { + 1.14463f +}; + +static const float + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = { + 0.364612f, 0.237868f, -0.192821f, 0.12364f, 0.522205f, + -0.205785f, -0.503288f, -0.426503f, -0.083073f, 0.0164429f, + 0.184278f, -0.426055f, 0.0717997f, -0.261968f, 0.176412f, + -0.101226f, 0.0400285f, -0.332051f, 0.344385f, 0.189565f, + 0.441162f, 0.330462f, -0.719857f, -1.14209f, 0.557831f, + 0.104756f, 0.0562001f, -0.465923f, -0.344592f, -0.191554f, + -0.0656866f, -0.640162f, 0.419388f, 0.409308f, -1.68632f, + -1.10829f, 0.105485f, -0.14561f, -0.944738f, 0.104629f, + -0.146837f, 0.538823f, -0.153157f, 0.321081f, -1.77714f, + -0.0559296f, 0.324136f, -0.497023f, -1.15793f, -0.740144f, + -0.0888472f, 0.010059f, -0.18394f, -0.234405f, -0.10586f, + 0.130958f, -0.101944f, -0.186483f, -0.447049f, -0.900026f, + 0.128444f, 0.401696f, 0.128509f, 0.123778f, 0.062168f, + -0.321755f, -0.0691584f, 0.254468f, -0.115212f, -0.848885f, + 0.817005f, 0.0615853f, 0.153363f, 0.513855f, 0.789225f, + 0.356168f, 0.371613f, 0.269541f, 0.268173f, 0.220481f, + -0.109063f, -0.00620798f, -0.0334622f, 0.236267f, -0.0235294f, + -0.0800253f, 0.0294184f, 0.047131f, -0.224047f, 0.0890737f, + -0.356293f, 0.0989534f, 0.16799f, 0.498266f, 0.612581f, + -0.372897f, -0.75125f, 0.77698f, 1.1032f, -0.0764679f, + 0.0266299f, 0.309532f, 0.461305f, 0.0193521f, -0.0939161f, + -0.276156f, -0.102714f, -0.0828328f, 0.40003f, 0.122542f, + 0.0867203f, -0.170738f, 0.0850642f, -0.130762f, 0.082324f, + -0.115218f, -0.0244491f, 0.0434331f, 0.216453f, 0.443733f, + -0.173679f, -0.161617f, 0.316209f, -0.689656f, -1.52007f, + -0.421018f, 0.430833f, -0.00734122f, 0.284499f, -0.0207885f, + 0.0572024f, -0.878942f, 0.388264f, 0.0191589f, -0.123415f, + -0.0461196f, -0.0444461f, -0.00383171f, 0.0945655f, -0.0597219f, + -0.374918f, 0.0182124f, 0.523083f, 0.00519547f, 0.80513f, + -0.221433f, -1.30591f, -0.416917f, -0.718173f, 0.622999f, + 0.941798f, 0.0477536f, 0.0303772f, 0.268078f, 0.414778f, + 0.394325f, 0.299733f, -0.583208f, 0.309379f, 0.416581f, + 0.0299948f, -0.409145f, -0.161557f, -0.214082f, -0.0098119f, + 0.221912f, 0.107135f, 0.0692518f, 0.00490957f, 0.107613f, + -0.368404f, -0.548006f, 0.208274f, 0.550475f, 0.643678f, + -1.65859f, 0.095938f, -0.0434245f, -0.0792685f, 0.838109f, + -0.0138653f, -0.527573f, -0.123472f, -0.235618f, -0.677401f, + -0.125877f, -0.175604f, -0.203196f, 0.113478f, -0.228323f, + -0.53539f, 0.134458f, 0.0534899f, -0.213006f, -0.138679f, + -2.15023f, 0.186303f, 0.48566f, -1.22301f, -0.240982f, + -0.486836f, -0.121181f, -0.131382f, -0.0320283f, 0.278828f, + 0.342581f, -0.182257f, -0.365193f, -0.226351f, 0.108928f, + -0.100159f, 0.448355f, -0.0768947f, 0.0633719f, -0.104786f, + 0.0456653f, 0.0965752f, 0.156403f, -0.157337f, 0.212259f, + 0.317939f, 0.124193f, -0.329475f, 0.206868f, -2.15986f, + -0.108385f, -0.396769f, -0.0317231f, -0.271524f, -0.184697f, + 0.662615f, 0.412926f, -0.0217462f, -0.0285475f, -0.118826f, + 0.0252706f, -0.137091f, 0.198973f, 0.329509f, -0.0831966f, + -0.621237f, 0.0896179f, 0.805261f, -0.019675f, 0.962452f, + 0.307433f, 0.892168f, -0.537587f, -2.46145f, 0.125606f, + 0.920491f, 0.219462f, 0.292765f, -0.748238f, -0.0537239f, + -0.224326f, 0.505492f, 0.176426f, 0.0343168f, 0.16708f, + -0.581393f, 0.951726f, -1.1777f, -0.561914f, -1.53288f, + 0.864567f, -1.19648f, -1.24141f, -0.334688f, -0.622026f, + 0.666876f, -0.197005f, -0.600507f, -0.851924f, 0.492299f, + 0.31078f, -0.0736115f, 0.030999f, -6.02463e-05f, -0.0604341f, + -0.0254238f, 0.139222f, 0.333235f, 0.366534f, -0.191982f, + -0.0156092f, 0.44234f, -0.0193213f, 0.0938745f, -0.015709f, + -0.12043f, 0.00895591f, 0.0464401f, 0.0530699f, -0.623018f, + -1.23372f, -0.538647f, -1.12389f, 0.26742f, 0.548694f, + 0.00540655f, -0.219703f, 0.314894f, -0.573463f, -0.241555f, + 0.441851f, 0.422491f, 0.253785f, -0.384683f, 0.0370165f, + 0.226669f, 0.245587f, 0.215265f, -0.122272f, 0.0492235f, + 0.000658591f, -0.312877f, 0.436487f, -0.229199f, -0.174373f, + 0.904268f, -0.855845f, -0.877293f, -0.65409f, 0.313795f, + 0.461748f, -0.737766f, -0.228523f, 0.182181f, 0.334522f, + 0.0629676f, -0.151087f, 0.178798f, -0.325809f, -0.331672f, + 0.0865837f, -0.0684225f, 0.0252008f, -0.0820631f, 0.0481863f, + 0.209473f, -0.0242151f, -0.0898919f, -0.163828f, -0.164282f, + 0.581888f, 0.816896f, 0.0607674f, 0.364855f, -0.346512f, + -0.764174f, 0.595561f, 0.302872f, 0.206361f, 0.106917f, + -0.972338f, 0.176948f, 0.6415f, -0.131897f, -0.155802f, + 0.216337f, -0.342511f, 0.123743f, -0.123014f, 0.0205439f, + 0.15173f, -0.23801f, -1.00387f, 0.651328f, 0.237439f, + -0.542952f, 1.066f, -0.161107f, -0.593545f, 0.219343f, + -0.178094f, 0.0789992f, 0.428332f, 0.23827f, -0.327421f, + 0.416144f, 0.00394653f, 0.052046f, -0.238289f, 0.405942f, + 0.00141984f, 0.161017f, 0.077111f, 0.0823985f, 0.0981208f, + 0.109949f, -0.0428502f, 0.343629f, -0.722978f, -0.375269f, + -0.111634f, -0.271523f, 0.712093f, 0.684904f, -0.572331f + }; + +static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = { + 0.583367f, -0.202004f, -0.207626f, 0.412451f, -0.258311f, 0.0304954f, + -0.102458f, 0.450087f, -0.376851f, -0.338702f, 0.335226f, 0.889072f, + 0.502411f, 0.649282f, 0.15345f, -0.0109896f +}; + +static const float + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = { + 0.0214882f, -0.934339f, -0.173335f, 0.8362f, -0.764234f, + 0.525163f, 0.409749f, 0.821539f, -0.784157f, -0.455593f, + 0.446099f, 0.406756f, 0.479242f, -0.814038f, -0.419332f, + 0.328869f, -0.340707f, 0.133219f, 0.0320347f, 0.25089f, + -0.324917f, -0.0684265f, 0.0377777f, -0.262556f, 0.673458f, + -0.0291454f, -0.417957f, -1.0075f, -0.481537f, 0.922105f, + -0.000516239f, -0.40034f, 0.242067f, -0.43178f, 0.32001f, + 0.143599f, -0.345172f, 0.126093f, 0.148518f, -1.12151f, + -1.03435f, 0.551691f, -0.310001f, -0.323194f, -0.595128f, + -0.395689f, 0.737268f, -0.729227f, 0.590804f, -0.590022f, + -1.01427f, -0.521159f, -0.617579f, 1.07292f, -0.613047f, + -0.619093f, 0.335268f, 0.473753f, -0.795027f, 1.24635f, + -0.556193f, 0.241046f, -0.0354181f, -0.354215f, 0.716752f, + -0.00200745f, -1.25171f, -0.440731f, -0.763918f, -0.588614f, + -0.183901f, -0.396056f, 0.226903f, 0.921471f, 1.10465f, + 0.207053f, 0.57681f, -0.555699f, 0.235469f, -0.92149f, + 0.625808f, 0.29653f, -0.81775f, -0.307889f, -1.41384f, + -0.136205f, -0.365314f, -0.516741f, 0.748052f, 0.617947f, + 0.0973239f, 0.839607f, 0.530668f, -0.227032f, -0.449044f, + -1.04725f, -0.244363f, -0.396888f, -0.146161f, 0.359789f, + 0.0436599f, 1.21645f, -0.336069f, 0.0534646f, -0.00200328f, + 0.658551f, -0.156142f, -1.0728f, 0.0951015f, 0.234837f, + -0.380525f, 0.041783f, -0.269273f, 0.0386013f, -0.455589f, + -0.174338f, 0.0345251f, 0.17116f, -0.507642f, 0.210453f, + 0.739987f, -0.0438776f, 0.570145f, -0.118811f, 0.0548662f, + 0.153458f, -0.89887f, 0.493704f, 0.283351f, 0.785441f, + -0.586002f, -0.0616167f, -0.714328f, -0.145941f, -0.449656f, + 0.850117f, 0.279997f, 0.204143f, -0.31356f, 0.947057f, + -0.135787f, 0.747071f, 0.0145968f, -0.81414f, 0.431009f, + -0.275824f, -0.342928f, -0.0528272f, -0.592183f, 0.433915f, + -0.251752f, -0.311815f, -1.47533f, -1.43677f, 0.0698436f, + 1.01341f, 0.305063f, -0.252003f, -0.428915f, -0.00104153f, + -0.368267f, -0.354523f, -0.27956f, -0.771664f, 0.232092f, + -0.428495f, 0.424952f, -0.343229f, 0.196899f, -0.761084f, + -0.0110293f, -0.335361f, 0.571637f, -0.423489f, -0.52773f, + 0.0108043f, -0.504715f, -1.1419f, -0.402904f, -0.160747f, + -0.329184f, 0.375374f, -1.02604f, -0.601371f, 0.631652f, + 0.0742486f, -0.464765f, 0.467445f, 0.240562f, -0.38211f, + -0.459004f, 0.704196f, 0.021357f, 0.860785f, -1.16731f, + -0.479029f, -0.139644f, -0.444087f, 0.322326f, -0.25455f, + 0.874399f, 0.477696f, 0.0464487f, 1.20658f, 0.0993356f, + 0.00682712f, -0.10163f, -0.371765f, -0.629513f, -0.679196f, + -0.193935f, 0.47405f, -0.18238f, 0.254918f, -0.35306f, + -0.375611f, 0.119771f, -0.257282f, -0.565124f, 0.162667f, + -0.356128f, 0.870351f, 0.241847f, -0.264712f, -0.384322f, + 0.31807f, 0.211621f, -0.180767f, 0.764944f, 0.368646f, + 0.186111f, 1.02458f, -0.494252f, -0.483375f, -0.699664f, + 0.00415657f, -0.189376f, -0.677103f, -0.030319f, 0.667087f, + 0.810951f, -0.488237f, -0.387355f, -0.726579f, -0.304763f, + 1.10392f, -0.775977f, -0.247731f, 0.532396f, 1.24089f, + 0.206621f, -0.670568f, -1.08142f, -0.342503f, 0.189854f, + -0.200846f, 0.784204f, 0.641112f, -0.509346f, 0.0805264f, + -1.40006f, 0.322084f, -0.823739f, -1.12965f, -0.215668f, + 0.099673f, 0.425966f, 0.771697f, 0.338834f, 0.345364f, + -0.297826f, -0.176746f, -0.297299f, -1.80029f, -0.178348f, + 0.421194f, -0.19155f, 0.417653f, 0.374441f, -0.135654f, + -0.895843f, 0.220647f, 0.368264f, 0.369233f, 0.382707f, + 0.0800511f, 0.542053f, 0.318896f, -0.385539f, 0.313305f, + -1.01166f, -0.222379f, -1.53708f, 1.32407f, -0.665444f, + -0.102348f, 0.0410504f, -0.616825f, 1.3108f, 0.405902f, + 1.27777f, 0.0630558f, -0.172696f, 0.16224f, -1.10111f, + -3.31326f, -0.242566f, 0.831422f, 0.917397f, 0.311749f, + -0.238613f, 0.438007f, -0.407089f, -0.0202555f, -1.82502f, + -0.907965f, -0.300031f, -0.616669f, -0.767921f, 0.285919f, + -0.112019f, 0.252677f, 0.350892f, 0.000214244f, 0.315915f, + 0.260344f, 0.327362f, -0.0211213f, -0.41241f, 0.0418355f, + 0.103328f, -0.0158439f, -0.230505f, -0.0215114f, 0.266739f, + -0.234376f, -0.352583f, 0.0709437f, -0.90649f, -0.535843f, + 1.21322f, -1.05144f, -0.983682f, -0.189956f, 1.14208f, + -0.0188492f, -0.254821f, -0.463214f, -0.708714f, 0.0447348f, + -0.220831f, 0.476299f, 0.102544f, 1.1173f, -0.36981f, + -0.814102f, 0.103604f, -0.247871f, 0.0610701f, -0.356616f, + -0.144093f, 1.66496f, 0.180206f, -1.04384f, -0.65883f, + 0.0290771f, -0.622728f, 0.761523f, -0.909091f, -0.0340348f, + 0.666895f, -0.0232575f, 0.962643f, -2.50103f, -1.69745f, + -0.0482305f, 0.771811f, -1.32233f, -0.778722f, -0.203309f, + 0.395875f, -0.171812f, 0.253794f, 0.432799f + }; + +static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = { + -0.152159f, 0.552347f, -0.806068f, 0.227901f, 0.335896f, 0.180785f, + 0.75277f, 0.982208f, 0.409823f, -0.17755f, -0.125365f, 0.738114f, + 0.202331f, 0.751737f, -0.360511f, 0.149254f, 0.085073f, -0.214542f, + 0.529727f, -0.0348777f, -2.13162f, -0.893332f, -0.136952f, -0.71258f +}; + +static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = { + -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f, -2.31153f, + 0.912733f, 0.879995f, -1.00602f, -1.02467f, 0.0536835f, 1.76011f, + -0.898546f, 1.06959f, 1.60471f, -1.7312f, -0.877168f, -0.681185f, + -1.57286f, -1.16038f, -4.11303f, -3.06351f, -3.02536f, -2.92186f +}; + +static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = { + 1.33207f +}; + +static const float + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = { + 0.0419551f, 0.0924078f, -0.153084f, 0.191642f, 0.069586f, + -0.530661f, 0.431968f, 0.000453838f, 0.793047f, 0.0161817f, + -0.476075f, -0.156638f, -0.219066f, 0.372716f, -0.0642299f, + 0.156813f, -0.105819f, -0.0519422f, 0.149935f, 0.295544f, + 0.192037f, -0.0450383f, 0.828794f, -0.0510661f, -1.22549f, + -0.100293f, -0.178274f, 0.0304427f, -0.0664097f, -0.0438936f, + 0.948248f, 0.425486f, -0.238206f, 1.3744f, 0.336897f, + 0.0760769f, -0.583508f, 0.0735519f, -0.117024f, 0.0501598f, + 0.332212f, 0.199531f, 0.424764f, 0.206712f, 0.342868f, + 0.592673f, -0.0961148f, -0.190113f, -0.155027f, 0.00789871f, + -0.0514839f, -0.416154f, -0.290309f, 0.407541f, 0.48534f, + 0.126564f, 0.0709566f, -0.0469664f, 0.735403f, -0.365963f, + 0.150295f, -0.50147f, 0.021383f, 0.76514f, 0.0085721f, + -0.416384f, 1.22268f, 0.0832438f, 0.367813f, -0.12012f, + 0.823183f, -0.0525972f, -0.325526f, -0.0983032f, 0.370128f, + 0.368778f, 0.138971f, -0.0397997f, 0.411058f, -0.0400404f, + 0.588437f, -0.29963f, -0.107992f, -1.75238f, -0.274387f, + 0.430418f, 0.495152f, 0.283172f, -0.441166f, 0.195339f, + -0.436182f, -0.252613f, 0.176204f, -0.126541f, -0.474833f, + -0.0721603f, -0.496599f, -0.0608464f, 0.0333451f, -0.0621485f, + 0.0843859f, 0.0637854f, -0.145291f, 0.14876f, 0.181665f, + -0.675805f, 0.294903f, 0.301118f, -0.225957f, 0.0105897f, + -0.136427f, -0.555925f, -0.158853f, -0.216779f, 0.0612481f, + -0.107158f, 0.352451f, 0.140536f, -0.0148237f, 0.189371f, + -0.091046f, -0.0476226f, 0.366054f, -0.0723413f, 0.389883f, + -0.0213411f, 0.0279539f, 0.194827f, -0.271502f, -0.166474f, + 0.0690549f, 0.0584665f, 0.0198415f, -0.442348f, 0.1571f, + -0.113463f, -0.16822f, -0.0580659f, -0.13441f, -0.0022386f, + 0.251521f, -0.160494f, -0.0753547f, 0.0897289f, 0.137917f, + 0.129836f, 0.0816833f, -0.626288f, 0.0643293f, -1.20001f, + 0.085631f, -0.195602f, 0.251244f, 0.0321744f, 0.0493178f, + -0.220616f, 0.724075f, -0.00831514f, 2.00319f, 0.407932f, + 0.0710799f, -0.166128f, 0.0126611f, -0.229644f, -0.0984299f, + 0.632041f, -0.0946141f, 0.295315f, 0.100934f, 0.184883f, + -0.236173f, 0.158081f, 0.195775f, 0.413542f, 0.789801f, + 0.767741f, 0.166275f, -0.348271f, -0.384074f, -0.291648f, + -0.119899f, 0.0368354f, 0.0751987f, 1.04217f, -0.159002f, + -2.71592f, -0.788502f, -1.06268f, 0.536057f, 0.0575876f, + 1.06811f, 0.12033f, 0.198578f, -0.0419196f, 0.0631388f, + 0.623138f, -0.142226f, 1.33129f, 0.0868059f, -0.0287825f, + 0.139378f, -0.143037f, 0.307452f, 0.0363987f, -0.0976368f, + 0.040544f, 0.0269327f, -0.0845524f, 0.0674699f, 0.104501f, + -0.0351155f, 0.167071f, 0.00986971f, 0.10284f, 0.0300016f, + 0.192601f, 0.0397177f, 0.0251346f, -0.00912908f, -0.0452825f, + 0.0164356f, -0.0275149f, 0.194846f, 0.0943608f, 1.61674f, + 0.0124345f, 0.523787f, 0.0397258f, -0.17208f, -0.147808f, + -1.23583f, 0.676385f, 0.551994f, 0.0233041f, 0.0116391f, + -0.466706f, 0.154725f, -0.207371f, 0.606662f, 0.247286f, + 0.31216f, 0.173765f, -0.268033f, 0.224422f, 0.314649f, + 0.481922f, -0.190604f, -0.0129162f, 0.270552f, 0.135195f, + 0.0927735f, -0.226099f, 0.53897f, 0.103309f, -0.0257271f, + -0.0246776f, 0.442013f, -0.179246f, -1.02581f, 0.206176f, + -0.326365f, 0.391623f, -0.103549f, 0.115645f, 0.0269328f, + -0.584517f, -0.237502f, 0.157996f, 0.0447407f, -0.161f, + -0.126072f, -0.148967f, -0.416347f, 0.0236496f, -1.12612f, + 0.0120709f, -0.00979376f, 0.0507126f, -0.172262f, 0.0697059f, + -0.212334f, 0.335731f, -0.0301362f, -0.839583f, -0.238539f, + 0.0636752f, -0.0467217f, -0.0372118f, -0.144615f, -0.161773f, + -0.648242f, 0.158197f, -0.051471f, -0.0615805f, -0.0426936f, + -0.0745554f, 0.358975f, 0.358297f, 0.0568553f, -1.14383f, + -0.103955f, 0.728194f, -0.224945f, -0.31659f, -0.204458f, + 0.171763f, -0.465666f, 0.899234f, -0.37042f, -0.0894774f, + 0.11478f, -0.334957f, 0.0896514f, 0.413251f, 0.359471f, + 1.41597f, 0.558082f, 0.153486f, 0.0270558f, -0.0178797f, + 0.124983f, -0.12273f, -1.04516f, -0.125375f, 0.370336f, + -0.209423f, -0.36816f, -0.66077f, -0.0180773f, -0.628921f, + -0.178542f, 0.0346841f, 0.0319309f, -0.470138f, 0.172763f, + 0.0798846f, -0.259737f, -0.652461f, -0.386283f, -0.474447f, + -0.924054f, -0.0154613f, -0.613712f, -0.138068f, -0.337842f, + 0.217921f, -0.0711405f, 0.000404091f, -0.703766f, 0.0364683f, + 0.150173f, 0.0126249f, 0.170594f, 0.0371879f, -0.0862515f, + -0.23454f, -0.0144143f, 0.164947f, 0.45591f, 0.115703f, + 0.069752f, -0.011993f, 0.0402097f, 0.00697581f, 0.0811613f, + 0.384752f, 0.341977f, 0.06087f, 0.0590107f, 0.00812679f, + 0.121211f, -0.0612108f, 0.167851f, 0.195781f, -1.62162f, + 0.336292f, -0.0772523f, -0.310786f, 0.188257f, -0.0325804f, + -0.240098f, 0.158748f, -0.265264f, 3.19593f, -0.449251f, + -1.33102f, -0.482856f, -0.435731f, 0.300808f, 0.346503f, + 2.67378f, -0.152379f, 0.219322f, -0.146119f, -0.0584806f, + -0.0276895f, -0.21955f, -0.479179f, -0.689545f, 0.152799f + }; + +static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = { + -0.296575f, 0.101072f, -0.208429f, 0.111585f, 0.699552f, -0.379484f, + 0.313244f, -0.746369f, 0.867757f, 0.457318f, -0.0190943f, -0.290745f, + 0.45592f, -0.160465f, -0.634243f, 0.0829737f +}; + +static const float + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = { + 0.27511f, -2.14172f, 1.25755f, -0.554772f, 0.589508f, + 0.228307f, 0.0754914f, 1.07061f, 0.293323f, 0.65162f, + -0.272016f, -1.33519f, -0.606759f, -0.57827f, 0.368807f, + -1.48668f, 0.162439f, 0.0821667f, 0.225535f, -0.795996f, + 0.0328293f, 0.975476f, -0.187514f, 2.47069f, -1.5638f, + -0.461524f, 0.00310062f, 1.1556f, -0.286206f, 0.00426021f, + 0.585836f, 0.900007f, 0.384055f, 0.189435f, -0.157291f, + -0.0710573f, -0.0663986f, -0.710772f, -0.669136f, -0.379493f, + -1.2634f, -0.377524f, 0.824094f, 0.312308f, 0.125368f, + -0.382737f, 0.637109f, 0.61907f, -0.741184f, 0.00257198f, + -0.0151343f, -0.669826f, -0.439855f, 0.564852f, -0.0588036f, + -1.38123f, -1.1126f, 0.701831f, 0.198686f, 0.266866f, + 0.270172f, -0.692401f, 0.272533f, -1.70914f, 0.66064f, + 0.0886659f, -0.132233f, 0.270531f, -0.479581f, 0.704338f, + -0.307039f, -0.111792f, -2.05753f, -0.231749f, 0.300528f, + 0.383266f, -0.130857f, -0.373944f, 1.21025f, 0.704655f, + -0.589422f, 0.267185f, -0.109065f, -0.195991f, 0.20209f, + -0.0676526f, -0.183926f, 0.164894f, 0.0877923f, 0.565943f, + -0.0610466f, -0.86354f, -0.80853f, -0.176111f, -1.45016f, + -2.29078f, -0.124524f, -0.139305f, -0.187858f, -0.0250151f, + -0.572544f, 0.185336f, -0.69275f, -0.430354f, -0.30861f, + -0.754258f, -0.468221f, -0.160487f, -0.766692f, -0.636418f, + -0.71016f, 0.576125f, -0.240476f, -0.954556f, -0.104693f, + 0.155557f, -0.840224f, -0.685457f, -0.0346927f, -0.644882f, + -1.92475f, -0.314544f, 0.463569f, 0.323569f, -0.990124f, + -0.213658f, 0.407183f, 1.19797f, -4.77004f, -0.0613379f, + -2.40345f, -0.0591791f, -0.477622f, -0.303556f, 0.104077f, + -0.974128f, -0.035172f, 1.47064f, 0.233727f, -0.0754056f, + 0.158553f, 0.0614361f, -1.38865f, 0.690729f, 0.568455f, + 0.205866f, -0.0236852f, -0.0921077f, -0.538954f, 0.336613f, + -0.427115f, 0.791754f, -1.819f, -0.404432f, 0.670242f, + -0.0343869f, -0.37191f, 0.0271262f, 0.988161f, -0.547343f, + 0.925304f, 0.548079f, -0.430343f, -0.214109f, 0.242013f, + 1.39027f, 0.37648f, -1.63524f, -0.158864f, -0.572779f, + -0.766801f, -2.62032f, 0.47799f, -1.12025f, -0.115283f, + 1.22349f, -0.262132f, -0.151274f, 0.390483f, -0.496482f, + 1.06166f, -0.183052f, 0.54647f, 0.847486f, 0.0229506f, + 0.653309f, -0.020736f, -1.27453f, 0.48386f, -0.366625f, + -0.515725f, -1.31196f, 0.140701f, -0.183636f, 0.000413912f, + 0.300993f, -0.849529f, -0.59764f, -0.212992f, -0.933365f, + -1.4054f, -0.091982f, 0.41695f, 0.264004f, -0.26379f, + -0.0738219f, 0.434052f, 1.16617f, -0.639624f, -0.146465f, + 0.0409936f, -0.900182f, 0.73517f, 0.805746f, -0.208088f, + 1.74459f, -0.0592751f, 0.624865f, -0.62325f, -0.446315f, + 0.150526f, 0.0526697f, 0.374254f, -0.658043f, 1.02623f, + -0.941758f, 0.381217f, -0.359448f, 0.160051f, 0.556455f, + 0.239382f, 0.75851f, 0.437583f, -0.122221f, 0.746136f, + 0.218286f, -0.426729f, 0.0353903f, -0.830513f, -0.877586f, + 0.488077f, -0.132354f, -0.180756f, 0.736163f, -0.202934f, + -0.882534f, 0.166305f, 0.183122f, 0.0599858f, 0.442687f, + 0.0522908f, -1.17755f, -1.03733f, 0.392363f, 0.672718f, + -1.44704f, 0.360623f, 0.390298f, -0.213968f, 0.169783f, + -0.717536f, -0.830984f, -0.445049f, 0.196772f, -0.730634f, + -1.09497f, 0.344012f, -0.292802f, -0.67966f, 0.138515f, + -0.361803f, 0.936778f, -0.189802f, 0.197777f, -0.367507f, + -0.293653f, 0.447759f, -0.409245f, -0.687568f, -0.431301f, + -0.271234f, -0.585413f, -0.936414f, -0.396049f, -0.29388f, + -0.0930843f, 0.0179339f, 0.262463f, -0.166598f, 0.0171466f, + -0.329641f, 0.39343f, 0.657445f, -0.579052f, -0.312444f, + -0.0915881f, -0.432622f, -0.247645f, 0.485749f, -0.602508f, + -0.347936f, 0.287353f, 0.288705f, 0.168397f, 0.568228f, + -0.493586f, 1.04155f, -0.097956f, 0.658928f, -0.561007f, + 0.0457783f, 2.12744f, 0.182683f, -0.690282f, 0.183302f, + 0.0309499f, -0.722251f, 0.0660448f, -0.333277f, 0.198929f, + -0.724102f, -0.405597f, 0.614868f, -0.292862f, 0.886513f, + 0.142353f, -1.48934f, -0.97273f, 0.199683f, 0.522121f, + 0.0877478f, -0.172593f, -1.58858f, 0.113191f, -0.436178f, + 0.640895f, -0.504676f, 0.0658654f, -0.361301f, 0.604323f, + 0.315196f, -0.423021f, -0.323484f, -0.563163f, 0.118989f, + -0.404508f, -0.0550995f, -0.0359236f, -0.126574f, -0.357288f, + -0.0494502f, 1.04959f, -0.31646f, -0.0376684f, -0.300744f, + -0.135016f, 0.102696f, -0.392333f, -1.17502f, 0.505227f, + 0.337608f, -0.348831f, -0.420815f, 0.202791f, -0.154264f, + -0.563686f, 0.0942187f, 0.353862f, 0.0303509f, -0.132794f, + 0.420746f, 0.143529f, 0.455822f, -1.28348f, -1.35662f, + -0.850688f, -1.76361f, -0.717546f, 0.443111f, 0.227155f, + -0.863307f, -0.452033f, -0.278151f, 1.86233f + }; + +static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = { + -0.103218f, -0.359587f, 0.619666f, -0.473497f, -0.649803f, 0.86992f, + -0.115561f, 0.335114f, -0.285044f, -0.59295f, 0.24497f, 0.611583f, + 0.38568f, 0.137913f, -0.281191f, -0.0107777f, 0.487236f, -0.262363f, + 0.696962f, 0.121565f, 0.312511f, 0.430916f, 0.694134f, 0.393632f +}; + +static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = { + -2.42496f, -1.239f, 0.832673f, 1.56923f, -2.6175f, -1.42492f, + -0.311387f, -1.94237f, 0.54071f, -2.50391f, 0.352205f, -0.96572f, + 1.47144f, -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f, + 0.789163f, -0.65236f, 1.77018f, 0.273867f, 1.19506f, 1.07022f +}; + +static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = { + 0.953424f +}; + +static const float + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = { + 0.0485154f, 0.0496279f, 0.0268229f, -0.0584843f, -0.166928f, + 0.0316731f, -0.0895094f, -0.0433243f, -0.00893639f, -0.0886265f, + -0.0345622f, -0.235395f, -0.213754f, -0.00212398f, 0.0218857f, + -0.0054983f, -0.0248236f, 0.081822f, -0.0355708f, -0.0795593f, + -0.106995f, -0.0596378f, 0.0350686f, -0.133863f, -0.00582928f, + 0.114963f, 0.193906f, -0.00419085f, 0.0430529f, -0.128318f, + 0.0614715f, -0.000952935f, -0.0345722f, -0.109459f, 0.074204f, + -0.0865131f, 0.0649158f, -0.0942417f, -0.10122f, -0.047551f, + -1.27825f, -0.0125456f, -0.019722f, -0.152058f, 0.280306f, + -0.121231f, -0.0565484f, 0.0959188f, 0.0603919f, 0.0457468f, + 0.967589f, 0.105892f, -0.118326f, 0.198933f, 0.163437f, + -0.056824f, -0.0302956f, -0.07366f, -0.681407f, -0.0781575f, + 0.255732f, -0.0712105f, 0.177882f, 0.709206f, -0.232457f, + 1.33809f, -0.0328557f, 0.0572231f, -1.01361f, 0.130676f, + -0.205159f, 0.975398f, 0.356293f, 0.0766364f, -0.297397f, + -0.0261066f, -0.0933549f, 0.0568851f, -0.0123034f, -0.0433538f, + 0.131003f, 0.890705f, 0.0084565f, 0.00547395f, 0.00157634f, + 0.0047937f, -0.0511092f, 0.0300034f, -0.00604993f, -0.0133502f, + -0.000274302f, 0.129728f, -0.00532916f, 0.0855351f, 0.136885f, + 0.0175562f, -0.0123633f, -0.000512229f, -0.019924f, -0.0316328f, + 0.422972f, 0.0460336f, 0.0170841f, -0.00086795f, -0.0655137f, + 0.0287308f, -0.0375644f, -0.0329215f, -0.0273072f, 0.0241426f, + -0.0429052f, 0.0221593f, -0.063881f, -0.0347391f, -6.44339e-07f, + 0.0476934f, -0.0150068f, 0.0146403f, -0.0653099f, 0.0107635f, + 0.012407f, 0.0048935f, 1.50975f, 0.322256f, 0.17881f, + 0.0943775f, -0.100583f, -0.367022f, -0.156525f, -0.0397161f, + 0.0752784f, -0.00219022f, -0.887456f, 0.0153415f, -0.0148185f, + -0.56435f, 0.163996f, -0.0221024f, -0.0115872f, -0.0529284f, + 0.156838f, -1.13813f, -0.207863f, -0.00484959f, 0.135719f, + 0.131004f, 0.0417939f, 0.31453f, 0.121719f, -0.101515f, + 0.267951f, 0.219727f, 0.0398821f, 0.0713504f, 3.65918e-06f, + -0.00659998f, 0.477343f, -0.128426f, 0.0648877f, 0.111884f, + 0.224552f, 0.0617426f, 0.117742f, 0.031377f, 0.0586865f, + -0.459293f, 0.100211f, -0.14127f, 0.624412f, 0.014659f, + -1.41807f, -0.382452f, -0.695931f, -0.103153f, 0.145808f, + 0.333526f, -0.256367f, 0.096842f, 0.102458f, -0.181224f, + 0.729272f, 0.151177f, 1.46729f, 0.111044f, -4.28813f, + 0.0178379f, 0.47641f, -6.57533f, 0.0633335f, 0.496934f, + -0.154657f, -9.07298e-05f, 0.848937f, -5.40143f, 0.375685f, + 0.23586f, -0.166591f, -0.0191648f, -0.039862f, -3.25093f, + 0.168472f, -0.260317f, -5.51548f, 0.0575334f, 0.328979f, + 0.112644f, 0.231339f, -0.122641f, 0.0567331f, 1.19541f, + -0.038735f, 0.0630576f, 0.176668f, 0.0757184f, -0.833104f, + 0.133669f, 0.982669f, 0.0311783f, 0.0908558f, -0.10065f, + -0.0386599f, -0.231587f, -0.83876f, -0.347148f, 0.225529f, + -1.29625f, 0.0806834f, 0.369648f, -1.63367f, 0.118057f, + -0.311948f, 0.95022f, -0.354807f, -0.648657f, -1.72048f, + 0.260397f, 0.915555f, 0.057737f, -0.162019f, -0.453543f, + -1.70388f, -0.311632f, -0.731593f, -0.678089f, 0.10438f, + -0.293911f, 0.144864f, 0.039212f, 0.0289241f, -0.0685266f, + 0.634592f, -0.0798614f, -0.119197f, -0.00517433f, -0.04653f, + -0.127568f, -0.0582645f, 0.0735302f, -0.0946823f, 0.00865585f, + 0.0115748f, 0.0194847f, 0.0455664f, 0.181006f, -0.0824601f, + 0.0869093f, 0.264767f, -0.0750432f, 0.135136f, 0.316511f, + 0.399015f, 0.0994808f, -0.166944f, -0.102126f, 0.457858f, + 0.300488f, 0.467582f, 0.830244f, -0.0511439f, -0.522892f, + -0.183049f, 0.2626f, 0.118382f, 0.241674f, 0.250399f, + -0.0963507f, -0.83231f, -0.227699f, -0.133314f, 0.231718f, + -0.0700274f, 0.891311f, 0.224742f, -0.572836f, 0.402798f, + -0.191576f, 0.740922f, -0.00374073f, 0.658178f, -0.209364f, + -0.416259f, 0.166297f, 0.0095577f, -0.0876076f, 0.424954f, + 0.265226f, -0.129343f, -0.203146f, -0.194637f, -0.818142f, + -0.164152f, -0.368962f, 0.273373f, 0.599927f, -0.19859f, + 0.0939651f, -0.12458f, -0.751816f, -0.302997f, -0.139176f, + -0.372737f, 0.332704f, -0.206045f, -0.00593763f, -0.452363f, + -0.2704f, -0.198846f, 0.0976308f, -0.216124f, 0.110122f, + -0.220342f, 0.00763426f, -0.0272775f, -0.190395f, -0.0359411f, + -0.0395759f, 0.000941162f, -1.49959f, 0.0914233f, 0.448346f, + -0.420435f, -0.0102102f, -0.0757978f, -0.0177687f, -0.0231492f, + -0.142125f, 1.31774f, 0.0269368f, 0.134566f, 0.152079f, + -0.139933f, 0.139226f, -0.214467f, -0.194446f, -0.555893f, + 0.271197f, -0.111047f, 0.0888069f, -0.198121f, 0.0871713f, + 0.100612f, 0.429782f, -0.3787f, 0.123147f, -0.12538f, + 0.235678f, 0.139237f, 0.223326f, 0.85806f, -0.00554756f, + 0.285095f, 0.0954683f, 0.0464989f, 0.100806f, -0.0211297f, + 0.121672f, 0.242473f, 0.0810475f, -0.834356f, 0.119629f, + 0.111338f, -0.227126f, 0.159296f, -0.0584685f, -0.108265f, + -0.0909221f, -0.21749f, 0.0929309f, -0.176815f, 0.178067f, + -0.0025905f, 0.317883f, 0.313045f, 0.26774f, -0.589329f, + -1.19882f, -0.285513f, -0.109478f, 0.309441f, -0.0604479f, + 0.947461f, -0.142342f, -0.9086f, -0.814788f, 0.184588f, + -0.0736317f, 0.276237f, 0.13132f, -0.3931f, -0.381744f, + -0.0122719f, 0.0246101f, -0.0920412f, 0.11331f, -0.110355f, + 0.00848064f, 0.0931248f, -0.0638655f, -4.30869e-05f, -0.300367f, + 0.0489508f, 0.464441f, -0.0466243f, -0.0137732f, 0.0099241f, + -0.223972f, 0.188966f, -0.653173f, -0.354322f, 0.189237f, + -0.624276f, -1.46218f, -0.075161f, -0.516172f, 0.40993f, + 0.291178f, -1.95088f, -0.0352157f, 0.196354f, -0.335897f, + 0.0857039f, 0.605319f, -1.12923f, -0.638387f, 1.41868f, + 0.0955757f, -0.00913477f, 0.315935f, -0.671223f, -0.851436f, + -0.157464f, -0.296763f, 0.182277f, -0.139309f, 0.232789f, + 0.869562f, 0.248894f, 0.242709f, 0.195479f, 0.106153f, + 0.358881f, 0.167443f, 0.982987f, 0.104767f, -0.033925f, + -0.0263185f, 0.0045304f, 0.0722479f, -0.111307f, 0.00128896f, + 0.406128f, -0.00944947f, 0.121592f, 0.546284f, -0.00175696f, + 0.776588f, 0.238846f, 0.064469f, 0.27082f, 0.269187f, + 0.0294455f, 0.62364f, -0.27872f, -0.0488013f, 0.229024f, + 0.154457f, 0.0445898f, 0.349943f, 0.0710998f, 0.0820674f, + 0.0279449f, 0.172826f, -0.122156f, -0.164688f, 0.0292124f, + 0.0496112f, -0.741762f, 0.0673926f, 0.108159f, -0.0942327f, + -0.0562883f, 0.558231f, 0.0552399f, 0.211393f, 0.0376817f, + -0.275788f, 0.0548436f, 0.212732f, 0.163603f, 0.0663363f, + -0.0252315f, 0.164533f, 0.0826088f, 0.0301389f, 0.345705f, + -0.0378046f, -0.139581f, 1.30162f, 1.23551f, -0.446693f, + 0.682534f, -0.0831157f, -0.0121595f, 1.50505f, 0.0839017f, + -0.953413f, 0.0820985f, -0.125556f, 0.699796f, -0.140453f, + 0.168438f, -0.110966f, 0.173806f, 0.114683f, 0.132502f, + -0.0453539f, -0.133096f, 0.511947f, -0.180657f, -0.0298605f, + 0.291437f, -0.0275017f, -0.229703f, -0.0504205f, 0.559622f, + 0.384601f, 0.111024f, -0.0773559f, -0.0591752f, -0.0866182f, + -0.189437f, -0.262345f, -0.0372182f, 0.149925f, 0.154644f, + -0.188298f, 0.236949f, -0.199328f, -0.378909f, -0.680128f, + 0.277184f, -0.172784f, 0.184717f, -0.23899f, 0.0712069f, + 0.0235425f, 0.4225f, -0.441487f, 0.177434f, -0.298303f, + 0.295696f, 0.17346f, 0.220542f, -0.680116f, 0.00266223f, + -0.0408459f, -0.15486f, 0.24335f, 0.237258f, -0.0283245f, + 0.19703f, -0.100027f, 0.0554843f, -1.03081f, 0.151745f, + 0.538582f, 0.370368f, 0.196683f, 0.0222123f, -0.0831401f, + -0.0832803f, -0.286743f, -0.686003f, 0.0995004f, 0.148901f, + -0.0436037f, -0.316508f, 0.00391835f, -0.228452f, 0.940058f, + 0.520047f, -0.334211f, 0.652142f, -0.0755971f, 0.0965123f, + -0.98191f, 0.394096f, -0.420466f, 0.327284f, -0.134651f, + 0.849297f, -0.523372f, 0.010327f, 0.133636f, 0.298119f, + -0.257389f, 0.0376153f, -0.198298f, 0.0736235f, 0.608809f, + 0.0291836f, -0.290005f, -0.141316f, 0.0184599f, 0.0554437f, + 0.0621519f, 0.485276f, 0.617062f, -0.0924811f, -0.0120834f, + 0.0817611f, 0.100421f, -0.0153553f, -0.135958f, -0.0185322f, + -0.395803f, -0.204862f, 0.547916f, -0.438117f, 0.0229788f, + 0.406981f, 0.795584f, -2.02756f, -0.8355f, -0.386789f, + 0.00968368f, 1.2147f, -0.740869f, -1.18415f, -0.954918f, + -0.541142f, 0.0596003f, 0.107189f, -0.411708f, -0.964593f, + 0.511906f + }; + +static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = { + -0.485545f, 0.131552f, 0.796833f, -0.157582f, -0.0948124f, 0.00818613f, + -0.485562f, 0.3826f, -0.0839326f, 0.170998f, 0.279545f, -0.287143f, + 0.184986f, -0.0719864f, 0.19748f, 0.404145f +}; + +static const float + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = { + 1.30172f, 0.720189f, 0.261675f, -0.466201f, 1.21773f, + 0.495525f, 0.62398f, 0.44567f, -0.330993f, -0.269798f, + 0.835161f, -0.294874f, 0.186981f, 0.0162467f, 0.367654f, + 0.658468f, 1.08325f, 1.01558f, 0.12783f, -0.280581f, + 2.2204f, 0.0337286f, -0.403649f, -0.230908f, -0.35188f, + 0.437712f, -0.103634f, -0.645929f, 1.17407f, 0.157385f, + 0.212438f, 1.41874f, 0.284242f, -0.493105f, 1.0703f, + 0.00632116f, 1.18222f, -0.26003f, 0.276795f, -0.823156f, + 0.29577f, -0.157467f, -0.18092f, 0.0237336f, 0.205715f, + -0.295679f, 0.165443f, -0.628279f, 1.00804f, 0.361232f, + 0.646155f, -0.028651f, 1.64317f, 0.334251f, -1.50713f, + -1.51685f, -0.488522f, 0.169694f, -0.593176f, -0.372682f, + -1.50223f, 0.35076f, -0.24641f, -0.237189f, 0.190502f, + -0.948191f, -0.303346f, 0.45108f, -0.794368f, -2.3116f, + 0.404008f, -2.67269f, -0.941992f, -0.45336f, 0.0655987f, + -0.288432f, 0.106068f, 0.286978f, 0.121403f, 0.462739f, + 0.0130292f, 0.240597f, -2.30983f, -0.453309f, -0.149335f, + 0.856424f, -0.186576f, 0.769961f, -0.0657097f, -0.976188f, + 0.972971f, -0.532728f, -0.699334f, -0.168803f, 0.361945f, + 0.950769f, 1.5368f, -0.223899f, 1.17547f, -0.281483f, + 0.533619f, 0.315344f, 0.0854543f, 0.464701f, 0.346828f, + 0.271794f, -0.0185388f, 0.109517f, 0.371662f, -0.10852f, + 0.244092f, 0.491959f, -0.750281f, 1.41865f, -3.51221f, + 0.298194f, -0.0790832f, -0.134158f, -0.424084f, 0.189593f, + -0.238361f, -0.407872f, -0.366222f, -0.606813f, -0.230498f, + 0.387248f, -0.102734f, -0.190544f, -1.43649f, 0.141338f, + -0.0438917f, 0.204628f, 1.57033f, 0.0366937f, -0.14733f, + 0.048198f, -0.122631f, 0.183354f, 0.0658753f, -0.243381f, + 0.0246889f, -0.768798f, -0.0644054f, 0.775073f, 1.63419f, + 0.491624f, 0.21898f, -0.358944f, 3.31304f, 0.0195916f, + 0.236174f, 0.530704f, 0.140124f, 0.0736778f, -0.27361f, + -0.598836f, -1.01659f, 0.361765f, 0.00455986f, -0.345222f, + 1.68731f, 0.764082f, 0.193555f, 0.322782f, 1.19801f, + 0.538935f, -0.0393231f, -0.0248292f, -0.151168f, 0.479879f, + -0.208582f, 0.22798f, 0.335473f, -0.00295455f, 0.139539f, + 0.400814f, 0.478307f, -0.189376f, 0.540084f, 0.466072f, + 0.920231f, 0.398774f, -0.472403f, -0.0431972f, -0.581665f, + -0.990058f, 0.258995f, -0.0148889f, 0.27105f, 0.340334f, + 0.223576f, -0.0405193f, -1.23888f, -1.45229f, -1.44543f, + -0.376146f, 0.132601f, -0.4064f, -0.583611f, -0.374588f, + 0.0659428f, 0.325652f, -0.338456f, 0.253767f, -0.0181164f, + 0.681732f, 0.222041f, 0.837496f, 1.09735f, 0.156328f, + 0.177236f, -0.702702f, 0.473689f, 0.322118f, 0.43343f, + 0.315441f, -0.40798f, 0.0811291f, 0.631431f, 0.361929f, + 0.0723276f, 0.0164498f, 0.0293847f, 0.156406f, -1.10453f, + 0.837977f, -1.03449f, -0.348408f, 1.71953f, -0.401765f, + 0.64272f, -0.182438f, -0.233954f, 0.364597f, 0.269177f, + -0.578512f, 0.397216f, 0.0425122f, -0.258728f, 1.41621f, + -0.688768f, 0.0944726f, 0.253163f, -0.989037f, 1.72726f, + 1.15976f, -0.0460612f, 0.534186f, -0.136814f, 0.49327f, + 0.115744f, -0.633052f, -0.433855f, -1.01874f, -0.324035f, + 0.489487f, 1.08696f, 0.836376f, -0.423477f, -0.421309f, + 1.07348f, 0.323266f, 0.717604f, 0.366422f, 0.32983f, + 0.336583f, 0.749292f, -0.210666f, 0.387101f, -0.583376f, + 0.0391101f, -1.07537f, 0.914591f, -0.51303f, 1.15023f, + -0.0378782f, 0.262889f, -0.841128f, 0.41619f, -0.669704f, + -0.109995f, 1.01825f, -0.194853f, 0.120739f, 0.627889f, + -0.00269221f, 0.751152f, -0.529865f, -1.50238f, 0.184521f, + 0.795464f, 0.106099f, 1.83117f, 0.0883305f, 0.306844f, + -0.0671504f, -0.169306f, -0.214575f, -0.121606f, -0.234965f, + 0.109752f, -0.35831f, -0.07894f, 0.497203f, -2.63013f, + 0.815608f, -0.193593f, -0.62292f, 0.338941f, 0.0970922f, + -0.531178f, 0.723346f, 0.35063f, 0.182647f, -0.257013f, + 0.784924f, -0.217915f, -0.0797363f, -0.399706f, -0.485602f, + 1.23155f, 0.345998f, 0.322949f, -0.168196f, -0.173313f, + 0.282205f, 0.45117f, 0.918706f, -0.046172f, -0.0873883f, + 0.56103f, -0.485768f, 0.546199f, 0.254997f, 0.394296f, + 0.607178f, 0.667532f, -0.343883f, 0.374402f, -0.531439f, + 2.27782f, -1.13255f, 0.505867f, -0.514742f, 0.998571f, + -1.60984f, -0.172873f, -0.0604094f, 0.719791f, -0.733982f, + 0.348905f, 1.39008f, -0.895343f, -0.677064f, -1.84221f, + 0.0434018f, -0.534794f, 0.0434753f, -0.266576f, 0.268099f, + -0.242935f, 0.00166289f, 0.0263789f, -0.224794f, -0.113493f, + -0.236397f, 0.0879936f, 0.510895f, -0.511789f, -1.48962f, + -2.78268f, -0.0495784f, -0.0343907f, 0.440459f, -0.364209f, + 0.833223f, -0.0589337f, 0.00181418f, 0.455499f, 0.101762f, + -1.16424f, 0.270405f, 0.219033f, -4.91105f + }; + +static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = { + -0.40114f, -0.372342f, -0.216186f, -0.240014f, -0.341773f, -0.344489f, + -0.113037f, 0.198479f, 0.482958f, -0.630072f, -0.728704f, -0.171963f, + 0.519883f, 0.253003f, -0.121618f, -0.0569875f, -0.485568f, -0.147577f, + 0.533305f, -0.587251f, -0.120837f, -0.483953f, 0.445641f, -0.125136f +}; + +static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = { + -1.57431f, -1.09069f, 1.67996f, -0.669702f, 0.499807f, -3.03145f, + -0.878135f, 0.637818f, -1.58419f, -3.79756f, 0.62755f, -0.446646f, + 0.653269f, -0.667854f, -2.19774f, -3.53349f, 2.6107f, -0.685892f, + -1.2603f, -0.89707f, -0.715551f, 0.382202f, 2.09574f, 0.469386f +}; + +static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = { + -0.022787f +}; + +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = { + BRANCH_0_NUM_DNN_FEATURES, + BRANCH_0_NUM_LOGITS, + BRANCH_0_NUM_DNN_LAYERS, + { + BRANCH_0_NUM_DNN_LAYER_0_UNITS, + BRANCH_0_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_0_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_0_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = { + BRANCH_1_NUM_DNN_FEATURES, + BRANCH_1_NUM_LOGITS, + BRANCH_1_NUM_DNN_LAYERS, + { + BRANCH_1_NUM_DNN_LAYER_0_UNITS, + BRANCH_1_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_1_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_1_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = { + BRANCH_2_NUM_DNN_FEATURES, + BRANCH_2_NUM_LOGITS, + BRANCH_2_NUM_DNN_LAYERS, + { + BRANCH_2_NUM_DNN_LAYER_0_UNITS, + BRANCH_2_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_2_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_2_logits_bias, + }, +}; +static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = { + BRANCH_3_NUM_DNN_FEATURES, + BRANCH_3_NUM_LOGITS, + BRANCH_3_NUM_DNN_LAYERS, + { + BRANCH_3_NUM_DNN_LAYER_0_UNITS, + BRANCH_3_NUM_DNN_LAYER_1_UNITS, + }, + { + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel, + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel, + av1_intra_mode_cnn_partition_branch_3_logits_kernel, + }, + { + av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias, + av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias, + av1_intra_mode_cnn_partition_branch_3_logits_bias, + }, +}; + +#undef NUM_DNN_BRANCHES +#undef NUM_CNN_LAYERS +#undef BRANCH_0_NUM_DNN_LAYERS +#undef BRANCH_1_NUM_DNN_LAYERS +#undef BRANCH_2_NUM_DNN_LAYERS +#undef BRANCH_3_NUM_DNN_LAYERS +#undef CNN_LAYER_0_HEIGHT +#undef CNN_LAYER_0_WIDTH +#undef CNN_LAYER_0_IN_CH +#undef CNN_LAYER_0_OUT_CH +#undef CNN_LAYER_0_HORZ_STRIDE +#undef CNN_LAYER_0_VERT_STRIDE +#undef CNN_LAYER_1_HEIGHT +#undef CNN_LAYER_1_WIDTH +#undef CNN_LAYER_1_IN_CH +#undef CNN_LAYER_1_OUT_CH +#undef CNN_LAYER_1_HORZ_STRIDE +#undef CNN_LAYER_1_VERT_STRIDE +#undef CNN_LAYER_2_HEIGHT +#undef CNN_LAYER_2_WIDTH +#undef CNN_LAYER_2_IN_CH +#undef CNN_LAYER_2_OUT_CH +#undef CNN_LAYER_2_HORZ_STRIDE +#undef CNN_LAYER_2_VERT_STRIDE +#undef CNN_LAYER_3_HEIGHT +#undef CNN_LAYER_3_WIDTH +#undef CNN_LAYER_3_IN_CH +#undef CNN_LAYER_3_OUT_CH +#undef CNN_LAYER_3_HORZ_STRIDE +#undef CNN_LAYER_3_VERT_STRIDE +#undef CNN_LAYER_4_HEIGHT +#undef CNN_LAYER_4_WIDTH +#undef CNN_LAYER_4_IN_CH +#undef CNN_LAYER_4_OUT_CH +#undef CNN_LAYER_4_HORZ_STRIDE +#undef CNN_LAYER_4_VERT_STRIDE +#undef BRANCH_0_NUM_DNN_FEATURES +#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_0_NUM_LOGITS +#undef BRANCH_1_NUM_DNN_FEATURES +#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_1_NUM_LOGITS +#undef BRANCH_2_NUM_DNN_FEATURES +#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_2_NUM_LOGITS +#undef BRANCH_3_NUM_DNN_FEATURES +#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS +#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS +#undef BRANCH_3_NUM_LOGITS + +static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = { + 100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = { + -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f, +}; + +static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = { + 100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = { + -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f, +}; + +static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = { + 100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f, +}; + +static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = { + -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f, +}; + +static const float av1_intra_mode_cnn_partition_mean[1] = { + 1.191922f, +}; + +static const float av1_intra_mode_cnn_partition_std[1] = { + 1.730044f, +}; + +static const int quad_to_linear_0[1] = { 0 }; +static const int quad_to_linear_1[4] = { 0, 1, 2, 3 }; +static const int quad_to_linear_2[16] = { 0, 1, 4, 5, 2, 3, 6, 7, + 8, 9, 12, 13, 10, 11, 14, 15 }; +static const int quad_to_linear_3[64] = { + 0, 1, 8, 9, 2, 3, 10, 11, 16, 17, 24, 25, 18, 19, 26, 27, + 4, 5, 12, 13, 6, 7, 14, 15, 20, 21, 28, 29, 22, 23, 30, 31, + 32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59, + 36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63 +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h new file mode 100644 index 0000000000..71c1ace782 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_model_weights.h @@ -0,0 +1,5646 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +// TODO(chiyotsai@google.com): The performance of these models are getting worse +// due the changes in the encoder. We should retrain the models here to get +// better performance once we have the time. + +#define FEATURE_SIZE 10 +#define LABEL_SIZE 16 +// nn model for ab partition pruning, 128x128. +static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = { + -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f, + 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f, + 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f, + -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f, + -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f, + -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f, + 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f, + 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f, + 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f, + 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f, + -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f, + -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f, + 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f, + 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f, + 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f, + 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f, + 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f, + -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f, + 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f, + -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f, + -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f, + -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f, + -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f, + -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f, + 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f, + 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f, + 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f, + 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f, + -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f, + -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f, + -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f, + -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f, + 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f, + 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f, + 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f, + 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f, + -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f, + 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f, + -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f, + 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f, + 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f, + 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f, + 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f, + -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f, + -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f, + 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f, + -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f, + 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f, + -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f, + -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f, + -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f, + -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f, + -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f, + -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f, + 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f, + 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f, + 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f, + -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f, + -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f, + 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f, + -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f, + 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f, + 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f, + -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f, + -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f, + 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f, + -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f, + 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f, + -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f, + -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f, + 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f, + -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f, + 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f, + 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f, + -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f, + 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f, + 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f, + -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f, + 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f, + -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f, + 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f, + -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f, + 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f, + -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f, + 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f, + -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f, + -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f, + -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f, + 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f, + -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f, + -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f, + -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f, + 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f, + 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f, + -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f, + 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f, + -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f, + 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f, + 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f, + -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f, + 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f, + -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f, + 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f, + 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f, + -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f, + 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f, + 0.420104f, -0.072042f, -0.006404f, 0.171680f, +}; + +static const float av1_ab_partition_nn_bias_128_layer0[64] = { + 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f, + -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f, + 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f, + -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f, + 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f, + 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f, + 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f, + 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f, + 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f, + 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f, + 0.853918f, 0.002504f, -0.190403f, 0.452050f, +}; + +static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = { + 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f, + -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f, + 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f, + 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f, + 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f, + -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f, + -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f, + -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f, + -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f, + 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f, + -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f, + 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f, + 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f, + 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f, + 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f, + 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f, + 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f, + 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f, + -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f, + 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f, + 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f, + -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f, + 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f, + 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f, + -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f, + -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f, + -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f, + 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f, + 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f, + 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f, + -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f, + -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f, + 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f, + 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f, + 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f, + 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f, + -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f, + -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f, + 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f, + 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f, + 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f, + 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f, + 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f, + -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f, + -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f, + -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f, + 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f, + -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f, + -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f, + 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f, + -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f, + -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f, + 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f, + -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f, + -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f, + -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f, + 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f, + 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f, + 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f, + -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f, + -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f, + -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f, + 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f, + -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f, + -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f, + 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f, + -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f, + -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f, + -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f, + -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f, + -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f, + 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f, + 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f, + -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f, + -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f, + 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f, + -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f, + -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f, + 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f, + 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f, + 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f, + -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f, + 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f, + -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f, + -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f, + -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f, + 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f, + 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f, + -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f, + 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f, + 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f, + -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f, + -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f, + -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f, + -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f, + 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f, + -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f, + 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f, + -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f, + 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f, + 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f, + 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f, + -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f, + 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f, + -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f, + 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f, + -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f, + 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f, + -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f, + -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f, + -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f, + -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f, + -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f, + 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f, + -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f, + -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f, + 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f, + -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f, + -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f, + -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f, + -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f, + -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f, + -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f, + -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f, + -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f, + -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f, + -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f, + -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f, + -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f, + -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f, + -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f, + -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f, + -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f, + -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f, + -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f, + 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f, + 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f, + -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f, + 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f, + -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f, + -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f, + -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f, + 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f, + -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f, + -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f, + -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f, + 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f, + -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f, + -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f, + -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f, + 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f, + -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f, + -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f, + -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f, + 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f, + 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f, + -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f, + 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f, + 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f, + 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f, + 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f, + -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f, + -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f, + -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f, + 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f, + -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f, + 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f, + 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f, + 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f, + -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f, + 0.823643f, -0.119781f, -0.098359f, 0.093119f, +}; + +static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = { + -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f, + -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f, + 0.325655f, -0.107123f, 0.591049f, 0.358744f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_128_layer0, + av1_ab_partition_nn_weights_128_layer1, + }, + { + av1_ab_partition_nn_bias_128_layer0, + av1_ab_partition_nn_bias_128_layer1, + }, +}; + +// nn model for ab partition pruning, 64x64. +static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = { + -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f, + -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f, + -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f, + -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f, + 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f, + -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f, + -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f, + 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f, + 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f, + -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f, + 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f, + 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f, + -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f, + 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f, + -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f, + 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f, + 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f, + -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f, + -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f, + -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f, + 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f, + 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f, + -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f, + -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f, + -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f, + -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f, + 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f, + 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f, + 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f, + -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f, + -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f, + -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f, + -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f, + -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f, + -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f, + -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f, + -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f, + -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f, + 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f, + -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f, + -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f, + -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f, + -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f, + -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f, + -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f, + 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f, + -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f, + -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f, + 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f, + -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f, + -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f, + -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f, + -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f, + -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f, + -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f, + -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f, + -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f, + 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f, + 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f, + -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f, + -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f, + -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f, + -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f, + -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f, + 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f, + -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f, + -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f, + 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f, + 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f, + 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f, + -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f, + -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f, + -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f, + -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f, + -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f, + -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f, + -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f, + -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f, + 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f, + -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f, + -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f, + -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f, + 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f, + -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f, + -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f, + -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f, + -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f, + -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f, + 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f, + -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f, + 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f, + -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f, + -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f, + 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f, + 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f, + -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f, + -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f, + -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f, + 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f, + 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f, + -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f, + -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f, + -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f, + -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f, + -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f, + -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f, + 0.230343f, -0.034318f, -0.022687f, -0.047090f, +}; + +static const float av1_ab_partition_nn_bias_64_layer0[64] = { + -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f, + -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f, + -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f, + -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f, + 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f, + -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f, + -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f, + 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f, + -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f, + -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f, + -0.125366f, -0.063576f, -0.212169f, -0.047463f, +}; + +static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = { + -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f, + -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f, + 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f, + -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f, + 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f, + 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f, + 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f, + -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f, + 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f, + 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f, + 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f, + -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f, + 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f, + -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f, + -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f, + -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f, + 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f, + 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f, + 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f, + -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f, + 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f, + 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f, + 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f, + 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f, + 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f, + -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f, + -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f, + -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f, + 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f, + -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f, + -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f, + -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f, + -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f, + 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f, + -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f, + 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f, + -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f, + -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f, + 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f, + 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f, + -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f, + -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f, + 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f, + 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f, + 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f, + 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f, + -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f, + -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f, + 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f, + 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f, + -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f, + 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f, + -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f, + -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f, + -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f, + -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f, + -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f, + -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f, + -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f, + -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f, + -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f, + -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f, + -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f, + -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f, + -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f, + 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f, + 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f, + -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f, + 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f, + 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f, + -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f, + 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f, + 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f, + -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f, + -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f, + -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f, + 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f, + -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f, + -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f, + -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f, + 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f, + 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f, + -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f, + 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f, + 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f, + 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f, + -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f, + -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f, + -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f, + -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f, + -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f, + -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f, + -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f, + -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f, + -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f, + 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f, + -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f, + -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f, + -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f, + -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f, + -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f, + -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f, + -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f, + 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f, + 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f, + -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f, + 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f, + -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f, + 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f, + -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f, + -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f, + 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f, + -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f, + -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f, + 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f, + 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f, + -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f, + -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f, + 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f, + -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f, + -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f, + -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f, + -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f, + -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f, + -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f, + 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f, + -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f, + 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f, + -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f, + 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f, + 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f, + 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f, + -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f, + 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f, + -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f, + 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f, + 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f, + -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f, + -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f, + 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f, + 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f, + -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f, + -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f, + 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f, + 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f, + -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f, + -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f, + -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f, + -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f, + -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f, + -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f, + -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f, + 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f, + -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f, + -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f, + -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f, + -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f, + -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f, + 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f, + 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f, + -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f, + -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f, + -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f, + -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f, + -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f, + -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f, + -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f, + 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f, + 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f, + -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f, + -0.114126f, -0.193834f, -0.025759f, 0.263183f, +}; + +static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f, + -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f, + -0.872737f, 0.718723f, 0.703398f, 2.560015f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_64_layer0, + av1_ab_partition_nn_weights_64_layer1, + }, + { + av1_ab_partition_nn_bias_64_layer0, + av1_ab_partition_nn_bias_64_layer1, + }, +}; + +// nn model for ab partition pruning, 32x32. +static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = { + -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f, + -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f, + 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f, + 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f, + -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f, + 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f, + -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f, + 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f, + 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f, + 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f, + -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f, + -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f, + -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f, + -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f, + 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f, + -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f, + -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f, + 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f, + -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f, + -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f, + -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f, + -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f, + 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f, + -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f, + -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f, + -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f, + -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f, + 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f, + 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f, + -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f, + -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f, + -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f, + -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f, + -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f, + 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f, + -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f, + -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f, + -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f, + -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f, + -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f, + -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f, + 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f, + -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f, + -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f, + 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f, + -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f, + -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f, + 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f, + 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f, + -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f, + -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f, + -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f, + -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f, + -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f, + -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f, + 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f, + -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f, + -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f, + -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f, + -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f, + -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f, + -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f, + -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f, + -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f, + -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f, + -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f, + -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f, + 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f, + 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f, + -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f, + -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f, + -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f, + -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f, + 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f, + -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f, + -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f, + -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f, + -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f, + -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f, + -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f, + -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f, + -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f, + -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f, + -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f, + 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f, + -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f, + -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f, + -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f, + -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f, + -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f, + 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f, + -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f, + -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f, + -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f, + 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f, + -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f, + -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f, + -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f, + -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f, + -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f, + 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f, + 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f, + -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f, + -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f, + -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f, + -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f, + -0.827145f, -0.225277f, 0.275800f, 1.696635f, +}; + +static const float av1_ab_partition_nn_bias_32_layer0[64] = { + -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f, + 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f, + -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f, + 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f, + 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f, + -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f, + 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f, + -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f, + -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f, + 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f, + -1.384604f, -0.201713f, -0.271948f, 0.372351f, +}; + +static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = { + -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f, + -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f, + -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f, + 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f, + 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f, + -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f, + 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f, + -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f, + 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f, + -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f, + 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f, + -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f, + -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f, + 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f, + -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f, + 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f, + 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f, + -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f, + 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f, + 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f, + -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f, + 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f, + 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f, + 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f, + -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f, + 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f, + -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f, + 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f, + 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f, + 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f, + -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f, + -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f, + -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f, + 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f, + -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f, + -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f, + -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f, + -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f, + -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f, + -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f, + -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f, + 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f, + 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f, + 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f, + 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f, + 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f, + -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f, + 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f, + -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f, + -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f, + -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f, + -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f, + 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f, + -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f, + 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f, + -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f, + 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f, + -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f, + -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f, + 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f, + 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f, + -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f, + -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f, + -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f, + -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f, + 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f, + -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f, + -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f, + -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f, + -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f, + -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f, + -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f, + -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f, + 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f, + -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f, + -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f, + 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f, + -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f, + -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f, + -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f, + 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f, + -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f, + -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f, + 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f, + -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f, + -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f, + -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f, + 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f, + -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f, + -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f, + 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f, + 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f, + -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f, + -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f, + -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f, + -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f, + -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f, + 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f, + -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f, + -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f, + 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f, + -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f, + -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f, + -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f, + -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f, + 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f, + 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f, + -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f, + -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f, + -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f, + 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f, + -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f, + -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f, + -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f, + 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f, + -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f, + 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f, + 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f, + -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f, + -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f, + 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f, + 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f, + 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f, + -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f, + 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f, + -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f, + -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f, + -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f, + -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f, + -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f, + 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f, + -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f, + -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f, + -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f, + -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f, + -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f, + -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f, + -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f, + -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f, + -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f, + -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f, + 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f, + -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f, + 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f, + -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f, + -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f, + -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f, + 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f, + -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f, + -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f, + -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f, + -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f, + 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f, + -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f, + -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f, + -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f, + 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f, + -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f, + -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f, + -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f, + -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f, + 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f, + 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f, + -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f, + 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f, + -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f, + -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f, + -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f, + -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f, + 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f, + -1.251640f, -0.353531f, -0.304009f, -0.153376f, +}; + +static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f, + 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f, + -0.010198f, 0.130597f, 1.276752f, 2.028188f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_32_layer0, + av1_ab_partition_nn_weights_32_layer1, + }, + { + av1_ab_partition_nn_bias_32_layer0, + av1_ab_partition_nn_bias_32_layer1, + }, +}; + +// nn model for ab partition pruning, 16x16. +static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = { + 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f, + 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f, + 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f, + 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f, + -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f, + 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f, + -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f, + 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f, + -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f, + 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f, + 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f, + 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f, + -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f, + 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f, + -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f, + -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f, + 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f, + -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f, + 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f, + -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f, + 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f, + -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f, + -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f, + -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f, + -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f, + -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f, + -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f, + -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f, + -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f, + -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f, + -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f, + -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f, + 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f, + -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f, + -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f, + 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f, + -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f, + -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f, + 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f, + 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f, + -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f, + 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f, + -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f, + 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f, + 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f, + 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f, + -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f, + -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f, + -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f, + -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f, + 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f, + -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f, + 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f, + 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f, + -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f, + -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f, + -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f, + 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f, + 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f, + -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f, + -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f, + -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f, + -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f, + -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f, + -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f, + -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f, + -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f, + 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f, + -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f, + 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f, + -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f, + -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f, + -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f, + -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f, + -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f, + -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f, + -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f, + -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f, + -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f, + -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f, + 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f, + -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f, + 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f, + 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f, + 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f, + -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f, + -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f, + -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f, + -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f, + -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f, + -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f, + -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f, + 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f, + -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f, + -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f, + 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f, + -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f, + 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f, + 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f, + -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f, + -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f, + -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f, + -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f, + -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f, + -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f, + -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f, + -0.021087f, 0.110220f, -0.128671f, 0.044219f, +}; + +static const float av1_ab_partition_nn_bias_16_layer0[64] = { + 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f, + -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f, + 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f, + 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f, + -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f, + -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f, + -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f, + 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f, + -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f, + -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f, + 0.123809f, -0.109797f, 0.200510f, -0.147542f, +}; + +static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = { + -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f, + -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f, + -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f, + -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f, + 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f, + 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f, + 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f, + -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f, + 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f, + 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f, + -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f, + 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f, + -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f, + 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f, + 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f, + 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f, + -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f, + 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f, + -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f, + 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f, + 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f, + -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f, + 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f, + 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f, + 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f, + 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f, + -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f, + -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f, + 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f, + 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f, + 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f, + 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f, + 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f, + -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f, + 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f, + 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f, + 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f, + -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f, + 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f, + 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f, + 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f, + -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f, + -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f, + -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f, + -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f, + -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f, + -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f, + 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f, + 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f, + -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f, + -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f, + 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f, + -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f, + -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f, + -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f, + -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f, + -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f, + -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f, + 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f, + 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f, + -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f, + -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f, + -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f, + -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f, + 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f, + 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f, + -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f, + 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f, + 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f, + -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f, + -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f, + -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f, + 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f, + 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f, + 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f, + 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f, + -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f, + 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f, + -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f, + -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f, + -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f, + -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f, + -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f, + 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f, + -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f, + 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f, + -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f, + 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f, + 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f, + -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f, + -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f, + 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f, + -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f, + -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f, + 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f, + -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f, + 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f, + -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f, + -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f, + -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f, + 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f, + 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f, + 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f, + 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f, + -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f, + -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f, + -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f, + 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f, + -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f, + 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f, + -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f, + -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f, + -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f, + 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f, + 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f, + -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f, + -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f, + -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f, + -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f, + -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f, + -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f, + -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f, + -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f, + -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f, + -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f, + -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f, + 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f, + 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f, + 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f, + 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f, + -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f, + -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f, + 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f, + 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f, + 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f, + 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f, + -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f, + -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f, + -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f, + -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f, + -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f, + -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f, + -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f, + 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f, + -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f, + -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f, + 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f, + 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f, + -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f, + -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f, + -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f, + 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f, + 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f, + 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f, + 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f, + 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f, + -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f, + -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f, + -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f, + -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f, + 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f, + -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f, + -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f, + -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f, + -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f, + 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f, + -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f, + 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f, + -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f, + -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f, + 0.172790f, -0.172982f, 0.041258f, -0.299379f, +}; + +static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f, + 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f, + -0.658522f, 0.723479f, 0.544264f, 1.035225f, +}; + +static const NN_CONFIG av1_ab_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 64, // num_hidden_nodes + }, + { + av1_ab_partition_nn_weights_16_layer0, + av1_ab_partition_nn_weights_16_layer1, + }, + { + av1_ab_partition_nn_bias_16_layer0, + av1_ab_partition_nn_bias_16_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#define FEATURE_SIZE 18 +#define LABEL_SIZE 4 + +static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = { + -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f, + 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f, + 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f, + 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f, + -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f, + -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f, + 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f, + 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f, + -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f, + -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f, + -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f, + -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f, + 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f, + 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f, + -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f, + -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f, + -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f, + -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f, + 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f, + -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f, + -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f, + -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f, + -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f, + -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f, + -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f, + 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f, + 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f, + -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f, + 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f, + -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f, + 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f, + 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f, + -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f, + -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f, + 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f, + -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f, + 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f, + -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f, + 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f, + -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f, + 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f, + 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f, + -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f, + 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f, + 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f, + 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f, + 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f, + -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f, + -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f, + -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f, + 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f, + 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f, + -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f, + -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f, + 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f, + -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f, + -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f, + -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f, + 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f, + -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f, + -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f, + 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f, + 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f, + 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f, + 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f, + -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f, + 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f, + -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f, + -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f, + -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f, + 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f, + 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f, +}; + +static const float av1_4_partition_nn_bias_16_layer0[24] = { + 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f, + -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f, + 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f, + -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f, +}; + +static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = { + -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f, + 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f, + -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f, + -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f, + 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f, + -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f, + -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f, + 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f, + 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f, + -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f, + 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f, + -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f, + 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f, + -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f, + -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f, + -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f, +}; + +static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = { + -0.462133f, + 0.465060f, + 0.062211f, + 0.401786f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 24, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_16_layer0, + av1_4_partition_nn_weights_16_layer1, + }, + { + av1_4_partition_nn_bias_16_layer0, + av1_4_partition_nn_bias_16_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = { + -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f, + 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f, + -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f, + 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f, + -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f, + -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f, + -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f, + -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f, + -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f, + -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f, + 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f, + -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f, + -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f, + 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f, + -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f, + -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f, + -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f, + -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f, + -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f, + -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f, + 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f, + -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f, + -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f, + 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f, + 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f, + -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f, + 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f, + 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f, + -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f, + -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f, + -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f, + 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f, + -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f, + 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f, + -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f, + -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f, + 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f, + -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f, + 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f, + -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f, + -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f, + -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f, + -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f, + 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f, + 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f, + -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f, + -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f, + 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f, + 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f, + 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f, + 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f, + -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f, + 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f, + 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f, + -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f, + -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f, + -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f, + -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f, + -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f, + -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f, + -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f, + -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f, + 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f, + -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f, + 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f, + -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f, + 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f, + 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f, + 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f, + -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f, + 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f, + 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f, + 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f, + -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f, + 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f, + 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f, + -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f, + 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f, + -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f, + -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f, + -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f, + -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f, + -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f, + 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f, + -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f, + 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f, + -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f, + -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f, + 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f, + -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f, + -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f, + 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f, + -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f, + -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f, + -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f, + 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f, +}; + +static const float av1_4_partition_nn_bias_32_layer0[32] = { + 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f, + -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f, + -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f, + -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f, + -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f, + 0.109579f, -0.082685f, +}; + +static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = { + 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f, + 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f, + 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f, + -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f, + 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f, + 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f, + -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f, + 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f, + 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f, + 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f, + -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f, + 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f, + -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f, + -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f, + 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f, + -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f, + 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f, + 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f, + 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f, + 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f, + -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f, + -0.800926f, -0.134132f, +}; + +static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = { + -0.019518f, + 0.198546f, + 0.339015f, + -0.261961f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_32_layer0, + av1_4_partition_nn_weights_32_layer1, + }, + { + av1_4_partition_nn_bias_32_layer0, + av1_4_partition_nn_bias_32_layer1, + }, +}; + +static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = { + -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f, + -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f, + 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f, + -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f, + -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f, + 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f, + 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f, + 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f, + 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f, + -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f, + -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f, + 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f, + -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f, + 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f, + -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f, + -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f, + 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f, + -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f, + 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f, + -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f, + -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f, + -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f, + -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f, + -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f, + -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f, + -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f, + -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f, + 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f, + 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f, + -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f, + -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f, + 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f, + -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f, + 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f, + -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f, + 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f, + 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f, + -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f, + -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f, + 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f, + 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f, + 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f, + 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f, + -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f, + -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f, + 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f, + -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f, + 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f, + -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f, + 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f, + -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f, + -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f, + 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f, + -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f, + -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f, + -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f, + -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f, + -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f, + 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f, + 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f, + -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f, + -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f, + -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f, + 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f, + 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f, + -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f, + -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f, + 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f, + 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f, + 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f, + -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f, + 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f, +}; + +static const float av1_4_partition_nn_bias_64_layer0[24] = { + 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f, + -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f, + -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f, + -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f, +}; + +static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = { + -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f, + 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f, + 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f, + -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f, + -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f, + 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f, + -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f, + 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f, + 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f, + -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f, + -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f, + -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f, + 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f, + -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f, + -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f, + -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f, +}; + +static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = { + -0.478735f, + 0.292948f, + 0.293172f, + 0.040013f, +}; + +static const NN_CONFIG av1_4_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + 24, // num_hidden_nodes + }, + { + av1_4_partition_nn_weights_64_layer0, + av1_4_partition_nn_weights_64_layer1, + }, + { + av1_4_partition_nn_bias_64_layer0, + av1_4_partition_nn_bias_64_layer1, + }, +}; + +#undef FEATURE_SIZE +#undef LABEL_SIZE + +#define FEATURE_SIZE 4 +static const float + av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = { + -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f, + -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f, + 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f, + -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f, + -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f, + -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f, + -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f, + -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f, + 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f, + 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f, + -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f, + -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f, + 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f, + -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f, + -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f, + -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f, + 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f, + -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f, + -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f, + -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f, + 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f, + -0.007193f, -0.257836f, + }; + +static const float av1_partition_breakout_nn_bias_128_layer0[32] = { + 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f, + -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f, + 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f, + 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f, + -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f, + 0.429660f, -8.439470f, +}; + +static const float av1_partition_breakout_nn_weights_128_layer1[32] = { + -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f, + 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f, + 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f, + -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f, + -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f, + -0.039662f, 0.131499f, +}; + +static const float av1_partition_breakout_nn_bias_128_layer1[1] = { + 0.86678213f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_128_layer0, + av1_partition_breakout_nn_weights_128_layer1, + }, + { + av1_partition_breakout_nn_bias_128_layer0, + av1_partition_breakout_nn_bias_128_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = { + 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f, + -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f, + 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f, + 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f, + -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f, + 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f, + 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f, + -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f, + 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f, + -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f, + -2.407131f, -0.062304f, 0.000874f, 0.108786f, + }; + +static const float av1_partition_breakout_nn_bias_64_layer0[16] = { + 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f, + -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f, + -0.337413f, 4.492778f, 0.000000f, 17.043072f, +}; + +static const float av1_partition_breakout_nn_weights_64_layer1[16] = { + -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f, + 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f, + -0.038572f, 0.307899f, -0.294283f, 0.118323f, +}; + +static const float av1_partition_breakout_nn_bias_64_layer1[1] = { + -1.33438122f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_64_layer0, + av1_partition_breakout_nn_weights_64_layer1, + }, + { + av1_partition_breakout_nn_bias_64_layer0, + av1_partition_breakout_nn_bias_64_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = { + -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f, + 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f, + -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f, + -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f, + -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f, + 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f, + 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f, + -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f, + -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f, + -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f, + -0.520814f, -0.045386f, -0.443123f, -0.484209f, + }; + +static const float av1_partition_breakout_nn_bias_32_layer0[16] = { + 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f, + 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f, + -0.423808f, 0.000000f, 6.352258f, -0.155787f, +}; + +static const float av1_partition_breakout_nn_weights_32_layer1[16] = { + 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f, + 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f, + -0.004171f, 0.157694f, 0.117845f, 0.272115f, +}; + +static const float av1_partition_breakout_nn_bias_32_layer1[1] = { + 0.09049262f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_32_layer0, + av1_partition_breakout_nn_weights_32_layer1, + }, + { + av1_partition_breakout_nn_bias_32_layer0, + av1_partition_breakout_nn_bias_32_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = { + 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f, + -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f, + -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f, + -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f, + -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f, + -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f, + -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f, + -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f, + -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f, + -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f, + -0.509287f, -0.048877f, -0.001512f, 0.077086f, + }; + +static const float av1_partition_breakout_nn_bias_16_layer0[16] = { + 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f, + 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f, + 5.625762f, 0.615822f, 0.040057f, 16.668884f, +}; + +static const float av1_partition_breakout_nn_weights_16_layer1[16] = { + -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f, + 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f, + 0.269773f, -0.021105f, -0.146698f, 0.188764f, +}; + +static const float av1_partition_breakout_nn_bias_16_layer1[1] = { + 1.60751927f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_16_layer0, + av1_partition_breakout_nn_weights_16_layer1, + }, + { + av1_partition_breakout_nn_bias_16_layer0, + av1_partition_breakout_nn_bias_16_layer1, + }, +}; + +static const float + av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = { + -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f, + 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f, + -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f, + -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f, + 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f, + -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f, + -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f, + -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f, + -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f, + -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f, + -0.596269f, 0.098494f, -0.005765f, 0.173652f, + }; + +static const float av1_partition_breakout_nn_bias_8_layer0[16] = { + 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f, + 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f, + 2.336705f, -0.278834f, 0.231905f, 7.954366f, +}; + +static const float av1_partition_breakout_nn_weights_8_layer1[16] = { + -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f, + -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f, + 0.055858f, 0.230970f, -0.056466f, 0.119780f, +}; + +static const float av1_partition_breakout_nn_bias_8_layer1[1] = { + 1.27784479f, +}; + +static const NN_CONFIG av1_partition_breakout_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, // num_hidden_nodes + }, + { + av1_partition_breakout_nn_weights_8_layer0, + av1_partition_breakout_nn_weights_8_layer1, + }, + { + av1_partition_breakout_nn_bias_8_layer0, + av1_partition_breakout_nn_bias_8_layer1, + }, +}; +#undef FEATURE_SIZE + +#define FEATURE_SIZE 9 // Input layer size +#define NUM_NODES 32 // Hidden layer size +#define LABEL_SIZE 3 // Output layer size + +static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f, + -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f, + 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f, + -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f, + 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f, + 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f, + 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f, + -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f, + 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f, + 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f, + -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f, + -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f, + 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f, + 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f, + -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f, + 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f, + -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f, + 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f, + 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f, + -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f, + -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f, + -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f, + 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f, + 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f, + -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f, + 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f, + -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f, + -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f, + -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f, + 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f, + -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f, + -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f, + -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f, + 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f, + 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f, + -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f, + 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f, + 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f, + 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f, + 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f, + -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f, + -1.08228f, +}; + +static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = { + 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f, + -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f, + 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f, + -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f, + -0.22638f, 1.40940f, -0.09309f, 0.05828f, +}; + +static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f, + -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f, + -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f, + -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f, + -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f, + 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f, + -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f, + 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f, + 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f, + -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f, + -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f, + 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f, + -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f, + -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f, +}; + +static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = { + 1.70665f, + -0.77954f, + -0.92709f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_8 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_8_layer0, + av1_rect_partition_nn_weights_8_layer1 }, + { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 } +}; + +static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f, + -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f, + 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f, + -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f, + 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f, + -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f, + 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f, + 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f, + 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f, + -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f, + 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f, + 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f, + 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f, + 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f, + 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f, + -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f, + -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f, + 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f, + -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f, + -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f, + -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f, + 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f, + 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f, + -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f, + -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f, + -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f, + 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f, + 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f, + -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f, + -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f, + -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f, + -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f, + -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f, + 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f, + 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f, + 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f, + -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f, + -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f, + 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f, + -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f, + -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f, + -0.05573f, +}; + +static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = { + -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f, + 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f, + 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f, + -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f, + -0.12044f, 1.65478f, -0.75153f, 1.18441f, +}; + +static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES * + LABEL_SIZE] = { + -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f, + 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f, + 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f, + 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f, + -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f, + 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f, + 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f, + 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f, + 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f, + -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f, + -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f, + -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f, + 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f, + -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f, +}; + +static const float av1_rect_partition_nn_bias_16_layer1[3] = { + 2.68750f, + -1.31894f, + -1.36768f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_16 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_16_layer0, + av1_rect_partition_nn_weights_16_layer1 }, + { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 } +}; + +static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f, + -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f, + -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f, + -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f, + -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f, + -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f, + -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f, + -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f, + -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f, + 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f, + -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f, + -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f, + 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f, + 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f, + -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f, + -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f, + 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f, + 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f, + 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f, + 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f, + 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f, + -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f, + 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f, + 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f, + -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f, + -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f, + -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f, + -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f, + -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f, + -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f, + 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f, + -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f, + -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f, + 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f, + 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f, + -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f, + 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f, + 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f, + 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f, + -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f, + -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f, + 0.33984f, +}; + +static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = { + -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f, + 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f, + 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f, + -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f, + -0.27602f, -1.98063f, 0.20816f, -0.01315f, +}; + +static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f, + -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f, + 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f, + -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f, + 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f, + 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f, + 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f, + 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f, + 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f, + -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f, + 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f, + -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f, + -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f, + -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f, +}; + +static const float av1_rect_partition_nn_bias_32_layer1[3] = { + 2.47332f, + -1.65756f, + -0.81573f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_32 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_32_layer0, + av1_rect_partition_nn_weights_32_layer1 }, + { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 } +}; + +static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE * + NUM_NODES] = { + 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f, + 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f, + 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f, + 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f, + 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f, + 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f, + 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f, + -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f, + 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f, + 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f, + -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f, + -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f, + -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f, + -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f, + 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f, + 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f, + 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f, + -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f, + -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f, + -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f, + -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f, + -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f, + 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f, + 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f, + 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f, + -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f, + -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f, + 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f, + 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f, + 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f, + -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f, + -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f, + -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f, + 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f, + -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f, + -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f, + 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f, + -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f, + -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f, + 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f, + -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f, + 0.09101f, +}; + +static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = { + 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f, + -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f, + -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f, + -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f, + 0.59835f, -0.31269f, -0.30585f, -1.66212f, +}; + +static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES * + LABEL_SIZE] = { + 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f, + -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f, + 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f, + 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f, + 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f, + -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f, + -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f, + 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f, + -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f, + 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f, + -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f, + -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f, + -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f, + 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f, +}; + +static const float av1_rect_partition_nn_bias_64_layer1[3] = { + 0.32215f, + -0.57522f, + 0.25314f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_64 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_64_layer0, + av1_rect_partition_nn_weights_64_layer1 }, + { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 } +}; + +static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE * + NUM_NODES] = { + -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f, + 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f, + 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f, + 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f, + -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f, + 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f, + 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f, + 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f, + 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f, + 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f, + -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f, + 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f, + -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f, + -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f, + 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f, + -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f, + -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f, + -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f, + -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f, + -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f, + -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f, + -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f, + -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f, + 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f, + 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f, + -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f, + -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f, + 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f, + 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f, + 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f, + -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f, + 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f, + -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f, + 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f, + -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f, + -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f, + 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f, + -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f, + -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f, + -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f, + 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f, + 2.02519f, +}; + +static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = { + 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f, + 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f, + -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f, + -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f, + 0.66120f, 0.61119f, -1.42293f, 0.32676f, +}; + +static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES * + LABEL_SIZE] = { + 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f, + 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f, + -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f, + 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f, + 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f, + 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f, + 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f, + 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f, + -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f, + -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f, + 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f, + 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f, + 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f, + 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f, +}; + +static const float av1_rect_partition_nn_bias_128_layer1[3] = { + 1.09014f, + -0.53317f, + -0.55668f, +}; + +static const NN_CONFIG av1_rect_partition_nnconfig_128 = { + FEATURE_SIZE, // num_inputs + LABEL_SIZE, // num_outputs + 1, // num_hidden_layers + { + NUM_NODES, + }, // num_hidden_nodes + { av1_rect_partition_nn_weights_128_layer0, + av1_rect_partition_nn_weights_128_layer1 }, + { av1_rect_partition_nn_bias_128_layer0, + av1_rect_partition_nn_bias_128_layer1 } +}; +#undef FEATURE_SIZE +#undef NUM_NODES +#undef LABEL_SIZE + +// Below are the models used for simple_motion_search_based_split +// Thresholds +// The first index level is for aggresiveness, and the second is frame +// resolution, third is bsize +static const float av1_simple_motion_search_split_thresh[4][3][5] = { + // Aggressiveness = 0 + { + // lowres + { + 1.40402595879f, // p = 0.8028197 + 4.72845183649f, // p = 0.99123732 + 1.86517797783f, // p = 0.86589934 + 1.58715223005f, // p = 0.83021506 + 7.22695596987f, // p = 0.9992738 + }, + // midres + { + 5.839480f, // p = 0.997098 + 1.877167f, // p = 0.867285 + 3.073499f, // p = 0.955783 + 1.405601f, // p = 0.803071 + 2.555636f, // p = 0.927951 + }, + // hdres + { + 5.839480f, // p = 0.997098 + 1.877167f, // p = 0.867285 + 3.073499f, // p = 0.955783 + 1.405601f, // p = 0.803071 + 2.555636f, // p = 0.927951 + }, + }, + // Aggressiveness = 1 + { + // Lowres + { + 100.0000f, // p = 1.000000 + 4.952535f, // p = 0.992984 + 1.720880f, // p = 0.848242 + 1.426233f, // p = 0.806314 + 1.491905f, // p = 0.816364 + }, + // Midres + { + 100.0000f, // p = 100.0000 + 3.137263f, // p = 0.958404 + 2.703262f, // p = 0.937219 + 1.877166f, // p = 0.867285 + 2.221149f, // p = 0.902133 + }, + // Hdres + { + 4.417680f, // p = 0.988082 + 3.086898f, // p = 0.956349 + 3.966704f, // p = 0.981416 + 1.532565f, // p = 0.822381 + 3.449975f, // p = 0.969230 + }, + }, + // Aggressiveness = 2 + { + // lowres + { + 100.000000f, // p = 0.998048 + 1.484020f, // p = 0.815179 + 1.866781f, // p = 0.866085 + 1.706711f, // p = 0.846409 + 2.080369f, // p = 0.888980 + }, + // midres + { + 100.000000f, // p = 0.0 + 3.265763f, // p = 0.963235428881 + 2.024598f, // p = 0.883355591569 + 1.846446f, // p = 0.863709256976 + 2.240962f, // p = 0.903868036126 + }, + // hdres + { + 3.133026f, // p = 0.958234684141 + 2.940954f, // p = 0.949834204693 + 2.484544f, // p = 0.923051170045 + 1.702972f, // p = 0.845922460525 + 1.655562f, // p = 0.839641385729 + }, + }, + // Aggressiveness = 3 + { + // lowres + { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f, + 0.762099214988f }, + // midres + { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f, + 0.557298794638f }, + // hdres + { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f, + 1.86572095242f }, + }, +}; + +static const float av1_simple_motion_search_no_split_thresh[4][3][5] = { + // Aggressiveness = 0 + { + // lowres + { + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + -100.0f, // p = 0.0 + }, + // midres + { + -3.38168078f, // p = 0.032872917 + -4.08610739f, // p = 0.016526795 + -1.78302370f, // p = 0.15270848 + -100.000000f, // p = 0.0 + -100.000000f, // p = 0.0 + }, + // hdres + { + -100.000000f, // p = 0.0 + -100.000000f, // p = 0.0 + -2.98718897f, // p = 0.048008 + -100.000000f, // p = 0.0 + -3.33229488f, // p = 0.03447975 + }, + }, + // Aggressiveness = 1 + { + // Lowres + { + -100.0000f, // p = 0.0 + -4.893793f, // p = 0.007437 + -3.387766f, // p = 0.032680 + -2.982806f, // p = 0.048209 + -2.330372f, // p = 0.088639 + }, + // Midres + { + -100.0000f, // p = 0.000000 + -6.131853f, // p = 0.002168 + -2.346579f, // p = 0.087338 + -2.712849f, // p = 0.062219 + -3.195430f, // p = 0.039338 + }, + // Hdres + { + -3.491416f, // p = 0.029557 + -2.192853f, // p = 0.100394 + -3.620180f, // p = 0.026079 + -2.030855f, // p = 0.116001 + -2.797586f, // p = 0.057455 + }, + }, + // Aggressiveness = 2 + { + // lowres + { + -100.0000f, // p = 0.0 + -3.617350f, // p = 0.026151 + -5.902503f, // p = 0.002725 + -4.677840f, // p = 0.009213 + -2.168378f, // p = 0.102626 + }, + // midres + { + -100.0000f, // p = 0.0 + -3.204195f, // p = 0.0390081679555 + -2.354128f, // p = 0.0867382128969 + -2.523326f, // p = 0.0742390077132 + -3.112328f, // p = 0.0426016085803 + }, + // hdres + { + -5.047760f, // p = 0.00638270448225 + -3.414994f, // p = 0.0318301469487 + -5.628090f, // p = 0.00358255438917 + -2.122691f, // p = 0.10691083145 + -1.972387f, // p = 0.122132728355 + }, + }, + // Aggressiveness = 3 + { + // lowres + { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f, + -1.0830321897f }, + // midres + { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f, + -0.228236297886f }, + // hdres + { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f, + -1.36741555171f }, + }, +}; + +static const float av1_simple_motion_search_split_mean_128[17] = { + 14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f, + 12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f, 0.714786f, + 3.535450f, 3.566207f, 0.835913f, 3.315452f, 3.302908f, +}; + +static const float av1_simple_motion_search_split_std_128[17] = { + 1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f, + 1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f, + 1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f, +}; + +static const float av1_simple_motion_search_split_mean_64[17] = { + 12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f, + 10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f, 0.896393f, + 2.819613f, 2.855845f, 0.926296f, 2.808782f, 2.798229f, +}; + +static const float av1_simple_motion_search_split_std_64[17] = { + 1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f, + 1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f, + 1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f, +}; + +static const float av1_simple_motion_search_split_mean_32[17] = { + 10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f, + 8.759780f, 8.656299f, 8.772563f, 8.669839f, 4.208026f, 0.958573f, + 2.308769f, 2.347375f, 0.961685f, 2.323464f, 2.296322f, +}; + +static const float av1_simple_motion_search_split_std_32[17] = { + 1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f, + 1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f, + 0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f, +}; + +static const float av1_simple_motion_search_split_mean_16[17] = { + 9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f, + 7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f, + 1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f, +}; + +static const float av1_simple_motion_search_split_std_16[17] = { + 1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f, + 1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f, + 0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f, +}; + +static const float av1_simple_motion_search_split_mean_8[17] = { + 7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f, + 5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f, + 1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f, +}; + +static const float av1_simple_motion_search_split_std_8[17] = { + 1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f, + 1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f, + 0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f, +}; + +static const float *const av1_simple_motion_search_split_mean[5] = { + av1_simple_motion_search_split_mean_128, + av1_simple_motion_search_split_mean_64, + av1_simple_motion_search_split_mean_32, + av1_simple_motion_search_split_mean_16, + av1_simple_motion_search_split_mean_8, +}; + +static const float *const av1_simple_motion_search_split_std[5] = { + av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64, + av1_simple_motion_search_split_std_32, av1_simple_motion_search_split_std_16, + av1_simple_motion_search_split_std_8, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 17 +#define NUM_LAYER_0_UNITS_128 20 +#define NUM_LOGITS_128 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = { + 0.24095f, -0.397761f, -0.388619f, -0.0629548f, -0.44577f, 0.688212f, + -0.20889f, -1.08227f, -0.0313894f, -0.615505f, -0.401839f, 0.40233f, + -0.171305f, 0.439803f, 1.58527f, -0.968535f, -1.29255f, 1.14846f, + 0.885777f, 0.116412f, -0.225704f, 0.316506f, 0.793951f, -0.63591f, + 0.097789f, -0.327027f, -0.778396f, -0.231667f, -0.9622f, 1.0044f, + 0.32594f, 0.179768f, -0.115529f, -0.499395f, -1.14727f, -1.26111f, + 0.269818f, -0.0882028f, -0.349107f, 0.100901f, 0.0249506f, 0.528929f, + 0.113961f, 0.929794f, 0.242494f, -0.122828f, -0.0477379f, 0.170659f, + 0.0500187f, 0.28859f, 0.78783f, 0.482412f, 0.795298f, 0.179517f, + 0.453911f, -0.298029f, -0.903332f, 0.510615f, 0.691994f, 0.433383f, + -0.140802f, -1.11635f, -0.547326f, 1.11318f, 0.71905f, 0.978538f, + 0.097444f, -0.0386012f, 0.713599f, 0.465164f, 0.391278f, -0.472864f, + 0.230224f, -0.279508f, 0.558192f, -0.468625f, 0.55995f, -0.57507f, + -1.39947f, -0.755819f, -1.04512f, -0.411552f, -0.830444f, -0.106571f, + -0.0972184f, 0.251842f, 0.269955f, 0.230492f, -0.290581f, -0.484799f, + 0.0151041f, 0.171047f, 0.829999f, -0.384581f, 0.220301f, -0.121687f, + 1.88848f, -0.482809f, -0.48185f, 1.34482f, -0.716438f, -0.284482f, + -1.78592f, -1.29333f, 0.886867f, 0.80106f, 0.456415f, 0.649095f, + 0.231093f, 0.361562f, 0.290018f, 0.128009f, -0.196343f, 0.0607802f, + 0.576761f, -0.0413836f, 0.0300984f, -0.318998f, 0.204434f, -0.712524f, + 0.833394f, -0.81168f, 0.765488f, -0.720973f, 1.12866f, -0.838694f, + 1.295f, -0.159127f, 1.05404f, 0.736519f, 0.248662f, 0.229233f, + 0.0434302f, 0.0551856f, 0.197862f, 0.354823f, -0.32429f, -0.227353f, + -0.132198f, -0.438118f, -0.210401f, -0.81046f, 0.653555f, 0.826737f, + 0.154235f, 0.228945f, 0.123089f, 0.614964f, -0.0940471f, -0.00676807f, + 0.24996f, 0.949233f, 0.746526f, -0.044474f, 0.386414f, 0.503221f, + 0.155133f, -0.698848f, -0.735356f, -0.255091f, 0.413235f, -0.335295f, + -0.145757f, 0.326299f, -0.602629f, -0.844474f, -0.346722f, -0.42598f, + -0.491016f, -0.447732f, -0.965366f, -0.0242841f, 0.836606f, -0.104877f, + 1.23236f, 0.683986f, 0.787005f, -0.0253437f, 1.2145f, 1.29554f, + -1.24302f, -0.229495f, 0.439415f, 0.885087f, -0.408704f, -0.119299f, + -0.0960972f, 0.60148f, 0.683271f, -0.057129f, -0.180295f, -0.264815f, + -0.363184f, 0.638271f, 0.631083f, -0.252899f, -0.164364f, -1.31274f, + 0.354408f, 0.0429172f, 0.371154f, -1.0978f, 0.0433642f, -0.467394f, + -0.706572f, 1.57198f, -0.0701271f, 1.93149f, -0.446267f, 1.4519f, + -1.29567f, 0.309978f, -0.878062f, 0.891494f, 0.364005f, -0.209611f, + -0.125927f, 0.184097f, 0.0629695f, -0.43375f, -0.0980562f, 1.08547f, + 0.578312f, 0.16566f, -0.198852f, -0.241854f, -0.523934f, -0.206037f, + -0.867721f, 1.00041f, 1.09848f, -2.12562f, -0.19992f, -0.186128f, + -0.03507f, 0.0484884f, 0.160856f, 0.10802f, -0.805141f, -1.06902f, + 0.290363f, 0.0222096f, -0.849266f, 0.112932f, 0.148682f, -0.0457585f, + 1.139f, 1.79141f, 0.194122f, -0.342508f, -0.403572f, 0.133678f, + 0.217553f, -0.263759f, 0.18441f, 0.254529f, 0.0471115f, 0.733178f, + -0.416205f, 0.441447f, -0.443335f, 0.725005f, -0.78946f, 0.71301f, + -0.644969f, 1.5445f, 0.365277f, -0.455775f, -0.365066f, 0.4742f, + -0.381714f, -0.545794f, -0.0464861f, -0.222768f, -0.0106466f, -0.069743f, + 0.0335566f, 0.378348f, -0.249663f, 0.922286f, 0.125711f, -0.894619f, + 0.444682f, 0.447893f, -1.98936f, -1.41978f, 0.0406667f, -0.199928f, + -0.199786f, 0.463481f, 0.334931f, -0.396222f, -0.0732259f, 0.796684f, + -0.140817f, -0.26878f, 0.194642f, 0.895784f, -0.369976f, -2.26981f, + -0.0791776f, -0.0492268f, 0.6715f, 0.281805f, 0.0156664f, -0.779785f, + 0.17743f, 0.188786f, -0.588077f, -0.359153f, 0.258319f, 0.881688f, + 0.846894f, 1.00292f, 0.838134f, 0.680632f, 0.273098f, -0.329261f, + 0.217757f, -0.506726f, -0.336523f, -0.695875f, -0.252006f, 0.751216f, + 0.334409f, -0.0151467f, 0.0885474f, 0.0973114f, -0.248754f, -0.263716f, + 0.369906f, -0.213749f, -0.0355395f, -0.137799f, 2.43233f, -0.944233f, + -0.745167f, 0.318558f, 0.316608f, 0.568678f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = { + 0.821344f, 1.11542f, -1.24172f, 1.03642f, 1.13511f, + 1.16414f, -0.278655f, -1.35558f, -1.26788f, -1.63189f, + -0.323271f, 1.21319f, -0.888415f, 0.987145f, -1.16767f, + 0.255833f, -0.1392f, 1.43265f, -1.54952f, 1.65159f +}; + +static const float av1_simple_motion_search_split_logits_kernel_128[] = { + 0.3565753f, 0.5490161f, -1.015597f, 0.565366f, 0.751604f, + 0.922747f, -1.931846f, 1.759353f, -0.7362949f, 0.5707034f, + -1.092127f, 0.936767f, 2.034499f, 2.08148f, 0.9509507f, + -1.342504f, -0.834566f, 0.618184f, 0.844113f, 1.182693f +}; + +static const float av1_simple_motion_search_split_logits_bias_128[] = { + 1.819351f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_128, + av1_simple_motion_search_split_logits_kernel_128, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_128, + av1_simple_motion_search_split_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 17 +#define NUM_LAYER_0_UNITS_64 24 +#define NUM_LOGITS_64 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = { + -1.40663f, -0.851503f, -0.0613111f, 0.741591f, 0.302754f, + 0.184001f, 0.0474853f, 0.371096f, 0.0541624f, 0.381508f, + 0.355427f, 0.0428822f, 0.154916f, -0.00490099f, 0.025484f, + 0.0208921f, 0.140596f, -0.292525f, -0.459067f, -0.081393f, + 0.109824f, -0.290183f, 0.720236f, 0.385835f, -0.150643f, + -0.078518f, 0.0979819f, -0.102135f, 0.137152f, -0.0786457f, + 0.0171441f, 0.991338f, -0.546583f, -1.0714f, -0.0842851f, + 0.244072f, 0.427379f, 0.146775f, -0.921613f, -0.912093f, + 0.393566f, -0.232375f, 0.19963f, 0.312355f, 0.55659f, + -0.104714f, -0.137563f, 0.0985237f, 0.0788307f, -0.225514f, + 0.0228832f, -0.288733f, -0.00737685f, -0.711657f, -0.256796f, + 0.0869605f, 0.583977f, 0.384306f, 1.46692f, -0.741126f, + -0.21105f, -0.276604f, -0.0151463f, -0.0227997f, -0.0403232f, + 0.044122f, 0.0185784f, -0.0451951f, 0.00489513f, -0.387131f, + 0.0966724f, -0.599174f, -0.00243351f, -0.21439f, 0.302043f, + 0.130334f, -0.191251f, 0.863261f, -1.50112f, 0.00901057f, + 0.000324294f, -0.0572545f, 0.0117685f, -0.0734682f, -0.0570435f, + -0.126253f, 1.2313f, -0.328267f, 0.211788f, -0.175438f, + -0.0419298f, 0.166447f, -0.178739f, -0.326221f, -0.0439188f, + 1.01182f, -0.390678f, -0.426343f, 0.0944665f, -0.225042f, + -0.183344f, 0.0500763f, -0.377393f, -0.673401f, -0.436907f, + -0.00366876f, -0.363412f, 0.195194f, 0.250248f, -0.397193f, + -0.0917222f, -0.0221579f, 1.7693f, -0.0694484f, -0.0410764f, + -0.134571f, -0.159992f, -0.170359f, -0.249333f, -0.128056f, + -0.617054f, -0.808701f, -0.540642f, 0.396391f, 0.147787f, + 0.346916f, 0.709852f, 0.116064f, 0.0509731f, 0.073713f, + -0.365082f, -1.09287f, -0.618214f, 0.20545f, 0.126161f, + -0.140012f, 0.62592f, 0.316326f, -0.392765f, -0.15934f, + 0.337617f, -0.41669f, -0.295225f, 0.0602025f, -0.0150657f, + -0.319629f, 0.783729f, -0.0661199f, -0.362657f, 0.390042f, + -0.043614f, -0.0414596f, 0.121155f, -0.309775f, -0.284761f, + -0.243932f, 0.279855f, -0.266823f, 0.734824f, -0.164028f, + 0.261776f, -0.105585f, 0.10733f, -0.180469f, 1.18875f, + -1.12836f, -0.173008f, 0.150221f, 0.111598f, 0.148306f, + -1.2833f, -1.06346f, 0.233546f, 0.16432f, 0.00142378f, + 0.340574f, -0.0140885f, 0.634761f, -0.122096f, 0.821487f, + 0.421424f, -0.0256687f, -0.035503f, -0.0453547f, -0.0215179f, + -0.0671277f, -0.0486862f, -0.962761f, -0.208383f, 0.109573f, + -0.210668f, -0.176485f, 0.421279f, 0.41605f, 0.342084f, + 0.619364f, 0.103718f, -0.00341643f, 0.00266677f, 0.249089f, + -0.22848f, -0.0368968f, 1.12092f, -0.64912f, -0.456579f, + 0.477823f, 0.418345f, 1.41515f, 0.0936279f, 0.886155f, + -0.785656f, -0.217109f, -0.561829f, -0.286435f, -0.884068f, + -0.148839f, -0.282848f, 0.0683745f, 0.0962815f, -0.111975f, + 0.0509158f, -0.211274f, 0.744909f, -0.8982f, 0.315232f, + -0.78624f, 0.598387f, -0.530952f, 0.677357f, 0.0371339f, + 0.99209f, -0.681899f, -0.291416f, -0.224822f, -0.26049f, + -0.0436525f, -0.380004f, -0.27187f, 0.534779f, 0.717939f, + 0.418197f, -0.152539f, -0.0684039f, -0.186308f, -0.0653121f, + 0.194145f, -0.196367f, 0.256997f, -0.726269f, -0.307672f, + -0.153362f, 0.450827f, 0.708842f, -0.0667079f, 0.555564f, + 0.0486892f, 0.0715072f, -0.7211f, -0.849797f, 0.0650271f, + 1.2747f, -0.646738f, -0.53042f, 0.182197f, 0.928203f, + 0.180621f, -0.00640791f, -0.171416f, 0.092688f, -0.391275f, + -0.0650657f, 0.0843773f, 0.170824f, 0.378085f, 0.0596657f, + 0.844398f, -1.3083f, -1.27828f, -0.199179f, 0.557855f, + 0.241479f, 0.385804f, 0.169533f, -0.0028072f, 0.0538041f, + 0.00136234f, 0.0130481f, 0.0349449f, -0.0366494f, -0.000474055f, + 0.437956f, 0.286724f, -0.298187f, 0.461967f, 0.43065f, + -0.0877194f, -0.19133f, 0.379121f, -0.687751f, -1.64077f, + -0.375191f, -0.336836f, -0.323904f, -0.101859f, 0.0126672f, + -0.346332f, 0.112303f, -0.863336f, 0.155538f, 0.366509f, + -0.0976829f, 0.635278f, -0.681967f, -0.527729f, 0.591839f, + 0.366678f, 0.189981f, 0.0208007f, -0.565809f, 0.70183f, + -0.282844f, -0.327485f, 0.347243f, -1.13014f, -0.373378f, + -0.514978f, 0.662994f, -0.144931f, 0.1402f, -0.820049f, + 0.711498f, 0.681156f, 1.06515f, -0.423409f, -0.0392664f, + 0.0675396f, -0.0508602f, 0.0431443f, 0.0212639f, -0.0279887f, + -0.62611f, -0.202064f, 0.701934f, 1.28452f, -0.00858481f, + -0.517249f, 0.0615832f, -0.260215f, 0.0949119f, -0.28423f, + -0.39573f, -0.0574246f, -0.318658f, 0.0601775f, -0.0629386f, + -0.134208f, 0.111686f, -0.23355f, 0.078667f, 0.741023f, + 0.828523f, -0.345067f, -0.315135f, -0.0957154f, 0.522825f, + -0.190057f, -0.473789f, -0.390489f, 0.200677f, -0.0271802f, + 0.110336f, 0.493302f, 0.663126f, 0.570148f, -0.380042f, + -0.437349f, -0.660884f, 0.301908f, 0.0644179f, 0.172494f, + 0.461917f, 0.330938f, -0.140041f, -0.0430205f, -1.51003f, + -0.410984f, -0.182161f, 0.0235313f, -0.364849f, 0.154183f, + -0.592465f, 0.272701f, 0.192389f, -0.0497777f, -0.924467f, + -0.179513f, -0.592217f, 0.436363f, -0.0716164f, 0.189094f, + -0.574697f, -0.304303f, 0.326441f, -0.0865553f, 0.735948f, + 0.266912f, 0.435824f, -0.123322f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = { + -1.19333f, 1.01834f, -1.10844f, 0.0454873f, -1.45506f, 0.580864f, + -0.040979f, -0.505681f, -1.15072f, 0.692697f, -0.520812f, -0.479384f, + 0.529652f, 0.507252f, -1.08619f, 0.0586375f, 0.0929614f, -0.46753f, + -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f +}; + +static const float av1_simple_motion_search_split_logits_kernel_64[] = { + -3.32501f, 0.43082f, -1.060692f, 1.328908f, 0.8892894f, 0.6488833f, + -1.096516f, -0.664786f, -1.301339f, 0.508805f, -2.128406f, -0.757304f, + 0.383839f, 0.694763f, -0.591725f, 0.770385f, 1.021594f, 0.589181f, + -0.76238f, 1.488826f, 0.709135f, -0.575738f, 0.26421759f, -0.2484219f +}; + +static const float av1_simple_motion_search_split_logits_bias_64[] = { + 0.699037f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_64, + av1_simple_motion_search_split_logits_kernel_64, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_64, + av1_simple_motion_search_split_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 17 +#define NUM_LAYER_0_UNITS_32 20 +#define NUM_LOGITS_32 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = { + -0.980626f, -0.946611f, 0.103761f, 0.408899f, 0.498149f, + 0.0490161f, 0.253279f, 0.332029f, 0.00367441f, 0.364401f, + -0.236433f, 0.0592119f, -0.0978848f, 0.159733f, -0.018052f, + -1.10726f, 1.16167f, -0.244982f, -0.147819f, -0.147095f, + 0.111404f, -0.349502f, 0.441178f, 0.0984191f, -0.135537f, + -0.0423312f, 0.0123079f, 0.358012f, -0.266796f, 0.0125811f, + 0.196563f, 0.337093f, -1.07266f, -1.25134f, 0.57337f, + -0.521717f, 0.259824f, 0.537383f, -0.463688f, -0.336128f, + 0.373385f, 0.483443f, -0.229293f, -0.33373f, -0.656021f, + 0.768647f, 0.179279f, 0.315415f, 0.187749f, 1.07839f, + 0.0626629f, -0.230299f, 0.662606f, -0.414154f, 0.459334f, + -0.6312f, 0.427704f, -0.249849f, 0.701056f, -0.707969f, + 0.057401f, 0.620434f, 0.665748f, -0.501356f, -0.230685f, + 0.0722371f, -0.0988625f, -0.114035f, -0.653799f, 0.571353f, + 0.268276f, 1.13251f, -1.0695f, -0.225607f, -0.984355f, + -0.42213f, 0.300422f, 1.21492f, -0.139931f, -0.000726004f, + 0.045964f, -0.0817352f, -0.0278813f, -0.0102341f, -0.0144087f, + -0.475882f, 1.20682f, -0.359919f, 0.277189f, -0.166401f, + 0.599211f, -0.129872f, 0.574211f, -0.247573f, 0.824405f, + -1.53329f, -0.202151f, -0.328698f, -0.516322f, -0.281416f, + -0.383651f, -0.252862f, -0.43185f, 0.456802f, -0.430055f, + -0.55245f, -0.6884f, -0.541456f, -0.281376f, 1.10425f, + -0.140706f, 1.59816f, -0.0343895f, -0.00920039f, -0.0307667f, + 0.0560132f, -0.0340302f, -0.10848f, 0.0593314f, -0.951795f, + 0.876831f, -1.00548f, -0.566244f, 0.430061f, 1.10109f, + -0.634212f, -0.0755369f, -0.108953f, 1.03191f, 0.109036f, + -0.0415309f, 0.0681162f, -0.0611775f, -0.0231938f, 0.0973158f, + -0.0558169f, -0.823484f, -0.918509f, 0.16756f, 0.27087f, + 0.286074f, 0.174069f, 0.1304f, 0.386074f, 0.433953f, + 0.0291467f, -1.74087f, 0.0296094f, -0.00793714f, -0.13041f, + 0.00990992f, -0.0137848f, -0.0742606f, -0.251029f, -0.645316f, + 0.640029f, 0.550607f, 0.470097f, 0.549451f, -0.285723f, + -0.164759f, -0.128166f, -0.391496f, -0.80287f, 0.0769472f, + 1.34391f, 0.0215005f, 0.0669497f, 0.131919f, 0.291674f, + 0.0952889f, -0.677953f, -0.364054f, 0.144823f, 0.246198f, + -0.12393f, 0.363661f, 0.215091f, -0.239658f, 0.18491f, + 0.118703f, 0.0064156f, 1.38619f, -1.3845f, 0.0567323f, + 1.20812f, -0.720374f, -1.92158f, -1.48657f, 0.335601f, + 0.409379f, 0.373618f, 0.231274f, 0.292194f, 0.368619f, + 0.2398f, 0.473579f, 0.83402f, -0.0133751f, -0.00344358f, + 2.20688e-05f, 0.00836757f, 0.00405377f, 0.0110539f, -0.260154f, + 0.192112f, -0.666986f, 0.302875f, -0.113302f, 0.17882f, + -0.221493f, 0.146161f, -0.448697f, 0.584187f, 0.122109f, + 0.989981f, -1.14706f, -0.734042f, 0.0638213f, 0.213357f, + 0.068543f, -0.808558f, 0.404741f, 0.808313f, 1.57523f, + -0.113448f, 0.254102f, -0.350065f, -0.615f, 0.0753549f, + -0.540936f, -0.0250732f, -0.225681f, -0.161384f, 0.0128342f, + -0.0933368f, -0.286904f, 0.130133f, -0.874747f, 0.392585f, + -0.493135f, 0.169708f, 0.0909804f, 1.89921f, -0.469954f, + 0.65165f, -0.953401f, -0.21595f, -0.37479f, 0.0451146f, + 0.0234621f, -0.0596903f, -0.0682308f, -0.0830426f, 0.130011f, + -0.409141f, 0.0627038f, -0.581148f, -0.513922f, 0.631676f, + 0.0637034f, 0.0539081f, 0.0638872f, 0.515863f, -0.0123463f, + 0.177238f, 0.279506f, -0.930345f, 1.23726f, 0.202851f, + 0.708792f, -0.445086f, -0.0267075f, -0.913822f, -0.0714978f, + -0.281107f, -0.0770565f, -0.23086f, -0.165893f, -0.319683f, + 0.216235f, -0.490999f, 2.04841f, -0.0524071f, -0.239043f, + -0.0526375f, 0.023002f, -0.132685f, -0.155354f, -0.186503f, + -0.904296f, 0.166478f, 0.063268f, -0.302842f, -0.27179f, + -0.428299f, 0.50193f, 0.480717f, -0.864275f, 0.317096f, + 0.40698f, 0.0286107f, 0.189432f, -0.0374374f, 0.0671728f, + 0.203681f, -0.457959f, -0.155776f, 0.340948f, 0.542841f, + 0.342675f, -0.000952399f, 0.470957f, 0.744418f, -1.11763f, + -0.658812f, -0.044832f, 0.0688237f, -0.357766f, 0.428662f, + -0.087152f, -0.291903f, 0.373244f, -0.587853f, 0.415895f, + -0.535694f, 0.621785f, -0.143648f, 0.0451373f, 0.00068827f, + 1.84432f, -1.26239f, -0.432087f, -0.152307f, 0.0293551f, + 0.184744f, -0.0173156f, -0.00572154f, -0.0305062f, -0.0900071f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = { + 0.160011f, 0.903856f, -0.13738f, 0.358221f, -0.0906044f, + -0.606558f, -0.0215651f, -0.03377f, -1.67017f, -0.144554f, + -0.201482f, -0.87719f, 0.639815f, -0.51976f, -0.309922f, + -1.33421f, 0.721328f, -0.889354f, -1.7158f, -0.285963f +}; + +static const float av1_simple_motion_search_split_logits_kernel_32[] = { + -0.2745374f, 0.333548f, -0.2437388f, 0.288009f, 0.55635f, + 0.4560176f, 0.2970518f, 0.391192f, 1.311854f, -0.231219f, + -0.2968651f, -1.819984f, 0.2775824f, 0.28929857f, 0.419126f, + -0.32868411f, -0.916399f, -0.1921077f, -0.617489f, 0.637953f +}; + +static const float av1_simple_motion_search_split_logits_bias_32[] = { + 0.208473f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_32, + av1_simple_motion_search_split_logits_kernel_32, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_32, + av1_simple_motion_search_split_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 17 +#define NUM_LAYER_0_UNITS_16 20 +#define NUM_LOGITS_16 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = { + 0.0136957f, 0.182135f, -0.583394f, 0.0556956f, 0.211152f, + 0.168234f, -0.694203f, -0.678216f, 0.289943f, 1.00014f, + -0.0427784f, -0.0427538f, -0.0276009f, -0.00133608f, 0.0901944f, + 0.0674892f, 0.104068f, -0.308582f, -0.43596f, 0.855997f, + -0.223414f, 0.0390026f, 0.366492f, 0.216065f, -0.386863f, + -0.148823f, -0.297022f, 0.0529546f, -0.202885f, 1.26471f, + -0.861163f, -0.0949431f, 0.573627f, -0.00277083f, -0.616063f, + -0.626927f, 0.371583f, -0.411743f, 0.173387f, -0.209734f, + 0.293697f, -0.260714f, 0.442728f, -0.594486f, 1.38987f, + 0.208025f, -0.0433776f, 0.01173f, 0.921766f, -0.168379f, + 0.000697326f, 0.209967f, -0.304577f, 0.149551f, -0.196658f, + 0.389251f, -0.449106f, -0.456329f, 0.669073f, -0.163806f, + 0.083348f, -0.0783998f, 0.0678355f, 0.0510435f, 0.103964f, + 0.104537f, -0.778093f, -1.0641f, -0.626102f, -2.02131f, + 0.159591f, 0.254161f, -0.000362642f, 0.289859f, 0.192713f, + 0.139801f, -0.0251327f, 0.164002f, 1.22892f, -0.0852193f, + 0.0769487f, 0.0296408f, -0.0418688f, 0.0936023f, 0.0448523f, + 0.674015f, -0.0732944f, 0.313575f, -0.593432f, 0.642067f, + -1.06063f, 0.468223f, -0.769085f, -0.173798f, -0.175663f, + 0.692808f, 0.00753295f, -0.123327f, -0.0234937f, -0.0923153f, + 0.0216917f, -0.0690157f, -0.397488f, 0.426628f, 0.264475f, + 0.342074f, -0.139817f, 0.215915f, 0.422544f, -0.321102f, + 0.0355587f, 0.460193f, 0.0315326f, 0.080556f, -0.0256533f, + -0.0857874f, -0.488283f, -0.299653f, -0.245987f, 0.104383f, + 0.203731f, 0.328734f, 0.668104f, -0.586909f, -0.501335f, + -0.661292f, -0.359811f, 0.00951363f, 0.816315f, -0.0124104f, + 0.0545827f, 0.089863f, 0.0125486f, 0.043609f, -0.0259544f, + 0.0123911f, 0.12557f, -0.539875f, -0.0556721f, 0.16532f, + 0.265834f, -0.384171f, 0.646496f, 0.366147f, -0.111272f, + 0.262096f, -0.0845724f, 0.382724f, 0.165783f, 0.1025f, + 0.392988f, 0.290525f, 0.038659f, 0.540269f, -0.485586f, + -0.273065f, -0.154052f, -0.0896895f, -0.35394f, 0.193214f, + -0.423728f, 0.654576f, -0.373321f, 0.814914f, 0.026278f, + -0.0328304f, -0.220913f, -0.0442121f, 0.487545f, -0.509537f, + -0.777581f, -1.23886f, 0.223482f, 0.206009f, 0.20391f, + 0.194628f, 0.226762f, 0.171609f, -0.219037f, 0.557892f, + -0.312011f, 1.27709f, 0.064013f, 0.105384f, 0.0493933f, + 0.074059f, -0.0100078f, -0.0176888f, -0.440005f, 0.302922f, + -0.197456f, 0.296128f, -0.326647f, 0.305323f, -0.30696f, + 0.201951f, -0.15874f, -0.793042f, 0.0197254f, 0.0569867f, + -0.0295468f, -0.0215012f, 0.025855f, -0.0196102f, 0.215558f, + -0.253069f, 0.298469f, 0.261269f, 0.435305f, 0.0120354f, + -0.384789f, -0.2772f, 0.0366613f, -0.494994f, 0.149072f, + 1.32981f, -0.427717f, 0.43938f, -0.16375f, -0.444342f, + 0.548214f, 0.127955f, -1.24387f, 0.0863676f, 0.175071f, + 0.172673f, -0.0906204f, 0.444454f, -0.546669f, 0.215857f, + -0.100621f, 0.200699f, -0.0985915f, 0.134706f, -0.256396f, + 0.393427f, 0.119606f, -0.214278f, -0.0183637f, 0.194266f, + -0.238025f, 0.182203f, 0.599718f, 0.846933f, 0.0607852f, + -0.183434f, -0.723743f, -0.72414f, -0.124701f, 0.0227527f, + -0.0664636f, -0.0385867f, -0.0257377f, -0.149054f, 0.12077f, + 0.678029f, -0.624456f, 0.189644f, -0.518604f, 0.134397f, + -0.189777f, -0.309376f, -0.00377086f, 0.701132f, -0.170915f, + 0.00736111f, -0.121906f, 0.329136f, 0.165514f, 0.0328356f, + 0.171275f, 0.248619f, 0.247704f, -0.449933f, 0.0841684f, + 0.136982f, 0.122703f, -0.0169439f, -0.0726496f, 0.302648f, + -0.128556f, 0.0667425f, -0.289717f, -0.207532f, -1.20269f, + -0.68892f, 0.045259f, 0.0973945f, 0.0988314f, -0.944748f, + -0.180401f, 0.134331f, 0.033834f, 0.109023f, 0.265723f, + 0.38063f, -0.106518f, -0.0686953f, 0.3744f, -1.0957f, + 0.0302782f, 0.0515164f, 0.00188222f, 0.0014413f, -0.0404425f, + 0.0124618f, -0.0828645f, 0.506166f, -0.776352f, -0.405138f, + -0.123887f, 0.0732116f, 0.379928f, 0.604524f, -0.492317f, + 0.439191f, 0.0744193f, 0.389101f, 0.0604518f, 0.0943165f, + 0.0339942f, 0.0917975f, 0.0161988f, 0.512227f, 0.538021f, + -0.411495f, 0.307281f, 0.33746f, -0.218639f, 0.265742f, + 0.39738f, -0.12442f, 0.125236f, -0.0845223f, -0.150396f, + 0.0334878f, -0.00391915f, 0.0406864f, -0.0487059f, 0.0377073f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = { + 0.0535976f, -0.0130279f, 0.150146f, -0.511132f, -0.357698f, + 0.6719f, -1.27877f, -0.0208048f, 0.0961914f, 0.263603f, + 0.704574f, -1.48998f, 0.728063f, 0.941829f, -0.199981f, + 0.797802f, -0.29816f, -0.60894f, -0.116624f, -1.16723f +}; + +static const float av1_simple_motion_search_split_logits_kernel_16[] = { + 0.343153f, -0.2110482f, -0.487199f, 0.3274144f, -2.1975f, + -0.6051438f, 0.1901127f, 0.4741924f, -0.24029f, -0.185018f, + -0.652635f, 2.57714f, -0.31033031f, -0.307222f, 0.329035f, + -0.430181f, 0.3429f, 0.742292f, 0.3269808f, 0.4142165f +}; + +static const float av1_simple_motion_search_split_logits_bias_16[] = { + -0.783658f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_16, + av1_simple_motion_search_split_logits_kernel_16, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_16, + av1_simple_motion_search_split_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 17 +#define NUM_LAYER_0_UNITS_8 20 +#define NUM_LOGITS_8 1 + +static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = { + 0.079443f, -1.04068f, 0.336819f, -0.20901f, 0.796251f, + 0.181066f, 0.0118876f, -0.207145f, 0.250671f, -0.402119f, + -0.0847227f, 1.88683f, 0.303469f, 0.0718458f, 0.0338589f, + 0.158896f, 0.0540238f, -0.385426f, 0.955925f, 0.424506f, + 0.492584f, -0.795058f, -0.248667f, -0.905349f, -0.316989f, + 0.545471f, 0.63762f, -0.232613f, -0.238947f, -0.395338f, + -0.322673f, -0.0761563f, -0.125357f, 0.0694415f, -0.371599f, + 0.358387f, -0.486841f, 0.403863f, -0.0295666f, 0.283074f, + -0.424396f, 0.156318f, -0.685355f, 0.6663f, 0.337949f, + 0.273198f, 0.517448f, 0.458911f, 0.157252f, 0.692096f, + 0.64965f, -0.23987f, -1.08431f, -0.252475f, -0.332614f, + -0.712291f, -0.380973f, 0.460545f, 0.48936f, 0.337601f, + 0.489223f, 1.65336f, -0.223585f, 0.17367f, -0.235057f, + -0.456773f, 0.327877f, -0.221192f, -0.940151f, -1.06616f, + 0.687084f, -0.109973f, 0.106636f, 0.445895f, 0.163432f, + 0.378306f, 0.201902f, 0.176811f, 0.693082f, 1.62156f, + -0.178346f, 0.455175f, 1.61943f, 0.231376f, 0.0890932f, + -0.889693f, -1.03298f, 0.778196f, -0.0289539f, 0.137848f, + 0.18707f, 0.171889f, 0.119157f, 0.24893f, -0.313628f, + 0.00250735f, -0.0758209f, 0.272974f, -0.229825f, 2.47926f, + -0.0354665f, 0.175366f, 0.0411555f, -1.52149f, -0.0258663f, + 0.253027f, -0.0520839f, -0.0189782f, 0.362387f, -0.371154f, + 0.622929f, 0.0447056f, 0.242529f, -0.168391f, 0.308935f, + -0.117294f, 2.16307f, 0.0673638f, 0.080771f, -0.460779f, + -0.940176f, 0.473266f, -0.0125302f, 0.475145f, -0.218187f, + 0.43258f, -0.0380196f, 0.413607f, -0.110856f, -1.52076f, + 0.0896812f, 0.246636f, -0.0612008f, 0.189583f, 0.0106902f, + -0.158403f, -0.629377f, -0.0634279f, -0.0864584f, -0.226568f, + -0.286234f, -0.0721132f, -0.43702f, 0.113702f, 0.433372f, + 0.743396f, 0.14312f, 0.29914f, 0.801188f, 0.7609f, + 0.385046f, 0.480314f, 0.171119f, -1.59058f, -1.18853f, + 0.150676f, 0.408123f, -0.00677924f, 0.398145f, 0.0914611f, + 0.176945f, 0.0677457f, 0.316478f, 0.998219f, -0.22618f, + 0.0756793f, -0.0156674f, 0.105716f, 0.0496245f, -0.0827133f, + -0.423119f, -0.161033f, 0.212962f, -0.234453f, 0.743366f, + 1.04108f, 0.0597604f, -0.285993f, -0.114829f, -0.557364f, + -0.840051f, 0.326509f, -0.192508f, -0.141769f, 0.370626f, + -0.126353f, 0.00672923f, 0.493623f, -0.852076f, 0.466798f, + -0.226436f, 0.259268f, -0.452662f, 0.0721126f, 0.0198245f, + 0.2048f, 0.02506f, 0.316194f, 0.814651f, 1.01288f, + -0.569607f, -0.0838994f, 1.37146f, -0.613135f, 0.441761f, + -0.643901f, 0.364269f, -0.147177f, 0.338001f, -0.332376f, + 0.518875f, -0.628964f, -0.291889f, -0.050736f, 0.108047f, + 1.05673f, 0.0479492f, 0.466756f, -0.0867334f, -0.0355575f, + 0.57626f, -0.227583f, -0.146421f, 0.0990489f, 0.117351f, + -0.103858f, -0.0336936f, 0.0201903f, -0.0766383f, -0.010211f, + 0.0400779f, 0.0725462f, 0.137142f, 0.478261f, 0.287869f, + 0.0882359f, -0.739754f, -0.853521f, -0.43703f, 0.316856f, + 0.27593f, 0.312149f, 0.175575f, 0.441839f, 0.264325f, + 0.0148051f, -0.005559f, 0.373176f, 0.933701f, -0.0197615f, + 0.0219723f, -0.0559883f, -0.103456f, -0.0323009f, 0.0773202f, + -0.390838f, 0.855488f, -0.596525f, -0.249093f, 0.124262f, + 0.220172f, 0.0552478f, 1.04041f, -0.960992f, -0.495255f, + -0.211612f, 0.350007f, -0.238998f, -0.0265068f, 0.384686f, + -0.0815808f, -0.0570019f, 0.123903f, -0.485114f, -0.00282573f, + -0.0649603f, 0.163719f, -0.469479f, -0.439713f, 0.0602562f, + -0.527993f, -0.111458f, 2.48686f, -0.180723f, 0.0553895f, + 0.0560679f, -0.0978928f, -0.216063f, 0.089457f, -1.5602f, + -1.62332f, -0.147388f, 0.736155f, 0.440409f, 0.243519f, + 0.0622638f, 0.522932f, 0.109686f, 0.422849f, 0.510589f, + 1.01116f, 0.174019f, 0.0191171f, -0.0717751f, -0.0068308f, + 0.172932f, -0.834888f, -0.635788f, 0.32012f, 0.298656f, + 0.274309f, -0.155456f, 0.1755f, -0.175171f, 0.343498f, + -0.122832f, -0.107696f, 0.279924f, -0.797633f, -0.344658f, + 0.162669f, 0.389092f, 0.644479f, -0.635216f, -0.181868f, + 0.0579244f, -0.0568976f, 0.433003f, -0.591067f, 0.71013f, + -0.165515f, 0.225725f, -0.358156f, 0.0541944f, 1.95485f, + -0.315223f, 0.61537f, -0.0401568f, 0.22811f, 0.271147f +}; + +static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = { + 1.63441f, -0.616459f, -0.437775f, -0.71669f, 1.56616f, 2.28109f, 1.64054f, + -1.51476f, 0.0274108f, 0.935156f, -0.966329f, 0.906069f, 1.19954f, -1.25867f, + -1.7376f, -0.594211f, 0.322242f, 0.438631f, -1.01682f, 1.30032f +}; + +static const float av1_simple_motion_search_split_logits_kernel_8[] = { + -0.463187f, 0.2936127f, 0.16762f, -0.1663271f, -0.292418f, + -0.421457f, -0.378265f, 1.053049f, 0.32432879f, -0.49775575f, + 0.427357f, -0.239251f, -0.1631546f, 0.335468f, 0.255371f, + 0.276901f, -0.665683f, -0.7021493f, 0.381513f, -0.1339761f +}; + +static const float av1_simple_motion_search_split_logits_bias_8[] = { + -1.739754f +}; + +static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_kernel_8, + av1_simple_motion_search_split_logits_kernel_8, + }, + { + av1_simple_motion_search_split_hiddenlayer_0_bias_8, + av1_simple_motion_search_split_logits_bias_8, + }, +}; + +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = { + &av1_simple_motion_search_split_nn_config_128, + &av1_simple_motion_search_split_nn_config_64, + &av1_simple_motion_search_split_nn_config_32, + &av1_simple_motion_search_split_nn_config_16, + &av1_simple_motion_search_split_nn_config_8, +}; + +// Model based on simple_motion_search for pruning rect +// Thresholds. The first idx level is aggresiveness, second is frame resolution, +// third is bsize +static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = { + // Aggressivness = 0 + { + // Lowres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + // Midres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + // Hdres + { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f, + 0.000961189195907f, 0.0f }, + }, + // Aggressivness = 1 + { + // Lowres + { + 0.000000f, + 0.116076f, + 0.049759f, + 0.057747f, + 0.006001f, + }, + // Midres + { + 0.000000f, + 0.017380f, + 0.026077f, + 0.078111f, + 0.064477f, + }, + // Hdres + { + 0.002994f, + 0.103093f, + 0.076408f, + 0.010456f, + 0.187211f, + }, + }, + // Aggressiveness = 2 + { + // Lowres + { + 0.000000f, + 0.003111f, + 0.144294f, + 0.144884f, + 0.069924f, + }, + // Midres + { + 0.000000f, + 0.013696f, + 0.055203f, + 0.152271f, + 0.078886f, + }, + // Hdres + { + 0.030577f, + 0.082486f, + 0.040690f, + 0.140924f, + 0.067608f, + }, + }, + // Aggressiveness = 3 + { + // Lowres + { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f, + 0.287219697095f }, + // Midres + { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f, + 0.178833795641f }, + // Hdres + { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f, + 0.21329309279f }, + }, +}; + +// Mean and std +static const float av1_simple_motion_search_prune_rect_mean_128[25] = { + 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f, + 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f, + 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f, + 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f, + 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f, +}; + +static const float av1_simple_motion_search_prune_rect_std_128[25] = { + 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f, + 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f, + 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f, + 1.208679f, 0.353742f, 1.228122f, 1.211777f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_64[25] = { + 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f, + 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f, + 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f, + 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f, + 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f, +}; + +static const float av1_simple_motion_search_prune_rect_std_64[25] = { + 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f, + 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f, + 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f, + 1.081292f, 0.257521f, 1.112510f, 1.089404f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_32[25] = { + 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f, + 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f, + 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f, + 2.751266f, 0.963302f, 2.716584f, 2.709725f, +}; + +static const float av1_simple_motion_search_prune_rect_std_32[25] = { + 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f, + 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f, + 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f, + 0.952221f, 0.188018f, 0.985295f, 0.946228f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_16[25] = { + 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f, + 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f, + 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f, + 2.131698f, 0.981005f, 2.110868f, 2.106539f, +}; + +static const float av1_simple_motion_search_prune_rect_std_16[25] = { + 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f, + 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f, + 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f, + 0.829935f, 0.136507f, 0.828972f, 0.808563f, +}; + +static const float av1_simple_motion_search_prune_rect_mean_8[25] = { + 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f, + 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f, + 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f, + 1.531762f, 0.989606f, 1.496581f, 1.484139f, +}; + +static const float av1_simple_motion_search_prune_rect_std_8[25] = { + 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f, + 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f, + 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f, + 0.754040f, 0.101419f, 0.738239f, 0.729455f, +}; + +static const float *const av1_simple_motion_search_prune_rect_mean[5] = { + av1_simple_motion_search_prune_rect_mean_128, + av1_simple_motion_search_prune_rect_mean_64, + av1_simple_motion_search_prune_rect_mean_32, + av1_simple_motion_search_prune_rect_mean_16, + av1_simple_motion_search_prune_rect_mean_8, +}; + +static const float *const av1_simple_motion_search_prune_rect_std[5] = { + av1_simple_motion_search_prune_rect_std_128, + av1_simple_motion_search_prune_rect_std_64, + av1_simple_motion_search_prune_rect_std_32, + av1_simple_motion_search_prune_rect_std_16, + av1_simple_motion_search_prune_rect_std_8, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 25 +#define NUM_LAYER_0_UNITS_128 8 +#define NUM_LOGITS_128 4 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = { + -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f, + -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f, + 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f, + -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f, + 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f, + 0.398452f, 0.696949f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = { + 1.22789f, -1.34527f, 0.759048f, 0.315086f, + 1.0834f, -1.58019f, -0.465158f, 1.20716f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = { + -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f, + 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f, + -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f, + 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f, + -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f, + -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f, + -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f, + 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f, + 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f, + 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f, + 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f, + -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f, + 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f, + -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f, + -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f, + 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f, + -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f, + 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f, + 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f, + -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f, + 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f, + -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f, + -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f, + -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f, + 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f, + -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f, + 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f, + -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f, + 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f, + 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f, + -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f, + -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f, + 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f, + -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f, + 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f, + 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f, + -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f, + 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f, + 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f, + -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = { + 1.58571f, -4.6314f, -2.00273f, 0.543699f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_128, + av1_simple_motion_search_prune_rect_logits_kernel_128, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_128, + av1_simple_motion_search_prune_rect_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 25 +#define NUM_LAYER_0_UNITS_64 32 +#define NUM_LOGITS_64 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = { + 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f, + -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f, + 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f, + -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f, + 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f, + 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f, + 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f, + -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f, + -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f, + -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f, + 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f, + -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f, + -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f, + 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f, + 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f, + -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f, + -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f, + 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f, + 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f, + 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f, + -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f, + 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f, + -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f, + -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f, + -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f, + -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f, + 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f, + 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f, + 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f, + -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f, + -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f, + -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f, + -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f, + -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f, + -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f, + -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f, + -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f, + -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f, + -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f, + -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f, + -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f, + -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f, + 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f, + 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f, + -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f, + 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f, + -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f, + -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f, + -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f, + -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f, + -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f, + -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f, + -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f, + -0.359633f, 0.668108f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = { + 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f, + -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f, + 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f, + -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f, + 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f, + 0.656818f, 0.0169274f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = { + -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f, + 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f, + 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f, + -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f, + 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f, + 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f, + -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f, + 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f, + -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f, + 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f, + -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f, + -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f, + -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f, + 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f, + 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f, + 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f, + -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f, + -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f, + 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f, + 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f, + -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f, + 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f, + -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f, + 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f, + 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f, + -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f, + 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f, + -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f, + -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f, + 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f, + -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f, + 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f, + -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f, + -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f, + 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f, + -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f, + -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f, + -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f, + -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f, + -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f, + -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f, + -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f, + -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f, + 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f, + 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f, + 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f, + -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f, + 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f, + -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f, + -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f, + 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f, + 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f, + 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f, + -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f, + -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f, + 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f, + -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f, + 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f, + -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f, + -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f, + 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f, + 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f, + -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f, + -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f, + 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f, + -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f, + 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f, + -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f, + -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f, + -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f, + -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f, + 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f, + -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f, + 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f, + 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f, + -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f, + -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f, + -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f, + -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f, + 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f, + 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f, + 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f, + -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f, + 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f, + 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f, + -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f, + -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f, + 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f, + 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f, + -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f, + -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f, + -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f, + -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f, + 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f, + -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f, + -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f, + -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f, + -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f, + 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f, + -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f, + -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f, + 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f, + 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f, + -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f, + 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f, + -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f, + -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f, + -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f, + 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f, + -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f, + -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f, + 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f, + -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f, + 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f, + -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f, + -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f, + 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f, + -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f, + 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f, + 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f, + -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f, + -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f, + -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f, + -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f, + -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f, + 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f, + -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f, + 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f, + 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f, + 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f, + 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f, + -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f, + 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f, + 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f, + -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f, + -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f, + -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f, + -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f, + 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f, + 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f, + 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f, + 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f, + -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f, + 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f, + -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f, + -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f, + 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f, + 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f, + -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f, + 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f, + 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f, + 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f, + 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f, + 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f, + -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f, + -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f, + 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f, + -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f, + -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f, + -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = { + 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f, + -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_64, + av1_simple_motion_search_prune_rect_logits_kernel_64, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_64, + av1_simple_motion_search_prune_rect_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 25 +#define NUM_LAYER_0_UNITS_32 28 +#define NUM_LOGITS_32 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = { + 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f, + 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f, + -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f, + 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f, + -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f, + -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f, + -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f, + 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f, + 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f, + 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f, + -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f, + 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f, + -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f, + 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f, + -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f, + 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f, + -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f, + 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f, + 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f, + -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f, + 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f, + -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f, + 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f, + 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f, + 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f, + -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f, + -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f, + -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f, + 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f, + -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f, + -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f, + -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f, + -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f, + 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f, + 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f, + 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f, + -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f, + -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f, + 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f, + 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f, + -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f, + 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f, + -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f, + -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f, + 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f, + 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f, + -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f, + -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f, + -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f, + -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f, + 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f, + -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f, + -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f, + -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f, + -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f, + -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = { + 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f, + 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f, + 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f, + -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f, + 0.59681f, -0.472405f, 0.0969218f, -0.250624f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = { + 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f, + -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f, + -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f, + 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f, + 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f, + -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f, + 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f, + -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f, + -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f, + -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f, + 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f, + -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f, + 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f, + 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f, + -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f, + 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f, + -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f, + 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f, + 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f, + 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f, + -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f, + 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f, + -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f, + 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f, + -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f, + -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f, + -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f, + 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f, + -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f, + 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f, + -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f, + 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f, + 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f, + -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f, + 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f, + -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f, + -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f, + -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f, + 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f, + 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f, + -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f, + 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f, + -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f, + -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f, + 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f, + 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f, + -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f, + 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f, + -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f, + -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f, + 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f, + 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f, + -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f, + 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f, + -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f, + -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f, + -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f, + -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f, + -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f, + -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f, + -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f, + 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f, + -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f, + -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f, + 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f, + -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f, + 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f, + 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f, + -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f, + 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f, + -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f, + 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f, + -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f, + 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f, + 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f, + -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f, + 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f, + 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f, + -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f, + 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f, + -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f, + 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f, + -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f, + -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f, + -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f, + -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f, + 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f, + 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f, + 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f, + 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f, + -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f, + -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f, + 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f, + -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f, + 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f, + -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f, + 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f, + -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f, + -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f, + -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f, + -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f, + -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f, + -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f, + 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f, + 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f, + -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f, + 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f, + 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f, + -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f, + 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f, + 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f, + -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f, + -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f, + -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f, + 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f, + -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f, + 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f, + -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f, + 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f, + -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f, + 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f, + 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f, + -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f, + -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f, + -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f, + -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f, + -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f, + -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f, + -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f, + -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f, + -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f, + 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f, + -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f, + 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f, + 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f, + -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f, + 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f, + -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f, + 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f, + -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = { + 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f, + -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_32, + av1_simple_motion_search_prune_rect_logits_kernel_32, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_32, + av1_simple_motion_search_prune_rect_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 25 +#define NUM_LAYER_0_UNITS_16 32 +#define NUM_LOGITS_16 10 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = { + -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f, + 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f, + -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f, + 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f, + -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f, + 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f, + 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f, + -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f, + 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f, + 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f, + -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f, + 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f, + -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f, + -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f, + -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f, + -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f, + -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f, + 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f, + -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f, + -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f, + 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f, + -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f, + 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f, + 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f, + 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f, + -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f, + -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f, + -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f, + -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f, + -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f, + 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f, + -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f, + -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f, + 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f, + -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f, + -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f, + -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f, + -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f, + -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f, + 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f, + -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f, + -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f, + -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f, + -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f, + -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f, + -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f, + 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f, + -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f, + -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f, + 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f, + 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f, + -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f, + -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f, + -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f, + 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f, + -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f, + 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f, + 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f, + -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f, + 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f, + -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f, + -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f, + -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f, + -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = { + -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f, + -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f, + 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f, + 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f, + -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f, + 0.661496f, 0.95533f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = { + -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f, + 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f, + 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f, + -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f, + -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f, + -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f, + -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f, + -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f, + 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f, + 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f, + -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f, + -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f, + -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f, + 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f, + -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f, + -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f, + 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f, + 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f, + 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f, + -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f, + 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f, + -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f, + -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f, + 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f, + 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f, + 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f, + -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f, + 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f, + 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f, + 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f, + -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f, + -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f, + -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f, + -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f, + -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f, + -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f, + -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f, + 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f, + -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f, + -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f, + -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f, + 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f, + -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f, + 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f, + 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f, + -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f, + 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f, + 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f, + -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f, + 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f, + -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f, + 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f, + -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f, + 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f, + -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f, + 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f, + 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f, + -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f, + 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f, + 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f, + 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f, + 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f, + 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f, + -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f, + 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f, + -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f, + -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f, + -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f, + 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f, + 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f, + 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f, + -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f, + 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f, + -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f, + -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f, + -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f, + -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f, + -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f, + -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f, + -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f, + 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f, + -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f, + 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f, + 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f, + 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f, + -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f, + 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f, + 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f, + -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f, + 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f, + -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f, + 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f, + -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f, + -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f, + 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f, + -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f, + -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f, + -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f, + -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f, + -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f, + -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f, + 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f, + -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f, + 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f, + 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f, + -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f, + -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f, + -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f, + 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f, + -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f, + -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f, + -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f, + -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f, + 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f, + 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f, + -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f, + -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f, + -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f, + 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f, + -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f, + 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f, + 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f, + -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f, + 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f, + -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f, + -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f, + 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f, + 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f, + -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f, + -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f, + -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f, + -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f, + 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f, + -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f, + -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f, + -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f, + -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f, + 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f, + 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f, + -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f, + -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f, + 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f, + 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f, + -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f, + 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f, + -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f, + 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f, + -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f, + 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f, + 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f, + 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f, + 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f, + 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f, + -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f, + 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f, + 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f, + -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f, + -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f, + 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f, + -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = { + 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f, + -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_16, + av1_simple_motion_search_prune_rect_logits_kernel_16, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_16, + av1_simple_motion_search_prune_rect_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 25 +#define NUM_LAYER_0_UNITS_8 32 +#define NUM_LOGITS_8 4 + +static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = { + -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f, + 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f, + -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f, + 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f, + -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f, + -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f, + 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f, + -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f, + -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f, + 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f, + -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f, + 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f, + -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f, + 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f, + 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f, + -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f, + -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f, + -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f, + 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f, + -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f, + -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f, + -0.112242f, 0.295184f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = { + -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f, + -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f, + -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f, + 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f, + -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f, + -0.490783f, -0.415782f +}; + +static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = { + -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f, + 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f, + 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f, + -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f, + -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f, + -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f, + -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f, + 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f, + 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f, + 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f, + -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f, + -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f, + 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f, + 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f, + 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f, + 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f, + -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f, + -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f, + 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f, + -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f, + -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f, + -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f, + 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f, + -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f, + 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f, + -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f, + 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f, + -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f, + -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f, + 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f, + -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f, + 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f, + 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f, + 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f, + 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f, + 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f, + 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f, + -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f, + 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f, + -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f, + -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f, + 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f, + -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f, + 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f, + -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f, + -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f, + 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f, + 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f, + 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f, + 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f, + -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f, + 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f, + -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f, + -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f, + -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f, + -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f, + 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f, + 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f, + -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f, + 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f, + -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f, + 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f, + 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f, + 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f, + -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f, + -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f, + 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f, + -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f, + -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f, + -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f, + 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f, + -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f, + -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f, + 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f, + -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f, + -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f, + -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f, + 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f, + 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f, + 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f, + -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f, + 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f, + -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f, + 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f, + 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f, + -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f, + 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f, + 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f, + -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f, + -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f, + -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f, + 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f, + 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f, + -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f, + -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f, + -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f, + 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f, + -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f, + 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f, + 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f, + -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f, + 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f, + -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f, + 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f, + 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f, + -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f, + 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f, + -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f, + 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f, + 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f, + -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f, + 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f, + -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f, + 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f, + 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f, + 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f, + 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f, + 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f, + 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f, + -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f, + -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f, + -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f, + 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f, + 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f, + 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f, + -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f, + 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f, + -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f, + -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f, + -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f, + -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f, + 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f, + -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f, + 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f, + 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f, + 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f, + -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f, + 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f, + -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f, + -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f, + -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f, + -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f, + 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f, + 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f, + -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f, + -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f, + -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f, + 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f, + -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f, + 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f, + -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f, + 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f, + -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f, + -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f, + 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f, + 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f, + -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f, + -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f, + -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f, + -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f +}; + +static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = { + 1.63404f, -0.715866f, -1.0132f, -2.08745f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_prune_rect_layer_0_kernel_8, + av1_simple_motion_search_prune_rect_logits_kernel_8, + }, + { + av1_simple_motion_search_prune_rect_layer_0_bias_8, + av1_simple_motion_search_prune_rect_logits_bias_8, + }, +}; + +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +static const NN_CONFIG + *const av1_simple_motion_search_prune_rect_nn_config[5] = { + &av1_simple_motion_search_prune_rect_nn_config_128, + &av1_simple_motion_search_prune_rect_nn_config_64, + &av1_simple_motion_search_prune_rect_nn_config_32, + &av1_simple_motion_search_prune_rect_nn_config_16, + &av1_simple_motion_search_prune_rect_nn_config_8, + }; + +// nn model for predicting max square partition level of a superblock +#define NUM_HIDDEN_LAYERS 1 +#define NUM_FEATURES 13 +#define NUM_LAYER_0_UNITS 48 +#define NUM_LOGITS 4 + +static const float av1_max_part_pred_logits_kernel[] = { + -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f, + 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f, + 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f, + 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f, + 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f, + 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f, + -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f, + 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f, + -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f, + -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f, + 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f, + 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f, + -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f, + 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f, + -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f, + -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f, + 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f, + 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f, + 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f, + 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f, + -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f, + 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f, + 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f, + 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f, + 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f, + 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f, + 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f, + 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f, + -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f, + -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f, + -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f, + 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f, + -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f, + 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f, + 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f, + -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f, + 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f, + 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f, + 0.208747f, 0.448697f +}; + +static const float av1_max_part_pred_layer_0_bias[] = { + -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f, + 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f, + -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f, + -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f, + -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f, + -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f, + -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f, + 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f +}; + +static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f, + 1.96217f, 0.728905f }; + +static const float av1_max_part_pred_layer_0_kernel[] = { + 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f, + -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f, + -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f, + 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f, + -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f, + -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f, + -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f, + -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f, + 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f, + -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f, + -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f, + -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f, + -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f, + 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f, + -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f, + -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f, + 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f, + -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f, + -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f, + 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f, + -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f, + -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f, + 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f, + -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f, + -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f, + -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f, + -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f, + -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f, + -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f, + -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f, + 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f, + -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f, + -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f, + -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f, + 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f, + -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f, + -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f, + 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f, + 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f, + -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f, + -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f, + -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f, + 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f, + -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f, + -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f, + -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f, + -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f, + -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f, + 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f, + 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f, + 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f, + -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f, + -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f, + -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f, + -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f, + -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f, + 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f, + -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f, + 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f, + -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f, + 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f, + -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f, + -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f, + 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f, + 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f, + -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f, + 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f, + 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f, + -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f, + 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f, + -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f, + -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f, + 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f, + 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f, + 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f, + -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f, + -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f, + -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f, + -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f, + -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f, + 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f, + -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f, + 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f, + -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f, + -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f, + -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f, + -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f, + -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f, + -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f, + 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f, + -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f, + 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f, + 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f, + -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f, + -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f, + 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f, + -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f, + -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f, + 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f, + 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f, + -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f, + -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f, + 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f, + -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f, + 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f, + 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f, + 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f, + 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f, + -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f, + -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f, + -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f, + 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f, + 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f, + -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f, + -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f, + -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f, + 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f, + -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f, + -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f, + -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f, + 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f, + 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f, + -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f, + -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f, + 1.36966f, 0.869475f, -0.0302774f, -0.0537556f +}; + +static const NN_CONFIG av1_max_part_pred_nn_config = { + NUM_FEATURES, + NUM_LOGITS, + NUM_HIDDEN_LAYERS, + { + NUM_LAYER_0_UNITS, + }, + { + av1_max_part_pred_layer_0_kernel, + av1_max_part_pred_logits_kernel, + }, + { + av1_max_part_pred_layer_0_bias, + av1_max_part_pred_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef NUM_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +// Early termination in second pass +static const float av1_simple_motion_search_term_none_mean_128[28] = { + 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f, + 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f, + 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f, + 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f, + 4.298179f, 8.514713f, 14.911736f, 19.825352f, +}; + +static const float av1_simple_motion_search_term_none_std_128[28] = { + 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f, + 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f, + 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f, + 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f, +}; + +static const float av1_simple_motion_search_term_none_mean_64[28] = { + 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f, + 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f, + 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f, + 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f, + 3.573322f, 8.807137f, 13.348477f, 18.269117f, +}; + +static const float av1_simple_motion_search_term_none_std_64[28] = { + 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f, + 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f, + 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f, + 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f, +}; + +static const float av1_simple_motion_search_term_none_mean_32[28] = { + 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f, + 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f, + 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f, + 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f, +}; + +static const float av1_simple_motion_search_term_none_std_32[28] = { + 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f, + 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f, + 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f, + 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f, +}; + +static const float av1_simple_motion_search_term_none_mean_16[28] = { + 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f, + 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f, + 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f, + 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f, +}; + +static const float av1_simple_motion_search_term_none_std_16[28] = { + 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f, + 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f, + 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f, + 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f, +}; + +static const float av1_simple_motion_search_term_none_model_128[] = { + -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f, + 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f, + 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f, + 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f, + -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f, + 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f, + 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f, + -0.5493146094f, +}; + +static const float av1_simple_motion_search_term_none_model_64[] = { + -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f, + 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f, + 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f, + -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f, + -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f, + 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f, + 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f, + -0.4337360901f, +}; + +static const float av1_simple_motion_search_term_none_model_32[] = { + -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f, + 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f, + 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f, + -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f, + -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f, + 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f, + 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f, + -0.6609679881f, +}; + +static const float av1_simple_motion_search_term_none_model_16[] = { + -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f, + 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f, + 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f, + -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f, + 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f, + 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f, + 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f, + -0.5396254205f, +}; + +#define FEATURES 31 +#define HIDDEN_NODES 32 +static const float av1_early_term_after_split_nn_weights_64_layer0[] = { + -0.306296f, -0.691664f, 0.335148f, -0.298465f, -0.509241f, -0.632796f, + -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f, + -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f, + -0.240293f, 0.121749f, -0.489777f, -0.756647f, 0.001047f, -0.016528f, + 0.145714f, 0.172910f, 0.086197f, 0.162882f, -0.070588f, -0.077104f, + 0.502730f, -0.244954f, 0.265605f, -0.323994f, 0.223397f, -1.086453f, + 0.391886f, 0.200343f, 0.253878f, 0.018925f, 0.201819f, -0.205136f, + 0.427314f, 0.041155f, 0.070484f, 0.159925f, -0.057095f, -0.146544f, + -0.073792f, 0.152628f, 0.003986f, -0.515965f, -0.209754f, 0.037457f, + 0.070622f, -0.143571f, -0.059602f, 0.111734f, 0.319674f, 0.149894f, + -0.219883f, 0.206678f, 0.015809f, -0.210549f, 0.130156f, -0.189502f, + -0.850392f, -0.156363f, -0.060354f, 0.189044f, 0.266495f, 0.151305f, + -0.563677f, -0.354896f, 0.300637f, 0.257568f, -0.008359f, -0.535497f, + -0.003127f, 0.293054f, -0.020212f, -0.157278f, 0.229972f, -0.309799f, + -0.329927f, -0.077140f, 0.001177f, -0.024415f, 0.134044f, -0.181587f, + -0.135380f, 0.230989f, -0.281451f, 0.912282f, 0.511562f, -3.900779f, + -0.039917f, 1.956406f, -0.357589f, 0.292998f, -0.950158f, 0.422041f, + 0.526572f, 0.605746f, -0.147110f, 0.256576f, 0.090010f, 0.221641f, + 0.029763f, 0.351592f, 0.458324f, -0.005888f, 0.010521f, -0.389326f, + -0.094006f, -0.171489f, -0.013153f, 0.026333f, -0.454571f, -1.932891f, + -0.168211f, 0.051298f, -0.258061f, -0.028936f, -0.555937f, -0.475566f, + -0.304046f, -0.318113f, 0.099697f, -0.217145f, 0.139433f, -0.203986f, + -0.164012f, 0.051527f, 0.138603f, -0.085100f, -0.082887f, -0.242955f, + -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f, 0.135086f, + 0.146200f, 0.184827f, -0.199041f, 0.162570f, -0.300167f, 0.017748f, + -0.140111f, 0.103553f, 0.206929f, 0.193446f, 0.123141f, -1.201898f, + -0.052254f, -0.750121f, 0.111741f, 0.204092f, -0.166266f, 0.124008f, + -0.455496f, 0.306035f, 0.275903f, 0.193599f, -0.730011f, 0.126808f, + 0.051059f, 0.103634f, -0.044334f, 0.048889f, 0.405228f, 0.574099f, + 0.061167f, 0.260576f, 0.070032f, -0.038040f, 0.229183f, -0.243269f, + -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f, + -0.290828f, -0.233006f, 0.068712f, 0.618085f, -0.407008f, 0.686868f, + 0.172247f, 0.826287f, -0.002672f, 0.239825f, -0.051548f, 0.420773f, + 0.218747f, 0.041057f, -0.071189f, 0.286987f, -0.113915f, 0.122561f, + 0.013979f, -0.049046f, 0.148175f, 0.031313f, -0.248601f, 0.209488f, + 0.069008f, 0.072763f, 0.332475f, 0.079986f, -0.151042f, -0.205110f, + -0.155550f, -0.510408f, 0.330429f, 0.577729f, 0.266524f, -0.378489f, + 0.228204f, 0.055318f, 0.117583f, -0.588557f, -0.778201f, 0.434622f, + -0.227820f, 0.611642f, 0.170548f, 0.817761f, 0.006642f, -1.005794f, + -0.911490f, 1.633684f, -0.290664f, 0.308128f, 0.295986f, 0.243377f, + -0.001275f, -0.131156f, 0.275205f, -0.041865f, -0.201951f, -0.016380f, + 0.336604f, -0.258118f, 0.890810f, 0.441065f, -0.968006f, 0.135989f, + -1.447191f, 0.353426f, -0.343235f, 0.376837f, -0.071602f, -0.319639f, + -0.072347f, 0.547450f, -0.215380f, 0.182141f, -0.066186f, 0.033787f, + 0.257482f, 0.217428f, -0.130249f, 0.057525f, 0.263991f, 0.230664f, + -0.245113f, 0.048610f, -0.079955f, 0.251737f, -0.070368f, -0.017968f, + -0.151815f, 0.025945f, -0.257769f, 0.299735f, 0.077263f, -0.565526f, + 0.326263f, 0.096429f, 0.113414f, 0.092754f, -0.141908f, 0.172060f, + 0.393117f, -0.216755f, 0.331051f, -0.363369f, -0.113363f, -0.095164f, + -0.072784f, 0.214572f, 0.010993f, 0.209456f, 0.260381f, -0.314747f, + -0.422173f, -0.189963f, -0.225130f, 0.339448f, 0.153814f, 0.265616f, + -0.103575f, -0.123841f, -0.106236f, 0.155894f, -0.156264f, -1.361406f, + -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f, + 0.105758f, 0.040788f, -0.313589f, -1.359318f, 0.071329f, 0.176404f, + -0.476141f, 0.010108f, -0.201440f, -0.221167f, -0.197448f, -0.013927f, + -0.610270f, -0.607285f, 0.178070f, 0.174320f, 0.313115f, 0.026191f, + -0.112330f, 0.122338f, -0.367751f, 0.196794f, 0.153709f, -0.205454f, + -0.397471f, -1.879336f, -0.030129f, 0.143429f, -0.079832f, 0.435259f, + -1.729539f, 0.518301f, -0.141393f, 0.199399f, -1.914601f, 0.142865f, + -0.219899f, 0.508458f, 0.086365f, -0.220740f, -0.012507f, 1.263320f, + 0.042136f, 0.050922f, -0.329644f, -0.188198f, 0.251522f, 0.394731f, + -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f, 0.359257f, + -0.427732f, -0.100652f, 0.192129f, 0.075572f, 0.916708f, 0.255747f, + 0.486384f, 0.127989f, -0.556449f, -0.484913f, 0.392298f, 0.045401f, + -0.839551f, -0.703619f, 0.069263f, -0.040720f, 0.542265f, 0.443739f, + 0.862552f, -0.021726f, 0.230858f, -0.261004f, -0.125697f, -0.106435f, + 0.002341f, 0.013904f, 0.011034f, 0.542296f, -0.284325f, 0.135736f, + 0.113882f, 0.040610f, -0.255485f, 0.224061f, -0.087140f, 0.127872f, + -0.002638f, 0.164889f, -0.335958f, -0.031166f, -0.393581f, 0.075455f, + 0.055995f, 0.087934f, -0.133859f, -0.342187f, 0.002492f, -0.340722f, + 0.058304f, 0.104165f, -0.142136f, -0.351111f, -0.158037f, -0.079924f, + -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f, + 0.076088f, -0.232091f, -0.070052f, 0.097595f, 0.063173f, -0.211195f, + 0.126478f, -0.178828f, 0.278723f, -0.070807f, -0.179783f, 0.034123f, + 0.035721f, -0.200431f, 0.170640f, 0.107933f, 0.226594f, -0.301499f, + -0.291096f, 0.228076f, -0.272951f, 0.002490f, -0.210707f, -0.128033f, + -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f, + -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f, + 0.749334f, -1.161845f, 0.505480f, 0.221733f, 0.210490f, -0.234984f, + 0.014183f, -0.510401f, 0.238692f, -0.134111f, 0.083844f, -0.478751f, + -0.088434f, 0.304063f, 0.150336f, -0.749682f, -0.081999f, 0.729739f, + 0.412508f, 0.132571f, 0.058306f, -0.047451f, -0.117435f, -0.445395f, + -0.005182f, -0.025757f, 0.175051f, -0.258194f, -0.150311f, -0.196533f, + -1.314316f, -0.428627f, 0.512451f, 0.045138f, -0.200925f, 0.081538f, + -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f, + -0.419858f, -0.154321f, 0.376970f, 0.094017f, 0.783520f, 0.110641f, + 0.077966f, -0.093064f, 0.160522f, -0.863041f, 0.086210f, 0.560764f, + 0.057032f, 0.159224f, 0.323068f, -0.173109f, 0.014042f, -0.126856f, + -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f, + -0.215834f, 0.062076f, -0.270596f, 0.271581f, -0.153486f, -0.247165f, + 0.079737f, -0.157049f, -0.027459f, -0.299397f, 0.136729f, -0.334192f, + -0.191722f, 0.145865f, -0.031324f, -0.307165f, -0.244923f, -0.228027f, + 0.063807f, 0.054965f, -0.005709f, -0.041977f, -0.276245f, 0.020003f, + 0.133323f, -0.145992f, -0.951030f, 0.414083f, -1.063323f, 0.137872f, + 0.104732f, -0.123728f, 0.542532f, 0.213654f, 0.542954f, 0.155619f, + 0.543072f, 0.399067f, 0.191402f, -0.102552f, -0.176734f, -0.136776f, + -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f, + 0.058331f, 0.126601f, 0.104420f, -0.148684f, 0.343218f, 0.093604f, + -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f, + 0.042791f, -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f, + -0.070898f, -0.156415f, 0.112831f, -0.065931f, -0.353007f, 0.058453f, + -0.136982f, 0.233393f, 0.017240f, -0.018428f, 0.229104f, -0.371440f, + -0.262212f, 0.203075f, -0.263293f, 0.034413f, -0.299354f, 0.227269f, + 0.204977f, -0.118107f, -0.359832f, -0.068252f, 0.480105f, -0.214711f, + -0.614381f, 0.209048f, -0.456014f, -0.188819f, -0.220995f, -0.322104f, + -0.191457f, 0.420874f, -0.454919f, 0.023119f, 0.291700f, -0.532885f, + -0.032642f, 0.043271f, 0.133974f, 0.002399f, -0.179899f, -0.044158f, + -0.027078f, -0.350075f, 0.236766f, 0.346771f, -0.118534f, -0.421221f, + 0.019544f, 0.109349f, 0.141517f, 0.403561f, 0.409102f, 0.054555f, + -0.561751f, 0.577183f, -0.705156f, -0.231188f, -1.969772f, 0.172289f, + -0.048122f, 0.205671f, -0.667130f, -0.066870f, 0.202838f, -0.095538f, + -0.842651f, 0.254170f, 0.046256f, -0.271891f, -0.369254f, 0.492101f, + 0.001189f, -0.186525f, 0.188470f, -0.207072f, 0.030086f, -0.132904f, + 0.127001f, 0.116662f, -0.079246f, 0.227241f, -0.462178f, 0.446304f, + -1.660753f, 0.241832f, -0.288040f, 0.054663f, -0.435804f, 0.296782f, + -0.026421f, -0.115618f, 0.163416f, 0.834001f, 0.008019f, -0.014243f, + 0.524658f, 0.067894f, -0.253936f, -0.100657f, 1.285389f, -0.005952f, + 0.087134f, -0.088375f, -0.121866f, -0.171172f, 0.279463f, -0.598593f, + -0.727761f, 0.189831f, -0.822575f, -0.291141f, -0.012410f, -0.069999f, + 0.098842f, -0.218513f, 0.009494f, 0.100106f, -0.402884f, -0.299236f, + -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f, + -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f, + -0.012113f, -0.032962f, -0.450648f, 0.129060f, -0.135227f, -0.298593f, + 0.001435f, 0.278790f, -0.272945f, 0.162759f, -0.290208f, 0.058481f, + -0.490971f, 0.019630f, -0.210347f, 0.000520f, -0.340413f, 0.641562f, + 0.023104f, 0.194832f, -0.441894f, -0.253538f, -0.228332f, 0.423264f, + -1.094073f, -0.475657f, -0.238752f, 0.033910f, 0.440425f, 0.036320f, + 0.566989f, -0.065326f, -0.297939f, 0.406098f, 0.529561f, -0.113084f, + 0.141472f, -0.024462f, -0.179212f, 0.187801f, -0.235787f, -0.229624f, + 0.357791f, 0.061110f, -0.607788f, -1.713694f, -0.651041f, 1.734283f, + -0.334701f, 0.161687f, 0.010215f, 0.320708f, 0.169447f, 0.513558f, + 0.488340f, -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f, + 0.327028f, -0.100539f, 0.012048f, -0.223013f, -0.239680f, 0.323035f, + 0.165950f, -0.155110f, 0.128664f, -0.157378f, -0.124490f, 0.291553f, + 0.055849f, -0.221664f, 0.077770f, -0.350658f, -0.181939f, 0.110230f, + -0.078219f, 0.007472f, -0.031620f, 0.007708f, -0.201794f, 0.017594f, + -0.027480f, 0.058884f, -0.369166f, -0.369770f, 0.181635f, -0.183318f, + -0.389184f, -0.256661f, 0.160107f, 0.037127f, -0.082573f, -0.095815f, + -0.322782f, 0.072528f, -0.348875f, 0.216247f, -0.161757f, -0.385502f, + -0.315738f, 0.020123f, -0.155609f, 0.114403f, -0.383232f, 0.629529f, + 0.066142f, 0.448392f, -0.389557f, -0.083315f, 0.829535f, -0.015531f, + -0.050728f, -0.325127f, 0.812992f, -0.196780f, 0.021060f, -0.952647f, + 0.006687f, -0.512715f, -0.066778f, 0.410067f, -0.116945f, -0.288283f, + 0.189334f, -0.083153f, 0.159980f, -0.068208f, 0.107358f, -0.154411f, + -0.068914f, 0.186816f, 0.032251f, 0.109242f, 0.134825f, 0.035101f, + -0.253175f, 0.157309f, -0.363597f, -0.138176f, -0.334141f, -0.172697f, + 0.045800f, -0.286057f, 0.173403f, -0.172444f, -0.117996f, -0.383848f, + -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f, + 0.056121f, 0.155046f, 0.044708f, -0.295609f, -0.211688f, -0.233229f, + -0.264980f, 0.145549f, 0.045323f, -0.027112f, 0.175638f, -0.207251f, + -0.055274f, 0.092706f, 0.086200f, -0.241340f, -0.147416f, 0.024510f, + -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f, + -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f, + -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f, -0.090609f, + 0.006595f, -0.200790f, 0.171856f, -0.027766f, -0.032017f, -0.006745f, + 0.566426f, -0.096850f, 0.727633f, -0.408065f, -0.012436f, 0.005646f, + -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f, + -0.231385f, -0.203175f, 0.041903f, -0.373694f, 0.058239f, -0.101116f, + 0.183772f, 0.164523f, -0.099046f, -0.201272f, -0.394523f, -0.157517f, + 0.032079f, -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f, + 0.100268f, -0.023806f, 0.004978f, 0.184916f, 0.142699f, -0.113240f, + -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f, + -0.113085f, -0.279699f, -0.267434f, 0.126263f, -0.260527f, -0.153904f, + -0.494653f, -0.355144f, 0.030549f, -0.216400f, -0.123363f, 0.189090f, + 0.219122f, 0.096677f, -0.202037f, -0.014489f, -0.137859f, -0.114184f, + -0.279423f, -0.270683f, +}; + +static const float av1_early_term_after_split_nn_bias_64_layer0[] = { + -0.491455f, 0.464538f, -0.005742f, -0.219951f, -0.073682f, 0.102027f, + 0.567071f, 0.441402f, 0.277521f, 0.314498f, -0.448199f, -0.065032f, + 0.488139f, -0.079632f, 0.000000f, 0.521555f, -0.151950f, -0.034616f, + 0.393438f, -0.072242f, -0.087343f, -0.571308f, 0.017372f, -0.126144f, + 0.372261f, -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f, + -0.109614f, -0.164492f, +}; + +static const float av1_early_term_after_split_nn_weights_64_layer1[] = { + -0.373195f, -0.283141f, 0.416113f, 0.483659f, 0.230583f, 0.349197f, + -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f, 0.339355f, + -0.828033f, 0.019617f, 0.118757f, -0.619360f, 0.282295f, -0.054116f, + -0.730596f, 0.068567f, -0.248707f, 0.461225f, 0.330224f, -0.287080f, + -0.458103f, 0.591852f, -0.008491f, 0.632119f, -0.007872f, 0.007869f, + -0.230698f, -0.011437f, +}; + +static const float av1_early_term_after_split_nn_bias_64_layer1[] = { + -0.55403697f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_64_layer0, + av1_early_term_after_split_nn_weights_64_layer1, + }, + { + av1_early_term_after_split_nn_bias_64_layer0, + av1_early_term_after_split_nn_bias_64_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_32_layer0[] = { + 0.026050f, -0.226531f, 0.308107f, -0.083744f, 0.201785f, 0.098562f, + 0.147595f, -0.495771f, -0.245741f, 0.201616f, -0.272070f, -0.579545f, + -0.127261f, -0.229588f, 0.250831f, -0.176929f, -0.031689f, 0.284718f, + 0.085845f, -0.285027f, 0.012304f, 0.382402f, -0.204591f, 0.272514f, + -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f, 0.195689f, + 0.242530f, 0.023528f, -0.294242f, -0.272132f, 0.460180f, -0.731281f, + -0.208103f, 0.208204f, 0.348250f, 0.016328f, 0.043707f, -0.169551f, + 0.108521f, 0.226895f, -0.020471f, 0.102443f, 0.429640f, -0.252555f, + -0.218434f, -0.163665f, 0.175531f, 0.101588f, -0.135798f, -0.158102f, + 0.142565f, 0.128277f, 0.174985f, -0.100073f, 0.113967f, 0.223682f, + -0.145576f, -0.008443f, 0.112748f, -0.037845f, 0.076954f, -0.287137f, + -0.518185f, -0.106833f, 0.175359f, 0.031408f, 0.219069f, -0.294440f, + 0.007766f, 0.067754f, -0.049168f, -0.212368f, -0.261708f, 0.309252f, + 0.220859f, -0.274852f, -0.653157f, 0.083438f, -0.265386f, 0.174429f, + -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f, 0.212890f, + 0.272053f, -0.425315f, -0.107726f, 0.294444f, -0.354629f, 0.104402f, + -0.307663f, 0.558430f, 0.140334f, -0.054831f, -0.449456f, 0.058274f, + -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f, -0.079297f, + -0.638571f, 0.181823f, -0.039611f, 0.206310f, -0.659157f, -0.102930f, + -0.067303f, -0.176881f, -0.001038f, 0.091835f, 0.079739f, -0.121923f, + 0.211070f, 0.362719f, -0.154915f, -0.151876f, -0.165460f, 0.023469f, + -0.251036f, 0.210014f, -0.537125f, 0.156832f, -0.216987f, 0.062975f, + -0.198462f, 0.329123f, 0.125870f, 0.225830f, 0.086377f, -0.128773f, + -0.179673f, -0.074612f, 0.456645f, 0.021905f, -0.243140f, 0.059145f, + -0.273942f, -0.277822f, 0.154556f, -0.025459f, 0.227614f, -0.313076f, + 0.044705f, -0.019017f, 0.108999f, -0.020243f, -0.016373f, 0.560270f, + -0.064818f, 0.050880f, -0.218458f, 0.825699f, -0.534056f, -0.258253f, + 0.222073f, 0.013295f, 0.477870f, -0.386727f, 0.388509f, 0.004128f, + 0.451388f, -0.175788f, 0.264093f, -0.109812f, 0.358132f, 0.500992f, + -0.446933f, -0.222397f, 0.345834f, 0.370943f, -0.233115f, -0.047005f, + -0.111335f, -0.111586f, 0.026975f, -0.052191f, -0.111800f, -0.129782f, + 0.225132f, 0.102524f, 0.544557f, -0.111674f, -0.857884f, 0.133258f, + 0.310001f, 0.043829f, 0.104143f, 0.256493f, 0.242520f, -0.342082f, + 0.421447f, 0.124227f, 0.061542f, -0.090206f, 0.316681f, 0.353452f, + -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f, + 0.255725f, -0.126346f, 0.034095f, -0.240276f, -0.135918f, 0.095682f, + -0.147457f, -0.338216f, -0.200426f, 0.010265f, -0.243915f, -0.231375f, + -0.323924f, -0.014353f, 0.150252f, -0.264346f, 0.205303f, -0.194610f, + -0.282527f, 0.180555f, -0.000087f, 0.027240f, -0.000903f, -0.345877f, + -0.353274f, -0.311829f, 0.172985f, -0.111748f, -0.309380f, 0.108110f, + -0.260914f, -0.164990f, 0.183625f, -0.319692f, -0.096988f, 0.094147f, + -0.047062f, -0.080978f, 0.227387f, -0.000450f, -0.220159f, -0.211448f, + -0.020885f, -0.139646f, -0.086721f, 0.067928f, -0.033084f, -0.251996f, + 0.090317f, 0.086313f, -0.228420f, -0.111356f, -0.314304f, -0.223664f, + 0.188176f, -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f, + -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f, + -0.038595f, 0.119537f, 0.260477f, -0.168014f, -0.172751f, 0.532861f, + -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f, + -0.532110f, 0.359323f, -0.254786f, 0.471316f, -0.545024f, 0.291912f, + -0.836939f, 0.443427f, -0.441709f, 0.168866f, -0.140372f, 0.546607f, + -0.315465f, 0.023328f, 0.137709f, -0.083492f, -0.049986f, -0.071302f, + -0.293680f, -0.105049f, 0.315317f, 0.279569f, 0.220762f, 0.088161f, + -0.756456f, -0.074512f, 0.958318f, -0.332924f, -0.004906f, -0.629271f, + 0.212050f, 0.279123f, 0.311523f, -0.599580f, 0.516150f, 0.456952f, + 0.020255f, 0.247290f, -0.182670f, -0.335554f, 0.021203f, 0.131081f, + -0.208584f, 0.112530f, -0.198980f, 0.211583f, -0.101271f, -0.206453f, + -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f, -0.165483f, + 0.094953f, -0.182578f, 0.055068f, 0.135605f, -0.266941f, -0.297556f, + 0.199181f, 0.015979f, -0.158659f, -0.226841f, 0.171306f, 0.013438f, + -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f, -0.026230f, + -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f, + 0.248576f, -0.144425f, -0.113948f, -0.247297f, 0.276682f, 0.010963f, + -0.737786f, 0.026347f, 0.007830f, 0.753543f, 0.371904f, 0.305614f, + 0.105028f, 0.073530f, -0.119137f, 0.102352f, -0.080523f, 0.176366f, + -0.159457f, -0.339948f, 0.360131f, -0.007051f, -0.388378f, -0.101695f, + 0.663041f, -0.234486f, -0.142536f, -0.099931f, 0.041478f, 0.230425f, + 0.005743f, 0.154060f, 0.056233f, -0.080668f, -0.009754f, -0.194356f, + 0.185474f, -0.296474f, 0.192700f, 0.257767f, 0.348529f, 0.458265f, + 0.060276f, -0.130473f, 0.139889f, 0.310073f, -0.306869f, -0.272922f, + -0.259862f, 0.409207f, 0.431991f, -0.100357f, -0.050415f, -0.071830f, + -0.239665f, 0.153399f, 0.177192f, -0.611644f, -0.176114f, -0.022694f, + -0.033701f, -0.345842f, 0.015660f, 0.158931f, -0.097586f, 0.222001f, + 0.257887f, -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f, + -0.010895f, 0.052815f, -0.265306f, -0.081059f, 0.219162f, -0.256084f, + -0.372676f, 0.148977f, 0.174831f, 0.086980f, 0.108518f, 0.074011f, + 0.038032f, -0.070856f, -0.109407f, 0.126174f, 0.022341f, -0.249786f, + -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f, -0.017953f, + -0.028353f, 0.233621f, 0.109426f, 0.232798f, -0.104950f, -0.241798f, + -0.018995f, -0.167954f, 0.002473f, 0.060418f, -0.232717f, -0.195980f, + -0.283971f, -0.371881f, 0.219728f, 0.018072f, -0.166694f, -0.083301f, + -0.000616f, -0.212641f, -0.173158f, 0.222739f, -0.235302f, 0.237624f, + 0.222232f, -0.041235f, -0.342411f, 0.121194f, 0.211291f, -0.032237f, + -0.249401f, -0.291668f, 0.206055f, -0.148200f, 0.011824f, -0.272728f, + -0.194854f, 0.367175f, -0.257243f, 0.103433f, -0.231077f, 0.236734f, + 0.135733f, -0.362845f, 0.197147f, 0.242782f, -0.135289f, 0.123311f, + 0.259420f, -0.116278f, 0.127287f, 0.236789f, -0.097438f, 0.118073f, + 0.112796f, -0.035949f, 0.184408f, 0.200948f, -0.008859f, 0.195989f, + 0.161970f, -0.295320f, -0.330389f, 0.141034f, 0.066081f, -0.707857f, + 0.357037f, 0.149633f, 0.679877f, 0.548674f, 0.469076f, 0.194123f, + -0.209872f, -0.071764f, -0.126960f, 0.199420f, 0.327116f, -0.169053f, + -0.429156f, 0.443429f, -0.225530f, -0.130738f, -0.028351f, 0.644393f, + 0.049606f, -0.243602f, -0.409920f, 0.117028f, -0.258557f, 0.073865f, + -0.200454f, -0.139957f, -0.031314f, 0.162325f, 0.247221f, 0.071909f, + -0.336276f, 0.079922f, 0.192780f, -0.148882f, 0.133192f, -0.143177f, + -0.121327f, 0.126221f, -0.089521f, -0.181826f, 0.149923f, -0.280682f, + 0.391572f, 0.108990f, -0.445494f, -0.170787f, 0.225182f, 0.223313f, + -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f, -0.049377f, + 0.057976f, 0.033558f, 0.068733f, -0.283353f, 0.217877f, 0.158093f, + -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f, -0.474510f, + -0.096738f, 0.256940f, 0.234203f, -0.226667f, -0.260576f, -0.183403f, + -0.035578f, 0.141570f, 0.078764f, -0.028086f, 0.155800f, -0.251115f, + -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f, + 0.043057f, 0.068136f, -0.179903f, 0.143699f, -0.002571f, 0.239012f, + 0.197456f, 0.035745f, -0.311927f, 0.220320f, 0.102687f, -0.294105f, + 0.426740f, 0.209050f, 0.211907f, 0.083453f, 0.006578f, -0.143338f, + 0.003157f, 0.040295f, 0.234497f, 0.035344f, -0.163909f, 0.411115f, + 0.289453f, -0.075357f, -0.008884f, 0.469798f, -0.033304f, -0.153293f, + -0.229322f, -0.004162f, 0.113363f, 0.395381f, 0.067414f, -0.188966f, + -0.117424f, -0.166423f, 0.066839f, 0.595641f, -0.204782f, -0.451727f, + 0.198509f, -0.921583f, -0.246765f, -0.153411f, 0.046491f, 0.365906f, + 0.376710f, -0.017355f, -0.035232f, 0.138785f, -0.163918f, -0.283449f, + -0.094340f, 0.192127f, 0.154815f, 0.035787f, -0.029087f, 0.115649f, + -0.220133f, -0.452741f, 0.311667f, 0.157666f, 0.091401f, 0.236040f, + -0.168523f, 0.122176f, -0.219016f, -0.214856f, 0.172824f, -0.091810f, + 0.031520f, -0.857420f, 0.643446f, -0.017471f, 0.206082f, -0.933517f, + -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f, + 0.289970f, -0.889775f, -0.044741f, 0.232647f, -0.319416f, 0.073030f, + 0.278549f, 0.238782f, -0.202206f, 0.272540f, 0.201412f, 0.175574f, + -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f, 0.505169f, + -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f, + -0.161019f, 0.344837f, 0.361958f, -0.097050f, 0.014375f, 0.267110f, + 0.341442f, -0.016688f, 0.073393f, 0.131500f, 0.246331f, 0.011059f, + 0.033597f, 0.014779f, -0.269366f, -0.504788f, 0.048651f, 0.295682f, + 0.237363f, 0.227484f, -0.235814f, -0.160530f, 0.182682f, -0.172999f, + -0.126630f, 0.168357f, -0.078729f, 0.052805f, 0.377021f, -0.004727f, + 0.230415f, -0.876673f, 0.458457f, 0.099401f, -0.019616f, 0.611982f, + -0.231508f, -0.070894f, -0.056142f, 0.548969f, -0.376599f, -0.600428f, + 0.241930f, -0.592893f, 0.189371f, 0.488651f, -0.092446f, -0.272569f, + 0.251643f, 0.315945f, -0.301468f, 0.112961f, 0.052119f, -0.066076f, + -0.082249f, 0.252805f, -0.195539f, 0.150386f, -0.865534f, 0.673447f, + 0.030177f, -0.438528f, -1.006174f, 0.575176f, -0.271656f, 0.035835f, + -1.056916f, 0.495267f, -0.092428f, -0.109511f, -0.192359f, 0.166669f, + -0.624326f, -0.000354f, -0.089075f, 0.176279f, -0.289347f, 0.021346f, + 0.020375f, 0.255282f, -0.045588f, 0.173675f, 0.100957f, -0.294373f, + 0.049303f, -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f, + 0.079024f, 0.101113f, 0.135742f, -0.348869f, -0.026759f, -0.134155f, + -0.179275f, -0.054297f, -0.054948f, 0.029351f, 0.190560f, 0.102476f, + -0.025785f, 0.169442f, -0.271303f, 0.200667f, 0.099063f, 0.074767f, + -0.326533f, 0.044426f, -0.290251f, -0.082443f, -0.164482f, -0.349412f, + 0.045109f, -0.157330f, 0.165935f, 0.012672f, -0.059818f, 0.399140f, + -0.316620f, 0.386638f, -0.285399f, -0.296777f, -0.200473f, -0.144232f, + 0.251851f, -0.203768f, 0.001071f, -0.179063f, 0.248952f, -0.143029f, + 0.010423f, -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f, + 0.322825f, 0.133683f, 0.017388f, 0.142467f, 0.221320f, 0.004059f, + -0.115770f, 0.143363f, 0.137972f, -0.272584f, 0.489366f, -0.091828f, + -0.014703f, 0.082332f, -0.476226f, -0.202859f, 0.356094f, -0.283049f, + 0.218086f, 0.202015f, 0.201724f, 0.012617f, 0.050720f, 0.255695f, + 0.244653f, 0.111296f, -0.151450f, -0.056210f, -0.757348f, 0.441724f, + -0.022455f, -0.244662f, 0.296205f, -0.421883f, -0.217386f, -0.254301f, + 0.409105f, -0.031309f, 0.050147f, -0.337170f, -0.106620f, -0.606455f, + 0.308024f, 0.298144f, 0.363993f, 0.704870f, -0.047292f, 0.166901f, + 0.105991f, -0.536757f, -0.424031f, -0.226034f, 0.213635f, -0.526754f, + 0.310990f, -0.116038f, 0.007775f, 0.538330f, -0.177912f, 0.445357f, + -0.290365f, 0.451169f, 0.030931f, 0.033388f, 0.209905f, -0.244492f, + -0.097792f, -0.246042f, 0.132047f, 0.032576f, 0.115516f, 0.022890f, + 0.093508f, -0.071840f, 0.362948f, -0.135245f, 0.659911f, -0.321413f, + 0.193118f, -0.795001f, -0.218311f, 0.024862f, 0.206172f, -0.832878f, + -0.255670f, 0.343402f, -0.275211f, -0.898363f, -0.025172f, 0.158565f, + 0.171347f, -0.127518f, -0.215156f, -0.159198f, 0.250355f, -0.132452f, + 0.061254f, -0.097544f, -0.223246f, 0.013183f, 0.239468f, 0.259017f, + -0.217739f, -0.032263f, 0.123755f, -0.701777f, 0.150049f, -0.555293f, + 0.062430f, -0.260304f, 0.494894f, -0.168702f, -0.134829f, -0.113989f, + 0.150092f, -0.060248f, 0.115711f, -0.277202f, 0.499811f, 0.417116f, + 0.191081f, -0.376432f, -0.321092f, 0.033992f, 0.057193f, 0.127077f, + -0.009042f, 0.014443f, 0.142808f, -0.124349f, 0.213087f, -0.381686f, + 0.129726f, -0.038396f, +}; + +static const float av1_early_term_after_split_nn_bias_32_layer0[] = { + -0.107171f, 0.060848f, -0.069480f, -0.121982f, 0.037637f, -0.291839f, + 0.102257f, -0.065889f, -0.032452f, 0.034171f, -0.073984f, -0.005236f, + 0.218820f, 0.132123f, -0.089621f, -0.067679f, 0.049368f, 0.329444f, + -0.184729f, 0.031702f, 0.009735f, -0.039964f, -0.018024f, -0.073031f, + -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f, 0.216609f, + -0.078358f, -0.007740f, +}; + +static const float av1_early_term_after_split_nn_weights_32_layer1[] = { + 0.047869f, -0.231773f, -0.185663f, 0.460676f, -0.208182f, 0.590555f, + -0.622627f, 0.279377f, 0.351681f, 0.633504f, 1.069884f, 0.332449f, + -0.457703f, -0.435817f, -0.028853f, 0.327490f, -0.282469f, -0.975792f, + -0.062975f, -0.147187f, 0.348340f, -1.207116f, 0.516159f, -1.509626f, + -0.805072f, 0.522999f, 0.143671f, 0.304246f, -0.360720f, -0.612472f, + 0.260045f, -0.223243f, +}; + +static const float av1_early_term_after_split_nn_bias_32_layer1[] = { + -0.07571174f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_32_layer0, + av1_early_term_after_split_nn_weights_32_layer1, + }, + { + av1_early_term_after_split_nn_bias_32_layer0, + av1_early_term_after_split_nn_bias_32_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_16_layer0[] = { + -0.113798f, 0.053357f, -0.037947f, -0.477171f, 0.276517f, -0.349252f, + -0.177284f, 0.189597f, 0.141744f, 0.230207f, -0.328104f, 0.074328f, + 0.247717f, 0.233533f, 0.145167f, 0.018029f, -0.398725f, -0.226199f, + -0.309724f, 0.125279f, 0.194759f, 0.025531f, 0.349714f, -0.273944f, + 0.186871f, 0.181735f, -0.520614f, -0.264076f, 0.308207f, 0.157438f, + -0.137791f, -0.054582f, 0.125879f, 0.796218f, -0.897562f, 0.885439f, + 0.381640f, 0.106625f, -2.027456f, 0.000874f, 0.179581f, 0.013287f, + -2.329439f, -0.163169f, -0.136191f, 0.320108f, -2.318779f, -0.196722f, + -0.295721f, 0.203658f, -0.182275f, 0.615941f, 0.015762f, 0.257181f, + -0.115297f, 0.295774f, -0.026144f, -0.022686f, -0.219423f, -0.042861f, + 0.207647f, -0.057791f, 0.201671f, -0.169569f, 0.291492f, -0.994991f, + 0.137473f, 0.230948f, 0.505626f, -1.065860f, 0.275225f, -0.250861f, + 0.519466f, -1.217242f, -0.087384f, 0.053441f, 0.030729f, -1.702304f, + -0.034635f, 0.010177f, -0.035422f, -0.749979f, 0.355499f, 0.408166f, + -0.086883f, 0.017203f, 0.195706f, -0.218056f, -0.029153f, 0.367335f, + -0.061732f, -0.241068f, 0.078496f, -0.370346f, -0.124223f, -0.172708f, + 0.037971f, 0.038875f, -0.282489f, -0.266323f, -0.210864f, 0.214714f, + 0.234695f, -0.045625f, 0.015357f, -0.007464f, -0.362003f, -0.113465f, + 0.145141f, 0.238470f, -0.202664f, -0.286587f, -0.347112f, 0.054501f, + -0.190290f, -0.283256f, 0.062179f, 0.041165f, -0.006935f, -0.220351f, + -0.088800f, 0.220924f, -0.200982f, 0.058493f, -0.225175f, 0.057175f, + -0.618187f, 0.761023f, -0.743774f, -0.500599f, -0.584999f, 1.545211f, + 0.123055f, -0.106848f, -0.353057f, 1.552187f, 0.174104f, 0.068060f, + -0.449859f, 1.254299f, -0.161716f, -0.060630f, -0.230721f, 0.165976f, + -0.101582f, -0.422415f, 0.110384f, -0.130098f, 0.104428f, 0.083518f, + 0.031626f, 0.083048f, 0.158877f, 0.173340f, 0.063962f, 0.427845f, + 0.663268f, 0.376996f, 0.146435f, -0.091329f, 0.443447f, 0.518432f, + -0.182777f, -0.091313f, 0.331229f, 0.532604f, -0.187001f, 0.054774f, + 0.298068f, 0.502295f, -0.362378f, 0.054283f, 0.292806f, 0.168901f, + -0.214787f, 0.025637f, 0.458009f, -0.322714f, -0.264059f, 0.140313f, + -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f, -0.009061f, + -0.050681f, 0.108681f, 0.043272f, -1.073133f, 0.206410f, 0.469576f, + 0.291494f, -2.021244f, -0.001183f, -0.067542f, 0.364907f, -2.470543f, + 0.049147f, -0.018868f, 0.658500f, -2.531048f, 0.275433f, -0.034224f, + -0.171386f, 0.096369f, 0.728069f, 0.272332f, 0.222255f, -0.030426f, + 0.026994f, 0.208928f, -0.173943f, -0.227581f, -0.214798f, 0.079341f, + 0.032344f, -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f, + -0.086592f, 0.066487f, 0.337353f, -0.168704f, 0.015702f, 0.022607f, + 0.286647f, 0.218106f, 0.193319f, -0.358714f, 0.030796f, 0.007646f, + -0.045617f, 0.165007f, -0.284641f, -0.291812f, 0.207544f, 0.082823f, + -0.141907f, -0.331336f, -0.052908f, 0.120716f, 0.202521f, 0.232782f, + -0.348141f, -0.017332f, 1.191126f, -0.391987f, -0.154537f, -0.206551f, + -2.378690f, 0.057918f, -0.328183f, 2.151556f, 0.238803f, 0.164880f, + -0.480039f, 1.616200f, 0.260243f, 0.083704f, -0.174461f, 1.804634f, + 0.194810f, 0.223837f, 0.550107f, -0.068171f, -0.293435f, -0.186770f, + -0.364846f, 0.127181f, 0.105556f, -0.016202f, 0.278403f, -0.344995f, + -0.009761f, -0.082555f, 0.046731f, -0.301452f, 0.604259f, 0.055895f, + 0.049862f, 0.314249f, -0.305811f, -0.112937f, 0.658787f, -0.549288f, + -0.307567f, -0.460650f, -0.840643f, 0.082576f, 0.373711f, 0.138318f, + 0.336901f, 0.284984f, -0.281400f, 0.408210f, -0.449858f, 0.461054f, + 0.227629f, -0.131705f, 0.301769f, -0.278540f, 0.189290f, -0.269041f, + 0.111350f, -0.300257f, 0.436858f, -0.265920f, -0.211938f, 0.272631f, + 0.206291f, 0.253273f, -0.229776f, -0.031112f, -0.171183f, -0.109676f, + -0.202390f, -0.068857f, 0.182125f, -0.140523f, -0.308742f, -0.045840f, + 0.256545f, -0.262405f, 0.225951f, -0.287463f, -0.189203f, -0.055552f, + -0.052448f, -0.242839f, -0.278877f, 0.140920f, -0.175755f, 0.215402f, + -0.248841f, -0.264080f, -0.178303f, 0.147777f, 0.049460f, -0.279877f, + -0.539725f, -0.004622f, 0.182874f, 0.338814f, 0.265974f, 0.249851f, + -0.141154f, 0.157228f, -0.090972f, 0.179444f, 0.305255f, 0.127788f, + 0.123270f, 0.355320f, 0.076797f, 0.263495f, 0.235965f, -0.133816f, + 0.243624f, 0.227062f, -0.213629f, 0.002075f, 0.061203f, -0.077820f, + -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f, + -0.274248f, 0.053950f, -0.225750f, -0.367097f, -0.122391f, 0.181212f, + -0.411824f, -0.084241f, -0.302288f, 0.077860f, -0.187443f, -0.300262f, + 0.083156f, -0.392461f, -0.332320f, -0.346474f, 0.140658f, -0.283656f, + 0.120714f, -0.056577f, -0.280968f, 0.017795f, -0.024686f, 0.073113f, + -0.346637f, 0.082567f, -0.036556f, -0.369730f, 0.081225f, -0.005211f, + 0.144886f, -0.003544f, 0.178307f, -0.366035f, -0.063887f, -0.191767f, + 0.105835f, -0.273978f, -0.266532f, -0.023984f, 0.039166f, 0.065848f, + -0.026802f, -0.268923f, 0.189659f, 0.086300f, 0.030718f, 0.216565f, + -0.130025f, -0.215687f, 0.146341f, -0.286438f, -0.394226f, -0.181509f, + -0.005612f, 0.186040f, 0.133491f, 0.032096f, -0.261609f, 0.074007f, + -0.042929f, -0.234479f, 0.189704f, 0.088395f, -0.003671f, -0.125055f, + -0.252418f, -0.086387f, 0.111197f, -0.297071f, -0.018793f, -0.031902f, + -0.333191f, -0.186279f, 0.039868f, 0.091419f, -0.264438f, -0.216150f, + -0.212550f, 0.203412f, -0.113028f, -0.197169f, -0.346771f, 0.086066f, + 0.091443f, -0.128507f, -0.007281f, -0.118389f, 0.003370f, -0.338661f, + 0.026739f, -0.063571f, -0.281567f, -0.166824f, 0.167455f, 0.216173f, + 0.199163f, 0.256314f, -0.222679f, 0.040282f, -0.154808f, -0.133943f, + -0.270163f, -0.357398f, 0.260373f, 0.176950f, -0.125162f, -0.085050f, + 0.226376f, -0.124585f, -0.324804f, 0.035536f, -0.133600f, 0.173450f, + 0.068107f, -0.337442f, 0.169629f, 0.047223f, 0.057878f, 0.055555f, + -0.317449f, -0.103768f, 0.080899f, -0.194759f, -1.137593f, 0.508999f, + 0.045372f, 1.746454f, 1.250347f, -0.342930f, -0.127821f, -0.220175f, + -0.417649f, -0.480595f, 0.071902f, 0.050231f, -0.562554f, -0.677866f, + -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f, 0.572936f, + 0.047325f, 0.050619f, 0.112611f, -0.035393f, 0.052585f, -0.071076f, + -0.015798f, -0.050228f, -0.142875f, 0.189329f, 0.048833f, 0.503633f, + 0.249588f, 0.175492f, -0.137664f, -0.018533f, 0.288453f, -0.025644f, + 0.079131f, 0.195096f, -0.154039f, -0.104220f, -0.224072f, 0.095946f, + -0.208424f, 0.214745f, 0.056468f, 0.182603f, 0.341784f, -0.134664f, + -0.194050f, 0.058532f, -0.107336f, -0.087783f, -0.238795f, -0.387212f, + 0.049055f, -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f, + 0.407375f, -0.462654f, -0.609488f, 0.027742f, -0.985512f, -0.109154f, + -0.423276f, 2.347960f, 0.129240f, 0.187610f, -0.057081f, 2.424892f, + 0.087666f, 0.106716f, -0.039379f, 2.764866f, 0.113309f, 0.028196f, + -0.582789f, 0.335385f, -0.538029f, -0.477337f, -0.114207f, 0.178829f, + 0.006276f, 0.123179f, 0.095101f, 0.139898f, -0.372074f, -0.111010f, + 0.136330f, 0.272900f, 0.126737f, -0.097808f, -0.363697f, 0.108665f, + -0.227749f, -0.083421f, 1.714677f, 0.451943f, 0.107931f, -0.392281f, + 1.615846f, 0.022307f, -0.247011f, 0.257703f, 1.039134f, 0.537789f, + 0.022177f, -0.271532f, 0.351350f, -0.399205f, -0.240534f, -0.315399f, + 0.026928f, -0.005618f, 0.053179f, -0.010277f, 0.000501f, 0.040896f, + -0.109160f, 0.018282f, 0.003887f, 0.199599f, 0.095349f, -0.337284f, + 0.169929f, -0.109409f, -0.166983f, 0.059908f, -0.226574f, -0.120114f, + 0.077329f, -0.333133f, -0.220936f, 0.114309f, -0.233965f, -0.281551f, + 0.042948f, 0.100940f, 0.116037f, -0.313122f, 0.215149f, -0.309057f, + -0.341052f, -0.294417f, -0.179722f, 0.010795f, 0.192053f, -0.275261f, + -0.033077f, 0.117348f, 0.090206f, 0.781573f, 0.602456f, -0.220296f, + 0.172159f, 0.758513f, 0.157910f, -0.217897f, -0.372659f, 0.031935f, + 0.791463f, 0.267195f, 0.931593f, -0.057349f, 0.405512f, -0.058512f, + -0.641663f, -0.076592f, 0.550227f, -0.024094f, 0.048218f, -0.289971f, + 0.180940f, 0.167533f, 0.052711f, -0.360726f, 0.019210f, -0.488879f, + 0.380498f, 0.151608f, -0.276895f, -0.596554f, 0.106076f, -0.245833f, + -0.048783f, 0.073823f, 0.098780f, 0.000211f, 0.113958f, -0.068964f, + -0.265533f, -0.185457f, 0.175586f, -0.163621f, -0.204919f, 0.145802f, + -0.163421f, 0.129576f, -0.153486f, -0.105573f, 0.067289f, -0.213120f, + -0.286103f, 0.249543f, -0.044970f, -0.170464f, -0.105501f, -0.094765f, + -0.050734f, -0.369468f, 0.180020f, -0.363328f, -0.151654f, -0.262550f, + -0.424503f, 0.829032f, -0.559452f, 0.506837f, 0.143823f, 0.276660f, + -1.808608f, -0.259517f, -0.053945f, 0.035676f, -1.842195f, -0.065960f, + -0.069285f, 0.462022f, -2.319453f, -0.370299f, 0.183329f, -0.146412f, + -0.563875f, 0.305068f, 0.480904f, 0.044319f, -0.016098f, 0.168516f, + 0.114874f, -0.097621f, -0.030373f, 0.177700f, 0.181591f, -0.146003f, + -0.330853f, -0.259200f, 0.779319f, -1.517524f, 0.178781f, 0.135451f, + 0.088784f, -2.076089f, 0.628717f, -0.048685f, 0.281327f, -2.341596f, + 0.422171f, 0.006135f, 0.367096f, -1.663118f, 0.365253f, -0.072884f, + -0.197620f, -0.688634f, 0.477354f, 0.395841f, -0.098505f, 0.208709f, + -0.027523f, 0.127119f, 0.106274f, 0.114424f, -0.122877f, -0.087245f, + 0.086923f, -0.527398f, -0.342062f, -0.764662f, 0.713094f, -0.626453f, + -0.081454f, -0.087683f, 0.885047f, 0.323440f, -0.018579f, -0.217166f, + 1.617984f, -0.159038f, 0.265991f, -0.390313f, 1.933182f, -0.032431f, + -0.057513f, -0.300841f, 0.461248f, -0.072147f, -0.287052f, -0.078056f, + 0.011734f, 0.044013f, 0.177174f, 0.093400f, 0.028819f, 0.193686f, + -0.224853f, 0.268321f, -0.075059f, 0.074526f, -0.015618f, 0.165615f, + -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f, + -0.224625f, -0.124980f, -0.104482f, 0.076864f, -0.009631f, -0.164682f, + 0.150480f, -0.111880f, -0.260425f, 0.086234f, -0.176936f, -0.136771f, + -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f, + -0.109061f, -0.098970f, 0.090792f, -0.109623f, 0.349851f, 0.266341f, + -0.088602f, -0.108071f, 0.082519f, 0.472650f, -1.838758f, 0.456694f, + 0.119927f, 0.461077f, -2.860022f, 0.231495f, 0.235771f, 0.256424f, + -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f, 0.505510f, + 0.615657f, 0.193760f, 0.224600f, 0.265732f, -0.121553f, -0.354597f, + -0.242414f, -0.276639f, -0.057591f, 0.026369f, -0.261148f, -0.356155f, + -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f, 0.221299f, + -0.108857f, -0.156514f, 0.050901f, 0.058541f, -0.077141f, 0.071515f, + -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f, + -0.143374f, -0.091811f, 0.165161f, 0.060156f, -0.086103f, -0.039031f, + -0.377759f, -0.370533f, 0.074431f, 0.064192f, 0.186576f, 0.447858f, + -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f, 0.176286f, + 2.850013f, 0.019385f, -0.225361f, -0.235315f, 1.654694f, -0.073978f, + -0.341412f, -1.187575f, 2.815900f, -0.228063f, -0.174547f, 0.623825f, + -0.010676f, 0.157189f, 0.111879f, -0.198965f, 0.051851f, 0.158396f, + 0.045194f, 0.293531f, -0.246714f, -0.351493f, 0.026954f, 0.076233f, + 0.420367f, 0.168154f, -0.131450f, 0.134487f, -0.288851f, -0.134553f, + 0.014902f, 0.756381f, 0.277713f, 0.190080f, -0.020869f, 1.446672f, + 0.029792f, -0.025927f, 0.060640f, 0.559864f, 0.422229f, 0.198459f, + 0.036167f, 0.029432f, 0.001882f, 0.038480f, -0.160528f, -0.288855f, + -0.310886f, 0.291296f, 0.190558f, -0.182816f, -0.002252f, 0.073101f, + -0.172245f, -0.305980f, 0.112492f, -0.422839f, -0.295999f, -0.078160f, + -0.173405f, -0.032819f, 0.373774f, -0.715223f, 0.018911f, 0.131753f, + -0.237364f, -0.128499f, -0.228406f, 0.341619f, 0.343552f, -0.521581f, + -0.263790f, 0.362502f, -0.018450f, 0.054233f, 0.183068f, 0.382772f, + 0.188811f, -0.627287f, 0.040399f, -0.487338f, -0.192591f, 0.247426f, + 0.154372f, -0.483994f, +}; + +static const float av1_early_term_after_split_nn_bias_16_layer0[] = { + -0.173976f, 0.305495f, 0.250981f, -0.067127f, -0.313100f, 0.242464f, + 0.315196f, -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f, + -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f, + -0.239698f, -0.182082f, -0.065296f, 0.021503f, -0.036787f, 0.311861f, + 0.118135f, -0.320456f, -0.110719f, 0.220692f, -0.071727f, -0.088226f, + -0.110874f, -0.111671f, +}; + +static const float av1_early_term_after_split_nn_weights_16_layer1[] = { + -0.338573f, 0.398159f, 0.314774f, -0.037448f, -0.271950f, -0.774991f, + 0.950901f, -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f, + -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f, + -0.657195f, 0.927632f, -0.040150f, 0.578920f, 0.212301f, 0.292495f, + 0.563590f, -0.205735f, 0.195877f, 0.582122f, -0.217860f, 1.613379f, + 0.313278f, -0.555802f, +}; + +static const float av1_early_term_after_split_nn_bias_16_layer1[] = { + 0.16553f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = { + FEATURES, + 1, + 1, + { + HIDDEN_NODES, + }, + { + av1_early_term_after_split_nn_weights_16_layer0, + av1_early_term_after_split_nn_weights_16_layer1, + }, + { + av1_early_term_after_split_nn_bias_16_layer0, + av1_early_term_after_split_nn_bias_16_layer1, + }, +}; + +static const float av1_early_term_after_split_nn_weights_8_layer0[] = { + -0.719472f, 0.305806f, 0.855829f, 0.100094f, 0.412517f, 1.254673f, + 1.552105f, -5.890773f, -0.089957f, -0.016736f, 1.418074f, -5.393506f, + -0.028214f, 0.117758f, 1.479209f, -5.299794f, 0.171585f, -0.084182f, + -0.162105f, 0.388577f, -0.044319f, -0.025861f, 0.251782f, -0.181462f, + -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f, + 0.038038f, -0.119492f, 0.049104f, -0.344384f, -0.354513f, 0.036977f, + 0.017513f, -0.004025f, -0.163212f, -0.261999f, 0.146575f, 0.207541f, + 0.130365f, -0.252127f, 0.097419f, -0.231057f, -0.309421f, 0.347866f, + -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f, + -0.233553f, 0.156354f, -0.184009f, 0.344289f, -0.308058f, -0.205202f, + -0.325068f, 0.183820f, -0.361667f, -0.069559f, -0.121834f, -0.038357f, + -0.210043f, -0.266129f, 0.003188f, 0.074902f, -0.328843f, 0.293679f, + -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f, + 0.166074f, -0.262899f, 0.102114f, -0.323420f, 0.057064f, -0.203318f, + -0.397413f, -0.317324f, -0.307093f, 0.020574f, -0.188627f, 0.132529f, + 0.118992f, -0.487387f, -0.282975f, 0.573231f, -0.266071f, 0.125140f, + -0.970034f, 1.424008f, -0.487366f, -0.196415f, 3.680273f, -0.008407f, + 0.081109f, -0.187479f, 3.876021f, 0.159168f, 0.111721f, -0.337423f, + 3.901760f, 0.261268f, -0.245555f, -0.187632f, -0.324298f, 0.167234f, + 0.170986f, -0.473055f, 0.087016f, -0.003469f, 0.051035f, 0.251794f, + 0.153549f, 0.217609f, -0.326870f, -0.175511f, 0.637341f, -0.694837f, + -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f, + 0.071414f, 0.215265f, -0.835999f, 5.735746f, 0.300101f, 0.089626f, + -0.450261f, 5.608051f, 0.190491f, 0.110220f, -0.595360f, -0.446324f, + 0.311380f, 0.268812f, -0.339656f, -0.008708f, 0.011111f, -0.027557f, + 0.171534f, 0.000676f, 0.227232f, 0.033993f, 0.146684f, 0.094817f, + -0.175381f, -0.211927f, -0.362471f, 0.168834f, 0.264149f, -0.350538f, + -0.463249f, -0.288105f, 0.347155f, 0.183231f, -0.229732f, -0.252202f, + -0.218074f, -0.008769f, -0.156103f, 0.181233f, -0.354736f, 0.263270f, + -0.106636f, 0.081057f, 0.060634f, -0.046887f, 0.050468f, 0.071259f, + 0.221287f, 0.199071f, -0.180185f, -0.406902f, -0.239351f, -0.034957f, + 0.369140f, 0.864600f, 0.233798f, 0.423612f, -0.468918f, 0.976987f, + 0.691198f, -1.597908f, 0.102926f, 0.305546f, 0.391196f, -3.909059f, + 0.333635f, 0.311561f, 0.738886f, -4.002001f, 0.236394f, -0.233141f, + 0.263342f, 0.679898f, 0.136233f, 0.254743f, -0.367571f, 0.066412f, + 0.001606f, -0.059542f, 0.051726f, -0.347145f, -0.045501f, -0.313847f, + -0.021952f, 1.386316f, -0.579139f, -1.275844f, -0.003493f, -1.716577f, + 0.250209f, 0.192086f, 4.177055f, 0.351835f, 0.338177f, 0.140163f, + 4.099592f, 0.321866f, -0.128153f, -0.360414f, 4.350767f, 0.025943f, + -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f, + 0.149441f, -0.019823f, 0.012759f, 0.404442f, -0.108881f, 0.067974f, + -0.188278f, 0.136327f, 0.109927f, -0.179270f, -0.272342f, 0.018064f, + -0.304216f, -0.469470f, 0.109310f, -0.326214f, 0.061909f, -0.278997f, + -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f, + 0.042441f, -0.126699f, -0.420399f, -0.033842f, 0.016773f, -0.273789f, + 0.081928f, -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f, + -0.232576f, 0.082955f, -0.490080f, 0.073820f, -0.090384f, 0.035781f, + -0.158880f, -0.506793f, -0.069132f, 0.047602f, -0.349640f, -0.058389f, + -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f, -0.548909f, + -0.131561f, 0.247196f, -0.207923f, 0.133056f, -0.509854f, -0.193685f, + -0.181327f, -0.242442f, 0.091821f, 0.114430f, -0.375233f, -0.015254f, + -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f, + -0.076332f, -0.186232f, -0.268491f, 0.075561f, -0.389082f, -0.077435f, + 0.352562f, -0.020086f, -0.338181f, -0.404629f, 0.254983f, 0.150477f, + -0.265903f, 0.003341f, 0.099969f, -0.211964f, -0.129372f, -0.166366f, + 0.327712f, -0.276234f, 0.140675f, -0.433677f, -0.163050f, -0.143578f, + -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f, + 1.394155f, -0.922486f, -1.350222f, 2.030201f, 0.057717f, 0.227650f, + -0.193179f, 0.037224f, 0.065555f, 0.020558f, -0.059205f, -0.023690f, + -0.008718f, 0.095976f, -0.549587f, -0.321164f, -0.243728f, 1.344381f, + -1.254107f, 0.294244f, -0.154737f, -0.152597f, 0.342419f, 0.301883f, + 0.069866f, -0.327766f, 0.209323f, -0.364913f, -0.005530f, -0.558972f, + 0.057684f, -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f, + -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f, + 0.269628f, -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f, + -0.135487f, -0.326108f, -0.039019f, 0.185029f, -0.264883f, -0.563447f, + -0.163532f, -0.447652f, -0.141851f, 0.001714f, -0.193184f, 0.032609f, + -0.112883f, 0.074599f, 0.490665f, 0.434764f, 0.021652f, -0.219618f, + 0.743267f, 0.147195f, -0.303479f, -0.097674f, 0.195813f, 0.704007f, + -1.290851f, 0.119701f, 0.224065f, 0.260246f, -0.580657f, -0.096201f, + -0.333214f, -0.586689f, 0.567178f, 0.157340f, -0.043184f, 0.194358f, + -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f, -0.564178f, + -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f, -0.346106f, + -0.256892f, 0.110915f, -0.337464f, -0.341474f, -0.216113f, 0.249445f, + -0.070175f, -0.412141f, 0.153458f, -0.081280f, 0.164669f, -0.356396f, + -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f, -0.253233f, + -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f, + -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f, 0.011957f, + 0.000498f, -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f, + 0.124249f, -0.550804f, -0.420397f, -0.123462f, 0.333292f, -0.240230f, + -0.025604f, 0.337536f, -0.295006f, -0.272614f, -0.496850f, -0.278521f, + 0.234591f, -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f, + 0.008714f, -0.064018f, -0.124873f, -0.334014f, +}; + +static const float av1_early_term_after_split_nn_bias_8_layer0[] = { + 1.202379f, -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f, + 0.615653f, -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f, + -0.225995f, 0.370877f, -0.214821f, -0.227752f, +}; + +static const float av1_early_term_after_split_nn_weights_8_layer1[] = { + 0.376594f, 0.266703f, -0.039847f, 1.680142f, -0.879939f, 0.286806f, + -0.378223f, -0.405295f, -0.021107f, 0.039188f, 0.259308f, 0.193091f, + 0.077994f, -0.269141f, 0.011180f, -0.019262f, +}; + +static const float av1_early_term_after_split_nn_bias_8_layer1[] = { + -1.29585564f, +}; + +static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = { + FEATURES, + 1, + 1, + { + 16, + }, + { + av1_early_term_after_split_nn_weights_8_layer0, + av1_early_term_after_split_nn_weights_8_layer1, + }, + { + av1_early_term_after_split_nn_bias_8_layer0, + av1_early_term_after_split_nn_bias_8_layer1, + }, +}; +#undef FEATURES +#undef HIDDEN_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c new file mode 100644 index 0000000000..1c17b09ee1 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_search.c @@ -0,0 +1,6263 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/txfm_common.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/aq_complexity.h" +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/partition_search.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/var_based_part.h" +#include "av1/encoder/av1_ml_partition_models.h" + +#if CONFIG_TUNE_VMAF +#include "av1/encoder/tune_vmaf.h" +#endif + +#define COLLECT_MOTION_SEARCH_FEATURE_SB 0 + +void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) { + part_sf->partition_search_type = SEARCH_PARTITION; + part_sf->less_rectangular_check_level = 0; + part_sf->use_square_partition_only_threshold = BLOCK_128X128; + part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + part_sf->default_max_partition_size = BLOCK_LARGEST; + part_sf->default_min_partition_size = BLOCK_4X4; + part_sf->adjust_var_based_rd_partitioning = 0; + part_sf->max_intra_bsize = BLOCK_LARGEST; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + part_sf->fixed_partition_size = BLOCK_16X16; + // Recode loop tolerance %. + part_sf->partition_search_breakout_dist_thr = 0; + part_sf->partition_search_breakout_rate_thr = 0; + part_sf->prune_ext_partition_types_search_level = 0; + part_sf->prune_part4_search = 0; + part_sf->ml_prune_partition = 0; + part_sf->ml_early_term_after_part_split_level = 0; + for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { + part_sf->ml_partition_search_breakout_thresh[i] = + -1; // -1 means not enabled. + } + part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0; + part_sf->simple_motion_search_split = 0; + part_sf->simple_motion_search_prune_rect = 0; + part_sf->simple_motion_search_early_term_none = 0; + part_sf->simple_motion_search_reduce_search_steps = 0; + part_sf->intra_cnn_based_part_prune_level = 0; + part_sf->ext_partition_eval_thresh = BLOCK_8X8; + part_sf->rect_partition_eval_thresh = BLOCK_128X128; + part_sf->ext_part_eval_based_on_cur_best = 0; + part_sf->prune_ext_part_using_split_info = 0; + part_sf->prune_rectangular_split_based_on_qidx = 0; + part_sf->early_term_after_none_split = 0; + part_sf->ml_predict_breakout_level = 0; + part_sf->prune_sub_8x8_partition_level = 0; + part_sf->simple_motion_search_rect_split = 0; + part_sf->reuse_prev_rd_results_for_part_ab = 0; + part_sf->reuse_best_prediction_for_part_ab = 0; + part_sf->use_best_rd_for_pruning = 0; + part_sf->skip_non_sq_part_based_on_none = 0; +} + +// Reset speed features that works for the baseline encoding, but +// blocks the external partition search. +void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) { + cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0; +} + +#if !CONFIG_REALTIME_ONLY +// If input |features| is NULL, write tpl stats to file for each super block. +// Otherwise, store tpl stats to |features|. +// The tpl stats is computed in the unit of tpl_bsize_1d (16x16). +// When writing to text file: +// The first row contains super block position, super block size, +// tpl unit length, number of units in the super block. +// The second row contains the intra prediction cost for each unit. +// The third row contains the inter prediction cost for each unit. +// The forth row contains the motion compensated dependency cost for each unit. +static void collect_tpl_stats_sb(const AV1_COMP *const cpi, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, + aom_partition_features_t *features) { + const AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { + return; + } + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + // If tpl stats is not established, early return + if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) { + if (features != NULL) features->sb_features.tpl_features.available = 0; + return; + } + + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int mi_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + const int col_steps = (mi_width / step) + ((mi_width % step) > 0); + const int row_steps = (mi_height / step) + ((mi_height % step) > 0); + const int num_blocks = col_steps * row_steps; + + if (features == NULL) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d", + cpi->oxcf.partition_info_path, cpi->sb_counter); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize, + tpl_data->tpl_bsize_1d, num_blocks); + int count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + fprintf(pfile, "%.0f", (double)this_stats->intra_cost); + if (count < num_blocks - 1) fprintf(pfile, ","); + ++count; + } + } + fprintf(pfile, "\n"); + count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + fprintf(pfile, "%.0f", (double)this_stats->inter_cost); + if (count < num_blocks - 1) fprintf(pfile, ","); + ++count; + } + } + fprintf(pfile, "\n"); + count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + fprintf(pfile, "%.0f", (double)mc_dep_delta); + if (count < num_blocks - 1) fprintf(pfile, ","); + ++count; + } + } + fclose(pfile); + } else { + features->sb_features.tpl_features.available = 1; + features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d; + features->sb_features.tpl_features.num_units = num_blocks; + int count = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + features->sb_features.tpl_features.intra_cost[count] = + this_stats->intra_cost; + features->sb_features.tpl_features.inter_cost[count] = + this_stats->inter_cost; + features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta; + ++count; + } + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, + FRAME_COUNTS *counts, TX_SIZE tx_size, int depth, + int blk_row, int blk_col, + uint8_t allow_update_cdf) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + int ctx = txfm_partition_context(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, mbmi->bsize, + tx_size); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + assert(tx_size > TX_4X4); + + if (depth == MAX_VARTX_DEPTH) { + // Don't add to counts in this case + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + return; + } + + if (tx_size == plane_tx_size) { +#if CONFIG_ENTROPY_STATS + ++counts->txfm_partition[ctx][0]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2); + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + +#if CONFIG_ENTROPY_STATS + ++counts->txfm_partition[ctx][1]; +#endif + if (allow_update_cdf) + update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2); + ++x->txfm_search_info.txb_split_count; + + if (sub_txs == TX_4X4) { + mbmi->inter_tx_size[txb_size_index] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, TX_4X4, tx_size); + return; + } + + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { + int offsetr = row; + int offsetc = col; + + update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr, + blk_col + offsetc, allow_update_cdf); + } + } + } +} + +static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x, + BLOCK_SIZE plane_bsize, + FRAME_COUNTS *td_counts, + uint8_t allow_update_cdf) { + MACROBLOCKD *xd = &x->e_mbd; + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx, + allow_update_cdf); + } + } +} + +static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, + int blk_col) { + MB_MODE_INFO *mbmi = xd->mi[0]; + const BLOCK_SIZE bsize = mbmi->bsize; + const int max_blocks_high = max_block_high(xd, bsize, 0); + const int max_blocks_wide = max_block_wide(xd, bsize, 0); + const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col); + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index]; + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + if (tx_size == plane_tx_size) { + mbmi->tx_size = tx_size; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, tx_size, tx_size); + + } else { + if (tx_size == TX_8X8) { + mbmi->inter_tx_size[txb_size_index] = TX_4X4; + mbmi->tx_size = TX_4X4; + txfm_partition_update(xd->above_txfm_context + blk_col, + xd->left_txfm_context + blk_row, TX_4X4, tx_size); + return; + } + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + set_txfm_context(xd, sub_txs, offsetr, offsetc); + } + } + } +} + +static void tx_partition_set_contexts(const AV1_COMMON *const cm, + MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) { + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + + xd->above_txfm_context = + cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK); + + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + set_txfm_context(xd, max_tx_size, idy, idx); + } + } +} + +static void update_zeromv_cnt(const AV1_COMP *const cpi, + const MB_MODE_INFO *const mi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) || + mi->segment_id > CR_SEGMENT_ID_BOOST2) { + return; + } + const AV1_COMMON *const cm = &cpi->common; + const MV mv = mi->mv[0].as_mv; + const int bw = mi_size_wide[bsize] >> 1; + const int bh = mi_size_high[bsize] >> 1; + const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw); + const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh); + const int block_index = + (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1); + for (int y = 0; y < ymis; y++) { + for (int x = 0; x < xmis; x++) { + // consec_zero_mv is in the scale of 8x8 blocks + const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x; + if (abs(mv.row) < 10 && abs(mv.col) < 10) { + if (cpi->consec_zero_mv[map_offset] < 255) + cpi->consec_zero_mv[map_offset]++; + } else { + cpi->consec_zero_mv[map_offset] = 0; + } + } + } +} + +static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TokenExtra **t, RUN_TYPE dry_run, + BLOCK_SIZE bsize, int *rate) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO **mi_4x4 = xd->mi; + MB_MODE_INFO *mbmi = mi_4x4[0]; + const int seg_skip = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); + const int mis = cm->mi_params.mi_stride; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const int is_inter = is_inter_block(mbmi); + + // Initialize tx_mode and tx_size_search_method + TxfmSearchParams *txfm_params = &x->txfm_search_params; + set_tx_size_search_method( + cm, &cpi->winner_mode_params, txfm_params, + cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (!is_inter) { + xd->cfl.store_y = store_cfl_required(cm, xd); + mbmi->skip_txfm = 1; + for (int plane = 0; plane < num_planes; ++plane) { + av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run, + cpi->optimize_seg_arr[mbmi->segment_id]); + } + + // If there is at least one lossless segment, force the skip for intra + // block to be 0, in order to avoid the segment_id to be changed by in + // write_segment_id(). + if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map && + cpi->enc_seg.has_lossless_segment) + mbmi->skip_txfm = 0; + + xd->cfl.store_y = 0; + if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) { + for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) { + if (mbmi->palette_mode_info.palette_size[plane] > 0) { + if (!dry_run) { + av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size, + PALETTE_MAP, tile_data->allow_update_cdf, + td->counts); + } else if (dry_run == DRY_RUN_COSTCOEFFS) { + *rate += + av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP); + } + } + } + } + + av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize, + tile_data->allow_update_cdf); + } else { + int ref; + const int is_compound = has_second_ref(mbmi); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (ref = 0; ref < 1 + is_compound; ++ref) { + const YV12_BUFFER_CONFIG *cfg = + get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]); + assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); + av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + xd->block_ref_scale_factors[ref], num_planes); + } + // Predicted sample of inter mode (for Luma plane) cannot be reused if + // nonrd_check_partition_split speed feature is enabled, Since in such cases + // the buffer may not contain the predicted sample of best mode. + const int start_plane = + (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) && + cm->seq_params->bit_depth == AOM_BITS_8) + ? 1 + : 0; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + start_plane, av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) { + assert(cpi->oxcf.motion_mode_cfg.enable_obmc); + av1_build_obmc_inter_predictors_sb(cm, xd); + } + +#if CONFIG_MISMATCH_DEBUG + if (dry_run == OUTPUT_ENABLED) { + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + continue; + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, + cm->current_frame.order_hint, plane, pixel_c, + pixel_r, pd->width, pd->height, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#else + (void)num_planes; +#endif + + av1_encode_sb(cpi, x, bsize, dry_run); + av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate, + tile_data->allow_update_cdf); + } + + if (!dry_run) { + if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1; + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 && + !(is_inter && (mbmi->skip_txfm || seg_skip))) { + if (is_inter) { + tx_partition_count_update(cm, x, bsize, td->counts, + tile_data->allow_update_cdf); + } else { + if (mbmi->tx_size != max_txsize_rect_lookup[bsize]) + ++x->txfm_search_info.txb_split_count; + if (block_signals_txsize(bsize)) { + const int tx_size_ctx = get_tx_size_context(xd); + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(mbmi->tx_size, bsize); + const int max_depths = bsize_to_max_depth(bsize); + + if (tile_data->allow_update_cdf) + update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx], + depth, max_depths + 1); +#if CONFIG_ENTROPY_STATS + ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth]; +#endif + } + } + assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi))); + } else { + int i, j; + TX_SIZE intra_tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + intra_tx_size = TX_4X4; + } else { + intra_tx_size = + tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type); + } + } else { + intra_tx_size = mbmi->tx_size; + } + + const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width); + const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height); + for (j = 0; j < rows; j++) { + for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size; + } + + if (intra_tx_size != max_txsize_rect_lookup[bsize]) + ++x->txfm_search_info.txb_split_count; + } + } + + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->bsize) && is_inter && + !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) { + if (dry_run) tx_partition_set_contexts(cm, xd, bsize); + } else { + TX_SIZE tx_size = mbmi->tx_size; + // The new intra coding scheme requires no change of transform size + if (is_inter) { + if (xd->lossless[mbmi->segment_id]) { + tx_size = TX_4X4; + } else { + tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type); + } + } else { + tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4; + } + mbmi->tx_size = tx_size; + set_txfm_ctxs(tx_size, xd->width, xd->height, + (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd); + } + + if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) { + cfl_store_block(xd, mbmi->bsize, mbmi->tx_size); + } + if (!dry_run) { + if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 && + cpi->sf.rt_sf.use_temporal_noise_estimate && + (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) + update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize); + } +} + +static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode, MB_MODE_INFO *mbmi) { + x->rdmult = cpi->rd.RDMULT; + + if (aq_mode != NO_AQ) { + assert(mbmi != NULL); + if (aq_mode == VARIANCE_AQ) { + if (cpi->vaq_refresh) { + const int energy = bsize <= BLOCK_16X16 + ? x->mb_energy + : av1_log_block_var(cpi, x, bsize); + mbmi->segment_id = energy; + } + x->rdmult = set_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == COMPLEXITY_AQ) { + x->rdmult = set_rdmult(cpi, x, mbmi->segment_id); + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) + x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } + } + +#if !CONFIG_REALTIME_ONLY + if (cpi->common.delta_q_info.delta_q_present_flag && + !cpi->sf.rt_sf.use_nonrd_pick_mode) { + x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col); + } +#endif // !CONFIG_REALTIME_ONLY + + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) { + av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col, + &x->rdmult); + } +#if CONFIG_SALIENCY_MAP + else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP) { + av1_set_saliency_map_vmaf_rdmult(cpi, &x->errorperbit, + cpi->common.seq_params->sb_size, mi_row, + mi_col, &x->rdmult); + } +#endif +#if CONFIG_TUNE_VMAF + else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING || + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN || + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +#endif +#if CONFIG_TUNE_BUTTERAUGLI + else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } +#endif + if (cpi->oxcf.mode == ALLINTRA) { + x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7); + } + + // Check to make sure that the adjustments above have not caused the + // rd multiplier to be truncated to 0. + x->rdmult = (x->rdmult > 0) ? x->rdmult : 1; +} + +void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi, + const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + + set_entropy_context(xd, mi_row, mi_col, num_planes); + xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height, + mi_width, cpi->oxcf.border_in_pixels); + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_params.mi_rows, cm->mi_params.mi_cols); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + + // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs() + xd->tile = *tile; +} + +void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + + av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + + // Setup segment ID. + mbmi = xd->mi[0]; + mbmi->segment_id = 0; + if (seg->enabled) { + if (seg->enabled && !cpi->vaq_refresh) { + const uint8_t *const map = + seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map; + mbmi->segment_id = + map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0; + } + av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0); + } +#ifndef NDEBUG + x->last_set_offsets_loc.mi_row = mi_row; + x->last_set_offsets_loc.mi_col = mi_col; + x->last_set_offsets_loc.bsize = bsize; +#endif // NDEBUG +} + +/*!\brief Hybrid intra mode search. + * + * \ingroup intra_mode_search + * \callgraph + * \callergraph + * This is top level function for mode search for intra frames in non-RD + * optimized case. Depending on speed feature and block size it calls + * either non-RD or RD optimized intra mode search. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ + +static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi, + MACROBLOCK *const x, + RD_STATS *rd_cost, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + int use_rdopt = 0; + const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode; + // Use rd pick for intra mode search based on block size and variance. + if (hybrid_intra_pickmode && bsize < BLOCK_16X16) { + unsigned int var_thresh[3] = { 0, 101, 201 }; + assert(hybrid_intra_pickmode <= 3); + if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1]) + use_rdopt = 1; + } + + if (use_rdopt) + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + else + av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); +} + +// For real time/allintra row-mt enabled multi-threaded encoding with cost +// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated +// at superblock level. Thus, it is not required for the encoding of top-right +// superblock be complete for updating tile ctxt. However, when encoding a block +// whose right edge is also the superblock edge, intra and inter mode evaluation +// (ref mv list population) require the encoding of the top-right superblock to +// be complete. So, here, we delay the waiting of threads until the need for the +// data from the top-right superblock region. +static AOM_INLINE void wait_for_top_right_sb( + AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync, + TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int sb_size_in_mi = mi_size_wide[sb_size]; + const int bw_in_mi = mi_size_wide[bsize]; + const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1); + const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1); + const int top_right_block_in_sb = + (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi); + + // Don't wait if the block is the not the top-right block in the superblock. + if (!top_right_block_in_sb) return; + + // Wait for the top-right superblock to finish encoding. + const int sb_row_in_tile = + (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2; + const int sb_col_in_tile = + (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2; + + enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile); +} + +/*!\brief Interface for AV1 mode search for an individual coding block + * + * \ingroup partition_search + * \callgraph + * \callergraph + * Searches prediction modes, transform, and coefficient coding modes for an + * individual coding block. This function is the top-level interface that + * directs the encoder to the proper mode search function, among these + * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment. + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding + * \param[in] x Pointer to structure holding all the data for + * the current macroblock + * \param[in] mi_row Row coordinate of the block in a step size of + * MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] rd_cost Pointer to structure holding rate and distortion + * stats for the current block + * \param[in] partition Partition mode of the parent block + * \param[in] bsize Current block size + * \param[in] ctx Pointer to structure holding coding contexts and + * chosen modes for the current block + * \param[in] best_rd Upper bound of rd cost of a valid partition + * + * \remark Nothing is returned. Instead, the chosen modes and contexts necessary + * for reconstruction are stored in ctx, the rate-distortion stats are stored in + * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be + * signalled by an INT64_MAX rd_cost->rdcost. + */ +static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_STATS *rd_cost, PARTITION_TYPE partition, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + RD_STATS best_rd) { + if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) { + ctx->rd_stats.rdcost = INT64_MAX; + ctx->rd_stats.skip_txfm = 0; + av1_invalid_rd_stats(rd_cost); + return; + } + + av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); + + if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab && + ctx->rd_mode_is_ready) { + assert(ctx->mic.bsize == bsize); + assert(ctx->mic.partition == partition); + rd_cost->rate = ctx->rd_stats.rate; + rd_cost->dist = ctx->rd_stats.dist; + rd_cost->rdcost = ctx->rd_stats.rdcost; + return; + } + + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + int i; + + // This is only needed for real time/allintra row-mt enabled multi-threaded + // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. + wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, + &tile_data->tile_info, cm->seq_params->sb_size, + cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif + + mbmi = xd->mi[0]; + mbmi->bsize = bsize; + mbmi->partition = partition; + +#if CONFIG_RD_DEBUG + mbmi->mi_row = mi_row; + mbmi->mi_col = mi_col; +#endif + + // Sets up the tx_type_map buffer in MACROBLOCKD. + xd->tx_type_map = txfm_info->tx_type_map_; + xd->tx_type_map_stride = mi_size_wide[bsize]; + + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + p[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + + ctx->skippable = 0; + // Set to zero to make sure we do not use the previous encoded frame stats + mbmi->skip_txfm = 0; + // Reset skip mode flag. + mbmi->skip_mode = 0; + + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); + // Set error per bit for current rdmult + av1_set_error_per_bit(&x->errorperbit, x->rdmult); + av1_rd_cost_update(x->rdmult, &best_rd); + + // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous + // rdcost information for the following mode search. + // Disabling the feature could get some coding gain, with encoder slowdown. + if (!cpi->sf.part_sf.use_best_rd_for_pruning) { + av1_invalid_rd_stats(&best_rd); + } + + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif + av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, best_rd.rdcost); + } else { + av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx, + best_rd.rdcost); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif + } + + // Examine the resulting rate and for AQ mode 2 make a segment choice. + if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ && + bsize >= BLOCK_16X16) { + av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); + } + + x->rdmult = orig_rdmult; + + // TODO(jingning) The rate-distortion optimization flow needs to be + // refactored to provide proper exit/return handle. + if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + + ctx->rd_stats.rate = rd_cost->rate; + ctx->rd_stats.dist = rd_cost->dist; + ctx->rd_stats.rdcost = rd_cost->rdcost; + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif +} + +static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { + MACROBLOCK *x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const CurrentFrame *const current_frame = &cm->current_frame; + const BLOCK_SIZE bsize = mbmi->bsize; + FRAME_CONTEXT *fc = xd->tile_ctx; + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + + if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active && + is_comp_ref_allowed(bsize)) { + const int skip_mode_ctx = av1_get_skip_mode_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++; +#endif + update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2); + } + + if (!mbmi->skip_mode && !seg_ref_active) { + const int skip_ctx = av1_get_skip_txfm_context(xd); +#if CONFIG_ENTROPY_STATS + td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++; +#endif + update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2); + } + +#if CONFIG_ENTROPY_STATS + // delta quant applies to both intra and inter + const int super_block_upper_left = + ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag && + (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && + super_block_upper_left) { + const int dq = (mbmi->current_qindex - xd->current_base_qindex) / + delta_q_info->delta_q_res; + const int absdq = abs(dq); + for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) { + td->counts->delta_q[i][1]++; + } + if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / + delta_q_info->delta_lf_res; + const int abs_delta_lf = abs(delta_lf); + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf_multi[lf_id][i][1]++; + } + if (abs_delta_lf < DELTA_LF_SMALL) + td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++; + } + } else { + const int delta_lf = + (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / + delta_q_info->delta_lf_res; + const int abs_delta_lf = abs(delta_lf); + for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) { + td->counts->delta_lf[i][1]++; + } + if (abs_delta_lf < DELTA_LF_SMALL) + td->counts->delta_lf[abs_delta_lf][0]++; + } + } + } +#endif + + if (!is_inter_block(mbmi)) { + av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi, + frame_is_intra_only(cm)); + } + + if (av1_allow_intrabc(cm)) { + const int is_intrabc = is_intrabc_block(mbmi); + update_cdf(fc->intrabc_cdf, is_intrabc, 2); +#if CONFIG_ENTROPY_STATS + ++td->counts->intrabc[is_intrabc]; +#endif // CONFIG_ENTROPY_STATS + if (is_intrabc) { + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv; + av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc, + MV_SUBPEL_NONE); + } + } + + if (frame_is_intra_only(cm) || mbmi->skip_mode) return; + + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mbmi); + + if (!seg_ref_active) { +#if CONFIG_ENTROPY_STATS + counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++; +#endif + update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)], + inter_block, 2); + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; + if (current_frame->reference_mode == REFERENCE_MODE_SELECT) { + if (is_comp_ref_allowed(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->comp_inter[av1_get_reference_mode_context(xd)] + [has_second_ref(mbmi)]++; +#endif // CONFIG_ENTROPY_STATS + update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2); + } + } + + if (has_second_ref(mbmi)) { + const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi) + ? UNIDIR_COMP_REFERENCE + : BIDIR_COMP_REFERENCE; + update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type, + COMP_REFERENCE_TYPES); +#if CONFIG_ENTROPY_STATS + counts->comp_ref_type[av1_get_comp_reference_type_context(xd)] + [comp_ref_type]++; +#endif // CONFIG_ENTROPY_STATS + + if (comp_ref_type == UNIDIR_COMP_REFERENCE) { + const int bit = (ref0 == BWDREF_FRAME); + update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts + ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit) { + const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME); + update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS + counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1] + [bit1]++; +#endif // CONFIG_ENTROPY_STATS + if (bit1) { + update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd), + ref1 == GOLDEN_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2] + [ref1 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } else { + const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME); + update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit) { + update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1] + [ref0 == LAST2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } else { + update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2] + [ref0 == GOLDEN_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0] + [ref1 == ALTREF_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + if (ref1 != ALTREF_FRAME) { + update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd), + ref1 == ALTREF2_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1] + [ref1 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } else { + const int bit = (ref0 >= BWDREF_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++; +#endif // CONFIG_ENTROPY_STATS + if (bit) { + assert(ref0 <= ALTREF_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1] + [ref0 == ALTREF_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + if (ref0 != ALTREF_FRAME) { + update_cdf(av1_get_pred_cdf_single_ref_p6(xd), + ref0 == ALTREF2_FRAME, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5] + [ref0 == ALTREF2_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } else { + const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME); + update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++; +#endif // CONFIG_ENTROPY_STATS + if (!bit1) { + update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3] + [ref0 != LAST_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } else { + update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME, + 2); +#if CONFIG_ENTROPY_STATS + counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4] + [ref0 != LAST3_FRAME]++; +#endif // CONFIG_ENTROPY_STATS + } + } + } + + if (cm->seq_params->enable_interintra_compound && + is_interintra_allowed(mbmi)) { + const int bsize_group = size_group_lookup[bsize]; + if (mbmi->ref_frame[1] == INTRA_FRAME) { +#if CONFIG_ENTROPY_STATS + counts->interintra[bsize_group][1]++; +#endif + update_cdf(fc->interintra_cdf[bsize_group], 1, 2); +#if CONFIG_ENTROPY_STATS + counts->interintra_mode[bsize_group][mbmi->interintra_mode]++; +#endif + update_cdf(fc->interintra_mode_cdf[bsize_group], + mbmi->interintra_mode, INTERINTRA_MODES); + if (av1_is_wedge_used(bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++; +#endif + update_cdf(fc->wedge_interintra_cdf[bsize], + mbmi->use_wedge_interintra, 2); + if (mbmi->use_wedge_interintra) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++; +#endif + update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index, + 16); + } + } + } else { +#if CONFIG_ENTROPY_STATS + counts->interintra[bsize_group][0]++; +#endif + update_cdf(fc->interintra_cdf[bsize_group], 0, 2); + } + } + + const MOTION_MODE motion_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed == WARPED_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->motion_mode[bsize][mbmi->motion_mode]++; +#endif + update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode, + MOTION_MODES); + } else if (motion_allowed == OBMC_CAUSAL) { +#if CONFIG_ENTROPY_STATS + counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; +#endif + update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2); + } + } + + if (has_second_ref(mbmi)) { + assert(current_frame->reference_mode != SINGLE_REFERENCE && + is_inter_compound_mode(mbmi->mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION); + + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + if (masked_compound_used) { + const int comp_group_idx_ctx = get_comp_group_idx_context(xd); +#if CONFIG_ENTROPY_STATS + ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx]; +#endif + update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx], + mbmi->comp_group_idx, 2); + } + + if (mbmi->comp_group_idx == 0) { + const int comp_index_ctx = get_comp_index_context(cm, xd); +#if CONFIG_ENTROPY_STATS + ++counts->compound_index[comp_index_ctx][mbmi->compound_idx]; +#endif + update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx, + 2); + } else { + assert(masked_compound_used); + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + ++counts->compound_type[bsize][mbmi->interinter_comp.type - + COMPOUND_WEDGE]; +#endif + update_cdf(fc->compound_type_cdf[bsize], + mbmi->interinter_comp.type - COMPOUND_WEDGE, + MASKED_COMPOUND_TYPES); + } + } + } + if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { + if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { +#if CONFIG_ENTROPY_STATS + counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++; +#endif + update_cdf(fc->wedge_idx_cdf[bsize], + mbmi->interinter_comp.wedge_index, 16); + } + } + } + } + + if (inter_block && cm->features.interp_filter == SWITCHABLE && + av1_is_interp_needed(xd)) { + update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter); + } + if (inter_block && + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const PREDICTION_MODE mode = mbmi->mode; + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + if (has_second_ref(mbmi)) { +#if CONFIG_ENTROPY_STATS + ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)]; +#endif + update_cdf(fc->inter_compound_mode_cdf[mode_ctx], + INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES); + } else { + av1_update_inter_mode_stats(fc, counts, mode, mode_ctx); + } + + const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV; + if (new_mv) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + const uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2); +#if CONFIG_ENTROPY_STATS + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx]; +#endif + if (mbmi->ref_mv_idx == idx) break; + } + } + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + const uint8_t drl_ctx = + av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2); +#if CONFIG_ENTROPY_STATS + ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1]; +#endif + if (mbmi->ref_mv_idx == idx - 1) break; + } + } + } + if (have_newmv_in_inter_mode(mbmi->mode)) { + const int allow_hp = cm->features.cur_frame_force_integer_mv + ? MV_SUBPEL_NONE + : cm->features.allow_high_precision_mv; + if (new_mv) { + for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } + } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) { + const int ref = 1; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) { + const int ref = 0; + const int_mv ref_mv = av1_get_ref_mv(x, ref); + av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc, + allow_hp); + } + } + } +} + +/*!\brief Reconstructs an individual coding block + * + * \ingroup partition_search + * Reconstructs an individual coding block by applying the chosen modes stored + * in ctx, also updates mode counts and entropy models. + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during encoding + * \param[in] td Pointer to thread data + * \param[in] tp Pointer to the starting token + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * \param[in] bsize Current block size + * \param[in] partition Partition mode of the parent block + * \param[in] ctx Pointer to structure holding coding contexts and the + * chosen modes for the current block + * \param[in] rate Pointer to the total rate for the current block + * + * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters) + * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes + * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0]. + */ +static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TokenExtra **tp, int mi_row, int mi_col, + RUN_TYPE dry_run, BLOCK_SIZE bsize, + PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx, + int *rate) { + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int subsampling_x = cm->seq_params->subsampling_x; + const int subsampling_y = cm->seq_params->subsampling_y; + + av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + const int origin_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->partition = partition; + av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); + + if (!dry_run) { + set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], + x->cb_offset[PLANE_TYPE_UV]); + assert(x->cb_offset[PLANE_TYPE_Y] < + (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); + assert(x->cb_offset[PLANE_TYPE_UV] < + ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> + (subsampling_x + subsampling_y))); + } + + encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); + + if (!dry_run) { + update_cb_offsets(x, bsize, subsampling_x, subsampling_y); + if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 && + cm->delta_q_info.delta_lf_present_flag) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) + mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; + mbmi->delta_lf_from_base = xd->delta_lf_from_base; + } + if (has_second_ref(mbmi)) { + if (mbmi->compound_idx == 0 || + mbmi->interinter_comp.type == COMPOUND_AVERAGE) + mbmi->comp_group_idx = 0; + else + mbmi->comp_group_idx = 1; + } + + // delta quant applies to both intra and inter + const int super_block_upper_left = + ((mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params->mib_size - 1)) == 0); + const DeltaQInfo *const delta_q_info = &cm->delta_q_info; + if (delta_q_info->delta_q_present_flag && + (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && + super_block_upper_left) { + xd->current_base_qindex = mbmi->current_qindex; + if (delta_q_info->delta_lf_present_flag) { + if (delta_q_info->delta_lf_multi) { + const int frame_lf_count = + av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; + for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) { + xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; + } + } else { + xd->delta_lf_from_base = mbmi->delta_lf_from_base; + } + } + } + + RD_COUNTS *rdc = &td->rd_counts; + if (mbmi->skip_mode) { + assert(!frame_is_intra_only(cm)); + rdc->skip_mode_used_flag = 1; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + assert(has_second_ref(mbmi)); + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } else { + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (is_inter_block(mbmi)) { + av1_collect_neighbors_ref_counts(xd); + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + if (has_second_ref(mbmi)) { + // This flag is also updated for 4x4 blocks + rdc->compound_ref_used_flag = 1; + } + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + } + } + + if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); + + // Gather obmc and warped motion count to update the probability. + if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) || + (cm->features.allow_warped_motion && + cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) { + const int inter_block = is_inter_block(mbmi); + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active && inter_block) { + const MOTION_MODE motion_allowed = + cm->features.switchable_motion_mode + ? motion_mode_allowed(xd->global_motion, xd, mbmi, + cm->features.allow_warped_motion) + : SIMPLE_TRANSLATION; + + if (mbmi->ref_frame[1] != INTRA_FRAME) { + if (motion_allowed >= OBMC_CAUSAL) { + td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++; + } + if (motion_allowed == WARPED_CAUSAL) { + td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++; + } + } + } + } + } + // TODO(Ravi/Remya): Move this copy function to a better logical place + // This function will copy the best mode information from block + // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This + // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during + // bitstream preparation. + av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + x->rdmult = origin_mult; +} + +/*!\brief Reconstructs a partition (may contain multiple coding blocks) + * + * \ingroup partition_search + * Reconstructs a sub-partition of the superblock by applying the chosen modes + * and partition trees stored in pc_tree. + * + * \param[in] cpi Top-level encoder structure + * \param[in] td Pointer to thread data + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during encoding + * \param[in] tp Pointer to the starting token + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] dry_run A code indicating whether it is part of the final + * pass for reconstructing the superblock + * \param[in] bsize Current block size + * \param[in] pc_tree Pointer to the PC_TREE node storing the picked + * partitions and mode info for the current block + * \param[in] rate Pointer to the total rate for the current block + * + * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters) + * will be updated in the pixel buffers in td->mb.e_mbd. + */ +static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, + PC_TREE *pc_tree, int *rate) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + assert(bsize < BLOCK_SIZES_ALL); + const int hbs = mi_size_wide[bsize] / 2; + const int is_partition_root = bsize >= BLOCK_8X8; + const int ctx = is_partition_root + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : -1; + const PARTITION_TYPE partition = pc_tree->partitioning; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); +#if !CONFIG_REALTIME_ONLY + int quarter_step = mi_size_wide[bsize] / 4; + int i; + BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT); +#endif + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + if (subsize == BLOCK_INVALID) return; + + if (!dry_run && ctx >= 0) { + const int has_rows = (mi_row + hbs) < mi_params->mi_rows; + const int has_cols = (mi_col + hbs) < mi_params->mi_cols; + + if (has_rows && has_cols) { +#if CONFIG_ENTROPY_STATS + td->counts->partition[ctx][partition]++; +#endif + + if (tile_data->allow_update_cdf) { + FRAME_CONTEXT *fc = xd->tile_ctx; + update_cdf(fc->partition_cdf[ctx], partition, + partition_cdf_length(bsize)); + } + } + } + + switch (partition) { + case PARTITION_NONE: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->none, rate); + break; + case PARTITION_VERT: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->vertical[0], rate); + if (mi_col + hbs < mi_params->mi_cols) { + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, pc_tree->vertical[1], rate); + } + break; + case PARTITION_HORZ: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->horizontal[0], rate); + if (mi_row + hbs < mi_params->mi_rows) { + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, pc_tree->horizontal[1], rate); + } + break; + case PARTITION_SPLIT: + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize, + pc_tree->split[0], rate); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize, + pc_tree->split[1], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize, + pc_tree->split[2], rate); + encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run, + subsize, pc_tree->split[3], rate); + break; + +#if !CONFIG_REALTIME_ONLY + case PARTITION_HORZ_A: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, pc_tree->horizontala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, pc_tree->horizontala[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize, + partition, pc_tree->horizontala[2], rate); + break; + case PARTITION_HORZ_B: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->horizontalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, pc_tree->horizontalb[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, pc_tree->horizontalb[2], rate); + break; + case PARTITION_VERT_A: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2, + partition, pc_tree->verticala[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2, + partition, pc_tree->verticala[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize, + partition, pc_tree->verticala[2], rate); + + break; + case PARTITION_VERT_B: + encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize, + partition, pc_tree->verticalb[0], rate); + encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2, + partition, pc_tree->verticalb[1], rate); + encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run, + bsize2, partition, pc_tree->verticalb[2], rate); + break; + case PARTITION_HORZ_4: + for (i = 0; i < SUB_PARTITIONS_PART4; ++i) { + int this_mi_row = mi_row + i * quarter_step; + if (i > 0 && this_mi_row >= mi_params->mi_rows) break; + + encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize, + partition, pc_tree->horizontal4[i], rate); + } + break; + case PARTITION_VERT_4: + for (i = 0; i < SUB_PARTITIONS_PART4; ++i) { + int this_mi_col = mi_col + i * quarter_step; + if (i > 0 && this_mi_col >= mi_params->mi_cols) break; + encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize, + partition, pc_tree->vertical4[i], rate); + } + break; +#endif + default: assert(0 && "Invalid partition type."); break; + } + + update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); +} + +static AOM_INLINE int is_adjust_var_based_part_enabled( + AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf, + BLOCK_SIZE bsize) { + if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0; + if (part_sf->adjust_var_based_rd_partitioning == 0 || + part_sf->adjust_var_based_rd_partitioning > 2) + return 0; + + if (bsize <= BLOCK_32X32) return 1; + if (part_sf->adjust_var_based_rd_partitioning == 2) { + const int is_larger_qindex = cm->quant_params.base_qindex > 190; + const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; + return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64; + } + return 0; +} + +/*!\brief AV1 block partition search (partition estimation and partial search). +* +* \ingroup partition_search +* Encode the block by applying pre-calculated partition patterns that are +* represented by coding block sizes stored in the mbmi array. Minor partition +* adjustments are tested and applied if they lead to lower rd costs. The +* partition types are limited to a basic set: none, horz, vert, and split. +* +* \param[in] cpi Top-level encoder structure +* \param[in] td Pointer to thread data +* \param[in] tile_data Pointer to struct holding adaptive +data/contexts/models for the tile during encoding +* \param[in] mib Array representing MB_MODE_INFO pointers for mi +blocks starting from the first pixel of the current +block +* \param[in] tp Pointer to the starting token +* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE +* \param[in] mi_col Column coordinate of the block in a step size of +MI_SIZE +* \param[in] bsize Current block size +* \param[in] rate Pointer to the final rate for encoding the current +block +* \param[in] dist Pointer to the final distortion of the current block +* \param[in] do_recon Whether the reconstruction function needs to be run, +either for finalizing a superblock or providing +reference for future sub-partitions +* \param[in] pc_tree Pointer to the PC_TREE node holding the picked +partitions and mode info for the current block +* +* \remark Nothing is returned. The pc_tree struct is modified to store the +* picked partition and modes. The rate and dist are also updated with those +* corresponding to the best partition found. +*/ +void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *rate, + int64_t *dist, int do_recon, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc; + BLOCK_SIZE bs_type = mib[0]->bsize; + int use_partition_none = 0; + x->try_merge_partition = 0; + + if (pc_tree->none == NULL) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + PICK_MODE_CONTEXT *ctx_none = pc_tree->none; + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + // In rt mode, currently the min partition size is BLOCK_8X8. + assert(bsize >= cpi->sf.part_sf.default_min_partition_size); + + av1_invalid_rd_stats(&last_part_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_invalid_rd_stats(&chosen_rdc); + av1_invalid_rd_stats(&invalid_rdc); + + pc_tree->partitioning = partition; + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); + } + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + + if (partition != PARTITION_NONE && + is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) && + (mi_row + hbs < mi_params->mi_rows && + mi_col + hbs < mi_params->mi_cols)) { + assert(bsize > cpi->sf.part_sf.default_min_partition_size); + mib[0]->bsize = bsize; + pc_tree->partitioning = PARTITION_NONE; + x->try_merge_partition = 1; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE, + bsize, ctx_none, invalid_rdc); + + if (none_rdc.rate < INT_MAX) { + none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + } + + // Try to skip split partition evaluation based on none partition + // characteristics. + if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) { + use_partition_none = 1; + } + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + mib[0]->bsize = bs_type; + pc_tree->partitioning = partition; + } + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + switch (partition) { + case PARTITION_NONE: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_NONE, bsize, ctx_none, invalid_rdc); + break; + case PARTITION_HORZ: + if (use_partition_none) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + pc_tree->horizontal[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->horizontal[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_HORZ, subsize, pc_tree->horizontal[0], + invalid_rdc); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_row + hbs < mi_params->mi_rows) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0]; + av1_init_rd_stats(&tmp_rdc); + av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, + NULL); + pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, + PARTITION_HORZ, subsize, pc_tree->horizontal[1], + invalid_rdc); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_VERT: + if (use_partition_none) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + pc_tree->vertical[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->vertical[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_col + hbs < mi_params->mi_cols) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0]; + av1_init_rd_stats(&tmp_rdc); + av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, + NULL); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, + PARTITION_VERT, subsize, + pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_SPLIT: + if (use_partition_none) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + + last_part_rdc.rate = 0; + last_part_rdc.dist = 0; + last_part_rdc.rdcost = 0; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + RD_STATS tmp_rdc; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + av1_init_rd_stats(&tmp_rdc); + av1_rd_use_partition( + cpi, td, tile_data, + mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, + &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } + + if (last_part_rdc.rate < INT_MAX) { + last_part_rdc.rate += mode_costs->partition_cost[pl][partition]; + last_part_rdc.rdcost = + RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist); + } + + if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION && + cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) && + partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && + (mi_row + bs < mi_params->mi_rows || + mi_row + hbs == mi_params->mi_rows) && + (mi_col + bs < mi_params->mi_cols || + mi_col + hbs == mi_params->mi_cols)) { + BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + chosen_rdc.rate = 0; + chosen_rdc.dist = 0; + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + pc_tree->partitioning = PARTITION_SPLIT; + + // Split partition. + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + RD_STATS tmp_rdc; + + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + pc_tree->split[i]->partitioning = PARTITION_NONE; + if (pc_tree->split[i]->none == NULL) + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, + PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none, + invalid_rdc); + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&chosen_rdc); + break; + } + + chosen_rdc.rate += tmp_rdc.rate; + chosen_rdc.dist += tmp_rdc.dist; + + if (i != SUB_PARTITIONS_SPLIT - 1) + encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, + OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL); + + chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + } + if (chosen_rdc.rate < INT_MAX) { + chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist); + } + } + + // If last_part is better set the partitioning to that. + if (last_part_rdc.rdcost < chosen_rdc.rdcost) { + mib[0]->bsize = bs_type; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition; + + chosen_rdc = last_part_rdc; + } + // If none was better set the partitioning to that. + if (none_rdc.rdcost < INT64_MAX && + none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) { + mib[0]->bsize = bsize; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + chosen_rdc = none_rdc; + } + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if (bsize == cm->seq_params->sb_size) + assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + if (do_recon) { + if (bsize == cm->seq_params->sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif + + *rate = chosen_rdc.rate; + *dist = chosen_rdc.dist; + x->rdmult = orig_rdmult; +} + +static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data, + ThreadData *td, TokenExtra **tp, int mi_row, + int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize, + PARTITION_TYPE partition, + PICK_MODE_CONTEXT *const ctx, int *rate) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing((AV1_COMP *)cpi, encode_b_nonrd_time); +#endif + const AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + const int origin_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->partition = partition; + av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); + const int subsampling_x = cpi->common.seq_params->subsampling_x; + const int subsampling_y = cpi->common.seq_params->subsampling_y; + if (!dry_run) { + set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], + x->cb_offset[PLANE_TYPE_UV]); + assert(x->cb_offset[PLANE_TYPE_Y] < + (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); + assert(x->cb_offset[PLANE_TYPE_UV] < + ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> + (subsampling_x + subsampling_y))); + } + + encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); + if (!dry_run) { + update_cb_offsets(x, bsize, subsampling_x, subsampling_y); + if (has_second_ref(mbmi)) { + if (mbmi->compound_idx == 0 || + mbmi->interinter_comp.type == COMPOUND_AVERAGE) + mbmi->comp_group_idx = 0; + else + mbmi->comp_group_idx = 1; + mbmi->compound_idx = 1; + } + RD_COUNTS *const rdc = &td->rd_counts; + if (mbmi->skip_mode) { + assert(!frame_is_intra_only(cm)); + rdc->skip_mode_used_flag = 1; + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && + has_second_ref(mbmi)) { + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } else { + const int seg_ref_active = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (is_inter_block(mbmi)) { + av1_collect_neighbors_ref_counts(xd); + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT && + has_second_ref(mbmi)) { + // This flag is also updated for 4x4 blocks + rdc->compound_ref_used_flag = 1; + } + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + } + } + } + if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY && + (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) { + int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize]; + rdc->newmv_or_intra_blocks += blocks; + } + if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); + } + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm && + !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) + av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run); + // TODO(Ravi/Remya): Move this copy function to a better logical place + // This function will copy the best mode information from block + // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This + // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during + // bitstream preparation. + av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + x->rdmult = origin_mult; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing((AV1_COMP *)cpi, encode_b_nonrd_time); +#endif +} + +static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi, + const MACROBLOCK *x, + BLOCK_SIZE bsize) { + // Force zero MV skip based on SB level decision + if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb; + + // For blocks of size equal to superblock size, the decision would have been + // already done at superblock level. Hence zeromv-skip decision is skipped. + const AV1_COMMON *const cm = &cpi->common; + if (bsize == cm->seq_params->sb_size) return 0; + + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *const xd = &x->e_mbd; + const unsigned int thresh_exit_part_y = + cpi->zeromv_skip_thresh_exit_part[bsize]; + const unsigned int thresh_exit_part_uv = + CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y); + const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y, + thresh_exit_part_uv, + thresh_exit_part_uv }; + const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, LAST_FRAME); + + struct buf_2d yv12_mb[MAX_MB_PLANE]; + av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes); + + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride); + assert(plane < MAX_MB_PLANE); + if (plane_sad >= thresh_exit_part[plane]) return 0; + } + return 1; +} + +/*!\brief Top level function to pick block mode for non-RD optimized case + * + * \ingroup partition_search + * \callgraph + * \callergraph + * Searches prediction modes, transform, and coefficient coding modes for an + * individual coding block. This function is the top-level function that is + * used for non-RD optimized mode search (controlled by + * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls + * inter/skip/hybrid-intra mode search functions + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding + * \param[in] x Pointer to structure holding all the data for + * the current macroblock + * \param[in] mi_row Row coordinate of the block in a step size of + * MI_SIZE + * \param[in] mi_col Column coordinate of the block in a step size of + * MI_SIZE + * \param[in] rd_cost Pointer to structure holding rate and distortion + * stats for the current block + * \param[in] bsize Current block size + * \param[in] ctx Pointer to structure holding coding contexts and + * chosen modes for the current block + * + * \remark Nothing is returned. Instead, the chosen modes and contexts necessary + * for reconstruction are stored in ctx, the rate-distortion stats are stored in + * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be + * signalled by an INT64_MAX rd_cost->rdcost. + */ +static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + // For nonrd mode, av1_set_offsets is already called at the superblock level + // in encode_nonrd_sb when we determine the partitioning. + if (bsize != cpi->common.seq_params->sb_size || + cpi->sf.rt_sf.nonrd_check_partition_split == 1) { + av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); + } + assert(x->last_set_offsets_loc.mi_row == mi_row && + x->last_set_offsets_loc.mi_col == mi_col && + x->last_set_offsets_loc.bsize == bsize); + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int i; + + // This is only needed for real time/allintra row-mt enabled multi-threaded + // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. + wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, + &tile_data->tile_info, cm->seq_params->sb_size, + cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, pick_sb_modes_nonrd_time); +#endif + // Sets up the tx_type_map buffer in MACROBLOCKD. + xd->tx_type_map = txfm_info->tx_type_map_; + xd->tx_type_map_stride = mi_size_wide[bsize]; + for (i = 0; i < num_planes; ++i) { + p[i].coeff = ctx->coeff[i]; + p[i].qcoeff = ctx->qcoeff[i]; + p[i].dqcoeff = ctx->dqcoeff[i]; + p[i].eobs = ctx->eobs[i]; + p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i]; + } + for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; + + x->force_zeromv_skip_for_blk = + get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); + + // Source variance may be already compute at superblock level, so no need + // to recompute, unless bsize < sb_size or source_variance is not yet set. + if (!x->force_zeromv_skip_for_blk && + (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi); + // Set error per bit for current rdmult + av1_set_error_per_bit(&x->errorperbit, x->rdmult); + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (frame_is_intra_only(cm)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, hybrid_intra_mode_search_time); +#endif + hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, hybrid_intra_mode_search_time); +#endif + } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, nonrd_pick_inter_mode_sb_time); +#endif + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + RD_STATS invalid_rd; + av1_invalid_rd_stats(&invalid_rd); + // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip + av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, + invalid_rd.rdcost); + } else { + av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, nonrd_pick_inter_mode_sb_time); +#endif + } + if (cpi->sf.rt_sf.skip_cdef_sb) { + // cdef_strength is initialized to 1 which means skip_cdef, and is updated + // here. Check to see is skipping cdef is allowed. + const int allow_cdef_skipping = + cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && + !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]); + + // Find the corresponding 64x64 block. It'll be the 128x128 block if that's + // the block size. + const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64; + const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64; + MB_MODE_INFO **mi_sb = + cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); + // Do not skip if intra or new mv is picked, or color sensitivity is set. + // Never skip on slide/scene change. + if (cpi->sf.rt_sf.skip_cdef_sb >= 2) { + mi_sb[0]->cdef_strength = + mi_sb[0]->cdef_strength && + (allow_cdef_skipping || x->source_variance == 0); + } else { + mi_sb[0]->cdef_strength = + mi_sb[0]->cdef_strength && allow_cdef_skipping && + !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV); + } + // Store in the pickmode context. + ctx->mic.cdef_strength = mi_sb[0]->cdef_strength; + } + x->rdmult = orig_rdmult; + ctx->rd_stats.rate = rd_cost->rate; + ctx->rd_stats.dist = rd_cost->dist; + ctx->rd_stats.rdcost = rd_cost->rdcost; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, pick_sb_modes_nonrd_time); +#endif +} + +static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td, + TileDataEnc *const tile_data, + TileInfo *const tile_info, TokenExtra **tp, + MACROBLOCK *const x, MACROBLOCKD *const xd, + const CommonModeInfoParams *const mi_params, + const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, const int pl, + PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const ModeCosts *mode_costs = &x->mode_costs; + const int hbs = mi_size_wide[bsize] / 2; + if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows || + mi_col + mi_size_wide[bsize] >= mi_params->mi_cols) + return 0; + if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0; + if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0; + + // Do not try split partition when the source sad is small, or + // the prediction residual is small. + const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, LAST_FRAME); + const int num_planes = av1_num_planes(cm); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes); + int block_sad = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride); + block_sad += plane_sad; + } + const int blk_pix = block_size_wide[bsize] * block_size_high[bsize]; + const int block_avg_sad = block_sad / blk_pix; + // TODO(chengchen): find a proper threshold. It might change according to + // q as well. + const int threshold = 25; + if (block_avg_sad < threshold) return 0; + + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS split_rdc, none_rdc; + av1_invalid_rd_stats(&split_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Calculate rdcost for none partition + pc_tree->partitioning = PARTITION_NONE; + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + if (!pc_tree->none) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->none); + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, + pc_tree->none); + none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + // Calculate rdcost for split partition + pc_tree->partitioning = PARTITION_SPLIT; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + av1_init_rd_stats(&split_rdc); + split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + if (subsize >= BLOCK_8X8) { + split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4); + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (!pc_tree->split[i]) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } + pc_tree->split[i]->index = i; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + RD_STATS block_rdc; + av1_invalid_rd_stats(&block_rdc); + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; + xd->left_txfm_context = + xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); + if (!pc_tree->split[i]->none) { + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->split[i]->none); + } + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &block_rdc, subsize, pc_tree->split[i]->none); + split_rdc.rate += block_rdc.rate; + split_rdc.dist += block_rdc.dist; + av1_rd_cost_update(x->rdmult, &split_rdc); + if (none_rdc.rdcost < split_rdc.rdcost) break; + if (i != SUB_PARTITIONS_SPLIT - 1) + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1, + subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); + } + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); + const int split = split_rdc.rdcost < none_rdc.rdcost; + + return split; +} + +// Returns if SPLIT partitions should be evaluated +static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x, + const PC_TREE *pc_tree, const RD_STATS *none_rdc, + const CommonModeInfoParams *mi_params, + int mi_row, int mi_col, int hbs, + BLOCK_SIZE bsize, PARTITION_TYPE partition) { + const AV1_COMMON *const cm = &cpi->common; + const int is_larger_qindex = cm->quant_params.base_qindex > 100; + const MACROBLOCKD *const xd = &x->e_mbd; + bool do_split = + (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3) + ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64)) + : true; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN || + cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 || + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) || + !none_rdc->skip_txfm) + return do_split; + + const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize); + + // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm + // would have been populated based on Hadamard transform and skip_txfm flag is + // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8 + // and 16x16 blocks. + // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may + // not be reliable. Hence SPLIT evaluation is disabled only at lower + // quantizers for blocks >= 32x32. + if ((!use_model_yrd_large) || (!is_larger_qindex)) return false; + + // Use residual statistics to decide if SPLIT partition should be evaluated + // for 32x32 blocks. The pruning logic is avoided for larger block size to + // avoid the visual artifacts + if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + assert(subsize < BLOCK_SIZES_ALL); + double min_per_pixel_error = DBL_MAX; + double max_per_pixel_error = 0.; + int i; + for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + const int x_idx = (i & 1) * hbs; + const int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) { + break; + } + + // Populate the appropriate buffer pointers. + // Pass scale factors as NULL as the base pointer of the block would have + // been calculated appropriately. + struct buf_2d src_split_buf_2d, pred_split_buf_2d; + const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src; + setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf, + src_none_buf_2d->width, src_none_buf_2d->height, + src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0); + const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst; + setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf, + pred_none_buf_2d->width, pred_none_buf_2d->height, + pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0); + + unsigned int curr_uint_mse; + const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf( + src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf, + pred_split_buf_2d.stride, &curr_uint_mse); + const double curr_per_pixel_error = + sqrt((double)curr_uint_var / block_size_wide[subsize] / + block_size_high[subsize]); + if (curr_per_pixel_error < min_per_pixel_error) + min_per_pixel_error = curr_per_pixel_error; + if (curr_per_pixel_error > max_per_pixel_error) + max_per_pixel_error = curr_per_pixel_error; + } + + // Prune based on residual statistics only if all the sub-partitions are + // valid. + if (i == SUB_PARTITIONS_SPLIT) { + if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false; + } + } + + return do_split; +} + +static void try_merge(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TokenExtra **tp, const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, PC_TREE *const pc_tree, + const PARTITION_TYPE partition, const BLOCK_SIZE subsize, + const int pl) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + const int num_planes = av1_num_planes(cm); + // Only square blocks from 8x8 to 128x128 are supported + assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + bool do_split = false; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS split_rdc, none_rdc; + av1_invalid_rd_stats(&split_rdc); + av1_invalid_rd_stats(&none_rdc); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + pc_tree->partitioning = PARTITION_NONE; + if (!pc_tree->none) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->none); + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, + pc_tree->none); + none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist); + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 || + none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) { + do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row, + mi_col, hbs, bsize, partition); + if (do_split) { + av1_init_rd_stats(&split_rdc); + split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + RD_STATS block_rdc; + av1_invalid_rd_stats(&block_rdc); + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx; + xd->left_txfm_context = + xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK); + if (!pc_tree->split[i]->none) { + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->split[i]->none); + } + pc_tree->split[i]->partitioning = PARTITION_NONE; + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, + &block_rdc, subsize, pc_tree->split[i]->none); + // TODO(yunqingwang): The rate here did not include the cost of + // signaling PARTITION_NONE token in the sub-blocks. + split_rdc.rate += block_rdc.rate; + split_rdc.dist += block_rdc.dist; + + av1_rd_cost_update(x->rdmult, &split_rdc); + + if (none_rdc.rdcost < split_rdc.rdcost) { + break; + } + + if (i != SUB_PARTITIONS_SPLIT - 1) + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, + 1, subsize, PARTITION_NONE, pc_tree->split[i]->none, + NULL); + } + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist); + } + } + + if (none_rdc.rdcost < split_rdc.rdcost) { + /* Predicted samples can not be reused for PARTITION_NONE since same + * buffer is being used to store the reconstructed samples of + * PARTITION_SPLIT block. */ + if (do_split) x->reuse_inter_pred = false; + + mib[0]->bsize = bsize; + pc_tree->partitioning = PARTITION_NONE; + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition, + pc_tree->none, NULL); + } else { + mib[0]->bsize = subsize; + pc_tree->partitioning = PARTITION_SPLIT; + /* Predicted samples can not be reused for PARTITION_SPLIT since same + * buffer is being used to write the reconstructed samples. */ + // TODO(Cherma): Store and reuse predicted samples generated by + // encode_b_nonrd() in DRY_RUN_NORMAL mode. + x->reuse_inter_pred = false; + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + + // Note: We don't reset pc_tree->split[i]->none here because it + // could contain results from the additional check. Instead, it is + // reset before we enter the nonrd_check_partition_merge_mode + // condition. + if (!pc_tree->split[i]->none) { + pc_tree->split[i]->none = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->split[i]->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0, + subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL); + } + } +} + +// Evaluate if the sub-partitions can be merged directly into a large partition +// without calculating the RD cost. +static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + MB_MODE_INFO **b0 = mib; + MB_MODE_INFO **b1 = mib + hbs; + MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride; + MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs; + + // Check if the following conditions are met. This can be updated + // later with more support added. + const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize || + b2[0]->bsize < subsize || b3[0]->bsize < subsize; + if (further_split) return; + + const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm || + !b2[0]->skip_txfm || !b3[0]->skip_txfm; + if (no_skip) return; + + const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] || + b0[0]->ref_frame[1] != b2[0]->ref_frame[1] || + b0[0]->ref_frame[1] != b3[0]->ref_frame[1] || + b0[0]->ref_frame[1] > NONE_FRAME); + if (compound) return; + + // Intra modes aren't considered here. + const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] || + b0[0]->ref_frame[0] != b2[0]->ref_frame[0] || + b0[0]->ref_frame[0] != b3[0]->ref_frame[0] || + b0[0]->ref_frame[0] <= INTRA_FRAME); + if (different_ref) return; + + const int different_mode = + (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode || + b0[0]->mode != b3[0]->mode); + if (different_mode) return; + + const int unsupported_mode = + (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV); + if (unsupported_mode) return; + + const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int || + b0[0]->mv[0].as_int != b2[0]->mv[0].as_int || + b0[0]->mv[0].as_int != b3[0]->mv[0].as_int); + if (different_mv) return; + + const int unsupported_motion_mode = + (b0[0]->motion_mode != b1[0]->motion_mode || + b0[0]->motion_mode != b2[0]->motion_mode || + b0[0]->motion_mode != b3[0]->motion_mode || + b0[0]->motion_mode != SIMPLE_TRANSLATION); + if (unsupported_motion_mode) return; + + const int diffent_filter = + (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int || + b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int || + b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int); + if (diffent_filter) return; + + const int different_seg = (b0[0]->segment_id != b1[0]->segment_id || + b0[0]->segment_id != b2[0]->segment_id || + b0[0]->segment_id != b3[0]->segment_id); + if (different_seg) return; + + // Evaluate the ref_mv. + MB_MODE_INFO **this_mi = mib; + BLOCK_SIZE orig_bsize = this_mi[0]->bsize; + const PARTITION_TYPE orig_partition = this_mi[0]->partition; + + this_mi[0]->bsize = bsize; + this_mi[0]->partition = PARTITION_NONE; + this_mi[0]->skip_txfm = 1; + + // TODO(yunqing): functions called below can be optimized by + // removing unrelated operations. + av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row, + mi_col, bsize); + + const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0]; + int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES]; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + int force_skip_low_temp_var = 0; + int skip_pred_mv = 0; + bool use_scaled_ref; + + for (int i = 0; i < MB_MODE_COUNT; ++i) { + for (int j = 0; j < REF_FRAMES; ++j) { + frame_mv[i][j].as_int = INVALID_MV; + } + } + av1_copy(x->color_sensitivity, x->color_sensitivity_sb); + skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 && + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2); + + find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize, + force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref); + + int continue_merging = 1; + if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row || + frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col) + continue_merging = 0; + + if (!continue_merging) { + this_mi[0]->bsize = orig_bsize; + this_mi[0]->partition = orig_partition; + + // TODO(yunqing): Store the results and restore here instead of + // calling find_predictors() again. + av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row, + mi_col, this_mi[0]->bsize); + find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize, + force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref); + } else { + struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); + const int is_scaled = av1_is_scaled(sf); + const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) || + (abs(this_mi[0]->mv[0].as_mv.col) % 8); + const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) || + (abs(this_mi[0]->mv[0].as_mv.col) % 16); + + if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) { + const int num_planes = av1_num_planes(cm); + set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]); + const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); + av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col, + xd->block_ref_scale_factors[0], num_planes); + + if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) { + assert(is_uv_subpel_mv == 1); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1, + num_planes - 1); + } else { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + num_planes - 1); + } + } + + // Copy out mbmi_ext information. + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame; + av1_copy_mbmi_ext_to_mbmi_ext_frame( + mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame)); + + const BLOCK_SIZE this_subsize = + get_partition_subsize(bsize, this_mi[0]->partition); + // Update partition contexts. + update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize, + this_mi[0]->partition); + + const int num_planes = av1_num_planes(cm); + av1_reset_entropy_context(xd, bsize, num_planes); + + // Note: use x->txfm_search_params.tx_mode_search_type instead of + // cm->features.tx_mode here. + TX_SIZE tx_size = + tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type); + if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4; + this_mi[0]->tx_size = tx_size; + memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size, + sizeof(this_mi[0]->inter_tx_size)); + + // Update txfm contexts. + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height, + this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd); + + // Update mi for this partition block. + for (int y = 0; y < bs; y++) { + for (int x_idx = 0; x_idx < bs; x_idx++) { + this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0]; + } + } + } +} + +/*!\brief AV1 block partition application (minimal RD search). +* +* \ingroup partition_search +* \callgraph +* \callergraph +* Encode the block by applying pre-calculated partition patterns that are +* represented by coding block sizes stored in the mbmi array. The only +* partition adjustment allowed is merging leaf split nodes if it leads to a +* lower rd cost. The partition types are limited to a basic set: none, horz, +* vert, and split. This function is only used in the real-time mode. +* +* \param[in] cpi Top-level encoder structure +* \param[in] td Pointer to thread data +* \param[in] tile_data Pointer to struct holding adaptive +data/contexts/models for the tile during encoding +* \param[in] mib Array representing MB_MODE_INFO pointers for mi +blocks starting from the first pixel of the current +block +* \param[in] tp Pointer to the starting token +* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE +* \param[in] mi_col Column coordinate of the block in a step size of +MI_SIZE +* \param[in] bsize Current block size +* \param[in] pc_tree Pointer to the PC_TREE node holding the picked +partitions and mode info for the current block +* +* \remark Nothing is returned. The pc_tree struct is modified to store the +* picked partition and modes. +*/ +void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TokenExtra **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const ModeCosts *mode_costs = &x->mode_costs; + // Only square blocks from 8x8 to 128x128 are supported + assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128); + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + PARTITION_TYPE partition = (bsize >= BLOCK_8X8) + ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + assert(subsize <= BLOCK_LARGEST); + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + + RD_STATS dummy_cost; + av1_invalid_rd_stats(&dummy_cost); + + if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd; + + int change_none_to_split = 0; + if (partition == PARTITION_NONE && + cpi->sf.rt_sf.nonrd_check_partition_split == 1) { + change_none_to_split = + try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params, + mi_row, mi_col, bsize, pl, pc_tree); + if (change_none_to_split) { + partition = PARTITION_SPLIT; + subsize = get_partition_subsize(bsize, partition); + assert(subsize <= BLOCK_LARGEST); + } + } + + pc_tree->partitioning = partition; + + switch (partition) { + case PARTITION_NONE: + if (!pc_tree->none) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->none); + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize, + pc_tree->none); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, + partition, pc_tree->none, NULL); + break; + case PARTITION_VERT: + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (!pc_tree->vertical[i]) { + pc_tree->vertical[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->vertical[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->vertical[i]); + } + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + subsize, pc_tree->vertical[0]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, + PARTITION_VERT, pc_tree->vertical[0], NULL); + if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) { + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs, + &dummy_cost, subsize, pc_tree->vertical[1]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize, + PARTITION_VERT, pc_tree->vertical[1], NULL); + } + break; + case PARTITION_HORZ: + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (!pc_tree->horizontal[i]) { + pc_tree->horizontal[i] = + av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!pc_tree->horizontal[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } else { + av1_reset_pmc(pc_tree->horizontal[i]); + } + } + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, + subsize, pc_tree->horizontal[0]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize, + PARTITION_HORZ, pc_tree->horizontal[0], NULL); + + if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) { + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col, + &dummy_cost, subsize, pc_tree->horizontal[1]); + encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize, + PARTITION_HORZ, pc_tree->horizontal[1], NULL); + } + break; + case PARTITION_SPLIT: + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (!pc_tree->split[i]) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + } + pc_tree->split[i]->index = i; + } + if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode && + av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) && + !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree, + partition, subsize, pl); + } else { + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + if ((mi_row + y_idx >= mi_params->mi_rows) || + (mi_col + x_idx >= mi_params->mi_cols)) + continue; + av1_nonrd_use_partition( + cpi, td, tile_data, + mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]); + } + + if (!change_none_to_split) { + // Note: Palette, cfl are not supported. + if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf && + cpi->sf.rt_sf.partition_direct_merging && + mode_costs->partition_cost[pl][PARTITION_NONE] < + mode_costs->partition_cost[pl][PARTITION_SPLIT] && + (mi_row + bs <= mi_params->mi_rows) && + (mi_col + bs <= mi_params->mi_cols)) { + direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col, + bsize); + } + } + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } +} + +#if !CONFIG_REALTIME_ONLY +// Try searching for an encoding for the given subblock. Returns zero if the +// rdcost is already too high (to tell the caller not to bother searching for +// encodings of further subblocks). +static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int is_last, + int mi_row, int mi_col, BLOCK_SIZE subsize, + RD_STATS best_rdcost, RD_STATS *sum_rdc, + PARTITION_TYPE partition, + PICK_MODE_CONTEXT *this_ctx) { + MACROBLOCK *const x = &td->mb; + const int orig_mult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL); + + av1_rd_cost_update(x->rdmult, &best_rdcost); + + RD_STATS rdcost_remaining; + av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining); + RD_STATS this_rdc; + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition, + subsize, this_ctx, rdcost_remaining); + + if (this_rdc.rate == INT_MAX) { + sum_rdc->rdcost = INT64_MAX; + } else { + sum_rdc->rate += this_rdc.rate; + sum_rdc->dist += this_rdc.dist; + av1_rd_cost_update(x->rdmult, sum_rdc); + } + + if (sum_rdc->rdcost >= best_rdcost.rdcost) { + x->rdmult = orig_mult; + return 0; + } + + if (!is_last) { + av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL); + } + + x->rdmult = orig_mult; + return 1; +} + +// Tests an AB partition, and updates the encoder status, the pick mode +// contexts, the best rdcost, and the best partition. +static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + PC_TREE *pc_tree, RD_STATS *best_rdc, + int64_t *this_rdcost, + PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB], + int mi_row, int mi_col, BLOCK_SIZE bsize, + PARTITION_TYPE partition, + const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB], + const int ab_mi_pos[SUB_PARTITIONS_AB][2], + const MB_MODE_INFO **mode_cache) { + MACROBLOCK *const x = &td->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + RD_STATS sum_rdc; + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = x->mode_costs.partition_cost[pl][partition]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + // Loop over sub-partitions in AB partition type. + for (int i = 0; i < SUB_PARTITIONS_AB; i++) { + if (mode_cache && mode_cache[i]) { + x->use_mb_mode_cache = 1; + x->mb_mode_cache = mode_cache[i]; + } + const int mode_search_success = + rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1, + ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i], + *best_rdc, &sum_rdc, partition, ctxs[i]); + x->use_mb_mode_cache = 0; + x->mb_mode_cache = NULL; + if (!mode_search_success) { + return false; + } + } + + av1_rd_cost_update(x->rdmult, &sum_rdc); + *this_rdcost = sum_rdc.rdcost; + if (sum_rdc.rdcost >= best_rdc->rdcost) return false; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + *this_rdcost = sum_rdc.rdcost; + if (sum_rdc.rdcost >= best_rdc->rdcost) return false; + + *best_rdc = sum_rdc; + pc_tree->partitioning = partition; + return true; +} + +#if CONFIG_COLLECT_PARTITION_STATS +static void init_partition_block_timing_stats( + PartitionTimingStats *part_timing_stats) { + av1_zero(*part_timing_stats); +} + +static INLINE void start_partition_block_timer( + PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) { + assert(!part_timing_stats->timer_is_on); + part_timing_stats->partition_attempts[partition_type] += 1; + aom_usec_timer_start(&part_timing_stats->timer); + part_timing_stats->timer_is_on = 1; +} + +static INLINE void end_partition_block_timer( + PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type, + int64_t rdcost) { + if (part_timing_stats->timer_is_on) { + aom_usec_timer_mark(&part_timing_stats->timer); + const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer); + part_timing_stats->partition_times[partition_type] += time; + part_timing_stats->partition_rdcost[partition_type] = rdcost; + part_timing_stats->timer_is_on = 0; + } +} +static INLINE void print_partition_timing_stats_with_rdcost( + const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number, + const RD_STATS *best_rdc, const char *filename) { + FILE *f = fopen(filename, "a"); + fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number, + frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist, + best_rdc->rdcost); + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) { + fprintf(f, "%d,", -1); + } else { + fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]); + } + } + fprintf(f, "\n"); + fclose(f); +} + +static INLINE void print_partition_timing_stats( + const PartitionTimingStats *part_timing_stats, int intra_only, + int show_frame, const BLOCK_SIZE bsize, const char *filename) { + FILE *f = fopen(filename, "a"); + fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only); + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]); + } + fprintf(f, "\n"); + fclose(f); +} + +static INLINE void accumulate_partition_timing_stats( + FramePartitionTimingStats *fr_part_timing_stats, + const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) { + const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize); + int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx]; + int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx]; + int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx]; + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + agg_attempts[idx] += part_timing_stats->partition_attempts[idx]; + agg_decisions[idx] += part_timing_stats->partition_decisions[idx]; + agg_times[idx] += part_timing_stats->partition_times[idx]; + } +} +#endif // CONFIG_COLLECT_PARTITION_STATS + +// Initialize state variables of partition search used in +// av1_rd_pick_partition(). +static void init_partition_search_state_params( + MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams *blk_params = &part_search_state->part_blk_params; + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + + // Initialization of block size related parameters. + blk_params->mi_step = mi_size_wide[bsize] / 2; + blk_params->mi_row = mi_row; + blk_params->mi_col = mi_col; + blk_params->mi_row_edge = mi_row + blk_params->mi_step; + blk_params->mi_col_edge = mi_col + blk_params->mi_step; + blk_params->width = block_size_wide[bsize]; + blk_params->min_partition_size_1d = + block_size_wide[x->sb_enc.min_partition_size]; + blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + blk_params->split_bsize2 = blk_params->subsize; + blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8); + blk_params->bsize = bsize; + + // Check if the partition corresponds to edge block. + blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows); + blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols); + + // Update intra partitioning related info. + part_search_state->intra_part_info = &x->part_search_info; + // Prepare for segmentation CNN-based partitioning for intra-frame. + if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { + part_search_state->intra_part_info->quad_tree_idx = 0; + part_search_state->intra_part_info->cnn_output_valid = 0; + } + + // Set partition plane context index. + part_search_state->pl_ctx_idx = + blk_params->bsize_at_least_8x8 + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + + // Partition cost buffer update + ModeCosts *mode_costs = &x->mode_costs; + part_search_state->partition_cost = + mode_costs->partition_cost[part_search_state->pl_ctx_idx]; + + // Initialize HORZ and VERT win flags as true for all split partitions. + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true; + part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true; + } + + // Initialize the rd cost. + av1_init_rd_stats(&part_search_state->this_rdc); + + // Initialize RD costs for partition types to 0. + part_search_state->none_rd = 0; + av1_zero(part_search_state->split_rd); + av1_zero(part_search_state->rect_part_rd); + + // Initialize SPLIT partition to be not ready. + av1_zero(part_search_state->is_split_ctx_is_ready); + // Initialize HORZ and VERT partitions to be not ready. + av1_zero(part_search_state->is_rect_ctx_is_ready); + + // Chroma subsampling. + part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x; + part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y; + + // Initialize partition search flags to defaults. + part_search_state->terminate_partition_search = 0; + part_search_state->do_square_split = blk_params->bsize_at_least_8x8; + part_search_state->do_rectangular_split = + cpi->oxcf.part_cfg.enable_rect_partitions && + blk_params->bsize_at_least_8x8; + av1_zero(part_search_state->prune_rect_part); + + // Initialize allowed partition types for the partition block. + part_search_state->partition_none_allowed = + av1_blk_has_rows_and_cols(blk_params); + part_search_state->partition_rect_allowed[HORZ] = + part_search_state->do_rectangular_split && blk_params->has_cols && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + part_search_state->partition_rect_allowed[VERT] = + part_search_state->do_rectangular_split && blk_params->has_rows && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + + // Reset the flag indicating whether a partition leading to a rdcost lower + // than the bound best_rdc has been found. + part_search_state->found_best_partition = false; + +#if CONFIG_COLLECT_PARTITION_STATS + init_partition_block_timing_stats(&part_search_state->part_timing_stats); +#endif // CONFIG_COLLECT_PARTITION_STATS +} + +// Override partition cost buffer for the edge blocks. +static void set_partition_cost_for_edge_blk( + AV1_COMMON const *cm, PartitionSearchState *part_search_state) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0); + const aom_cdf_prob *partition_cdf = + cm->fc->partition_cdf[part_search_state->pl_ctx_idx]; + const int max_cost = av1_cost_symbol(0); + for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i) + part_search_state->tmp_partition_cost[i] = max_cost; + if (blk_params.has_cols) { + // At the bottom, the two possibilities are HORZ and SPLIT. + aom_cdf_prob bot_cdf[2]; + partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize); + static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf, + bot_inv_map); + } else if (blk_params.has_rows) { + // At the right, the two possibilities are VERT and SPLIT. + aom_cdf_prob rhs_cdf[2]; + partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize); + static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT }; + av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf, + rhs_inv_map); + } else { + // At the bottom right, we always split. + part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0; + } + // Override the partition cost buffer. + part_search_state->partition_cost = part_search_state->tmp_partition_cost; +} + +// Reset the partition search state flags when +// must_find_valid_partition is equal to 1. +static AOM_INLINE void reset_part_limitations( + AV1_COMP *const cpi, PartitionSearchState *part_search_state) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int is_rect_part_allowed = + blk_params.bsize_at_least_8x8 && + cpi->oxcf.part_cfg.enable_rect_partitions && + (blk_params.width > blk_params.min_partition_size_1d); + part_search_state->do_square_split = + blk_params.bsize_at_least_8x8 && + (blk_params.width > blk_params.min_partition_size_1d); + part_search_state->partition_none_allowed = + av1_blk_has_rows_and_cols(&blk_params) && + (blk_params.width >= blk_params.min_partition_size_1d); + part_search_state->partition_rect_allowed[HORZ] = + blk_params.has_cols && is_rect_part_allowed && + get_plane_block_size( + get_partition_subsize(blk_params.bsize, PARTITION_HORZ), + part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; + part_search_state->partition_rect_allowed[VERT] = + blk_params.has_rows && is_rect_part_allowed && + get_plane_block_size( + get_partition_subsize(blk_params.bsize, PARTITION_VERT), + part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID; + part_search_state->terminate_partition_search = 0; +} + +// Rectangular partitions evaluation at sub-block level. +static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *x, + PICK_MODE_CONTEXT *cur_partition_ctx, + PartitionSearchState *part_search_state, + RD_STATS *best_rdc, const int idx, + int mi_row, int mi_col, BLOCK_SIZE bsize, + PARTITION_TYPE partition_type) { + // Obtain the remainder from the best rd cost + // for further processing of partition. + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc, + &best_remain_rdcost); + + // Obtain the best mode for the partition sub-block. + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc, + partition_type, bsize, cur_partition_ctx, best_remain_rdcost); + av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc); + + // Update the partition rd cost with the current sub-block rd. + if (part_search_state->this_rdc.rate == INT_MAX) { + part_search_state->sum_rdc.rdcost = INT64_MAX; + } else { + part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate; + part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist; + av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc); + } + const RECT_PART_TYPE rect_part = + partition_type == PARTITION_HORZ ? HORZ : VERT; + part_search_state->rect_part_rd[rect_part][idx] = + part_search_state->this_rdc.rdcost; +} + +typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step); + +// Checks if HORZ / VERT partition search is allowed. +static AOM_INLINE int is_rect_part_allowed( + const AV1_COMP *cpi, const PartitionSearchState *part_search_state, + const active_edge_info *active_edge, RECT_PART_TYPE rect_part, + const int mi_pos) { + const PartitionBlkParams *blk_params = &part_search_state->part_blk_params; + const int is_part_allowed = + (!part_search_state->terminate_partition_search && + part_search_state->partition_rect_allowed[rect_part] && + !part_search_state->prune_rect_part[rect_part] && + (part_search_state->do_rectangular_split || + active_edge[rect_part](cpi, mi_pos, blk_params->mi_step))); + return is_part_allowed; +} + +static void rectangular_partition_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, + RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type, + const RECT_PART_TYPE end_type) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS *sum_rdc = &part_search_state->sum_rdc; + const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ, + PARTITION_VERT }; + + // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of + // HORZ and VERT partition types. + // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of + // HORZ and VERT partition types. + const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = { + { { blk_params.mi_row, blk_params.mi_col }, + { blk_params.mi_row_edge, blk_params.mi_col } }, + { { blk_params.mi_row, blk_params.mi_col }, + { blk_params.mi_row, blk_params.mi_col_edge } } + }; + + // Initialize active edge_type function pointer + // for HOZR and VERT partition types. + active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge, + av1_active_v_edge }; + + // Indicates edge blocks for HORZ and VERT partition types. + const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows, + blk_params.has_cols }; + + // Initialize pc tree context for HORZ and VERT partition types. + PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = { + { &pc_tree->horizontal[0], &pc_tree->horizontal[1] }, + { &pc_tree->vertical[0], &pc_tree->vertical[1] } + }; + + // Loop over rectangular partition types. + for (RECT_PART_TYPE i = start_type; i <= end_type; i++) { + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part_search_state->partition_rect_allowed[i])); + + // Check if the HORZ / VERT partition search is to be performed. + if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i, + mi_pos_rect[i][0][i])) + continue; + + // Sub-partition idx. + int sub_part_idx = 0; + PARTITION_TYPE partition_type = rect_partition_type[i]; + blk_params.subsize = + get_partition_subsize(blk_params.bsize, partition_type); + assert(blk_params.subsize <= BLOCK_LARGEST); + av1_init_rd_stats(sum_rdc); + for (int j = 0; j < SUB_PARTITIONS_RECT; j++) { + if (cur_ctx[i][j][0] == NULL) { + cur_ctx[i][j][0] = + av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf); + if (!cur_ctx[i][j][0]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } + } + sum_rdc->rate = part_search_state->partition_cost[partition_type]; + sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0); +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_rdc->rdcost - sum_rdc->rdcost >= 0) { + start_partition_block_timer(part_timing_stats, partition_type); + } +#endif + + // First sub-partition evaluation in HORZ / VERT partition type. + rd_pick_rect_partition( + cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state, + best_rdc, 0, mi_pos_rect[i][sub_part_idx][0], + mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type); + + // Start of second sub-partition evaluation. + // Evaluate second sub-partition if the first sub-partition cost + // is less than the best cost and if it is not an edge block. + if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) { + const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted. + if (pmi->palette_size[PLANE_TYPE_Y] == 0 && + pmi->palette_size[PLANE_TYPE_UV] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) + part_search_state->is_rect_ctx_is_ready[i] = 1; + } + av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row, + blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, + blk_params.subsize, NULL); + + // Second sub-partition evaluation in HORZ / VERT partition type. + sub_part_idx = 1; + rd_pick_rect_partition( + cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state, + best_rdc, 1, mi_pos_rect[i][sub_part_idx][0], + mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type); + } + // Update HORZ / VERT best partition. + if (sum_rdc->rdcost < best_rdc->rdcost) { + sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist); + if (sum_rdc->rdcost < best_rdc->rdcost) { + *best_rdc = *sum_rdc; + part_search_state->found_best_partition = true; + pc_tree->partitioning = partition_type; + } + } else { + // Update HORZ / VERT win flag. + if (rect_part_win_info != NULL) + rect_part_win_info->rect_part_win[i] = false; + } +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + end_partition_block_timer(part_timing_stats, partition_type, + sum_rdc->rdcost); + } +#endif + av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col, + blk_params.bsize, av1_num_planes(cm)); + } +} + +// AB partition type evaluation. +static void rd_pick_ab_part( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB], + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB], + const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type, + const MB_MODE_INFO **mode_cache) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + int64_t this_rdcost = 0; + +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = part_search_state->partition_cost[part_type]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, part_type); + } + } +#endif + + // Test this partition and update the best partition. + const bool find_best_ab_part = rd_test_partition3( + cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row, + mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache); + part_search_state->found_best_partition |= find_best_ab_part; + +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + if (!find_best_ab_part) this_rdcost = INT64_MAX; + end_partition_block_timer(part_timing_stats, part_type, this_rdcost); + } +#endif + av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); +} + +// Set mode search context. +static AOM_INLINE void set_mode_search_ctx( + PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2], + PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) { + mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0]; + mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0]; + + if (is_ctx_ready[HORZ_A][0]) + mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none; + + if (is_ctx_ready[VERT_A][0]) + mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none; + + if (is_ctx_ready[HORZ_A][1]) + mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none; +} + +static AOM_INLINE void copy_partition_mode_from_mode_context( + const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) { + if (ctx && ctx->rd_stats.rate < INT_MAX) { + *dst_mode = &ctx->mic; + } else { + *dst_mode = NULL; + } +} + +static AOM_INLINE void copy_partition_mode_from_pc_tree( + const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) { + if (pc_tree) { + copy_partition_mode_from_mode_context(dst_mode, pc_tree->none); + } else { + *dst_mode = NULL; + } +} + +static AOM_INLINE void set_mode_cache_for_partition_ab( + const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree, + AB_PART_TYPE ab_part_type) { + switch (ab_part_type) { + case HORZ_A: + copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]); + copy_partition_mode_from_mode_context(&mode_cache[2], + pc_tree->horizontal[1]); + break; + case HORZ_B: + copy_partition_mode_from_mode_context(&mode_cache[0], + pc_tree->horizontal[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]); + copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]); + break; + case VERT_A: + copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]); + copy_partition_mode_from_mode_context(&mode_cache[2], + pc_tree->vertical[1]); + break; + case VERT_B: + copy_partition_mode_from_mode_context(&mode_cache[0], + pc_tree->vertical[0]); + copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]); + copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]); + break; + default: assert(0 && "Invalid ab partition type!\n"); + } +} + +// AB Partitions type search. +static void ab_partitions_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PC_TREE *pc_tree, PartitionSearchState *part_search_state, + RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info, + int pb_source_variance, int ext_partition_allowed, + const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + + if (part_search_state->terminate_partition_search) { + return; + } + + int ab_partitions_allowed[NUM_AB_PARTS]; + // Prune AB partitions + av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost, + rect_part_win_info, ext_partition_allowed, + part_search_state, ab_partitions_allowed); + + // Flags to indicate whether the mode search is done. + const int is_ctx_ready[NUM_AB_PARTS][2] = { + { part_search_state->is_split_ctx_is_ready[0], + part_search_state->is_split_ctx_is_ready[1] }, + { part_search_state->is_rect_ctx_is_ready[HORZ], 0 }, + { part_search_state->is_split_ctx_is_ready[0], 0 }, + { part_search_state->is_rect_ctx_is_ready[VERT], 0 } + }; + + // Current partition context. + PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala, + pc_tree->horizontalb, + pc_tree->verticala, + pc_tree->verticalb }; + + // Context of already evaluted partition types. + PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]; + // Set context of already evaluted partition types. + set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx); + + // Array of sub-partition size of AB partition types. + const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = { + { blk_params.split_bsize2, blk_params.split_bsize2, + get_partition_subsize(bsize, PARTITION_HORZ_A) }, + { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2, + blk_params.split_bsize2 }, + { blk_params.split_bsize2, blk_params.split_bsize2, + get_partition_subsize(bsize, PARTITION_VERT_A) }, + { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2, + blk_params.split_bsize2 } + }; + + // Array of mi_row, mi_col positions corresponds to each sub-partition in AB + // partition types. + const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = { + { { mi_row, mi_col }, + { mi_row, blk_params.mi_col_edge }, + { blk_params.mi_row_edge, mi_col } }, + { { mi_row, mi_col }, + { blk_params.mi_row_edge, mi_col }, + { blk_params.mi_row_edge, blk_params.mi_col_edge } }, + { { mi_row, mi_col }, + { blk_params.mi_row_edge, mi_col }, + { mi_row, blk_params.mi_col_edge } }, + { { mi_row, mi_col }, + { mi_row, blk_params.mi_col_edge }, + { blk_params.mi_row_edge, blk_params.mi_col_edge } } + }; + + // Loop over AB partition types. + for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type; + ab_part_type++) { + const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A; + + // Check if the AB partition search is to be performed. + if (!ab_partitions_allowed[ab_part_type]) { + continue; + } + + blk_params.subsize = get_partition_subsize(bsize, part_type); + for (int i = 0; i < SUB_PARTITIONS_AB; i++) { + // Set AB partition context. + cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc( + cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf); + if (!cur_part_ctxs[ab_part_type][i]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + // Set mode as not ready. + cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0; + } + + if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) { + // We can copy directly the mode search results if we have already + // searched the current block and the contexts match. + if (is_ctx_ready[ab_part_type][0]) { + av1_copy_tree_context(cur_part_ctxs[ab_part_type][0], + mode_srch_ctx[ab_part_type][0][0]); + cur_part_ctxs[ab_part_type][0]->mic.partition = part_type; + cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1; + if (is_ctx_ready[ab_part_type][1]) { + av1_copy_tree_context(cur_part_ctxs[ab_part_type][1], + mode_srch_ctx[ab_part_type][1][0]); + cur_part_ctxs[ab_part_type][1]->mic.partition = part_type; + cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1; + } + } + } + + // Even if the contexts don't match, we can still speed up by reusing the + // previous prediction mode. + const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL }; + if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) { + set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type); + } + + // Evaluation of AB partition type. + rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree, + cur_part_ctxs[ab_part_type], part_search_state, best_rdc, + ab_subsize[ab_part_type], ab_mi_pos[ab_part_type], + part_type, mode_cache); + } +} + +// Set mi positions for HORZ4 / VERT4 sub-block partitions. +static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES], + int mi_pos[SUB_PARTITIONS_PART4][2], + const int mi_row, const int mi_col) { + for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) { + mi_pos[i][0] = mi_row + i * inc_step[HORZ4]; + mi_pos[i][1] = mi_col + i * inc_step[VERT4]; + } +} + +// Set context and RD cost for HORZ4 / VERT4 partition types. +static void set_4_part_ctx_and_rdcost( + MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td, + PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4], + PartitionSearchState *part_search_state, PARTITION_TYPE partition_type, + BLOCK_SIZE bsize) { + // Initialize sum_rdc RD cost structure. + av1_init_rd_stats(&part_search_state->sum_rdc); + const int subsize = get_partition_subsize(bsize, partition_type); + part_search_state->sum_rdc.rate = + part_search_state->partition_cost[partition_type]; + part_search_state->sum_rdc.rdcost = + RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0); + for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) { + cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf); + if (!cur_part_ctx[i]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + } +} + +// Partition search of HORZ4 / VERT4 partition types. +static void rd_pick_4partition( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4], + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + // mi positions needed for HORZ4 and VERT4 partition types. + int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows, + cm->mi_params.mi_cols }; + const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4); + int mi_pos[SUB_PARTITIONS_PART4][2]; + + blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type); + // Set partition context and RD cost. + set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state, + partition_type, blk_params.bsize); + // Set mi positions for sub-block sizes. + set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col); +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, partition_type); + } +#endif + // Loop over sub-block partitions. + for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) { + if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break; + + // Sub-block evaluation of Horz4 / Vert4 partition type. + cur_part_ctx[i]->rd_mode_is_ready = 0; + if (!rd_try_subblock( + cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1), + mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc, + &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) { + av1_invalid_rd_stats(&part_search_state->sum_rdc); + break; + } + } + + // Calculate the total cost and update the best partition. + av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc); + if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) { + *best_rdc = part_search_state->sum_rdc; + part_search_state->found_best_partition = true; + pc_tree->partitioning = partition_type; + } +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + end_partition_block_timer(part_timing_stats, partition_type, + part_search_state->sum_rdc.rdcost); + } +#endif + av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col, + blk_params.bsize, av1_num_planes(cm)); +} + +// Do not evaluate extended partitions if NONE partition is skippable. +static INLINE int prune_ext_part_none_skippable( + PICK_MODE_CONTEXT *part_none, int must_find_valid_partition, + int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) { + if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) { + if (part_none->skippable && !must_find_valid_partition && + bsize >= BLOCK_16X16) { + return 1; + } + } + return 0; +} + +// Allow ab partition search +static int allow_ab_partition_search(PartitionSearchState *part_search_state, + PARTITION_SPEED_FEATURES *part_sf, + PARTITION_TYPE curr_best_part, + int must_find_valid_partition, + int prune_ext_part_state, + int64_t best_rdcost) { + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params.bsize; + + // Do not prune if there is no valid partition + if (best_rdcost == INT64_MAX) return 1; + + // Determine bsize threshold to evaluate ab partitions + BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh; + if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition && + !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT)) + ab_bsize_thresh = BLOCK_128X128; + + // ab partitions are only allowed for square block sizes BLOCK_16X16 or + // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and + // BLOCK_8X8. + assert(ab_bsize_thresh >= BLOCK_8X8); + + int ab_partition_allowed = + part_search_state->do_rectangular_split && bsize > ab_bsize_thresh && + av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state; + + return ab_partition_allowed; +} + +// Prune 4-way partitions based on the number of horz/vert wins +// in the current block and sub-blocks in PARTITION_SPLIT. +static void prune_4_partition_using_split_info( + AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state, + int part4_search_allowed[NUM_PART4_TYPES]) { + PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 }; + // Count of child blocks in which HORZ or VERT partition has won + int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 }; + // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of + // split partiitons. + // Conservative pruning for high quantizers. + const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3); + + for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) { + if (!(cpi->sf.part_sf.prune_ext_part_using_split_info && + part4_search_allowed[cur_part[i]])) + continue; + // Loop over split partitions. + // Get rectangular partitions winner info of split partitions. + for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++) + num_child_rect_win[i] += + (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1 + : 0; + if (num_child_rect_win[i] < num_win_thresh) { + part4_search_allowed[cur_part[i]] = 0; + } + } +} + +// Prune 4-way partition search. +static void prune_4_way_partition_search( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + int pb_source_variance, int prune_ext_part_state, + int part4_search_allowed[NUM_PART4_TYPES]) { + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params.bsize; + + // Do not prune if there is no valid partition + if (best_rdc->rdcost == INT64_MAX) return; + + // Determine bsize threshold to evaluate 4-way partitions + BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh; + if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best && + !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE) + part4_bsize_thresh = BLOCK_128X128; + + // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and + // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude + // BLOCK_4X4 and BLOCK_8X8. + assert(part4_bsize_thresh >= BLOCK_8X8); + + bool partition4_allowed = + part_search_state->do_rectangular_split && bsize > part4_bsize_thresh && + av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state; + + // Disable 4-way partition search flags for width less than a multiple of the + // minimum partition width. + if (blk_params.width < (blk_params.min_partition_size_1d + << cpi->sf.part_sf.prune_part4_search)) { + part4_search_allowed[HORZ4] = 0; + part4_search_allowed[VERT4] = 0; + return; + } + + PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4, + PARTITION_VERT_4 }; + const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; + // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or + // PARTITION_VERT_4 for this block. This is almost the same as + // partition4_allowed, except that we don't allow 128x32 or 32x128 + // blocks, so we require that bsize is not BLOCK_128X128. + partition4_allowed &= + part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128; + + for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) { + part4_search_allowed[i] = + partition4_allowed && part_search_state->partition_rect_allowed[i] && + get_plane_block_size(get_partition_subsize(bsize, cur_part[i]), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + } + // Pruning: pruning out 4-way partitions based on the current best partition. + if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) { + part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_HORZ_A || + pc_tree->partitioning == PARTITION_HORZ_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_VERT_A || + pc_tree->partitioning == PARTITION_VERT_B || + pc_tree->partitioning == PARTITION_SPLIT || + pc_tree->partitioning == PARTITION_NONE); + } + + // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of + // sub-blocks from basic partition types. + if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed && + part_search_state->partition_rect_allowed[HORZ] && + part_search_state->partition_rect_allowed[VERT]) { + av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost, + part_search_state, part4_search_allowed, + pb_source_variance); + } + + // Pruning: pruning out 4-way partitions based on the number of horz/vert wins + // in the current block and sub-blocks in PARTITION_SPLIT. + prune_4_partition_using_split_info(cpi, x, part_search_state, + part4_search_allowed); +} + +// Set params needed for PARTITION_NONE search. +static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td, + MACROBLOCK *x, PC_TREE *pc_tree, + PartitionSearchState *part_search_state, + RD_STATS *best_remain_rdcost, + RD_STATS *best_rdc, int *pt_cost) { + PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS partition_rdcost; + // Set PARTITION_NONE context. + if (pc_tree->none == NULL) + pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + + // Set PARTITION_NONE type cost. + if (part_search_state->partition_none_allowed) { + if (blk_params.bsize_at_least_8x8) { + *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX + ? part_search_state->partition_cost[PARTITION_NONE] + : 0; + } + + // Initialize the RD stats structure. + av1_init_rd_stats(&partition_rdcost); + partition_rdcost.rate = *pt_cost; + av1_rd_cost_update(x->rdmult, &partition_rdcost); + av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost, + best_remain_rdcost); + } +} + +// Skip other partitions based on PARTITION_NONE rd cost. +static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PICK_MODE_CONTEXT *ctx_none, + PartitionSearchState *part_search_state, + RD_STATS *best_rdc, + unsigned int *pb_source_variance) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS *this_rdc = &part_search_state->this_rdc; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + + if (!frame_is_intra_only(cm) && + (part_search_state->do_square_split || + part_search_state->do_rectangular_split) && + !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { + const int use_ml_based_breakout = + bsize <= cpi->sf.part_sf.use_square_partition_only_threshold && + bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1; + if (use_ml_based_breakout) { + av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd, + part_search_state); + } + + // Adjust dist breakout threshold according to the partition size. + const int64_t dist_breakout_thr = + cpi->sf.part_sf.partition_search_breakout_dist_thr >> + ((2 * (MAX_SB_SIZE_LOG2 - 2)) - + (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize])); + const int rate_breakout_thr = + cpi->sf.part_sf.partition_search_breakout_rate_thr * + num_pels_log2_lookup[bsize]; + // If all y, u, v transform blocks in this partition are skippable, + // and the dist & rate are within the thresholds, the partition + // search is terminated for current branch of the partition search + // tree. The dist & rate thresholds are set to 0 at speed 0 to + // disable the early termination at that speed. + if (best_rdc->dist < dist_breakout_thr && + best_rdc->rate < rate_breakout_thr) { + part_search_state->do_square_split = 0; + part_search_state->do_rectangular_split = 0; + } + } + + // Early termination: using simple_motion_search features and the + // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a + // decision on early terminating at PARTITION_NONE. + if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame && + !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 && + av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX && + this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX && + this_rdc->rate >= 0 && + (part_search_state->do_square_split || + part_search_state->do_rectangular_split)) { + av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc, + part_search_state); + } +} + +// Decide early termination and rectangular partition pruning +// based on PARTITION_NONE and PARTITION_SPLIT costs. +static void prune_partitions_after_split( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + int64_t part_none_rd, int64_t part_split_rd) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + + // Early termination: using the rd costs of PARTITION_NONE and subblocks + // from PARTITION_SPLIT to determine an early breakout. + if (cpi->sf.part_sf.ml_early_term_after_part_split_level && + !frame_is_intra_only(cm) && + !part_search_state->terminate_partition_search && + part_search_state->do_rectangular_split && + (part_search_state->partition_rect_allowed[HORZ] || + part_search_state->partition_rect_allowed[VERT])) { + av1_ml_early_term_after_split( + cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd, + part_search_state->split_rd, part_search_state); + } + + // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT + // to prune out rectangular partitions in some directions. + if (!cpi->sf.part_sf.ml_early_term_after_part_split_level && + cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) && + (part_search_state->partition_rect_allowed[HORZ] || + part_search_state->partition_rect_allowed[VERT]) && + !(part_search_state->prune_rect_part[HORZ] || + part_search_state->prune_rect_part[VERT]) && + !part_search_state->terminate_partition_search) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm), + bsize); + av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost, + part_search_state->none_rd, + part_search_state->split_rd, part_search_state); + } +} + +// Returns true if either of the left and top neighbor blocks is larger than +// the current block; false otherwise. +static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk( + const MACROBLOCKD *xd, BLOCK_SIZE bsize) { + const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]); + if (xd->left_available) { + const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize; + if (block_size_high[left_bsize] * block_size_wide[left_bsize] > + cur_blk_area) + return true; + } + + if (xd->up_available) { + const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize; + if (block_size_high[above_bsize] * block_size_wide[above_bsize] > + cur_blk_area) + return true; + } + return false; +} + +static AOM_INLINE void prune_rect_part_using_none_pred_mode( + const MACROBLOCKD *xd, PartitionSearchState *part_state, + PREDICTION_MODE mode, BLOCK_SIZE bsize) { + if (mode == DC_PRED || mode == SMOOTH_PRED) { + // If the prediction mode of NONE partition is either DC_PRED or + // SMOOTH_PRED, it indicates that the current block has less variation. In + // this case, HORZ and VERT partitions are pruned if at least one of left + // and top neighbor blocks is larger than the current block. + if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) { + part_state->prune_rect_part[HORZ] = 1; + part_state->prune_rect_part[VERT] = 1; + } + } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) { + // If the prediction mode chosen by NONE partition is close to 90 degrees, + // it implies a dominant vertical pattern, and the chance of choosing a + // vertical rectangular partition is high. Hence, horizontal partition is + // pruned in these cases. + part_state->prune_rect_part[HORZ] = 1; + } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) { + // If the prediction mode chosen by NONE partition is close to 180 degrees, + // it implies a dominant horizontal pattern, and the chance of choosing a + // horizontal rectangular partition is high. Hence, vertical partition is + // pruned in these cases. + part_state->prune_rect_part[VERT] = 1; + } +} + +// PARTITION_NONE search. +static void none_partition_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x, + PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree, + RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + RD_STATS *this_rdc = &part_search_state->this_rdc; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + + if (part_search_state->terminate_partition_search || + !part_search_state->partition_none_allowed) + return; + + int pt_cost = 0; + RD_STATS best_remain_rdcost; + av1_invalid_rd_stats(&best_remain_rdcost); + + // Set PARTITION_NONE context and cost. + set_none_partition_params(cpi, td, x, pc_tree, part_search_state, + &best_remain_rdcost, best_rdc, &pt_cost); + +#if CONFIG_COLLECT_PARTITION_STATS + // Timer start for partition None. + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_remain_rdcost.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, PARTITION_NONE); + } +#endif + // PARTITION_NONE evaluation and cost update. + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE, + bsize, pc_tree->none, best_remain_rdcost); + + av1_rd_cost_update(x->rdmult, this_rdc); + +#if CONFIG_COLLECT_PARTITION_STATS + // Timer end for partition None. + if (part_timing_stats->timer_is_on) { + RD_STATS tmp_rdc; + av1_init_rd_stats(&tmp_rdc); + if (this_rdc->rate != INT_MAX) { + tmp_rdc.rate = this_rdc->rate; + tmp_rdc.dist = this_rdc->dist; + tmp_rdc.rdcost = this_rdc->rdcost; + if (blk_params.bsize_at_least_8x8) { + tmp_rdc.rate += pt_cost; + tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist); + } + } + end_partition_block_timer(part_timing_stats, PARTITION_NONE, + tmp_rdc.rdcost); + } +#endif + *pb_source_variance = x->source_variance; + if (none_rd) *none_rd = this_rdc->rdcost; + part_search_state->none_rd = this_rdc->rdcost; + if (this_rdc->rate != INT_MAX) { + // Record picked ref frame to prune ref frames for other partition types. + if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) { + const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame); + av1_update_picked_ref_frames_mask( + x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col); + } + + // Calculate the total cost and update the best partition. + if (blk_params.bsize_at_least_8x8) { + this_rdc->rate += pt_cost; + this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist); + } + *part_none_rd = this_rdc->rdcost; + if (this_rdc->rdcost < best_rdc->rdcost) { + *best_rdc = *this_rdc; + part_search_state->found_best_partition = true; + if (blk_params.bsize_at_least_8x8) { + pc_tree->partitioning = PARTITION_NONE; + } + + // Disable split and rectangular partition search + // based on PARTITION_NONE cost. + prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none, + part_search_state, best_rdc, + pb_source_variance); + } + + if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode) + prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state, + pc_tree->none->mic.mode, bsize); + } + av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); +} + +// PARTITION_SPLIT search. +static void split_partition_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, + SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, + PartitionSearchState *part_search_state, RD_STATS *best_rdc, + SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) { + const AV1_COMMON *const cm = &cpi->common; + PartitionBlkParams blk_params = part_search_state->part_blk_params; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + assert(bsize < BLOCK_SIZES_ALL); + RD_STATS sum_rdc = part_search_state->sum_rdc; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + // Check if partition split is allowed. + if (part_search_state->terminate_partition_search || + !part_search_state->do_square_split) + return; + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (pc_tree->split[i] == NULL) + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + + // Initialization of this partition RD stats. + av1_init_rd_stats(&sum_rdc); + sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); + + int idx; +#if CONFIG_COLLECT_PARTITION_STATS + PartitionTimingStats *part_timing_stats = + &part_search_state->part_timing_stats; + if (best_rdc->rdcost - sum_rdc.rdcost >= 0) { + start_partition_block_timer(part_timing_stats, PARTITION_SPLIT); + } +#endif + // Recursive partition search on 4 sub-blocks. + for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost; + ++idx) { + const int x_idx = (idx & 1) * blk_params.mi_step; + const int y_idx = (idx >> 1) * blk_params.mi_step; + + if (mi_row + y_idx >= mi_params->mi_rows || + mi_col + x_idx >= mi_params->mi_cols) + continue; + + pc_tree->split[idx]->index = idx; + int64_t *p_split_rd = &part_search_state->split_rd[idx]; + RD_STATS best_remain_rdcost; + av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc, + &best_remain_rdcost); + + int curr_quad_tree_idx = 0; + if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx; + part_search_state->intra_part_info->quad_tree_idx = + 4 * curr_quad_tree_idx + idx + 1; + } + // Split partition evaluation of corresponding idx. + // If the RD cost exceeds the best cost then do not + // evaluate other split sub-partitions. + SIMPLE_MOTION_DATA_TREE *const sms_tree_split = + (sms_tree == NULL) ? NULL : sms_tree->split[idx]; + if (!av1_rd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &part_search_state->this_rdc, best_remain_rdcost, + pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode, + &part_search_state->split_part_rect_win[idx])) { + av1_invalid_rd_stats(&sum_rdc); + break; + } + if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) { + part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx; + } + + sum_rdc.rate += part_search_state->this_rdc.rate; + sum_rdc.dist += part_search_state->this_rdc.dist; + av1_rd_cost_update(x->rdmult, &sum_rdc); + + // Set split ctx as ready for use. + if (idx <= 1 && (bsize <= BLOCK_8X8 || + pc_tree->split[idx]->partitioning == PARTITION_NONE)) { + const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + // Neither palette mode nor cfl predicted. + if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) { + if (mbmi->uv_mode != UV_CFL_PRED) + part_search_state->is_split_ctx_is_ready[idx] = 1; + } + } + } +#if CONFIG_COLLECT_PARTITION_STATS + if (part_timing_stats->timer_is_on) { + end_partition_block_timer(part_timing_stats, PARTITION_SPLIT, + sum_rdc.rdcost); + } +#endif + const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT); + + // Calculate the total cost and update the best partition. + *part_split_rd = sum_rdc.rdcost; + if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) { + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + if (sum_rdc.rdcost < best_rdc->rdcost) { + *best_rdc = sum_rdc; + part_search_state->found_best_partition = true; + pc_tree->partitioning = PARTITION_SPLIT; + } + } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) { + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) { + const int partition_none_valid = part_search_state->none_rd > 0; + const int partition_none_better = + part_search_state->none_rd < sum_rdc.rdcost; + part_search_state->do_rectangular_split &= + !(partition_none_valid && partition_none_better); + } + } + // Restore the context for the following cases: + // 1) Current block size not more than maximum partition size as dry run + // encode happens for these cases + // 2) Current block size same as superblock size as the final encode + // happens for this case + if (bsize <= x->sb_enc.max_partition_size || bsize == cm->seq_params->sb_size) + av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); +} + +// The max number of nodes in the partition tree. +// The number of leaf nodes is (128x128) / (4x4) = 1024. +// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023. +#define NUM_NODES 2048 + +static void write_partition_tree(AV1_COMP *const cpi, + const PC_TREE *const pc_tree, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + (void)mi_row; + (void)mi_col; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, + cpi->sb_counter, 0); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d", bsize); + + // Write partition type with BFS order. + const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int q_idx = 0; + int last_idx = 1; + int num_nodes = 1; + + // First traversal to get number of leaf nodes. + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + const int num_leafs = last_idx; + fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); + + // Write partitions for each node. + q_idx = 0; + last_idx = 1; + num_nodes = 1; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + fprintf(pfile, ",%d", node->partitioning); + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + fprintf(pfile, "\n"); + + fclose(pfile); +} + +#if CONFIG_PARTITION_SEARCH_ORDER +static void verify_write_partition_tree(const AV1_COMP *const cpi, + const PC_TREE *const pc_tree, + const BLOCK_SIZE bsize, + const int config_id, const int mi_row, + const int mi_col) { + (void)mi_row; + (void)mi_col; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d", + path, cpi->sb_counter, config_id); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d", bsize); + + // Write partition type with BFS order. + const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int q_idx = 0; + int last_idx = 1; + int num_nodes = 1; + + // First traversal to get number of leaf nodes. + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL && node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + const int num_leafs = last_idx; + fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); + + // Write partitions for each node. + q_idx = 0; + last_idx = 1; + num_nodes = 1; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { // suppress warning + fprintf(pfile, ",%d", node->partitioning); + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + num_nodes += 4; + } + } + --num_nodes; + ++q_idx; + } + fprintf(pfile, "\n"); + + fclose(pfile); +} + +static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree, + struct aom_internal_error_info *error_info, + const int config_id) { + const AV1_COMMON *const cm = &cpi->common; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, + cpi->sb_counter, config_id); + FILE *pfile = fopen(filename, "r"); + if (pfile == NULL) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Can't find input file: %s.", + filename); + } + + int read_bsize; + int num_nodes; + int num_configs; + fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs); + assert(read_bsize == cpi->common.seq_params->sb_size); + BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize; + assert(bsize == pc_tree->block_size); + + PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int last_idx = 1; + int q_idx = 0; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + int partitioning; + fscanf(pfile, ",%d", &partitioning); + assert(partitioning >= PARTITION_NONE && + partitioning < EXT_PARTITION_TYPES); + PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { + node->partitioning = partitioning; + bsize = node->block_size; + } + if (partitioning == PARTITION_SPLIT) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + if (node != NULL) { // Suppress warning + node->split[i] = av1_alloc_pc_tree_node(subsize); + if (!node->split[i]) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + node->split[i]->index = i; + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + } + } + --num_nodes; + ++q_idx; + } + fclose(pfile); + + return num_configs; +} + +static RD_STATS rd_search_for_fixed_partition( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, + const BLOCK_SIZE bsize, PC_TREE *pc_tree) { + const PARTITION_TYPE partition = pc_tree->partitioning; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo *const tile_info = &tile_data->tile_info; + RD_STATS best_rdc; + av1_invalid_rd_stats(&best_rdc); + int sum_subblock_rate = 0; + int64_t sum_subblock_dist = 0; + PartitionSearchState part_search_state; + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, + bsize); + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + PartitionBlkParams blk_params = part_search_state.part_blk_params; + if (!av1_blk_has_rows_and_cols(&blk_params)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + (void)orig_rdmult; + + // Set the context. + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + assert(bsize < BLOCK_SIZES_ALL); + unsigned int pb_source_variance = UINT_MAX; + int64_t part_none_rd = INT64_MAX; + int64_t none_rd = INT64_MAX; + int inc_step[NUM_PART4_TYPES] = { 0 }; + if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4; + if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4; + + switch (partition) { + case PARTITION_NONE: + none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, &pb_source_variance, + &none_rd, &part_none_rd); + break; + case PARTITION_HORZ: + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, NULL, HORZ, + HORZ); + break; + case PARTITION_VERT: + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, NULL, VERT, + VERT); + break; + case PARTITION_HORZ_A: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, HORZ_A, HORZ_A); + break; + case PARTITION_HORZ_B: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, HORZ_B, HORZ_B); + break; + case PARTITION_VERT_A: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, VERT_A, VERT_A); + break; + case PARTITION_VERT_B: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, VERT_B, VERT_B); + break; + case PARTITION_HORZ_4: + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->horizontal4, &part_search_state, &best_rdc, + inc_step, PARTITION_HORZ_4); + break; + case PARTITION_VERT_4: + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->vertical4, &part_search_state, &best_rdc, + inc_step, PARTITION_VERT_4); + break; + case PARTITION_SPLIT: + for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) { + const BLOCK_SIZE subsize = + get_partition_subsize(bsize, PARTITION_SPLIT); + assert(subsize < BLOCK_SIZES_ALL); + const int next_mi_row = + idx < 2 ? mi_row : mi_row + mi_size_high[subsize]; + const int next_mi_col = + idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize]; + if (next_mi_row >= cm->mi_params.mi_rows || + next_mi_col >= cm->mi_params.mi_cols) { + continue; + } + const RD_STATS subblock_rdc = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row, + next_mi_col, subsize, pc_tree->split[idx]); + sum_subblock_rate += subblock_rdc.rate; + sum_subblock_dist += subblock_rdc.dist; + } + best_rdc.rate = sum_subblock_rate; + best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT]; + best_rdc.dist = sum_subblock_dist; + best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist); + break; + default: + assert(0 && "invalid partition type."); + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid partition type."); + } + // Note: it is necessary to restore context information. + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize != cm->seq_params->sb_size) { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + x->rdmult = orig_rdmult; + + return best_rdc; +} + +static void prepare_sb_features_before_search( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, + int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) { + av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col, + bsize, features); + collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features); +} + +static void update_partition_stats(const RD_STATS *const this_rdcost, + aom_partition_stats_t *stats) { + stats->rate = this_rdcost->rate; + stats->dist = this_rdcost->dist; + stats->rdcost = this_rdcost->rdcost; +} + +static void build_pc_tree_from_part_decision( + const aom_partition_decision_t *partition_decision, + const BLOCK_SIZE this_bsize, PC_TREE *pc_tree, + struct aom_internal_error_info *error_info) { + BLOCK_SIZE bsize = this_bsize; + int num_nodes = partition_decision->num_nodes; + PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int last_idx = 1; + int q_idx = 0; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const int partitioning = partition_decision->partition_decision[q_idx]; + assert(partitioning >= PARTITION_NONE && + partitioning < EXT_PARTITION_TYPES); + PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { + node->partitioning = partitioning; + bsize = node->block_size; + } + if (partitioning == PARTITION_SPLIT) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + if (node != NULL) { // Suppress warning + node->split[i] = av1_alloc_pc_tree_node(subsize); + if (!node->split[i]) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + node->split[i]->index = i; + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + } + } + --num_nodes; + ++q_idx; + } +} + +// The ML model needs to provide the whole decision tree for the superblock. +static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, + TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col, + const BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + struct aom_internal_error_info *error_info = x->e_mbd.error_info; + aom_partition_features_t features; + prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize, + &features); + features.mi_row = mi_row; + features.mi_col = mi_col; + features.frame_width = cpi->frame_info.frame_width; + features.frame_height = cpi->frame_info.frame_height; + features.block_size = bsize; + av1_ext_part_send_features(ext_part_controller, &features); + + // rd mode search (dry run) for a valid partition decision from the ml model. + aom_partition_decision_t partition_decision; + do { + const bool valid_decision = av1_ext_part_get_partition_decision( + ext_part_controller, &partition_decision); + if (!valid_decision) return false; + + // First, let's take the easy approach. + // We require that the ml model has to provide partition decisions for the + // whole superblock. + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root, + error_info); + + const RD_STATS this_rdcost = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); + aom_partition_stats_t stats; + update_partition_stats(&this_rdcost, &stats); + av1_ext_part_send_partition_stats(ext_part_controller, &stats); + if (!partition_decision.is_final_decision) { + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + } + } while (!partition_decision.is_final_decision); + + // Encode with the selected mode and partition. + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + td->pc_root, NULL); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + + return true; +} + +// Use a bitmask to represent the valid partition types for the current +// block. "1" represents the corresponding partition type is vaild. +// The least significant bit represents "PARTITION_NONE", the +// largest significant bit represents "PARTITION_VERT_4", follow +// the enum order for PARTITION_TYPE in "enums.h" +static int get_valid_partition_types( + const AV1_COMP *const cpi, + const PartitionSearchState *const part_search_state, + const BLOCK_SIZE bsize) { + const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; + const PartitionBlkParams blk_params = part_search_state->part_blk_params; + int valid_types = 0; + // PARTITION_NONE + valid_types |= (part_search_state->partition_none_allowed << 0); + // PARTITION_HORZ + valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1); + // PARTITION_VERT + valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2); + // PARTITION_SPLIT + valid_types |= (part_search_state->do_square_split << 3); + // PARTITION_HORZ_A + const int ext_partition_allowed = part_search_state->do_rectangular_split && + av1_blk_has_rows_and_cols(&blk_params); + const int horzab_partition_allowed = + ext_partition_allowed && part_cfg->enable_ab_partitions && + part_search_state->partition_rect_allowed[HORZ]; + valid_types |= (horzab_partition_allowed << 4); + // PARTITION_HORZ_B + valid_types |= (horzab_partition_allowed << 5); + // PARTITION_VERT_A + const int vertab_partition_allowed = + ext_partition_allowed && part_cfg->enable_ab_partitions && + part_search_state->partition_rect_allowed[VERT]; + valid_types |= (vertab_partition_allowed << 6); + // PARTITION_VERT_B + valid_types |= (vertab_partition_allowed << 7); + // PARTITION_HORZ_4 + const int partition4_allowed = part_cfg->enable_1to4_partitions && + ext_partition_allowed && + bsize != BLOCK_128X128; + const int horz4_allowed = + partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + valid_types |= (horz4_allowed << 8); + // PARTITION_VERT_4 + const int vert4_allowed = + partition4_allowed && part_search_state->partition_rect_allowed[HORZ] && + get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), + part_search_state->ss_x, + part_search_state->ss_y) != BLOCK_INVALID; + valid_types |= (vert4_allowed << 9); + + return valid_types; +} + +static void prepare_tpl_stats_block(const AV1_COMP *const cpi, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int64_t *intra_cost, + int64_t *inter_cost, int64_t *mc_dep_cost) { + const AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { + return; + } + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + // If tpl stats is not established, early return + if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) { + return; + } + + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int mi_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + + int64_t sum_intra_cost = 0; + int64_t sum_inter_cost = 0; + int64_t sum_mc_dep_cost = 0; + for (int row = 0; row < mi_height; row += step) { + for (int col = 0; col < mi_width; col += step) { + TplDepStats *this_stats = + &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride, + tpl_data->tpl_stats_block_mis_log2)]; + sum_intra_cost += this_stats->intra_cost; + sum_inter_cost += this_stats->inter_cost; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + sum_mc_dep_cost += mc_dep_delta; + } + } + + *intra_cost = sum_intra_cost; + *inter_cost = sum_inter_cost; + *mc_dep_cost = sum_mc_dep_cost; +} + +static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, + PC_TREE *pc_tree, int mi_row, int mi_col, + const BLOCK_SIZE bsize, RD_STATS *this_rdcost) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) { + return false; + } + aom_partition_decision_t partition_decision; + do { + PartitionSearchState part_search_state; + // Initialization of state variables used in partition search. + // TODO(chengchen): check if there is hidden conditions that don't allow + // all possible partition types. + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, + mi_col, bsize); + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + PartitionBlkParams blk_params = part_search_state.part_blk_params; + if (!av1_blk_has_rows_and_cols(&blk_params)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + const int valid_partition_types = + get_valid_partition_types(cpi, &part_search_state, bsize); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id, + cm->quant_params.base_qindex); + // RD multiplier + const int rdmult = x->rdmult; + // pyramid level + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; + x->rdmult = orig_rdmult; + // Neighbor information + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = + has_above ? xd->above_mbmi->bsize : BLOCK_INVALID; + const BLOCK_SIZE left_bsize = + has_left ? xd->left_mbmi->bsize : BLOCK_INVALID; + const int above_block_width = + above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize]; + const int above_block_height = + above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize]; + const int left_block_width = + left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize]; + const int left_block_height = + left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize]; + // Prepare simple motion search stats as features + unsigned int block_sse = -1; + unsigned int block_var = -1; + unsigned int sub_block_sse[4] = { -1, -1, -1, -1 }; + unsigned int sub_block_var[4] = { -1, -1, -1, -1 }; + unsigned int horz_block_sse[2] = { -1, -1 }; + unsigned int horz_block_var[2] = { -1, -1 }; + unsigned int vert_block_sse[2] = { -1, -1 }; + unsigned int vert_block_var[2] = { -1, -1 }; + av1_prepare_motion_search_features_block( + cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types, + &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse, + horz_block_var, vert_block_sse, vert_block_var); + // Prepare tpl stats for the current block as features + int64_t tpl_intra_cost = -1; + int64_t tpl_inter_cost = -1; + int64_t tpl_mc_dep_cost = -1; + prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost, + &tpl_inter_cost, &tpl_mc_dep_cost); + + aom_partition_features_t features; + features.mi_row = mi_row; + features.mi_col = mi_col; + features.frame_width = cpi->frame_info.frame_width; + features.frame_height = cpi->frame_info.frame_height; + features.block_size = bsize; + features.valid_partition_types = valid_partition_types; + features.update_type = update_type; + features.qindex = qindex; + features.rdmult = rdmult; + features.pyramid_level = pyramid_level; + features.has_above_block = has_above; + features.above_block_width = above_block_width; + features.above_block_height = above_block_height; + features.has_left_block = has_left; + features.left_block_width = left_block_width; + features.left_block_height = left_block_height; + features.block_sse = block_sse; + features.block_var = block_var; + for (int i = 0; i < 4; ++i) { + features.sub_block_sse[i] = sub_block_sse[i]; + features.sub_block_var[i] = sub_block_var[i]; + } + for (int i = 0; i < 2; ++i) { + features.horz_block_sse[i] = horz_block_sse[i]; + features.horz_block_var[i] = horz_block_var[i]; + features.vert_block_sse[i] = vert_block_sse[i]; + features.vert_block_var[i] = vert_block_var[i]; + } + features.tpl_intra_cost = tpl_intra_cost; + features.tpl_inter_cost = tpl_inter_cost; + features.tpl_mc_dep_cost = tpl_mc_dep_cost; + av1_ext_part_send_features(ext_part_controller, &features); + const bool valid_decision = av1_ext_part_get_partition_decision( + ext_part_controller, &partition_decision); + if (!valid_decision) return false; + pc_tree->partitioning = partition_decision.current_decision; + + av1_init_rd_stats(this_rdcost); + if (partition_decision.current_decision == PARTITION_SPLIT) { + assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8); + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + RD_STATS split_rdc[SUB_PARTITIONS_SPLIT]; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + av1_init_rd_stats(&split_rdc[i]); + if (pc_tree->split[i] == NULL) + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + const int orig_rdmult_tmp = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + // TODO(chengchen): check boundary conditions + // top-left + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0], + mi_row, mi_col, subsize, &split_rdc[0]); + // top-right + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1], + mi_row, mi_col + mi_size_wide[subsize], subsize, + &split_rdc[1]); + // bottom-left + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2], + mi_row + mi_size_high[subsize], mi_col, subsize, + &split_rdc[2]); + // bottom_right + recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3], + mi_row + mi_size_high[subsize], + mi_col + mi_size_wide[subsize], subsize, + &split_rdc[3]); + this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT]; + // problem is here, the rdmult is different from the rdmult in sub block. + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + this_rdcost->rate += split_rdc[i].rate; + this_rdcost->dist += split_rdc[i].dist; + av1_rd_cost_update(x->rdmult, this_rdcost); + } + x->rdmult = orig_rdmult_tmp; + } else { + *this_rdcost = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree); + } + + aom_partition_stats_t stats; + update_partition_stats(this_rdcost, &stats); + av1_ext_part_send_partition_stats(ext_part_controller, &stats); + if (!partition_decision.is_final_decision) { + if (partition_decision.current_decision == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + if (pc_tree->split[i] != NULL) { + av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0, + 0, + cpi->sf.part_sf.partition_search_type); + pc_tree->split[i] = NULL; + } + } + } + } + } while (!partition_decision.is_final_decision); + + return true; +} + +// The ML model only needs to make decisions for the current block each time. +static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col, + const BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + aom_partition_features_t features; + prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize, + &features); + features.mi_row = mi_row; + features.mi_col = mi_col; + features.frame_width = cpi->frame_info.frame_width; + features.frame_height = cpi->frame_info.frame_height; + features.block_size = bsize; + av1_ext_part_send_features(ext_part_controller, &features); + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + + RD_STATS rdcost; + const bool valid_partition = + recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row, + mi_col, bsize, &rdcost); + if (!valid_partition) { + return false; + } + + // Encode with the selected mode and partition. + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + td->pc_root, NULL); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + + return true; +} + +bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + RD_STATS *best_rd_cost) { + AV1_COMMON *const cm = &cpi->common; + if (cpi->ext_part_controller.ready) { + bool valid_search = true; + const aom_ext_part_decision_mode_t decision_mode = + av1_get_ext_part_decision_mode(&cpi->ext_part_controller); + if (decision_mode == AOM_EXT_PART_WHOLE_TREE) { + valid_search = ml_partition_search_whole_tree( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize); + } else if (decision_mode == AOM_EXT_PART_RECURSIVE) { + valid_search = ml_partition_search_partial( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize); + } else { + assert(0 && "Unknown decision mode."); + return false; + } + if (!valid_search) { + aom_internal_error( + cm->error, AOM_CODEC_ERROR, + "Invalid search from ML model, partition search failed"); + } + return true; + } + + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + int best_idx = 0; + int64_t min_rdcost = INT64_MAX; + int num_configs; + int i = 0; + do { + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i); + if (num_configs <= 0) { + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs."); + } + verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col); + if (i == 0) { + AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost, + aom_calloc(num_configs, sizeof(*x->rdcost))); + } + // Encode the block with the given partition tree. Get rdcost and encoding + // time. + x->rdcost[i] = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root); + + if (x->rdcost[i].rdcost < min_rdcost) { + min_rdcost = x->rdcost[i].rdcost; + best_idx = i; + *best_rd_cost = x->rdcost[i]; + } + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + ++i; + } while (i < num_configs); + + aom_free(x->rdcost); + x->rdcost = NULL; + // Encode with the partition configuration with the smallest rdcost. + td->pc_root = av1_alloc_pc_tree_node(bsize); + if (!td->pc_root) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx); + rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row, + mi_col, bsize, td->pc_root); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + td->pc_root, NULL); + av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0, + cpi->sf.part_sf.partition_search_type); + td->pc_root = NULL; + ++cpi->sb_counter; + + return true; +} +#endif // CONFIG_PARTITION_SEARCH_ORDER + +static AOM_INLINE bool should_do_dry_run_encode_for_current_block( + BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index, + BLOCK_SIZE bsize) { + if (bsize > max_partition_size) return false; + + // Enable the reconstruction with dry-run for the 4th sub-block only if its + // parent block's reconstruction with dry-run is skipped. If + // max_partition_size is the same as immediate split of superblock, then avoid + // reconstruction of the 4th sub-block, as this data is not consumed. + if (curr_block_index != 3) return true; + + const BLOCK_SIZE sub_sb_size = + get_partition_subsize(sb_size, PARTITION_SPLIT); + return bsize == max_partition_size && sub_sb_size != max_partition_size; +} + +static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs, + double *var_min, double *var_max) { + // This functions returns a the minimum and maximum log variances for 4x4 + // sub blocks in the current block. + + const MACROBLOCKD *const xd = &x->e_mbd; + const int is_hbd = is_cur_buf_hbd(xd); + const int right_overflow = + (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0; + const int bottom_overflow = + (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0; + const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow; + const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow; + + // Initialize minimum variance to a large value and maximum variance to 0. + double min_var_4x4 = (double)INT_MAX; + double max_var_4x4 = 0.0; + + for (int i = 0; i < bh; i += MI_SIZE) { + for (int j = 0; j < bw; j += MI_SIZE) { + int var; + // Calculate the 4x4 sub-block variance. + var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, + x->plane[0].src.buf + (i * x->plane[0].src.stride) + j, + x->plane[0].src.stride, is_hbd); + + // Record min and max for over-arching block + min_var_4x4 = AOMMIN(min_var_4x4, var); + max_var_4x4 = AOMMAX(max_var_4x4, var); + } + } + *var_min = log1p(min_var_4x4 / 16.0); + *var_max = log1p(max_var_4x4 / 16.0); +} + +static AOM_INLINE void set_sms_tree_partitioning( + SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) { + if (sms_tree == NULL) return; + sms_tree->partitioning = partition; +} + +/*!\brief AV1 block partition search (full search). +* +* \ingroup partition_search +* \callgraph +* Searches for the best partition pattern for a block based on the +* rate-distortion cost, and returns a bool value to indicate whether a valid +* partition pattern is found. The partition can recursively go down to the +* smallest block size. +* +* \param[in] cpi Top-level encoder structure +* \param[in] td Pointer to thread data +* \param[in] tile_data Pointer to struct holding adaptive +data/contexts/models for the tile during +encoding +* \param[in] tp Pointer to the starting token +* \param[in] mi_row Row coordinate of the block in a step size +of MI_SIZE +* \param[in] mi_col Column coordinate of the block in a step +size of MI_SIZE +* \param[in] bsize Current block size +* \param[in] rd_cost Pointer to the final rd cost of the block +* \param[in] best_rdc Upper bound of rd cost of a valid partition +* \param[in] pc_tree Pointer to the PC_TREE node storing the +picked partitions and mode info for the +current block +* \param[in] sms_tree Pointer to struct holding simple motion +search data for the current block +* \param[in] none_rd Pointer to the rd cost in the case of not +splitting the current block +* \param[in] multi_pass_mode SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS +* \param[in] rect_part_win_info Pointer to struct storing whether horz/vert +partition outperforms previously tested +partitions +* +* \return A bool value is returned indicating if a valid partition is found. +* The pc_tree struct is modified to store the picked partition and modes. +* The rd_cost struct is also updated with the RD stats corresponding to the +* best partition found. +*/ +bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, + RD_STATS best_rdc, PC_TREE *pc_tree, + SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd, + SB_MULTI_PASS_MODE multi_pass_mode, + RD_RECT_PART_WIN_INFO *rect_part_win_info) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + const TokenExtra *const tp_orig = *tp; + PartitionSearchState part_search_state; + + // Initialization of state variables used in partition search. + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, + bsize); + PartitionBlkParams blk_params = part_search_state.part_blk_params; + + set_sms_tree_partitioning(sms_tree, PARTITION_NONE); + if (best_rdc.rdcost < 0) { + av1_invalid_rd_stats(rd_cost); + return part_search_state.found_best_partition; + } + if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0; + + // Override skipping rectangular partition operations for edge blocks. + if (none_rd) *none_rd = 0; + (void)*tp_orig; + +#if CONFIG_COLLECT_PARTITION_STATS + // Stats at the current quad tree + PartitionTimingStats *part_timing_stats = + &part_search_state.part_timing_stats; + // Stats aggregated at frame level + FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats; +#endif // CONFIG_COLLECT_PARTITION_STATS + + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + if (!av1_blk_has_rows_and_cols(&blk_params)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + + // Disable rectangular partitions for inner blocks when the current block is + // forced to only use square partitions. + if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) { + part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows; + part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols; + } + +#ifndef NDEBUG + // Nothing should rely on the default value of this array (which is just + // leftover from encoding the previous block. Setting it to fixed pattern + // when debugging. + // bit 0, 1, 2 are blk_skip of each plane + // bit 4, 5, 6 are initialization checking of each plane + memset(x->txfm_search_info.blk_skip, 0x77, + sizeof(x->txfm_search_info.blk_skip)); +#endif // NDEBUG + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + // Set buffers and offsets. + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + if (cpi->oxcf.mode == ALLINTRA) { + if (bsize == cm->seq_params->sb_size) { + double var_min, var_max; + log_sub_block_var(cpi, x, bsize, &var_min, &var_max); + + x->intra_sb_rdmult_modifier = 128; + if ((var_min < 2.0) && (var_max > 4.0)) { + if ((var_max - var_min) > 8.0) { + x->intra_sb_rdmult_modifier -= 48; + } else { + x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6); + } + } + } + } + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + + // Apply simple motion search for the entire super block with fixed block + // size, e.g., 16x16, to collect features and write to files for the + // external ML model. + // TODO(chengchen): reduce motion search. This function is similar to + // av1_get_max_min_partition_features(). + if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) && + bsize == cm->seq_params->sb_size) { + av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col, + bsize, /*features=*/NULL); + collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL); + } + + // Update rd cost of the bound using the current multiplier. + av1_rd_cost_update(x->rdmult, &best_rdc); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) + x->mb_energy = av1_log_block_var(cpi, x, bsize); + + // Set the context. + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_prune_partitions_time); +#endif + // Pruning: before searching any partition type, using source and simple + // motion search results to prune out unlikely partitions. + av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state); + + // Pruning: eliminating partition types leading to coding block sizes outside + // the min and max bsize limitations set from the encoder. + av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_prune_partitions_time); +#endif + + // Partition search +BEGIN_PARTITION_SEARCH: + // If a valid partition is required, usually when the first round cannot find + // a valid one under the cost limit after pruning, reset the limitations on + // partition types and intra cnn output. + if (x->must_find_valid_partition) { + reset_part_limitations(cpi, &part_search_state); + av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state); + // Invalidate intra cnn output for key frames. + if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) { + part_search_state.intra_part_info->quad_tree_idx = 0; + part_search_state.intra_part_info->cnn_output_valid = 0; + } + } + // Partition block source pixel variance. + unsigned int pb_source_variance = UINT_MAX; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, none_partition_search_time); +#endif + + if (cpi->oxcf.mode == ALLINTRA) { + const bool bsize_at_least_16x16 = (bsize >= BLOCK_16X16); + const bool prune_rect_part_using_4x4_var_deviation = + (cpi->sf.part_sf.prune_rect_part_using_4x4_var_deviation && + !x->must_find_valid_partition); + + if (bsize_at_least_16x16 || prune_rect_part_using_4x4_var_deviation) { + double var_min, var_max; + log_sub_block_var(cpi, x, bsize, &var_min, &var_max); + + // Further pruning or in some cases reverse pruning when allintra is set. + // This code helps visual and in some cases metrics quality where the + // current block comprises at least one very low variance sub-block and at + // least one where the variance is much higher. + // + // The idea is that in such cases there is danger of ringing and other + // visual artifacts from a high variance feature such as an edge into a + // very low variance region. + // + // The approach taken is to force break down / split to a smaller block + // size to try and separate out the low variance and well predicted blocks + // from the more complex ones and to prevent propagation of ringing over a + // large region. + if (bsize_at_least_16x16 && (var_min < 0.272) && + ((var_max - var_min) > 3.0)) { + part_search_state.partition_none_allowed = 0; + part_search_state.terminate_partition_search = 0; + part_search_state.do_square_split = 1; + } else if (prune_rect_part_using_4x4_var_deviation && + (var_max - var_min < 3.0)) { + // Prune rectangular partitions if the variance deviation of 4x4 + // sub-blocks within the block is less than a threshold (derived + // empirically). + part_search_state.do_rectangular_split = 0; + } + } + } + + // PARTITION_NONE search stage. + int64_t part_none_rd = INT64_MAX; + none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, &pb_source_variance, + none_rd, &part_none_rd); + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, none_partition_search_time); +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, split_partition_search_time); +#endif + // PARTITION_SPLIT search stage. + int64_t part_split_rd = INT64_MAX; + split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, multi_pass_mode, + &part_split_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, split_partition_search_time); +#endif + // Terminate partition search for child partition, + // when NONE and SPLIT partition rd_costs are INT64_MAX. + if (cpi->sf.part_sf.early_term_after_none_split && + part_none_rd == INT64_MAX && part_split_rd == INT64_MAX && + !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) { + part_search_state.terminate_partition_search = 1; + } + + // Do not evaluate non-square partitions if NONE partition did not choose a + // newmv mode and is skippable. + if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) && + (pc_tree->none != NULL)) { + if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) && + !have_newmv_in_inter_mode(pc_tree->none->mic.mode) && + pc_tree->none->skippable && !x->must_find_valid_partition && + bsize >= BLOCK_16X16) + part_search_state.do_rectangular_split = 0; + } + + // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT. + prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc, + part_none_rd, part_split_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rectangular_partition_search_time); +#endif + // Rectangular partitions search stage. + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, + rect_part_win_info, HORZ, VERT); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rectangular_partition_search_time); +#endif + + if (pb_source_variance == UINT_MAX) { + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + pb_source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + } + + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part_search_state.do_rectangular_split)); + + const int prune_ext_part_state = prune_ext_part_none_skippable( + pc_tree->none, x->must_find_valid_partition, + cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize); + + const int ab_partition_allowed = allow_ab_partition_search( + &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning, + x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, ab_partitions_search_time); +#endif + // AB partitions search stage. + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, rect_part_win_info, + pb_source_variance, ab_partition_allowed, HORZ_A, + VERT_B); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, ab_partitions_search_time); +#endif + + // 4-way partitions search stage. + int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 }; + // Prune 4-way partition search. + prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc, + pb_source_variance, prune_ext_part_state, + part4_search_allowed); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_4partition_time); +#endif + // PARTITION_HORZ_4 + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part4_search_allowed[HORZ4])); + if (!part_search_state.terminate_partition_search && + part4_search_allowed[HORZ4]) { + const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4, + 0 }; + // Evaluation of Horz4 partition type. + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->horizontal4, &part_search_state, &best_rdc, + inc_step, PARTITION_HORZ_4); + } + + // PARTITION_VERT_4 + assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, + !part4_search_allowed[VERT4])); + if (!part_search_state.terminate_partition_search && + part4_search_allowed[VERT4] && blk_params.has_cols) { + const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] / + 4 }; + // Evaluation of Vert4 partition type. + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->vertical4, &part_search_state, &best_rdc, + inc_step, PARTITION_VERT_4); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_4partition_time); +#endif + + if (bsize == cm->seq_params->sb_size && + !part_search_state.found_best_partition) { + // Did not find a valid partition, go back and search again, with less + // constraint on which partition types to search. + x->must_find_valid_partition = 1; +#if CONFIG_COLLECT_PARTITION_STATS + fr_part_timing_stats->partition_redo += 1; +#endif // CONFIG_COLLECT_PARTITION_STATS + goto BEGIN_PARTITION_SEARCH; + } + + // Store the final rd cost + *rd_cost = best_rdc; + + // Also record the best partition in simple motion data tree because it is + // necessary for the related speed features. + set_sms_tree_partitioning(sms_tree, pc_tree->partitioning); + +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + part_timing_stats->partition_decisions[pc_tree->partitioning] += 1; + } + + // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each + // prediction block. + print_partition_timing_stats_with_rdcost( + part_timing_stats, mi_row, mi_col, bsize, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + cm->current_frame.frame_number, &best_rdc, "part_timing.csv"); + const bool print_timing_stats = false; + if (print_timing_stats) { + print_partition_timing_stats(part_timing_stats, cm->show_frame, + frame_is_intra_only(cm), bsize, + "part_timing_data.csv"); + } + // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for + // the whole clip. So we need to pass the information upstream to the encoder. + accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats, + bsize); +#endif // CONFIG_COLLECT_PARTITION_STATS + + // Reset the PC_TREE deallocation flag. + int pc_tree_dealloc = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + if (part_search_state.found_best_partition) { + if (bsize == cm->seq_params->sb_size) { + // Encode the superblock. + const int emit_output = multi_pass_mode != SB_DRY_PASS; + const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL; + + // Write partition tree to file. Not used by default. + if (COLLECT_MOTION_SEARCH_FEATURE_SB) { + write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col); + ++cpi->sb_counter; + } + + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize, + pc_tree, NULL); + assert(pc_tree == td->pc_root); + // Dealloc the whole PC_TREE after a superblock is done. + av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0, + cpi->sf.part_sf.partition_search_type); + pc_tree = NULL; + td->pc_root = NULL; + pc_tree_dealloc = 1; + } else if (should_do_dry_run_encode_for_current_block( + cm->seq_params->sb_size, x->sb_enc.max_partition_size, + pc_tree->index, bsize)) { + // Encode the smaller blocks in DRY_RUN mode. + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif + + // If the tree still exists (non-superblock), dealloc most nodes, only keep + // nodes for the best partition and PARTITION_NONE. + if (pc_tree_dealloc == 0) + av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1, + cpi->sf.part_sf.partition_search_type); + + if (bsize == cm->seq_params->sb_size) { + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } + + // Restore the rd multiplier. + x->rdmult = orig_rdmult; + return part_search_state.found_best_partition; +} +#endif // !CONFIG_REALTIME_ONLY + +#undef COLLECT_MOTION_SEARCH_FEATURE_SB + +#if CONFIG_RT_ML_PARTITIONING +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + const float *means = NULL; + const float *vars = NULL; + switch (bsize) { + case BLOCK_64X64: + nn_config = &av1_var_part_nnconfig_64; + means = av1_var_part_means_64; + vars = av1_var_part_vars_64; + break; + case BLOCK_32X32: + nn_config = &av1_var_part_nnconfig_32; + means = av1_var_part_means_32; + vars = av1_var_part_vars_32; + break; + case BLOCK_16X16: + nn_config = &av1_var_part_nnconfig_16; + means = av1_var_part_means_16; + vars = av1_var_part_vars_16; + break; + case BLOCK_8X8: + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + { + const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; + float features[FEATURES] = { 0.0f }; + const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx] = + (log1pf((float)(dc_q * dc_q) / 256.0f) - means[feature_idx]) / + sqrtf(vars[feature_idx]); + feature_idx++; + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize); + { + const int bs = block_size_wide[bsize]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 4 * (mi_row & 15); + const int sb_offset_col = 4 * (mi_col & 15); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx] = + (log1pf((float)var) - means[feature_idx]) / sqrtf(vars[feature_idx]); + feature_idx++; + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx] = + (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]); + feature_idx++; + } + } + // for (int i = 0; i thresh) return PARTITION_SPLIT; + if (score[0] < -thresh) return PARTITION_NONE; + return -1; + } +} +#undef FEATURES +#undef LABELS + +// Uncomment for collecting data for ML-based partitioning +// #define _COLLECT_GROUND_TRUTH_ + +#ifdef _COLLECT_GROUND_TRUTH_ +static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, PARTITION_TYPE part) { + AV1_COMMON *const cm = &cpi->common; + char fname[128]; + switch (bsize) { + case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break; + case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break; + case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break; + case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break; + default: assert(0 && "Unexpected block size."); return -1; + } + + float features[6]; // DC_Q, VAR, VAR_RATIO-0..3 + + FILE *f = fopen(fname, "a"); + + { + const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, + cm->seq_params->bit_depth); + int feature_idx = 0; + + features[feature_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize); + { + const int bs = block_size_wide[bsize]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 4 * (mi_row & 15); + const int sb_offset_col = 4 * (mi_col & 15); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + /* + if (bs == 8) + { + int r, c; + printf("%d %d\n", mi_row, mi_col); + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + printf("%3d ", + src[r * src_stride + c] - pred[64 * r + c]); + } + printf("\n"); + } + printf("\n"); + } + */ + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = log1pf((float)var); + + fprintf(f, "%f,%f,", features[0], features[1]); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + fprintf(f, "%f,", var_ratio); + } + + fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1); + } + + fclose(f); + return -1; + } +} +#endif + +static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int block_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int block_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + const int mi_stride = xd->mi_stride; + MB_MODE_INFO *const src_mi = xd->mi[0]; + int i, j; + + for (j = 0; j < block_height; ++j) + for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi; +} + +static INLINE void copy_mbmi_ext_frame_to_mbmi_ext( + MB_MODE_INFO_EXT *const mbmi_ext, + const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) { + memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack, + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight, + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context; + mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count; + memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + int hbs = mi_size_wide[bsize] >> 1; + PARTITION_TYPE partition = pc_tree->partitioning; + BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + + assert(bsize >= BLOCK_8X8); + + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + switch (partition) { + case PARTITION_NONE: + set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row, + mi_col); + *(xd->mi[0]) = pc_tree->none->mic; + copy_mbmi_ext_frame_to_mbmi_ext( + &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME); + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + break; + case PARTITION_SPLIT: { + fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]); + fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize, + pc_tree->split[1]); + fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize, + pc_tree->split[2]); + fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize, + pc_tree->split[3]); + break; + } + default: break; + } +} + +void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_STATS *rd_cost, int do_recon, int64_t best_rd, + PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int hbs = mi_size_wide[bsize] >> 1; + TokenExtra *tp_orig = *tp; + const ModeCosts *mode_costs = &x->mode_costs; + RD_STATS this_rdc, best_rdc; + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + int do_split = bsize > BLOCK_8X8; + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows); + const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols); + + int partition_none_allowed = !force_horz_split && !force_vert_split; + + assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Square partition only + assert(cm->seq_params->sb_size == BLOCK_64X64); // Small SB so far + + (void)*tp_orig; + + av1_invalid_rd_stats(&best_rdc); + best_rdc.rdcost = best_rd; +#ifndef _COLLECT_GROUND_TRUTH_ + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == PARTITION_NONE) do_split = 0; + if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; + } +#endif + + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + // PARTITION_NONE + if (partition_none_allowed) { + pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf); + if (!pc_tree->none) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PICK_MODE_CONTEXT"); + PICK_MODE_CONTEXT *ctx = pc_tree->none; + +// Flip for RDO based pick mode +#if 0 + RD_STATS dummy; + av1_invalid_rd_stats(&dummy); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, + PARTITION_NONE, bsize, ctx, dummy); +#else + pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, + ctx); +#endif + if (this_rdc.rate != INT_MAX) { + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + + this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE]; + this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; + } + } + } + + // PARTITION_SPLIT + if (do_split) { + RD_STATS sum_rdc; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + + av1_init_rd_stats(&sum_rdc); + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + pc_tree->split[i] = av1_alloc_pc_tree_node(subsize); + if (!pc_tree->split[i]) + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate PC_TREE"); + pc_tree->split[i]->index = i; + } + + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); + for (int i = 0; + i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) { + const int x_idx = (i & 1) * hbs; + const int y_idx = (i >> 1) * hbs; + + if (mi_row + y_idx >= cm->mi_params.mi_rows || + mi_col + x_idx >= cm->mi_params.mi_cols) + continue; + av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, + mi_col + x_idx, subsize, &this_rdc, i < 3, + best_rdc.rdcost - sum_rdc.rdcost, + pc_tree->split[i]); + + if (this_rdc.rate == INT_MAX) { + av1_invalid_rd_stats(&sum_rdc); + } else { + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; + } + } + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_SPLIT; + } + } + +#ifdef _COLLECT_GROUND_TRUTH_ + store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning); +#endif + + *rd_cost = best_rdc; + + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3); + + if (best_rdc.rate == INT_MAX) { + av1_invalid_rd_stats(rd_cost); + return; + } + + // update mode info array + fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree); + + if (do_recon) { + if (bsize == cm->seq_params->sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + + if (bsize == BLOCK_64X64 && do_recon) { + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } +} +#endif // CONFIG_RT_ML_PARTITIONING diff --git a/third_party/aom/av1/encoder/partition_search.h b/third_party/aom/av1/encoder/partition_search.h new file mode 100644 index 0000000000..1b5d71b7da --- /dev/null +++ b/third_party/aom/av1/encoder/partition_search.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_ +#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_ + +#include "av1/encoder/block.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/tokenize.h" + +void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi, + const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize); +void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize); +void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + MB_MODE_INFO **mib, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *rate, + int64_t *dist, int do_recon, PC_TREE *pc_tree); +void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TokenExtra **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, PC_TREE *pc_tree); +#if CONFIG_RT_ML_PARTITIONING +void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_STATS *rd_cost, int do_recon, int64_t best_rd, + PC_TREE *pc_tree); +#endif +void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf); +void av1_reset_sf_for_ext_part(AV1_COMP *const cpi); + +#if CONFIG_PARTITION_SEARCH_ORDER +bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, + int mi_col, BLOCK_SIZE bsize, + RD_STATS *best_rd_cost); +#endif + +bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, + RD_STATS best_rdc, PC_TREE *pc_tree, + SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd, + SB_MULTI_PASS_MODE multi_pass_mode, + RD_RECT_PART_WIN_INFO *rect_part_win_info); + +static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset, + const uint16_t cb_offset_y, + const uint16_t cb_offset_uv) { + cb_offset[PLANE_TYPE_Y] = cb_offset_y; + cb_offset[PLANE_TYPE_UV] = cb_offset_uv; +} + +static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize, + const int subsampling_x, + const int subsampling_y) { + x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize]; + if (x->e_mbd.is_chroma_ref) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize != BLOCK_INVALID); + x->cb_offset[PLANE_TYPE_UV] += + block_size_wide[plane_bsize] * block_size_high[plane_bsize]; + } +} + +#endif // AOM_AV1_ENCODER_PARTITION_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/partition_strategy.c b/third_party/aom/av1/encoder/partition_strategy.c new file mode 100644 index 0000000000..ce06313579 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_strategy.c @@ -0,0 +1,2573 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/thirdpass.h" +#include "config/aom_dsp_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/reconinter.h" + +#if !CONFIG_REALTIME_ONLY +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_model_weights.h" +#include "av1/encoder/partition_cnn_weights.h" +#endif +#include "av1/encoder/encoder.h" + +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/partition_search.h" +#include "av1/encoder/rdopt.h" + +#if !CONFIG_REALTIME_ONLY +static AOM_INLINE void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, + int features_to_get); + +static bool ext_ml_model_decision_before_none( + AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], + int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split); + +static bool ext_ml_model_decision_before_none_part2( + AV1_COMP *cpi, + const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], + int *prune_horz, int *prune_vert); + +static bool ext_ml_model_decision_after_none( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_none, int *do_square_split, + int *do_rectangular_split); + +static bool ext_ml_model_decision_after_none_part2( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search); + +static bool ext_ml_model_decision_after_split( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search); + +static bool ext_ml_model_decision_after_split_part2( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_prune, int *prune_rect_part_horz, + int *prune_rect_part_vert); + +static bool ext_ml_model_decision_after_rect( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_rect, int *horza_partition_allowed, + int *horzb_partition_allowed, int *verta_partition_allowed, + int *vertb_partition_allowed); + +static bool ext_ml_model_decision_after_part_ab( + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, + int *const partition_vert4_allowed, unsigned int pb_source_variance, + int mi_row, int mi_col); + +static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) { + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + default: assert(0 && "Invalid bsize"); return -1; + } +} + +static char *get_feature_file_name(int id) { + static char *feature_file_names[] = { + "feature_before_partition_none", + "feature_before_partition_none_prune_rect", + "feature_after_partition_none_prune", + "feature_after_partition_none_terminate", + "feature_after_partition_split_terminate", + "feature_after_partition_split_prune_rect", + "feature_after_partition_rect", + "feature_after_partition_ab", + }; + + return feature_file_names[id]; +} + +static void write_features_to_file(const char *const path, + const bool is_test_mode, + const float *features, + const int feature_size, const int id, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return; + + char filename[256]; + snprintf(filename, sizeof(filename), "%s/%s", path, + get_feature_file_name(id)); + FILE *pfile = fopen(filename, "a"); + if (pfile == NULL) return; + if (!is_test_mode) { + fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col, + feature_size); + } + for (int i = 0; i < feature_size; ++i) { + fprintf(pfile, "%.6f", features[i]); + if (i < feature_size - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + fclose(pfile); +} + +// TODO(chiyotsai@google.com): This is very much a work in progress. We still +// need to the following: +// -- add support for hdres +// -- add support for pruning rectangular partitions +// -- use reconstructed pixels instead of source pixels for padding +// -- use chroma pixels in addition to luma pixels +void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, + int quad_tree_idx, + int intra_cnn_based_part_prune_level, + PartitionSearchState *part_state) { + assert(cm->seq_params->sb_size >= BLOCK_64X64 && + "Invalid sb_size for intra_cnn!"); + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params->bsize; + + const int bsize_idx = convert_bsize_to_idx(bsize); + + if (bsize == BLOCK_128X128) { + return; + } + + PartitionSearchInfo *part_info = &x->part_search_info; + + // Precompute the CNN part and cache the result in MACROBLOCK + if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) { + const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config; + + // Prepare the output + const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL }; + const int num_outputs = 4; + const int output_dims[4] = { 1, 2, 4, 8 }; + const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH, + CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH }; + float *output_buffer[CNN_TOT_OUT_CH]; + + float **cur_output_buf = output_buffer; + float *curr_buf_ptr = part_info->cnn_buffer; + for (int output_idx = 0; output_idx < num_outputs; output_idx++) { + const int num_chs = out_chs[output_idx]; + const int ch_size = output_dims[output_idx] * output_dims[output_idx]; + for (int ch = 0; ch < num_chs; ch++) { + cur_output_buf[ch] = curr_buf_ptr; + curr_buf_ptr += ch_size; + } + cur_output_buf += num_chs; + } + + CNN_MULTI_OUT output = { + .num_outputs = 4, + .output_channels = out_chs, + .output_strides = output_dims, + .output_buffer = output_buffer, + }; + + // Prepare the input + const MACROBLOCKD *xd = &x->e_mbd; + const int bit_depth = xd->bd; + const int dc_q = + av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8); + part_info->log_q = log1pf((float)(dc_q * dc_q) / 256.0f); + part_info->log_q = + (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) / + av1_intra_mode_cnn_partition_std[0]; + + const int width = 65, height = 65, + stride = x->plane[AOM_PLANE_Y].src.stride; + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *image[1] = { + CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1 + }; + + if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride, + cnn_config, &thread_data, + bit_depth, &output)) { + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating CNN data"); + return; + } + } else { + uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 }; + + if (!av1_cnn_predict_img_multi_out(image, width, height, stride, + cnn_config, &thread_data, &output)) { + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating CNN data"); + return; + } + } + + part_info->cnn_output_valid = 1; + } + + if (!part_info->cnn_output_valid) { + return; + } + + const NN_CONFIG *dnn_configs[5] = { + NULL, + &av1_intra_mode_cnn_partition_branch_0_dnn_config, + &av1_intra_mode_cnn_partition_branch_1_dnn_config, + &av1_intra_mode_cnn_partition_branch_2_dnn_config, + &av1_intra_mode_cnn_partition_branch_3_dnn_config, + }; + + const NN_CONFIG *dnn_config = dnn_configs[bsize_idx]; + + float dnn_features[100]; + float logits[4] = { 0.0f }; + + const float *branch_0 = part_info->cnn_buffer; + const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE; + const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE; + const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE; + + if (bsize == BLOCK_64X64) { + int f_idx = 0; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_0[ch_idx]; + } + + const int spa_stride = 2 * 2; + for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) { + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride]; + } + } + dnn_features[f_idx++] = part_info->log_q; + } else if (bsize == BLOCK_32X32) { + int f_idx = 0; + for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) { + dnn_features[f_idx++] = branch_0[idx]; + } + + const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1]; + const int spa_stride = 2 * 2; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = part_info->log_q; + } else if (bsize == BLOCK_16X16) { + int f_idx = 0; + const int prev_quad_idx = (quad_tree_idx - 1) / 4; + const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1]; + const int prev_spa_stride = 2 * 2; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride]; + } + + const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5]; + const int spa_stride = 4 * 4; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = part_info->log_q; + } else if (bsize == BLOCK_8X8) { + int f_idx = 0; + const int prev_quad_idx = (quad_tree_idx - 1) / 4; + const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5]; + const int prev_spa_stride = 4 * 4; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride]; + } + + const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21]; + const int spa_stride = 8 * 8; + for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) { + dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride]; + } + dnn_features[f_idx++] = part_info->log_q; + } else { + assert(0 && "Invalid bsize in intra_cnn partition"); + } + + // Make decision + av1_nn_predict(dnn_features, dnn_config, 1, logits); + + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + float split_only_thresh = 100.0f, no_split_thresh = -100.0f; + if (is_720p_or_larger) { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx]; + } else if (is_480p_or_larger) { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx]; + } else { + split_only_thresh = + av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx]; + no_split_thresh = + av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx]; + } + + if (logits[0] > split_only_thresh) { + // As screen contents tend to choose larger partitions, do not prune + // PARTITION_NONE when intra_cnn_based_part_prune_level=1. + if (intra_cnn_based_part_prune_level != 1) { + part_state->partition_none_allowed = 0; + } + part_state->do_square_split = 1; + av1_disable_rect_partitions(part_state); + } + + if (logits[0] < no_split_thresh) { + av1_disable_square_split_partition(part_state); + } +} + +static INLINE int get_simple_motion_search_prune_agg(int qindex, + int prune_level, + int is_rect_part) { + assert(prune_level < TOTAL_AGG_LVLS); + if (prune_level == NO_PRUNING) { + return -1; + } + + // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except + // QIDX_BASED_AGG_LVL + const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 }; + if (prune_level < TOTAL_SIMPLE_AGG_LVLS) { + return sms_prune_agg_levels[prune_level]; + } + + // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value. + // Aggressive pruning for lower quantizers in non-boosted frames to prune + // rectangular partitions. + const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0; + const int sms_prune_agg_qindex_based[2] = { 1, 2 }; + return sms_prune_agg_qindex_based[qband]; +} + +void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state) { + const AV1_COMMON *const cm = &cpi->common; + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + const int bsize_idx = convert_bsize_to_idx(bsize); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+ + const int res_idx = is_480p_or_larger + is_720p_or_larger; + + assert(bsize_idx >= 0 && bsize_idx <= 4 && + "Invalid bsize in simple_motion_search_based_split"); + + const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx]; + const float *ml_std = av1_simple_motion_search_split_std[bsize_idx]; + const NN_CONFIG *nn_config = + av1_simple_motion_search_split_nn_config[bsize_idx]; + + const int agg = get_simple_motion_search_prune_agg( + x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0); + if (agg < 0) { + return; + } + + const float split_only_thresh = + av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx]; + const float no_split_thresh = + av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx]; + + float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_SPLIT_MODEL_FLAG); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col); + + // Note: it is intended to not normalize the features here, to keep it + // consistent for all features collected and passed to the external model. + if (ext_ml_model_decision_before_none( + cpi, features, &part_state->partition_none_allowed, + &part_state->partition_rect_allowed[HORZ], + &part_state->partition_rect_allowed[VERT], + &part_state->do_rectangular_split, &part_state->do_square_split)) { + return; + } + + for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) { + features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx]; + } + + float score = 0.0f; + + av1_nn_predict(features, nn_config, 1, &score); + + if (score > split_only_thresh) { + av1_set_square_split_only(part_state); + } + + if (cpi->sf.part_sf.simple_motion_search_split >= 2 && + score < no_split_thresh) { + av1_disable_square_split_partition(part_state); + } + + // If the score is very low, prune rectangular split since it is unlikely to + // occur. + if (cpi->sf.part_sf.simple_motion_search_rect_split) { + const float scale = res_idx >= 2 ? 3.0f : 2.0f; + const float rect_split_thresh = + scale * av1_simple_motion_search_no_split_thresh + [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx] + [bsize_idx]; + if (score < rect_split_thresh) { + part_state->do_rectangular_split = 0; + } + } +} + +// Given a list of ref frames in refs, performs simple_motion_search on each of +// the refs and returns the ref with the smallest sse. Returns -1 if none of the +// ref in the list is available. Also stores the best sse and var in best_sse, +// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in +// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the +// subtrees. +static int simple_motion_search_get_best_ref( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs, + int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse, + unsigned int *best_var) { + const AV1_COMMON *const cm = &cpi->common; + int best_ref = -1; + + if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) { + // If the whole block is outside of the image, set the var and sse to 0. + *best_var = 0; + *best_sse = 0; + + return best_ref; + } + + // Otherwise do loop through the reference frames and find the one with the + // minimum SSE + const int num_planes = 1; + + *best_sse = INT_MAX; + + for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) { + const int ref = refs[ref_idx]; + + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) { + const FULLPEL_MV *start_mvs = sms_tree->start_mvs; + unsigned int curr_sse = 0, curr_var = 0; + const int_mv best_mv = av1_simple_motion_search_sse_var( + cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes, + use_subpixel, &curr_sse, &curr_var); + if (curr_sse < *best_sse) { + *best_sse = curr_sse; + *best_var = curr_var; + best_ref = ref; + } + + if (save_mv) { + sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8; + sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8; + + if (bsize >= BLOCK_8X8) { + for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) { + // Propagate the new motion vectors to a lower level + SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx]; + sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref]; + } + } + } + } + } + + return best_ref; +} + +// Collects features using simple_motion_search and store them in features. The +// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features +// collected are the sse and var from the subblocks flagged by features_to_get. +// Furthermore, if features is not NULL, then 7 more features are appended to +// the end of features: +// - log(1.0 + dc_q ** 2) +// - whether an above macroblock exists +// - width of above macroblock +// - height of above macroblock +// - whether a left marcoblock exists +// - width of left macroblock +// - height of left macroblock +static AOM_INLINE void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, + int features_to_get) { + const int w_mi = mi_size_wide[bsize]; + const int h_mi = mi_size_high[bsize]; + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + assert(bsize >= BLOCK_8X8); + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] || + cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + + // Setting up motion search + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int num_refs = 1; + const int use_subpixel = 1; + + // Doing whole block first to update the mv + if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) { + simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize, + ref_list, num_refs, use_subpixel, 1, + &sms_tree->sms_none_feat[0], + &sms_tree->sms_none_feat[1]); + sms_tree->sms_none_valid = 1; + } + + // Split subblocks + if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) { + const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2; + const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2; + SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx]; + + if (!sub_tree->sms_none_valid) { + simple_motion_search_get_best_ref( + cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list, + num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0], + &sub_tree->sms_none_feat[1]); + sub_tree->sms_none_valid = 1; + } + } + } + + // Rectangular subblocks + if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) { + // Horz subblock + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) { + const int sub_mi_col = mi_col + 0; + const int sub_mi_row = mi_row + r_idx * h_mi / 2; + + simple_motion_search_get_best_ref( + cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx], + &sms_tree->sms_rect_feat[2 * r_idx + 1]); + } + + // Vert subblock + subsize = get_partition_subsize(bsize, PARTITION_VERT); + for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) { + const int sub_mi_col = mi_col + r_idx * w_mi / 2; + const int sub_mi_row = mi_row + 0; + + simple_motion_search_get_best_ref( + cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx], + &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]); + } + sms_tree->sms_rect_valid = 1; + } + + if (!features) return; + + int f_idx = 0; + if (features_to_get & FEATURE_SMS_NONE_FLAG) { + for (int sub_idx = 0; sub_idx < 2; sub_idx++) { + features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[sub_idx]); + } + } + + if (features_to_get & FEATURE_SMS_SPLIT_FLAG) { + for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) { + SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx]; + features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[0]); + features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[1]); + } + } + + if (features_to_get & FEATURE_SMS_RECT_FLAG) { + for (int sub_idx = 0; sub_idx < 8; sub_idx++) { + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[sub_idx]); + } + } + + const MACROBLOCKD *xd = &x->e_mbd; + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + // Q_INDEX + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + features[f_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f); + + // Neighbor stuff + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize; + features[f_idx++] = (float)has_above; + features[f_idx++] = (float)mi_size_wide_log2[above_bsize]; + features[f_idx++] = (float)mi_size_high_log2[above_bsize]; + features[f_idx++] = (float)has_left; + features[f_idx++] = (float)mi_size_wide_log2[left_bsize]; + features[f_idx++] = (float)mi_size_high_log2[left_bsize]; +} + +void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state) { + const AV1_COMMON *const cm = &cpi->common; + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + const int bsize_idx = convert_bsize_to_idx(bsize); + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+ + const int res_idx = is_480p_or_larger + is_720p_or_larger; + + // Get model parameters + const NN_CONFIG *nn_config = + av1_simple_motion_search_prune_rect_nn_config[bsize_idx]; + const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx], + *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx]; + + const int agg = get_simple_motion_search_prune_agg( + x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1); + if (agg < 0) { + return; + } + + const float prune_thresh = + av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx]; + + // If there is no valid threshold, return immediately. + if (!nn_config || prune_thresh == 0.0f) { + return; + } + + // Get features + float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_PRUNE_PART_FLAG); + + // Note: it is intended to not normalize the features here, to keep it + // consistent for all features collected and passed to the external model. + if (cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && + (part_state->partition_rect_allowed[HORZ] || + part_state->partition_rect_allowed[VERT]) && + bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) { + // Write features to file + write_features_to_file( + cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, + features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_before_none_part2( + cpi, features, &part_state->prune_rect_part[HORZ], + &part_state->prune_rect_part[VERT])) { + return; + } + } + + for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) { + features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + + // Get probabilities + float scores[EXT_PARTITION_TYPES] = { 0.0f }, + probs[EXT_PARTITION_TYPES] = { 0.0f }; + const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8) + ? PARTITION_TYPES + : EXT_PARTITION_TYPES; + + av1_nn_predict(features, nn_config, 1, scores); + + av1_nn_softmax(scores, probs, num_classes); + + // Determine if we should prune rectangular partitions. + if (probs[PARTITION_HORZ] <= prune_thresh) { + part_state->prune_rect_part[HORZ] = 1; + } + if (probs[PARTITION_VERT] <= prune_thresh) { + part_state->prune_rect_part[VERT] = 1; + } +} + +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none( + AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree, + const RD_STATS *none_rdc, PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f }; + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, features, + FEATURE_SMS_PRUNE_PART_FLAG); + int f_idx = FEATURE_SIZE_SMS_PRUNE_PART; + + features[f_idx++] = log1pf((float)none_rdc->rate); + features[f_idx++] = log1pf((float)none_rdc->dist); + features[f_idx++] = log1pf((float)none_rdc->rdcost); + + assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE); + + const float *ml_mean = NULL; + const float *ml_std = NULL; + const float *ml_model = NULL; + + if (bsize == BLOCK_128X128) { + ml_mean = av1_simple_motion_search_term_none_mean_128; + ml_std = av1_simple_motion_search_term_none_std_128; + ml_model = av1_simple_motion_search_term_none_model_128; + } else if (bsize == BLOCK_64X64) { + ml_mean = av1_simple_motion_search_term_none_mean_64; + ml_std = av1_simple_motion_search_term_none_std_64; + ml_model = av1_simple_motion_search_term_none_model_64; + } else if (bsize == BLOCK_32X32) { + ml_mean = av1_simple_motion_search_term_none_mean_32; + ml_std = av1_simple_motion_search_term_none_std_32; + ml_model = av1_simple_motion_search_term_none_model_32; + } else if (bsize == BLOCK_16X16) { + ml_mean = av1_simple_motion_search_term_none_mean_16; + ml_std = av1_simple_motion_search_term_none_std_16; + ml_model = av1_simple_motion_search_term_none_model_16; + } else { + assert(0 && "Unexpected block size in simple_motion_term_none"); + } + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_none_part2( + cpi, features, &part_state->terminate_partition_search)) { + return; + } + + if (ml_model) { + float score = 0.0f; + for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) { + score += + ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + score += ml_model[FEATURE_SIZE_SMS_TERM_NONE]; + + if (score >= 0.0f) { + part_state->terminate_partition_search = 1; + } + } +} + +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + + // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size. + assert(sb_size == BLOCK_128X128); + + int f_idx = 0; + + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const float log_q_sq = log1pf((float)(dc_q * dc_q) / 256.0f); + + // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb + float sum_mv_row_sq = 0; + float sum_mv_row = 0; + float min_abs_mv_row = FLT_MAX; + float max_abs_mv_row = 0; + + float sum_mv_col_sq = 0; + float sum_mv_col = 0; + float min_abs_mv_col = FLT_MAX; + float max_abs_mv_col = 0; + + float sum_log_sse_sq = 0; + float sum_log_sse = 0; + float min_log_sse = FLT_MAX; + float max_log_sse = 0; + + const BLOCK_SIZE mb_size = BLOCK_16X16; + const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size]; + const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size]; + const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size]; + const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size]; + + for (int mb_row = 0; mb_row < mb_rows; mb_row++) + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2); + const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2); + unsigned int sse = 0; + unsigned int var = 0; + const FULLPEL_MV start_mv = kZeroFullMv; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + const int_mv best_mv = av1_simple_motion_search_sse_var( + cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse, + &var); + + const float mv_row = (float)(best_mv.as_mv.row / 8); + const float mv_col = (float)(best_mv.as_mv.col / 8); + const float log_sse = log1pf((float)sse); + const float abs_mv_row = fabsf(mv_row); + const float abs_mv_col = fabsf(mv_col); + + sum_mv_row_sq += mv_row * mv_row; + sum_mv_row += mv_row; + sum_mv_col_sq += mv_col * mv_col; + sum_mv_col += mv_col; + + if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row; + if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row; + if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col; + if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col; + + sum_log_sse_sq += log_sse * log_sse; + sum_log_sse += log_sse; + if (log_sse < min_log_sse) min_log_sse = log_sse; + if (log_sse > max_log_sse) max_log_sse = log_sse; + } + const int blks = mb_rows * mb_cols; + const float avg_mv_row = sum_mv_row / (float)blks; + const float var_mv_row = + sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row; + + const float avg_mv_col = sum_mv_col / (float)blks; + const float var_mv_col = + sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col; + + const float avg_log_sse = sum_log_sse / (float)blks; + const float var_log_sse = + sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse; + + features[f_idx++] = avg_log_sse; + features[f_idx++] = avg_mv_col; + features[f_idx++] = avg_mv_row; + features[f_idx++] = log_q_sq; + features[f_idx++] = max_abs_mv_col; + features[f_idx++] = max_abs_mv_row; + features[f_idx++] = max_log_sse; + features[f_idx++] = min_abs_mv_col; + features[f_idx++] = min_abs_mv_row; + features[f_idx++] = min_log_sse; + features[f_idx++] = var_log_sse; + features[f_idx++] = var_mv_col; + features[f_idx++] = var_mv_row; + + assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED); +} + +// Convert result index to block size. +// result idx block size +// 0 BLOCK_16X16 +// 1 BLOCK_32X32 +// 2 BLOCK_64X64 +// 3 BLOCK_128X128 +static BLOCK_SIZE get_block_size(int idx) { + return (BLOCK_SIZE)((idx + 2) * 3); +} + +BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const float *features) { + float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config; + + assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != + NOT_IN_USE); + + av1_nn_predict(features, nn_config, 1, scores); + + int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; + if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + DIRECT_PRED) { + result = 0; + float max_score = scores[0]; + for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) { + if (scores[i] > max_score) { + max_score = scores[i]; + result = i; + } + } + return get_block_size(result); + } + + float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); + + if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + RELAXED_PRED) { + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > 0.2) break; + } + } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + ADAPT_PRED) { + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; + // TODO(debargha): x->source_variance is unavailable at this point, + // so compute. The redundant recomputation later can be removed. + const unsigned int source_variance = av1_get_perpixel_variance_facade( + cpi, &x->e_mbd, &x->plane[0].src, sb_size, AOM_PLANE_Y); + if (source_variance > 16) { + const double thresh = source_variance < 128 ? 0.05 : 0.1; + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > thresh) break; + } + } + } + + return get_block_size(result); +} + +// Get the minimum partition block width and height(in log scale) under a +// SIMPLE_MOTION_DATA_TREE. +static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree, + int *min_bw, int *min_bh) { + if (!sms_tree) return; + + const BLOCK_SIZE bsize = sms_tree->block_size; + if (bsize == BLOCK_4X4) { + *min_bw = 0; + *min_bh = 0; + return; + } + + PARTITION_TYPE part_type = sms_tree->partitioning; + if (part_type == PARTITION_INVALID) return; + + if (part_type == PARTITION_SPLIT) { + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + get_min_bsize(sms_tree->split[i], min_bw, min_bh); + } + } else { + if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B || + part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B) + part_type = PARTITION_SPLIT; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type); + if (subsize != BLOCK_INVALID) { + *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]); + *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]); + } + } +} + +static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features, + int *feature_idx) { + const int rd_valid = rd > 0 && rd < INT64_MAX; + const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f; + features[(*feature_idx)++] = (float)rd_valid; + features[(*feature_idx)++] = rd_ratio; +} + +#define FEATURES 31 +void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + int64_t best_rd, int64_t part_none_rd, + int64_t part_split_rd, + int64_t *split_block_rd, + PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + if (best_rd <= 0 || best_rd == INT64_MAX || + part_state->terminate_partition_search) + return; + + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const NN_CONFIG *nn_config = NULL; + float thresh = -1e6; + switch (bsize) { + case BLOCK_128X128: break; + case BLOCK_64X64: + nn_config = &av1_early_term_after_split_nnconfig_64; + thresh = is_480p_or_larger ? -2.0f : -1.2f; + break; + case BLOCK_32X32: + nn_config = &av1_early_term_after_split_nnconfig_32; + thresh = is_480p_or_larger ? -2.6f : -2.3f; + break; + case BLOCK_16X16: + nn_config = &av1_early_term_after_split_nnconfig_16; + thresh = is_480p_or_larger ? -2.0f : -2.4f; + break; + case BLOCK_8X8: + nn_config = &av1_early_term_after_split_nnconfig_8; + thresh = is_480p_or_larger ? -1.0f : -1.4f; + break; + case BLOCK_4X4: break; + default: + assert(0 && "Invalid block size in av1_ml_early_term_after_split()."); + break; + } + if (!nn_config) return; + + // Use more conservative threshold for level 1. + if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f; + + const MACROBLOCKD *const xd = &x->e_mbd; + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const int bs = block_size_wide[bsize]; + int f_idx = 0; + float features[FEATURES] = { 0.0f }; + + features[f_idx++] = log1pf((float)dc_q / 4.0f); + features[f_idx++] = log1pf((float)best_rd / bs / bs / 1024.0f); + + add_rd_feature(part_none_rd, best_rd, features, &f_idx); + add_rd_feature(part_split_rd, best_rd, features, &f_idx); + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + add_rd_feature(split_block_rd[i], best_rd, features, &f_idx); + int min_bw = MAX_SB_SIZE_LOG2; + int min_bh = MAX_SB_SIZE_LOG2; + get_min_bsize(sms_tree->split[i], &min_bw, &min_bh); + features[f_idx++] = (float)min_bw; + features[f_idx++] = (float)min_bh; + } + + simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, + bsize, NULL, + FEATURE_SMS_PRUNE_PART_FLAG); + + features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[1]); + + features[f_idx++] = log1pf((float)sms_tree->split[0]->sms_none_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->split[1]->sms_none_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->split[2]->sms_none_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->split[3]->sms_none_feat[1]); + + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[1]); + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[3]); + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[5]); + features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[7]); + + assert(f_idx == FEATURES); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, FEATURES, + 4, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_split( + cpi, features, &part_state->terminate_partition_search)) { + return; + } + + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + // Score is indicator of confidence that we should NOT terminate. + if (score < thresh) { + part_state->terminate_partition_search = 1; + } +} +#undef FEATURES + +void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, + int64_t best_rd, int64_t none_rd, + const int64_t *split_rd, + PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + best_rd = AOMMAX(best_rd, 1); + const NN_CONFIG *nn_config = NULL; + const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f }; + float cur_thresh = 0.0f; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_rect_partition_nnconfig_8; + cur_thresh = prob_thresholds[0]; + break; + case BLOCK_16X16: + nn_config = &av1_rect_partition_nnconfig_16; + cur_thresh = prob_thresholds[1]; + break; + case BLOCK_32X32: + nn_config = &av1_rect_partition_nnconfig_32; + cur_thresh = prob_thresholds[2]; + break; + case BLOCK_64X64: + nn_config = &av1_rect_partition_nnconfig_64; + cur_thresh = prob_thresholds[3]; + break; + case BLOCK_128X128: + nn_config = &av1_rect_partition_nnconfig_128; + cur_thresh = prob_thresholds[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + // 1. Compute input features + float features[9]; + + // RD cost ratios + for (int i = 0; i < 5; i++) features[i] = 1.0f; + if (none_rd > 0 && none_rd < 1000000000) + features[0] = (float)none_rd / (float)best_rd; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + features[1 + i] = (float)split_rd[i] / (float)best_rd; + } + + // Variance ratios + const MACROBLOCKD *const xd = &x->e_mbd; + int whole_block_variance; + whole_block_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + whole_block_variance = AOMMAX(whole_block_variance, 1); + + int split_variance[SUB_PARTITIONS_SPLIT]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + struct buf_2d buf; + buf.stride = x->plane[0].src.stride; + const int bw = block_size_wide[bsize]; + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + const int x_idx = (i & 1) * bw / 2; + const int y_idx = (i >> 1) * bw / 2; + buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; + split_variance[i] = + av1_get_perpixel_variance_facade(cpi, xd, &buf, subsize, AOM_PLANE_Y); + } + + for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) + features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + /*feature_size=*/9, 5, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_split_part2( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, &part_state->prune_rect_part[HORZ], + &part_state->prune_rect_part[VERT])) { + return; + } + + // 2. Do the prediction and prune 0-2 partitions based on their probabilities + float raw_scores[3] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, raw_scores); + float probs[3] = { 0.0f }; + av1_nn_softmax(raw_scores, probs, 3); + + // probs[0] is the probability of the fact that both rectangular partitions + // are worse than current best_rd + if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1; + if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1; +} + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx, + int64_t best_rd, + PartitionSearchState *part_state, + int *ab_partitions_allowed) { + const PartitionBlkParams blk_params = part_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + + if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; + const NN_CONFIG *nn_config = NULL; + switch (bsize) { + case BLOCK_8X8: nn_config = NULL; break; + case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break; + case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + // Generate features. + float features[10]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)var_ctx; + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + const int64_t *horz_rd = part_state->rect_part_rd[HORZ]; + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + const int64_t *vert_rd = part_state->rect_part_rd[VERT]; + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + const int64_t *split_rd = part_state->split_rd; + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + assert(feature_index == 10); + + // Write features to file + if (!frame_is_intra_only(&cpi->common)) { + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + /*feature_size=*/10, 6, bsize, mi_row, mi_col); + } + + if (ext_ml_model_decision_after_rect( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, &ab_partitions_allowed[HORZ_A], + &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A], + &ab_partitions_allowed[VERT_B])) { + return; + } + + // Calculate scores using the NN model. + float score[16] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, score); + int int_score[16]; + int max_score = -1000; + for (int i = 0; i < 16; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 150; break; + case BLOCK_32X32: thresh -= 100; break; + default: break; + } + av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS); + for (int i = 0; i < 16; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1; + if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1; + if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1; + if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1; + } + } +} + +#define FEATURES 18 +#define LABELS 4 +// Use a ML model to predict if horz4 and vert4 should be considered. +void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + int part_ctx, int64_t best_rd, + PartitionSearchState *part_state, + int *part4_allowed, + unsigned int pb_source_variance) { + const PartitionBlkParams blk_params = part_state->part_blk_params; + const int mi_row = blk_params.mi_row; + const int mi_col = blk_params.mi_col; + const BLOCK_SIZE bsize = blk_params.bsize; + + int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd; + int64_t *split_rd = part_state->split_rd; + if (ext_ml_model_decision_after_part_ab( + cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd, + &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance, + mi_row, mi_col)) + return; + + if (best_rd >= 1000000000) return; + int64_t *horz_rd = rect_part_rd[HORZ4]; + int64_t *vert_rd = rect_part_rd[VERT4]; + const NN_CONFIG *nn_config = NULL; + // 4-way partitions are only allowed for these three square block sizes. + switch (bsize) { + case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break; + case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break; + case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config) return; + + // Generate features. + float features[FEATURES]; + int feature_index = 0; + features[feature_index++] = (float)part_ctx; + features[feature_index++] = (float)get_unsigned_bits(pb_source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features[feature_index++] = rd_ratio; + } + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + + assert(horz_4_bs != BLOCK_INVALID); + assert(vert_4_bs != BLOCK_INVALID); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common), bsize); + const int src_stride = x->plane[0].src.stride; + uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + + struct buf_2d horz_4_src, vert_4_src; + horz_4_src.stride = src_stride; + vert_4_src.stride = src_stride; + + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; + vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; + + horz_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y); + vert_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y); + } + } + + const float denom = (float)(pb_source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + assert(feature_index == FEATURES); + + // Write features to file + if (!frame_is_intra_only(&cpi->common)) { + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURES, 7, bsize, mi_row, mi_col); + } + + // Calculate scores using the NN model. + float score[LABELS] = { 0.0f }; + av1_nn_predict(features, nn_config, 1, score); + int int_score[LABELS]; + int max_score = -1000; + for (int i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = AOMMAX(int_score[i], max_score); + } + + // Make decisions based on the model scores. + int thresh = max_score; + switch (bsize) { + case BLOCK_16X16: thresh -= 500; break; + case BLOCK_32X32: thresh -= 500; break; + case BLOCK_64X64: thresh -= 200; break; + default: break; + } + av1_zero_array(part4_allowed, NUM_PART4_TYPES); + for (int i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) part4_allowed[HORZ4] = 1; + if ((i >> 1) & 1) part4_allowed[VERT4] = 1; + } + } +} +#undef FEATURES +#undef LABELS + +#define FEATURES 4 +void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance, int bit_depth, + PartitionSearchState *part_state) { + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col; + const BLOCK_SIZE bsize = blk_params->bsize; + + const NN_CONFIG *nn_config = NULL; + int thresh = 0; + switch (bsize) { + case BLOCK_8X8: + nn_config = &av1_partition_breakout_nnconfig_8; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0]; + break; + case BLOCK_16X16: + nn_config = &av1_partition_breakout_nnconfig_16; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &av1_partition_breakout_nnconfig_32; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &av1_partition_breakout_nnconfig_64; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3]; + break; + case BLOCK_128X128: + nn_config = &av1_partition_breakout_nnconfig_128; + thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4]; + break; + default: assert(0 && "Unexpected bsize."); + } + if (!nn_config || thresh < 0) return; + + const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f }; + thresh = (int)((float)thresh * + ml_predict_breakout_thresh_scale + [cpi->sf.part_sf.ml_predict_breakout_level - 1]); + + // Generate feature values. + float features[FEATURES]; + int feature_index = 0; + + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX); + rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + features[feature_index++] = rate_f; + + const float dist_f = + (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2); + features[feature_index++] = dist_f; + + features[feature_index++] = (float)pb_source_variance; + + const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8); + features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; + assert(feature_index == FEATURES); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, FEATURES, + 2, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_none(&cpi->ext_part_controller, + frame_is_intra_only(&cpi->common), + features, &part_state->do_square_split, + &part_state->do_rectangular_split)) { + return; + } + + // Calculate score using the NN model. + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + + // Make decision. + if ((int)(score * 100) >= thresh) { + part_state->do_square_split = 0; + part_state->do_rectangular_split = 0; + } +} +#undef FEATURES + +void av1_prune_partitions_before_search(AV1_COMP *const cpi, + MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + PartitionSearchState *part_state) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params->bsize; + + if (cpi->third_pass_ctx) { + int mi_row = blk_params->mi_row; + int mi_col = blk_params->mi_col; + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width, + &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w); + BLOCK_SIZE third_pass_bsize = + av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w); + // check the actual partition of this block in the second pass + PARTITION_TYPE third_pass_part = + av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi); + + int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) || + (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols); + + if (!is_edge && block_size_wide[bsize] >= 16) { + // If in second pass we used rectangular partition, then do not search for + // rectangular partition in the different direction. + if (third_pass_part != PARTITION_NONE) { + if (third_pass_part == PARTITION_HORZ || + third_pass_part == PARTITION_HORZ_4 || + third_pass_part == PARTITION_HORZ_A || + third_pass_part == PARTITION_HORZ_B) { + part_state->partition_rect_allowed[VERT] = 0; + } else if (third_pass_part == PARTITION_VERT || + third_pass_part == PARTITION_VERT_4 || + third_pass_part == PARTITION_VERT_A || + third_pass_part == PARTITION_VERT_B) { + part_state->partition_rect_allowed[HORZ] = 0; + } + } + + int minSize = AOMMIN(block_size_wide[third_pass_bsize], + block_size_high[third_pass_bsize]); + int maxSize = AOMMAX(block_size_wide[third_pass_bsize], + block_size_high[third_pass_bsize]); + if (block_size_wide[bsize] < minSize / 4) { + // Current partition is too small, just terminate + part_state->terminate_partition_search = 1; + return; + } else if (block_size_wide[bsize] < minSize / 2) { + if (third_pass_part != PARTITION_NONE) { + // Current partition is very small, and in second pass we used + // rectangular partition. Terminate the search here then. + part_state->terminate_partition_search = 1; + return; + } else { + // Partition is small, but we still check this partition, only disable + // further splits. + // TODO(any): check why this is not covered by the termination for < + // minSize/4. + av1_disable_square_split_partition(part_state); + av1_disable_rect_partitions(part_state); + return; + } + } else if (block_size_wide[bsize] > maxSize) { + // Partition is larger than in the second pass. Only allow split. + av1_set_square_split_only(part_state); + return; + } else if (block_size_wide[bsize] >= minSize && + block_size_wide[bsize] <= maxSize) { + // Partition is within a range where it is very likely to find a good + // choice, so do not prune anything. + return; + } + } + } + + // Prune rectangular partitions for larger blocks. + if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) { + part_state->do_rectangular_split = 0; + part_state->partition_rect_allowed[HORZ] = 0; + part_state->partition_rect_allowed[VERT] = 0; + } + + // Prune rectangular, AB and 4-way partition based on q index and block size + if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) { + if (bsize == BLOCK_8X8 && x->qindex < 35) + av1_disable_rect_partitions(part_state); + + } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) { + // Enumeration difference between two square partitions + const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16; + int max_bsize = + BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step; + max_bsize = AOMMAX(max_bsize, BLOCK_4X4); + const BLOCK_SIZE max_prune_bsize = + (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32); + + // Prune partition + // qidx 0 to 85: prune bsize below BLOCK_32X32 + // qidx 86 to 170: prune bsize below BLOCK_16X16 + // qidx 171 to 255: prune bsize below BLOCK_8X8 + if (bsize < max_prune_bsize) { + av1_disable_rect_partitions(part_state); + } + } + + if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) { + const MACROBLOCKD *const xd = &x->e_mbd; + int prune_sub_8x8; + if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) { + prune_sub_8x8 = 1; + } else { + assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1); + // Prune if both neighbors are available and either is > BLOCK_8X8 + prune_sub_8x8 = xd->left_available && xd->up_available && + (xd->left_mbmi->bsize > BLOCK_8X8 || + xd->above_mbmi->bsize > BLOCK_8X8); + } + if (prune_sub_8x8) { + av1_disable_all_splits(part_state); + } + } + + // A CNN-based speed feature pruning out either split or all non-split + // partition in INTRA frame coding. + const int try_intra_cnn_based_part_prune = + frame_is_intra_only(cm) && + cpi->sf.part_sf.intra_cnn_based_part_prune_level && + cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 && + blk_params->bsize_at_least_8x8 && + av1_is_whole_blk_in_frame(blk_params, mi_params); + + if (try_intra_cnn_based_part_prune) { + av1_intra_mode_cnn_partition( + &cpi->common, x, x->part_search_info.quad_tree_idx, + cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state); + } + + // Use simple motion search to prune out split or non-split partitions. This + // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a + // smaller blocksize. + const int try_split_only = + cpi->sf.part_sf.simple_motion_search_split && + part_state->do_square_split && blk_params->bsize_at_least_8x8 && + av1_is_whole_blk_in_frame(blk_params, mi_params) && + !frame_is_intra_only(cm) && !av1_superres_scaled(cm); + + if (try_split_only) { + av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state); + } + + // Use simple motion search to prune out rectangular partition in some + // direction. The results are stored in prune_horz and prune_vert in order to + // bypass future related pruning checks if a pruning decision has been made. + + // We want to search at least one partition mode, so don't prune if NONE and + // SPLIT are disabled. + const int non_rect_part_allowed = + part_state->do_square_split || part_state->partition_none_allowed; + // Only run the model if the partitions are not already pruned. + const int rect_part_allowed = part_state->do_rectangular_split && + ((part_state->partition_rect_allowed[HORZ] && + !part_state->prune_rect_part[HORZ]) || + (part_state->partition_rect_allowed[VERT] && + !part_state->prune_rect_part[VERT])); + + const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && + non_rect_part_allowed && rect_part_allowed && + !av1_superres_scaled(cm); + + if (try_prune_rect) { + av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state); + } +} + +#ifndef NDEBUG +static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) { + return block_size_wide[bsize] == block_size_high[bsize]; +} +#endif // NDEBUG + +void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc, + PartitionSearchState *part_state) { + assert(is_bsize_square(sb_enc->max_partition_size)); + assert(is_bsize_square(sb_enc->min_partition_size)); + assert(sb_enc->min_partition_size <= sb_enc->max_partition_size); + const PartitionBlkParams *blk_params = &part_state->part_blk_params; + const BLOCK_SIZE bsize = blk_params->bsize; + assert(is_bsize_square(bsize)); + const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size]; + const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size]; + const int bsize_1d = block_size_wide[bsize]; + assert(min_partition_size_1d <= max_partition_size_1d); + const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d; + const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d; + if (is_gt_max_sq_part) { + // If current block size is larger than max, only allow split. + av1_set_square_split_only(part_state); + } else if (is_le_min_sq_part) { + // If current block size is less or equal to min, only allow none if valid + // block large enough; only allow split otherwise. + av1_disable_rect_partitions(part_state); + + // only disable square split when current block is not at the picture + // boundary. otherwise, inherit the square split flag from previous logic + if (av1_blk_has_rows_and_cols(blk_params)) { + part_state->do_square_split = 0; + } + part_state->partition_none_allowed = !(part_state->do_square_split); + } +} + +// Decide whether to evaluate the AB partition specified by part_type based on +// split and HORZ/VERT info +int evaluate_ab_partition_based_on_split( + const PC_TREE *pc_tree, PARTITION_TYPE rect_part, + const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1, + int split_idx2) { + int num_win = 0; + // Threshold for number of winners + // Conservative pruning for high quantizers + const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3); + int sub_part_win = + (rect_part_win_info == NULL) ? (pc_tree->partitioning == rect_part) + : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ] + : rect_part_win_info->rect_part_win[VERT]; + num_win += (sub_part_win) ? 1 : 0; + if (pc_tree->split[split_idx1]) { + num_win += + (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0; + } else { + num_win += 1; + } + if (pc_tree->split[split_idx2]) { + num_win += + (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0; + } else { + num_win += 1; + } + if (num_win < num_win_thresh) { + return 0; + } + return 1; +} + +void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x, + const PC_TREE *pc_tree, int pb_source_variance, + int64_t best_rdcost, + const RD_RECT_PART_WIN_INFO *rect_part_win_info, + bool ext_partition_allowed, + PartitionSearchState *part_state, + int *ab_partitions_allowed) { + int64_t *horz_rd = part_state->rect_part_rd[HORZ]; + int64_t *vert_rd = part_state->rect_part_rd[VERT]; + int64_t *split_rd = part_state->split_rd; + const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg; + // The standard AB partitions are allowed initially if ext-partition-types are + // allowed. + int horzab_partition_allowed = ext_partition_allowed && + part_cfg->enable_ab_partitions && + part_state->partition_rect_allowed[HORZ]; + int vertab_partition_allowed = ext_partition_allowed && + part_cfg->enable_ab_partitions && + part_state->partition_rect_allowed[VERT]; + + // Pruning: pruning out AB partitions on one main direction based on the + // current best partition and source variance. + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) { + // TODO(debargha,huisu@google.com): may need to tune the threshold for + // pb_source_variance. + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + (pc_tree->partitioning == PARTITION_NONE && + pb_source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + (pc_tree->partitioning == PARTITION_NONE && + pb_source_variance < 32) || + pc_tree->partitioning == PARTITION_SPLIT); + } else { + horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ || + pc_tree->partitioning == PARTITION_SPLIT); + vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT || + pc_tree->partitioning == PARTITION_SPLIT); + } + horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0); + horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0); + vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0); + vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0); + split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0); + split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0); + split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0); + split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0); + } + + // Pruning: pruning out horz_a or horz_b if the combined rdcost of its + // subblocks estimated from previous partitions is much higher than the best + // rd so far. + ab_partitions_allowed[HORZ_A] = horzab_partition_allowed; + ab_partitions_allowed[HORZ_B] = horzab_partition_allowed; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1]; + const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3]; + switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + case 1: + ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost); + ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost); + break; + case 2: + default: + ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost); + ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost); + break; + } + } + + // Pruning: pruning out vert_a or vert_b if the combined rdcost of its + // subblocks estimated from previous partitions is much higher than the best + // rd so far. + ab_partitions_allowed[VERT_A] = vertab_partition_allowed; + ab_partitions_allowed[VERT_B] = vertab_partition_allowed; + if (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2]; + const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3]; + switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) { + case 1: + ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost); + ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost); + break; + case 2: + default: + ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost); + ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost); + break; + } + } + + // Pruning: pruning out some ab partitions using a DNN taking rd costs of + // sub-blocks from previous basic partition types. + if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed && + part_state->partition_rect_allowed[HORZ] && + part_state->partition_rect_allowed[VERT]) { + // TODO(huisu@google.com): x->source_variance may not be the current + // block's variance. The correct one to use is pb_source_variance. Need to + // re-train the model to fix it. + av1_ml_prune_ab_partition(cpi, pc_tree->partitioning, + get_unsigned_bits(x->source_variance), + best_rdcost, part_state, ab_partitions_allowed); + } + + // Pruning: pruning AB partitions based on the number of horz/vert wins + // in the current block and sub-blocks in PARTITION_SPLIT. + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[HORZ_A]) { + ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1); + } + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[HORZ_B]) { + ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3); + } + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[VERT_A]) { + ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2); + } + if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 && + ab_partitions_allowed[VERT_B]) { + ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split( + pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3); + } +} + +// Prepare features for the external model. Specifically, features after +// ab partition is searched. +static void prepare_features_after_part_ab( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + int part_ctx, int64_t best_rd, + int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance, + int mi_row, int mi_col, aom_partition_features_t *const features) { + int64_t *horz_rd = rect_part_rd[HORZ]; + int64_t *vert_rd = rect_part_rd[VERT]; + + // Generate features. + int feature_index = 0; + features->after_part_ab.f[feature_index++] = (float)part_ctx; + features->after_part_ab.f[feature_index++] = + (float)get_unsigned_bits(pb_source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features->after_part_ab.f[feature_index++] = rd_ratio; + } + + // 4-way partitions are only allowed for these three square block sizes. + assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64); + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + + assert(horz_4_bs != BLOCK_INVALID); + assert(vert_4_bs != BLOCK_INVALID); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common), bsize); + const int src_stride = x->plane[0].src.stride; + uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + + struct buf_2d horz_4_src, vert_4_src; + horz_4_src.stride = src_stride; + vert_4_src.stride = src_stride; + + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; + vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; + + horz_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y); + vert_4_source_var[i] = av1_get_perpixel_variance_facade( + cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y); + } + } + + const float denom = (float)(pb_source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features->after_part_ab.f[feature_index++] = var_ratio; + } + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features->after_part_ab.f[feature_index++] = var_ratio; + } + assert(feature_index == 18); +} + +// If the external partition model is used, we let it determine partition +// decisions before partition none. Specifically, these parameters: +// partition_none_allowed +// partition_horz_allowed +// partition_vert_allowed +// do_rectangular_split +// do_square_split +static bool ext_ml_model_decision_before_none( + AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], + int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split) { + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE; + for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) { + features.before_part_none.f[i] = features_from_motion[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *partition_none_allowed = decision.partition_none_allowed; + *partition_horz_allowed = decision.partition_rect_allowed[HORZ]; + *partition_vert_allowed = decision.partition_rect_allowed[VERT]; + *do_rectangular_split = decision.do_rectangular_split; + *do_square_split = decision.do_square_split; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions before partition none. Specifically, these parameters: +// prune_horz +// prune_vert +static bool ext_ml_model_decision_before_none_part2( + AV1_COMP *cpi, + const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], + int *prune_horz, int *prune_vert) { + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2; + for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) { + features.before_part_none.f_part2[i] = features_from_motion[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *prune_horz = decision.prune_rect_part[HORZ]; + *prune_vert = decision.prune_rect_part[VERT]; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// do_square_split +// do_rectangular_split +bool ext_ml_model_decision_after_none( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_none, int *do_square_split, + int *do_rectangular_split) { + if (!ext_part_controller->ready || is_intra_frame) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_NONE; + for (int i = 0; i < 4; ++i) { + features.after_part_none.f[i] = features_after_none[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *do_square_split = decision.do_square_split; + *do_rectangular_split = decision.do_rectangular_split; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// terminate_partition_search +bool ext_ml_model_decision_after_none_part2( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search) { + AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2; + for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) { + features.after_part_none.f_terminate[i] = features_terminate[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *terminate_partition_search = decision.terminate_partition_search; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// terminate_partition_search +bool ext_ml_model_decision_after_split(AV1_COMP *const cpi, + const float *const features_terminate, + int *terminate_partition_search) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) { + return false; + } + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT; + for (int i = 0; i < 31; ++i) { + features.after_part_split.f_terminate[i] = features_terminate[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *terminate_partition_search = decision.terminate_partition_search; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// prune_rect_part[HORZ] +// prune_rect_part[VERT] +bool ext_ml_model_decision_after_split_part2( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_prune, int *prune_rect_part_horz, + int *prune_rect_part_vert) { + if (is_intra_frame || !ext_part_controller->ready) { + return false; + } + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2; + for (int i = 0; i < 9; ++i) { + features.after_part_split.f_prune_rect[i] = features_prune[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *prune_rect_part_horz = decision.prune_rect_part[0]; + *prune_rect_part_vert = decision.prune_rect_part[1]; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after rectangular partition. Specifically, these parameters: +// horza_partition_allowed +// horzb_partition_allowed +// verta_partition_allowed +// vertb_partition_allowed +static bool ext_ml_model_decision_after_rect( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_rect, int *horza_partition_allowed, + int *horzb_partition_allowed, int *verta_partition_allowed, + int *vertb_partition_allowed) { + if (is_intra_frame || !ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_RECT; + for (int i = 0; i < 10; ++i) { + features.after_part_rect.f[i] = features_after_rect[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *horza_partition_allowed = decision.horza_partition_allowed; + *horzb_partition_allowed = decision.horzb_partition_allowed; + *verta_partition_allowed = decision.verta_partition_allowed; + *vertb_partition_allowed = decision.vertb_partition_allowed; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after AB partition. Specifically, these parameters: +// partition_vert4_allowed +// partition_horz4_allowed +static bool ext_ml_model_decision_after_part_ab( + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, + int *const partition_vert4_allowed, unsigned int pb_source_variance, + int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + + if (!frame_is_intra_only(cm) && ext_part_controller->ready) { + // Setup features. + aom_partition_features_t features; + features.id = AOM_EXT_PART_FEATURE_AFTER_AB; + prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd, + rect_part_rd, split_rd, pb_source_variance, + mi_row, mi_col, &features); + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *partition_horz4_allowed = decision.partition_horz4_allowed; + *partition_vert4_allowed = decision.partition_vert4_allowed; + + return true; + } + + return false; +} + +// This function resembles "av1_setup_sms_tree()" in context_tree.c +// with function signature change. +static SIMPLE_MOTION_DATA_TREE *setup_sms_tree( + AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) { + AV1_COMMON *const cm = &cpi->common; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + int sms_tree_index = 0; + SIMPLE_MOTION_DATA_TREE *this_sms; + int square_index = 1; + int nodes; + this_sms = &sms_tree[0]; + + if (!stat_generation_stage) { + const int leaf_factor = is_sb_size_128 ? 4 : 1; + const int leaf_nodes = 256 * leaf_factor; + + // Sets up all the leaf nodes in the tree. + for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) { + SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; + tree->block_size = square[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) { + for (int i = 0; i < nodes; ++i) { + SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; + tree->block_size = square[square_index]; + for (int j = 0; j < 4; j++) tree->split[j] = this_sms++; + ++sms_tree_index; + } + ++square_index; + } + } else { + // Allocation for firstpass/LAP stage + // TODO(Mufaddal): refactor square_index to use a common block_size macro + // from firstpass.c + SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index]; + square_index = 2; + tree->block_size = square[square_index]; + } + + // Set up the root node for the largest superblock size + return &sms_tree[tree_nodes - 1]; +} + +static void write_motion_feature_to_file( + const char *const path, const int sb_counter, const unsigned int *block_sse, + const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize, + const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path, + sb_counter); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize, + block_size_wide[fixed_block_size], num_blocks); + for (int i = 0; i < num_blocks; ++i) { + fprintf(pfile, "%d", block_sse[i]); + if (i < num_blocks - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + for (int i = 0; i < num_blocks; ++i) { + fprintf(pfile, "%d", block_var[i]); + if (i < num_blocks - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + fclose(pfile); +} + +void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, + const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, + aom_partition_features_t *features) { + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return; + + MACROBLOCK *const x = &td->mb; + const BLOCK_SIZE fixed_block_size = BLOCK_16X16; + const int col_step = mi_size_wide[fixed_block_size]; + const int row_step = mi_size_high[fixed_block_size]; + SIMPLE_MOTION_DATA_TREE *sms_tree = NULL; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree))); + SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree); + TileInfo *const tile_info = &tile_data->tile_info; + av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize); + av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row, + mi_col); + av1_reset_simple_motion_tree_partition(sms_root, bsize); + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int mi_width = + AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col); + const int mi_height = + AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row); + const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0); + const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0); + const int num_blocks = col_steps * row_steps; + unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse)); + unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var)); + if (!(block_sse && block_var)) { + aom_free(sms_tree); + aom_free(block_sse); + aom_free(block_var); + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating block_sse & block_var"); + } + int idx = 0; + + for (int row = mi_row; + row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows); + row += row_step) { + for (int col = mi_col; + col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols); + col += col_step) { + simple_motion_search_get_best_ref( + cpi, x, sms_root, row, col, fixed_block_size, ref_list, + /*num_refs=*/1, /*use_subpixel=*/1, + /*save_mv=*/1, &block_sse[idx], &block_var[idx]); + ++idx; + } + } + if (features == NULL) { + write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter, + block_sse, block_var, idx, bsize, + fixed_block_size, mi_row, mi_col); + } else { + features->sb_features.motion_features.unit_length = + block_size_wide[fixed_block_size]; + features->sb_features.motion_features.num_units = idx; + for (int i = 0; i < idx; ++i) { + features->sb_features.motion_features.block_sse[i] = block_sse[i]; + features->sb_features.motion_features.block_var[i] = block_var[i]; + } + } + + aom_free(block_sse); + aom_free(block_var); + aom_free(sms_tree); +} + +void av1_prepare_motion_search_features_block( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + const int mi_row, const int mi_col, const BLOCK_SIZE bsize, + const int valid_partition_types, unsigned int *block_sse, + unsigned int *block_var, unsigned int sub_block_sse[4], + unsigned int sub_block_var[4], unsigned int horz_block_sse[2], + unsigned int horz_block_var[2], unsigned int vert_block_sse[2], + unsigned int vert_block_var[2]) { + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return; + MACROBLOCK *const x = &td->mb; + SIMPLE_MOTION_DATA_TREE *sms_tree = NULL; + const int stat_generation_stage = is_stat_generation_stage(cpi); + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; + const int tree_nodes = + av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); + CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree))); + SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree); + TileInfo *const tile_info = &tile_data->tile_info; + av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize); + av1_reset_simple_motion_tree_partition(sms_root, bsize); + const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME + : LAST_FRAME }; + const int sub_mi_width = mi_size_wide[bsize] / 2; + const int sub_mi_height = sub_mi_width; + simple_motion_search_get_best_ref( + cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var); + // Split to 4 sub blocks. + if (valid_partition_types & (1 << PARTITION_SPLIT)) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + const int row = mi_row + (i >> 1) * sub_mi_height; + const int col = mi_col + (i & 1) * sub_mi_width; + simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, + ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, + &sub_block_sse[i], &sub_block_var[i]); + } + } + // Horizontal split + if (valid_partition_types & (1 << PARTITION_HORZ)) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + for (int i = 0; i < 2; ++i) { + const int row = mi_row + (i & 1) * sub_mi_height; + const int col = mi_col; + simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, + ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, + &horz_block_sse[i], &horz_block_var[i]); + } + } + // Vertical split + if (valid_partition_types & (1 << PARTITION_VERT)) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); + for (int i = 0; i < 2; ++i) { + const int row = mi_row; + const int col = mi_col + (i & 1) * sub_mi_width; + simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize, + ref_list, /*num_refs=*/1, + /*use_subpixel=*/1, /*save_mv=*/1, + &vert_block_sse[i], &vert_block_var[i]); + } + } + + aom_free(sms_tree); +} +#endif // !CONFIG_REALTIME_ONLY + +static INLINE void init_simple_motion_search_mvs( + SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) { + memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs)); + av1_zero(sms_tree->sms_none_feat); + av1_zero(sms_tree->sms_rect_feat); + av1_zero(sms_tree->sms_none_valid); + av1_zero(sms_tree->sms_rect_valid); + + if (sms_tree->block_size >= BLOCK_8X8) { + init_simple_motion_search_mvs(sms_tree->split[0], start_mvs); + init_simple_motion_search_mvs(sms_tree->split[1], start_mvs); + init_simple_motion_search_mvs(sms_tree->split[2], start_mvs); + init_simple_motion_search_mvs(sms_tree->split[3], start_mvs); + } +} + +void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi, + const TileInfo *tile_info, + MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col) { + // Use the NEARESTMV of the sb as the start mv + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + FULLPEL_MV ref_mvs[REF_FRAMES]; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + av1_zero(ref_mvs); + // If tile_info is NULL, assume that the offsets have already been set. + if (tile_info) { + av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, + sb_size); + } + + MB_MODE_INFO_EXT mbmi_ext; + const int ref_frame = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs, + mbmi_ext.mode_context); + if (mbmi_ext.ref_mv_count[ref_frame] > 0) { + ref_mvs[ref_frame] = + get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv); + } else { + ref_mvs[ref_frame] = + get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv); + } + + init_simple_motion_search_mvs(sms_root, ref_mvs); +} diff --git a/third_party/aom/av1/encoder/partition_strategy.h b/third_party/aom/av1/encoder/partition_strategy.h new file mode 100644 index 0000000000..84683f5fd4 --- /dev/null +++ b/third_party/aom/av1/encoder/partition_strategy.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ +#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encoder.h" + +void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, + int label_idx, + int intra_cnn_based_part_prune_level, + PartitionSearchState *part_state); + +// Performs a simple_motion_search with a single reference frame and extract +// the variance of residues. Then use the features to determine whether we want +// to go straight to splitting without trying PARTITION_NONE +void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state); + +// Performs a simple_motion_search with two reference frames and extract +// the variance of residues. Then use the features to determine whether we want +// to prune some partitions. +void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + PartitionSearchState *part_state); + +#if !CONFIG_REALTIME_ONLY +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi, + MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_tree, + const RD_STATS *none_rdc, + PartitionSearchState *part_state); + +// Get the features for selecting the max and min partition size. Currently this +// performs simple_motion_search on 16X16 subblocks of the current superblock, +// and then extract the statistics of sse and motion vectors as features. +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features); + +// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock. +BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const float *features); + +// Attempts an early termination after PARTITION_SPLIT. +void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + int64_t best_rd, int64_t part_none_rd, + int64_t part_split_rd, + int64_t *split_block_rd, + PartitionSearchState *part_state); + +// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and +// PARTITION_VERT. +// TODO(chiyotsai@google.com): Currently this model does not use q value and has +// no information about rectangular partitions. Preliminary experiments suggest +// that we can get better performance by adding in q_index and rectangular +// sse/var from SMS. We should retrain and tune this model later. +void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, + int64_t best_rd, int64_t none_rd, + const int64_t *split_rd, + PartitionSearchState *part_state); + +// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be +// considered. +void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx, + int64_t best_rd, + PartitionSearchState *part_state, + int *ab_partitions_allowed); + +// Use a ML model to predict if horz4 and vert4 should be considered. +void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + int part_ctx, int64_t best_rd, + PartitionSearchState *part_state, + int *part4_allowed, + unsigned int pb_source_variance); + +// ML-based partition search breakout after PARTITION_NONE. +void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + unsigned int pb_source_variance, int bit_depth, + PartitionSearchState *part_state); + +// The first round of partition pruning determined before any partition +// has been tested. The decisions will be updated and passed back +// to the partition search function. +void av1_prune_partitions_before_search(AV1_COMP *const cpi, + MACROBLOCK *const x, + SIMPLE_MOTION_DATA_TREE *const sms_tree, + PartitionSearchState *part_state); + +// Prune out partitions that lead to coding block sizes outside the min and max +// bsizes set by the encoder. Max and min square partition levels are defined as +// the partition nodes that the recursive function rd_pick_partition() can +// reach. To implement this: only PARTITION_NONE is allowed if the current node +// equals max_partition_size, only PARTITION_SPLIT is allowed if the current +// node exceeds max_partition_size. +void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc, + PartitionSearchState *part_state); + +// Prune out AB partitions based on rd decisions made from testing the +// basic partitions. +void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x, + const PC_TREE *pc_tree, int pb_source_variance, + int64_t best_rdcost, + const RD_RECT_PART_WIN_INFO *rect_part_win_info, + bool ext_partition_allowed, + PartitionSearchState *part_state, + int *ab_partitions_allowed); + +void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, + const int mi_row, const int mi_col, + const BLOCK_SIZE bsize, + aom_partition_features_t *features); +void av1_prepare_motion_search_features_block( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + const int mi_row, const int mi_col, const BLOCK_SIZE bsize, + const int valid_partition_types, unsigned int *block_sse, + unsigned int *block_var, unsigned int sub_block_sse[4], + unsigned int sub_block_var[4], unsigned int horz_block_sse[2], + unsigned int horz_block_var[2], unsigned int vert_block_sse[2], + unsigned int vert_block_var[2]); +#endif // !CONFIG_REALTIME_ONLY + +// A simplified version of set_offsets meant to be used for +// simple_motion_search. +static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *const x, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height, + mi_width, cpi->oxcf.border_in_pixels); + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); + xd->mb_to_right_edge = + GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE); + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); +} + +void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi, + const TileInfo *tile_info, + MACROBLOCK *x, + SIMPLE_MOTION_DATA_TREE *sms_root, + int mi_row, int mi_col); + +static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col, BLOCK_SIZE sb_size) { + const int sb_mi_wide = mi_size_wide[sb_size]; + const int sb_mi_high = mi_size_high[sb_size]; + + return (mi_row + sb_mi_high) <= mi_params->mi_rows && + (mi_col + sb_mi_wide) <= mi_params->mi_cols; +} + +#if !CONFIG_REALTIME_ONLY +// Do not use this criteria for screen content videos. +// Since screen content videos could often find good predictors and the largest +// block size is likely to be used. +static INLINE int use_auto_max_partition(const AV1_COMP *const cpi, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const AV1_COMMON *const cm = &cpi->common; + return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools && + cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != + NOT_IN_USE && + sb_size == BLOCK_128X128 && + is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != + OVERLAY_UPDATE && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != + INTNL_OVERLAY_UPDATE; +} + +static BLOCK_SIZE dim_to_size(int dim) { + switch (dim) { + case 4: return BLOCK_4X4; + case 8: return BLOCK_8X8; + case 16: return BLOCK_16X16; + case 32: return BLOCK_32X32; + case 64: return BLOCK_64X64; + case 128: return BLOCK_128X128; + default: assert(0); return 0; + } +} + +static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc, + AV1_COMP *cpi, MACROBLOCK *x, + const SPEED_FEATURES *sf, + BLOCK_SIZE sb_size, + int mi_row, int mi_col) { + const AV1_COMMON *cm = &cpi->common; + + sb_enc->max_partition_size = + AOMMIN(sf->part_sf.default_max_partition_size, + dim_to_size(cpi->oxcf.part_cfg.max_partition_size)); + sb_enc->min_partition_size = + AOMMAX(sf->part_sf.default_min_partition_size, + dim_to_size(cpi->oxcf.part_cfg.min_partition_size)); + sb_enc->max_partition_size = + AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size); + sb_enc->min_partition_size = + AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size); + + if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { + float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; + + av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); + sb_enc->max_partition_size = + AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features), + sb_enc->max_partition_size), + sb_enc->min_partition_size); + } +} +#endif // !CONFIG_REALTIME_ONLY +#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ diff --git a/third_party/aom/av1/encoder/pass2_strategy.c b/third_party/aom/av1/encoder/pass2_strategy.c new file mode 100644 index 0000000000..a9442ffc1a --- /dev/null +++ b/third_party/aom/av1/encoder/pass2_strategy.c @@ -0,0 +1,4488 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\defgroup gf_group_algo Golden Frame Group + * \ingroup high_level_algo + * Algorithms regarding determining the length of GF groups and defining GF + * group structures. + * @{ + */ +/*! @} - end defgroup gf_group_algo */ + +#include +#include + +#include "aom_mem/aom_mem.h" +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "av1/common/av1_common_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/thirdpass.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/encode_strategy.h" + +#define DEFAULT_KF_BOOST 2300 +#define DEFAULT_GF_BOOST 2000 +#define GROUP_ADAPTIVE_MAXQ 1 + +static void init_gf_stats(GF_GROUP_STATS *gf_stats); +static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params, + int is_final_pass); + +// Calculate an active area of the image that discounts formatting +// bars and partially discounts other 0 energy areas. +#define MIN_ACTIVE_AREA 0.5 +#define MAX_ACTIVE_AREA 1.0 +static double calculate_active_area(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame) { + const double active_pct = + 1.0 - + ((this_frame->intra_skip_pct / 2) + + ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows)); + return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); +} + +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +#define ACT_AREA_CORRECTION 0.5 +static double calculate_modified_err_new(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *total_stats, + const FIRSTPASS_STATS *this_stats, + int vbrbias, double modified_error_min, + double modified_error_max) { + if (total_stats == NULL) { + return 0; + } + const double av_weight = total_stats->weight / total_stats->count; + const double av_err = + (total_stats->coded_error * av_weight) / total_stats->count; + double modified_error = + av_err * pow(this_stats->coded_error * this_stats->weight / + DOUBLE_DIVIDE_CHECK(av_err), + vbrbias / 100.0); + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_error *= + pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION); + + return fclamp(modified_error, modified_error_min, modified_error_max); +} + +static double calculate_modified_err(const FRAME_INFO *frame_info, + const TWO_PASS *twopass, + const AV1EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; + return calculate_modified_err_new( + frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias, + twopass->modified_error_min, twopass->modified_error_max); +} + +// Resets the first pass file to the given position using a relative seek from +// the current position. +static void reset_fpf_position(TWO_PASS_FRAME *p_frame, + const FIRSTPASS_STATS *position) { + p_frame->stats_in = position; +} + +static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame, + FIRSTPASS_STATS *fps) { + if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; + + *fps = *p_frame->stats_in; + ++p_frame->stats_in; + return 1; +} + +static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame, + FIRSTPASS_STATS *fps) { + if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF; + + *fps = *p_frame->stats_in; + /* Move old stats[0] out to accommodate for next frame stats */ + memmove(p->frame_stats_arr[0], p->frame_stats_arr[1], + (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) * + sizeof(FIRSTPASS_STATS)); + p->stats_buf_ctx->stats_in_end--; + return 1; +} + +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, + const TWO_PASS_FRAME *p_frame, + int offset) { + if ((offset >= 0 && + p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) || + (offset < 0 && + p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) { + return NULL; + } + + return &p_frame->stats_in[offset]; +} + +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->rc_cfg.vbrmax_section) / + 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; + + return (int)max_bits; +} + +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, + 0.80, 0.85, 0.90, + 0.95, 0.95, 0.95 }; +#define ERR_DIVISOR 96.0 +static double calc_correction_factor(double err_per_mb, int q) { + const double error_term = err_per_mb / ERR_DIVISOR; + const int index = q >> 5; + // Adjustment to power term based on qindex + const double power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); + assert(error_term >= 0.0); + return fclamp(pow(error_term, power_term), 0.05, 5.0); +} + +// Based on history adjust expectations of bits per macroblock. +static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + + // Based on recent history adjust expectations of bits per macroblock. + double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0); + double rate_err_factor = 1.0; + const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0); + const double min_fac = 1.0 - adj_limit; + const double max_fac = 1.0 + adj_limit; + + if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) { + int64_t actual_bits = 0; + int64_t target_bits = 0; + double factor = 0.0; + int count = 0; + for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) { + actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits; + target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated; + factor += cpi->third_pass_ctx->frame_info[i].bpm_factor; + count++; + } + + if (count == 0) { + factor = 1.0; + } else { + factor /= (double)count; + } + + factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits); + + if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) || + (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) { + twopass->bpm_factor = factor; + twopass->bpm_factor = + AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); + } + } + + int err_estimate = p_rc->rate_error_estimate; + int64_t bits_left = twopass->bits_left; + int64_t total_actual_bits = p_rc->total_actual_bits; + int64_t bits_off_target = p_rc->vbr_bits_off_target; + double rolling_arf_group_actual_bits = + (double)twopass->rolling_arf_group_actual_bits; + double rolling_arf_group_target_bits = + (double)twopass->rolling_arf_group_target_bits; + +#if CONFIG_FPMT_TEST + const int is_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0; + const int simulate_parallel_frame = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE + ? is_parallel_frame + : 0; + total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits + : p_rc->total_actual_bits; + bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target + : p_rc->vbr_bits_off_target; + bits_left = + simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left; + rolling_arf_group_target_bits = + (double)(simulate_parallel_frame + ? p_rc->temp_rolling_arf_group_target_bits + : twopass->rolling_arf_group_target_bits); + rolling_arf_group_actual_bits = + (double)(simulate_parallel_frame + ? p_rc->temp_rolling_arf_group_actual_bits + : twopass->rolling_arf_group_actual_bits); + err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate + : p_rc->rate_error_estimate; +#endif + + if (p_rc->bits_off_target && total_actual_bits > 0) { + if (cpi->ppi->lap_enabled) { + rate_err_factor = rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits); + } else { + rate_err_factor = 1.0 - ((double)(bits_off_target) / + AOMMAX(total_actual_bits, bits_left)); + } + + // Adjustment is damped if this is 1 pass with look ahead processing + // (as there are only ever a few frames of data) and for all but the first + // GOP in normal two pass. + if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) { + rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac); + } + rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor)); + } + + // Is the rate control trending in the right direction. Only make + // an adjustment if things are getting worse. + if ((rate_err_factor < 1.0 && err_estimate >= 0) || + (rate_err_factor > 1.0 && err_estimate <= 0)) { + twopass->bpm_factor *= rate_err_factor; + if (rate_err_tol >= 100) { + twopass->bpm_factor = + AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); + } else { + twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor)); + } + } +} + +static int qbpm_enumerator(int rate_err_tol) { + return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75); +} + +// Similar to find_qindex_by_rate() function in ratectrl.c, but includes +// calculation of a correction_factor. +static int find_qindex_by_rate_with_correction( + int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb, + double group_weight_factor, int rate_err_tol, int best_qindex, + int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_factor = calc_correction_factor(error_per_mb, mid); + const double q = av1_convert_qindex_to_q(mid, bit_depth); + const int enumerator = qbpm_enumerator(rate_err_tol); + const int mid_bits_per_mb = + (int)((enumerator * mid_factor * group_weight_factor) / q); + + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + return low; +} + +/*!\brief Choose a target maximum Q for a group of frames + * + * \ingroup rate_control + * + * This function is used to estimate a suitable maximum Q for a + * group of frames. Inititally it is called to get a crude estimate + * for the whole clip. It is then called for each ARF/GF group to get + * a revised estimate for that group. + * + * \param[in] cpi Top-level encoder structure + * \param[in] av_frame_err The average per frame coded error score + * for frames making up this section/group. + * \param[in] inactive_zone Used to mask off /ignore part of the + * frame. The most common use case is where + * a wide format video (e.g. 16:9) is + * letter-boxed into a more square format. + * Here we want to ignore the bands at the + * top and bottom. + * \param[in] av_target_bandwidth The target bits per frame + * + * \return The maximum Q for frames in the group. + */ +static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err, + double inactive_zone, + int av_target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + inactive_zone = fclamp(inactive_zone, 0.0, 0.9999); + + if (av_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); + const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone); + const int target_norm_bits_per_mb = + (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs; + int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); + + // Update bpm correction factor based on previous GOP rate error. + twopass_update_bpm_factor(cpi, rate_err_tol); + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + int q = find_qindex_by_rate_with_correction( + target_norm_bits_per_mb, cpi->common.seq_params->bit_depth, + av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol, + rc->best_quality, rc->worst_quality); + + // Restriction on active max q for constrained quality mode. + if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level); + return q; + } +} + +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.01 +#define NCOUNT_FRAME_II_THRESH 5.0 +#define LOW_CODED_ERR_PER_MB 0.01 + +/* This function considers how the quality of prediction may be deteriorating + * with distance. It comapres the coded error for the last frame and the + * second reference frame (usually two frames old) and also applies a factor + * based on the extent of INTRA coding. + * + * The decay factor is then used to reduce the contribution of frames further + * from the alt-ref or golden frame, to the bitframe boost calculation for that + * alt-ref or golden frame. + */ +static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) { + double sr_diff = (frame->sr_coded_error - frame->coded_error); + double sr_decay = 1.0; + double modified_pct_inter; + double modified_pcnt_intra; + + modified_pct_inter = frame->pcnt_inter; + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && + ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH)) { + modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + if ((sr_diff > LOW_SR_DIFF_TRHESH)) { + double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error); + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); + } + return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT); +} + +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) { + const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; + double sr_decay = get_sr_decay_rate(frame); + return AOMMIN(sr_decay, zero_motion_pct); +} + +#define DEFAULT_ZM_FACTOR 0.5 +static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) { + const double sr_decay_rate = get_sr_decay_rate(frame_stats); + double zero_motion_factor = + DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); + + // Clamp value to range 0.0 to 1.0 + // This should happen anyway if input values are sensibly clamped but checked + // here just in case. + if (zero_motion_factor > 1.0) + zero_motion_factor = 1.0; + else if (zero_motion_factor < 0.0) + zero_motion_factor = 0.0; + + return AOMMAX(zero_motion_factor, + (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); +} + +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info, + int next_stats_index, + const int min_gf_interval, + const int frame_interval, + const int still_interval, + const double loop_decay_rate, + const double last_decay_rate) { + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 && + last_decay_rate < 0.9) { + int stats_left = + av1_firstpass_info_future_count(firstpass_info, next_stats_index); + if (stats_left >= still_interval) { + int j; + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = + av1_firstpass_info_peek(firstpass_info, next_stats_index + j); + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + // Only if it does do we signal a transition to still. + return j == still_interval; + } + } + return 0; +} + +// This function detects a flash through the high relative pcnt_second_ref +// score in the frame following a flash frame. The offset passed in should +// reflect this. +static int detect_flash(const TWO_PASS *twopass, + const TWO_PASS_FRAME *twopass_frame, const int offset) { + const FIRSTPASS_STATS *const next_frame = + read_frame_stats(twopass, twopass_frame, offset); + + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; +} + +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + GF_GROUP_STATS *gf_stats, double f_w, + double f_h) { + const double pct = stats->pcnt_motion; + + // Accumulate Motion In/Out of frame stats. + gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct; + gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out; + gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = + fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = + fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + gf_stats->mv_ratio_accumulator += + pct * + (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h); + gf_stats->mv_ratio_accumulator += + pct * + (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w); + } +} + +static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats, + const double mod_frame_err, + GF_GROUP_STATS *gf_stats) { + gf_stats->gf_group_err += mod_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_stats->gf_group_raw_error += stats->coded_error; +#endif + gf_stats->gf_group_skip_pct += stats->intra_skip_pct; + gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows; +} + +static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats, + const int flash_detected, + const int frames_since_key, + const int cur_idx, + GF_GROUP_STATS *gf_stats, int f_w, + int f_h) { + accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h); + // sum up the metric values of current gf group + gf_stats->avg_sr_coded_error += stats->sr_coded_error; + gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref; + gf_stats->avg_new_mv_count += stats->new_mv_count; + gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy; + if (fabs(stats->raw_error_stdev) > 0.000001) { + gf_stats->non_zero_stdev_count++; + gf_stats->avg_raw_err_stdev += stats->raw_error_stdev; + } + + // Accumulate the effect of prediction quality decay + if (!flash_detected) { + gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate; + gf_stats->loop_decay_rate = get_prediction_decay_rate(stats); + + gf_stats->decay_accumulator = + gf_stats->decay_accumulator * gf_stats->loop_decay_rate; + + // Monitor for static sections. + if ((frames_since_key + cur_idx - 1) > 1) { + gf_stats->zero_motion_accumulator = AOMMIN( + gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats)); + } + } +} + +static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) { + if (total_frame) { + gf_stats->avg_sr_coded_error /= total_frame; + gf_stats->avg_pcnt_second_ref /= total_frame; + gf_stats->avg_new_mv_count /= total_frame; + gf_stats->avg_wavelet_energy /= total_frame; + } + + if (gf_stats->non_zero_stdev_count) + gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count; +} + +#define BOOST_FACTOR 12.5 +static double baseline_err_per_mb(const FRAME_INFO *frame_info) { + unsigned int screen_area = frame_info->frame_height * frame_info->frame_width; + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 500.0; + } else { + return 1000.0; + } +} + +static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME], + frame_info->bit_depth); + const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); + const double active_area = calculate_active_area(frame_info, this_frame); + + // Underlying boost factor is based on inter error ratio. + frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area, + this_frame->intra_error * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; + + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + // In the extreme case the boost is halved. + else + frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME], + frame_info->bit_depth); + const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00); + const double active_area = calculate_active_area(frame_info, this_frame); + + // Underlying boost factor is based on inter error ratio. + frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area, + this_frame->intra_error * active_area) / + DOUBLE_DIVIDE_CHECK( + (this_frame->coded_error + *sr_accumulator) * active_area); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); + *sr_accumulator = AOMMAX(0.0, *sr_accumulator); + + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = ((frame_boost + 40.0) * boost_q_correction); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc, + int gfu_boost, int frames_to_project, + int num_stats_used_for_gfu_boost) { + /* + * If frames_to_project is equal to num_stats_used_for_gfu_boost, + * it means that gfu_boost was calculated over frames_to_project to + * begin with(ie; all stats required were available), hence return + * the original boost. + */ + if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost; + + double min_boost_factor = sqrt(p_rc->baseline_gf_interval); + // Get the current tpl factor (number of frames = frames_to_project). + double tpl_factor = av1_get_gfu_boost_projection_factor( + min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project); + // Get the tpl factor when number of frames = num_stats_used_for_prior_boost. + double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor( + min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost); + int projected_gfu_boost = + (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats); + return projected_gfu_boost; +} + +#define GF_MAX_BOOST 90.0 +#define GF_MIN_BOOST 50 +#define MIN_DECAY_FACTOR 0.01 +int av1_calc_arf_boost(const TWO_PASS *twopass, + const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + int offset, int f_frames, int b_frames, + int *num_fpstats_used, int *num_fpstats_required, + int project_gfu_boost) { + int i; + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + double boost_score = (double)NORMAL_BOOST; + int arf_boost; + int flash_detected = 0; + if (num_fpstats_used) *num_fpstats_used = 0; + + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = + read_frame_stats(twopass, twopass_frame, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &gf_stats, + frame_info->frame_width, + frame_info->frame_height); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, twopass_frame, i + offset) || + detect_flash(twopass, twopass_frame, i + offset + 1); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); + gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : gf_stats.decay_accumulator; + } + + boost_score += + gf_stats.decay_accumulator * + calc_frame_boost(p_rc, frame_info, this_frame, + gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); + if (num_fpstats_used) (*num_fpstats_used)++; + } + + arf_boost = (int)boost_score; + + // Reset for backward looking loop. + boost_score = 0.0; + init_gf_stats(&gf_stats); + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = + read_frame_stats(twopass, twopass_frame, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &gf_stats, + frame_info->frame_width, + frame_info->frame_height); + + // We want to discount the the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, twopass_frame, i + offset) || + detect_flash(twopass, twopass_frame, i + offset + 1); + + // Cumulative effect of prediction quality decay. + if (!flash_detected) { + gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); + gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : gf_stats.decay_accumulator; + } + + boost_score += + gf_stats.decay_accumulator * + calc_frame_boost(p_rc, frame_info, this_frame, + gf_stats.this_frame_mv_in_out, GF_MAX_BOOST); + if (num_fpstats_used) (*num_fpstats_used)++; + } + arf_boost += (int)boost_score; + + if (project_gfu_boost) { + assert(num_fpstats_required != NULL); + assert(num_fpstats_used != NULL); + *num_fpstats_required = f_frames + b_frames; + arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required, + *num_fpstats_used); + } + + if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST)) + arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST); + + return arf_boost; +} + +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; + } + + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +/*!\brief Calculates the bit target for this GF/ARF group + * + * \ingroup rate_control + * + * Calculates the total bits to allocate in this GF/ARF group. + * + * \param[in] cpi Top-level encoder structure + * \param[in] gf_group_err Cumulative coded error score for the + * frames making up this group. + * + * \return The target total number of bits for this GF/ARF group. + */ +static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const TWO_PASS *const twopass = &cpi->ppi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number of bits to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, int boost, + int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0)) return 0; + + if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX)); + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), + 0); +} + +// Calculate the boost factor based on the number of bits assigned, i.e. the +// inverse of calculate_boost_bits(). +static int calculate_boost_factor(int frame_count, int bits, + int64_t total_group_bits) { + return (int)(100.0 * frame_count * bits / (total_group_bits - bits)); +} + +// Reduce the number of bits assigned to keyframe or arf if necessary, to +// prevent bitrate spikes that may break level constraints. +// frame_type: 0: keyframe; 1: arf. +static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, + RATE_CONTROL *const rc, + int bits_assigned, + int64_t group_bits, + int frame_type) { + const AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1; + ++index) { + if (!is_in_operating_point(seq_params->operating_point_idc[index], + temporal_layer_id, spatial_layer_id)) { + continue; + } + + const AV1_LEVEL target_level = + cpi->ppi->level_params.target_seq_level_idx[index]; + if (target_level >= SEQ_LEVELS) continue; + + assert(is_valid_seq_level_idx(target_level)); + + const double level_bitrate_limit = av1_get_max_bitrate_for_level( + target_level, seq_params->tier[0], seq_params->profile); + const int target_bits_per_frame = + (int)(level_bitrate_limit / cpi->framerate); + if (frame_type == 0) { + // Maximum bits for keyframe is 8 times the target_bits_per_frame. + const int level_enforced_max_kf_bits = target_bits_per_frame * 8; + if (bits_assigned > level_enforced_max_kf_bits) { + const int frames = rc->frames_to_key - 1; + p_rc->kf_boost = calculate_boost_factor( + frames, level_enforced_max_kf_bits, group_bits); + bits_assigned = + calculate_boost_bits(frames, p_rc->kf_boost, group_bits); + } + } else if (frame_type == 1) { + // Maximum bits for arf is 4 times the target_bits_per_frame. + const int level_enforced_max_arf_bits = target_bits_per_frame * 4; + if (bits_assigned > level_enforced_max_arf_bits) { + p_rc->gfu_boost = + calculate_boost_factor(p_rc->baseline_gf_interval, + level_enforced_max_arf_bits, group_bits); + bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval, + p_rc->gfu_boost, group_bits); + } + } else { + assert(0); + } + } + + return bits_assigned; +} + +// Allocate bits to each frame in a GF / ARF group +double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60, + 0.60, 1.0, 1.0 }; +static void allocate_gf_group_bits(GF_GROUP *gf_group, + PRIMARY_RATE_CONTROL *const p_rc, + RATE_CONTROL *const rc, + int64_t gf_group_bits, int gf_arf_bits, + int key_frame, int use_arf) { + int64_t total_group_bits = gf_group_bits; + int base_frame_bits; + const int gf_group_size = gf_group->size; + int layer_frames[MAX_ARF_LAYERS + 1] = { 0 }; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + int frame_index = !!key_frame; + + // Subtract the extra bits set aside for ARF frames from the Group Total + if (use_arf) total_group_bits -= gf_arf_bits; + + int num_frames = + AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0)); + base_frame_bits = (int)(total_group_bits / num_frames); + + // Check the number of frames in each layer in case we have a + // non standard group length. + int max_arf_layer = gf_group->max_layer_depth - 1; + for (int idx = frame_index; idx < gf_group_size; ++idx) { + if ((gf_group->update_type[idx] == ARF_UPDATE) || + (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) { + layer_frames[gf_group->layer_depth[idx]]++; + } + } + + // Allocate extra bits to each ARF layer + int i; + int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 }; + assert(max_arf_layer <= MAX_ARF_LAYERS); + for (i = 1; i <= max_arf_layer; ++i) { + double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i]; + layer_extra_bits[i] = + (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i])); + gf_arf_bits -= (int)(gf_arf_bits * fraction); + } + + // Now combine ARF layer and baseline bits to give total bits for each frame. + int arf_extra_bits; + for (int idx = frame_index; idx < gf_group_size; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + case INTNL_ARF_UPDATE: + arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]]; + gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits; + break; + case INTNL_OVERLAY_UPDATE: + case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = base_frame_bits; break; + } + } + + // Set the frame following the current GOP to 0 bit allocation. For ARF + // groups, this next frame will be overlay frame, which is the first frame + // in the next GOP. For GF group, next GOP will overwrite the rate allocation. + // Setting this frame to use 0 bit (of out the current GOP budget) will + // simplify logics in reference frame management. + if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH) + gf_group->bit_allocation[gf_group_size] = 0; +} + +// Returns true if KF group and GF group both are almost completely static. +static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion, + int is_lap_enabled) { + if (is_lap_enabled) { + /* + * when LAP enabled kf_zero_motion is not reliable, so use strict + * constraint on gf_zero_motion. + */ + return (gf_zero_motion >= 0.999); + } else { + return (gf_zero_motion >= 0.995) && + (kf_zero_motion >= STATIC_KF_GROUP_THRESH); + } +} + +#define ARF_ABS_ZOOM_THRESH 4.4 +static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start, + int flash_detected, int active_max_gf_interval, + int active_min_gf_interval, + GF_GROUP_STATS *gf_stats) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + AV1_COMMON *const cm = &cpi->common; + // Motion breakout threshold for loop below depends on image size. + const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0; + + if (!flash_detected) { + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + + // TODO(angiebird): This is a temporary change, we will avoid using + // twopass_frame.stats_in in the follow-up CL + int index = (int)(cpi->twopass_frame.stats_in - + twopass->stats_buf_ctx->stats_in_start); + if (detect_transition_to_still(&twopass->firstpass_info, index, + rc->min_gf_interval, frame_index - cur_start, + 5, gf_stats->loop_decay_rate, + gf_stats->last_loop_decay_rate)) { + return 1; + } + } + + // Some conditions to breakout after min interval. + if (frame_index - cur_start >= active_min_gf_interval && + // If possible don't break very close to a kf + (rc->frames_to_key - frame_index >= rc->min_gf_interval) && + ((frame_index - cur_start) & 0x01) && !flash_detected && + (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh || + gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) { + return 1; + } + + // If almost totally static, we will not use the the max GF length later, + // so we can continue for more frames. + if (((frame_index - cur_start) >= active_max_gf_interval + 1) && + !is_almost_static(gf_stats->zero_motion_accumulator, + twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) { + return 1; + } + return 0; +} + +static int is_shorter_gf_interval_better( + AV1_COMP *cpi, const EncodeFrameParams *frame_params) { + const RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method; + int shorten_gf_interval; + + av1_tpl_preload_rc_estimate(cpi, frame_params); + + if (gop_length_decision_method == 2) { + // GF group length is decided based on GF boost and tpl stats of ARFs from + // base layer, (base+1) layer. + shorten_gf_interval = + (p_rc->gfu_boost < + p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) && + !av1_tpl_setup_stats(cpi, 3, frame_params); + } else { + int do_complete_tpl = 1; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + int is_temporal_filter_enabled = + (rc->frames_since_key > 0 && gf_group->arf_index > -1); + + if (gop_length_decision_method == 1) { + // Check if tpl stats of ARFs from base layer, (base+1) layer, + // (base+2) layer can decide the GF group length. + int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params); + + if (gop_length_eval != 2) { + do_complete_tpl = 0; + shorten_gf_interval = !gop_length_eval; + } + } + + if (do_complete_tpl) { + // Decide GF group length based on complete tpl stats. + shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params); + // Tpl stats is reused when the ARF is temporally filtered and GF + // interval is not shortened. + if (is_temporal_filter_enabled && !shorten_gf_interval) { + cpi->skip_tpl_setup_stats = 1; +#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS + assert(cpi->gf_frame_index == 0); + av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data, + gf_group, + cpi->common.seq_params->bit_depth); +#endif // CONFIG_BITRATE_ACCURACY + } + } + } + return shorten_gf_interval; +} + +#define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking +#define SMOOTH_FILT_LEN 7 +#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2) +#define WINDOW_SIZE 7 +#define HALF_WIN (WINDOW_SIZE / 2) +// A 7-tap gaussian smooth filter +const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383, + 0.242, 0.061, 0.006 }; + +// Smooth filter intra_error and coded_error in firstpass stats. +// If stats[i].is_flash==1, the ith element should not be used in the filtering. +static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx, + int last_idx, double *filt_intra_err, + double *filt_coded_err) { + int i, j; + for (i = start_idx; i <= last_idx; i++) { + double total_wt = 0; + for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { + int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); + if (stats[idx].is_flash) continue; + + filt_intra_err[i] += + smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error; + total_wt += smooth_filt[j + HALF_FILT_LEN]; + } + if (total_wt > 0.01) { + filt_intra_err[i] /= total_wt; + } else { + filt_intra_err[i] = stats[i].intra_error; + } + } + for (i = start_idx; i <= last_idx; i++) { + double total_wt = 0; + for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { + int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); + // Coded error involves idx and idx - 1. + if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; + + filt_coded_err[i] += + smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error; + total_wt += smooth_filt[j + HALF_FILT_LEN]; + } + if (total_wt > 0.01) { + filt_coded_err[i] /= total_wt; + } else { + filt_coded_err[i] = stats[i].coded_error; + } + } +} + +// Calculate gradient +static void get_gradient(const double *values, int start, int last, + double *grad) { + if (start == last) { + grad[start] = 0; + return; + } + for (int i = start; i <= last; i++) { + int prev = AOMMAX(i - 1, start); + int next = AOMMIN(i + 1, last); + grad[i] = (values[next] - values[prev]) / (next - prev); + } +} + +static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, + int first, int last) { + // Identify unstable areas caused by scenecuts. + // Find the max and 2nd max coded error, and the average of the rest frames. + // If there is only one frame that yields a huge coded error, it is likely a + // scenecut. + double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded, + max_next_coded; + + if (last - first == 0) return -1; + + for (int i = first; i <= last; i++) { + if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) + continue; + double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01); + this_ratio = stats_start[i].coded_error / temp_intra; + // find the avg ratio in the preceding neighborhood + max_prev_ratio = 0; + max_prev_coded = 0; + for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) { + if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash)) + continue; + temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); + double temp_ratio = stats_start[j].coded_error / temp_intra; + if (temp_ratio > max_prev_ratio) { + max_prev_ratio = temp_ratio; + } + if (stats_start[j].coded_error > max_prev_coded) { + max_prev_coded = stats_start[j].coded_error; + } + } + // find the avg ratio in the following neighborhood + max_next_ratio = 0; + max_next_coded = 0; + for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) { + if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) + continue; + temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); + double temp_ratio = stats_start[j].coded_error / temp_intra; + if (temp_ratio > max_next_ratio) { + max_next_ratio = temp_ratio; + } + if (stats_start[j].coded_error > max_next_coded) { + max_next_coded = stats_start[j].coded_error; + } + } + + if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) { + // the ratios are very small, only check a small fixed threshold + if (this_ratio < 0.02) continue; + } else { + // check if this frame has a larger ratio than the neighborhood + double max_sr = stats_start[i].sr_coded_error; + if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error); + double max_sr_fr_ratio = + max_sr / AOMMAX(stats_start[i].coded_error, 0.01); + + if (max_sr_fr_ratio > 1.2) continue; + if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) && + stats_start[i].coded_error < + 2 * AOMMAX(max_prev_coded, max_next_coded)) { + continue; + } + } + return i; + } + return -1; +} + +// Remove the region with index next_region. +// parameter merge: 0: merge with previous; 1: merge with next; 2: +// merge with both, take type from previous if possible +// After removing, next_region will be the index of the next region. +static void remove_region(int merge, REGIONS *regions, int *num_regions, + int *next_region) { + int k = *next_region; + assert(k < *num_regions); + if (*num_regions == 1) { + *num_regions = 0; + return; + } + if (k == 0) { + merge = 1; + } else if (k == *num_regions - 1) { + merge = 0; + } + int num_merge = (merge == 2) ? 2 : 1; + switch (merge) { + case 0: + regions[k - 1].last = regions[k].last; + *next_region = k; + break; + case 1: + regions[k + 1].start = regions[k].start; + *next_region = k + 1; + break; + case 2: + regions[k - 1].last = regions[k + 1].last; + *next_region = k; + break; + default: assert(0); + } + *num_regions -= num_merge; + for (k = *next_region - (merge == 1); k < *num_regions; k++) { + regions[k] = regions[k + num_merge]; + } +} + +// Insert a region in the cur_region_idx. The start and last should both be in +// the current region. After insertion, the cur_region_idx will point to the +// last region that was splitted from the original region. +static void insert_region(int start, int last, REGION_TYPES type, + REGIONS *regions, int *num_regions, + int *cur_region_idx) { + int k = *cur_region_idx; + REGION_TYPES this_region_type = regions[k].type; + int this_region_last = regions[k].last; + int num_add = (start != regions[k].start) + (last != regions[k].last); + // move the following regions further to the back + for (int r = *num_regions - 1; r > k; r--) { + regions[r + num_add] = regions[r]; + } + *num_regions += num_add; + if (start > regions[k].start) { + regions[k].last = start - 1; + k++; + regions[k].start = start; + } + regions[k].type = type; + if (last < this_region_last) { + regions[k].last = last; + k++; + regions[k].start = last + 1; + regions[k].last = this_region_last; + regions[k].type = this_region_type; + } else { + regions[k].last = this_region_last; + } + *cur_region_idx = k; +} + +// Get the average of stats inside a region. +static void analyze_region(const FIRSTPASS_STATS *stats, int k, + REGIONS *regions) { + int i; + regions[k].avg_cor_coeff = 0; + regions[k].avg_sr_fr_ratio = 0; + regions[k].avg_intra_err = 0; + regions[k].avg_coded_err = 0; + + int check_first_sr = (k != 0); + + for (i = regions[k].start; i <= regions[k].last; i++) { + if (i > regions[k].start || check_first_sr) { + double num_frames = + (double)(regions[k].last - regions[k].start + check_first_sr); + double max_coded_error = + AOMMAX(stats[i].coded_error, stats[i - 1].coded_error); + double this_ratio = + stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001); + regions[k].avg_sr_fr_ratio += this_ratio / num_frames; + } + + regions[k].avg_intra_err += + stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1); + regions[k].avg_coded_err += + stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1); + + regions[k].avg_cor_coeff += + AOMMAX(stats[i].cor_coeff, 0.001) / + (double)(regions[k].last - regions[k].start + 1); + regions[k].avg_noise_var += + AOMMAX(stats[i].noise_var, 0.001) / + (double)(regions[k].last - regions[k].start + 1); + } +} + +// Calculate the regions stats of every region. +static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions, + int num_regions) { + for (int k = 0; k < num_regions; k++) { + analyze_region(stats, k, regions); + } +} + +// Find tentative stable regions +static int find_stable_regions(const FIRSTPASS_STATS *stats, + const double *grad_coded, int this_start, + int this_last, REGIONS *regions) { + int i, j, k = 0; + regions[k].start = this_start; + for (i = this_start; i <= this_last; i++) { + // Check mean and variance of stats in a window + double mean_intra = 0.001, var_intra = 0.001; + double mean_coded = 0.001, var_coded = 0.001; + int count = 0; + for (j = -HALF_WIN; j <= HALF_WIN; j++) { + int idx = AOMMIN(AOMMAX(i + j, this_start), this_last); + if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; + mean_intra += stats[idx].intra_error; + var_intra += stats[idx].intra_error * stats[idx].intra_error; + mean_coded += stats[idx].coded_error; + var_coded += stats[idx].coded_error * stats[idx].coded_error; + count++; + } + + REGION_TYPES cur_type; + if (count > 0) { + mean_intra /= (double)count; + var_intra /= (double)count; + mean_coded /= (double)count; + var_coded /= (double)count; + int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03); + int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 && + fabs(grad_coded[i]) / mean_coded < 0.05) || + mean_coded / mean_intra < 0.05; + int is_coded_small = mean_coded < 0.5 * mean_intra; + cur_type = (is_intra_stable && is_coded_stable && is_coded_small) + ? STABLE_REGION + : HIGH_VAR_REGION; + } else { + cur_type = HIGH_VAR_REGION; + } + + // mark a new region if type changes + if (i == regions[k].start) { + // first frame in the region + regions[k].type = cur_type; + } else if (cur_type != regions[k].type) { + // Append a new region + regions[k].last = i - 1; + regions[k + 1].start = i; + regions[k + 1].type = cur_type; + k++; + } + } + regions[k].last = this_last; + return k + 1; +} + +// Clean up regions that should be removed or merged. +static void cleanup_regions(REGIONS *regions, int *num_regions) { + int k = 0; + while (k < *num_regions) { + if ((k > 0 && regions[k - 1].type == regions[k].type && + regions[k].type != SCENECUT_REGION) || + regions[k].last < regions[k].start) { + remove_region(0, regions, num_regions, &k); + } else { + k++; + } + } +} + +// Remove regions that are of type and shorter than length. +// Merge it with its neighboring regions. +static void remove_short_regions(REGIONS *regions, int *num_regions, + REGION_TYPES type, int length) { + int k = 0; + while (k < *num_regions && (*num_regions) > 1) { + if ((regions[k].last - regions[k].start + 1 < length && + regions[k].type == type)) { + // merge current region with the previous and next regions + remove_region(2, regions, num_regions, &k); + } else { + k++; + } + } + cleanup_regions(regions, num_regions); +} + +static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, + REGIONS *regions, int *num_regions) { + int i, j, k; + // Remove regions that are too short. Likely noise. + remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN); + remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); + + get_region_stats(stats, regions, *num_regions); + + // Adjust region boundaries. The thresholds are empirically obtained, but + // overall the performance is not very sensitive to small changes to them. + for (k = 0; k < *num_regions; k++) { + if (regions[k].type == STABLE_REGION) continue; + if (k > 0) { + // Adjust previous boundary. + // First find the average intra/coded error in the previous + // neighborhood. + double avg_intra_err = 0; + const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1, + regions[k - 1].start + 1); + const int lasti = regions[k - 1].last; + int counti = 0; + for (i = starti; i <= lasti; i++) { + avg_intra_err += stats[i].intra_error; + counti++; + } + if (counti > 0) { + avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); + int count_coded = 0, count_grad = 0; + for (j = lasti + 1; j <= regions[k].last; j++) { + const int intra_close = + fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; + const int coded_small = stats[j].coded_error / avg_intra_err < 0.1; + const int coeff_close = stats[j].cor_coeff > 0.995; + if (!coeff_close || !coded_small) count_coded--; + if (intra_close && count_coded >= 0 && count_grad >= 0) { + // this frame probably belongs to the previous stable region + regions[k - 1].last = j; + regions[k].start = j + 1; + } else { + break; + } + } + } + } // if k > 0 + if (k < *num_regions - 1) { + // Adjust next boundary. + // First find the average intra/coded error in the next neighborhood. + double avg_intra_err = 0; + const int starti = regions[k + 1].start; + const int lasti = AOMMIN(regions[k + 1].last - 1, + regions[k + 1].start + WINDOW_SIZE - 1); + int counti = 0; + for (i = starti; i <= lasti; i++) { + avg_intra_err += stats[i].intra_error; + counti++; + } + if (counti > 0) { + avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); + // At the boundary, coded error is large, but still the frame is stable + int count_coded = 1, count_grad = 1; + for (j = starti - 1; j >= regions[k].start; j--) { + const int intra_close = + fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; + const int coded_small = + stats[j + 1].coded_error / avg_intra_err < 0.1; + const int coeff_close = stats[j].cor_coeff > 0.995; + if (!coeff_close || !coded_small) count_coded--; + if (intra_close && count_coded >= 0 && count_grad >= 0) { + // this frame probably belongs to the next stable region + regions[k + 1].start = j; + regions[k].last = j - 1; + } else { + break; + } + } + } + } // if k < *num_regions - 1 + } // end of loop over all regions + + cleanup_regions(regions, num_regions); + remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); + get_region_stats(stats, regions, *num_regions); + + // If a stable regions has higher error than neighboring high var regions, + // or if the stable region has a lower average correlation, + // then it should be merged with them + k = 0; + while (k < *num_regions && (*num_regions) > 1) { + if (regions[k].type == STABLE_REGION && + (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && + ((k > 0 && // previous regions + (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 || + regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) && + (k < *num_regions - 1 && // next region + (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 || + regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) { + // merge current region with the previous and next regions + remove_region(2, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + } else if (regions[k].type == HIGH_VAR_REGION && + (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && + ((k > 0 && // previous regions + (regions[k].avg_coded_err < + regions[k - 1].avg_coded_err * 0.99 || + regions[k].avg_cor_coeff > + regions[k - 1].avg_cor_coeff * 1.001)) && + (k < *num_regions - 1 && // next region + (regions[k].avg_coded_err < + regions[k + 1].avg_coded_err * 0.99 || + regions[k].avg_cor_coeff > + regions[k + 1].avg_cor_coeff * 1.001)))) { + // merge current region with the previous and next regions + remove_region(2, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + } else { + k++; + } + } + + remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE); + remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); +} + +// Identify blending regions. +static void find_blending_regions(const FIRSTPASS_STATS *stats, + REGIONS *regions, int *num_regions) { + int i, k = 0; + // Blending regions will have large content change, therefore will have a + // large consistent change in intra error. + int count_stable = 0; + while (k < *num_regions) { + if (regions[k].type == STABLE_REGION) { + k++; + count_stable++; + continue; + } + int dir = 0; + int start = 0, last; + for (i = regions[k].start; i <= regions[k].last; i++) { + // First mark the regions that has consistent large change of intra error. + if (k == 0 && i == regions[k].start) continue; + if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue; + double grad = stats[i].intra_error - stats[i - 1].intra_error; + int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05; + int this_dir = 0; + if (large_change) { + this_dir = (grad > 0) ? 1 : -1; + } + // the current trend continues + if (dir == this_dir) continue; + if (dir != 0) { + // Mark the end of a new large change group and add it + last = i - 1; + insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); + } + dir = this_dir; + if (k == 0 && i == regions[k].start + 1) { + start = i - 1; + } else { + start = i; + } + } + if (dir != 0) { + last = regions[k].last; + insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); + } + k++; + } + + // If the blending region has very low correlation, mark it as high variance + // since we probably cannot benefit from it anyways. + get_region_stats(stats, regions, *num_regions); + for (k = 0; k < *num_regions; k++) { + if (regions[k].type != BLENDING_REGION) continue; + if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 || + count_stable == 0) + regions[k].type = HIGH_VAR_REGION; + } + get_region_stats(stats, regions, *num_regions); + + // It is possible for blending to result in a "dip" in intra error (first + // decrease then increase). Therefore we need to find the dip and combine the + // two regions. + k = 1; + while (k < *num_regions) { + if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) { + // Check if this short high variance regions is actually in the middle of + // a blending region. + if (regions[k - 1].type == BLENDING_REGION && + regions[k + 1].type == BLENDING_REGION && + regions[k].last - regions[k].start < 3) { + int prev_dir = (stats[regions[k - 1].last].intra_error - + stats[regions[k - 1].last - 1].intra_error) > 0 + ? 1 + : -1; + int next_dir = (stats[regions[k + 1].last].intra_error - + stats[regions[k + 1].last - 1].intra_error) > 0 + ? 1 + : -1; + if (prev_dir < 0 && next_dir > 0) { + // This is possibly a mid region of blending. Check the ratios + double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio, + regions[k + 1].avg_sr_fr_ratio) * + 0.95; + if (regions[k].avg_sr_fr_ratio > ratio_thres) { + regions[k].type = BLENDING_REGION; + remove_region(2, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + continue; + } + } + } + } + // Check if we have a pair of consecutive blending regions. + if (regions[k - 1].type == BLENDING_REGION && + regions[k].type == BLENDING_REGION) { + int prev_dir = (stats[regions[k - 1].last].intra_error - + stats[regions[k - 1].last - 1].intra_error) > 0 + ? 1 + : -1; + int next_dir = (stats[regions[k].last].intra_error - + stats[regions[k].last - 1].intra_error) > 0 + ? 1 + : -1; + + // if both are too short, no need to check + int total_length = regions[k].last - regions[k - 1].start + 1; + if (total_length < 4) { + regions[k - 1].type = HIGH_VAR_REGION; + k++; + continue; + } + + int to_merge = 0; + if (prev_dir < 0 && next_dir > 0) { + // In this case we check the last frame in the previous region. + double prev_length = + (double)(regions[k - 1].last - regions[k - 1].start + 1); + double last_ratio, ratio_thres; + if (prev_length < 2.01) { + // if the previous region is very short + double max_coded_error = + AOMMAX(stats[regions[k - 1].last].coded_error, + stats[regions[k - 1].last - 1].coded_error); + last_ratio = stats[regions[k - 1].last].sr_coded_error / + AOMMAX(max_coded_error, 0.001); + ratio_thres = regions[k].avg_sr_fr_ratio * 0.95; + } else { + double max_coded_error = + AOMMAX(stats[regions[k - 1].last].coded_error, + stats[regions[k - 1].last - 1].coded_error); + last_ratio = stats[regions[k - 1].last].sr_coded_error / + AOMMAX(max_coded_error, 0.001); + double prev_ratio = + (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) / + (prev_length - 1.0); + ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95; + } + if (last_ratio > ratio_thres) { + to_merge = 1; + } + } + + if (to_merge) { + remove_region(0, regions, num_regions, &k); + analyze_region(stats, k - 1, regions); + continue; + } else { + // These are possibly two separate blending regions. Mark the boundary + // frame as HIGH_VAR_REGION to separate the two. + int prev_k = k - 1; + insert_region(regions[prev_k].last, regions[prev_k].last, + HIGH_VAR_REGION, regions, num_regions, &prev_k); + analyze_region(stats, prev_k, regions); + k = prev_k + 1; + analyze_region(stats, k, regions); + } + } + k++; + } + cleanup_regions(regions, num_regions); +} + +// Clean up decision for blendings. Remove blending regions that are too short. +// Also if a very short high var region is between a blending and a stable +// region, just merge it with one of them. +static void cleanup_blendings(REGIONS *regions, int *num_regions) { + int k = 0; + while (k<*num_regions && * num_regions> 1) { + int is_short_blending = regions[k].type == BLENDING_REGION && + regions[k].last - regions[k].start + 1 < 5; + int is_short_hv = regions[k].type == HIGH_VAR_REGION && + regions[k].last - regions[k].start + 1 < 5; + int has_stable_neighbor = + ((k > 0 && regions[k - 1].type == STABLE_REGION) || + (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION)); + int has_blend_neighbor = + ((k > 0 && regions[k - 1].type == BLENDING_REGION) || + (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION)); + int total_neighbors = (k > 0) + (k < *num_regions - 1); + + if (is_short_blending || + (is_short_hv && + has_stable_neighbor + has_blend_neighbor >= total_neighbors)) { + // Remove this region.Try to determine whether to combine it with the + // previous or next region. + int merge; + double prev_diff = + (k > 0) + ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff) + : 1; + double next_diff = + (k < *num_regions - 1) + ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff) + : 1; + // merge == 0 means to merge with previous, 1 means to merge with next + merge = prev_diff > next_diff; + remove_region(merge, regions, num_regions, &k); + } else { + k++; + } + } + cleanup_regions(regions, num_regions); +} + +static void free_firstpass_stats_buffers(REGIONS *temp_regions, + double *filt_intra_err, + double *filt_coded_err, + double *grad_coded) { + aom_free(temp_regions); + aom_free(filt_intra_err); + aom_free(filt_coded_err); + aom_free(grad_coded); +} + +// Identify stable and unstable regions from first pass stats. +// stats_start points to the first frame to analyze. +// |offset| is the offset from the current frame to the frame stats_start is +// pointing to. +// Returns 0 on success, -1 on memory allocation failure. +static int identify_regions(const FIRSTPASS_STATS *const stats_start, + int total_frames, int offset, REGIONS *regions, + int *total_regions) { + int k; + if (total_frames <= 1) return 0; + + // store the initial decisions + REGIONS *temp_regions = + (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0])); + // buffers for filtered stats + double *filt_intra_err = + (double *)aom_calloc(total_frames, sizeof(*filt_intra_err)); + double *filt_coded_err = + (double *)aom_calloc(total_frames, sizeof(*filt_coded_err)); + double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded)); + if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) { + free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err, + grad_coded); + return -1; + } + av1_zero_array(temp_regions, total_frames); + + int cur_region = 0, this_start = 0, this_last; + + int next_scenecut = -1; + do { + // first get the obvious scenecuts + next_scenecut = + find_next_scenecut(stats_start, this_start, total_frames - 1); + this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1; + + // low-pass filter the needed stats + smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err, + filt_coded_err); + get_gradient(filt_coded_err, this_start, this_last, grad_coded); + + // find tentative stable regions and unstable regions + int num_regions = find_stable_regions(stats_start, grad_coded, this_start, + this_last, temp_regions); + + adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions); + + get_region_stats(stats_start, temp_regions, num_regions); + + // Try to identify blending regions in the unstable regions + find_blending_regions(stats_start, temp_regions, &num_regions); + cleanup_blendings(temp_regions, &num_regions); + + // The flash points should all be considered high variance points + k = 0; + while (k < num_regions) { + if (temp_regions[k].type != STABLE_REGION) { + k++; + continue; + } + int start = temp_regions[k].start; + int last = temp_regions[k].last; + for (int i = start; i <= last; i++) { + if (stats_start[i].is_flash) { + insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k); + } + } + k++; + } + cleanup_regions(temp_regions, &num_regions); + + // copy the regions in the scenecut group + for (k = 0; k < num_regions; k++) { + if (temp_regions[k].last < temp_regions[k].start && + k == num_regions - 1) { + num_regions--; + break; + } + regions[k + cur_region] = temp_regions[k]; + } + cur_region += num_regions; + + // add the scenecut region + if (next_scenecut > -1) { + // add the scenecut region, and find the next scenecut + regions[cur_region].type = SCENECUT_REGION; + regions[cur_region].start = next_scenecut; + regions[cur_region].last = next_scenecut; + cur_region++; + this_start = next_scenecut + 1; + } + } while (next_scenecut >= 0); + + *total_regions = cur_region; + get_region_stats(stats_start, regions, *total_regions); + + for (k = 0; k < *total_regions; k++) { + // If scenecuts are very minor, mark them as high variance. + if (regions[k].type != SCENECUT_REGION || + regions[k].avg_cor_coeff * + (1 - stats_start[regions[k].start].noise_var / + regions[k].avg_intra_err) < + 0.8) { + continue; + } + regions[k].type = HIGH_VAR_REGION; + } + cleanup_regions(regions, total_regions); + get_region_stats(stats_start, regions, *total_regions); + + for (k = 0; k < *total_regions; k++) { + regions[k].start += offset; + regions[k].last += offset; + } + + free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err, + grad_coded); + return 0; +} + +static int find_regions_index(const REGIONS *regions, int num_regions, + int frame_idx) { + for (int k = 0; k < num_regions; k++) { + if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) { + return k; + } + } + return -1; +} + +/*!\brief Determine the length of future GF groups. + * + * \ingroup gf_group_algo + * This function decides the gf group length of future frames in batch + * + * \param[in] cpi Top-level encoder structure + * \param[in] max_gop_length Maximum length of the GF group + * \param[in] max_intervals Maximum number of intervals to decide + * + * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is + * changed to store the decided GF group lengths. + */ +static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, + int max_intervals) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0); + + const int f_w = cpi->common.width; + const int f_h = cpi->common.height; + int i; + + int flash_detected; + + av1_zero(next_frame); + + if (has_no_stats_stage(cpi)) { + for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) { + p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length); + } + p_rc->cur_gf_index = 0; + rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS; + return; + } + + // TODO(urvang): Try logic to vary min and max interval based on q. + const int active_min_gf_interval = rc->min_gf_interval; + const int active_max_gf_interval = + AOMMIN(rc->max_gf_interval, max_gop_length); + const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval); + + i = (rc->frames_since_key == 0); + max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals; + int count_cuts = 1; + // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF. + int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last; + int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 }; + int cut_here; + GF_GROUP_STATS gf_stats; + init_gf_stats(&gf_stats); + while (count_cuts < max_intervals + 1) { + // reaches next key frame, break here + if (i >= rc->frames_to_key) { + cut_here = 2; + } else if (i - cur_start >= rc->static_scene_max_gf_interval) { + // reached maximum len, but nothing special yet (almost static) + // let's look at the next interval + cut_here = 1; + } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) { + // reaches last frame, break + cut_here = 2; + } else { + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0); + // TODO(bohanli): remove redundant accumulations here, or unify + // this and the ones in define_gf_group + accumulate_next_frame_stats(&next_frame, flash_detected, + rc->frames_since_key, i, &gf_stats, f_w, f_h); + + cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected, + active_max_gf_interval, active_min_gf_interval, + &gf_stats); + } + if (cut_here) { + cur_last = i - 1; // the current last frame in the gf group + int ori_last = cur_last; + // The region frame idx does not start from the same frame as cur_start + // and cur_last. Need to offset them. + int offset = rc->frames_since_key - p_rc->regions_offset; + REGIONS *regions = p_rc->regions; + int num_regions = p_rc->num_regions; + + int scenecut_idx = -1; + // only try shrinking if interval smaller than active_max_gf_interval + if (cur_last - cur_start <= active_max_gf_interval && + cur_last > cur_start) { + // find the region indices of where the first and last frame belong. + int k_start = + find_regions_index(regions, num_regions, cur_start + offset); + int k_last = + find_regions_index(regions, num_regions, cur_last + offset); + if (cur_start + offset == 0) k_start = 0; + + // See if we have a scenecut in between + for (int r = k_start + 1; r <= k_last; r++) { + if (regions[r].type == SCENECUT_REGION && + regions[r].last - offset - cur_start > active_min_gf_interval) { + scenecut_idx = r; + break; + } + } + + // if the found scenecut is very close to the end, ignore it. + if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) { + scenecut_idx = -1; + } + + if (scenecut_idx != -1) { + // If we have a scenecut, then stop at it. + // TODO(bohanli): add logic here to stop before the scenecut and for + // the next gop start from the scenecut with GF + int is_minor_sc = + (regions[scenecut_idx].avg_cor_coeff * + (1 - stats[regions[scenecut_idx].start - offset].noise_var / + regions[scenecut_idx].avg_intra_err) > + 0.6); + cur_last = regions[scenecut_idx].last - offset - !is_minor_sc; + } else { + int is_last_analysed = (k_last == num_regions - 1) && + (cur_last + offset == regions[k_last].last); + int not_enough_regions = + k_last - k_start <= + 1 + (regions[k_start].type == SCENECUT_REGION); + // if we are very close to the end, then do not shrink since it may + // introduce intervals that are too short + if (!(is_last_analysed && not_enough_regions)) { + const double arf_length_factor = 0.1; + double best_score = 0; + int best_j = -1; + const int first_frame = regions[0].start - offset; + const int last_frame = regions[num_regions - 1].last - offset; + // score of how much the arf helps the whole GOP + double base_score = 0.0; + // Accumulate base_score in + for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) { + if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; + base_score = (base_score + 1.0) * stats[j].cor_coeff; + } + int met_blending = 0; // Whether we have met blending areas before + int last_blending = 0; // Whether the previous frame if blending + for (int j = cur_start + min_shrink_int; j <= cur_last; j++) { + if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; + base_score = (base_score + 1.0) * stats[j].cor_coeff; + int this_reg = + find_regions_index(regions, num_regions, j + offset); + if (this_reg < 0) continue; + // A GOP should include at most 1 blending region. + if (regions[this_reg].type == BLENDING_REGION) { + last_blending = 1; + if (met_blending) { + break; + } else { + base_score = 0; + continue; + } + } else { + if (last_blending) met_blending = 1; + last_blending = 0; + } + + // Add the factor of how good the neighborhood is for this + // candidate arf. + double this_score = arf_length_factor * base_score; + double temp_accu_coeff = 1.0; + // following frames + int count_f = 0; + for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) { + if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break; + temp_accu_coeff *= stats[n].cor_coeff; + this_score += + temp_accu_coeff * + sqrt(AOMMAX(0.5, + 1 - stats[n].noise_var / + AOMMAX(stats[n].intra_error, 0.001))); + count_f++; + } + // preceding frames + temp_accu_coeff = 1.0; + for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) { + if (stats + n < twopass->stats_buf_ctx->stats_in_start) break; + temp_accu_coeff *= stats[n].cor_coeff; + this_score += + temp_accu_coeff * + sqrt(AOMMAX(0.5, + 1 - stats[n].noise_var / + AOMMAX(stats[n].intra_error, 0.001))); + } + + if (this_score > best_score) { + best_score = this_score; + best_j = j; + } + } + + // For blending areas, move one more frame in case we missed the + // first blending frame. + int best_reg = + find_regions_index(regions, num_regions, best_j + offset); + if (best_reg < num_regions - 1 && best_reg > 0) { + if (regions[best_reg - 1].type == BLENDING_REGION && + regions[best_reg + 1].type == BLENDING_REGION) { + if (best_j + offset == regions[best_reg].start && + best_j + offset < regions[best_reg].last) { + best_j += 1; + } else if (best_j + offset == regions[best_reg].last && + best_j + offset > regions[best_reg].start) { + best_j -= 1; + } + } + } + + if (cur_last - best_j < 2) best_j = cur_last; + if (best_j > 0 && best_score > 0.1) cur_last = best_j; + // if cannot find anything, just cut at the original place. + } + } + } + cut_pos[count_cuts] = cur_last; + count_cuts++; + + // reset pointers to the shrunken location + cpi->twopass_frame.stats_in = start_pos + cur_last; + cur_start = cur_last; + int cur_region_idx = + find_regions_index(regions, num_regions, cur_start + 1 + offset); + if (cur_region_idx >= 0) + if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++; + + i = cur_last; + + if (cut_here > 1 && cur_last == ori_last) break; + + // reset accumulators + init_gf_stats(&gf_stats); + } + ++i; + } + + // save intervals + rc->intervals_till_gf_calculate_due = count_cuts - 1; + for (int n = 1; n < count_cuts; n++) { + p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1]; + } + p_rc->cur_gf_index = 0; + cpi->twopass_frame.stats_in = start_pos; +} + +static void correct_frames_to_key(AV1_COMP *cpi) { + int lookahead_size = + (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); + if (lookahead_size < + av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) { + assert( + IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0, + lookahead_size == cpi->ppi->frames_left)); + cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size); + } else if (cpi->ppi->frames_left > 0) { + // Correct frames to key based on limit + cpi->rc.frames_to_key = + AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left); + } +} + +/*!\brief Define a GF group in one pass mode when no look ahead stats are + * available. + * + * \ingroup gf_group_algo + * This function defines the structure of a GF group, along with various + * parameters regarding bit-allocation and quality setup in the special + * case of one pass encoding where no lookahead stats are avialable. + * + * \param[in] cpi Top-level encoder structure + * + * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed. + */ +static void define_gf_group_pass0(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GFConfig *const gf_cfg = &oxcf->gf_cfg; + int target; + + if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { + av1_cyclic_refresh_set_golden_update(cpi); + } else { + p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index]; + rc->intervals_till_gf_calculate_due--; + p_rc->cur_gf_index++; + } + + // correct frames_to_key when lookahead queue is flushing + correct_frames_to_key(cpi); + + if (p_rc->baseline_gf_interval > rc->frames_to_key) + p_rc->baseline_gf_interval = rc->frames_to_key; + + p_rc->gfu_boost = DEFAULT_GF_BOOST; + p_rc->constrained_gf_group = + (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; + + gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height; + + // Rare case when the look-ahead is less than the target GOP length, can't + // generate ARF frame. + if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames || + !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) || + p_rc->baseline_gf_interval < rc->min_gf_interval) + gf_group->max_layer_depth_allowed = 0; + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi); + + // Allocate bits to each of the frames in the GF group. + // TODO(sarahparker) Extend this to work with pyramid structure. + for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) { + const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index]; + if (oxcf->rc_cfg.mode == AOM_CBR) { + if (cur_update_type == KF_UPDATE) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type); + } + } else { + if (cur_update_type == KF_UPDATE) { + target = av1_calc_iframe_target_size_one_pass_vbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type); + } + } + gf_group->bit_allocation[cur_index] = target; + } +} + +static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc, + int arf_position) { + p_rc->baseline_gf_interval = arf_position; +} + +// initialize GF_GROUP_STATS +static void init_gf_stats(GF_GROUP_STATS *gf_stats) { + gf_stats->gf_group_err = 0.0; + gf_stats->gf_group_raw_error = 0.0; + gf_stats->gf_group_skip_pct = 0.0; + gf_stats->gf_group_inactive_zone_rows = 0.0; + + gf_stats->mv_ratio_accumulator = 0.0; + gf_stats->decay_accumulator = 1.0; + gf_stats->zero_motion_accumulator = 1.0; + gf_stats->loop_decay_rate = 1.0; + gf_stats->last_loop_decay_rate = 1.0; + gf_stats->this_frame_mv_in_out = 0.0; + gf_stats->mv_in_out_accumulator = 0.0; + gf_stats->abs_mv_in_out_accumulator = 0.0; + + gf_stats->avg_sr_coded_error = 0.0; + gf_stats->avg_pcnt_second_ref = 0.0; + gf_stats->avg_new_mv_count = 0.0; + gf_stats->avg_wavelet_energy = 0.0; + gf_stats->avg_raw_err_stdev = 0.0; + gf_stats->non_zero_stdev_count = 0; +} + +static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w, + int f_h, FIRSTPASS_STATS *next_frame, + const FIRSTPASS_STATS *start_pos, + GF_GROUP_STATS *gf_stats, int *idx) { + int i, flash_detected; + TWO_PASS *const twopass = &cpi->ppi->twopass; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + RATE_CONTROL *const rc = &cpi->rc; + FRAME_INFO *frame_info = &cpi->frame_info; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + init_gf_stats(gf_stats); + av1_zero(*next_frame); + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + i = is_intra_only; + // get the determined gf group length from p_rc->gf_intervals + while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { + // read in the next frame + if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break; + // Accumulate error score of frames in this gf group. + double mod_frame_err = + calculate_modified_err(frame_info, twopass, oxcf, next_frame); + // accumulate stats for this frame + accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats); + ++i; + } + + reset_fpf_position(&cpi->twopass_frame, start_pos); + + i = is_intra_only; + input_stats(twopass, &cpi->twopass_frame, next_frame); + while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { + // read in the next frame + if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break; + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0); + + // accumulate stats for next frame + accumulate_next_frame_stats(next_frame, flash_detected, + rc->frames_since_key, i, gf_stats, f_w, f_h); + + ++i; + } + + i = p_rc->gf_intervals[p_rc->cur_gf_index]; + average_gf_stats(i, gf_stats); + + *idx = i; +} + +static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc, + int idx, int is_final_pass) { + if (is_final_pass) { + rc->intervals_till_gf_calculate_due--; + p_rc->cur_gf_index++; + } + + // Was the group length constrained by the requirement for a new KF? + p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0; + + set_baseline_gf_interval(p_rc, idx); + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; +} + +#define MAX_GF_BOOST 5400 +#define REDUCE_GF_LENGTH_THRESH 4 +#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 +#define REDUCE_GF_LENGTH_BY 1 +static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only, + int is_final_pass, int use_alt_ref, + int alt_offset, const FIRSTPASS_STATS *start_pos, + GF_GROUP_STATS *gf_stats) { + // Should we use the alternate reference frame. + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + FRAME_INFO *frame_info = &cpi->frame_info; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + int ext_len = i - is_intra_only; + if (use_alt_ref) { + const int forward_frames = (rc->frames_to_key - i >= ext_len) + ? ext_len + : AOMMAX(0, rc->frames_to_key - i); + + // Calculate the boost for alt ref. + p_rc->gfu_boost = av1_calc_arf_boost( + twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, + forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost, + &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled); + } else { + reset_fpf_position(&cpi->twopass_frame, start_pos); + p_rc->gfu_boost = AOMMIN( + MAX_GF_BOOST, + av1_calc_arf_boost( + twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len, + 0, &p_rc->num_stats_used_for_gfu_boost, + &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled)); + } + +#define LAST_ALR_BOOST_FACTOR 0.2f + p_rc->arf_boost_factor = 1.0; + if (use_alt_ref && !is_lossless_requested(rc_cfg)) { + // Reduce the boost of altref in the last gf group + if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY || + rc->frames_to_key - ext_len == 0) { + p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + } + } + + // Reset the file position. + reset_fpf_position(&cpi->twopass_frame, start_pos); + if (cpi->ppi->lap_enabled) { + // Since we don't have enough stats to know the actual error of the + // gf group, we assume error of each frame to be equal to 1 and set + // the error of the group as baseline_gf_interval. + gf_stats->gf_group_err = p_rc->baseline_gf_interval; + } + // Calculate the bits to be allocated to the gf/arf group as a whole + p_rc->gf_group_bits = + calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err); + +#if GROUP_ADAPTIVE_MAXQ + // Calculate an estimate of the maxq needed for the group. + // We are more aggressive about correcting for sections + // where there could be significant overshoot than for easier + // sections where we do not wish to risk creating an overshoot + // of the allocated bit budget. + if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) && + is_final_pass) { + const int vbr_group_bits_per_frame = + (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval); + const double group_av_err = + gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval; + const double group_av_skip_pct = + gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval; + const double group_av_inactive_zone = + ((gf_stats->gf_group_inactive_zone_rows * 2) / + (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows)); + + int tmp_q; + tmp_q = get_twopass_worst_quality( + cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame); + rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1); + } +#endif + + // Adjust KF group bits and error remaining. + if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err; + + // Reset the file position. + reset_fpf_position(&cpi->twopass_frame, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. + if (rc->frames_since_key != 0) { + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_buf_ctx->stats_in_end, + p_rc->baseline_gf_interval); + } + + av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0, + use_alt_ref, p_rc->gf_group_bits); + + // TODO(jingning): Generalize this condition. + if (is_final_pass) { + cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref; + + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; + } +#if CONFIG_BITRATE_ACCURACY + if (is_final_pass) { + av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info, + p_rc->baseline_gf_interval); + } +#endif +} + +/*!\brief Define a GF group. + * + * \ingroup gf_group_algo + * This function defines the structure of a GF group, along with various + * parameters regarding bit-allocation and quality setup. + * + * \param[in] cpi Top-level encoder structure + * \param[in] frame_params Structure with frame parameters + * \param[in] is_final_pass Whether this is the final pass for the + * GF group, or a trial (non-zero) + * + * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed. + */ +static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params, + int is_final_pass) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + const GFConfig *const gf_cfg = &oxcf->gf_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + const int f_w = cm->width; + const int f_h = cm->height; + int i; + const int is_intra_only = rc->frames_since_key == 0; + + cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (!is_intra_only) { + av1_zero(cpi->ppi->gf_group); + cpi->gf_frame_index = 0; + } + + if (has_no_stats_stage(cpi)) { + define_gf_group_pass0(cpi); + return; + } + + if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) { + int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass); + if (ret == 0) return; + + av1_free_thirdpass_ctx(cpi->third_pass_ctx); + cpi->third_pass_ctx = NULL; + } + + // correct frames_to_key when lookahead queue is emptying + if (cpi->ppi->lap_enabled) { + correct_frames_to_key(cpi); + } + + GF_GROUP_STATS gf_stats; + accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, + &gf_stats, &i); + + const int can_disable_arf = !gf_cfg->gf_min_pyr_height; + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + const int active_min_gf_interval = rc->min_gf_interval; + + // Disable internal ARFs for "still" gf groups. + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. + const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1; + if (can_disable_internal_arfs && + gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION && + gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR && + gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) { + cpi->ppi->internal_altref_allowed = 0; + } + + int use_alt_ref; + if (can_disable_arf) { + use_alt_ref = + !is_almost_static(gf_stats.zero_motion_accumulator, + twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) && + p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && + (i >= MIN_GF_INTERVAL); + } else { + use_alt_ref = p_rc->use_arf_in_this_kf_group && + (i < gf_cfg->lag_in_frames) && (i > 2); + } + if (use_alt_ref) { + gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height; + } else { + gf_group->max_layer_depth_allowed = 0; + } + + int alt_offset = 0; + // The length reduction strategy is tweaked for certain cases, and doesn't + // work well for certain other cases. + const int allow_gf_length_reduction = + ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) || + !cpi->ppi->internal_altref_allowed) && + !is_lossless_requested(rc_cfg); + + if (allow_gf_length_reduction && use_alt_ref) { + // adjust length of this gf group if one of the following condition met + // 1: only one overlay frame left and this gf is too long + // 2: next gf group is too short to have arf compared to the current gf + + // maximum length of next gf group + const int next_gf_len = rc->frames_to_key - i; + const int single_overlay_left = + next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; + // the next gf is probably going to have a ARF but it will be shorter than + // this gf + const int unbalanced_gf = + i > REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 >= rc->min_gf_interval; + + if (single_overlay_left || unbalanced_gf) { + const int roll_back = REDUCE_GF_LENGTH_BY; + // Reduce length only if active_min_gf_interval will be respected later. + if (i - roll_back >= active_min_gf_interval + 1) { + alt_offset = -roll_back; + i -= roll_back; + if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; + p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back; + reset_fpf_position(&cpi->twopass_frame, start_pos); + accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, + start_pos, &gf_stats, &i); + } + } + } + + update_gop_length(rc, p_rc, i, is_final_pass); + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi); + + set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, + alt_offset, start_pos, &gf_stats); + + frame_params->frame_type = + rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME; + frame_params->show_frame = + !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); +} + +/*!\brief Define a GF group for the third apss. + * + * \ingroup gf_group_algo + * This function defines the structure of a GF group for the third pass, along + * with various parameters regarding bit-allocation and quality setup based on + * the two-pass bitstream. + * Much of the function still uses the strategies used for the second pass and + * relies on first pass statistics. It is expected that over time these portions + * would be replaced with strategies specific to the third pass. + * + * \param[in] cpi Top-level encoder structure + * \param[in] frame_params Structure with frame parameters + * \param[in] is_final_pass Whether this is the final pass for the + * GF group, or a trial (non-zero) + * + * \return 0: Success; + * -1: There are conflicts between the bitstream and current config + * The values in cpi->ppi->gf_group are also changed. + */ +static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params, + int is_final_pass) { + if (!cpi->third_pass_ctx) return -1; + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + const GFConfig *const gf_cfg = &oxcf->gf_cfg; + const int f_w = cm->width; + const int f_h = cm->height; + int i; + const int is_intra_only = rc->frames_since_key == 0; + + cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (!is_intra_only) { + av1_zero(cpi->ppi->gf_group); + cpi->gf_frame_index = 0; + } + + GF_GROUP_STATS gf_stats; + accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos, + &gf_stats, &i); + + const int can_disable_arf = !gf_cfg->gf_min_pyr_height; + + // TODO(any): set cpi->ppi->internal_altref_allowed accordingly; + + int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx); + if (use_alt_ref == 0 && !can_disable_arf) return -1; + if (use_alt_ref) { + gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height; + } else { + gf_group->max_layer_depth_allowed = 0; + } + + update_gop_length(rc, p_rc, i, is_final_pass); + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi); + + set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0, + start_pos, &gf_stats); + + frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type; + frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame; + return 0; +} + +// #define FIXED_ARF_BITS +#ifdef FIXED_ARF_BITS +#define ARF_BITS_FRACTION 0.75 +#endif +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + // Calculate the extra bits to be used for boosted frame(s) +#ifdef FIXED_ARF_BITS + int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits); +#else + int gf_arf_bits = calculate_boost_bits( + p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost, + gf_group_bits); +#endif + + gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits, + gf_group_bits, 1); + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits, + is_key_frame, use_arf); +} + +// Minimum % intra coding observed in first pass (1.0 = 100%) +#define MIN_INTRA_LEVEL 0.25 +// Minimum ratio between the % of intra coding and inter coding in the first +// pass after discounting neutral blocks (discounting neutral blocks in this +// way helps catch scene cuts in clips with very flat areas or letter box +// format clips with image padding. +#define INTRA_VS_INTER_THRESH 2.0 +// Hard threshold where the first pass chooses intra for almost all blocks. +// In such a case even if the frame is not a scene cut coding a key frame +// may be a good option. +#define VERY_LOW_INTER_THRESH 0.05 +// Maximum threshold for the relative ratio of intra error score vs best +// inter error score. +#define KF_II_ERR_THRESHOLD 1.9 +// In real scene cuts there is almost always a sharp change in the intra +// or inter error score. +#define ERR_CHANGE_THRESHOLD 0.4 +// For real scene cuts we expect an improvment in the intra inter error +// ratio in the next frame. +#define II_IMPROVEMENT_THRESHOLD 3.5 +#define KF_II_MAX 128.0 +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + +// Slide show transition detection. +// Tests for case where there is very low error either side of the current frame +// but much higher just for this frame. This can help detect key frames in +// slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. +// It will not help if the transition is a fade or other multi-frame effect. +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); +} + +// Threshold for use of the lagging second reference frame. High second ref +// usage may point to a transient event like a flash or occlusion rather than +// a real scene cut. +// We adapt the threshold based on number of frames in this key-frame group so +// far. +static double get_second_ref_usage_thresh(int frame_count_so_far) { + const int adapt_upto = 32; + const double min_second_ref_usage_thresh = 0.085; + const double second_ref_usage_thresh_max_delta = 0.035; + if (frame_count_so_far >= adapt_upto) { + return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta; + } + return min_second_ref_usage_thresh + + ((double)frame_count_so_far / (adapt_upto - 1)) * + second_ref_usage_thresh_max_delta; +} + +static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info, + int this_stats_index, int frame_count_so_far, + enum aom_rc_mode rc_mode, int scenecut_mode, + int num_mbs) { + const FIRSTPASS_STATS *last_stats = + av1_firstpass_info_peek(firstpass_info, this_stats_index - 1); + const FIRSTPASS_STATS *this_stats = + av1_firstpass_info_peek(firstpass_info, this_stats_index); + const FIRSTPASS_STATS *next_stats = + av1_firstpass_info_peek(firstpass_info, this_stats_index + 1); + if (last_stats == NULL || this_stats == NULL || next_stats == NULL) { + return 0; + } + + int is_viable_kf = 0; + double pcnt_intra = 1.0 - this_stats->pcnt_inter; + double modified_pcnt_inter = + this_stats->pcnt_inter - this_stats->pcnt_neutral; + const double second_ref_usage_thresh = + get_second_ref_usage_thresh(frame_count_so_far); + int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL; + int count_for_tolerable_prediction = 3; + + // We do "-1" because the candidate key is not counted. + int stats_after_this_stats = + av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1; + + if (scenecut_mode == ENABLE_SCENECUT_MODE_1) { + if (stats_after_this_stats < 3) { + return 0; + } else { + frames_to_test_after_candidate_key = 3; + count_for_tolerable_prediction = 1; + } + } + // Make sure we have enough stats after the candidate key. + frames_to_test_after_candidate_key = + AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats); + + // Does the frame satisfy the primary criteria of a key frame? + // See above for an explanation of the test criteria. + // If so, then examine how well it predicts subsequent frames. + if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) && + (this_stats->pcnt_second_ref < second_ref_usage_thresh) && + (next_stats->pcnt_second_ref < second_ref_usage_thresh) && + ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) || + slide_transition(this_stats, last_stats, next_stats) || + ((pcnt_intra > MIN_INTRA_LEVEL) && + (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + ((this_stats->intra_error / + DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) < + KF_II_ERR_THRESHOLD) && + ((fabs(last_stats->coded_error - this_stats->coded_error) / + DOUBLE_DIVIDE_CHECK(this_stats->coded_error) > + ERR_CHANGE_THRESHOLD) || + (fabs(last_stats->intra_error - this_stats->intra_error) / + DOUBLE_DIVIDE_CHECK(this_stats->intra_error) > + ERR_CHANGE_THRESHOLD) || + ((next_stats->intra_error / + DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) > + II_IMPROVEMENT_THRESHOLD))))) { + int i; + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + + // Examine how well the key frame predicts subsequent frames. + for (i = 1; i <= frames_to_test_after_candidate_key; ++i) { + // Get the next frame details + const FIRSTPASS_STATS *local_next_frame = + av1_firstpass_info_peek(firstpass_info, this_stats_index + i); + double next_iiratio = + (BOOST_FACTOR * local_next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error)); + + if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; + + // Cumulative effect of decay in prediction quality. + if (local_next_frame->pcnt_inter > 0.85) + decay_accumulator *= local_next_frame->pcnt_inter; + else + decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0; + + // Keep a running total. + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses. + // TODO(any): Test of intra error should be normalized to an MB. + if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) < + 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 3.0) || + (local_next_frame->intra_error < (200.0 / (double)num_mbs))) { + break; + } + + old_boost_score = boost_score; + } + + // If there is tolerable prediction for at least the next 3 frames then + // break out else discard this potential key frame and move on + if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) { + is_viable_kf = 1; + } else { + is_viable_kf = 0; + } + } + return is_viable_kf; +} + +#define FRAMES_TO_CHECK_DECAY 8 +#define KF_MIN_FRAME_BOOST 80.0 +#define KF_MAX_FRAME_BOOST 128.0 +#define MIN_KF_BOOST 600 // Minimum boost for non-static KF interval +#define MAX_KF_BOOST 3200 +#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval + +static int detect_app_forced_key(AV1_COMP *cpi) { + int num_frames_to_app_forced_key = is_forced_keyframe_pending( + cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage); + return num_frames_to_app_forced_key; +} + +static int get_projected_kf_boost(AV1_COMP *cpi) { + /* + * If num_stats_used_for_kf_boost >= frames_to_key, then + * all stats needed for prior boost calculation are available. + * Hence projecting the prior boost is not needed in this cases. + */ + if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key) + return cpi->ppi->p_rc.kf_boost; + + // Get the current tpl factor (number of frames = frames_to_key). + double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key); + // Get the tpl factor when number of frames = num_stats_used_for_kf_boost. + double tpl_factor_num_stats = av1_get_kf_boost_projection_factor( + cpi->ppi->p_rc.num_stats_used_for_kf_boost); + int projected_kf_boost = + (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats); + return projected_kf_boost; +} + +/*!\brief Determine the location of the next key frame + * + * \ingroup gf_group_algo + * This function decides the placement of the next key frame when a + * scenecut is detected or the maximum key frame distance is reached. + * + * \param[in] cpi Top-level encoder structure + * \param[in] firstpass_info struct for firstpass info + * \param[in] num_frames_to_detect_scenecut Maximum lookahead frames. + * \param[in] search_start_idx the start index for searching key frame. + * Set it to one if we already know the + * current frame is key frame. Otherwise, + * set it to zero. + * + * \return Number of frames to the next key including the current frame. + */ +static int define_kf_interval(AV1_COMP *cpi, + const FIRSTPASS_INFO *firstpass_info, + int num_frames_to_detect_scenecut, + int search_start_idx) { + const TWO_PASS *const twopass = &cpi->ppi->twopass; + const RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + double decay_accumulator = 1.0; + int i = 0, j; + int frames_to_key = search_start_idx; + int frames_since_key = rc->frames_since_key + 1; + int scenecut_detected = 0; + + int num_frames_to_next_key = detect_app_forced_key(cpi); + + if (num_frames_to_detect_scenecut == 0) { + if (num_frames_to_next_key != -1) + return num_frames_to_next_key; + else + return rc->frames_to_key; + } + + if (num_frames_to_next_key != -1) + num_frames_to_detect_scenecut = + AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + + i = 0; + const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const int future_stats_count = + av1_firstpass_info_future_count(firstpass_info, 0); + while (frames_to_key < future_stats_count && + frames_to_key < num_frames_to_detect_scenecut) { + // Provided that we are not at the end of the file... + if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key && + frames_to_key + 1 < future_stats_count) { + double loop_decay_rate; + + // Check for a scene cut. + if (frames_since_key >= kf_cfg->key_freq_min) { + scenecut_detected = test_candidate_kf( + &twopass->firstpass_info, frames_to_key, frames_since_key, + oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection, + num_mbs); + if (scenecut_detected) { + break; + } + } + + // How fast is the prediction quality decaying? + const FIRSTPASS_STATS *next_stats = + av1_firstpass_info_peek(firstpass_info, frames_to_key + 1); + loop_decay_rate = get_prediction_decay_rate(next_stats); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if (frames_since_key >= kf_cfg->key_freq_min) { + scenecut_detected = detect_transition_to_still( + firstpass_info, frames_to_key + 1, rc->min_gf_interval, i, + kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator); + if (scenecut_detected) { + // In the case of transition followed by a static scene, the key frame + // could be a good predictor for the following frames, therefore we + // do not use an arf. + p_rc->use_arf_in_this_kf_group = 0; + break; + } + } + + // Step on to the next frame. + ++frames_to_key; + ++frames_since_key; + + // If we don't have a real key frame within the next two + // key_freq_max intervals then break out of the loop. + if (frames_to_key >= 2 * kf_cfg->key_freq_max) { + break; + } + } else { + ++frames_to_key; + ++frames_since_key; + } + ++i; + } + if (cpi->ppi->lap_enabled && !scenecut_detected) + frames_to_key = num_frames_to_next_key; + + return frames_to_key; +} + +static double get_kf_group_avg_error(TWO_PASS *twopass, + TWO_PASS_FRAME *twopass_frame, + const FIRSTPASS_STATS *first_frame, + const FIRSTPASS_STATS *start_position, + int frames_to_key) { + FIRSTPASS_STATS cur_frame = *first_frame; + int num_frames, i; + double kf_group_avg_error = 0.0; + + reset_fpf_position(twopass_frame, start_position); + + for (i = 0; i < frames_to_key; ++i) { + kf_group_avg_error += cur_frame.coded_error; + if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break; + } + num_frames = i + 1; + num_frames = AOMMIN(num_frames, frames_to_key); + kf_group_avg_error = kf_group_avg_error / num_frames; + + return (kf_group_avg_error); +} + +static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err, + double kf_group_avg_error) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + int64_t kf_group_bits; + if (cpi->ppi->lap_enabled) { + kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth; + if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) { + double vbr_corpus_complexity_lap = + cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0; + /* Get the average corpus complexity of the frame */ + kf_group_bits = (int64_t)( + kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap)); + } + } else { + kf_group_bits = (int64_t)(twopass->bits_left * + (kf_group_err / twopass->modified_error_left)); + } + + return kf_group_bits; +} + +static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS cur_frame; + av1_zero(cur_frame); + int num_frames = 0; + // Accumulate total stat using available number of stats. + for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) { + if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break; + av1_accumulate_stats(avg_frame_stat, &cur_frame); + } + + if (num_frames < 2) { + return num_frames; + } + // Average the total stat + avg_frame_stat->weight = avg_frame_stat->weight / num_frames; + avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames; + avg_frame_stat->frame_avg_wavelet_energy = + avg_frame_stat->frame_avg_wavelet_energy / num_frames; + avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames; + avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames; + avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames; + avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames; + avg_frame_stat->pcnt_second_ref = + avg_frame_stat->pcnt_second_ref / num_frames; + avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames; + avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames; + avg_frame_stat->inactive_zone_rows = + avg_frame_stat->inactive_zone_rows / num_frames; + avg_frame_stat->inactive_zone_cols = + avg_frame_stat->inactive_zone_cols / num_frames; + avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames; + avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames; + avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames; + avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames; + avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames; + avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames; + avg_frame_stat->mv_in_out_count = + avg_frame_stat->mv_in_out_count / num_frames; + avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames; + avg_frame_stat->count = avg_frame_stat->count / num_frames; + avg_frame_stat->duration = avg_frame_stat->duration / num_frames; + + return num_frames; +} + +static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err, + double *zero_motion_accumulator, + double *sr_accumulator, int use_avg_stat) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + FIRSTPASS_STATS frame_stat; + av1_zero(frame_stat); + int i = 0, num_stat_used = 0; + double boost_score = 0.0; + const double kf_max_boost = + cpi->oxcf.rc_cfg.mode == AOM_Q + ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), + KF_MAX_FRAME_BOOST) + : KF_MAX_FRAME_BOOST; + + // Calculate the average using available number of stats. + if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat); + + for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) { + if (!use_avg_stat && + EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat)) + break; + + // Monitor for static sections. + // For the first frame in kf group, the second ref indicator is invalid. + if (i > 0) { + *zero_motion_accumulator = + AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat)); + } else { + *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion; + } + + // Not all frames in the group are necessarily used in calculating boost. + if ((*sr_accumulator < (kf_raw_err * 1.50)) && + (i <= rc->max_gf_interval * 2)) { + double frame_boost; + double zm_factor; + + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (*zero_motion_accumulator / 2.0)); + + if (i < 2) *sr_accumulator = 0.0; + frame_boost = + calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat, + sr_accumulator, kf_max_boost); + boost_score += frame_boost * zm_factor; + } + } + return boost_score; +} + +/*!\brief Interval(in seconds) to clip key-frame distance to in LAP. + */ +#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5 + +/*!\brief Determine the next key frame group + * + * \ingroup gf_group_algo + * This function decides the placement of the next key frame, and + * calculates the bit allocation of the KF group and the keyframe itself. + * + * \param[in] cpi Top-level encoder structure + * \param[in] this_frame Pointer to first pass stats + */ +static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_INFO *const frame_info = &cpi->frame_info; + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; + const FIRSTPASS_STATS first_frame = *this_frame; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info; + av1_zero(next_frame); + + rc->frames_since_key = 0; + // Use arfs if possible. + p_rc->use_arf_in_this_kf_group = is_altref_enabled( + oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf); + + // Reset the GF group data structures. + av1_zero(*gf_group); + cpi->gf_frame_index = 0; + + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + + if (has_no_stats_stage(cpi)) { + int num_frames_to_app_forced_key = detect_app_forced_key(cpi); + p_rc->this_key_frame_forced = + current_frame->frame_number != 0 && rc->frames_to_key == 0; + if (num_frames_to_app_forced_key != -1) + rc->frames_to_key = num_frames_to_app_forced_key; + else + rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max); + correct_frames_to_key(cpi); + p_rc->kf_boost = DEFAULT_KF_BOOST; + gf_group->update_type[0] = KF_UPDATE; + return; + } + int i; + const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in; + int kf_bits = 0; + double zero_motion_accumulator = 1.0; + double boost_score = 0.0; + double kf_raw_err = 0.0; + double kf_mod_err = 0.0; + double sr_accumulator = 0.0; + double kf_group_avg_error = 0.0; + int frames_to_key, frames_to_key_clipped = INT_MAX; + int64_t kf_group_bits_clipped = INT64_MAX; + + // Is this a forced key frame by interval. + p_rc->this_key_frame_forced = p_rc->next_key_frame_forced; + + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. + + kf_raw_err = this_frame->intra_error; + kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame); + + // We assume the current frame is a key frame and we are looking for the next + // key frame. Therefore search_start_idx = 1 + frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max, + /*search_start_idx=*/1); + + if (frames_to_key != -1) { + rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key); + } else { + rc->frames_to_key = kf_cfg->key_freq_max; + } + + if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi); + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) { + FIRSTPASS_STATS tmp_frame = first_frame; + + rc->frames_to_key /= 2; + + // Reset to the start of the group. + reset_fpf_position(&cpi->twopass_frame, start_position); + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break; + } + p_rc->next_key_frame_forced = 1; + } else if ((cpi->twopass_frame.stats_in == + twopass->stats_buf_ctx->stats_in_end && + is_stat_consumption_stage_twopass(cpi)) || + rc->frames_to_key >= kf_cfg->key_freq_max) { + p_rc->next_key_frame_forced = 1; + } else { + p_rc->next_key_frame_forced = 0; + } + + double kf_group_err = 0; + for (i = 0; i < rc->frames_to_key; ++i) { + const FIRSTPASS_STATS *this_stats = + av1_firstpass_info_peek(&twopass->firstpass_info, i); + if (this_stats != NULL) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err_new( + frame_info, &firstpass_info->total_stats, this_stats, + oxcf->rc_cfg.vbrbias, twopass->modified_error_min, + twopass->modified_error_max); + ++p_rc->num_stats_used_for_kf_boost; + } + } + + // Calculate the number of bits that should be assigned to the kf group. + if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) || + (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, oxcf); + + // Maximum number of bits allocated to the key frame group. + int64_t max_grp_bits; + + if (oxcf->rc_cfg.vbr_corpus_complexity_lap) { + kf_group_avg_error = + get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame, + start_position, rc->frames_to_key); + } + + // Default allocation based on bits left and relative + // complexity of the section. + twopass->kf_group_bits = + get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error); + // Clip based on maximum per frame rate defined by the user. + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; + } else { + twopass->kf_group_bits = 0; + } + twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); + + if (cpi->ppi->lap_enabled) { + // In the case of single pass based on LAP, frames to key may have an + // inaccurate value, and hence should be clipped to an appropriate + // interval. + frames_to_key_clipped = + (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate); + + // This variable calculates the bits allocated to kf_group with a clipped + // frames_to_key. + if (rc->frames_to_key > frames_to_key_clipped) { + kf_group_bits_clipped = + (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped / + rc->frames_to_key); + } + } + + // Reset the first pass file position. + reset_fpf_position(&cpi->twopass_frame, start_position); + + // Scan through the kf group collating various stats used to determine + // how many bits to spend on it. + boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator, + &sr_accumulator, 0); + reset_fpf_position(&cpi->twopass_frame, start_position); + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = calculate_section_intra_ratio( + start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key); + + p_rc->kf_boost = (int)boost_score; + + if (cpi->ppi->lap_enabled) { + if (oxcf->rc_cfg.mode == AOM_Q) { + p_rc->kf_boost = get_projected_kf_boost(cpi); + } else { + // TODO(any): Explore using average frame stats for AOM_Q as well. + boost_score = get_kf_boost_score( + cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1); + reset_fpf_position(&cpi->twopass_frame, start_position); + p_rc->kf_boost += (int)boost_score; + } + } + + // Special case for static / slide show content but don't apply + // if the kf group is very short. + if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && + (rc->frames_to_key > 8)) { + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST); + } else { + // Apply various clamps for min and max boost + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3)); + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST); +#ifdef STRICT_RC + p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST); +#endif + } + + // Work out how many bits to allocate for the key frame itself. + // In case of LAP enabled for VBR, if the frames_to_key value is + // very high, we calculate the bits based on a clipped value of + // frames_to_key. + kf_bits = calculate_boost_bits( + AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost, + AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped)); + // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", + // p_rc->kf_boost, + // kf_bits, twopass->kf_zeromotion_pct); + kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits, + twopass->kf_group_bits, 0); + + twopass->kf_group_bits -= kf_bits; + + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + + // Note the total error score of the kf group minus the key frame itself. + if (cpi->ppi->lap_enabled) + // As we don't have enough stats to know the actual error of the group, + // we assume the complexity of each frame to be equal to 1, and set the + // error as the number of frames in the group(minus the keyframe). + twopass->kf_group_error_left = (double)(rc->frames_to_key - 1); + else + twopass->kf_group_error_left = kf_group_err - kf_mod_err; + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. + twopass->modified_error_left -= kf_group_err; +} + +#define ARF_STATS_OUTPUT 0 +#if ARF_STATS_OUTPUT +unsigned int arf_count = 0; +#endif + +static int get_section_target_bandwidth(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + int section_target_bandwidth; + const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count - + current_frame->frame_number); + if (cpi->ppi->lap_enabled) + section_target_bandwidth = (int)rc->avg_frame_bandwidth; + else + section_target_bandwidth = (int)(twopass->bits_left / frames_left); + return section_target_bandwidth; +} + +static INLINE void set_twopass_params_based_on_fp_stats( + AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) { + if (this_frame_ptr == NULL) return; + + TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame; + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass_frame->mb_av_energy = log1p(this_frame_ptr->intra_error); + + const FIRSTPASS_STATS *const total_stats = + cpi->ppi->twopass.stats_buf_ctx->total_stats; + if (is_fp_wavelet_energy_invalid(total_stats) == 0) { + twopass_frame->frame_avg_haar_energy = + log1p(this_frame_ptr->frame_avg_wavelet_energy); + } + + // Set the frame content type flag. + if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH) + twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass_frame->fr_content_type = FC_NORMAL; +} + +static void process_first_pass_stats(AV1_COMP *cpi, + FIRSTPASS_STATS *this_frame) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; + + if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 && + cpi->gf_frame_index == 0 && total_stats && + twopass->stats_buf_ctx->total_left_stats) { + if (cpi->ppi->lap_enabled) { + /* + * Accumulate total_stats using available limited number of stats, + * and assign it to total_left_stats. + */ + *twopass->stats_buf_ctx->total_left_stats = *total_stats; + } + // Special case code for first frame. + const int section_target_bandwidth = get_section_target_bandwidth(cpi); + const double section_length = + twopass->stats_buf_ctx->total_left_stats->count; + const double section_error = + twopass->stats_buf_ctx->total_left_stats->coded_error / section_length; + const double section_intra_skip = + twopass->stats_buf_ctx->total_left_stats->intra_skip_pct / + section_length; + const double section_inactive_zone = + (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) / + ((double)cm->mi_params.mb_rows * section_length); + const int tmp_q = get_twopass_worst_quality( + cpi, section_error, section_intra_skip + section_inactive_zone, + section_target_bandwidth); + + rc->active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + p_rc->last_q[INTER_FRAME] = tmp_q; + p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth); + p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q; + p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2; + p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME]; + } + + if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) { + *this_frame = *cpi->twopass_frame.stats_in; + ++cpi->twopass_frame.stats_in; + } + set_twopass_params_based_on_fp_stats(cpi, this_frame); +} + +static void setup_target_rate(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + int target_rate = gf_group->bit_allocation[cpi->gf_frame_index]; + + if (has_no_stats_stage(cpi)) { + av1_rc_set_frame_target(cpi, target_rate, cpi->common.width, + cpi->common.height); + } + + rc->base_frame_target = target_rate; +} + +void av1_mark_flashes(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats = first_stats, *next_stats; + while (this_stats < last_stats - 1) { + next_stats = this_stats + 1; + if (next_stats->pcnt_second_ref > next_stats->pcnt_inter && + next_stats->pcnt_second_ref >= 0.5) { + this_stats->is_flash = 1; + } else { + this_stats->is_flash = 0; + } + this_stats = next_stats; + } + // We always treat the last one as none flash. + if (last_stats - 1 >= first_stats) { + (last_stats - 1)->is_flash = 0; + } +} + +// Smooth-out the noise variance so it is more stable +// Returns 0 on success, -1 on memory allocation failure. +// TODO(bohanli): Use a better low-pass filter than averaging +static int smooth_filter_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + int len = (int)(last_stats - first_stats); + double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise)); + if (!smooth_noise) return -1; + + for (int i = 0; i < len; i++) { + double total_noise = 0; + double total_wt = 0; + for (int j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { + int idx = AOMMIN(AOMMAX(i + j, 0), len - 1); + if (first_stats[idx].is_flash) continue; + + total_noise += first_stats[idx].noise_var; + total_wt += 1.0; + } + if (total_wt > 0.01) { + total_noise /= total_wt; + } else { + total_noise = first_stats[i].noise_var; + } + smooth_noise[i] = total_noise; + } + + for (int i = 0; i < len; i++) { + first_stats[i].noise_var = smooth_noise[i]; + } + + aom_free(smooth_noise); + return 0; +} + +// Estimate the noise variance of each frame from the first pass stats +void av1_estimate_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info) { + FIRSTPASS_STATS *this_stats, *next_stats; + double C1, C2, C3, noise; + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + this_stats->noise_var = 0.0; + // flashes tend to have high correlation of innovations, so ignore them. + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) + continue; + + C1 = (this_stats - 1)->intra_error * + (this_stats->intra_error - this_stats->coded_error); + C2 = (this_stats - 2)->intra_error * + ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error); + C3 = (this_stats - 2)->intra_error * + (this_stats->intra_error - this_stats->sr_coded_error); + if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue; + C1 = sqrt(C1); + C2 = sqrt(C2); + C3 = sqrt(C3); + + noise = (this_stats - 1)->intra_error - C1 * C2 / C3; + noise = AOMMAX(noise, 0.01); + this_stats->noise_var = noise; + } + + // Copy noise from the neighbor if the noise value is not trustworthy + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) + continue; + if (this_stats->noise_var < 1.0) { + int found = 0; + // TODO(bohanli): consider expanding to two directions at the same time + for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) + continue; + found = 1; + this_stats->noise_var = next_stats->noise_var; + break; + } + if (found) continue; + for (next_stats = this_stats - 1; next_stats >= first_stats + 2; + next_stats--) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) + continue; + this_stats->noise_var = next_stats->noise_var; + break; + } + } + } + + // copy the noise if this is a flash + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) { + int found = 0; + for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash) + continue; + found = 1; + this_stats->noise_var = next_stats->noise_var; + break; + } + if (found) continue; + for (next_stats = this_stats - 1; next_stats >= first_stats + 2; + next_stats--) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash) + continue; + this_stats->noise_var = next_stats->noise_var; + break; + } + } + } + + // if we are at the first 2 frames, copy the noise + for (this_stats = first_stats; + this_stats < first_stats + 2 && (first_stats + 2) < last_stats; + this_stats++) { + this_stats->noise_var = (first_stats + 2)->noise_var; + } + + if (smooth_filter_noise(first_stats, last_stats) == -1) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffers in smooth_filter_noise()"); + } +} + +// Estimate correlation coefficient of each frame with its previous frame. +void av1_estimate_coeff(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats; + for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) { + const double C = + sqrt(AOMMAX((this_stats - 1)->intra_error * + (this_stats->intra_error - this_stats->coded_error), + 0.001)); + const double cor_coeff = + C / + AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001); + + this_stats->cor_coeff = + cor_coeff * + sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, + 0.001) / + AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001)); + // clip correlation coefficient. + this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1); + } + first_stats->cor_coeff = 1.0; +} + +void av1_get_second_pass_params(AV1_COMP *cpi, + EncodeFrameParams *const frame_params, + unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + if (cpi->use_ducky_encode && + cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) { + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; + frame_params->show_frame = + !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); + if (cpi->gf_frame_index == 0) { + av1_tf_info_reset(&cpi->ppi->tf_info); + av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); + } + return; + } + + const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in; + int update_total_stats = 0; + + if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return; + + // Check forced key frames. + const int frames_to_next_forced_key = detect_app_forced_key(cpi); + if (frames_to_next_forced_key == 0) { + rc->frames_to_key = 0; + frame_flags &= FRAMEFLAGS_KEY; + } else if (frames_to_next_forced_key > 0 && + frames_to_next_forced_key < rc->frames_to_key) { + rc->frames_to_key = frames_to_next_forced_key; + } + + assert(cpi->twopass_frame.stats_in != NULL); + const int update_type = gf_group->update_type[cpi->gf_frame_index]; + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; + + if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) { + assert(cpi->gf_frame_index < gf_group->size); + + setup_target_rate(cpi); + + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) { + const FIRSTPASS_STATS *const this_frame_ptr = + read_frame_stats(twopass, &cpi->twopass_frame, + gf_group->arf_src_offset[cpi->gf_frame_index]); + set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); + return; + } + } + + if (oxcf->rc_cfg.mode == AOM_Q) + rc->active_worst_quality = oxcf->rc_cfg.cq_level; + + if (cpi->gf_frame_index == gf_group->size) { + if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) { + const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1; + const int frames_to_key = define_kf_interval( + cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut, + /*search_start_idx=*/0); + if (frames_to_key != -1) + rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key); + } + } + + FIRSTPASS_STATS this_frame; + av1_zero(this_frame); + // call above fn + if (is_stat_consumption_stage(cpi)) { + if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) { + process_first_pass_stats(cpi, &this_frame); + update_total_stats = 1; + } + } else { + rc->active_worst_quality = oxcf->rc_cfg.cq_level; + } + + // Keyframe and section processing. + FIRSTPASS_STATS this_frame_copy; + this_frame_copy = this_frame; + if (rc->frames_to_key <= 0) { + assert(rc->frames_to_key == 0); + // Define next KF group and assign bits to it. + frame_params->frame_type = KEY_FRAME; + find_next_key_frame(cpi, &this_frame); + this_frame = this_frame_copy; + } + + if (rc->frames_to_fwd_kf <= 0) + rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist; + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (cpi->gf_frame_index == gf_group->size) { + av1_tf_info_reset(&cpi->ppi->tf_info); +#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS + vbr_rc_reset_gop_data(&cpi->vbr_rc_info); +#endif // CONFIG_BITRATE_ACCURACY + int max_gop_length = + (oxcf->gf_cfg.lag_in_frames >= 32) + ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames - + oxcf->algo_cfg.arnr_max_frames / 2) + : MAX_GF_LENGTH_LAP; + + // Handle forward key frame when enabled. + if (oxcf->kf_cfg.fwd_kf_dist > 0) + max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length); + + // Use the provided gop size in low delay setting + if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval; + + // Limit the max gop length for the last gop in 1 pass setting. + max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key); + + // Identify regions if needed. + // TODO(bohanli): identify regions for all stats available. + if (rc->frames_since_key == 0 || rc->frames_since_key == 1 || + (p_rc->frames_till_regions_update - rc->frames_since_key < + rc->frames_to_key && + p_rc->frames_till_regions_update - rc->frames_since_key < + max_gop_length + 1)) { + // how many frames we can analyze from this frame + int rest_frames = + AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES); + rest_frames = + AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end - + cpi->twopass_frame.stats_in + + (rc->frames_since_key == 0))); + p_rc->frames_till_regions_update = rest_frames; + + int ret; + if (cpi->ppi->lap_enabled) { + av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end, + cpi->common.error); + av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames, + (rc->frames_since_key == 0), p_rc->regions, + &p_rc->num_regions); + } else { + ret = identify_regions( + cpi->twopass_frame.stats_in - (rc->frames_since_key == 0), + rest_frames, 0, p_rc->regions, &p_rc->num_regions); + } + if (ret == -1) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating buffers in identify_regions"); + } + } + + int cur_region_idx = + find_regions_index(p_rc->regions, p_rc->num_regions, + rc->frames_since_key - p_rc->regions_offset); + if ((cur_region_idx >= 0 && + p_rc->regions[cur_region_idx].type == SCENECUT_REGION) || + rc->frames_since_key == 0) { + // If we start from a scenecut, then the last GOP's arf boost is not + // needed for this GOP. + cpi->ppi->gf_state.arf_gf_boost_lst = 0; + } + + int need_gf_len = 1; + if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) { + // set up bitstream to read + if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) { + cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output; + } + av1_open_second_pass_log(cpi, 1); + THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info; + // Read in GOP information from the second pass file. + av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info, + cpi->common.error); +#if CONFIG_BITRATE_ACCURACY + TPL_INFO *tpl_info; + AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info, + aom_malloc(sizeof(*tpl_info))); + av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream, + cpi->common.error); + aom_free(tpl_info); +#if CONFIG_THREE_PASS + // TODO(angiebird): Put this part into a func + cpi->vbr_rc_info.cur_gop_idx++; +#endif // CONFIG_THREE_PASS +#endif // CONFIG_BITRATE_ACCURACY + // Read in third_pass_info from the bitstream. + av1_set_gop_third_pass(cpi->third_pass_ctx); + // Read in per-frame info from second-pass encoding + av1_read_second_pass_per_frame_info( + cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info, + gop_info->num_frames, cpi->common.error); + + p_rc->cur_gf_index = 0; + p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length; + need_gf_len = 0; + } + + if (need_gf_len) { + // If we cannot obtain GF group length from second_pass_file + // TODO(jingning): Resolve the redundant calls here. + if (rc->intervals_till_gf_calculate_due == 0 || 1) { + calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS); + } + + if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model && + oxcf->gf_cfg.lag_in_frames >= 32 && + cpi->sf.tpl_sf.gop_length_decision_method != 3) { + int this_idx = rc->frames_since_key + + p_rc->gf_intervals[p_rc->cur_gf_index] - + p_rc->regions_offset - 1; + int this_region = + find_regions_index(p_rc->regions, p_rc->num_regions, this_idx); + int next_region = + find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1); + // TODO(angiebird): Figure out why this_region and next_region are -1 in + // unit test like AltRefFramePresenceTestLarge (aomedia:3134) + int is_last_scenecut = + p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key || + (this_region != -1 && + p_rc->regions[this_region].type == SCENECUT_REGION) || + (next_region != -1 && + p_rc->regions[next_region].type == SCENECUT_REGION); + + int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index]; + + if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 && + rc->min_gf_interval <= 16) { + // The calculate_gf_length function is previously used with + // max_gop_length = 32 with look-ahead gf intervals. + define_gf_group(cpi, frame_params, 0); + av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); + this_frame = this_frame_copy; + + if (is_shorter_gf_interval_better(cpi, frame_params)) { + // A shorter gf interval is better. + // TODO(jingning): Remove redundant computations here. + max_gop_length = 16; + calculate_gf_length(cpi, max_gop_length, 1); + if (is_last_scenecut && + (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) { + p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int; + } + } + } + } + } + + define_gf_group(cpi, frame_params, 0); + + if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE && + rc->frames_since_key > 0) + process_first_pass_stats(cpi, &this_frame); + + define_gf_group(cpi, frame_params, 1); + + // write gop info if needed for third pass. Per-frame info is written after + // each frame is encoded. + av1_write_second_pass_gop_info(cpi); + + av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group); + + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + assert(cpi->gf_frame_index == 0); +#if ARF_STATS_OUTPUT + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10d %10d %10d %10d\n", + cpi->common.current_frame.frame_number, + rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count, + p_rc->gfu_boost); + + fclose(fpfile); + } +#endif + } + assert(cpi->gf_frame_index < gf_group->size); + + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { + reset_fpf_position(&cpi->twopass_frame, start_pos); + + const FIRSTPASS_STATS *const this_frame_ptr = + read_frame_stats(twopass, &cpi->twopass_frame, + gf_group->arf_src_offset[cpi->gf_frame_index]); + set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); + } else { + // Back up this frame's stats for updating total stats during post encode. + cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL; + } + + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; + setup_target_rate(cpi); +} + +void av1_init_second_pass(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FRAME_INFO *const frame_info = &cpi->frame_info; + double frame_rate; + FIRSTPASS_STATS *stats; + + if (!twopass->stats_buf_ctx->stats_in_end) return; + + av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end, cpi->common.error); + av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + + stats = twopass->stats_buf_ctx->total_stats; + + *stats = *twopass->stats_buf_ctx->stats_in_end; + *twopass->stats_buf_ctx->total_left_stats = *stats; + + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + av1_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0); + +#if CONFIG_BITRATE_ACCURACY + av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left, + (int)round(stats->count)); +#endif + +#if CONFIG_RATECTRL_LOG + rc_log_init(&cpi->rc_log); +#endif + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + // Scan the first pass file and calculate a modified total error based upon + // the bias/power function used to allocate bits. + { + const double avg_error = + stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = + (avg_error * oxcf->rc_cfg.vbrmin_section) / 100; + twopass->modified_error_max = + (avg_error * oxcf->rc_cfg.vbrmax_section) / 100; + while (s < twopass->stats_buf_ctx->stats_in_end) { + modified_error_total += + calculate_modified_err(frame_info, twopass, oxcf, s); + ++s; + } + twopass->modified_error_left = modified_error_total; + } + + // Reset the vbr bits off target counters + cpi->ppi->p_rc.vbr_bits_off_target = 0; + cpi->ppi->p_rc.vbr_bits_off_target_fast = 0; + + cpi->ppi->p_rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; +} + +void av1_init_single_pass_lap(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + + if (!twopass->stats_buf_ctx->stats_in_end) return; + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + twopass->bits_left = 0; + twopass->modified_error_min = 0.0; + twopass->modified_error_max = 0.0; + twopass->modified_error_left = 0.0; + + // Reset the vbr bits off target counters + cpi->ppi->p_rc.vbr_bits_off_target = 0; + cpi->ppi->p_rc.vbr_bits_off_target_fast = 0; + + cpi->ppi->p_rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initialize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; +} + +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +void av1_twopass_postencode_update(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + + // Increment the stats_in pointer. + if (is_stat_consumption_stage(cpi) && + !(cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode == + DUCKY_ENCODE_GOP_MODE_RCL) && + (cpi->gf_frame_index < cpi->ppi->gf_group.size || + rc->frames_to_key == 0)) { + const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) { + FIRSTPASS_STATS this_frame; + assert(cpi->twopass_frame.stats_in > + twopass->stats_buf_ctx->stats_in_start); + --cpi->twopass_frame.stats_in; + if (cpi->ppi->lap_enabled) { + input_stats_lap(twopass, &cpi->twopass_frame, &this_frame); + } else { + input_stats(twopass, &cpi->twopass_frame, &this_frame); + } + } else if (cpi->ppi->lap_enabled) { + cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start; + } + } + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0); + + if (cpi->do_update_vbr_bits_off_target_fast) { + // Subtract current frame's fast_extra_bits. + p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits; + rc->frame_level_fast_extra_bits = 0; + } + + // Target vs actual bits for this arf group. + twopass->rolling_arf_group_target_bits += rc->base_frame_target; + twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; + + // Calculate the pct rc error. + if (p_rc->total_actual_bits) { + p_rc->rate_error_estimate = + (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits); + p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100); + } else { + p_rc->rate_error_estimate = 0; + } + +#if CONFIG_FPMT_TEST + /* The variables temp_vbr_bits_off_target, temp_bits_left, + * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits + * temp_rate_error_estimate are introduced for quality simulation purpose, + * it retains the value previous to the parallel encode frames. The + * variables are updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + const int simulate_parallel_frame = + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + simulate_parallel_frame) { + cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target; + cpi->ppi->p_rc.temp_bits_left = twopass->bits_left; + cpi->ppi->p_rc.temp_rolling_arf_group_target_bits = + twopass->rolling_arf_group_target_bits; + cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits = + twopass->rolling_arf_group_actual_bits; + cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate; + } +#endif + // Update the active best quality pyramid. + if (!rc->is_src_frame_alt_ref) { + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; + int i; + for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) { + p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex; +#if CONFIG_TUNE_VMAF + if (cpi->vmaf_info.original_qindex != -1 && + (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) { + p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex; + } +#endif + } + } + +#if 0 + { + AV1_COMMON *cm = &cpi->common; + FILE *fpfile; + fpfile = fopen("details.stt", "a"); + fprintf(fpfile, + "%10d %10d %10d %10" PRId64 " %10" PRId64 + " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n", + cm->current_frame.frame_number, rc->base_frame_target, + rc->projected_frame_size, rc->total_actual_bits, + rc->vbr_bits_off_target, p_rc->rate_error_estimate, + twopass->rolling_arf_group_target_bits, + twopass->rolling_arf_group_actual_bits, + (double)twopass->rolling_arf_group_actual_bits / + (double)twopass->rolling_arf_group_target_bits, + twopass->bpm_factor, + av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex, + cm->seq_params->bit_depth), + av1_convert_qindex_to_q(rc->active_worst_quality, + cm->seq_params->bit_depth)); + fclose(fpfile); + } +#endif + + if (cpi->common.current_frame.frame_type != KEY_FRAME) { + twopass->kf_group_bits -= rc->base_frame_target; + twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; + } + twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); + + // If the rate control is drifting consider adjustment to min or maxq. + if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) { + int minq_adj_limit; + int maxq_adj_limit; + minq_adj_limit = + (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); + maxq_adj_limit = rc->worst_quality - rc->active_worst_quality; + + // Undershoot + if ((rc_cfg->under_shoot_pct < 100) && + (p_rc->rolling_actual_bits < p_rc->rolling_target_bits)) { + int pct_error = + ((p_rc->rolling_target_bits - p_rc->rolling_actual_bits) * 100) / + p_rc->rolling_target_bits; + + if ((pct_error >= rc_cfg->under_shoot_pct) && + (p_rc->rate_error_estimate > 0)) { + twopass->extend_minq += 1; + } + twopass->extend_maxq -= 1; + // Overshoot + } else if ((rc_cfg->over_shoot_pct < 100) && + (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) { + int pct_error = + ((p_rc->rolling_actual_bits - p_rc->rolling_target_bits) * 100) / + p_rc->rolling_target_bits; + + pct_error = clamp(pct_error, 0, 100); + if ((pct_error >= rc_cfg->over_shoot_pct) && + (p_rc->rate_error_estimate < 0)) { + twopass->extend_maxq += 1; + } + twopass->extend_minq -= 1; + } else { + // Adjustment for extreme local overshoot. + // Only applies when normal adjustment above is not used (e.g. + // when threshold is set to 100). + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + // Unwind extreme overshoot adjustment. + else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits) + --twopass->extend_maxq; + } + twopass->extend_minq = + clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit); + twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + p_rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast, + (4 * rc->avg_frame_bandwidth)); + } + } + +#if CONFIG_FPMT_TEST + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + simulate_parallel_frame) { + cpi->ppi->p_rc.temp_vbr_bits_off_target_fast = + p_rc->vbr_bits_off_target_fast; + cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq; + cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq; + } +#endif + } + + // Update the frame probabilities obtained from parallel encode frames + FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs; +#if CONFIG_FPMT_TEST + /* The variable temp_active_best_quality is introduced only for quality + * simulation purpose, it retains the value previous to the parallel + * encode frames. The variable is updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + simulate_parallel_frame) { + int i; + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; + if (!rc->is_src_frame_alt_ref) { + for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) + cpi->ppi->p_rc.temp_active_best_quality[i] = + p_rc->active_best_quality[i]; + } + } + + // Update the frame probabilities obtained from parallel encode frames + FrameProbInfo *const temp_frame_probs_simulation = + simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation + : frame_probs; + FrameProbInfo *const temp_frame_probs = + simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL; +#endif + int i, j, loop; + // Sequentially do average on temp_frame_probs_simulation which holds + // probabilities of last frame before parallel encode + for (loop = 0; loop <= cpi->num_frame_recode; loop++) { + // Sequentially update tx_type_probs + if (cpi->do_update_frame_probs_txtype[loop] && + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + for (i = 0; i < TX_SIZES_ALL; i++) { + int left = 1024; + + for (j = TX_TYPES - 1; j >= 0; j--) { + const int new_prob = + cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j]; +#if CONFIG_FPMT_TEST + int prob = + (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob; +#else + int prob = + (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->tx_type_probs[update_type][i][j] = prob; +#endif + } + } + } + + // Sequentially update obmc_probs + if (cpi->do_update_frame_probs_obmc[loop] && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + const int new_prob = + cpi->frame_new_probs[loop].obmc_probs[update_type][i]; +#if CONFIG_FPMT_TEST + temp_frame_probs_simulation->obmc_probs[update_type][i] = + (temp_frame_probs_simulation->obmc_probs[update_type][i] + + new_prob) >> + 1; +#else + frame_probs->obmc_probs[update_type][i] = + (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; +#endif + } + } + + // Sequentially update warped_probs + if (cpi->do_update_frame_probs_warp[loop] && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type]; +#if CONFIG_FPMT_TEST + temp_frame_probs_simulation->warped_probs[update_type] = + (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >> + 1; +#else + frame_probs->warped_probs[update_type] = + (frame_probs->warped_probs[update_type] + new_prob) >> 1; +#endif + } + + // Sequentially update switchable_interp_probs + if (cpi->do_update_frame_probs_interpfilter[loop] && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int left = 1536; + + for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) { + const int new_prob = cpi->frame_new_probs[loop] + .switchable_interp_probs[update_type][i][j]; +#if CONFIG_FPMT_TEST + int prob = (temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + + temp_frame_probs_simulation + ->switchable_interp_probs[update_type][i][j] = prob; +#else + int prob = (frame_probs->switchable_interp_probs[update_type][i][j] + + new_prob) >> + 1; + left -= prob; + if (j == 0) prob += left; + frame_probs->switchable_interp_probs[update_type][i][j] = prob; +#endif + } + } + } + } + +#if CONFIG_FPMT_TEST + // Copying temp_frame_probs_simulation to temp_frame_probs based on + // the flag + if (cpi->do_frame_data_update && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + simulate_parallel_frame) { + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + for (i = 0; i < BLOCK_SIZES_ALL; i++) { + temp_frame_probs->obmc_probs[update_type_idx][i] = + temp_frame_probs_simulation->obmc_probs[update_type_idx][i]; + } + temp_frame_probs->warped_probs[update_type_idx] = + temp_frame_probs_simulation->warped_probs[update_type_idx]; + for (i = 0; i < TX_SIZES_ALL; i++) { + for (j = 0; j < TX_TYPES; j++) { + temp_frame_probs->tx_type_probs[update_type_idx][i][j] = + temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j]; + } + } + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + for (j = 0; j < SWITCHABLE_FILTERS; j++) { + temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = + temp_frame_probs_simulation + ->switchable_interp_probs[update_type_idx][i][j]; + } + } + } + } +#endif + // Update framerate obtained from parallel encode frames + if (cpi->common.show_frame && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + cpi->framerate = cpi->new_framerate; +#if CONFIG_FPMT_TEST + // SIMULATION PURPOSE + int show_existing_between_parallel_frames_cndn = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn && + cpi->do_frame_data_update && simulate_parallel_frame) + cpi->temp_framerate = cpi->framerate; +#endif +} diff --git a/third_party/aom/av1/encoder/pass2_strategy.h b/third_party/aom/av1/encoder/pass2_strategy.h new file mode 100644 index 0000000000..5987a78a23 --- /dev/null +++ b/third_party/aom/av1/encoder/pass2_strategy.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_ +#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct EncodeFrameParams; + +#include "av1/encoder/encoder.h" + +/*! + * \brief accumulated stats and features in a gf group + */ +typedef struct { + /*!\cond */ + double gf_group_err; + double gf_group_raw_error; + double gf_group_skip_pct; + double gf_group_inactive_zone_rows; + + double mv_ratio_accumulator; + double decay_accumulator; + double zero_motion_accumulator; + double loop_decay_rate; + double last_loop_decay_rate; + double this_frame_mv_in_out; + double mv_in_out_accumulator; + double abs_mv_in_out_accumulator; + + double avg_sr_coded_error; + double avg_pcnt_second_ref; + double avg_new_mv_count; + double avg_wavelet_energy; + double avg_raw_err_stdev; + int non_zero_stdev_count; + /*!\endcond */ +} GF_GROUP_STATS; + +/*! + * \brief accumulated stats and features for a frame + */ +typedef struct { + /*!\cond */ + double frame_err; + double frame_coded_error; + double frame_sr_coded_error; + /*!\endcond */ +} GF_FRAME_STATS; +/*!\cond */ + +void av1_init_second_pass(struct AV1_COMP *cpi); + +void av1_init_single_pass_lap(AV1_COMP *cpi); + +/*!\endcond */ +/*!\brief Main per frame entry point for second pass of two pass encode + * + *\ingroup rate_control + * + * This function is called for each frame in the second pass of a two pass + * encode. It checks the frame type and if a new KF or GF/ARF is due. + * When a KF is due it calls find_next_key_frame() to work out how long + * this key frame group will be and assign bits to the key frame. + * At the start of a new GF/ARF group it calls calculate_gf_length() + * and define_gf_group() which are the main functions responsible for + * defining the size and structure of the new GF/ARF group. + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] frame_params Per frame encoding parameters + * \param[in] frame_flags Frame type and coding flags + * + * \remark No return but analyses first pass stats and assigns a target + * number of bits to the current frame and a target Q range. + */ +void av1_get_second_pass_params(struct AV1_COMP *cpi, + struct EncodeFrameParams *const frame_params, + unsigned int frame_flags); + +/*!\brief Adjustments to two pass and rate control after each frame. + * + *\ingroup rate_control + * + * This function is called after each frame to make adjustments to + * heuristics and data structures that relate to rate control. + * + * \param[in] cpi Top - level encoder instance structure + * + * \remark No return value but this function updates various rate control + * related data structures that for example track overshoot and + * undershoot. + */ +void av1_twopass_postencode_update(struct AV1_COMP *cpi); + +/*!\brief Distributes bits to frames in a group + * + *\ingroup rate_control + * + * This function decides on the allocation of bits between the different + * frames and types of frame in a GF/ARF group. + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] rc Rate control data + * \param[in] gf_group GF/ARF group data structure + * \param[in] is_key_frame Indicates if the first frame in the group is + * also a key frame. + * \param[in] use_arf Are ARF frames enabled or is this a GF only + * uni-directional group. + * \param[in] gf_group_bits Bits available to be allocated. + * + * \remark No return but updates the rate control and group data structures + * to reflect the allocation of bits. + */ +void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, + GF_GROUP *gf_group, int is_key_frame, int use_arf, + int64_t gf_group_bits); + +int av1_calc_arf_boost(const TWO_PASS *twopass, + const TWO_PASS_FRAME *twopass_frame, + const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info, + int offset, int f_frames, int b_frames, + int *num_fpstats_used, int *num_fpstats_required, + int project_gfu_boost); + +void av1_mark_flashes(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats); +void av1_estimate_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info); +void av1_estimate_coeff(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_ diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c new file mode 100644 index 0000000000..232a2f9edb --- /dev/null +++ b/third_party/aom/av1/encoder/pickcdef.c @@ -0,0 +1,958 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/mcomp.h" + +// Get primary and secondary filter strength for the given strength index and +// search method +static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method, + int *pri_strength, + int *sec_strength, + int strength_idx) { + const int tot_sec_filter = + (pick_method == CDEF_FAST_SEARCH_LVL5) + ? REDUCED_SEC_STRENGTHS_LVL5 + : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3 + : CDEF_SEC_STRENGTHS); + const int pri_idx = strength_idx / tot_sec_filter; + const int sec_idx = strength_idx % tot_sec_filter; + *pri_strength = pri_idx; + *sec_strength = sec_idx; + if (pick_method == CDEF_FULL_SEARCH) return; + + switch (pick_method) { + case CDEF_FAST_SEARCH_LVL1: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1); + *pri_strength = priconv_lvl1[pri_idx]; + break; + case CDEF_FAST_SEARCH_LVL2: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2); + *pri_strength = priconv_lvl2[pri_idx]; + break; + case CDEF_FAST_SEARCH_LVL3: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2); + assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3); + *pri_strength = priconv_lvl2[pri_idx]; + *sec_strength = secconv_lvl3[sec_idx]; + break; + case CDEF_FAST_SEARCH_LVL4: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4); + assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3); + *pri_strength = priconv_lvl4[pri_idx]; + *sec_strength = secconv_lvl3[sec_idx]; + break; + case CDEF_FAST_SEARCH_LVL5: + assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4); + assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5); + *pri_strength = priconv_lvl5[pri_idx]; + *sec_strength = secconv_lvl5[sec_idx]; + break; + default: assert(0 && "Invalid CDEF search method"); + } +} + +// Store CDEF filter strength calculated from strength index for given search +// method +#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \ + do { \ + get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength, \ + (strength_idx)); \ + cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength; \ + } while (0) + +/* Search for the best strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one(int *lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t tot_mse[TOTAL_STRENGTHS]; + const int total_strengths = nb_cdef_strengths[pick_method]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id = 0; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + if (mse[i][lev[gi]] < best_mse) { + best_mse = mse[i][lev[gi]]; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < total_strengths; j++) { + uint64_t best = best_mse; + if (mse[i][j] < best) best = mse[i][j]; + tot_mse[j] += best; + } + } + for (j = 0; j < total_strengths; j++) { + if (tot_mse[j] < best_tot_mse) { + best_tot_mse = tot_mse[j]; + best_id = j; + } + } + lev[nb_strengths] = best_id; + return best_tot_mse; +} + +/* Search for the best luma+chroma strength to add as an option, knowing we + already selected nb_strengths options. */ +static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS]; + int i, j; + uint64_t best_tot_mse = (uint64_t)1 << 63; + int best_id0 = 0; + int best_id1 = 0; + const int total_strengths = nb_cdef_strengths[pick_method]; + memset(tot_mse, 0, sizeof(tot_mse)); + for (i = 0; i < sb_count; i++) { + int gi; + uint64_t best_mse = (uint64_t)1 << 63; + /* Find best mse among already selected options. */ + for (gi = 0; gi < nb_strengths; gi++) { + uint64_t curr = mse[0][i][lev0[gi]]; + curr += mse[1][i][lev1[gi]]; + if (curr < best_mse) { + best_mse = curr; + } + } + /* Find best mse when adding each possible new option. */ + for (j = 0; j < total_strengths; j++) { + int k; + for (k = 0; k < total_strengths; k++) { + uint64_t best = best_mse; + uint64_t curr = mse[0][i][j]; + curr += mse[1][i][k]; + if (curr < best) best = curr; + tot_mse[j][k] += best; + } + } + } + for (j = 0; j < total_strengths; j++) { + int k; + for (k = 0; k < total_strengths; k++) { + if (tot_mse[j][k] < best_tot_mse) { + best_tot_mse = tot_mse[j][k]; + best_id0 = j; + best_id1 = k; + } + } + } + lev0[nb_strengths] = best_id0; + lev1[nb_strengths] = best_id1; + return best_tot_mse; +} + +/* Search for the set of strengths that minimizes mse. */ +static uint64_t joint_strength_search(int *best_lev, int nb_strengths, + uint64_t mse[][TOTAL_STRENGTHS], + int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t best_tot_mse; + int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 && + pick_method <= CDEF_FAST_SEARCH_LVL5); + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + if (!fast) { + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1]; + best_tot_mse = + search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method); + } + } + return best_tot_mse; +} + +/* Search for the set of luma+chroma strengths that minimizes mse. */ +static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1, + int nb_strengths, + uint64_t (**mse)[TOTAL_STRENGTHS], + int sb_count, + CDEF_PICK_METHOD pick_method) { + uint64_t best_tot_mse; + int i; + best_tot_mse = (uint64_t)1 << 63; + /* Greedy search: add one strength options at a time. */ + for (i = 0; i < nb_strengths; i++) { + best_tot_mse = + search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method); + } + /* Trying to refine the greedy search by reconsidering each + already-selected option. */ + for (i = 0; i < 4 * nb_strengths; i++) { + int j; + for (j = 0; j < nb_strengths - 1; j++) { + best_lev0[j] = best_lev0[j + 1]; + best_lev1[j] = best_lev1[j + 1]; + } + best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse, + sb_count, pick_method); + } + return best_tot_mse; +} + +static INLINE void init_src_params(int *src_stride, int *width, int *height, + int *width_log2, int *height_log2, + BLOCK_SIZE bsize) { + *src_stride = block_size_wide[bsize]; + *width = block_size_wide[bsize]; + *height = block_size_high[bsize]; + *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; + *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize]; +} +#if CONFIG_AV1_HIGHBITDEPTH +/* Compute MSE only on the blocks we filtered. */ +static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, + int row, int col) { + assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_8X8); + uint64_t sum = 0; + int bi, bx, by; + uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst); + uint16_t *dst_buff = &dst16[row * dstride + col]; + int src_stride, width, height, width_log2, height_log2; + init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, + bsize); + for (bi = 0; bi < cdef_count; bi++) { + by = dlist[bi].by; + bx = dlist[bi].bx; + sum += aom_mse_wxh_16bit_highbd( + &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride, + &src[bi << (height_log2 + width_log2)], src_stride, width, height); + } + return sum >> 2 * coeff_shift; +} +#endif + +// Checks dual and quad block processing is applicable for block widths 8 and 4 +// respectively. +static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width, + int cdef_count, int bi, int iter) { + assert(width == 8 || width == 4); + const int blk_offset = (width == 8) ? 1 : 3; + if ((iter + blk_offset) >= cdef_count) return 0; + + if (dlist[bi].by == dlist[bi + blk_offset].by && + dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx) + return 1; + + return 0; +} + +static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, int row, + int col) { + assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_8X8); + uint64_t sum = 0; + int bi, bx, by; + int iter = 0; + int inc = 1; + uint8_t *dst8 = (uint8_t *)dst; + uint8_t *dst_buff = &dst8[row * dstride + col]; + int src_stride, width, height, width_log2, height_log2; + init_src_params(&src_stride, &width, &height, &width_log2, &height_log2, + bsize); + + const int num_blks = 16 / width; + for (bi = 0; bi < cdef_count; bi += inc) { + by = dlist[bi].by; + bx = dlist[bi].bx; + uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)]; + uint8_t *dst_tmp = + &dst_buff[(by << height_log2) * dstride + (bx << width_log2)]; + + if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) { + sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height); + iter += num_blks; + inc = num_blks; + } else { + sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width, + height); + iter += 1; + inc = 1; + } + } + + return sum >> 2 * coeff_shift; +} + +// Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the +// region is outside frame boundary +static INLINE void fill_borders_for_fbs_on_frame_boundary( + uint16_t *inbuf, int hfilt_size, int vfilt_size, + bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary, + bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) { + if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary && + !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary) + return; + if (is_fb_on_frm_bottom_boundary) { + // Fill bottom region of the block + const int buf_offset = + (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER; + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) { + const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE; + // Fill bottom-left region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) { + const int buf_offset = + (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER; + // Fill bottom-right region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_top_boundary) { + // Fill top region of the block + fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) { + // Fill top-left region of the block + fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) { + const int buf_offset = hfilt_size + CDEF_HBORDER; + // Fill top-right region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_left_boundary) { + const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE; + // Fill left region of the block + fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (is_fb_on_frm_right_boundary) { + const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE; + // Fill right region of the block + fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE, + vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE); + } +} + +// Calculate the number of 8x8/4x4 filter units for which SSE can be calculated +// after CDEF filtering in single function call +static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units( + cdef_list *dlist, int cdef_count, int bi, int subsampling_x, + int subsampling_y) { + // TODO(Ranjit): Extend the optimization for 422 + if (subsampling_x != subsampling_y) return 1; + + // Combining more blocks seems to increase encode time due to increase in + // control code + if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by && + dlist[bi].bx + 3 == dlist[bi + 3].bx) { + /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific + * logic if y co-ordinates match and x co-ordinates are + * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */ + return 4; + } + if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by && + dlist[bi].bx + 1 == dlist[bi + 1].bx) { + /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific + * logic if their y co-ordinates match and x co-ordinates are + * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */ + return 2; + } + return 1; +} + +// Returns the block error after CDEF filtering for a given strength +static INLINE uint64_t get_filt_error( + const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd, + cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, + int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer, + int ref_stride, int row, int col, int pri_strength, int sec_strength, + int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) { + uint64_t curr_sse = 0; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y); + const int bw_log2 = 3 - pd->subsampling_x; + const int bh_log2 = 3 - pd->subsampling_y; + + // TODO(Ranjit): Extend this optimization for HBD + if (!cdef_search_ctx->use_highbitdepth) { + // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the + // error at CDEF block level + const int tot_blk_count = + (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >> + (bw_log2 + bh_log2); + if (cdef_count == tot_blk_count) { + // Calculate the offset in the buffer based on block position + const FULLPEL_MV this_mv = { row, col }; + const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); + if (pri_strength == 0 && sec_strength == 0) { + // When CDEF strength is zero, filtering is not applied. Hence + // error is calculated between source and unfiltered pixels + curr_sse = + aom_sse(&ref_buffer[buf_offset], ref_stride, + get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride, + block_size_wide[plane_bsize], block_size_high[plane_bsize]); + } else { + DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]); + + av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in, + cdef_search_ctx->xdec[pli], + cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, + dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), + cdef_search_ctx->damping, coeff_shift); + curr_sse = + aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8, + (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize], + block_size_high[plane_bsize]); + } + } else { + // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering + // functions produce 8-bit output and the error is calculated in 8-bit + // domain + if (pri_strength == 0 && sec_strength == 0) { + int num_error_calc_filt_units = 1; + for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) { + const uint8_t by = dlist[bi].by; + const uint8_t bx = dlist[bi].bx; + const int16_t by_pos = (by << bh_log2); + const int16_t bx_pos = (bx << bw_log2); + // Calculate the offset in the buffer based on block position + const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos }; + const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); + num_error_calc_filt_units = get_error_calc_width_in_filt_units( + dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y); + curr_sse += aom_sse( + &ref_buffer[buf_offset], ref_stride, + get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride, + num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2)); + } + } else { + DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]); + av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in, + cdef_search_ctx->xdec[pli], + cdef_search_ctx->ydec[pli], dir, dirinit, var, pli, + dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), + cdef_search_ctx->damping, coeff_shift); + int num_error_calc_filt_units = 1; + for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) { + const uint8_t by = dlist[bi].by; + const uint8_t bx = dlist[bi].bx; + const int16_t by_pos = (by << bh_log2); + const int16_t bx_pos = (bx << bw_log2); + // Calculate the offset in the buffer based on block position + const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos }; + const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos }; + const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride); + const int tmp_buf_offset = + get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2)); + num_error_calc_filt_units = get_error_calc_width_in_filt_units( + dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y); + curr_sse += aom_sse( + &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset], + (1 << MAX_SB_SIZE_LOG2), + num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2)); + } + } + } + } else { + DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]); + + av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, + cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli], + dir, dirinit, var, pli, dlist, cdef_count, pri_strength, + sec_strength + (sec_strength == 3), + cdef_search_ctx->damping, coeff_shift); + curr_sse = cdef_search_ctx->compute_cdef_dist_fn( + ref_buffer, ref_stride, tmp_dst, dlist, cdef_count, + cdef_search_ctx->bsize[pli], coeff_shift, row, col); + } + return curr_sse; +} + +// Calculates MSE at block level. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// fbr: Row index in units of 64x64 block +// fbc: Column index in units of 64x64 block +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, + struct aom_internal_error_info *error_info, + int fbr, int fbc, int sb_count) { + // TODO(aomedia:3276): Pass error_info to the low-level functions as required + // in future to handle error propagation. + (void)error_info; + const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params; + const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref; + const int coeff_shift = cdef_search_ctx->coeff_shift; + const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2; + const int *mi_high_l2 = cdef_search_ctx->mi_high_l2; + + // Declare and initialize the temporary buffers. + DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]); + cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128]; + int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } }; + uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER; + int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + int hb_step = 1, vb_step = 1; + BLOCK_SIZE bs; + + const MB_MODE_INFO *const mbmi = + mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc]; + + uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer, + ref->v_buffer }; + int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride, + ref->uv_stride }; + + if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 || + mbmi->bsize == BLOCK_64X128) { + bs = mbmi->bsize; + if (bs == BLOCK_128X128 || bs == BLOCK_128X64) { + nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + hb_step = 2; + } + if (bs == BLOCK_128X128 || bs == BLOCK_64X128) { + nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + vb_step = 2; + } + } else { + bs = BLOCK_64X64; + } + // Get number of 8x8 blocks which are not skip. Cdef processing happens for + // 8x8 blocks which are not skip. + const int cdef_count = av1_cdef_compute_sb_list( + mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs); + const bool is_fb_on_frm_left_boundary = (fbc == 0); + const bool is_fb_on_frm_right_boundary = + (fbc + hb_step == cdef_search_ctx->nhfb); + const bool is_fb_on_frm_top_boundary = (fbr == 0); + const bool is_fb_on_frm_bottom_boundary = + (fbr + vb_step == cdef_search_ctx->nvfb); + const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary); + const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary); + int dirinit = 0; + for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) { + /* We avoid filtering the pixels for which some of the pixels to + average are outside the frame. We could change the filter instead, + but it would add special cases for any future vectorization. */ + const int hfilt_size = (nhb << mi_wide_l2[pli]); + const int vfilt_size = (nvb << mi_high_l2[pli]); + const int ysize = + vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff; + const int xsize = + hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff; + const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli]; + const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli]; + struct macroblockd_plane pd = cdef_search_ctx->plane[pli]; + cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE, + pd.dst.buf, row - yoff, col - xoff, pd.dst.stride, + ysize, xsize); + fill_borders_for_fbs_on_frame_boundary( + inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary, + is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary, + is_fb_on_frm_bottom_boundary); + for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) { + int pri_strength, sec_strength; + get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength, + &sec_strength, gi); + const uint64_t curr_mse = get_filt_error( + cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli], + ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count, + pli, coeff_shift, bs); + if (pli < 2) + cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse; + else + cdef_search_ctx->mse[1][sb_count][gi] += curr_mse; + } + } + cdef_search_ctx->sb_index[sb_count] = + MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc; +} + +// MSE calculation at frame level. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx, + struct aom_internal_error_info *error_info) { + // Loop over each sb. + for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) { + for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) { + // Checks if cdef processing can be skipped for particular sb. + if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue; + // Calculate mse for each sb and store the relevant sb index. + av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc, + cdef_search_ctx->sb_count); + cdef_search_ctx->sb_count++; + } + } +} + +// Allocates memory for members of CdefSearchCtx. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters +// related to CDEF search context. +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) { + const int nvfb = cdef_search_ctx->nvfb; + const int nhfb = cdef_search_ctx->nhfb; + CHECK_MEM_ERROR( + cm, cdef_search_ctx->sb_index, + aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0]))); + cdef_search_ctx->sb_count = 0; + CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0], + aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb)); + CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1], + aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb)); +} + +// Deallocates the memory allocated for members of CdefSearchCtx. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters +// related to CDEF search context. +// Returns: +// Nothing will be returned. +void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) { + if (cdef_search_ctx) { + aom_free(cdef_search_ctx->mse[0]); + cdef_search_ctx->mse[0] = NULL; + aom_free(cdef_search_ctx->mse[1]); + cdef_search_ctx->mse[1] = NULL; + aom_free(cdef_search_ctx->sb_index); + cdef_search_ctx->sb_index = NULL; + } +} + +// Initialize the parameters related to CDEF search context. +// Inputs: +// frame: Pointer to compressed frame buffer +// ref: Pointer to the frame buffer holding the source frame +// cm: Pointer to top level common structure +// xd: Pointer to common current coding block structure +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// pick_method: Search method used to select CDEF parameters +// Returns: +// Nothing will be returned. Contents of cdef_search_ctx will be modified. +static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame, + const YV12_BUFFER_CONFIG *ref, + AV1_COMMON *cm, MACROBLOCKD *xd, + CdefSearchCtx *cdef_search_ctx, + CDEF_PICK_METHOD pick_method) { + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int num_planes = av1_num_planes(cm); + cdef_search_ctx->mi_params = &cm->mi_params; + cdef_search_ctx->ref = ref; + cdef_search_ctx->nvfb = + (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + cdef_search_ctx->nhfb = + (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6); + cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method]; + cdef_search_ctx->num_planes = num_planes; + cdef_search_ctx->pick_method = pick_method; + cdef_search_ctx->sb_count = 0; + cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth; + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + // Initialize plane wise information. + for (int pli = 0; pli < num_planes; pli++) { + cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x; + cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y; + cdef_search_ctx->bsize[pli] = + cdef_search_ctx->ydec[pli] + ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4) + : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8); + cdef_search_ctx->mi_wide_l2[pli] = + MI_SIZE_LOG2 - xd->plane[pli].subsampling_x; + cdef_search_ctx->mi_high_l2[pli] = + MI_SIZE_LOG2 - xd->plane[pli].subsampling_y; + cdef_search_ctx->plane[pli] = xd->plane[pli]; + } + // Function pointer initialization. +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params->use_highbitdepth) { + cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd; + cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd; + } else { + cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd; + cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist; + } +#else + cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd; + cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist; +#endif +} + +void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, + int is_screen_content) { + const int bd = cm->seq_params->bit_depth; + const int q = + av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8); + CdefInfo *const cdef_info = &cm->cdef_info; + // Check the speed feature to avoid extra signaling. + if (skip_cdef) { + cdef_info->cdef_bits = 1; + cdef_info->nb_cdef_strengths = 2; + } else { + cdef_info->cdef_bits = 0; + cdef_info->nb_cdef_strengths = 1; + } + cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6); + + int predicted_y_f1 = 0; + int predicted_y_f2 = 0; + int predicted_uv_f1 = 0; + int predicted_uv_f2 = 0; + if (is_screen_content) { + predicted_y_f1 = + (int)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02); + predicted_y_f2 = + (int)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01); + predicted_uv_f1 = + (int)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01); + predicted_uv_f2 = + (int)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0); + predicted_y_f1 = clamp(predicted_y_f1, 0, 15); + predicted_y_f2 = clamp(predicted_y_f2, 0, 3); + predicted_uv_f1 = clamp(predicted_uv_f1, 0, 15); + predicted_uv_f2 = clamp(predicted_uv_f2, 0, 3); + } else { + if (!frame_is_intra_only(cm)) { + predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f + + q * 0.0068615186f + 0.02709886f), + 0, 15); + predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f + + q * 0.0013993345f + 0.03831067f), + 0, 3); + predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f + + q * 0.0034628846f + 0.00887099f), + 0, 15); + predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f + + q * 0.00028223585f + 0.05576307f), + 0, 3); + } else { + predicted_y_f1 = clamp( + (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f), + 0, 15); + predicted_y_f2 = clamp((int)roundf(q * q * 0.0000029167343f + + q * 0.0027798624f + 0.0079405f), + 0, 3); + predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000130790995f + + q * 0.012892405f - 0.00748388f), + 0, 15); + predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f + + q * 0.00035520183f + 0.00228092f), + 0, 3); + } + } + cdef_info->cdef_strengths[0] = + predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2; + cdef_info->cdef_uv_strengths[0] = + predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2; + + // mbmi->cdef_strength is already set in the encoding stage. We don't need to + // set it again here. + if (skip_cdef) { + cdef_info->cdef_strengths[1] = 0; + cdef_info->cdef_uv_strengths[1] = 0; + return; + } + + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + MB_MODE_INFO **mbmi = mi_params->mi_grid_base; + // mbmi is NULL when real-time rate control library is used. + if (!mbmi) return; + for (int r = 0; r < nvfb; ++r) { + for (int c = 0; c < nhfb; ++c) { + MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c]; + current_mbmi->cdef_strength = 0; + } + mbmi += MI_SIZE_64X64 * mi_params->mi_stride; + } +} + +void av1_cdef_search(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control; + + assert(cdef_control != CDEF_NONE); + if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) { + CdefInfo *const cdef_info = &cm->cdef_info; + cdef_info->nb_cdef_strengths = 1; + cdef_info->cdef_bits = 0; + cdef_info->cdef_strengths[0] = 0; + cdef_info->cdef_uv_strengths[0] = 0; + return; + } + + // Indicate if external RC is used for testing + const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl; + if (rtc_ext_rc) { + av1_pick_cdef_from_qp(cm, 0, 0); + return; + } + CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method; + if (pick_method == CDEF_PICK_FROM_Q) { + const int use_screen_content_model = + cm->quant_params.base_qindex > + AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh, + cpi->rc.best_quality + 5) && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb, + use_screen_content_model); + return; + } + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int damping = 3 + (cm->quant_params.base_qindex >> 6); + const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 && + pick_method <= CDEF_FAST_SEARCH_LVL5); + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + + if (!cpi->cdef_search_ctx) + CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx, + aom_malloc(sizeof(*cpi->cdef_search_ctx))); + CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx; + + // Initialize parameters related to CDEF search context. + cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx, + pick_method); + // Allocate CDEF search context buffers. + cdef_alloc_data(cm, cdef_search_ctx); + // Frame level mse calculation. + if (cpi->mt_info.num_workers > 1) { + av1_cdef_mse_calc_frame_mt(cpi); + } else { + cdef_mse_calc_frame(cdef_search_ctx, cm->error); + } + + /* Search for different number of signaling bits. */ + int nb_strength_bits = 0; + uint64_t best_rd = UINT64_MAX; + CdefInfo *const cdef_info = &cm->cdef_info; + int sb_count = cdef_search_ctx->sb_count; + uint64_t(*mse[2])[TOTAL_STRENGTHS]; + mse[0] = cdef_search_ctx->mse[0]; + mse[1] = cdef_search_ctx->mse[1]; + /* Calculate the maximum number of bits required to signal CDEF strengths at + * block level */ + const int total_strengths = nb_cdef_strengths[pick_method]; + const int joint_strengths = + num_planes > 1 ? total_strengths * total_strengths : total_strengths; + const int max_signaling_bits = + joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1; + int rdmult = cpi->td.mb.rdmult; + for (int i = 0; i <= 3; i++) { + if (i > max_signaling_bits) break; + int best_lev0[CDEF_MAX_STRENGTHS]; + int best_lev1[CDEF_MAX_STRENGTHS] = { 0 }; + const int nb_strengths = 1 << i; + uint64_t tot_mse; + if (num_planes > 1) { + tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths, + mse, sb_count, pick_method); + } else { + tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count, + pick_method); + } + + const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS * + (num_planes > 1 ? 2 : 1); + const int rate_cost = av1_cost_literal(total_bits); + const uint64_t dist = tot_mse * 16; + const uint64_t rd = RDCOST(rdmult, rate_cost, dist); + if (rd < best_rd) { + best_rd = rd; + nb_strength_bits = i; + memcpy(cdef_info->cdef_strengths, best_lev0, + nb_strengths * sizeof(best_lev0[0])); + if (num_planes > 1) { + memcpy(cdef_info->cdef_uv_strengths, best_lev1, + nb_strengths * sizeof(best_lev1[0])); + } + } + } + + cdef_info->cdef_bits = nb_strength_bits; + cdef_info->nb_cdef_strengths = 1 << nb_strength_bits; + for (int i = 0; i < sb_count; i++) { + uint64_t best_mse = UINT64_MAX; + int best_gi = 0; + for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) { + uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]]; + if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]]; + if (curr < best_mse) { + best_gi = gi; + best_mse = curr; + } + } + mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength = + best_gi; + } + if (fast) { + for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) { + const int luma_strength = cdef_info->cdef_strengths[j]; + const int chroma_strength = cdef_info->cdef_uv_strengths[j]; + int pri_strength, sec_strength; + + STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method, + luma_strength); + STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method, + chroma_strength); + } + } + + cdef_info->cdef_damping = damping; + // Deallocate CDEF search context buffers. + av1_cdef_dealloc_data(cdef_search_ctx); +} diff --git a/third_party/aom/av1/encoder/pickcdef.h b/third_party/aom/av1/encoder/pickcdef.h new file mode 100644 index 0000000000..192e734fb0 --- /dev/null +++ b/third_party/aom/av1/encoder/pickcdef.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_PICKCDEF_H_ +#define AOM_AV1_ENCODER_PICKCDEF_H_ + +#include "av1/common/cdef.h" +#include "av1/encoder/speed_features.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\enum CDEF_CONTROL + * \brief This enum controls to which frames CDEF is applied. + */ +typedef enum { + CDEF_NONE = 0, /*!< Disable CDEF on all frames. */ + CDEF_ALL = 1, /*!< Enable CDEF for all frames. */ + CDEF_REFERENCE = 2, /*!< Disable CDEF on non reference frames. */ +} CDEF_CONTROL; + +/*!\cond */ +struct MultiThreadInfo; + +#define REDUCED_PRI_STRENGTHS_LVL1 8 +#define REDUCED_PRI_STRENGTHS_LVL2 5 +#define REDUCED_SEC_STRENGTHS_LVL3 2 +#define REDUCED_SEC_STRENGTHS_LVL5 1 +#define REDUCED_PRI_STRENGTHS_LVL4 2 + +#define REDUCED_TOTAL_STRENGTHS_LVL1 \ + (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS) +#define REDUCED_TOTAL_STRENGTHS_LVL2 \ + (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS) +#define REDUCED_TOTAL_STRENGTHS_LVL3 \ + (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3) +#define REDUCED_TOTAL_STRENGTHS_LVL4 \ + (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3) +#define REDUCED_TOTAL_STRENGTHS_LVL5 \ + (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5) +#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS) + +static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2, 3, + 5, 7, 10, 13 }; +static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 }; +static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 }; +static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 }; +static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 }; +static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 }; +static const int nb_cdef_strengths[CDEF_PICK_METHODS] = { + TOTAL_STRENGTHS, + REDUCED_TOTAL_STRENGTHS_LVL1, + REDUCED_TOTAL_STRENGTHS_LVL2, + REDUCED_TOTAL_STRENGTHS_LVL3, + REDUCED_TOTAL_STRENGTHS_LVL4, + REDUCED_TOTAL_STRENGTHS_LVL5, + TOTAL_STRENGTHS +}; + +typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src, + int src_voffset, int src_hoffset, int sstride, + int vsize, int hsize); +typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src, + cdef_list *dlist, int cdef_count, + BLOCK_SIZE bsize, int coeff_shift, + int row, int col); + +/*! \brief CDEF search context. + */ +typedef struct { + /*! + * Pointer to the frame buffer holding the source frame + */ + const YV12_BUFFER_CONFIG *ref; + /*! + * Pointer to params related to MB_MODE_INFO arrays and related info + */ + CommonModeInfoParams *mi_params; + /*! + * Info specific to each plane + */ + struct macroblockd_plane plane[MAX_MB_PLANE]; + /*! + * Function pointer of copy_fn + */ + copy_fn_t copy_fn; + /*! + * Function pointer of compute_cdef_dist_fn + */ + compute_cdef_dist_t compute_cdef_dist_fn; + /*! + * Number of strenghts evaluated in CDEF filter search + */ + int total_strengths; + /*! + * Bit-depth dependent shift + */ + int coeff_shift; + /*! + * CDEF damping factor + */ + int damping; + /*! + * Search method used to select CDEF parameters + */ + int pick_method; + /*! + * Number of planes + */ + int num_planes; + /*! + * Log2 of width of the MI unit in pixels. mi_wide_l2[i] + * indicates the width of the MI unit in pixels for the ith plane + */ + int mi_wide_l2[MAX_MB_PLANE]; + /*! + * Log2 of height of the MI unit in pixels. mi_high_l2[i] + * indicates the height of the MI unit in pixels for the ith plane + */ + int mi_high_l2[MAX_MB_PLANE]; + /*! + * Subsampling in x direction. xdec[i] indicates the subsampling + * for the ith plane + */ + int xdec[MAX_MB_PLANE]; + /*! + * Subsampling in y direction. ydec[i] indicates the subsampling + * for the ith plane + */ + int ydec[MAX_MB_PLANE]; + /*! + * bsize[i] indicates the block size of ith plane + */ + int bsize[MAX_MB_PLANE]; + /*! + * Number of 64x64 blocks in vertical direction of a frame + */ + int nvfb; + /*! + * Number of 64x64 blocks in horizontal direction of a frame + */ + int nhfb; + /*! + * Pointer to the mean squared error between the CDEF filtered block and the + * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds + * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength + * index + */ + uint64_t (*mse[2])[TOTAL_STRENGTHS]; + /*! + * Holds the position (in units of mi's) of the cdef filtered + * block in raster scan order + */ + int *sb_index; + /*! + * Holds the count of cdef filtered blocks + */ + int sb_count; + /*! + * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth + * is > 8-bit + */ + bool use_highbitdepth; +} CdefSearchCtx; + +static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params, + int mi_row, int mi_col) { + const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64); + const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64); + const int stride = mi_params->mi_stride; + MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col; + for (int r = 0; r < maxr; ++r, mbmi += stride) { + for (int c = 0; c < maxc; ++c) { + if (!mbmi[c]->skip_txfm) return 0; + } + } + return 1; +} + +// Checks if cdef processing can be skipped for particular sb. +// Inputs: +// cdef_search_ctx: Pointer to the structure containing parameters related to +// CDEF search context. +// fbr: Row index in units of 64x64 block +// fbc: Column index in units of 64x64 block +// Returns: +// 1/0 will be returned to indicate skip/don't skip cdef processing of sb +// respectively. +static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params, + int fbr, int fbc) { + const MB_MODE_INFO *const mbmi = + mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride + + MI_SIZE_64X64 * fbc]; + // No filtering if the entire filter block is skipped. + if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) + return 1; + // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128, + // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering + // is done at the corresponding block sizes. + if (((fbc & 1) && + (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) || + ((fbr & 1) && + (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128))) + return 1; + return 0; +} + +void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx); + +void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, + struct aom_internal_error_info *error_info, + int fbr, int fbc, int sb_count); +/*!\endcond */ + +/*!\brief AV1 CDEF parameter search + * + * \ingroup in_loop_cdef + * + * Searches for optimal CDEF parameters for frame + * + * \param[in,out] cpi Top level encoder structure + * + * \remark Nothing is returned. Instead, optimal CDEF parameters are stored + * in the \c cdef_info structure of type \ref CdefInfo inside \c cm: + * \arg \c cdef_bits: Bits of strength parameters + * \arg \c nb_cdef_strengths: Number of strength parameters + * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters + * for the luma plane. + * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters + * for the chroma planes. + * \arg \c damping_factor: CDEF damping factor. + * + */ +void av1_cdef_search(struct AV1_COMP *cpi); + +/*!\brief AV1 CDEF level from QP + * + * \ingroup in_loop_cdef + * + * Calculates CDEF levels from frame QP. Only used for speed 7+ with RT mode. + * + * \param[in,out] cm Pointer to top level common structure + * \param[in] skip_cdef Flag to skip CDEF filtering + * \param[in] is_screen_content Flag indicating screen content + * + */ +void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, + int is_screen_content); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_AV1_ENCODER_PICKCDEF_H_ diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c new file mode 100644 index 0000000000..9084d3f13a --- /dev/null +++ b/third_party/aom/av1/encoder/picklpf.c @@ -0,0 +1,339 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/av1_loopfilter.h" +#include "av1/common/quant_common.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" + +static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, + YV12_BUFFER_CONFIG *dst_bc, int plane) { + switch (plane) { + case 0: aom_yv12_copy_y(src_bc, dst_bc); break; + case 1: aom_yv12_copy_u(src_bc, dst_bc); break; + case 2: aom_yv12_copy_v(src_bc, dst_bc); break; + default: assert(plane >= 0 && plane <= 2); break; + } +} + +int av1_get_max_filter_level(const AV1_COMP *cpi) { + if (is_stat_consumption_stage_twopass(cpi)) { + return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; + } +} + +static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, + AV1_COMP *const cpi, int filt_level, + int partial_frame, int plane, int dir) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + int num_workers = mt_info->num_mod_workers[MOD_LPF]; + AV1_COMMON *const cm = &cpi->common; + int64_t filt_err; + + assert(plane >= 0 && plane <= 2); + int filter_level[2] = { filt_level, filt_level }; + if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1]; + if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0]; + + // set base filters for use of av1_get_filter_level when in DELTA_LF mode + switch (plane) { + case 0: + cm->lf.filter_level[0] = filter_level[0]; + cm->lf.filter_level[1] = filter_level[1]; + break; + case 1: cm->lf.filter_level_u = filter_level[0]; break; + case 2: cm->lf.filter_level_v = filter_level[0]; break; + } + + // lpf_opt_level = 1 : Enables dual/quad loop-filtering. + int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf); + + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane, + plane + 1, partial_frame, mt_info->workers, + num_workers, &mt_info->lf_row_sync, lpf_opt_level); + + filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane, + cm->seq_params->use_highbitdepth); + + // Re-instate the unfiltered frame + yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane); + + return filt_err; +} + +static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + int partial_frame, + const int *last_frame_filter_level, int plane, + int dir) { + const AV1_COMMON *const cm = &cpi->common; + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + int filt_direction = 0; + int64_t best_err; + int filt_best; + + // Start the search at the previous frame filter level unless it is now out of + // range. + int lvl; + switch (plane) { + case 0: + switch (dir) { + case 2: + lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >> + 1; + break; + case 0: + case 1: lvl = last_frame_filter_level[dir]; break; + default: assert(dir >= 0 && dir <= 2); return 0; + } + break; + case 1: lvl = last_frame_filter_level[2]; break; + case 2: lvl = last_frame_filter_level[3]; break; + default: assert(plane >= 0 && plane <= 2); return 0; + } + int filt_mid = clamp(lvl, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Sum squared error at each filter level + int64_t ss_err[MAX_LOOP_FILTER + 1]; + + const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search; + assert(use_coarse_search <= 1); + static const int min_filter_step_lookup[2] = { 0, 2 }; + // min_filter_step_thesh determines the stopping criteria for the search. + // The search is terminated when filter_step equals min_filter_step_thesh. + const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search]; + + // Set each entry to -1 + memset(ss_err, 0xFF, sizeof(ss_err)); + yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane); + best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir); + filt_best = filt_mid; + ss_err[filt_mid] = best_err; + + while (filter_step > min_filter_step_thesh) { + const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level); + const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level); + + // Bias against raising loop filter in favor of lowering it. + int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if ((is_stat_consumption_stage_twopass(cpi)) && + (cpi->ppi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20; + + // yx, bias less for large block size + if (cm->features.tx_mode != ONLY_4X4) bias >>= 1; + + if (filt_direction <= 0 && filt_low != filt_mid) { + // Get Low filter error score + if (ss_err[filt_low] < 0) { + ss_err[filt_low] = + try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir); + } + // If value is close to the best so far then bias towards a lower loop + // filter value. + if (ss_err[filt_low] < (best_err + bias)) { + // Was it actually better than the previous best? + if (ss_err[filt_low] < best_err) { + best_err = ss_err[filt_low]; + } + filt_best = filt_low; + } + } + + // Now look at filt_high + if (filt_direction >= 0 && filt_high != filt_mid) { + if (ss_err[filt_high] < 0) { + ss_err[filt_high] = + try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir); + } + // If value is significantly better than previous best, bias added against + // raising filter value + if (ss_err[filt_high] < (best_err - bias)) { + best_err = ss_err[filt_high]; + filt_best = filt_high; + } + } + + // Half the step distance if the best filter value was the same as last time + if (filt_best == filt_mid) { + filter_step /= 2; + filt_direction = 0; + } else { + filt_direction = (filt_best < filt_mid) ? -1 : 1; + filt_mid = filt_best; + } + } + + return filt_best; +} + +void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, + LPF_PICK_METHOD method) { + AV1_COMMON *const cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + const int num_planes = av1_num_planes(cm); + struct loopfilter *const lf = &cm->lf; + int disable_filter_rt_screen = 0; + (void)sd; + + lf->sharpness_level = 0; + + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->sf.rt_sf.skip_lf_screen) + disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi); + + if (disable_filter_rt_screen || + cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE || + (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE && + cpi->ppi->rtc_ref.non_reference_frame)) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + return; + } + + if (method == LPF_PICK_MINIMAL_LPF) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } else if (method >= LPF_PICK_FROM_Q) { + const int min_filter_level = 0; + const int max_filter_level = av1_get_max_filter_level(cpi); + const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, + seq_params->bit_depth); + // based on tests result for rtc test set + // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point + const int strength_boost_q_treshold = 0; + int inter_frame_multiplier = + (q > strength_boost_q_treshold || + (cpi->sf.rt_sf.use_nonrd_pick_mode && + cpi->common.width * cpi->common.height > 352 * 288)) + ? 12034 + : 6017; + // Increase strength on base TL0 for temporal layers, for low-resoln, + // based on frame source_sad. + if (cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id == 0 && + cpi->common.width * cpi->common.height <= 352 * 288 && + cpi->sf.rt_sf.use_nonrd_pick_mode) { + if (cpi->rc.frame_source_sad > 100000) + inter_frame_multiplier = inter_frame_multiplier << 1; + else if (cpi->rc.frame_source_sad > 50000) + inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); + } + // These values were determined by linear fitting the result of the + // searched level for 8 bit depth: + // Keyframes: filt_guess = q * 0.06699 - 1.60817 + // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 + // + // And high bit depth separately: + // filt_guess = q * 0.316206 + 3.87252 + int filt_guess; + switch (seq_params->bit_depth) { + case AOM_BITS_8: + filt_guess = + (cm->current_frame.frame_type == KEY_FRAME) + ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) + : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18); + break; + case AOM_BITS_10: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); + break; + case AOM_BITS_12: + filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); + break; + default: + assert(0 && + "bit_depth should be AOM_BITS_8, AOM_BITS_10 " + "or AOM_BITS_12"); + return; + } + if (seq_params->bit_depth != AOM_BITS_8 && + cm->current_frame.frame_type == KEY_FRAME) + filt_guess -= 4; + // TODO(chengchen): retrain the model for Y, U, V filter levels + lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level); + lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level); + if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY && + !frame_is_intra_only(cm) && !cpi->rc.high_source_sad) { + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } else { + const int num4x4 = (cm->width >> 2) * (cm->height >> 2); + const int newmv_thresh = 7; + const int distance_since_key_thresh = 5; + if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) < + newmv_thresh && + cpi->rc.frames_since_key > distance_since_key_thresh) { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; + } + } + } + } else { + int last_frame_filter_level[4] = { 0 }; + if (!frame_is_intra_only(cm)) { + last_frame_filter_level[0] = cpi->ppi->filter_level[0]; + last_frame_filter_level[1] = cpi->ppi->filter_level[1]; + last_frame_filter_level[2] = cpi->ppi->filter_level_u; + last_frame_filter_level[3] = cpi->ppi->filter_level_v; + } + // The frame buffer last_frame_uf is used to store the non-loop filtered + // reconstructed frame in search_filter_level(). + if (aom_realloc_frame_buffer( + &cpi->last_frame_uf, cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + + lf->filter_level[0] = lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 0, 2); + if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) { + lf->filter_level[0] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 0, 0); + lf->filter_level[1] = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 0, 1); + } + + if (num_planes > 1) { + lf->filter_level_u = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 1, 0); + lf->filter_level_v = + search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, + last_frame_filter_level, 2, 0); + } + } +} diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h new file mode 100644 index 0000000000..f567937c32 --- /dev/null +++ b/third_party/aom/av1/encoder/picklpf.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PICKLPF_H_ +#define AOM_AV1_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; +int av1_get_max_filter_level(const AV1_COMP *cpi); + +/*!\brief Algorithm for AV1 loop filter level selection. + * + * \ingroup in_loop_filter + * This function determines proper filter levels used for in-loop filter + * (deblock filter). + * + * \param[in] sd The pointer of frame buffer + * \param[in] cpi Top-level encoder structure + * \param[in] method The method used to select filter levels + * + * \par + * method includes: + * \arg \c LPF_PICK_FROM_FULL_IMAGE: Try the full image with different values. + * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search + * with non-dual filter only. + * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with + * different values. + * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type + * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last + * frame + * + * \remark Nothing is returned. Instead, filter levels below are stored in the + * "loopfilter" structure inside "cpi": + * \arg \c filter_level[0]: the vertical filter level for Y plane + * \arg \c filter_level[1]: the horizontal filter level for Y plane + * \arg \c filter_level_u: the filter level for U plane + * \arg \c filter_level_v: the filter level for V plane + * + * \n + * \b Overview + * \par + * The workflow of deblock filter is shown in Fig.1. \n + * Boundary pixels pass through a non-flatness check, followed by a step that + * determines smoothness and selects proper types of filters + * (4-, 6-, 8-, 14-tap filter). \n + * If non-flatness criteria is not satisfied, the encoder will not apply + * deblock filtering on these boundary pixels. + * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70% + * + * \par + * The non-flatness is determined by the boundary pixels and thresholds as shown + * in Fig.2. \n + * Filtering is applied when \n + * \f$|p_0-p_1| 700 ? 0.04590 : 0.02295 \n + * For 10 bit and 12 bit: \n + * filt_guess = q * 0.316206 + 3.87252 \n + * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v = + * clamp(filt_guess, min_filter_level, max_filter_level) \n + * Where min_filter_level = 0, max_filter_level = 64 \n + * The equations were determined by linear fitting using filter levels + * generated by "LPF_PICK_FROM_FULL_IMAGE" method. + * + */ +void av1_pick_filter_level(const struct yv12_buffer_config *sd, + struct AV1_COMP *cpi, LPF_PICK_METHOD method); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PICKLPF_H_ diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c new file mode 100644 index 0000000000..6429064175 --- /dev/null +++ b/third_party/aom/av1/encoder/pickrst.c @@ -0,0 +1,2217 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_scale_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/restoration.h" + +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/pickrst.h" + +// Number of Wiener iterations +#define NUM_WIENER_ITERS 5 + +// Penalty factor for use of dual sgr +#define DUAL_SGR_PENALTY_MULT 0.01 + +// Working precision for Wiener filter coefficients +#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16) + +#define SGRPROJ_EP_GRP1_START_IDX 0 +#define SGRPROJ_EP_GRP1_END_IDX 9 +#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4 +#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2 +static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6, + 9 }; +static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = { + { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 }, + { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 } +}; + +#if DEBUG_LR_COSTING +RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE] + [MAX_LR_UNITS_W * MAX_LR_UNITS_H]; +#endif // DEBUG_LR_COSTING + +typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, + int hstart, int width, int vstart, + int height); +typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a, + int hstart, int width, int vstart, + int height); + +#if CONFIG_AV1_HIGHBITDEPTH +#define NUM_EXTRACTORS (3 * (1 + 1)) +#else +#define NUM_EXTRACTORS 3 +#endif +static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_sse_part, aom_get_u_sse_part, + aom_get_v_sse_part, +#if CONFIG_AV1_HIGHBITDEPTH + aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part, + aom_highbd_get_v_sse_part, +#endif +}; +static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = { + aom_get_y_var, aom_get_u_var, aom_get_v_var, +#if CONFIG_AV1_HIGHBITDEPTH + aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var, +#endif +}; + +static int64_t sse_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *dst, int plane, + int highbd) { + return sse_part_extractors[3 * highbd + plane]( + src, dst, limits->h_start, limits->h_end - limits->h_start, + limits->v_start, limits->v_end - limits->v_start); +} + +static uint64_t var_restoration_unit(const RestorationTileLimits *limits, + const YV12_BUFFER_CONFIG *src, int plane, + int highbd) { + return var_part_extractors[3 * highbd + plane]( + src, limits->h_start, limits->h_end - limits->h_start, limits->v_start, + limits->v_end - limits->v_start); +} + +typedef struct { + const YV12_BUFFER_CONFIG *src; + YV12_BUFFER_CONFIG *dst; + + const AV1_COMMON *cm; + const MACROBLOCK *x; + int plane; + int plane_w; + int plane_h; + RestUnitSearchInfo *rusi; + + // Speed features + const LOOP_FILTER_SPEED_FEATURES *lpf_sf; + + uint8_t *dgd_buffer; + int dgd_stride; + const uint8_t *src_buffer; + int src_stride; + + // SSE values for each restoration mode for the current RU + // These are saved by each search function for use in search_switchable() + int64_t sse[RESTORE_SWITCHABLE_TYPES]; + + // This flag will be set based on the speed feature + // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning. + uint8_t skip_sgr_eval; + + // Total rate and distortion so far for each restoration type + // These are initialised by reset_rsc in search_rest_type + int64_t total_sse[RESTORE_TYPES]; + int64_t total_bits[RESTORE_TYPES]; + + // Reference parameters for delta-coding + // + // For each restoration type, we need to store the latest parameter set which + // has been used, so that we can properly cost up the next parameter set. + // Note that we have two sets of these - one for the single-restoration-mode + // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ) + // and one for the switchable mode. This is because these two cases can lead + // to different sets of parameters being signaled, but we don't know which + // we will pick for sure until the end of the search process. + WienerInfo ref_wiener; + SgrprojInfo ref_sgrproj; + WienerInfo switchable_ref_wiener; + SgrprojInfo switchable_ref_sgrproj; + + // Buffers used to hold dgd-avg and src-avg data respectively during SIMD + // call of Wiener filter. + int16_t *dgd_avg; + int16_t *src_avg; +} RestSearchCtxt; + +static AOM_INLINE void rsc_on_tile(void *priv) { + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + set_default_wiener(&rsc->ref_wiener); + set_default_sgrproj(&rsc->ref_sgrproj); + set_default_wiener(&rsc->switchable_ref_wiener); + set_default_sgrproj(&rsc->switchable_ref_sgrproj); +} + +static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) { + memset(rsc->total_sse, 0, sizeof(rsc->total_sse)); + memset(rsc->total_bits, 0, sizeof(rsc->total_bits)); +} + +static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src, + const AV1_COMMON *cm, const MACROBLOCK *x, + const LOOP_FILTER_SPEED_FEATURES *lpf_sf, + int plane, RestUnitSearchInfo *rusi, + YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) { + rsc->src = src; + rsc->dst = dst; + rsc->cm = cm; + rsc->x = x; + rsc->plane = plane; + rsc->rusi = rusi; + rsc->lpf_sf = lpf_sf; + + const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; + const int is_uv = plane != AOM_PLANE_Y; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + assert(plane_w == src->crop_widths[is_uv]); + assert(plane_h == src->crop_heights[is_uv]); + assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]); + assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]); + + rsc->plane_w = plane_w; + rsc->plane_h = plane_h; + rsc->src_buffer = src->buffers[plane]; + rsc->src_stride = src->strides[is_uv]; + rsc->dgd_buffer = dgd->buffers[plane]; + rsc->dgd_stride = dgd->strides[is_uv]; +} + +static int64_t try_restoration_unit(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + const RestorationUnitInfo *rui) { + const AV1_COMMON *const cm = rsc->cm; + const int plane = rsc->plane; + const int is_uv = plane > 0; + const RestorationInfo *rsi = &cm->rst_info[plane]; + RestorationLineBuffers rlbs; + const int bit_depth = cm->seq_params->bit_depth; + const int highbd = cm->seq_params->use_highbitdepth; + + const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf; + // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be + // also used in encoder. + const int optimized_lr = 0; + + av1_loop_restoration_filter_unit( + limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h, + is_uv && cm->seq_params->subsampling_x, + is_uv && cm->seq_params->subsampling_y, highbd, bit_depth, + fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], + rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error); + + return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd); +} + +int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int xq[2], + const sgr_params_type *params) { + int i, j; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[0] * (flt0[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + } + } else if (params->r[1] > 0) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15)); + const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS); + int32_t v = u << SGRPROJ_PRJ_BITS; + v += xq[1] * (flt1[j] - u); + const int32_t e = + ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt1 += flt1_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t e = (int32_t)(dat[j]) - src[j]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + return err; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], + const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int i, j; + int64_t err = 0; + const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + if (params->r[0] > 0 && params->r[1] > 0) { + int xq0 = xq[0]; + int xq1 = xq[1]; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v0 = flt0[j] - u; + int32_t v1 = flt1[j] - u; + int32_t v = half; + v += xq0 * v0; + v += xq1 * v1; + const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + src += src_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + int exq; + int32_t *flt; + int flt_stride; + if (params->r[0] > 0) { + exq = xq[0]; + flt = flt0; + flt_stride = flt0_stride; + } else { + exq = xq[1]; + flt = flt1; + flt_stride = flt1_stride; + } + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS); + int32_t v = half; + v += exq * (flt[j] - u); + const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } + } else { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int32_t d = dat[j]; + const int32_t s = src[j]; + const int32_t e = d - s; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int use_highbitdepth, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int *xqd, + const sgr_params_type *params) { + int xq[2]; + av1_decode_xq(xqd, xq, params); + +#if CONFIG_AV1_HIGHBITDEPTH + if (use_highbitdepth) { + return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); + + } else { + return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); + } +#else + (void)use_highbitdepth; + return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, xq, params); +#endif +} + +#define USE_SGRPROJ_REFINEMENT_SEARCH 1 +static int64_t finer_search_pixel_proj_error( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd, + const sgr_params_type *params) { + int64_t err = get_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + (void)start_step; +#if USE_SGRPROJ_REFINEMENT_SEARCH + int64_t err2; + int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 }; + int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 }; + for (int s = start_step; s >= 1; s >>= 1) { + for (int p = 0; p < 2; ++p) { + if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) { + continue; + } + int skip = 0; + do { + if (xqd[p] - s >= tap_min[p]) { + xqd[p] -= s; + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + if (err2 > err) { + xqd[p] += s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (xqd[p] + s <= tap_max[p]) { + xqd[p] += s; + err2 = + get_pixel_proj_error(src8, width, height, src_stride, dat8, + dat_stride, use_highbitdepth, flt0, + flt0_stride, flt1, flt1_stride, xqd, params); + if (err2 > err) { + xqd[p] -= s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + } +#endif // USE_SGRPROJ_REFINEMENT_SEARCH + return err; +} + +static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) { + if (dividend < 0) + return (dividend - divisor / 2) / divisor; + else + return (dividend + divisor / 2) / divisor; +} + +static AOM_INLINE void calc_proj_params_r0_r1_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + H[1][1] += (int64_t)f2 * f2; + H[0][1] += (int64_t)f1 * f2; + C[0] += (int64_t)f1 * s; + C[1] += (int64_t)f2 * s; + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + H[1][1] += (int64_t)f2 * f2; + H[0][1] += (int64_t)f1 * f2; + C[0] += (int64_t)f1 * s; + C[1] += (int64_t)f2 * s; + } + } + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int64_t H[2][2], + int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + C[0] += (int64_t)f1 * s; + } + } + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u; + H[0][0] += (int64_t)f1 * f1; + C[0] += (int64_t)f1 * s; + } + } + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], + int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[1][1] += (int64_t)f2 * f2; + C[1] += (int64_t)f2 * s; + } + } + H[1][1] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_high_bd_c( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS); + const int32_t s = + (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u; + const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u; + H[1][1] += (int64_t)f2 * f2; + C[1] += (int64_t)f2 * s; + } + } + H[1][1] /= size; + C[1] /= size; +} + +// The function calls 3 subfunctions for the following cases : +// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements +// of C and H need to be computed. +// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +void av1_calc_proj_params_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} + +void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int64_t H[2][2], + int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int use_highbitdepth, int32_t *flt0, + int flt0_stride, int32_t *flt1, + int flt1_stride, int *xq, + const sgr_params_type *params) { + int64_t H[2][2] = { { 0, 0 }, { 0, 0 } }; + int64_t C[2] = { 0, 0 }; + + // Default values to be returned if the problem becomes ill-posed + xq[0] = 0; + xq[1] = 0; + + if (!use_highbitdepth) { + if ((width & 0x7) == 0) { + av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C, params); + } else { + av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, flt1, flt1_stride, H, C, + params); + } + } +#if CONFIG_AV1_HIGHBITDEPTH + else { // NOLINT + if ((width & 0x7) == 0) { + av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C, params); + } else { + av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C, params); + } + } +#endif + + if (params->r[0] == 0) { + // H matrix is now only the scalar H[1][1] + // C vector is now only the scalar C[1] + const int64_t Det = H[1][1]; + if (Det == 0) return; // ill-posed, return default values + xq[0] = 0; + xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det); + } else if (params->r[1] == 0) { + // H matrix is now only the scalar H[0][0] + // C vector is now only the scalar C[0] + const int64_t Det = H[0][0]; + if (Det == 0) return; // ill-posed, return default values + xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det); + xq[1] = 0; + } else { + const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0]; + if (Det == 0) return; // ill-posed, return default values + + // If scaling up dividend would overflow, instead scale down the divisor + const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1]; + if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) || + (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1)) + xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS)); + else + xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det); + + const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0]; + if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) || + (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2)) + xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS)); + else + xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det); + } +} + +static AOM_INLINE void encode_xq(int *xq, int *xqd, + const sgr_params_type *params) { + if (params->r[0] == 0) { + xqd[0] = 0; + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else if (params->r[1] == 0) { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } else { + xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0); + xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1, + SGRPROJ_PRJ_MAX1); + } +} + +// Apply the self-guided filter across an entire restoration unit. +static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8, + int width, int height, int dat_stride, + int use_highbd, int bit_depth, int pu_width, + int pu_height, int32_t *flt0, int32_t *flt1, + int flt_stride, + struct aom_internal_error_info *error_info) { + for (int i = 0; i < height; i += pu_height) { + const int h = AOMMIN(pu_height, height - i); + int32_t *flt0_row = flt0 + i * flt_stride; + int32_t *flt1_row = flt1 + i * flt_stride; + const uint8_t *dat8_row = dat8 + i * dat_stride; + + // Iterate over the stripe in blocks of width pu_width + for (int j = 0; j < width; j += pu_width) { + const int w = AOMMIN(pu_width, width - j); + if (av1_selfguided_restoration( + dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j, + flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) { + aom_internal_error( + error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffer in av1_selfguided_restoration"); + } + } + } +} + +static AOM_INLINE void compute_sgrproj_err( + const uint8_t *dat8, const int width, const int height, + const int dat_stride, const uint8_t *src8, const int src_stride, + const int use_highbitdepth, const int bit_depth, const int pu_width, + const int pu_height, const int ep, int32_t *flt0, int32_t *flt1, + const int flt_stride, int *exqd, int64_t *err, + struct aom_internal_error_info *error_info) { + int exq[2]; + apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth, + pu_width, pu_height, flt0, flt1, flt_stride, error_info); + const sgr_params_type *const params = &av1_sgr_params[ep]; + get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride, + use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq, + params); + encode_xq(exq, exqd, params); + *err = finer_search_pixel_proj_error( + src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0, + flt_stride, flt1, flt_stride, 2, exqd, params); +} + +static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err, + const int *exqd, int *bestxqd, + int *bestep, const int ep) { + if (*besterr == -1 || err < *besterr) { + *bestep = ep; + *besterr = err; + bestxqd[0] = exqd[0]; + bestxqd[1] = exqd[1]; + } +} + +static SgrprojInfo search_selfguided_restoration( + const uint8_t *dat8, int width, int height, int dat_stride, + const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth, + int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning, + struct aom_internal_error_info *error_info) { + int32_t *flt0 = rstbuf; + int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX; + int ep, idx, bestep = 0; + int64_t besterr = -1; + int exqd[2], bestxqd[2] = { 0, 0 }; + int flt_stride = ((width + 7) & ~7) + 8; + assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) || + pu_width == RESTORATION_PROC_UNIT_SIZE); + assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) || + pu_height == RESTORATION_PROC_UNIT_SIZE); + if (!enable_sgr_ep_pruning) { + for (ep = 0; ep < SGRPROJ_PARAMS; ep++) { + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + } else { + // evaluate first four seed ep in first group + for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) { + ep = sgproj_ep_grp1_seed[idx]; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + // evaluate left and right ep of winner in seed ep + int bestep_ref = bestep; + for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) { + if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX) + continue; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + // evaluate last two group + for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) { + ep = sgproj_ep_grp2_3[idx][bestep]; + int64_t err; + compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride, + use_highbitdepth, bit_depth, pu_width, pu_height, ep, + flt0, flt1, flt_stride, exqd, &err, error_info); + get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep); + } + } + + SgrprojInfo ret; + ret.ep = bestep; + ret.xqd[0] = bestxqd[0]; + ret.xqd[1] = bestxqd[1]; + return ret; +} + +static int count_sgrproj_bits(SgrprojInfo *sgrproj_info, + SgrprojInfo *ref_sgrproj_info) { + int bits = SGRPROJ_PARAMS_BITS; + const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep]; + if (params->r[0] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, + sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0); + if (params->r[1] > 0) + bits += aom_count_primitive_refsubexpfin( + SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K, + ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, + sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1); + return bits; +} + +static AOM_INLINE void search_sgrproj( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)rlbs; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + const AV1_COMMON *const cm = rsc->cm; + const int highbd = cm->seq_params->use_highbitdepth; + const int bit_depth = cm->seq_params->bit_depth; + + const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0]; + // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set + if (rsc->skip_sgr_eval) { + rsc->total_bits[RESTORE_SGRPROJ] += bits_none; + rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE; + rsc->sse[RESTORE_SGRPROJ] = INT64_MAX; + return; + } + + uint8_t *dgd_start = + rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start; + const uint8_t *src_start = + rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; + + const int is_uv = rsc->plane > 0; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; + const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; + + rusi->sgrproj = search_selfguided_restoration( + dgd_start, limits->h_end - limits->h_start, + limits->v_end - limits->v_start, rsc->dgd_stride, src_start, + rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height, + tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info); + + RestorationUnitInfo rui; + rui.restoration_type = RESTORE_SGRPROJ; + rui.sgrproj_info = rusi->sgrproj; + + rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui); + + const int64_t bits_sgr = + x->mode_costs.sgrproj_restore_cost[1] + + (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj) + << AV1_PROB_COST_SHIFT); + double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth); + double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth); + if (rusi->sgrproj.ep < 10) + cost_sgr *= + (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); + + RestorationType rtype = + (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE; + rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype; + +#if DEBUG_LR_COSTING + // Store ref params for later checking + lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info = + rsc->ref_sgrproj; +#endif // DEBUG_LR_COSTING + + rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype]; + rsc->total_bits[RESTORE_SGRPROJ] += + (cost_sgr < cost_none) ? bits_sgr : bits_none; + if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj; +} + +static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src, + int dgd_stride, int h_start, int h_end, + uint8_t avg, const int wiener_halfwin, + const int wiener_win2, int32_t *M_int32, + int32_t *H_int32, int count) { + int j, k, l; + int16_t Y[WIENER_WIN2]; + + for (j = h_start; j < h_end; j++) { + const int16_t X = (int16_t)src[j] - (int16_t)avg; + int idx = 0; + for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { + for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { + Y[idx] = + (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg; + idx++; + } + } + assert(idx == wiener_win2); + for (k = 0; k < wiener_win2; ++k) { + M_int32[k] += (int32_t)Y[k] * X; + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l]; + } + } + } +} + +void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src, + int16_t *dgd_avg, int16_t *src_avg, int h_start, + int h_end, int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + (void)dgd_avg; + (void)src_avg; + int i, k, l; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + int32_t M_row[WIENER_WIN2] = { 0 }; + int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 }; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + + for (i = v_start; i < v_end; i = i + downsample_factor) { + if (use_downsampled_wiener_stats && + (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + downsample_factor = v_end - i; + } + + memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2); + memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2); + acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end, + avg, wiener_halfwin, wiener_win2, M_row, H_row, i); + + for (k = 0; k < wiener_win2; ++k) { + // Scale M matrix based on the downsampling factor + M[k] += ((int64_t)M_row[k] * downsample_factor); + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + // Scale H Matrix based on the downsampling factor + H[k * wiener_win2 + l] += + ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor); + } + } + } + + for (k = 0; k < wiener_win2; ++k) { + for (l = k + 1; l < wiener_win2; ++l) { + H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + int i, j, k, l; + int32_t Y[WIENER_WIN2]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + memset(M, 0, sizeof(*M) * wiener_win2); + memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + for (i = v_start; i < v_end; i++) { + for (j = h_start; j < h_end; j++) { + const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg; + int idx = 0; + for (k = -wiener_halfwin; k <= wiener_halfwin; k++) { + for (l = -wiener_halfwin; l <= wiener_halfwin; l++) { + Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg; + idx++; + } + } + assert(idx == wiener_win2); + for (k = 0; k < wiener_win2; ++k) { + M[k] += (int64_t)Y[k] * X; + for (l = k; l < wiener_win2; ++l) { + // H is a symmetric matrix, so we only need to fill out the upper + // triangle here. We can copy it down to the lower triangle outside + // the (i, j) loops. + H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l]; + } + } + } + } + for (k = 0; k < wiener_win2; ++k) { + M[k] /= bit_depth_divider; + H[k * wiener_win2 + k] /= bit_depth_divider; + for (l = k + 1; l < wiener_win2; ++l) { + H[k * wiener_win2 + l] /= bit_depth_divider; + H[l * wiener_win2 + k] = H[k * wiener_win2 + l]; + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE int wrap_index(int i, int wiener_win) { + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i); +} + +// Solve linear equations to find Wiener filter tap values +// Taps are output scaled by WIENER_FILT_STEP +static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b, + int64_t *x) { + for (int k = 0; k < n - 1; k++) { + // Partial pivoting: bring the row with the largest pivot to the top + for (int i = n - 1; i > k; i--) { + // If row i has a better (bigger) pivot than row (i-1), swap them + if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) { + for (int j = 0; j < n; j++) { + const int64_t c = A[i * stride + j]; + A[i * stride + j] = A[(i - 1) * stride + j]; + A[(i - 1) * stride + j] = c; + } + const int64_t c = b[i]; + b[i] = b[i - 1]; + b[i - 1] = c; + } + } + + // b/278065963: The multiplies + // c / 256 * A[k * stride + j] / cd * 256 + // and + // c / 256 * b[k] / cd * 256 + // within Gaussian elimination can cause a signed integer overflow. Rework + // the multiplies so that larger scaling is used without significantly + // impacting the overall precision. + // + // Precision guidance: + // scale_threshold: Pick as high as possible. + // For max_abs_akj >= scale_threshold scenario: + // scaler_A: Pick as low as possible. Needed for A[(i + 1) * stride + j]. + // scaler_c: Pick as low as possible while maintaining scaler_c >= + // (1 << 7). Needed for A[(i + 1) * stride + j] and b[i + 1]. + int64_t max_abs_akj = 0; + for (int j = 0; j < n; j++) { + const int64_t abs_akj = llabs(A[k * stride + j]); + if (abs_akj > max_abs_akj) max_abs_akj = abs_akj; + } + const int scale_threshold = 1 << 22; + const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5); + const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7); + const int scaler = scaler_c * scaler_A; + + // Forward elimination (convert A to row-echelon form) + for (int i = k; i < n - 1; i++) { + if (A[k * stride + k] == 0) return 0; + const int64_t c = A[(i + 1) * stride + k] / scaler_c; + const int64_t cd = A[k * stride + k]; + for (int j = 0; j < n; j++) { + A[(i + 1) * stride + j] -= + A[k * stride + j] / scaler_A * c / cd * scaler; + } + b[i + 1] -= c * b[k] / cd * scaler_c; + } + } + // Back-substitution + for (int i = n - 1; i >= 0; i--) { + if (A[i * stride + i] == 0) return 0; + int64_t c = 0; + for (int j = i + 1; j <= n - 1; j++) { + c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR; + } + // Store filter taps x in scaled form. + x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i]; + } + + return 1; +} + +// Fix vector b, update vector a +static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc, + int64_t **Hc, int32_t *a, int32_t *b) { + int i, j; + int64_t S[WIENER_WIN]; + int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; ++j) { + const int jj = wrap_index(j, wiener_win); + A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR; + } + } + + // b/274668506: This is the dual branch for the issue in b/272139363. The fix + // is similar. See comments in update_b_sep_sym() below. + int32_t max_b_l = 0; + for (int l = 0; l < wiener_win; ++l) { + const int32_t abs_b_l = abs(b[l]); + if (abs_b_l > max_b_l) max_b_l = abs_b_l; + } + const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR; + const int scaler = max_b_l < scale_threshold ? 1 : 4; + + for (i = 0; i < wiener_win; i++) { + for (j = 0; j < wiener_win; j++) { + int k, l; + for (k = 0; k < wiener_win; ++k) { + const int kk = wrap_index(k, wiener_win); + for (l = 0; l < wiener_win; ++l) { + const int ll = wrap_index(l, wiener_win); + B[ll * wiener_halfwin1 + kk] += + Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] / + (scaler * WIENER_TAP_SCALE_FACTOR) * b[j] / + (WIENER_TAP_SCALE_FACTOR / scaler); + } + } + } + } + // Normalization enforcement in the system of equations itself + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + A[i] -= + A[wiener_halfwin1 - 1] * 2 + + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; + } + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + for (j = 0; j < wiener_halfwin1 - 1; ++j) { + B[i * wiener_halfwin1 + j] -= + 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + + (wiener_halfwin1 - 1)]); + } + } + if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { + S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; + for (i = wiener_halfwin1; i < wiener_win; ++i) { + S[i] = S[wiener_win - 1 - i]; + S[wiener_halfwin1 - 1] -= 2 * S[i]; + } + for (i = 0; i < wiener_win; ++i) { + a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)), + (1 << (WIENER_FILT_BITS - 1)) - 1); + } + } +} + +// Fix vector a, update vector b +static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc, + int64_t **Hc, int32_t *a, int32_t *b) { + int i, j; + int64_t S[WIENER_WIN]; + int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin1 = (wiener_win >> 1) + 1; + memset(A, 0, sizeof(A)); + memset(B, 0, sizeof(B)); + for (i = 0; i < wiener_win; i++) { + const int ii = wrap_index(i, wiener_win); + for (j = 0; j < wiener_win; j++) { + A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR; + } + } + + // b/272139363: The computation, + // Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / + // WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR; + // may generate a signed-integer-overflow. Conditionally scale the terms to + // avoid a potential overflow. + // + // Hc contains accumulated correlation statistics and it is desired to leave + // as much room as possible for Hc. It was experimentally observed that the + // primary issue manifests itself with the second, a[l], multiply. For + // max_a_l < WIENER_TAP_SCALE_FACTOR the first multiply with a[k] should not + // increase dynamic range and the second multiply should hence be safe. + // Thereafter a safe scale_threshold depends on the actual operational range + // of Hc. The largest scale_threshold is expected to depend on bit-depth + // (av1_compute_stats_highbd_c() scales highbd to 8-bit) and maximum + // restoration-unit size (256), leading up to 32-bit positive numbers in Hc. + // Noting that the caller, wiener_decompose_sep_sym(), initializes a[...] + // to a range smaller than 16 bits, the scale_threshold is set as below for + // convenience. + int32_t max_a_l = 0; + for (int l = 0; l < wiener_win; ++l) { + const int32_t abs_a_l = abs(a[l]); + if (abs_a_l > max_a_l) max_a_l = abs_a_l; + } + const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR; + const int scaler = max_a_l < scale_threshold ? 1 : 4; + + for (i = 0; i < wiener_win; i++) { + const int ii = wrap_index(i, wiener_win); + for (j = 0; j < wiener_win; j++) { + const int jj = wrap_index(j, wiener_win); + int k, l; + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + B[jj * wiener_halfwin1 + ii] += + Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] / + (scaler * WIENER_TAP_SCALE_FACTOR) * a[l] / + (WIENER_TAP_SCALE_FACTOR / scaler); + } + } + } + } + // Normalization enforcement in the system of equations itself + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + A[i] -= + A[wiener_halfwin1 - 1] * 2 + + B[i * wiener_halfwin1 + wiener_halfwin1 - 1] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)]; + } + for (i = 0; i < wiener_halfwin1 - 1; ++i) { + for (j = 0; j < wiener_halfwin1 - 1; ++j) { + B[i * wiener_halfwin1 + j] -= + 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] + + B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] - + 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + + (wiener_halfwin1 - 1)]); + } + } + if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) { + S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR; + for (i = wiener_halfwin1; i < wiener_win; ++i) { + S[i] = S[wiener_win - 1 - i]; + S[wiener_halfwin1 - 1] -= 2 * S[i]; + } + for (i = 0; i < wiener_win; ++i) { + b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)), + (1 << (WIENER_FILT_BITS - 1)) - 1); + } + } +} + +static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H, + int32_t *a, int32_t *b) { + static const int32_t init_filt[WIENER_WIN] = { + WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV, + WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV, + WIENER_FILT_TAP0_MIDV, + }; + int64_t *Hc[WIENER_WIN2]; + int64_t *Mc[WIENER_WIN]; + int i, j, iter; + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + const int wiener_win2 = wiener_win * wiener_win; + for (i = 0; i < wiener_win; i++) { + a[i] = b[i] = + WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off]; + } + for (i = 0; i < wiener_win; i++) { + Mc[i] = M + i * wiener_win; + for (j = 0; j < wiener_win; j++) { + Hc[i * wiener_win + j] = + H + i * wiener_win * wiener_win2 + j * wiener_win; + } + } + + iter = 1; + while (iter < NUM_WIENER_ITERS) { + update_a_sep_sym(wiener_win, Mc, Hc, a, b); + update_b_sep_sym(wiener_win, Mc, Hc, a, b); + iter++; + } +} + +// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares +// against identity filters; Final score is defined as the difference between +// the function values +static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H, + InterpKernel vfilt, InterpKernel hfilt) { + int32_t ab[WIENER_WIN * WIENER_WIN]; + int16_t a[WIENER_WIN], b[WIENER_WIN]; + int64_t P = 0, Q = 0; + int64_t iP = 0, iQ = 0; + int64_t Score, iScore; + int i, k, l; + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + const int wiener_win2 = wiener_win * wiener_win; + + a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP; + for (i = 0; i < WIENER_HALFWIN; ++i) { + a[i] = a[WIENER_WIN - i - 1] = vfilt[i]; + b[i] = b[WIENER_WIN - i - 1] = hfilt[i]; + a[WIENER_HALFWIN] -= 2 * a[i]; + b[WIENER_HALFWIN] -= 2 * b[i]; + } + memset(ab, 0, sizeof(ab)); + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) + ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off]; + } + for (k = 0; k < wiener_win2; ++k) { + P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP; + for (l = 0; l < wiener_win2; ++l) { + Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP / + WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP; + } + } + Score = Q - 2 * P; + + iP = M[wiener_win2 >> 1]; + iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)]; + iScore = iQ - 2 * iP; + + return Score - iScore; +} + +static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f, + InterpKernel fi) { + int i; + const int wiener_halfwin = (wiener_win >> 1); + + for (i = 0; i < wiener_halfwin; ++i) { + const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP; + const int64_t divisor = WIENER_TAP_SCALE_FACTOR; + // Perform this division with proper rounding rather than truncation + if (dividend < 0) { + fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor); + } else { + fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor); + } + } + // Specialize for 7-tap filter + if (wiener_win == WIENER_WIN) { + fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV); + fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + } else { + fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV); + fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV); + fi[0] = 0; + } + // Satisfy filter constraints + fi[WIENER_WIN - 1] = fi[0]; + fi[WIENER_WIN - 2] = fi[1]; + fi[WIENER_WIN - 3] = fi[2]; + // The central element has an implicit +WIENER_FILT_STEP + fi[3] = -2 * (fi[0] + fi[1] + fi[2]); +} + +static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info, + WienerInfo *ref_wiener_info) { + int bits = 0; + if (wiener_win == WIENER_WIN) + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV); + if (wiener_win == WIENER_WIN) + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1, + WIENER_FILT_TAP0_SUBEXP_K, + ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, + wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1, + WIENER_FILT_TAP1_SUBEXP_K, + ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, + wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV); + bits += aom_count_primitive_refsubexpfin( + WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1, + WIENER_FILT_TAP2_SUBEXP_K, + ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, + wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV); + return bits; +} + +static int64_t finer_search_wiener(const RestSearchCtxt *rsc, + const RestorationTileLimits *limits, + RestorationUnitInfo *rui, int wiener_win) { + const int plane_off = (WIENER_WIN - wiener_win) >> 1; + int64_t err = try_restoration_unit(rsc, limits, rui); + + if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err; + + // Refinement search around the wiener filter coefficients. + int64_t err2; + int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV, + WIENER_FILT_TAP2_MINV }; + int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV, + WIENER_FILT_TAP2_MAXV }; + + WienerInfo *plane_wiener = &rui->wiener_info; + + // printf("err pre = %"PRId64"\n", err); + const int start_step = 4; + for (int s = start_step; s >= 1; s >>= 1) { + for (int p = plane_off; p < WIENER_HALFWIN; ++p) { + int skip = 0; + do { + if (plane_wiener->hfilter[p] - s >= tap_min[p]) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (plane_wiener->hfilter[p] + s <= tap_max[p]) { + plane_wiener->hfilter[p] += s; + plane_wiener->hfilter[WIENER_WIN - p - 1] += s; + plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->hfilter[p] -= s; + plane_wiener->hfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + for (int p = plane_off; p < WIENER_HALFWIN; ++p) { + int skip = 0; + do { + if (plane_wiener->vfilter[p] - s >= tap_min[p]) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + } else { + err = err2; + skip = 1; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + if (skip) break; + do { + if (plane_wiener->vfilter[p] + s <= tap_max[p]) { + plane_wiener->vfilter[p] += s; + plane_wiener->vfilter[WIENER_WIN - p - 1] += s; + plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s; + err2 = try_restoration_unit(rsc, limits, rui); + if (err2 > err) { + plane_wiener->vfilter[p] -= s; + plane_wiener->vfilter[WIENER_WIN - p - 1] -= s; + plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s; + } else { + err = err2; + // At the highest step size continue moving in the same direction + if (s == start_step) continue; + } + } + break; + } while (1); + } + } + // printf("err post = %"PRId64"\n", err); + return err; +} + +static AOM_INLINE void search_wiener( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)tmpbuf; + (void)rlbs; + (void)error_info; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + const int64_t bits_none = x->mode_costs.wiener_restore_cost[0]; + + // Skip Wiener search for low variance contents + if (rsc->lpf_sf->prune_wiener_based_on_src_var) { + const int scale[3] = { 0, 1, 2 }; + // Obtain the normalized Qscale + const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0, + rsc->cm->seq_params->bit_depth) >> + 3; + // Derive threshold as sqr(normalized Qscale) * scale / 16, + const uint64_t thresh = + (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4; + const int highbd = rsc->cm->seq_params->use_highbitdepth; + const uint64_t src_var = + var_restoration_unit(limits, rsc->src, rsc->plane, highbd); + // Do not perform Wiener search if source variance is lower than threshold + // or if the reconstruction error is zero + int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0); + if (prune_wiener) { + rsc->total_bits[RESTORE_WIENER] += bits_none; + rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rsc->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1; + return; + } + } + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + int reduced_wiener_win = wiener_win; + if (rsc->lpf_sf->reduce_wiener_window_size) { + reduced_wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA; + } + + int64_t M[WIENER_WIN2]; + int64_t H[WIENER_WIN2 * WIENER_WIN2]; + int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN]; + +#if CONFIG_AV1_HIGHBITDEPTH + const AV1_COMMON *const cm = rsc->cm; + if (cm->seq_params->use_highbitdepth) { + // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD + // functions. Optimize intrinsics of HBD design similar to LBD (i.e., + // pre-calculate d and s buffers and avoid most of the C operations). + av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, + rsc->src_buffer, limits->h_start, limits->h_end, + limits->v_start, limits->v_end, rsc->dgd_stride, + rsc->src_stride, M, H, cm->seq_params->bit_depth); + } else { + av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, + rsc->dgd_avg, rsc->src_avg, limits->h_start, + limits->h_end, limits->v_start, limits->v_end, + rsc->dgd_stride, rsc->src_stride, M, H, + rsc->lpf_sf->use_downsampled_wiener_stats); + } +#else + av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, + rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end, + limits->v_start, limits->v_end, rsc->dgd_stride, + rsc->src_stride, M, H, + rsc->lpf_sf->use_downsampled_wiener_stats); +#endif + + wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter); + + RestorationUnitInfo rui; + memset(&rui, 0, sizeof(rui)); + rui.restoration_type = RESTORE_WIENER; + finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter); + finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter); + + // Filter score computes the value of the function x'*A*x - x'*b for the + // learned filter and compares it against identity filer. If there is no + // reduction in the function, the filter is reverted back to identity + if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter, + rui.wiener_info.hfilter) > 0) { + rsc->total_bits[RESTORE_WIENER] += bits_none; + rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE]; + rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE; + rsc->sse[RESTORE_WIENER] = INT64_MAX; + if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1; + return; + } + + rsc->sse[RESTORE_WIENER] = + finer_search_wiener(rsc, limits, &rui, reduced_wiener_win); + rusi->wiener = rui.wiener_info; + + if (reduced_wiener_win != WIENER_WIN) { + assert(rui.wiener_info.vfilter[0] == 0 && + rui.wiener_info.vfilter[WIENER_WIN - 1] == 0); + assert(rui.wiener_info.hfilter[0] == 0 && + rui.wiener_info.hfilter[WIENER_WIN - 1] == 0); + } + + const int64_t bits_wiener = + x->mode_costs.wiener_restore_cost[1] + + (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener) + << AV1_PROB_COST_SHIFT); + + double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], + rsc->cm->seq_params->bit_depth); + double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER], + rsc->cm->seq_params->bit_depth); + + RestorationType rtype = + (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE; + rusi->best_rtype[RESTORE_WIENER - 1] = rtype; + + // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and + // RESTORE_NONE or based on best_rtype + if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) { + rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none); + } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) { + rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE; + } + +#if DEBUG_LR_COSTING + // Store ref params for later checking + lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info = + rsc->ref_wiener; +#endif // DEBUG_LR_COSTING + + rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype]; + rsc->total_bits[RESTORE_WIENER] += + (cost_wiener < cost_none) ? bits_wiener : bits_none; + if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener; +} + +static AOM_INLINE void search_norestore( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)rest_unit_idx; + (void)tmpbuf; + (void)rlbs; + (void)error_info; + + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + + const int highbd = rsc->cm->seq_params->use_highbitdepth; + rsc->sse[RESTORE_NONE] = sse_restoration_unit( + limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd); + + rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE]; +} + +static AOM_INLINE void search_switchable( + const RestorationTileLimits *limits, int rest_unit_idx, void *priv, + int32_t *tmpbuf, RestorationLineBuffers *rlbs, + struct aom_internal_error_info *error_info) { + (void)limits; + (void)tmpbuf; + (void)rlbs; + (void)error_info; + RestSearchCtxt *rsc = (RestSearchCtxt *)priv; + RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; + + const MACROBLOCK *const x = rsc->x; + + const int wiener_win = + (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA; + + double best_cost = 0; + int64_t best_bits = 0; + RestorationType best_rtype = RESTORE_NONE; + + for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) { + // If this restoration mode was skipped, or could not find a solution + // that was better than RESTORE_NONE, then we can't select it here either. + // + // Note: It is possible for the restoration search functions to find a + // filter which is better than RESTORE_NONE when looking purely at SSE, but + // for it to be rejected overall due to its rate cost. In this case, there + // is a chance that it may be have a lower rate cost when looking at + // RESTORE_SWITCHABLE, and so it might be acceptable here. + // + // Therefore we prune based on SSE, rather than on whether or not the + // previous search function selected this mode. + if (r > RESTORE_NONE) { + if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue; + } + + const int64_t sse = rsc->sse[r]; + int64_t coeff_pcost = 0; + switch (r) { + case RESTORE_NONE: coeff_pcost = 0; break; + case RESTORE_WIENER: + coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener, + &rsc->switchable_ref_wiener); + break; + case RESTORE_SGRPROJ: + coeff_pcost = + count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj); + break; + default: assert(0); break; + } + const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT; + const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits; + double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth); + if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10) + cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_bits = bits; + best_rtype = r; + } + } + + rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype; + +#if DEBUG_LR_COSTING + // Store ref params for later checking + lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info = + rsc->switchable_ref_wiener; + lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info = + rsc->switchable_ref_sgrproj; +#endif // DEBUG_LR_COSTING + + rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype]; + rsc->total_bits[RESTORE_SWITCHABLE] += best_bits; + if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener; + if (best_rtype == RESTORE_SGRPROJ) + rsc->switchable_ref_sgrproj = rusi->sgrproj; +} + +static AOM_INLINE void copy_unit_info(RestorationType frame_rtype, + const RestUnitSearchInfo *rusi, + RestorationUnitInfo *rui) { + assert(frame_rtype > 0); + rui->restoration_type = rusi->best_rtype[frame_rtype - 1]; + if (rui->restoration_type == RESTORE_WIENER) + rui->wiener_info = rusi->wiener; + else + rui->sgrproj_info = rusi->sgrproj; +} + +static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc, + bool *disable_lr_filter) { + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mib_size_log2 = cm->seq_params->mib_size_log2; + const CommonTileParams *tiles = &cm->tiles; + const int is_uv = plane > 0; + const int ss_y = is_uv && cm->seq_params->subsampling_y; + RestorationInfo *rsi = &cm->rst_info[plane]; + const int ru_size = rsi->restoration_unit_size; + const int ext_size = ru_size * 3 / 2; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + static const rest_unit_visitor_t funs[RESTORE_TYPES] = { + search_norestore, search_wiener, search_sgrproj, search_switchable + }; + + const int plane_num_units = rsi->num_rest_units; + const RestorationType num_rtypes = + (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; + + reset_rsc(rsc); + + // Iterate over restoration units in encoding order, so that each RU gets + // the correct reference parameters when we cost it up. This is effectively + // a nested iteration over: + // * Each tile, order does not matter + // * Each superblock within that tile, in raster order + // * Each LR unit which is coded within that superblock, in raster order + for (int tile_row = 0; tile_row < tiles->rows; tile_row++) { + int sb_row_start = tiles->row_start_sb[tile_row]; + int sb_row_end = tiles->row_start_sb[tile_row + 1]; + for (int tile_col = 0; tile_col < tiles->cols; tile_col++) { + int sb_col_start = tiles->col_start_sb[tile_col]; + int sb_col_end = tiles->col_start_sb[tile_col + 1]; + + // Reset reference parameters for delta-coding at the start of each tile + rsc_on_tile(rsc); + + for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) { + int mi_row = sb_row << mib_size_log2; + for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) { + int mi_col = sb_col << mib_size_log2; + + int rcol0, rcol1, rrow0, rrow1; + int has_lr_info = av1_loop_restoration_corners_in_sb( + cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0, + &rrow1); + + if (!has_lr_info) continue; + + RestorationTileLimits limits; + for (int rrow = rrow0; rrow < rrow1; rrow++) { + int y0 = rrow * ru_size; + int remaining_h = plane_h - y0; + int h = (remaining_h < ext_size) ? remaining_h : ru_size; + + limits.v_start = y0; + limits.v_end = y0 + h; + assert(limits.v_end <= plane_h); + // Offset upwards to align with the restoration processing stripe + const int voffset = RESTORATION_UNIT_OFFSET >> ss_y; + limits.v_start = AOMMAX(0, limits.v_start - voffset); + if (limits.v_end < plane_h) limits.v_end -= voffset; + + for (int rcol = rcol0; rcol < rcol1; rcol++) { + int x0 = rcol * ru_size; + int remaining_w = plane_w - x0; + int w = (remaining_w < ext_size) ? remaining_w : ru_size; + + limits.h_start = x0; + limits.h_end = x0 + w; + assert(limits.h_end <= plane_w); + + const int unit_idx = rrow * rsi->horz_units + rcol; + + rsc->skip_sgr_eval = 0; + for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) { + if (disable_lr_filter[r]) continue; + + funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL, + cm->error); + } + } + } + } + } + } + } +} + +static INLINE void av1_derive_flags_for_lr_processing( + const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) { + const bool is_wiener_disabled = lpf_sf->disable_wiener_filter; + const bool is_sgr_disabled = lpf_sf->disable_sgr_filter; + + // Enable None Loop restoration filter if either of Wiener or Self-guided is + // enabled. + disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled); + + disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled; + disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled; + + // Enable Swicthable Loop restoration filter if both of the Wiener and + // Self-guided are enabled. + disable_lr_filter[RESTORE_SWITCHABLE] = + (is_wiener_disabled || is_sgr_disabled); +} + +#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0 +// Allocate both decoder-side and encoder-side info structs for a single plane. +// The unit size passed in should be the minimum size which we are going to +// search; before each search, set_restoration_unit_size() must be called to +// configure the actual size. +static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm, + RestorationInfo *rsi, + int is_uv, + int min_luma_unit_size) { +#if COUPLED_CHROMA_FROM_LUMA_RESTORATION + int sx = cm->seq_params.subsampling_x; + int sy = cm->seq_params.subsampling_y; + int s = (p > 0) ? AOMMIN(sx, sy) : 0; +#else + int s = 0; +#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION + int min_unit_size = min_luma_unit_size >> s; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w); + const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h); + const int max_num_units = max_horz_units * max_vert_units; + + aom_free(rsi->unit_info); + CHECK_MEM_ERROR(cm, rsi->unit_info, + (RestorationUnitInfo *)aom_memalign( + 16, sizeof(*rsi->unit_info) * max_num_units)); + + RestUnitSearchInfo *rusi; + CHECK_MEM_ERROR( + cm, rusi, + (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units)); + + // If the restoration unit dimensions are not multiples of + // rsi->restoration_unit_size then some elements of the rusi array may be + // left uninitialised when we reach copy_unit_info(...). This is not a + // problem, as these elements are ignored later, but in order to quiet + // Valgrind's warnings we initialise the array below. + memset(rusi, 0, sizeof(*rusi) * max_num_units); + + return rusi; +} + +static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi, + int is_uv, int luma_unit_size) { +#if COUPLED_CHROMA_FROM_LUMA_RESTORATION + int sx = cm->seq_params.subsampling_x; + int sy = cm->seq_params.subsampling_y; + int s = (p > 0) ? AOMMIN(sx, sy) : 0; +#else + int s = 0; +#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION + int unit_size = luma_unit_size >> s; + + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + + const int horz_units = av1_lr_count_units(unit_size, plane_w); + const int vert_units = av1_lr_count_units(unit_size, plane_h); + + rsi->restoration_unit_size = unit_size; + rsi->num_rest_units = horz_units * vert_units; + rsi->horz_units = horz_units; + rsi->vert_units = vert_units; +} + +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + const SequenceHeader *const seq_params = cm->seq_params; + const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf; + const int num_planes = av1_num_planes(cm); + const int highbd = cm->seq_params->use_highbitdepth; + assert(!cm->features.all_lossless); + + av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx); + + // Select unit size based on speed feature settings, and allocate + // rui structs based on this size + int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size; + int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size; + + // The minimum allowed unit size at a syntax level is 1 superblock. + // Apply this constraint here so that the speed features code which sets + // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size + min_lr_unit_size = + AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]); + + for (int plane = 0; plane < num_planes; ++plane) { + cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs( + cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size); + } + + x->rdmult = cpi->rd.RDMULT; + + // Allocate the frame buffer trial_frame_rst, which is used to temporarily + // store the loop restored frame. + if (aom_realloc_frame_buffer( + &cpi->trial_frame_rst, cm->superres_upscaled_width, + cm->superres_upscaled_height, seq_params->subsampling_x, + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, + cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate trial restored frame buffer"); + + RestSearchCtxt rsc; + + // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers. + // These buffers are only required for the AVX2 and NEON implementations of + // av1_compute_stats. The buffer size required is calculated based on maximum + // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5 + // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width + // and height aligned to multiple of 16 is considered for intrinsic purpose. + rsc.dgd_avg = NULL; + rsc.src_avg = NULL; +#if HAVE_AVX2 || HAVE_NEON + // The buffers allocated below are used during Wiener filter processing of low + // bitdepth path. Hence, allocate the same when Wiener filter is enabled in + // low bitdepth path. + if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { + const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 * + RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg, + (int16_t *)aom_memalign(32, buf_size)); + + rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg; + // When LRU width isn't multiple of 16, the 256 bits load instruction used + // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to + // silence Valgrind warning this buffer is initialized with zero. Overhead + // due to this initialization is negligible since it is done at frame level. + memset(rsc.dgd_avg, 0, buf_size); + rsc.src_avg = + rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX; + // Asserts the starting address of src_avg is always 32-bytes aligned. + assert(!((intptr_t)rsc.src_avg % 32)); + } +#endif + + // Initialize all planes, so that any planes we skip searching will still have + // valid data + for (int plane = 0; plane < num_planes; plane++) { + cm->rst_info[plane].frame_restoration_type = RESTORE_NONE; + } + + // Decide which planes to search + int plane_start, plane_end; + + if (lpf_sf->disable_loop_restoration_luma) { + plane_start = AOM_PLANE_U; + } else { + plane_start = AOM_PLANE_Y; + } + + if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) { + plane_end = AOM_PLANE_Y; + } else { + plane_end = AOM_PLANE_V; + } + + // Derive the flags to enable/disable Loop restoration filters based on the + // speed features 'disable_wiener_filter' and 'disable_sgr_filter'. + bool disable_lr_filter[RESTORE_TYPES] = { false }; + av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter); + + for (int plane = plane_start; plane <= plane_end; plane++) { + const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; + const int is_uv = plane != AOM_PLANE_Y; + int plane_w, plane_h; + av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h); + av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv], + RESTORATION_BORDER, RESTORATION_BORDER, highbd); + } + + double best_cost = DBL_MAX; + int best_luma_unit_size = max_lr_unit_size; + for (int luma_unit_size = max_lr_unit_size; + luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) { + int64_t bits_this_size = 0; + int64_t sse_this_size = 0; + RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE, + RESTORE_NONE }; + for (int plane = plane_start; plane <= plane_end; ++plane) { + set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0, + luma_unit_size); + init_rsc(src, &cpi->common, x, lpf_sf, plane, + cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc); + + restoration_search(cm, plane, &rsc, disable_lr_filter); + + const int plane_num_units = cm->rst_info[plane].num_rest_units; + const RestorationType num_rtypes = + (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES; + double best_cost_this_plane = DBL_MAX; + for (RestorationType r = 0; r < num_rtypes; ++r) { + // Disable Loop restoration filter based on the flags set using speed + // feature 'disable_wiener_filter' and 'disable_sgr_filter'. + if (disable_lr_filter[r]) continue; + + double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r], + cm->seq_params->bit_depth); + + if (cost_this_plane < best_cost_this_plane) { + best_cost_this_plane = cost_this_plane; + best_rtype[plane] = r; + } + } + + bits_this_size += rsc.total_bits[best_rtype[plane]]; + sse_this_size += rsc.total_sse[best_rtype[plane]]; + } + + double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits_this_size >> 4, sse_this_size, + cm->seq_params->bit_depth); + + if (cost_this_size < best_cost) { + best_cost = cost_this_size; + best_luma_unit_size = luma_unit_size; + // Copy parameters out of rusi struct, before we overwrite it at + // the start of the next iteration + bool all_none = true; + for (int plane = plane_start; plane <= plane_end; ++plane) { + cm->rst_info[plane].frame_restoration_type = best_rtype[plane]; + if (best_rtype[plane] != RESTORE_NONE) { + all_none = false; + const int plane_num_units = cm->rst_info[plane].num_rest_units; + for (int u = 0; u < plane_num_units; ++u) { + copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u], + &cm->rst_info[plane].unit_info[u]); + } + } + } + // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we + // couldn't find any good filters at this size. So we likely won't find + // any good filters at a smaller size either, so skip + if (all_none) { + break; + } + } else { + // Heuristic: If this size is worse than the previous (larger) size, then + // the next size down will likely be even worse, so skip + break; + } + } + + // Final fixup to set the correct unit size + // We set this for all planes, even ones we have skipped searching, + // so that other code does not need to care which planes were and weren't + // searched + for (int plane = 0; plane < num_planes; ++plane) { + set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0, + best_luma_unit_size); + } + +#if HAVE_AVX || HAVE_NEON + if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) { + aom_free(cpi->pick_lr_ctxt.dgd_avg); + cpi->pick_lr_ctxt.dgd_avg = NULL; + } +#endif + for (int plane = 0; plane < num_planes; plane++) { + aom_free(cpi->pick_lr_ctxt.rusi[plane]); + cpi->pick_lr_ctxt.rusi[plane] = NULL; + } +} diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h new file mode 100644 index 0000000000..d1d0b0cec6 --- /dev/null +++ b/third_party/aom/av1/encoder/pickrst.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_PICKRST_H_ +#define AOM_AV1_ENCODER_PICKRST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/encoder.h" + +struct yv12_buffer_config; +struct AV1_COMP; + +// Enable extra debugging for loop restoration costing? +// +// If this is set to 1, then we record not just the selected LR parameters, but +// also the values which the search process thinks they should be delta-coded +// against. Then, when writing out the bitstream, we verify this information, +// to help ensure that the search code is costing things properly +#define DEBUG_LR_COSTING 0 + +#if DEBUG_LR_COSTING +#define MAX_LR_UNITS_W 64 +#define MAX_LR_UNITS_H 64 + +// Storage for reference parameters. +// +// The storage size is determined by: +// * This is always written and then checked within the same frame encode pass, +// so we do not need to buffer multiple frames of data +// * The parameters can be different per plane within one frame +// * The relevant set of ref parameters can differ between the search where +// we set the frame restoration mode to RESTORE_WIENER, and the search where +// we set it to RESTORE_SWITCHABLE. +// So we need to store at least two sets of Wiener params and two sets of +// SGR params, and the easiest way to do this is to index by +// frame_restoration_type +extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE] + [MAX_LR_UNITS_W * MAX_LR_UNITS_H]; +#endif // DEBUG_LR_COSTING + +static const uint8_t g_shuffle_stats_data[16] = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, +}; + +static const uint8_t g_shuffle_stats_highbd_data[32] = { + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, + 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, +}; + +static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end, + int v_start, int v_end, int stride) { + uint64_t sum = 0; + for (int i = v_start; i < v_end; i++) { + for (int j = h_start; j < h_end; j++) { + sum += src[i * stride + j]; + } + } + uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); + return (uint8_t)avg; +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start, + int h_end, int v_start, int v_end, + int stride) { + uint64_t sum = 0; + for (int i = v_start; i < v_end; i++) { + for (int j = h_start; j < h_end; j++) { + sum += src[i * stride + j]; + } + } + uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start)); + return (uint16_t)avg; +} +#endif + +/*!\brief Algorithm for AV1 loop restoration search and estimation. + * + * \ingroup in_loop_restoration + * This function determines proper restoration filter types and + * associated parameters for each restoration unit in a frame. + * + * \param[in] sd Source frame buffer + * \param[in,out] cpi Top-level encoder structure + * + * \remark Nothing is returned. Instead, chosen restoration filter + * types and parameters are stored per plane in the \c rst_info structure + * of type \ref RestorationInfo inside \c cpi->common: + * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane + * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists + * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists + * \par + * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2 + * are populated: + * \arg \c rst_info[ \c p ].\c frame_restoration_type + * \arg \c rst_info[ \c p ].\c unit_info[ \c u ], + * for each \c u in 0, 1, ..., \c n( \c p ) - 1, + * where \c n( \c p ) is the number of restoration units in plane \c p. + * \par + * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ], + * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type + * \ref RestorationUnitInfo are populated: + * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type + * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR + * \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR + * neither, depending on + * \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type + * + */ +void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PICKRST_H_ diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h new file mode 100644 index 0000000000..2e8710108b --- /dev/null +++ b/third_party/aom/av1/encoder/pustats.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PUSTATS_H_ +#define AOM_AV1_ENCODER_PUSTATS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +#define NUM_FEATURES_PUSTATS 8 +#define NUM_HIDDEN_LAYERS 2 +#define HIDDEN_LAYERS_0_NODES 12 +#define HIDDEN_LAYERS_1_NODES 10 +#define LOGITS_NODES 1 + +static const float + av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * + HIDDEN_LAYERS_0_NODES] = { + -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f, + -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f, + 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f, + 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f, + -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f, + -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f, + -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f, + -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f, + 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f, + -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f, + -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f, + -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f, + 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f, + -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f, + }; + +static const float + av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { + 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f, + 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f, + 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f, + -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f, + 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f, + 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f, + -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f, + -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f, + -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f, + 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f, + 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f, + -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f, + -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f, + -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f, + 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f, + -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f, + -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f, + 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f, + -2.7566f, + }; + +static const float + av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { + 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f, + 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f, + }; + +static const float + av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f, + 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f, + }; + +static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = { + 4.5103f, +}; + +static const NN_CONFIG av1_pustats_rate_nnconfig = { + NUM_FEATURES_PUSTATS, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_rate_hiddenlayer_0_kernel, + av1_pustats_rate_hiddenlayer_1_kernel, + av1_pustats_rate_logits_kernel, + }, + { + av1_pustats_rate_hiddenlayer_0_bias, + av1_pustats_rate_hiddenlayer_1_bias, + av1_pustats_rate_logits_bias, + }, +}; + +static const float + av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS * + HIDDEN_LAYERS_0_NODES] = { + -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f, + 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f, + 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f, + 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f, + 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f, + -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f, + -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f, + -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f, + 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f, + 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f, + -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f, + -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f, + 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f, + -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f, + }; + +static const float + av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = { + 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f, + 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f, + }; + +static const float + av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES * + HIDDEN_LAYERS_1_NODES] = { + -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f, + -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f, + 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f, + 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f, + -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f, + -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f, + -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f, + 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f, + -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f, + 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f, + 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f, + -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f, + 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f, + 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f, + 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f, + -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f, + -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f, + -0.4164f, + }; + +static const float + av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = { + -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f, + 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f, + }; + +static const float + av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = { + -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f, + 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f, + }; + +static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = { + 2.3371f, +}; + +static const NN_CONFIG av1_pustats_dist_nnconfig = { + NUM_FEATURES_PUSTATS, // num_inputs + LOGITS_NODES, // num_outputs + NUM_HIDDEN_LAYERS, // num_hidden_layers + { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes + { + av1_pustats_dist_hiddenlayer_0_kernel, + av1_pustats_dist_hiddenlayer_1_kernel, + av1_pustats_dist_logits_kernel, + }, + { + av1_pustats_dist_hiddenlayer_0_bias, + av1_pustats_dist_hiddenlayer_1_bias, + av1_pustats_dist_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef HIDDEN_LAYERS_0_NODES +#undef HIDDEN_LAYERS_1_NODES +#undef LOGITS_NODES + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PUSTATS_H_ diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h new file mode 100644 index 0000000000..efe909b6db --- /dev/null +++ b/third_party/aom/av1/encoder/random.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RANDOM_H_ +#define AOM_AV1_ENCODER_RANDOM_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Advance the generator to its next state, and generate the next 32-bit output. +// Note that the low bits of this output are comparatively low-quality, so users +// of this function should ensure that the high bits factor through to their +// outputs. +static INLINE uint32_t lcg_next(uint32_t *state) { + *state = (uint32_t)(*state * 1103515245ULL + 12345); + return *state; +} + +// Generate a random number in the range [0, 32768). +static INLINE uint32_t lcg_rand16(uint32_t *state) { + return (lcg_next(state) / 65536) % 32768; +} + +// Generate a random number in the range [0, n) +// This is implemented as (rand() * n) / rather than +// rand() % n, for a few reasons: This implementation is faster and less biased, +// and if is a power of 2, this uses the higher-quality top bits from the RNG +// output rather than the lower-quality bottom bits. +static INLINE uint32_t lcg_randint(uint32_t *state, uint32_t n) { + uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32; + return (uint32_t)v; +} + +// Generate a random number in the range [lo, hi) +static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo, + uint32_t hi) { + assert(lo < hi); + return lo + lcg_randint(state, hi - lo); +} + +// Pick k distinct numbers from the set {0, ..., n-1} +// All possible sets of k numbers, and all possible orderings of those numbers, +// are equally likely. +// +// Note: The algorithm used here uses resampling to avoid choosing repeated +// values. This works well as long as n >> k, but can potentially lead to many +// resampling attempts if n is equal to or only slightly larger than k. +static INLINE void lcg_pick(int n, int k, int *out, unsigned int *seed) { + assert(0 <= k && k <= n); + for (int i = 0; i < k; i++) { + int v; + + // Inner resampling loop + // We have to use a goto here because C does not have a multi-level continue + // statement + resample: + v = (int)lcg_randint(seed, n); + for (int j = 0; j < i; j++) { + if (v == out[j]) { + // Repeated v, resample + goto resample; + } + } + + // New v, accept + out[i] = v; + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RANDOM_H_ diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c new file mode 100644 index 0000000000..df86380272 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl.c @@ -0,0 +1,3587 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_once.h" + +#include "av1/common/alloccommon.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/common/common.h" +#include "av1/common/entropymode.h" +#include "av1/common/quant_common.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" + +#include "config/aom_dsp_rtcd.h" + +#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0 + +// Max rate target for 1080P and below encodes under normal circumstances +// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 2025000 + +#define MIN_BPB_FACTOR 0.005 +#define MAX_BPB_FACTOR 50 + +#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0 +#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2 +#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0 + +#define FRAME_OVERHEAD_BITS 200 +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case AOM_BITS_8: name = name##_8; break; \ + case AOM_BITS_10: name = name##_10; break; \ + case AOM_BITS_12: name = name##_12; break; \ + default: \ + assert(0 && \ + "bit_depth should be AOM_BITS_8, AOM_BITS_10" \ + " or AOM_BITS_12"); \ + name = NULL; \ + } \ + } while (0) + +// Tables relating active max Q to active min Q +static int kf_low_motion_minq_8[QINDEX_RANGE]; +static int kf_high_motion_minq_8[QINDEX_RANGE]; +static int arfgf_low_motion_minq_8[QINDEX_RANGE]; +static int arfgf_high_motion_minq_8[QINDEX_RANGE]; +static int inter_minq_8[QINDEX_RANGE]; +static int rtc_minq_8[QINDEX_RANGE]; + +static int kf_low_motion_minq_10[QINDEX_RANGE]; +static int kf_high_motion_minq_10[QINDEX_RANGE]; +static int arfgf_low_motion_minq_10[QINDEX_RANGE]; +static int arfgf_high_motion_minq_10[QINDEX_RANGE]; +static int inter_minq_10[QINDEX_RANGE]; +static int rtc_minq_10[QINDEX_RANGE]; +static int kf_low_motion_minq_12[QINDEX_RANGE]; +static int kf_high_motion_minq_12[QINDEX_RANGE]; +static int arfgf_low_motion_minq_12[QINDEX_RANGE]; +static int arfgf_high_motion_minq_12[QINDEX_RANGE]; +static int inter_minq_12[QINDEX_RANGE]; +static int rtc_minq_12[QINDEX_RANGE]; + +static int gf_high = 2400; +static int gf_low = 300; +#ifdef STRICT_RC +static int kf_high = 3200; +#else +static int kf_high = 5000; +#endif +static int kf_low = 400; + +// How many times less pixels there are to encode given the current scaling. +// Temporary replacement for rcf_mult and rate_thresh_mult. +static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg, + int width, int height) { + return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height); +} + +// Functions to compute the active minq lookup table entries based on a +// formulaic approach to facilitate easier adjustment of the Q tables. +// The formulae were derived from computing a 3rd order polynomial best +// fit to the original data (after plotting real maxq vs minq (not q index)) +static int get_minq_index(double maxq, double x3, double x2, double x1, + aom_bit_depth_t bit_depth) { + const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); + + // Special case handling to deal with the step from q2.0 + // down to lossless mode represented by q 1.0. + if (minqtarget <= 2.0) return 0; + + return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1); +} + +static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, + int *arfgf_high, int *inter, int *rtc, + aom_bit_depth_t bit_depth) { + int i; + for (i = 0; i < QINDEX_RANGE; i++) { + const double maxq = av1_convert_qindex_to_q(i, bit_depth); + kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); + rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); + } +} + +static void rc_init_minq_luts(void) { + init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, + arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, + inter_minq_8, rtc_minq_8, AOM_BITS_8); + init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, + arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, + inter_minq_10, rtc_minq_10, AOM_BITS_10); + init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, + arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, + inter_minq_12, rtc_minq_12, AOM_BITS_12); +} + +void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); } + +// These functions use formulaic calculations to make playing with the +// quantizer tables easier. If necessary they can be replaced by lookup +// tables if and when things settle down in the experimental bitstream +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) { + // Convert the index to a real Q value (scaled down to match old Q values) + switch (bit_depth) { + case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0; + case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0; + case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1.0; + } +} + +int av1_get_bpmb_enumerator(FRAME_TYPE frame_type, + const int is_screen_content_type) { + int enumerator; + + if (is_screen_content_type) { + enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000; + } else { + enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000; + } + + return enumerator; +} + +static int get_init_ratio(double sse) { return (int)(300000 / sse); } + +int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex, + double correction_factor, int accurate_estimate) { + const AV1_COMMON *const cm = &cpi->common; + const int is_screen_content_type = cpi->is_screen_content_type; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; + const double q = av1_convert_qindex_to_q(qindex, bit_depth); + int enumerator = av1_get_bpmb_enumerator(frame_type, is_screen_content_type); + + assert(correction_factor <= MAX_BPB_FACTOR && + correction_factor >= MIN_BPB_FACTOR); + + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME && + accurate_estimate && cpi->rec_sse != UINT64_MAX) { + const int mbs = cm->mi_params.MBs; + const double sse_sqrt = + (double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) / + (double)mbs; + const int ratio = (cpi->rc.bit_est_ratio == 0) ? get_init_ratio(sse_sqrt) + : cpi->rc.bit_est_ratio; + // Clamp the enumerator to lower the q fluctuations. + enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000); + } + + // q based adjustment to baseline enumerator + return (int)(enumerator * correction_factor / q); +} + +int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q, + double correction_factor) { + const AV1_COMMON *const cm = &cpi->common; + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + const int mbs = cm->mi_params.MBs; + const int bpm = + (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor, + cpi->sf.hl_sf.accurate_bit_estimate)); + return AOMMAX(FRAME_OVERHEAD_BITS, + (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); +} + +int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target, + FRAME_UPDATE_TYPE frame_update_type) { + const RATE_CONTROL *rc = &cpi->rc; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const int min_frame_target = + AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); + // Clip the frame target to the minimum setup value. + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; + } else if (target < min_frame_target) { + target = min_frame_target; + } + + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_cfg.max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + + return target; +} + +int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) { + const RATE_CONTROL *rc = &cpi->rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + if (rc_cfg->max_intra_bitrate_pct) { + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + return (int)target; +} + +// Update the buffer level for higher temporal layers, given the encoded current +// temporal layer. +static void update_layer_buffer_level(SVC *svc, int encoded_frame_size, + bool is_screen) { + const int current_temporal_layer = svc->temporal_layer_id; + for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers; + ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; + lp_rc->bits_off_target += + (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size; + // Clip buffer level to maximum buffer size for the layer. + lp_rc->bits_off_target = + AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size); + lp_rc->buffer_level = lp_rc->bits_off_target; + + // For screen-content mode: don't let buffer level go below threshold, + // given here as -rc->maximum_ buffer_size, to allow buffer to come back + // up sooner after slide change with big oveshoot. + if (is_screen) { + lp_rc->bits_off_target = + AOMMAX(lp_rc->bits_off_target, -lp_rc->maximum_buffer_size); + lp_rc->buffer_level = lp_rc->bits_off_target; + } + } +} +// Update the buffer level: leaky bucket model. +static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + + // Non-viewable frames are a special case and are treated as pure overhead. + if (!cm->show_frame) + p_rc->bits_off_target -= encoded_frame_size; + else + p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; + + // Clip the buffer level to the maximum specified buffer size. + p_rc->bits_off_target = + AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); + // For screen-content mode: don't let buffel level go below threshold, + // given here as -rc->maximum_ buffer_size, to allow buffer to come back + // up sooner after slide change with big oveshoot. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) + p_rc->bits_off_target = + AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size); + p_rc->buffer_level = p_rc->bits_off_target; + + if (cpi->ppi->use_svc) + update_layer_buffer_level(&cpi->svc, encoded_frame_size, + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); + +#if CONFIG_FPMT_TEST + /* The variable temp_buffer_level is introduced for quality + * simulation purpose, it retains the value previous to the parallel + * encode frames. The variable is updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + p_rc->temp_buffer_level = p_rc->buffer_level; + } +#endif +} + +int av1_rc_get_default_min_gf_interval(int width, int height, + double framerate) { + // Assume we do not need any constraint lower than 4K 20 fps + static const double factor_safe = 3840 * 2160 * 20.0; + const double factor = (double)width * height * framerate; + const int default_interval = + clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); + + if (factor <= factor_safe) + return default_interval; + else + return AOMMAX(default_interval, + (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); + // Note this logic makes: + // 4K24: 5 + // 4K30: 6 + // 4K60: 12 +} + +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { + int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); + interval += (interval & 0x01); // Round to even value + interval = AOMMAX(MAX_GF_INTERVAL, interval); + return AOMMAX(interval, min_gf_interval); +} + +void av1_primary_rc_init(const AV1EncoderConfig *oxcf, + PRIMARY_RATE_CONTROL *p_rc) { + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + int worst_allowed_q = rc_cfg->worst_allowed_q; + + int min_gf_interval = oxcf->gf_cfg.min_gf_interval; + int max_gf_interval = oxcf->gf_cfg.max_gf_interval; + if (min_gf_interval == 0) + min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + oxcf->input_cfg.init_framerate); + if (max_gf_interval == 0) + max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->input_cfg.init_framerate, min_gf_interval); + p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2; + p_rc->this_key_frame_forced = 0; + p_rc->next_key_frame_forced = 0; + p_rc->ni_frames = 0; + + p_rc->tot_q = 0.0; + p_rc->total_actual_bits = 0; + p_rc->total_target_bits = 0; + p_rc->buffer_level = p_rc->starting_buffer_level; + + if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) { + worst_allowed_q = 255; + } + if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) { + p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q; + p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q; + } else { + p_rc->avg_frame_qindex[KEY_FRAME] = + (worst_allowed_q + rc_cfg->best_allowed_q) / 2; + p_rc->avg_frame_qindex[INTER_FRAME] = + (worst_allowed_q + rc_cfg->best_allowed_q) / 2; + } + p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q, + oxcf->tool_cfg.bit_depth); + p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q; + p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q; + + for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { + p_rc->rate_correction_factors[i] = 0.7; + } + p_rc->rate_correction_factors[KF_STD] = 1.0; + p_rc->bits_off_target = p_rc->starting_buffer_level; + + p_rc->rolling_target_bits = + (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate); + p_rc->rolling_actual_bits = + (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate); +} + +void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) { + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + rc->frames_since_key = 8; // Sensible default for first frame. + rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist; + + rc->frames_till_gf_update_due = 0; + rc->ni_av_qi = rc_cfg->worst_allowed_q; + rc->ni_tot_qi = 0; + + rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; + rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + oxcf->input_cfg.init_framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->input_cfg.init_framerate, rc->min_gf_interval); + rc->avg_frame_low_motion = 0; + + rc->resize_state = ORIG; + rc->resize_avg_qp = 0; + rc->resize_buffer_underflow = 0; + rc->resize_count = 0; + rc->rtc_external_ratectrl = 0; + rc->frame_level_fast_extra_bits = 0; + rc->use_external_qp_one_pass = 0; +} + +static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level, + int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 || + cpi->svc.framedrop_mode == AOM_LAYER_DROP) { + return (buffer_level <= drop_mark); + } else { + // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on + // buffer is checked on current and upper spatial layers. + for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + PRIMARY_RATE_CONTROL *lrc = &lc->p_rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark; + const int drop_mark_layer = + (int)(drop_thresh * lrc->optimal_buffer_level / 100); + if (lrc->buffer_level <= drop_mark_layer) return true; + } + } + return false; + } +} + +int av1_rc_drop_frame(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int64_t buffer_level = + simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level; +#else + int64_t buffer_level = p_rc->buffer_level; +#endif + // Never drop on key frame, or for frame whose base layer is key. + // If drop_count_consec hits or exceeds max_consec_drop then don't drop. + if (cpi->common.current_frame.frame_type == KEY_FRAME || + (cpi->ppi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || + !oxcf->rc_cfg.drop_frames_water_mark || + (rc->max_consec_drop > 0 && + rc->drop_count_consec >= rc->max_consec_drop)) { + return 0; + } else { + SVC *svc = &cpi->svc; + // In the full_superframe framedrop mode for svc, if the previous spatial + // layer was dropped, drop the current spatial layer. + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1] && + svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP) + return 1; + // -1 is passed here for drop_mark since we are checking if + // buffer goes below 0 (<= -1). + if (check_buffer_below_thresh(cpi, buffer_level, -1)) { + // Always drop if buffer is below 0. + rc->drop_count_consec++; + return 1; + } else { + // If buffer is below drop_mark, for now just drop every other frame + // (starting with the next frame) until it increases back over drop_mark. + const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * + p_rc->optimal_buffer_level / 100); + const bool buffer_below_thresh = + check_buffer_below_thresh(cpi, buffer_level, drop_mark); + if (!buffer_below_thresh && rc->decimation_factor > 0) { + --rc->decimation_factor; + } else if (buffer_below_thresh && rc->decimation_factor == 0) { + rc->decimation_factor = 1; + } + if (rc->decimation_factor > 0) { + if (rc->decimation_count > 0) { + --rc->decimation_count; + rc->drop_count_consec++; + return 1; + } else { + rc->decimation_count = rc->decimation_factor; + return 0; + } + } else { + rc->decimation_count = 0; + return 0; + } + } + } +} + +static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality, + int width, int height) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1_COMMON *const cm = &cpi->common; + const SVC *const svc = &cpi->svc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + // Flag to indicate previous frame has overshoot, and buffer level + // for current frame is low (less than ~half of optimal). For such + // (inter) frames, if the source_sad is non-zero, relax the max_delta_up + // and clamp applied below. + const bool overshoot_buffer_low = + cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 && + p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) && + rc->frames_since_key > 4; + int max_delta_down; + int max_delta_up = overshoot_buffer_low ? 60 : 20; + const int change_avg_frame_bandwidth = + abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) > + 0.1 * (rc->avg_frame_bandwidth); + + // Set the maximum adjustment down for Q for this frame. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->apply_cyclic_refresh) { + // For static screen type content limit the Q drop till the start of the + // next refresh cycle. + if (cpi->is_screen_content_type && + (cpi->cyclic_refresh->sb_index > cpi->cyclic_refresh->last_sb_index)) { + max_delta_down = AOMMIN(8, AOMMAX(1, rc->q_1_frame / 32)); + } else { + max_delta_down = AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8)); + } + if (!cpi->ppi->use_svc && cpi->is_screen_content_type) { + // Link max_delta_up to max_delta_down and buffer status. + if (p_rc->buffer_level > p_rc->optimal_buffer_level) { + max_delta_up = AOMMAX(4, max_delta_down); + } else { + max_delta_up = AOMMAX(8, max_delta_down); + } + } + } else { + max_delta_down = (cpi->is_screen_content_type) + ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16)) + : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8)); + } + // If resolution changes or avg_frame_bandwidth significantly changed, + // then set this flag to indicate change in target bits per macroblock. + const int change_target_bits_mb = + cm->prev_frame && + (width != cm->prev_frame->width || height != cm->prev_frame->height || + change_avg_frame_bandwidth); + // Apply some control/clamp to QP under certain conditions. + // Delay the use of the clamping for svc until after num_temporal_layers, + // to make they have been set for each temporal layer. + if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 && + (!cpi->ppi->use_svc || + svc->current_superframe > (unsigned int)svc->number_temporal_layers) && + !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && + (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct || + !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) { + // If in the previous two frames we have seen both overshoot and undershoot + // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have + // not been set due to dropped frames. + if (rc->rc_1_frame * rc->rc_2_frame == -1 && + rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 && + rc->q_2_frame > 0 && !overshoot_buffer_low) { + int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame), + AOMMAX(rc->q_1_frame, rc->q_2_frame)); + // If the previous frame had overshoot and the current q needs to + // increase above the clamped value, reduce the clamp for faster reaction + // to overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + // Adjust Q base on source content change from scene detection. + if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 && + rc->frames_since_key > 10 && rc->frame_source_sad > 0 && + !cpi->rc.rtc_external_ratectrl) { + const int bit_depth = cm->seq_params->bit_depth; + double delta = + (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0; + // Push Q downwards if content change is decreasing and buffer level + // is stable (at least 1/4-optimal level), so not overshooting. Do so + // only for high Q to avoid excess overshoot. + // Else reduce decrease in Q from previous frame if content change is + // increasing and buffer is below max (so not undershooting). + if (delta < 0.0 && + p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && + q > (rc->worst_quality >> 1)) { + double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta); + double q_val = av1_convert_qindex_to_q(q, bit_depth); + q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } else if (rc->q_1_frame - q > 0 && delta > 0.1 && + p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size, + p_rc->optimal_buffer_level << 1)) { + q = (3 * q + rc->q_1_frame) >> 2; + } + } + // Limit the decrease in Q from previous frame. + if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down; + // Limit the increase in Q from previous frame. + else if (q - rc->q_1_frame > max_delta_up) + q = rc->q_1_frame + max_delta_up; + } + // Adjustment for temporal layers. + if (svc->number_temporal_layers > 1 && svc->spatial_layer_id == 0 && + !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl && + cpi->oxcf.resize_cfg.resize_mode != RESIZE_DYNAMIC) { + if (svc->temporal_layer_id > 0) { + // Constrain enhancement relative to the previous base TL0. + // Get base temporal layer TL0. + const int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the + // last TL0 frame. + if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth && + q < lc->p_rc.last_q[INTER_FRAME] - 4) + q = lc->p_rc.last_q[INTER_FRAME] - 4; + } else if (cpi->svc.temporal_layer_id == 0 && + p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && + rc->frame_source_sad < 100000) { + // Push base TL0 Q down if buffer is stable and frame_source_sad + // is below threshold. + int delta = (svc->number_temporal_layers == 2) ? 4 : 10; + q = q - delta; + } + } + // For non-svc (single layer): if resolution has increased push q closer + // to the active_worst to avoid excess overshoot. + if (!cpi->ppi->use_svc && cm->prev_frame && + (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height)) + q = (q + active_worst_quality) >> 1; + // For single layer RPS: Bias Q based on distance of closest reference. + if (cpi->ppi->rtc_ref.bias_recovery_frame) { + const int min_dist = av1_svc_get_min_ref_dist(cpi); + q = q - AOMMIN(min_dist, 20); + } + return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); +} + +static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = { + KF_STD, // KF_UPDATE + INTER_NORMAL, // LF_UPDATE + GF_ARF_STD, // GF_UPDATE + GF_ARF_STD, // ARF_UPDATE + INTER_NORMAL, // OVERLAY_UPDATE + INTER_NORMAL, // INTNL_OVERLAY_UPDATE + GF_ARF_LOW, // INTNL_ARF_UPDATE +}; + +static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group, + int gf_frame_index) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; + assert(update_type < FRAME_UPDATE_TYPES); + return rate_factor_levels[update_type]; +} + +/*!\brief Gets a rate vs Q correction factor + * + * This function returns the current value of a correction factor used to + * dynamilcally adjust the relationship between Q and the expected number + * of bits for the frame. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] width Frame width + * \param[in] height Frame height + * + * \return Returns a correction factor for the current frame + */ +static double get_rate_correction_factor(const AV1_COMP *cpi, int width, + int height) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + double rcf; + double rate_correction_factors_kfstd; + double rate_correction_factors_gfarfstd; + double rate_correction_factors_internormal; + + rate_correction_factors_kfstd = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[KF_STD] + : p_rc->rate_correction_factors[KF_STD]; + rate_correction_factors_gfarfstd = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[GF_ARF_STD] + : p_rc->rate_correction_factors[GF_ARF_STD]; + rate_correction_factors_internormal = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[INTER_NORMAL] + : p_rc->rate_correction_factors[INTER_NORMAL]; + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + rcf = rate_correction_factors_kfstd; + } else if (is_stat_consumption_stage(cpi)) { + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); + double rate_correction_factors_rflvl = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? rc->frame_level_rate_correction_factors[rf_lvl] + : p_rc->rate_correction_factors[rf_lvl]; + rcf = rate_correction_factors_rflvl; + } else { + if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && + (cpi->oxcf.rc_cfg.mode != AOM_CBR || + cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) + rcf = rate_correction_factors_gfarfstd; + else + rcf = rate_correction_factors_internormal; + } + rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height); + return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR); +} + +/*!\brief Sets a rate vs Q correction factor + * + * This function updates the current value of a correction factor used to + * dynamilcally adjust the relationship between Q and the expected number + * of bits for the frame. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] is_encode_stage Indicates if recode loop or post-encode + * \param[in] factor New correction factor + * \param[in] width Frame width + * \param[in] height Frame height + * + * \remark Updates the rate correction factor for the + * current frame type in cpi->rc. + */ +static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage, + double factor, int width, int height) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + int update_default_rcf = 1; + // Normalize RCF to account for the size-dependent scaling factor. + factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height); + + factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + p_rc->rate_correction_factors[KF_STD] = factor; + } else if (is_stat_consumption_stage(cpi)) { + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); + if (is_encode_stage && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + rc->frame_level_rate_correction_factors[rf_lvl] = factor; + update_default_rcf = 0; + } + if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor; + } else { + if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) && + !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && + (cpi->oxcf.rc_cfg.mode != AOM_CBR || + cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) { + p_rc->rate_correction_factors[GF_ARF_STD] = factor; + } else { + if (is_encode_stage && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) { + rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor; + update_default_rcf = 0; + } + if (update_default_rcf) + p_rc->rate_correction_factors[INTER_NORMAL] = factor; + } + } +} + +void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage, + int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + double correction_factor = 1.0; + double rate_correction_factor = + get_rate_correction_factor(cpi, width, height); + double adjustment_limit; + int projected_size_based_on_q = 0; + int cyclic_refresh_active = + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled; + + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) return; + + // Don't update rate correction factors here on scene changes as + // it is already reset in av1_encodedframe_overshoot_cbr(), + // but reset variables related to previous frame q and size. + // Note that the counter of frames since the last scene change + // is only valid when cyclic refresh mode is enabled and that + // this break out only applies to scene changes that are not + // recorded as INTRA only key frames. + if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) && + (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) && + !frame_is_intra_only(cm) && !cpi->ppi->use_svc) { + cpi->rc.q_2_frame = cm->quant_params.base_qindex; + cpi->rc.q_1_frame = cm->quant_params.base_qindex; + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; + return; + } + + // Clear down mmx registers to allow floating point in what follows + + // Work out how big we would have expected the frame to be at this Q given + // the current correction factor. + // Stay in double to avoid int overflow when values are large + if (cyclic_refresh_active) { + projected_size_based_on_q = + av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); + } else { + projected_size_based_on_q = av1_estimate_bits_at_q( + cpi, cm->quant_params.base_qindex, rate_correction_factor); + } + // Work out a size correction factor. + if (projected_size_based_on_q > FRAME_OVERHEAD_BITS) + correction_factor = (double)cpi->rc.projected_frame_size / + (double)projected_size_based_on_q; + + // Clamp correction factor to prevent anything too extreme + correction_factor = AOMMAX(correction_factor, 0.25); + + cpi->rc.q_2_frame = cpi->rc.q_1_frame; + cpi->rc.q_1_frame = cm->quant_params.base_qindex; + cpi->rc.rc_2_frame = cpi->rc.rc_1_frame; + if (correction_factor > 1.1) + cpi->rc.rc_1_frame = -1; + else if (correction_factor < 0.9) + cpi->rc.rc_1_frame = 1; + else + cpi->rc.rc_1_frame = 0; + + // Decide how heavily to dampen the adjustment + if (correction_factor > 0.0) { + if (cpi->is_screen_content_type) { + adjustment_limit = + 0.25 + 0.5 * AOMMIN(0.5, fabs(log10(correction_factor))); + } else { + adjustment_limit = + 0.25 + 0.75 * AOMMIN(0.5, fabs(log10(correction_factor))); + } + } else { + adjustment_limit = 0.75; + } + + // Adjustment to delta Q and number of blocks updated in cyclic refressh + // based on over or under shoot of target in current frame. + if (cyclic_refresh_active && cpi->rc.this_frame_target > 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + if (correction_factor > 1.25) { + cr->percent_refresh_adjustment = + AOMMAX(cr->percent_refresh_adjustment - 1, -5); + cr->rate_ratio_qdelta_adjustment = + AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0); + } else if (correction_factor < 0.5) { + cr->percent_refresh_adjustment = + AOMMIN(cr->percent_refresh_adjustment + 1, 5); + cr->rate_ratio_qdelta_adjustment = + AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25); + } + } + + if (correction_factor > 1.01) { + // We are not already at the worst allowable quality + correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit)); + rate_correction_factor = rate_correction_factor * correction_factor; + // Keep rate_correction_factor within limits + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + } else if (correction_factor < 0.99) { + // We are not already at the best allowable quality + correction_factor = 1.0 / correction_factor; + correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit)); + correction_factor = 1.0 / correction_factor; + + rate_correction_factor = rate_correction_factor * correction_factor; + + // Keep rate_correction_factor within limits + if (rate_correction_factor < MIN_BPB_FACTOR) + rate_correction_factor = MIN_BPB_FACTOR; + } + + set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor, + width, height); +} + +// Calculate rate for the given 'q'. +static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh, + double correction_factor, int q) { + const AV1_COMMON *const cm = &cpi->common; + return use_cyclic_refresh + ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor) + : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q, + correction_factor, + cpi->sf.hl_sf.accurate_bit_estimate); +} + +/*!\brief Searches for a Q index value predicted to give an average macro + * block rate closest to the target value. + * + * Similar to find_qindex_by_rate() function, but returns a q index with a + * rate just above or below the desired rate, depending on which of the two + * rates is closer to the desired rate. + * Also, respects the selected aq_mode when computing the rate. + * + * \ingroup rate_control + * \param[in] desired_bits_per_mb Target bits per mb + * \param[in] cpi Top level encoder instance structure + * \param[in] correction_factor Current Q to rate correction factor + * \param[in] best_qindex Min allowed Q value. + * \param[in] worst_qindex Max allowed Q value. + * + * \return Returns a correction factor for the current frame + */ +static int find_closest_qindex_by_rate(int desired_bits_per_mb, + const AV1_COMP *cpi, + double correction_factor, + int best_qindex, int worst_qindex) { + const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->apply_cyclic_refresh; + + // Find 'qindex' based on 'desired_bits_per_mb'. + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + + // Calculate rate difference of this q index from the desired rate. + const int curr_q = low; + const int curr_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q); + const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb) + ? desired_bits_per_mb - curr_bits_per_mb + : INT_MAX; + assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) || + curr_q == worst_qindex); + + // Calculate rate difference for previous q index too. + const int prev_q = curr_q - 1; + int prev_bit_diff; + if (curr_bit_diff == INT_MAX || curr_q == best_qindex) { + prev_bit_diff = INT_MAX; + } else { + const int prev_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q); + assert(prev_bits_per_mb > desired_bits_per_mb); + prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb; + } + + // Pick one of the two q indices, depending on which one has rate closer to + // the desired rate. + return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q; +} + +int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality, + int width, int height) { + const int MBs = av1_get_MBs(width, height); + const double correction_factor = + get_rate_correction_factor(cpi, width, height); + const int target_bits_per_mb = + (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs); + + int q = + find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor, + active_best_quality, active_worst_quality); + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi)) + return adjust_q_cbr(cpi, q, active_worst_quality, width, height); + + return q; +} + +static int get_active_quality(int q, int gfu_boost, int low, int high, + int *low_motion_minq, int *high_motion_minq) { + if (gfu_boost > high) { + return low_motion_minq[q]; + } else if (gfu_boost < low) { + return high_motion_minq[q]; + } else { + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + return low_motion_minq[q] + adjustment; + } +} + +static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, + aom_bit_depth_t bit_depth) { + int *kf_low_motion_minq; + int *kf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); + return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} + +static int get_gf_active_quality_no_rc(int gfu_boost, int q, + aom_bit_depth_t bit_depth) { + int *arfgf_low_motion_minq; + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return get_active_quality(q, gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + +static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, + aom_bit_depth_t bit_depth) { + return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth); +} + +static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); + return arfgf_high_motion_minq[q]; +} + +static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const unsigned int curr_frame = cpi->common.current_frame.frame_number; + int active_worst_quality; + int last_q_key_frame; + int last_q_inter_frame; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME] + : p_rc->last_q[KEY_FRAME]; + last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME] + : p_rc->last_q[INTER_FRAME]; +#else + last_q_key_frame = p_rc->last_q[KEY_FRAME]; + last_q_inter_frame = p_rc->last_q[INTER_FRAME]; +#endif + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + active_worst_quality = + curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2; + } else { + if (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame || + refresh_frame->alt_ref_frame)) { + active_worst_quality = + curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame; + } else { + active_worst_quality = + curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2; + } + } + return AOMMIN(active_worst_quality, rc->worst_quality); +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + const SVC *const svc = &cpi->svc; + unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = p_rc->optimal_buffer_level >> 3; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + int ambient_qp; + if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality; + // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] + // for the first few frames following key frame. These are both initialized + // to worst_quality and updated with (3/4, 1/4) average in postencode_update. + // So for first few frames following key, the qp of that key frame is weighted + // into the active_worst_quality setting. For SVC the key frame should + // correspond to layer (0, 0), so use that for layer context. + int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME]; + if (svc->number_temporal_layers > 1) { + int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers); + const LAYER_CONTEXT *lc = &svc->layer_context[layer]; + const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; + avg_qindex_key = + AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]); + } + ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key) + ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key) + : p_rc->avg_frame_qindex[INTER_FRAME]; + ambient_qp = AOMMIN(rc->worst_quality, ambient_qp); + + if (p_rc->buffer_level > p_rc->optimal_buffer_level) { + // Adjust down. + int max_adjustment_down; // Maximum adjustment down for Q + + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && !cpi->ppi->use_svc && + (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) { + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp); + max_adjustment_down = AOMMIN(4, active_worst_quality / 16); + } else { + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4); + max_adjustment_down = active_worst_quality / 3; + } + + if (max_adjustment_down) { + buff_lvl_step = + ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) / + max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (p_rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp); + if (critical_level) { + buff_lvl_step = (p_rc->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = (int)((rc->worst_quality - ambient_qp) * + (p_rc->optimal_buffer_level - p_rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality += adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +// Calculate the active_best_quality level. +static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, + int active_worst_quality, + int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const CurrentFrame *const current_frame = &cm->current_frame; + int *rtc_minq; + const int bit_depth = cm->seq_params->bit_depth; + int active_best_quality = rc->best_quality; + ASSIGN_MINQ_TABLE(bit_depth, rtc_minq); + + if (frame_is_intra_only(cm)) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (p_rc->this_key_frame_forced) { + int qindex = p_rc->last_boosted_qindex; + double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + int delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + (last_boosted_q * 0.75), bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (current_frame->frame_number > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + active_best_quality = get_kf_active_quality( + p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth); + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } + } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && + cpi->oxcf.rc_cfg.gf_cbr_boost_pct && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + int q = active_worst_quality; + if (rc->frames_since_key > 1 && + p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = p_rc->avg_frame_qindex[INTER_FRAME]; + } + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + } else { + // Use the lower of active_worst_quality and recent/average Q. + FRAME_TYPE frame_type = + (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME; + if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality) + active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } + return active_best_quality; +} + +#if RT_PASSIVE_STRATEGY +static int get_q_passive_strategy(const AV1_COMP *const cpi, + const int q_candidate, const int threshold) { + const AV1_COMMON *const cm = &cpi->common; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const CurrentFrame *const current_frame = &cm->current_frame; + int sum = 0; + int count = 0; + int i = 1; + while (i < MAX_Q_HISTORY) { + int frame_id = current_frame->frame_number - i; + if (frame_id <= 0) break; + sum += p_rc->q_history[frame_id % MAX_Q_HISTORY]; + ++count; + ++i; + } + if (count > 0) { + const int avg_q = sum / count; + if (abs(avg_q - q_candidate) <= threshold) return avg_q; + } + return q_candidate; +} +#endif // RT_PASSIVE_STRATEGY + +/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc. + * + * Handles the special case when using: + * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and + * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are + * NOT available. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + */ +static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const CurrentFrame *const current_frame = &cm->current_frame; + int q; + int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); + int active_best_quality = calc_active_best_quality_no_stats_cbr( + cpi, active_worst_quality, width, height); + assert(has_no_stats_stage(cpi)); + assert(cpi->oxcf.rc_cfg.mode == AOM_CBR); + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced && + current_frame->frame_number != 0) { + int qdelta = 0; + qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, + active_worst_quality, 2.0); + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); +#if RT_PASSIVE_STRATEGY + if (current_frame->frame_type != KEY_FRAME && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + q = get_q_passive_strategy(cpi, q, 50); + } +#endif // RT_PASSIVE_STRATEGY + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) { + return gf_group->layer_depth[gf_index]; +} + +static int get_active_cq_level(const RATE_CONTROL *rc, + const PRIMARY_RATE_CONTROL *p_rc, + const AV1EncoderConfig *const oxcf, + int intra_only, aom_superres_mode superres_mode, + int superres_denom) { + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + static const double cq_adjust_threshold = 0.1; + int active_cq_level = rc_cfg->cq_level; + if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) { + // printf("Superres %d %d %d = %d\n", superres_denom, intra_only, + // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1)); + if ((superres_mode == AOM_SUPERRES_QTHRESH || + superres_mode == AOM_SUPERRES_AUTO) && + superres_denom != SCALE_NUMERATOR) { + int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO; + if (intra_only && rc->frames_to_key <= 1) { + mult = 0; + } else if (intra_only) { + mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME; + } else { + mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME; + } + active_cq_level = AOMMAX( + active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0); + } + } + if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) { + const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + +/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc. + * + * Handles the special case when using: + * - Any rate control other than constant bit-rate mode: + * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and + * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are + * NOT available. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + */ +static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const CurrentFrame *const current_frame = &cm->current_frame; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; + + assert(has_no_stats_stage(cpi)); + assert(rc_mode == AOM_VBR || + (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) || + rc_mode == AOM_Q); + + const int cq_level = + get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), + cpi->superres_mode, cm->superres_scale_denominator); + const int bit_depth = cm->seq_params->bit_depth; + + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi); + int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + + if (frame_is_intra_only(cm)) { + if (rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = + av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else if (p_rc->this_key_frame_forced) { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + int qindex = p_rc->last_boosted_qindex; +#endif + const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.75, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + + active_best_quality = get_kf_active_quality( + p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth); + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta on active_best_quality. + { + const double q_val = + av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + } + } + } else if (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + q = (rc->frames_since_key > 1 && + p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + ? p_rc->avg_frame_qindex[INTER_FRAME] + : p_rc->avg_frame_qindex[KEY_FRAME]; + // For constrained quality dont allow Q less than the cq level + if (rc_mode == AOM_CQ) { + if (q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + } else if (rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = + (refresh_frame->alt_ref_frame) + ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth) + : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + } + } else { + if (rc_mode == AOM_Q) { + const int qindex = cq_level; + const double q_val = av1_convert_qindex_to_q(qindex, bit_depth); + const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, + 0.70, 1.0, 0.85, 1.0 }; + const int delta_qindex = av1_compute_qdelta( + rc, q_val, + q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL], + bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } else { + // Use the lower of active_worst_quality and recent/average Q. + active_best_quality = + (current_frame->frame_number > 1) + ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]] + : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + // Limit Q range for the adaptive loop. + { + int qdelta = 0; + if (current_frame->frame_type == KEY_FRAME && + !p_rc->this_key_frame_forced && current_frame->frame_number != 0) { + qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, + active_worst_quality, 2.0); + } else if (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) { + qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type, + active_worst_quality, 1.75); + } + *top_index = active_worst_quality + qdelta; + *top_index = AOMMAX(*top_index, *bottom_index); + } + + if (rc_mode == AOM_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((current_frame->frame_type == KEY_FRAME) && + p_rc->this_key_frame_forced) { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + q = p_rc->last_boosted_qindex; +#endif + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75, + 1.50, 1.25, 1.15, + 1.0 }; +int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(gf_group, cpi->gf_frame_index); + const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const double rate_factor = + (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer]; + + return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor); +} + +// This unrestricted Q selection on CQ mode is useful when testing new features, +// but may lead to Q being out of range on current RC restrictions +#if USE_UNRESTRICTED_Q_IN_CQ_MODE +static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width, + int height, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = + get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, + cm->superres_scale_denominator); + const int bit_depth = cm->seq_params->bit_depth; + const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth); + (void)width; + (void)height; + assert(has_no_stats_stage(cpi)); + assert(cpi->oxcf.rc_cfg.mode == AOM_CQ); + + *top_index = q; + *bottom_index = q; + + return q; +} +#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE + +#define STATIC_MOTION_THRESH 95 +static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, + int *active_best, int *active_worst, + int cq_level) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int active_best_quality; + int active_worst_quality = *active_worst; + const int bit_depth = cm->seq_params->bit_depth; + + if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) { + // If the next frame is also a key frame or the current frame is the + // only frame in the sequence in AOM_Q mode, just use the cq_level + // as q. + active_best_quality = cq_level; + active_worst_quality = cq_level; + } else if (p_rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + double last_boosted_q; + int delta_qindex; + int qindex; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int last_boosted_qindex = simulate_parallel_frame + ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + int last_boosted_qindex = p_rc->last_boosted_qindex; +#endif + if (is_stat_consumption_stage_twopass(cpi) && + cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, bit_depth); + active_worst_quality = + AOMMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = last_boosted_qindex; + last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.50, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + + // Baseline value derived from active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(p_rc, active_worst_quality, bit_depth); + if (cpi->is_screen_content_type) { + active_best_quality /= 2; + } + + if (is_stat_consumption_stage_twopass(cpi) && + cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 3; + } + + // Allow somewhat lower kf minq with small image formats. + if ((width * height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + if (is_stat_consumption_stage_twopass(cpi)) + q_adj_factor += + 0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth); + active_best_quality += + av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); + + // Tweak active_best_quality for AOM_Q mode when superres is on, as this + // will be used directly as 'q' later. + if (oxcf->rc_cfg.mode == AOM_Q && + (cpi->superres_mode == AOM_SUPERRES_QTHRESH || + cpi->superres_mode == AOM_SUPERRES_AUTO) && + cm->superres_scale_denominator != SCALE_NUMERATOR) { + active_best_quality = + AOMMAX(active_best_quality - + ((cm->superres_scale_denominator - SCALE_NUMERATOR) * + SUPERRES_QADJ_PER_DENOM_KEYFRAME), + 0); + } + } + *active_best = active_best_quality; + *active_worst = active_worst_quality; +} + +static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi, + const int is_intrl_arf_boost, + int *active_worst, + int *active_best) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + int active_best_quality = *active_best; + int active_worst_quality = *active_worst; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq + : cpi->ppi->twopass.extend_minq; + int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq + : cpi->ppi->twopass.extend_maxq; +#endif + // Extension to max or min Q if undershoot or overshoot is outside + // the permitted range. + if (cpi->oxcf.rc_cfg.mode != AOM_Q) { + if (frame_is_intra_only(cm) || + (!rc->is_src_frame_alt_ref && + (refresh_frame->golden_frame || is_intrl_arf_boost || + refresh_frame->alt_ref_frame))) { +#if CONFIG_FPMT_TEST + active_best_quality -= extend_minq; + active_worst_quality += (extend_maxq / 2); +#else + active_best_quality -= cpi->ppi->twopass.extend_minq / 4; + active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2); +#endif + } else { +#if CONFIG_FPMT_TEST + active_best_quality -= extend_minq / 2; + active_worst_quality += extend_maxq; +#else + active_best_quality -= cpi->ppi->twopass.extend_minq / 4; + active_worst_quality += cpi->ppi->twopass.extend_maxq; +#endif + } + } + +#ifndef STRICT_RC + // Static forced key frames Q restrictions dealt with elsewhere. + if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced || + (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { + const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality); + active_worst_quality = + AOMMAX(active_worst_quality + qdelta, active_best_quality); + } +#endif + + // Modify active_best_quality for downscaled normal frames. + if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) { + int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, + active_best_quality, 2.0); + active_best_quality = + AOMMAX(active_best_quality + qdelta, rc->best_quality); + } + + active_best_quality = + clamp(active_best_quality, rc->best_quality, rc->worst_quality); + active_worst_quality = + clamp(active_worst_quality, active_best_quality, rc->worst_quality); + + *active_best = active_best_quality; + *active_worst = active_worst_quality; +} + +/*!\brief Gets a Q value to use for the current frame + * + * + * Selects a Q value from a permitted range that we estimate + * will result in approximately the target number of bits. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] width Width of frame + * \param[in] height Height of frame + * \param[in] active_worst_quality Max Q allowed + * \param[in] active_best_quality Min Q allowed + * + * \return The suggested Q for this frame. + */ +static int get_q(const AV1_COMP *cpi, const int width, const int height, + const int active_worst_quality, + const int active_best_quality) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int q; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg; + int last_boosted_qindex = simulate_parallel_frame + ? p_rc->temp_last_boosted_qindex + : p_rc->last_boosted_qindex; +#else + int last_boosted_qindex = p_rc->last_boosted_qindex; +#endif + + if (cpi->oxcf.rc_cfg.mode == AOM_Q || + (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced && + cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && + rc->frames_to_key > 1)) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames. + } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) { + // If static since last kf use better of last boosted and last kf q. + if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex); + } else { + q = AOMMIN(last_boosted_qindex, + (active_best_quality + active_worst_quality) / 2); + } + q = clamp(q, active_best_quality, active_worst_quality); + } else { + q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, + active_worst_quality, width, height); + if (q > active_worst_quality) { + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target < rc->max_frame_bandwidth) { + q = active_worst_quality; + } + } + q = AOMMAX(q, active_best_quality); + } + return q; +} + +// Returns |active_best_quality| for an inter frame. +// The |active_best_quality| depends on different rate control modes: +// VBR, Q, CQ, CBR. +// The returning active_best_quality could further be adjusted in +// adjust_active_best_and_worst_quality(). +static int get_active_best_quality(const AV1_COMP *const cpi, + const int active_worst_quality, + const int cq_level, const int gf_index) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cm->seq_params->bit_depth; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; + int *inter_minq; + ASSIGN_MINQ_TABLE(bit_depth, inter_minq); + int active_best_quality = 0; + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + int is_leaf_frame = + !(gf_group->update_type[gf_index] == ARF_UPDATE || + gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost); + + // TODO(jingning): Consider to rework this hack that covers issues incurred + // in lightfield setting. + if (cm->tiles.large_scale) { + is_leaf_frame = !(refresh_frame->golden_frame || + refresh_frame->alt_ref_frame || is_intrl_arf_boost); + } + const int is_overlay_frame = rc->is_src_frame_alt_ref; + + if (is_leaf_frame || is_overlay_frame) { + if (rc_mode == AOM_Q) return cq_level; + + active_best_quality = inter_minq[active_worst_quality]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + return active_best_quality; + } + + // Determine active_best_quality for frames that are not leaf or overlay. + int q = active_worst_quality; + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = p_rc->avg_frame_qindex[INTER_FRAME]; + } + if (rc_mode == AOM_CQ && q < cq_level) q = cq_level; + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); + // Constrained quality use slightly lower active best. + if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16; + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; + active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor); + if (!is_intrl_arf_boost) return active_best_quality; + + if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q; + int this_height = gf_group_pyramid_level(gf_group, gf_index); + while (this_height > 1) { + active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; + --this_height; + } + return active_best_quality; +} + +// Returns the q_index for a single frame in the GOP. +// This function assumes that rc_mode == AOM_Q mode. +int av1_q_mode_get_q_index(int base_q_index, int gf_update_type, + int gf_pyramid_level, int arf_q) { + const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE; + int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE || + gf_update_type == OVERLAY_UPDATE || + gf_update_type == INTNL_OVERLAY_UPDATE; + + if (is_leaf_or_overlay_frame) return base_q_index; + + if (!is_intrl_arf_boost) return arf_q; + + int active_best_quality = arf_q; + int active_worst_quality = base_q_index; + + while (gf_pyramid_level > 1) { + active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; + --gf_pyramid_level; + } + return active_best_quality; +} + +// Returns the q_index for the ARF in the GOP. +int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth, + double arf_boost_factor) { + int active_best_quality = + get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth); + const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth); + const int boost = min_boost - active_best_quality; + return min_boost - (int)(boost * arf_boost_factor); +} + +static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width, + int height, int gf_index, + int *bottom_index, int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = + get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), + cpi->superres_mode, cm->superres_scale_denominator); + int active_best_quality = 0; + int active_worst_quality = rc->active_worst_quality; + int q; + + if (frame_is_intra_only(cm)) { + get_intra_q_and_bounds(cpi, width, height, &active_best_quality, + &active_worst_quality, cq_level); + } else { + // Active best quality limited by previous layer. + active_best_quality = + get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); + } + + if (cq_level > 0) active_best_quality = AOMMAX(1, active_best_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + *top_index = AOMMAX(*top_index, rc->best_quality); + *top_index = AOMMIN(*top_index, rc->worst_quality); + + *bottom_index = AOMMAX(*bottom_index, rc->best_quality); + *bottom_index = AOMMIN(*bottom_index, rc->worst_quality); + + q = active_best_quality; + + q = AOMMAX(q, rc->best_quality); + q = AOMMIN(q, rc->worst_quality); + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + + return q; +} + +/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc. + * + * Handles the the general cases not covered by + * \ref rc_pick_q_and_bounds_no_stats_cbr() and + * \ref rc_pick_q_and_bounds_no_stats() + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[in] gf_index Index of this frame in the golden frame group + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + */ +static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, + int gf_index, int *bottom_index, + int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(has_no_stats_stage(cpi), + cpi->oxcf.rc_cfg.mode == AOM_Q && + gf_group->update_type[gf_index] != ARF_UPDATE)); + const int cq_level = + get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm), + cpi->superres_mode, cm->superres_scale_denominator); + + if (oxcf->rc_cfg.mode == AOM_Q) { + return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index, + bottom_index, top_index); + } + + int active_best_quality = 0; + int active_worst_quality = rc->active_worst_quality; + int q; + + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + + if (frame_is_intra_only(cm)) { + get_intra_q_and_bounds(cpi, width, height, &active_best_quality, + &active_worst_quality, cq_level); +#ifdef STRICT_RC + active_best_quality = 0; +#endif + } else { + // Active best quality limited by previous layer. + const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index); + + if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) { + active_best_quality = get_active_best_quality(cpi, active_worst_quality, + cq_level, gf_index); + } else { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int local_active_best_quality = + simulate_parallel_frame + ? p_rc->temp_active_best_quality[pyramid_level - 1] + : p_rc->active_best_quality[pyramid_level - 1]; + active_best_quality = local_active_best_quality + 1; +#else + active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1; +#endif + + active_best_quality = AOMMIN(active_best_quality, active_worst_quality); +#ifdef STRICT_RC + active_best_quality += (active_worst_quality - active_best_quality) / 16; +#else + active_best_quality += (active_worst_quality - active_best_quality) / 2; +#endif + } + + // For alt_ref and GF frames (including internal arf frames) adjust the + // worst allowed quality as well. This insures that even on hard + // sections we dont clamp the Q at the same value for arf frames and + // leaf (non arf) frames. This is important to the TPL model which assumes + // Q drops with each arf level. + if (!(rc->is_src_frame_alt_ref) && + (refresh_frame->golden_frame || refresh_frame->alt_ref_frame || + is_intrl_arf_boost)) { + active_worst_quality = + (active_best_quality + (3 * active_worst_quality) + 2) / 4; + } + } + + adjust_active_best_and_worst_quality( + cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality); + q = get_q(cpi, width, height, active_worst_quality, active_best_quality); + + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target >= rc->max_frame_bandwidth && + q > active_worst_quality) { + active_worst_quality = q; + } + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + + return q; +} + +static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source; + if (unscaled_src == NULL) return; + + const uint8_t *src_y = unscaled_src->y_buffer; + const int src_ystride = unscaled_src->y_stride; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const uint8_t *pre_y = yv12->buffers[0]; + const int pre_ystride = yv12->strides[0]; + + // TODO(yunqing): support scaled reference frames. + if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return; + + for (int i = 0; i < 2; ++i) { + if (unscaled_src->widths[i] != yv12->widths[i] || + unscaled_src->heights[i] != yv12->heights[i]) { + return; + } + } + + const int num_mi_cols = cm->mi_params.mi_cols; + const int num_mi_rows = cm->mi_params.mi_rows; + const BLOCK_SIZE bsize = BLOCK_64X64; + int num_samples = 0; + // sse is computed on 64x64 blocks + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; + + uint64_t fsse = 0; + cpi->rec_sse = 0; + + for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { + for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { + unsigned int sse; + uint8_t src[64 * 64] = { 0 }; + // Apply 4x4 block averaging/denoising on source frame. + for (int i = 0; i < 64; i += 4) { + for (int j = 0; j < 64; j += 4) { + const unsigned int avg = + aom_avg_4x4(src_y + i * src_ystride + j, src_ystride); + + for (int m = 0; m < 4; ++m) { + for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg; + } + } + } + + cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse); + fsse += sse; + num_samples++; + src_y += 64; + pre_y += 64; + } + src_y += (src_ystride << 6) - (sb_cols << 6); + pre_y += (pre_ystride << 6) - (sb_cols << 6); + } + assert(num_samples > 0); + // Ensure rec_sse > 0 + if (num_samples > 0) cpi->rec_sse = fsse > 0 ? fsse : 1; +} + +int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index, + int *bottom_index, int *top_index) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int q; + // TODO(sarahparker) merge no-stats vbr and altref q computation + // with rc_pick_q_and_bounds(). + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if ((cpi->oxcf.rc_cfg.mode != AOM_Q || + gf_group->update_type[gf_index] == ARF_UPDATE) && + has_no_stats_stage(cpi)) { + if (cpi->oxcf.rc_cfg.mode == AOM_CBR) { + // TODO(yunqing): the results could be used for encoder optimization. + cpi->rec_sse = UINT64_MAX; + if (cpi->sf.hl_sf.accurate_bit_estimate && + cpi->common.current_frame.frame_type != KEY_FRAME) + rc_compute_variance_onepass_rt(cpi); + + q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index, + top_index); + // preserve copy of active worst quality selected. + cpi->rc.active_worst_quality = *top_index; + +#if USE_UNRESTRICTED_Q_IN_CQ_MODE + } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) { + q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index, + top_index); +#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE + } else { + q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index, + top_index); + } + } else { + q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index, + top_index); + } + if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q; + + return q; +} + +void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + if (cpi->oxcf.rc_cfg.mode == AOM_Q) { + *frame_under_shoot_limit = 0; + *frame_over_shoot_limit = INT_MAX; + } else { + // For very small rate targets where the fractional adjustment + // may be tiny make sure there is at least a minimum range. + assert(cpi->sf.hl_sf.recode_tolerance <= 100); + const int tolerance = (int)AOMMAX( + 100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100); + *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0); + *frame_over_shoot_limit = + AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth); + } +} + +void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + rc->this_frame_target = target; + + // Modify frame size target when down-scaled. + if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) { + rc->this_frame_target = + (int)(rc->this_frame_target * + resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height)); + } + + // Target rate per SB64 (including partial SB64s. + rc->sb64_target_rate = + (int)(((int64_t)rc->this_frame_target << 12) / (width * height)); +} + +static void update_alt_ref_frame_stats(AV1_COMP *cpi) { + // this frame refreshes means next frames don't unless specified by user + RATE_CONTROL *const rc = &cpi->rc; + rc->frames_since_golden = 0; +} + +static void update_golden_frame_stats(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + + // Update the Golden frame usage counts. + if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) { + rc->frames_since_golden = 0; + } else if (cpi->common.show_frame) { + rc->frames_since_golden++; + } +} + +void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { + const AV1_COMMON *const cm = &cpi->common; + const CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; + + const int is_intrnl_arf = + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + + const int qindex = cm->quant_params.base_qindex; + +#if RT_PASSIVE_STRATEGY + const int frame_number = current_frame->frame_number % MAX_Q_HISTORY; + p_rc->q_history[frame_number] = qindex; +#endif // RT_PASSIVE_STRATEGY + + // Update rate control heuristics + rc->projected_frame_size = (int)(bytes_used << 3); + + // Post encode loop adjustment of Q prediction. + av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height); + + // Update bit estimation ratio. + if (cpi->oxcf.rc_cfg.mode == AOM_CBR && + cm->current_frame.frame_type != KEY_FRAME && + cpi->sf.hl_sf.accurate_bit_estimate) { + const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, + cm->seq_params->bit_depth); + const int this_bit_est_ratio = + (int)(rc->projected_frame_size * q / sqrt((double)cpi->rec_sse)); + cpi->rc.bit_est_ratio = + cpi->rc.bit_est_ratio == 0 + ? this_bit_est_ratio + : (7 * cpi->rc.bit_est_ratio + this_bit_est_ratio) / 8; + } + + // Keep a record of last Q and ambient average Q. + if (current_frame->frame_type == KEY_FRAME) { + p_rc->last_q[KEY_FRAME] = qindex; + p_rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); + } else { + if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) || + cpi->rc.rtc_external_ratectrl || + (!rc->is_src_frame_alt_ref && + !(refresh_frame->golden_frame || is_intrnl_arf || + refresh_frame->alt_ref_frame))) { + p_rc->last_q[INTER_FRAME] = qindex; + p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO( + 3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + p_rc->ni_frames++; + p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth); + p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames; + } + } + // Keep record of last boosted (KF/GF/ARF) Q value. + // If the current frame is coded at a lower Q then we also update it. + // If all mbs in this group are skipped only update if the Q value is + // better than that already stored. + // This is used to help set quality in forced key frames to reduce popping + if ((qindex < p_rc->last_boosted_qindex) || + (current_frame->frame_type == KEY_FRAME) || + (!p_rc->constrained_gf_group && + (refresh_frame->alt_ref_frame || is_intrnl_arf || + (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) { + p_rc->last_boosted_qindex = qindex; + } + if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex; + + update_buffer_level(cpi, rc->projected_frame_size); + rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth; + + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (av1_frame_scaled(cm)) + rc->this_frame_target = (int)(rc->this_frame_target / + resize_rate_factor(&cpi->oxcf.frm_dim_cfg, + cm->width, cm->height)); + if (current_frame->frame_type != KEY_FRAME) { + p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64( + p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64( + p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + } + + // Actual bits spent + p_rc->total_actual_bits += rc->projected_frame_size; + p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; + + if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames, + cpi->oxcf.gf_cfg.enable_auto_arf) && + refresh_frame->alt_ref_frame && + (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm))) + // Update the alternate reference frame stats as appropriate. + update_alt_ref_frame_stats(cpi); + else + // Update the Golden frame stats as appropriate. + update_golden_frame_stats(cpi); + +#if CONFIG_FPMT_TEST + /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q, + * temp_last_boosted_qindex are introduced only for quality simulation + * purpose, it retains the value previous to the parallel encode frames. The + * variables are updated based on the update flag. + * + * If there exist show_existing_frames between parallel frames, then to + * retain the temp state do not update it. */ + int show_existing_between_parallel_frames = + (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == + INTNL_OVERLAY_UPDATE && + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2); + + if (cpi->do_frame_data_update && !show_existing_between_parallel_frames && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) { + for (int i = 0; i < FRAME_TYPES; i++) { + p_rc->temp_last_q[i] = p_rc->last_q[i]; + } + p_rc->temp_avg_q = p_rc->avg_q; + p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex; + p_rc->temp_total_actual_bits = p_rc->total_actual_bits; + p_rc->temp_projected_frame_size = rc->projected_frame_size; + for (int i = 0; i < RATE_FACTOR_LEVELS; i++) + p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i]; + } +#endif + if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0; + if (cpi->refresh_frame.golden_frame) + rc->frame_num_last_gf_refresh = current_frame->frame_number; + rc->prev_coded_width = cm->width; + rc->prev_coded_height = cm->height; + rc->frame_number_encoded++; + rc->prev_frame_is_dropped = 0; + rc->drop_count_consec = 0; + // if (current_frame->frame_number == 1 && cm->show_frame) + /* + rc->this_frame_target = + (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg, + cm->width, cm->height)); + */ +} + +void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { + // Update buffer level with zero size, update frame counters, and return. + update_buffer_level(cpi, 0); + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + } + cpi->rc.rc_2_frame = 0; + cpi->rc.rc_1_frame = 0; + cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + cpi->rc.prev_coded_width = cpi->common.width; + cpi->rc.prev_coded_height = cpi->common.height; + cpi->rc.prev_frame_is_dropped = 1; + // On a scene/slide change for dropped frame: reset the avg_source_sad to 0, + // otherwise the avg_source_sad can get too large and subsequent frames + // may miss the scene/slide detection. + if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0; + if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) { + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true; + } +} + +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_q = av1_convert_qindex_to_q(mid, bit_depth); + if (mid_q < desired_q) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q || + low == worst_qindex); + return low; +} + +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth) { + const int start_index = + av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality); + const int target_index = + av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality); + return target_index - start_index; +} + +// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex], +// assuming 'correction_factor' is 1.0. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// bits per mb <= desired_bits_per_mb. +// If no such q index is found, returns 'worst_qindex'. +static int find_qindex_by_rate(const AV1_COMP *const cpi, + int desired_bits_per_mb, FRAME_TYPE frame_type, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <= + desired_bits_per_mb || + low == worst_qindex); + return low; +} + +int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio) { + const RATE_CONTROL *rc = &cpi->rc; + + // Look up the current projected bits per block for the base index + const int base_bits_per_mb = + av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0); + + // Find the target bits per mb based on the base value and given ratio. + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); + + const int target_index = find_qindex_by_rate( + cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality); + return target_index - qindex; +} + +void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, + RATE_CONTROL *const rc) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + // Special case code for 1 pass fixed Q mode tests + if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) { + rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; + rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; + rc->static_scene_max_gf_interval = rc->min_gf_interval + 1; + } else { + // Set Maximum gf/arf interval + rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval; + rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval; + if (rc->min_gf_interval == 0) + rc->min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate); + if (rc->max_gf_interval == 0) + rc->max_gf_interval = av1_rc_get_default_max_gf_interval( + cpi->framerate, rc->min_gf_interval); + /* + * Extended max interval for genuinely static scenes like slide shows. + * The no.of.stats available in the case of LAP is limited, + * hence setting to max_gf_interval. + */ + if (cpi->ppi->lap_enabled) + rc->static_scene_max_gf_interval = rc->max_gf_interval + 1; + else + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; + + // Clamp min to max + rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval); + } +} + +void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + const int MBs = av1_get_MBs(width, height); + + rc->avg_frame_bandwidth = + (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate); + rc->min_frame_bandwidth = + (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100); + + rc->min_frame_bandwidth = + AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // The baseline for this aligns with HW implementations that + // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits + // per 16x16 MB (averaged over a frame). However this limit is extended if + // a very high rate is given on the command line or the the rate cannnot + // be acheived because of a user specificed max q (e.g. when the user + // specifies lossless encode. + vbr_max_bits = + (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) / + 100); + rc->max_frame_bandwidth = + AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + + av1_rc_set_gf_interval_range(cpi, rc); +} + +#define VBR_PCT_ADJUSTMENT_LIMIT 50 +// For VBR...adjustment to the frame target based on error from previous frames +static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + int64_t vbr_bits_off_target = simulate_parallel_frame + ? cpi->ppi->p_rc.temp_vbr_bits_off_target + : p_rc->vbr_bits_off_target; +#else + int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target; +#endif + const int stats_count = + cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL + ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count + : 0; + const int frame_window = AOMMIN( + 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number)); + assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100); + if (frame_window > 0) { + const int max_delta = (int)AOMMIN( + abs((int)(vbr_bits_off_target / frame_window)), + ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100); + + // vbr_bits_off_target > 0 means we have extra bits to spend + // vbr_bits_off_target < 0 we are currently overshooting + *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta; + } + +#if CONFIG_FPMT_TEST + int64_t vbr_bits_off_target_fast = + simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast + : p_rc->vbr_bits_off_target_fast; +#endif + // Fast redistribution of bits arising from massive local undershoot. + // Dont do it for kf,arf,gf or overlay frames. + if (!frame_is_kf_gf_arf(cpi) && +#if CONFIG_FPMT_TEST + vbr_bits_off_target_fast && +#else + p_rc->vbr_bits_off_target_fast && +#endif + !rc->is_src_frame_alt_ref) { + int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target); + int fast_extra_bits; +#if CONFIG_FPMT_TEST + fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = + (int)AOMMIN(fast_extra_bits, + AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8)); +#else + fast_extra_bits = + (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = (int)AOMMIN( + fast_extra_bits, + AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8)); +#endif + if (fast_extra_bits > 0) { + // Update this_frame_target only if additional bits are available from + // local undershoot. + *this_frame_target += (int)fast_extra_bits; + } + // Store the fast_extra_bits of the frame and reduce it from + // vbr_bits_off_target_fast during postencode stage. + rc->frame_level_fast_extra_bits = fast_extra_bits; + // Retaining the condition to udpate during postencode stage since + // fast_extra_bits are calculated based on vbr_bits_off_target_fast. + cpi->do_update_vbr_bits_off_target_fast = 1; + } +} + +void av1_set_target_rate(AV1_COMP *cpi, int width, int height) { + RATE_CONTROL *const rc = &cpi->rc; + int target_rate = rc->base_frame_target; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ) + vbr_rate_correction(cpi, &target_rate); + av1_rc_set_frame_target(cpi, target_rate, width, height); +} + +int av1_calc_pframe_target_size_one_pass_vbr( + const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) { + static const int af_ratio = 10; + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int64_t target; +#if USE_ALTREF_FOR_ONE_PASS + if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE || + frame_update_type == ARF_UPDATE) { + target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * + af_ratio) / + (p_rc->baseline_gf_interval + af_ratio - 1); + } else { + target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) / + (p_rc->baseline_gf_interval + af_ratio - 1); + } + if (target > INT_MAX) target = INT_MAX; +#else + target = rc->avg_frame_bandwidth; +#endif + return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type); +} + +int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio; + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +int av1_calc_pframe_target_size_one_pass_cbr( + const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + const RateControlCfg *rc_cfg = &oxcf->rc_cfg; + const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level; + const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100; + int min_frame_target = + AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target; + + if (rc_cfg->gf_cbr_boost_pct) { + const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100; + if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) { + target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * + af_ratio_pct) / + (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) / + (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } + } else { + target = rc->avg_frame_bandwidth; + } + if (cpi->ppi->use_svc) { + // Note that for layers, avg_frame_bandwidth is the cumulative + // per-frame-bandwidth. For the target size of this frame, use the + // layer average frame size (i.e., non-cumulative per-frame-bw). + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + target = lc->avg_frame_size; + min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); + } + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = + (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = + (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct); + target += (target * pct_high) / 200; + } + if (rc_cfg->max_inter_bitrate_pct) { + const int max_rate = + rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100; + target = AOMMIN(target, max_rate); + } + return AOMMAX(min_frame_target, target); +} + +int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + int64_t target; + if (cpi->common.current_frame.frame_number == 0) { + target = ((p_rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX + : (int)(p_rc->starting_buffer_level / 2); + if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) { + target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1)); + } + } else { + int kf_boost = 32; + int framerate = (int)round(cpi->framerate); + + kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); + } + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + return av1_rc_clamp_iframe_target_size(cpi, target); +} + +static void set_golden_update(AV1_COMP *const cpi) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int divisor = 10; + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) + divisor = cpi->cyclic_refresh->percent_refresh; + + // Set minimum gf_interval for GF update to a multiple of the refresh period, + // with some max limit. Depending on past encoding stats, GF flag may be + // reset and update may not occur until next baseline_gf_interval. + const int gf_length_mult[2] = { 8, 4 }; + if (divisor > 0) + p_rc->baseline_gf_interval = + AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor), + MAX_GF_INTERVAL_RT); + else + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT; + if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40) + p_rc->baseline_gf_interval = 16; +} + +static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + set_golden_update(cpi); + + if (p_rc->baseline_gf_interval > rc->frames_to_key && + cpi->oxcf.kf_cfg.auto_key) + p_rc->baseline_gf_interval = rc->frames_to_key; + p_rc->gfu_boost = DEFAULT_GF_BOOST_RT; + p_rc->constrained_gf_group = + (p_rc->baseline_gf_interval >= rc->frames_to_key && + cpi->oxcf.kf_cfg.auto_key) + ? 1 + : 0; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + cpi->gf_frame_index = 0; + // SVC does not use GF as periodic boost. + // TODO(marpan): Find better way to disable this for SVC. + if (cpi->ppi->use_svc) { + SVC *const svc = &cpi->svc; + p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1; + p_rc->gfu_boost = 1; + p_rc->constrained_gf_group = 0; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + for (int layer = 0; + layer < svc->number_spatial_layers * svc->number_temporal_layers; + ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval; + lc->p_rc.gfu_boost = p_rc->gfu_boost; + lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group; + lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due; + lc->group_index = 0; + } + } + gf_group->size = p_rc->baseline_gf_interval; + gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE; + gf_group->refbuf_state[cpi->gf_frame_index] = + (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE; +} + +void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + const int resize_pending = is_frame_resize_pending(cpi); + if (!resize_pending && !rc->high_source_sad) { + // Check if we should disable GF refresh (if period is up), + // or force a GF refresh update (if we are at least halfway through + // period) based on QP. Look into add info on segment deltaq. + PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; + const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME]; + const int allow_gf_update = + rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10); + int gf_update_changed = 0; + int thresh = 87; + if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) < + FIXED_GF_INTERVAL_RT && + rc->frames_till_gf_update_due == 1 && + cm->quant_params.base_qindex > avg_qp) { + // Disable GF refresh since QP is above the running average QP. + rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0; + gf_update_changed = 1; + cpi->refresh_frame.golden_frame = 0; + } else if (allow_gf_update && + ((cm->quant_params.base_qindex < thresh * avg_qp / 100) || + (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) { + // Force refresh since QP is well below average QP or this is a high + // motion frame. + rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1; + gf_update_changed = 1; + cpi->refresh_frame.golden_frame = 1; + } + if (gf_update_changed) { + set_baseline_gf_interval(cpi, INTER_FRAME); + int refresh_mask = 0; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + int ref_frame_map_idx = rtc_ref->ref_idx[i]; + refresh_mask |= rtc_ref->refresh[ref_frame_map_idx] + << ref_frame_map_idx; + } + cm->current_frame.refresh_frame_flags = refresh_mask; + } + } +} + +/*!\brief Setup the reference prediction structure for 1 pass real-time + * + * Set the reference prediction structure for 1 layer. + * Current structue is to use 3 references (LAST, GOLDEN, ALTREF), + * where ALT_REF always behind current by lag_alt frames, and GOLDEN is + * either updated on LAST with period baseline_gf_interval (fixed slot) + * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7). + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] gf_update Flag to indicate if GF is updated + * + * \remark Nothing is returned. Instead the settings for the prediction + * structure are set in \c cpi-ext_flags; and the buffer slot index + * (for each of 7 references) and refresh flags (for each of the 8 slots) + * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[]. + */ +void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) { + AV1_COMMON *const cm = &cpi->common; + ExternalFlags *const ext_flags = &cpi->ext_flags; + RATE_CONTROL *const rc = &cpi->rc; + ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = + &ext_flags->refresh_frame; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + unsigned int frame_number = (cpi->oxcf.rc_cfg.drop_frames_water_mark) + ? rc->frame_number_encoded + : cm->current_frame.frame_number; + unsigned int lag_alt = 4; + int last_idx = 0; + int last_idx_refresh = 0; + int gld_idx = 0; + int alt_ref_idx = 0; + int last2_idx = 0; + ext_refresh_frame_flags->update_pending = 1; + ext_flags->ref_frame_flags = 0; + ext_refresh_frame_flags->last_frame = 1; + ext_refresh_frame_flags->golden_frame = 0; + ext_refresh_frame_flags->alt_ref_frame = 0; + // Decide altref lag adaptively for rt + if (cpi->sf.rt_sf.sad_based_adp_altref_lag) { + lag_alt = 6; + const uint64_t th_frame_sad[4][3] = { + { 18000, 18000, 18000 }, // HDRES CPU 9 + { 25000, 25000, 25000 }, // MIDRES CPU 9 + { 40000, 30000, 20000 }, // HDRES CPU10 + { 30000, 25000, 20000 } // MIDRES CPU 10 + }; + int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1; + assert(th_idx < 4); + if (rc->avg_source_sad > th_frame_sad[th_idx][0]) + lag_alt = 3; + else if (rc->avg_source_sad > th_frame_sad[th_idx][1]) + lag_alt = 4; + else if (rc->avg_source_sad > th_frame_sad[th_idx][2]) + lag_alt = 5; + } + // This defines the reference structure for 1 layer (non-svc) RTC encoding. + // To avoid the internal/default reference structure for non-realtime + // overwriting this behavior, we use the "svc" ref parameters from the + // external control SET_SVC_REF_FRAME_CONFIG. + // TODO(marpan): rename that control and the related internal parameters + // to rtc_ref. + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7; + for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0; + // Set the reference frame flags. + ext_flags->ref_frame_flags ^= AOM_LAST_FLAG; + if (!cpi->sf.rt_sf.force_only_last_ref) { + ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) + ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; + } + const int sh = 6; + // Moving index slot for last: 0 - (sh - 1). + if (frame_number > 1) last_idx = ((frame_number - 1) % sh); + // Moving index for refresh of last: one ahead for next frame. + last_idx_refresh = (frame_number % sh); + gld_idx = 6; + + // Moving index for alt_ref, lag behind LAST by lag_alt frames. + if (frame_number > lag_alt) alt_ref_idx = ((frame_number - lag_alt) % sh); + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) { + // Moving index for LAST2, lag behind LAST by 2 frames. + if (frame_number > 2) last2_idx = ((frame_number - 2) % sh); + } + rtc_ref->ref_idx[0] = last_idx; // LAST + rtc_ref->ref_idx[1] = last_idx_refresh; // LAST2 (for refresh of last). + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) { + rtc_ref->ref_idx[1] = last2_idx; // LAST2 + rtc_ref->ref_idx[2] = last_idx_refresh; // LAST3 (for refresh of last). + } + rtc_ref->ref_idx[3] = gld_idx; // GOLDEN + rtc_ref->ref_idx[6] = alt_ref_idx; // ALT_REF + // Refresh this slot, which will become LAST on next frame. + rtc_ref->refresh[last_idx_refresh] = 1; + // Update GOLDEN on period for fixed slot case. + if (gf_update && cm->current_frame.frame_type != KEY_FRAME) { + ext_refresh_frame_flags->golden_frame = 1; + rtc_ref->refresh[gld_idx] = 1; + } + rtc_ref->gld_idx_1layer = gld_idx; + // Set the flag to reduce the number of reference frame buffers used. + // This assumes that slot 7 is never used. + cpi->rt_reduce_num_ref_buffers = 1; + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7); + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7); + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7); + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7); + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) + cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7); +} + +/*!\brief Check for scene detection, for 1 pass real-time mode. + * + * Compute average source sad (temporal sad: between current source and + * previous source) over a subset of superblocks. Use this is detect big changes + * in content and set the \c cpi->rc.high_source_sad flag. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] frame_input Current and last input source frames + * + * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad + * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated. + */ +static void rc_scene_detection_onepass_rt(AV1_COMP *cpi, + const EncodeFrameInput *frame_input) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + YV12_BUFFER_CONFIG const *const unscaled_src = frame_input->source; + YV12_BUFFER_CONFIG const *const unscaled_last_src = frame_input->last_source; + uint8_t *src_y; + int src_ystride; + int src_width; + int src_height; + uint8_t *last_src_y; + int last_src_ystride; + int last_src_width; + int last_src_height; + int width = cm->width; + int height = cm->height; + if (cpi->svc.number_spatial_layers > 1) { + width = cpi->oxcf.frm_dim_cfg.width; + height = cpi->oxcf.frm_dim_cfg.height; + } + if (width != cm->render_width || height != cm->render_height || + unscaled_src == NULL || unscaled_last_src == NULL) { + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + } + if (unscaled_src == NULL || unscaled_last_src == NULL) return; + src_y = unscaled_src->y_buffer; + src_ystride = unscaled_src->y_stride; + src_width = unscaled_src->y_width; + src_height = unscaled_src->y_height; + last_src_y = unscaled_last_src->y_buffer; + last_src_ystride = unscaled_last_src->y_stride; + last_src_width = unscaled_last_src->y_width; + last_src_height = unscaled_last_src->y_height; + if (src_width != last_src_width || src_height != last_src_height) { + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + return; + } + rc->high_source_sad = 0; + rc->percent_blocks_with_motion = 0; + rc->max_block_source_sad = 0; + rc->prev_avg_source_sad = rc->avg_source_sad; + int num_mi_cols = cm->mi_params.mi_cols; + int num_mi_rows = cm->mi_params.mi_rows; + if (cpi->svc.number_spatial_layers > 1) { + num_mi_cols = cpi->svc.mi_cols_full_resoln; + num_mi_rows = cpi->svc.mi_rows_full_resoln; + } + int num_zero_temp_sad = 0; + uint32_t min_thresh = 10000; + if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 + ? 50000 + : 100000; + } + const BLOCK_SIZE bsize = BLOCK_64X64; + // Loop over sub-sample of frame, compute average sad over 64x64 blocks. + uint64_t avg_sad = 0; + uint64_t tmp_sad = 0; + int num_samples = 0; + const int thresh = + cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6; + // SAD is computed on 64x64 blocks + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; + uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 + int num_low_var_high_sumdiff = 0; + int light_change = 0; + // Flag to check light change or not. + const int check_light_change = 0; + // TODO(marpan): There seems some difference along the bottom border when + // using the source_last_tl0 for last_source (used for temporal layers or + // when previous frame is dropped). + // Remove this bord parameter when issue is resolved: difference is that + // non-zero sad exists along bottom border even though source is static. + const int border = + rc->prev_frame_is_dropped || cpi->svc.number_temporal_layers > 1; + // Store blkwise SAD for later use + if (width == cm->render_width && height == cm->render_height) { + if (cpi->src_sad_blk_64x64 == NULL) { + CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64, + (uint64_t *)aom_calloc(sb_cols * sb_rows, + sizeof(*cpi->src_sad_blk_64x64))); + } + } + // Avoid bottom and right border. + for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) { + for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + if (cpi->src_sad_blk_64x64 != NULL) + cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad; + if (check_light_change) { + unsigned int sse, variance; + variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, + last_src_ystride, &sse); + // Note: sse - variance = ((sum * sum) >> 12) + // Detect large lighting change. + if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) { + num_low_var_high_sumdiff++; + } + } + avg_sad += tmp_sad; + num_samples++; + if (tmp_sad == 0) num_zero_temp_sad++; + if (tmp_sad > rc->max_block_source_sad) + rc->max_block_source_sad = tmp_sad; + + src_y += 64; + last_src_y += 64; + } + src_y += (src_ystride << 6) - (sb_cols << 6); + last_src_y += (last_src_ystride << 6) - (sb_cols << 6); + } + if (check_light_change && num_samples > 0 && + num_low_var_high_sumdiff > (num_samples >> 1)) + light_change = 1; + if (num_samples > 0) avg_sad = avg_sad / num_samples; + // Set high_source_sad flag if we detect very high increase in avg_sad + // between current and previous frame value(s). Use minimum threshold + // for cases where there is small change from content that is completely + // static. + if (!light_change && + avg_sad > + AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) && + rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && + num_zero_temp_sad < 3 * (num_samples >> 2)) + rc->high_source_sad = 1; + else + rc->high_source_sad = 0; + rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2; + rc->frame_source_sad = avg_sad; + if (num_samples > 0) + rc->percent_blocks_with_motion = + ((num_samples - num_zero_temp_sad) * 100) / num_samples; + // Scene detection is only on base SLO, and using full/orignal resolution. + // Pass the state to the upper spatial layers. + if (cpi->svc.number_spatial_layers > 1) { + SVC *svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + int tl = svc->temporal_layer_id; + const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->high_source_sad = rc->high_source_sad; + lrc->frame_source_sad = rc->frame_source_sad; + lrc->avg_source_sad = rc->avg_source_sad; + lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion; + lrc->max_block_source_sad = rc->max_block_source_sad; + } + } +} + +/*!\brief Set the GF baseline interval for 1 pass real-time mode. + * + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] frame_type frame type + * + * \return Return GF update flag, and update the \c cpi->rc with + * the next GF interval settings. + */ +static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi, + FRAME_TYPE frame_type) { + RATE_CONTROL *const rc = &cpi->rc; + int gf_update = 0; + const int resize_pending = is_frame_resize_pending(cpi); + // GF update based on frames_till_gf_update_due, also + // force upddate on resize pending frame or for scene change. + if ((resize_pending || rc->high_source_sad || + rc->frames_till_gf_update_due == 0) && + cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) { + set_baseline_gf_interval(cpi, frame_type); + gf_update = 1; + } + return gf_update; +} + +static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, + int prev_width, int prev_height) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + SVC *const svc = &cpi->svc; + int target_bits_per_frame; + int active_worst_quality; + int qindex; + double tot_scale_change = (double)(resize_width * resize_height) / + (double)(prev_width * prev_height); + // Disable the skip mv search for svc on resize frame. + svc->skip_mvsearch_last = 0; + svc->skip_mvsearch_gf = 0; + svc->skip_mvsearch_altref = 0; + // Reset buffer level to optimal, update target size. + p_rc->buffer_level = p_rc->optimal_buffer_level; + p_rc->bits_off_target = p_rc->optimal_buffer_level; + rc->this_frame_target = + av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME); + target_bits_per_frame = rc->this_frame_target; + if (tot_scale_change > 4.0) + p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; + else if (tot_scale_change > 1.0) + p_rc->avg_frame_qindex[INTER_FRAME] = + (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1; + active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); + qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality, + active_worst_quality, resize_width, resize_height); + // If resize is down, check if projected q index is close to worst_quality, + // and if so, reduce the rate correction factor (since likely can afford + // lower q for resized frame). + if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100) + p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85; + // If resize is back up: check if projected q index is too much above the + // previous index, and if so, reduce the rate correction factor + // (since prefer to keep q for resized frame at least closet to previous q). + // Also check if projected qindex is close to previous qindex, if so + // increase correction factor (to push qindex higher and avoid overshoot). + if (tot_scale_change >= 1.0) { + if (tot_scale_change < 4.0 && + qindex > 130 * p_rc->last_q[INTER_FRAME] / 100) + p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8; + if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100) + p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5; + } + if (svc->number_temporal_layers > 1) { + // Apply the same rate control reset to all temporal layers. + for (int tl = 0; tl < svc->number_temporal_layers; tl++) { + LAYER_CONTEXT *lc = NULL; + lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + tl]; + lc->rc.resize_state = rc->resize_state; + lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level; + lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level; + lc->p_rc.rate_correction_factors[INTER_NORMAL] = + p_rc->rate_correction_factors[INTER_NORMAL]; + lc->p_rc.avg_frame_qindex[INTER_FRAME] = + p_rc->avg_frame_qindex[INTER_FRAME]; + } + } +} + +/*!\brief ChecK for resize based on Q, for 1 pass real-time mode. + * + * Check if we should resize, based on average QP from past x frames. + * Only allow for resize at most 1/2 scale down for now, Scaling factor + * for each step may be 3/4 or 1/2. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * + * \remark Return resized width/height in \c cpi->resize_pending_params, + * and update some resize counters in \c rc. + */ +static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + RESIZE_ACTION resize_action = NO_RESIZE; + const int avg_qp_thr1 = 70; + const int avg_qp_thr2 = 50; + // Don't allow for resized frame to go below 160x90, resize in steps of 3/4. + const int min_width = (160 * 4) / 3; + const int min_height = (90 * 4) / 3; + int down_size_on = 1; + // Don't resize on key frame; reset the counters on key frame. + if (cm->current_frame.frame_type == KEY_FRAME) { + rc->resize_avg_qp = 0; + rc->resize_count = 0; + rc->resize_buffer_underflow = 0; + return; + } + // No resizing down if frame size is below some limit. + if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0; + + // Resize based on average buffer underflow and QP over some window. + // Ignore samples close to key frame, since QP is usually high after key. + if (cpi->rc.frames_since_key > cpi->framerate) { + const int window = AOMMIN(30, (int)(2 * cpi->framerate)); + rc->resize_avg_qp += p_rc->last_q[INTER_FRAME]; + if (cpi->ppi->p_rc.buffer_level < + (int)(30 * p_rc->optimal_buffer_level / 100)) + ++rc->resize_buffer_underflow; + ++rc->resize_count; + // Check for resize action every "window" frames. + if (rc->resize_count >= window) { + int avg_qp = rc->resize_avg_qp / rc->resize_count; + // Resize down if buffer level has underflowed sufficient amount in past + // window, and we are at original or 3/4 of original resolution. + // Resize back up if average QP is low, and we are currently in a resized + // down state, i.e. 1/2 or 3/4 of original resolution. + // Currently, use a flag to turn 3/4 resizing feature on/off. + if (rc->resize_buffer_underflow > (rc->resize_count >> 2) && + down_size_on) { + if (rc->resize_state == THREE_QUARTER) { + resize_action = DOWN_ONEHALF; + rc->resize_state = ONE_HALF; + } else if (rc->resize_state == ORIG) { + resize_action = DOWN_THREEFOUR; + rc->resize_state = THREE_QUARTER; + } + } else if (rc->resize_state != ORIG && + avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) { + if (rc->resize_state == THREE_QUARTER || + avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) { + resize_action = UP_ORIG; + rc->resize_state = ORIG; + } else if (rc->resize_state == ONE_HALF) { + resize_action = UP_THREEFOUR; + rc->resize_state = THREE_QUARTER; + } + } + // Reset for next window measurement. + rc->resize_avg_qp = 0; + rc->resize_count = 0; + rc->resize_buffer_underflow = 0; + } + } + // If decision is to resize, reset some quantities, and check is we should + // reduce rate correction factor, + if (resize_action != NO_RESIZE) { + int resize_width = cpi->oxcf.frm_dim_cfg.width; + int resize_height = cpi->oxcf.frm_dim_cfg.height; + int resize_scale_num = 1; + int resize_scale_den = 1; + if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) { + resize_scale_num = 3; + resize_scale_den = 4; + } else if (resize_action == DOWN_ONEHALF) { + resize_scale_num = 1; + resize_scale_den = 2; + } + resize_width = resize_width * resize_scale_num / resize_scale_den; + resize_height = resize_height * resize_scale_num / resize_scale_den; + resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height); + } + return; +} + +static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + AV1_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + + // Very first frame has to be key frame. + if (cm->current_frame.frame_number == 0) return 1; + // Set key frame if forced by frame flags. + if (frame_flags & FRAMEFLAGS_KEY) return 1; + if (!cpi->ppi->use_svc) { + // Non-SVC + if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1; + } else { + // SVC + if (svc->spatial_layer_id == 0 && + (cpi->oxcf.kf_cfg.auto_key && + (cpi->oxcf.kf_cfg.key_freq_max == 0 || + svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0))) + return 1; + } + + return 0; +} + +// Set to true if this frame is a recovery frame, for 1 layer RPS, +// and whether we should apply some boost (QP, adjust speed features, etc). +// Recovery frame here means frame whose closest reference suddenly +// switched from previous frame to one much further away. +// TODO(marpan): Consider adding on/off flag to SVC_REF_FRAME_CONFIG to +// allow more control for applications. +static bool set_flag_rps_bias_recovery_frame(const AV1_COMP *const cpi) { + if (cpi->ppi->rtc_ref.set_ref_frame_config && + cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers == 1 && + cpi->ppi->rtc_ref.reference_was_previous_frame) { + int min_dist = av1_svc_get_min_ref_dist(cpi); + // Only consider boost for this frame if its closest reference is further + // than x frames away, using x = 4 for now. + if (min_dist != INT_MAX && min_dist > 4) return true; + } + return false; +} + +void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type, + const EncodeFrameInput *frame_input, + unsigned int frame_flags) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + SVC *const svc = &cpi->svc; + ResizePendingParams *const resize_pending_params = + &cpi->resize_pending_params; + int target; + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + if (cpi->ppi->use_svc) { + av1_update_temporal_layer_framerate(cpi); + av1_restore_layer_context(cpi); + } + cpi->ppi->rtc_ref.bias_recovery_frame = set_flag_rps_bias_recovery_frame(cpi); + // Set frame type. + if (set_key_frame(cpi, frame_flags)) { + *frame_type = KEY_FRAME; + p_rc->this_key_frame_forced = + cm->current_frame.frame_number != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max; + p_rc->kf_boost = DEFAULT_KF_BOOST_RT; + gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE; + gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME; + gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET; + if (cpi->ppi->use_svc) { + if (cm->current_frame.frame_number > 0) + av1_svc_reset_temporal_layers(cpi, 1); + svc->layer_context[layer].is_key_frame = 1; + } + rc->frame_number_encoded = 0; + cpi->ppi->rtc_ref.non_reference_frame = 0; + } else { + *frame_type = INTER_FRAME; + gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE; + gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME; + gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE; + if (cpi->ppi->use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + lc->is_key_frame = + svc->spatial_layer_id == 0 + ? 0 + : svc->layer_context[svc->temporal_layer_id].is_key_frame; + // If the user is setting the reference structure with + // set_ref_frame_config and did not set any references, set the + // frame type to Intra-only. + if (cpi->ppi->rtc_ref.set_ref_frame_config) { + int no_references_set = 1; + for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (cpi->ppi->rtc_ref.reference[i]) { + no_references_set = 0; + break; + } + } + // Set to intra_only_frame if no references are set. + // The stream can start decoding on INTRA_ONLY_FRAME so long as the + // layer with the intra_only_frame doesn't signal a reference to a slot + // that hasn't been set yet. + if (no_references_set) *frame_type = INTRA_ONLY_FRAME; + } + } + } + // Check for scene change: for SVC check on base spatial layer only. + if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) { + if (rc->prev_coded_width == cm->width && + rc->prev_coded_height == cm->height) { + rc_scene_detection_onepass_rt(cpi, frame_input); + } else { + aom_free(cpi->src_sad_blk_64x64); + cpi->src_sad_blk_64x64 = NULL; + } + } + // Check for dynamic resize, for single spatial layer for now. + // For temporal layers only check on base temporal layer. + if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) { + if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0) + dynamic_resize_one_pass_cbr(cpi); + if (rc->resize_state == THREE_QUARTER) { + resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2; + resize_pending_params->height = + (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2; + } else if (rc->resize_state == ONE_HALF) { + resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1; + resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1; + } else { + resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width; + resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height; + } + } else if (is_frame_resize_pending(cpi)) { + resize_reset_rc(cpi, resize_pending_params->width, + resize_pending_params->height, cm->width, cm->height); + } + // Set the GF interval and update flag. + if (!rc->rtc_external_ratectrl) + set_gf_interval_update_onepass_rt(cpi, *frame_type); + // Set target size. + if (cpi->oxcf.rc_cfg.mode == AOM_CBR) { + if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_cbr( + cpi, gf_group->update_type[cpi->gf_frame_index]); + } + } else { + if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) { + target = av1_calc_iframe_target_size_one_pass_vbr(cpi); + } else { + target = av1_calc_pframe_target_size_one_pass_vbr( + cpi, gf_group->update_type[cpi->gf_frame_index]); + } + } + if (cpi->oxcf.rc_cfg.mode == AOM_Q) + rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level; + + av1_rc_set_frame_target(cpi, target, cm->width, cm->height); + rc->base_frame_target = target; + cm->current_frame.frame_type = *frame_type; + // For fixed mode SVC: if KSVC is enabled remove inter layer + // prediction on spatial enhancement layer frames for frames + // whose base is not KEY frame. + if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode && + svc->number_spatial_layers > 1 && + !svc->layer_context[layer].is_key_frame) { + ExternalFlags *const ext_flags = &cpi->ext_flags; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + } +} + +int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { + AV1_COMMON *const cm = &cpi->common; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + double rate_correction_factor = + cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL]; + const int target_size = cpi->rc.avg_frame_bandwidth; + double new_correction_factor; + int target_bits_per_mb; + double q2; + int enumerator; + int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); + *q = (3 * cpi->rc.worst_quality + *q) >> 2; + // For screen content use the max-q set by the user to allow for less + // overshoot on slide changes. + if (is_screen_content) *q = cpi->rc.worst_quality; + cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as + // these parameters will affect QP selection for subsequent frames. If they + // have settled down to a very different (low QP) state, then not adjusting + // them may cause next frame to select low QP and overshoot again. + p_rc->avg_frame_qindex[INTER_FRAME] = *q; + p_rc->buffer_level = p_rc->optimal_buffer_level; + p_rc->bits_off_target = p_rc->optimal_buffer_level; + // Reset rate under/over-shoot flags. + cpi->rc.rc_1_frame = 0; + cpi->rc.rc_2_frame = 0; + // Adjust rate correction factor. + target_bits_per_mb = + (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs); + // Reset rate correction factor: for now base it on target_bits_per_mb + // and qp (==max_QP). This comes from the inverse computation of + // av1_rc_bits_per_mb(). + q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth); + enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content); + new_correction_factor = (double)target_bits_per_mb * q2 / enumerator; + if (new_correction_factor > rate_correction_factor) { + rate_correction_factor = + (new_correction_factor + rate_correction_factor) / 2.0; + if (rate_correction_factor > MAX_BPB_FACTOR) + rate_correction_factor = MAX_BPB_FACTOR; + cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] = + rate_correction_factor; + } + // For temporal layers: reset the rate control parameters across all + // temporal layers. + if (cpi->svc.number_temporal_layers > 1) { + SVC *svc = &cpi->svc; + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int sl = svc->spatial_layer_id; + const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; + lp_rc->avg_frame_qindex[INTER_FRAME] = *q; + lp_rc->buffer_level = lp_rc->optimal_buffer_level; + lp_rc->bits_off_target = lp_rc->optimal_buffer_level; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + } + } + return 1; +} diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h new file mode 100644 index 0000000000..6802ad42d0 --- /dev/null +++ b/third_party/aom/av1/encoder/ratectrl.h @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RATECTRL_H_ +#define AOM_AV1_ENCODER_RATECTRL_H_ + +#include "aom/aom_codec.h" +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ + +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 + +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 1 + +// Threshold used to define if a KF group is static (e.g. a slide show). +// Essentially, this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 +#define STATIC_KF_GROUP_FLOAT_THRESH 0.99 + +// The maximum duration of a GF group that is static (e.g. a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + +#define MIN_GF_INTERVAL 4 +#define MAX_GF_INTERVAL 32 +#define FIXED_GF_INTERVAL 16 +#define MAX_GF_LENGTH_LAP 16 + +#define FIXED_GF_INTERVAL_RT 80 +#define MAX_GF_INTERVAL_RT 160 + +#define MAX_NUM_GF_INTERVALS 15 + +#define MAX_ARF_LAYERS 6 +// #define STRICT_RC + +#define DEFAULT_KF_BOOST_RT 2300 +#define DEFAULT_GF_BOOST_RT 2000 + +// A passive rate control strategy for screen content type in real-time mode. +// When it is turned on, the compression performance is improved by +// 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains +// over 20% on metric. +// The downside is that it does not guarantee frame size. +// Since RT mode has a tight restriction on buffer overflow control, we +// turn it off by default. +#define RT_PASSIVE_STRATEGY 0 +#define MAX_Q_HISTORY 1000 + +typedef struct { + int resize_width; + int resize_height; + uint8_t superres_denom; +} size_params_type; + +enum { + INTER_NORMAL, + GF_ARF_LOW, + GF_ARF_STD, + KF_STD, + RATE_FACTOR_LEVELS +} UENUM1BYTE(RATE_FACTOR_LEVEL); + +enum { + KF_UPDATE, + LF_UPDATE, + GF_UPDATE, + ARF_UPDATE, + OVERLAY_UPDATE, + INTNL_OVERLAY_UPDATE, // Internal Overlay Frame + INTNL_ARF_UPDATE, // Internal Altref Frame + FRAME_UPDATE_TYPES +} UENUM1BYTE(FRAME_UPDATE_TYPE); + +enum { + REFBUF_RESET, // Clear reference frame buffer + REFBUF_UPDATE, // Refresh reference frame buffer + REFBUF_STATES +} UENUM1BYTE(REFBUF_STATE); + +typedef enum { + NO_RESIZE = 0, + DOWN_THREEFOUR = 1, // From orig to 3/4. + DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2. + UP_THREEFOUR = -1, // From 1/2 to 3/4. + UP_ORIG = -2, // From 1/2 or 3/4 to orig. +} RESIZE_ACTION; + +typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE; + +#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150 +typedef enum region_types { + STABLE_REGION = 0, + HIGH_VAR_REGION = 1, + SCENECUT_REGION = 2, + BLENDING_REGION = 3, +} REGION_TYPES; + +typedef struct regions { + int start; + int last; + double avg_noise_var; + double avg_cor_coeff; + double avg_sr_fr_ratio; + double avg_intra_err; + double avg_coded_err; + REGION_TYPES type; +} REGIONS; + +/*!\endcond */ +/*! + * \brief Rate Control parameters and status + */ +typedef struct { + // Rate targetting variables + + /*! + * Baseline target rate for frame before adjustment for previous under or + * over shoot. + */ + int base_frame_target; + /*! + * Target rate for frame after adjustment for previous under or over shoot. + */ + int this_frame_target; // Actual frame target after rc adjustment. + + /*! + * Projected size for current frame + */ + int projected_frame_size; + + /*! + * Bit size of transform coefficient for current frame. + */ + int coefficient_size; + + /*! + * Super block rate target used with some adaptive quantization strategies. + */ + int sb64_target_rate; + + /*! + * Number of frames since the last ARF / GF. + */ + int frames_since_golden; + + /*! + * Number of frames till the next ARF / GF is due. + */ + int frames_till_gf_update_due; + + /*! + * Number of determined gf groups left + */ + int intervals_till_gf_calculate_due; + + /*!\cond */ + int min_gf_interval; + int max_gf_interval; + int static_scene_max_gf_interval; + /*!\endcond */ + /*! + * Frames before the next key frame + */ + int frames_to_key; + /*!\cond */ + int frames_since_key; + int frames_to_fwd_kf; + int is_src_frame_alt_ref; + int sframe_due; + + int high_source_sad; + uint64_t avg_source_sad; + uint64_t prev_avg_source_sad; + uint64_t frame_source_sad; + + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + int prev_avg_frame_bandwidth; + + int ni_av_qi; + int ni_tot_qi; + + int decimation_factor; + int decimation_count; + int prev_frame_is_dropped; + int drop_count_consec; + int max_consec_drop; + + /*! + * Frame number for encoded frames (non-dropped). + * Use for setting the rtc reference structure. + */ + unsigned int frame_number_encoded; + + /*!\endcond */ + /*! + * User specified maximum Q allowed for current frame + */ + int worst_quality; + /*! + * User specified minimum Q allowed for current frame + */ + int best_quality; + + /*!\cond */ + + // rate control history for last frame(1) and the frame before(2). + // -1: overshoot + // 1: undershoot + // 0: not initialized. + int rc_1_frame; + int rc_2_frame; + int q_1_frame; + int q_2_frame; + + /*!\endcond */ + /*! + * Proposed maximum allowed Q for current frame + */ + int active_worst_quality; + + /*!\cond */ + // Track amount of low motion in scene + int avg_frame_low_motion; + int cnt_zeromv; + + // signals if number of blocks with motion is high + int percent_blocks_with_motion; + + // Maximum value of source sad across all blocks of frame. + uint64_t max_block_source_sad; + + // For dynamic resize, 1 pass cbr. + RESIZE_STATE resize_state; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; + + // Flag to disable content related qp adjustment. + int rtc_external_ratectrl; + + // Stores fast_extra_bits of the current frame. + int frame_level_fast_extra_bits; + + double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS]; + + int frame_num_last_gf_refresh; + + int prev_coded_width; + int prev_coded_height; + + // The ratio used for inter frames in bit estimation. + // TODO(yunqing): if golden frame is treated differently (e.g. gf_cbr_boost_ + // pct > THR), consider to add bit_est_ratio_g for golden frames. + int bit_est_ratio; + + // Whether to use a fixed qp for the frame, bypassing internal rate control. + // This flag will reset to 0 after every frame. + int use_external_qp_one_pass; + /*!\endcond */ +} RATE_CONTROL; + +/*! + * \brief Primary Rate Control parameters and status + */ +typedef struct { + // Sub-gop level Rate targetting variables + + /*! + * Target bit budget for the current GF / ARF group of frame. + */ + int64_t gf_group_bits; + + /*! + * Boost factor used to calculate the extra bits allocated to the key frame + */ + int kf_boost; + + /*! + * Boost factor used to calculate the extra bits allocated to ARFs and GFs + */ + int gfu_boost; + + /*! + * Stores the determined gf group lengths for a set of gf groups + */ + int gf_intervals[MAX_NUM_GF_INTERVALS]; + + /*! + * The current group's index into gf_intervals[] + */ + int cur_gf_index; + + /*!\cond */ + int num_regions; + + REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES]; + int regions_offset; // offset of regions from the last keyframe + int frames_till_regions_update; + + int baseline_gf_interval; + + int constrained_gf_group; + + int this_key_frame_forced; + + int next_key_frame_forced; + /*!\endcond */ + + /*! + * Initial buffuer level in ms for CBR / low delay encoding + */ + int64_t starting_buffer_level; + + /*! + * Optimum / target buffuer level in ms for CBR / low delay encoding + */ + int64_t optimal_buffer_level; + + /*! + * Maximum target buffuer level in ms for CBR / low delay encoding + */ + int64_t maximum_buffer_size; + + /*! + * Q index used for ALT frame + */ + int arf_q; + + /*!\cond */ + float_t arf_boost_factor; + + int base_layer_qp; + + // Total number of stats used only for kf_boost calculation. + int num_stats_used_for_kf_boost; + + // Total number of stats used only for gfu_boost calculation. + int num_stats_used_for_gfu_boost; + + // Total number of stats required by gfu_boost calculation. + int num_stats_required_for_gfu_boost; + + int enable_scenecut_detection; + + int use_arf_in_this_kf_group; + + int ni_frames; + + double tot_q; + /*!\endcond */ + + /*! + * Q used for last boosted (non leaf) frame + */ + int last_kf_qindex; + + /*! + * Average of q index of previous encoded frames in a sequence. + */ + int avg_frame_qindex[FRAME_TYPES]; + +#if CONFIG_FPMT_TEST + /*! + * Temporary variable used in simulating the delayed update of + * active_best_quality. + */ + int temp_active_best_quality[MAX_ARF_LAYERS + 1]; + + /*! + * Temporary variable used in simulating the delayed update of + * last_boosted_qindex. + */ + int temp_last_boosted_qindex; + + /*! + * Temporary variable used in simulating the delayed update of + * avg_q. + */ + double temp_avg_q; + + /*! + * Temporary variable used in simulating the delayed update of + * last_q. + */ + int temp_last_q[FRAME_TYPES]; + + /*! + * Temporary variable used in simulating the delayed update of + * projected_frame_size. + */ + int temp_projected_frame_size; + + /*! + * Temporary variable used in simulating the delayed update of + * total_actual_bits. + */ + int64_t temp_total_actual_bits; + + /*! + * Temporary variable used in simulating the delayed update of + * buffer_level. + */ + int64_t temp_buffer_level; + + /*! + * Temporary variable used in simulating the delayed update of + * vbr_bits_off_target. + */ + int64_t temp_vbr_bits_off_target; + + /*! + * Temporary variable used in simulating the delayed update of + * vbr_bits_off_target_fast. + */ + int64_t temp_vbr_bits_off_target_fast; + + /*! + * Temporary variable used in simulating the delayed update of + * rate_correction_factors. + */ + double temp_rate_correction_factors[RATE_FACTOR_LEVELS]; + + /*! + * Temporary variable used in simulating the delayed update of + * rate_error_estimate. + */ + int temp_rate_error_estimate; + + /*! + * Temporary variable used in simulating the delayed update of + * rolling_arf_group_target_bits. + */ + int temp_rolling_arf_group_target_bits; + + /*! + * Temporary variable used in simulating the delayed update of + * rolling_arf_group_actual_bits;. + */ + int temp_rolling_arf_group_actual_bits; + + /*! + * Temporary variable used in simulating the delayed update of + * bits_left;. + */ + int64_t temp_bits_left; + + /*! + * Temporary variable used in simulating the delayed update of + * extend_minq. + */ + int temp_extend_minq; + + /*! + * Temporary variable used in simulating the delayed update of + * extend_maxq. + */ + int temp_extend_maxq; + +#endif + /*! + * Proposed minimum allowed Q different layers in a coding pyramid + */ + int active_best_quality[MAX_ARF_LAYERS + 1]; + + /*! + * Q used for last boosted (non leaf) frame (GF/KF/ARF) + */ + int last_boosted_qindex; + + /*! + * Average Q value of previous inter frames + */ + double avg_q; + + /*! + * Q used on last encoded frame of the given type. + */ + int last_q[FRAME_TYPES]; + + /*! + * Correction factors used to adjust the q estimate for a given target rate + * in the encode loop. + */ + double rate_correction_factors[RATE_FACTOR_LEVELS]; + + /*! + * Current total consumed bits. + */ + int64_t total_actual_bits; + + /*! + * Current total target bits. + */ + int64_t total_target_bits; + + /*! + * Current buffer level. + */ + int64_t buffer_level; + + /*! + * PCT rc error. + */ + int rate_error_estimate; + + /*! + * Error bits available from previously encoded frames. + */ + int64_t vbr_bits_off_target; + + /*! + * Error bits available from previously encoded frames undershoot. + */ + int64_t vbr_bits_off_target_fast; + + /*! + * Total bits deviated from the average frame target, from previously + * encoded frames. + */ + int64_t bits_off_target; + + /*! + * Rolling monitor target bits updated based on current frame target size. + */ + int rolling_target_bits; + + /*! + * Rolling monitor actual bits updated based on current frame final projected + * size. + */ + int rolling_actual_bits; + + /*! + * The history of qindex for each frame. + * Only used when RT_PASSIVE_STRATEGY = 1. + */ + int q_history[MAX_Q_HISTORY]; +} PRIMARY_RATE_CONTROL; + +/*!\cond */ + +struct AV1_COMP; +struct AV1EncoderConfig; +struct GF_GROUP; + +void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf, + PRIMARY_RATE_CONTROL *p_rc); + +void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc); + +int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q, + double correction_factor); + +double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth); + +void av1_rc_init_minq_luts(void); + +int av1_rc_get_default_min_gf_interval(int width, int height, double framerate); +// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to +// be passed in to ensure that the max_gf_interval returned is at least as bis +// as that. +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); + +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// av1_get_one_pass_rt_params() +// av1_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by: +// av1_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the av1_get_..._params() functions and +// updated during the av1_rc_postencode_update...() functions. +// The only exceptions are av1_rc_drop_frame() and +// av1_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +struct EncodeFrameInput; + +// Post encode update of the rate control parameters based +// on bytes used +void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi); + +/*!\endcond */ +/*!\brief Updates the rate correction factor linking Q to output bits + * + * This function updates the Q rate correction factor after an encode + * cycle depending on whether we overshot or undershot the target rate. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] is_encode_stage Indicates if recode loop or post-encode + * \param[in] width Frame width + * \param[in] height Frame height + * + * \remark Updates the relevant rate correction factor in cpi->rc + */ +void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, + int is_encode_stage, int width, + int height); +/*!\cond */ + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int av1_rc_drop_frame(struct AV1_COMP *cpi); + +// Computes frame size bounds. +void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, + int this_frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +/*!\endcond */ + +/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] width Coded frame width + * \param[in] height Coded frame height + * \param[in] gf_index Index of this frame in the golden frame group + * \param[out] bottom_index Bottom bound for q index (best quality) + * \param[out] top_index Top bound for q index (worst quality) + * \return Returns selected q index to be used for encoding this frame. + * Also, updates \c rc->arf_q. + */ +int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height, + int gf_index, int *bottom_index, int *top_index); + +/*!\brief Estimates q to achieve a target bits per frame + * + * \ingroup rate_control + * \param[in] cpi Top level encoder instance structure + * \param[in] target_bits_per_frame Frame rate target + * \param[in] active_worst_quality Max Q allowed + * \param[in] active_best_quality Min Q allowed + * \param[in] width Frame width + * \param[in] height Frame height + * + * \return Returns a q index value + */ +int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality, + int width, int height); + +/*!\cond */ +// Gets the appropriate bpmb ennumerator based on the frame and content type +int av1_get_bpmb_enumerator(FRAME_TYPE frame_type, + const int is_screen_content_type); + +// Estimates bits per mb for a given qindex and correction factor. +int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type, + int qindex, double correction_factor, + int accurate_estimate); + +// Clamping utilities for bitrate targets for iframes and pframes. +int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi, + int64_t target); +int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi, + int target, uint8_t frame_update_type); + +// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex]. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// q >= desired_q. +// If no such q index is found, returns 'worst_qindex'. +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a target q value +int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + aom_bit_depth_t bit_depth); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a value that should equate to the given rate ratio. +int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi, + FRAME_TYPE frame_type, int qindex, + double rate_target_ratio); + +int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q); + +void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height); + +void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi, + RATE_CONTROL *const rc); + +void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height); + +int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); + +void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width, + int height); + +void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi); + +void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi, + int gf_update); + +/*!\endcond */ +/*!\brief Calculates how many bits to use for a P frame in one pass vbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] frame_update_type Type of frame + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_pframe_target_size_one_pass_vbr( + const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type); + +/*!\brief Calculates how many bits to use for an i frame in one pass vbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi); + +/*!\brief Calculates how many bits to use for a P frame in one pass cbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] frame_update_type Type of frame + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_pframe_target_size_one_pass_cbr( + const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type); + +/*!\brief Calculates how many bits to use for an i frame in one pass cbr + * + * \ingroup rate_control + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \return Returns the target number of bits for this frame. + */ +int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi); + +/*!\brief Setup the rate control parameters for 1 pass real-time mode. + * + * - Sets the frame type and target frame size. + * - Sets the GF update. + * - Checks for scene change. + * - Sets the reference prediction structure for 1 layers (non-SVC). + * - Resets and updates are done for SVC. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] frame_type Encoder frame type + * \param[in] frame_input Current and last input source frames + * \param[in] frame_flags Encoder frame flags + * + * \remark Nothing is returned. Instead the settings computed in this + * function are set in: \c frame_params, \c cpi->common, \c cpi->rc, + * \c cpi->svc. + */ +void av1_get_one_pass_rt_params(struct AV1_COMP *cpi, + FRAME_TYPE *const frame_type, + const struct EncodeFrameInput *frame_input, + unsigned int frame_flags); + +/*!\brief Increase q on expected encoder overshoot, for CBR mode. + * + * Handles the case when encoder is expected to create a large frame: + * - q is increased to value closer to \c cpi->rc.worst_quality + * - avg_frame_qindex is reset + * - buffer levels are reset + * - rate correction factor is adjusted + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * \param[in] q Current q index + * + * \return q is returned, and updates are done to \c cpi->rc. + */ +int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q); + +/*!\brief Compute the q_indices for a single frame. + * + * Intended to be used with AOM_Q mode. + * + * \param[in] base_q_index Base q index + * \param[in] gf_update_type GOP update type + * \param[in] gf_pyramid_level GOP level of the current frame + * \param[in] arf_q ARF q_index + * + * \return Returns the q_index for the current frame. + */ +int av1_q_mode_get_q_index(int base_q_index, int gf_update_type, + int gf_pyramid_level, int arf_q); + +/*!\brief Compute the q_indices for the ARF of a GOP. + * + * \param[in] base_q_index Base q index + * \param[in] gfu_boost GFU boost + * \param[in] bit_depth Bit depth + * \param[in] arf_boost_factor ARF boost factor + * + * \return Returns the q_index for the ARF frame. + */ +int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth, + double arf_boost_factor); + +#if !CONFIG_REALTIME_ONLY +struct TplDepFrame; +/*!\brief Compute the q_indices for the ARF of a GOP in Q mode. + * + * \param[in] cpi Top level encoder structure + * \param[in] tpl_frame Tpl Frame stats + * + * \return Returns the q_index for the ARF frame. + */ +int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi, + struct TplDepFrame *tpl_frame); +#endif +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RATECTRL_H_ diff --git a/third_party/aom/av1/encoder/rc_utils.h b/third_party/aom/av1/encoder/rc_utils.h new file mode 100644 index 0000000000..fe22ee5afb --- /dev/null +++ b/third_party/aom/av1/encoder/rc_utils.h @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RC_UTILS_H_ +#define AOM_AV1_ENCODER_RC_UTILS_H_ + +#include "av1/encoder/encoder.h" +#include "aom_dsp/psnr.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + if (cpi->common.current_frame.frame_number > + (unsigned int)cpi->svc.number_spatial_layers) { + if (cpi->ppi->use_svc) { + av1_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + p_rc->bits_off_target = p_rc->optimal_buffer_level; + p_rc->buffer_level = p_rc->optimal_buffer_level; + } + } + } +} + +static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf, + AV1_PRIMARY *ppi) { + PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + const int64_t bandwidth = rc_cfg->target_bandwidth; + const int64_t starting = rc_cfg->starting_buffer_level_ms; + const int64_t optimal = rc_cfg->optimal_buffer_level_ms; + const int64_t maximum = rc_cfg->maximum_buffer_size_ms; + + p_rc->starting_buffer_level = starting * bandwidth / 1000; + p_rc->optimal_buffer_level = + (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; + p_rc->maximum_buffer_size = + (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + p_rc->bits_off_target = + AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size); + p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size); +} + +static AOM_INLINE void config_target_level(AV1_COMP *const cpi, + AV1_LEVEL target_level, int tier) { + AV1EncoderConfig *const oxcf = &cpi->oxcf; + SequenceHeader *const seq_params = cpi->common.seq_params; + TileConfig *const tile_cfg = &oxcf->tile_cfg; + RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + // Adjust target bitrate to be no larger than 70% of level limit. + const BITSTREAM_PROFILE profile = seq_params->profile; + const double level_bitrate_limit = + av1_get_max_bitrate_for_level(target_level, tier, profile); + const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70); + rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate); + // Also need to update cpi->ppi->twopass.bits_left. + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats; + if (stats != NULL) + cpi->ppi->twopass.bits_left = + (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0); + + // Adjust max over-shoot percentage. + rc_cfg->over_shoot_pct = 0; + + // Adjust max quantizer. + rc_cfg->worst_allowed_q = 255; + + // Adjust number of tiles and tile columns to be under level limit. + int max_tiles, max_tile_cols; + av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols); + while (tile_cfg->tile_columns > 0 && + (1 << tile_cfg->tile_columns) > max_tile_cols) { + --tile_cfg->tile_columns; + } + const int tile_cols = (1 << tile_cfg->tile_columns); + while (tile_cfg->tile_rows > 0 && + tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) { + --tile_cfg->tile_rows; + } + + // Adjust min compression ratio. + const int still_picture = seq_params->still_picture; + const double min_cr = + av1_get_min_cr_for_level(target_level, tier, still_picture); + rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100)); +} + +#if !CONFIG_REALTIME_ONLY + +/*!\brief Function to test for conditions that indicate we should loop + * back and recode a frame. + * + * \ingroup rate_control + * + * \param[in] cpi Top-level encoder structure + * \param[in] high_limit Upper rate threshold + * \param[in] low_limit Lower rate threshold + * \param[in] q Current q index + * \param[in] maxq Maximum allowed q index + * \param[in] minq Minimum allowed q index + * + * \return Indicates if a recode is required. + * \retval 1 Recode Required + * \retval 0 No Recode required + */ +static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit, + int low_limit, int q, int maxq, + int minq) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi); + int force_recode = 0; + + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) || + (frame_is_kfgfarf && + (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) { + // TODO(agrange) high_limit could be greater than the scale-down threshold. + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { + force_recode = 1; + } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. + if (q > oxcf->rc_cfg.cq_level && + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { + force_recode = 1; + } + } + } + return force_recode; +} + +static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor, + double max_factor, + int frame_count) { + double factor = sqrt((double)frame_count); + factor = AOMMIN(factor, max_factor); + factor = AOMMAX(factor, min_factor); + factor = (200.0 + 10.0 * factor); + return factor; +} + +static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor, + double max_factor, double r0, + int frames_to_key) { + double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor, + frames_to_key); + const int boost = (int)rint(factor / r0); + return boost; +} + +static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) { + double factor = sqrt((double)frame_count); + factor = AOMMIN(factor, 10.0); + factor = AOMMAX(factor, 4.0); + factor = (75.0 + 14.0 * factor); + return factor; +} + +static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi, + int is_encode_stage, int q_low, + int q_high, int top_index, + int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + + int q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + + int retries = 0; + while (q_regulated < q_low && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + retries++; + } + return q_regulated; +} + +static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi, + int is_encode_stage, + int q_high, int top_index, + int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + + int retries = 0; + while (q_regulated > q_high && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width, + cm->height); + q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + retries++; + } + return q_regulated; +} + +/*!\brief Called after encode_with_recode_loop() has just encoded a frame. + * This function works out whether we undershot or overshot our bitrate + * target and adjusts q as appropriate. It also decides whether or not + * we need to recode the frame to get closer to the target rate. + * + * \ingroup rate_control + * + * \param[in] cpi Top-level encoder structure + * \param[out] loop Should we go around the recode loop again + * \param[in,out] q New q index value + * \param[in,out] q_low Low q index limit for this loop itteration + * \param[in,out] q_high High q index limit for this loop itteration + * \param[in] top_index Max permited new value for q index + * \param[in] bottom_index Min permited new value for q index + * \param[in,out] undershoot_seen Have we seen undershoot on this frame + * \param[in,out] overshoot_seen Have we seen overshoot on this frame + * \param[in,out] low_cr_seen Have we previously trriggered recode + * because the compression ration was less + * than a given minimum threshold. + * \param[in] loop_count Loop itterations so far. + * + */ +static AOM_INLINE void recode_loop_update_q( + AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low, + int *const q_high, const int top_index, const int bottom_index, + int *const undershoot_seen, int *const overshoot_seen, + int *const low_cr_seen, const int loop_count) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + *loop = 0; + + // Special case for overlay frame. + if (rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) + return; + + const int min_cr = rc_cfg->min_cr; + if (min_cr > 0) { + const double compression_ratio = + av1_get_compression_ratio(cm, rc->projected_frame_size >> 3); + const double target_cr = min_cr / 100.0; + if (compression_ratio < target_cr) { + *low_cr_seen = 1; + if (*q < rc->worst_quality) { + const double cr_ratio = target_cr / compression_ratio; + const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio)); + *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality); + *q_low = AOMMAX(*q, *q_low); + *q_high = AOMMAX(*q, *q_high); + *loop = 1; + } + } + if (*low_cr_seen) return; + } + + if (cpi->ppi->level_params.keep_level_stats && + !is_stat_generation_stage(cpi)) { + // Initialize level info. at the beginning of each sequence. + if (cm->current_frame.frame_type == KEY_FRAME && + cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) { + av1_init_level_info(cpi); + } + const AV1LevelParams *const level_params = &cpi->ppi->level_params; + // TODO(any): currently only checking operating point 0 + const AV1LevelInfo *const level_info = level_params->level_info[0]; + const DECODER_MODEL *const decoder_models = level_info->decoder_models; + const AV1_LEVEL target_level = level_params->target_seq_level_idx[0]; + + if (target_level < SEQ_LEVELS && + decoder_models[target_level].status == DECODER_MODEL_OK) { + DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf( + cpi, rc->projected_frame_size, &decoder_models[target_level]); + + if ((status == SMOOTHING_BUFFER_UNDERFLOW || + status == SMOOTHING_BUFFER_OVERFLOW) && + *q < rc->worst_quality) { + *q = AOMMIN(*q + 10, rc->worst_quality); + *q_low = AOMMAX(*q, *q_low); + *q_high = AOMMAX(*q, *q_high); + *loop = 1; + return; + } + } + } + + if (rc_cfg->mode == AOM_Q) return; + + const int last_q = *q; + int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0; + av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + + if (cm->current_frame.frame_type == KEY_FRAME && + p_rc->this_key_frame_forced && + rc->projected_frame_size < rc->max_frame_bandwidth) { + int64_t kf_err; + const int64_t high_err_target = cpi->ambient_err; + const int64_t low_err_target = cpi->ambient_err >> 1; + +#if CONFIG_AV1_HIGHBITDEPTH + if (cm->seq_params->use_highbitdepth) { + kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); + } else { + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } +#else + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); +#endif + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + *q_high = AOMMAX(*q - 1, *q_low); + + // Adjust Q + *q = (int)((*q * high_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + *q_low = AOMMIN(*q + 1, *q_high); + + // Adjust Q + *q = (int)((*q * low_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1); + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + *loop = (*q != last_q); + return; + } + + if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q, + AOMMAX(*q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if (*q == *q_high && + rc->projected_frame_size >= rc->max_frame_bandwidth) { + const double q_val_high_current = + av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth); + const double q_val_high_new = + q_val_high_current * + ((double)rc->projected_frame_size / rc->max_frame_bandwidth); + *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth, + rc->best_quality, rc->worst_quality); + } + + // Raise Qlow as to at least the current value + *q_low = AOMMIN(*q + 1, *q_high); + + if (*undershoot_seen || loop_count > 2 || + (loop_count == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height); + + *q = (*q_high + *q_low + 1) / 2; + } else if (loop_count == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low + 1) / 2; + const int q_regulated = get_regulated_q_overshoot( + cpi, 1, *q_low, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_count < 2 and loop_count > 2. + *q = (q_mid + q_regulated + 1) / 2; + } else { + *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index, + bottom_index); + } + + *overshoot_seen = 1; + } else { + // Frame is too small + *q_high = AOMMAX(*q - 1, *q_low); + + if (*overshoot_seen || loop_count > 2 || + (loop_count == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height); + *q = (*q_high + *q_low) / 2; + } else if (loop_count == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low) / 2; + const int q_regulated = get_regulated_q_undershoot( + cpi, 1, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_count < 2 and loop_count > 2. + *q = (q_mid + q_regulated) / 2; + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) { + *q_low = *q; + } + } else { + *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index, + bottom_index); + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (rc_cfg->mode == AOM_CQ && *q < *q_low) { + *q_low = *q; + } + } + + *undershoot_seen = 1; + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + } + + *loop = (*q != last_q); +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RC_UTILS_H_ diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c new file mode 100644 index 0000000000..c2d76e7a9a --- /dev/null +++ b/third_party/aom/av1/encoder/rd.c @@ -0,0 +1,1580 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_once.h" + +#include "av1/common/common.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/nonrd_opt.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" + +#define RD_THRESH_POW 1.25 + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = { + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16 +}; + +static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA] + [EXT_TX_SIZES] = { + { 1, 1, 1, 1 }, // unused + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, + }; + +static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER] + [EXT_TX_SIZES] = { + { 1, 1, 1, 1 }, // unused + { 1, 1, 0, 0 }, + { 0, 0, 1, 0 }, + { 0, 1, 1, 1 }, + }; + +static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA, + EXT_TX_SETS_INTER)] = { + { + // Intra + EXT_TX_SET_DCTONLY, + EXT_TX_SET_DTT4_IDTX_1DDCT, + EXT_TX_SET_DTT4_IDTX, + }, + { + // Inter + EXT_TX_SET_DCTONLY, + EXT_TX_SET_ALL16, + EXT_TX_SET_DTT9_IDTX_1DDCT, + EXT_TX_SET_DCT_IDTX, + }, +}; + +void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs, + FRAME_CONTEXT *fc) { + int i, j; + + for (i = 0; i < PARTITION_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(mode_costs->partition_cost[i], + fc->partition_cdf[i], NULL); + + if (cm->current_frame.skip_mode_info.skip_mode_flag) { + for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i], + fc->skip_mode_cdfs[i], NULL); + } + } + + for (i = 0; i < SKIP_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i], + fc->skip_txfm_cdfs[i], NULL); + } + + for (i = 0; i < KF_MODE_CONTEXTS; ++i) + for (j = 0; j < KF_MODE_CONTEXTS; ++j) + av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j], + fc->kf_y_cdf[i][j], NULL); + + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i], + NULL); + for (i = 0; i < CFL_ALLOWED_TYPES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j], + fc->uv_mode_cdf[i][j], NULL); + + av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost, + fc->filter_intra_mode_cdf, NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_filter_intra_allowed_bsize(cm, i)) + av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i], + fc->filter_intra_cdfs[i], NULL); + } + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i], + fc->switchable_interp_cdf[i], NULL); + + for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i], + fc->palette_y_size_cdf[i], NULL); + av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i], + fc->palette_uv_size_cdf[i], NULL); + for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j], + fc->palette_y_mode_cdf[i][j], NULL); + } + } + + for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i], + fc->palette_uv_mode_cdf[i], NULL); + } + + for (i = 0; i < PALETTE_SIZES; ++i) { + for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) { + av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j], + fc->palette_y_color_index_cdf[i][j], NULL); + av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j], + fc->palette_uv_color_index_cdf[i][j], NULL); + } + } + + int sign_cost[CFL_JOINT_SIGNS]; + av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL); + for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { + int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U]; + int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V]; + if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) { + memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u)); + } else { + const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)]; + av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL); + } + if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) { + memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v)); + } else { + const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)]; + av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL); + } + for (int u = 0; u < CFL_ALPHABET_SIZE; u++) + cost_u[u] += sign_cost[joint_sign]; + } + + for (i = 0; i < MAX_TX_CATS; ++i) + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) + av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j], + fc->tx_size_cdf[i][j], NULL); + + for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i], + fc->txfm_partition_cdf[i], NULL); + } + + for (i = TX_4X4; i < EXT_TX_SIZES; ++i) { + int s; + for (s = 1; s < EXT_TX_SETS_INTER; ++s) { + if (use_inter_ext_tx_for_txsize[s][i]) { + av1_cost_tokens_from_cdf( + mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i], + av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]); + } + } + for (s = 1; s < EXT_TX_SETS_INTRA; ++s) { + if (use_intra_ext_tx_for_txsize[s][i]) { + for (j = 0; j < INTRA_MODES; ++j) { + av1_cost_tokens_from_cdf( + mode_costs->intra_tx_type_costs[s][i][j], + fc->intra_ext_tx_cdf[s][i][j], + av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]); + } + } + } + } + for (i = 0; i < DIRECTIONAL_MODES; ++i) { + av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i], + fc->angle_delta_cdf[i], NULL); + } + av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL); + + for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i], + fc->seg.spatial_pred_seg_cdf[i], NULL); + } + + for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i], + NULL); + } + + if (!frame_is_intra_only(cm)) { + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i], + fc->comp_inter_cdf[i], NULL); + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < SINGLE_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j], + fc->single_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i], + fc->comp_ref_type_cdf[i], NULL); + } + + for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) { + for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j], + fc->uni_comp_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < FWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j], + fc->comp_ref_cdf[i][j], NULL); + } + } + + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < BWD_REFS - 1; ++j) { + av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j], + fc->comp_bwdref_cdf[i][j], NULL); + } + } + + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i], + fc->intra_inter_cdf[i], NULL); + } + + for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i], + NULL); + } + + for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i], + fc->zeromv_cdf[i], NULL); + } + + for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i], + NULL); + } + + for (i = 0; i < DRL_MODE_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i], + NULL); + } + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i], + fc->inter_compound_mode_cdf[i], NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) + av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i], + fc->compound_type_cdf[i], NULL); + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + if (av1_is_wedge_used(i)) { + av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i], + fc->wedge_idx_cdf[i], NULL); + } + } + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i], + fc->interintra_cdf[i], NULL); + av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i], + fc->interintra_mode_cdf[i], NULL); + } + for (i = 0; i < BLOCK_SIZES_ALL; ++i) { + av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i], + fc->wedge_interintra_cdf[i], NULL); + } + for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { + av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i], + fc->motion_mode_cdf[i], NULL); + } + for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) { + av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i], + fc->obmc_cdf[i], NULL); + } + for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i], + fc->compound_index_cdf[i], NULL); + } + for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) { + av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i], + fc->comp_group_idx_cdf[i], NULL); + } + } +} + +void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) { + av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost, + fc->switchable_restore_cdf, NULL); + av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost, + fc->wiener_restore_cdf, NULL); + av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost, + fc->sgrproj_restore_cdf, NULL); +} + +// Values are now correlated to quantizer. +static int sad_per_bit_lut_8[QINDEX_RANGE]; +static int sad_per_bit_lut_10[QINDEX_RANGE]; +static int sad_per_bit_lut_12[QINDEX_RANGE]; + +static void init_me_luts_bd(int *bit16lut, int range, + aom_bit_depth_t bit_depth) { + int i; + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < range; i++) { + const double q = av1_convert_qindex_to_q(i, bit_depth); + bit16lut[i] = (int)(0.0418 * q + 2.4107); + } +} + +static void init_me_luts(void) { + init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8); + init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10); + init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12); +} + +void av1_init_me_luts(void) { aom_once(init_me_luts); } + +static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 }; + +static const int rd_layer_depth_factor[7] = { + 160, 160, 160, 160, 192, 208, 224 +}; + +// Returns the default rd multiplier for inter frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_inter_rd_multiplier(int qindex) { + return 3.2 + (0.0015 * (double)qindex); +} + +// Returns the default rd multiplier for ARF/Golden Frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_arf_rd_multiplier(int qindex) { + return 3.25 + (0.0015 * (double)qindex); +} + +// Returns the default rd multiplier for key frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_kf_rd_multiplier(int qindex) { + return 3.3 + (0.0015 * (double)qindex); +} + +int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth, + FRAME_UPDATE_TYPE update_type, + int qindex) { + const int q = av1_dc_quant_QTX(qindex, 0, bit_depth); + int64_t rdmult = q * q; + if (update_type == KF_UPDATE) { + double def_rd_q_mult = def_kf_rd_multiplier(q); + rdmult = (int64_t)((double)rdmult * def_rd_q_mult); + } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) { + double def_rd_q_mult = def_arf_rd_multiplier(q); + rdmult = (int64_t)((double)rdmult * def_rd_q_mult); + } else { + double def_rd_q_mult = def_inter_rd_multiplier(q); + rdmult = (int64_t)((double)rdmult * def_rd_q_mult); + } + + switch (bit_depth) { + case AOM_BITS_8: break; + case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1; +} + +int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth, + const FRAME_UPDATE_TYPE update_type, + const int layer_depth, const int boost_index, + const FRAME_TYPE frame_type, + const int use_fixed_qp_offsets, + const int is_stat_consumption_stage) { + int64_t rdmult = + av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex); + if (is_stat_consumption_stage && !use_fixed_qp_offsets && + (frame_type != KEY_FRAME)) { + // Layer depth adjustment + rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7; + // ARF boost adjustment + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + return (int)rdmult; +} + +int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) { + assert(beta > 0.0); + int q = av1_dc_quant_QTX(qindex, 0, bit_depth); + int newq = (int)rint(q / sqrt(beta)); + int orig_qindex = qindex; + if (newq == q) { + return 0; + } + if (newq < q) { + while (qindex > 0) { + qindex--; + q = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (newq >= q) { + break; + } + } + } else { + while (qindex < MAXQ) { + qindex++; + q = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (newq <= q) { + break; + } + } + } + return qindex - orig_qindex; +} + +int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex, + int curr_qindex) { + curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res); + const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1; + const int deltaq_deadzone = delta_q_res / 4; + const int qmask = ~(delta_q_res - 1); + int abs_deltaq_index = abs(curr_qindex - prev_qindex); + abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask; + int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index; + adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1); + return adjust_qindex; +} + +int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) { + assert(beta > 0.0); + const AV1_COMMON *cm = &cpi->common; + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + const int qindex_rdmult = cm->quant_params.base_qindex; + return (int)(av1_compute_rd_mult( + qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + layer_depth, boost_index, frame_type, + cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)) / + beta); +} + +static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { + double q; + switch (bit_depth) { + case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break; + case AOM_BITS_10: + q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0; + break; + case AOM_BITS_12: + q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0; + break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } + // TODO(debargha): Adjust the function below. + return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); +} + +void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) { + switch (cpi->common.seq_params->bit_depth) { + case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break; + case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break; + case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break; + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + } +} + +static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd, + int use_nonrd_pick_mode) { + int i, bsize, segment_id; + THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 }; + int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES; + + if (use_nonrd_pick_mode) { + for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) { + const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; + if (ref != INTRA_FRAME) { + for (i = 0; i < RTC_INTER_MODES; i++) + mode_indices[num_modes_count++] = + mode_idx[ref][mode_offset(inter_mode_list[i])]; + } else { + for (i = 0; i < RTC_INTRA_MODES; i++) + mode_indices[num_modes_count++] = + mode_idx[ref][mode_offset(intra_mode_list[i])]; + } + } + } + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = clamp( + av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + + cm->quant_params.y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth); + + for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + + for (i = 0; i < num_modes_count; ++i) { + const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i; + rd->threshes[segment_id][bsize][mode_index] = + rd->thresh_mult[mode_index] < thresh_max + ? rd->thresh_mult[mode_index] * t / 4 + : INT_MAX; + } + } + } +} + +void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, + const int num_planes) { + const int nplanes = AOMMIN(num_planes, PLANE_TYPES); + for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane]; + + for (int ctx = 0; ctx < 2; ++ctx) { + aom_cdf_prob *pcdf; + switch (eob_multi_size) { + case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break; + case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break; + case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break; + case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break; + case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break; + case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break; + case 6: + default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break; + } + av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL); + } + } + } + for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) { + for (int plane = 0; plane < nplanes; ++plane) { + LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane]; + + for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx], + fc->txb_skip_cdf[tx_size][ctx], NULL); + + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx], + fc->coeff_base_eob_cdf[tx_size][plane][ctx], + NULL); + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->base_cost[ctx], + fc->coeff_base_cdf[tx_size][plane][ctx], NULL); + + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { + pcost->base_cost[ctx][4] = 0; + pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] + + av1_cost_literal(1) - + pcost->base_cost[ctx][0]; + pcost->base_cost[ctx][6] = + pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1]; + pcost->base_cost[ctx][7] = + pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2]; + } + + for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx], + fc->eob_extra_cdf[tx_size][plane][ctx], NULL); + + for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx) + av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx], + fc->dc_sign_cdf[plane][ctx], NULL); + + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + int br_rate[BR_CDF_SIZE]; + int prev_cost = 0; + int i, j; + av1_cost_tokens_from_cdf( + br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx], + NULL); + // printf("br_rate: "); + // for(j = 0; j < BR_CDF_SIZE; j++) + // printf("%4d ", br_rate[j]); + // printf("\n"); + for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) { + for (j = 0; j < BR_CDF_SIZE - 1; j++) { + pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j]; + } + prev_cost += br_rate[j]; + } + pcost->lps_cost[ctx][i] = prev_cost; + // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx); + // for (i = 0; i <= COEFF_BASE_RANGE; i++) + // printf("%5d ", pcost->lps_cost[ctx][i]); + // printf("\n"); + } + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][0]; + for (int i = 1; i <= COEFF_BASE_RANGE; ++i) { + pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1]; + } + } + } + } +} + +void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, + MvCosts *mv_costs) { + // Avoid accessing 'mv_costs' when it is not allocated. + if (mv_costs == NULL) return; + + mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX]; + mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX]; + mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX]; + mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX]; + if (integer_mv) { + mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost; + av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, + nmvc, MV_SUBPEL_NONE); + } else { + mv_costs->mv_cost_stack = + usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost; + av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack, + nmvc, usehp); + } +} + +void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) { + dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX]; + dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX]; + av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc, + MV_SUBPEL_NONE); +} + +// Populates speed features based on codec control settings (of type +// COST_UPDATE_TYPE) and expected speed feature settings (of type +// INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update. +// The populated/updated speed features are used for cost updates in the +// encoder. +// WARNING: Population of unified cost update frequency needs to be taken care +// accordingly, in case of any modifications/additions to the enum +// COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE. +static INLINE void populate_unified_cost_update_freq( + const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) { + INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf; + // Mapping of entropy cost update frequency from the encoder's codec control + // settings of type COST_UPDATE_TYPE to speed features of type + // INTERNAL_COST_UPDATE_TYPE. + static const INTERNAL_COST_UPDATE_TYPE + map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = { + INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE, + INTERNAL_COST_UPD_OFF + }; + + inter_sf->mv_cost_upd_level = + AOMMIN(inter_sf->mv_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]); + inter_sf->coeff_cost_upd_level = + AOMMIN(inter_sf->coeff_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]); + inter_sf->mode_cost_upd_level = + AOMMIN(inter_sf->mode_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]); + sf->intra_sf.dv_cost_upd_level = + AOMMIN(sf->intra_sf.dv_cost_upd_level, + map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]); +} + +// Checks if entropy costs should be initialized/updated at frame level or not. +static INLINE int is_frame_level_cost_upd_freq_set( + const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level, + const int use_nonrd_pick_mode, const int frames_since_key) { + const int fill_costs = + frame_is_intra_only(cm) || + (use_nonrd_pick_mode ? frames_since_key < 2 + : (cm->current_frame.frame_number & 0x07) == 1); + return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) || + cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs); +} + +// Decide whether we want to update the mode entropy cost for the current frame. +// The logit is currently inherited from selective_disable_cdf_rtc. +static AOM_INLINE int should_force_mode_cost_update(const AV1_COMP *cpi) { + const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf; + if (!rt_sf->frame_level_mode_cost_update) { + return false; + } + + if (cpi->oxcf.algo_cfg.cdf_update_mode == 2) { + return cpi->frames_since_last_update == 1; + } else if (cpi->oxcf.algo_cfg.cdf_update_mode == 1) { + if (cpi->svc.number_spatial_layers == 1 && + cpi->svc.number_temporal_layers == 1) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + return frame_is_intra_only(cm) || is_frame_resize_pending(cpi) || + rc->high_source_sad || rc->frames_since_key < 10 || + cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 || + cm->current_frame.frame_number % 8 == 0; + } else if (cpi->svc.number_temporal_layers > 1) { + return cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1; + } + } + + return false; +} + +void av1_initialize_rd_consts(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->td.mb; + SPEED_FEATURES *const sf = &cpi->sf; + RD_OPT *const rd = &cpi->rd; + int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; + int frames_since_key = cpi->rc.frames_since_key; + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + const int qindex_rdmult = + cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q; + rd->RDMULT = av1_compute_rd_mult( + qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); +#if CONFIG_RD_COMMAND + if (cpi->oxcf.pass == 2) { + const RD_COMMAND *rd_command = &cpi->rd_command; + if (rd_command->option_ls[rd_command->frame_index] == + RD_OPTION_SET_Q_RDMULT) { + rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index]; + } + } +#endif // CONFIG_RD_COMMAND + + av1_set_error_per_bit(&x->errorperbit, rd->RDMULT); + + set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode); + + populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf); + const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf; + // Frame level mv cost update + if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level, + use_nonrd_pick_mode, frames_since_key)) + av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, x->mv_costs); + + // Frame level coefficient cost update + if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level, + use_nonrd_pick_mode, frames_since_key)) + av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm)); + + // Frame level mode cost update + if (should_force_mode_cost_update(cpi) || + is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level, + use_nonrd_pick_mode, frames_since_key)) + av1_fill_mode_rates(cm, &x->mode_costs, cm->fc); + + // Frame level dv cost update + if (av1_need_dv_costs(cpi)) { + if (cpi->td.dv_costs_alloc == NULL) { + CHECK_MEM_ERROR( + cm, cpi->td.dv_costs_alloc, + (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc))); + cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc; + } + av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs); + } +} + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + // NOTE: The tables below must be of the same size. + + // The functions described below are sampled at the four most significant + // bits of x^2 + 8 / 256. + + // Normalized rate: + // This table models the rate for a Laplacian source with given variance + // when quantized with a uniform quantizer with given stepsize. The + // closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, + 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, + 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, + 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, + 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, + 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, + 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, + 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, + 5, 3, 2, 1, 1, 1, 0, 0, + }; + // Normalized distortion: + // This table models the normalized distortion for a Laplacian source + // with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance). + // Note the actual distortion is Dn * variance. + static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, + 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, + 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, + 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, + 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, + 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, + 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, + 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, + 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, + }; + static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, + }; + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; + const uint64_t xsq_q10_64 = + (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; + const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT); + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +static double interp_cubic(const double *p, double x) { + return p[1] + 0.5 * x * + (p[2] - p[0] + + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); +} + +/* +static double interp_bicubic(const double *p, int p_stride, double x, + double y) { + double q[4]; + q[0] = interp_cubic(p, x); + q[1] = interp_cubic(p + p_stride, x); + q[2] = interp_cubic(p + 2 * p_stride, x); + q[3] = interp_cubic(p + 3 * p_stride, x); + return interp_cubic(q, y); +} +*/ + +static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3 +}; + +static int sse_norm_curvfit_model_cat_lookup(double sse_norm) { + return (sse_norm > 16.0); +} + +// Models distortion by sse using a logistic function on +// l = log2(sse / q^2) as: +// dbysse = 16 / (1 + k exp(l + c)) +static double get_dbysse_logistic(double l, double c, double k) { + const double A = 16.0; + const double dbysse = A / (1 + k * exp(l + c)); + return dbysse; +} + +// Models rate using a clamped linear function on +// l = log2(sse / q^2) as: +// rate = max(0, a + b * l) +static double get_rate_clamplinear(double l, double a, double b) { + const double rate = a + b * l; + return (rate < 0 ? 0 : rate); +} + +static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4 +}; + +static const double surffit_rate_params[9][4] = { + { + 638.390212, + 2.253108, + 166.585650, + -3.939401, + }, + { + 5.256905, + 81.997240, + -1.321771, + 17.694216, + }, + { + -74.193045, + 72.431868, + -19.033152, + 15.407276, + }, + { + 416.770113, + 14.794188, + 167.686830, + -6.997756, + }, + { + 378.511276, + 9.558376, + 154.658843, + -6.635663, + }, + { + 277.818787, + 4.413180, + 150.317637, + -9.893038, + }, + { + 142.212132, + 11.542038, + 94.393964, + -5.518517, + }, + { + 219.100256, + 4.007421, + 108.932852, + -6.981310, + }, + { + 222.261971, + 3.251049, + 95.972916, + -5.609789, + }, +}; + +static const double surffit_dist_params[7] = { 1.475844, 4.328362, -5.680233, + -0.500994, 0.554585, 4.839478, + -0.695837 }; + +static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *rpar) { + const int cat = bsize_surffit_model_cat_lookup[bsize]; + rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm; + rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm; +} + +static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *dpar) { + (void)bsize; + const double *params = surffit_dist_params; + dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3])); + dpar[1] = params[4] + params[5] * exp(params[6] * xm); +} + +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f) { + (void)sse_norm; + double rpar[2], dpar[2]; + rate_surffit_model_params_lookup(bsize, xm, rpar); + dist_surffit_model_params_lookup(bsize, xm, dpar); + + *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]); + *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]); +} + +static const double interp_rgrid_curv[4][65] = { + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 118.257702, 120.210658, 121.434853, 122.100487, + 122.377758, 122.436865, 72.290102, 96.974289, 101.652727, + 126.830141, 140.417377, 157.644879, 184.315291, 215.823873, + 262.300169, 335.919859, 420.624173, 519.185032, 619.854243, + 726.053595, 827.663369, 933.127475, 1037.988755, 1138.839609, + 1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052, + 1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680, + 2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011, + 2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827, + 2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773, + 3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 13.087244, 15.919735, 25.930313, 24.412411, + 28.567417, 29.924194, 30.857010, 32.742979, 36.382570, + 39.210386, 42.265690, 47.378572, 57.014850, 82.740067, + 137.346562, 219.968084, 316.781856, 415.643773, 516.706538, + 614.914364, 714.303763, 815.512135, 911.210485, 1008.501528, + 1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641, + 1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309, + 1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824, + 2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694, + 2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660, + 3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 4.656893, 5.123633, 5.594132, 6.162376, + 6.918433, 7.768444, 8.739415, 10.105862, 11.477328, + 13.236604, 15.421030, 19.093623, 25.801871, 46.724612, + 98.841054, 181.113466, 272.586364, 359.499769, 445.546343, + 525.944439, 605.188743, 681.793483, 756.668359, 838.486885, + 926.950356, 1015.482542, 1113.353926, 1204.897193, 1288.871992, + 1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771, + 1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872, + 2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216, + 2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436, + 3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.337370, 0.391916, 0.468839, 0.566334, + 0.762564, 1.069225, 1.384361, 1.787581, 2.293948, + 3.251909, 4.412991, 8.050068, 11.606073, 27.668092, + 65.227758, 128.463938, 202.097653, 262.715851, 312.464873, + 355.601398, 400.609054, 447.201352, 495.761568, 552.871938, + 619.067625, 691.984883, 773.753288, 860.628503, 946.262808, + 1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987, + 1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823, + 1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119, + 2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754, + 3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000, + }, +}; + +static const double interp_dgrid_curv[3][65] = { + { + 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770, + 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870, + 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387, + 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790, + 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064, + 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123, + 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, 0.000000, + }, + { + 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501, + 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967, + 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212, + 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519, + 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412, + 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825, + 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, -0.000000, + }, +}; + +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f) { + const double x_start = -15.5; + const double x_end = 16.5; + const double x_step = 0.5; + const double epsilon = 1e-6; + const int rcat = bsize_curvfit_model_cat_lookup[bsize]; + const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm); + (void)x_end; + + xqr = AOMMAX(xqr, x_start + x_step + epsilon); + xqr = AOMMIN(xqr, x_end - x_step - epsilon); + const double x = (xqr - x_start) / x_step; + const int xi = (int)floor(x); + const double xo = x - xi; + + assert(xi > 0); + + const double *prate = &interp_rgrid_curv[rcat][(xi - 1)]; + *rate_f = interp_cubic(prate, xo); + const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)]; + *distbysse_f = interp_cubic(pdist, xo); +} + +static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { + const int num_4x4_w = mi_size_wide[plane_bsize]; + const int num_4x4_h = mi_size_high[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_entropy_context; + const ENTROPY_CONTEXT *const left = pd->left_entropy_context; + + memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); +} + +void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) { + assert(plane_bsize < BLOCK_SIZES_ALL); + get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left); +} + +// Special clamping used in the encoder when calculating a prediction +// +// Logically, all pixel fetches used for prediction are clamped against the +// edges of the frame. But doing this directly is slow, so instead we allocate +// a finite border around the frame and fill it with copies of the outermost +// pixels. +// +// Since this border is finite, we need to clamp the motion vector before +// prediction in order to avoid out-of-bounds reads. At the same time, this +// clamp must not change the prediction result. +// +// We can balance both of these concerns by calculating how far we would have +// to go in each direction before the extended prediction region (the current +// block + AOM_INTERP_EXTEND many pixels around the block) would be mapped +// so that it touches the frame only at one row or column. This is a special +// point because any more extreme MV will always lead to the same prediction. +// So it is safe to clamp at that point. +// +// In the worst case, this requires a border of +// max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels +// around the frame edges. +static INLINE void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, + MV *mv) { + int bw = xd->width << MI_SIZE_LOG2; + int bh = xd->height << MI_SIZE_LOG2; + + int px_to_left_edge = xd->mi_col << MI_SIZE_LOG2; + int px_to_right_edge = (cm->mi_params.mi_cols - xd->mi_col) << MI_SIZE_LOG2; + int px_to_top_edge = xd->mi_row << MI_SIZE_LOG2; + int px_to_bottom_edge = (cm->mi_params.mi_rows - xd->mi_row) << MI_SIZE_LOG2; + + const SubpelMvLimits mv_limits = { + .col_min = -GET_MV_SUBPEL(px_to_left_edge + bw + AOM_INTERP_EXTEND), + .col_max = GET_MV_SUBPEL(px_to_right_edge + AOM_INTERP_EXTEND), + .row_min = -GET_MV_SUBPEL(px_to_top_edge + bh + AOM_INTERP_EXTEND), + .row_max = GET_MV_SUBPEL(px_to_bottom_edge + AOM_INTERP_EXTEND) + }; + clamp_mv(mv, &mv_limits); +} + +void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, + int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { + const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME }; + const int_mv ref_mv = + av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext); + const int_mv ref_mv1 = + av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext); + MV pred_mv[MAX_MV_REF_CANDIDATES + 1]; + int num_mv_refs = 0; + pred_mv[num_mv_refs++] = ref_mv.as_mv; + if (ref_mv.as_int != ref_mv1.as_int) { + pred_mv[num_mv_refs++] = ref_mv1.as_mv; + } + + assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0]))); + + const uint8_t *const src_y_ptr = x->plane[0].src.buf; + int zero_seen = 0; + int best_sad = INT_MAX; + int max_mv = 0; + // Get the sad for each candidate reference mv. + for (int i = 0; i < num_mv_refs; ++i) { + MV *this_mv = &pred_mv[i]; + enc_clamp_mv(&cpi->common, &x->e_mbd, this_mv); + + const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; + const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; + max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + + if (fp_row == 0 && fp_col == 0 && zero_seen) continue; + zero_seen |= (fp_row == 0 && fp_col == 0); + + const uint8_t *const ref_y_ptr = + &ref_y_buffer[ref_y_stride * fp_row + fp_col]; + // Find sad for current vector. + const int this_sad = cpi->ppi->fn_ptr[block_size].sdf( + src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride); + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + } + if (i == 0) + x->pred_mv0_sad[ref_frame] = this_sad; + else if (i == 1) + x->pred_mv1_sad[ref_frame] = this_sad; + } + + // Note the index of the mv that worked best in the reference list. + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + const struct scale_factors *scale, + const struct scale_factors *scale_uv, + const int num_planes) { + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + for (int i = 0; i < num_planes; ++i) { + setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf, + i ? src->uv_crop_width : src->y_crop_width, + i ? src->uv_crop_height : src->y_crop_height, + dst[i].stride, mi_row, mi_col, i ? scale_uv : scale, + xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); + } +} + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, + int ref_frame) { + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1]; + const RefCntBuffer *const ref_buf = + get_ref_frame_buf(&cpi->common, ref_frame); + return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf + : NULL; +} + +int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, + InterpFilter interp_filter, int dual_filter) { + if (interp_filter == SWITCHABLE) { + const MB_MODE_INFO *const mbmi = xd->mi[0]; + int inter_filter_cost = 0; + for (int dir = 0; dir < 2; ++dir) { + if (dir && !dual_filter) break; + const int ctx = av1_get_pred_context_switchable_interp(xd, dir); + const InterpFilter filter = + av1_extract_interp_filter(mbmi->interp_filters, dir); + inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter]; + } + return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost; + } else { + return 0; + } +} + +void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { + RD_OPT *const rd = &cpi->rd; + + // Set baseline threshold values. + av1_zero(rd->thresh_mult); + + rd->thresh_mult[THR_NEARESTMV] = 300; + rd->thresh_mult[THR_NEARESTL2] = 300; + rd->thresh_mult[THR_NEARESTL3] = 300; + rd->thresh_mult[THR_NEARESTB] = 300; + rd->thresh_mult[THR_NEARESTA2] = 300; + rd->thresh_mult[THR_NEARESTA] = 300; + rd->thresh_mult[THR_NEARESTG] = 300; + + rd->thresh_mult[THR_NEWMV] = 1000; + rd->thresh_mult[THR_NEWL2] = 1000; + rd->thresh_mult[THR_NEWL3] = 1000; + rd->thresh_mult[THR_NEWB] = 1000; + rd->thresh_mult[THR_NEWA2] = 1100; + rd->thresh_mult[THR_NEWA] = 1000; + rd->thresh_mult[THR_NEWG] = 1000; + + rd->thresh_mult[THR_NEARMV] = 1000; + rd->thresh_mult[THR_NEARL2] = 1000; + rd->thresh_mult[THR_NEARL3] = 1000; + rd->thresh_mult[THR_NEARB] = 1000; + rd->thresh_mult[THR_NEARA2] = 1000; + rd->thresh_mult[THR_NEARA] = 1000; + rd->thresh_mult[THR_NEARG] = 1000; + + rd->thresh_mult[THR_GLOBALMV] = 2200; + rd->thresh_mult[THR_GLOBALL2] = 2000; + rd->thresh_mult[THR_GLOBALL3] = 2000; + rd->thresh_mult[THR_GLOBALB] = 2400; + rd->thresh_mult[THR_GLOBALA2] = 2000; + rd->thresh_mult[THR_GLOBALG] = 2000; + rd->thresh_mult[THR_GLOBALA] = 2400; + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000; + + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000; + + rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530; + rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000; + + rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040; + rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250; + + rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360; + rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870; + rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800; + rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440; + rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500; + + rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200; + rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500; + rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500; + rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700; + rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640; + rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800; + rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760; + rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400; + rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760; + rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640; + rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200; + + rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600; + rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000; + rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200; + rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980; + rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200; + + rd->thresh_mult[THR_DC] = 1000; + rd->thresh_mult[THR_PAETH] = 1000; + rd->thresh_mult[THR_SMOOTH] = 2200; + rd->thresh_mult[THR_SMOOTH_V] = 2000; + rd->thresh_mult[THR_SMOOTH_H] = 2000; + rd->thresh_mult[THR_H_PRED] = 2000; + rd->thresh_mult[THR_V_PRED] = 1800; + rd->thresh_mult[THR_D135_PRED] = 2500; + rd->thresh_mult[THR_D203_PRED] = 2000; + rd->thresh_mult[THR_D157_PRED] = 2500; + rd->thresh_mult[THR_D67_PRED] = 2000; + rd->thresh_mult[THR_D113_PRED] = 2500; + rd->thresh_mult[THR_D45_PRED] = 2500; +} + +static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES], + THR_MODES best_mode_index, + THR_MODES mode_start, THR_MODES mode_end, + BLOCK_SIZE min_size, BLOCK_SIZE max_size, + int max_rd_thresh_factor) { + for (THR_MODES mode = mode_start; mode < mode_end; ++mode) { + for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) { + int *const fact = &factor_buf[bs][mode]; + if (mode == best_mode_index) { + *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR); + } else { + *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor); + } + } + } +} + +void av1_update_rd_thresh_fact( + const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES], + int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index, + THR_MODES inter_mode_start, THR_MODES inter_mode_end, + THR_MODES intra_mode_start, THR_MODES intra_mode_end) { + assert(use_adaptive_rd_thresh > 0); + const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT; + + const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size; + BLOCK_SIZE min_size, max_size; + if (bsize_is_1_to_4) { + // This part handles block sizes with 1:4 and 4:1 aspect ratios + // TODO(any): Experiment with threshold update for parent/child blocks + min_size = bsize; + max_size = bsize; + } else { + min_size = AOMMAX(bsize - 2, BLOCK_4X4); + max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size); + } + + update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end, + min_size, max_size, max_rd_thresh_factor); + update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end, + min_size, max_size, max_rd_thresh_factor); +} + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth) { + const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth); + switch (bit_depth) { + case AOM_BITS_8: return 20 * q; + case AOM_BITS_10: return 5 * q; + case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); + default: + assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12"); + return -1; + } +} diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h new file mode 100644 index 0000000000..b38d9ca542 --- /dev/null +++ b/third_party/aom/av1/encoder/rd.h @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RD_H_ +#define AOM_AV1_ENCODER_RD_H_ + +#include + +#include "av1/common/blockd.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 +#define RD_EPB_SHIFT 6 + +#define RDCOST(RM, R, D) \ + (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \ + ((D) * (1 << RDDIV_BITS))) + +#define RDCOST_NEG_R(RM, R, D) \ + (((D) * (1 << RDDIV_BITS)) - \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT)) + +#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD) \ + (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \ + ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS))) + +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +// The fractional part of rd_thresh factor is stored with 5 bits. The maximum +// factor that we allow is two, which is stored as 2 ** (5+1) = 64 +#define RD_THRESH_FAC_FRAC_BITS (5) +#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS)) +#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1) +#define RD_THRESH_LOG_DEC_FACTOR (4) +#define RD_THRESH_INC (1) + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +// Macros for common video resolutions: width x height +// For example, 720p represents video resolution of 1280x720 pixels. +#define RESOLUTION_288P 352 * 288 +#define RESOLUTION_360P 640 * 360 +#define RESOLUTION_480P 640 * 480 +#define RESOLUTION_720P 1280 * 720 +#define RESOLUTION_1080P 1920 * 1080 +#define RESOLUTION_1440P 2560 * 1440 +#define RESOLUTION_4K 3840 * 2160 + +#define RTC_REFS 4 +static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = { + { LAST_FRAME, NONE_FRAME }, + { ALTREF_FRAME, NONE_FRAME }, + { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME } +}; + +static INLINE int mode_offset(const PREDICTION_MODE mode) { + if (mode >= NEARESTMV) { + return INTER_OFFSET(mode); + } else { + switch (mode) { + case DC_PRED: return 0; + case V_PRED: return 1; + case H_PRED: return 2; + case SMOOTH_PRED: return 3; + default: assert(0); return -1; + } + } +} + +enum { + // Default initialization when we are not using winner mode framework. e.g. + // intrabc + DEFAULT_EVAL = 0, + // Initialization for selecting winner mode + MODE_EVAL, + // Initialization for winner mode evaluation + WINNER_MODE_EVAL, + // All mode evaluation types + MODE_EVAL_TYPES, +} UENUM1BYTE(MODE_EVAL_TYPE); + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact + // to pick a threshold. + int thresh_mult[MAX_MODES]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES]; + + int RDMULT; + + double r0; +} RD_OPT; + +static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = 0; + rd_stats->dist = 0; + rd_stats->rdcost = 0; + rd_stats->sse = 0; + rd_stats->skip_txfm = 1; + rd_stats->zero_rate = 0; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = 0; + } +#endif +} + +static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { +#if CONFIG_RD_DEBUG + int plane; +#endif + rd_stats->rate = INT_MAX; + rd_stats->dist = INT64_MAX; + rd_stats->rdcost = INT64_MAX; + rd_stats->sse = INT64_MAX; + rd_stats->skip_txfm = 0; + rd_stats->zero_rate = 0; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats->txb_coeff_cost[plane] = INT_MAX; + } +#endif +} + +static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, + const RD_STATS *rd_stats_src) { + if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) { + // If rd_stats_dst or rd_stats_src has invalid rate, we will make + // rd_stats_dst invalid. + av1_invalid_rd_stats(rd_stats_dst); + return; + } + rd_stats_dst->rate = (int)AOMMIN( + ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX); + if (!rd_stats_dst->zero_rate) + rd_stats_dst->zero_rate = rd_stats_src->zero_rate; + rd_stats_dst->dist += rd_stats_src->dist; + if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) { + rd_stats_dst->sse += rd_stats_src->sse; + } + rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm; +#if CONFIG_RD_DEBUG + // This may run into problems when monochrome video is + // encoded, as there will only be 1 plane + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; + } +#endif +} + +static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist, + int rate, int skip_txfm, int64_t sse, + int zero_rate) { + assert(rd_stats->rate != INT_MAX && rate != INT_MAX); + rd_stats->rate += rate; + if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate; + rd_stats->dist += dist; + rd_stats->skip_txfm &= skip_txfm; + rd_stats->sse += sse; +} + +static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) { + assert(mult >= 0); + if (rate >= 0) { + return RDCOST(mult, rate, dist); + } + return RDCOST_NEG_R(mult, -rate, dist); +} + +static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) { + if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX && + rd_cost->rdcost < INT64_MAX) { + rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist); + } else { + av1_invalid_rd_stats(rd_cost); + } +} + +static INLINE void av1_rd_stats_subtraction(int mult, + const RD_STATS *const left, + const RD_STATS *const right, + RD_STATS *result) { + if (left->rate == INT_MAX || right->rate == INT_MAX || + left->dist == INT64_MAX || right->dist == INT64_MAX || + left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) { + av1_invalid_rd_stats(result); + } else { + result->rate = left->rate - right->rate; + result->dist = left->dist - right->dist; + result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist); + } +} + +struct TileInfo; +struct TileDataEnc; +struct AV1_COMP; +struct macroblock; + +/*!\brief Compute rdmult based on q index and frame update type + * + * \param[in] bit_depth bit depth + * \param[in] update_type frame update type + * \param[in] qindex q index + * + * \return rdmult + */ +int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth, + FRAME_UPDATE_TYPE update_type, + int qindex); + +int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth, + const FRAME_UPDATE_TYPE update_type, + const int layer_depth, const int boost_index, + const FRAME_TYPE frame_type, + const int use_fixed_qp_offsets, + const int is_stat_consumption_stage); + +void av1_initialize_rd_consts(struct AV1_COMP *cpi); + +// Sets the multiplier to convert mv cost to l1 error during motion search. +void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit, + int qindex); + +void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, + unsigned int qstep, int *rate, int64_t *dist); + +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f); +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f); + +int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd, + InterpFilter interp_filter, int dual_filter); + +YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi, + int ref_frame); + +void av1_init_me_luts(void); + +void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx); + +void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE], + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]); + +void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); + +void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, + int (*fact)[MAX_MODES], int rd_thresh, + BLOCK_SIZE bsize, THR_MODES best_mode_index, + THR_MODES inter_mode_start, + THR_MODES inter_mode_end, + THR_MODES intra_mode_start, + THR_MODES intra_mode_end); + +static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + for (int j = 0; j < MAX_MODES; ++j) { + x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL; + } + } +} + +static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh, + int thresh_fact) { + return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX; +} + +void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, + BLOCK_SIZE block_size); + +// Sets the multiplier to convert mv cost to l2 error during motion search. +static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) { + *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1); +} + +// Get the threshold for R-D optimization of coefficients depending upon mode +// decision/winner mode processing +static INLINE void get_rd_opt_coeff_thresh( + const uint32_t (*const coeff_opt_threshold)[2], + TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt, + int is_winner_mode) { + if (!enable_winner_mode_for_coeff_opt) { + // Default initialization of threshold + txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0]; + txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1]; + return; + } + // TODO(any): Experiment with coeff_opt_dist_threshold values when + // enable_winner_mode_for_coeff_opt is ON + // TODO(any): Skip the winner mode processing for blocks with lower residual + // energy as R-D optimization of coefficients would have been enabled during + // mode decision + + // Use conservative threshold during mode decision and perform R-D + // optimization of coeffs always for winner modes + if (is_winner_mode) { + txfm_params->coeff_opt_thresholds[0] = + coeff_opt_threshold[WINNER_MODE_EVAL][0]; + txfm_params->coeff_opt_thresholds[1] = + coeff_opt_threshold[WINNER_MODE_EVAL][1]; + } else { + txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0]; + txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1]; + } +} + +// Used to reset the state of mb rd hash information +static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) { + if (!mb_rd_record) return; + + // Reset the state for use_mb_rd_hash + mb_rd_record->num = mb_rd_record->index_start = 0; +} + +void av1_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + const struct scale_factors *scale, + const struct scale_factors *scale_uv, + const int num_planes); + +int av1_get_intra_cost_penalty(int qindex, int qdelta, + aom_bit_depth_t bit_depth); + +void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs, + FRAME_CONTEXT *fc); + +void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc); + +void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, + const int num_planes); + +void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, + MvCosts *mv_costs); + +void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs); + +int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta); + +int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta); + +/*!\brief Adjust current superblock's q_index based on delta q resolution + * + * \param[in] delta_q_res delta q resolution + * \param[in] prev_qindex previous superblock's q index + * \param[in] curr_qindex current superblock's q index + * + * \return the current superblock's adjusted q_index + */ +int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex, + int curr_qindex); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RD_H_ diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c new file mode 100644 index 0000000000..c17fbccf8c --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt.c @@ -0,0 +1,6598 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/cfl.h" +#include "av1/common/blockd.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/entropy.h" +#include "av1/common/entropymode.h" +#include "av1/common/idct.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/pred_common.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" +#include "av1/common/txb_common.h" +#include "av1/common/warped_motion.h" + +#include "av1/encoder/aq_variance.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/cost.h" +#include "av1/encoder/compound_type.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/interp_search.h" +#include "av1/encoder/intra_mode_search.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/mode_prune_model_weights.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/palette.h" +#include "av1/encoder/pustats.h" +#include "av1/encoder/random.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tokenize.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/tx_search.h" +#include "av1/encoder/var_based_part.h" + +#define LAST_NEW_MV_INDEX 6 + +// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable +// The values are kept in Q12 format and equation used to derive is +// (2.5 - ((float)x->qindex / MAXQ) * 1.5) +#define MODE_THRESH_QBITS 12 +static const int mode_threshold_mul_factor[QINDEX_RANGE] = { + 10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999, + 9975, 9951, 9927, 9903, 9879, 9854, 9830, 9806, 9782, 9758, 9734, + 9710, 9686, 9662, 9638, 9614, 9589, 9565, 9541, 9517, 9493, 9469, + 9445, 9421, 9397, 9373, 9349, 9324, 9300, 9276, 9252, 9228, 9204, + 9180, 9156, 9132, 9108, 9083, 9059, 9035, 9011, 8987, 8963, 8939, + 8915, 8891, 8867, 8843, 8818, 8794, 8770, 8746, 8722, 8698, 8674, + 8650, 8626, 8602, 8578, 8553, 8529, 8505, 8481, 8457, 8433, 8409, + 8385, 8361, 8337, 8312, 8288, 8264, 8240, 8216, 8192, 8168, 8144, + 8120, 8096, 8072, 8047, 8023, 7999, 7975, 7951, 7927, 7903, 7879, + 7855, 7831, 7806, 7782, 7758, 7734, 7710, 7686, 7662, 7638, 7614, + 7590, 7566, 7541, 7517, 7493, 7469, 7445, 7421, 7397, 7373, 7349, + 7325, 7301, 7276, 7252, 7228, 7204, 7180, 7156, 7132, 7108, 7084, + 7060, 7035, 7011, 6987, 6963, 6939, 6915, 6891, 6867, 6843, 6819, + 6795, 6770, 6746, 6722, 6698, 6674, 6650, 6626, 6602, 6578, 6554, + 6530, 6505, 6481, 6457, 6433, 6409, 6385, 6361, 6337, 6313, 6289, + 6264, 6240, 6216, 6192, 6168, 6144, 6120, 6096, 6072, 6048, 6024, + 5999, 5975, 5951, 5927, 5903, 5879, 5855, 5831, 5807, 5783, 5758, + 5734, 5710, 5686, 5662, 5638, 5614, 5590, 5566, 5542, 5518, 5493, + 5469, 5445, 5421, 5397, 5373, 5349, 5325, 5301, 5277, 5253, 5228, + 5204, 5180, 5156, 5132, 5108, 5084, 5060, 5036, 5012, 4987, 4963, + 4939, 4915, 4891, 4867, 4843, 4819, 4795, 4771, 4747, 4722, 4698, + 4674, 4650, 4626, 4602, 4578, 4554, 4530, 4506, 4482, 4457, 4433, + 4409, 4385, 4361, 4337, 4313, 4289, 4265, 4241, 4216, 4192, 4168, + 4144, 4120, 4096 +}; + +static const THR_MODES av1_default_mode_order[MAX_MODES] = { + THR_NEARESTMV, + THR_NEARESTL2, + THR_NEARESTL3, + THR_NEARESTB, + THR_NEARESTA2, + THR_NEARESTA, + THR_NEARESTG, + + THR_NEWMV, + THR_NEWL2, + THR_NEWL3, + THR_NEWB, + THR_NEWA2, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARL2, + THR_NEARL3, + THR_NEARB, + THR_NEARA2, + THR_NEARA, + THR_NEARG, + + THR_GLOBALMV, + THR_GLOBALL2, + THR_GLOBALL3, + THR_GLOBALB, + THR_GLOBALA2, + THR_GLOBALA, + THR_GLOBALG, + + THR_COMP_NEAREST_NEARESTLA, + THR_COMP_NEAREST_NEARESTL2A, + THR_COMP_NEAREST_NEARESTL3A, + THR_COMP_NEAREST_NEARESTGA, + THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTLA2, + THR_COMP_NEAREST_NEARESTL2A2, + THR_COMP_NEAREST_NEARESTL3A2, + THR_COMP_NEAREST_NEARESTGA2, + THR_COMP_NEAREST_NEARESTLL2, + THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, + THR_COMP_NEAREST_NEARESTBA, + + THR_COMP_NEAR_NEARLB, + THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEARESTLB, + THR_COMP_NEAREST_NEWLB, + THR_COMP_NEW_NEARLB, + THR_COMP_NEAR_NEWLB, + THR_COMP_GLOBAL_GLOBALLB, + + THR_COMP_NEAR_NEARLA, + THR_COMP_NEW_NEWLA, + THR_COMP_NEW_NEARESTLA, + THR_COMP_NEAREST_NEWLA, + THR_COMP_NEW_NEARLA, + THR_COMP_NEAR_NEWLA, + THR_COMP_GLOBAL_GLOBALLA, + + THR_COMP_NEAR_NEARL2A, + THR_COMP_NEW_NEWL2A, + THR_COMP_NEW_NEARESTL2A, + THR_COMP_NEAREST_NEWL2A, + THR_COMP_NEW_NEARL2A, + THR_COMP_NEAR_NEWL2A, + THR_COMP_GLOBAL_GLOBALL2A, + + THR_COMP_NEAR_NEARL3A, + THR_COMP_NEW_NEWL3A, + THR_COMP_NEW_NEARESTL3A, + THR_COMP_NEAREST_NEWL3A, + THR_COMP_NEW_NEARL3A, + THR_COMP_NEAR_NEWL3A, + THR_COMP_GLOBAL_GLOBALL3A, + + THR_COMP_NEAR_NEARGA, + THR_COMP_NEW_NEWGA, + THR_COMP_NEW_NEARESTGA, + THR_COMP_NEAREST_NEWGA, + THR_COMP_NEW_NEARGA, + THR_COMP_NEAR_NEWGA, + THR_COMP_GLOBAL_GLOBALGA, + + THR_COMP_NEAR_NEARL2B, + THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEW_NEARL2B, + THR_COMP_NEAR_NEWL2B, + THR_COMP_GLOBAL_GLOBALL2B, + + THR_COMP_NEAR_NEARL3B, + THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEW_NEARL3B, + THR_COMP_NEAR_NEWL3B, + THR_COMP_GLOBAL_GLOBALL3B, + + THR_COMP_NEAR_NEARGB, + THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEARESTGB, + THR_COMP_NEAREST_NEWGB, + THR_COMP_NEW_NEARGB, + THR_COMP_NEAR_NEWGB, + THR_COMP_GLOBAL_GLOBALGB, + + THR_COMP_NEAR_NEARLA2, + THR_COMP_NEW_NEWLA2, + THR_COMP_NEW_NEARESTLA2, + THR_COMP_NEAREST_NEWLA2, + THR_COMP_NEW_NEARLA2, + THR_COMP_NEAR_NEWLA2, + THR_COMP_GLOBAL_GLOBALLA2, + + THR_COMP_NEAR_NEARL2A2, + THR_COMP_NEW_NEWL2A2, + THR_COMP_NEW_NEARESTL2A2, + THR_COMP_NEAREST_NEWL2A2, + THR_COMP_NEW_NEARL2A2, + THR_COMP_NEAR_NEWL2A2, + THR_COMP_GLOBAL_GLOBALL2A2, + + THR_COMP_NEAR_NEARL3A2, + THR_COMP_NEW_NEWL3A2, + THR_COMP_NEW_NEARESTL3A2, + THR_COMP_NEAREST_NEWL3A2, + THR_COMP_NEW_NEARL3A2, + THR_COMP_NEAR_NEWL3A2, + THR_COMP_GLOBAL_GLOBALL3A2, + + THR_COMP_NEAR_NEARGA2, + THR_COMP_NEW_NEWGA2, + THR_COMP_NEW_NEARESTGA2, + THR_COMP_NEAREST_NEWGA2, + THR_COMP_NEW_NEARGA2, + THR_COMP_NEAR_NEWGA2, + THR_COMP_GLOBAL_GLOBALGA2, + + THR_COMP_NEAR_NEARLL2, + THR_COMP_NEW_NEWLL2, + THR_COMP_NEW_NEARESTLL2, + THR_COMP_NEAREST_NEWLL2, + THR_COMP_NEW_NEARLL2, + THR_COMP_NEAR_NEWLL2, + THR_COMP_GLOBAL_GLOBALLL2, + + THR_COMP_NEAR_NEARLL3, + THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEW_NEARLL3, + THR_COMP_NEAR_NEWLL3, + THR_COMP_GLOBAL_GLOBALLL3, + + THR_COMP_NEAR_NEARLG, + THR_COMP_NEW_NEWLG, + THR_COMP_NEW_NEARESTLG, + THR_COMP_NEAREST_NEWLG, + THR_COMP_NEW_NEARLG, + THR_COMP_NEAR_NEWLG, + THR_COMP_GLOBAL_GLOBALLG, + + THR_COMP_NEAR_NEARBA, + THR_COMP_NEW_NEWBA, + THR_COMP_NEW_NEARESTBA, + THR_COMP_NEAREST_NEWBA, + THR_COMP_NEW_NEARBA, + THR_COMP_NEAR_NEWBA, + THR_COMP_GLOBAL_GLOBALBA, + + THR_DC, + THR_PAETH, + THR_SMOOTH, + THR_SMOOTH_V, + THR_SMOOTH_H, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D203_PRED, + THR_D157_PRED, + THR_D67_PRED, + THR_D113_PRED, + THR_D45_PRED, +}; + +/*!\cond */ +typedef struct SingleInterModeState { + int64_t rd; + MV_REFERENCE_FRAME ref_frame; + int valid; +} SingleInterModeState; + +typedef struct InterModeSearchState { + int64_t best_rd; + int64_t best_skip_rd[2]; + MB_MODE_INFO best_mbmode; + int best_rate_y; + int best_rate_uv; + int best_mode_skippable; + int best_skip2; + THR_MODES best_mode_index; + int num_available_refs; + int64_t dist_refs[REF_FRAMES]; + int dist_order_refs[REF_FRAMES]; + int64_t mode_threshold[MAX_MODES]; + int64_t best_intra_rd; + unsigned int best_pred_sse; + + /*! + * \brief Keep track of best intra rd for use in compound mode. + */ + int64_t best_pred_rd[REFERENCE_MODES]; + // Save a set of single_newmv for each checked ref_mv. + int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES]; + int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES]; + int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES]; + int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; + // The rd of simple translation in single inter modes + int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES]; + int64_t best_single_rd[REF_FRAMES]; + PREDICTION_MODE best_single_mode[REF_FRAMES]; + + // Single search results by [directions][modes][reference frames] + SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + int single_state_cnt[2][SINGLE_INTER_MODE_NUM]; + SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM] + [FWD_REFS]; + int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM]; + MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; + IntraModeSearchState intra_search_state; + RD_STATS best_y_rdcost; +} InterModeSearchState; +/*!\endcond */ + +void av1_inter_mode_data_init(TileDataEnc *tile_data) { + for (int i = 0; i < BLOCK_SIZES_ALL; ++i) { + InterModeRdModel *md = &tile_data->inter_mode_rd_models[i]; + md->ready = 0; + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; + } +} + +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } + } + return 1; + } + return 0; +} + +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { + for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { + const int block_idx = inter_mode_data_block_idx(bsize); + InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (block_idx == -1) continue; + if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) { + continue; + } else { + if (md->ready == 0) { + md->dist_mean = md->dist_sum / md->num; + md->ld_mean = md->ld_sum / md->num; + md->sse_mean = md->sse_sum / md->num; + md->sse_sse_mean = md->sse_sse_sum / md->num; + md->sse_ld_mean = md->sse_ld_sum / md->num; + } else { + const double factor = 3; + md->dist_mean = + (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1); + md->ld_mean = + (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1); + md->sse_mean = + (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1); + md->sse_sse_mean = + (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) / + (factor + 1); + md->sse_ld_mean = + (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) / + (factor + 1); + } + + const double my = md->ld_mean; + const double mx = md->sse_mean; + const double dx = sqrt(md->sse_sse_mean); + const double dxy = md->sse_ld_mean; + + md->a = (dxy - mx * my) / (dx * dx - mx * mx); + md->b = my - md->a * mx; + md->ready = 1; + + md->num = 0; + md->dist_sum = 0; + md->ld_sum = 0; + md->sse_sum = 0; + md->sse_sse_sum = 0; + md->sse_ld_sum = 0; + } + (void)rdmult; + } +} + +static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data, + BLOCK_SIZE bsize, int64_t sse, + int64_t dist, int residue_cost) { + if (residue_cost == 0 || sse == dist) return; + const int block_idx = inter_mode_data_block_idx(bsize); + if (block_idx == -1) return; + InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize]; + if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) { + const double ld = (sse - dist) * 1. / residue_cost; + ++rd_model->num; + rd_model->dist_sum += dist; + rd_model->ld_sum += ld; + rd_model->sse_sum += sse; + rd_model->sse_sse_sum += (double)sse * (double)sse; + rd_model->sse_ld_sum += sse * ld; + } +} + +static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info, + int mode_rate, int64_t sse, + int64_t rd, RD_STATS *rd_cost, + RD_STATS *rd_cost_y, + RD_STATS *rd_cost_uv, + const MB_MODE_INFO *mbmi) { + const int num = inter_modes_info->num; + assert(num < MAX_INTER_MODES); + inter_modes_info->mbmi_arr[num] = *mbmi; + inter_modes_info->mode_rate_arr[num] = mode_rate; + inter_modes_info->sse_arr[num] = sse; + inter_modes_info->est_rd_arr[num] = rd; + inter_modes_info->rd_cost_arr[num] = *rd_cost; + inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y; + inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv; + ++inter_modes_info->num; +} + +static int compare_rd_idx_pair(const void *a, const void *b) { + if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) { + // To avoid inconsistency in qsort() ordering when two elements are equal, + // using idx as tie breaker. Refer aomedia:2928 + if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx) + return 0; + else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx) + return 1; + else + return -1; + } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) { + return 1; + } else { + return -1; + } +} + +static AOM_INLINE void inter_modes_info_sort( + const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) { + if (inter_modes_info->num == 0) { + return; + } + for (int i = 0; i < inter_modes_info->num; ++i) { + rd_idx_pair_arr[i].idx = i; + rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i]; + } + qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), + compare_rd_idx_pair); +} + +// Similar to get_horver_correlation, but also takes into account first +// row/column, when computing horizontal/vertical correlation. +void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - left neighbor pixel + // z - top neighbor pixel + int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0; + int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0; + int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0; + + // First, process horizontal correlation on just the first row + x_sum += diff[0]; + x2_sum += diff[0] * diff[0]; + x_firstrow += diff[0]; + x2_firstrow += diff[0] * diff[0]; + for (int j = 1; j < width; ++j) { + const int16_t x = diff[j]; + const int16_t y = diff[j - 1]; + x_sum += x; + x_firstrow += x; + x2_sum += x * x; + x2_firstrow += x * x; + xy_sum += x * y; + } + + // Process vertical correlation in the first column + x_firstcol += diff[0]; + x2_firstcol += diff[0] * diff[0]; + for (int i = 1; i < height; ++i) { + const int16_t x = diff[i * stride]; + const int16_t z = diff[(i - 1) * stride]; + x_sum += x; + x_firstcol += x; + x2_sum += x * x; + x2_firstcol += x * x; + xz_sum += x * z; + } + + // Now process horiz and vert correlation through the rest unit + for (int i = 1; i < height; ++i) { + for (int j = 1; j < width; ++j) { + const int16_t x = diff[i * stride + j]; + const int16_t y = diff[i * stride + j - 1]; + const int16_t z = diff[(i - 1) * stride + j]; + x_sum += x; + x2_sum += x * x; + xy_sum += x * y; + xz_sum += x * z; + } + } + + for (int j = 0; j < width; ++j) { + x_finalrow += diff[(height - 1) * stride + j]; + x2_finalrow += + diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j]; + } + for (int i = 0; i < height; ++i) { + x_finalcol += diff[i * stride + width - 1]; + x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} + +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x, + int64_t *sse_y) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + unsigned int sse; + + cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + total_sse += sse; + if (!plane && sse_y) *sse_y = sse; + } + total_sse <<= 4; + return total_sse; +} + +int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += coeff[i] * coeff[i]; + } + + *ssz = sqcoeff; + return error; +} + +int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + int64_t error = 0; + + for (int i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + } + + return error; +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_block_error_c(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bd) { + int i; + int64_t error = 0, sqcoeff = 0; + int shift = 2 * (bd - 8); + int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i++) { + const int64_t diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} +#endif + +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { + if (mode == D113_PRED && best_intra_mode != V_PRED && + best_intra_mode != D135_PRED) + return 1; + if (mode == D67_PRED && best_intra_mode != V_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D203_PRED && best_intra_mode != H_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D157_PRED && best_intra_mode != H_PRED && + best_intra_mode != D135_PRED) + return 1; + return 0; +} + +static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode, + int16_t mode_context) { + if (is_inter_compound_mode(mode)) { + return mode_costs + ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)]; + } + + int mode_cost = 0; + int16_t mode_ctx = mode_context & NEWMV_CTX_MASK; + + assert(is_inter_mode(mode)); + + if (mode == NEWMV) { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; + + if (mode == GLOBALMV) { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0]; + return mode_cost; + } else { + mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1]; + mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK; + mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV]; + return mode_cost; + } + } +} + +static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode, + int ref_idx) { + return ref_idx ? compound_ref1_mode(this_mode) + : compound_ref0_mode(this_mode); +} + +static AOM_INLINE void estimate_ref_frame_costs( + const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs, + int segment_id, unsigned int *ref_costs_single, + unsigned int (*ref_costs_comp)[REF_FRAMES]) { + int seg_ref_active = + segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + if (seg_ref_active) { + memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single)); + int ref_frame; + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + memset(ref_costs_comp[ref_frame], 0, + REF_FRAMES * sizeof((*ref_costs_comp)[0])); + } else { + int intra_inter_ctx = av1_get_intra_inter_context(xd); + ref_costs_single[INTRA_FRAME] = + mode_costs->intra_inter_cost[intra_inter_ctx][0]; + unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1]; + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + ref_costs_single[i] = base_cost; + + const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd); + const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd); + const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd); + const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd); + const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd); + const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd); + + // Determine cost of a single ref frame, where frame types are represented + // by a tree: + // Level 0: add cost whether this ref is a forward or backward ref + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0]; + ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF2_FRAME] += + mode_costs->single_ref_cost[ctx_p1][0][1]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1]; + + // Level 1: if this ref is forward ref, + // add cost whether it is last/last2 or last3/golden + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0]; + ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1]; + + // Level 1: if this ref is backward ref + // then add cost whether this ref is altref or backward ref + ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF2_FRAME] += + mode_costs->single_ref_cost[ctx_p2][1][0]; + ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1]; + + // Level 2: further add cost whether this ref is last or last2 + ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0]; + ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1]; + + // Level 2: last3 or golden + ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0]; + ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1]; + + // Level 2: bwdref or altref2 + ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0]; + ref_costs_single[ALTREF2_FRAME] += + mode_costs->single_ref_cost[ctx_p6][5][1]; + + if (cm->current_frame.reference_mode != SINGLE_REFERENCE) { + // Similar to single ref, determine cost of compound ref frames. + // cost_compound_refs = cost_first_ref + cost_second_ref + const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd); + const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd); + const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd); + const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd); + const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd); + + const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd); + unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 }; + + ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] = + ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1]; + ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0; + ref_bicomp_costs[ALTREF_FRAME] = 0; + + // cost of first ref frame + ref_bicomp_costs[LAST_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST2_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0]; + ref_bicomp_costs[LAST3_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1]; + ref_bicomp_costs[GOLDEN_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1]; + + ref_bicomp_costs[LAST_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0]; + ref_bicomp_costs[LAST2_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1]; + + ref_bicomp_costs[LAST3_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0]; + ref_bicomp_costs[GOLDEN_FRAME] += + mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1]; + + // cost of second ref frame + ref_bicomp_costs[BWDREF_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0]; + ref_bicomp_costs[ALTREF_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1]; + + ref_bicomp_costs[BWDREF_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0]; + ref_bicomp_costs[ALTREF2_FRAME] += + mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1]; + + // cost: if one ref frame is forward ref, the other ref is backward ref + int ref0, ref1; + for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) { + ref_costs_comp[ref0][ref1] = + ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1]; + } + } + + // cost: if both ref frames are the same side. + const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd); + const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd); + const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd); + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0]; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0]; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1]; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = + base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] + + mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1]; + } else { + int ref0, ref1; + for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) { + for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) + ref_costs_comp[ref0][ref1] = 512; + } + ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512; + ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512; + ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512; + ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512; + } + } +} + +static AOM_INLINE void store_coding_context( +#if CONFIG_INTERNAL_STATS + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, +#else + MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, +#endif // CONFIG_INTERNAL_STATS + int skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + // Take a snapshot of the coding context so it can be + // restored if we decide to encode this way + ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm; + ctx->skippable = skippable; +#if CONFIG_INTERNAL_STATS + ctx->best_mode_index = mode_index; +#endif // CONFIG_INTERNAL_STATS + ctx->mic = *xd->mi[0]; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); +} + +static AOM_INLINE void setup_buffer_ref_mvs_inter( + const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, + BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref_frame); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, ref_frame); + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame); + assert(yv12 != NULL); + + if (scaled_ref_frame) { + // Setup pred block based on scaled reference, because av1_mv_pred() doesn't + // support scaling. + av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL, + num_planes); + } else { + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + } + + // Gets an initial list of candidate vectors from neighbours and orders them + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + // Further refinement that is encode side only to test the top few candidates + // in full and choose the best as the center point for subsequent searches. + // The current implementation doesn't support scaling. + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride, + ref_frame, block_size); + + // Go back to unscaled reference. + if (scaled_ref_frame) { + // We had temporarily setup pred block based on scaled reference above. Go + // back to unscaled reference now, for subsequent use. + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes); + } +} + +#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3) + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + + RIGHT_BOTTOM_MARGIN }; + clamp_mv(mv, &mv_limits); +} + +/* If the current mode shares the same mv with other modes with higher cost, + * skip this mode. */ +static int skip_repeated_mv(const AV1_COMMON *const cm, + const MACROBLOCK *const x, + PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frames[2], + InterModeSearchState *search_state) { + const int is_comp_pred = ref_frames[1] > INTRA_FRAME; + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames); + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + PREDICTION_MODE compare_mode = MB_MODE_COUNT; + if (!is_comp_pred) { + if (this_mode == NEARMV) { + if (ref_mv_count == 0) { + // NEARMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // NEARMV has the same motion vector as GLOBALMV + compare_mode = GLOBALMV; + } + } + if (this_mode == GLOBALMV) { + if (ref_mv_count == 0 && + cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) { + // GLOBALMV has the same motion vector as NEARESTMV + compare_mode = NEARESTMV; + } + if (ref_mv_count == 1) { + // GLOBALMV has the same motion vector as NEARMV + compare_mode = NEARMV; + } + } + + if (compare_mode != MB_MODE_COUNT) { + // Use modelled_rd to check whether compare mode was searched + if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] != + INT64_MAX) { + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames); + const int compare_cost = + cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx); + const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx); + + // Only skip if the mode cost is larger than compare mode cost + if (this_cost > compare_cost) { + search_state->modelled_rd[this_mode][0][ref_frames[0]] = + search_state->modelled_rd[compare_mode][0][ref_frames[0]]; + return 1; + } + } + } + } + return 0; +} + +static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv, + const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *const xd = &x->e_mbd; + *out_mv = in_mv; + lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv, + cm->features.cur_frame_force_integer_mv); + clamp_mv2(&out_mv->as_mv, xd); + return av1_is_fullmv_in_range(&x->mv_limits, + get_fullmv_from_mv(&out_mv->as_mv)); +} + +// To use single newmv directly for compound modes, need to clamp the mv to the +// valid mv range. Without this, encoder would generate out of range mv, and +// this is seen in 8k encoding. +static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv, + int ref_idx) { + const int_mv ref_mv = av1_get_ref_mv(x, ref_idx); + SubpelMvLimits mv_limits; + + av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv); + clamp_mv(&mv->as_mv, &mv_limits); +} + +static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, int_mv *cur_mv, + int *const rate_mv, HandleInterModeArgs *const args, + inter_mode_info *mode_info) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + const int refs[2] = { mbmi->ref_frame[0], + mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; + const int ref_mv_idx = mbmi->ref_mv_idx; + + if (is_comp_pred) { + const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]]; + const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]]; + if (this_mode == NEW_NEWMV) { + if (valid_mv0) { + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + clamp_mv_in_range(x, &cur_mv[0], 0); + } + if (valid_mv1) { + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; + clamp_mv_in_range(x, &cur_mv[1], 1); + } + *rate_mv = 0; + for (int i = 0; i < 2; ++i) { + const int_mv ref_mv = av1_get_ref_mv(x, i); + *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) { + if (valid_mv1) { + cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int; + clamp_mv_in_range(x, &cur_mv[1], 1); + } + const int_mv ref_mv = av1_get_ref_mv(x, 1); + *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } else { + assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV); + if (valid_mv0) { + cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int; + clamp_mv_in_range(x, &cur_mv[0], 0); + } + const int_mv ref_mv = av1_get_ref_mv(x, 0); + *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, + x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + } + } else { + // Single ref case. + const int ref_idx = 0; + int search_range = INT_MAX; + + if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) { + const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv; + int min_mv_diff = INT_MAX; + int best_match = -1; + MV prev_ref_mv[2] = { { 0 } }; + for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) { + prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, + idx, &x->mbmi_ext) + .as_mv; + const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row), + abs(ref_mv.col - prev_ref_mv[idx].col)); + + if (min_mv_diff > ref_mv_diff) { + min_mv_diff = ref_mv_diff; + best_match = idx; + } + } + + if (min_mv_diff < (16 << 3)) { + if (args->single_newmv_valid[best_match][refs[0]]) { + search_range = min_mv_diff; + search_range += + AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row - + prev_ref_mv[best_match].row), + abs(args->single_newmv[best_match][refs[0]].as_mv.col - + prev_ref_mv[best_match].col)); + // Get full pixel search range. + search_range = (search_range + 4) >> 3; + } + } + } + + int_mv best_mv; + av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range, + mode_info, &best_mv, args); + if (best_mv.as_int == INVALID_MV) return INT64_MAX; + + args->single_newmv[ref_mv_idx][refs[0]] = best_mv; + args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv; + args->single_newmv_valid[ref_mv_idx][refs[0]] = 1; + cur_mv[0].as_int = best_mv.as_int; + + // Return after single_newmv is set. + if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX; + } + + return 0; +} + +static INLINE void update_mode_start_end_index( + const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi, + int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed, + int interintra_allowed, int eval_motion_mode) { + *mode_index_start = (int)SIMPLE_TRANSLATION; + *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed; + if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) { + if (!eval_motion_mode) { + *mode_index_end = (int)SIMPLE_TRANSLATION; + } else { + // Set the start index appropriately to process motion modes other than + // simple translation + *mode_index_start = 1; + } + } + if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16) + *mode_index_end = SIMPLE_TRANSLATION; +} + +/*!\brief AV1 motion mode search + * + * \ingroup inter_mode_search + * Function to search over and determine the motion mode. It will update + * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or + * WARPED_CAUSAL and determine any necessary side information for the selected + * motion mode. It will also perform the full transform search, unless the + * input parameter do_tx_search indicates to do an estimation of the RD rather + * than an RD corresponding to a full transform search. It will return the + * RD for the final motion_mode. + * Do the RD search for a given inter mode and compute all information relevant + * to the input mode. It will compute the best MV, + * compound parameters (if the mode is a compound mode) and interpolation filter + * parameters. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] bsize Current block size. + * \param[in,out] rd_stats Struct to keep track of the overall RD + * information. + * \param[in,out] rd_stats_y Struct to keep track of the RD information + * for only the Y plane. + * \param[in,out] rd_stats_uv Struct to keep track of the RD information + * for only the UV planes. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * \param[in,out] ref_skip_rd A length 2 array, where skip_rd[0] is the + * best total RD for a skip mode so far, and + * skip_rd[1] is the best RD for a skip mode so + * far in luma. This is used as a speed feature + * to skip the transform search if the computed + * skip RD for the current mode is not better + * than the best skip_rd so far. + * \param[in,out] rate_mv The rate associated with the motion vectors. + * This will be modified if a motion search is + * done in the motion mode search. + * \param[in,out] orig_dst A prediction buffer to hold a computed + * prediction. This will eventually hold the + * final prediction, and the tmp_dst info will + * be copied here. + * \param[in,out] best_est_rd Estimated RD for motion mode search if + * do_tx_search (see below) is 0. + * \param[in] do_tx_search Parameter to indicate whether or not to do + * a full transform search. This will compute + * an estimated RD for the modes without the + * transform search and later perform the full + * transform search on the best candidates. + * \param[in] inter_modes_info InterModesInfo struct to hold inter mode + * information to perform a full transform + * search only on winning candidates searched + * with an estimate for transform coding RD. + * \param[in] eval_motion_mode Boolean whether or not to evaluate motion + * motion modes other than SIMPLE_TRANSLATION. + * \param[out] yrd Stores the rdcost corresponding to encoding + * the luma plane. + * \return Returns INT64_MAX if the determined motion mode is invalid and the + * current motion mode being tested should be skipped. It returns 0 if the + * motion mode search is a success. + */ +static int64_t motion_mode_rd( + const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd, + int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst, + int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info, + int eval_motion_mode, int64_t *yrd) { + const AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + const int rate2_nocoeff = rd_stats->rate; + int best_xskip_txfm = 0; + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int rate_mv0 = *rate_mv; + const int interintra_allowed = cm->seq_params->enable_interintra_compound && + is_interintra_allowed(mbmi) && + mbmi->compound_idx; + WARP_SAMPLE_INFO *const warp_sample_info = + &x->warp_sample_info[mbmi->ref_frame[0]]; + int *pts0 = warp_sample_info->pts; + int *pts_inref0 = warp_sample_info->pts_inref; + + assert(mbmi->ref_frame[1] != INTRA_FRAME); + const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + av1_invalid_rd_stats(&best_rd_stats); + mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 + MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION; + *yrd = INT64_MAX; + if (features->switchable_motion_mode) { + // Determine which motion modes to search if more than SIMPLE_TRANSLATION + // is allowed. + last_motion_mode_allowed = motion_mode_allowed( + xd->global_motion, xd, mbmi, features->allow_warped_motion); + } + + if (last_motion_mode_allowed == WARPED_CAUSAL) { + // Collect projection samples used in least squares approximation of + // the warped motion parameters if WARPED_CAUSAL is going to be searched. + if (warp_sample_info->num < 0) { + warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0); + } + mbmi->num_proj_ref = warp_sample_info->num; + } + const int total_samples = mbmi->num_proj_ref; + if (total_samples == 0) { + // Do not search WARPED_CAUSAL if there are no samples to use to determine + // warped parameters. + last_motion_mode_allowed = OBMC_CAUSAL; + } + + const MB_MODE_INFO base_mbmi = *mbmi; + MB_MODE_INFO best_mbmi; + const int interp_filter = features->interp_filter; + const int switchable_rate = + av1_is_interp_needed(xd) + ? av1_get_switchable_rate(x, xd, interp_filter, + cm->seq_params->enable_dual_filter) + : 0; + int64_t best_rd = INT64_MAX; + int best_rate_mv = rate_mv0; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int mode_index_start, mode_index_end; + const int txfm_rd_gate_level = + get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, + TX_SEARCH_MOTION_MODE, eval_motion_mode); + + // Modify the start and end index according to speed features. For example, + // if SIMPLE_TRANSLATION has already been searched according to + // the motion_mode_for_winner_cand speed feature, update the mode_index_start + // to avoid searching it again. + update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end, + last_motion_mode_allowed, interintra_allowed, + eval_motion_mode); + // Main function loop. This loops over all of the possible motion modes and + // computes RD to determine the best one. This process includes computing + // any necessary side information for the motion mode and performing the + // transform search. + for (int mode_index = mode_index_start; mode_index <= mode_index_end; + mode_index++) { + if (args->skip_motion_mode && mode_index) continue; + int tmp_rate2 = rate2_nocoeff; + const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; + int tmp_rate_mv = rate_mv0; + + *mbmi = base_mbmi; + if (is_interintra_mode) { + // Only use SIMPLE_TRANSLATION for interintra + mbmi->motion_mode = SIMPLE_TRANSLATION; + } else { + mbmi->motion_mode = (MOTION_MODE)mode_index; + assert(mbmi->ref_frame[1] != INTRA_FRAME); + } + + // Do not search OBMC if the probability of selecting it is below a + // predetermined threshold for this update_type and block size. + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int use_actual_frame_probs = 1; + int prune_obmc; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } +#endif + if (use_actual_frame_probs) { + prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } + if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) && + mbmi->motion_mode == OBMC_CAUSAL) + continue; + + if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) { + // SIMPLE_TRANSLATION mode: no need to recalculate. + // The prediction is calculated before motion_mode_rd() is called in + // handle_inter_mode() + } else if (mbmi->motion_mode == OBMC_CAUSAL) { + const uint32_t cur_mv = mbmi->mv[0].as_int; + // OBMC_CAUSAL not allowed for compound prediction + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { + av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL, + &mbmi->mv[0], NULL); + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } + if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) { + // Build the predictor according to the current motion vector if it has + // not already been built + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + 0, av1_num_planes(cm) - 1); + } + // Build the inter predictor by blending the predictor corresponding to + // this MV, and the neighboring blocks using the OBMC model + av1_build_obmc_inter_prediction( + cm, xd, args->above_pred_buf, args->above_pred_stride, + args->left_pred_buf, args->left_pred_stride); +#if !CONFIG_REALTIME_ONLY + } else if (mbmi->motion_mode == WARPED_CAUSAL) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->motion_mode = WARPED_CAUSAL; + mbmi->wm_params.wmtype = DEFAULT_WMTYPE; + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + + memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0)); + memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0)); + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples( + &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize); + } + + // Compute the warped motion parameters with a least squares fit + // using the collected samples + if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, + mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col, + &mbmi->wm_params, mi_row, mi_col)) { + assert(!is_comp_pred); + if (have_newmv_in_inter_mode(this_mode)) { + // Refine MV for NEWMV mode + const int_mv mv0 = mbmi->mv[0]; + const WarpedMotionParams wm_params0 = mbmi->wm_params; + const int num_proj_ref0 = mbmi->num_proj_ref; + + const int_mv ref_mv = av1_get_ref_mv(x, 0); + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, + &ref_mv.as_mv, NULL); + + // Refine MV in a small range. + av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0, + total_samples, cpi->sf.mv_sf.warp_search_method, + cpi->sf.mv_sf.warp_search_iters); + + if (mv0.as_int != mbmi->mv[0].as_int) { + // Keep the refined MV and WM parameters. + tmp_rate_mv = av1_mv_bit_cost( + &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, + x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); + tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; + } else { + // Restore the old MV and WM parameters. + mbmi->mv[0] = mv0; + mbmi->wm_params = wm_params0; + mbmi->num_proj_ref = num_proj_ref0; + } + } + + // Build the warped predictor + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } else { + continue; + } +#endif // !CONFIG_REALTIME_ONLY + } else if (is_interintra_mode) { + const int ret = + av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd, + &tmp_rate_mv, &tmp_rate2, orig_dst); + if (ret < 0) continue; + } + + // If we are searching newmv and the mv is the same as refmv, skip the + // current mode + if (!av1_check_newmv_joint_nonzero(cm, x)) continue; + + // Update rd_stats for the current motion mode + txfm_info->skip_txfm = 0; + rd_stats->dist = 0; + rd_stats->sse = 0; + rd_stats->skip_txfm = 1; + rd_stats->rate = tmp_rate2; + const ModeCosts *mode_costs = &x->mode_costs; + if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate; + if (interintra_allowed) { + rd_stats->rate += + mode_costs->interintra_cost[size_group_lookup[bsize]] + [mbmi->ref_frame[1] == INTRA_FRAME]; + } + if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) && + (mbmi->ref_frame[1] != INTRA_FRAME)) { + if (last_motion_mode_allowed == WARPED_CAUSAL) { + rd_stats->rate += + mode_costs->motion_mode_cost[bsize][mbmi->motion_mode]; + } else { + rd_stats->rate += + mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode]; + } + } + + int64_t this_yrd = INT64_MAX; + + if (!do_tx_search) { + // Avoid doing a transform search here to speed up the overall mode + // search. It will be done later in the mode search if the current + // motion mode seems promising. + int64_t curr_sse = -1; + int64_t sse_y = -1; + int est_residue_cost = 0; + int64_t est_dist = 0; + int64_t est_rd = 0; + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + curr_sse = get_sse(cpi, x, &sse_y); + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 || + cpi->sf.rt_sf.use_nonrd_pick_mode) { + model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD]( + cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist, + NULL, &curr_sse, NULL, NULL, NULL); + sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]]; + } + est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist); + if (est_rd * 0.80 > *best_est_rd) { + mbmi->ref_frame[1] = ref_frame_1; + continue; + } + const int mode_rate = rd_stats->rate; + rd_stats->rate += est_residue_cost; + rd_stats->dist = est_dist; + rd_stats->rdcost = est_rd; + if (rd_stats->rdcost < *best_est_rd) { + *best_est_rd = rd_stats->rdcost; + assert(sse_y >= 0); + ref_skip_rd[1] = txfm_rd_gate_level + ? RDCOST(x->rdmult, mode_rate, (sse_y << 4)) + : INT64_MAX; + } + if (cm->current_frame.reference_mode == SINGLE_REFERENCE) { + if (!is_comp_pred) { + assert(curr_sse >= 0); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); + } + } else { + assert(curr_sse >= 0); + inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, + rd_stats->rdcost, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); + } + mbmi->skip_txfm = 0; + } else { + // Perform full transform search + int64_t skip_rd = INT64_MAX; + int64_t skip_rdy = INT64_MAX; + if (txfm_rd_gate_level) { + // Check if the mode is good enough based on skip RD + int64_t sse_y = INT64_MAX; + int64_t curr_sse = get_sse(cpi, x, &sse_y); + skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse); + skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4)); + int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd, + txfm_rd_gate_level, 0); + if (!eval_txfm) continue; + } + + // Do transform search + const int mode_rate = rd_stats->rate; + if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, + rd_stats->rate, ref_best_rd)) { + if (rd_stats_y->rate == INT_MAX && mode_index == 0) { + return INT64_MAX; + } + continue; + } + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int y_rate = + rd_stats->skip_txfm + ? x->mode_costs.skip_txfm_cost[skip_ctx][1] + : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]); + this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist); + + const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (curr_rd < ref_best_rd) { + ref_best_rd = curr_rd; + ref_skip_rd[0] = skip_rd; + ref_skip_rd[1] = skip_rdy; + } + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + inter_mode_data_push( + tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]); + } + } + + if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { + if (is_nontrans_global_motion(xd, xd->mi[0])) { + mbmi->interp_filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + } + } + + const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (mode_index == 0) { + args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; + } + if (mode_index == 0 || tmp_rd < best_rd) { + // Update best_rd data if this is the best motion mode so far + best_mbmi = *mbmi; + best_rd = tmp_rd; + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rate_mv = tmp_rate_mv; + *yrd = this_yrd; + if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv; + memcpy(best_blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); + best_xskip_txfm = mbmi->skip_txfm; + } + } + // Update RD and mbmi stats for selected motion mode + mbmi->ref_frame[1] = ref_frame_1; + *rate_mv = best_rate_mv; + if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) { + av1_invalid_rd_stats(rd_stats); + restore_dst_buf(xd, *orig_dst, num_planes); + return INT64_MAX; + } + *mbmi = best_mbmi; + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv; + memcpy(txfm_info->blk_skip, best_blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); + txfm_info->skip_txfm = best_xskip_txfm; + + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, + MACROBLOCK *const x, BLOCK_SIZE bsize, + const BUFFER_SET *const orig_dst, int64_t best_rd) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int64_t total_sse = 0; + int64_t this_rd = INT64_MAX; + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1]; + + for (int plane = 0; plane < num_planes; ++plane) { + // Call av1_enc_build_inter_predictor() for one plane at a time. + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane, plane); + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + av1_subtract_plane(x, plane_bsize, plane); + + int64_t sse = + av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL); + if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + sse <<= 4; + total_sse += sse; + // When current rd cost is more than the best rd, skip evaluation of + // remaining planes. + this_rd = RDCOST(x->rdmult, rd_stats->rate, total_sse); + if (this_rd > best_rd) break; + } + + rd_stats->dist = rd_stats->sse = total_sse; + rd_stats->rdcost = this_rd; + + restore_dst_buf(xd, *orig_dst, num_planes); + return 0; +} + +// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant +// mode +// Note(rachelbarker): This speed feature currently does not interact correctly +// with global motion. The issue is that, when global motion is used, GLOBALMV +// produces a different prediction to NEARESTMV/NEARMV even if the motion +// vectors are the same. Thus GLOBALMV should not be pruned in this case. +static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext, + int ref_idx, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE single_mode) { + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + assert(single_mode != NEWMV); + if (single_mode == NEARESTMV) { + return 0; + } else if (single_mode == NEARMV) { + // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV + // when ref_mv_count = 1, NEARMV is same as GLOBALMV + if (ref_mv_count < 2) return 1; + } else if (single_mode == GLOBALMV) { + // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV + if (ref_mv_count == 0) return 1; + // when ref_mv_count == 1, NEARMV is same as GLOBALMV + else if (ref_mv_count == 1) + return 0; + + int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count); + // Check GLOBALMV is matching with any mv in ref_mv_stack + for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) { + int_mv this_mv; + + if (ref_idx == 0) + this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv; + else + this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv; + + if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int) + return 1; + } + } + return 0; +} + +static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode, + int ref_idx, int ref_mv_idx, + int skip_repeated_ref_mv, + const MV_REFERENCE_FRAME *ref_frame, + const MB_MODE_INFO_EXT *mbmi_ext) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); + assert(is_inter_singleref_mode(single_mode)); + if (single_mode == NEWMV) { + this_mv->as_int = INVALID_MV; + } else if (single_mode == GLOBALMV) { + if (skip_repeated_ref_mv && + check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) + return 0; + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } else { + assert(single_mode == NEARMV || single_mode == NEARESTMV); + const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1; + if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) { + assert(ref_mv_offset >= 0); + if (ref_idx == 0) { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv; + } else { + *this_mv = + mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv; + } + } else { + if (skip_repeated_ref_mv && + check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode)) + return 0; + *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]]; + } + } + return 1; +} + +// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list +// population +static INLINE int skip_nearest_near_mv_using_refmv_weight( + const MACROBLOCK *const x, const PREDICTION_MODE this_mode, + const int8_t ref_frame_type, PREDICTION_MODE best_mode) { + if (this_mode != NEARESTMV && this_mode != NEARMV) return 0; + // Do not skip the mode if the current block has not yet obtained a valid + // inter mode. + if (!is_inter_mode(best_mode)) return 0; + + const MACROBLOCKD *xd = &x->e_mbd; + // Do not skip the mode if both the top and left neighboring blocks are not + // available. + if (!xd->left_available || !xd->up_available) return 0; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type]; + const int ref_mv_count = + AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]); + + if (ref_mv_count == 0) return 0; + // If ref mv list has at least one nearest candidate do not prune NEARESTMV + if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0; + + // Count number of ref mvs populated from nearest candidates + int nearest_refmv_count = 0; + for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) { + if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++; + } + + // nearest_refmv_count indicates the closeness of block motion characteristics + // with respect to its spatial neighbor. Smaller value of nearest_refmv_count + // w.r.t to ref_mv_count means less correlation with its spatial neighbors. + // Hence less possibility for NEARESTMV and NEARMV modes becoming the best + // mode since these modes work well for blocks that shares similar motion + // characteristics with its neighbor. Thus, NEARMV mode is pruned when + // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV + // mode is pruned if none of the ref mvs are populated from nearest candidate. + const int prune_thresh = 1 + (ref_mv_count >= 2); + if (nearest_refmv_count < prune_thresh) return 1; + return 0; +} + +// This function update the non-new mv for the current prediction mode +static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode, + const AV1_COMMON *cm, const MACROBLOCK *x, + int skip_repeated_ref_mv) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_comp_pred = has_second_ref(mbmi); + + int ret = 1; + for (int i = 0; i < is_comp_pred + 1; ++i) { + int_mv this_mv; + this_mv.as_int = INVALID_MV; + ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, + skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext); + if (!ret) return 0; + const PREDICTION_MODE single_mode = get_single_mode(this_mode, i); + if (single_mode == NEWMV) { + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + cur_mv[i] = + (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .this_mv + : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx] + .comp_mv; + } else { + ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x); + } + } + return ret; +} + +static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi, + const MB_MODE_INFO_EXT *mbmi_ext, + const int (*const drl_mode_cost0)[2], + int8_t ref_frame_type) { + int cost = 0; + if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) { + for (int idx = 0; idx < 2; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx]; + if (mbmi->ref_mv_idx == idx) return cost; + } + } + return cost; + } + + if (have_nearmv_in_inter_mode(mbmi->mode)) { + for (int idx = 1; idx < 3; ++idx) { + if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) { + uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx); + cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)]; + if (mbmi->ref_mv_idx == (idx - 1)) return cost; + } + } + return cost; + } + return cost; +} + +static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args, + const MB_MODE_INFO *const mbmi, + PREDICTION_MODE this_mode) { + for (int ref_idx = 0; ref_idx < 2; ++ref_idx) { + const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx); + const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx]; + if (single_mode == NEWMV && + args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) { + return 0; + } + } + return 1; +} + +static int get_drl_refmv_count(const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *ref_frame, + PREDICTION_MODE mode) { + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(ref_frame); + const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0; + const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV); + const int has_drl = + (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1); + const int ref_set = + has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1; + + return ref_set; +} + +// Checks if particular ref_mv_idx should be pruned. +static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes, + const int qindex, + const int ref_mv_idx) { + if (reduce_inter_modes >= 3) return 1; + // Q-index logic based pruning is enabled only for + // reduce_inter_modes = 2. + assert(reduce_inter_modes == 2); + // When reduce_inter_modes=2, pruning happens as below based on q index. + // For q index range between 0 and 85: prune if ref_mv_idx >= 1. + // For q index range between 86 and 170: prune if ref_mv_idx == 2. + // For q index range between 171 and 255: no pruning. + const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1; + return (ref_mv_idx >= min_prune_ref_mv_idx); +} + +// Whether this reference motion vector can be skipped, based on initial +// heuristics. +static bool ref_mv_idx_early_breakout( + const SPEED_FEATURES *const sf, + const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x, + const HandleInterModeArgs *const args, int64_t ref_best_rd, + int ref_mv_idx) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const int is_comp_pred = has_second_ref(mbmi); + if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) { + if (mbmi->ref_frame[0] == LAST2_FRAME || + mbmi->ref_frame[0] == LAST3_FRAME || + mbmi->ref_frame[1] == LAST2_FRAME || + mbmi->ref_frame[1] == LAST3_FRAME) { + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < + REF_CAT_LEVEL) { + return true; + } + } + // TODO(any): Experiment with reduce_inter_modes for compound prediction + if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred && + have_newmv_in_inter_mode(mbmi->mode)) { + if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref && + mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) { + const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; + const int do_prune = prune_ref_mv_idx_using_qindex( + sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx); + if (do_prune && + (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] < + REF_CAT_LEVEL)) { + return true; + } + } + } + } + + mbmi->ref_mv_idx = ref_mv_idx; + if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) { + return true; + } + size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost; + const int drl_cost = get_drl_cost( + mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type); + est_rd_rate += drl_cost; + if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + return true; + } + return false; +} + +// Compute the estimated RD cost for the motion vector with simple translation. +static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + HandleInterModeArgs *args, + int ref_mv_idx, int64_t ref_best_rd, + BLOCK_SIZE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + const AV1_COMMON *cm = &cpi->common; + const int is_comp_pred = has_second_ref(mbmi); + const ModeCosts *mode_costs = &x->mode_costs; + + struct macroblockd_plane *p = xd->plane; + const BUFFER_SET orig_dst = { + { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, + { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + }; + av1_init_rd_stats(rd_stats); + + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) { + mbmi->ref_frame[1] = NONE_FRAME; + } + int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = ref_mv_idx; + + rd_stats->rate += args->ref_frame_cost + args->single_comp_cost; + const int drl_cost = + get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); + rd_stats->rate += drl_cost; + + int_mv cur_mv[2]; + if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) { + return INT64_MAX; + } + assert(have_nearmv_in_inter_mode(mbmi->mode)); + for (int i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx); + rd_stats->rate += ref_mv_cost; + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) { + return INT64_MAX; + } + + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->num_proj_ref = 0; + if (is_comp_pred) { + // Only compound_average + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + } + set_default_interp_filters(mbmi, cm->features.interp_filter); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + int est_rate; + int64_t est_dist; + model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist, + NULL, NULL, NULL, NULL, NULL); + return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist); +} + +// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in +// an integer. 0 for the i-th bit means that integer is excluded, 1 means +// it is included. +static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); } + +static INLINE bool mask_check_bit(int mask, int index) { + return (mask >> index) & 0x1; +} + +// Before performing the full MV search in handle_inter_mode, do a simple +// translation search and see if we can eliminate any motion vectors. +// Returns an integer where, if the i-th bit is set, it means that the i-th +// motion vector should be searched. This is only set for NEAR_MV. +static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + HandleInterModeArgs *const args, + int64_t ref_best_rd, BLOCK_SIZE bsize, + const int ref_set) { + // If the number of ref mv count is equal to 1, do not prune the same. It + // is better to evaluate the same than to prune it. + if (ref_set == 1) return 1; + AV1_COMMON *const cm = &cpi->common; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + + // Only search indices if they have some chance of being good. + int good_indices = 0; + for (int i = 0; i < ref_set; ++i) { + if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args, + ref_best_rd, i)) { + continue; + } + mask_set_bit(&good_indices, i); + } + + // Only prune in NEARMV mode, if the speed feature is set, and the block size + // is large enough. If these conditions are not met, return all good indices + // found so far. + if (!cpi->sf.inter_sf.prune_mode_search_simple_translation) + return good_indices; + if (!have_nearmv_in_inter_mode(this_mode)) return good_indices; + if (num_pels_log2_lookup[bsize] <= 6) return good_indices; + // Do not prune when there is internal resizing. TODO(elliottk) fix this + // so b/2384 can be resolved. + if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) || + (mbmi->ref_frame[1] > 0 && + av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) { + return good_indices; + } + + // Calculate the RD cost for the motion vectors using simple translation. + int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX }; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + // If this index is bad, ignore it. + if (!mask_check_bit(good_indices, ref_mv_idx)) { + continue; + } + idx_rdcost[ref_mv_idx] = simple_translation_pred_rd( + cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize); + } + // Find the index with the best RD cost. + int best_idx = 0; + for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) { + if (idx_rdcost[i] < idx_rdcost[best_idx]) { + best_idx = i; + } + } + // Only include indices that are good and within a % of the best. + const double dth = has_second_ref(mbmi) ? 1.05 : 1.001; + // If the simple translation cost is not within this multiple of the + // best RD, skip it. Note that the cutoff is derived experimentally. + const double ref_dth = 5; + int result = 0; + for (int i = 0; i < ref_set; ++i) { + if (mask_check_bit(good_indices, i) && + (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth && + (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) { + mask_set_bit(&result, i); + } + } + return result; +} + +/*!\brief Motion mode information for inter mode search speedup. + * + * Used in a speed feature to search motion modes other than + * SIMPLE_TRANSLATION only on winning candidates. + */ +typedef struct motion_mode_candidate { + /*! + * Mode info for the motion mode candidate. + */ + MB_MODE_INFO mbmi; + /*! + * Rate describing the cost of the motion vectors for this candidate. + */ + int rate_mv; + /*! + * Rate before motion mode search and transform coding is applied. + */ + int rate2_nocoeff; + /*! + * An integer value 0 or 1 which indicates whether or not to skip the motion + * mode search and default to SIMPLE_TRANSLATION as a speed feature for this + * candidate. + */ + int skip_motion_mode; + /*! + * Total RD cost for this candidate. + */ + int64_t rd_cost; +} motion_mode_candidate; + +/*!\cond */ +typedef struct motion_mode_best_st_candidate { + motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES]; + int num_motion_mode_cand; +} motion_mode_best_st_candidate; + +// Checks if the current reference frame matches with neighbouring block's +// (top/left) reference frames +static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi, + MB_MODE_INFO *nb_mbmi) { + MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0], + nb_mbmi->ref_frame[1] }; + MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0], + cur_mbmi->ref_frame[1] }; + const int is_cur_comp_pred = has_second_ref(cur_mbmi); + int match_found = 0; + + for (int i = 0; i < (is_cur_comp_pred + 1); i++) { + if ((cur_ref_frames[i] == nb_ref_frames[0]) || + (cur_ref_frames[i] == nb_ref_frames[1])) + match_found = 1; + } + return match_found; +} + +static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols, + MACROBLOCKD *xd) { + if (!xd->up_available) return 1; + const int mi_col = xd->mi_col; + MB_MODE_INFO **cur_mbmi = xd->mi; + // prev_row_mi points into the mi array, starting at the beginning of the + // previous row. + MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride; + const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols); + uint8_t mi_step; + for (int above_mi_col = mi_col; above_mi_col < end_col; + above_mi_col += mi_step) { + MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col; + mi_step = mi_size_wide[above_mi[0]->bsize]; + int match_found = 0; + if (is_inter_block(*above_mi)) + match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi); + if (match_found) return 1; + } + return 0; +} + +static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows, + MACROBLOCKD *xd) { + if (!xd->left_available) return 1; + const int mi_row = xd->mi_row; + MB_MODE_INFO **cur_mbmi = xd->mi; + // prev_col_mi points into the mi array, starting at the top of the + // previous column + MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride; + const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows); + uint8_t mi_step; + for (int left_mi_row = mi_row; left_mi_row < end_row; + left_mi_row += mi_step) { + MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride; + mi_step = mi_size_high[left_mi[0]->bsize]; + int match_found = 0; + if (is_inter_block(*left_mi)) + match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi); + if (match_found) return 1; + } + return 0; +} +/*!\endcond */ + +/*! \brief Struct used to hold TPL data to + * narrow down parts of the inter mode search. + */ +typedef struct { + /*! + * The best inter cost out of all of the reference frames. + */ + int64_t best_inter_cost; + /*! + * The inter cost for each reference frame. + */ + int64_t ref_inter_cost[INTER_REFS_PER_FRAME]; +} PruneInfoFromTpl; + +#if !CONFIG_REALTIME_ONLY +// TODO(Remya): Check if get_tpl_stats_b() can be reused +static AOM_INLINE void get_block_level_tpl_stats( + AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs, + PruneInfoFromTpl *inter_cost_info_from_tpl) { + AV1_COMMON *const cm = &cpi->common; + + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return; + const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int mi_wide = mi_size_wide[bsize]; + const int mi_high = mi_size_high[bsize]; + const int tpl_stride = tpl_frame->stride; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_col_end_sr = + coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + const int row_step = step; + const int col_step_sr = + coded_to_superres_mi(step, cm->superres_scale_denominator); + for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows); + row += row_step) { + for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr); + col += col_step_sr) { + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + + // Sums up the inter cost of corresponding ref frames + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { + inter_cost_info_from_tpl->ref_inter_cost[ref_idx] += + this_stats->pred_error[ref_idx]; + } + } + } + + // Computes the best inter cost (minimum inter_cost) + int64_t best_inter_cost = INT64_MAX; + for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) { + const int64_t cur_inter_cost = + inter_cost_info_from_tpl->ref_inter_cost[ref_idx]; + // For invalid ref frames, cur_inter_cost = 0 and has to be handled while + // calculating the minimum inter_cost + if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) && + valid_refs[ref_idx]) + best_inter_cost = cur_inter_cost; + } + inter_cost_info_from_tpl->best_inter_cost = best_inter_cost; +} +#endif + +static AOM_INLINE int prune_modes_based_on_tpl_stats( + PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx, + const PREDICTION_MODE this_mode, int prune_mode_level) { + const int have_newmv = have_newmv_in_inter_mode(this_mode); + if ((prune_mode_level < 2) && have_newmv) return 0; + + const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost; + if (best_inter_cost == INT64_MAX) return 0; + + const int prune_level = prune_mode_level - 1; + int64_t cur_inter_cost; + + const int is_globalmv = + (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV); + const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx; + + // Thresholds used for pruning: + // Lower value indicates aggressive pruning and higher value indicates + // conservative pruning which is set based on ref_mv_idx and speed feature. + // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index + // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV + static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = { + { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 } + }; + + const int is_comp_pred = (refs[1] > INTRA_FRAME); + if (!is_comp_pred) { + cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; + } else { + const int64_t inter_cost_ref0 = + inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1]; + const int64_t inter_cost_ref1 = + inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1]; + // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for + // more aggressive pruning + cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1); + } + + // Prune the mode if cur_inter_cost is greater than threshold times + // best_inter_cost + if (cur_inter_cost > + ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] * + best_inter_cost) >> + 2)) + return 1; + return 0; +} + +/*!\brief High level function to select parameters for compound mode. + * + * \ingroup inter_mode_search + * The main search functionality is done in the call to av1_compound_type_rd(). + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * \param[in,out] cur_mv Current motion vector. + * \param[in] bsize Current block size. + * \param[in,out] compmode_interinter_cost RD of the selected interinter + compound mode. + * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all + * allocated buffers for the compound + * predictors and masks in the compound type + * search. + * \param[in,out] orig_dst A prediction buffer to hold a computed + * prediction. This will eventually hold the + * final prediction, and the tmp_dst info will + * be copied here. + * \param[in] tmp_dst A temporary prediction buffer to hold a + * computed prediction. + * \param[in,out] rate_mv The rate associated with the motion vectors. + * This will be modified if a motion search is + * done in the motion mode search. + * \param[in,out] rd_stats Struct to keep track of the overall RD + * information. + * \param[in,out] skip_rd An array of length 2 where skip_rd[0] is the + * best total RD for a skip mode so far, and + * skip_rd[1] is the best RD for a skip mode so + * far in luma. This is used as a speed feature + * to skip the transform search if the computed + * skip RD for the current mode is not better + * than the best skip_rd so far. + * \param[in,out] skip_build_pred Indicates whether or not to build the inter + * predictor. If this is 0, the inter predictor + * has already been built and thus we can avoid + * repeating computation. + * \return Returns 1 if this mode is worse than one already seen and 0 if it is + * a viable candidate. + */ +static int process_compound_inter_mode( + AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args, + int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize, + int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers, + const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv, + RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const AV1_COMMON *cm = &cpi->common; + const int masked_compound_used = is_any_masked_compound_used(bsize) && + cm->seq_params->enable_masked_compound; + int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | + (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD); + + const int num_planes = av1_num_planes(cm); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + int is_luma_interp_done = 0; + set_default_interp_filters(mbmi, cm->features.interp_filter); + + int64_t best_rd_compound; + int64_t rd_thresh; + const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT; + const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE; + rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift), + comp_type_rd_scale); + // Select compound type and any parameters related to that type + // (for example, the mask parameters if it is a masked mode) and compute + // the RD + *compmode_interinter_cost = av1_compound_type_rd( + cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used, + orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats, + ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh); + if (ref_best_rd < INT64_MAX && + (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale > + ref_best_rd) { + restore_dst_buf(xd, *orig_dst, num_planes); + return 1; + } + + // Build only uv predictor for COMPOUND_AVERAGE. + // Note there is no need to call av1_enc_build_inter_predictor + // for luma if COMPOUND_AVERAGE is selected because it is the first + // candidate in av1_compound_type_rd, which means it used the dst_buf + // rather than the tmp_buf. + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) { + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_U, num_planes - 1); + } + *skip_build_pred = 1; + } + return 0; +} + +// Speed feature to prune out MVs that are similar to previous MVs if they +// don't achieve the best RD advantage. +static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx, + int_mv save_mv[MAX_REF_MV_SEARCH - 1][2], + MB_MODE_INFO *mbmi, int pruning_factor) { + int i; + const int is_comp_pred = has_second_ref(mbmi); + const int thr = (1 + is_comp_pred) << (pruning_factor + 1); + + // Skip the evaluation if an MV match is found. + if (ref_mv_idx > 0) { + for (int idx = 0; idx < ref_mv_idx; ++idx) { + if (save_mv[idx][0].as_int == INVALID_MV) continue; + + int mv_diff = 0; + for (i = 0; i < 1 + is_comp_pred; ++i) { + mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) + + abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col); + } + + // If this mode is not the best one, and current MV is similar to + // previous stored MV, terminate this ref_mv_idx evaluation. + if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1; + } + } + + if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) { + for (i = 0; i < is_comp_pred + 1; ++i) + save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int; + } + + return 0; +} + +/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE + * + * \ingroup inter_mode_search + * + * Compares the sse of zero mv and the best sse found in single new_mv. If the + * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped. + * Else returns 0. + * + * Note that the sse of here comes from single_motion_search. So it is + * interpolated with the filter in motion search, not the actual interpolation + * filter used in encoding. + * + * \param[in] fn_ptr A table of function pointers to compute SSE. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] bsize The current block_size. + * \param[in] args The args to handle_inter_mode, used to track + * the best SSE. + * \param[in] prune_zero_mv_with_sse The argument holds speed feature + * prune_zero_mv_with_sse value + * \return Returns 1 if zero_mv is pruned, 0 otherwise. + */ +static AOM_INLINE int prune_zero_mv_with_sse( + const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize, + const HandleInterModeArgs *args, int prune_zero_mv_with_sse) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + + const int is_comp_pred = has_second_ref(mbmi); + const MV_REFERENCE_FRAME *refs = mbmi->ref_frame; + + for (int idx = 0; idx < 1 + is_comp_pred; idx++) { + if (xd->global_motion[refs[idx]].wmtype != IDENTITY) { + // Pruning logic only works for IDENTITY type models + // Note: In theory we could apply similar logic for TRANSLATION + // type models, but we do not code these due to a spec bug + // (see comments in gm_get_motion_vector() in av1/common/mv.h) + assert(xd->global_motion[refs[idx]].wmtype != TRANSLATION); + return 0; + } + + // Don't prune if we have invalid data + assert(mbmi->mv[idx].as_int == 0); + if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) { + return 0; + } + } + + // Sum up the sse of ZEROMV and best NEWMV + unsigned int this_sse_sum = 0; + unsigned int best_sse_sum = 0; + for (int idx = 0; idx < 1 + is_comp_pred; idx++) { + const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const struct macroblockd_plane *pd = xd->plane; + const struct buf_2d *src_buf = &p->src; + const struct buf_2d *ref_buf = &pd->pre[idx]; + const uint8_t *src = src_buf->buf; + const uint8_t *ref = ref_buf->buf; + const int src_stride = src_buf->stride; + const int ref_stride = ref_buf->stride; + + unsigned int this_sse; + fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse); + this_sse_sum += this_sse; + + const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]]; + best_sse_sum += best_sse; + } + + const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25; + if ((double)this_sse_sum > (mul * (double)best_sse_sum)) { + return 1; + } + + return 0; +} + +/*!\brief Searches for interpolation filter in realtime mode during winner eval + * + * \ingroup inter_mode_search + * + * Does a simple interpolation filter search during winner mode evaluation. This + * is currently only used by realtime mode as \ref + * av1_interpolation_filter_search is not called during realtime encoding. + * + * This function only searches over two possible filters. EIGHTTAP_REGULAR is + * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For + * higher res slips (>240p), EIGHTTAP_SMOOTH is also searched. + * * + * \param[in] cpi Pointer to the compressor. Used for feature + * flags. + * \param[in,out] x Pointer to macroblock. This is primarily + * used to access the buffers. + * \param[in] mi_row The current row in mi unit (4X4 pixels). + * \param[in] mi_col The current col in mi unit (4X4 pixels). + * \param[in] bsize The current block_size. + * \return Returns true if a predictor is built in xd->dst, false otherwise. + */ +static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + static const InterpFilters filters_ref_set[3] = { + { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, + { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH }, + { MULTITAP_SHARP, MULTITAP_SHARP } + }; + + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mi = xd->mi[0]; + int64_t best_cost = INT64_MAX; + int best_filter_index = -1; + // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best + const int num_planes = av1_num_planes(cm); + const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240; + assert(is_inter_mode(mi->mode)); + assert(mi->motion_mode == SIMPLE_TRANSLATION); + assert(!is_inter_compound_mode(mi->mode)); + + if (!av1_is_interp_needed(xd)) { + return false; + } + + struct macroblockd_plane *pd = xd->plane; + const BUFFER_SET orig_dst = { + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, + }; + uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; + const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst }; + + for (int i = 0; i < 3; ++i) { + if (is_240p_or_lesser) { + if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) { + continue; + } + } else { + if (filters_ref_set[i].x_filter == MULTITAP_SHARP) { + continue; + } + } + int64_t cost; + RD_STATS tmp_rd = { 0 }; + + mi->interp_filters.as_filters = filters_ref_set[i]; + av1_enc_build_inter_predictor_y(xd, mi_row, mi_col); + + model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model + ? MODELRD_LEGACY + : MODELRD_TYPE_INTERP_FILTER]( + cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist, + &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL); + + tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter, + cm->seq_params->enable_dual_filter); + cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist); + if (cost < best_cost) { + best_filter_index = i; + best_cost = cost; + swap_dst_buf(xd, dst_bufs, num_planes); + } + } + assert(best_filter_index >= 0); + + mi->interp_filters.as_filters = filters_ref_set[best_filter_index]; + + const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1]; + + if (is_best_pred_in_orig) { + swap_dst_buf(xd, dst_bufs, num_planes); + } else { + // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if + // is_best_pred_in_orig is false, that means the current buffer is the + // original one. + assert(&orig_dst == dst_bufs[0]); + assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]); + const int width = block_size_wide[bsize]; + const int height = block_size_high[bsize]; +#if CONFIG_AV1_HIGHBITDEPTH + const bool is_hbd = is_cur_buf_hbd(xd); + if (is_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]), + tmp_dst.stride[AOM_PLANE_Y], + CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]), + orig_dst.stride[AOM_PLANE_Y], width, height); + } else { + aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y], + orig_dst.plane[AOM_PLANE_Y], + orig_dst.stride[AOM_PLANE_Y], width, height); + } +#else + aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y], + orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y], + width, height); +#endif + } + + // Build the YUV predictor. + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_U, AOM_PLANE_V); + } + + return true; +} + +/*!\brief AV1 inter mode RD computation + * + * \ingroup inter_mode_search + * Do the RD search for a given inter mode and compute all information relevant + * to the input mode. It will compute the best MV, + * compound parameters (if the mode is a compound mode) and interpolation filter + * parameters. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] tile_data Pointer to struct holding adaptive + * data/contexts/models for the tile during + * encoding. + * \param[in] x Pointer to structure holding all the data + * for the current macroblock. + * \param[in] bsize Current block size. + * \param[in,out] rd_stats Struct to keep track of the overall RD + * information. + * \param[in,out] rd_stats_y Struct to keep track of the RD information + * for only the Y plane. + * \param[in,out] rd_stats_uv Struct to keep track of the RD information + * for only the UV planes. + * \param[in] args HandleInterModeArgs struct holding + * miscellaneous arguments for inter mode + * search. See the documentation for this + * struct for a description of each member. + * \param[in] ref_best_rd Best RD found so far for this block. + * It is used for early termination of this + * search if the RD exceeds this value. + * \param[in] tmp_buf Temporary buffer used to hold predictors + * built in this search. + * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all + * allocated buffers for the compound + * predictors and masks in the compound type + * search. + * \param[in,out] best_est_rd Estimated RD for motion mode search if + * do_tx_search (see below) is 0. + * \param[in] do_tx_search Parameter to indicate whether or not to do + * a full transform search. This will compute + * an estimated RD for the modes without the + * transform search and later perform the full + * transform search on the best candidates. + * \param[in,out] inter_modes_info InterModesInfo struct to hold inter mode + * information to perform a full transform + * search only on winning candidates searched + * with an estimate for transform coding RD. + * \param[in,out] motion_mode_cand A motion_mode_candidate struct to store + * motion mode information used in a speed + * feature to search motion modes other than + * SIMPLE_TRANSLATION only on winning + * candidates. + * \param[in,out] skip_rd A length 2 array, where skip_rd[0] is the + * best total RD for a skip mode so far, and + * skip_rd[1] is the best RD for a skip mode so + * far in luma. This is used as a speed feature + * to skip the transform search if the computed + * skip RD for the current mode is not better + * than the best skip_rd so far. + * \param[in] inter_cost_info_from_tpl A PruneInfoFromTpl struct used to + * narrow down the search based on data + * collected in the TPL model. + * \param[out] yrd Stores the rdcost corresponding to encoding + * the luma plane. + * + * \return The RD cost for the mode being searched. + */ +static int64_t handle_inter_mode( + AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd, + uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers, + int64_t *best_est_rd, const int do_tx_search, + InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand, + int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl, + int64_t *yrd) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const int is_comp_pred = has_second_ref(mbmi); + const PREDICTION_MODE this_mode = mbmi->mode; + +#if CONFIG_REALTIME_ONLY + const int prune_modes_based_on_tpl = 0; +#else // CONFIG_REALTIME_ONLY + const TplParams *const tpl_data = &cpi->ppi->tpl_data; + const int prune_modes_based_on_tpl = + cpi->sf.inter_sf.prune_inter_modes_based_on_tpl && + av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index); +#endif // CONFIG_REALTIME_ONLY + int i; + // Reference frames for this mode + const int refs[2] = { mbmi->ref_frame[0], + (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; + int rate_mv = 0; + int64_t rd = INT64_MAX; + // Do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + struct macroblockd_plane *pd = xd->plane; + const BUFFER_SET orig_dst = { + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, + }; + const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, + tmp_buf + 2 * MAX_SB_SQUARE }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } }; + + int64_t ret_val = INT64_MAX; + const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; + int64_t best_rd = INT64_MAX; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + int64_t best_yrd = INT64_MAX; + MB_MODE_INFO best_mbmi = *mbmi; + int best_xskip_txfm = 0; + int64_t newmv_ret_val = INT64_MAX; + inter_mode_info mode_info[MAX_REF_MV_SEARCH]; + + // Do not prune the mode based on inter cost from tpl if the current ref frame + // is the winner ref in neighbouring blocks. + int ref_match_found_in_above_nb = 0; + int ref_match_found_in_left_nb = 0; + if (prune_modes_based_on_tpl) { + ref_match_found_in_above_nb = + find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd); + ref_match_found_in_left_nb = + find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd); + } + + // First, perform a simple translation search for each of the indices. If + // an index performs well, it will be fully searched in the main loop + // of this function. + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + // Save MV results from first 2 ref_mv_idx. + int_mv save_mv[MAX_REF_MV_SEARCH - 1][2]; + int best_ref_mv_idx = -1; + const int idx_mask = + ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set); + const int16_t mode_ctx = + av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); + const ModeCosts *mode_costs = &x->mode_costs; + const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx); + const int base_rate = + args->ref_frame_cost + args->single_comp_cost + ref_mv_cost; + + for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) { + save_mv[i][0].as_int = INVALID_MV; + save_mv[i][1].as_int = INVALID_MV; + } + args->start_mv_cnt = 0; + + // Main loop of this function. This will iterate over all of the ref mvs + // in the dynamic reference list and do the following: + // 1.) Get the current MV. Create newmv MV if necessary + // 2.) Search compound type and parameters if applicable + // 3.) Do interpolation filter search + // 4.) Build the inter predictor + // 5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL, + // WARPED_CAUSAL) + // 6.) Update stats if best so far + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mbmi->ref_mv_idx = ref_mv_idx; + + mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV; + mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX; + const int drl_cost = get_drl_cost( + mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); + mode_info[ref_mv_idx].drl_cost = drl_cost; + mode_info[ref_mv_idx].skip = 0; + + if (!mask_check_bit(idx_mask, ref_mv_idx)) { + // MV did not perform well in simple translation search. Skip it. + continue; + } + if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb && + !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) { + // Skip mode if TPL model indicates it will not be beneficial. + if (prune_modes_based_on_tpl_stats( + inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode, + cpi->sf.inter_sf.prune_inter_modes_based_on_tpl)) + continue; + } + av1_init_rd_stats(rd_stats); + + // Initialize compound mode data + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME; + + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + + // Compute cost for signalling this DRL index + rd_stats->rate = base_rate; + rd_stats->rate += drl_cost; + + int rs = 0; + int compmode_interinter_cost = 0; + + int_mv cur_mv[2]; + + // TODO(Cherma): Extend this speed feature to support compound mode + int skip_repeated_ref_mv = + is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv; + // Generate the current mv according to the prediction mode + if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) { + continue; + } + + // The above call to build_cur_mv does not handle NEWMV modes. Build + // the mv here if we have NEWMV for any predictors. + if (have_newmv_in_inter_mode(this_mode)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_newmv_time); +#endif + newmv_ret_val = + handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_newmv_time); +#endif + + if (newmv_ret_val != 0) continue; + + if (is_inter_singleref_mode(this_mode) && + cur_mv[0].as_int != INVALID_MV) { + const MV_REFERENCE_FRAME ref = refs[0]; + const unsigned int this_sse = x->pred_sse[ref]; + if (this_sse < args->best_single_sse_in_refs[ref]) { + args->best_single_sse_in_refs[ref] = this_sse; + } + + if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) { + const int th_idx = cpi->sf.rt_sf.skip_newmv_mode_based_on_sse - 1; + const int pix_idx = num_pels_log2_lookup[bsize] - 4; + const double scale_factor[3][11] = { + { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9 }, + { 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 1, 1, 1, 1, 1 }, + { 0.7, 0.7, 0.7, 0.7, 1, 1, 1, 1, 1, 1, 1 } + }; + assert(pix_idx >= 0); + assert(th_idx <= 2); + if (args->best_pred_sse < scale_factor[th_idx][pix_idx] * this_sse) + continue; + } + } + + rd_stats->rate += rate_mv; + } + // Copy the motion vector for this mode into mbmi struct + for (i = 0; i < is_comp_pred + 1; ++i) { + mbmi->mv[i].as_int = cur_mv[i].as_int; + } + + if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd && + mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) { + continue; + } + + // Skip the rest of the search if prune_ref_mv_idx_search speed feature + // is enabled, and the current MV is similar to a previous one. + if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred && + prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi, + cpi->sf.inter_sf.prune_ref_mv_idx_search)) + continue; + + if (cpi->sf.gm_sf.prune_zero_mv_with_sse && + (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) { + if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args, + cpi->sf.gm_sf.prune_zero_mv_with_sse)) { + continue; + } + } + + int skip_build_pred = 0; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Handle a compound predictor, continue if it is determined this + // cannot be the best compound mode + if (is_comp_pred) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, compound_type_rd_time); +#endif + const int not_best_mode = process_compound_inter_mode( + cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost, + rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd, + &skip_build_pred); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, compound_type_rd_time); +#endif + if (not_best_mode) continue; + } + + if (!args->skip_ifs) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interpolation_filter_search_time); +#endif + // Determine the interpolation filter for this mode + ret_val = av1_interpolation_filter_search( + x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs, + &skip_build_pred, args, ref_best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interpolation_filter_search_time); +#endif + if (args->modelled_rd != NULL && !is_comp_pred) { + args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; + } + if (ret_val != 0) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout && + ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + + // Compute modelled RD if enabled + if (args->modelled_rd != NULL) { + if (is_comp_pred) { + const int mode0 = compound_ref0_mode(this_mode); + const int mode1 = compound_ref1_mode(this_mode); + const int64_t mrd = + AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]], + args->modelled_rd[mode1][ref_mv_idx][refs[1]]); + if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + } + } + } + + rd_stats->rate += compmode_interinter_cost; + if (skip_build_pred != 1) { + // Build this inter predictor if it has not been previously built + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0, + av1_num_planes(cm) - 1); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, motion_mode_rd_time); +#endif + int rate2_nocoeff = rd_stats->rate; + // Determine the motion mode. This will be one of SIMPLE_TRANSLATION, + // OBMC_CAUSAL or WARPED_CAUSAL + int64_t this_yrd; + ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y, + rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv, + &orig_dst, best_est_rd, do_tx_search, + inter_modes_info, 0, &this_yrd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, motion_mode_rd_time); +#endif + assert( + IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX)); + + if (ret_val != INT64_MAX) { + int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y, + rd_stats_uv, mode_enum, NULL, bsize, tmp_rd, + cpi->sf.winner_mode_sf.multi_winner_mode_type, + do_tx_search); + if (tmp_rd < best_rd) { + best_yrd = this_yrd; + // Update the best rd stats if we found the best mode so far + best_rd_stats = *rd_stats; + best_rd_stats_y = *rd_stats_y; + best_rd_stats_uv = *rd_stats_uv; + best_rd = tmp_rd; + best_mbmi = *mbmi; + best_xskip_txfm = txfm_info->skip_txfm; + memcpy(best_blk_skip, txfm_info->blk_skip, + sizeof(best_blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, + xd->height * xd->width); + motion_mode_cand->rate_mv = rate_mv; + motion_mode_cand->rate2_nocoeff = rate2_nocoeff; + } + + if (tmp_rd < ref_best_rd) { + ref_best_rd = tmp_rd; + best_ref_mv_idx = ref_mv_idx; + } + } + restore_dst_buf(xd, orig_dst, num_planes); + } + + if (best_rd == INT64_MAX) return INT64_MAX; + + // re-instate status of the best choice + *rd_stats = best_rd_stats; + *rd_stats_y = best_rd_stats_y; + *rd_stats_uv = best_rd_stats_uv; + *yrd = best_yrd; + *mbmi = best_mbmi; + txfm_info->skip_txfm = best_xskip_txfm; + assert(IMPLIES(mbmi->comp_group_idx == 1, + mbmi->interinter_comp.type != COMPOUND_AVERAGE)); + memcpy(txfm_info->blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width); + + rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + + return rd_stats->rdcost; +} + +/*!\brief Search for the best intrabc predictor + * + * \ingroup intra_mode_search + * \callergraph + * This function performs a motion search to find the best intrabc predictor. + * + * \returns Returns the best overall rdcost (including the non-intrabc modes + * search before this function). + */ +static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, + PICK_MODE_CONTEXT *ctx, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc || + !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode) + return INT64_MAX; + const int num_planes = av1_num_planes(cm); + + MACROBLOCKD *const xd = &x->e_mbd; + const TileInfo *tile = &xd->tile; + MB_MODE_INFO *mbmi = xd->mi[0]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int w = block_size_wide[bsize]; + const int h = block_size_high[bsize]; + const int sb_row = mi_row >> cm->seq_params->mib_size_log2; + const int sb_col = mi_col >> cm->seq_params->mib_size_log2; + + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + int_mv nearestmv, nearmv; + av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv, + 0); + + if (nearestmv.as_int == INVALID_MV) { + nearestmv.as_int = 0; + } + if (nearmv.as_int == INVALID_MV) { + nearmv.as_int = 0; + } + + int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; + if (dv_ref.as_int == 0) { + av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row); + } + // Ref DV should not have sub-pel. + assert((dv_ref.as_mv.col & 7) == 0); + assert((dv_ref.as_mv.row & 7) == 0); + mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref; + + struct buf_2d yv12_mb[MAX_MB_PLANE]; + av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes); + for (int i = 0; i < num_planes; ++i) { + xd->plane[i].pre[0] = yv12_mb[i]; + } + + enum IntrabcMotionDirection { + IBC_MOTION_ABOVE, + IBC_MOTION_LEFT, + IBC_MOTION_DIRECTIONS + }; + + MB_MODE_INFO best_mbmi = *mbmi; + RD_STATS best_rdstats = *rd_stats; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + + FULLPEL_MOTION_SEARCH_PARAMS fullms_params; + const SEARCH_METHODS search_method = + av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize); + const search_site_config *lookahead_search_sites = + cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD]; + const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv); + av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize, + &dv_ref.as_mv, start_mv, + lookahead_search_sites, search_method, + /*fine_search_interval=*/0); + const IntraBCMVCosts *const dv_costs = x->dv_costs; + av1_set_ms_to_intra_mode(&fullms_params, dv_costs); + + for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE; + dir < IBC_MOTION_DIRECTIONS; ++dir) { + switch (dir) { + case IBC_MOTION_ABOVE: + fullms_params.mv_limits.col_min = + (tile->mi_col_start - mi_col) * MI_SIZE; + fullms_params.mv_limits.col_max = + (tile->mi_col_end - mi_col) * MI_SIZE - w; + fullms_params.mv_limits.row_min = + (tile->mi_row_start - mi_row) * MI_SIZE; + fullms_params.mv_limits.row_max = + (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h; + break; + case IBC_MOTION_LEFT: + fullms_params.mv_limits.col_min = + (tile->mi_col_start - mi_col) * MI_SIZE; + fullms_params.mv_limits.col_max = + (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w; + // TODO(aconverse@google.com): Minimize the overlap between above and + // left areas. + fullms_params.mv_limits.row_min = + (tile->mi_row_start - mi_row) * MI_SIZE; + int bottom_coded_mi_edge = + AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end); + fullms_params.mv_limits.row_max = + (bottom_coded_mi_edge - mi_row) * MI_SIZE - h; + break; + default: assert(0); + } + assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min); + assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max); + assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min); + assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max); + + av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv); + + if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min || + fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) { + continue; + } + + const int step_param = cpi->mv_search_params.mv_step_param; + IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info; + int_mv best_mv, best_hash_mv; + FULLPEL_MV_STATS best_mv_stats; + + int bestsme = + av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL, + &best_mv.as_fullmv, &best_mv_stats, NULL); + const int hashsme = av1_intrabc_hash_search( + cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv); + if (hashsme < bestsme) { + best_mv = best_hash_mv; + bestsme = hashsme; + } + + if (bestsme == INT_MAX) continue; + const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv); + if (!av1_is_fullmv_in_range(&fullms_params.mv_limits, + get_fullmv_from_mv(&dv))) + continue; + if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, + cm->seq_params->mib_size_log2)) + continue; + + // DV should not have sub-pel. + assert((dv.col & 7) == 0); + assert((dv.row & 7) == 0); + memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info)); + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->use_intrabc = 1; + mbmi->mode = DC_PRED; + mbmi->uv_mode = UV_DC_PRED; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->mv[0].as_mv = dv; + mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + mbmi->skip_txfm = 0; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + + // TODO(aconverse@google.com): The full motion field defining discount + // in MV_COST_WEIGHT is too large. Explore other values. + const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv, + dv_costs->dv_costs, MV_COST_WEIGHT_SUB); + const int rate_mode = x->mode_costs.intrabc_cost[1]; + RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv; + if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y, + &rd_stats_uv, rate_mode + rate_mv, INT64_MAX)) + continue; + rd_stats_yuv.rdcost = + RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist); + if (rd_stats_yuv.rdcost < best_rd) { + best_rd = rd_stats_yuv.rdcost; + best_mbmi = *mbmi; + best_rdstats = rd_stats_yuv; + memcpy(best_blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width); + } + } + *mbmi = best_mbmi; + *rd_stats = best_rdstats; + memcpy(txfm_info->blk_skip, best_blk_skip, + sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width); + av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk); +#if CONFIG_RD_DEBUG + mbmi->rd_stats = *rd_stats; +#endif + return best_rd; +} + +// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their +// typedef here because Doxygen doesn't know about the typedefs yet. So using +// the typedef will prevent doxygen from finding this function and generating +// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to +// doxygen, we can revert back to using the typedefs. +void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int num_planes = av1_num_planes(cm); + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + uint8_t y_skip_txfm = 0, uv_skip_txfm = 0; + int64_t dist_y = 0, dist_uv = 0; + + ctx->rd_stats.skip_txfm = 0; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->use_intrabc = 0; + mbmi->mv[0].as_int = 0; + mbmi->skip_mode = 0; + + const int64_t intra_yrd = + av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, + &y_skip_txfm, bsize, best_rd, ctx); + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + if (intra_yrd < best_rd) { + // Search intra modes for uv planes if needed + if (num_planes > 1) { + // Set up the tx variables for reproducing the y predictions in case we + // need it for chroma-from-luma. + if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) { + memcpy(txfm_info->blk_skip, ctx->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk); + } + const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + &dist_uv, &uv_skip_txfm, bsize, + max_uv_tx_size); + } + + // Intra block is always coded as non-skip + rd_cost->rate = + rate_y + rate_uv + + x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0]; + rd_cost->dist = dist_y + dist_uv; + rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + rd_cost->skip_txfm = 0; + } else { + rd_cost->rate = INT_MAX; + } + + if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd) + best_rd = rd_cost->rdcost; + if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) { + ctx->rd_stats.skip_txfm = mbmi->skip_txfm; + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + assert(rd_cost->rate != INT_MAX); + } + if (rd_cost->rate == INT_MAX) return; + + ctx->mic = *xd->mi[0]; + av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext, + av1_ref_frame_type(xd->mi[0]->ref_frame)); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); +} + +static AOM_INLINE void calc_target_weighted_pred( + const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, + const uint8_t *above, int above_stride, const uint8_t *left, + int left_stride); + +static AOM_INLINE void rd_pick_skip_mode( + RD_STATS *rd_cost, InterModeSearchState *search_state, + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + x->compound_idx = 1; // COMPOUND_AVERAGE + RD_STATS skip_mode_rd_stats; + av1_invalid_rd_stats(&skip_mode_rd_stats); + + if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX || + skip_mode_info->ref_frame_idx_1 == INVALID_IDX) { + return; + } + + const MV_REFERENCE_FRAME ref_frame = + LAST_FRAME + skip_mode_info->ref_frame_idx_0; + const MV_REFERENCE_FRAME second_ref_frame = + LAST_FRAME + skip_mode_info->ref_frame_idx_1; + const PREDICTION_MODE this_mode = NEAREST_NEARESTMV; + const THR_MODES mode_index = + get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame); + + if (mode_index == THR_INVALID) { + return; + } + + if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp || + cpi->sf.inter_sf.disable_onesided_comp) && + cpi->all_one_sided_refs) { + return; + } + + mbmi->mode = this_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame); + if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) { + MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext; + if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX || + mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) { + return; + } + av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type); + } + + assert(this_mode == NEAREST_NEARESTMV); + if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) { + return; + } + + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + mbmi->comp_group_idx = 0; + mbmi->compound_idx = x->compound_idx; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->ref_mv_idx = 0; + mbmi->skip_mode = mbmi->skip_txfm = 1; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + + set_default_interp_filters(mbmi, cm->features.interp_filter); + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + BUFFER_SET orig_dst; + for (int i = 0; i < num_planes; i++) { + orig_dst.plane[i] = xd->plane[i].dst.buf; + orig_dst.stride[i] = xd->plane[i].dst.stride; + } + + // Compare the use of skip_mode with the best intra/inter mode obtained. + const int skip_mode_ctx = av1_get_skip_mode_context(xd); + int64_t best_intra_inter_mode_cost = INT64_MAX; + if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) { + const ModeCosts *mode_costs = &x->mode_costs; + best_intra_inter_mode_cost = RDCOST( + x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0], + rd_cost->dist); + // Account for non-skip mode rate in total rd stats + rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0]; + av1_rd_cost_update(x->rdmult, rd_cost); + } + + // Obtain the rdcost for skip_mode. + skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst, + best_intra_inter_mode_cost); + + if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost && + (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) { + assert(mode_index != THR_INVALID); + search_state->best_mbmode.skip_mode = 1; + search_state->best_mbmode = *mbmi; + memset(search_state->best_mbmode.inter_tx_size, + search_state->best_mbmode.tx_size, + sizeof(search_state->best_mbmode.inter_tx_size)); + set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height, + search_state->best_mbmode.skip_txfm && is_inter_block(mbmi), + xd); + search_state->best_mode_index = mode_index; + + // Update rd_cost + rd_cost->rate = skip_mode_rd_stats.rate; + rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist; + rd_cost->rdcost = skip_mode_rd_stats.rdcost; + + search_state->best_rd = rd_cost->rdcost; + search_state->best_skip2 = 1; + search_state->best_mode_skippable = 1; + + x->txfm_search_info.skip_txfm = 1; + } +} + +// Get winner mode stats of given mode index +static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats( + MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost, + int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index, + RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv, + THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type, + int mode_idx) { + MB_MODE_INFO *winner_mbmi; + if (multi_winner_mode_type) { + assert(mode_idx >= 0 && mode_idx < x->winner_mode_count); + WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx]; + winner_mbmi = &winner_mode_stat->mbmi; + + *winner_rd_cost = &winner_mode_stat->rd_cost; + *winner_rate_y = winner_mode_stat->rate_y; + *winner_rate_uv = winner_mode_stat->rate_uv; + *winner_mode_index = winner_mode_stat->mode_index; + } else { + winner_mbmi = best_mbmode; + *winner_rd_cost = best_rd_cost; + *winner_rate_y = best_rate_y; + *winner_rate_uv = best_rate_uv; + *winner_mode_index = *best_mode_index; + } + return winner_mbmi; +} + +// speed feature: fast intra/inter transform type search +// Used for speed >= 2 +// When this speed feature is on, in rd mode search, only DCT is used. +// After the mode is determined, this function is called, to select +// transform types and get accurate rdcost. +static AOM_INLINE void refine_winner_mode_tx( + const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index, + MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], + int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + TxfmSearchParams *txfm_params = &x->txfm_search_params; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int64_t best_rd; + const int num_planes = av1_num_planes(cm); + + if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode, + rd_cost->skip_txfm)) + return; + + // Set params for winner mode evaluation + set_mode_eval_params(cpi, x, WINNER_MODE_EVAL); + + // No best mode identified so far + if (*best_mode_index == THR_INVALID) return; + + best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist); + for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) { + RD_STATS *winner_rd_stats = NULL; + int winner_rate_y = 0, winner_rate_uv = 0; + THR_MODES winner_mode_index = 0; + + // TODO(any): Combine best mode and multi-winner mode processing paths + // Get winner mode stats for current mode index + MB_MODE_INFO *winner_mbmi = get_winner_mode_stats( + x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index, + &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index, + cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx); + + if (xd->lossless[winner_mbmi->segment_id] == 0 && + winner_mode_index != THR_INVALID && + is_winner_mode_processing_enabled(cpi, x, winner_mbmi, + rd_cost->skip_txfm)) { + RD_STATS rd_stats = *winner_rd_stats; + int skip_blk = 0; + RD_STATS rd_stats_y, rd_stats_uv; + const int skip_ctx = av1_get_skip_txfm_context(xd); + + *mbmi = *winner_mbmi; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (has_second_ref(mbmi)) + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + if (is_inter_mode(mbmi->mode)) { + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + bool is_predictor_built = false; + const PREDICTION_MODE prediction_mode = mbmi->mode; + // Do interpolation filter search for realtime mode if applicable. + if (cpi->sf.winner_mode_sf.winner_mode_ifs && + cpi->oxcf.mode == REALTIME && + cm->current_frame.reference_mode == SINGLE_REFERENCE && + is_inter_mode(prediction_mode) && + mbmi->motion_mode == SIMPLE_TRANSLATION && + !is_inter_compound_mode(prediction_mode)) { + is_predictor_built = + fast_interp_search(cpi, x, mi_row, mi_col, bsize); + } + if (!is_predictor_built) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd); + + av1_subtract_plane(x, bsize, 0); + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) { + av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + assert(rd_stats_y.rate != INT_MAX); + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + memset(mbmi->inter_tx_size, mbmi->tx_size, + sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->height * xd->width; ++i) + set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm); + } + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, + INT64_MAX); + } + + if (num_planes > 1) { + av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + } else { + av1_init_rd_stats(&rd_stats_uv); + } + + const ModeCosts *mode_costs = &x->mode_costs; + if (is_inter_mode(mbmi->mode) && + RDCOST(x->rdmult, + mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate + + rd_stats_uv.rate, + (rd_stats_y.dist + rd_stats_uv.dist)) > + RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1], + (rd_stats_y.sse + rd_stats_uv.sse))) { + skip_blk = 1; + rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1]; + rd_stats_uv.rate = 0; + rd_stats_y.dist = rd_stats_y.sse; + rd_stats_uv.dist = rd_stats_uv.sse; + } else { + skip_blk = 0; + rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0]; + } + int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate - + winner_rate_y - winner_rate_uv; + int64_t this_rd = + RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist)); + if (best_rd > this_rd) { + *best_mbmode = *mbmi; + *best_mode_index = winner_mode_index; + av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + rd_cost->rate = this_rate; + rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist; + rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse; + rd_cost->rdcost = this_rd; + best_rd = this_rd; + *best_skip2 = skip_blk; + } + } + } +} + +/*!\cond */ +typedef struct { + // Mask for each reference frame, specifying which prediction modes to NOT try + // during search. + uint32_t pred_modes[REF_FRAMES]; + // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of + // reference frames (i, j). + // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1 + // (NONE_FRAME). + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]; +} mode_skip_mask_t; +/*!\endcond */ + +// Update 'ref_combo' mask to disable given 'ref' in single and compound modes. +static AOM_INLINE void disable_reference( + MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + ref_combo[ref][ref2 + 1] = true; + } +} + +// Update 'ref_combo' mask to disable all inter references except ALTREF. +static AOM_INLINE void disable_inter_references_except_altref( + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + disable_reference(LAST_FRAME, ref_combo); + disable_reference(LAST2_FRAME, ref_combo); + disable_reference(LAST3_FRAME, ref_combo); + disable_reference(GOLDEN_FRAME, ref_combo); + disable_reference(BWDREF_FRAME, ref_combo); + disable_reference(ALTREF2_FRAME, ref_combo); +} + +static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = { + { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME }, + { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME }, + { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME }, + { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME }, +}; + +typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET; + +static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask, + REF_SET ref_set) { + if (ref_set == REF_SET_FULL) { + // Everything available by default. + memset(mask, 0, sizeof(*mask)); + } else { + // All modes available by default. + memset(mask->pred_modes, 0, sizeof(mask->pred_modes)); + // All references disabled first. + for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + mask->ref_combo[ref1][ref2 + 1] = true; + } + } + const MV_REFERENCE_FRAME(*ref_set_combos)[2]; + int num_ref_combos; + + // Then enable reduced set of references explicitly. + switch (ref_set) { + case REF_SET_REDUCED: + ref_set_combos = reduced_ref_combos; + num_ref_combos = + (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]); + break; + case REF_SET_REALTIME: + ref_set_combos = real_time_ref_combos; + num_ref_combos = + (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]); + break; + default: assert(0); num_ref_combos = 0; + } + + for (int i = 0; i < num_ref_combos; ++i) { + const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i]; + mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false; + } + } +} + +static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask, + const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const SPEED_FEATURES *const sf = &cpi->sf; + const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf; + REF_SET ref_set = REF_SET_FULL; + + if (sf->rt_sf.use_real_time_ref_set) + ref_set = REF_SET_REALTIME; + else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set) + ref_set = REF_SET_REDUCED; + + default_skip_mask(mask, ref_set); + + int min_pred_mv_sad = INT_MAX; + MV_REFERENCE_FRAME ref_frame; + if (ref_set == REF_SET_REALTIME) { + // For real-time encoding, we only look at a subset of ref frames. So the + // threshold for pruning should be computed from this subset as well. + const int num_rt_refs = + sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos); + for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) { + const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0]; + if (ref != INTRA_FRAME) { + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]); + } + } + } else { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); + } + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) { + // Skip checking missing reference in both single and compound reference + // modes. + disable_reference(ref_frame, mask->ref_combo); + } else { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { + mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + } + } + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + // Reference not used for the segment. + disable_reference(ref_frame, mask->ref_combo); + } + } + // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature + // is disabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && + (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) { + disable_inter_references_except_altref(mask->ref_combo); + + mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; + const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; + int_mv near_mv, nearest_mv, global_mv; + get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, + &x->mbmi_ext); + get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); + get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext); + + if (near_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV); + if (nearest_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV); + } + } + + if (cpi->rc.is_src_frame_alt_ref) { + if (inter_sf->alt_ref_search_fp && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) { + mask->pred_modes[ALTREF_FRAME] = 0; + disable_inter_references_except_altref(mask->ref_combo); + disable_reference(INTRA_FRAME, mask->ref_combo); + } + } + + if (inter_sf->alt_ref_search_fp) { + if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) { + int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3); + // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if + // those are past frames + MV_REFERENCE_FRAME start_frame = + inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME; + for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < + 0) { + // Prune inter modes when relative dist of ALTREF2 and ALTREF is close + // to the relative dist of LAST_FRAME. + if (inter_sf->alt_ref_search_fp == 1 && + (abs(cpi->ref_frame_dist_info + .ref_relative_dist[ref_frame - LAST_FRAME]) > + 1.5 * abs(cpi->ref_frame_dist_info + .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) { + continue; + } + if (x->pred_mv_sad[ref_frame] > sad_thresh) + mask->pred_modes[ref_frame] |= INTER_ALL; + } + } + } + } + + if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) { + if (x->best_pred_mv_sad[0] < INT_MAX) { + int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1); + const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME }; + + // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references + for (int ref_idx = 0; ref_idx < 2; ref_idx++) { + ref_frame = prune_ref_list[ref_idx]; + if (x->pred_mv_sad[ref_frame] > sad_thresh) + mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + } + } + } + + if (bsize > sf->part_sf.max_intra_bsize) { + disable_reference(INTRA_FRAME, mask->ref_combo); + } + + if (!cpi->oxcf.tool_cfg.enable_global_motion) { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + mask->pred_modes[ref_frame] |= (1 << GLOBALMV); + mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV); + } + } + + mask->pred_modes[INTRA_FRAME] |= + ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]; + + // Prune reference frames which are not the closest to the current + // frame and with large pred_mv_sad. + if (inter_sf->prune_single_ref) { + assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3); + const double prune_threshes[2] = { 1.20, 1.05 }; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const RefFrameDistanceInfo *const ref_frame_dist_info = + &cpi->ref_frame_dist_info; + const int is_closest_ref = + (ref_frame == ref_frame_dist_info->nearest_past_ref) || + (ref_frame == ref_frame_dist_info->nearest_future_ref); + + if (!is_closest_ref) { + const int dir = + (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0) + ? 0 + : 1; + if (x->best_pred_mv_sad[dir] < INT_MAX && + x->pred_mv_sad[ref_frame] > + prune_threshes[inter_sf->prune_single_ref - 1] * + x->best_pred_mv_sad[dir]) + mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL; + } + } + } +} + +static AOM_INLINE void init_neighbor_pred_buf( + const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args, + int is_hbd) { + if (is_hbd) { + const int len = sizeof(uint16_t); + args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred); + args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + + (MAX_SB_SQUARE >> 1) * len); + args->above_pred_buf[2] = + CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len); + args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred); + args->left_pred_buf[1] = + CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len); + args->left_pred_buf[2] = + CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len); + } else { + args->above_pred_buf[0] = obmc_buffer->above_pred; + args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1); + args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE; + args->left_pred_buf[0] = obmc_buffer->left_pred; + args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1); + args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE; + } +} + +static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x, + MV_REFERENCE_FRAME ref_frame) { + const AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME rf[2]; + av1_set_ref_frame(rf, ref_frame); + + if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1; + + if (prune_ref_by_selective_ref_frame(cpi, x, rf, + cm->cur_frame->ref_display_order_hint)) { + return 1; + } + + return 0; +} + +static AOM_INLINE int is_ref_frame_used_by_compound_ref( + int ref_frame, int skip_ref_frame_mask) { + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + return 1; + } + } + } + return 0; +} + +static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame, + const MB_MODE_INFO *mi_cache) { + if (!mi_cache) { + return 0; + } + + if (ref_frame < REF_FRAMES) { + return (ref_frame == mi_cache->ref_frame[0] || + ref_frame == mi_cache->ref_frame[1]); + } + + // if we are here, then the current mode is compound. + MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame); + return ref_frame == cached_ref_type; +} + +// Please add/modify parameter setting in this function, making it consistent +// and easy to read and maintain. +static AOM_INLINE void set_params_rd_pick_inter_mode( + const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, + BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask, + unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES], + struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; + unsigned char segment_id = mbmi->segment_id; + + init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd)); + av1_collect_neighbors_ref_counts(xd); + estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single, + ref_costs_comp); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + x->best_pred_mv_sad[0] = INT_MAX; + x->best_pred_mv_sad[1] = INT_MAX; + + for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; + ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + // Skip the ref frame if the mask says skip and the ref is not used by + // compound ref. + if (skip_ref_frame_mask & (1 << ref_frame) && + !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) && + !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { + continue; + } + assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); + setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb); + } + if (cpi->sf.inter_sf.alt_ref_search_fp || + cpi->sf.inter_sf.prune_single_ref || + cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) { + // Store the best pred_mv_sad across all past frames + if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] < + 0) + x->best_pred_mv_sad[0] = + AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]); + else + // Store the best pred_mv_sad across all future frames + x->best_pred_mv_sad[1] = + AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]); + } + } + + if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) { + // No second reference on RT ref set, so no need to initialize + for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME; + ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) { + mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; + if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) { + continue; + } + + if (skip_ref_frame_mask & (1 << ref_frame) && + !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { + continue; + } + // Ref mv list population is not required, when compound references are + // pruned. + if (prune_ref_frame(cpi, x, ref_frame)) continue; + + av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, + xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, + mbmi_ext->mode_context); + // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and + // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs. + av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame); + } + } + + av1_count_overlappable_neighbors(cm, xd); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int use_actual_frame_probs = 1; + int prune_obmc; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } +#endif + if (use_actual_frame_probs) { + prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] < + cpi->sf.inter_sf.prune_obmc_prob_thresh; + } + if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) { + if (check_num_overlappable_neighbors(mbmi) && + is_motion_variation_allowed_bsize(bsize)) { + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf, + dst_width1, dst_height1, + args->above_pred_stride); + av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf, + dst_width2, dst_height2, + args->left_pred_stride); + const int num_planes = av1_num_planes(cm); + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, + mi_col, 0, num_planes); + calc_target_weighted_pred( + cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0], + args->left_pred_buf[0], args->left_pred_stride[0]); + } + } + + init_mode_skip_mask(mode_skip_mask, cpi, x, bsize); + + // Set params for mode evaluation + set_mode_eval_params(cpi, x, MODE_EVAL); + + x->comp_rd_stats_idx = 0; + + for (int idx = 0; idx < REF_FRAMES; idx++) { + args->best_single_sse_in_refs[idx] = INT32_MAX; + } +} + +static AOM_INLINE void init_single_inter_mode_search_state( + InterModeSearchState *search_state) { + for (int dir = 0; dir < 2; ++dir) { + for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) { + SingleInterModeState *state; + + state = &search_state->single_state[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + state = &search_state->single_state_modelled[dir][mode][ref_frame]; + state->ref_frame = NONE_FRAME; + state->rd = INT64_MAX; + + search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME; + } + } + } + + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->best_single_rd[ref_frame] = INT64_MAX; + search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID; + } + av1_zero(search_state->single_state_cnt); + av1_zero(search_state->single_state_modelled_cnt); +} + +static AOM_INLINE void init_inter_mode_search_state( + InterModeSearchState *search_state, const AV1_COMP *cpi, + const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) { + init_intra_mode_search_state(&search_state->intra_search_state); + av1_invalid_rd_stats(&search_state->best_y_rdcost); + + search_state->best_rd = best_rd_so_far; + search_state->best_skip_rd[0] = INT64_MAX; + search_state->best_skip_rd[1] = INT64_MAX; + + av1_zero(search_state->best_mbmode); + + search_state->best_rate_y = INT_MAX; + + search_state->best_rate_uv = INT_MAX; + + search_state->best_mode_skippable = 0; + + search_state->best_skip2 = 0; + + search_state->best_mode_index = THR_INVALID; + + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const unsigned char segment_id = mbmi->segment_id; + + search_state->num_available_refs = 0; + memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs)); + memset(search_state->dist_order_refs, -1, + sizeof(search_state->dist_order_refs)); + + for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i) + search_state->mode_threshold[i] = 0; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i) + search_state->mode_threshold[i] = + ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> + RD_THRESH_FAC_FRAC_BITS; + + search_state->best_intra_rd = INT64_MAX; + + search_state->best_pred_sse = UINT_MAX; + + av1_zero(search_state->single_newmv); + av1_zero(search_state->single_newmv_rate); + av1_zero(search_state->single_newmv_valid); + for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) { + for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + for (int i = 0; i < REFERENCE_MODES; ++i) { + search_state->best_pred_rd[i] = INT64_MAX; + } + + if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) { + for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i) + search_state->mode_threshold[i] = + ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >> + RD_THRESH_FAC_FRAC_BITS; + + for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) { + for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) { + for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { + search_state->modelled_rd[i][j][ref_frame] = INT64_MAX; + search_state->simple_rd[i][j][ref_frame] = INT64_MAX; + } + } + } + + init_single_inter_mode_search_state(search_state); + } +} + +static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask, + const MV_REFERENCE_FRAME *ref_frame, + const PREDICTION_MODE this_mode) { + if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) { + return true; + } + + return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1]; +} + +static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x, + BLOCK_SIZE bsize, + PREDICTION_MODE curr_mode, + const MV_REFERENCE_FRAME *ref_frames) { + const int comp_pred = ref_frames[1] > INTRA_FRAME; + if (comp_pred) { + if (!is_comp_ref_allowed(bsize)) return 1; + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) { + return 1; + } + + const AV1_COMMON *const cm = &cpi->common; + if (frame_is_intra_only(cm)) return 1; + + const CurrentFrame *const current_frame = &cm->current_frame; + if (current_frame->reference_mode == SINGLE_REFERENCE) return 1; + + const struct segmentation *const seg = &cm->seg; + const unsigned char segment_id = x->e_mbd.mi[0]->segment_id; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; + } + + if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) { + // Mode must be compatible + if (!is_interintra_allowed_bsize(bsize)) return 1; + if (!is_interintra_allowed_mode(curr_mode)) return 1; + } + + return 0; +} + +static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x, + BLOCK_SIZE bsize, int mib_size) { + const int sb_size_mask = mib_size - 1; + const MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_w = mi_size_wide[bsize]; + const int mi_h = mi_size_high[bsize]; + int picked_ref_frames_mask = 0; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) { + picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j]; + } + } + return picked_ref_frames_mask; +} + +// Check if reference frame pair of the current block matches with the given +// block. +static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi, + const MV_REFERENCE_FRAME *ref_frames) { + return ((ref_frames[0] == mbmi->ref_frame[0]) && + (ref_frames[1] == mbmi->ref_frame[1])); +} + +// Case 1: return 0, means don't skip this mode +// Case 2: return 1, means skip this mode completely +// Case 3: return 2, means skip compound only, but still try single motion modes +static int inter_mode_search_order_independent_skip( + const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask, + InterModeSearchState *search_state, int skip_ref_frame_mask, + PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) { + if (mask_says_skip(mode_skip_mask, ref_frame, mode)) { + return 1; + } + + const int ref_type = av1_ref_frame_type(ref_frame); + if (!cpi->sf.rt_sf.use_real_time_ref_set) + if (prune_ref_frame(cpi, x, ref_type)) return 1; + + // This is only used in motion vector unit test. + if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test && + ref_frame[0] == INTRA_FRAME) + return 1; + + const AV1_COMMON *const cm = &cpi->common; + if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) { + return 1; + } + + // Reuse the prediction mode in cache + if (x->use_mb_mode_cache) { + const MB_MODE_INFO *cached_mi = x->mb_mode_cache; + const PREDICTION_MODE cached_mode = cached_mi->mode; + const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame; + const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME; + + // If the cached mode is intra, then we just need to match the mode. + if (is_mode_intra(cached_mode) && mode != cached_mode) { + return 1; + } + + // If the cached mode is single inter mode, then we match the mode and + // reference frame. + if (cached_mode_is_single) { + if (mode != cached_mode || ref_frame[0] != cached_frame[0]) { + return 1; + } + } else { + // If the cached mode is compound, then we need to consider several cases. + const int mode_is_single = ref_frame[1] <= INTRA_FRAME; + if (mode_is_single) { + // If the mode is single, we know the modes can't match. But we might + // still want to search it if compound mode depends on the current mode. + int skip_motion_mode_only = 0; + if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) { + skip_motion_mode_only = (ref_frame[0] == cached_frame[0]); + } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) { + skip_motion_mode_only = (ref_frame[0] == cached_frame[1]); + } else if (cached_mode == NEW_NEWMV) { + skip_motion_mode_only = (ref_frame[0] == cached_frame[0] || + ref_frame[0] == cached_frame[1]); + } + + return 1 + skip_motion_mode_only; + } else { + // If both modes are compound, then everything must match. + if (mode != cached_mode || ref_frame[0] != cached_frame[0] || + ref_frame[1] != cached_frame[1]) { + return 1; + } + } + } + } + + const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0]; + // If no valid mode has been found so far in PARTITION_NONE when finding a + // valid partition is required, do not skip mode. + if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE && + x->must_find_valid_partition) + return 0; + + const SPEED_FEATURES *const sf = &cpi->sf; + // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference + // frames + if (sf->inter_sf.prune_nearmv_using_neighbors && + (mode == NEAR_NEARMV || mode == NEARMV)) { + const MACROBLOCKD *const xd = &x->e_mbd; + if (search_state->best_rd != INT64_MAX && xd->left_available && + xd->up_available) { + const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 }, + { 1, 1, 0 }, + { 2, 1, 0 } }; + const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE; + + assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX && + qindex_sub_range < 3); + const int num_ref_frame_pair_match_thresh = + thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1] + [qindex_sub_range]; + + assert(num_ref_frame_pair_match_thresh <= 2 && + num_ref_frame_pair_match_thresh >= 0); + int num_ref_frame_pair_match = 0; + + num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame); + num_ref_frame_pair_match += + match_ref_frame_pair(xd->above_mbmi, ref_frame); + + // Pruning based on ref frame pair match with neighbors. + if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1; + } + } + + int skip_motion_mode = 0; + if (mbmi->partition != PARTITION_NONE) { + int skip_ref = skip_ref_frame_mask & (1 << ref_type); + if (ref_type <= ALTREF_FRAME && skip_ref) { + // Since the compound ref modes depends on the motion estimation result of + // two single ref modes (best mv of single ref modes as the start point), + // if current single ref mode is marked skip, we need to check if it will + // be used in compound ref modes. + if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) { + // Found a not skipped compound ref mode which contains current + // single ref. So this single ref can't be skipped completely + // Just skip its motion mode search, still try its simple + // transition mode. + skip_motion_mode = 1; + skip_ref = 0; + } + } + // If we are reusing the prediction from cache, and the current frame is + // required by the cache, then we cannot prune it. + if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) { + skip_ref = 0; + // If the cache only needs the current reference type for compound + // prediction, then we can skip motion mode search. + skip_motion_mode = (ref_type <= ALTREF_FRAME && + x->mb_mode_cache->ref_frame[1] > INTRA_FRAME); + } + if (skip_ref) return 1; + } + + if (ref_frame[0] == INTRA_FRAME) { + if (mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = 64; + if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + return 1; + } + } + + if (skip_motion_mode) return 2; + + return 0; +} + +static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode, + const MV_REFERENCE_FRAME *ref_frames, + const AV1_COMMON *cm) { + PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + mbmi->ref_mv_idx = 0; + mbmi->mode = curr_mode; + mbmi->uv_mode = UV_DC_PRED; + mbmi->ref_frame[0] = ref_frames[0]; + mbmi->ref_frame[1] = ref_frames[1]; + pmi->palette_size[0] = 0; + pmi->palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1); + set_default_interp_filters(mbmi, cm->features.interp_filter); +} + +static AOM_INLINE void collect_single_states(MACROBLOCK *x, + InterModeSearchState *search_state, + const MB_MODE_INFO *const mbmi) { + int i, j; + const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0]; + const PREDICTION_MODE this_mode = mbmi->mode; + const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1; + const int mode_offset = INTER_OFFSET(this_mode); + const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode); + + // Simple rd + int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + const int64_t rd = + search_state->simple_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < simple_rd) simple_rd = rd; + } + + // Insertion sort of single_state + const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 }; + SingleInterModeState *state_s = search_state->single_state[dir][mode_offset]; + i = search_state->single_state_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j) + state_s[j] = state_s[j - 1]; + state_s[j] = this_state_s; + search_state->single_state_cnt[dir][mode_offset]++; + + // Modelled rd + int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame]; + for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) { + const int64_t rd = + search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame]; + if (rd < modelled_rd) modelled_rd = rd; + } + + // Insertion sort of single_state_modelled + const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 }; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode_offset]; + i = search_state->single_state_modelled_cnt[dir][mode_offset]; + for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j) + state_m[j] = state_m[j - 1]; + state_m[j] = this_state_m; + search_state->single_state_modelled_cnt[dir][mode_offset]++; +} + +static AOM_INLINE void analyze_single_states( + const AV1_COMP *cpi, InterModeSearchState *search_state) { + const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result; + assert(prune_level >= 1); + int i, j, dir, mode; + + for (dir = 0; dir < 2; ++dir) { + int64_t best_rd; + SingleInterModeState(*state)[FWD_REFS]; + const int prune_factor = prune_level >= 2 ? 6 : 5; + + // Use the best rd of GLOBALMV or NEWMV to prune the unlikely + // reference frames for all the modes (NEARESTMV and NEARMV may not + // have same motion vectors). Always keep the best of each mode + // because it might form the best possible combination with other mode. + state = search_state->single_state[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 3) * prune_factor > best_rd) { + state[mode][i].valid = 0; + } + } + } + + state = search_state->single_state_modelled[dir]; + best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd, + state[INTER_OFFSET(GLOBALMV)][0].rd); + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) { + if (state[mode][i].rd != INT64_MAX && + (state[mode][i].rd >> 3) * prune_factor > best_rd) { + state[mode][i].valid = 0; + } + } + } + } + + // Ordering by simple rd first, then by modelled rd + for (dir = 0; dir < 2; ++dir) { + for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) { + const int state_cnt_s = search_state->single_state_cnt[dir][mode]; + const int state_cnt_m = + search_state->single_state_modelled_cnt[dir][mode]; + SingleInterModeState *state_s = search_state->single_state[dir][mode]; + SingleInterModeState *state_m = + search_state->single_state_modelled[dir][mode]; + int count = 0; + const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m); + for (i = 0; i < state_cnt_s; ++i) { + if (state_s[i].rd == INT64_MAX) break; + if (state_s[i].valid) { + search_state->single_rd_order[dir][mode][count++] = + state_s[i].ref_frame; + } + } + if (count >= max_candidates) continue; + + for (i = 0; i < state_cnt_m && count < max_candidates; ++i) { + if (state_m[i].rd == INT64_MAX) break; + if (!state_m[i].valid) continue; + const int ref_frame = state_m[i].ref_frame; + int match = 0; + // Check if existing already + for (j = 0; j < count; ++j) { + if (search_state->single_rd_order[dir][mode][j] == ref_frame) { + match = 1; + break; + } + } + if (match) continue; + // Check if this ref_frame is removed in simple rd + int valid = 1; + for (j = 0; j < state_cnt_s; ++j) { + if (ref_frame == state_s[j].ref_frame) { + valid = state_s[j].valid; + break; + } + } + if (valid) { + search_state->single_rd_order[dir][mode][count++] = ref_frame; + } + } + } + } +} + +static int compound_skip_get_candidates( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const int dir, const PREDICTION_MODE mode) { + const int mode_offset = INTER_OFFSET(mode); + const SingleInterModeState *state = + search_state->single_state[dir][mode_offset]; + const SingleInterModeState *state_modelled = + search_state->single_state_modelled[dir][mode_offset]; + + int max_candidates = 0; + for (int i = 0; i < FWD_REFS; ++i) { + if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break; + max_candidates++; + } + + int candidates = max_candidates; + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) { + candidates = AOMMIN(2, max_candidates); + } + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) { + if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX && + state[0].ref_frame == state_modelled[0].ref_frame) + candidates = 1; + if (mode == NEARMV || mode == GLOBALMV) candidates = 1; + } + + if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) { + // Limit the number of candidates to 1 in each direction for compound + // prediction + candidates = AOMMIN(1, candidates); + } + return candidates; +} + +static int compound_skip_by_single_states( + const AV1_COMP *cpi, const InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame, + const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) { + const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame }; + const int mode[2] = { compound_ref0_mode(this_mode), + compound_ref1_mode(this_mode) }; + const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) }; + const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1, + refs[1] <= GOLDEN_FRAME ? 0 : 1 }; + int ref_searched[2] = { 0, 0 }; + int ref_mv_match[2] = { 1, 1 }; + int i, j; + + for (i = 0; i < 2; ++i) { + const SingleInterModeState *state = + search_state->single_state[mode_dir[i]][mode_offset[i]]; + const int state_cnt = + search_state->single_state_cnt[mode_dir[i]][mode_offset[i]]; + for (j = 0; j < state_cnt; ++j) { + if (state[j].ref_frame == refs[i]) { + ref_searched[i] = 1; + break; + } + } + } + + const int ref_set = get_drl_refmv_count(x, refs, this_mode); + for (i = 0; i < 2; ++i) { + if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) { + continue; + } + const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME }; + for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) { + int_mv single_mv; + int_mv comp_mv; + get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs, + &x->mbmi_ext); + get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext); + if (single_mv.as_int != comp_mv.as_int) { + ref_mv_match[i] = 0; + break; + } + } + } + + for (i = 0; i < 2; ++i) { + if (!ref_searched[i] || !ref_mv_match[i]) continue; + const int candidates = + compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]); + const MV_REFERENCE_FRAME *ref_order = + search_state->single_rd_order[mode_dir[i]][mode_offset[i]]; + int match = 0; + for (j = 0; j < candidates; ++j) { + if (refs[i] == ref_order[j]) { + match = 1; + break; + } + } + if (!match) return 1; + } + + return 0; +} + +// Check if ref frames of current block matches with given block. +static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi, + const MV_REFERENCE_FRAME *ref_frames, + int *const is_ref_match) { + if (is_inter_block(mbmi)) { + is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0]; + is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0]; + if (has_second_ref(mbmi)) { + is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1]; + is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1]; + } + } +} + +// Prune compound mode using ref frames of neighbor blocks. +static INLINE int compound_skip_using_neighbor_refs( + MACROBLOCKD *const xd, const PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) { + // Exclude non-extended compound modes from pruning + if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || + this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) + return 0; + + if (prune_ext_comp_using_neighbors >= 3) return 1; + + int is_ref_match[2] = { 0 }; // 0 - match for forward refs + // 1 - match for backward refs + // Check if ref frames of this block matches with left neighbor. + if (xd->left_available) + match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match); + + // Check if ref frames of this block matches with above neighbor. + if (xd->up_available) + match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match); + + // Combine ref frame match with neighbors in forward and backward refs. + const int track_ref_match = is_ref_match[0] + is_ref_match[1]; + + // Pruning based on ref frame match with neighbors. + if (track_ref_match >= prune_ext_comp_using_neighbors) return 0; + return 1; +} + +// Update best single mode for the given reference frame based on simple rd. +static INLINE void update_best_single_mode(InterModeSearchState *search_state, + const PREDICTION_MODE this_mode, + const MV_REFERENCE_FRAME ref_frame, + int64_t this_rd) { + if (this_rd < search_state->best_single_rd[ref_frame]) { + search_state->best_single_rd[ref_frame] = this_rd; + search_state->best_single_mode[ref_frame] = this_mode; + } +} + +// Prune compound mode using best single mode for the same reference. +static INLINE int skip_compound_using_best_single_mode_ref( + const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames, + const PREDICTION_MODE *best_single_mode, + int prune_comp_using_best_single_mode_ref) { + // Exclude non-extended compound modes from pruning + if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || + this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) + return 0; + + assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV); + const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode); + // Get ref frame direction corresponding to NEWMV + // 0 - NEWMV corresponding to forward direction + // 1 - NEWMV corresponding to backward direction + const int newmv_dir = comp_mode_ref0 != NEWMV; + + // Avoid pruning the compound mode when ref frame corresponding to NEWMV + // have NEWMV as single mode winner. + // Example: For an extended-compound mode, + // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}} + // - Ref frame corresponding to NEWMV is ALTREF_FRAME + // - Avoid pruning this mode, if best single mode corresponding to ref frame + // ALTREF_FRAME is NEWMV + const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]]; + if (single_mode == NEWMV) return 0; + + // Avoid pruning the compound mode when best single mode is not available + if (prune_comp_using_best_single_mode_ref == 1) + if (single_mode == MB_MODE_COUNT) return 0; + return 1; +} + +static int compare_int64(const void *a, const void *b) { + int64_t a64 = *((int64_t *)a); + int64_t b64 = *((int64_t *)b); + if (a64 < b64) { + return -1; + } else if (a64 == b64) { + return 0; + } else { + return 1; + } +} + +static INLINE void update_search_state( + InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst, + PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats, + const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv, + THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int skip_txfm = + mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode); + const TxfmSearchInfo *txfm_info = &x->txfm_search_info; + + search_state->best_rd = new_best_rd_stats->rdcost; + search_state->best_mode_index = new_best_mode; + *best_rd_stats_dst = *new_best_rd_stats; + search_state->best_mbmode = *mbmi; + search_state->best_skip2 = skip_txfm; + search_state->best_mode_skippable = new_best_rd_stats->skip_txfm; + // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and + // rate_uv because av1_txfm_search process is replaced by rd estimation. + // Therefore, we should avoid updating best_rate_y and best_rate_uv here. + // These two values will be updated when av1_txfm_search is called. + if (txfm_search_done) { + search_state->best_rate_y = + new_best_rd_stats_y->rate + + x->mode_costs.skip_txfm_cost[skip_ctx] + [new_best_rd_stats->skip_txfm || skip_txfm]; + search_state->best_rate_uv = new_best_rd_stats_uv->rate; + } + search_state->best_y_rdcost = *new_best_rd_stats_y; + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); +} + +// Find the best RD for a reference frame (among single reference modes) +// and store +10% of it in the 0-th element in ref_frame_rd. +static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) { + assert(ref_frame_rd[0] == INT64_MAX); + int64_t ref_copy[REF_FRAMES - 1]; + memcpy(ref_copy, ref_frame_rd + 1, + sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1)); + qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64); + + int64_t cutoff = ref_copy[0]; + // The cut-off is within 10% of the best. + if (cutoff != INT64_MAX) { + assert(cutoff < INT64_MAX / 200); + cutoff = (110 * cutoff) / 100; + } + ref_frame_rd[0] = cutoff; +} + +// Check if either frame is within the cutoff. +static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES], + MV_REFERENCE_FRAME frame1, + MV_REFERENCE_FRAME frame2) { + assert(frame2 > 0); + return ref_frame_rd[frame1] <= ref_frame_rd[0] || + ref_frame_rd[frame2] <= ref_frame_rd[0]; +} + +static AOM_INLINE void evaluate_motion_mode_for_winner_candidates( + const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost, + HandleInterModeArgs *const args, TileDataEnc *const tile_data, + PICK_MODE_CONTEXT *const ctx, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], + const motion_mode_best_st_candidate *const best_motion_mode_cands, + int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd, + InterModeSearchState *const search_state, int64_t *yrd) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + InterModesInfo *const inter_modes_info = x->inter_modes_info; + const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand; + + for (int cand = 0; cand < num_best_cand; cand++) { + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + av1_init_rd_stats(&rd_stats); + av1_init_rd_stats(&rd_stats_y); + av1_init_rd_stats(&rd_stats_uv); + int rate_mv; + + rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv; + args->skip_motion_mode = + best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode; + *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi; + rd_stats.rate = + best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff; + + // Continue if the best candidate is compound. + if (!is_inter_singleref_mode(mbmi->mode)) continue; + + x->txfm_search_info.skip_txfm = 0; + struct macroblockd_plane *pd = xd->plane; + const BUFFER_SET orig_dst = { + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, + }; + + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Initialize motion mode to simple translation + // Calculation of switchable rate depends on it. + mbmi->motion_mode = 0; + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + int64_t skip_rd[2] = { search_state->best_skip_rd[0], + search_state->best_skip_rd[1] }; + int64_t this_yrd = INT64_MAX; + int64_t ret_value = motion_mode_rd( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args, + search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd, + do_tx_search, inter_modes_info, 1, &this_yrd); + + if (ret_value != INT64_MAX) { + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + const THR_MODES mode_enum = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + // Collect mode stats for multiwinner mode processing + store_winner_mode_stats( + &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_enum, NULL, bsize, rd_stats.rdcost, + cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search); + if (rd_stats.rdcost < search_state->best_rd) { + *yrd = this_yrd; + update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, do_tx_search); + if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0]; + } + } + } +} + +/*!\cond */ +// Arguments for speed feature pruning of inter mode search +typedef struct { + int *skip_motion_mode; + mode_skip_mask_t *mode_skip_mask; + InterModeSearchState *search_state; + int skip_ref_frame_mask; + int reach_first_comp_mode; + int mode_thresh_mul_fact; + int num_single_modes_processed; + int prune_cpd_using_sr_stats_ready; +} InterModeSFArgs; +/*!\endcond */ + +static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, + int64_t *ref_frame_rd, int midx, + InterModeSFArgs *args, int is_low_temp_var) { + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + // Get the actual prediction mode we are trying in this iteration + const THR_MODES mode_enum = av1_default_mode_order[midx]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; + const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; + const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + + if (ref_frame == INTRA_FRAME) return 1; + + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE && + comp_pred) { + return 1; + } + + // This is for real time encoding. + if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME && + this_mode != NEARESTMV) + return 1; + + // Check if this mode should be skipped because it is incompatible with the + // current frame + if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames)) + return 1; + const int ret = inter_mode_search_order_independent_skip( + cpi, x, args->mode_skip_mask, args->search_state, + args->skip_ref_frame_mask, this_mode, mode_def->ref_frame); + if (ret == 1) return 1; + *(args->skip_motion_mode) = (ret == 2); + + // We've reached the first compound prediction mode, get stats from the + // single reference predictors to help with pruning. + // Disable this pruning logic if interpolation filter search was skipped for + // single prediction modes as it can result in aggressive pruning of compound + // prediction modes due to the absence of modelled_rd populated by + // av1_interpolation_filter_search(). + // TODO(Remya): Check the impact of the sf + // 'prune_comp_search_by_single_result' if compound prediction modes are + // enabled in future for REALTIME encode. + if (!sf->interp_sf.skip_interp_filter_search && + sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred && + args->reach_first_comp_mode == 0) { + analyze_single_states(cpi, args->search_state); + args->reach_first_comp_mode = 1; + } + + // Prune aggressively when best mode is skippable. + int mul_fact = args->search_state->best_mode_skippable + ? args->mode_thresh_mul_fact + : (1 << MODE_THRESH_QBITS); + int64_t mode_threshold = + (args->search_state->mode_threshold[mode_enum] * mul_fact) >> + MODE_THRESH_QBITS; + + if (args->search_state->best_rd < mode_threshold) return 1; + + // Skip this compound mode based on the RD results from the single prediction + // modes + if (!sf->interp_sf.skip_interp_filter_search && + sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, args->search_state, this_mode, + ref_frame, second_ref_frame, x)) + return 1; + } + + if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) { + // After we done with single reference modes, find the 2nd best RD + // for a reference frame. Only search compound modes that have a reference + // frame at least as good as the 2nd best. + if (!args->prune_cpd_using_sr_stats_ready && + args->num_single_modes_processed == NUM_SINGLE_REF_MODES) { + find_top_ref(ref_frame_rd); + args->prune_cpd_using_sr_stats_ready = 1; + } + if (args->prune_cpd_using_sr_stats_ready && + !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) + return 1; + } + + // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes + if (sf->inter_sf.skip_ext_comp_nearmv_mode && + (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) { + return 1; + } + + if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) { + if (compound_skip_using_neighbor_refs( + xd, this_mode, ref_frames, + sf->inter_sf.prune_ext_comp_using_neighbors)) + return 1; + } + + if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) { + if (skip_compound_using_best_single_mode_ref( + this_mode, ref_frames, args->search_state->best_single_mode, + sf->inter_sf.prune_comp_using_best_single_mode_ref)) + return 1; + } + + if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) { + const int8_t ref_frame_type = av1_ref_frame_type(ref_frames); + if (skip_nearest_near_mv_using_refmv_weight( + x, this_mode, ref_frame_type, + args->search_state->best_mbmode.mode)) { + // Ensure the mode is pruned only when the current block has obtained a + // valid inter mode. + assert(is_inter_mode(args->search_state->best_mbmode.mode)); + return 1; + } + } + + if (sf->rt_sf.prune_inter_modes_with_golden_ref && + ref_frame == GOLDEN_FRAME && !comp_pred) { + const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL); + if (cpi->rc.frames_since_golden > (subgop_size >> 2) && + args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) { + if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV) + return 1; + } + } + + return 0; +} + +static void record_best_compound(REFERENCE_MODE reference_mode, + RD_STATS *rd_stats, int comp_pred, int rdmult, + InterModeSearchState *search_state, + int compmode_cost) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rd_stats->rate - compmode_cost; + hybrid_rate = rd_stats->rate; + } else { + single_rate = rd_stats->rate; + hybrid_rate = rd_stats->rate + compmode_cost; + } + + single_rd = RDCOST(rdmult, single_rate, rd_stats->dist); + hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist); + + if (!comp_pred) { + if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE]) + search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd; + } else { + if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE]) + search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd; + } + if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT]) + search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; +} + +// Does a transform search over a list of the best inter mode candidates. +// This is called if the original mode search computed an RD estimate +// for the transform search rather than doing a full search. +static void tx_search_best_inter_candidates( + AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, + int64_t best_rd_so_far, BLOCK_SIZE bsize, + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col, + InterModeSearchState *search_state, RD_STATS *rd_cost, + PICK_MODE_CONTEXT *ctx, int64_t *yrd) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + const ModeCosts *mode_costs = &x->mode_costs; + const int num_planes = av1_num_planes(cm); + const int skip_ctx = av1_get_skip_txfm_context(xd); + MB_MODE_INFO *const mbmi = xd->mi[0]; + InterModesInfo *inter_modes_info = x->inter_modes_info; + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state->best_rd = best_rd_so_far; + search_state->best_mode_index = THR_INVALID; + // Initialize best mode stats for winner mode processing + x->winner_mode_count = 0; + store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, + NULL, bsize, best_rd_so_far, + cpi->sf.winner_mode_sf.multi_winner_mode_type, 0); + inter_modes_info->num = + inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search + ? inter_modes_info->num + : cpi->sf.rt_sf.num_inter_modes_for_tx_search; + const int64_t top_est_rd = + inter_modes_info->num > 0 + ? inter_modes_info + ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx] + : INT64_MAX; + *yrd = INT64_MAX; + int64_t best_rd_in_this_partition = INT64_MAX; + int num_inter_mode_cands = inter_modes_info->num; + int newmv_mode_evaled = 0; + int max_allowed_cands = INT_MAX; + if (cpi->sf.inter_sf.limit_inter_mode_cands) { + // The bound on the no. of inter mode candidates, beyond which the + // candidates are limited if a newmv mode got evaluated, is set as + // max_allowed_cands + 1. + const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 }; + assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4); + max_allowed_cands = + num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands]; + } + + int num_mode_thresh = INT_MAX; + if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) { + // Bound the no. of transform searches per prediction mode beyond a + // threshold. + const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 }; + assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3); + num_mode_thresh = + num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode]; + } + + int num_tx_cands = 0; + int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 }; + // Iterate over best inter mode candidates and perform tx search + for (int j = 0; j < num_inter_mode_cands; ++j) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + const PREDICTION_MODE prediction_mode = mbmi->mode; + int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; + if (curr_est_rd * 0.80 > top_est_rd) break; + + if (num_tx_cands > num_mode_thresh) { + if ((prediction_mode != NEARESTMV && + num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) || + (prediction_mode == NEARESTMV && + num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2)) + continue; + } + + txfm_info->skip_txfm = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (int i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + bool is_predictor_built = false; + + // Initialize RD stats + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + int64_t skip_rd = INT64_MAX; + const int txfm_rd_gate_level = get_txfm_rd_gate_level( + cm->seq_params->enable_masked_compound, + cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT, + /*eval_motion_mode=*/0); + if (txfm_rd_gate_level) { + // Check if the mode is good enough based on skip RD + int64_t curr_sse = inter_modes_info->sse_arr[data_idx]; + skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse); + int eval_txfm = check_txfm_eval(x, bsize, search_state->best_skip_rd[0], + skip_rd, txfm_rd_gate_level, 0); + if (!eval_txfm) continue; + } + + // Build the prediction for this mode + if (!is_predictor_built) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + } + if (mbmi->motion_mode == OBMC_CAUSAL) { + av1_build_obmc_inter_predictors_sb(cm, xd); + } + + num_tx_cands++; + if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1; + num_tx_search_modes[prediction_mode - INTER_MODE_START]++; + int64_t this_yrd = INT64_MAX; + // Do the transform search + if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + mode_rate, search_state->best_rd)) { + continue; + } else { + const int y_rate = + rd_stats.skip_txfm + ? mode_costs->skip_txfm_cost[skip_ctx][1] + : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]); + this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist); + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + inter_mode_data_push( + tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]); + } + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + if (rd_stats.rdcost < best_rd_in_this_partition) { + best_rd_in_this_partition = rd_stats.rdcost; + *yrd = this_yrd; + } + + const THR_MODES mode_enum = get_prediction_mode_idx( + prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, + NULL, bsize, rd_stats.rdcost, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + + if (rd_stats.rdcost < search_state->best_rd) { + update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, txfm_search_done); + search_state->best_skip_rd[0] = skip_rd; + // Limit the total number of modes to be evaluated if the first is valid + // and transform skip or compound + if (cpi->sf.inter_sf.inter_mode_txfm_breakout) { + if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) { + // Evaluate more candidates at high quantizers where occurrence of + // transform skip is high. + const int max_cands_cap[5] = { 2, 3, 5, 7, 9 }; + const int qindex_band = (5 * x->qindex) >> QINDEX_BITS; + num_inter_mode_cands = + AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num); + } else if (!j && has_second_ref(&search_state->best_mbmode)) { + const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1; + // Evaluate more candidates at low quantizers where occurrence of + // single reference mode is high. + const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 }, + { 10, 7, 5, 3 } }; + const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS; + num_inter_mode_cands = AOMMIN( + max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num); + } + } + } + // If the number of candidates evaluated exceeds max_allowed_cands, break if + // a newmv mode was evaluated already. + if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break; + } +} + +// Indicates number of winner simple translation modes to be used +static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 }; + +// Adds a motion mode to the candidate list for motion_mode_for_winner_cand +// speed feature. This list consists of modes that have only searched +// SIMPLE_TRANSLATION. The final list will be used to search other motion +// modes after the initial RD search. +static void handle_winner_cand( + MB_MODE_INFO *const mbmi, + motion_mode_best_st_candidate *best_motion_mode_cands, + int max_winner_motion_mode_cand, int64_t this_rd, + motion_mode_candidate *motion_mode_cand, int skip_motion_mode) { + // Number of current motion mode candidates in list + const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand; + int valid_motion_mode_cand_loc = num_motion_mode_cand; + + // find the best location to insert new motion mode candidate + for (int j = 0; j < num_motion_mode_cand; j++) { + if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) { + valid_motion_mode_cand_loc = j; + break; + } + } + + // Insert motion mode if location is found + if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) { + if (num_motion_mode_cand > 0 && + valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1) + memmove( + &best_motion_mode_cands + ->motion_mode_cand[valid_motion_mode_cand_loc + 1], + &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc], + (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) - + valid_motion_mode_cand_loc) * + sizeof(best_motion_mode_cands->motion_mode_cand[0])); + motion_mode_cand->mbmi = *mbmi; + motion_mode_cand->rd_cost = this_rd; + motion_mode_cand->skip_motion_mode = skip_motion_mode; + best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] = + *motion_mode_cand; + best_motion_mode_cands->num_motion_mode_cand = + AOMMIN(max_winner_motion_mode_cand, + best_motion_mode_cands->num_motion_mode_cand + 1); + } +} + +/*!\brief Search intra modes in interframes + * + * \ingroup intra_mode_search + * + * This function searches for the best intra mode when the current frame is an + * interframe. This function however does *not* handle luma palette mode. + * Palette mode is currently handled by \ref av1_search_palette_mode. + * + * This function will first iterate through the luma mode candidates to find the + * best luma intra mode. Once the best luma mode it's found, it will then search + * for the best chroma mode. Because palette mode is currently not handled by + * here, a cache of uv mode is stored in + * InterModeSearchState::intra_search_state so it can be reused later by \ref + * av1_search_palette_mode. + * + * \param[in,out] search_state Struct keep track of the prediction mode + * search state in interframe. + * + * \param[in] cpi Top-level encoder structure. + * \param[in,out] x Pointer to struct holding all the data for + * the current prediction block. + * \param[out] rd_cost Stores the best rd_cost among all the + * prediction modes searched. + * \param[in] bsize Current block size. + * \param[in,out] ctx Structure to hold the number of 4x4 blks to + * copy the tx_type and txfm_skip arrays. + * for only the Y plane. + * \param[in] sf_args Stores the list of intra mode candidates + * to be searched. + * \param[in] intra_ref_frame_cost The entropy cost for signaling that the + * current ref frame is an intra frame. + * \param[in] yrd_threshold The rdcost threshold for luma intra mode to + * terminate chroma intra mode search. + * + * \remark If a new best mode is found, search_state and rd_costs are updated + * correspondingly. While x is also modified, it is only used as a temporary + * buffer, and the final decisions are stored in search_state. + */ +static AOM_INLINE void search_intra_modes_in_interframe( + InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost, + int64_t yrd_threshold) { + const AV1_COMMON *const cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + IntraModeSearchState *intra_search_state = &search_state->intra_search_state; + + int is_best_y_mode_intra = 0; + RD_STATS best_intra_rd_stats_y; + int64_t best_rd_y = INT64_MAX; + int best_mode_cost_y = -1; + MB_MODE_INFO best_mbmi = *xd->mi[0]; + THR_MODES best_mode_enum = THR_INVALID; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + const int num_4x4 = bsize_to_num_blk(bsize); + + // Performs luma search + int64_t best_model_rd = INT64_MAX; + int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; + for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { + top_intra_model_rd[i] = INT64_MAX; + } + for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) { + if (sf->intra_sf.skip_intra_in_interframe && + search_state->intra_search_state.skip_intra_modes) + break; + set_y_mode_and_delta_angle( + mode_idx, mbmi, sf->intra_sf.prune_luma_odd_delta_angles_in_intra); + assert(mbmi->mode < INTRA_MODE_END); + + // Use intra_y_mode_mask speed feature to skip intra mode evaluation. + if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode)) + continue; + + const THR_MODES mode_enum = + get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME); + if ((!intra_mode_cfg->enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED) + continue; + if (av1_is_directional_mode(mbmi->mode) && + !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) && + mbmi->angle_delta[PLANE_TYPE_Y] != 0) + continue; + const PREDICTION_MODE this_mode = mbmi->mode; + + assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME); + assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME); + init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm); + x->txfm_search_info.skip_txfm = 0; + + if (this_mode != DC_PRED) { + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) { + if (search_state->best_mode_index != THR_INVALID && + search_state->best_mbmode.ref_frame[0] > INTRA_FRAME) + continue; + } + if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra( + this_mode, search_state->intra_search_state.best_intra_mode)) + continue; + } + } + + RD_STATS intra_rd_stats_y; + int mode_cost_y; + int64_t intra_rd_y = INT64_MAX; + const int is_luma_result_valid = av1_handle_intra_y_mode( + intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx, + &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y, + &best_model_rd, top_intra_model_rd); + if (is_luma_result_valid && intra_rd_y < yrd_threshold) { + is_best_y_mode_intra = 1; + if (intra_rd_y < best_rd_y) { + best_intra_rd_stats_y = intra_rd_stats_y; + best_mode_cost_y = mode_cost_y; + best_rd_y = intra_rd_y; + best_mbmi = *mbmi; + best_mode_enum = mode_enum; + memcpy(best_blk_skip, x->txfm_search_info.blk_skip, + sizeof(best_blk_skip[0]) * num_4x4); + av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4); + } + } + } + + if (!is_best_y_mode_intra) { + return; + } + + assert(best_rd_y < INT64_MAX); + + // Restores the best luma mode + *mbmi = best_mbmi; + memcpy(x->txfm_search_info.blk_skip, best_blk_skip, + sizeof(best_blk_skip[0]) * num_4x4); + av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4); + + // Performs chroma search + RD_STATS intra_rd_stats, intra_rd_stats_uv; + av1_init_rd_stats(&intra_rd_stats); + av1_init_rd_stats(&intra_rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe( + intra_search_state, cpi, x, bsize, &intra_rd_stats, + &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd); + + if (!intra_uv_mode_valid) { + return; + } + } + + // Merge the luma and chroma rd stats + assert(best_mode_cost_y >= 0); + intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y; + if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) { + // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size + // in the tokenonly rate, but for intra blocks, tx_size is always coded + // (prediction granularity), so we account for it in the full rate, + // not the tokenonly rate. + best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size); + } + + const ModeCosts *mode_costs = &x->mode_costs; + const PREDICTION_MODE mode = mbmi->mode; + if (num_planes > 1 && xd->is_chroma_ref) { + const int uv_mode_cost = + mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode]; + intra_rd_stats.rate += + intra_rd_stats_uv.rate + + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost); + } + + // Intra block is always coded as non-skip + intra_rd_stats.skip_txfm = 0; + intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist; + // Add in the cost of the no skip flag. + const int skip_ctx = av1_get_skip_txfm_context(xd); + intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0]; + // Calculate the final RD estimate for this mode. + const int64_t this_rd = + RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist); + // Keep record of best intra rd + if (this_rd < search_state->best_intra_rd) { + search_state->best_intra_rd = this_rd; + intra_search_state->best_intra_mode = mode; + } + + for (int i = 0; i < REFERENCE_MODES; ++i) { + search_state->best_pred_rd[i] = + AOMMIN(search_state->best_pred_rd[i], this_rd); + } + + intra_rd_stats.rdcost = this_rd; + + // Collect mode stats for multiwinner mode processing + const int txfm_search_done = 1; + store_winner_mode_stats( + &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y, + &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost, + cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done); + if (intra_rd_stats.rdcost < search_state->best_rd) { + update_search_state(search_state, rd_cost, ctx, &intra_rd_stats, + &best_intra_rd_stats_y, &intra_rd_stats_uv, + best_mode_enum, x, txfm_search_done); + } +} + +#if !CONFIG_REALTIME_ONLY +// Prepare inter_cost and intra_cost from TPL stats, which are used as ML +// features in intra mode pruning. +static AOM_INLINE void calculate_cost_from_tpl_data( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t *inter_cost, int64_t *intra_cost) { + const AV1_COMMON *const cm = &cpi->common; + // Only consider full SB. + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d; + const int len = (block_size_wide[sb_size] / tpl_bsize_1d) * + (block_size_high[sb_size] / tpl_bsize_1d); + SuperBlockEnc *sb_enc = &x->sb_enc; + if (sb_enc->tpl_data_count == len) { + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d); + const int tpl_stride = sb_enc->tpl_stride; + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[sb_size]; + const int of_w = mi_col % mi_size_wide[sb_size]; + const int start = of_h / tplh * tpl_stride + of_w / tplw; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l]; + *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l]; + } + } + *inter_cost /= nw * nh; + *intra_cost /= nw * nh; + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune +// intra mode search. +static AOM_INLINE void skip_intra_modes_in_interframe( + AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize, + InterModeSearchState *search_state, const SPEED_FEATURES *const sf, + int64_t inter_cost, int64_t intra_cost) { + MACROBLOCKD *const xd = &x->e_mbd; + const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME; + if (sf->rt_sf.prune_intra_mode_based_on_mv_range && + bsize > sf->part_sf.max_intra_bsize && !comp_pred) { + const MV best_mv = search_state->best_mbmode.mv[0].as_mv; + const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range; + if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh && + x->source_variance > 128) { + search_state->intra_search_state.skip_intra_modes = 1; + return; + } + } + + const unsigned int src_var_thresh_intra_skip = 1; + const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe; + if (!(skip_intra_in_interframe && + (x->source_variance > src_var_thresh_intra_skip))) + return; + + // Prune intra search based on best inter mode being transfrom skip. + if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) { + const int qindex_thresh[2] = { 200, MAXQ }; + const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0; + if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) && + (x->qindex <= qindex_thresh[ind])) { + search_state->intra_search_state.skip_intra_modes = 1; + return; + } else if ((skip_intra_in_interframe >= 4) && + (inter_cost < 0 || intra_cost < 0)) { + search_state->intra_search_state.skip_intra_modes = 1; + return; + } + } + // Use ML model to prune intra search. + if (inter_cost >= 0 && intra_cost >= 0) { + const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480) + ? &av1_intrap_nn_config + : &av1_intrap_hd_nn_config; + float nn_features[6]; + float scores[2] = { 0.0f }; + + nn_features[0] = (float)search_state->best_mbmode.skip_txfm; + nn_features[1] = (float)mi_size_wide_log2[bsize]; + nn_features[2] = (float)mi_size_high_log2[bsize]; + nn_features[3] = (float)intra_cost; + nn_features[4] = (float)inter_cost; + const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd); + nn_features[5] = (float)(ac_q_max / ac_q); + + av1_nn_predict(nn_features, nn_config, 1, scores); + + // For two parameters, the max prob returned from av1_nn_softmax equals + // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the + // calling of av1_nn_softmax. + const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f }; + assert(skip_intra_in_interframe <= 5); + if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) { + search_state->intra_search_state.skip_intra_modes = 1; + } + } +} + +static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi, + int is_single_pred) { + const MODE encoding_mode = cpi->oxcf.mode; + if (encoding_mode == REALTIME) { + return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE && + (cpi->sf.interp_sf.skip_interp_filter_search || + cpi->sf.winner_mode_sf.winner_mode_ifs)); + } else if (encoding_mode == GOOD) { + // Skip interpolation filter search for single prediction modes. + return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred); + } + return false; +} + +static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi, + const MACROBLOCK *x, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const SPEED_FEATURES *const sf = &cpi->sf; + + if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION || + !sf->rt_sf.short_circuit_low_temp_var || + !sf->rt_sf.prune_inter_modes_using_temp_var) { + return 0; + } + + const int mi_row = x->e_mbd.mi_row; + const int mi_col = x->e_mbd.mi_col; + int is_low_temp_var = 0; + + if (cm->seq_params->sb_size == BLOCK_64X64) + is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + else + is_low_temp_var = av1_get_force_skip_low_temp_var( + &x->part_search_info.variance_low[0], mi_row, mi_col, bsize); + + return is_low_temp_var; +} + +// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb. +void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + const int num_planes = av1_num_planes(cm); + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + int i; + const ModeCosts *mode_costs = &x->mode_costs; + const int *comp_inter_cost = + mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)]; + + InterModeSearchState search_state; + init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far); + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; + HandleInterModeArgs args = { { NULL }, + { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, + { NULL }, + { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }, + NULL, + NULL, + NULL, + search_state.modelled_rd, + INT_MAX, + INT_MAX, + search_state.simple_rd, + 0, + false, + interintra_modes, + { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } }, + { { 0, 0 } }, + { 0 }, + 0, + 0, + -1, + -1, + -1, + { 0 }, + { 0 }, + UINT_MAX }; + // Currently, is_low_temp_var is used in real time encoding. + const int is_low_temp_var = get_block_temp_var(cpi, x, bsize); + + for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1; + // Indicates the appropriate number of simple translation winner modes for + // exhaustive motion mode evaluation + const int max_winner_motion_mode_cand = + num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand]; + assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES); + motion_mode_candidate motion_mode_cand; + motion_mode_best_st_candidate best_motion_mode_cands; + // Initializing the number of motion mode candidates to zero. + best_motion_mode_cands.num_motion_mode_cand = 0; + for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i) + best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX; + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + + av1_invalid_rd_stats(rd_cost); + + for (i = 0; i < REF_FRAMES; ++i) { + x->warp_sample_info[i].num = -1; + } + + // Ref frames that are selected by square partition blocks. + int picked_ref_frames_mask = 0; + if (sf->inter_sf.prune_ref_frame_for_rect_partitions && + mbmi->partition != PARTITION_NONE) { + // prune_ref_frame_for_rect_partitions = 1 implies prune only extended + // partition blocks. prune_ref_frame_for_rect_partitions >=2 + // implies prune for vert, horiz and extended partition blocks. + if ((mbmi->partition != PARTITION_VERT && + mbmi->partition != PARTITION_HORZ) || + sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) { + picked_ref_frames_mask = + fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size); + } + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, set_params_rd_pick_inter_mode_time); +#endif + // Skip ref frames that never selected by square blocks. + const int skip_ref_frame_mask = + picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; + mode_skip_mask_t mode_skip_mask; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + // init params, set frame modes, speed features + set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask, + skip_ref_frame_mask, ref_costs_single, + ref_costs_comp, yv12_mb); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, set_params_rd_pick_inter_mode_time); +#endif + + int64_t best_est_rd = INT64_MAX; + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + // If do_tx_search is 0, only estimated RD should be computed. + // If do_tx_search is 1, all modes have TX search performed. + const int do_tx_search = + !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) || + (sf->inter_sf.inter_mode_rd_model_estimation == 2 && + num_pels_log2_lookup[bsize] > 8)); + InterModesInfo *inter_modes_info = x->inter_modes_info; + inter_modes_info->num = 0; + + // Temporary buffers used by handle_inter_mode(). + uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); + + // The best RD found for the reference frame, among single reference modes. + // Note that the 0-th element will contain a cut-off that is later used + // to determine if we should skip a compound mode. + int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX, INT64_MAX }; + + // Prepared stats used later to check if we could skip intra mode eval. + int64_t inter_cost = -1; + int64_t intra_cost = -1; + // Need to tweak the threshold for hdres speed 0 & 1. + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + // Obtain the relevant tpl stats for pruning inter modes + PruneInfoFromTpl inter_cost_info_from_tpl; +#if !CONFIG_REALTIME_ONLY + if (sf->inter_sf.prune_inter_modes_based_on_tpl) { + // x->tpl_keep_ref_frame[id] = 1 => no pruning in + // prune_ref_by_selective_ref_frame() + // x->tpl_keep_ref_frame[id] = 0 => ref frame can be pruned in + // prune_ref_by_selective_ref_frame() + // Populating valid_refs[idx] = 1 ensures that + // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a + // pruned ref frame. + int valid_refs[INTER_REFS_PER_FRAME]; + for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) { + const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME }; + valid_refs[frame - 1] = + x->tpl_keep_ref_frame[frame] || + !prune_ref_by_selective_ref_frame( + cpi, x, refs, cm->cur_frame->ref_display_order_hint); + } + av1_zero(inter_cost_info_from_tpl); + get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs, + &inter_cost_info_from_tpl); + } + + const int do_pruning = + (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1; + if (do_pruning && sf->intra_sf.skip_intra_in_interframe && + cpi->oxcf.algo_cfg.enable_tpl_model) + calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost, + &intra_cost); +#endif // !CONFIG_REALTIME_ONLY + + // Initialize best mode stats for winner mode processing. + const int max_winner_mode_count = + winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type]; + zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats); + x->winner_mode_count = 0; + store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, + NULL, bsize, best_rd_so_far, + sf->winner_mode_sf.multi_winner_mode_type, 0); + + int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS); + if (sf->inter_sf.prune_inter_modes_if_skippable) { + // Higher multiplication factor values for lower quantizers. + mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex]; + } + + // Initialize arguments for mode loop speed features + InterModeSFArgs sf_args = { &args.skip_motion_mode, + &mode_skip_mask, + &search_state, + skip_ref_frame_mask, + 0, + mode_thresh_mul_fact, + 0, + 0 }; + int64_t best_inter_yrd = INT64_MAX; + + // This is the main loop of this function. It loops over all possible inter + // modes and calls handle_inter_mode() to compute the RD for each. + // Here midx is just an iterator index that should not be used by itself + // except to keep track of the number of modes searched. It should be used + // with av1_default_mode_order to get the enum that defines the mode, which + // can be used with av1_mode_defs to get the prediction mode and the ref + // frames. + // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings + // good speedup for real time case. If we decide to use compound mode in real + // time, maybe we can modify av1_default_mode_order table. + THR_MODES mode_start = THR_INTER_MODE_START; + THR_MODES mode_end = THR_INTER_MODE_END; + const CurrentFrame *const current_frame = &cm->current_frame; + if (current_frame->reference_mode == SINGLE_REFERENCE) { + mode_start = SINGLE_REF_MODE_START; + mode_end = SINGLE_REF_MODE_END; + } + + for (THR_MODES midx = mode_start; midx < mode_end; ++midx) { + // Get the actual prediction mode we are trying in this iteration + const THR_MODES mode_enum = av1_default_mode_order[midx]; + const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; + const PREDICTION_MODE this_mode = mode_def->mode; + const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame; + + const MV_REFERENCE_FRAME ref_frame = ref_frames[0]; + const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; + const int is_single_pred = + ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME; + const int comp_pred = second_ref_frame > INTRA_FRAME; + + init_mbmi(mbmi, this_mode, ref_frames, cm); + + txfm_info->skip_txfm = 0; + sf_args.num_single_modes_processed += is_single_pred; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, skip_inter_mode_time); +#endif + // Apply speed features to decide if this inter mode can be skipped + const int is_skip_inter_mode = skip_inter_mode( + cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, skip_inter_mode_time); +#endif + if (is_skip_inter_mode) continue; + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->ref_mv_idx = 0; + + const int64_t ref_best_rd = search_state.best_rd; + RD_STATS rd_stats, rd_stats_y, rd_stats_uv; + av1_init_rd_stats(&rd_stats); + + const int ref_frame_cost = comp_pred + ? ref_costs_comp[ref_frame][second_ref_frame] + : ref_costs_single[ref_frame]; + const int compmode_cost = + is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0; + const int real_compmode_cost = + cm->current_frame.reference_mode == REFERENCE_MODE_SELECT + ? compmode_cost + : 0; + // Point to variables that are maintained between loop iterations + args.single_newmv = search_state.single_newmv; + args.single_newmv_rate = search_state.single_newmv_rate; + args.single_newmv_valid = search_state.single_newmv_valid; + args.single_comp_cost = real_compmode_cost; + args.ref_frame_cost = ref_frame_cost; + args.best_pred_sse = search_state.best_pred_sse; + args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred); + + int64_t skip_rd[2] = { search_state.best_skip_rd[0], + search_state.best_skip_rd[1] }; + int64_t this_yrd = INT64_MAX; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif + int64_t this_rd = handle_inter_mode( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args, + ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search, + inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl, + &this_yrd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif + if (current_frame->reference_mode != SINGLE_REFERENCE) { + if (!args.skip_ifs && + sf->inter_sf.prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode)) { + collect_single_states(x, &search_state, mbmi); + } + + if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 && + is_inter_singleref_mode(this_mode)) + update_best_single_mode(&search_state, this_mode, ref_frame, this_rd); + } + + if (this_rd == INT64_MAX) continue; + + if (mbmi->skip_txfm) { + rd_stats_y.rate = 0; + rd_stats_uv.rate = 0; + } + + if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred && + this_rd < ref_frame_rd[ref_frame]) { + ref_frame_rd[ref_frame] = this_rd; + } + + // Did this mode help, i.e., is it the new best mode + if (this_rd < search_state.best_rd) { + assert(IMPLIES(comp_pred, + cm->current_frame.reference_mode != SINGLE_REFERENCE)); + search_state.best_pred_sse = x->pred_sse[ref_frame]; + best_inter_yrd = this_yrd; + update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, + &rd_stats_uv, mode_enum, x, do_tx_search); + if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0]; + // skip_rd[0] is the best total rd for a skip mode so far. + // skip_rd[1] is the best total rd for a skip mode so far in luma. + // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated. + // When do_tx_search = 0, skip_rd[1] is updated. + search_state.best_skip_rd[1] = skip_rd[1]; + } + if (sf->winner_mode_sf.motion_mode_for_winner_cand) { + // Add this mode to motion mode candidate list for motion mode search + // if using motion_mode_for_winner_cand speed feature + handle_winner_cand(mbmi, &best_motion_mode_cands, + max_winner_motion_mode_cand, this_rd, + &motion_mode_cand, args.skip_motion_mode); + } + + /* keep record of best compound/single-only prediction */ + record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred, + x->rdmult, &search_state, compmode_cost); + } + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time); +#endif + if (sf->winner_mode_sf.motion_mode_for_winner_cand) { + // For the single ref winner candidates, evaluate other motion modes (non + // simple translation). + evaluate_motion_mode_for_winner_candidates( + cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb, + &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd, + &search_state, &best_inter_yrd); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, do_tx_search_time); +#endif + if (do_tx_search != 1) { + // A full tx search has not yet been done, do tx search for + // top mode candidates + tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize, + yv12_mb, mi_row, mi_col, &search_state, + rd_cost, ctx, &best_inter_yrd); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, do_tx_search_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_intra_mode_time); +#endif + // Gate intra mode evaluation if best of inter is skip except when source + // variance is extremely low and also based on max intra bsize. + skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost, + intra_cost); + + const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME]; + search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx, + &sf_args, intra_ref_frame_cost, + best_inter_yrd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_intra_mode_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, refine_winner_mode_tx_time); +#endif + int winner_mode_count = + sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1; + // In effect only when fast tx search speed features are enabled. + refine_winner_mode_tx( + cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index, + &search_state.best_mbmode, yv12_mb, search_state.best_rate_y, + search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, refine_winner_mode_tx_time); +#endif + + // Initialize default mode evaluation params + set_mode_eval_params(cpi, x, DEFAULT_EVAL); + + // Only try palette mode when the best mode so far is an intra mode. + const int try_palette = + cpi->oxcf.tool_cfg.enable_palette && + av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) && + !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX; + RD_STATS this_rd_cost; + int this_skippable = 0; + if (try_palette) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_search_palette_mode_time); +#endif + this_skippable = av1_search_palette_mode( + &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost, + ctx, &this_rd_cost, search_state.best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_search_palette_mode_time); +#endif + if (this_rd_cost.rdcost < search_state.best_rd) { + search_state.best_mode_index = THR_DC; + mbmi->mv[0].as_int = 0; + rd_cost->rate = this_rd_cost.rate; + rd_cost->dist = this_rd_cost.dist; + rd_cost->rdcost = this_rd_cost.rdcost; + search_state.best_rd = rd_cost->rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = this_skippable; + memcpy(ctx->blk_skip, txfm_info->blk_skip, + sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk); + av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk); + } + } + + search_state.best_mbmode.skip_mode = 0; + if (cm->current_frame.skip_mode_info.skip_mode_flag && + is_comp_ref_allowed(bsize)) { + const struct segmentation *const seg = &cm->seg; + unsigned char segment_id = mbmi->segment_id; + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb); + } + } + + // Make sure that the ref_mv_idx is only nonzero when we're + // using a mode which can support ref_mv_idx + if (search_state.best_mbmode.ref_mv_idx != 0 && + !(search_state.best_mbmode.mode == NEWMV || + search_state.best_mbmode.mode == NEW_NEWMV || + have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) { + search_state.best_mbmode.ref_mv_idx = 0; + } + + if (search_state.best_mode_index == THR_INVALID || + search_state.best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + const InterpFilter interp_filter = features->interp_filter; + assert((interp_filter == SWITCHABLE) || + (interp_filter == + search_state.best_mbmode.interp_filters.as_filters.y_filter) || + !is_inter_block(&search_state.best_mbmode)); + assert((interp_filter == SWITCHABLE) || + (interp_filter == + search_state.best_mbmode.interp_filters.as_filters.x_filter) || + !is_inter_block(&search_state.best_mbmode)); + + if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) { + av1_update_rd_thresh_fact( + cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize, + search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES); + } + + // macroblock modes + *mbmi = search_state.best_mbmode; + txfm_info->skip_txfm |= search_state.best_skip2; + + // Note: this section is needed since the mode may have been forced to + // GLOBALMV by the all-zero mode handling of ref-mv. + if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) { + // Correct the interp filters for GLOBALMV + if (is_nontrans_global_motion(xd, xd->mi[0])) { + int_interpfilters filters = + av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter)); + assert(mbmi->interp_filters.as_int == filters.as_int); + (void)filters; + } + } + + txfm_info->skip_txfm |= search_state.best_mode_skippable; + + assert(search_state.best_mode_index != THR_INVALID); + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, search_state.best_mode_index, + search_state.best_mode_skippable); +#else + store_coding_context(x, ctx, search_state.best_mode_skippable); +#endif // CONFIG_INTERNAL_STATS + + if (mbmi->palette_mode_info.palette_size[1] > 0) { + assert(try_palette); + av1_restore_uv_color_map(cpi, x); + } +} + +void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, + TileDataEnc *tile_data, MACROBLOCK *x, + int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + const AV1_COMMON *const cm = &cpi->common; + const FeatureFlags *const features = &cm->features; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const int comp_pred = 0; + int i; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + const ModeCosts *mode_costs = &x->mode_costs; + const int *comp_inter_cost = + mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)]; + InterpFilter best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + (void)mi_row; + (void)mi_col; + (void)tile_data; + + av1_collect_neighbors_ref_counts(xd); + + estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single, + ref_costs_comp); + + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX; + + rd_cost->rate = INT_MAX; + + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->mode = GLOBALMV; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->uv_mode = UV_DC_PRED; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) + mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + else + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->mv[0].as_int = + gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]], + features->allow_high_precision_mv, bsize, mi_col, + mi_row, features->cur_frame_force_integer_mv) + .as_int; + mbmi->tx_size = max_txsize_lookup[bsize]; + x->txfm_search_info.skip_txfm = 1; + + mbmi->ref_mv_idx = 0; + + mbmi->motion_mode = SIMPLE_TRANSLATION; + av1_count_overlappable_neighbors(cm, xd); + if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) { + int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE]; + mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref); + // Select the samples according to motion vector difference + if (mbmi->num_proj_ref > 1) { + mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref, + mbmi->num_proj_ref, bsize); + } + } + + const InterpFilter interp_filter = features->interp_filter; + set_default_interp_filters(mbmi, interp_filter); + + if (interp_filter != SWITCHABLE) { + best_filter = interp_filter; + } else { + best_filter = EIGHTTAP_REGULAR; + if (av1_is_interp_needed(xd)) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + mbmi->interp_filters = av1_broadcast_interp_filter(i); + rs = av1_get_switchable_rate(x, xd, interp_filter, + cm->seq_params->enable_dual_filter); + if (rs < best_rs) { + best_rs = rs; + best_filter = mbmi->interp_filters.as_filters.y_filter; + } + } + } + } + // Set the appropriate filter + mbmi->interp_filters = av1_broadcast_interp_filter(best_filter); + rate2 += av1_get_switchable_rate(x, xd, interp_filter, + cm->seq_params->enable_dual_filter); + + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) + rate2 += comp_inter_cost[comp_pred]; + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, rate2, distortion2); + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + + if (this_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + assert((interp_filter == SWITCHABLE) || + (interp_filter == mbmi->interp_filters.as_filters.y_filter)); + + if (cpi->sf.inter_sf.adaptive_rd_thresh) { + av1_update_rd_thresh_fact(cm, x->thresh_freq_fact, + cpi->sf.inter_sf.adaptive_rd_thresh, bsize, + THR_GLOBALMV, THR_INTER_MODE_START, + THR_INTER_MODE_END, THR_DC, MAX_MODES); + } + +#if CONFIG_INTERNAL_STATS + store_coding_context(x, ctx, THR_GLOBALMV, 0); +#else + store_coding_context(x, ctx, 0); +#endif // CONFIG_INTERNAL_STATS +} + +/*!\cond */ +struct calc_target_weighted_pred_ctxt { + const OBMCBuffer *obmc_buffer; + const uint8_t *tmp; + int tmp_stride; + int overlap; +}; +/*!\endcond */ + +static INLINE void calc_target_weighted_pred_above( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { + (void)nb_mi; + (void)num_planes; + (void)rel_mi_row; + (void)dir; + + struct calc_target_weighted_pred_ctxt *ctxt = + (struct calc_target_weighted_pred_ctxt *)fun_ctxt; + + const int bw = xd->width << MI_SIZE_LOG2; + const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); + + int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE); + int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE); + const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE; + const int is_hbd = is_cur_buf_hbd(xd); + + if (!is_hbd) { + for (int row = 0; row < ctxt->overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { + wsrc[col] = m1 * tmp[col]; + mask[col] = m0; + } + wsrc += bw; + mask += bw; + tmp += ctxt->tmp_stride; + } + } else { + const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); + + for (int row = 0; row < ctxt->overlap; ++row) { + const uint8_t m0 = mask1d[row]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + for (int col = 0; col < op_mi_size * MI_SIZE; ++col) { + wsrc[col] = m1 * tmp16[col]; + mask[col] = m0; + } + wsrc += bw; + mask += bw; + tmp16 += ctxt->tmp_stride; + } + } +} + +static INLINE void calc_target_weighted_pred_left( + MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) { + (void)nb_mi; + (void)num_planes; + (void)rel_mi_col; + (void)dir; + + struct calc_target_weighted_pred_ctxt *ctxt = + (struct calc_target_weighted_pred_ctxt *)fun_ctxt; + + const int bw = xd->width << MI_SIZE_LOG2; + const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap); + + int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw); + int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw); + const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride); + const int is_hbd = is_cur_buf_hbd(xd); + + if (!is_hbd) { + for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { + for (int col = 0; col < ctxt->overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += bw; + mask += bw; + tmp += ctxt->tmp_stride; + } + } else { + const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp); + + for (int row = 0; row < op_mi_size * MI_SIZE; ++row) { + for (int col = 0; col < ctxt->overlap; ++col) { + const uint8_t m0 = mask1d[col]; + const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0; + wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 + + (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1; + mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0; + } + wsrc += bw; + mask += bw; + tmp16 += ctxt->tmp_stride; + } + } +} + +// This function has a structure similar to av1_build_obmc_inter_prediction +// +// The OBMC predictor is computed as: +// +// PObmc(x,y) = +// AOM_BLEND_A64(Mh(x), +// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)), +// PLeft(x, y)) +// +// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate +// rounding, this can be written as: +// +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) = +// Mh(x) * Mv(y) * P(x,y) + +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// Where : +// +// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y) +// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y) +// +// This function computes 'wsrc' and 'mask' as: +// +// wsrc(x, y) = +// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) - +// Mh(x) * Cv(y) * Pabove(x,y) + +// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y) +// +// mask(x, y) = Mh(x) * Mv(y) +// +// These can then be used to efficiently approximate the error for any +// predictor P in the context of the provided neighbouring predictors by +// computing: +// +// error(x, y) = +// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2) +// +static AOM_INLINE void calc_target_weighted_pred( + const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd, + const uint8_t *above, int above_stride, const uint8_t *left, + int left_stride) { + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + const int bw = xd->width << MI_SIZE_LOG2; + const int bh = xd->height << MI_SIZE_LOG2; + const OBMCBuffer *obmc_buffer = &x->obmc_buffer; + int32_t *mask_buf = obmc_buffer->mask; + int32_t *wsrc_buf = obmc_buffer->wsrc; + + const int is_hbd = is_cur_buf_hbd(xd); + const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; + + // plane 0 should not be sub-sampled + assert(xd->plane[0].subsampling_x == 0); + assert(xd->plane[0].subsampling_y == 0); + + av1_zero_array(wsrc_buf, bw * bh); + for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA; + + // handle above row + if (xd->up_available) { + const int overlap = + AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; + struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above, + above_stride, overlap }; + foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + calc_target_weighted_pred_above, &ctxt); + } + + for (int i = 0; i < bw * bh; ++i) { + wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA; + } + + // handle left column + if (xd->left_available) { + const int overlap = + AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; + struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left, + left_stride, overlap }; + foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + calc_target_weighted_pred_left, &ctxt); + } + + if (!is_hbd) { + const uint8_t *src = x->plane[0].src.buf; + + for (int row = 0; row < bh; ++row) { + for (int col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += bw; + src += x->plane[0].src.stride; + } + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf); + + for (int row = 0; row < bh; ++row) { + for (int col = 0; col < bw; ++col) { + wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col]; + } + wsrc_buf += bw; + src += x->plane[0].src.stride; + } + } +} diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h new file mode 100644 index 0000000000..efb797e5b5 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt.h @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_H_ +#define AOM_AV1_ENCODER_RDOPT_H_ + +#include + +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define COMP_TYPE_RD_THRESH_SCALE 11 +#define COMP_TYPE_RD_THRESH_SHIFT 4 +#define MAX_WINNER_MOTION_MODES 10 + +struct TileInfo; +struct macroblock; +struct RD_STATS; + +/*!\brief AV1 intra mode selection for intra frames. + * + * \ingroup intra_mode_search + * \callgraph + * Top level function for rd-based intra mode selection during intra frame + * encoding. This function will first search for the best luma prediction by + * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction + * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the + * search with an evaluation for intrabc. + * + * \param[in] cpi Top-level encoder structure. + * \param[in] x Pointer to structure holding all the data for + the current macroblock. + * \param[in] rd_cost Struct to keep track of the RD information. + * \param[in] bsize Current block size. + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process. + * \param[in] best_rd Best RD seen for this block so far. + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); + +/*!\brief AV1 inter mode selection. + * + * \ingroup inter_mode_search + * \callgraph + * Top level function for inter mode selection. This function will loop over + * all possible inter modes and select the best one for the current block by + * computing the RD cost. The mode search and RD are computed in + * handle_inter_mode(), which is called from this function within the main + * loop. + * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + data/contexts/models for the tile during + encoding + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * \param[in] best_rd_so_far Best RD seen for this block so far + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +/*!\brief AV1 intra mode selection based on Non-RD optimized model. + * + * \ingroup nonrd_mode_search + * \callgraph + * \callergraph + * Top level function for Non-RD optimized intra mode selection. + * This finction will loop over subset of intra modes and select the best one + * based on calculated modelled RD cost. Only 4 intra modes are checked as + * specified in \c intra_mode_list. When calculating RD cost Hadamard transform + * of residual is used to calculate rate. Estmation of RD cost is performed + * in \c av1_estimate_block_intra which is called from this function + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); + +/*!\brief AV1 inter mode selection based on Non-RD optimized model. + * + * \ingroup nonrd_mode_search + * \callgraph + * Top level function for Non-RD optimized inter mode selection. + * This finction will loop over subset of inter modes and select the best one + * based on calculated modelled RD cost. While making decisions which modes to + * check, this function applies heuristics based on previously checked modes, + * block residual variance, block size, and other factors to prune certain + * modes and reference frames. Currently only single reference frame modes + * are checked. Additional heuristics are applied to decide if intra modes + * need to be checked. + * * + * \param[in] cpi Top-level encoder structure + * \param[in] tile_data Pointer to struct holding adaptive + data/contexts/models for the tile during + encoding + * \param[in] x Pointer to structure holding all the data for + the current macroblock + * \param[in] rd_cost Struct to keep track of the RD information + * \param[in] bsize Current block size + * \param[in] ctx Structure to hold snapshot of coding context + during the mode picking process + * + * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x + * is modified to store information about the best mode computed + * in this function. The rd_cost struct is also updated with the RD stats + * corresponding to the best mode found. + */ +void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); + +void av1_rd_pick_inter_mode_sb_seg_skip( + const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); + +void av1_inter_mode_data_init(struct TileDataEnc *tile_data); +void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); + +static INLINE int coded_to_superres_mi(int mi_col, int denom) { + return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR; +} + +static INLINE int av1_encoder_get_relative_dist(int a, int b) { + assert(a >= 0 && b >= 0); + return (a - b); +} + +// This function will return number of mi's in a superblock. +static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) { + const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize]; + int sb_mi_rows = + (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) / + mi_alloc_size_1d; + assert(mi_size_wide[cm->seq_params->sb_size] == + mi_size_high[cm->seq_params->sb_size]); + int sb_mi_size = sb_mi_rows * sb_mi_rows; + + return sb_mi_size; +} + +// This function prunes the mode if either of the reference frame falls in the +// pruning list +static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame, + const unsigned int *const ref_display_order_hint, + const unsigned int frame_display_order_hint, + const int *ref_frame_list) { + for (int i = 0; i < 2; i++) { + if (ref_frame_list[i] == NONE_FRAME) continue; + + if (ref_frame[0] == ref_frame_list[i] || + ref_frame[1] == ref_frame_list[i]) { + if (av1_encoder_get_relative_dist( + ref_display_order_hint[ref_frame_list[i] - LAST_FRAME], + frame_display_order_hint) < 0) + return 1; + } + } + return 0; +} + +static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame, + int8_t closest_past_ref, + int8_t closest_future_ref) { + int has_closest_past_ref = + (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref); + int has_closest_future_ref = (ref_frame[0] == closest_future_ref) || + (ref_frame[1] == closest_future_ref); + return (has_closest_past_ref && has_closest_future_ref); +} + +static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame, + const MACROBLOCK *const x) { + int has_best_past_pred_mv_sad = 0; + int has_best_future_pred_mv_sad = 0; + if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) { + has_best_past_pred_mv_sad = + (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) || + (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]); + has_best_future_pred_mv_sad = + (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) || + (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]); + } + return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad); +} + +static INLINE int prune_ref_by_selective_ref_frame( + const AV1_COMP *const cpi, const MACROBLOCK *const x, + const MV_REFERENCE_FRAME *const ref_frame, + const unsigned int *const ref_display_order_hint) { + const SPEED_FEATURES *const sf = &cpi->sf; + if (!sf->inter_sf.selective_ref_frame) return 0; + + const int comp_pred = ref_frame[1] > INTRA_FRAME; + + if (sf->inter_sf.selective_ref_frame >= 2 || + (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) { + int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME }; + + if (x != NULL) { + // Disable pruning if either tpl suggests that we keep the frame or + // the pred_mv gives us the best sad + if (x->tpl_keep_ref_frame[LAST3_FRAME] || + x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[0] = NONE_FRAME; + } + if (x->tpl_keep_ref_frame[LAST2_FRAME] || + x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[1] = NONE_FRAME; + } + } + + if (prune_ref(ref_frame, ref_display_order_hint, + ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME], + ref_frame_list)) + return 1; + } + + if (sf->inter_sf.selective_ref_frame >= 3) { + int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME }; + + if (x != NULL) { + // Disable pruning if either tpl suggests that we keep the frame or + // the pred_mv gives us the best sad + if (x->tpl_keep_ref_frame[ALTREF2_FRAME] || + x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[0] = NONE_FRAME; + } + if (x->tpl_keep_ref_frame[BWDREF_FRAME] || + x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) { + ref_frame_list[1] = NONE_FRAME; + } + } + + if (prune_ref(ref_frame, ref_display_order_hint, + ref_display_order_hint[LAST_FRAME - LAST_FRAME], + ref_frame_list)) + return 1; + } + + if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) { + int closest_ref_frames = has_closest_ref_frames( + ref_frame, cpi->ref_frame_dist_info.nearest_past_ref, + cpi->ref_frame_dist_info.nearest_future_ref); + if (closest_ref_frames == 0) { + // Prune reference frames which are not the closest to the current frame. + if (sf->inter_sf.prune_comp_ref_frames >= 2) { + return 1; + } else if (sf->inter_sf.prune_comp_ref_frames == 1) { + // Prune reference frames with non minimum pred_mv_sad. + if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1; + } + } + } + + return 0; +} + +// This function will copy the best reference mode information from +// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME. +static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame( + MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, + const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) { + memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type], + sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE])); + memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type], + sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE])); + mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type]; + mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type]; + memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs, + sizeof(mbmi_ext->global_mvs)); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/rdopt_data_defs.h b/third_party/aom/av1/encoder/rdopt_data_defs.h new file mode 100644 index 0000000000..ca7ef810f3 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt_data_defs.h @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ +#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = { + THR_DC, // DC_PRED, + THR_V_PRED, // V_PRED, + THR_H_PRED, // H_PRED, + THR_D45_PRED, // D45_PRED, + THR_D135_PRED, // D135_PRED, + THR_D113_PRED, // D113_PRED, + THR_D157_PRED, // D157_PRED, + THR_D203_PRED, // D203_PRED, + THR_D67_PRED, // D67_PRED, + THR_SMOOTH, // SMOOTH_PRED, + THR_SMOOTH_V, // SMOOTH_V_PRED, + THR_SMOOTH_H, // SMOOTH_H_PRED, + THR_PAETH, // PAETH_PRED, +}; + +/* clang-format off */ +static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM] + [REF_FRAMES] = { + // NEARESTMV, + { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, + THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, }, + // NEARMV, + { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3, + THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, }, + // GLOBALMV, + { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3, + THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, }, + // NEWMV, + { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3, + THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, }, +}; +/* clang-format on */ + +/* clang-format off */ +static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES] + [REF_FRAMES] = { + // NEAREST_NEARESTMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3, + THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB, + THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTL2B, + THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTL3B, + THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTGB, + THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEARESTBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAR_NEARMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3, + THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB, + THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARL2B, + THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARL3B, + THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARGB, + THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEARBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAREST_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3, + THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB, + THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWL2B, + THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWL3B, + THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWGB, + THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAREST_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEARESTMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3, + THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB, + THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTL2B, + THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTL3B, + THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTGB, + THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARESTBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEAR_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3, + THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB, + THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWL2B, + THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWL3B, + THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWGB, + THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEAR_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEARMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3, + THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB, + THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARL2B, + THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARL3B, + THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARGB, + THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEARBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // GLOBAL_GLOBALMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3, + THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB, + THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B, + THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B, + THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALGB, + THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, + // NEW_NEWMV, + { + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, + THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3, + THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB, + THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWL2B, + THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWL3B, + THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWGB, + THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, }, + { THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, + THR_INVALID, THR_COMP_NEW_NEWBA, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, + THR_INVALID, THR_INVALID, THR_INVALID, }, + }, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_ diff --git a/third_party/aom/av1/encoder/rdopt_utils.h b/third_party/aom/av1/encoder/rdopt_utils.h new file mode 100644 index 0000000000..b6bc4927e3 --- /dev/null +++ b/third_party/aom/av1/encoder/rdopt_utils.h @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_ +#define AOM_AV1_ENCODER_RDOPT_UTILS_H_ + +#include "aom/aom_integer.h" +#include "av1/encoder/block.h" +#include "av1/common/cfl.h" +#include "av1/common/pred_common.h" +#include "av1/encoder/rdopt_data_defs.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_REF_MV_SEARCH 3 +#define MAX_TX_RD_GATE_LEVEL 5 +#define INTER_INTRA_RD_THRESH_SCALE 9 +#define INTER_INTRA_RD_THRESH_SHIFT 4 + +typedef struct { + PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame[2]; +} MODE_DEFINITION; + +// This array defines the mapping from the enums in THR_MODES to the actual +// prediction modes and refrence frames +static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = { + { NEARESTMV, { LAST_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARESTMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEWMV, { LAST_FRAME, NONE_FRAME } }, + { NEWMV, { LAST2_FRAME, NONE_FRAME } }, + { NEWMV, { LAST3_FRAME, NONE_FRAME } }, + { NEWMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEWMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEWMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { NEARMV, { LAST_FRAME, NONE_FRAME } }, + { NEARMV, { LAST2_FRAME, NONE_FRAME } }, + { NEARMV, { LAST3_FRAME, NONE_FRAME } }, + { NEARMV, { BWDREF_FRAME, NONE_FRAME } }, + { NEARMV, { ALTREF2_FRAME, NONE_FRAME } }, + { NEARMV, { ALTREF_FRAME, NONE_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NONE_FRAME } }, + + { GLOBALMV, { LAST_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST2_FRAME, NONE_FRAME } }, + { GLOBALMV, { LAST3_FRAME, NONE_FRAME } }, + { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } }, + { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } }, + { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } }, + + // TODO(zoeliu): May need to reconsider the order on the modes to check + + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } }, + + { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } }, + { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } }, + { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } }, + + { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } }, + { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } }, + + // intra modes + { DC_PRED, { INTRA_FRAME, NONE_FRAME } }, + { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { H_PRED, { INTRA_FRAME, NONE_FRAME } }, + { V_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D135_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D203_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D157_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D67_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D113_PRED, { INTRA_FRAME, NONE_FRAME } }, + { D45_PRED, { INTRA_FRAME, NONE_FRAME } }, +}; + +// Number of winner modes allowed for different values of the speed feature +// multi_winner_mode_type. +static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = { + 1, // MULTI_WINNER_MODE_OFF + 2, // MULTI_WINNER_MODE_FAST + 3 // MULTI_WINNER_MODE_DEFAULT +}; + +static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst, + const int num_planes) { + for (int i = 0; i < num_planes; i++) { + xd->plane[i].dst.buf = dst.plane[i]; + xd->plane[i].dst.stride = dst.stride[i]; + } +} + +static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd, + const BUFFER_SET *dst_bufs[2], + int num_planes) { + const BUFFER_SET *buf0 = dst_bufs[0]; + dst_bufs[0] = dst_bufs[1]; + dst_bufs[1] = buf0; + restore_dst_buf(xd, *dst_bufs[0], num_planes); +} + +/* clang-format on */ +// Calculate rd threshold based on ref best rd and relevant scaling factors +static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd, + int mul_factor, + int div_factor) { + int64_t rd_thresh = ref_best_rd; + if (div_factor != 0) { + rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor)) + ? ((ref_best_rd / div_factor) * mul_factor) + : INT64_MAX; + } + return rd_thresh; +} + +static AOM_INLINE THR_MODES +get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME second_ref_frame) { + if (this_mode < INTRA_MODE_END) { + assert(ref_frame == INTRA_FRAME); + assert(second_ref_frame == NONE_FRAME); + return intra_to_mode_idx[this_mode - INTRA_MODE_START]; + } + if (this_mode >= SINGLE_INTER_MODE_START && + this_mode < SINGLE_INTER_MODE_END) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START] + [ref_frame]; + } + if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END && + second_ref_frame != NONE_FRAME) { + assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME)); + assert((second_ref_frame > INTRA_FRAME) && + (second_ref_frame <= ALTREF_FRAME)); + return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame] + [second_ref_frame]; + } + assert(0); + return THR_INVALID; +} + +static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_4X16 || bsize == BLOCK_16X4) { + return -1; + } + return 1; +} + +// Get transform block visible dimensions cropped to the MI units. +static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane, + BLOCK_SIZE plane_bsize, int blk_row, + int blk_col, BLOCK_SIZE tx_bsize, + int *width, int *height, + int *visible_width, + int *visible_height) { + assert(tx_bsize <= plane_bsize); + const int txb_height = block_size_high[tx_bsize]; + const int txb_width = block_size_wide[tx_bsize]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + + // TODO(aconverse@google.com): Investigate using crop_width/height here rather + // than the MI size + if (xd->mb_to_bottom_edge >= 0) { + *visible_height = txb_height; + } else { + const int block_height = block_size_high[plane_bsize]; + const int block_rows = + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height; + *visible_height = + clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height); + } + if (height) *height = txb_height; + + if (xd->mb_to_right_edge >= 0) { + *visible_width = txb_width; + } else { + const int block_width = block_size_wide[plane_bsize]; + const int block_cols = + (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width; + *visible_width = + clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width); + } + if (width) *width = txb_width; +} + +static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) { + int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2); + return num_blk; +} + +static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize, + int64_t best_skip_rd, int64_t skip_rd, + int level, int is_luma_only) { + int eval_txfm = 1; + // Derive aggressiveness factor for gating the transform search + // Lower value indicates more aggressiveness. Be more conservative (high + // value) for (i) low quantizers (ii) regions where prediction is poor + const int scale[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 4, 3, 2, 2, 1 }; + const int qslope = 2 * (!is_luma_only); + const int level_to_qindex_map[MAX_TX_RD_GATE_LEVEL + 1] = { 0, 0, 0, + 80, 100, 140 }; + int aggr_factor = 4; + assert(level <= MAX_TX_RD_GATE_LEVEL); + const int pred_qindex_thresh = level_to_qindex_map[level]; + if (!is_luma_only && level <= 2) { + aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope, + QINDEX_BITS)); + } + if ((best_skip_rd > + (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) && + (x->qindex >= pred_qindex_thresh)) + aggr_factor *= scale[level]; + // For level setting 1, be more conservative for non-luma-only case even when + // prediction is good. + else if ((level <= 1) && !is_luma_only) + aggr_factor = (aggr_factor >> 2) * 6; + + // Be more conservative for luma only cases (called from compound type rd) + // since best_skip_rd is computed after and skip_rd is computed (with 8-bit + // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before + // interpolation filter search + const int luma_mul[MAX_TX_RD_GATE_LEVEL + 1] = { + INT_MAX, 32, 29, 17, 17, 17 + }; + int mul_factor = is_luma_only ? luma_mul[level] : 16; + int64_t rd_thresh = + (best_skip_rd == INT64_MAX) + ? best_skip_rd + : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6); + if (skip_rd > rd_thresh) eval_txfm = 0; + return eval_txfm; +} + +static TX_MODE select_tx_mode( + const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) { + if (cm->features.coded_lossless) return ONLY_4X4; + if (tx_size_search_method == USE_LARGESTALL) { + return TX_MODE_LARGEST; + } else { + assert(tx_size_search_method == USE_FULL_RD || + tx_size_search_method == USE_FAST_RD); + return TX_MODE_SELECT; + } +} + +// Checks the conditions to disable winner mode processing +static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x, + const SPEED_FEATURES *sf, + int use_txfm_skip, + int actual_txfm_skip, + PREDICTION_MODE best_mode) { + const int prune_winner_mode_eval_level = + sf->winner_mode_sf.prune_winner_mode_eval_level; + + // Disable winner mode processing for blocks with low source variance. + // The aggressiveness of this pruning logic reduces as qindex increases. + // The threshold decreases linearly from 64 as qindex varies from 0 to 255. + if (prune_winner_mode_eval_level == 1) { + const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1); + if (x->source_variance < src_var_thresh) return 1; + } else if (prune_winner_mode_eval_level == 2) { + // Skip winner mode processing of blocks for which transform turns out to be + // skip due to nature of eob alone except NEWMV mode. + if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1; + } else if (prune_winner_mode_eval_level == 3) { + // Skip winner mode processing of blocks for which transform turns out to be + // skip except NEWMV mode and considered based on the quantizer. + // At high quantizers: Take conservative approach by considering transform + // skip based on eob alone. + // At low quantizers: Consider transform skip based on eob nature or RD cost + // evaluation. + const int is_txfm_skip = + x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip; + + if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1; + } else if (prune_winner_mode_eval_level >= 4) { + // Do not skip winner mode evaluation at low quantizers if normal mode's + // transform search was too aggressive. + if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0; + + if (use_txfm_skip || actual_txfm_skip) return 1; + } + + return 0; +} + +// Checks the conditions to enable winner mode processing +static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi, + const MACROBLOCK *const x, + MB_MODE_INFO *const mbmi, + int actual_txfm_skip) { + const SPEED_FEATURES *sf = &cpi->sf; + const PREDICTION_MODE best_mode = mbmi->mode; + + if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip, + best_mode)) + return 0; + + // TODO(any): Move block independent condition checks to frame level + if (is_inter_block(mbmi)) { + if (is_inter_mode(best_mode) && + (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) && + !cpi->oxcf.txfm_cfg.use_inter_dct_only) + return 1; + } else { + if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search && + !cpi->oxcf.txfm_cfg.use_intra_default_tx_only && + !cpi->oxcf.txfm_cfg.use_intra_dct_only) + return 1; + } + + // Check speed feature related to winner mode processing + if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt && + cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT && + cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT) + return 1; + if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1; + + return 0; +} + +static INLINE void set_tx_size_search_method( + const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params, + TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch, + int is_winner_mode) { + // Populate transform size search method/transform mode appropriately + txfm_params->tx_size_search_method = + winner_mode_params->tx_size_search_methods[DEFAULT_EVAL]; + if (enable_winner_mode_for_tx_size_srch) { + if (is_winner_mode) + txfm_params->tx_size_search_method = + winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL]; + else + txfm_params->tx_size_search_method = + winner_mode_params->tx_size_search_methods[MODE_EVAL]; + } + txfm_params->tx_mode_search_type = + select_tx_mode(cm, txfm_params->tx_size_search_method); +} + +static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, + TxfmSearchParams *txfm_params, + int winner_mode_tx_type_pruning, + int is_winner_mode) { + // Populate prune transform mode appropriately + txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode; + if (!winner_mode_tx_type_pruning) return; + + const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 }, + { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 }, + { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 }, + { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } }; + txfm_params->prune_2d_txfm_mode = + prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode]; +} + +static INLINE void set_tx_domain_dist_params( + const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params, + int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) { + if (txfm_params->use_qm_dist_metric) { + // QM-weighted PSNR is computed in transform space, so we need to forcibly + // enable the use of tx domain distortion. + txfm_params->use_transform_domain_distortion = 1; + txfm_params->tx_domain_dist_threshold = 0; + return; + } + + if (!enable_winner_mode_for_tx_domain_dist) { + txfm_params->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL]; + txfm_params->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL]; + return; + } + + if (is_winner_mode) { + txfm_params->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL]; + txfm_params->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL]; + } else { + txfm_params->use_transform_domain_distortion = + winner_mode_params->use_transform_domain_distortion[MODE_EVAL]; + txfm_params->tx_domain_dist_threshold = + winner_mode_params->tx_domain_dist_threshold[MODE_EVAL]; + } +} + +// This function sets mode parameters for different mode evaluation stages +static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi, + MACROBLOCK *x, + MODE_EVAL_TYPE mode_eval_type) { + const AV1_COMMON *cm = &cpi->common; + const SPEED_FEATURES *sf = &cpi->sf; + const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params; + TxfmSearchParams *txfm_params = &x->txfm_search_params; + + txfm_params->use_qm_dist_metric = + cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR; + + switch (mode_eval_type) { + case DEFAULT_EVAL: + txfm_params->default_inter_tx_type_prob_thresh = INT_MAX; + txfm_params->use_default_intra_tx_type = 0; + txfm_params->skip_txfm_level = + winner_mode_params->skip_txfm_level[DEFAULT_EVAL]; + txfm_params->predict_dc_level = + winner_mode_params->predict_dc_level[DEFAULT_EVAL]; + // Set default transform domain distortion type + set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0); + + // Get default threshold for R-D optimization of coefficients + get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds, + txfm_params, 0, 0); + + // Set default transform size search method + set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0); + // Set default transform type prune + set_tx_type_prune(sf, txfm_params, 0, 0); + break; + case MODE_EVAL: + txfm_params->use_default_intra_tx_type = + (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search || + cpi->oxcf.txfm_cfg.use_intra_default_tx_only); + txfm_params->default_inter_tx_type_prob_thresh = + cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh; + txfm_params->skip_txfm_level = + winner_mode_params->skip_txfm_level[MODE_EVAL]; + txfm_params->predict_dc_level = + winner_mode_params->predict_dc_level[MODE_EVAL]; + // Set transform domain distortion type for mode evaluation + set_tx_domain_dist_params( + winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0); + + // Get threshold for R-D optimization of coefficients during mode + // evaluation + get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_thresholds, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0); + + // Set the transform size search method for mode evaluation + set_tx_size_search_method( + cm, winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0); + // Set transform type prune for mode evaluation + set_tx_type_prune(sf, txfm_params, + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning, + 0); + break; + case WINNER_MODE_EVAL: + txfm_params->default_inter_tx_type_prob_thresh = INT_MAX; + txfm_params->use_default_intra_tx_type = 0; + txfm_params->skip_txfm_level = + winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL]; + txfm_params->predict_dc_level = + winner_mode_params->predict_dc_level[WINNER_MODE_EVAL]; + + // Set transform domain distortion type for winner mode evaluation + set_tx_domain_dist_params( + winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1); + + // Get threshold for R-D optimization of coefficients for winner mode + // evaluation + get_rd_opt_coeff_thresh( + winner_mode_params->coeff_opt_thresholds, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1); + + // Set the transform size search method for winner mode evaluation + set_tx_size_search_method( + cm, winner_mode_params, txfm_params, + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1); + // Set default transform type prune mode for winner mode evaluation + set_tx_type_prune(sf, txfm_params, + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning, + 1); + break; + default: assert(0); + } + + // Rd record collected at a specific mode evaluation stage can not be used + // across other evaluation stages as the transform parameters are different. + // Hence, reset mb rd record whenever mode evaluation stage type changes. + if (txfm_params->mode_eval_type != mode_eval_type) + reset_mb_rd_record(x->txfm_search_info.mb_rd_record); + + txfm_params->mode_eval_type = mode_eval_type; +} + +// Similar to store_cfl_required(), but for use during the RDO process, +// where we haven't yet determined whether this block uses CfL. +static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, + const MACROBLOCK *x) { + const MACROBLOCKD *xd = &x->e_mbd; + + if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED; + + if (!xd->is_chroma_ref) { + // For non-chroma-reference blocks, we should always store the luma pixels, + // in case the corresponding chroma-reference block uses CfL. + // Note that this can only happen for block sizes which are <8 on + // their shortest side, as otherwise they would be chroma reference + // blocks. + return CFL_ALLOWED; + } + + // For chroma reference blocks, we should store data in the encoder iff we're + // allowed to try out CfL. + return is_cfl_allowed(xd); +} + +static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) { + mbmi->uv_mode = UV_DC_PRED; + mbmi->palette_mode_info.palette_size[1] = 0; +} + +// Store best mode stats for winner mode processing +static INLINE void store_winner_mode_stats( + const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi, + RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv, + THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd, + int multi_winner_mode_type, int txfm_search_done) { + WinnerModeStats *winner_mode_stats = x->winner_mode_stats; + int mode_idx = 0; + int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0; + // Mode stat is not required when multiwinner mode processing is disabled + if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return; + // Ignore mode with maximum rd + if (this_rd == INT64_MAX) return; + // TODO(any): Winner mode processing is currently not applicable for palette + // mode in Inter frames. Clean-up the following code, once support is added + if (!frame_is_intra_only(cm) && is_palette_mode) return; + + int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type]; + assert(x->winner_mode_count >= 0 && + x->winner_mode_count <= max_winner_mode_count); + + if (x->winner_mode_count) { + // Find the mode which has higher rd cost than this_rd + for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) + if (winner_mode_stats[mode_idx].rd > this_rd) break; + + if (mode_idx == max_winner_mode_count) { + // No mode has higher rd cost than this_rd + return; + } else if (mode_idx < max_winner_mode_count - 1) { + // Create a slot for current mode and move others to the next slot + memmove( + &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx], + (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats)); + } + } + // Add a mode stat for winner mode processing + winner_mode_stats[mode_idx].mbmi = *mbmi; + winner_mode_stats[mode_idx].rd = this_rd; + winner_mode_stats[mode_idx].mode_index = mode_index; + + // Update rd stats required for inter frame + if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) { + const MACROBLOCKD *xd = &x->e_mbd; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END; + const int skip_txfm = mbmi->skip_txfm && !is_intra_mode; + + winner_mode_stats[mode_idx].rd_cost = *rd_cost; + if (txfm_search_done) { + winner_mode_stats[mode_idx].rate_y = + rd_cost_y->rate + + x->mode_costs + .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm]; + winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate; + } + } + + if (color_map) { + // Store color_index_map for palette mode + const MACROBLOCKD *const xd = &x->e_mbd; + int block_width, block_height; + av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width, + &block_height, NULL, NULL); + memcpy(winner_mode_stats[mode_idx].color_index_map, color_map, + block_width * block_height * sizeof(color_map[0])); + } + + x->winner_mode_count = + AOMMIN(x->winner_mode_count + 1, max_winner_mode_count); +} + +unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane, + int use_hbd); + +unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi, + const MACROBLOCKD *xd, + const struct buf_2d *ref, + BLOCK_SIZE bsize, int plane); + +static INLINE int is_mode_intra(PREDICTION_MODE mode) { + return mode < INTRA_MODE_END; +} + +// This function will copy usable ref_mv_stack[ref_frame][4] and +// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and +// weight[ref_frame][8]. +static INLINE void av1_copy_usable_ref_mv_stack_and_weight( + const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext, + MV_REFERENCE_FRAME ref_frame) { + memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame], + USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0])); + memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame], + USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0])); +} + +// Get transform rd gate level for the given transform search case. +static INLINE int get_txfm_rd_gate_level( + const int is_masked_compound_enabled, + const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize, + TX_SEARCH_CASE tx_search_case, int eval_motion_mode) { + assert(tx_search_case < TX_SEARCH_CASES); + if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode && + num_pels_log2_lookup[bsize] > 8) + return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE]; + // Enable aggressive gating of transform search only when masked compound type + // is enabled. + else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE && + is_masked_compound_enabled) + return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE]; + + return txfm_rd_gate_level[TX_SEARCH_DEFAULT]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RDOPT_UTILS_H_ diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c new file mode 100644 index 0000000000..9b964113a5 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.c @@ -0,0 +1,701 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/reconinter_enc.h" + +static AOM_INLINE void enc_calc_subpel_params( + const MV *const src_mv, InterPredParams *const inter_pred_params, + uint8_t **pre, SubpelParams *subpel_params, int *src_stride) { + struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf; + init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width, + pre_buf->height); + *pre = pre_buf->buf0 + + (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (subpel_params->pos_x >> SCALE_SUBPEL_BITS); + *src_stride = pre_buf->stride; +} + +#define IS_DEC 0 +#include "av1/common/reconinter_template.inc" +#undef IS_DEC + +void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, + const MV *src_mv, + InterPredParams *inter_pred_params) { + build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params); +} + +static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, + int plane, const MB_MODE_INFO *mi, + int bw, int bh, int mi_x, int mi_y) { + build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x, + mi_y); +} + +void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) { + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + InterPredParams inter_pred_params; + + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf; + const MV mv = xd->mi[0]->mv[0].as_mv; + const struct scale_factors *const sf = xd->block_ref_scale_factors[0]; + + av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), false, sf, pd->pre, + xd->mi[0]->interp_filters); + + inter_pred_params.conv_params = get_conv_params_no_round( + 0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd); + + inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0; + av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv, + &inter_pred_params); +} + +void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params) { + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y]; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *pre_buf = &pd->pre[0]; + const uint8_t *src = + pre_buf->buf0 + + (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (subpel_params->pos_x >> SCALE_SUBPEL_BITS); + uint8_t *const dst = dst_buf->buf; + int src_stride = pre_buf->stride; + int dst_stride = dst_buf->stride; + inter_pred_params->ref_frame_buf = *pre_buf; + + // Initialize interp filter for single reference mode. + init_interp_filter_params(inter_pred_params->interp_filter_params, + &mbmi->interp_filters.as_filters, pd->width, + pd->height, /*is_intrabc=*/0); + + av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params, + subpel_params); +} + +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to) { + for (int plane = plane_from; plane <= plane_to; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width, + xd->plane[plane].height, mi_x, mi_y); + + if (is_interintra_pred(xd->mi[0])) { + BUFFER_SET default_ctx = { + { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf }, + { xd->plane[0].dst.stride, xd->plane[1].dst.stride, + xd->plane[2].dst.stride } + }; + if (!ctx) { + ctx = &default_ctx; + } + av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf, + xd->plane[plane].dst.stride, ctx, plane, + bsize); + } + } +} + +static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset, + int mi_col_offset, MB_MODE_INFO *ref_mbmi, + struct build_prediction_ctxt *ctxt, + const int num_planes) { + const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize); + const int ref_mi_row = xd->mi_row + mi_row_offset; + const int ref_mi_col = xd->mi_col + mi_col_offset; + + for (int plane = 0; plane < num_planes; ++plane) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane], + ctxt->tmp_width[plane], ctxt->tmp_height[plane], + ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset, + NULL, pd->subsampling_x, pd->subsampling_y); + } + + const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0]; + + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + + xd->block_ref_scale_factors[0] = sf; + if (!av1_is_valid_scale(sf)) + aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + + av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf, + num_planes); +} + +static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row, + int rel_mi_col, uint8_t op_mi_size, + int dir, MB_MODE_INFO *above_mbmi, + void *fun_ctxt, const int num_planes) { + struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt; + setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt, + num_planes); + + const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2; + const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2; + + const BLOCK_SIZE bsize = xd->mi[0]->bsize; + + InterPredParams inter_pred_params; + + for (int j = 0; j < num_planes; ++j) { + const struct macroblockd_plane *pd = &xd->plane[j]; + int bw = 0, bh = 0; + + if (dir) { + // prepare left reference block size + bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4, + block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1)); + bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y; + } else { + // prepare above reference block size + bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x; + bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4, + block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1)); + } + + if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue; + + const struct buf_2d *const pre_buf = &pd->pre[0]; + const MV mv = above_mbmi->mv[0].as_mv; + + av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[0], pre_buf, + above_mbmi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, j, xd->bd); + + av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv, + &inter_pred_params); + } +} + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->up_available) return; + struct build_prediction_ctxt ctxt = { + cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL + }; + BLOCK_SIZE bsize = xd->mi[0]->bsize; + foreach_overlappable_nb_above(cm, xd, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + build_obmc_prediction, &ctxt); +} + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]) { + if (!xd->left_available) return; + struct build_prediction_ctxt ctxt = { + cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL + }; + BLOCK_SIZE bsize = xd->mi[0]->bsize; + foreach_overlappable_nb_left(cm, xd, + max_neighbor_obmc[mi_size_high_log2[bsize]], + build_obmc_prediction, &ctxt); +} + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) { + const int num_planes = av1_num_planes(cm); + uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE]; + int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + + av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2); + + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1, + dst_stride1); + av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2, + dst_stride2); + av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row, + mi_col, 0, num_planes); + av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2, + dst_stride2); +} + +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, + uint8_t *ext_dst[], int ext_dst_stride[]) { + assert(bsize < BLOCK_SIZES_ALL); + const MB_MODE_INFO *mi = xd->mi[0]; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + WarpTypesAllowed warp_types; + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]]; + warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype); + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + for (int plane = plane_from; plane <= plane_to; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + + InterPredParams inter_pred_params; + + av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, + pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[ref], &pd->pre[ref], + mi->interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi); + + uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]); + const MV mv = mi->mv[ref].as_mv; + + av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv, + &inter_pred_params); + } +} + +static void build_masked_compound( + uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride, + const uint8_t *src1, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, block_size_wide[sb_type], w, h, subw, subh); +} + +#if CONFIG_AV1_HIGHBITDEPTH +static void build_masked_compound_highbd( + uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride, + const uint8_t *src1_8, int src1_stride, + const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h, + int w, int bd) { + // Derive subsampling from h and w passed in. May be refactored to + // pass in subsampling factors directly. + const int subh = (2 << mi_size_high_log2[sb_type]) == h; + const int subw = (2 << mi_size_wide_log2[sb_type]) == w; + const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); + // const uint8_t *mask = + // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type); + aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, block_size_wide[sb_type], w, h, + subw, subh, bd); +} +#endif + +static void build_wedge_inter_predictor_from_buf( + MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0, + int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) { + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_compound = has_second_ref(mbmi); + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + mbmi->interinter_comp.seg_mask = xd->seg_mask; + const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; + const int is_hbd = is_cur_buf_hbd(xd); + + if (is_compound && is_masked_compound_type(comp_data->type)) { + if (!plane && comp_data->type == COMPOUND_DIFFWTD) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + av1_build_compound_diffwtd_mask_highbd( + comp_data->seg_mask, comp_data->mask_type, + CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); + } else { + av1_build_compound_diffwtd_mask( + comp_data->seg_mask, comp_data->mask_type, ext_dst0, + ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); + } +#else + (void)is_hbd; + av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type, + ext_dst0, ext_dst_stride0, ext_dst1, + ext_dst_stride1, h, w); +#endif // CONFIG_AV1_HIGHBITDEPTH + } +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + build_masked_compound_highbd( + dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize, + h, w, xd->bd); + } else { + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, + h, w); + } +#else + build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, + ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h, + w); +#endif + } else { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h); + } else { + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h); + } +#else + aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h); +#endif + } +} + +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[], + int ext_dst_stride0[], + uint8_t *ext_dst1[], + int ext_dst_stride1[]) { + int plane; + assert(bsize < BLOCK_SIZES_ALL); + for (plane = plane_from; plane <= plane_to; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size( + bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + build_wedge_inter_predictor_from_buf( + xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane], + ext_dst1[plane], ext_dst_stride1[plane]); + } +} + +// Get pred block from up-sampled reference. +void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + for (int i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); + } +} + +void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int i, j; + + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); + } + comp_pred += width; + pred += width; + } +} + +void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +void aom_dist_wtd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, + const struct AV1Common *const cm, int mi_row, + int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + + if (!subpel_x_q3 && !subpel_y_q3) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (int i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert_c( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + int i, j; + + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); + } + comp_pred += width; + pred += width; + } +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, + ref_stride, bd, subpel_search); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; + } + comp_pred += width; + pred += width; + } +} + +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, + mask, mask_stride, invert_mask); +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h new file mode 100644 index 0000000000..16932f37a0 --- /dev/null +++ b/third_party/aom/av1/encoder/reconinter_enc.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_ +#define AOM_AV1_ENCODER_RECONINTER_ENC_H_ + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/reconinter.h" +#include "av1/common/warped_motion.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search); + +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd, int subpel_search); + +// Build single or compound reference inter predictors for all planes. +// Can build inter-intra predictors, masked predictors etc as well. +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to); + +void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col); + +void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd, + InterPredParams *inter_pred_params, + const SubpelParams *subpel_params); + +// Build one inter predictor. It is called for building predictor for single +// reference case, or just the 1st or 2nd reference in compound reference case. +// Can build both regular and masked predictors. +void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride, + const MV *src_mv, + InterPredParams *inter_pred_params); + +void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd, + uint8_t *tmp_buf[MAX_MB_PLANE], + int tmp_width[MAX_MB_PLANE], + int tmp_height[MAX_MB_PLANE], + int tmp_stride[MAX_MB_PLANE]); + +void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd); + +// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive. +void av1_build_inter_predictors_for_planes_single_buf( + MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref, + uint8_t *ext_dst[], int ext_dst_stride[]); + +// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive. +void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane_from, int plane_to, + uint8_t *ext_dst0[], + int ext_dst_stride0[], + uint8_t *ext_dst1[], + int ext_dst_stride1[]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_ diff --git a/third_party/aom/av1/encoder/saliency_map.c b/third_party/aom/av1/encoder/saliency_map.c new file mode 100644 index 0000000000..30019bbec0 --- /dev/null +++ b/third_party/aom/av1/encoder/saliency_map.c @@ -0,0 +1,1414 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/saliency_map.h" + +// The Gabor filter is generated by setting the parameters as: +// ksize = 9 +// sigma = 1 +// theta = y*np.pi/4, where y /in {0, 1, 2, 3}, i.e., 0, 45, 90, 135 degree +// lambda1 = 1 +// gamma=0.8 +// phi =0 +static const double kGaborFilter[4][9][9] = { // [angle: 0, 45, 90, 135 + // degree][ksize][ksize] + { { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03, + 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 }, + { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02, + 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 }, + { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01, + 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 }, + { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01, + 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 }, + { 3.3546262e-04, 1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00, + 6.0653067e-01, 1.3533528e-01, 1.1108996e-02, 3.3546262e-04 }, + { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01, + 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 }, + { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01, + 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 }, + { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02, + 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 }, + { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03, + 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 } }, + + { { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04, + 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05, + 3.5712848e-05 }, + { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03, + 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03, + -8.1631159e-05 }, + { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02, + -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03, + -9.9486928e-04 }, + { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01, + -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02, + 1.3962291e-03 }, + { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01, + 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02, + 6.6981313e-04 }, + { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01, + -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03, + -4.4602581e-04 }, + { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02, + -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03, + 3.0079011e-06 }, + { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02, + 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06, + 3.8760313e-06 }, + { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03, + 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06, + -6.2165498e-08 } }, + + { { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04, + 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 }, + { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02, + 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 }, + { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01, + 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 }, + { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01, + 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 }, + { 5.9760227e-03, 5.6134764e-02, 2.7803731e-01, 7.2614902e-01, 1.0000000e+00, + 7.2614902e-01, 2.7803731e-01, 5.6134764e-02, 5.9760227e-03 }, + { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01, + 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 }, + { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01, + 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 }, + { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02, + 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 }, + { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04, + 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 } }, + + { { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03, + 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06, + -6.2165498e-08 }, + { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02, + 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06, + 3.8760313e-06 }, + { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02, + -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03, + 3.0079011e-06 }, + { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01, + -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03, + -4.4602581e-04 }, + { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01, + 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02, + 6.6981313e-04 }, + { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01, + -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02, + 1.3962291e-03 }, + { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02, + -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03, + -9.9486928e-04 }, + { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03, + 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03, + -8.1631159e-05 }, + { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04, + 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05, + 3.5712848e-05 } } +}; + +// This function is to extract red/green/blue channels, and calculate intensity +// = (r+g+b)/3. Note that it only handles 8bits case now. +// TODO(linzhen): add high bitdepth support. +static void get_color_intensity(const YV12_BUFFER_CONFIG *src, + int subsampling_x, int subsampling_y, + double *cr, double *cg, double *cb, + double *intensity) { + const uint8_t *y = src->buffers[0]; + const uint8_t *u = src->buffers[1]; + const uint8_t *v = src->buffers[2]; + + const int y_height = src->crop_heights[0]; + const int y_width = src->crop_widths[0]; + const int y_stride = src->strides[0]; + const int c_stride = src->strides[1]; + + for (int i = 0; i < y_height; ++i) { + for (int j = 0; j < y_width; ++j) { + cr[i * y_width + j] = + fclamp((double)y[i * y_stride + j] + + 1.370 * (double)(v[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128), + 0, 255); + cg[i * y_width + j] = + fclamp((double)y[i * y_stride + j] - + 0.698 * (double)(u[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128) - + 0.337 * (double)(v[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128), + 0, 255); + cb[i * y_width + j] = + fclamp((double)y[i * y_stride + j] + + 1.732 * (double)(u[(i >> subsampling_y) * c_stride + + (j >> subsampling_x)] - + 128), + 0, 255); + + intensity[i * y_width + j] = + (cr[i * y_width + j] + cg[i * y_width + j] + cb[i * y_width + j]) / + 3.0; + assert(intensity[i * y_width + j] >= 0 && + intensity[i * y_width + j] <= 255); + + intensity[i * y_width + j] /= 256; + cr[i * y_width + j] /= 256; + cg[i * y_width + j] /= 256; + cb[i * y_width + j] /= 256; + } + } +} + +static INLINE double convolve_map(const double *filter, const double *map, + const int size) { + double result = 0; + for (int i = 0; i < size; ++i) { + result += filter[i] * map[i]; // symmetric filter is used + } + return result; +} + +// This function is to decimate the map by half, and apply Gaussian filter on +// top of the downsampled map. +static INLINE void decimate_map(const double *map, int height, int width, + int stride, double *downsampled_map) { + const int new_width = width / 2; + const int window_size = 5; + const double gaussian_filter[25] = { + 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16, + 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32, + 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256, + 1. / 64, 3. / 128, 1. / 64, 1. / 256 + }; + + double map_region[25]; + for (int y = 0; y < height - 1; y += 2) { + for (int x = 0; x < width - 1; x += 2) { + int i = 0; + for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) { + for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) { + int yvalue = clamp(yy, 0, height - 1); + int xvalue = clamp(xx, 0, width - 1); + map_region[i++] = map[yvalue * stride + xvalue]; + } + } + downsampled_map[(y / 2) * new_width + (x / 2)] = + convolve_map(gaussian_filter, map_region, window_size * window_size); + } + } +} + +// This function is to upscale the map from in_level size to out_level size. +// Note that the map at "level-1" will upscale the map at "level" by x2. +static INLINE int upscale_map(const double *input, int in_level, int out_level, + int height[9], int width[9], double *output) { + for (int level = in_level; level > out_level; level--) { + const int cur_width = width[level]; + const int cur_height = height[level]; + const int cur_stride = width[level]; + + double *original = (level == in_level) ? (double *)input : output; + + assert(level > 0); + + const int h_upscale = height[level - 1]; + const int w_upscale = width[level - 1]; + const int s_upscale = width[level - 1]; + + double *upscale = aom_malloc(h_upscale * w_upscale * sizeof(*upscale)); + + if (!upscale) { + return 0; + } + + for (int i = 0; i < h_upscale; ++i) { + for (int j = 0; j < w_upscale; ++j) { + const int ii = clamp((i >> 1), 0, cur_height - 1); + const int jj = clamp((j >> 1), 0, cur_width - 1); + upscale[j + i * s_upscale] = (double)original[jj + ii * cur_stride]; + } + } + memcpy(output, upscale, h_upscale * w_upscale * sizeof(double)); + aom_free(upscale); + } + + return 1; +} + +// This function calculates the differences between a fine scale c and a +// coarser scale s yielding the feature maps. c \in {2, 3, 4}, and s = c + +// delta, where delta \in {3, 4}. +static int center_surround_diff(const double *input[9], int height[9], + int width[9], saliency_feature_map *output[6]) { + int j = 0; + for (int k = 2; k < 5; ++k) { + int cur_height = height[k]; + int cur_width = width[k]; + + if (upscale_map(input[k + 3], k + 3, k, height, width, output[j]->buf) == + 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j]->buf[r * cur_width + c] = + fabs((double)(input[k][r * cur_width + c] - + output[j]->buf[r * cur_width + c])); + } + } + + if (upscale_map(input[k + 4], k + 4, k, height, width, + output[j + 1]->buf) == 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j + 1]->buf[r * cur_width + c] = + fabs(input[k][r * cur_width + c] - + output[j + 1]->buf[r * cur_width + c]); + } + } + + j += 2; + } + return 1; +} + +// For color channels, the differences is calculated based on "color +// double-opponency". For example, the RG feature map is constructed between a +// fine scale c of R-G component and a coarser scale s of G-R component. +static int center_surround_diff_rgb(const double *input_1[9], + const double *input_2[9], int height[9], + int width[9], + saliency_feature_map *output[6]) { + int j = 0; + for (int k = 2; k < 5; ++k) { + int cur_height = height[k]; + int cur_width = width[k]; + + if (upscale_map(input_2[k + 3], k + 3, k, height, width, output[j]->buf) == + 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j]->buf[r * cur_width + c] = + fabs((double)(input_1[k][r * cur_width + c] - + output[j]->buf[r * cur_width + c])); + } + } + + if (upscale_map(input_2[k + 4], k + 4, k, height, width, + output[j + 1]->buf) == 0) { + return 0; + } + + for (int r = 0; r < cur_height; ++r) { + for (int c = 0; c < cur_width; ++c) { + output[j + 1]->buf[r * cur_width + c] = + fabs(input_1[k][r * cur_width + c] - + output[j + 1]->buf[r * cur_width + c]); + } + } + + j += 2; + } + return 1; +} + +// This function is to generate Gaussian pyramid images with indexes from 0 to +// 8, and construct the feature maps from calculating the center-surround +// differences. +static int gaussian_pyramid(const double *src, int width[9], int height[9], + saliency_feature_map *dst[6]) { + double *gaussian_map[9]; // scale = 9 + gaussian_map[0] = + (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0])); + if (!gaussian_map[0]) { + return 0; + } + + memcpy(gaussian_map[0], src, width[0] * height[0] * sizeof(double)); + + for (int i = 1; i < 9; ++i) { + int stride = width[i - 1]; + int new_width = width[i]; + int new_height = height[i]; + + gaussian_map[i] = + (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i])); + + if (!gaussian_map[i]) { + for (int l = 0; l < i; ++l) { + aom_free(gaussian_map[l]); + } + return 0; + } + + memset(gaussian_map[i], 0, new_width * new_height * sizeof(double)); + + decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride, + gaussian_map[i]); + } + + if (center_surround_diff((const double **)gaussian_map, height, width, dst) == + 0) { + for (int l = 0; l < 9; ++l) { + aom_free(gaussian_map[l]); + } + return 0; + } + + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[i]); + } + return 1; +} + +static int gaussian_pyramid_rgb(double *src_1, double *src_2, int width[9], + int height[9], saliency_feature_map *dst[6]) { + double *gaussian_map[2][9]; // scale = 9 + double *src[2]; + + src[0] = src_1; + src[1] = src_2; + + for (int k = 0; k < 2; ++k) { + gaussian_map[k][0] = (double *)aom_malloc(width[0] * height[0] * + sizeof(*gaussian_map[k][0])); + if (!gaussian_map[k][0]) { + for (int l = 0; l < k; ++l) { + aom_free(gaussian_map[l][0]); + } + return 0; + } + memcpy(gaussian_map[k][0], src[k], width[0] * height[0] * sizeof(double)); + + for (int i = 1; i < 9; ++i) { + int stride = width[i - 1]; + int new_width = width[i]; + int new_height = height[i]; + + gaussian_map[k][i] = (double *)aom_malloc(new_width * new_height * + sizeof(*gaussian_map[k][i])); + if (!gaussian_map[k][i]) { + for (int l = 0; l < k; ++l) { + aom_free(gaussian_map[l][i]); + } + return 0; + } + memset(gaussian_map[k][i], 0, new_width * new_height * sizeof(double)); + decimate_map(gaussian_map[k][i - 1], height[i - 1], width[i - 1], stride, + gaussian_map[k][i]); + } + } + + if (center_surround_diff_rgb((const double **)gaussian_map[0], + (const double **)gaussian_map[1], height, width, + dst) == 0) { + for (int l = 0; l < 2; ++l) { + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[l][i]); + } + } + return 0; + } + + for (int l = 0; l < 2; ++l) { + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[l][i]); + } + } + return 1; +} + +static int get_feature_map_intensity(double *intensity, int width[9], + int height[9], + saliency_feature_map *i_map[6]) { + if (gaussian_pyramid(intensity, width, height, i_map) == 0) { + return 0; + } + return 1; +} + +static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9], + int height[9], saliency_feature_map *rg_map[6], + saliency_feature_map *by_map[6]) { + double *rg_mat = aom_malloc(height[0] * width[0] * sizeof(*rg_mat)); + double *by_mat = aom_malloc(height[0] * width[0] * sizeof(*by_mat)); + double *gr_mat = aom_malloc(height[0] * width[0] * sizeof(*gr_mat)); + double *yb_mat = aom_malloc(height[0] * width[0] * sizeof(*yb_mat)); + + if (!rg_mat || !by_mat || !gr_mat || !yb_mat) { + aom_free(rg_mat); + aom_free(by_mat); + aom_free(gr_mat); + aom_free(yb_mat); + return 0; + } + + double r, g, b, y; + for (int i = 0; i < height[0]; ++i) { + for (int j = 0; j < width[0]; ++j) { + r = AOMMAX(0, cr[i * width[0] + j] - + (cg[i * width[0] + j] + cb[i * width[0] + j]) / 2); + g = AOMMAX(0, cg[i * width[0] + j] - + (cr[i * width[0] + j] + cb[i * width[0] + j]) / 2); + b = AOMMAX(0, cb[i * width[0] + j] - + (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2); + y = AOMMAX(0, (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2 - + fabs(cr[i * width[0] + j] - cg[i * width[0] + j]) / 2 - + cb[i * width[0] + j]); + + rg_mat[i * width[0] + j] = r - g; + by_mat[i * width[0] + j] = b - y; + gr_mat[i * width[0] + j] = g - r; + yb_mat[i * width[0] + j] = y - b; + } + } + + if (gaussian_pyramid_rgb(rg_mat, gr_mat, width, height, rg_map) == 0 || + gaussian_pyramid_rgb(by_mat, yb_mat, width, height, by_map) == 0) { + aom_free(rg_mat); + aom_free(by_mat); + aom_free(gr_mat); + aom_free(yb_mat); + return 0; + } + + aom_free(rg_mat); + aom_free(by_mat); + aom_free(gr_mat); + aom_free(yb_mat); + return 1; +} + +static INLINE void filter2d(const double *input, const double kernel[9][9], + int width, int height, double *output) { + const int window_size = 9; + double map_section[81]; + for (int y = 0; y <= height - 1; ++y) { + for (int x = 0; x <= width - 1; ++x) { + int i = 0; + for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) { + for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) { + int yvalue = clamp(yy, 0, height - 1); + int xvalue = clamp(xx, 0, width - 1); + map_section[i++] = input[yvalue * width + xvalue]; + } + } + + output[y * width + x] = 0; + for (int k = 0; k < window_size; ++k) { + for (int l = 0; l < window_size; ++l) { + output[y * width + x] += + kernel[k][l] * map_section[k * window_size + l]; + } + } + } + } +} + +static int get_feature_map_orientation(const double *intensity, int width[9], + int height[9], + saliency_feature_map *dst[24]) { + double *gaussian_map[9]; + + gaussian_map[0] = + (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0])); + if (!gaussian_map[0]) { + return 0; + } + memcpy(gaussian_map[0], intensity, width[0] * height[0] * sizeof(double)); + + for (int i = 1; i < 9; ++i) { + int stride = width[i - 1]; + int new_width = width[i]; + int new_height = height[i]; + + gaussian_map[i] = + (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i])); + if (!gaussian_map[i]) { + for (int l = 0; l < i; ++l) { + aom_free(gaussian_map[l]); + } + return 0; + } + memset(gaussian_map[i], 0, new_width * new_height * sizeof(double)); + decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride, + gaussian_map[i]); + } + + double *tempGaborOutput[4][9]; //[angle: 0, 45, 90, 135 degree][filter_size] + + for (int i = 2; i < 9; ++i) { + const int cur_height = height[i]; + const int cur_width = width[i]; + for (int j = 0; j < 4; ++j) { + tempGaborOutput[j][i] = (double *)aom_malloc( + cur_height * cur_width * sizeof(*tempGaborOutput[j][i])); + if (!tempGaborOutput[j][i]) { + for (int l = 0; l < 9; ++l) { + aom_free(gaussian_map[l]); + } + for (int h = 0; h < 4; ++h) { + for (int g = 2; g < 9; ++g) { + aom_free(tempGaborOutput[h][g]); + } + } + return 0; + } + filter2d(gaussian_map[i], kGaborFilter[j], cur_width, cur_height, + tempGaborOutput[j][i]); + } + } + + for (int i = 0; i < 9; ++i) { + aom_free(gaussian_map[i]); + } + + saliency_feature_map + *tmp[4][6]; //[angle: 0, 45, 90, 135 degree][filter_size] + + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 4; ++j) { + tmp[j][i] = dst[j * 6 + i]; + } + } + + for (int j = 0; j < 4; ++j) { + if (center_surround_diff((const double **)tempGaborOutput[j], height, width, + tmp[j]) == 0) { + for (int h = 0; h < 4; ++h) { + for (int g = 2; g < 9; ++g) { + aom_free(tempGaborOutput[h][g]); + } + } + return 0; + } + } + + for (int i = 2; i < 9; ++i) { + for (int j = 0; j < 4; ++j) { + aom_free(tempGaborOutput[j][i]); + } + } + + return 1; +} + +static INLINE void find_min_max(const saliency_feature_map *input, + double *max_value, double *min_value) { + assert(input && input->buf); + *min_value = DBL_MAX; + *max_value = 0.0; + + for (int i = 0; i < input->height; ++i) { + for (int j = 0; j < input->width; ++j) { + assert(input->buf[i * input->width + j] >= 0.0); + *min_value = fmin(input->buf[i * input->width + j], *min_value); + *max_value = fmax(input->buf[i * input->width + j], *max_value); + } + } +} + +static INLINE double average_local_max(const saliency_feature_map *input, + int stepsize) { + int numlocal = 0; + double lmaxmean = 0, lmax = 0, dummy = 0; + saliency_feature_map local_map; + local_map.height = stepsize; + local_map.width = stepsize; + local_map.buf = + (double *)aom_malloc(stepsize * stepsize * sizeof(*local_map.buf)); + + if (!local_map.buf) { + return -1; + } + + for (int y = 0; y < input->height - stepsize; y += stepsize) { + for (int x = 0; x < input->width - stepsize; x += stepsize) { + for (int i = 0; i < stepsize; ++i) { + for (int j = 0; j < stepsize; ++j) { + local_map.buf[i * stepsize + j] = + input->buf[(y + i) * input->width + x + j]; + } + } + + find_min_max(&local_map, &lmax, &dummy); + lmaxmean += lmax; + numlocal++; + } + } + + aom_free(local_map.buf); + + return lmaxmean / numlocal; +} + +// Linear normalization the values in the map to [0,1]. +static void minmax_normalize(saliency_feature_map *input) { + double max_value, min_value; + find_min_max(input, &max_value, &min_value); + + for (int i = 0; i < input->height; ++i) { + for (int j = 0; j < input->width; ++j) { + if (max_value != min_value) { + input->buf[i * input->width + j] = + input->buf[i * input->width + j] / (max_value - min_value) + + min_value / (min_value - max_value); + } else { + input->buf[i * input->width + j] -= min_value; + } + } + } +} + +// This function is to promote meaningful “activation spots” in the map and +// ignores homogeneous areas. +static int nomalization_operator(saliency_feature_map *input, int stepsize) { + minmax_normalize(input); + double lmaxmean = average_local_max(input, stepsize); + if (lmaxmean < 0) { + return 0; + } + double normCoeff = (1 - lmaxmean) * (1 - lmaxmean); + + for (int i = 0; i < input->height; ++i) { + for (int j = 0; j < input->width; ++j) { + input->buf[i * input->width + j] *= normCoeff; + } + } + + return 1; +} + +// Normalize the values in feature maps to [0,1], and then upscale all maps to +// the original frame size. +static int normalize_fm(saliency_feature_map *input[6], int width[9], + int height[9], int num_fm, + saliency_feature_map *output[6]) { + // Feature maps (FM) are generated by function "center_surround_diff()". The + // difference is between a fine scale c and a coarser scale s, where c \in {2, + // 3, 4}, and s = c + delta, where delta \in {3, 4}, and the FM size is scale + // c. Specifically, i=0: c=2 and s=5, i=1: c=2 and s=6, i=2: c=3 and s=6, i=3: + // c=3 and s=7, i=4: c=4 and s=7, i=5: c=4 and s=8. + for (int i = 0; i < num_fm; ++i) { + if (nomalization_operator(input[i], 8) == 0) { + return 0; + } + + // Upscale FM to original frame size + if (upscale_map(input[i]->buf, (i / 2) + 2, 0, height, width, + output[i]->buf) == 0) { + return 0; + } + } + return 1; +} + +// Combine feature maps with the same category (intensity, color, or +// orientation) into one conspicuity map. +static int normalized_map(saliency_feature_map *input[6], int width[9], + int height[9], saliency_feature_map *output) { + int num_fm = 6; + + saliency_feature_map *n_input[6]; + for (int i = 0; i < 6; ++i) { + n_input[i] = (saliency_feature_map *)aom_malloc(sizeof(*n_input[i])); + if (!n_input[i]) { + return 0; + } + n_input[i]->buf = + (double *)aom_malloc(width[0] * height[0] * sizeof(*n_input[i]->buf)); + if (!n_input[i]->buf) { + aom_free(n_input[i]); + return 0; + } + n_input[i]->height = height[0]; + n_input[i]->width = width[0]; + } + + if (normalize_fm(input, width, height, num_fm, n_input) == 0) { + for (int i = 0; i < num_fm; ++i) { + aom_free(n_input[i]->buf); + aom_free(n_input[i]); + } + return 0; + } + + // Add up all normalized feature maps with the same category into one map. + for (int i = 0; i < num_fm; ++i) { + for (int r = 0; r < height[0]; ++r) { + for (int c = 0; c < width[0]; ++c) { + output->buf[r * width[0] + c] += n_input[i]->buf[r * width[0] + c]; + } + } + } + + for (int i = 0; i < num_fm; ++i) { + aom_free(n_input[i]->buf); + aom_free(n_input[i]); + } + + nomalization_operator(output, 8); + return 1; +} + +static int normalized_map_rgb(saliency_feature_map *rg_map[6], + saliency_feature_map *by_map[6], int width[9], + int height[9], saliency_feature_map *output) { + saliency_feature_map *color_cm[2]; // 0: color_cm_rg, 1: color_cm_by + for (int i = 0; i < 2; ++i) { + color_cm[i] = aom_malloc(sizeof(*color_cm[i])); + if (!color_cm[i]) { + return 0; + } + color_cm[i]->buf = + (double *)aom_malloc(width[0] * height[0] * sizeof(*color_cm[i]->buf)); + if (!color_cm[i]->buf) { + for (int l = 0; l < i; ++l) { + aom_free(color_cm[l]->buf); + } + aom_free(color_cm[i]); + return 0; + } + + color_cm[i]->width = width[0]; + color_cm[i]->height = height[0]; + memset(color_cm[i]->buf, 0, + width[0] * height[0] * sizeof(*color_cm[i]->buf)); + } + + if (normalized_map(rg_map, width, height, color_cm[0]) == 0 || + normalized_map(by_map, width, height, color_cm[1]) == 0) { + for (int i = 0; i < 2; ++i) { + aom_free(color_cm[i]->buf); + aom_free(color_cm[i]); + } + return 0; + } + + for (int r = 0; r < height[0]; ++r) { + for (int c = 0; c < width[0]; ++c) { + output->buf[r * width[0] + c] = color_cm[0]->buf[r * width[0] + c] + + color_cm[1]->buf[r * width[0] + c]; + } + } + + for (int i = 0; i < 2; ++i) { + aom_free(color_cm[i]->buf); + aom_free(color_cm[i]); + } + + nomalization_operator(output, 8); + return 1; +} + +static int normalized_map_orientation(saliency_feature_map *orientation_map[24], + int width[9], int height[9], + saliency_feature_map *output) { + int num_fms_per_angle = 6; + + saliency_feature_map *ofm[4][6]; + for (int i = 0; i < num_fms_per_angle; ++i) { + for (int j = 0; j < 4; ++j) { + ofm[j][i] = orientation_map[j * num_fms_per_angle + i]; + } + } + + // extract conspicuity map for each angle + saliency_feature_map *nofm = aom_malloc(sizeof(*nofm)); + if (!nofm) { + return 0; + } + nofm->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*nofm->buf)); + if (!nofm->buf) { + aom_free(nofm); + return 0; + } + nofm->height = height[0]; + nofm->width = width[0]; + + for (int i = 0; i < 4; ++i) { + memset(nofm->buf, 0, width[0] * height[0] * sizeof(*nofm->buf)); + if (normalized_map(ofm[i], width, height, nofm) == 0) { + aom_free(nofm->buf); + aom_free(nofm); + return 0; + } + + for (int r = 0; r < height[0]; ++r) { + for (int c = 0; c < width[0]; ++c) { + output->buf[r * width[0] + c] += nofm->buf[r * width[0] + c]; + } + } + } + + aom_free(nofm->buf); + aom_free(nofm); + + nomalization_operator(output, 8); + return 1; +} + +// Set pixel level saliency mask based on Itti-Koch algorithm +int av1_set_saliency_map(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + + int frm_width = cm->width; + int frm_height = cm->height; + + int pyr_height[9]; + int pyr_width[9]; + + pyr_height[0] = frm_height; + pyr_width[0] = frm_width; + + for (int i = 1; i < 9; ++i) { + pyr_width[i] = pyr_width[i - 1] / 2; + pyr_height[i] = pyr_height[i - 1] / 2; + } + + double *cr = aom_malloc(frm_width * frm_height * sizeof(*cr)); + double *cg = aom_malloc(frm_width * frm_height * sizeof(*cg)); + double *cb = aom_malloc(frm_width * frm_height * sizeof(*cb)); + double *intensity = aom_malloc(frm_width * frm_height * sizeof(*intensity)); + + if (!cr || !cg || !cb || !intensity) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + return 0; + } + + // Extract red / green / blue channels and intensity component + get_color_intensity(cpi->source, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, cr, cg, cb, intensity); + + // Feature Map Extraction + // intensity map + saliency_feature_map *i_map[6]; + for (int i = 0; i < 6; ++i) { + int cur_height = pyr_height[(i / 2) + 2]; + int cur_width = pyr_width[(i / 2) + 2]; + + i_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*i_map[i])); + if (!i_map[i]) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < i; ++l) { + aom_free(i_map[l]); + } + return 0; + } + i_map[i]->buf = + (double *)aom_malloc(cur_height * cur_width * sizeof(*i_map[i]->buf)); + if (!i_map[i]->buf) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < i; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + } + return 0; + } + i_map[i]->height = cur_height; + i_map[i]->width = cur_width; + } + + if (get_feature_map_intensity(intensity, pyr_width, pyr_height, i_map) == 0) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + } + return 0; + } + + // RGB map + saliency_feature_map *rg_map[6], *by_map[6]; + for (int i = 0; i < 6; ++i) { + int cur_height = pyr_height[(i / 2) + 2]; + int cur_width = pyr_width[(i / 2) + 2]; + rg_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*rg_map[i])); + by_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*by_map[i])); + if (!rg_map[i] || !by_map[i]) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + return 0; + } + rg_map[i]->buf = + (double *)aom_malloc(cur_height * cur_width * sizeof(*rg_map[i]->buf)); + by_map[i]->buf = + (double *)aom_malloc(cur_height * cur_width * sizeof(*by_map[i]->buf)); + if (!by_map[i]->buf || !rg_map[i]->buf) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(i_map[l]); + } + for (int l = 0; l < i; ++l) { + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + return 0; + } + rg_map[i]->height = cur_height; + rg_map[i]->width = cur_width; + by_map[i]->height = cur_height; + by_map[i]->width = cur_width; + } + + if (get_feature_map_rgb(cr, cg, cb, pyr_width, pyr_height, rg_map, by_map) == + 0) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + return 0; + } + + // Orientation map + saliency_feature_map *orientation_map[24]; + for (int i = 0; i < 24; ++i) { + int cur_height = pyr_height[((i % 6) / 2) + 2]; + int cur_width = pyr_width[((i % 6) / 2) + 2]; + + orientation_map[i] = + (saliency_feature_map *)aom_malloc(sizeof(*orientation_map[i])); + if (!orientation_map[i]) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + for (int h = 0; h < i; ++h) { + aom_free(orientation_map[h]); + } + return 0; + } + + orientation_map[i]->buf = (double *)aom_malloc( + cur_height * cur_width * sizeof(*orientation_map[i]->buf)); + if (!orientation_map[i]->buf) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + + for (int h = 0; h < i; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + aom_free(orientation_map[h]); + } + return 0; + } + + orientation_map[i]->height = cur_height; + orientation_map[i]->width = cur_width; + } + + if (get_feature_map_orientation(intensity, pyr_width, pyr_height, + orientation_map) == 0) { + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + for (int h = 0; h < 24; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + } + return 0; + } + + aom_free(cr); + aom_free(cg); + aom_free(cb); + aom_free(intensity); + + saliency_feature_map + *normalized_maps[3]; // 0: intensity, 1: color, 2: orientation + + for (int i = 0; i < 3; ++i) { + normalized_maps[i] = aom_malloc(sizeof(*normalized_maps[i])); + if (!normalized_maps[i]) { + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + + for (int h = 0; h < 24; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + } + + for (int l = 0; l < i; ++l) { + aom_free(normalized_maps[l]); + } + return 0; + } + normalized_maps[i]->buf = (double *)aom_malloc( + frm_width * frm_height * sizeof(*normalized_maps[i]->buf)); + if (!normalized_maps[i]->buf) { + for (int l = 0; l < 6; ++l) { + aom_free(i_map[l]->buf); + aom_free(rg_map[l]->buf); + aom_free(by_map[l]->buf); + aom_free(i_map[l]); + aom_free(rg_map[l]); + aom_free(by_map[l]); + } + for (int h = 0; h < 24; ++h) { + aom_free(orientation_map[h]->buf); + aom_free(orientation_map[h]); + } + for (int l = 0; l < i; ++l) { + aom_free(normalized_maps[l]->buf); + aom_free(normalized_maps[l]); + } + return 0; + } + normalized_maps[i]->width = frm_width; + normalized_maps[i]->height = frm_height; + memset(normalized_maps[i]->buf, 0, + frm_width * frm_height * sizeof(*normalized_maps[i]->buf)); + } + + // Conspicuity map generation + if (normalized_map(i_map, pyr_width, pyr_height, normalized_maps[0]) == 0 || + normalized_map_rgb(rg_map, by_map, pyr_width, pyr_height, + normalized_maps[1]) == 0 || + normalized_map_orientation(orientation_map, pyr_width, pyr_height, + normalized_maps[2]) == 0) { + for (int i = 0; i < 6; ++i) { + aom_free(i_map[i]->buf); + aom_free(rg_map[i]->buf); + aom_free(by_map[i]->buf); + aom_free(i_map[i]); + aom_free(rg_map[i]); + aom_free(by_map[i]); + } + + for (int i = 0; i < 24; ++i) { + aom_free(orientation_map[i]->buf); + aom_free(orientation_map[i]); + } + + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + return 0; + } + + for (int i = 0; i < 6; ++i) { + aom_free(i_map[i]->buf); + aom_free(rg_map[i]->buf); + aom_free(by_map[i]->buf); + aom_free(i_map[i]); + aom_free(rg_map[i]); + aom_free(by_map[i]); + } + + for (int i = 0; i < 24; ++i) { + aom_free(orientation_map[i]->buf); + aom_free(orientation_map[i]); + } + + // Pixel level saliency map + saliency_feature_map *combined_saliency_map = + aom_malloc(sizeof(*combined_saliency_map)); + if (!combined_saliency_map) { + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + return 0; + } + + combined_saliency_map->buf = (double *)aom_malloc( + frm_width * frm_height * sizeof(*combined_saliency_map->buf)); + if (!combined_saliency_map->buf) { + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + + aom_free(combined_saliency_map); + return 0; + } + combined_saliency_map->height = frm_height; + combined_saliency_map->width = frm_width; + + double w_intensity, w_color, w_orient; + + w_intensity = w_color = w_orient = (double)1 / 3; + + for (int r = 0; r < frm_height; ++r) { + for (int c = 0; c < frm_width; ++c) { + combined_saliency_map->buf[r * frm_width + c] = + (w_intensity * normalized_maps[0]->buf[r * frm_width + c] + + w_color * normalized_maps[1]->buf[r * frm_width + c] + + w_orient * normalized_maps[2]->buf[r * frm_width + c]); + } + } + + for (int r = 0; r < frm_height; ++r) { + for (int c = 0; c < frm_width; ++c) { + int index = r * frm_width + c; + cpi->saliency_map[index] = + (uint8_t)(combined_saliency_map->buf[index] * 255); + } + } + + for (int i = 0; i < 3; ++i) { + aom_free(normalized_maps[i]->buf); + aom_free(normalized_maps[i]); + } + + aom_free(combined_saliency_map->buf); + aom_free(combined_saliency_map); + + return 1; +} + +// Set superblock level saliency mask for rdmult scaling +int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) { + AV1_COMMON *cm = &cpi->common; + + saliency_feature_map *sb_saliency_map = + aom_malloc(sizeof(saliency_feature_map)); + + if (sb_saliency_map == NULL) { + return 0; + } + + const BLOCK_SIZE bsize = cm->seq_params->sb_size; + const int num_mi_w = mi_size_wide[bsize]; + const int num_mi_h = mi_size_high[bsize]; + const int block_width = block_size_wide[bsize]; + const int block_height = block_size_high[bsize]; + const int num_sb_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_sb_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + + sb_saliency_map->height = num_sb_rows; + sb_saliency_map->width = num_sb_cols; + sb_saliency_map->buf = (double *)aom_malloc(num_sb_rows * num_sb_cols * + sizeof(*sb_saliency_map->buf)); + + if (sb_saliency_map->buf == NULL) { + aom_free(sb_saliency_map); + return 0; + } + + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + double total_pixel = 0; + double total_weight = 0; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j++) { + if ((row * block_height + i) >= cpi->common.height || + (col * block_width + j) >= cpi->common.width) + continue; + total_pixel++; + total_weight += + cpi->saliency_map[(row * block_height + i) * cpi->common.width + + col * block_width + j]; + } + } + + assert(total_pixel > 0); + + // Calculate the superblock level saliency map from pixel level saliency + // map + sb_saliency_map->buf[index] = total_weight / total_pixel; + + // Further lower the superblock saliency score for boundary superblocks. + if (row < 1 || row > num_sb_rows - 2 || col < 1 || + col > num_sb_cols - 2) { + sb_saliency_map->buf[index] /= 5; + } + } + } + + // superblock level saliency map finalization + minmax_normalize(sb_saliency_map); + + double log_sum = 0.0; + double sum = 0.0; + int block_count = 0; + + // Calculate the average superblock sm_scaling_factor for a frame, to be used + // for clamping later. + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + const double saliency = sb_saliency_map->buf[index]; + + cpi->sm_scaling_factor[index] = 1 - saliency; + sum += cpi->sm_scaling_factor[index]; + block_count++; + } + } + assert(block_count > 0); + sum /= block_count; + + // Calculate the geometric mean of superblock sm_scaling_factor for a frame, + // to be used for normalization. + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + log_sum += log(fmax(cpi->sm_scaling_factor[index], 0.001)); + cpi->sm_scaling_factor[index] = + fmax(cpi->sm_scaling_factor[index], 0.8 * sum); + } + } + + log_sum = exp(log_sum / block_count); + + // Normalize the sm_scaling_factor by geometric mean. + for (int row = 0; row < num_sb_rows; ++row) { + for (int col = 0; col < num_sb_cols; ++col) { + const int index = row * num_sb_cols + col; + assert(log_sum > 0); + cpi->sm_scaling_factor[index] /= log_sum; + + // Modulate the sm_scaling_factor by frame basis motion factor + cpi->sm_scaling_factor[index] = + cpi->sm_scaling_factor[index] * motion_ratio; + } + } + + aom_free(sb_saliency_map->buf); + aom_free(sb_saliency_map); + return 1; +} + +// av1_setup_motion_ratio() is only enabled when CONFIG_REALTIME_ONLY is 0, +// because the computations need to access the first pass stats which are +// only available when CONFIG_REALTIME_ONLY is equal to 0. +#if !CONFIG_REALTIME_ONLY +// Set motion_ratio that reflects the motion quantities between two consecutive +// frames. Motion_ratio will be used to set up saliency_map based rdmult scaling +// factor, i.e., the less the motion quantities are, the more bits will be spent +// on this frame, and vice versa. +double av1_setup_motion_ratio(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + int frames_since_key = + cm->current_frame.display_order_hint - cpi->rc.frames_since_key; + const FIRSTPASS_STATS *cur_stats = av1_firstpass_info_peek( + &cpi->ppi->twopass.firstpass_info, frames_since_key); + assert(cur_stats != NULL); + assert(cpi->ppi->twopass.firstpass_info.total_stats.count > 0); + + const double avg_intra_error = + exp(cpi->ppi->twopass.firstpass_info.total_stats.log_intra_error / + cpi->ppi->twopass.firstpass_info.total_stats.count); + const double avg_inter_error = + exp(cpi->ppi->twopass.firstpass_info.total_stats.log_coded_error / + cpi->ppi->twopass.firstpass_info.total_stats.count); + + double inter_error = cur_stats->coded_error; + double error_stdev = 0; + const double avg_error = + cpi->ppi->twopass.firstpass_info.total_stats.intra_error / + cpi->ppi->twopass.firstpass_info.total_stats.count; + for (int i = 0; i < cpi->ppi->twopass.firstpass_info.total_stats.count; i++) { + const FIRSTPASS_STATS *stats = + &cpi->ppi->twopass.firstpass_info.stats_buf[i]; + error_stdev += + (stats->intra_error - avg_error) * (stats->intra_error - avg_error); + } + error_stdev = + sqrt(error_stdev / cpi->ppi->twopass.firstpass_info.total_stats.count); + + double motion_ratio = 1; + if (error_stdev / fmax(avg_intra_error, 1) > 0.1) { + motion_ratio = inter_error / fmax(1, avg_inter_error); + motion_ratio = AOMMIN(motion_ratio, 1.5); + motion_ratio = AOMMAX(motion_ratio, 0.8); + } + + return motion_ratio; +} +#endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/aom/av1/encoder/saliency_map.h b/third_party/aom/av1/encoder/saliency_map.h new file mode 100644 index 0000000000..0d27f83633 --- /dev/null +++ b/third_party/aom/av1/encoder/saliency_map.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SALIENCY_MAP_H_ +#define AOM_AV1_ENCODER_SALIENCY_MAP_H_ +#include "av1/encoder/encoder.h" + +typedef struct saliency_feature_map { + double *buf; // stores values of the map in 1D array + int height; + int width; +} saliency_feature_map; + +int av1_set_saliency_map(AV1_COMP *cpi); +#if !CONFIG_REALTIME_ONLY +double av1_setup_motion_ratio(AV1_COMP *cpi); +#endif +int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio); + +#endif // AOM_AV1_ENCODER_SALIENCY_MAP_H_ diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c new file mode 100644 index 0000000000..4b4e78779c --- /dev/null +++ b/third_party/aom/av1/encoder/segmentation.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/pred_common.h" +#include "av1/common/tile_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/segmentation.h" + +void av1_enable_segmentation(struct segmentation *seg) { + seg->enabled = 1; + seg->update_map = 1; + seg->update_data = 1; + seg->temporal_update = 0; +} + +void av1_disable_segmentation(struct segmentation *seg) { + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + seg->temporal_update = 0; +} + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] &= ~(1u << feature_id); +} + +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_data[segment_id][feature_id] = 0; +} + +void av1_reset_segment_features(AV1_COMMON *cm) { + struct segmentation *seg = &cm->seg; + + // Set up default state for MB feature flags + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + av1_clearall_segfeatures(seg); +} diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h new file mode 100644 index 0000000000..1ad13d66a9 --- /dev/null +++ b/third_party/aom/av1/encoder/segmentation.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_ +#define AOM_AV1_ENCODER_SEGMENTATION_H_ + +#include "av1/common/blockd.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_enable_segmentation(struct segmentation *seg); +void av1_disable_segmentation(struct segmentation *seg); + +void av1_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); +void av1_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id); + +void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd); + +void av1_reset_segment_features(AV1_COMMON *cm); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SEGMENTATION_H_ diff --git a/third_party/aom/av1/encoder/sorting_network.h b/third_party/aom/av1/encoder/sorting_network.h new file mode 100644 index 0000000000..54f4c19dcd --- /dev/null +++ b/third_party/aom/av1/encoder/sorting_network.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*! \file + * This file contains several utility functions used to sort small arrays with + * sorting networks. + * + * Sorting network is a (potentially branch-less) way to quickly sort small + * arrays with known size. For more details, consult + * (https://en.wikipedia.org/wiki/Sorting_network). + */ +#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_ +#define AOM_AV1_ENCODER_SORTING_NETWORK_H_ + +#include "aom/aom_integer.h" + +#define SWAP(i, j) \ + do { \ + const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \ + const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \ + const int maxi = (k[i] >= k[j]) ? v[i] : v[j]; \ + const int mini = (k[i] >= k[j]) ? v[j] : v[i]; \ + k[i] = maxf; \ + k[j] = minf; \ + v[i] = maxi; \ + v[j] = mini; \ + } while (0) + +/*!\brief Sorts two size-16 arrays of keys and values in descending order of + * keys. + * + * \param[in,out] k An length-16 array of float serves as the keys. + * \param[in,out] v An length-16 array of int32 serves as the + * value. + */ +static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) { + SWAP(0, 1); + SWAP(2, 3); + SWAP(4, 5); + SWAP(6, 7); + SWAP(8, 9); + SWAP(10, 11); + SWAP(12, 13); + SWAP(14, 15); + SWAP(0, 2); + SWAP(1, 3); + SWAP(4, 6); + SWAP(5, 7); + SWAP(8, 10); + SWAP(9, 11); + SWAP(12, 14); + SWAP(13, 15); + SWAP(1, 2); + SWAP(5, 6); + SWAP(0, 4); + SWAP(3, 7); + SWAP(9, 10); + SWAP(13, 14); + SWAP(8, 12); + SWAP(11, 15); + SWAP(1, 5); + SWAP(2, 6); + SWAP(9, 13); + SWAP(10, 14); + SWAP(0, 8); + SWAP(7, 15); + SWAP(1, 4); + SWAP(3, 6); + SWAP(9, 12); + SWAP(11, 14); + SWAP(2, 4); + SWAP(3, 5); + SWAP(10, 12); + SWAP(11, 13); + SWAP(1, 9); + SWAP(6, 14); + SWAP(3, 4); + SWAP(11, 12); + SWAP(1, 8); + SWAP(2, 10); + SWAP(5, 13); + SWAP(7, 14); + SWAP(3, 11); + SWAP(2, 8); + SWAP(4, 12); + SWAP(7, 13); + SWAP(3, 10); + SWAP(5, 12); + SWAP(3, 9); + SWAP(6, 12); + SWAP(3, 8); + SWAP(7, 12); + SWAP(5, 9); + SWAP(6, 10); + SWAP(4, 8); + SWAP(7, 11); + SWAP(5, 8); + SWAP(7, 10); + SWAP(6, 8); + SWAP(7, 9); + SWAP(7, 8); +} + +/*!\brief Sorts two size-8 arrays of keys and values in descending order of + * keys. + * + * \param[in,out] k An length-8 array of float serves as the keys. + * \param[in,out] v An length-8 array of int32 serves as the values. + */ +static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) { + SWAP(0, 1); + SWAP(2, 3); + SWAP(4, 5); + SWAP(6, 7); + SWAP(0, 2); + SWAP(1, 3); + SWAP(4, 6); + SWAP(5, 7); + SWAP(1, 2); + SWAP(5, 6); + SWAP(0, 4); + SWAP(3, 7); + SWAP(1, 5); + SWAP(2, 6); + SWAP(1, 4); + SWAP(3, 6); + SWAP(2, 4); + SWAP(3, 5); + SWAP(3, 4); +} +#undef SWAP +#endif // AOM_AV1_ENCODER_SORTING_NETWORK_H_ diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.c b/third_party/aom/av1/encoder/sparse_linear_solver.c new file mode 100644 index 0000000000..e47c78e148 --- /dev/null +++ b/third_party/aom/av1/encoder/sparse_linear_solver.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/common/av1_common_int.h" +#include "av1/encoder/sparse_linear_solver.h" +#include "config/aom_config.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/alloccommon.h" + +#if CONFIG_OPTICAL_FLOW_API +/* + * Input: + * rows: array of row positions + * cols: array of column positions + * values: array of element values + * num_elem: total number of elements in the matrix + * num_rows: number of rows in the matrix + * num_cols: number of columns in the matrix + * + * Output: + * sm: pointer to the sparse matrix to be initialized + * + * Return: 0 - success + * -1 - failed + */ +int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values, + int num_elem, int num_rows, int num_cols, + SPARSE_MTX *sm) { + sm->n_elem = num_elem; + sm->n_rows = num_rows; + sm->n_cols = num_cols; + if (num_elem == 0) { + sm->row_pos = NULL; + sm->col_pos = NULL; + sm->value = NULL; + return 0; + } + sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos)); + sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos)); + sm->value = aom_calloc(num_elem, sizeof(*sm->value)); + + if (!sm->row_pos || !sm->col_pos || !sm->value) { + av1_free_sparse_mtx_elems(sm); + return -1; + } + + memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos)); + memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos)); + memcpy(sm->value, values, num_elem * sizeof(*sm->value)); + + return 0; +} + +/* + * Combines two sparse matrices (allocating new space). + * + * Input: + * sm1, sm2: matrices to be combined + * row_offset1, row_offset2: row offset of each matrix in the new matrix + * col_offset1, col_offset2: column offset of each matrix in the new matrix + * new_n_rows, new_n_cols: number of rows and columns in the new matrix + * + * Output: + * sm: the combined matrix + * + * Return: 0 - success + * -1 - failed + */ +int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2, + SPARSE_MTX *sm, int row_offset1, + int col_offset1, int row_offset2, + int col_offset2, int new_n_rows, + int new_n_cols) { + sm->n_elem = sm1->n_elem + sm2->n_elem; + sm->n_cols = new_n_cols; + sm->n_rows = new_n_rows; + + if (sm->n_elem == 0) { + sm->row_pos = NULL; + sm->col_pos = NULL; + sm->value = NULL; + return 0; + } + + sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos)); + sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos)); + sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value)); + + if (!sm->row_pos || !sm->col_pos || !sm->value) { + av1_free_sparse_mtx_elems(sm); + return -1; + } + + for (int i = 0; i < sm1->n_elem; i++) { + sm->row_pos[i] = sm1->row_pos[i] + row_offset1; + sm->col_pos[i] = sm1->col_pos[i] + col_offset1; + } + memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value)); + int n_elem1 = sm1->n_elem; + for (int i = 0; i < sm2->n_elem; i++) { + sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2; + sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2; + } + memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value)); + return 0; +} + +void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) { + sm->n_cols = 0; + sm->n_rows = 0; + if (sm->n_elem != 0) { + aom_free(sm->row_pos); + aom_free(sm->col_pos); + aom_free(sm->value); + } + sm->n_elem = 0; +} + +/* + * Calculate matrix and vector multiplication: A*b + * + * Input: + * sm: matrix A + * srcv: the vector b to be multiplied to + * dstl: the length of vectors + * + * Output: + * dstv: pointer to the resulting vector + */ +void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl) { + memset(dstv, 0, sizeof(*dstv) * dstl); + for (int i = 0; i < sm->n_elem; i++) { + dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i]; + } +} +/* + * Calculate matrix and vector multiplication: b*A + * + * Input: + * sm: matrix A + * srcv: the vector b to be multiplied to + * dstl: the length of vectors + * + * Output: + * dstv: pointer to the resulting vector + */ +void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl) { + memset(dstv, 0, sizeof(*dstv) * dstl); + for (int i = 0; i < sm->n_elem; i++) { + dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i]; + } +} + +/* + * Calculate inner product of two vectors + * + * Input: + * src1, scr2: the vectors to be multiplied + * src1l: length of the vectors + * + * Output: + * the inner product + */ +double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) { + double result = 0; + for (int i = 0; i < src1l; i++) { + result += src1[i] * src2[i]; + } + return result; +} + +/* + * Multiply each element in the matrix sm with a constant c + */ +void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) { + for (int i = 0; i < sm->n_elem; i++) { + sm->value[i] *= c; + } +} + +static INLINE void free_solver_local_buf(double *buf1, double *buf2, + double *buf3, double *buf4, + double *buf5, double *buf6, + double *buf7) { + aom_free(buf1); + aom_free(buf2); + aom_free(buf3); + aom_free(buf4); + aom_free(buf5); + aom_free(buf6); + aom_free(buf7); +} + +/* + * Solve for Ax = b + * no requirement on A + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, + int bl, double *x) { + double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL, + *p_hatA = NULL, *x_hat = NULL; + double alpha, beta, rtr, r_norm_2; + double denormtemp; + + // initialize + r = aom_calloc(bl, sizeof(*r)); + r_hat = aom_calloc(bl, sizeof(*r_hat)); + p = aom_calloc(bl, sizeof(*p)); + p_hat = aom_calloc(bl, sizeof(*p_hat)); + Ap = aom_calloc(bl, sizeof(*Ap)); + p_hatA = aom_calloc(bl, sizeof(*p_hatA)); + x_hat = aom_calloc(bl, sizeof(*x_hat)); + if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) { + free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat); + return -1; + } + + int i; + for (i = 0; i < bl; i++) { + r[i] = b[i]; + r_hat[i] = b[i]; + p[i] = r[i]; + p_hat[i] = r_hat[i]; + x[i] = 0; + x_hat[i] = 0; + } + r_norm_2 = av1_vect_vect_multi(r_hat, bl, r); + for (int k = 0; k < MAX_CG_SP_ITER; k++) { + rtr = r_norm_2; + av1_mtx_vect_multi_right(A, p, Ap, bl); + av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl); + + denormtemp = av1_vect_vect_multi(p_hat, bl, Ap); + if (denormtemp < 1e-10) break; + alpha = rtr / denormtemp; + r_norm_2 = 0; + for (i = 0; i < bl; i++) { + x[i] += alpha * p[i]; + x_hat[i] += alpha * p_hat[i]; + r[i] -= alpha * Ap[i]; + r_hat[i] -= alpha * p_hatA[i]; + r_norm_2 += r_hat[i] * r[i]; + } + if (sqrt(r_norm_2) < 1e-2) { + break; + } + if (rtr < 1e-10) break; + beta = r_norm_2 / rtr; + for (i = 0; i < bl; i++) { + p[i] = r[i] + beta * p[i]; + p_hat[i] = r_hat[i] + beta * p_hat[i]; + } + } + // free + free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat); + return 0; +} + +/* + * Solve for Ax = b when A is symmetric and positive definite + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x) { + double *r = NULL, *p = NULL, *Ap = NULL; + double alpha, beta, rtr, r_norm_2; + double denormtemp; + + // initialize + r = aom_calloc(bl, sizeof(*r)); + p = aom_calloc(bl, sizeof(*p)); + Ap = aom_calloc(bl, sizeof(*Ap)); + if (!r || !p || !Ap) { + free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL); + return -1; + } + + int i; + for (i = 0; i < bl; i++) { + r[i] = b[i]; + p[i] = r[i]; + x[i] = 0; + } + r_norm_2 = av1_vect_vect_multi(r, bl, r); + int k; + for (k = 0; k < MAX_CG_SP_ITER; k++) { + rtr = r_norm_2; + av1_mtx_vect_multi_right(A, p, Ap, bl); + denormtemp = av1_vect_vect_multi(p, bl, Ap); + if (denormtemp < 1e-10) break; + alpha = rtr / denormtemp; + r_norm_2 = 0; + for (i = 0; i < bl; i++) { + x[i] += alpha * p[i]; + r[i] -= alpha * Ap[i]; + r_norm_2 += r[i] * r[i]; + } + if (r_norm_2 < 1e-8 * bl) break; + if (rtr < 1e-10) break; + beta = r_norm_2 / rtr; + for (i = 0; i < bl; i++) { + p[i] = r[i] + beta * p[i]; + } + } + // free + free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL); + + return 0; +} + +/* + * Solve for Ax = b using Jacobi method + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) { + double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL, + *tempx = NULL; + double resi2; + + diags = aom_calloc(bl, sizeof(*diags)); + Rx = aom_calloc(bl, sizeof(*Rx)); + x_last = aom_calloc(bl, sizeof(*x_last)); + x_cur = aom_calloc(bl, sizeof(*x_cur)); + + if (!diags || !Rx || !x_last || !x_cur) { + free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL); + return -1; + } + + int i; + memset(x_last, 0, sizeof(*x_last) * bl); + // get the diagonals of A + memset(diags, 0, sizeof(*diags) * bl); + for (int c = 0; c < A->n_elem; c++) { + if (A->row_pos[c] != A->col_pos[c]) continue; + diags[A->row_pos[c]] = A->value[c]; + } + int k; + for (k = 0; k < MAX_CG_SP_ITER; k++) { + // R = A - diag(diags) + // get R*x_last + memset(Rx, 0, sizeof(*Rx) * bl); + for (int c = 0; c < A->n_elem; c++) { + if (A->row_pos[c] == A->col_pos[c]) continue; + Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c]; + } + resi2 = 0; + for (i = 0; i < bl; i++) { + x_cur[i] = (b[i] - Rx[i]) / diags[i]; + resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]); + } + if (resi2 <= 1e-10 * bl) break; + // swap last & cur buffer ptrs + tempx = x_last; + x_last = x_cur; + x_cur = tempx; + } + printf("\n numiter: %d\n", k); + for (i = 0; i < bl; i++) { + x[i] = x_cur[i]; + } + free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL); + return 0; +} + +/* + * Solve for Ax = b using Steepest descent method + * + * Input: + * A: the sparse matrix + * b: the vector b + * bl: length of b + * x: the vector x + * + * Output: + * x: pointer to the solution vector + * + * Return: 0 - success + * -1 - failed + */ +int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x) { + double *d = NULL, *Ad = NULL, *Ax = NULL; + double resi2, resi2_last, dAd, temp; + + d = aom_calloc(bl, sizeof(*d)); + Ax = aom_calloc(bl, sizeof(*Ax)); + Ad = aom_calloc(bl, sizeof(*Ad)); + + if (!d || !Ax || !Ad) { + free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL); + return -1; + } + + int i; + // initialize with 0s + resi2 = 0; + for (i = 0; i < bl; i++) { + x[i] = 0; + d[i] = b[i]; + resi2 += d[i] * d[i] / bl; + } + int k; + for (k = 0; k < MAX_CG_SP_ITER; k++) { + // get A*x_last + av1_mtx_vect_multi_right(A, d, Ad, bl); + dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad); + for (i = 0; i < bl; i++) { + temp = dAd * d[i]; + x[i] = x[i] + temp; + } + av1_mtx_vect_multi_right(A, x, Ax, bl); + resi2_last = resi2; + resi2 = 0; + for (i = 0; i < bl; i++) { + d[i] = b[i] - Ax[i]; + resi2 += d[i] * d[i] / bl; + } + if (resi2 <= 1e-8) break; + if (resi2_last - resi2 < 1e-8) { + break; + } + } + free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL); + + return 0; +} + +#endif // CONFIG_OPTICAL_FLOW_API diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.h b/third_party/aom/av1/encoder/sparse_linear_solver.h new file mode 100644 index 0000000000..f30fc0f5b1 --- /dev/null +++ b/third_party/aom/av1/encoder/sparse_linear_solver.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ +#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config/aom_config.h" + +#if CONFIG_OPTICAL_FLOW_API + +// Number of iterations for solving linear equations. +#define MAX_CG_SP_ITER 100 + +typedef struct { + int n_elem; // number of non-zero elements + int n_rows; + int n_cols; + // using arrays to represent non-zero elements. + int *col_pos; + int *row_pos; // starts with 0 + double *value; +} SPARSE_MTX; + +int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values, + int num_elem, int num_rows, int num_cols, + SPARSE_MTX *sm); +int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2, + SPARSE_MTX *sm, int row_offset1, + int col_offset1, int row_offset2, + int col_offset2, int new_n_rows, + int new_n_cols); +void av1_free_sparse_mtx_elems(SPARSE_MTX *sm); + +void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl); +void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv, + double *dstv, int dstl); +double av1_vect_vect_multi(const double *src1, int src1l, const double *src2); +void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c); + +int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x); +int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, + int bl, double *x); +int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x); +int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, + double *x); + +#endif // CONFIG_OPTICAL_FLOW_API + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */ diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c new file mode 100644 index 0000000000..a6c0971096 --- /dev/null +++ b/third_party/aom/av1/encoder/speed_features.c @@ -0,0 +1,2715 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/speed_features.h" +#include "av1/encoder/rdopt.h" + +#include "aom_dsp/aom_dsp_common.h" + +#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +// Max speed setting for tx domain evaluation +#define MAX_TX_DOMAIN_EVAL_SPEED 5 +static MESH_PATTERN + good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, + }; + +// TODO(huisu@google.com): These settings are pretty relaxed, tune them for +// each speed setting +static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, + { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } }, +}; + +// Threshold values to be used for pruning the txfm_domain_distortion +// based on block MSE +// Index 0: Default mode evaluation, Winner mode processing is not +// applicable (Eg : IntraBc). Index 1: Mode evaluation. +// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when +// enable_winner_mode_for_use_tx_domain_dist speed feature is ON +// TODO(any): Experiment the threshold logic based on variance metric +static unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = { + { UINT_MAX, UINT_MAX, UINT_MAX }, + { 22026, 22026, 22026 }, + { 1377, 1377, 1377 }, + { 0, 0, 0 } +}; + +// Number of different levels of aggressiveness in using transform domain +// distortion during the R-D evaluation based on the speed feature +// tx_domain_dist_level. +#define TX_DOMAIN_DIST_LEVELS 4 + +// Transform domain distortion type to be used for default, mode and winner mode +// evaluation Index 0: Default mode evaluation, Winner mode processing is not +// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode +// evaluation. Index 1 and 2 are applicable when +// enable_winner_mode_for_use_tx_domain_dist speed feature is ON +static unsigned int + tx_domain_dist_types[TX_DOMAIN_DIST_LEVELS][MODE_EVAL_TYPES] = { + { 0, 2, 0 }, { 1, 2, 0 }, { 2, 2, 0 }, { 2, 2, 2 } + }; + +// Threshold values to be used for disabling coeff RD-optimization +// based on block MSE / qstep^2. +// TODO(any): Experiment the threshold logic based on variance metric. +// Table has satd and dist threshold value index 0 : dist,index 1: satd +// For each row, the indices are as follows. +// Index 0: Default mode evaluation, Winner mode processing is not applicable +// (Eg : IntraBc) +// Index 1: Mode evaluation. +// Index 2: Winner mode evaluation. +// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed +// feature is ON +// There are 7 levels with increasing speed, mapping to vertical indices. +static unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = { + { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } }, + { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } }, + { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } }, + { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } }, + { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } } +}; + +// Transform size to be used for default, mode and winner mode evaluation +// Index 0: Default mode evaluation, Winner mode processing is not applicable +// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation. +// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed +// feature is ON +static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = { + { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD }, + { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD }, + { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }, + { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL } +}; + +// Predict transform skip levels to be used for default, mode and winner mode +// evaluation. Index 0: Default mode evaluation, Winner mode processing is not +// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation +// Values indicate the aggressiveness of skip flag prediction. +// 0 : no early skip prediction +// 1 : conservative early skip prediction using DCT_DCT +// 2 : early skip prediction based on SSE +static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 }, + { 1, 1, 1 }, + { 1, 2, 1 } }; + +// Predict skip or DC block level used during transform type search. It is +// indexed using the following: +// First index : Speed feature 'dc_blk_pred_level' (0 to 3) +// Second index : Mode evaluation type (DEFAULT_EVAL, MODE_EVAL and +// WINNER_MODE_EVAL). +// +// The values of predict_dc_levels[][] indicate the aggressiveness of predicting +// a block as transform skip or DC only. +// Type 0 : No skip block or DC only block prediction +// Type 1 : Prediction of skip block based on residual mean and variance +// Type 2 : Prediction of skip block or DC only block based on residual mean and +// variance +static unsigned int predict_dc_levels[4][MODE_EVAL_TYPES] = { + { 0, 0, 0 }, { 1, 1, 0 }, { 2, 2, 0 }, { 2, 2, 2 } +}; + +#if !CONFIG_FPMT_TEST +// This table holds the maximum number of reference frames for global motion. +// The table is indexed as per the speed feature 'gm_search_type'. +// 0 : All reference frames are allowed. +// 1 : All reference frames except L2 and L3 are allowed. +// 2 : All reference frames except L2, L3 and ARF2 are allowed. +// 3 : No reference frame is allowed. +static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = { + INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0 +}; +#endif + +// Qindex threshold levels used for selecting full-pel motion search. +// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band +// for resolution index 'j' for aggressiveness level 'i'. +// Aggressiveness increases from i = 0 to 2. +// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution. +// Currently invoked only for speed 0, 1 and 2. +static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } }, + { { 170, 50 }, { MAXQ, 200 } }, + { { 170, 40 }, { 200, 40 } } }; + +// Full-pel search methods for aggressive search based on qindex. +// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger +// resolutions. Currently invoked only for speed 1 and 2. +static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND }; + +// Intra only frames, golden frames (except alt ref overlays) and +// alt ref frames tend to be coded at a higher than ambient quality +static int frame_is_boosted(const AV1_COMP *cpi) { + return frame_is_kf_gf_arf(cpi); +} + +// Set transform rd gate level for all transform search cases. +static AOM_INLINE void set_txfm_rd_gate_level( + int txfm_rd_gate_level[TX_SEARCH_CASES], int level) { + assert(level <= MAX_TX_RD_GATE_LEVEL); + for (int idx = 0; idx < TX_SEARCH_CASES; idx++) + txfm_rd_gate_level[idx] = level; +} + +static void set_allintra_speed_feature_framesize_dependent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; + const bool use_hbd = cpi->oxcf.use_highbitdepth; + + if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + if (is_720p_or_larger) + sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; + else + sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 1; + } + + if (is_4k_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + sf->part_sf.ml_early_term_after_part_split_level = 1; + } + + if (is_720p_or_larger) { + // TODO(chiyotsai@google.com): make this speed feature adaptive based on + // current block's vertical texture instead of hardcoded with resolution + sf->mv_sf.use_downsampled_sad = 2; + } + + if (speed >= 1) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + sf->part_sf.ml_early_term_after_part_split_level = 2; + } + + if (speed >= 2) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); + sf->part_sf.partition_search_breakout_rate_thr = 100; + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; + } else { + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + } + } + + if (speed >= 3) { + sf->part_sf.ml_early_term_after_part_split_level = 0; + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.partition_search_breakout_rate_thr = 200; + } else { + sf->part_sf.max_intra_bsize = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + } + + if (speed >= 4) { + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; + } + } + + if (speed >= 6) { + if (is_720p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE; + } else if (is_480p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + } + + if (is_1080p_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; + } + + if (speed >= 7) { + // TODO(kyslov): add more speed features to control speed/quality + } + + if (speed >= 8) { + if (!is_480p_or_larger) { + sf->rt_sf.nonrd_check_partition_merge_mode = 2; + } + if (is_720p_or_larger) { + sf->rt_sf.force_large_partition_blocks_intra = 1; + } + } + + if (speed >= 9) { + // TODO(kyslov): add more speed features to control speed/quality + if (!is_4k_or_larger) { + // In av1_select_sb_size(), superblock size is set to 64x64 only for + // resolutions less than 4k in speed>=9, to improve the multithread + // performance. If cost update levels are set to INTERNAL_COST_UPD_OFF + // for resolutions >= 4k, the SB size setting can be modified for these + // resolutions as well. + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF; + } + } +} + +static void set_allintra_speed_features_framesize_independent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + const int use_hbd = cpi->oxcf.use_highbitdepth; + + sf->part_sf.less_rectangular_check_level = 1; + sf->part_sf.ml_prune_partition = 1; + sf->part_sf.prune_ext_partition_types_search_level = 1; + sf->part_sf.prune_part4_search = 2; + sf->part_sf.simple_motion_search_prune_rect = 1; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; + + sf->intra_sf.intra_pruning_with_hog = 1; + sf->intra_sf.prune_luma_palette_size_search_level = 1; + sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; + sf->intra_sf.early_term_chroma_palette_size_search = 1; + + sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + + sf->rt_sf.use_nonrd_pick_mode = 0; + sf->rt_sf.use_real_time_ref_set = 0; + + if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION || + cpi->use_screen_content_tools) { + sf->mv_sf.exhaustive_searches_thresh = (1 << 20); + } else { + sf->mv_sf.exhaustive_searches_thresh = (1 << 25); + } + + sf->rd_sf.perform_coeff_opt = 1; + sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL; + + if (speed >= 1) { + sf->part_sf.intra_cnn_based_part_prune_level = + allow_screen_content_tools ? 0 : 2; + sf->part_sf.simple_motion_search_early_term_none = 1; + // TODO(Venkat): Clean-up frame type dependency for + // simple_motion_search_split in partition search function and set the + // speed feature accordingly + sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; + sf->part_sf.reuse_best_prediction_for_part_ab = 1; + + sf->mv_sf.exhaustive_searches_thresh <<= 1; + + sf->intra_sf.prune_palette_search_level = 1; + sf->intra_sf.prune_luma_palette_size_search_level = 2; + sf->intra_sf.top_intra_model_count_allowed = 3; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + + sf->rd_sf.perform_coeff_opt = 2; + sf->rd_sf.tx_domain_dist_level = 1; + sf->rd_sf.tx_domain_dist_thres_level = 1; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; + sf->lpf_sf.dual_sgr_penalty_level = 1; + sf->lpf_sf.enable_sgr_ep_pruning = 1; + } + + if (speed >= 2) { + sf->mv_sf.auto_mv_step_size = 1; + + sf->intra_sf.disable_smooth_intra = 1; + sf->intra_sf.intra_pruning_with_hog = 2; + sf->intra_sf.prune_filter_intra_level = 1; + + sf->rd_sf.perform_coeff_opt = 3; + + sf->lpf_sf.prune_wiener_based_on_src_var = 1; + sf->lpf_sf.prune_sgr_based_on_wiener = 1; + } + + if (speed >= 3) { + sf->hl_sf.high_precision_mv_usage = CURRENT_Q; + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + + sf->part_sf.less_rectangular_check_level = 2; + sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1; + sf->part_sf.prune_ext_part_using_split_info = 1; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.search_method = DIAMOND; + + // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are + // inherited directly from luma hog with some minor tweaking. Eventually we + // should run this with a bayesian optimizer to find the Pareto frontier. + sf->intra_sf.chroma_intra_pruning_with_hog = 2; + sf->intra_sf.intra_pruning_with_hog = 3; + sf->intra_sf.prune_palette_search_level = 2; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; + sf->tx_sf.use_rd_based_breakout_for_intra_tx_search = true; + + // TODO(any): evaluate if these lpf features can be moved to speed 2. + // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality + // loss. + sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2; + sf->lpf_sf.disable_loop_restoration_chroma = 0; + sf->lpf_sf.reduce_wiener_window_size = 1; + sf->lpf_sf.prune_wiener_based_on_src_var = 2; + } + + if (speed >= 4) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + + sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2; + sf->part_sf.simple_motion_search_reduce_search_steps = 4; + sf->part_sf.prune_ext_part_using_split_info = 2; + sf->part_sf.early_term_after_none_split = 1; + sf->part_sf.ml_predict_breakout_level = 3; + + sf->intra_sf.prune_chroma_modes_using_luma_winner = 1; + + sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; + + sf->tpl_sf.prune_starting_mv = 2; + sf->tpl_sf.subpel_force_stop = HALF_PEL; + sf->tpl_sf.search_method = FAST_BIGDIA; + + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1; + + sf->rd_sf.perform_coeff_opt = 5; + sf->rd_sf.tx_domain_dist_thres_level = 3; + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3; + + sf->mv_sf.reduce_search_range = 1; + + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + } + + if (speed >= 5) { + sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3; + sf->part_sf.ext_partition_eval_thresh = + allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; + sf->part_sf.intra_cnn_based_part_prune_level = + allow_screen_content_tools ? 1 : 2; + + sf->intra_sf.chroma_intra_pruning_with_hog = 3; + + sf->lpf_sf.use_coarse_filter_level_search = 0; + // Disable Wiener and Self-guided Loop restoration filters. + sf->lpf_sf.disable_wiener_filter = true; + sf->lpf_sf.disable_sgr_filter = true; + + sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2; + + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST; + } + + if (speed >= 6) { + sf->intra_sf.prune_smooth_intra_mode_for_chroma = 1; + sf->intra_sf.prune_filter_intra_level = 2; + sf->intra_sf.chroma_intra_pruning_with_hog = 4; + sf->intra_sf.intra_pruning_with_hog = 4; + sf->intra_sf.cfl_search_range = 1; + sf->intra_sf.top_intra_model_count_allowed = 2; + sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1; + sf->intra_sf.prune_luma_odd_delta_angles_in_intra = 1; + + sf->part_sf.prune_rectangular_split_based_on_qidx = + allow_screen_content_tools ? 0 : 2; + sf->part_sf.prune_rect_part_using_4x4_var_deviation = true; + sf->part_sf.prune_rect_part_using_none_pred_mode = true; + sf->part_sf.prune_sub_8x8_partition_level = + allow_screen_content_tools ? 0 : 1; + sf->part_sf.prune_part4_search = 3; + // TODO(jingning): This might not be a good trade off if the + // target image quality is very low. + sf->part_sf.default_max_partition_size = BLOCK_32X32; + + sf->mv_sf.use_bsize_dependent_search_method = 1; + + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; + sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0; + sf->tx_sf.prune_intra_tx_depths_using_nn = true; + + sf->rd_sf.perform_coeff_opt = 6; + sf->rd_sf.tx_domain_dist_level = 3; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; + sf->winner_mode_sf.prune_winner_mode_eval_level = 1; + sf->winner_mode_sf.dc_blk_pred_level = 1; + } + // The following should make all-intra mode speed 7 approximately equal + // to real-time speed 6, + // all-intra speed 8 close to real-time speed 7, and all-intra speed 9 + // close to real-time speed 8 + if (speed >= 7) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.var_part_split_threshold_shift = 7; + } + + if (speed >= 8) { + sf->rt_sf.hybrid_intra_pickmode = 1; + sf->rt_sf.use_nonrd_pick_mode = 1; + sf->rt_sf.nonrd_check_partition_merge_mode = 1; + sf->rt_sf.var_part_split_threshold_shift = 8; + // Set mask for intra modes. + for (int i = 0; i < BLOCK_SIZES; ++i) + if (i >= BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + // Use DC, H, V intra mode for block sizes < 32X32. + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + } + + if (speed >= 9) { + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; + + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->rt_sf.hybrid_intra_pickmode = 0; + sf->rt_sf.var_part_split_threshold_shift = 9; + sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true; + sf->rt_sf.prune_h_pred_using_best_mode_so_far = true; + sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true; + sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true; + } + + // As the speed feature prune_chroma_modes_using_luma_winner already + // constrains the number of chroma directional mode evaluations to a maximum + // of 1, the HOG computation and the associated pruning logic does not seem to + // help speed-up the chroma mode evaluations. Hence disable the speed feature + // chroma_intra_pruning_with_hog when prune_chroma_modes_using_luma_winner is + // enabled. + if (sf->intra_sf.prune_chroma_modes_using_luma_winner) + sf->intra_sf.chroma_intra_pruning_with_hog = 0; +} + +static void set_good_speed_feature_framesize_dependent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160; + const bool use_hbd = cpi->oxcf.use_highbitdepth; + // Speed features applicable for temporal filtering and tpl modules may be + // changed based on frame type at places where the sf is applied (Example : + // use_downsampled_sad). This is because temporal filtering and tpl modules + // are called before this function (except for the first key frame). + // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal + // filtering and tpl modules, modify the sf initialization appropriately + // before calling the modules. + const int boosted = frame_is_boosted(cpi); + const int is_boosted_arf2_bwd_type = + boosted || + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + const int is_lf_frame = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + + if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + if (is_720p_or_larger) + sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED; + else + sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 1; + } + + if (is_4k_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + // TODO(huisu@google.com): train models for 720P and above. + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + sf->part_sf.ml_early_term_after_part_split_level = 1; + } + + if (is_720p_or_larger) { + // TODO(chiyotsai@google.com): make this speed feature adaptive based on + // current block's vertical texture instead of hardcoded with resolution + sf->mv_sf.use_downsampled_sad = 2; + } + + if (!is_720p_or_larger) { + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + const int rate_tolerance = + AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); + sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2); + } + + if (speed >= 1) { + if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1; + + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (!is_720p_or_larger) { + sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8 + sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16 + sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 + sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 + sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + } + sf->part_sf.ml_early_term_after_part_split_level = 2; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1; + } + + if (speed >= 2) { + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64; + } else if (is_480p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + } + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + sf->part_sf.partition_search_breakout_rate_thr = 120; + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 22); + sf->part_sf.partition_search_breakout_rate_thr = 100; + } + + if (is_720p_or_larger) { + sf->inter_sf.prune_obmc_prob_thresh = 16; + } else { + sf->inter_sf.prune_obmc_prob_thresh = 8; + } + + if (is_480p_or_larger) { + sf->inter_sf.disable_interintra_wedge_var_thresh = 100; + } else { + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + } + + if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1; + + if (is_720p_or_larger) { + sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0; + } else { + sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0; + } + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; + if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; + } else { + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1; + } + + if (!is_720p_or_larger) { + sf->mv_sf.disable_second_mv = 1; + sf->mv_sf.auto_mv_step_size = 2; + } else { + sf->mv_sf.disable_second_mv = boosted ? 0 : 2; + sf->mv_sf.auto_mv_step_size = 1; + } + + if (!is_720p_or_larger) { + sf->hl_sf.recode_tolerance = 50; + sf->inter_sf.disable_interinter_wedge_newmv_search = + is_boosted_arf2_bwd_type ? 0 : 1; + sf->inter_sf.enable_fast_wedge_mask_search = 1; + } + } + + if (speed >= 3) { + sf->inter_sf.enable_fast_wedge_mask_search = 1; + sf->inter_sf.skip_newmv_in_drl = 2; + sf->inter_sf.skip_ext_comp_nearmv_mode = 1; + sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0; + sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = + frame_is_intra_only(&cpi->common) ? 0 : 1; + + sf->part_sf.ml_early_term_after_part_split_level = 0; + + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.partition_search_breakout_rate_thr = 200; + sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0; + } else { + sf->part_sf.max_intra_bsize = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 23); + sf->part_sf.partition_search_breakout_rate_thr = 120; + sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0; + } + if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + + if (is_480p_or_larger) { + sf->part_sf.early_term_after_none_split = 1; + } else { + sf->part_sf.early_term_after_none_split = 0; + } + if (is_720p_or_larger) { + sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2; + } else { + sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3; + } + + if (is_720p_or_larger) { + sf->inter_sf.disable_interinter_wedge_var_thresh = 100; + sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1; + } else { + sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; + sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2; + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2; + } + + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + } + + if (speed >= 4) { + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + if (is_720p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } else { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 24); + } + sf->part_sf.early_term_after_none_split = 1; + + if (is_480p_or_larger) { + sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2; + } else { + sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1; + } + + sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; + sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; + sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2; + if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3; + + if (is_720p_or_larger) { + sf->inter_sf.prune_comp_ref_frames = 1; + } else if (is_480p_or_larger) { + sf->inter_sf.prune_comp_ref_frames = is_boosted_arf2_bwd_type ? 0 : 1; + } + + if (is_720p_or_larger) + sf->hl_sf.recode_tolerance = 32; + else + sf->hl_sf.recode_tolerance = 55; + + sf->intra_sf.skip_intra_in_interframe = 4; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3; + } + + if (speed >= 5) { + if (is_720p_or_larger) { + sf->inter_sf.prune_warped_prob_thresh = 16; + } else if (is_480p_or_larger) { + sf->inter_sf.prune_warped_prob_thresh = 8; + } + if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40; + + sf->inter_sf.skip_newmv_in_drl = 4; + sf->inter_sf.prune_comp_ref_frames = 1; + sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1; + + if (!is_720p_or_larger) { + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET; + sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = + (boosted || allow_screen_content_tools) ? 0 : 1; + sf->mv_sf.use_downsampled_sad = 1; + } + + if (!is_480p_or_larger) { + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } + + if (is_480p_or_lesser) { + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1; + } else { + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2; + } + + if (is_720p_or_larger) + sf->part_sf.ext_part_eval_based_on_cur_best = + (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1; + + if (is_480p_or_larger) { + sf->tpl_sf.reduce_num_frames = 1; + } + } + + if (speed >= 6) { + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3; + sf->inter_sf.prune_comp_ref_frames = 2; + sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = + (boosted || allow_screen_content_tools) ? 0 : 1; + sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 2; + + if (is_720p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE; + } else if (is_480p_or_larger) { + sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED; + } + + if (is_480p_or_larger) { + sf->hl_sf.allow_sub_blk_me_in_tf = 1; + } + + if (is_1080p_or_larger) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + } + + if (is_720p_or_larger) { + sf->inter_sf.disable_masked_comp = 1; + } + + if (!is_720p_or_larger) { + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; + } + + if (is_720p_or_larger) { + sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 28); + } else { + sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 26); + } + + if (is_720p_or_larger) { + sf->inter_sf.prune_ref_mv_idx_search = 2; + } else { + sf->inter_sf.prune_ref_mv_idx_search = 1; + } + + if (!is_720p_or_larger) { + sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = + is_boosted_arf2_bwd_type ? 450 : 150; + } + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + + sf->hl_sf.recode_tolerance = 55; + } +} + +static void set_good_speed_features_framesize_independent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boosted = frame_is_boosted(cpi); + const int is_boosted_arf2_bwd_type = + boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + const int is_inter_frame = + gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME; + const int allow_screen_content_tools = + cm->features.allow_screen_content_tools; + const int use_hbd = cpi->oxcf.use_highbitdepth; + if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) { + sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA; + } + + // Speed 0 for all speed features that give neutral coding performance change. + sf->gm_sf.gm_search_type = boosted ? GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 + : GM_SEARCH_CLOSEST_REFS_ONLY; + sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1; + sf->gm_sf.disable_gm_search_based_on_stats = 1; + + sf->part_sf.less_rectangular_check_level = 1; + sf->part_sf.ml_prune_partition = 1; + sf->part_sf.prune_ext_partition_types_search_level = 1; + sf->part_sf.prune_part4_search = 2; + sf->part_sf.simple_motion_search_prune_rect = 1; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0; + + // TODO(debargha): Test, tweak and turn on either 1 or 2 + sf->inter_sf.inter_mode_rd_model_estimation = 1; + sf->inter_sf.model_based_post_interp_filter_breakout = 1; + sf->inter_sf.prune_compound_using_single_ref = 1; + sf->inter_sf.prune_mode_search_simple_translation = 1; + sf->inter_sf.prune_ref_frame_for_rect_partitions = + (boosted || (allow_screen_content_tools)) + ? 0 + : (is_boosted_arf2_bwd_type ? 1 : 2); + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2; + sf->inter_sf.selective_ref_frame = 1; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; + + sf->interp_sf.use_fast_interpolation_filter_search = 1; + + sf->intra_sf.intra_pruning_with_hog = 1; + + sf->tx_sf.adaptive_txb_search_level = 1; + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.model_based_prune_tx_search_level = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + + sf->tpl_sf.search_method = NSTEP_8PT; + + sf->rt_sf.use_nonrd_pick_mode = 0; + sf->rt_sf.use_real_time_ref_set = 0; + + if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION || + cpi->use_screen_content_tools) { + sf->mv_sf.exhaustive_searches_thresh = (1 << 20); + } else { + sf->mv_sf.exhaustive_searches_thresh = (1 << 25); + } + + sf->rd_sf.perform_coeff_opt = 1; + sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL; + + if (speed >= 1) { + sf->hl_sf.adjust_num_frames_for_arf_filtering = + allow_screen_content_tools ? 0 : 1; + + sf->part_sf.intra_cnn_based_part_prune_level = + allow_screen_content_tools ? 0 : 2; + sf->part_sf.simple_motion_search_early_term_none = 1; + // TODO(Venkat): Clean-up frame type dependency for + // simple_motion_search_split in partition search function and set the + // speed feature accordingly + sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; + sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; + + sf->mv_sf.exhaustive_searches_thresh <<= 1; + sf->mv_sf.obmc_full_pixel_search_level = 1; + sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS; + sf->mv_sf.disable_extensive_joint_motion_search = 1; + + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1; + sf->inter_sf.prune_comp_type_by_comp_avg = 1; + sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1; + sf->inter_sf.prune_ref_frame_for_rect_partitions = + (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools)) + ? 0 + : (boosted ? 1 : 2); + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; + sf->inter_sf.reuse_inter_intra_mode = 1; + sf->inter_sf.selective_ref_frame = 2; + sf->inter_sf.skip_arf_compound = 1; + + sf->interp_sf.use_interp_filter = 1; + + sf->intra_sf.prune_palette_search_level = 1; + + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + + sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3; + sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; + sf->rd_sf.tx_domain_dist_thres_level = 1; + + sf->lpf_sf.dual_sgr_penalty_level = 1; + sf->lpf_sf.enable_sgr_ep_pruning = 1; + + // TODO(any, yunqing): move this feature to speed 0. + sf->tpl_sf.skip_alike_starting_mv = 1; + } + + if (speed >= 2) { + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + + sf->fp_sf.skip_motion_search_threshold = 25; + + sf->gm_sf.num_refinement_steps = 2; + + sf->part_sf.reuse_best_prediction_for_part_ab = + !frame_is_intra_only(&cpi->common); + + sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL; + sf->mv_sf.subpel_iters_per_step = 1; + sf->mv_sf.reduce_search_range = 1; + + // TODO(chiyotsai@google.com): We can get 10% speed up if we move + // adaptive_rd_thresh to speed 1. But currently it performs poorly on some + // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a + // bit more closely to figure out why. + sf->inter_sf.adaptive_rd_thresh = 1; + sf->inter_sf.disable_interinter_wedge_var_thresh = 100; + sf->inter_sf.fast_interintra_wedge_search = 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1; + sf->inter_sf.prune_ext_comp_using_neighbors = 1; + sf->inter_sf.prune_comp_using_best_single_mode_ref = 2; + sf->inter_sf.prune_comp_type_by_comp_avg = 2; + sf->inter_sf.selective_ref_frame = 3; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + sf->inter_sf.enable_fast_compound_mode_search = 1; + sf->inter_sf.reuse_mask_search_results = 1; + set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 1); + sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1; + sf->inter_sf.alt_ref_search_fp = 1; + + sf->interp_sf.adaptive_interp_filter_search = 1; + sf->interp_sf.disable_dual_filter = 1; + + sf->intra_sf.disable_smooth_intra = + !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1); + sf->intra_sf.intra_pruning_with_hog = 2; + sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1; + sf->intra_sf.skip_filter_intra_in_inter_frames = 1; + + sf->tpl_sf.prune_starting_mv = 1; + sf->tpl_sf.search_method = DIAMOND; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4; + sf->rd_sf.use_mb_rd_hash = 1; + + sf->lpf_sf.prune_wiener_based_on_src_var = 1; + sf->lpf_sf.prune_sgr_based_on_wiener = 1; + sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1; + sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1; + + // TODO(any): Re-evaluate this feature set to 1 in speed 2. + sf->tpl_sf.allow_compound_pred = 0; + sf->tpl_sf.prune_ref_frames_in_tpl = 1; + } + + if (speed >= 3) { + sf->hl_sf.high_precision_mv_usage = CURRENT_Q; + + sf->gm_sf.prune_ref_frame_for_gm_search = 1; + sf->gm_sf.prune_zero_mv_with_sse = 1; + sf->gm_sf.num_refinement_steps = 0; + + sf->part_sf.less_rectangular_check_level = 2; + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools + ? SIMPLE_AGG_LVL0 + : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1); + sf->part_sf.prune_ext_part_using_split_info = 1; + sf->part_sf.simple_motion_search_rect_split = 1; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->mv_sf.search_method = DIAMOND; + sf->mv_sf.disable_second_mv = 2; + sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1; + sf->mv_sf.use_intrabc = 0; + + sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.disable_onesided_comp = 1; + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2 + // and clean-up the speed feature + sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1; + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1; + sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2; + sf->inter_sf.selective_ref_frame = 5; + sf->inter_sf.reuse_compound_type_decision = 1; + set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, + boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2)); + sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2; + + sf->interp_sf.adaptive_interp_filter_search = 2; + + // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are + // inherited directly from luma hog with some minor tweaking. Eventually we + // should run this with a bayesian optimizer to find the Pareto frontier. + sf->intra_sf.chroma_intra_pruning_with_hog = 2; + sf->intra_sf.intra_pruning_with_hog = 3; + sf->intra_sf.prune_palette_search_level = 2; + sf->intra_sf.top_intra_model_count_allowed = 2; + + sf->tpl_sf.prune_starting_mv = 2; + sf->tpl_sf.skip_alike_starting_mv = 2; + sf->tpl_sf.prune_intra_modes = 1; + sf->tpl_sf.reduce_first_step_size = 6; + sf->tpl_sf.subpel_force_stop = QUARTER_PEL; + sf->tpl_sf.gop_length_decision_method = 1; + + sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + + // TODO(any): Refactor the code related to following winner mode speed + // features + sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1; + sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; + sf->winner_mode_sf.motion_mode_for_winner_cand = + boosted ? 0 + : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1 + : 2; + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4; + + // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality + // loss. + sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2; + sf->lpf_sf.prune_wiener_based_on_src_var = 2; + sf->lpf_sf.use_coarse_filter_level_search = + frame_is_intra_only(&cpi->common) ? 0 : 1; + sf->lpf_sf.use_downsampled_wiener_stats = 1; + } + + if (speed >= 4) { + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + + sf->gm_sf.prune_zero_mv_with_sse = 2; + + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2; + sf->part_sf.simple_motion_search_reduce_search_steps = 4; + sf->part_sf.prune_ext_part_using_split_info = 2; + sf->part_sf.ml_predict_breakout_level = 3; + sf->part_sf.prune_rectangular_split_based_on_qidx = + (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0 + : 1; + + sf->inter_sf.alt_ref_search_fp = 2; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3; + + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2; + sf->inter_sf.prune_ext_comp_using_neighbors = 2; + sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; + sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; + + sf->interp_sf.cb_pred_filter_search = 1; + sf->interp_sf.skip_sharp_interp_filter_search = 1; + sf->interp_sf.use_interp_filter = 2; + + sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; + sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; + // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4. + // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + sf->intra_sf.skip_intra_in_interframe = 4; + + sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; + sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2; + + sf->tpl_sf.subpel_force_stop = HALF_PEL; + sf->tpl_sf.search_method = FAST_BIGDIA; + sf->tpl_sf.use_sad_for_mode_decision = 1; + + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7; + + // TODO(any): Extend multi-winner mode processing support for inter frames + sf->winner_mode_sf.multi_winner_mode_type = + frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT + : MULTI_WINNER_MODE_OFF; + sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 2; + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL; + } + + if (speed >= 5) { + sf->hl_sf.weight_calc_level_in_tf = 1; + sf->hl_sf.adjust_num_frames_for_arf_filtering = + allow_screen_content_tools ? 0 : 2; + + sf->fp_sf.reduce_mv_step_param = 4; + + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + + sf->part_sf.simple_motion_search_prune_agg = + allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3; + sf->part_sf.ext_partition_eval_thresh = + allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; + sf->part_sf.prune_sub_8x8_partition_level = + allow_screen_content_tools ? 1 : 2; + + sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND; + + sf->inter_sf.prune_inter_modes_if_skippable = 1; + sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4; + sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5; + sf->inter_sf.enable_fast_compound_mode_search = 2; + + sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1; + + sf->intra_sf.chroma_intra_pruning_with_hog = 3; + + // TODO(any): Extend multi-winner mode processing support for inter frames + sf->winner_mode_sf.multi_winner_mode_type = + frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST + : MULTI_WINNER_MODE_OFF; + + // Disable Self-guided Loop restoration filter. + sf->lpf_sf.disable_sgr_filter = true; + sf->lpf_sf.disable_wiener_coeff_refine_search = true; + + sf->tpl_sf.prune_starting_mv = 3; + sf->tpl_sf.use_y_only_rate_distortion = 1; + sf->tpl_sf.subpel_force_stop = FULL_PEL; + sf->tpl_sf.gop_length_decision_method = 2; + sf->tpl_sf.use_sad_for_mode_decision = 2; + + sf->winner_mode_sf.dc_blk_pred_level = 2; + + sf->fp_sf.disable_recon = 1; + } + + if (speed >= 6) { + sf->hl_sf.disable_extra_sc_testing = 1; + sf->hl_sf.second_alt_ref_filtering = 0; + + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; + sf->inter_sf.selective_ref_frame = 6; + sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2; + sf->inter_sf.prune_ext_comp_using_neighbors = 3; + + sf->intra_sf.chroma_intra_pruning_with_hog = 4; + sf->intra_sf.intra_pruning_with_hog = 4; + sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC; + sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC; + sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC; + sf->intra_sf.early_term_chroma_palette_size_search = 1; + + sf->part_sf.prune_rectangular_split_based_on_qidx = + boosted || allow_screen_content_tools ? 0 : 2; + + sf->part_sf.prune_part4_search = 3; + + sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL; + sf->mv_sf.use_bsize_dependent_search_method = 1; + + sf->tpl_sf.gop_length_decision_method = 3; + + sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8; + + sf->winner_mode_sf.dc_blk_pred_level = 3; + sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; + + sf->fp_sf.skip_zeromv_motion_search = 1; + } +} + +static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, + SPEED_FEATURES *const sf, + int speed) { + const AV1_COMMON *const cm = &cpi->common; + const int boosted = frame_is_boosted(cpi); + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360; + + if (!is_360p_or_larger) { + sf->rt_sf.prune_intra_mode_based_on_mv_range = 1; + sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1; + if (speed >= 6) + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2; + if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2; + if (speed >= 7) { + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; + sf->rt_sf.use_rtc_tf = 2; + } + if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1; + if (speed >= 8) { + sf->rt_sf.use_nonrd_filter_search = 1; + sf->rt_sf.tx_size_level_based_on_qstep = 1; + } + if (speed >= 9) { + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.nonrd_aggressive_skip = 1; + sf->rt_sf.skip_intra_pred = 1; + // Only turn on enable_ref_short_signaling for low resolution when only + // LAST and GOLDEN ref frames are used. + sf->rt_sf.enable_ref_short_signaling = + (!sf->rt_sf.use_nonrd_altref_frame && + (!sf->rt_sf.use_comp_ref_nonrd || + (!sf->rt_sf.ref_frame_comp_nonrd[1] && + !sf->rt_sf.ref_frame_comp_nonrd[2]))); + +// TODO(kyslov) Re-enable when AV1 models are trained +#if 0 +#if CONFIG_RT_ML_PARTITIONING + if (!frame_is_intra_only(cm)) { + sf->part_sf.partition_search_type = ML_BASED_PARTITION; + sf->rt_sf.reuse_inter_pred_nonrd = 0; + } +#endif +#endif + sf->rt_sf.use_adaptive_subpel_search = false; + } + if (speed >= 10) { + // TODO(yunqingwang@google.com): To be conservative, disable + // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga + // for now. May enable it in the future. + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + sf->rt_sf.skip_intra_pred = 2; + sf->rt_sf.hybrid_intra_pickmode = 3; + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; + sf->rt_sf.use_nonrd_filter_search = 0; + } + } else { + sf->rt_sf.prune_intra_mode_based_on_mv_range = 2; + sf->intra_sf.skip_filter_intra_in_inter_frames = 1; + if (speed <= 5) { + sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = + boosted ? INT_MAX : 350; + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2; + } + if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1; + if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2; + if (speed == 7) { + sf->rt_sf.prefer_large_partition_blocks = 1; + // Enable this feature for [360p, 720p] resolution range initially. + // Only enable for low bitdepth to mitigate issue: b/303023614. + if (!cpi->rc.rtc_external_ratectrl && + AOMMIN(cm->width, cm->height) <= 720 && !cpi->oxcf.use_highbitdepth) + sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ; + } + if (speed >= 7) { + sf->rt_sf.use_rtc_tf = 1; + } + if (speed == 8 && !cpi->ppi->use_svc) { + sf->rt_sf.short_circuit_low_temp_var = 0; + sf->rt_sf.use_nonrd_altref_frame = 1; + } + if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2; + if (speed >= 9) { + sf->rt_sf.gf_length_lvl = 1; + sf->rt_sf.skip_cdef_sb = 1; + sf->rt_sf.sad_based_adp_altref_lag = 2; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; + sf->rt_sf.use_adaptive_subpel_search = true; + sf->interp_sf.cb_pred_filter_search = 1; + } + if (speed >= 10) { + sf->rt_sf.hybrid_intra_pickmode = 2; + sf->rt_sf.sad_based_adp_altref_lag = 4; + sf->rt_sf.tx_size_level_based_on_qstep = 0; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + sf->rt_sf.use_adaptive_subpel_search = false; + sf->interp_sf.cb_pred_filter_search = 2; + } + } + if (!is_480p_or_larger) { + if (speed == 7) { + sf->rt_sf.nonrd_check_partition_merge_mode = 2; + } + } + if (!is_720p_or_larger) { + if (speed >= 9) { + sf->rt_sf.force_large_partition_blocks_intra = 1; + } + } else { + if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3; + if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0; + if (speed >= 7) { + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 1; + } + if (speed >= 9) { + sf->rt_sf.sad_based_adp_altref_lag = 1; + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 2; + } + if (speed >= 10) { + sf->rt_sf.sad_based_adp_altref_lag = 3; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + } + } + // TODO(Any): Check/Tune settings of other sfs for 1080p. + if (is_1080p_or_larger) { + if (speed >= 7) { + sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; + sf->rt_sf.use_adaptive_subpel_search = 0; + } + if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0; + } else { + if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1; + } + // TODO(marpan): Tune settings for speed 11 video mode, + // for resolutions below 720p. + if (speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.force_only_last_ref = 1; + sf->rt_sf.selective_cdf_update = 1; + sf->rt_sf.use_nonrd_filter_search = 0; + if (is_360p_or_larger) { + sf->part_sf.fixed_partition_size = BLOCK_32X32; + sf->rt_sf.use_fast_fixed_part = 1; + } + sf->rt_sf.increase_source_sad_thresh = 1; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + for (int i = 0; i < BLOCK_SIZES; ++i) { + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + } + } + // Setting for SVC, or when the ref_frame_config control is + // used to set the reference structure. + if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) { + const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + // For SVC: for greater than 2 temporal layers, use better mv search on + // base temporal layers, and only on base spatial layer if highest + // resolution is above 640x360. + if (cpi->svc.number_temporal_layers >= 2 && + cpi->svc.temporal_layer_id == 0 && + (cpi->svc.spatial_layer_id == 0 || + cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <= + 640 * 360)) { + sf->mv_sf.search_method = NSTEP; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->rt_sf.fullpel_search_step_param = 10; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; + if (cm->width * cm->height <= 352 * 288) + sf->rt_sf.nonrd_prune_ref_frame_search = 2; + sf->rt_sf.force_large_partition_blocks_intra = 0; + } + if (speed >= 8) { + if (cpi->svc.number_temporal_layers > 2) + sf->rt_sf.disable_cdf_update_non_reference_frame = true; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + if (rtc_ref->non_reference_frame) { + sf->rt_sf.nonrd_aggressive_skip = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + } + } + if (speed <= 9 && cpi->svc.number_temporal_layers > 2 && + cpi->svc.temporal_layer_id == 0) + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false; + else + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; + sf->rt_sf.frame_level_mode_cost_update = false; + + // Compound mode enabling. + if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] || + rtc_ref->ref_frame_comp[2]) { + sf->rt_sf.use_comp_ref_nonrd = 1; + sf->rt_sf.ref_frame_comp_nonrd[0] = + rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1]; + sf->rt_sf.ref_frame_comp_nonrd[1] = + rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1]; + sf->rt_sf.ref_frame_comp_nonrd[2] = + rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1]; + } else { + sf->rt_sf.use_comp_ref_nonrd = 0; + } + + if (cpi->svc.number_spatial_layers > 1 || + cpi->svc.number_temporal_layers > 1) + sf->hl_sf.accurate_bit_estimate = 0; + + sf->rt_sf.estimate_motion_for_var_based_partition = 1; + + // For single layers RPS: bias/adjustment for recovery frame. + if (cpi->ppi->rtc_ref.bias_recovery_frame) { + sf->mv_sf.search_method = NSTEP; + sf->mv_sf.subpel_search_method = SUBPEL_TREE; + sf->rt_sf.fullpel_search_step_param = 8; + sf->rt_sf.nonrd_aggressive_skip = 0; + } + } + // Screen settings. + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + // TODO(marpan): Check settings for speed 7 and 8. + if (speed >= 7) { + sf->rt_sf.reduce_mv_pel_precision_highmotion = 1; + sf->mv_sf.use_bsize_dependent_search_method = 0; + sf->rt_sf.skip_cdef_sb = 1; + sf->rt_sf.increase_color_thresh_palette = 1; + if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1; + } + if (speed >= 8) { + sf->rt_sf.nonrd_check_partition_merge_mode = 3; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + sf->rt_sf.use_nonrd_filter_search = 0; + sf->rt_sf.prune_hv_pred_modes_using_src_sad = false; + } + if (speed >= 9) { + sf->rt_sf.prune_idtx_nonrd = 1; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.skip_lf_screen = 1; + sf->rt_sf.nonrd_prune_ref_frame_search = 3; + sf->rt_sf.var_part_split_threshold_shift = 10; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 3; + sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1; + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->interp_sf.cb_pred_filter_search = 0; + } + if (speed >= 10) { + if (cm->width * cm->height > 1920 * 1080) + sf->part_sf.disable_8x8_part_based_on_qidx = 1; + sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80; + sf->rt_sf.part_early_exit_zeromv = 1; + sf->rt_sf.nonrd_aggressive_skip = 1; + } + if (speed >= 11) { + sf->rt_sf.skip_lf_screen = 2; + sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.prune_palette_nonrd = 1; + sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + sf->rt_sf.increase_color_thresh_palette = 0; + } + sf->rt_sf.use_nonrd_altref_frame = 0; + sf->rt_sf.use_rtc_tf = 0; + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.source_metrics_sb_nonrd = 1; + if (cpi->rc.high_source_sad == 1) { + sf->rt_sf.prefer_large_partition_blocks = 0; + sf->part_sf.max_intra_bsize = BLOCK_128X128; + for (int i = 0; i < BLOCK_SIZES; ++i) { + if (i > BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + } + } + if (cpi->rc.max_block_source_sad > 20000 && + cpi->rc.frame_source_sad > 100 && speed >= 6 && + (cpi->rc.percent_blocks_with_motion > 1 || + cpi->svc.last_layer_dropped[0])) { + sf->mv_sf.search_method = NSTEP; + sf->rt_sf.fullpel_search_step_param = 2; + } + sf->rt_sf.partition_direct_merging = 0; + sf->hl_sf.accurate_bit_estimate = 0; + // This feature is for nonrd_pickmode. + if (sf->rt_sf.use_nonrd_pick_mode) + sf->rt_sf.estimate_motion_for_var_based_partition = 1; + else + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + } + if (is_lossless_requested(&cpi->oxcf.rc_cfg)) { + sf->rt_sf.use_rtc_tf = 0; + // TODO(aomedia:3412): The setting accurate_bit_estimate = 0 + // can be removed once it's fixed for lossless mode. + sf->hl_sf.accurate_bit_estimate = 0; + } + if (cpi->oxcf.use_highbitdepth) { + // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614. + sf->rt_sf.estimate_motion_for_var_based_partition = 0; + } + if (cpi->oxcf.superres_cfg.enable_superres) { + sf->rt_sf.use_rtc_tf = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + } +} + +// TODO(kyslov): now this is very similar to +// set_good_speed_features_framesize_independent +// except it sets non-rd flag on speed 8. This function will likely +// be modified in the future with RT-specific speed features. +static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + AV1_COMMON *const cm = &cpi->common; + const int boosted = frame_is_boosted(cpi); + + // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same. + // Following set of speed features are not impacting encoder's decisions as + // the relevant tools are disabled by default. + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + sf->inter_sf.reuse_inter_intra_mode = 1; + sf->inter_sf.prune_compound_using_single_ref = 0; + sf->inter_sf.prune_comp_search_by_single_result = 2; + sf->inter_sf.prune_comp_type_by_comp_avg = 2; + sf->inter_sf.fast_wedge_sign_estimate = 1; + sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.disable_interinter_wedge_var_thresh = 100; + sf->interp_sf.cb_pred_filter_search = 0; + sf->interp_sf.skip_interp_filter_search = 1; + sf->part_sf.ml_prune_partition = 1; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.prune_ext_partition_types_search_level = 2; + sf->part_sf.less_rectangular_check_level = 2; + sf->mv_sf.obmc_full_pixel_search_level = 1; + sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; + sf->tx_sf.model_based_prune_tx_search_level = 0; + sf->lpf_sf.dual_sgr_penalty_level = 1; + // Disable Wiener and Self-guided Loop restoration filters. + sf->lpf_sf.disable_wiener_filter = true; + sf->lpf_sf.disable_sgr_filter = true; + sf->intra_sf.prune_palette_search_level = 2; + sf->intra_sf.prune_luma_palette_size_search_level = 2; + sf->intra_sf.early_term_chroma_palette_size_search = 1; + + // End of set + + // TODO(any, yunqing): tune these features for real-time use cases. + sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO; + sf->hl_sf.frame_parameter_update = 0; + + sf->inter_sf.model_based_post_interp_filter_breakout = 1; + // TODO(any): As per the experiments, this speed feature is doing redundant + // computation since the model rd based pruning logic is similar to model rd + // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if + // either of the condition becomes true. + // (1) inter_mode_rd_model_estimation != 2 + // (2) skip_interp_filter_search == 0 + // (3) Motion mode or compound mode is enabled */ + sf->inter_sf.prune_mode_search_simple_translation = 0; + sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted; + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + sf->inter_sf.selective_ref_frame = 4; + sf->inter_sf.alt_ref_search_fp = 2; + set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 4); + sf->inter_sf.limit_txfm_eval_per_mode = 3; + + sf->inter_sf.adaptive_rd_thresh = 4; + sf->inter_sf.inter_mode_rd_model_estimation = 2; + sf->inter_sf.prune_inter_modes_if_skippable = 1; + sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3; + sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; + sf->inter_sf.skip_newmv_in_drl = 4; + + sf->interp_sf.use_fast_interpolation_filter_search = 1; + sf->interp_sf.use_interp_filter = 1; + sf->interp_sf.adaptive_interp_filter_search = 1; + sf->interp_sf.disable_dual_filter = 1; + + sf->part_sf.default_max_partition_size = BLOCK_128X128; + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.use_best_rd_for_pruning = 1; + sf->part_sf.early_term_after_none_split = 1; + sf->part_sf.partition_search_breakout_dist_thr = (1 << 25); + sf->part_sf.max_intra_bsize = BLOCK_16X16; + sf->part_sf.partition_search_breakout_rate_thr = 500; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->part_sf.adjust_var_based_rd_partitioning = 2; + + sf->mv_sf.full_pixel_search_level = 1; + sf->mv_sf.exhaustive_searches_thresh = INT_MAX; + sf->mv_sf.auto_mv_step_size = 1; + sf->mv_sf.subpel_iters_per_step = 1; + sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS; + sf->mv_sf.search_method = FAST_DIAMOND; + sf->mv_sf.subpel_force_stop = EIGHTH_PEL; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; + + for (int i = 0; i < TX_SIZES; ++i) { + sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC; + sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL; + } + sf->intra_sf.skip_intra_in_interframe = 5; + sf->intra_sf.disable_smooth_intra = 1; + sf->intra_sf.skip_filter_intra_in_inter_frames = 1; + + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1; + sf->tx_sf.adaptive_txb_search_level = 2; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.tx_size_search_lgr_block = 1; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + sf->tx_sf.refine_fast_tx_search_results = 0; + sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; + sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; + + sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; + sf->rd_sf.simple_model_rd_from_var = 1; + sf->rd_sf.tx_domain_dist_level = 2; + sf->rd_sf.tx_domain_dist_thres_level = 2; + + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + + sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 3; + sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1; + sf->winner_mode_sf.tx_size_search_level = 1; + sf->winner_mode_sf.winner_mode_ifs = 1; + + sf->rt_sf.check_intra_pred_nonrd = 1; + sf->rt_sf.estimate_motion_for_var_based_partition = 2; + sf->rt_sf.hybrid_intra_pickmode = 1; + sf->rt_sf.use_comp_ref_nonrd = 0; + sf->rt_sf.ref_frame_comp_nonrd[0] = 0; + sf->rt_sf.ref_frame_comp_nonrd[1] = 0; + sf->rt_sf.ref_frame_comp_nonrd[2] = 0; + sf->rt_sf.use_nonrd_filter_search = 1; + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.num_inter_modes_for_tx_search = 5; + sf->rt_sf.prune_inter_modes_using_temp_var = 1; + sf->rt_sf.use_real_time_ref_set = 1; + sf->rt_sf.use_simple_rd_model = 1; + sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1; + // TODO(any): This sf could be removed. + sf->rt_sf.short_circuit_low_temp_var = 1; + sf->rt_sf.check_scene_detection = 1; + if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0; + if (cm->current_frame.frame_type != KEY_FRAME && + cpi->oxcf.rc_cfg.mode == AOM_CBR) + sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ; + // Enable noise estimation only for high resolutions for now. + // + // Since use_temporal_noise_estimate has no effect for all-intra frame + // encoding, it is disabled for this case. + if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480) + sf->rt_sf.use_temporal_noise_estimate = 1; + sf->rt_sf.skip_tx_no_split_var_based_partition = 1; + sf->rt_sf.skip_newmv_mode_based_on_sse = 1; + sf->rt_sf.mode_search_skip_flags = + (cm->current_frame.frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | + FLAG_EARLY_TERMINATE; + sf->rt_sf.var_part_split_threshold_shift = 5; + if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1; + sf->rt_sf.use_fast_fixed_part = 0; + sf->rt_sf.increase_source_sad_thresh = 0; + + if (speed >= 6) { + sf->mv_sf.use_fullpel_costlist = 1; + + sf->rd_sf.tx_domain_dist_thres_level = 3; + + sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0; + sf->inter_sf.limit_inter_mode_cands = 4; + sf->inter_sf.prune_warped_prob_thresh = 8; + sf->inter_sf.extra_prune_warped = 1; + + sf->rt_sf.gf_refresh_based_on_qp = 1; + sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1; + sf->rt_sf.var_part_split_threshold_shift = 7; + if (!frame_is_intra_only(&cpi->common)) + sf->rt_sf.var_part_based_on_qidx = 2; + + sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3; + } + + if (speed >= 7) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1; + sf->rt_sf.use_comp_ref_nonrd = 1; + sf->rt_sf.ref_frame_comp_nonrd[2] = 1; // LAST_ALTREF + sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->part_sf.max_intra_bsize = BLOCK_32X32; + + sf->mv_sf.search_method = FAST_DIAMOND; + sf->mv_sf.subpel_force_stop = QUARTER_PEL; + + sf->inter_sf.inter_mode_rd_model_estimation = 2; + // This sf is not applicable in non-rd path. + sf->inter_sf.skip_newmv_in_drl = 0; + + sf->interp_sf.skip_interp_filter_search = 0; + + // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't + // good. May need more study. + for (int i = 0; i < TX_SIZES; ++i) { + sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL; + } + + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; + sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5; + + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + // This is for rd path only. + sf->rt_sf.prune_inter_modes_using_temp_var = 0; + sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0; + sf->rt_sf.prune_intra_mode_based_on_mv_range = 0; +#if !CONFIG_REALTIME_ONLY + sf->rt_sf.reuse_inter_pred_nonrd = + (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0); +#else + sf->rt_sf.reuse_inter_pred_nonrd = 1; +#endif +#if CONFIG_AV1_TEMPORAL_DENOISING + sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0); +#endif + sf->rt_sf.short_circuit_low_temp_var = 0; + // For spatial layers, only LAST and GOLDEN are currently used in the SVC + // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the + // get_ref_frame_flags() for some patterns, so disable it here for + // spatial layers. + sf->rt_sf.use_nonrd_altref_frame = + (cpi->svc.number_spatial_layers > 1) ? 0 : 1; + sf->rt_sf.use_nonrd_pick_mode = 1; + sf->rt_sf.nonrd_check_partition_merge_mode = 3; + sf->rt_sf.skip_intra_pred = 1; + sf->rt_sf.source_metrics_sb_nonrd = 1; + // Set mask for intra modes. + for (int i = 0; i < BLOCK_SIZES; ++i) + if (i >= BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + // Use DC, H, V intra mode for block sizes < 32X32. + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + + sf->winner_mode_sf.dc_blk_pred_level = 0; + sf->rt_sf.var_part_based_on_qidx = 3; + sf->rt_sf.prune_compoundmode_with_singlecompound_var = true; + sf->rt_sf.prune_compoundmode_with_singlemode_var = true; + sf->rt_sf.skip_compound_based_on_var = true; + sf->rt_sf.use_adaptive_subpel_search = true; + } + + if (speed >= 8) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2; + sf->intra_sf.intra_pruning_with_hog = 1; + sf->rt_sf.short_circuit_low_temp_var = 1; + sf->rt_sf.use_nonrd_altref_frame = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 2; + sf->rt_sf.nonrd_check_partition_merge_mode = 0; + sf->rt_sf.var_part_split_threshold_shift = 8; + sf->rt_sf.var_part_based_on_qidx = 4; + sf->rt_sf.partition_direct_merging = 1; + sf->rt_sf.prune_compoundmode_with_singlemode_var = false; + sf->mv_sf.use_bsize_dependent_search_method = 2; + sf->rt_sf.prune_hv_pred_modes_using_src_sad = true; + } + if (speed >= 9) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3; + sf->rt_sf.estimate_motion_for_var_based_partition = 3; + sf->rt_sf.prefer_large_partition_blocks = 3; + sf->rt_sf.skip_intra_pred = 2; + sf->rt_sf.var_part_split_threshold_shift = 9; + for (int i = 0; i < BLOCK_SIZES; ++i) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + sf->rt_sf.var_part_based_on_qidx = 0; + sf->rt_sf.frame_level_mode_cost_update = true; + sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true; + sf->rt_sf.reduce_mv_pel_precision_highmotion = 0; + sf->rt_sf.use_adaptive_subpel_search = true; + sf->mv_sf.use_bsize_dependent_search_method = 0; + } + if (speed >= 10) { + sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4; + sf->rt_sf.nonrd_prune_ref_frame_search = 3; + sf->rt_sf.var_part_split_threshold_shift = 10; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + } + if (speed >= 11 && !frame_is_intra_only(cm) && + cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + sf->winner_mode_sf.dc_blk_pred_level = 3; + } +} + +static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) { + // best quality defaults + hl_sf->frame_parameter_update = 1; + hl_sf->recode_loop = ALLOW_RECODE; + // Recode loop tolerance %. + hl_sf->recode_tolerance = 25; + hl_sf->high_precision_mv_usage = CURRENT_Q; + hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL; + hl_sf->disable_extra_sc_testing = 0; + hl_sf->second_alt_ref_filtering = 1; + hl_sf->adjust_num_frames_for_arf_filtering = 0; + hl_sf->accurate_bit_estimate = 0; + hl_sf->weight_calc_level_in_tf = 0; + hl_sf->allow_sub_blk_me_in_tf = 0; +} + +static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) { + fp_sf->reduce_mv_step_param = 3; + fp_sf->skip_motion_search_threshold = 0; + fp_sf->disable_recon = 0; + fp_sf->skip_zeromv_motion_search = 0; +} + +static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) { + tpl_sf->gop_length_decision_method = 0; + tpl_sf->prune_intra_modes = 0; + tpl_sf->prune_starting_mv = 0; + tpl_sf->reduce_first_step_size = 0; + tpl_sf->skip_alike_starting_mv = 0; + tpl_sf->subpel_force_stop = EIGHTH_PEL; + tpl_sf->search_method = NSTEP; + tpl_sf->prune_ref_frames_in_tpl = 0; + tpl_sf->allow_compound_pred = 1; + tpl_sf->use_y_only_rate_distortion = 0; + tpl_sf->use_sad_for_mode_decision = 0; + tpl_sf->reduce_num_frames = 0; +} + +static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) { + gm_sf->gm_search_type = GM_FULL_SEARCH; + gm_sf->prune_ref_frame_for_gm_search = 0; + gm_sf->prune_zero_mv_with_sse = 0; + gm_sf->disable_gm_search_based_on_stats = 0; + gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS; +} + +static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) { + part_sf->partition_search_type = SEARCH_PARTITION; + part_sf->less_rectangular_check_level = 0; + part_sf->use_square_partition_only_threshold = BLOCK_128X128; + part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + part_sf->default_max_partition_size = BLOCK_LARGEST; + part_sf->default_min_partition_size = BLOCK_4X4; + part_sf->adjust_var_based_rd_partitioning = 0; + part_sf->max_intra_bsize = BLOCK_LARGEST; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + part_sf->fixed_partition_size = BLOCK_16X16; + // Recode loop tolerance %. + part_sf->partition_search_breakout_dist_thr = 0; + part_sf->partition_search_breakout_rate_thr = 0; + part_sf->prune_ext_partition_types_search_level = 0; + part_sf->prune_part4_search = 0; + part_sf->ml_prune_partition = 0; + part_sf->ml_early_term_after_part_split_level = 0; + for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { + part_sf->ml_partition_search_breakout_thresh[i] = + -1; // -1 means not enabled. + } + part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0; + part_sf->simple_motion_search_split = 0; + part_sf->simple_motion_search_prune_rect = 0; + part_sf->simple_motion_search_early_term_none = 0; + part_sf->simple_motion_search_reduce_search_steps = 0; + part_sf->intra_cnn_based_part_prune_level = 0; + part_sf->ext_partition_eval_thresh = BLOCK_8X8; + part_sf->rect_partition_eval_thresh = BLOCK_128X128; + part_sf->ext_part_eval_based_on_cur_best = 0; + part_sf->prune_ext_part_using_split_info = 0; + part_sf->prune_rectangular_split_based_on_qidx = 0; + part_sf->prune_rect_part_using_4x4_var_deviation = false; + part_sf->prune_rect_part_using_none_pred_mode = false; + part_sf->early_term_after_none_split = 0; + part_sf->ml_predict_breakout_level = 0; + part_sf->prune_sub_8x8_partition_level = 0; + part_sf->simple_motion_search_rect_split = 0; + part_sf->reuse_prev_rd_results_for_part_ab = 0; + part_sf->reuse_best_prediction_for_part_ab = 0; + part_sf->use_best_rd_for_pruning = 0; + part_sf->skip_non_sq_part_based_on_none = 0; + part_sf->disable_8x8_part_based_on_qidx = 0; +} + +static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) { + mv_sf->full_pixel_search_level = 0; + mv_sf->auto_mv_step_size = 0; + mv_sf->exhaustive_searches_thresh = 0; + mv_sf->obmc_full_pixel_search_level = 0; + mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED; + mv_sf->reduce_search_range = 0; + mv_sf->search_method = NSTEP; + mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL; + mv_sf->subpel_force_stop = EIGHTH_PEL; + mv_sf->subpel_iters_per_step = 2; + mv_sf->subpel_search_method = SUBPEL_TREE; + mv_sf->use_accurate_subpel_search = USE_8_TAPS; + mv_sf->use_bsize_dependent_search_method = 0; + mv_sf->use_fullpel_costlist = 0; + mv_sf->use_downsampled_sad = 0; + mv_sf->disable_extensive_joint_motion_search = 0; + mv_sf->disable_second_mv = 0; + mv_sf->skip_fullpel_search_using_startmv = 0; + mv_sf->warp_search_method = WARP_SEARCH_SQUARE; + mv_sf->warp_search_iters = 8; + mv_sf->use_intrabc = 1; +} + +static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) { + inter_sf->adaptive_rd_thresh = 0; + inter_sf->model_based_post_interp_filter_breakout = 0; + inter_sf->reduce_inter_modes = 0; + inter_sf->alt_ref_search_fp = 0; + inter_sf->prune_single_ref = 0; + inter_sf->prune_comp_ref_frames = 0; + inter_sf->selective_ref_frame = 0; + inter_sf->prune_ref_frame_for_rect_partitions = 0; + inter_sf->fast_wedge_sign_estimate = 0; + inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED; + inter_sf->reuse_inter_intra_mode = 0; + inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->prune_inter_modes_based_on_tpl = 0; + inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF; + inter_sf->prune_comp_search_by_single_result = 0; + inter_sf->skip_repeated_ref_mv = 0; + inter_sf->skip_newmv_in_drl = 0; + inter_sf->inter_mode_rd_model_estimation = 0; + inter_sf->prune_compound_using_single_ref = 0; + inter_sf->prune_ext_comp_using_neighbors = 0; + inter_sf->skip_ext_comp_nearmv_mode = 0; + inter_sf->prune_comp_using_best_single_mode_ref = 0; + inter_sf->prune_nearest_near_mv_using_refmv_weight = 0; + inter_sf->disable_onesided_comp = 0; + inter_sf->prune_mode_search_simple_translation = 0; + inter_sf->prune_comp_type_by_comp_avg = 0; + inter_sf->disable_interinter_wedge_newmv_search = 0; + inter_sf->fast_interintra_wedge_search = 0; + inter_sf->prune_comp_type_by_model_rd = 0; + inter_sf->perform_best_rd_based_gating_for_chroma = 0; + inter_sf->prune_obmc_prob_thresh = 0; + inter_sf->disable_interinter_wedge_var_thresh = 0; + inter_sf->disable_interintra_wedge_var_thresh = 0; + inter_sf->prune_ref_mv_idx_search = 0; + inter_sf->prune_warped_prob_thresh = 0; + inter_sf->reuse_compound_type_decision = 0; + inter_sf->prune_inter_modes_if_skippable = 0; + inter_sf->disable_masked_comp = 0; + inter_sf->enable_fast_compound_mode_search = 0; + inter_sf->reuse_mask_search_results = 0; + inter_sf->enable_fast_wedge_mask_search = 0; + inter_sf->inter_mode_txfm_breakout = 0; + inter_sf->limit_inter_mode_cands = 0; + inter_sf->limit_txfm_eval_per_mode = 0; + inter_sf->skip_arf_compound = 0; + set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0); +} + +static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) { + interp_sf->adaptive_interp_filter_search = 0; + interp_sf->cb_pred_filter_search = 0; + interp_sf->disable_dual_filter = 0; + interp_sf->skip_sharp_interp_filter_search = 0; + interp_sf->use_fast_interpolation_filter_search = 0; + interp_sf->use_interp_filter = 0; + interp_sf->skip_interp_filter_search = 0; +} + +static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) { + intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB; + intra_sf->skip_intra_in_interframe = 1; + intra_sf->intra_pruning_with_hog = 0; + intra_sf->chroma_intra_pruning_with_hog = 0; + intra_sf->prune_palette_search_level = 0; + intra_sf->prune_luma_palette_size_search_level = 0; + + for (int i = 0; i < TX_SIZES; i++) { + intra_sf->intra_y_mode_mask[i] = INTRA_ALL; + intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; + } + intra_sf->disable_smooth_intra = 0; + intra_sf->prune_smooth_intra_mode_for_chroma = 0; + intra_sf->prune_filter_intra_level = 0; + intra_sf->prune_chroma_modes_using_luma_winner = 0; + intra_sf->cfl_search_range = 3; + intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT; + intra_sf->adapt_top_model_rd_count_using_neighbors = 0; + intra_sf->early_term_chroma_palette_size_search = 0; + intra_sf->skip_filter_intra_in_inter_frames = 0; + intra_sf->prune_luma_odd_delta_angles_in_intra = 0; +} + +static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) { + tx_sf->inter_tx_size_search_init_depth_sqr = 0; + tx_sf->inter_tx_size_search_init_depth_rect = 0; + tx_sf->intra_tx_size_search_init_depth_rect = 0; + tx_sf->intra_tx_size_search_init_depth_sqr = 0; + tx_sf->tx_size_search_lgr_block = 0; + tx_sf->model_based_prune_tx_search_level = 0; + tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1; + tx_sf->tx_type_search.ml_tx_split_thresh = 8500; + tx_sf->tx_type_search.use_skip_flag_prediction = 1; + tx_sf->tx_type_search.use_reduced_intra_txset = 0; + tx_sf->tx_type_search.fast_intra_tx_type_search = 0; + tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX; + tx_sf->tx_type_search.skip_tx_search = 0; + tx_sf->tx_type_search.prune_tx_type_using_stats = 0; + tx_sf->tx_type_search.prune_tx_type_est_rd = 0; + tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0; + tx_sf->txb_split_cap = 1; + tx_sf->adaptive_txb_search_level = 0; + tx_sf->refine_fast_tx_search_results = 1; + tx_sf->prune_tx_size_level = 0; + tx_sf->prune_intra_tx_depths_using_nn = false; + tx_sf->use_rd_based_breakout_for_intra_tx_search = false; +} + +static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf, + const AV1EncoderConfig *oxcf) { + const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant; + if (disable_trellis_quant == 3) { + rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg) + ? NO_ESTIMATE_YRD_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (disable_trellis_quant == 2) { + rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg) + ? FINAL_PASS_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (disable_trellis_quant == 0) { + if (is_lossless_requested(&oxcf->rc_cfg)) { + rd_sf->optimize_coefficients = NO_TRELLIS_OPT; + } else { + rd_sf->optimize_coefficients = FULL_TRELLIS_OPT; + } + } else if (disable_trellis_quant == 1) { + rd_sf->optimize_coefficients = NO_TRELLIS_OPT; + } else { + assert(0 && "Invalid disable_trellis_quant value"); + } + rd_sf->use_mb_rd_hash = 0; + rd_sf->simple_model_rd_from_var = 0; + rd_sf->tx_domain_dist_level = 0; + rd_sf->tx_domain_dist_thres_level = 0; + rd_sf->perform_coeff_opt = 0; +} + +static AOM_INLINE void init_winner_mode_sf( + WINNER_MODE_SPEED_FEATURES *winner_mode_sf) { + winner_mode_sf->motion_mode_for_winner_cand = 0; + // Set this at the appropriate speed levels + winner_mode_sf->tx_size_search_level = 0; + winner_mode_sf->enable_winner_mode_for_coeff_opt = 0; + winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0; + winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0; + winner_mode_sf->multi_winner_mode_type = 0; + winner_mode_sf->dc_blk_pred_level = 0; + winner_mode_sf->winner_mode_ifs = 0; + winner_mode_sf->prune_winner_mode_eval_level = 0; +} + +static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) { + lpf_sf->disable_loop_restoration_chroma = 0; + lpf_sf->disable_loop_restoration_luma = 0; + lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE; + lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX; + lpf_sf->prune_wiener_based_on_src_var = 0; + lpf_sf->prune_sgr_based_on_wiener = 0; + lpf_sf->enable_sgr_ep_pruning = 0; + lpf_sf->reduce_wiener_window_size = 0; + lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; + lpf_sf->use_coarse_filter_level_search = 0; + lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH; + // Set decoder side speed feature to use less dual sgr modes + lpf_sf->dual_sgr_penalty_level = 0; + // Enable Wiener and Self-guided Loop restoration filters by default. + lpf_sf->disable_wiener_filter = false; + lpf_sf->disable_sgr_filter = false; + lpf_sf->disable_wiener_coeff_refine_search = false; + lpf_sf->use_downsampled_wiener_stats = 0; +} + +static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) { + rt_sf->check_intra_pred_nonrd = 0; + rt_sf->skip_intra_pred = 0; + rt_sf->estimate_motion_for_var_based_partition = 0; + rt_sf->nonrd_check_partition_merge_mode = 0; + rt_sf->nonrd_check_partition_split = 0; + rt_sf->mode_search_skip_flags = 0; + rt_sf->nonrd_prune_ref_frame_search = 0; + rt_sf->use_nonrd_pick_mode = 0; + rt_sf->use_nonrd_altref_frame = 0; + rt_sf->use_comp_ref_nonrd = 0; + rt_sf->use_real_time_ref_set = 0; + rt_sf->short_circuit_low_temp_var = 0; + rt_sf->reuse_inter_pred_nonrd = 0; + rt_sf->num_inter_modes_for_tx_search = INT_MAX; + rt_sf->use_nonrd_filter_search = 0; + rt_sf->use_simple_rd_model = 0; + rt_sf->hybrid_intra_pickmode = 0; + rt_sf->source_metrics_sb_nonrd = 0; + rt_sf->overshoot_detection_cbr = NO_DETECTION; + rt_sf->check_scene_detection = 0; + rt_sf->prefer_large_partition_blocks = 0; + rt_sf->use_temporal_noise_estimate = 0; + rt_sf->fullpel_search_step_param = 0; + for (int i = 0; i < BLOCK_SIZES; ++i) + rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL; + rt_sf->prune_hv_pred_modes_using_src_sad = false; + rt_sf->nonrd_aggressive_skip = 0; + rt_sf->skip_cdef_sb = 0; + rt_sf->force_large_partition_blocks_intra = 0; + rt_sf->skip_tx_no_split_var_based_partition = 0; + rt_sf->skip_newmv_mode_based_on_sse = 0; + rt_sf->gf_length_lvl = 0; + rt_sf->prune_inter_modes_with_golden_ref = 0; + rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0; + rt_sf->prune_inter_modes_using_temp_var = 0; + rt_sf->reduce_mv_pel_precision_highmotion = 0; + rt_sf->reduce_mv_pel_precision_lowcomplex = 0; + rt_sf->prune_intra_mode_based_on_mv_range = 0; + rt_sf->var_part_split_threshold_shift = 7; + rt_sf->gf_refresh_based_on_qp = 0; + rt_sf->use_rtc_tf = 0; + rt_sf->prune_idtx_nonrd = 0; + rt_sf->prune_palette_nonrd = 0; + rt_sf->dct_only_palette_nonrd = 0; + rt_sf->part_early_exit_zeromv = 0; + rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED; + rt_sf->skip_lf_screen = 0; + rt_sf->sad_based_adp_altref_lag = 0; + rt_sf->partition_direct_merging = 0; + rt_sf->var_part_based_on_qidx = 0; + rt_sf->tx_size_level_based_on_qstep = 0; + rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false; + rt_sf->prune_compoundmode_with_singlecompound_var = false; + rt_sf->frame_level_mode_cost_update = false; + rt_sf->prune_h_pred_using_best_mode_so_far = false; + rt_sf->enable_intra_mode_pruning_using_neighbors = false; + rt_sf->prune_intra_mode_using_best_sad_so_far = false; + rt_sf->check_only_zero_zeromv_on_large_blocks = false; + rt_sf->disable_cdf_update_non_reference_frame = false; + rt_sf->prune_compoundmode_with_singlemode_var = false; + rt_sf->skip_compound_based_on_var = false; + rt_sf->set_zeromv_skip_based_on_source_sad = 1; + rt_sf->use_adaptive_subpel_search = false; + rt_sf->screen_content_cdef_filter_qindex_thresh = 0; + rt_sf->enable_ref_short_signaling = false; + rt_sf->check_globalmv_on_single_ref = true; + rt_sf->increase_color_thresh_palette = false; + rt_sf->selective_cdf_update = 0; + rt_sf->force_only_last_ref = 0; +} + +static fractional_mv_step_fp + *const fractional_mv_search[SUBPEL_SEARCH_METHODS] = { + av1_find_best_sub_pixel_tree, // SUBPEL_TREE = 0 + av1_find_best_sub_pixel_tree_pruned, // SUBPEL_TREE_PRUNED = 1 + av1_find_best_sub_pixel_tree_pruned_more // SUBPEL_TREE_PRUNED_MORE = 2 + }; + +// Populate appropriate sub-pel search method based on speed feature and user +// specified settings +static void set_subpel_search_method( + MotionVectorSearchParams *mv_search_params, + unsigned int motion_vector_unit_test, + SUBPEL_SEARCH_METHOD subpel_search_method) { + assert(subpel_search_method <= SUBPEL_TREE_PRUNED_MORE); + mv_search_params->find_fractional_mv_step = + fractional_mv_search[subpel_search_method]; + + // This is only used in motion vector unit test. + if (motion_vector_unit_test == 1) + mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv; + else if (motion_vector_unit_test == 2) + mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv; +} + +void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + switch (oxcf->mode) { + case GOOD: + set_good_speed_feature_framesize_dependent(cpi, sf, speed); + break; + case ALLINTRA: + set_allintra_speed_feature_framesize_dependent(cpi, sf, speed); + break; + case REALTIME: + set_rt_speed_feature_framesize_dependent(cpi, sf, speed); + break; + } + + if (!cpi->ppi->seq_params_locked) { + cpi->common.seq_params->enable_masked_compound &= + !sf->inter_sf.disable_masked_comp; + cpi->common.seq_params->enable_interintra_compound &= + (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); + } + + set_subpel_search_method(&cpi->mv_search_params, + cpi->oxcf.unit_test_cfg.motion_vector_unit_test, + sf->mv_sf.subpel_search_method); + + // For multi-thread use case with row_mt enabled, cost update for a set of + // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to + // INTERNAL_COST_UPD_SBROW in such cases. + if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) { + if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) { + // Set mv_cost_upd_level to use row level update. + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; + } + } +} + +void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { + SPEED_FEATURES *const sf = &cpi->sf; + WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + int i; + + init_hl_sf(&sf->hl_sf); + init_fp_sf(&sf->fp_sf); + init_tpl_sf(&sf->tpl_sf); + init_gm_sf(&sf->gm_sf); + init_part_sf(&sf->part_sf); + init_mv_sf(&sf->mv_sf); + init_inter_sf(&sf->inter_sf); + init_interp_sf(&sf->interp_sf); + init_intra_sf(&sf->intra_sf); + init_tx_sf(&sf->tx_sf); + init_rd_sf(&sf->rd_sf, oxcf); + init_winner_mode_sf(&sf->winner_mode_sf); + init_lpf_sf(&sf->lpf_sf); + init_rt_sf(&sf->rt_sf); + + switch (oxcf->mode) { + case GOOD: + set_good_speed_features_framesize_independent(cpi, sf, speed); + break; + case ALLINTRA: + set_allintra_speed_features_framesize_independent(cpi, sf, speed); + break; + case REALTIME: + set_rt_speed_features_framesize_independent(cpi, sf, speed); + break; + } + + // Note: when use_nonrd_pick_mode is true, the transform size is the + // minimum of 16x16 and the largest possible size of the current block, + // which conflicts with the speed feature "enable_tx_size_search". + if (!oxcf->txfm_cfg.enable_tx_size_search && + sf->rt_sf.use_nonrd_pick_mode == 0) { + sf->winner_mode_sf.tx_size_search_level = 3; + } + + if (cpi->mt_info.num_workers > 1) { + // Loop restoration stage is conditionally disabled for speed 5, 6 when + // num_workers > 1. Since av1_pick_filter_restoration() is not + // multi-threaded, enabling the Loop restoration stage will cause an + // increase in encode time (3% to 7% increase depends on frame + // resolution). + // TODO(aomedia:3446): Implement multi-threading of + // av1_pick_filter_restoration() and enable Wiener filter for speed 5, 6 + // similar to single thread encoding path. + if (speed >= 5) { + sf->lpf_sf.disable_sgr_filter = true; + sf->lpf_sf.disable_wiener_filter = true; + } + } + + if (!cpi->ppi->seq_params_locked) { + cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &= + (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); + cpi->common.seq_params->enable_dual_filter &= + !sf->interp_sf.disable_dual_filter; + // Set the flag 'enable_restoration', if one the Loop restoration filters + // (i.e., Wiener or Self-guided) is enabled. + cpi->common.seq_params->enable_restoration &= + (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter); + + cpi->common.seq_params->enable_interintra_compound &= + (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); + } + + const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED); + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mv_sf.mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_speed][i].range; + sf->mv_sf.mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_speed][i].interval; + } + + // Update the mesh pattern of exhaustive motion search for intraBC + // Though intraBC mesh pattern is populated for all frame types, it is used + // only for intra frames of screen contents + for (i = 0; i < MAX_MESH_STEP; ++i) { + sf->mv_sf.intrabc_mesh_patterns[i].range = + intrabc_mesh_patterns[mesh_speed][i].range; + sf->mv_sf.intrabc_mesh_patterns[i].interval = + intrabc_mesh_patterns[mesh_speed][i].interval; + } + + // Slow quant, dct and trellis not worthwhile for first pass + // so make sure they are always turned off. + if (is_stat_generation_stage(cpi)) + sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT; + + // No recode for 1 pass. + if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi)) + sf->hl_sf.recode_loop = DISALLOW_RECODE; + + set_subpel_search_method(&cpi->mv_search_params, + cpi->oxcf.unit_test_cfg.motion_vector_unit_test, + sf->mv_sf.subpel_search_method); + + // assert ensures that tx_domain_dist_level is accessed correctly + assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 && + cpi->sf.rd_sf.tx_domain_dist_thres_level < 4); + memcpy(winner_mode_params->tx_domain_dist_threshold, + tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level], + sizeof(winner_mode_params->tx_domain_dist_threshold)); + + assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 && + cpi->sf.rd_sf.tx_domain_dist_level < TX_DOMAIN_DIST_LEVELS); + memcpy(winner_mode_params->use_transform_domain_distortion, + tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level], + sizeof(winner_mode_params->use_transform_domain_distortion)); + + // assert ensures that coeff_opt_thresholds is accessed correctly + assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 && + cpi->sf.rd_sf.perform_coeff_opt < 9); + memcpy(winner_mode_params->coeff_opt_thresholds, + &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt], + sizeof(winner_mode_params->coeff_opt_thresholds)); + + // assert ensures that predict_skip_levels is accessed correctly + assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 && + cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3); + memcpy(winner_mode_params->skip_txfm_level, + predict_skip_levels[cpi->sf.tx_sf.tx_type_search + .use_skip_flag_prediction], + sizeof(winner_mode_params->skip_txfm_level)); + + // assert ensures that tx_size_search_level is accessed correctly + assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 && + cpi->sf.winner_mode_sf.tx_size_search_level <= 3); + memcpy(winner_mode_params->tx_size_search_methods, + tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level], + sizeof(winner_mode_params->tx_size_search_methods)); + memcpy(winner_mode_params->predict_dc_level, + predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level], + sizeof(winner_mode_params->predict_dc_level)); + + if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) { + if (sf->inter_sf.inter_mode_rd_model_estimation == 1) { + // Revert to type 2 + sf->inter_sf.inter_mode_rd_model_estimation = 2; + } + +#if !CONFIG_FPMT_TEST + // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve + // better parallelism when number of threads available are greater than or + // equal to maximum number of reference frames allowed for global motion. + if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH && + (cpi->mt_info.num_workers >= + gm_available_reference_frames[sf->gm_sf.gm_search_type])) + sf->gm_sf.prune_ref_frame_for_gm_search = 0; +#endif + } + + // This only applies to the real time mode. Adaptive gf refresh is disabled if + // gf_cbr_boost_pct that is set by the user is larger than 0. + if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0) + sf->rt_sf.gf_refresh_based_on_qp = 0; +} + +// Override some speed features based on qindex +void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { + AV1_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; + const int boosted = frame_is_boosted(cpi); + const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; + const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; + const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; + const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440; + const int is_arf2_bwd_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; + + if (cpi->oxcf.mode == REALTIME) { + if (speed >= 6) { + const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150); + sf->part_sf.adjust_var_based_rd_partitioning = + frame_is_intra_only(cm) + ? 0 + : cm->quant_params.base_qindex > qindex_thresh; + } + return; + } + + if (speed == 0) { + // qindex_thresh for resolution < 720p + const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140); + if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) { + sf->part_sf.simple_motion_search_split = + cm->features.allow_screen_content_tools ? 1 : 2; + sf->part_sf.simple_motion_search_early_term_none = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + } + + if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) { + sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger; + memcpy(winner_mode_params->coeff_opt_thresholds, + &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt], + sizeof(winner_mode_params->coeff_opt_thresholds)); + sf->part_sf.simple_motion_search_split = + cm->features.allow_screen_content_tools ? 1 : 2; + sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; + sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; + sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; + sf->tx_sf.model_based_prune_tx_search_level = 0; + + if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) { + sf->inter_sf.selective_ref_frame = 2; + sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2; + sf->rd_sf.tx_domain_dist_thres_level = 1; + sf->part_sf.simple_motion_search_early_term_none = 1; + sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000; + sf->interp_sf.cb_pred_filter_search = 0; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2; + sf->tx_sf.tx_type_search.skip_tx_search = 1; + } + } + } + + if (speed >= 2) { + // Disable extended partitions for lower quantizers + const int aggr = AOMMIN(4, speed - 2); + const int qindex_thresh1[4] = { 50, 50, 80, 100 }; + const int qindex_thresh2[4] = { 80, 100, 120, 160 }; + int qindex_thresh; + if (aggr <= 1) { + const int qthresh2 = + (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr]; + qindex_thresh = cm->features.allow_screen_content_tools + ? qindex_thresh1[aggr] + : qthresh2; + if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else if (aggr <= 2) { + qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; + if (cm->quant_params.base_qindex <= qindex_thresh && + !frame_is_intra_only(cm)) + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else if (aggr <= 3) { + if (!is_480p_or_larger) { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else if (!is_720p_or_larger && !frame_is_intra_only(cm) && + !cm->features.allow_screen_content_tools) { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } else { + qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; + if (cm->quant_params.base_qindex <= qindex_thresh && + !frame_is_intra_only(cm)) + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } + } else { + sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + } + } + + if (speed >= 4) { + // Disable rectangular partitions for lower quantizers + const int aggr = AOMMIN(1, speed - 4); + const int qindex_thresh[2] = { 65, 80 }; + int disable_rect_part; + disable_rect_part = !boosted; + if (cm->quant_params.base_qindex <= qindex_thresh[aggr] && + disable_rect_part && is_480p_or_larger) { + sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8; + } + } + + if (speed <= 2) { + if (!is_stat_generation_stage(cpi)) { + // Use faster full-pel motion search for high quantizers. + // Also use reduced total search range for low resolutions at high + // quantizers. + const int aggr = speed; + const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0]; + const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1]; + const SEARCH_METHODS search_method = + motion_search_method[is_720p_or_larger]; + if (cm->quant_params.base_qindex > qindex_thresh1) { + sf->mv_sf.search_method = search_method; + sf->tpl_sf.search_method = search_method; + } else if (cm->quant_params.base_qindex > qindex_thresh2) { + sf->mv_sf.search_method = NSTEP_8PT; + } + } + } + + if (speed >= 4) { + // Disable LR search at low and high quantizers and enable only for + // mid-quantizer range. + if (!boosted && !is_arf2_bwd_type) { + const int qindex_low[2] = { 100, 60 }; + const int qindex_high[2] = { 180, 160 }; + if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] || + cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) { + sf->lpf_sf.disable_loop_restoration_luma = 1; + } + } + } + + if (speed == 1) { + // Reuse interinter wedge mask search from first search for non-boosted + // non-internal-arf frames, except at very high quantizers. + if (cm->quant_params.base_qindex <= 200) { + if (!boosted && !is_arf2_bwd_type) + sf->inter_sf.reuse_mask_search_results = 1; + } + } + + if (speed == 5) { + if (!(frame_is_intra_only(&cpi->common) || + cm->features.allow_screen_content_tools)) { + const int qindex[2] = { 256, 128 }; + // Set the sf value as 3 for low resolution and + // for higher resolutions with low quantizers. + if (cm->quant_params.base_qindex < qindex[is_480p_or_larger]) + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; + } + } + + if (speed >= 5) { + // Disable the sf for low quantizers in case of low resolution screen + // contents. + if (cm->features.allow_screen_content_tools && + cm->quant_params.base_qindex < 128 && is_480p_or_lesser) { + sf->part_sf.prune_sub_8x8_partition_level = 0; + } + } + + // Loop restoration size search + // At speed 0, always search all available sizes for the maximum possible gain + sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE; + sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX; + + if (speed >= 1) { + // For large frames, small restoration units are almost never useful, + // so prune them away + if (is_1440p_or_larger) { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX; + } else if (is_720p_or_larger) { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; + } + } + + if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) { + // At this speed, a full search is too expensive. Instead, pick a single + // size based on size and qindex. Note that, in general, higher quantizers + // (== lower quality) and larger frames generally want to use larger + // restoration units. + int qindex_thresh = 96; + if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; + sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1; + } else { + sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX; + sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX; + } + } + + set_subpel_search_method(&cpi->mv_search_params, + cpi->oxcf.unit_test_cfg.motion_vector_unit_test, + sf->mv_sf.subpel_search_method); +} diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h new file mode 100644 index 0000000000..60c000e4f4 --- /dev/null +++ b/third_party/aom/av1/encoder/speed_features.h @@ -0,0 +1,2025 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_ +#define AOM_AV1_ENCODER_SPEED_FEATURES_H_ + +#include "av1/common/enums.h" +#include "av1/encoder/enc_enums.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/encodemb.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! @file */ + +/*!\cond */ +#define MAX_MESH_STEP 4 + +typedef struct MESH_PATTERN { + int range; + int interval; +} MESH_PATTERN; + +enum { + GM_FULL_SEARCH, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2, + + // Same as GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 but with extra filtering + // to keep at most two ref frames + GM_SEARCH_CLOSEST_REFS_ONLY, + + GM_DISABLE_SEARCH +} UENUM1BYTE(GM_SEARCH_TYPE); + +enum { + DIST_WTD_COMP_ENABLED, + DIST_WTD_COMP_SKIP_MV_SEARCH, + DIST_WTD_COMP_DISABLED, +} UENUM1BYTE(DIST_WTD_COMP_FLAG); + +enum { + INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) | + (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) | + (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) | + (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED), + UV_INTRA_ALL = + (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) | + (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) | + (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) | + (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC = (1 << UV_DC_PRED), + UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED), + UV_INTRA_DC_PAETH_CFL = + (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | + (1 << UV_H_PRED) | (1 << UV_CFL_PRED), + UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED), + UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | + (1 << UV_V_PRED) | (1 << UV_H_PRED) | + (1 << UV_CFL_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_H_V_SMOOTH = + (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED), + INTRA_DC_PAETH_H_V = + (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED) +}; + +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | + (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) | + (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV), + INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | + (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) | + (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) | + (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | + (1 << NEAR_NEARMV), + INTER_SINGLE_ALL = + (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV), +}; + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) | + (1 << THR_ALTR) | (1 << THR_GOLD) +}; + +enum { + TXFM_CODING_SF = 1, + INTER_PRED_SF = 2, + INTRA_PRED_SF = 4, + PARTITION_SF = 8, + LOOP_FILTER_SF = 16, + RD_SKIP_SF = 32, + RESERVE_2_SF = 64, + RESERVE_3_SF = 128, +} UENUM1BYTE(DEV_SPEED_FEATURES); + +/* This enumeration defines when the rate control recode loop will be + * enabled. + */ +enum { + /* + * No recodes allowed + */ + DISALLOW_RECODE = 0, + /* + * Allow recode only for KF/ARF/GF frames + */ + ALLOW_RECODE_KFARFGF = 1, + /* + * Allow recode for all frame types based on bitrate constraints. + */ + ALLOW_RECODE = 2, +} UENUM1BYTE(RECODE_LOOP_TYPE); + +enum { + SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches + SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively + SUBPEL_SEARCH_METHODS +} UENUM1BYTE(SUBPEL_SEARCH_METHOD); + +enum { + // Try the full image with different values. + LPF_PICK_FROM_FULL_IMAGE, + // Try the full image filter search with non-dual filter only. + LPF_PICK_FROM_FULL_IMAGE_NON_DUAL, + // Try a small portion of the image with different values. + LPF_PICK_FROM_SUBIMAGE, + // Estimate the level based on quantizer and frame type + LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF +} UENUM1BYTE(LPF_PICK_METHOD); +/*!\endcond */ + +/*!\enum CDEF_PICK_METHOD + * \brief This enumeration defines a variety of CDEF pick methods + */ +typedef enum { + CDEF_FULL_SEARCH, /**< Full search */ + CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */ + CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */ + CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than + Level 2. */ + CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */ + CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */ + CDEF_PICK_FROM_Q, /**< Estimate filter strength based on quantizer. */ + CDEF_PICK_METHODS +} CDEF_PICK_METHOD; + +/*!\cond */ +enum { + // Terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1 << 0, + + // Skips comp inter modes if the best so far is an intra mode. + FLAG_SKIP_COMP_BESTINTRA = 1 << 1, + + // Skips oblique intra modes if the best so far is an inter mode. + FLAG_SKIP_INTRA_BESTINTER = 1 << 3, + + // Skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions. + FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, + + // Skips intra modes other than DC_PRED if the source variance is small + FLAG_SKIP_INTRA_LOWVAR = 1 << 5, +} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC); + +enum { + // No tx type pruning + TX_TYPE_PRUNE_0 = 0, + // adaptively prunes the least perspective tx types out of all 16 + // (tuned to provide negligible quality loss) + TX_TYPE_PRUNE_1 = 1, + // similar, but applies much more aggressive pruning to get better speed-up + TX_TYPE_PRUNE_2 = 2, + TX_TYPE_PRUNE_3 = 3, + // More aggressive pruning based on tx type score and allowed tx count + TX_TYPE_PRUNE_4 = 4, + TX_TYPE_PRUNE_5 = 5, +} UENUM1BYTE(TX_TYPE_PRUNE_MODE); + +enum { + // No reaction to rate control on a detected slide/scene change. + NO_DETECTION = 0, + + // Set to larger Q based only on the detected slide/scene change and + // current/past Q. + FAST_DETECTION_MAXQ = 1, +} UENUM1BYTE(OVERSHOOT_DETECTION_CBR); + +enum { + // Turns off multi-winner mode. So we will do txfm search on either all modes + // if winner mode is off, or we will only on txfm search on a single winner + // mode. + MULTI_WINNER_MODE_OFF = 0, + + // Limits the number of winner modes to at most 2 + MULTI_WINNER_MODE_FAST = 1, + + // Uses the default number of winner modes, which is 3 for intra mode, and 1 + // for inter mode. + MULTI_WINNER_MODE_DEFAULT = 2, + + // Maximum number of winner modes allowed. + MULTI_WINNER_MODE_LEVELS, +} UENUM1BYTE(MULTI_WINNER_MODE_TYPE); + +enum { + PRUNE_NEARMV_OFF = 0, // Turn off nearmv pruning + PRUNE_NEARMV_LEVEL1 = 1, // Prune nearmv for qindex (0-85) + PRUNE_NEARMV_LEVEL2 = 2, // Prune nearmv for qindex (0-170) + PRUNE_NEARMV_LEVEL3 = 3, // Prune nearmv more aggressively for qindex (0-170) + PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3, +} UENUM1BYTE(PRUNE_NEARMV_LEVEL); + +enum { + // Default transform search used in evaluation of best inter candidates + // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL + // stage). + TX_SEARCH_DEFAULT = 0, + // Transform search in motion mode rd during MODE_EVAL stage. + TX_SEARCH_MOTION_MODE, + // Transform search in compound type mode rd during MODE_EVAL stage. + TX_SEARCH_COMP_TYPE_MODE, + // All transform search cases + TX_SEARCH_CASES +} UENUM1BYTE(TX_SEARCH_CASE); + +typedef struct { + TX_TYPE_PRUNE_MODE prune_2d_txfm_mode; + int fast_intra_tx_type_search; + + // INT_MAX: Disable fast search. + // 1 - 1024: Probability threshold used for conditionally forcing tx type, + // during mode search. + // 0: Force tx type to be DCT_DCT unconditionally, during + // mode search. + int fast_inter_tx_type_prob_thresh; + + // Prune less likely chosen transforms for each intra mode. The speed + // feature ranges from 0 to 2, for different speed / compression trade offs. + int use_reduced_intra_txset; + + // Use a skip flag prediction model to detect blocks with skip = 1 early + // and avoid doing full TX type search for such blocks. + int use_skip_flag_prediction; + + // Threshold used by the ML based method to predict TX block split decisions. + int ml_tx_split_thresh; + + // skip remaining transform type search when we found the rdcost of skip is + // better than applying transform + int skip_tx_search; + + // Prune tx type search using previous frame stats. + int prune_tx_type_using_stats; + // Prune tx type search using estimated RDcost + int prune_tx_type_est_rd; + + // Flag used to control the winner mode processing for tx type pruning for + // inter blocks. It enables further tx type mode pruning based on ML model for + // mode evaluation and disables tx type mode pruning for winner mode + // processing. + int winner_mode_tx_type_pruning; +} TX_TYPE_SEARCH; + +enum { + // Search partitions using RD criterion + SEARCH_PARTITION, + + // Always use a fixed size partition + FIXED_PARTITION, + + // Partition using source variance + VAR_BASED_PARTITION, + +#if CONFIG_RT_ML_PARTITIONING + // Partition using ML model + ML_BASED_PARTITION +#endif +} UENUM1BYTE(PARTITION_SEARCH_TYPE); + +enum { + NOT_IN_USE, + DIRECT_PRED, + RELAXED_PRED, + ADAPT_PRED +} UENUM1BYTE(MAX_PART_PRED_MODE); + +enum { + LAST_MV_DATA, + CURRENT_Q, + QTR_ONLY, +} UENUM1BYTE(MV_PREC_LOGIC); + +enum { + SUPERRES_AUTO_ALL, // Tries all possible superres ratios + SUPERRES_AUTO_DUAL, // Tries no superres and q-based superres ratios + SUPERRES_AUTO_SOLO, // Only apply the q-based superres ratio +} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE); +/*!\endcond */ + +/*!\enum INTERNAL_COST_UPDATE_TYPE + * \brief This enum decides internally how often to update the entropy costs + * + * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly + * more flexibility in update frequency. This enum is separate from \ref + * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its + * values are public so it cannot be modified without breaking public API. + * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to + * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and + * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in + * the order of increasing frequencies. + * + * \warning In case of any updates/modifications to the enum COST_UPDATE_TYPE, + * update the enum INTERNAL_COST_UPDATE_TYPE as well. + */ +typedef enum { + INTERNAL_COST_UPD_OFF, /*!< Turn off cost updates. */ + INTERNAL_COST_UPD_TILE, /*!< Update every tile. */ + INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */ + INTERNAL_COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ + INTERNAL_COST_UPD_SB, /*!< Update every sb. */ +} INTERNAL_COST_UPDATE_TYPE; + +/*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL + * \brief This enumeration defines a variety of simple motion search based + * partition prune levels + */ +typedef enum { + NO_PRUNING = -1, + SIMPLE_AGG_LVL0, /*!< Simple prune aggressiveness level 0. */ + SIMPLE_AGG_LVL1, /*!< Simple prune aggressiveness level 1. */ + SIMPLE_AGG_LVL2, /*!< Simple prune aggressiveness level 2. */ + SIMPLE_AGG_LVL3, /*!< Simple prune aggressiveness level 3. */ + QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive + level maps to simple agg level 1 or 2 based on qindex. + */ + TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune + aggressiveness levels. */ + TOTAL_QINDEX_BASED_AGG_LVLS = + QIDX_BASED_AGG_LVL1 - + SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune + aggressiveness levels. */ + TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS + + TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */ +} SIMPLE_MOTION_SEARCH_PRUNE_LEVEL; + +/*!\enum PRUNE_MESH_SEARCH_LEVEL + * \brief This enumeration defines a variety of mesh search prune levels. + */ +typedef enum { + PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */ + PRUNE_MESH_SEARCH_LVL_1 = 1, /*!< Prune mesh search level 1. */ + PRUNE_MESH_SEARCH_LVL_2 = 2, /*!< Prune mesh search level 2. */ +} PRUNE_MESH_SEARCH_LEVEL; + +/*!\enum INTER_SEARCH_EARLY_TERM_IDX + * \brief This enumeration defines inter search early termination index in + * non-rd path based on sse value. + */ +typedef enum { + EARLY_TERM_DISABLED = + 0, /*!< Early terminate inter mode search based on sse disabled. */ + EARLY_TERM_IDX_1 = + 1, /*!< Early terminate inter mode search based on sse, index 1. */ + EARLY_TERM_IDX_2 = + 2, /*!< Early terminate inter mode search based on sse, index 2. */ + EARLY_TERM_IDX_3 = + 3, /*!< Early terminate inter mode search based on sse, index 3. */ + EARLY_TERM_IDX_4 = + 4, /*!< Early terminate inter mode search based on sse, index 4. */ + EARLY_TERM_INDICES, /*!< Total number of early terminate indices */ +} INTER_SEARCH_EARLY_TERM_IDX; + +/*! + * \brief Sequence/frame level speed vs quality features + */ +typedef struct HIGH_LEVEL_SPEED_FEATURES { + /*! Frame level coding parameter update. */ + int frame_parameter_update; + + /*! + * Cases and frame types for which the recode loop is enabled. + */ + RECODE_LOOP_TYPE recode_loop; + + /*! + * Controls the tolerance vs target rate used in deciding whether to + * recode a frame. It has no meaning if recode is disabled. + */ + int recode_tolerance; + + /*! + * Determine how motion vector precision is chosen. The possibilities are: + * LAST_MV_DATA: use the mv data from the last coded frame + * CURRENT_Q: use the current q as a threshold + * QTR_ONLY: use quarter pel precision only. + */ + MV_PREC_LOGIC high_precision_mv_usage; + + /*! + * Always set to 0. If on it enables 0 cost background transmission + * (except for the initial transmission of the segmentation). The feature is + * disabled because the addition of very large block sizes make the + * backgrounds very to cheap to encode, and the segmentation we have + * adds overhead. + */ + int static_segmentation; + + /*! + * Superres-auto mode search type: + */ + SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type; + + /*! + * Enable/disable extra screen content test by encoding key frame twice. + */ + int disable_extra_sc_testing; + + /*! + * Enable/disable second_alt_ref temporal filtering. + */ + int second_alt_ref_filtering; + + /*! + * The number of frames to be used during temporal filtering of an ARF frame + * is adjusted based on noise level of the current frame. The sf has three + * levels to decide number of frames to be considered for filtering: + * 0 : Use default number of frames + * 1 and 2 : Reduce the number of frames based on noise level with varied + * aggressiveness + */ + int adjust_num_frames_for_arf_filtering; + + /*! + * Decide the bit estimation approach used in qindex decision. + * 0: estimate bits based on a constant value; + * 1: estimate bits more accurately based on the frame complexity. + */ + int accurate_bit_estimate; + + /*! + * Decide the approach for weight calculation during temporal filtering. + * 0: Calculate weight using exp() + * 1: Calculate weight using a lookup table that approximates exp(). + */ + int weight_calc_level_in_tf; + + /*! + * Decide whether to perform motion estimation at split block (i.e. 16x16) + * level or not. + * 0: Always allow motion estimation. + * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance. + */ + int allow_sub_blk_me_in_tf; +} HIGH_LEVEL_SPEED_FEATURES; + +/*! + * Speed features for the first pass. + */ +typedef struct FIRST_PASS_SPEED_FEATURES { + /*! + * \brief Reduces the mv search window. + * By default, the initial search window is around + * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023). + * Each step reduction decrease the window size by about a factor of 2. + */ + int reduce_mv_step_param; + + /*! + * \brief Skips the motion search when the zero mv has small sse. + */ + int skip_motion_search_threshold; + + /*! + * \brief Skips reconstruction by using source buffers for prediction + */ + int disable_recon; + + /*! + * \brief Skips the motion search centered on 0,0 mv. + */ + int skip_zeromv_motion_search; +} FIRST_PASS_SPEED_FEATURES; + +/*!\cond */ +typedef struct TPL_SPEED_FEATURES { + // GOP length adaptive decision. + // If set to 0, tpl model decides whether a shorter gf interval is better. + // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and + // (base+2) layer decide whether a shorter gf interval is better. + // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost + // decide whether a shorter gf interval is better. + // If set to 3, gop length adaptive decision is disabled. + int gop_length_decision_method; + // Prune the intra modes search by tpl. + // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED. + // If set to 1, we only search DC_PRED, V_PRED, and H_PRED. + int prune_intra_modes; + // This parameter controls which step in the n-step process we start at. + int reduce_first_step_size; + // Skip motion estimation based on the precision of center MVs and the + // difference between center MVs. + // If set to 0, motion estimation is skipped for duplicate center MVs + // (default). If set to 1, motion estimation is skipped for duplicate + // full-pixel center MVs. If set to 2, motion estimation is skipped if the + // difference between center MVs is less than the threshold. + int skip_alike_starting_mv; + + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // Which search method to use. + SEARCH_METHODS search_method; + + // Prune starting mvs in TPL based on sad scores. + int prune_starting_mv; + + // Prune reference frames in TPL. + int prune_ref_frames_in_tpl; + + // Support compound predictions. + int allow_compound_pred; + + // Calculate rate and distortion based on Y plane only. + int use_y_only_rate_distortion; + + // Use SAD instead of SATD during intra/inter mode search. + // If set to 0, use SATD always. + // If set to 1, use SAD during intra/inter mode search for frames in the + // higher temporal layers of the hierarchical prediction structure. + // If set to 2, use SAD during intra/inter mode search for all frames. + // This sf is disabled for the first GF group of the key-frame interval, + // i.e., SATD is used during intra/inter mode search of the first GF group. + int use_sad_for_mode_decision; + + // Skip tpl processing for frames of type LF_UPDATE. + // This sf is disabled for the first GF group of the key-frame interval. + int reduce_num_frames; +} TPL_SPEED_FEATURES; + +typedef struct GLOBAL_MOTION_SPEED_FEATURES { + GM_SEARCH_TYPE gm_search_type; + + // During global motion estimation, prune remaining reference frames in a + // given direction(past/future), if the evaluated ref_frame in that direction + // yields gm_type as INVALID/TRANSLATION/IDENTITY + int prune_ref_frame_for_gm_search; + + // When the current GM type is set to ZEROMV, prune ZEROMV if its performance + // is worse than NEWMV under SSE metric. + // 0 : no pruning + // 1 : conservative pruning + // 2 : aggressive pruning + int prune_zero_mv_with_sse; + + // Disable global motion estimation based on stats of previous frames in the + // GF group + int disable_gm_search_based_on_stats; + + // Number of refinement steps to apply after initial model generation + int num_refinement_steps; +} GLOBAL_MOTION_SPEED_FEATURES; + +typedef struct PARTITION_SPEED_FEATURES { + PARTITION_SEARCH_TYPE partition_search_type; + + // Used if partition_search_type = FIXED_PARTITION + BLOCK_SIZE fixed_partition_size; + + // Prune extended partition types search based on the current best partition + // and the combined rdcost of the subblocks estimated from previous + // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 + // increasing aggressiveness of pruning in order. + int prune_ext_partition_types_search_level; + + // Prune part4 based on block size + int prune_part4_search; + + // Use a ML model to prune rectangular, ab and 4-way horz + // and vert partitions + int ml_prune_partition; + + // Use a ML model to adaptively terminate partition search after trying + // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and + // 1 - 2 increasing aggressiveness in order. + int ml_early_term_after_part_split_level; + + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. Can take values 0 - 2, 0 referring to no + // skipping, and 1 - 2 increasing aggressiveness of skipping in order. + int less_rectangular_check_level; + + // Use square partition only beyond this block size. + BLOCK_SIZE use_square_partition_only_threshold; + + // Sets max square partition levels for this superblock based on + // motion vector and prediction error distribution produced from 16x16 + // simple motion search + MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion; + + // Min and max square partition size we enable (block_size) as per auto + // min max, but also used by adjust partitioning, and pick_partitioning. + BLOCK_SIZE default_min_partition_size; + BLOCK_SIZE default_max_partition_size; + + // Sets level of adjustment of variance-based partitioning during + // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions + // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge + // and split leaf partitions and 0 - 3 decreasing aggressiveness in order. + int adjust_var_based_rd_partitioning; + + // Partition search early breakout thresholds. + int64_t partition_search_breakout_dist_thr; + int partition_search_breakout_rate_thr; + + // Thresholds for ML based partition search breakout. + int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES]; + + // Aggressiveness levels for pruning split and rectangular partitions based on + // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to + // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to + // qindex based and simple motion search based pruning. + int simple_motion_search_prune_agg; + + // Perform simple_motion_search on each possible subblock and use it to prune + // PARTITION_HORZ and PARTITION_VERT. + int simple_motion_search_prune_rect; + + // Perform simple motion search before none_partition to decide if we + // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this + // model is disabled. If set to 1, the model attempts to perform + // PARTITION_SPLIT only. If set to 2, the model also attempts to prune + // PARTITION_SPLIT. + int simple_motion_search_split; + + // Use features from simple_motion_search to terminate prediction block + // partition after PARTITION_NONE + int simple_motion_search_early_term_none; + + // Controls whether to reduce the number of motion search steps. If this is 0, + // then simple_motion_search has the same number of steps as + // single_motion_search (assuming no other speed features). Otherwise, reduce + // the number of steps by the value contained in this variable. + int simple_motion_search_reduce_search_steps; + + // This variable controls the maximum block size where intra blocks can be + // used in inter frames. + // TODO(aconverse): Fold this into one of the other many mode skips + BLOCK_SIZE max_intra_bsize; + + // Use CNN with luma pixels on source frame on each of the 64x64 subblock to + // perform partition pruning in intra frames. + // 0: No Pruning + // 1: Prune split and rectangular partitions only + // 2: Prune none, split and rectangular partitions + int intra_cnn_based_part_prune_level; + + // Disable extended partition search if the current bsize is greater than the + // threshold. Must be a square block size BLOCK_8X8 or higher. + BLOCK_SIZE ext_partition_eval_thresh; + + // Use best partition decision so far to tune 'ext_partition_eval_thresh' + int ext_part_eval_based_on_cur_best; + + // Disable rectangular partitions for larger block sizes. + int rect_partition_eval_thresh; + + // Prune extended partition search based on whether the split/rect partitions + // provided an improvement in the previous search. + // 0 : no pruning + // 1 : prune 1:4 partition search using winner info from split partitions + // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info + int prune_ext_part_using_split_info; + + // Prunt rectangular, AB and 4-way partition based on q index and block size + // 0 : no pruning + // 1 : prune sub_8x8 at very low quantizers + // 2 : prune all block size based on qindex + int prune_rectangular_split_based_on_qidx; + + // Prune rectangular partitions based on 4x4 sub-block variance + // false : no pruning + // true : prune rectangular partitions based on 4x4 sub-block variance + // deviation + // + // For allintra encode, this speed feature reduces instruction count by 6.4% + // for speed=6 with coding performance change less than 0.24%. For AVIF image + // encode, this speed feature reduces encode time by 8.14% for speed 6 on a + // typical image dataset with coding performance change less than 0.16%. This + // speed feature is not applicable to speed >= 7. + bool prune_rect_part_using_4x4_var_deviation; + + // Prune rectangular partitions based on prediction mode chosen by NONE + // partition. + // false : no pruning + // true : prunes rectangular partition as described below + // If prediction mode chosen by NONE partition is + // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if + // at least one of the left and top neighbor blocks is larger than the + // current block. + // Directional Mode: Prunes either of the horizontal and vertical partition + // based on center angle of the prediction mode chosen by NONE partition. For + // example, vertical partition is pruned if center angle of the prediction + // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal + // direction) and vice versa. + // For allintra encode, this speed feature reduces instruction count by 5.1% + // for speed=6 with coding performance change less than 0.22%. For AVIF image + // encode, this speed feature reduces encode time by 4.44% for speed 6 on a + // typical image dataset with coding performance change less than 0.15%. + // For speed >= 7, variance-based logic is used to determine the partition + // structure instead of recursive partition search. Therefore, this speed + // feature is not applicable in such cases. + bool prune_rect_part_using_none_pred_mode; + + // Terminate partition search for child partition, + // when NONE and SPLIT partition rd_costs are INT64_MAX. + int early_term_after_none_split; + + // Level used to adjust threshold for av1_ml_predict_breakout(). At lower + // levels, more conservative threshold is used, and value of 0 indicates + // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default + // case with no adjustment to lbd thresholds. + int ml_predict_breakout_level; + + // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions. + // 0 : no pruning + // 1 : pruning based on neighbour block information + // 2 : prune always + int prune_sub_8x8_partition_level; + + // Prune rectangular split based on simple motion search split/no_split score. + // 0: disable pruning, 1: enable pruning + int simple_motion_search_rect_split; + + // The current encoder adopts a DFS search for block partitions. + // Therefore the mode selection and associated rdcost is ready for smaller + // blocks before the mode selection for some partition types. + // AB partition could use previous rd information and skip mode search. + // An example is: + // + // current block + // +---+---+ + // | | + // + + + // | | + // +-------+ + // + // SPLIT partition has been searched first before trying HORZ_A + // +---+---+ + // | R | R | + // +---+---+ + // | R | R | + // +---+---+ + // + // HORZ_A + // +---+---+ + // | | | + // +---+---+ + // | | + // +-------+ + // + // With this speed feature, the top two sub blocks can directly use rdcost + // searched in split partition, and the mode info is also copied from + // saved info. Similarly, the bottom rectangular block can also use + // the available information from previous rectangular search. + int reuse_prev_rd_results_for_part_ab; + + // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT + // when encoding PARTITION_AB. + int reuse_best_prediction_for_part_ab; + + // The current partition search records the best rdcost so far and uses it + // in mode search and transform search to early skip when some criteria is + // met. For example, when the current rdcost is larger than the best rdcost, + // or the model rdcost is larger than the best rdcost times some thresholds. + // By default, this feature is turned on to speed up the encoder partition + // search. + // If disabling it, at speed 0, 30 frames, we could get + // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown. + int use_best_rd_for_pruning; + + // Skip evaluation of non-square partitions based on the corresponding NONE + // partition. + // 0: no pruning + // 1: prune extended partitions if NONE is skippable + // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv + // mode and skippable + int skip_non_sq_part_based_on_none; + + // Disables 8x8 and below partitions for low quantizers. + int disable_8x8_part_based_on_qidx; +} PARTITION_SPEED_FEATURES; + +typedef struct MV_SPEED_FEATURES { + // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). + SEARCH_METHODS search_method; + + // Enable the use of faster, less accurate mv search method + // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp + // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into + // account. + int use_bsize_dependent_search_method; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; + + // Subpel_search_method can only be subpel_tree which does a subpixel + // logarithmic search that keeps stepping at 1/2 pixel units until + // you stop getting a gain, and then goes on to 1/4 and repeats + // the same process. Along the way it skips many diagonals. + SUBPEL_SEARCH_METHOD subpel_search_method; + + // Maximum number of steps in logarithmic subpel search before giving up. + int subpel_iters_per_step; + + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // When to stop subpel search in simple motion search. + SUBPEL_FORCE_STOP simple_motion_subpel_force_stop; + + // If true, sub-pixel search uses the exact convolve function used for final + // encoding and decoding; otherwise, it uses bilinear interpolation. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // Threshold for allowing exhaustive motion search. + int exhaustive_searches_thresh; + + // Pattern to be used for any exhaustive mesh searches (except intraBC ME). + MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; + + // Pattern to be used for exhaustive mesh searches of intraBC ME. + MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP]; + + // Reduce single motion search range based on MV result of prior ref_mv_idx. + int reduce_search_range; + + // Prune mesh search. + PRUNE_MESH_SEARCH_LEVEL prune_mesh_search; + + // Use the rd cost around the best FULLPEL_MV to speed up subpel search + int use_fullpel_costlist; + + // Set the full pixel search level of obmc + // 0: obmc_full_pixel_diamond + // 1: obmc_refining_search_sad (faster) + int obmc_full_pixel_search_level; + + // Accurate full pixel motion search based on TPL stats. + int full_pixel_search_level; + + // Allow intrabc motion search + int use_intrabc; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 16 rows. When this sf is + // active, if there is a large discrepancy in the SAD values for the final + // motion vector between skipping vs not skipping, motion search is redone + // with skip row features off. + // 0: Disabled (do not downsample rows) + // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and + // odd rows for the starting MV is small. Redo motion search with sf off + // when SAD deviation is high for the final motion vector. + // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the + // start MV and tested only for the final MV. + int use_downsampled_sad; + + // Enable/disable extensive joint motion search. + int disable_extensive_joint_motion_search; + + // Enable second best mv check in joint mv search. + // 0: allow second MV (use rd cost as the metric) + // 1: use var as the metric + // 2: disable second MV + int disable_second_mv; + + // Skips full pixel search based on start mv of prior ref_mv_idx. + // 0: Disabled + // 1: Skips the full pixel search upto 4 neighbor full-pel MV positions. + // 2: Skips the full pixel search upto 8 neighbor full-pel MV positions. + int skip_fullpel_search_using_startmv; + + // Method to use for refining WARPED_CAUSAL motion vectors + // TODO(rachelbarker): Can this be unified with OBMC in some way? + WARP_SEARCH_METHOD warp_search_method; + + // Maximum number of iterations in WARPED_CAUSAL refinement search + int warp_search_iters; +} MV_SPEED_FEATURES; + +typedef struct INTER_MODE_SPEED_FEATURES { + // 2-pass inter mode model estimation where the preliminary pass skips + // transform search and uses a model to estimate rd, while the final pass + // computes the full transform search. Two types of models are supported: + // 0: not used + // 1: used with online dynamic rd model + // 2: used with static rd model + int inter_mode_rd_model_estimation; + + // Bypass transform search based on skip rd at following stages + // i. Compound type mode search + // ii. Motion mode search (mode evaluation and winner motion mode stage) + // iii. Transform search for best inter candidates + int txfm_rd_gate_level[TX_SEARCH_CASES]; + + // Limit the inter mode tested in the RD loop + int reduce_inter_modes; + + // This variable is used to cap the maximum number of times we skip testing a + // mode to be evaluated. A high value means we will be faster. + int adaptive_rd_thresh; + + // Aggressively prune inter modes when best mode is skippable. + int prune_inter_modes_if_skippable; + + // Drop less likely to be picked reference frames in the RD search. + // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune + // more aggressively than lower ones. (0 means no pruning). + int selective_ref_frame; + + // Prune reference frames for rectangular partitions. + // 0 implies no pruning + // 1 implies prune for extended partition + // 2 implies prune horiz, vert and extended partition + int prune_ref_frame_for_rect_partitions; + + // Prune inter modes w.r.t past reference frames + // 0 no pruning + // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames + // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames + int alt_ref_search_fp; + + // Prune reference frames for single prediction modes based on temporal + // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is + // disabled for 0. An increasing value indicates more aggressive pruning + // threshold. + int prune_single_ref; + + // Prune compound reference frames + // 0 no pruning + // 1 prune compound references which do not satisfy the two conditions: + // a) The references are at a nearest distance from the current frame in + // both past and future direction. + // b) The references have minimum pred_mv_sad in both past and future + // direction. + // 2 prune compound references except the one with nearest distance from the + // current frame in both past and future direction. + int prune_comp_ref_frames; + + // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc. + // This speed feature equaling 0 means no skipping. + // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode + // if we have already encountered ref_mv in the drl such that: + // 1. The other drl has the same mv during the SIMPLE_TRANSLATION search + // process as the current mv. + // 2. The rate needed to encode the current mv is larger than that for the + // other ref_mv. + // The speed feature equaling 1 means using subpel mv in the comparison. + // The speed feature equaling 2 means using fullpel mv in the comparison. + // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on + // known full_mv bestsme and drl cost. + int skip_newmv_in_drl; + + // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV, + // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found + // TODO(any): Instead of skipping repeated ref mv, use the recalculated + // rd-cost based on mode rate and skip the mode evaluation + int skip_repeated_ref_mv; + + // Flag used to control the ref_best_rd based gating for chroma + int perform_best_rd_based_gating_for_chroma; + + // Reuse the inter_intra_mode search result from NEARESTMV mode to other + // single ref modes + int reuse_inter_intra_mode; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average modeled rd + int prune_comp_type_by_model_rd; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average rd/ref_best_rd + int prune_comp_type_by_comp_avg; + + // Skip some ref frames in compound motion search by single motion search + // result. Has three levels for now: 0 referring to no skipping, and 1 - 3 + // increasing aggressiveness of skipping in order. + // Note: The search order might affect the result. It assumes that the single + // reference modes are searched before compound modes. It is better to search + // same single inter mode as a group. + int prune_comp_search_by_single_result; + + // Instead of performing a full MV search, do a simple translation first + // and only perform a full MV search on the motion vectors that performed + // well. + int prune_mode_search_simple_translation; + + // Only search compound modes with at least one "good" reference frame. + // A reference frame is good if, after looking at its performance among + // the single reference modes, it is one of the two best performers. + int prune_compound_using_single_ref; + + // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV, + // NEW_NEARMV) using ref frames of above and left neighbor + // blocks. + // 0 : no pruning + // 1 : prune ext compound modes using neighbor blocks (less aggressiveness) + // 2 : prune ext compound modes using neighbor blocks (high aggressiveness) + // 3 : prune ext compound modes unconditionally (highest aggressiveness) + int prune_ext_comp_using_neighbors; + + // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes + int skip_ext_comp_nearmv_mode; + + // Skip extended compound mode when ref frame corresponding to NEWMV does not + // have NEWMV as single mode winner. + // 0 : no pruning + // 1 : prune extended compound mode (less aggressiveness) + // 2 : prune extended compound mode (high aggressiveness) + int prune_comp_using_best_single_mode_ref; + + // Skip NEARESTMV and NEARMV using weight computed in ref mv list population + // + // Pruning is enabled only when both the top and left neighbor blocks are + // available and when the current block already has a valid inter prediction. + int prune_nearest_near_mv_using_refmv_weight; + + // Based on previous ref_mv_idx search result, prune the following search. + int prune_ref_mv_idx_search; + + // Disable one sided compound modes. + int disable_onesided_comp; + + // Prune obmc search using previous frame stats. + // INT_MAX : disable obmc search + int prune_obmc_prob_thresh; + + // Prune warped motion search using previous frame stats. + int prune_warped_prob_thresh; + + // Variance threshold to enable/disable Interintra wedge search + unsigned int disable_interintra_wedge_var_thresh; + + // Variance threshold to enable/disable Interinter wedge search + unsigned int disable_interinter_wedge_var_thresh; + + // De-couple wedge and mode search during interintra RDO. + int fast_interintra_wedge_search; + + // Whether fast wedge sign estimate is used + int fast_wedge_sign_estimate; + + // Enable/disable ME for interinter wedge search. + int disable_interinter_wedge_newmv_search; + + // Decide when and how to use joint_comp. + DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag; + + // Clip the frequency of updating the mv cost. + INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level; + + // Clip the frequency of updating the coeff cost. + INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level; + + // Clip the frequency of updating the mode cost. + INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level; + + // Prune inter modes based on tpl stats + // 0 : no pruning + // 1 - 3 indicate increasing aggressiveness in order. + int prune_inter_modes_based_on_tpl; + + // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left + // neighbor blocks and qindex. + PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors; + + // Model based breakout after interpolation filter search + // 0: no breakout + // 1: use model based rd breakout + int model_based_post_interp_filter_breakout; + + // Reuse compound type rd decision when exact match is found + // 0: No reuse + // 1: Reuse the compound type decision + int reuse_compound_type_decision; + + // Enable/disable masked compound. + int disable_masked_comp; + + // Enable/disable MV refinement for compound modes corresponds to compound + // types COMPOUND_AVERAGE, COMPOUND_DISTWTD (currently, this compound type + // is disabled for speeds >= 2 using the sf 'use_dist_wtd_comp_flag') and + // COMPOUND_DIFFWTD based on the availability. Levels 0 to 3 indicate + // increasing order of aggressiveness to disable MV refinement. + // 0: MV Refinement is enabled and for NEW_NEWMV mode used two iterations of + // refinement in av1_joint_motion_search(). + // 1: MV Refinement is disabled for COMPOUND_DIFFWTD and enabled for + // COMPOUND_AVERAGE & COMPOUND_DISTWTD. + // 2: MV Refinement is enabled for COMPOUND_AVERAGE & COMPOUND_DISTWTD for + // NEW_NEWMV mode with one iteration of refinement in + // av1_joint_motion_search() and MV Refinement is disabled for other compound + // type modes. + // 3: MV Refinement is disabled. + int enable_fast_compound_mode_search; + + // Reuse masked compound type search results + int reuse_mask_search_results; + + // Enable/disable fast search for wedge masks + int enable_fast_wedge_mask_search; + + // Early breakout from transform search of inter modes + int inter_mode_txfm_breakout; + + // Limit number of inter modes for txfm search if a newmv mode gets + // evaluated among the top modes. + // 0: no pruning + // 1 to 3 indicate increasing order of aggressiveness + int limit_inter_mode_cands; + + // Cap the no. of txfm searches for a given prediction mode. + // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches. + int limit_txfm_eval_per_mode; + + // Prune warped motion search based on block size. + int extra_prune_warped; + + // Do not search compound modes for ARF. + // The intuition is that ARF is predicted by frames far away from it, + // whose temporal correlations with the ARF are likely low. + // It is therefore likely that compound modes do not work as well for ARF + // as other inter frames. + // Speed/quality impact: + // Speed 1: 12% faster, 0.1% psnr loss. + // Speed 2: 2% faster, 0.05% psnr loss. + // No change for speed 3 and up, because |disable_onesided_comp| is true. + int skip_arf_compound; +} INTER_MODE_SPEED_FEATURES; + +typedef struct INTERP_FILTER_SPEED_FEATURES { + // Do limited interpolation filter search for dual filters, since best choice + // usually includes EIGHTTAP_REGULAR. + int use_fast_interpolation_filter_search; + + // Disable dual filter + int disable_dual_filter; + + // Save results of av1_interpolation_filter_search for a block + // Check mv and ref_frames before search, if they are very close with previous + // saved results, filter search can be skipped. + int use_interp_filter; + + // skip sharp_filter evaluation based on regular and smooth filter rd for + // dual_filter=0 case + int skip_sharp_interp_filter_search; + + // skip interpolation filter search for a block in chessboard pattern + int cb_pred_filter_search; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation + // filter search. + int skip_interp_filter_search; +} INTERP_FILTER_SPEED_FEATURES; + +typedef struct INTRA_MODE_SPEED_FEATURES { + // These bit masks allow you to enable or disable intra modes for each + // transform size separately. + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; + + // flag to allow skipping intra mode for inter frame prediction + int skip_intra_in_interframe; + + // Prune intra mode candidates based on source block histogram of gradient. + // Applies to luma plane only. + // Feasible values are 0..4. The feature is disabled for 0. An increasing + // value indicates more aggressive pruning threshold. + int intra_pruning_with_hog; + + // Prune intra mode candidates based on source block histogram of gradient. + // Applies to chroma plane only. + // Feasible values are 0..4. The feature is disabled for 0. An increasing + // value indicates more aggressive pruning threshold. + int chroma_intra_pruning_with_hog; + + // Enable/disable smooth intra modes. + int disable_smooth_intra; + + // Prune UV_SMOOTH_PRED mode for chroma based on chroma source variance. + // false : No pruning + // true : Prune UV_SMOOTH_PRED mode based on chroma source variance + // + // For allintra encode, this speed feature reduces instruction count + // by 1.90%, 2.21% and 1.97% for speed 6, 7 and 8 with coding performance + // change less than 0.04%. For AVIF image encode, this speed feature reduces + // encode time by 1.56%, 2.14% and 0.90% for speed 6, 7 and 8 on a typical + // image dataset with coding performance change less than 0.05%. + bool prune_smooth_intra_mode_for_chroma; + + // Prune filter intra modes in intra frames. + // 0 : No pruning + // 1 : Evaluate applicable filter intra modes based on best intra mode so far + // 2 : Do not evaluate filter intra modes + int prune_filter_intra_level; + + // prune palette search + // 0: No pruning + // 1: Perform coarse search to prune the palette colors. For winner colors, + // neighbors are also evaluated using a finer search. + // 2: Perform 2 way palette search from max colors to min colors (and min + // colors to remaining colors) and terminate the search if current number of + // palette colors is not the winner. + int prune_palette_search_level; + + // Terminate early in luma palette_size search. Speed feature values indicate + // increasing level of pruning. + // 0: No early termination + // 1: Terminate early for higher luma palette_size, if header rd cost of lower + // palette_size is more than 2 * best_rd. This level of pruning is more + // conservative when compared to sf level 2 as the cases which will get pruned + // with sf level 1 is a subset of the cases which will get pruned with sf + // level 2. + // 2: Terminate early for higher luma palette_size, if header rd cost of lower + // palette_size is more than best_rd. + // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%, + // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4, + // 5, 6, 7 and 8 on screen content set with coding performance change less + // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF + // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%, + // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6, + // 7 and 8 on a typical image dataset with coding performance change less than + // 0.01%. + int prune_luma_palette_size_search_level; + + // Prune chroma intra modes based on luma intra mode winner. + // 0: No pruning + // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED, + // UV_CFL_PRED and the mode that corresponds to luma intra mode winner. + int prune_chroma_modes_using_luma_winner; + + // Clip the frequency of updating the mv cost for intrabc. + INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level; + + // We use DCT_DCT transform followed by computing SATD (Sum of Absolute + // Transformed Differences) as an estimation of RD score to quickly find the + // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search + // near the best possible parameter. The search range is set here. + // The range of cfl_searh_range should be [1, 33], and the following are the + // recommended values. + // 1: Fastest mode. + // 3: Default mode that provides good speedup without losing compression + // performance at speed 0. + // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only + // be used for debugging purpose. + int cfl_search_range; + + // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in + // intra mode decision. Here, add a speed feature to reduce this number for + // higher speeds. + int top_intra_model_count_allowed; + + // Adapt top_intra_model_count_allowed locally to prune luma intra modes using + // neighbor block and quantizer information. + int adapt_top_model_rd_count_using_neighbors; + + // Prune the evaluation of odd delta angles of directional luma intra modes by + // using the rdcosts of neighbouring delta angles. + // For allintra encode, this speed feature reduces instruction count + // by 4.461%, 3.699% and 3.536% for speed 6, 7 and 8 on a typical video + // dataset with coding performance change less than 0.26%. For AVIF image + // encode, this speed feature reduces encode time by 2.849%, 2.471%, + // and 2.051% for speed 6, 7 and 8 on a typical image dataset with coding + // performance change less than 0.27%. + int prune_luma_odd_delta_angles_in_intra; + + // Terminate early in chroma palette_size search. + // 0: No early termination + // 1: Terminate early for higher palette_size, if header rd cost of lower + // palette_size is more than best_rd. + // For allintra encode, this sf reduces instruction count by 0.45%, + // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen + // content set with coding performance change less than 0.01%. + // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%, + // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image + // dataset with no quality drop. + int early_term_chroma_palette_size_search; + + // Skips the evaluation of filter intra modes in inter frames if rd evaluation + // of luma intra dc mode results in invalid rd stats. + int skip_filter_intra_in_inter_frames; +} INTRA_MODE_SPEED_FEATURES; + +typedef struct TX_SPEED_FEATURES { + // Init search depth for square and rectangular transform partitions. + // Values: + // 0 - search full tree, 1: search 1 level, 2: search the highest level only + int inter_tx_size_search_init_depth_sqr; + int inter_tx_size_search_init_depth_rect; + int intra_tx_size_search_init_depth_sqr; + int intra_tx_size_search_init_depth_rect; + + // If any dimension of a coding block size above 64, always search the + // largest transform only, since the largest transform block size is 64x64. + int tx_size_search_lgr_block; + + TX_TYPE_SEARCH tx_type_search; + + // Skip split transform block partition when the collocated bigger block + // is selected as all zero coefficients. + int txb_split_cap; + + // Shortcut the transform block partition and type search when the target + // rdcost is relatively lower. + // Values are 0 (not used) , or 1 - 2 with progressively increasing + // aggressiveness + int adaptive_txb_search_level; + + // Prune level for tx_size_type search for inter based on rd model + // 0: no pruning + // 1-2: progressively increasing aggressiveness of pruning + int model_based_prune_tx_search_level; + + // Refine TX type after fast TX search. + int refine_fast_tx_search_results; + + // Prune transform split/no_split eval based on residual properties. A value + // of 0 indicates no pruning, and the aggressiveness of pruning progressively + // increases from levels 1 to 3. + int prune_tx_size_level; + + // Prune the evaluation of transform depths as decided by the NN model. + // false: No pruning. + // true : Avoid the evaluation of specific transform depths using NN model. + // + // For allintra encode, this speed feature reduces instruction count + // by 4.76%, 8.92% and 11.28% for speed 6, 7 and 8 with coding performance + // change less than 0.32%. For AVIF image encode, this speed feature reduces + // encode time by 4.65%, 9.16% and 10.45% for speed 6, 7 and 8 on a typical + // image dataset with coding performance change less than 0.19%. + bool prune_intra_tx_depths_using_nn; + + // Enable/disable early breakout during transform search of intra modes, by + // using the minimum rd cost possible. By using this approach, the rd + // evaluation of applicable transform blocks (in the current block) can be + // avoided as + // 1) best_rd evolves during the search in choose_tx_size_type_from_rd() + // 2) appropriate ref_best_rd is passed in intra_block_yrd() + // + // For allintra encode, this speed feature reduces instruction count + // by 1.11%, 1.08%, 1.02% and 0.93% for speed 3, 6, 7 and 8 with coding + // performance change less than 0.02%. For AVIF image encode, this speed + // feature reduces encode time by 0.93%, 1.46%, 1.07%, 0.84%, 0.99% and 0.73% + // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding + // performance change less than 0.004%. + bool use_rd_based_breakout_for_intra_tx_search; +} TX_SPEED_FEATURES; + +typedef struct RD_CALC_SPEED_FEATURES { + // Fast approximation of av1_model_rd_from_var_lapndz + int simple_model_rd_from_var; + + // Perform faster distortion computation during the R-D evaluation by trying + // to approximate the prediction error with transform coefficients (faster but + // less accurate) rather than computing distortion in the pixel domain (slower + // but more accurate). The following methods are used for distortion + // computation: + // Method 0: Always compute distortion in the pixel domain + // Method 1: Based on block error, try using transform domain distortion for + // tx_type search and compute distortion in pixel domain for final RD_STATS + // Method 2: Based on block error, try to compute distortion in transform + // domain + // Methods 1 and 2 may fallback to computing distortion in the pixel domain in + // case the block error is less than the threshold, which is controlled by the + // speed feature tx_domain_dist_thres_level. + // + // The speed feature tx_domain_dist_level decides which of the above methods + // needs to be used across different mode evaluation stages as described + // below: + // Eval type: Default Mode Winner + // Level 0 : Method 0 Method 2 Method 0 + // Level 1 : Method 1 Method 2 Method 0 + // Level 2 : Method 2 Method 2 Method 0 + // Level 3 : Method 2 Method 2 Method 2 + int tx_domain_dist_level; + + // Transform domain distortion threshold level + int tx_domain_dist_thres_level; + + // Trellis (dynamic programming) optimization of quantized values + TRELLIS_OPT_TYPE optimize_coefficients; + + // Use hash table to store macroblock RD search results + // to avoid repeated search on the same residue signal. + int use_mb_rd_hash; + + // Flag used to control the extent of coeff R-D optimization + int perform_coeff_opt; +} RD_CALC_SPEED_FEATURES; + +typedef struct WINNER_MODE_SPEED_FEATURES { + // Flag used to control the winner mode processing for better R-D optimization + // of quantized coeffs + int enable_winner_mode_for_coeff_opt; + + // Flag used to control the winner mode processing for transform size + // search method + int enable_winner_mode_for_tx_size_srch; + + // Control transform size search level + // Eval type: Default Mode Winner + // Level 0 : FULL RD LARGEST ALL FULL RD + // Level 1 : FAST RD LARGEST ALL FULL RD + // Level 2 : LARGEST ALL LARGEST ALL FULL RD + // Level 3 : LARGEST ALL LARGEST ALL LARGEST ALL + int tx_size_search_level; + + // Flag used to control the winner mode processing for use transform + // domain distortion + int enable_winner_mode_for_use_tx_domain_dist; + + // Flag used to enable processing of multiple winner modes + MULTI_WINNER_MODE_TYPE multi_winner_mode_type; + + // Motion mode for winner candidates: + // 0: speed feature OFF + // 1 / 2 : Use configured number of winner candidates + int motion_mode_for_winner_cand; + + // Controls the prediction of transform skip block or DC only block. + // + // Different speed feature values (0 to 3) decide the aggressiveness of + // prediction (refer to predict_dc_levels[][] in speed_features.c) to be used + // during different mode evaluation stages. + int dc_blk_pred_level; + + // If on, disables interpolation filter search in handle_inter_mode loop, and + // performs it during winner mode processing by \ref + // tx_search_best_inter_candidates. + int winner_mode_ifs; + + // Controls the disabling of winner mode processing. Speed feature levels + // are ordered in increasing aggressiveness of pruning. The method considered + // for disabling, depends on the sf level value and it is described as below. + // 0: Do not disable + // 1: Disable for blocks with low source variance. + // 2: Disable for blocks which turn out to be transform skip (skipped based on + // eob) during MODE_EVAL stage except NEWMV mode. + // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL + // stage except NEWMV mode. For high quantizers, prune conservatively based on + // transform skip (skipped based on eob) except for NEWMV mode. + // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL + // stage. + int prune_winner_mode_eval_level; +} WINNER_MODE_SPEED_FEATURES; + +typedef struct LOOP_FILTER_SPEED_FEATURES { + // This feature controls how the loop filter level is determined. + LPF_PICK_METHOD lpf_pick; + + // Skip some final iterations in the determination of the best loop filter + // level. + int use_coarse_filter_level_search; + + // Control how the CDEF strength is determined. + CDEF_PICK_METHOD cdef_pick_method; + + // Decoder side speed feature to add penalty for use of dual-sgr filters. + // Takes values 0 - 10, 0 indicating no penalty and each additional level + // adding a penalty of 1% + int dual_sgr_penalty_level; + + // prune sgr ep using binary search like mechanism + int enable_sgr_ep_pruning; + + // Disable loop restoration for Chroma plane + int disable_loop_restoration_chroma; + + // Disable loop restoration for luma plane + int disable_loop_restoration_luma; + + // Range of loop restoration unit sizes to search + // The minimum size is clamped against the superblock size in + // av1_pick_filter_restoration, so that the code which sets this value does + // not need to know the superblock size ahead of time. + int min_lr_unit_size; + int max_lr_unit_size; + + // Prune RESTORE_WIENER evaluation based on source variance + // 0 : no pruning + // 1 : conservative pruning + // 2 : aggressive pruning + int prune_wiener_based_on_src_var; + + // Prune self-guided loop restoration based on wiener search results + // 0 : no pruning + // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE + // 2 : pruning based on winner restoration type among RESTORE_WIENER and + // RESTORE_NONE + int prune_sgr_based_on_wiener; + + // Reduce the wiener filter win size for luma + int reduce_wiener_window_size; + + // Flag to disable Wiener Loop restoration filter. + bool disable_wiener_filter; + + // Flag to disable Self-guided Loop restoration filter. + bool disable_sgr_filter; + + // Disable the refinement search around the wiener filter coefficients. + bool disable_wiener_coeff_refine_search; + + // Whether to downsample the rows in computation of wiener stats. + int use_downsampled_wiener_stats; +} LOOP_FILTER_SPEED_FEATURES; + +typedef struct REAL_TIME_SPEED_FEATURES { + // check intra prediction for non-RD mode. + int check_intra_pred_nonrd; + + // Skip checking intra prediction. + // 0 - don't skip + // 1 - skip if TX is skipped and best mode is not NEWMV + // 2 - skip if TX is skipped + // Skipping aggressiveness increases from level 1 to 2. + int skip_intra_pred; + + // Estimate motion before calculating variance in variance-based partition + // 0 - Only use zero MV + // 1 - perform coarse ME + // 2 - perform coarse ME, and also use neighbours' MVs + // 3 - use neighbours' MVs without performing coarse ME + int estimate_motion_for_var_based_partition; + + // For nonrd_use_partition: mode of extra check of leaf partition + // 0 - don't check merge + // 1 - always check merge + // 2 - check merge and prune checking final split + // 3 - check merge and prune checking final split based on bsize and qindex + int nonrd_check_partition_merge_mode; + + // For nonrd_use_partition: check of leaf partition extra split + int nonrd_check_partition_split; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + + // For nonrd: Reduces ref frame search. + // 0 - low level of search prune in non last frames + // 1 - pruned search in non last frames + // 2 - more pruned search in non last frames + int nonrd_prune_ref_frame_search; + + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // Use ALTREF frame in non-RD mode decision. + int use_nonrd_altref_frame; + + // Use compound reference for non-RD mode. + int use_comp_ref_nonrd; + + // Reference frames for compound prediction for nonrd pickmode: + // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2). + int ref_frame_comp_nonrd[3]; + + // use reduced ref set for real-time mode + int use_real_time_ref_set; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. + int short_circuit_low_temp_var; + + // Reuse inter prediction in fast non-rd mode. + int reuse_inter_pred_nonrd; + + // Number of best inter modes to search transform. INT_MAX - search all. + int num_inter_modes_for_tx_search; + + // Use interpolation filter search in non-RD mode decision. + int use_nonrd_filter_search; + + // Use simplified RD model for interpolation search and Intra + int use_simple_rd_model; + + // For nonrd mode: use hybrid intra mode search for intra only frames based on + // block properties. + // 0 : use nonrd pick intra for all blocks + // 1 : use rd for bsize < 16x16, nonrd otherwise + // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise + int hybrid_intra_pickmode; + + // Compute variance/sse on source difference, prior to encoding superblock. + int source_metrics_sb_nonrd; + + // Flag to indicate process for handling overshoot on slide/scene change, + // for real-time CBR mode. + OVERSHOOT_DETECTION_CBR overshoot_detection_cbr; + + // Check for scene/content change detection on every frame before encoding. + int check_scene_detection; + + // For nonrd mode: Prefer larger partition blks in variance based partitioning + // 0: disabled, 1-3: increasing aggressiveness + int prefer_large_partition_blocks; + + // uses results of temporal noise estimate + int use_temporal_noise_estimate; + + // Parameter indicating initial search window to be used in full-pixel search + // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value + // indicates larger window. If set to 0, step_param is set based on internal + // logic in set_mv_search_params(). + int fullpel_search_step_param; + + // Bit mask to enable or disable intra modes for each prediction block size + // separately, for nonrd_pickmode. Currently, the sf is not respected when + // 'force_intra_check' is true in 'av1_estimate_intra_mode()' function. Also, + // H and V pred modes allowed through this sf can be further pruned when + //'prune_hv_pred_modes_using_src_sad' sf is true. + int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES]; + + // Prune H and V intra predition modes evalution in inter frame. + // The sf does not have any impact. + // i. when frame_source_sad is 1.1 times greater than avg_source_sad + // ii. when cyclic_refresh_segment_id_boosted is enabled + // iii. when SB level source sad is greater than kMedSad + // iv. when color sensitivity is non zero for both the chroma channels + bool prune_hv_pred_modes_using_src_sad; + + // Skips mode checks more aggressively in nonRD mode + int nonrd_aggressive_skip; + + // Skip cdef on 64x64 blocks/ + // 0: disabled + // 1: skip when NEWMV or INTRA is not picked or color sensitivity is off. + // When color sensitivity is on for a superblock, all 64x64 blocks within + // will not skip. + // 2: more aggressive mode where skip is done for all frames where + // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off. + int skip_cdef_sb; + + // Force selective cdf update. + int selective_cdf_update; + + // Force only single reference (LAST) for prediction. + int force_only_last_ref; + + // Forces larger partition blocks in variance based partitioning for intra + // frames + int force_large_partition_blocks_intra; + + // Use fixed partition for superblocks based on source_sad. + // 0: disabled + // 1: enabled + int use_fast_fixed_part; + + // Increase source_sad thresholds in nonrd pickmode. + int increase_source_sad_thresh; + + // Skip evaluation of no split in tx size selection for merge partition + int skip_tx_no_split_var_based_partition; + + // Intermediate termination of newMV mode evaluation based on so far best mode + // sse + int skip_newmv_mode_based_on_sse; + + // Define gf length multiplier. + // Level 0: use large multiplier, level 1: use medium multiplier. + int gf_length_lvl; + + // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes + int prune_inter_modes_with_golden_ref; + + // Prune inter modes w.r.t golden or alt-ref frame based on sad + int prune_inter_modes_wrt_gf_arf_based_on_sad; + + // Prune inter mode search in rd path based on current block's temporal + // variance wrt LAST reference. + int prune_inter_modes_using_temp_var; + + // Reduce MV precision to halfpel for higher int MV value & frame-level motion + // 0: disabled + // 1-2: Reduce precision to halfpel, fullpel based on conservative + // thresholds, aggressiveness increases with increase in level + // 3: Reduce precision to halfpel using more aggressive thresholds + int reduce_mv_pel_precision_highmotion; + + // Reduce MV precision for low complexity blocks + // 0: disabled + // 1: Reduce the mv resolution for zero mv if the variance is low + // 2: Switch to halfpel, fullpel based on low block spatial-temporal + // complexity. + int reduce_mv_pel_precision_lowcomplex; + + // Prune intra mode evaluation in inter frames based on mv range. + BLOCK_SIZE prune_intra_mode_based_on_mv_range; + // The number of times to left shift the splitting thresholds in variance + // based partitioning. The minimum values should be 7 to avoid left shifting + // by a negative number. + int var_part_split_threshold_shift; + + // Qindex based variance partition threshold index, which determines + // the aggressiveness of partition pruning + // 0: disabled for speeds 9,10 + // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb) + // 3,4: (non-rd path) uses pre-tuned qindex thresholds + int var_part_based_on_qidx; + + // Enable GF refresh based on Q value. + int gf_refresh_based_on_qp; + + // Temporal filtering + // The value can be 1 or 2, which indicates the threshold to use. + // Must be off for lossless mode. + int use_rtc_tf; + + // Prune the use of the identity transform in nonrd_pickmode, + // used for screen content mode: only for smaller blocks + // and higher spatial variance, and when skip_txfm is not + // already set. + int prune_idtx_nonrd; + + // Prune the use of paletter mode in nonrd pickmode. + int prune_palette_nonrd; + + // Force to only use dct for palette search in nonrd pickmode. + int dct_only_palette_nonrd; + + // Skip loopfilter, for static content after slide change + // or key frame, once quality has ramped up. + // 0: disabled + // 1: skip only after quality is ramped up. + // 2: aggrssive mode, where skip is done for all frames that + // where rc->high_source_sad = 0 (no slide-changes). + int skip_lf_screen; + + // For nonrd: early exit out of variance partition that sets the + // block size to superblock size, and sets mode to zeromv-last skip. + // 0: disabled + // 1: zeromv-skip is enabled at SB level only + // 2: zeromv-skip is enabled at SB level and coding block level + int part_early_exit_zeromv; + + // Early terminate inter mode search based on sse in non-rd path. + INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search; + + // SAD based adaptive altref selection + int sad_based_adp_altref_lag; + + // Enable/disable partition direct merging. + int partition_direct_merging; + + // Level of aggressiveness for obtaining tx size based on qstep + int tx_size_level_based_on_qstep; + + // Avoid the partitioning of a 16x16 block in variance based partitioning + // (VBP) by making use of minimum and maximum sub-block variances. + // For allintra encode, this speed feature reduces instruction count by 5.39% + // for speed 9 on a typical video dataset with coding performance gain + // of 1.44%. + // For AVIF image encode, this speed feature reduces encode time + // by 8.44% for speed 9 on a typical image dataset with coding performance + // gain of 0.78%. + bool vbp_prune_16x16_split_using_min_max_sub_blk_var; + + // A qindex threshold that determines whether to use qindex based CDEF filter + // strength estimation for screen content types. The strength estimation model + // used for screen contents prefers to allow cdef filtering for more frames. + // This sf is used to limit the frames which go through cdef filtering and + // following explains the setting of the same. + // MAXQ (255): This disables the usage of this sf. Here, frame does not use a + // screen content model thus reduces the number of frames that go through cdef + // filtering. + // MINQ (0): Frames always use screen content model thus increasing the number + // of frames that go through cdef filtering. + // This speed feature has a substantial gain on coding metrics, with moderate + // increase encoding time. Select threshold based on speed vs quality + // trade-off. + int screen_content_cdef_filter_qindex_thresh; + + // Prune compound mode if its variance is higher than the variance of single + // modes. + bool prune_compoundmode_with_singlecompound_var; + + // Allow mode cost update at frame level every couple frames. This + // overrides the command line setting --mode-cost-upd-freq=3 (never update + // except on key frame and first delta). + bool frame_level_mode_cost_update; + + // Prune H_PRED during intra mode evaluation in the nonrd path based on best + // mode so far. + // + // For allintra encode, this speed feature reduces instruction count by 1.10% + // for speed 9 with coding performance change less than 0.04%. + // For AVIF image encode, this speed feature reduces encode time by 1.03% for + // speed 9 on a typical image dataset with coding performance change less than + // 0.08%. + bool prune_h_pred_using_best_mode_so_far; + + // Enable pruning of intra mode evaluations in nonrd path based on source + // variance and best mode so far. The pruning logic is enabled only if the + // mode is not a winner mode of both the neighboring blocks (left/top). + // + // For allintra encode, this speed feature reduces instruction count by 3.96% + // for speed 9 with coding performance change less than 0.38%. + // For AVIF image encode, this speed feature reduces encode time by 3.46% for + // speed 9 on a typical image dataset with coding performance change less than + // -0.06%. + bool enable_intra_mode_pruning_using_neighbors; + + // Prune intra mode evaluations in nonrd path based on best sad so far. + // + // For allintra encode, this speed feature reduces instruction count by 3.05% + // for speed 9 with coding performance change less than 0.24%. + // For AVIF image encode, this speed feature reduces encode time by 1.87% for + // speed 9 on a typical image dataset with coding performance change less than + // 0.16%. + bool prune_intra_mode_using_best_sad_so_far; + + // If compound is enabled, and the current block size is \geq BLOCK_16X16, + // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the + // base layer of svc. + bool check_only_zero_zeromv_on_large_blocks; + + // Allow for disabling cdf update for non reference frames in svc mode. + bool disable_cdf_update_non_reference_frame; + + // Prune compound modes if the single modes variances do not perform well. + bool prune_compoundmode_with_singlemode_var; + + // Skip searching all compound mode if the variance of single_mode residue is + // sufficiently low. + bool skip_compound_based_on_var; + + // Sets force_zeromv_skip based on the source sad available. Aggressiveness + // increases with increase in the level set for speed feature. + // 0: No setting + // 1: If source sad is kZeroSad + // 2: If source sad <= kVeryLowSad + int set_zeromv_skip_based_on_source_sad; + + // Downgrades the block-level subpel motion search to + // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel + // search performed well, zeromv has low sad or low source_var + bool use_adaptive_subpel_search; + + // A flag used in RTC case to control frame_refs_short_signaling. Note that + // the final decision is made in check_frame_refs_short_signaling(). The flag + // can only be turned on when res < 360p and speed >= 9, in which case only + // LAST and GOLDEN ref frames are used now. + bool enable_ref_short_signaling; + + // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame + // case. + bool check_globalmv_on_single_ref; + + // Allows for increasing the color_threshold for palette prediction. + // This generally leads to better coding efficiency but with some speed loss. + // Only used for screen content and for nonrd_pickmode. + bool increase_color_thresh_palette; +} REAL_TIME_SPEED_FEATURES; + +/*!\endcond */ + +/*! + * \brief Top level speed vs quality trade off data struture. + */ +typedef struct SPEED_FEATURES { + /*! + * Sequence/frame level speed features: + */ + HIGH_LEVEL_SPEED_FEATURES hl_sf; + + /*! + * Speed features for the first pass. + */ + FIRST_PASS_SPEED_FEATURES fp_sf; + + /*! + * Speed features related to how tpl's searches are done. + */ + TPL_SPEED_FEATURES tpl_sf; + + /*! + * Global motion speed features: + */ + GLOBAL_MOTION_SPEED_FEATURES gm_sf; + + /*! + * Partition search speed features: + */ + PARTITION_SPEED_FEATURES part_sf; + + /*! + * Motion search speed features: + */ + MV_SPEED_FEATURES mv_sf; + + /*! + * Inter mode search speed features: + */ + INTER_MODE_SPEED_FEATURES inter_sf; + + /*! + * Interpolation filter search speed features: + */ + INTERP_FILTER_SPEED_FEATURES interp_sf; + + /*! + * Intra mode search speed features: + */ + INTRA_MODE_SPEED_FEATURES intra_sf; + + /*! + * Transform size/type search speed features: + */ + TX_SPEED_FEATURES tx_sf; + + /*! + * RD calculation speed features: + */ + RD_CALC_SPEED_FEATURES rd_sf; + + /*! + * Two-pass mode evaluation features: + */ + WINNER_MODE_SPEED_FEATURES winner_mode_sf; + + /*! + * In-loop filter speed features: + */ + LOOP_FILTER_SPEED_FEATURES lpf_sf; + + /*! + * Real-time mode speed features: + */ + REAL_TIME_SPEED_FEATURES rt_sf; +} SPEED_FEATURES; +/*!\cond */ + +struct AV1_COMP; + +/*!\endcond */ +/*!\brief Frame size independent speed vs quality trade off flags + * + *\ingroup speed_features + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] speed Speed setting passed in from the command line + * + * \remark No return value but configures the various speed trade off flags + * based on the passed in speed setting. (Higher speed gives lower + * quality) + */ +void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi, + int speed); + +/*!\brief Frame size dependent speed vs quality trade off flags + * + *\ingroup speed_features + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] speed Speed setting passed in from the command line + * + * \remark No return value but configures the various speed trade off flags + * based on the passed in speed setting and frame size. (Higher speed + * corresponds to lower quality) + */ +void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi, + int speed); +/*!\brief Q index dependent speed vs quality trade off flags + * + *\ingroup speed_features + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] speed Speed setting passed in from the command line + * + * \remark No return value but configures the various speed trade off flags + * based on the passed in speed setting and current frame's Q index. + * (Higher speed corresponds to lower quality) + */ +void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_ diff --git a/third_party/aom/av1/encoder/superres_scale.c b/third_party/aom/av1/encoder/superres_scale.c new file mode 100644 index 0000000000..3b47909b15 --- /dev/null +++ b/third_party/aom/av1/encoder/superres_scale.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/superres_scale.h" +#include "av1/encoder/random.h" + +// Compute the horizontal frequency components' energy in a frame +// by calculuating the 16x4 Horizontal DCT. This is to be used to +// decide the superresolution parameters. +static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { + uint64_t freq_energy[16] = { 0 }; + const YV12_BUFFER_CONFIG *buf = cpi->source; + const int bd = cpi->td.mb.e_mbd.bd; + const int width = buf->y_crop_width; + const int height = buf->y_crop_height; + DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]); + int n = 0; + memset(freq_energy, 0, sizeof(freq_energy)); + if (buf->flags & YV12_FLAG_HIGHBITDEPTH) { + const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer); + for (int i = 0; i < height - 4; i += 4) { + for (int j = 0; j < width - 16; j += 16) { + av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride, + H_DCT, bd); + for (int k = 1; k < 16; ++k) { + const uint64_t this_energy = + ((int64_t)coeff[k] * coeff[k]) + + ((int64_t)coeff[k + 16] * coeff[k + 16]) + + ((int64_t)coeff[k + 32] * coeff[k + 32]) + + ((int64_t)coeff[k + 48] * coeff[k + 48]); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8)); + } + n++; + } + } + } else { + assert(bd == 8); + DECLARE_ALIGNED(16, int16_t, src16[16 * 4]); + for (int i = 0; i < height - 4; i += 4) { + for (int j = 0; j < width - 16; j += 16) { + for (int ii = 0; ii < 4; ++ii) + for (int jj = 0; jj < 16; ++jj) + src16[ii * 16 + jj] = + buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)]; + av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd); + for (int k = 1; k < 16; ++k) { + const uint64_t this_energy = + ((int64_t)coeff[k] * coeff[k]) + + ((int64_t)coeff[k + 16] * coeff[k + 16]) + + ((int64_t)coeff[k + 32] * coeff[k + 32]) + + ((int64_t)coeff[k + 48] * coeff[k + 48]); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2); + } + n++; + } + } + } + if (n) { + for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n; + // Convert to cumulative energy + for (int k = 14; k > 0; --k) energy[k] += energy[k + 1]; + } else { + for (int k = 1; k < 16; ++k) energy[k] = 1e+20; + } +} + +static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { + // Choose an arbitrary random number + static unsigned int seed = 56789; + const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg; + if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; + uint8_t new_denom = SCALE_NUMERATOR; + + if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR; + switch (resize_cfg->resize_mode) { + case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break; + case RESIZE_FIXED: + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = resize_cfg->resize_kf_scale_denominator; + else + new_denom = resize_cfg->resize_scale_denominator; + break; + case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; + default: assert(0); + } + return new_denom; +} + +int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + // Empirically found to not be beneficial for image coding. + return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO && + cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO && + cpi->rc.frames_to_key > 1; +} + +#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012 +#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008 +#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008 +#define SUPERRES_ENERGY_BY_AC_THRESH 0.2 + +static double get_energy_by_q2_thresh(const GF_GROUP *gf_group, + const RATE_CONTROL *rc, + int gf_frame_index) { + // TODO(now): Return keyframe thresh * factor based on frame type / pyramid + // level. + if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) { + return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME; + } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) { + if (rc->frames_to_key <= 1) + return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO; + else + return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME; + } else { + assert(0); + } + return 0; +} + +static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy, + double threshq, + double threshp) { + const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8); + const double tq = threshq * q * q; + const double tp = threshp * energy[1]; + const double thresh = AOMMIN(tq, tp); + int k; + for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) { + if (energy[k - 1] > thresh) break; + } + return 3 * SCALE_NUMERATOR - k; +} + +static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, + int sr_kf, int sr_arf) { + // Use superres for Key-frames and Alt-ref frames only. + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) { + return SCALE_NUMERATOR; + } + if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) { + return SCALE_NUMERATOR; + } + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) { + return SCALE_NUMERATOR; + } + + double energy[16]; + analyze_hor_freq(cpi, energy); + + const double energy_by_q2_thresh = + get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index); + int denom = get_superres_denom_from_qindex_energy( + qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH); + /* + printf("\nenergy = ["); + for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]); + printf("]\n"); + printf("boost = %d\n", + (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE) + ? cpi->ppi->p_rc.kf_boost + : cpi->rc.gfu_boost); + printf("denom = %d\n", denom); + */ + if (av1_superres_in_recode_allowed(cpi)) { + assert(cpi->superres_mode != AOM_SUPERRES_NONE); + // Force superres to be tried in the recode loop, as full-res is also going + // to be tried anyway. + denom = AOMMAX(denom, SCALE_NUMERATOR + 1); + } + return denom; +} + +static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { + // Choose an arbitrary random number + static unsigned int seed = 34567; + const AV1EncoderConfig *oxcf = &cpi->oxcf; + const SuperResCfg *const superres_cfg = &oxcf->superres_cfg; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; + uint8_t new_denom = SCALE_NUMERATOR; + + // Make sure that superres mode of the frame is consistent with the + // sequence-level flag. + assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE, + cpi->common.seq_params->enable_superres)); + assert(IMPLIES(!cpi->common.seq_params->enable_superres, + superres_cfg->superres_mode == AOM_SUPERRES_NONE)); + // Make sure that superres mode for current encoding is consistent with user + // provided superres mode. + assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO, + cpi->superres_mode == superres_cfg->superres_mode)); + + // Note: we must look at the current superres_mode to be tried in 'cpi' here, + // not the user given mode in 'oxcf'. + switch (cpi->superres_mode) { + case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break; + case AOM_SUPERRES_FIXED: + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = superres_cfg->superres_kf_scale_denominator; + else + new_denom = superres_cfg->superres_scale_denominator; + break; + case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break; + case AOM_SUPERRES_QTHRESH: { + // Do not use superres when screen content tools are used. + if (cpi->common.features.allow_screen_content_tools) break; + if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ) + av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height); + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, + &bottom_index, &top_index); + + const int qthresh = (frame_is_intra_only(&cpi->common)) + ? superres_cfg->superres_kf_qthresh + : superres_cfg->superres_qthresh; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; + } else { + new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); + } + break; + } + case AOM_SUPERRES_AUTO: { + if (cpi->common.features.allow_screen_content_tools) break; + if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ) + av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height); + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, + &bottom_index, &top_index); + + const SUPERRES_AUTO_SEARCH_TYPE sr_search_type = + cpi->sf.hl_sf.superres_auto_search_type; + const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; // Don't use superres. + } else { + if (sr_search_type == SUPERRES_AUTO_ALL) { + if (cpi->common.current_frame.frame_type == KEY_FRAME) + new_denom = superres_cfg->superres_kf_scale_denominator; + else + new_denom = superres_cfg->superres_scale_denominator; + } else { + new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1); + } + } + break; + } + default: assert(0); + } + return new_denom; +} + +static int dimension_is_ok(int orig_dim, int resized_dim, int denom) { + return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2); +} + +static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) { + // Only need to check the width, as scaling is horizontal only. + (void)oheight; + return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom); +} + +static int validate_size_scales(RESIZE_MODE resize_mode, + aom_superres_mode superres_mode, int owidth, + int oheight, size_params_type *rsz) { + if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do. + return 1; + } + + // Calculate current resize scale. + int resize_denom = + AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width), + DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height)); + + if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) { + // Alter superres scale as needed to enforce conformity. + rsz->superres_denom = + (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom; + if (!dimensions_are_ok(owidth, oheight, rsz)) { + if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom; + } + } else if (resize_mode == RESIZE_RANDOM && + superres_mode != AOM_SUPERRES_RANDOM) { + // Alter resize scale as needed to enforce conformity. + resize_denom = + (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + if (!dimensions_are_ok(owidth, oheight, rsz)) { + if (resize_denom > SCALE_NUMERATOR) { + --resize_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + } + } + } else if (resize_mode == RESIZE_RANDOM && + superres_mode == AOM_SUPERRES_RANDOM) { + // Alter both resize and superres scales as needed to enforce conformity. + do { + if (resize_denom > rsz->superres_denom) + --resize_denom; + else + --rsz->superres_denom; + rsz->resize_width = owidth; + rsz->resize_height = oheight; + av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height, + resize_denom); + } while (!dimensions_are_ok(owidth, oheight, rsz) && + (resize_denom > SCALE_NUMERATOR || + rsz->superres_denom > SCALE_NUMERATOR)); + } else { // We are allowed to alter neither resize scale nor superres + // scale. + return 0; + } + return dimensions_are_ok(owidth, oheight, rsz); +} + +// Calculates resize and superres params for next frame +static size_params_type calculate_next_size_params(AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; + size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height, + SCALE_NUMERATOR }; + int resize_denom = SCALE_NUMERATOR; + if (has_no_stats_stage(cpi) && cpi->ppi->use_svc && + (cpi->common.width != cpi->oxcf.frm_dim_cfg.width || + cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) { + rsz.resize_width = cpi->common.width; + rsz.resize_height = cpi->common.height; + return rsz; + } + if (is_stat_generation_stage(cpi)) return rsz; + if (resize_pending_params->width && resize_pending_params->height) { + rsz.resize_width = resize_pending_params->width; + rsz.resize_height = resize_pending_params->height; + resize_pending_params->width = resize_pending_params->height = 0; + if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz; + } else { + resize_denom = calculate_next_resize_scale(cpi); + rsz.resize_width = frm_dim_cfg->width; + rsz.resize_height = frm_dim_cfg->height; + av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height, + resize_denom); + } + rsz.superres_denom = calculate_next_superres_scale(cpi); + if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode, + frm_dim_cfg->width, frm_dim_cfg->height, &rsz)) + assert(0 && "Invalid scale parameters"); + return rsz; +} + +static void setup_frame_size_from_params(AV1_COMP *cpi, + const size_params_type *rsz) { + int encode_width = rsz->resize_width; + int encode_height = rsz->resize_height; + + AV1_COMMON *cm = &cpi->common; + cm->superres_upscaled_width = encode_width; + cm->superres_upscaled_height = encode_height; + cm->superres_scale_denominator = rsz->superres_denom; + av1_calculate_scaled_superres_size(&encode_width, &encode_height, + rsz->superres_denom); + av1_set_frame_size(cpi, encode_width, encode_height); +} + +void av1_setup_frame_size(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + // Reset superres params from previous frame. + cm->superres_scale_denominator = SCALE_NUMERATOR; + const size_params_type rsz = calculate_next_size_params(cpi); + setup_frame_size_from_params(cpi, &rsz); + + assert(av1_is_min_tile_width_satisfied(cm)); +} + +void av1_superres_post_encode(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + assert(cpi->oxcf.superres_cfg.enable_superres); + assert(!is_lossless_requested(&cpi->oxcf.rc_cfg)); + assert(!cm->features.all_lossless); + + av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels); + + // If regular resizing is occurring the source will need to be downscaled to + // match the upscaled superres resolution. Otherwise the original source is + // used. + if (!av1_resize_scaled(cm)) { + cpi->source = cpi->unscaled_source; + if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source; + } else { + assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width); + assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height); + // Do downscale. cm->(width|height) has been updated by + // av1_superres_upscale + cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width, + cm->superres_upscaled_height); + } +} diff --git a/third_party/aom/av1/encoder/superres_scale.h b/third_party/aom/av1/encoder/superres_scale.h new file mode 100644 index 0000000000..450a4ed902 --- /dev/null +++ b/third_party/aom/av1/encoder/superres_scale.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_ +#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_ + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int av1_superres_in_recode_allowed(const AV1_COMP *const cpi); +void av1_superres_post_encode(AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SUPERRES_SCALE_H_ diff --git a/third_party/aom/av1/encoder/svc_layercontext.c b/third_party/aom/av1/encoder/svc_layercontext.c new file mode 100644 index 0000000000..2c99cb89b8 --- /dev/null +++ b/third_party/aom/av1/encoder/svc_layercontext.c @@ -0,0 +1,701 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_alloc.h" + +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + +void av1_init_layer_context(AV1_COMP *const cpi) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + SVC *const svc = &cpi->svc; + int mi_rows = cpi->common.mi_params.mi_rows; + int mi_cols = cpi->common.mi_params.mi_cols; + svc->base_framerate = 30.0; + svc->current_superframe = 0; + svc->force_zero_mode_spatial_ref = 1; + svc->num_encoded_top_layer = 0; + svc->use_flexible_mode = 0; + svc->has_lower_quality_layer = 0; + + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; + lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q; + lp_rc->total_actual_bits = 0; + lrc->ni_tot_qi = 0; + lp_rc->tot_q = 0.0; + lp_rc->avg_q = 0.0; + lp_rc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); + lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); + lrc->rtc_external_ratectrl = 0; + for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lp_rc->rate_correction_factors[i] = 1.0; + } + lc->target_bandwidth = lc->layer_target_bitrate; + lp_rc->last_q[INTER_FRAME] = lrc->worst_quality; + lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality; + lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality; + lp_rc->buffer_level = + oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000; + lp_rc->bits_off_target = lp_rc->buffer_level; + // Initialize the cyclic refresh parameters. If spatial layers are used + // (i.e., ss_number_layers > 1), these need to be updated per spatial + // layer. Cyclic refresh is only applied on base temporal layer. + if (svc->number_spatial_layers > 1 && tl == 0) { + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + aom_free(lc->map); + CHECK_MEM_ERROR(cm, lc->map, + aom_calloc(mi_rows * mi_cols, sizeof(*lc->map))); + } + } + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; + svc->last_layer_dropped[sl] = false; + svc->drop_spatial_layer[sl] = false; + } + if (svc->number_spatial_layers == 3) { + svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH; + } +} + +bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) { + SVC *const svc = &cpi->svc; + if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) { + assert(num_layers > 1); + aom_free(svc->layer_context); + svc->num_allocated_layers = 0; + svc->layer_context = + (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context)); + if (svc->layer_context == NULL) return false; + svc->num_allocated_layers = num_layers; + } + return true; +} + +// Update the layer context from a change_config() call. +void av1_update_layer_context_change_config(AV1_COMP *const cpi, + const int64_t target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + AV1_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + int layer = 0; + int64_t spatial_layer_target = 0; + float bitrate_alloc = 1.0; + const int mi_rows = cm->mi_params.mi_rows; + const int mi_cols = cm->mi_params.mi_cols; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate; + } + spatial_layer_target = svc->layer_context[layer].target_bandwidth; + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + LAYER_CONTEXT *const lc = + &svc->layer_context[sl * svc->number_temporal_layers + tl]; + RATE_CONTROL *const lrc = &lc->rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; + lc->spatial_layer_target_bandwidth = spatial_layer_target; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } + lp_rc->starting_buffer_level = + (int64_t)(p_rc->starting_buffer_level * bitrate_alloc); + lp_rc->optimal_buffer_level = + (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc); + lp_rc->maximum_buffer_size = + (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc); + lp_rc->bits_off_target = + AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size); + lp_rc->buffer_level = + AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size); + lc->framerate = cpi->framerate / lc->framerate_factor; + lrc->avg_frame_bandwidth = + (int)round(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl; + lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q); + lrc->best_quality = av1_quantizer_to_qindex(lc->min_q); + if (rc->use_external_qp_one_pass) { + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } + // Reset the cyclic refresh parameters, if needed (map is NULL), + // or number of spatial layers has changed. + // Cyclic refresh is only applied on base temporal layer. + if (svc->number_spatial_layers > 1 && tl == 0 && + (lc->map == NULL || + svc->prev_number_spatial_layers != svc->number_spatial_layers)) { + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + aom_free(lc->map); + CHECK_MEM_ERROR(cm, lc->map, + aom_calloc(mi_rows * mi_cols, sizeof(*lc->map))); + } + } + } +} + +/*!\brief Return layer context for current layer. + * + * \ingroup rate_control + * \param[in] cpi Top level encoder structure + * + * \return LAYER_CONTEXT for current layer. + */ +static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) { + return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; +} + +void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + const int tl = svc->temporal_layer_id; + lc->framerate = cpi->framerate / lc->framerate_factor; + lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; + // Update the average layer frame size (non-cumulative per-frame-bw). + if (tl == 0) { + lc->avg_frame_size = lrc->avg_frame_bandwidth; + } else { + int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id - 1; + LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer]; + const double prev_layer_framerate = + cpi->framerate / lcprev->framerate_factor; + const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate; + lc->avg_frame_size = + (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } +} + +static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame( + int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) { + int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1]; + return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe && + rtc_ref->buffer_spatial_layer[ref_frame_idx] <= + svc->spatial_layer_id - 1; +} + +void av1_restore_layer_context(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + const AV1_COMMON *const cm = &cpi->common; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + const int old_frame_since_key = cpi->rc.frames_since_key; + const int old_frame_to_key = cpi->rc.frames_to_key; + const int max_consec_drop = cpi->rc.max_consec_drop; + // Restore layer rate control. + cpi->rc = lc->rc; + cpi->ppi->p_rc = lc->p_rc; + cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth; + cpi->gf_frame_index = 0; + cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude; + if (cpi->mv_search_params.max_mv_magnitude == 0) + cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height); + // Reset the frames_since_key and frames_to_key counters to their values + // before the layer restore. Keep these defined for the stream (not layer). + cpi->rc.frames_since_key = old_frame_since_key; + cpi->rc.frames_to_key = old_frame_to_key; + // Reset to value before the layer restore. + cpi->rc.max_consec_drop = max_consec_drop; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + swap_ptr(&cr->map, &lc->map); + cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; + } + svc->skip_mvsearch_last = 0; + svc->skip_mvsearch_gf = 0; + svc->skip_mvsearch_altref = 0; + // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags. + // This is to skip searching mv for that reference if it was last + // refreshed (i.e., buffer slot holding that reference was refreshed) on the + // previous spatial layer(s) at the same time (current_superframe). + if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref && + cpi->sf.rt_sf.use_nonrd_pick_mode) { + if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) { + svc->skip_mvsearch_last = 1; + } + if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) { + svc->skip_mvsearch_gf = 1; + } + if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) { + svc->skip_mvsearch_altref = 1; + } + } +} + +void av1_svc_update_buffer_slot_refreshed(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + const unsigned int current_frame = + cpi->ppi->use_svc ? svc->current_superframe + : cpi->common.current_frame.frame_number; + // For any buffer slot that is refreshed, update it with + // the spatial_layer_id and the current_superframe. + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + // All slots are refreshed on KEY. + for (unsigned int i = 0; i < REF_FRAMES; i++) { + rtc_ref->buffer_time_index[i] = current_frame; + rtc_ref->buffer_spatial_layer[i] = svc->spatial_layer_id; + } + } else if (rtc_ref->set_ref_frame_config) { + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + const int ref_frame_map_idx = rtc_ref->ref_idx[i]; + if (cpi->ppi->rtc_ref.refresh[ref_frame_map_idx]) { + rtc_ref->buffer_time_index[ref_frame_map_idx] = current_frame; + rtc_ref->buffer_spatial_layer[ref_frame_map_idx] = + svc->spatial_layer_id; + } + } + } +} + +void av1_save_layer_context(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + const AV1_COMMON *const cm = &cpi->common; + LAYER_CONTEXT *lc = get_layer_context(cpi); + lc->rc = cpi->rc; + lc->p_rc = cpi->ppi->p_rc; + lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth; + lc->group_index = cpi->gf_frame_index; + lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude; + if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, + // for the base temporal layer. + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + signed char *temp = lc->map; + lc->map = cr->map; + cr->map = temp; + lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; + } + av1_svc_update_buffer_slot_refreshed(cpi); + for (unsigned int i = 0; i < REF_FRAMES; i++) { + if (frame_is_intra_only(cm) || + cm->current_frame.refresh_frame_flags & (1 << i)) { + svc->spatial_layer_fb[i] = svc->spatial_layer_id; + svc->temporal_layer_fb[i] = svc->temporal_layer_id; + } + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + svc->current_superframe++; + // Reset drop flag to false for next superframe. + for (int sl = 0; sl < svc->number_spatial_layers; sl++) + svc->drop_spatial_layer[sl] = false; + } +} + +int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) { + const SVC *const svc = &cpi->svc; + const AV1_COMMON *const cm = &cpi->common; + int fb_idx = -1; + int primary_ref_frame = PRIMARY_REF_NONE; + if (cpi->svc.number_spatial_layers > 1 || + cpi->svc.number_temporal_layers > 1) { + // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST + // was last updated on a lower temporal layer (or base TL0) and for the + // same spatial layer. For RTC patterns this allows for continued decoding + // when set of enhancement layers are dropped (continued decoding starting + // at next base TL0), so error_resilience can be off/0 for all layers. + fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id && + (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id || + svc->temporal_layer_fb[fb_idx] == 0)) { + primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME + } + } else if (cpi->ppi->rtc_ref.set_ref_frame_config) { + const ExternalFlags *const ext_flags = &cpi->ext_flags; + int flags = ext_flags->ref_frame_flags; + if (flags & AOM_LAST_FLAG) { + primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME + } else if (flags & AOM_GOLD_FLAG) { + primary_ref_frame = GOLDEN_FRAME - LAST_FRAME; + } else if (flags & AOM_ALT_FLAG) { + primary_ref_frame = ALTREF_FRAME - LAST_FRAME; + } + } + return primary_ref_frame; +} + +void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + aom_free(lc->map); + lc->map = NULL; + } + } +} + +void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) { + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; + if (is_key) lc->frames_from_key_frame = 0; + } + } + av1_update_temporal_layer_framerate(cpi); + av1_restore_layer_context(cpi); +} + +void av1_get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { + int w, h; + if (width_out == NULL || height_out == NULL || den == 0) return; + if (den == 1 && num == 1) { + *width_out = width_org; + *height_out = height_org; + return; + } + w = width_org * num / den; + h = height_org * num / den; + // Make height and width even. + w += w % 2; + h += h % 2; + *width_out = w; + *height_out = h; +} + +void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + AV1_COMMON *const cm = &cpi->common; + LAYER_CONTEXT *lc = NULL; + int width = 0, height = 0; + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; + // Set the lower quality layer flag. + svc->has_lower_quality_layer = 0; + if (cpi->svc.spatial_layer_id > 0) { + const LAYER_CONTEXT *lc_prev = + &svc->layer_context[(svc->spatial_layer_id - 1) * + svc->number_temporal_layers + + svc->temporal_layer_id]; + if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1) + svc->has_lower_quality_layer = 1; + } + av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width, + cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num, + lc->scaling_factor_den, &width, &height); + // Use Eightap_smooth for low resolutions. + if (width * height <= 320 * 240) + svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; + + cm->width = width; + cm->height = height; + alloc_mb_mode_info_buffers(cpi); + av1_update_frame_size(cpi); + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + svc->mi_cols_full_resoln = cm->mi_params.mi_cols; + svc->mi_rows_full_resoln = cm->mi_params.mi_rows; + } +} + +enum { + SVC_LAST_FRAME = 0, + SVC_LAST2_FRAME, + SVC_LAST3_FRAME, + SVC_GOLDEN_FRAME, + SVC_BWDREF_FRAME, + SVC_ALTREF2_FRAME, + SVC_ALTREF_FRAME +}; + +// For fixed svc mode: fixed pattern is set based on the number of +// spatial and temporal layers, and the ksvc_fixed_mode. +void av1_set_svc_fixed_mode(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + int i; + assert(svc->use_flexible_mode == 0); + // Fixed SVC mode only supports at most 3 spatial or temporal layers. + assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 && + svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3); + rtc_ref->set_ref_frame_config = 1; + int superframe_cnt = svc->current_superframe; + // Set the reference map buffer idx for the 7 references: + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = i; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->reference[i] = 0; + for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0; + // Always reference LAST, and reference GOLDEN on SL > 0. + // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later + // when frame_type is set. + rtc_ref->reference[SVC_LAST_FRAME] = 1; + if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1; + if (svc->temporal_layer_id == 0) { + // Base temporal layer. + if (svc->spatial_layer_id == 0) { + // Set all buffer_idx to 0. Update slot 0 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->refresh[0] = 1; + } else if (svc->spatial_layer_id == 1) { + // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to + // slot 0. Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; + rtc_ref->refresh[1] = 1; + } else if (svc->spatial_layer_id == 2) { + // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to + // slot 1. Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; + rtc_ref->refresh[2] = 1; + } + } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Reference LAST (slot 0). + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to slot 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; + rtc_ref->refresh[3] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // Set LAST2 to slot 4 and Update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4; + rtc_ref->refresh[4] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 4. + // No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; + } + } else if (svc->temporal_layer_id == 1) { + // Middle temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5; + rtc_ref->refresh[5] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST3 to slot 6 and update slot 6. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 1; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6; + rtc_ref->refresh[6] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 6. + // Set LAST3 to slot 7 and update slot 7. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 2; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1) { + rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7; + rtc_ref->refresh[7] = 1; + } + } + } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 5; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; + rtc_ref->refresh[3] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 6; + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4; + rtc_ref->refresh[4] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, + // GOLDEN to slot 4. No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0; + rtc_ref->ref_idx[SVC_LAST_FRAME] = 7; + rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4; + } + } +} + +void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + // If avg_frame_bandwidth for top temporal layer is not set + // (because enhancement layer was inactive), use the base TL0 + int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + int avg_frame_bandwidth = lrc->avg_frame_bandwidth; + int prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth; + if (avg_frame_bandwidth == 0 || prev_avg_frame_bandwidth == 0) { + // Use base TL0. + layer = LAYER_IDS_TO_IDX(sl, 0, svc->number_temporal_layers); + lc = &svc->layer_context[layer]; + lrc = &lc->rc; + avg_frame_bandwidth = lrc->avg_frame_bandwidth; + prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth; + } + if (avg_frame_bandwidth > (3 * prev_avg_frame_bandwidth >> 1) || + avg_frame_bandwidth < (prev_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc2 = &svc->layer_context[layer2]; + RATE_CONTROL *lrc2 = &lc2->rc; + PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc; + lrc2->rc_1_frame = 0; + lrc2->rc_2_frame = 0; + lp_rc2->bits_off_target = lp_rc->optimal_buffer_level; + lp_rc2->buffer_level = lp_rc->optimal_buffer_level; + } + } + } +} + +void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input, + YV12_BUFFER_CONFIG *prev_source) { + frame_input->last_source = prev_source != NULL ? prev_source : NULL; + if (!cpi->ppi->use_svc && cpi->rc.prev_frame_is_dropped && + cpi->rc.frame_number_encoded > 0) { + frame_input->last_source = &cpi->svc.source_last_TL0; + } else { + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + if (cpi->svc.spatial_layer_id == 0) { + // For base spatial layer: if the LAST reference (index 0) is not + // the previous (super)frame set the last_source to the source + // corresponding to the last TL0, otherwise keep it at prev_source. + // Always use source_last_TL0 if previous base TL0 was dropped. + if (cpi->svc.current_superframe > 0) { + const int buffslot_last = rtc_ref->ref_idx[0]; + // Check if previous frame was dropped on base TL0 layer. + const int layer = + LAYER_IDS_TO_IDX(0, 0, cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->prev_frame_is_dropped || + rtc_ref->buffer_time_index[buffslot_last] < + cpi->svc.current_superframe - 1) { + frame_input->last_source = &cpi->svc.source_last_TL0; + } + } + } else if (cpi->svc.spatial_layer_id > 0) { + // For spatial enhancement layers: the previous source (prev_source) + // corresponds to the lower spatial layer (which is the same source so + // we can't use that), so always set the last_source to the source of the + // last TL0. + if (cpi->svc.current_superframe > 0) + frame_input->last_source = &cpi->svc.source_last_TL0; + else + frame_input->last_source = NULL; + } + } +} + +int av1_svc_get_min_ref_dist(const AV1_COMP *cpi) { + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + int min_dist = INT_MAX; + const unsigned int current_frame_num = + cpi->ppi->use_svc ? cpi->svc.current_superframe + : cpi->common.current_frame.frame_number; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (cpi->ppi->rtc_ref.reference[i]) { + const int ref_frame_map_idx = rtc_ref->ref_idx[i]; + const int dist = + current_frame_num - rtc_ref->buffer_time_index[ref_frame_map_idx]; + if (dist < min_dist) min_dist = dist; + } + } + return min_dist; +} + +void av1_svc_set_reference_was_previous(AV1_COMP *cpi) { + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; + // Check if the encoded frame had some reference that was the + // previous frame. + const unsigned int current_frame = + cpi->ppi->use_svc ? cpi->svc.current_superframe + : cpi->common.current_frame.frame_number; + rtc_ref->reference_was_previous_frame = true; + if (current_frame > 0) { + rtc_ref->reference_was_previous_frame = false; + for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { + if (rtc_ref->reference[i]) { + const int ref_frame_map_idx = rtc_ref->ref_idx[i]; + if (rtc_ref->buffer_time_index[ref_frame_map_idx] == current_frame - 1) + rtc_ref->reference_was_previous_frame = true; + } + } + } +} diff --git a/third_party/aom/av1/encoder/svc_layercontext.h b/third_party/aom/av1/encoder/svc_layercontext.h new file mode 100644 index 0000000000..93118be2d4 --- /dev/null +++ b/third_party/aom/av1/encoder/svc_layercontext.h @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ +#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ + +#include "aom_scale/yv12config.h" +#include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The stucture of quantities related to each spatial and temporal layer. + * \ingroup SVC + */ +typedef struct { + /*!\cond */ + RATE_CONTROL rc; + PRIMARY_RATE_CONTROL p_rc; + int framerate_factor; + int64_t layer_target_bitrate; // In bits per second. + int scaling_factor_num; + int scaling_factor_den; + int64_t target_bandwidth; + int64_t spatial_layer_target_bandwidth; + double framerate; + int avg_frame_size; + int max_q; + int min_q; + int frames_from_key_frame; + /*!\endcond */ + + /*! + * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + */ + int sb_index; + /*! + * Segmentation map + */ + int8_t *map; + /*! + * Number of blocks on segment 1 + */ + int actual_num_seg1_blocks; + + /*! + * Number of blocks on segment 2 + */ + int actual_num_seg2_blocks; + /*! + * Counter used to detect scene change. + */ + int counter_encode_maxq_scene_change; + + /*! + * Speed settings for each layer. + */ + uint8_t speed; + /*! + * GF group index. + */ + unsigned char group_index; + /*! + * If current layer is key frame. + */ + int is_key_frame; + /*! + * Maximum motion magnitude of previous encoded layer. + */ + int max_mv_magnitude; +} LAYER_CONTEXT; + +/*! + * \brief The stucture of SVC. + * \ingroup SVC + */ +typedef struct SVC { + /*!\cond */ + int spatial_layer_id; + int temporal_layer_id; + int number_spatial_layers; + int number_temporal_layers; + int prev_number_spatial_layers; + int use_flexible_mode; + int ksvc_fixed_mode; + /*!\endcond */ + + /*!\cond */ + double base_framerate; + unsigned int current_superframe; + int skip_mvsearch_last; + int skip_mvsearch_gf; + int skip_mvsearch_altref; + int spatial_layer_fb[REF_FRAMES]; + int temporal_layer_fb[REF_FRAMES]; + int num_encoded_top_layer; + int first_layer_denoise; + YV12_BUFFER_CONFIG source_last_TL0; + int mi_cols_full_resoln; + int mi_rows_full_resoln; + /*!\endcond */ + + /*! + * Layer context used for rate control in CBR mode. + * An array. The index for spatial layer `sl` and temporal layer `tl` is + * sl * number_temporal_layers + tl. + */ + LAYER_CONTEXT *layer_context; + + /*! + * Number of layers allocated for layer_context. If nonzero, must be greater + * than or equal to number_spatial_layers * number_temporal_layers. + */ + int num_allocated_layers; + + /*! + * EIGHTTAP_SMOOTH or BILINEAR + */ + InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS]; + + /*! + * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + * = 8 will center the target pixel and get a symmetric averaging filter. + */ + int downsample_filter_phase[AOM_MAX_SS_LAYERS]; + + /*! + * Force zero-mv in mode search for the spatial/inter-layer reference. + */ + int force_zero_mode_spatial_ref; + + /*! + * Flag to indicate that current spatial layer has a lower quality layer + * (at the same timestamp) that can be used as a reference. + * Lower quality layer refers to the same resolution but encoded at + * different/lower bitrate. + */ + int has_lower_quality_layer; + + /*! + * Flag to indicate the frame drop mode for SVC: one of the two settings: + * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP. + */ + AOM_SVC_FRAME_DROP_MODE framedrop_mode; + + /*! + * Flag to indicate if frame was dropped for a given spatial_layer_id on + * previous superframe. + */ + bool last_layer_dropped[AOM_MAX_SS_LAYERS]; + + /*! + * Flag to indicate if a previous spatial was dropped for the same superframe. + */ + bool drop_spatial_layer[AOM_MAX_SS_LAYERS]; +} SVC; + +struct AV1_COMP; +struct EncodeFrameInput; + +/*!\brief Initialize layer context data from init_config(). + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Nothing returned. Set cpi->svc. + */ +void av1_init_layer_context(struct AV1_COMP *const cpi); + +/*!\brief Allocate layer context data. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] num_layers Number of layers to be allocated + * + * \remark Allocates memory for cpi->svc.layer_context. + * \return True on success, false on allocation failure. + */ +bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers); + +/*!\brief Update the layer context from a change_config() call. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] target_bandwidth Total target bandwidth + * + * \remark Nothing returned. Buffer level for each layer is set. + */ +void av1_update_layer_context_change_config(struct AV1_COMP *const cpi, + const int64_t target_bandwidth); + +/*!\brief Prior to encoding the frame, update framerate-related quantities + for the current temporal layer. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Nothing returned. Frame related quantities for current temporal + layer are updated. + */ +void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi); + +/*!\brief Prior to encoding the frame, set the layer context, for the current + layer to be encoded, to the cpi struct. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \remark Nothing returned. Layer context for current layer is set. + */ +void av1_restore_layer_context(struct AV1_COMP *const cpi); + +/*!\brief Save the layer context after encoding the frame. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + */ +void av1_save_layer_context(struct AV1_COMP *const cpi); + +/*!\brief Free the memory used for cyclic refresh in layer context. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + */ +void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi); + +/*!\brief Reset on key frame: reset counters, references and buffer updates. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] is_key Whether current layer is key frame + */ +void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key); + +/*!\brief Before encoding, set resolutions and allocate compressor data. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + */ +void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi); + +/*!\brief Get primary reference frame for current layer + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * + * \return The primary reference frame for current layer. + */ +int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi); + +/*!\brief Get resolution for current layer. + * + * \ingroup SVC + * \param[in] width_org Original width, unscaled + * \param[in] height_org Original height, unscaled + * \param[in] num Numerator for the scale ratio + * \param[in] den Denominator for the scale ratio + * \param[in] width_out Output width, scaled for current layer + * \param[in] height_out Output height, scaled for current layer + * + * \remark Nothing is returned. Instead the scaled width and height are set. + */ +void av1_get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out); + +void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi); + +void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi); + +void av1_svc_set_last_source(struct AV1_COMP *const cpi, + struct EncodeFrameInput *frame_input, + YV12_BUFFER_CONFIG *prev_source); + +void av1_svc_update_buffer_slot_refreshed(struct AV1_COMP *const cpi); + +int av1_svc_get_min_ref_dist(const struct AV1_COMP *cpi); + +void av1_svc_set_reference_was_previous(struct AV1_COMP *cpi); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_ diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c new file mode 100644 index 0000000000..7d4d25de6a --- /dev/null +++ b/third_party/aom/av1/encoder/temporal_filter.c @@ -0,0 +1,1520 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/mathutils.h" +#include "aom_dsp/odintrin.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" +#include "aom_scale/aom_scale.h" +#include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/quant_common.h" +#include "av1/common/reconinter.h" +#include "av1/encoder/av1_quantize.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" +#include "av1/encoder/intra_mode_search_utils.h" +#include "av1/encoder/mcomp.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/segmentation.h" +#include "av1/encoder/temporal_filter.h" + +/*!\cond */ + +// NOTE: All `tf` in this file means `temporal filtering`. + +// Forward Declaration. +static void tf_determine_block_partition(const MV block_mv, const int block_mse, + MV *subblock_mvs, int *subblock_mses); + +// This function returns the minimum and maximum log variances for 4x4 sub +// blocks in the current block. +static INLINE void get_log_var_4x4sub_blk( + AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row, + int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min, + double *blk_4x4_var_max, int is_hbd) { + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + int var_min = INT_MAX; + int var_max = 0; + + // Derive the source buffer. + const int src_stride = frame_to_filter->y_stride; + const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width; + const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset; + + for (int i = 0; i < mb_height; i += MI_SIZE) { + for (int j = 0; j < mb_width; j += MI_SIZE) { + // Calculate the 4x4 sub-block variance. + const int var = av1_calc_normalized_variance( + cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j, + src_stride, is_hbd); + + // Record min and max for over-arching block + var_min = AOMMIN(var_min, var); + var_max = AOMMAX(var_max, var); + } + } + + *blk_4x4_var_min = log1p(var_min / 16.0); + *blk_4x4_var_max = log1p(var_max / 16.0); +} + +/*!\endcond */ +/*!\brief Does motion search for blocks in temporal filtering. This is + * the first step for temporal filtering. More specifically, given a frame to + * be filtered and another frame as reference, this function searches the + * reference frame to find out the most similar block as that from the frame + * to be filtered. This found block will be further used for weighted + * averaging. + * + * NOTE: Besides doing motion search for the entire block, this function will + * also do motion search for each 1/4 sub-block to get more precise + * predictions. Then, this function will determines whether to use 4 + * sub-blocks to replace the entire block. If we do need to split the + * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to + * the searched motion vector and search error (MSE) w.r.t. each sub-block + * respectively. Otherwise, the 4 elements will be the same, all of which + * are assigned as the searched motion vector and search error (MSE) for + * the entire block. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance structure + * \param[in] mb Pointer to macroblock + * \param[in] frame_to_filter Pointer to the frame to be filtered + * \param[in] ref_frame Pointer to the reference frame + * \param[in] block_size Block size used for motion search + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] ref_mv Reference motion vector, which is commonly + * inherited from the motion search result of + * previous frame. + * \param[in] allow_me_for_sub_blks Flag to indicate whether motion search at + * 16x16 sub-block level is needed or not. + * \param[out] subblock_mvs Pointer to the motion vectors for + * 4 sub-blocks + * \param[out] subblock_mses Pointer to the search errors (MSE) for + * 4 sub-blocks + * + * \remark Nothing will be returned. Results are saved in subblock_mvs and + * subblock_mses + */ +static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb, + const YV12_BUFFER_CONFIG *frame_to_filter, + const YV12_BUFFER_CONFIG *ref_frame, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, MV *ref_mv, + bool allow_me_for_sub_blks, MV *subblock_mvs, + int *subblock_mses) { + // Frame information + const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height); + + // Block information (ONLY Y-plane is used for motion search). + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int y_stride = frame_to_filter->y_stride; + const int src_width = frame_to_filter->y_width; + const int ref_width = ref_frame->y_width; + assert(y_stride == ref_frame->y_stride); + assert(src_width == ref_width); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + + // Save input state. + MACROBLOCKD *const mbd = &mb->e_mbd; + const struct buf_2d ori_src_buf = mb->plane[0].src; + const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; + + // Parameters used for motion search. + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + const int step_param = av1_init_search_range( + AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height)); + const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS; + const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv; + const MV_COST_TYPE mv_cost_type = + min_frame_size >= 720 + ? MV_COST_L1_HDRES + : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES); + + // Starting position for motion search. + FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv); + // Baseline position for motion search (used for rate distortion comparison). + const MV baseline_mv = kZeroMv; + + // Setup. + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset; + mb->plane[0].src.stride = y_stride; + mb->plane[0].src.width = src_width; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset; + mbd->plane[0].pre[0].stride = y_stride; + mbd->plane[0].pre[0].width = ref_width; + + const SEARCH_METHODS search_method = NSTEP; + const search_site_config *search_site_cfg = + av1_get_search_site_config(cpi, mb, search_method); + + // Unused intermediate results for motion search. + unsigned int sse, error; + int distortion; + int cost_list[5]; + + // Do motion search. + int_mv best_mv; // Searched motion vector. + FULLPEL_MV_STATS best_mv_stats; + int block_mse = INT_MAX; + MV block_mv = kZeroMv; + const int q = av1_get_q(cpi); + + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, + &baseline_mv, start_mv, search_site_cfg, + search_method, + /*fine_search_interval=*/0); + full_ms_params.run_mesh_search = 1; + full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; + + if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { + // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. + full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; + full_ms_params.mesh_search_mv_diff_threshold = 2; + } + + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), &best_mv.as_fullmv, + &best_mv_stats, NULL); + + if (force_integer_mv == 1) { // Only do full search on the entire block. + const int mv_row = best_mv.as_mv.row; + const int mv_col = best_mv.as_mv.col; + best_mv.as_mv.row = GET_MV_SUBPEL(mv_row); + best_mv.as_mv.col = GET_MV_SUBPEL(mv_col); + const int mv_offset = mv_row * y_stride + mv_col; + error = cpi->ppi->fn_ptr[block_size].vf( + ref_frame->y_buffer + y_offset + mv_offset, y_stride, + frame_to_filter->y_buffer + y_offset, y_stride, &sse); + block_mse = DIVIDE_AND_ROUND(error, mb_pels); + block_mv = best_mv.as_mv; + } else { // Do fractional search on the entire block and all sub-blocks. + av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size, + &baseline_mv, cost_list); + ms_params.forced_stop = EIGHTH_PEL; + ms_params.var_params.subpel_search_type = subpel_search_type; + // Since we are merely refining the result from full pixel search, we don't + // need regularization for subpel search + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats.err_cost = 0; + + MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats, + &best_mv.as_mv, &distortion, &sse, NULL); + block_mse = DIVIDE_AND_ROUND(error, mb_pels); + block_mv = best_mv.as_mv; + *ref_mv = best_mv.as_mv; + + if (allow_me_for_sub_blks) { + // On 4 sub-blocks. + const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1]; + const int subblock_height = block_size_high[subblock_size]; + const int subblock_width = block_size_wide[subblock_size]; + const int subblock_pels = subblock_height * subblock_width; + start_mv = get_fullmv_from_mv(ref_mv); + + int subblock_idx = 0; + for (int i = 0; i < mb_height; i += subblock_height) { + for (int j = 0; j < mb_width; j += subblock_width) { + const int offset = i * y_stride + j; + mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset; + mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset; + av1_make_default_fullpel_ms_params( + &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv, + search_site_cfg, search_method, + /*fine_search_interval=*/0); + full_ms_params.run_mesh_search = 1; + full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type; + + if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) { + // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1. + full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1; + full_ms_params.mesh_search_mv_diff_threshold = 2; + } + av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv.as_fullmv, &best_mv_stats, NULL); + + av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size, + &baseline_mv, cost_list); + ms_params.forced_stop = EIGHTH_PEL; + ms_params.var_params.subpel_search_type = subpel_search_type; + // Since we are merely refining the result from full pixel search, we + // don't need regularization for subpel search + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats.err_cost = 0; + + subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv); + assert( + av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + error = cpi->mv_search_params.find_fractional_mv_step( + &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, + &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL); + subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels); + subblock_mvs[subblock_idx] = best_mv.as_mv; + ++subblock_idx; + } + } + } + } + + // Restore input state. + mb->plane[0].src = ori_src_buf; + mbd->plane[0].pre[0] = ori_pre_buf; + + // Make partition decision. + if (allow_me_for_sub_blks) { + tf_determine_block_partition(block_mv, block_mse, subblock_mvs, + subblock_mses); + } else { + // Copy 32X32 block mv and mse values to sub blocks + for (int i = 0; i < 4; ++i) { + subblock_mvs[i] = block_mv; + subblock_mses[i] = block_mse; + } + } + // Do not pass down the reference motion vector if error is too large. + const int thresh = (min_frame_size >= 720) ? 12 : 3; + if (block_mse > (thresh << (mbd->bd - 8))) { + *ref_mv = kZeroMv; + } +} +/*!\cond */ + +// Determines whether to split the entire block to 4 sub-blocks for filtering. +// In particular, this decision is made based on the comparison between the +// motion search error of the entire block and the errors of all sub-blocks. +// Inputs: +// block_mv: Motion vector for the entire block (ONLY as reference). +// block_mse: Motion search error (MSE) for the entire block (ONLY as +// reference). +// subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be +// modified based on the partition decision). +// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will +// be modified based on the partition decision). +// Returns: +// Nothing will be returned. Results are saved in `subblock_mvs` and +// `subblock_mses`. +static void tf_determine_block_partition(const MV block_mv, const int block_mse, + MV *subblock_mvs, int *subblock_mses) { + int min_subblock_mse = INT_MAX; + int max_subblock_mse = INT_MIN; + int64_t sum_subblock_mse = 0; + for (int i = 0; i < 4; ++i) { + sum_subblock_mse += subblock_mses[i]; + min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]); + max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]); + } + + // TODO(any): The following magic numbers may be tuned to improve the + // performance OR find a way to get rid of these magic numbers. + if (((block_mse * 15 < sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 48) || + ((block_mse * 14 < sum_subblock_mse * 4) && + max_subblock_mse - min_subblock_mse < 24)) { // No split. + for (int i = 0; i < 4; ++i) { + subblock_mvs[i] = block_mv; + subblock_mses[i] = block_mse; + } + } +} + +// Helper function to determine whether a frame is encoded with high bit-depth. +static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) { + return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; +} + +/*!\endcond */ +/*!\brief Builds predictor for blocks in temporal filtering. This is the + * second step for temporal filtering, which is to construct predictions from + * all reference frames INCLUDING the frame to be filtered itself. These + * predictors are built based on the motion search results (motion vector is + * set as 0 for the frame to be filtered), and will be futher used for + * weighted averaging. + * + * \ingroup src_frame_proc + * \param[in] ref_frame Pointer to the reference frame (or the frame + * to be filtered) + * \param[in] mbd Pointer to the block for filtering. Besides + * containing the subsampling information of all + * planes, this field also gives the searched + * motion vector for the entire block, i.e., + * `mbd->mi[0]->mv[0]`. This vector should be 0 + * if the `ref_frame` itself is the frame to be + * filtered. + * \param[in] block_size Size of the block + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] num_planes Number of planes in the frame + * \param[in] scale Scaling factor + * \param[in] subblock_mvs The motion vectors for each sub-block (row-major + * order) + * \param[out] pred Pointer to the predictor to be built + * + * \remark Nothing returned, But the contents of `pred` will be modified + */ +static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame, + const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, const int num_planes, + const struct scale_factors *scale, + const MV *subblock_mvs, uint8_t *pred) { + // Information of the entire block. + const int mb_height = block_size_high[block_size]; // Height. + const int mb_width = block_size_wide[block_size]; // Width. + const int mb_y = mb_height * mb_row; // Y-coord (Top-left). + const int mb_x = mb_width * mb_col; // X-coord (Top-left). + const int bit_depth = mbd->bd; // Bit depth. + const int is_intrabc = 0; // Is intra-copied? + const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame); + + // Default interpolation filters. + const int_interpfilters interp_filters = + av1_broadcast_interp_filter(MULTITAP_SHARP2); + + // Handle Y-plane, U-plane and V-plane (if needed) in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + // Information of each sub-block in current plane. + const int plane_h = mb_height >> subsampling_y; // Plane height. + const int plane_w = mb_width >> subsampling_x; // Plane width. + const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left). + const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left). + const int h = plane_h >> 1; // Sub-block height. + const int w = plane_w >> 1; // Sub-block width. + const int is_y_plane = (plane == 0); // Is Y-plane? + + const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane], + ref_frame->widths[is_y_plane ? 0 : 1], + ref_frame->heights[is_y_plane ? 0 : 1], + ref_frame->strides[is_y_plane ? 0 : 1] }; + + // Handle each subblock. + int subblock_idx = 0; + for (int i = 0; i < plane_h; i += h) { + for (int j = 0; j < plane_w; j += w) { + // Choose proper motion vector. + const MV mv = subblock_mvs[subblock_idx++]; + assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX && + mv.col >= INT16_MIN && mv.col <= INT16_MAX); + + const int y = plane_y + i; + const int x = plane_x + j; + + // Build predictior for each sub-block on current plane. + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x, + subsampling_y, bit_depth, is_high_bitdepth, + is_intrabc, scale, &ref_buf, interp_filters); + inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth); + av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j], + plane_w, &mv, &inter_pred_params); + } + } + plane_offset += plane_h * plane_w; + } +} +/*!\cond */ + +// Computes temporal filter weights and accumulators for the frame to be +// filtered. More concretely, the filter weights for all pixels are the same. +// Inputs: +// mbd: Pointer to the block for filtering, which is ONLY used to get +// subsampling information of all planes as well as the bit-depth. +// block_size: Size of the block. +// num_planes: Number of planes in the frame. +// pred: Pointer to the well-built predictors. +// accum: Pointer to the pixel-wise accumulator for filtering. +// count: Pointer to the pixel-wise counter fot filtering. +// Returns: +// Nothing will be returned. But the content to which `accum` and `pred` +// point will be modified. +void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame, + const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, + const int mb_row, const int mb_col, + const int num_planes, uint32_t *accum, + uint16_t *count) { + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int is_high_bitdepth = is_cur_buf_hbd(mbd); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + + const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const uint8_t *buf8 = ref_frame->buffers[plane]; + const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8); + const int frame_offset = mb_row * h * frame_stride + mb_col * w; + + int pred_idx = 0; + int pixel_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth + ? buf16[frame_offset + pixel_idx] + : buf8[frame_offset + pixel_idx]; + accum[idx] += TF_WEIGHT_SCALE * pred_value; + count[idx] += TF_WEIGHT_SCALE; + ++pred_idx; + ++pixel_idx; + } + pixel_idx += (frame_stride - w); + } + plane_offset += h * w; + } +} + +// Function to compute pixel-wise squared difference between two buffers. +// Inputs: +// ref: Pointer to reference buffer. +// ref_offset: Start position of reference buffer for computation. +// ref_stride: Stride for reference buffer. +// tgt: Pointer to target buffer. +// tgt_offset: Start position of target buffer for computation. +// tgt_stride: Stride for target buffer. +// height: Height of block for computation. +// width: Width of block for computation. +// is_high_bitdepth: Whether the two buffers point to high bit-depth frames. +// square_diff: Pointer to save the squared differces. +// Returns: +// Nothing will be returned. But the content to which `square_diff` points +// will be modified. +static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset, + const int ref_stride, const uint8_t *tgt, + const int tgt_offset, + const int tgt_stride, const int height, + const int width, + const int is_high_bitdepth, + uint32_t *square_diff) { + const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); + const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt); + + int ref_idx = 0; + int tgt_idx = 0; + int idx = 0; + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx] + : ref[ref_offset + ref_idx]; + const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx] + : tgt[tgt_offset + tgt_idx]; + const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value) + : (tgt_value - ref_value); + square_diff[idx] = diff * diff; + + ++ref_idx; + ++tgt_idx; + ++idx; + } + ref_idx += (ref_stride - width); + tgt_idx += (tgt_stride - width); + } +} + +// Function to accumulate pixel-wise squared difference between two luma buffers +// to be consumed while filtering the chroma planes. +// Inputs: +// square_diff: Pointer to squared differences from luma plane. +// luma_sse_sum: Pointer to save the sum of luma squared differences. +// block_height: Height of block for computation. +// block_width: Width of block for computation. +// ss_x_shift: Chroma subsampling shift in 'X' direction +// ss_y_shift: Chroma subsampling shift in 'Y' direction +// Returns: +// Nothing will be returned. But the content to which `luma_sse_sum` points +// will be modified. +void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum, + int block_height, int block_width, + int ss_x_shift, int ss_y_shift) { + for (int i = 0; i < block_height; ++i) { + for (int j = 0; j < block_width; ++j) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + const int ww = block_width << ss_x_shift; // Width of Y-plane. + luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx]; + } + } + } + } +} + +/*!\endcond */ +/*!\brief Applies temporal filtering. NOTE that there are various optimised + * versions of this function called where the appropriate instruction set is + * supported. + * + * \ingroup src_frame_proc + * \param[in] frame_to_filter Pointer to the frame to be filtered, which is + * used as reference to compute squared + * difference from the predictor. + * \param[in] mbd Pointer to the block for filtering, ONLY used + * to get subsampling information for the planes + * \param[in] block_size Size of the block + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] num_planes Number of planes in the frame + * \param[in] noise_levels Estimated noise levels for each plane + * in the frame (Y,U,V) + * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks + * \param[in] subblock_mses Pointer to the search errors (MSE) for 4 + * sub-blocks + * \param[in] q_factor Quantization factor. This is actually the `q` + * defined in libaom, converted from `qindex` + * \param[in] filter_strength Filtering strength. This value lies in range + * [0, 6] where 6 is the maximum strength. + * \param[in] tf_wgt_calc_lvl Controls the weight calculation method during + * temporal filtering + * \param[out] pred Pointer to the well-built predictors + * \param[out] accum Pointer to the pixel-wise accumulator for + * filtering + * \param[out] count Pointer to the pixel-wise counter for + * filtering + * + * \remark Nothing returned, But the contents of `accum`, `pred` and 'count' + * will be modified + */ +void av1_apply_temporal_filter_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_pels = mb_height * mb_width; + const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter); + const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred); + // Frame information. + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Decay factors for non-local mean approach. + double decay_factor[MAX_MB_PLANE] = { 0 }; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + for (int plane = 0; plane < num_planes; plane++) { + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + decay_factor[plane] = 1 / (n_decay * q_decay * s_decay); + } + double d_factor[4] = { 0 }; + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Allocate memory for pixel-wise squared differences. They, + // regardless of the subsampling, are assigned with memory of size `mb_pels`. + uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t)); + if (!square_diff) { + aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + memset(square_diff, 0, mb_pels * sizeof(square_diff[0])); + + // Allocate memory for accumulated luma squared error. This value will be + // consumed while filtering the chroma planes. + uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t)); + if (!luma_sse_sum) { + aom_free(square_diff); + aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0])); + + // Get window size for pixel-wise filtering. + assert(TF_WINDOW_LENGTH % 2 == 1); + const int half_window = TF_WINDOW_LENGTH >> 1; + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + // Locate pixel on reference frame. + const int subsampling_y = mbd->plane[plane].subsampling_y; + const int subsampling_x = mbd->plane[plane].subsampling_x; + const int h = mb_height >> subsampling_y; // Plane height. + const int w = mb_width >> subsampling_x; // Plane width. + const int frame_stride = + frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1]; + const int frame_offset = mb_row * h * frame_stride + mb_col * w; + const uint8_t *ref = frame_to_filter->buffers[plane]; + const int ss_y_shift = + subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int ss_x_shift = + subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane will + // be more accurate. The luma sse sum is reused in both chroma planes. + if (plane == AOM_PLANE_U) + compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift, + ss_y_shift); + compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w, + h, w, is_high_bitdepth, square_diff); + + // Perform filtering. + int pred_idx = 0; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + // non-local mean approach + uint64_t sum_square_diff = 0; + + for (int wi = -half_window; wi <= half_window; ++wi) { + for (int wj = -half_window; wj <= half_window; ++wj) { + const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane. + const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane. + sum_square_diff += square_diff[y * w + x]; + } + } + + sum_square_diff += luma_sse_sum[i * w + j]; + + // Scale down the difference for high bit depth input. + if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2); + + // Combine window error and block error, and normalize it. + const double window_error = sum_square_diff * inv_num_ref_pixels; + const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2); + const double block_error = (double)subblock_mses[subblock_idx]; + const double combined_error = + weight_factor * window_error + block_error * inv_factor; + + // Compute filter weight. + double scaled_error = + combined_error * d_factor[subblock_idx] * decay_factor[plane]; + scaled_error = AOMMIN(scaled_error, 7); + int weight; + if (tf_wgt_calc_lvl == 0) { + weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + } else { + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + weight = iroundpf(fweight); + } + + const int idx = plane_offset + pred_idx; // Index with plane shift. + const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx]; + accum[idx] += weight * pred_value; + count[idx] += weight; + + ++pred_idx; + } + } + plane_offset += h * w; + } + + aom_free(square_diff); + aom_free(luma_sse_sum); +} +#if CONFIG_AV1_HIGHBITDEPTH +// Calls High bit-depth temporal filter +void av1_highbd_apply_temporal_filter_c( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col, + num_planes, noise_levels, subblock_mvs, + subblock_mses, q_factor, filter_strength, + tf_wgt_calc_lvl, pred, accum, count); +} +#endif // CONFIG_AV1_HIGHBITDEPTH +/*!\brief Normalizes the accumulated filtering result to produce the filtered + * frame + * + * \ingroup src_frame_proc + * \param[in] mbd Pointer to the block for filtering, which is + * ONLY used to get subsampling information for + * all the planes + * \param[in] block_size Size of the block + * \param[in] mb_row Row index of the block in the frame + * \param[in] mb_col Column index of the block in the frame + * \param[in] num_planes Number of planes in the frame + * \param[in] accum Pointer to the pre-computed accumulator + * \param[in] count Pointer to the pre-computed count + * \param[out] result_buffer Pointer to result buffer + * + * \remark Nothing returned, but the content to which `result_buffer` pointer + * will be modified + */ +static void tf_normalize_filtered_frame( + const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, const int num_planes, const uint32_t *accum, + const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) { + // Block information. + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer); + + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const int plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const int plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + uint8_t *const buf = result_buffer->buffers[plane]; + uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf); + + int plane_idx = 0; // Pixel index on current plane (block-base). + int frame_idx = frame_offset; // Pixel index on the entire frame. + for (int i = 0; i < plane_h; ++i) { + for (int j = 0; j < plane_w; ++j) { + const int idx = plane_idx + plane_offset; + const uint16_t rounding = count[idx] >> 1; + if (is_high_bitdepth) { + buf16[frame_idx] = + (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]); + } else { + buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]); + } + ++plane_idx; + ++frame_idx; + } + frame_idx += (frame_stride - plane_w); + } + plane_offset += plane_h * plane_w; + } +} + +int av1_get_q(const AV1_COMP *cpi) { + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + const int q = + (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type], + cpi->common.seq_params->bit_depth); + return q; +} + +void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) { + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + YV12_BUFFER_CONFIG **frames = tf_ctx->frames; + const int num_frames = tf_ctx->num_frames; + const int filter_frame_idx = tf_ctx->filter_frame_idx; + const int compute_frame_diff = tf_ctx->compute_frame_diff; + const struct scale_factors *scale = &tf_ctx->sf; + const double *noise_levels = tf_ctx->noise_levels; + const int num_pels = tf_ctx->num_pels; + const int q_factor = tf_ctx->q_factor; + const BLOCK_SIZE block_size = TF_BLOCK_SIZE; + const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; + MACROBLOCK *const mb = &td->mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + TemporalFilterData *const tf_data = &td->tf_data; + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mi_h = mi_size_high_log2[block_size]; + const int mi_w = mi_size_wide_log2[block_size]; + const int num_planes = av1_num_planes(&cpi->common); + const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf; + uint32_t *accum = tf_data->accum; + uint16_t *count = tf_data->count; + uint8_t *pred = tf_data->pred; + + // Factor to control the filering strength. + const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength; + + // Do filtering. + FRAME_DIFF *diff = &td->tf_data.diff; + av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) { + av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + memset(accum, 0, num_pels * sizeof(accum[0])); + memset(count, 0, num_pels * sizeof(count[0])); + MV ref_mv = kZeroMv; // Reference motion vector passed down along frames. + // Perform temporal filtering frame by frame. + + // Decide whether to perform motion search at 16x16 sub-block level or not + // based on 4x4 sub-blocks source variance. Allow motion search for split + // partition only if the difference between max and min source variance of + // 4x4 blocks is greater than a threshold (which is derived empirically). + bool allow_me_for_sub_blks = true; + if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) { + const int is_hbd = is_frame_high_bitdepth(frame_to_filter); + // Initialize minimum variance to a large value and maximum variance to 0. + double blk_4x4_var_min = DBL_MAX; + double blk_4x4_var_max = 0; + get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col, + TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max, + is_hbd); + // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the + // threshold for high bit depth. + if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0) + allow_me_for_sub_blks = false; + } + + for (int frame = 0; frame < num_frames; frame++) { + if (frames[frame] == NULL) continue; + + // Motion search. + MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv }; + int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + if (frame == + filter_frame_idx) { // Frame to be filtered. + // Change ref_mv sign for following frames. + ref_mv.row *= -1; + ref_mv.col *= -1; + } else { // Other reference frames. + tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size, + mb_row, mb_col, &ref_mv, allow_me_for_sub_blks, + subblock_mvs, subblock_mses); + } + + // Perform weighted averaging. + if (frame == filter_frame_idx) { // Frame to be filtered. + tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row, + mb_col, num_planes, accum, count); + } else { // Other reference frames. + tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col, + num_planes, scale, subblock_mvs, pred); + + // All variants of av1_apply_temporal_filter() contain floating point + // operations. Hence, clear the system state. + + // TODO(any): avx2/sse2 version should be changed to align with C + // function before using. In particular, current avx2/sse2 function + // only supports 32x32 block size and 5x5 filtering window. + if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth +#if CONFIG_AV1_HIGHBITDEPTH + if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { + av1_highbd_apply_temporal_filter( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); + } else { +#endif // CONFIG_AV1_HIGHBITDEPTH + av1_apply_temporal_filter_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); +#if CONFIG_AV1_HIGHBITDEPTH + } +#endif // CONFIG_AV1_HIGHBITDEPTH + } else { + // for 8-bit + if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) { + av1_apply_temporal_filter( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); + } else { + av1_apply_temporal_filter_c( + frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes, + noise_levels, subblock_mvs, subblock_mses, q_factor, + filter_strength, weight_calc_level_in_tf, pred, accum, count); + } + } + } + } + tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes, + accum, count, tf_ctx->output_frame); + + if (compute_frame_diff) { + const int y_height = mb_height >> mbd->plane[0].subsampling_y; + const int y_width = mb_width >> mbd->plane[0].subsampling_x; + const int source_y_stride = frame_to_filter->y_stride; + const int filter_y_stride = tf_ctx->output_frame->y_stride; + const int source_offset = + mb_row * y_height * source_y_stride + mb_col * y_width; + const int filter_offset = + mb_row * y_height * filter_y_stride + mb_col * y_width; + unsigned int sse = 0; + cpi->ppi->fn_ptr[block_size].vf( + frame_to_filter->y_buffer + source_offset, source_y_stride, + tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride, + &sse); + diff->sum += sse; + diff->sse += sse * (int64_t)sse; + } + } +} + +/*!\brief Does temporal filter for a given frame. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance structure + * + * \remark Nothing will be returned, but the contents of td->diff will be + modified. + */ +static void tf_do_filtering(AV1_COMP *cpi) { + // Basic information. + ThreadData *td = &cpi->td; + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + const struct scale_factors *scale = &tf_ctx->sf; + const int num_planes = av1_num_planes(&cpi->common); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + MACROBLOCKD *mbd = &td->mb.e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + MB_MODE_INFO **input_mb_mode_info; + tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes); + tf_setup_macroblockd(mbd, &td->tf_data, scale); + + // Perform temporal filtering for each row. + for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++) + av1_tf_do_filtering_row(cpi, td, mb_row); + + tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes); +} + +/*!\brief Setups the frame buffer for temporal filtering. This fuction + * determines how many frames will be used for temporal filtering and then + * groups them into a buffer. This function will also estimate the noise level + * of the to-filter frame. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance structure + * \param[in] filter_frame_lookahead_idx The index of the to-filter frame + * in the lookahead buffer cpi->lookahead + * \param[in] gf_frame_index GOP index + * + * \remark Nothing will be returned. But the fields `frames`, `num_frames`, + * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx. + */ +static void tf_setup_filtering_buffer(AV1_COMP *cpi, + int filter_frame_lookahead_idx, + int gf_frame_index) { + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; + const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index]; + const int is_forward_keyframe = + av1_gop_check_forward_keyframe(gf_group, gf_frame_index); + + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + YV12_BUFFER_CONFIG **frames = tf_ctx->frames; + // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable + // temporal filtering. + int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1); + int num_before = 0; // Number of filtering frames before the to-filter frame. + int num_after = 0; // Number of filtering frames after the to-filer frame. + const int lookahead_depth = + av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); + + // Temporal filtering should not go beyond key frames + const int key_to_curframe = + AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0); + const int curframe_to_key = + AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0); + + // Number of buffered frames before the to-filter frame. + int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe); + + // Number of buffered frames after the to-filter frame. + int max_after = + AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key); + + // Estimate noises for each plane. + const struct lookahead_entry *to_filter_buf = av1_lookahead_peek( + cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage); + assert(to_filter_buf != NULL); + const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img; + const int num_planes = av1_num_planes(&cpi->common); + double *noise_levels = tf_ctx->noise_levels; + av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y, + num_planes - 1, cpi->common.seq_params->bit_depth, + NOISE_ESTIMATION_EDGE_THRESHOLD); + // Get quantization factor. + const int q = av1_get_q(cpi); + // Get correlation estimates from first-pass; + const FIRSTPASS_STATS *stats = + cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0); + double accu_coeff0 = 1.0, accu_coeff1 = 1.0; + for (int i = 1; i <= max_after; i++) { + if (stats + filter_frame_lookahead_idx + i >= + cpi->ppi->twopass.stats_buf_ctx->stats_in_end) { + max_after = i - 1; + break; + } + accu_coeff1 *= + AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001); + } + if (max_after >= 1) { + accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after); + } + for (int i = 1; i <= max_before; i++) { + if (stats + filter_frame_lookahead_idx - i + 1 <= + cpi->ppi->twopass.stats_buf_ctx->stats_in_start) { + max_before = i - 1; + break; + } + accu_coeff0 *= + AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001); + } + if (max_before >= 1) { + accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before); + } + + // Adjust number of filtering frames based on quantization factor. When the + // quantization factor is small enough (lossless compression), we will not + // change the number of frames for key frame filtering, which is to avoid + // visual quality drop. + int adjust_num = 6; + const int adjust_num_frames_for_arf_filtering = + cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering; + if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering. + adjust_num = 0; + } else if ((update_type == KF_UPDATE) && q <= 10) { + adjust_num = 0; + } else if (adjust_num_frames_for_arf_filtering > 0 && + update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) { + // Since screen content detection happens after temporal filtering, + // 'frames_since_key' check is added to ensure the sf is disabled for the + // first alt-ref frame. + // Adjust number of frames to be considered for filtering based on noise + // level of the current frame. For low-noise frame, use more frames to + // filter such that the filtered frame can provide better predictions for + // subsequent frames and vice versa. + const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 }, + { 4, 2, 0 } }; + const uint8_t *adjust_num_frames = + av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1]; + + if (noise_levels[AOM_PLANE_Y] < 0.5) + adjust_num = adjust_num_frames[0]; + else if (noise_levels[AOM_PLANE_Y] < 1.0) + adjust_num = adjust_num_frames[1]; + else + adjust_num = adjust_num_frames[2]; + } + num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth); + + if (frame_type == KEY_FRAME) { + num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before); + num_after = AOMMIN(num_frames - 1, max_after); + } else { + int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame, + &cpi->ppi->p_rc, &cpi->frame_info, + filter_frame_lookahead_idx, max_before, + max_after, NULL, NULL, 0); + + num_frames = AOMMIN(num_frames, gfu_boost / 150); + num_frames += !(num_frames & 1); // Make the number odd. + + // Only use 2 neighbours for the second ARF. + if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3); + if (AOMMIN(max_after, max_before) >= num_frames / 2) { + // just use half half + num_before = num_frames / 2; + num_after = num_frames / 2; + } else { + if (max_after < num_frames / 2) { + num_after = max_after; + num_before = AOMMIN(num_frames - 1 - num_after, max_before); + } else { + num_before = max_before; + num_after = AOMMIN(num_frames - 1 - num_before, max_after); + } + // Adjust insymmetry based on frame-level correlation + if (max_after > 0 && max_before > 0) { + if (num_after < num_before) { + const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01)); + num_before = AOMMIN(num_before, num_after + insym); + } else { + const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01)); + num_after = AOMMIN(num_after, num_before + insym); + } + } + } + } + num_frames = num_before + 1 + num_after; + + // Setup the frame buffer. + for (int frame = 0; frame < num_frames; ++frame) { + const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx; + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage); + assert(buf != NULL); + frames[frame] = &buf->img; + } + tf_ctx->num_frames = num_frames; + tf_ctx->filter_frame_idx = num_before; + assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame); + + av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes, + cpi->common.seq_params->sb_size); + av1_setup_block_planes(&cpi->td.mb.e_mbd, + cpi->common.seq_params->subsampling_x, + cpi->common.seq_params->subsampling_y, num_planes); +} + +/*!\cond */ + +double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height, + int width, int stride, + int edge_thresh) { + int64_t accum = 0; + int count = 0; + + for (int i = 1; i < height - 1; ++i) { + for (int j = 1; j < width - 1; ++j) { + // Setup a small 3x3 matrix. + const int center_idx = i * stride + j; + int mat[3][3]; + for (int ii = -1; ii <= 1; ++ii) { + for (int jj = -1; jj <= 1; ++jj) { + const int idx = center_idx + ii * stride + jj; + mat[ii + 1][jj + 1] = src[idx]; + } + } + // Compute sobel gradients. + const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0); + // Accumulate Laplacian. + if (Ga < edge_thresh) { // Only count smooth pixels. + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + accum += ROUND_POWER_OF_TWO(abs(v), 0); + ++count; + } + } + } + + // Return -1.0 (unreliable estimation) if there are too few smooth pixels. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} + +#if CONFIG_AV1_HIGHBITDEPTH +double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16, + int height, int width, + const int stride, + int bit_depth, + int edge_thresh) { + int64_t accum = 0; + int count = 0; + for (int i = 1; i < height - 1; ++i) { + for (int j = 1; j < width - 1; ++j) { + // Setup a small 3x3 matrix. + const int center_idx = i * stride + j; + int mat[3][3]; + for (int ii = -1; ii <= 1; ++ii) { + for (int jj = -1; jj <= 1; ++jj) { + const int idx = center_idx + ii * stride + jj; + mat[ii + 1][jj + 1] = src16[idx]; + } + } + // Compute sobel gradients. + const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) + + 2 * (mat[1][0] - mat[1][2]); + const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) + + 2 * (mat[0][1] - mat[2][1]); + const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8); + // Accumulate Laplacian. + if (Ga < edge_thresh) { // Only count smooth pixels. + const int v = 4 * mat[1][1] - + 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) + + (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]); + accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8); + ++count; + } + } + } + + // Return -1.0 (unreliable estimation) if there are too few smooth pixels. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} +#endif + +void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame, + double *noise_level, int plane_from, int plane_to, + int bit_depth, int edge_thresh) { + for (int plane = plane_from; plane <= plane_to; plane++) { + const bool is_uv_plane = (plane != AOM_PLANE_Y); + const int height = frame->crop_heights[is_uv_plane]; + const int width = frame->crop_widths[is_uv_plane]; + const int stride = frame->strides[is_uv_plane]; + const uint8_t *src = frame->buffers[plane]; + +#if CONFIG_AV1_HIGHBITDEPTH + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const int is_high_bitdepth = is_frame_high_bitdepth(frame); + if (is_high_bitdepth) { + noise_level[plane] = av1_highbd_estimate_noise_from_single_plane( + src16, height, width, stride, bit_depth, edge_thresh); + } else { + noise_level[plane] = av1_estimate_noise_from_single_plane( + src, height, width, stride, edge_thresh); + } +#else + (void)bit_depth; + noise_level[plane] = av1_estimate_noise_from_single_plane( + src, height, width, stride, edge_thresh); +#endif + } +} + +// Initializes the members of TemporalFilterCtx +// Inputs: +// cpi: Top level encoder instance structure +// check_show_existing: If 1, check whether the filtered frame is similar +// to the original frame. +// filter_frame_lookahead_idx: The index of the frame to be filtered in the +// lookahead buffer cpi->lookahead. +// Returns: +// Nothing will be returned. But the contents of cpi->tf_ctx will be modified. +static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx, + int gf_frame_index, int compute_frame_diff, + YV12_BUFFER_CONFIG *output_frame) { + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + // Setup frame buffer for filtering. + YV12_BUFFER_CONFIG **frames = tf_ctx->frames; + tf_ctx->num_frames = 0; + tf_ctx->filter_frame_idx = -1; + tf_ctx->output_frame = output_frame; + tf_ctx->compute_frame_diff = compute_frame_diff; + tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index); + assert(tf_ctx->num_frames > 0); + assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames); + + // Setup scaling factors. Scaling on each of the arnr frames is not + // supported. + // ARF is produced at the native frame size and resized when coded. + struct scale_factors *sf = &tf_ctx->sf; + av1_setup_scale_factors_for_frame( + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + frames[0]->y_crop_width, frames[0]->y_crop_height); + + // Initialize temporal filter parameters. + MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + const int filter_frame_idx = tf_ctx->filter_frame_idx; + const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx]; + const BLOCK_SIZE block_size = TF_BLOCK_SIZE; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int mb_width = block_size_wide[block_size]; + const int mb_height = block_size_high[block_size]; + const int mb_rows = get_num_blocks(frame_height, mb_height); + const int mb_cols = get_num_blocks(frame_width, mb_width); + const int mb_pels = mb_width * mb_height; + const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter); + const int num_planes = av1_num_planes(&cpi->common); + int num_pels = 0; + for (int i = 0; i < num_planes; i++) { + const int subsampling_x = mbd->plane[i].subsampling_x; + const int subsampling_y = mbd->plane[i].subsampling_y; + num_pels += mb_pels >> (subsampling_x + subsampling_y); + } + tf_ctx->num_pels = num_pels; + tf_ctx->mb_rows = mb_rows; + tf_ctx->mb_cols = mb_cols; + tf_ctx->is_highbitdepth = is_highbitdepth; + tf_ctx->q_factor = av1_get_q(cpi); +} + +int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame, + const FRAME_DIFF *frame_diff, int q_index, + aom_bit_depth_t bit_depth) { + const int frame_height = frame->y_crop_height; + const int frame_width = frame->y_crop_width; + const int block_height = block_size_high[TF_BLOCK_SIZE]; + const int block_width = block_size_wide[TF_BLOCK_SIZE]; + const int mb_rows = get_num_blocks(frame_height, block_height); + const int mb_cols = get_num_blocks(frame_width, block_width); + const int num_mbs = AOMMAX(1, mb_rows * mb_cols); + const float mean = (float)frame_diff->sum / num_mbs; + const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean); + + const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth); + const float threshold = 0.7f * ac_q_step * ac_q_step; + + if (mean < threshold && std < mean * 1.2) { + return 1; + } + return 0; +} + +void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, + int gf_frame_index, FRAME_DIFF *frame_diff, + YV12_BUFFER_CONFIG *output_frame) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + // Basic informaton of the current frame. + TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; + TemporalFilterData *tf_data = &cpi->td.tf_data; + const int compute_frame_diff = frame_diff != NULL; + // TODO(anyone): Currently, we enforce the filtering strength on internal + // ARFs except the second ARF to be zero. We should investigate in which case + // it is more beneficial to use non-zero strength filtering. + // Only parallel level 0 frames go through temporal filtering. + assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0); + + // Initialize temporal filter context structure. + init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index, + compute_frame_diff, output_frame); + + // Allocate and reset temporal filter buffers. + const int is_highbitdepth = tf_ctx->is_highbitdepth; + if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating temporal filter data"); + } + + // Perform temporal filtering process. + if (mt_info->num_workers > 1) + av1_tf_do_filtering_mt(cpi); + else + tf_do_filtering(cpi); + + if (compute_frame_diff) { + *frame_diff = tf_data->diff; + } + // Deallocate temporal filter buffers. + tf_dealloc_data(tf_data, is_highbitdepth); +} + +int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) { + return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1; +} + +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { + const AV1EncoderConfig *oxcf = &cpi->oxcf; + tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf); + if (tf_info->is_temporal_filter_on == 0) return true; + + const AV1_COMMON *cm = &cpi->common; + const SequenceHeader *const seq_params = cm->seq_params; + for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { + if (aom_realloc_frame_buffer( + &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->image_pyramid_levels, 0)) { + return false; + } + } + return true; +} + +void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) { + if (tf_info->is_temporal_filter_on == 0) return; + for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { + aom_free_frame_buffer(&tf_info->tf_buf[i]); + } + aom_free_frame_buffer(&tf_info->tf_buf_second_arf); +} + +void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) { + av1_zero(tf_info->tf_buf_valid); + av1_zero(tf_info->tf_buf_gf_index); + av1_zero(tf_info->tf_buf_display_index_offset); +} + +void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi, + const GF_GROUP *gf_group) { + if (tf_info->is_temporal_filter_on == 0) return; + const AV1_COMMON *const cm = &cpi->common; + for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) { + int update_type = gf_group->update_type[gf_index]; + if (update_type == KF_UPDATE || update_type == ARF_UPDATE) { + int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME; + int lookahead_idx = gf_group->arf_src_offset[gf_index] + + gf_group->cur_frame_idx[gf_index]; + // This function is designed to be called multiple times after + // av1_tf_info_reset(). It will only generate the filtered frame that does + // not exist yet. + if (tf_info->tf_buf_valid[buf_idx] == 0 || + tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) { + YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx]; + av1_temporal_filter(cpi, lookahead_idx, gf_index, + &tf_info->frame_diff[buf_idx], out_buf); + aom_extend_frame_borders(out_buf, av1_num_planes(cm)); + tf_info->tf_buf_gf_index[buf_idx] = gf_index; + tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx; + tf_info->tf_buf_valid[buf_idx] = 1; + } + } + } +} + +YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info, + int gf_index, + FRAME_DIFF *frame_diff) { + if (tf_info->is_temporal_filter_on == 0) return NULL; + YV12_BUFFER_CONFIG *out_buf = NULL; + for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { + if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) { + out_buf = &tf_info->tf_buf[i]; + *frame_diff = tf_info->frame_diff[i]; + } + } + return out_buf; +} +/*!\endcond */ diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h new file mode 100644 index 0000000000..6504b91b66 --- /dev/null +++ b/third_party/aom/av1/encoder/temporal_filter.h @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ +#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ +struct AV1_COMP; +struct AV1EncoderConfig; +struct ThreadData; +// TODO(wtc): These two variables are only used in avx2, sse2, neon +// implementations, where the block size is still hard coded to TF_BLOCK_SIZE. +// This should be fixed to align with the c implementation. +#define BH 32 +#define BW 32 + +// Block size used in temporal filtering. +#define TF_BLOCK_SIZE BLOCK_32X32 + +// Window size for temporal filtering. +#define TF_WINDOW_LENGTH 5 + +// A constant number, sqrt(pi / 2), used for noise estimation. +static const double SQRT_PI_BY_2 = 1.25331413732; + +// Hyper-parameters used to compute filtering weight. These hyper-parameters can +// be tuned for a better performance. +// 0. A scale factor used in temporal filtering to raise the filter weight from +// `double` with range [0, 1] to `int` with range [0, 1000]. +#define TF_WEIGHT_SCALE 1000 +// 1. Weight factor used to balance the weighted-average between window error +// and block error. The weight is for window error while the weight for block +// error is always set as 1. +#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5 +// 2. Threshold for using q to adjust the filtering weight. Concretely, when +// using a small q (high bitrate), we would like to reduce the filtering +// strength such that more detailed information can be preserved. Hence, when +// q is smaller than this threshold, we will adjust the filtering weight +// based on the q-value. +#define TF_Q_DECAY_THRESHOLD 20 +// 3. Normalization factor used to normalize the motion search error. Since the +// motion search error can be large and uncontrollable, we will simply +// normalize it before using it to compute the filtering weight. +#define TF_SEARCH_ERROR_NORM_WEIGHT 20 +// 4. Threshold for using `arnr_strength` to adjust the filtering strength. +// Concretely, users can use `arnr_strength` arguments to control the +// strength of temporal filtering. When `arnr_strength` is small enough ( +// i.e., smaller than this threshold), we will adjust the filtering weight +// based on the strength value. +#define TF_STRENGTH_THRESHOLD 4 +// 5. Threshold for using motion search distance to adjust the filtering weight. +// Concretely, larger motion search vector leads to a higher probability of +// unreliable search. Hence, we would like to reduce the filtering strength +// when the distance is large enough. Considering that the distance actually +// relies on the frame size, this threshold is also a resolution-based +// threshold. Taking 720p videos as an instance, if this field equals to 0.1, +// then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold +// for 360p videos will be 360 * 0.1 = 36. +#define TF_SEARCH_DISTANCE_THRESHOLD 0.1 +// 6. Threshold to identify if the q is in a relative high range. +// Above this cutoff q, a stronger filtering is applied. +// For a high q, the quantization throws away more information, and thus a +// stronger filtering is less likely to distort the encoded quality, while a +// stronger filtering could reduce bit rates. +// Ror a low q, more details are expected to be retained. Filtering is thus +// more conservative. +#define TF_QINDEX_CUTOFF 128 + +#define NOISE_ESTIMATION_EDGE_THRESHOLD 50 + +// Sum and SSE source vs filtered frame difference returned by +// temporal filter. +typedef struct { + int64_t sum; + int64_t sse; +} FRAME_DIFF; + +/*!\endcond */ + +/*! + * \brief Parameters related to temporal filtering. + */ +typedef struct { + /*! + * Frame buffers used for temporal filtering. + */ + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + /*! + * Number of frames in the frame buffer. + */ + int num_frames; + + /*! + * Output filtered frame + */ + YV12_BUFFER_CONFIG *output_frame; + + /*! + * Index of the frame to be filtered. + */ + int filter_frame_idx; + /*! + * Whether to accumulate diff for show existing condition check. + */ + int compute_frame_diff; + /*! + * Frame scaling factor. + */ + struct scale_factors sf; + /*! + * Estimated noise levels for each plane in the frame. + */ + double noise_levels[MAX_MB_PLANE]; + /*! + * Number of pixels in the temporal filtering block across all planes. + */ + int num_pels; + /*! + * Number of temporal filtering block rows. + */ + int mb_rows; + /*! + * Number of temporal filtering block columns. + */ + int mb_cols; + /*! + * Whether the frame is high-bitdepth or not. + */ + int is_highbitdepth; + /*! + * Quantization factor used in temporal filtering. + */ + int q_factor; +} TemporalFilterCtx; + +/*! + * buffer count in TEMPORAL_FILTER_INFO + * Currently we only apply filtering on KEY and ARF after + * define_gf_group(). Hence, the count is two. + */ +#define TF_INFO_BUF_COUNT 2 + +/*! + * \brief Temporal filter info for a gop + */ +typedef struct TEMPORAL_FILTER_INFO { + /*! + * A flag indicate whether temporal filter shoud be applied. + * This flag will stored the result of + * av1_is_temporal_filter_on() + */ + int is_temporal_filter_on; + /*! + * buffers used for temporal filtering in a GOP + * index 0 for key frame and index 1 for ARF + */ + YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT]; + + /*! + * buffers used for temporal filtering for + * INTNL_ARF_UPDATE + * Check av1_gop_is_second_arf() for the + * definition of second_arf in detail + */ + YV12_BUFFER_CONFIG tf_buf_second_arf; + /*! + * whether to show the buffer directly or not. + */ + FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT]; + /*! + * the corresponding gf_index for the buffer. + */ + int tf_buf_gf_index[TF_INFO_BUF_COUNT]; + /*! + * the display_index offset between next show frame and the frames in the GOP + */ + int tf_buf_display_index_offset[TF_INFO_BUF_COUNT]; + /*! + * whether the buf is valid or not. + */ + int tf_buf_valid[TF_INFO_BUF_COUNT]; +} TEMPORAL_FILTER_INFO; + +/*!\brief Check whether we should apply temporal filter at all. + * \param[in] oxcf AV1 encoder config + * + * \return 1: temporal filter is on 0: temporal is off + */ +int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf); + +/*!\brief Allocate buffers for TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + * \param[in,out] cpi Top level encoder instance structure + * + * \return True on success, false on memory allocation failure. + */ +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, + const struct AV1_COMP *cpi); + +/*!\brief Free buffers for TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + */ +void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info); + +/*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + */ +void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info); + +/*!\brief Apply temporal filter for key frame and ARF in a gop + * \param[in,out] tf_info Temporal filter info for a gop + * \param[in,out] cpi Top level encoder instance structure + * \param[in] gf_group GF/ARF group data structure + */ +void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi, + const GF_GROUP *gf_group); + +/*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO + * \param[in,out] tf_info Temporal filter info for a gop + * \param[in] gf_index gf_index for the target buffer + * \param[out] show_tf_buf whether the target buffer can be shown + * directly + */ +YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info, + int gf_index, + FRAME_DIFF *frame_diff); + +/*!\cond */ + +// Data related to temporal filtering. +typedef struct { + // Source vs filtered frame error. + FRAME_DIFF diff; + // Pointer to temporary block info used to store state in temporal filtering + // process. + MB_MODE_INFO *tmp_mbmi; + // Pointer to accumulator buffer used in temporal filtering process. + uint32_t *accum; + // Pointer to count buffer used in temporal filtering process. + uint16_t *count; + // Pointer to predictor used in temporal filtering process. + uint8_t *pred; +} TemporalFilterData; + +// Data related to temporal filter multi-thread synchronization. +typedef struct { +#if CONFIG_MULTITHREAD + // Mutex lock used for dispatching jobs. + pthread_mutex_t *mutex_; +#endif // CONFIG_MULTITHREAD + // Next temporal filter block row to be filtered. + int next_tf_row; + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool tf_mt_exit; +} AV1TemporalFilterSync; + +// Estimates noise level from a given frame using a single plane (Y, U, or V). +// This is an adaptation of the mehtod in the following paper: +// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise +// estimation using Laplacian operator and adaptive edge detection", +// Proc. 3rd International Symposium on Communications, Control and +// Signal Processing, 2008, St Julians, Malta. +// Inputs: +// frame: Pointer to the frame to estimate noise level from. +// noise_level: Pointer to store the estimated noise. +// plane_from: Index of the starting plane used for noise estimation. +// Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane. +// plane_to: Index of the end plane used for noise estimation. +// bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame. +// edge_thresh: Edge threshold. +void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame, + double *noise_level, int plane_from, int plane_to, + int bit_depth, int edge_thresh); +/*!\endcond */ + +/*!\brief Does temporal filter for a given macroblock row. +* +* \ingroup src_frame_proc +* \param[in] cpi Top level encoder instance structure +* \param[in] td Pointer to thread data +* \param[in] mb_row Macroblock row to be filtered +filtering +* +* \remark Nothing will be returned, but the contents of td->diff will be +modified. +*/ +void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td, + int mb_row); + +/*!\brief Performs temporal filtering if needed on a source frame. + * For example to create a filtered alternate reference frame (ARF) + * + * In this function, the lookahead index is different from the 0-based + * real index. For example, if we want to filter the first frame in the + * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead + * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the + * second frame in the pre-fetched buffer. Another example: if we want to filter + * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16. + * Futhermore, negative number is used for key frame in one-pass mode, where key + * frame is filtered with the frames before it instead of after it. For example, + * -15 means to filter the 17-th frame, which is a key frame in one-pass mode. + * + * \ingroup src_frame_proc + * \param[in] cpi Top level encoder instance + * structure + * \param[in] filter_frame_lookahead_idx The index of the + * to-filter frame in the lookahead + * buffer cpi->lookahead. + * \param[in] gf_frame_index Index of GOP + * \param[in,out] frame_diff structure of sse and sum of the + * filtered frame. + * \param[out] output_frame Ouput filtered frame. + */ +void av1_temporal_filter(struct AV1_COMP *cpi, + const int filter_frame_lookahead_idx, + int gf_frame_index, FRAME_DIFF *frame_diff, + YV12_BUFFER_CONFIG *output_frame); + +/*!\brief Check whether a filtered frame can be show directly + * + * This function will use the filtered frame's sse and current q index + * to make decision. + * + * \ingroup src_frame_proc + * \param[in] frame filtered frame's buffer + * \param[in] frame_diff structure of sse and sum of the + * filtered frame. + * \param[in] q_index q_index used for this frame + * \param[in] bit_depth bit depth + * \return return 1 if this frame can be shown directly, otherwise + * return 0 + */ +int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame, + const FRAME_DIFF *frame_diff, int q_index, + aom_bit_depth_t bit_depth); + +/*!\cond */ +// Helper function to get `q` used for encoding. +int av1_get_q(const struct AV1_COMP *cpi); + +// Allocates memory for members of TemporalFilterData. +// Inputs: +// tf_data: Pointer to the structure containing temporal filter related data. +// num_pels: Number of pixels in the block across all planes. +// is_high_bitdepth: Whether the frame is high-bitdepth or not. +// Returns: +// True if allocation is successful and false otherwise. +static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data, + int num_pels, + int is_high_bitdepth) { + tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi)); + tf_data->accum = + (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum)); + tf_data->count = + (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count)); + if (is_high_bitdepth) + tf_data->pred = CONVERT_TO_BYTEPTR( + aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred))); + else + tf_data->pred = + (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred)); + // In case of an allocation failure, other successfully allocated buffers will + // be freed by the tf_dealloc_data() call in encoder_destroy(). + if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred)) + return false; + memset(&tf_data->diff, 0, sizeof(tf_data->diff)); + return true; +} + +// Setup macroblockd params for temporal filtering process. +// Inputs: +// mbd: Pointer to the block for filtering. +// tf_data: Pointer to the structure containing temporal filter related data. +// scale: Scaling factor. +// Returns: +// Nothing will be returned. Contents of mbd will be modified. +static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd, + TemporalFilterData *tf_data, + const struct scale_factors *scale) { + mbd->block_ref_scale_factors[0] = scale; + mbd->block_ref_scale_factors[1] = scale; + mbd->mi = &tf_data->tmp_mbmi; + mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION; +} + +// Deallocates the memory allocated for members of TemporalFilterData. +// Inputs: +// tf_data: Pointer to the structure containing temporal filter related data. +// is_high_bitdepth: Whether the frame is high-bitdepth or not. +// Returns: +// Nothing will be returned. +static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data, + int is_high_bitdepth) { + if (is_high_bitdepth) + tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred); + aom_free(tf_data->tmp_mbmi); + tf_data->tmp_mbmi = NULL; + aom_free(tf_data->accum); + tf_data->accum = NULL; + aom_free(tf_data->count); + tf_data->count = NULL; + aom_free(tf_data->pred); + tf_data->pred = NULL; +} + +// Saves the state prior to temporal filter process. +// Inputs: +// mbd: Pointer to the block for filtering. +// input_mbmi: Backup block info to save input state. +// input_buffer: Backup buffer pointer to save input state. +// num_planes: Number of planes. +// Returns: +// Nothing will be returned. Contents of input_mbmi and input_buffer will be +// modified. +static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi, + uint8_t **input_buffer, int num_planes) { + for (int i = 0; i < num_planes; i++) { + input_buffer[i] = mbd->plane[i].pre[0].buf; + } + *input_mbmi = mbd->mi; +} + +// Restores the initial state after temporal filter process. +// Inputs: +// mbd: Pointer to the block for filtering. +// input_mbmi: Backup block info from where input state is restored. +// input_buffer: Backup buffer pointer from where input state is restored. +// num_planes: Number of planes. +// Returns: +// Nothing will be returned. Contents of mbd will be modified. +static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi, + uint8_t **input_buffer, int num_planes) { + for (int i = 0; i < num_planes; i++) { + mbd->plane[i].pre[0].buf = input_buffer[i]; + } + mbd->mi = input_mbmi; +} + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_ diff --git a/third_party/aom/av1/encoder/thirdpass.c b/third_party/aom/av1/encoder/thirdpass.c new file mode 100644 index 0000000000..a25522fbc5 --- /dev/null +++ b/third_party/aom/av1/encoder/thirdpass.c @@ -0,0 +1,877 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "av1/encoder/thirdpass.h" + +#if CONFIG_THREE_PASS && CONFIG_AV1_DECODER +#include "aom/aom_codec.h" +#include "aom/aomdx.h" +#include "aom_dsp/psnr.h" +#include "aom_mem/aom_mem.h" +#include "av1/av1_iface_common.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/common/blockd.h" +#include "common/ivfdec.h" + +static void setup_two_pass_stream_input( + struct AvxInputContext **input_ctx_ptr, const char *input_file_name, + struct aom_internal_error_info *err_info) { + FILE *infile; + infile = fopen(input_file_name, "rb"); + if (!infile) { + aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM, + "Failed to open input file '%s'.", input_file_name); + } + struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx)); + if (!aom_input_ctx) { + fclose(infile); + aom_internal_error(err_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate memory for third-pass context."); + } + memset(aom_input_ctx, 0, sizeof(*aom_input_ctx)); + aom_input_ctx->filename = input_file_name; + aom_input_ctx->file = infile; + + if (file_is_ivf(aom_input_ctx)) { + aom_input_ctx->file_type = FILE_TYPE_IVF; + } else { + fclose(infile); + aom_free(aom_input_ctx); + aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM, + "Unrecognized input file type."); + } + *input_ctx_ptr = aom_input_ctx; +} + +static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) { + if (!ctx->input_ctx) { + if (ctx->input_file_name == NULL) { + aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM, + "No third pass input specified."); + } + setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name, + ctx->err_info); + } + + if (!ctx->decoder.iface) { + aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo; + if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to initialize decoder."); + } + } +} + +// Return 0: success +// 1: cannot read because this is end of file +// -1: failure to read the frame +static int read_frame(THIRD_PASS_DEC_CTX *ctx) { + if (!ctx->input_ctx || !ctx->decoder.iface) { + init_third_pass(ctx); + } + if (!ctx->have_frame) { + if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer, + &ctx->buffer_size, NULL) != 0) { + if (feof(ctx->input_ctx->file)) { + return 1; + } else { + return -1; + } + } + ctx->frame = ctx->buf; + ctx->end_frame = ctx->frame + ctx->bytes_in_buffer; + ctx->have_frame = 1; + } + + Av1DecodeReturn adr; + if (aom_codec_decode(&ctx->decoder, ctx->frame, + (unsigned int)ctx->bytes_in_buffer, + &adr) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to decode frame for third pass."); + } + ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3; + ctx->frame = adr.buf; + ctx->bytes_in_buffer = ctx->end_frame - ctx->frame; + if (ctx->frame == ctx->end_frame) ctx->have_frame = 0; + return 0; +} + +static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) { + if (!frame_info) return; + aom_free(frame_info->mi_info); + frame_info->mi_info = NULL; +} + +// This function gets the information needed from the recently decoded frame, +// via various decoder APIs, and saves the info into ctx->frame_info. +// Return 0: success +// 1: cannot read because this is end of file +// -1: failure to read the frame +static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) { + int ret = read_frame(ctx); + if (ret != 0) return ret; + int cur = ctx->frame_info_count; + + ctx->frame_info[cur].actual_bits = ctx->this_frame_bits; + + if (cur >= MAX_THIRD_PASS_BUF) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Third pass frame info ran out of available slots."); + } + aom_codec_frame_flags_t frame_type_flags = 0; + if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS, + &frame_type_flags) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame flags."); + } + if (frame_type_flags & AOM_FRAME_IS_KEY) { + ctx->frame_info[cur].frame_type = KEY_FRAME; + } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) { + ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME; + } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) { + ctx->frame_info[cur].frame_type = S_FRAME; + } else { + ctx->frame_info[cur].frame_type = INTER_FRAME; + } + + // Get frame width and height + int frame_size[2]; + if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) != + AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame size."); + } + + // Check if we need to re-alloc the mi fields. + const int mi_cols = (frame_size[0] + 3) >> 2; + const int mi_rows = (frame_size[1] + 3) >> 2; + ctx->frame_info[cur].mi_stride = mi_cols; + ctx->frame_info[cur].mi_rows = mi_rows; + ctx->frame_info[cur].mi_cols = mi_cols; + + if (ctx->frame_info[cur].width != frame_size[0] || + ctx->frame_info[cur].height != frame_size[1] || + !ctx->frame_info[cur].mi_info) { + free_frame_info(&ctx->frame_info[cur]); + + ctx->frame_info[cur].mi_info = + aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info)); + + if (!ctx->frame_info[cur].mi_info) { + aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR, + "Failed to allocate mi buffer for the third pass."); + } + } + + ctx->frame_info[cur].width = frame_size[0]; + ctx->frame_info[cur].height = frame_size[1]; + + // Get frame base q idx + if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX, + &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read base q index."); + } + + // Get show existing frame flag + if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG, + &ctx->frame_info[cur].is_show_existing_frame) != + AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read show existing frame flag."); + } + + // Get show frame flag + if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG, + &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read show frame flag."); + } + + // Get order hint + if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT, + &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read order hint."); + } + + // Clear MI info + for (int mi_row = 0; mi_row < mi_rows; mi_row++) { + for (int mi_col = 0; mi_col < mi_cols; mi_col++) { + ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize = + BLOCK_INVALID; + } + } + + // Get relevant information regarding each 4x4 MI + MB_MODE_INFO cur_mi_info; + THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info; + for (int mi_row = 0; mi_row < mi_rows; mi_row++) { + for (int mi_col = 0; mi_col < mi_cols; mi_col++) { + const int offset = mi_row * mi_cols + mi_col; + if (this_mi[offset].bsize != BLOCK_INVALID) { + continue; + } + // Get info of this MI + if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col, + &cur_mi_info) != AOM_CODEC_OK) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read mi info."); + } + const int blk_mi_rows = mi_size_high[cur_mi_info.bsize]; + const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize]; + + for (int h = 0; h < blk_mi_rows; h++) { + for (int w = 0; w < blk_mi_cols; w++) { + if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) { + continue; + } + const int this_offset = offset + h * mi_cols + w; + this_mi[this_offset].bsize = cur_mi_info.bsize; + this_mi[this_offset].partition = cur_mi_info.partition; + this_mi[this_offset].mi_row_start = mi_row; + this_mi[this_offset].mi_col_start = mi_col; + this_mi[this_offset].mv[0] = cur_mi_info.mv[0]; + this_mi[this_offset].mv[1] = cur_mi_info.mv[1]; + this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0]; + this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1]; + this_mi[this_offset].pred_mode = cur_mi_info.mode; + } + } + } + } + + ctx->frame_info_count++; + + return 0; +} + +#define USE_SECOND_PASS_FILE 1 + +#if !USE_SECOND_PASS_FILE +// Parse the frames in the gop and determine the last frame of the current GOP. +// Decode more frames if necessary. The variable max_num is the maximum static +// GOP length if we detect an IPPP structure, and it is expected that max_mum >= +// MAX_GF_INTERVAL. +static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num, + int *last_idx) { + assert(max_num >= MAX_GF_INTERVAL); + *last_idx = 0; + int cur_idx = 0; + int arf_order_hint = -1; + int num_show_frames = 0; + while (num_show_frames < max_num) { + assert(cur_idx < MAX_THIRD_PASS_BUF); + // Read in from bitstream if needed. + if (cur_idx >= ctx->frame_info_count) { + int ret = get_frame_info(ctx); + if (ret == 1) { + // At the end of the file, GOP ends in the prev frame. + if (arf_order_hint >= 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to derive GOP length."); + } + *last_idx = cur_idx - 1; + return; + } + if (ret < 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame for third pass."); + } + } + + // TODO(bohanli): verify that fwd_kf works here. + if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME && + ctx->frame_info[cur_idx].is_show_frame) { + if (cur_idx != 0) { + // If this is a key frame and is not the first kf in this kf group, we + // have reached the next key frame. Stop here. + *last_idx = cur_idx - 1; + return; + } + } else if (!ctx->frame_info[cur_idx].is_show_frame && + arf_order_hint == -1) { + // If this is an arf (the first no show) + if (num_show_frames <= 1) { + // This is an arf and we should end the GOP with its overlay. + arf_order_hint = ctx->frame_info[cur_idx].order_hint; + } else { + // There are multiple show frames before the this arf, so we treat the + // frames previous to this arf as a GOP. + *last_idx = cur_idx - 1; + return; + } + } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint == + (unsigned int)arf_order_hint) { + // If this is the overlay/show existing of the arf + assert(ctx->frame_info[cur_idx].is_show_frame); + *last_idx = cur_idx; + return; + } else { + // This frame is part of the GOP. + if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++; + } + cur_idx++; + } + // This is a long IPPP GOP and we will use a length of max_num here. + assert(arf_order_hint < 0); + *last_idx = max_num - 1; + return; +} +#endif + +static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) { + int cur_idx = 0; + while (cur_idx < ctx->gop_info.num_frames) { + assert(cur_idx < MAX_THIRD_PASS_BUF); + // Read in from bitstream if needed. + if (cur_idx >= ctx->frame_info_count) { + int ret = get_frame_info(ctx); + if (ret != 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Failed to read frame for third pass."); + } + } + cur_idx++; + } + return; +} + +void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { + // Read in future frames in the current GOP. + read_gop_frames(ctx); + + int gf_len = 0; + // Check the GOP length against the value read from second_pass_file + for (int i = 0; i < ctx->gop_info.num_frames; i++) { + if (ctx->frame_info[i].is_show_frame) gf_len++; + } + + if (gf_len != ctx->gop_info.gf_length) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Mismatch in third pass GOP length!"); + } +} + +void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { + if (ctx->frame_info_count == 0) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "No available frame info for third pass."); + } + ctx->frame_info_count--; + free_frame_info(&ctx->frame_info[0]); + for (int i = 0; i < ctx->frame_info_count; i++) { + ctx->frame_info[i] = ctx->frame_info[i + 1]; + } + ctx->frame_info[ctx->frame_info_count].mi_info = NULL; +} + +void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, + const char *file) { + av1_free_thirdpass_ctx(*ctx); + CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx))); + THIRD_PASS_DEC_CTX *ctx_ptr = *ctx; + ctx_ptr->input_file_name = file; + ctx_ptr->prev_gop_end = -1; + ctx_ptr->err_info = cm->error; +} + +void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { + if (ctx == NULL) return; + if (ctx->decoder.iface) { + aom_codec_destroy(&ctx->decoder); + } + if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file); + aom_free(ctx->input_ctx); + if (ctx->buf) free(ctx->buf); + for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) { + free_frame_info(&ctx->frame_info[i]); + } + aom_free(ctx); +} + +void av1_write_second_pass_gop_info(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + + if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) { + // Write the GOP length to a log file. + av1_open_second_pass_log(cpi, 0); + + THIRD_PASS_GOP_INFO gop_info; + + gop_info.num_frames = gf_group->size; + gop_info.use_arf = (gf_group->arf_index >= 0); + gop_info.gf_length = p_rc->baseline_gf_interval; + + size_t count = + fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + } +} + +void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + + if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) { + // write target bitrate + int bits = gf_group->bit_allocation[gf_index]; + size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + + // write sse + uint64_t sse = 0; + int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1; + if (pkt_idx >= 0 && + cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) { + sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0]; +#if CONFIG_INTERNAL_STATS + } else if (cpi->ppi->b_calculate_psnr) { + sse = cpi->ppi->total_sq_error[0]; +#endif + } else { + const YV12_BUFFER_CONFIG *orig = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + PSNR_STATS psnr; +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth); +#else + aom_calc_psnr(orig, recon, &psnr); +#endif + sse = psnr.sse[0]; + } + + count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + + // write bpm_factor + double factor = cpi->ppi->twopass.bpm_factor; + count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream); + if (count < 1) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not write to second pass log file!"); + } + } +} +void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + if (oxcf->second_pass_log == NULL) { + aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM, + "No second pass log file specified for the third pass!"); + } + // Read the GOP length from a file. + if (!cpi->second_pass_log_stream) { + if (is_read) { + cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb"); + } else { + cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb"); + } + if (!cpi->second_pass_log_stream) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not open second pass log file!"); + } + } +} + +void av1_close_second_pass_log(AV1_COMP *cpi) { + if (cpi->second_pass_log_stream) { + int ret = fclose(cpi->second_pass_log_stream); + if (ret != 0) { + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, + "Could not close second pass log file!"); + } + cpi->second_pass_log_stream = 0; + } +} + +void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, + THIRD_PASS_GOP_INFO *gop_info, + struct aom_internal_error_info *error) { + size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } +} + +void av1_read_second_pass_per_frame_info( + FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, + int frame_info_count, struct aom_internal_error_info *error) { + for (int i = 0; i < frame_info_count; i++) { + // read target bits + int bits = 0; + size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } + frame_info_arr[i].bits_allocated = bits; + + // read distortion + uint64_t sse; + count = fread(&sse, sizeof(sse), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } + frame_info_arr[i].sse = sse; + + // read bpm factor + double factor; + count = fread(&factor, sizeof(factor), 1, second_pass_log_stream); + if (count < 1) { + aom_internal_error(error, AOM_CODEC_ERROR, + "Could not read from second pass log file!"); + } + frame_info_arr[i].bpm_factor = factor; + } +} + +int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) { + if (ctx == NULL) return -1; + int use_arf = 0; + for (int i = 0; i < ctx->gop_info.gf_length; i++) { + if (ctx->frame_info[i].order_hint != 0 && + ctx->frame_info[i].is_show_frame == 0) { + use_arf = 1; + } + } + if (use_arf != ctx->gop_info.use_arf) { + aom_internal_error(ctx->err_info, AOM_CODEC_ERROR, + "Mismatch in third pass GOP length!"); + } + return use_arf; +} + +void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, + int fwidth, double *ratio_h, double *ratio_w) { + assert(ctx); + assert(fidx < ctx->frame_info_count); + const int fheight_second_pass = ctx->frame_info[fidx].height; + const int fwidth_second_pass = ctx->frame_info[fidx].width; + assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth); + + *ratio_h = (double)fheight / fheight_second_pass; + *ratio_w = (double)fwidth / fwidth_second_pass; +} + +THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, + int mi_row, int mi_col, + double ratio_h, double ratio_w) { + assert(ctx); + assert(fidx < ctx->frame_info_count); + + const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows; + const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols; + + const int mi_row_second_pass = + clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1); + const int mi_col_second_pass = + clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1); + + const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride; + THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info + + mi_row_second_pass * mi_stride_second_pass + + mi_col_second_pass; + return this_mi; +} + +void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, + double ratio_h, double ratio_w, int *mi_row, + int *mi_col) { + *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h); + *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w); +} + +int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w, + MV_REFERENCE_FRAME frame) { + assert(this_mi != NULL); + int_mv cur_mv; + cur_mv.as_int = INVALID_MV; + + if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv; + + for (int r = 0; r < 2; r++) { + if (this_mi->ref_frame[r] == frame) { + cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h); + cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w); + } + } + + return cur_mv; +} + +BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, + double ratio_w) { + assert(this_mi != NULL); + BLOCK_SIZE bsize = BLOCK_INVALID; + + const BLOCK_SIZE bsize_second_pass = this_mi->bsize; + assert(bsize_second_pass != BLOCK_INVALID); + + const int w_second_pass = block_size_wide[bsize_second_pass]; + const int h_second_pass = block_size_high[bsize_second_pass]; + + int part_type; + + if (w_second_pass == h_second_pass) { + part_type = PARTITION_NONE; + } else if (w_second_pass / h_second_pass == 2) { + part_type = PARTITION_HORZ; + } else if (w_second_pass / h_second_pass == 4) { + part_type = PARTITION_HORZ_4; + } else if (h_second_pass / w_second_pass == 2) { + part_type = PARTITION_VERT; + } else if (h_second_pass / w_second_pass == 4) { + part_type = PARTITION_VERT_4; + } else { + part_type = PARTITION_INVALID; + } + assert(part_type != PARTITION_INVALID); + + const int w = (int)(round(w_second_pass * ratio_w)); + const int h = (int)(round(h_second_pass * ratio_h)); + + for (int i = 0; i < SQR_BLOCK_SIZES; i++) { + const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i]; + if (this_bsize == BLOCK_INVALID) continue; + + const int this_w = block_size_wide[this_bsize]; + const int this_h = block_size_high[this_bsize]; + + if (this_w >= w && this_h >= h) { + // find the smallest block size that contains the mapped block + bsize = this_bsize; + break; + } + } + if (bsize == BLOCK_INVALID) { + // could not find a proper one, just use the largest then. + bsize = BLOCK_128X128; + } + + return bsize; +} + +PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, + THIRD_PASS_MI_INFO *this_mi) { + int mi_stride = ctx->frame_info[0].mi_stride; + + int mi_row = this_mi->mi_row_start; + int mi_col = this_mi->mi_col_start; + + THIRD_PASS_MI_INFO *corner_mi = + &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col]; + + return corner_mi->partition; +} + +#else // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER) +void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, + const char *file) { + (void)ctx; + (void)file; + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "To utilize three-pass encoding, libaom must be built " + "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1."); +} + +void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } + +void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } + +void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; } + +void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) { + (void)cpi; + (void)is_read; +} + +void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; } + +void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; } + +void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) { + (void)cpi; + (void)gf_index; +} + +void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, + THIRD_PASS_GOP_INFO *gop_info, + struct aom_internal_error_info *error) { + (void)second_pass_log_stream; + (void)gop_info; + (void)error; +} + +void av1_read_second_pass_per_frame_info( + FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr, + int frame_info_count, struct aom_internal_error_info *error) { + (void)second_pass_log_stream; + (void)frame_info_arr; + (void)frame_info_count; + (void)error; +} + +int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) { + (void)ctx; + return 1; +} + +void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, + int fwidth, double *ratio_h, double *ratio_w) { + (void)ctx; + (void)fidx; + (void)fheight; + (void)fwidth; + (void)ratio_h; + (void)ratio_w; +} + +THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, + int mi_row, int mi_col, + double ratio_h, double ratio_w) { + (void)ctx; + (void)fidx; + (void)mi_row; + (void)mi_col; + (void)ratio_h; + (void)ratio_w; + return NULL; +} + +int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w, + MV_REFERENCE_FRAME frame) { + (void)this_mi; + (void)ratio_h; + (void)ratio_w; + (void)frame; + int_mv mv; + mv.as_int = INVALID_MV; + return mv; +} + +BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, + double ratio_w) { + (void)this_mi; + (void)ratio_h; + (void)ratio_w; + return BLOCK_INVALID; +} + +void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, + double ratio_h, double ratio_w, int *mi_row, + int *mi_col) { + (void)third_pass_mi; + (void)ratio_h; + (void)ratio_w; + (void)mi_row; + (void)mi_col; +} + +PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, + THIRD_PASS_MI_INFO *this_mi) { + (void)ctx; + (void)this_mi; + return PARTITION_INVALID; +} +#endif // CONFIG_THREE_PASS && CONFIG_AV1_DECODER + +#if CONFIG_BITRATE_ACCURACY +static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb, + FILE *stream, + struct aom_internal_error_info *error) { + size_t count = fwrite(ptr, size, nmemb, stream); + if (count < nmemb) { + aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n"); + } +} + +static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream, + struct aom_internal_error_info *error) { + size_t count = fread(ptr, size, nmemb, stream); + if (count < nmemb) { + aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n"); + } +} + +void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group, + const TplParams *tpl_data) { + tpl_info->tpl_ready = tpl_data->ready; + if (tpl_info->tpl_ready) { + tpl_info->gf_length = gf_group->size; + for (int i = 0; i < tpl_info->gf_length; ++i) { + tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i]; + tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i); + tpl_info->update_type_list[i] = gf_group->update_type[i]; + } + } +} + +void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error) { + fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1, + log_stream, error); + if (tpl_info->tpl_ready) { + fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1, + log_stream, error); + assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS); + fwrite_and_check(&tpl_info->txfm_stats_list, + sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length, + log_stream, error); + fwrite_and_check(&tpl_info->qstep_ratio_ls, + sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length, + log_stream, error); + fwrite_and_check(&tpl_info->update_type_list, + sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length, + log_stream, error); + } +} + +void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error) { + av1_zero(*tpl_info); + fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1, + log_stream, error); + if (tpl_info->tpl_ready) { + fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1, + log_stream, error); + assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS); + fread_and_check(&tpl_info->txfm_stats_list, + sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length, + log_stream, error); + fread_and_check(&tpl_info->qstep_ratio_ls, + sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length, + log_stream, error); + fread_and_check(&tpl_info->update_type_list, + sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length, + log_stream, error); + } +} +#endif // CONFIG_BITRATE_ACCURACY diff --git a/third_party/aom/av1/encoder/thirdpass.h b/third_party/aom/av1/encoder/thirdpass.h new file mode 100644 index 0000000000..8080c06cb6 --- /dev/null +++ b/third_party/aom/av1/encoder/thirdpass.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_THIRDPASS_H_ +#define AOM_AV1_ENCODER_THIRDPASS_H_ + +#include "av1/common/enums.h" +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/firstpass.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/tpl_model.h" + +struct AV1_COMP; + +// TODO(bohanli): optimize this number +#define MAX_THIRD_PASS_BUF \ + (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH)) + +// Struct to store useful information related to a GOP, in addition to what is +// available in the bitstream +typedef struct { + int gf_length; + int num_frames; + int use_arf; +} THIRD_PASS_GOP_INFO; + +#if CONFIG_BITRATE_ACCURACY +typedef struct TPL_INFO { + int gf_length; + int tpl_ready; + TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS]; + double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS]; + FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS]; +} TPL_INFO; +#endif // CONFIG_BITRATE_ACCURACY + +typedef struct { + BLOCK_SIZE bsize; + PARTITION_TYPE partition; + int mi_row_start; + int mi_col_start; + int_mv mv[2]; + MV_REFERENCE_FRAME ref_frame[2]; + PREDICTION_MODE pred_mode; +} THIRD_PASS_MI_INFO; + +// Struct to store useful information about a frame for the third pass. +// The members are extracted from the decoder by function get_frame_info. +typedef struct { + int width; + int height; + int mi_stride; + int mi_rows; + int mi_cols; + int base_q_idx; + int is_show_existing_frame; + int is_show_frame; + int bits_allocated; + int actual_bits; + uint64_t sse; + double bpm_factor; + FRAME_TYPE frame_type; + unsigned int order_hint; + THIRD_PASS_MI_INFO *mi_info; +} THIRD_PASS_FRAME_INFO; + +typedef struct { + /* --- Input and decoding related members --- */ + // the input file + const char *input_file_name; +#if CONFIG_THREE_PASS + // input context + struct AvxInputContext *input_ctx; +#endif + // decoder codec context + aom_codec_ctx_t decoder; + // start of the frame in buf + const unsigned char *frame; + // end of the frame(s) in buf + const unsigned char *end_frame; + // whether we still have following frames in buf + int have_frame; + // pointer to buffer for the read frames + uint8_t *buf; + // size of data in buffer + size_t bytes_in_buffer; + // current buffer size + size_t buffer_size; + // error info pointer + struct aom_internal_error_info *err_info; + + int this_frame_bits; + + /* --- Members for third pass encoding --- */ + // Array to store info about each frame. + // frame_info[0] should point to the current frame. + THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF]; + // number of frames available in frame_info + int frame_info_count; + // the end of the previous GOP (order hint) + int prev_gop_end; + THIRD_PASS_GOP_INFO gop_info; +} THIRD_PASS_DEC_CTX; + +void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx, + const char *file); +void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx); + +// Set the GOP structure from the twopass bitstream. +// TODO(bohanli): this is currently a skeleton and we only return the gop +// length. This function also saves all frame information in the array +// ctx->frame_info for this GOP. +void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx); + +// Pop one frame out of the array ctx->frame_info. This function is used to make +// sure that frame_info[0] always corresponds to the current frame. +void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx); + +void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read); +void av1_close_second_pass_log(struct AV1_COMP *cpi); + +// Write the current GOP information into the second pass log file. +void av1_write_second_pass_gop_info(struct AV1_COMP *cpi); +// Write the information of the frames in this GOP into the second pass log +// file. +void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index); + +// Read the next GOP information from the second pass log file. +void av1_read_second_pass_gop_info(FILE *second_pass_log_stream, + THIRD_PASS_GOP_INFO *gop_info, + struct aom_internal_error_info *error); +// read the information of the frames in next GOP from the second pass log file. +void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream, + THIRD_PASS_FRAME_INFO *frame_info_arr, + int frame_info_count, + struct aom_internal_error_info *error); + +int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx); + +// Calculate the ratio of third pass frame dimensions over second pass frame +// dimensions. Return them in ratio_h and ratio_w. +void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight, + int fwidth, double *ratio_h, double *ratio_w); + +// Get the pointer to a second pass mi info, where mi_row and mi_col are the mi +// location in the thirdpass frame. +THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx, + int mi_row, int mi_col, + double ratio_h, double ratio_w); + +// Get the adjusted MVs of this_mi, associated with the reference frame. If no +// MV is found with the reference frame, INVALID_MV is returned. +int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w, + MV_REFERENCE_FRAME frame); + +// Get the adjusted block size of this_mi. +BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi, + double ratio_h, double ratio_w); + +// Get the adjusted mi position in the third pass frame, of a given +// third_pass_mi. Location is returned in mi_row and mi_col. +void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi, + double ratio_h, double ratio_w, int *mi_row, + int *mi_col); + +PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx, + THIRD_PASS_MI_INFO *this_mi); + +#if CONFIG_BITRATE_ACCURACY + +void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group, + const TplParams *tpl_data); + +void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error); + +void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream, + struct aom_internal_error_info *error); + +#endif // CONFIG_BITRATE_ACCURACY +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_THIRDPASS_H_ diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c new file mode 100644 index 0000000000..ffac886e32 --- /dev/null +++ b/third_party/aom/av1/encoder/tokenize.c @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_mem/aom_mem.h" + +#include "av1/common/entropy.h" +#include "av1/common/pred_common.h" +#include "av1/common/scan.h" +#include "av1/common/seg_common.h" + +#include "av1/encoder/cost.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encodetxb.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/tokenize.h" + +static AOM_INLINE int av1_fast_palette_color_index_context_on_edge( + const uint8_t *color_map, int stride, int r, int c, int *color_idx) { + const bool has_left = (c - 1 >= 0); + const bool has_above = (r - 1 >= 0); + assert(r > 0 || c > 0); + assert(has_above ^ has_left); + assert(color_idx); + (void)has_left; + + const uint8_t color_neighbor = has_above + ? color_map[(r - 1) * stride + (c - 0)] + : color_map[(r - 0) * stride + (c - 1)]; + // If the neighbor color has higher index than current color index, then we + // move up by 1. + const uint8_t current_color = *color_idx = color_map[r * stride + c]; + if (color_neighbor > current_color) { + (*color_idx)++; + } else if (color_neighbor == current_color) { + *color_idx = 0; + } + + // Get hash value of context. + // The non-diagonal neighbors get a weight of 2. + const uint8_t color_score = 2; + const uint8_t hash_multiplier = 1; + const uint8_t color_index_ctx_hash = color_score * hash_multiplier; + + // Lookup context from hash. + const int color_index_ctx = + av1_palette_color_index_context_lookup[color_index_ctx_hash]; + assert(color_index_ctx == 0); + (void)color_index_ctx; + return 0; +} + +#define SWAP(i, j) \ + do { \ + const uint8_t tmp_score = score_rank[i]; \ + const uint8_t tmp_color = color_rank[i]; \ + score_rank[i] = score_rank[j]; \ + color_rank[i] = color_rank[j]; \ + score_rank[j] = tmp_score; \ + color_rank[j] = tmp_color; \ + } while (0) +#define INVALID_COLOR_IDX (UINT8_MAX) + +// A faster version of av1_get_palette_color_index_context used by the encoder +// exploiting the fact that the encoder does not need to maintain a color order. +static AOM_INLINE int av1_fast_palette_color_index_context( + const uint8_t *color_map, int stride, int r, int c, int *color_idx) { + assert(r > 0 || c > 0); + + const bool has_above = (r - 1 >= 0); + const bool has_left = (c - 1 >= 0); + assert(has_above || has_left); + if (has_above ^ has_left) { + return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c, + color_idx); + } + + // This goes in the order of left, top, and top-left. This has the advantage + // that unless anything here are not distinct or invalid, this will already + // be in sorted order. Furthermore, if either of the first two is + // invalid, we know the last one is also invalid. + uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS]; + color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)]; + color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)]; + color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)]; + + // Aggregate duplicated values. + // Since our array is so small, using a couple if statements is faster + uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 }; + uint8_t num_invalid_colors = 0; + if (color_neighbors[0] == color_neighbors[1]) { + scores[0] += scores[1]; + color_neighbors[1] = INVALID_COLOR_IDX; + num_invalid_colors += 1; + + if (color_neighbors[0] == color_neighbors[2]) { + scores[0] += scores[2]; + num_invalid_colors += 1; + } + } else if (color_neighbors[0] == color_neighbors[2]) { + scores[0] += scores[2]; + num_invalid_colors += 1; + } else if (color_neighbors[1] == color_neighbors[2]) { + scores[1] += scores[2]; + num_invalid_colors += 1; + } + + const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors; + + uint8_t *color_rank = color_neighbors; + uint8_t *score_rank = scores; + + // Sort everything + if (num_valid_colors > 1) { + if (color_neighbors[1] == INVALID_COLOR_IDX) { + scores[1] = scores[2]; + color_neighbors[1] = color_neighbors[2]; + } + + // We need to swap the first two elements if they have the same score but + // the color indices are not in the right order + if (score_rank[0] < score_rank[1] || + (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) { + SWAP(0, 1); + } + if (num_valid_colors > 2) { + if (score_rank[0] < score_rank[2]) { + SWAP(0, 2); + } + if (score_rank[1] < score_rank[2]) { + SWAP(1, 2); + } + } + } + + // If any of the neighbor colors has higher index than current color index, + // then we move up by 1 unless the current color is the same as one of the + // neighbors. + const uint8_t current_color = *color_idx = color_map[r * stride + c]; + for (int idx = 0; idx < num_valid_colors; idx++) { + if (color_rank[idx] > current_color) { + (*color_idx)++; + } else if (color_rank[idx] == current_color) { + *color_idx = idx; + break; + } + } + + // Get hash value of context. + uint8_t color_index_ctx_hash = 0; + static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 }; + for (int idx = 0; idx < num_valid_colors; ++idx) { + color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx]; + } + assert(color_index_ctx_hash > 0); + assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH); + + // Lookup context from hash. + const int color_index_ctx = 9 - color_index_ctx_hash; + assert(color_index_ctx == + av1_palette_color_index_context_lookup[color_index_ctx_hash]); + assert(color_index_ctx >= 0); + assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS); + return color_index_ctx; +} +#undef INVALID_COLOR_IDX +#undef SWAP + +static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t, + int plane, int calc_rate, int allow_update_cdf, + FRAME_COUNTS *counts) { + const uint8_t *const color_map = param->color_map; + MapCdf map_cdf = param->map_cdf; + ColorCost color_cost = param->color_cost; + const int plane_block_width = param->plane_width; + const int rows = param->rows; + const int cols = param->cols; + const int n = param->n_colors; + const int palette_size_idx = n - PALETTE_MIN_SIZE; + int this_rate = 0; + + (void)plane; + (void)counts; + + for (int k = 1; k < rows + cols - 1; ++k) { + for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) { + int i = k - j; + int color_new_idx; + const int color_ctx = av1_fast_palette_color_index_context( + color_map, plane_block_width, i, j, &color_new_idx); + assert(color_new_idx >= 0 && color_new_idx < n); + if (calc_rate) { + this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx]; + } else { + (*t)->token = color_new_idx; + (*t)->color_ctx = color_ctx; + ++(*t); + if (allow_update_cdf) + update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n); +#if CONFIG_ENTROPY_STATS + if (plane) { + ++counts->palette_uv_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } else { + ++counts->palette_y_color_index[palette_size_idx][color_ctx] + [color_new_idx]; + } +#endif + } + } + } + if (calc_rate) return this_rate; + return 0; +} + +static void get_palette_params(const MACROBLOCK *const x, int plane, + BLOCK_SIZE bsize, Av1ColorMapParam *params) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; + params->color_map = xd->plane[plane].color_index_map; + params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf + : xd->tile_ctx->palette_y_color_index_cdf; + params->color_cost = plane ? x->mode_costs.palette_uv_color_cost + : x->mode_costs.palette_y_color_cost; + params->n_colors = pmi->palette_size[plane]; + av1_get_block_dimensions(bsize, plane, xd, ¶ms->plane_width, NULL, + ¶ms->rows, ¶ms->cols); +} + +// TODO(any): Remove this function +static void get_color_map_params(const MACROBLOCK *const x, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, + Av1ColorMapParam *params) { + (void)tx_size; + memset(params, 0, sizeof(*params)); + switch (type) { + case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break; + default: assert(0 && "Invalid color map type"); return; + } +} + +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam color_map_params; + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL); +} + +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, + TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, int allow_update_cdf, + FRAME_COUNTS *counts) { + assert(plane == 0 || plane == 1); + Av1ColorMapParam color_map_params; + get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params); + // The first color index does not use context or entropy. + (*t)->token = color_map_params.color_map[0]; + (*t)->color_ctx = -1; + ++(*t); + cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf, + counts); +} + +static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int blk_row, int blk_col, + int block, int plane, void *arg) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int max_blocks_high = max_block_high(xd, plane_bsize, plane); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = + plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x, + pd->subsampling_y) + : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row, + blk_col)]; + + if (tx_size == plane_tx_size || plane) { + plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + struct tokenize_b_args *args = arg; + if (args->allow_update_cdf) + av1_update_and_record_txb_context(plane, block, blk_row, blk_col, + plane_bsize, tx_size, arg); + else + av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize, + tx_size, arg); + + } else { + // Half the block size in transform block unit. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int bsw = tx_size_wide_unit[sub_txs]; + const int bsh = tx_size_high_unit[sub_txs]; + const int step = bsw * bsh; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + + assert(bsw > 0 && bsh > 0); + + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { + const int offsetc = blk_col + col; + + tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane, + arg); + block += step; + } + } + } +} + +void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf) { + assert(bsize < BLOCK_SIZES_ALL); + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) + return; + + const int num_planes = av1_num_planes(cm); + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run }; + + if (mbmi->skip_txfm) { + av1_reset_entropy_context(xd, bsize, num_planes); + return; + } + + for (int plane = 0; plane < num_planes; ++plane) { + if (plane && !xd->is_chroma_ref) break; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + assert(plane_bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); + const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size]; + const int bw = mi_size_wide[txb_size]; + const int bh = mi_size_high[txb_size]; + int block = 0; + const int step = + tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; + + const BLOCK_SIZE max_unit_bsize = + get_plane_block_size(BLOCK_64X64, ss_x, ss_y); + int mu_blocks_wide = mi_size_wide[max_unit_bsize]; + int mu_blocks_high = mi_size_high[max_unit_bsize]; + + mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide); + mu_blocks_high = AOMMIN(mi_height, mu_blocks_high); + + for (int idy = 0; idy < mi_height; idy += mu_blocks_high) { + for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) { + const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height); + const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width); + for (int blk_row = idy; blk_row < unit_height; blk_row += bh) { + for (int blk_col = idx; blk_col < unit_width; blk_col += bw) { + tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col, + block, plane, &arg); + block += step; + } + } + } + } + } + if (rate) *rate += arg.this_rate; +} diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h new file mode 100644 index 0000000000..f675c489ae --- /dev/null +++ b/third_party/aom/av1/encoder/tokenize.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TOKENIZE_H_ +#define AOM_AV1_ENCODER_TOKENIZE_H_ + +#include "av1/common/entropy.h" +#include "av1/encoder/block.h" +#include "aom_dsp/bitwriter.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The token and color_ctx members of the TokenExtra structure are used +// to store the indices of color and color context of each pixel in +// case of palette mode. +// 1) token can take values in the range of [0, 7] as maximum number of possible +// colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned). +// 2) The reserved field (1-bit) is positioned such that color_ctx occupies the +// most significant bits and token occupies the least significant bits of the +// byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is +// defined as: +// typedef struct { +// int8_t color_ctx : 4; +// uint8_t token : 3; +// } TokenExtra; +// then read of color_ctx requires an extra left shift to facilitate sign +// extension and write of token requires an extra masking. +// 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e., +// from 0 to 4. As per the current implementation it can take values in the +// range of [-1, 4]. Here -1 corresponds to invalid color index context and is +// used for default initialization. Hence color_ctx requires 4 bits (signed). +typedef struct { + uint8_t token : 3; + uint8_t reserved : 1; + int8_t color_ctx : 4; +} TokenExtra; + +typedef struct { + TokenExtra *start; + unsigned int count; +} TokenList; + +typedef struct { + // Number of tile tokens for which memory is allocated. + unsigned int tokens_allocated; + // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith + // tile row, jth tile column. + TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; + // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of + // palette tokens for the kth superblock row of the ith tile row, jth tile + // column. + TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; +} TokenInfo; + +struct AV1_COMP; +struct ThreadData; +struct FRAME_COUNTS; + +enum { + OUTPUT_ENABLED = 0, + DRY_RUN_NORMAL, + DRY_RUN_COSTCOEFFS, +} UENUM1BYTE(RUN_TYPE); + +struct tokenize_b_args { + const struct AV1_COMP *cpi; + struct ThreadData *td; + int this_rate; + uint8_t allow_update_cdf; + RUN_TYPE dry_run; +}; + +// Note in all the tokenize functions rate if non NULL is incremented +// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, +// otherwise rate is not incremented. +void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td, + RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate, + uint8_t allow_update_cdf); + +int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, COLOR_MAP_TYPE type); + +void av1_tokenize_color_map(const MACROBLOCK *const x, int plane, + TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size, + COLOR_MAP_TYPE type, int allow_update_cdf, + struct FRAME_COUNTS *counts); + +static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id, + TX_SIZE tx_size) { + const int eob_max = av1_get_max_eob(tx_size); + return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; +} + +// Token buffer is only used for palette tokens. +static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols, + int sb_size_log2, + const int num_planes) { + // Calculate the maximum number of max superblocks in the image. + const int shift = sb_size_log2 - 4; + const int sb_size = 1 << sb_size_log2; + const int sb_size_square = sb_size * sb_size; + const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift); + const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift); + + // One palette token for each pixel. There can be palettes on two planes. + const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square; + + return sb_rows * sb_cols * sb_palette_toks; +} + +// Allocate memory for token related info. +static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info, + unsigned int tokens_required) { + int sb_rows = + CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + token_info->tokens_allocated = tokens_required; + + CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0], + (TokenExtra *)aom_calloc( + tokens_required, sizeof(*token_info->tile_tok[0][0]))); + + CHECK_MEM_ERROR( + cm, token_info->tplist[0][0], + (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS, + sizeof(*token_info->tplist[0][0]))); +} + +// Check if memory allocation has been done for token related info. +static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) { + return ((token_info->tile_tok[0][0] != NULL) && + (token_info->tplist[0][0] != NULL)); +} + +// Free memory from token related variables. +static AOM_INLINE void free_token_info(TokenInfo *token_info) { + aom_free(token_info->tile_tok[0][0]); + token_info->tile_tok[0][0] = NULL; + + aom_free(token_info->tplist[0][0]); + token_info->tplist[0][0] = NULL; + + token_info->tokens_allocated = 0; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TOKENIZE_H_ diff --git a/third_party/aom/av1/encoder/tpl_model.c b/third_party/aom/av1/encoder/tpl_model.c new file mode 100644 index 0000000000..ca60e4981e --- /dev/null +++ b/third_party/aom/av1/encoder/tpl_model.c @@ -0,0 +1,2511 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "av1/encoder/thirdpass.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/common/idct.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/ethread.h" +#include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/encoder/motion_search_facade.h" +#include "av1/encoder/rd.h" +#include "av1/encoder/rdopt.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/tpl_model.h" + +static INLINE double exp_bounded(double v) { + // When v > 700 or <-700, the exp function will be close to overflow + // For details, see the "Notes" in the following link. + // https://en.cppreference.com/w/c/numeric/math/exp + if (v > 700) { + return DBL_MAX; + } else if (v < -700) { + return 0; + } + return exp(v); +} + +void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) { + tpl_txfm_stats->ready = 0; + tpl_txfm_stats->coeff_num = 256; + tpl_txfm_stats->txfm_block_count = 0; + memset(tpl_txfm_stats->abs_coeff_sum, 0, + sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num); + memset(tpl_txfm_stats->abs_coeff_mean, 0, + sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num); +} + +#if CONFIG_BITRATE_ACCURACY +void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, + TplTxfmStats *accumulated_stats) { + accumulated_stats->txfm_block_count += sub_stats->txfm_block_count; + for (int i = 0; i < accumulated_stats->coeff_num; ++i) { + accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i]; + } +} + +void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, + const tran_low_t *coeff) { + // For transform larger than 16x16, the scale of coeff need to be adjusted. + // It's not LOSSLESS_Q_STEP. + assert(tpl_txfm_stats->coeff_num <= 256); + for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) { + tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP; + } + ++tpl_txfm_stats->txfm_block_count; +} + +void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) { + if (txfm_stats->txfm_block_count > 0) { + for (int j = 0; j < txfm_stats->coeff_num; j++) { + txfm_stats->abs_coeff_mean[j] = + txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count; + } + txfm_stats->ready = 1; + } else { + txfm_stats->ready = 0; + } +} + +static AOM_INLINE void av1_tpl_store_txfm_stats( + TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats, + const int frame_index) { + tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats; +} +#endif // CONFIG_BITRATE_ACCURACY + +static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane, + const tran_low_t *coeff, + tran_low_t *qcoeff, + tran_low_t *dqcoeff, TX_SIZE tx_size, + uint16_t *eob, int64_t *recon_error, + int64_t *sse) { + const struct macroblock_plane *const p = &x->plane[plane]; + const MACROBLOCKD *xd = &x->e_mbd; + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + QUANT_PARAM quant_param; + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param); + +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, + scan_order, &quant_param); + *recon_error = + av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift; + } else { + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, + &quant_param); + *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + } +#else + (void)xd; + av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order, + &quant_param); + *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; +#endif // CONFIG_AV1_HIGHBITDEPTH + + *recon_error = AOMMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = AOMMAX(*sse, 1); +} + +static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2, + uint8_t *tpl_bsize_1d) { + // tpl stats bsize: 2 means 16x16 + *block_mis_log2 = 2; + // Block size used in tpl motion estimation + *tpl_bsize_1d = 16; + // MIN_TPL_BSIZE_1D = 16; + assert(*tpl_bsize_1d >= 16); +} + +void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi, + CommonModeInfoParams *const mi_params, int width, + int height, int byte_alignment, int lag_in_frames) { + SequenceHeader *const seq_params = &ppi->seq_params; + TplParams *const tpl_data = &ppi->tpl_data; + set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, + &tpl_data->tpl_bsize_1d); + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + tpl_data->border_in_pixels = + ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5); + + const int alloc_y_plane_only = + ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0; + for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) { + const int mi_cols = + ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2); + const int mi_rows = + ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2); + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame]; + tpl_frame->is_valid = 0; + tpl_frame->width = mi_cols >> block_mis_log2; + tpl_frame->height = mi_rows >> block_mis_log2; + tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width; + tpl_frame->mi_rows = mi_params->mi_rows; + tpl_frame->mi_cols = mi_params->mi_cols; + } + tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1]; + + // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory + // allocations are avoided for buffers in tpl_data. + if (lag_in_frames <= 1) return; + + AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list, + aom_calloc(MAX_LENGTH_TPL_FRAME_STATS, + sizeof(*tpl_data->txfm_stats_list))); + + for (int frame = 0; frame < lag_in_frames; ++frame) { + AOM_CHECK_MEM_ERROR( + &ppi->error, tpl_data->tpl_stats_pool[frame], + aom_calloc(tpl_data->tpl_stats_buffer[frame].width * + tpl_data->tpl_stats_buffer[frame].height, + sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr))); + + if (aom_alloc_frame_buffer( + &tpl_data->tpl_rec_pool[frame], width, height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, tpl_data->border_in_pixels, + byte_alignment, 0, alloc_y_plane_only)) + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } +} + +static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info, + int16_t *src_diff, int diff_stride, + const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + tran_low_t *coeff, int bw, int bh, + TX_SIZE tx_size) { + const int pix_num = bw * bh; + + av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, + dst, dst_stride); + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); + return aom_satd(coeff, pix_num); +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; + + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + int rate_cost = 1; + + for (int idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << AV1_PROB_COST_SHIFT); +} + +static AOM_INLINE void txfm_quant_rdcost( + const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src, + int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size, + int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) { + const MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + uint16_t eob; + av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, + dst, dst_stride); + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); + + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error, + sse); + + *rate_cost = rate_estimator(qcoeff, eob, tx_size); + + if (do_recon) + av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, + dst_stride, eob, 0); +} + +static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + int ref_stride, int width, int ref_width, + BLOCK_SIZE bsize, MV center_mv, + int_mv *best_mv) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + int step_param; + uint32_t bestsme = UINT_MAX; + FULLPEL_MV_STATS best_mv_stats; + int distortion; + uint32_t sse; + int cost_list[5]; + FULLPEL_MV start_mv = get_fullmv_from_mv(¢er_mv); + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + x->plane[0].src.width = width; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = ref_stride; + xd->plane[0].pre[0].width = ref_width; + + step_param = tpl_sf->reduce_first_step_size; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + const search_site_config *search_site_cfg = + cpi->mv_search_params.search_site_cfg[SS_CFG_SRC]; + if (search_site_cfg->stride != ref_stride) + search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD]; + assert(search_site_cfg->stride == ref_stride); + + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, ¢er_mv, + start_mv, search_site_cfg, + tpl_sf->search_method, + /*fine_search_interval=*/0); + + bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), + &best_mv->as_fullmv, &best_mv_stats, NULL); + + // When sub-pel motion search is skipped, populate sub-pel precision MV and + // return. + if (tpl_sf->subpel_force_stop == FULL_PEL) { + best_mv->as_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + return bestsme; + } + + SUBPEL_MOTION_SEARCH_PARAMS ms_params; + av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, ¢er_mv, + cost_list); + ms_params.forced_stop = tpl_sf->subpel_force_stop; + ms_params.var_params.subpel_search_type = USE_2_TAPS; + ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE; + best_mv_stats.err_cost = 0; + MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv); + assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)); + bestsme = cpi->mv_search_params.find_fractional_mv_step( + xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv, + &distortion, &sse, NULL); + + return bestsme; +} + +typedef struct { + int_mv mv; + int sad; +} center_mv_t; + +static int compare_sad(const void *a, const void *b) { + const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad; + if (diff < 0) + return -1; + else if (diff > 0) + return 1; + return 0; +} + +static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs, + int center_mvs_count, int skip_alike_starting_mv) { + // MV difference threshold is in 1/8 precision. + const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) }; + int thr = mv_diff_thr[skip_alike_starting_mv]; + int i; + + for (i = 0; i < center_mvs_count; i++) { + if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr && + abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr) + return 1; + } + + return 0; +} + +static void get_rate_distortion( + int *rate_cost, int64_t *recon_error, int64_t *pred_error, + int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff, + tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x, + const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3], + const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode, + int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon, + TplTxfmStats *tpl_txfm_stats) { + const SequenceHeader *seq_params = cm->seq_params; + *rate_cost = 0; + *recon_error = 1; + *pred_error = 1; + + (void)tpl_txfm_stats; + + MACROBLOCKD *xd = &x->e_mbd; + int is_compound = (best_mode == NEW_NEWMV); + int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE; + + uint8_t *src_buffer_pool[MAX_MB_PLANE] = { + xd->cur_buf->y_buffer, + xd->cur_buf->u_buffer, + xd->cur_buf->v_buffer, + }; + const int src_stride_pool[MAX_MB_PLANE] = { + xd->cur_buf->y_stride, + xd->cur_buf->uv_stride, + xd->cur_buf->uv_stride, + }; + + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + for (int plane = 0; plane < num_planes; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + BLOCK_SIZE bsize_plane = + av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x] + [pd->subsampling_y]; + + int dst_buffer_stride = rec_stride_pool[plane]; + int dst_mb_offset = + ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) + + ((mi_col * MI_SIZE) >> pd->subsampling_x); + uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset; + for (int ref = 0; ref < 1 + is_compound; ++ref) { + if (!is_inter_mode(best_mode)) { + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize_plane], block_size_high[bsize_plane], + max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0, + FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer, + dst_buffer_stride, 0, 0, plane); + } else { + int_mv best_mv = xd->mi[0]->mv[ref]; + uint8_t *ref_buffer_pool[MAX_MB_PLANE] = { + ref_frame_ptr[ref]->y_buffer, + ref_frame_ptr[ref]->u_buffer, + ref_frame_ptr[ref]->v_buffer, + }; + InterPredParams inter_pred_params; + struct buf_2d ref_buf = { + NULL, ref_buffer_pool[plane], + plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width, + plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height, + plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride + }; + av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane], + block_size_high[bsize_plane], + (mi_row * MI_SIZE) >> pd->subsampling_y, + (mi_col * MI_SIZE) >> pd->subsampling_x, + pd->subsampling_x, pd->subsampling_y, xd->bd, + is_cur_buf_hbd(xd), 0, + xd->block_ref_scale_factors[0], &ref_buf, kernel); + if (is_compound) av1_init_comp_mode(&inter_pred_params); + inter_pred_params.conv_params = get_conv_params_no_round( + ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); + + av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride, + &best_mv.as_mv, &inter_pred_params); + } + } + + int src_stride = src_stride_pool[plane]; + int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) + + ((mi_col * MI_SIZE) >> pd->subsampling_x); + + int this_rate = 1; + int64_t this_recon_error = 1; + int64_t sse; + txfm_quant_rdcost( + x, src_diff, block_size_wide[bsize_plane], + src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer, + dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane], + block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane], + do_recon, &this_rate, &this_recon_error, &sse); + +#if CONFIG_BITRATE_ACCURACY + if (plane == 0 && tpl_txfm_stats) { + // We only collect Y plane's transform coefficient + av1_record_tpl_txfm_block(tpl_txfm_stats, coeff); + } +#endif // CONFIG_BITRATE_ACCURACY + + *recon_error += this_recon_error; + *pred_error += sse; + *rate_cost += this_rate; + } +} + +static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd, + const uint8_t *src_mb_buffer, + int src_stride, + TplBuffers *tpl_tmp_buffers, + BLOCK_SIZE bsize, TX_SIZE tx_size, + int mi_row, int mi_col, int rf_idx, + MV *rfidx_mv, int use_pred_sad) { + const BitDepthInfo bd_info = get_bit_depth_info(xd); + TplParams *tpl_data = &cpi->ppi->tpl_data; + const YV12_BUFFER_CONFIG *const ref_frame_ptr = + tpl_data->src_ref_frame[rf_idx]; + int16_t *src_diff = tpl_tmp_buffers->src_diff; + tran_low_t *coeff = tpl_tmp_buffers->coeff; + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + int32_t inter_cost; + + if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) { + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + uint8_t *predictor8 = tpl_tmp_buffers->predictor8; + uint8_t *predictor = + is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; + struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer, + ref_frame_ptr->y_width, ref_frame_ptr->y_height, + ref_frame_ptr->y_stride }; + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0, + &tpl_data->sf, &ref_buf, kernel); + inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd); + + av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv, + &inter_pred_params); + + if (use_pred_sad) { + inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride, + predictor, bw); + } else { + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + } + } else { + int ref_mb_offset = + mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; + uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; + int ref_stride = ref_frame_ptr->y_stride; + const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv); + // Since sub-pel motion search is not performed, use the prediction pixels + // directly from the reference block ref_mb + if (use_pred_sad) { + inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf( + src_mb_buffer, src_stride, + &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride); + } else { + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + &ref_mb[fullmv.row * ref_stride + fullmv.col], + ref_stride, coeff, bw, bh, tx_size); + } + } + return inter_cost; +} + +static AOM_INLINE void mode_estimation(AV1_COMP *cpi, + TplTxfmStats *tpl_txfm_stats, + TplBuffers *tpl_tmp_buffers, + MACROBLOCK *x, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, + TplDepStats *tpl_stats) { + AV1_COMMON *cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + + (void)gf_group; + + MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + TplParams *tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + + int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index; + + int32_t best_intra_cost = INT32_MAX; + int32_t intra_cost; + PREDICTION_MODE best_mode = DC_PRED; + + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset; + const int src_stride = xd->cur_buf->y_stride; + const int src_width = xd->cur_buf->y_width; + + int dst_mb_offset = + mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE; + uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset; + int dst_buffer_stride = tpl_frame->rec_picture->y_stride; + int use_y_only_rate_distortion = tpl_sf->use_y_only_rate_distortion; + + uint8_t *rec_buffer_pool[3] = { + tpl_frame->rec_picture->y_buffer, + tpl_frame->rec_picture->u_buffer, + tpl_frame->rec_picture->v_buffer, + }; + + const int rec_stride_pool[3] = { + tpl_frame->rec_picture->y_stride, + tpl_frame->rec_picture->uv_stride, + tpl_frame->rec_picture->uv_stride, + }; + + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + pd->subsampling_x = xd->cur_buf->subsampling_x; + pd->subsampling_y = xd->cur_buf->subsampling_y; + } + + uint8_t *predictor8 = tpl_tmp_buffers->predictor8; + int16_t *src_diff = tpl_tmp_buffers->src_diff; + tran_low_t *coeff = tpl_tmp_buffers->coeff; + tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff; + tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff; + uint8_t *predictor = + is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; + int64_t recon_error = 1; + int64_t pred_error = 1; + + memset(tpl_stats, 0, sizeof(*tpl_stats)); + tpl_stats->ref_frame_index[0] = -1; + tpl_stats->ref_frame_index[1] = -1; + + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd, + mi_row, mi_col); + set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_params.mi_rows, cm->mi_params.mi_cols); + set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], + av1_num_planes(cm)); + xd->mi[0]->bsize = bsize; + xd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + + // Intra prediction search + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + // Pre-load the bottom left line. + if (xd->left_available && + mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) { + if (is_cur_buf_hbd(xd)) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer); + for (int i = 0; i < bw; ++i) + dst[(bw + i) * dst_buffer_stride - 1] = + dst[(bw - 1) * dst_buffer_stride - 1]; + } else { + for (int i = 0; i < bw; ++i) + dst_buffer[(bw + i) * dst_buffer_stride - 1] = + dst_buffer[(bw - 1) * dst_buffer_stride - 1]; + } + } + + // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED, + // H_PRED, and V_PRED + const PREDICTION_MODE last_intra_mode = + tpl_sf->prune_intra_modes ? D45_PRED : INTRA_MODE_END; + const SequenceHeader *seq_params = cm->seq_params; + for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode; + ++mode) { + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], + tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, + dst_buffer_stride, predictor, bw, 0, 0, 0); + + if (tpl_frame->use_pred_sad) { + intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf( + src_mb_buffer, src_stride, predictor, bw); + } else { + intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + } + + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = mode; + } + } + // Calculate SATD of the best intra mode if SAD was used for mode decision + // as best_intra_cost is used in ML model to skip intra mode evaluation. + if (tpl_frame->use_pred_sad) { + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0, + 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0, + 0, 0); + best_intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + } + + int rate_cost = 1; + + if (cpi->use_ducky_encode) { + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, NULL); + + tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->intra_rate = rate_cost; + } + + if (cpi->third_pass_ctx && + frame_offset < cpi->third_pass_ctx->frame_info_count && + tpl_data->frame_idx < gf_group->size) { + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, + cm->width, &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); + + PREDICTION_MODE third_pass_mode = this_mi->pred_mode; + + if (third_pass_mode >= last_intra_mode && + third_pass_mode < INTRA_MODE_END) { + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, + third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, + dst_buffer_stride, predictor, bw, 0, 0, 0); + + intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + + if (intra_cost < best_intra_cost) { + best_intra_cost = intra_cost; + best_mode = third_pass_mode; + } + } + } + + // Motion compensated prediction + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + xd->mi[0]->ref_frame[1] = NONE_FRAME; + xd->mi[0]->compound_idx = 1; + + int best_rf_idx = -1; + int_mv best_mv[2]; + int32_t inter_cost; + int32_t best_inter_cost = INT32_MAX; + int rf_idx; + int_mv single_mv[INTER_REFS_PER_FRAME]; + + best_mv[0].as_int = INVALID_MV; + best_mv[1].as_int = INVALID_MV; + + for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) { + single_mv[rf_idx].as_int = INVALID_MV; + if (tpl_data->ref_frame[rf_idx] == NULL || + tpl_data->src_ref_frame[rf_idx] == NULL) { + tpl_stats->mv[rf_idx].as_int = INVALID_MV; + continue; + } + + const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx]; + const int ref_mb_offset = + mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE; + uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset; + const int ref_stride = ref_frame_ptr->y_stride; + const int ref_width = ref_frame_ptr->y_width; + + int_mv best_rfidx_mv = { 0 }; + uint32_t bestsme = UINT32_MAX; + + center_mv_t center_mvs[4] = { { { 0 }, INT_MAX }, + { { 0 }, INT_MAX }, + { { 0 }, INT_MAX }, + { { 0 }, INT_MAX } }; + int refmv_count = 1; + int idx; + + if (xd->up_available) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (xd->left_available) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) { + TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + mi_row - mi_height, mi_col + mi_width, tpl_frame->stride, + block_mis_log2)]; + if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int; + ++refmv_count; + } + } + + if (cpi->third_pass_ctx && + frame_offset < cpi->third_pass_ctx->frame_info_count && + tpl_data->frame_idx < gf_group->size) { + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, + cm->width, &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); + + int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w, + rf_idx + LAST_FRAME); + if (tp_mv.as_int != INVALID_MV && + !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1, + tpl_sf->skip_alike_starting_mv)) { + center_mvs[0].mv = tp_mv; + } + } + + // Prune starting mvs + if (tpl_sf->prune_starting_mv && refmv_count > 1) { + // Get each center mv's sad. + for (idx = 0; idx < refmv_count; ++idx) { + FULLPEL_MV mv = get_fullmv_from_mv(¢er_mvs[idx].mv.as_mv); + clamp_fullmv(&mv, &x->mv_limits); + center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf( + src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col], + ref_stride); + } + + // Rank center_mv using sad. + qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad); + + refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count); + // Further reduce number of refmv based on sad difference. + if (refmv_count > 1) { + int last_sad = center_mvs[refmv_count - 1].sad; + int second_to_last_sad = center_mvs[refmv_count - 2].sad; + if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad) + refmv_count--; + } + } + + for (idx = 0; idx < refmv_count; ++idx) { + int_mv this_mv; + uint32_t thissme = motion_estimation( + cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width, + ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv); + + if (thissme < bestsme) { + bestsme = thissme; + best_rfidx_mv = this_mv; + } + } + + tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int; + single_mv[rf_idx] = best_rfidx_mv; + + inter_cost = get_inter_cost( + cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size, + mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad); + // Store inter cost for each ref frame. This is used to prune inter modes. + tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost); + + if (inter_cost < best_inter_cost) { + best_rf_idx = rf_idx; + + best_inter_cost = inter_cost; + best_mv[0].as_int = best_rfidx_mv.as_int; + } + } + // Calculate SATD of the best inter mode if SAD was used for mode decision + // as best_inter_cost is used in ML model to skip intra mode evaluation. + if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) { + assert(best_rf_idx != -1); + best_inter_cost = get_inter_cost( + cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size, + mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */); + } + + if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) { + best_mode = NEWMV; + xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME; + xd->mi[0]->mv[0].as_int = best_mv[0].as_int; + } + + // Start compound predition search. + int comp_ref_frames[3][2] = { + { 0, 4 }, + { 0, 6 }, + { 3, 6 }, + }; + + int start_rf = 0; + int end_rf = 3; + if (!tpl_sf->allow_compound_pred) end_rf = 0; + if (cpi->third_pass_ctx && + frame_offset < cpi->third_pass_ctx->frame_info_count && + tpl_data->frame_idx < gf_group->size) { + double ratio_h, ratio_w; + av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height, + cm->width, &ratio_h, &ratio_w); + THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi( + cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w); + + if (this_mi->ref_frame[0] >= LAST_FRAME && + this_mi->ref_frame[1] >= LAST_FRAME) { + int found = 0; + for (int i = 0; i < 3; i++) { + if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] && + comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) { + found = 1; + break; + } + } + if (!found || !tpl_sf->allow_compound_pred) { + comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME; + comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME; + if (!tpl_sf->allow_compound_pred) { + start_rf = 2; + end_rf = 3; + } + } + } + } + + xd->mi_row = mi_row; + xd->mi_col = mi_col; + int best_cmp_rf_idx = -1; + const int_interpfilters kernel = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) { + int rf_idx0 = comp_ref_frames[cmp_rf_idx][0]; + int rf_idx1 = comp_ref_frames[cmp_rf_idx][1]; + + if (tpl_data->ref_frame[rf_idx0] == NULL || + tpl_data->src_ref_frame[rf_idx0] == NULL || + tpl_data->ref_frame[rf_idx1] == NULL || + tpl_data->src_ref_frame[rf_idx1] == NULL) { + continue; + } + + const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = { + tpl_data->src_ref_frame[rf_idx0], + tpl_data->src_ref_frame[rf_idx1], + }; + + xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME; + xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME; + xd->mi[0]->mode = NEW_NEWMV; + const int8_t ref_frame_type = av1_ref_frame_type(xd->mi[0]->ref_frame); + // Set up ref_mv for av1_joint_motion_search(). + CANDIDATE_MV *this_ref_mv_stack = x->mbmi_ext.ref_mv_stack[ref_frame_type]; + this_ref_mv_stack[xd->mi[0]->ref_mv_idx].this_mv = single_mv[rf_idx0]; + this_ref_mv_stack[xd->mi[0]->ref_mv_idx].comp_mv = single_mv[rf_idx1]; + + struct buf_2d yv12_mb[2][MAX_MB_PLANE]; + for (int i = 0; i < 2; ++i) { + av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i], + xd->block_ref_scale_factors[i], + xd->block_ref_scale_factors[i], MAX_MB_PLANE); + for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { + xd->plane[plane].pre[i] = yv12_mb[i][plane]; + } + } + + int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] }; + int rate_mv; + av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv, + !cpi->sf.mv_sf.disable_second_mv, + NUM_JOINT_ME_REFINE_ITER); + + for (int ref = 0; ref < 2; ++ref) { + struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer, + ref_frame_ptr[ref]->y_width, + ref_frame_ptr[ref]->y_height, + ref_frame_ptr[ref]->y_stride }; + InterPredParams inter_pred_params; + av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE, + mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), + 0, &tpl_data->sf, &ref_buf, kernel); + av1_init_comp_mode(&inter_pred_params); + + inter_pred_params.conv_params = get_conv_params_no_round( + ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd); + + av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv, + &inter_pred_params); + } + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); + if (inter_cost < best_inter_cost) { + best_cmp_rf_idx = cmp_rf_idx; + best_inter_cost = inter_cost; + best_mv[0] = tmp_mv[0]; + best_mv[1] = tmp_mv[1]; + } + } + + if (best_cmp_rf_idx != -1 && best_inter_cost < best_intra_cost) { + best_mode = NEW_NEWMV; + const int best_rf_idx0 = comp_ref_frames[best_cmp_rf_idx][0]; + const int best_rf_idx1 = comp_ref_frames[best_cmp_rf_idx][1]; + xd->mi[0]->ref_frame[0] = best_rf_idx0 + LAST_FRAME; + xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME; + } + + if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) { + xd->mi[0]->mv[0].as_int = best_mv[0].as_int; + xd->mi[0]->mv[1].as_int = best_mv[1].as_int; + const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = { + best_cmp_rf_idx >= 0 + ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]] + : tpl_data->src_ref_frame[best_rf_idx], + best_cmp_rf_idx >= 0 + ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] + : NULL, + }; + rate_cost = 1; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 0 /*do_recon*/, NULL); + tpl_stats->srcrf_rate = rate_cost; + } + + best_intra_cost = AOMMAX(best_intra_cost, 1); + best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = best_inter_cost; + tpl_stats->intra_cost = best_intra_cost; + + tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + + // Final encode + rate_cost = 0; + const YV12_BUFFER_CONFIG *ref_frame_ptr[2]; + + ref_frame_ptr[0] = + best_mode == NEW_NEWMV + ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]] + : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx] + : NULL; + ref_frame_ptr[1] = + best_mode == NEW_NEWMV + ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] + : NULL; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, + tpl_txfm_stats); + + tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->recrf_rate = rate_cost; + + if (!is_inter_mode(best_mode)) { + tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->srcrf_rate = rate_cost; + tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; + } + + tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist); + tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate); + + if (best_mode == NEW_NEWMV) { + ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; + ref_frame_ptr[1] = + tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, NULL); + tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->cmp_recrf_rate[0] = rate_cost; + + tpl_stats->cmp_recrf_dist[0] = + AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]); + tpl_stats->cmp_recrf_rate[0] = + AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]); + + tpl_stats->cmp_recrf_dist[0] = + AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]); + tpl_stats->cmp_recrf_rate[0] = + AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]); + + rate_cost = 0; + ref_frame_ptr[0] = + tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; + ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + rec_stride_pool, tx_size, best_mode, mi_row, mi_col, + use_y_only_rate_distortion, 1 /*do_recon*/, NULL); + tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->cmp_recrf_rate[1] = rate_cost; + + tpl_stats->cmp_recrf_dist[1] = + AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]); + tpl_stats->cmp_recrf_rate[1] = + AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]); + + tpl_stats->cmp_recrf_dist[1] = + AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]); + tpl_stats->cmp_recrf_rate[1] = + AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]); + } + + if (best_mode == NEWMV) { + tpl_stats->mv[best_rf_idx] = best_mv[0]; + tpl_stats->ref_frame_index[0] = best_rf_idx; + tpl_stats->ref_frame_index[1] = NONE_FRAME; + } else if (best_mode == NEW_NEWMV) { + tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0]; + tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1]; + tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0]; + tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1]; + } + + for (int idy = 0; idy < mi_height; ++idy) { + for (int idx = 0; idx < mi_width; ++idx) { + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx && + (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) { + xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0]; + } + } + } +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, + int height) { + int min_row = AOMMAX(row_a, row_b); + int max_row = AOMMIN(row_a + height, row_b + height); + int min_col = AOMMAX(col_a, col_b); + int max_col = AOMMIN(col_a + width, col_b + width); + if (min_row < max_row && min_col < max_col) { + return (max_row - min_row) * (max_col - min_col); + } + return 0; +} + +int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) { + return (mi_row >> right_shift) * stride + (mi_col >> right_shift); +} + +int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num) { + double beta = (double)srcrf_dist / recrf_dist; + int64_t rate_cost = delta_rate; + + if (srcrf_dist <= 128) return rate_cost; + + double dr = + (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) / + pix_num; + + double log_den = log(beta) / log(2.0) + 2.0 * dr; + + if (log_den > log(10.0) / log(2.0)) { + rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0); + rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); + return rate_cost; + } + + double num = pow(2.0, log_den); + double den = num * beta + (1 - beta) * beta; + + rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0); + + rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT); + + return rate_cost; +} + +static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + int frame_idx, int ref) { + TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx]; + TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr; + TplDepFrame *tpl_frame = tpl_data->tpl_frame; + const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; + TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos( + mi_row, mi_col, tpl_frame->stride, block_mis_log2)]; + + int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0; + + if (tpl_stats_ptr->ref_frame_index[ref] < 0) return; + const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref]; + TplDepFrame *ref_tpl_frame = + &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]]; + TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr; + + if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return; + + const FULLPEL_MV full_mv = + get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv); + const int ref_pos_row = mi_row * MI_SIZE + full_mv.row; + const int ref_pos_col = mi_col * MI_SIZE + full_mv.col; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref] + : tpl_stats_ptr->srcrf_dist; + int64_t srcrf_rate = + is_compound + ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2) + : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2); + + int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist; + int64_t mc_dep_dist = + (int64_t)(tpl_stats_ptr->mc_dep_dist * + ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) / + tpl_stats_ptr->recrf_dist)); + int64_t delta_rate = + (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate; + int64_t mc_dep_rate = + av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist, + srcrf_dist, pix_num); + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col, + ref_pos_row, ref_pos_col, bw, bh); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + assert((1 << block_mis_log2) == mi_height); + assert((1 << block_mis_log2) == mi_width); + TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos( + ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)]; + des_stats->mc_dep_dist += + ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num; + des_stats->mc_dep_rate += + ((delta_rate + mc_dep_rate) * overlap_area) / pix_num; + } + } +} + +static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row, + int mi_col, int frame_idx) { + const BLOCK_SIZE tpl_stats_block_size = + convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2); + tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx, + 0); + tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx, + 1); +} + +static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, + int mi_col, int stride, + const TplDepStats *src_stats, + uint8_t block_mis_log2) { + int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2); + TplDepStats *tpl_ptr = &tpl_stats_ptr[index]; + *tpl_ptr = *src_stats; + tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost); + tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost); + tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist); + tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse); + tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist); + tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate); + tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate); + tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]); + tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]); + tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]); + tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]); +} + +// Reset the ref and source frame pointers of tpl_data. +static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) { + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + tpl_data->ref_frame[i] = NULL; + tpl_data->src_ref_frame[i] = NULL; + } +} + +static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) { + int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1); + return gop_length; +} + +// Initialize the mc_flow parameters used in computing tpl data. +static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, + int pframe_qindex) { + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture; + const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME]; + uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME]; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf; + int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( + gf_group, cpi->sf.inter_sf.selective_ref_frame, + tpl_sf->prune_ref_frames_in_tpl, frame_idx); + int gop_length = get_gop_length(gf_group); + int ref_frame_flags; + AV1_COMMON *cm = &cpi->common; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; + tpl_data->frame_idx = frame_idx; + tpl_reset_src_ref_frames(tpl_data); + av1_tile_init(&xd->tile, cm, 0, 0); + + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + // Setup scaling factor + av1_setup_scale_factors_for_frame( + &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); + + xd->cur_buf = this_frame; + + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + TplDepFrame *tpl_ref_frame = + &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]]; + tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture; + tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture; + ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index; + } + + // Store the reference frames based on priority order + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + ref_frames_ordered[i] = + tpl_data->ref_frame[ref_frame_priority_order[i] - 1]; + } + + // Work out which reference frame slots may be used. + ref_frame_flags = + get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), + ref_frames_ordered, cpi->ext_flags.ref_frame_flags); + + enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices, + tpl_frame->frame_display_index); + + // Prune reference frames + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + if ((ref_frame_flags & (1 << idx)) == 0) { + tpl_data->ref_frame[idx] = NULL; + } + } + + // Skip motion estimation w.r.t. reference frames which are not + // considered in RD search, using "selective_ref_frame" speed feature. + // The reference frame pruning is not enabled for frames beyond the gop + // length, as there are fewer reference frames and the reference frames + // differ from the frames considered during RD search. + if (ref_pruning_enabled && (frame_idx < gop_length)) { + for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) { + const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME }; + if (prune_ref_by_selective_ref_frame(cpi, NULL, refs, + ref_frame_display_indices)) { + tpl_data->ref_frame[idx] = NULL; + } + } + } + + // Make a temporary mbmi for tpl model + MB_MODE_INFO mbmi; + memset(&mbmi, 0, sizeof(mbmi)); + MB_MODE_INFO *mbmi_ptr = &mbmi; + xd->mi = &mbmi_ptr; + + xd->block_ref_scale_factors[0] = &tpl_data->sf; + xd->block_ref_scale_factors[1] = &tpl_data->sf; + + const int base_qindex = + cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex; + // Get rd multiplier set up. + rdmult = (int)av1_compute_rd_mult( + base_qindex, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + if (rdmult < 1) rdmult = 1; + av1_set_error_per_bit(&x->errorperbit, rdmult); + av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex); + + tpl_frame->is_valid = 1; + + cm->quant_params.base_qindex = base_qindex; + av1_frame_init_quantizer(cpi); + + const BitDepthInfo bd_info = get_bit_depth_info(xd); + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; + tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex( + bd_info.bit_depth, update_type, base_qindex) / + 6; + + if (cpi->use_ducky_encode) + tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx]; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); + + // Initialize x->mbmi_ext when compound predictions are enabled. + if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext); + + // Set the pointer to null since mbmi is only allocated inside this function. + assert(xd->mi == &mbmi_ptr); + xd->mi = NULL; + + // Tpl module is called before the setting of speed features at frame level. + // Thus, turning off this speed feature for key frame is done here and not + // integrated into the speed feature setting itself. + const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0; + tpl_frame->use_pred_sad = + tpl_sf->use_sad_for_mode_decision && + gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->layer_depth[frame_idx] >= layer_depth_th; +} + +// This function stores the motion estimation dependencies of all the blocks in +// a row +void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats, + TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, + int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) { + AV1_COMMON *const cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const int mi_width = mi_size_wide[bsize]; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; + MACROBLOCKD *xd = &x->e_mbd; + + const int tplb_cols_in_tile = + ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]); + const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]); + assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2)); + assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2)); + + for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols; + mi_col += mi_width, tplb_col_in_tile++) { + (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row, + tplb_col_in_tile); + +#if CONFIG_MULTITHREAD + if (mt_info->num_workers > 1) { + pthread_mutex_lock(tpl_row_mt->mutex_); + const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit; + pthread_mutex_unlock(tpl_row_mt->mutex_); + // Exit in case any worker has encountered an error. + if (tpl_mt_exit) return; + } +#endif + + TplDepStats tpl_stats; + + // Motion estimation column boundary + av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width, + tpl_data->border_in_pixels); + xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); + xd->mb_to_right_edge = + GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col); + mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col, + bsize, tx_size, &tpl_stats); + + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride, + &tpl_stats, tpl_data->tpl_stats_block_mis_log2); + (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row, + tplb_col_in_tile, tplb_cols_in_tile); + } +} + +static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = mi_size_high[bsize]; + for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) { + // Motion estimation row boundary + av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, + cpi->ppi->tpl_data.border_in_pixels); + xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); + xd->mb_to_bottom_edge = + GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); + av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x, + mi_row, bsize, tx_size); + } +} + +static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows, + int mi_cols) { + if (!frame_idx) { + return; + } + const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2)); + assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2)); + + for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) { + for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) { + tpl_model_update(tpl_data, mi_row, mi_col, frame_idx); + } + } +} + +static AOM_INLINE void init_gop_frames_for_tpl( + AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params, + GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) { + AV1_COMMON *cm = &cpi->common; + assert(cpi->gf_frame_index == 0); + *pframe_qindex = 0; + + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(cpi, ref_frame_map_pairs); + + int remapped_ref_idx[REF_FRAMES]; + + EncodeFrameParams frame_params = *init_frame_params; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + + int ref_picture_map[REF_FRAMES]; + + for (int i = 0; i < REF_FRAMES; ++i) { + if (frame_params.frame_type == KEY_FRAME) { + tpl_data->tpl_frame[-i - 1].gf_picture = NULL; + tpl_data->tpl_frame[-i - 1].rec_picture = NULL; + tpl_data->tpl_frame[-i - 1].frame_display_index = 0; + } else { + tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf; + tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf; + tpl_data->tpl_frame[-i - 1].frame_display_index = + cm->ref_frame_map[i]->display_order_hint; + } + + ref_picture_map[i] = -i - 1; + } + + *tpl_group_frames = 0; + + int gf_index; + int process_frame_count = 0; + const int gop_length = get_gop_length(gf_group); + + for (gf_index = 0; gf_index < gop_length; ++gf_index) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; + FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index]; + int lookahead_index = + gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index]; + frame_params.show_frame = frame_update_type != ARF_UPDATE && + frame_update_type != INTNL_ARF_UPDATE; + frame_params.show_existing_frame = + frame_update_type == INTNL_OVERLAY_UPDATE || + frame_update_type == OVERLAY_UPDATE; + frame_params.frame_type = gf_group->frame_type[gf_index]; + + if (frame_update_type == LF_UPDATE) + *pframe_qindex = gf_group->q_val[gf_index]; + + const struct lookahead_entry *buf = av1_lookahead_peek( + cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage); + if (buf == NULL) break; + tpl_frame->gf_picture = &buf->img; + + // Use filtered frame buffer if available. This will make tpl stats more + // precise. + FRAME_DIFF frame_diff; + const YV12_BUFFER_CONFIG *tf_buf = + av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff); + if (tf_buf != NULL) { + tpl_frame->gf_picture = tf_buf; + } + + // 'cm->current_frame.frame_number' is the display number + // of the current frame. + // 'lookahead_index' is frame offset within the gf group. + // 'lookahead_index + cm->current_frame.frame_number' + // is the display index of the frame. + tpl_frame->frame_display_index = + lookahead_index + cm->current_frame.frame_number; + assert(buf->display_idx == + cpi->frame_index_set.show_frame_count + lookahead_index); + + if (frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; + tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; + ++process_frame_count; + } + const int true_disp = (int)(tpl_frame->frame_display_index); + + av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0, + remapped_ref_idx); + + int refresh_mask = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, + gf_index, true_disp, ref_frame_map_pairs); + + // Make the frames marked as is_frame_non_ref to non-reference frames. + if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0; + + int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); + + if (refresh_frame_map_index < REF_FRAMES && + refresh_frame_map_index != INVALID_IDX) { + ref_frame_map_pairs[refresh_frame_map_index].disp_order = + AOMMAX(0, true_disp); + ref_frame_map_pairs[refresh_frame_map_index].pyr_level = + get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, + cpi->ppi->gf_group.max_layer_depth); + } + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + tpl_frame->ref_map_index[i - LAST_FRAME] = + ref_picture_map[remapped_ref_idx[i - LAST_FRAME]]; + + if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; + + ++*tpl_group_frames; + } + + const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL; + int extend_frame_count = 0; + int extend_frame_length = AOMMIN( + tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval); + + int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] + + gf_group->arf_src_offset[gop_length - 1] + 1; + + for (; + gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length; + ++gf_index) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; + FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE; + frame_params.show_frame = frame_update_type != ARF_UPDATE && + frame_update_type != INTNL_ARF_UPDATE; + frame_params.show_existing_frame = + frame_update_type == INTNL_OVERLAY_UPDATE; + frame_params.frame_type = INTER_FRAME; + + int lookahead_index = frame_display_index; + struct lookahead_entry *buf = av1_lookahead_peek( + cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage); + + if (buf == NULL) break; + + tpl_frame->gf_picture = &buf->img; + tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count]; + tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; + // 'cm->current_frame.frame_number' is the display number + // of the current frame. + // 'frame_display_index' is frame offset within the gf group. + // 'frame_display_index + cm->current_frame.frame_number' + // is the display index of the frame. + tpl_frame->frame_display_index = + frame_display_index + cm->current_frame.frame_number; + + ++process_frame_count; + + gf_group->update_type[gf_index] = LF_UPDATE; + +#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) { + if (cpi->oxcf.rc_cfg.mode == AOM_Q) { + *pframe_qindex = cpi->oxcf.rc_cfg.cq_level; + } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) { + // TODO(angiebird): Find a more adaptive method to decide pframe_qindex + // override the pframe_qindex in the second pass when bitrate accuracy + // is on. We found that setting this pframe_qindex make the tpl stats + // more stable. + *pframe_qindex = 128; + } + } +#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS + gf_group->q_val[gf_index] = *pframe_qindex; + const int true_disp = (int)(tpl_frame->frame_display_index); + av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0, + remapped_ref_idx); + int refresh_mask = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, + gf_index, true_disp, ref_frame_map_pairs); + int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); + + if (refresh_frame_map_index < REF_FRAMES && + refresh_frame_map_index != INVALID_IDX) { + ref_frame_map_pairs[refresh_frame_map_index].disp_order = + AOMMAX(0, true_disp); + ref_frame_map_pairs[refresh_frame_map_index].pyr_level = + get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, + cpi->ppi->gf_group.max_layer_depth); + } + + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) + tpl_frame->ref_map_index[i - LAST_FRAME] = + ref_picture_map[remapped_ref_idx[i - LAST_FRAME]]; + + tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1; + tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1; + + if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index; + + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_display_index; + } +} + +void av1_init_tpl_stats(TplParams *const tpl_data) { + tpl_data->ready = 0; + set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, + &tpl_data->tpl_bsize_1d); + for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) { + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; + tpl_frame->is_valid = 0; + } + for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { + TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; + if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue; + memset(tpl_data->tpl_stats_pool[frame_idx], 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + } +} + +int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) { + if (tpl_data->ready == 0) { + return 0; + } + if (gf_frame_index >= MAX_TPL_FRAME_IDX) { + // The sub-GOP length exceeds the TPL buffer capacity. + // Hence the TPL related functions are disabled hereafter. + return 0; + } + return tpl_data->tpl_frame[gf_frame_index].is_valid; +} + +static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) { + switch (gop_eval) { + case 1: + // Allow larger GOP size if the base layer ARF has higher dependency + // factor than the intermediate ARF and both ARFs have reasonably high + // dependency factors. + return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0; + case 2: + if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6) + return 1; // Don't shorten the gf interval + else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4) + return 0; // Shorten the gf interval + else + return 2; // Cannot decide the gf interval, so redo the + // tpl stats calculation. + case 3: return beta[0] > 1.1; + default: return 2; + } +} + +// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down +// the scope of input arguments. +void av1_tpl_preload_rc_estimate(AV1_COMP *cpi, + const EncodeFrameParams *const frame_params) { + AV1_COMMON *cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + int bottom_index, top_index; + if (cpi->use_ducky_encode) return; + + cm->current_frame.frame_type = frame_params->frame_type; + for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; + ++gf_index) { + cm->current_frame.frame_type = gf_group->frame_type[gf_index]; + cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE && + gf_group->update_type[gf_index] != INTNL_ARF_UPDATE; + gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds( + cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index); + } +} + +static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group, + int frame_idx, int gop_eval, + int approx_gop_eval, + int reduce_num_frames) { + // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base + // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3, + // tpl stats calculation is limited to ARFs from base layer and (base+1) + // layer. + const int num_arf_layers = (gop_eval == 2) ? 3 : 2; + const int gop_length = get_gop_length(gf_group); + + if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || + gf_group->update_type[frame_idx] == OVERLAY_UPDATE) + return 1; + + // When approx_gop_eval = 1, skip tpl stats calculation for higher layer + // frames and for frames beyond gop length. + if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers || + frame_idx >= gop_length)) + return 1; + + if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE && + frame_idx < gop_length) + return 1; + + return 0; +} + +int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, + const EncodeFrameParams *const frame_params) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_tpl_setup_stats_time); +#endif + assert(cpi->gf_frame_index == 0); + AV1_COMMON *cm = &cpi->common; + MultiThreadInfo *const mt_info = &cpi->mt_info; + AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + EncodeFrameParams this_frame_params = *frame_params; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + int approx_gop_eval = (gop_eval > 1); + + if (cpi->superres_mode != AOM_SUPERRES_NONE) { + assert(cpi->superres_mode != AOM_SUPERRES_AUTO); + av1_init_tpl_stats(tpl_data); + return 0; + } + + cm->current_frame.frame_type = frame_params->frame_type; + for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; + ++gf_index) { + cm->current_frame.frame_type = gf_group->frame_type[gf_index]; + av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, + gf_group->update_type[gf_index], + gf_group->refbuf_state[gf_index], 0); + + memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame, + sizeof(cpi->refresh_frame)); + } + + int pframe_qindex; + int tpl_gf_group_frames; + init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames, + &pframe_qindex); + + cpi->ppi->p_rc.base_layer_qp = pframe_qindex; + + av1_init_tpl_stats(tpl_data); + + TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers; + if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) { + aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR, + "Error allocating tpl data"); + } + + tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy; + tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy; + + av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height, + cm->width, cm->height); + + if (frame_params->frame_type == KEY_FRAME) { + av1_init_mv_probs(cm); + } + av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, + cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs); + + const int num_planes = + cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm); + // As tpl module is called before the setting of speed features at frame + // level, turning off this speed feature for the first GF group of the + // key-frame interval is done here. + int reduce_num_frames = + cpi->sf.tpl_sf.reduce_num_frames && + gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->max_layer_depth > 2; + // TPL processing is skipped for frames of type LF_UPDATE when + // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor + // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the + // frames in the gf group on an average. + tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0; + + // Backward propagation from tpl_group_frames to 1. + for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames; + ++frame_idx) { + if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval, + reduce_num_frames)) + continue; + + init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex); + if (mt_info->num_workers > 1) { + tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read; + tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write; + av1_mc_flow_dispenser_mt(cpi); + } else { + mc_flow_dispenser(cpi); + } +#if CONFIG_BITRATE_ACCURACY + av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats); + av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx); +#endif // CONFIG_BITRATE_ACCURACY +#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY + if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) { + int frame_coding_idx = + av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx); + rc_log_frame_stats(&cpi->rc_log, frame_coding_idx, + &cpi->td.tpl_txfm_stats); + } +#endif // CONFIG_RATECTRL_LOG + + aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture, + num_planes); + } + + for (int frame_idx = tpl_gf_group_frames - 1; + frame_idx >= cpi->gf_frame_index; --frame_idx) { + if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval, + reduce_num_frames)) + continue; + + mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows, + cm->mi_params.mi_cols); + } + + av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, + gf_group->update_type[cpi->gf_frame_index], + gf_group->update_type[cpi->gf_frame_index], 0); + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + +#if CONFIG_COLLECT_COMPONENT_TIMING + // Record the time if the function returns. + if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 || + !gop_eval) + end_timing(cpi, av1_tpl_setup_stats_time); +#endif + + tpl_dealloc_temp_buffers(tpl_tmp_buffers); + + if (!approx_gop_eval) { + tpl_data->ready = 1; + } + if (cpi->common.tiles.large_scale) return 0; + if (gf_group->max_layer_depth_allowed == 0) return 1; + if (!gop_eval) return 0; + assert(gf_group->arf_index >= 0); + + double beta[2] = { 0.0 }; + const int frame_idx_0 = gf_group->arf_index; + const int frame_idx_1 = + AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1); + beta[0] = av1_tpl_get_frame_importance(tpl_data, frame_idx_0); + beta[1] = av1_tpl_get_frame_importance(tpl_data, frame_idx_1); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_tpl_setup_stats_time); +#endif + return eval_gop_length(beta, gop_eval); +} + +void av1_tpl_rdmult_setup(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + const int tpl_idx = cpi->gf_frame_index; + + assert( + IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size)); + + TplParams *const tpl_data = &cpi->ppi->tpl_data; + const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + + if (!tpl_frame->is_valid) return; + + const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + + const int block_size = BLOCK_16X16; + const int num_mi_w = mi_size_wide[block_size]; + const int num_mi_h = mi_size_high[block_size]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const double c = 1.2; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + // Loop through each 'block_size' X 'block_size' block. + for (int row = 0; row < num_rows; row++) { + for (int col = 0; col < num_cols; col++) { + double intra_cost = 0.0, mc_dep_cost = 0.0; + // Loop through each mi block. + for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h; + mi_row += step) { + for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w; + mi_col += step) { + if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue; + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS); + mc_dep_cost += + (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta; + } + } + const double rk = intra_cost / mc_dep_cost; + const int index = row * num_cols + col; + cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c; + } + } +} + +void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, + BLOCK_SIZE sb_size, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); + const FRAME_TYPE frame_type = cm->current_frame.frame_type; + + if (tpl_idx >= MAX_TPL_FRAME_IDX) return; + TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx]; + if (!tpl_frame->is_valid) return; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; + if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; + + const int mi_col_sr = + coded_to_superres_mi(mi_col, cm->superres_scale_denominator); + const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width); + const int sb_mi_width_sr = coded_to_superres_mi( + mi_size_wide[sb_size], cm->superres_scale_denominator); + + const int bsize_base = BLOCK_16X16; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h; + int row, col; + + double base_block_count = 0.0; + double log_sum = 0.0; + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col_sr / num_mi_h; + col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + log_sum += log(cpi->tpl_rdmult_scaling_factors[index]); + base_block_count += 1.0; + } + } + + const CommonQuantParams *quant_params = &cm->quant_params; + + const int orig_qindex_rdmult = + quant_params->base_qindex + quant_params->y_dc_delta_q; + const int orig_rdmult = av1_compute_rd_mult( + orig_qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + const int new_qindex_rdmult = quant_params->base_qindex + + x->rdmult_delta_qindex + + quant_params->y_dc_delta_q; + const int new_rdmult = av1_compute_rd_mult( + new_qindex_rdmult, cm->seq_params->bit_depth, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth, + boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets, + is_stat_consumption_stage(cpi)); + + const double scaling_factor = (double)new_rdmult / (double)orig_rdmult; + + double scale_adj = log(scaling_factor) - log_sum / base_block_count; + scale_adj = exp_bounded(scale_adj); + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col_sr / num_mi_h; + col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + cpi->ppi->tpl_sb_rdmult_scaling_factors[index] = + scale_adj * cpi->tpl_rdmult_scaling_factors[index]; + } + } +} + +double av1_exponential_entropy(double q_step, double b) { + b = AOMMAX(b, TPL_EPSILON); + double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); + return -log2(1 - z) - z * log2(z) / (1 - z); +} + +double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) { + // zero bin's size is zero_bin_ratio * q_step + // non-zero bin's size is q_step + b = AOMMAX(b, TPL_EPSILON); + double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); + double h = av1_exponential_entropy(q_step, b); + double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1); + return r; +} + +double av1_laplace_estimate_frame_rate(int q_index, int block_count, + const double *abs_coeff_mean, + int coeff_num) { + double zero_bin_ratio = 2; + double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double est_rate = 0; + // dc coeff + est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio); + // ac coeff + for (int i = 1; i < coeff_num; ++i) { + est_rate += + av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio); + } + est_rate *= block_count; + return est_rate; +} + +double av1_estimate_coeff_entropy(double q_step, double b, + double zero_bin_ratio, int qcoeff) { + b = AOMMAX(b, TPL_EPSILON); + int abs_qcoeff = abs(qcoeff); + double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); + if (abs_qcoeff == 0) { + double r = -log2(1 - z0); + return r; + } else { + double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); + double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z); + return r; + } +} + +double av1_estimate_txfm_block_entropy(int q_index, + const double *abs_coeff_mean, + int *qcoeff_arr, int coeff_num) { + double zero_bin_ratio = 2; + double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double est_rate = 0; + // dc coeff + est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0], + zero_bin_ratio, qcoeff_arr[0]); + // ac coeff + for (int i = 1; i < coeff_num; ++i) { + est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i], + zero_bin_ratio, qcoeff_arr[i]); + } + return est_rate; +} + +#if CONFIG_RD_COMMAND +void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) { + FILE *fptr = fopen(filepath, "r"); + fscanf(fptr, "%d", &rd_command->frame_count); + rd_command->frame_index = 0; + for (int i = 0; i < rd_command->frame_count; ++i) { + int option; + fscanf(fptr, "%d", &option); + rd_command->option_ls[i] = (RD_OPTION)option; + if (option == RD_OPTION_SET_Q) { + fscanf(fptr, "%d", &rd_command->q_index_ls[i]); + } else if (option == RD_OPTION_SET_Q_RDMULT) { + fscanf(fptr, "%d", &rd_command->q_index_ls[i]); + fscanf(fptr, "%d", &rd_command->rdmult_ls[i]); + } + } + fclose(fptr); +} +#endif // CONFIG_RD_COMMAND + +double av1_tpl_get_frame_importance(const TplParams *tpl_data, + int gf_frame_index) { + const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index]; + const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + const int tpl_stride = tpl_frame->stride; + double intra_cost_base = 0; + double mc_dep_cost_base = 0; + double cbcmp_base = 1; + const int step = 1 << tpl_data->tpl_stats_block_mis_log2; + + for (int row = 0; row < tpl_frame->mi_rows; row += step) { + for (int col = 0; col < tpl_frame->mi_cols; col += step) { + const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos( + row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)]; + double cbcmp = (double)this_stats->srcrf_dist; + const int64_t mc_dep_delta = + RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate, + this_stats->mc_dep_dist); + double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS); + dist_scaled = AOMMAX(dist_scaled, 1); + intra_cost_base += log(dist_scaled) * cbcmp; + mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp; + cbcmp_base += cbcmp; + } + } + return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base); +} + +double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) { + if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) { + return 1; + } + const double frame_importance = + av1_tpl_get_frame_importance(tpl_data, gf_frame_index); + return sqrt(1 / frame_importance); +} + +int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio, + aom_bit_depth_t bit_depth) { + const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth); + const double target_qstep = leaf_qstep * qstep_ratio; + int qindex = leaf_qindex; + if (qstep_ratio < 1.0) { + for (qindex = leaf_qindex; qindex > 0; --qindex) { + const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (qstep <= target_qstep) break; + } + } else { + for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) { + const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (qstep >= target_qstep) break; + } + } + return qindex; +} + +int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index, + int leaf_qindex, aom_bit_depth_t bit_depth) { + const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index); + return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth); +} + +#if CONFIG_BITRATE_ACCURACY +void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget, + int show_frame_count) { + av1_zero(*vbr_rc_info); + vbr_rc_info->ready = 0; + vbr_rc_info->total_bit_budget = total_bit_budget; + vbr_rc_info->show_frame_count = show_frame_count; + const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1, + 0.94559, 1, 1, + 0.94559 }; + + // TODO(angiebird): Based on the previous code, only the scale factor 0.94559 + // will be used in most of the cases with --limi=17. Figure out if the + // following scale factors works better. + // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1, + // 1.10199, 1, 1, + // 0.16393 }; + + const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 }; + memcpy(vbr_rc_info->scale_factors, scale_factors, + sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES); + memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors, + sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES); + + vbr_rc_reset_gop_data(vbr_rc_info); +#if CONFIG_THREE_PASS + // TODO(angiebird): Explain why we use -1 here + vbr_rc_info->cur_gop_idx = -1; + vbr_rc_info->gop_count = 0; + vbr_rc_info->total_frame_count = 0; +#endif // CONFIG_THREE_PASS +} + +#if CONFIG_THREE_PASS +int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info, + int gf_frame_index) { + int gop_idx = vbr_rc_info->cur_gop_idx; + int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx]; + return gop_start_idx + gf_frame_index; +} + +void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info, + const TPL_INFO *tpl_info) { + int gop_start_idx = vbr_rc_info->total_frame_count; + vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx; + vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length; + assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES); + for (int i = 0; i < tpl_info->gf_length; ++i) { + vbr_rc_info->txfm_stats_list[gop_start_idx + i] = + tpl_info->txfm_stats_list[i]; + vbr_rc_info->qstep_ratio_list[gop_start_idx + i] = + tpl_info->qstep_ratio_ls[i]; + vbr_rc_info->update_type_list[gop_start_idx + i] = + tpl_info->update_type_list[i]; + } + vbr_rc_info->total_frame_count += tpl_info->gf_length; + vbr_rc_info->gop_count++; +} +#endif // CONFIG_THREE_PASS + +void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info, + int gop_showframe_count) { + vbr_rc_info->gop_showframe_count = gop_showframe_count; + vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget * + gop_showframe_count / + vbr_rc_info->show_frame_count; +} + +void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count, + const double *qstep_ratio_list, + aom_bit_depth_t bit_depth, + int *q_index_list) { + for (int i = 0; i < frame_count; ++i) { + q_index_list[i] = av1_get_q_index_from_qstep_ratio( + base_q_index, qstep_ratio_list[i], bit_depth); + } +} + +double av1_vbr_rc_info_estimate_gop_bitrate( + int base_q_index, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe) { + av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list, + bit_depth, q_index_list); + double estimated_gop_bitrate = 0; + for (int frame_index = 0; frame_index < frame_count; frame_index++) { + const TplTxfmStats *frame_stats = &stats_list[frame_index]; + double frame_bitrate = 0; + if (frame_stats->ready) { + int q_index = q_index_list[frame_index]; + + frame_bitrate = av1_laplace_estimate_frame_rate( + q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean, + frame_stats->coeff_num); + } + FRAME_UPDATE_TYPE update_type = update_type_list[frame_index]; + estimated_gop_bitrate += + frame_bitrate * update_type_scale_factors[update_type]; + if (estimated_bitrate_byframe != NULL) { + estimated_bitrate_byframe[frame_index] = frame_bitrate; + } + } + return estimated_gop_bitrate; +} + +int av1_vbr_rc_info_estimate_base_q( + double bit_budget, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe) { + int q_max = 255; // Maximum q value. + int q_min = 0; // Minimum q value. + int q = (q_max + q_min) / 2; + + double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate( + q_max, bit_depth, update_type_scale_factors, frame_count, + update_type_list, qstep_ratio_list, stats_list, q_index_list, + estimated_bitrate_byframe); + + double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate( + q_min, bit_depth, update_type_scale_factors, frame_count, + update_type_list, qstep_ratio_list, stats_list, q_index_list, + estimated_bitrate_byframe); + while (q_min + 1 < q_max) { + double estimate = av1_vbr_rc_info_estimate_gop_bitrate( + q, bit_depth, update_type_scale_factors, frame_count, update_type_list, + qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); + if (estimate > bit_budget) { + q_min = q; + q_min_estimate = estimate; + } else { + q_max = q; + q_max_estimate = estimate; + } + q = (q_max + q_min) / 2; + } + // Pick the estimate that lands closest to the budget. + if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) { + q = q_max; + } else { + q = q_min; + } + // Update q_index_list and vbr_rc_info. + av1_vbr_rc_info_estimate_gop_bitrate( + q, bit_depth, update_type_scale_factors, frame_count, update_type_list, + qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe); + return q; +} +void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info, + const TplParams *tpl_data, + const GF_GROUP *gf_group, + aom_bit_depth_t bit_depth) { + vbr_rc_info->q_index_list_ready = 1; + double gop_bit_budget = vbr_rc_info->gop_bit_budget; + + for (int i = 0; i < gf_group->size; i++) { + vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i); + } + + double mv_bits = 0; + for (int i = 0; i < gf_group->size; i++) { + double frame_mv_bits = 0; + if (av1_tpl_stats_ready(tpl_data, i)) { + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i]; + frame_mv_bits = av1_tpl_compute_frame_mv_entropy( + tpl_frame, tpl_data->tpl_stats_block_mis_log2); + FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i]; + mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type]; + } + } + + mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget); + gop_bit_budget -= mv_bits; + + vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q( + gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size, + gf_group->update_type, vbr_rc_info->qstep_ratio_list, + tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL); +} + +#endif // CONFIG_BITRATE_ACCURACY + +// Use upper and left neighbor block as the reference MVs. +// Compute the minimum difference between current MV and reference MV. +int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col, + int step, int tpl_stride, int right_shift) { + const TplDepStats *tpl_stats = + &tpl_frame + ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)]; + int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; + int current_mv_magnitude = + abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col); + + // Retrieve the up and left neighbors. + int up_error = INT_MAX; + int_mv up_mv_diff; + if (row - step >= 0) { + tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + row - step, col, tpl_stride, right_shift)]; + up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; + up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row; + up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col; + up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col); + } + + int left_error = INT_MAX; + int_mv left_mv_diff; + if (col - step >= 0) { + tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos( + row, col - step, tpl_stride, right_shift)]; + left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]]; + left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row; + left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col; + left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col); + } + + // Return the MV with the minimum distance from current. + if (up_error < left_error && up_error < current_mv_magnitude) { + return up_mv_diff; + } else if (left_error < up_error && left_error < current_mv_magnitude) { + return left_mv_diff; + } + return current_mv; +} + +/* Compute the entropy of motion vectors for a single frame. */ +double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame, + uint8_t right_shift) { + if (!tpl_frame->is_valid) { + return 0; + } + + int count_row[500] = { 0 }; + int count_col[500] = { 0 }; + int n = 0; // number of MVs to process + + const int tpl_stride = tpl_frame->stride; + const int step = 1 << right_shift; + + for (int row = 0; row < tpl_frame->mi_rows; row += step) { + for (int col = 0; col < tpl_frame->mi_cols; col += step) { + int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step, + tpl_stride, right_shift); + count_row[clamp(mv.as_mv.row, 0, 499)] += 1; + count_col[clamp(mv.as_mv.row, 0, 499)] += 1; + n += 1; + } + } + + // Estimate the bits used using the entropy formula. + double rate_row = 0; + double rate_col = 0; + for (int i = 0; i < 500; i++) { + if (count_row[i] != 0) { + double p = count_row[i] / (double)n; + rate_row += count_row[i] * -log2(p); + } + if (count_col[i] != 0) { + double p = count_col[i] / (double)n; + rate_col += count_col[i] * -log2(p); + } + } + + return rate_row + rate_col; +} diff --git a/third_party/aom/av1/encoder/tpl_model.h b/third_party/aom/av1/encoder/tpl_model.h new file mode 100644 index 0000000000..bcd58216c5 --- /dev/null +++ b/third_party/aom/av1/encoder/tpl_model.h @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_ +#define AOM_AV1_ENCODER_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\cond */ + +struct AV1_PRIMARY; +struct AV1_COMP; +struct AV1_SEQ_CODING_TOOLS; +struct EncodeFrameParams; +struct EncodeFrameInput; +struct GF_GROUP; +struct ThreadData; +struct TPL_INFO; + +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" + +#include "av1/common/mv.h" +#include "av1/common/scale.h" +#include "av1/encoder/block.h" +#include "av1/encoder/lookahead.h" +#include "av1/encoder/ratectrl.h" + +static INLINE BLOCK_SIZE convert_length_to_bsize(int length) { + switch (length) { + case 64: return BLOCK_64X64; + case 32: return BLOCK_32X32; + case 16: return BLOCK_16X16; + case 8: return BLOCK_8X8; + case 4: return BLOCK_4X4; + default: + assert(0 && "Invalid block size for tpl model"); + return BLOCK_16X16; + } +} + +typedef struct AV1TplRowMultiThreadSync { +#if CONFIG_MULTITHREAD + // Synchronization objects for top-right dependency. + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Buffer to store the macroblock whose encoding is complete. + // num_finished_cols[i] stores the number of macroblocks which finished + // encoding in the ith macroblock row. + int *num_finished_cols; + // Number of extra macroblocks of the top row to be complete for encoding + // of the current macroblock to start. A value of 1 indicates top-right + // dependency. + int sync_range; + // Number of macroblock rows. + int rows; + // Number of threads processing the current tile. + int num_threads_working; +} AV1TplRowMultiThreadSync; + +typedef struct AV1TplRowMultiThreadInfo { + // Initialized to false, set to true by the worker thread that encounters an + // error in order to abort the processing of other worker threads. + bool tpl_mt_exit; +#if CONFIG_MULTITHREAD + // Mutex lock object used for error handling. + pthread_mutex_t *mutex_; +#endif + // Row synchronization related function pointers. + void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c); + void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c, + int cols); +} AV1TplRowMultiThreadInfo; + +// TODO(jingning): This needs to be cleaned up next. + +// TPL stats buffers are prepared for every frame in the GOP, +// including (internal) overlays and (internal) arfs. +// In addition, frames in the lookahead that are outside of the GOP +// are also used. +// Thus it should use +// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) = +// MAX_LAG_BUFFERS + (# overlays) +// 2 * MAX_LAG_BUFFERS is therefore a safe estimate. +// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER +#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS) +// The first REF_FRAMES + 1 buffers are reserved. +// tpl_data->tpl_frame starts after REF_FRAMES + 1 +#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1) +#define TPL_DEP_COST_SCALE_LOG2 4 + +#define TPL_EPSILON 0.0000001 + +typedef struct TplTxfmStats { + int ready; // Whether abs_coeff_mean is ready + double abs_coeff_sum[256]; // Assume we are using 16x16 transform block + double abs_coeff_mean[256]; + int txfm_block_count; + int coeff_num; +} TplTxfmStats; + +typedef struct { + uint8_t *predictor8; + int16_t *src_diff; + tran_low_t *coeff; + tran_low_t *qcoeff; + tran_low_t *dqcoeff; +} TplBuffers; + +typedef struct TplDepStats { + int64_t srcrf_sse; + int64_t srcrf_dist; + int64_t recrf_sse; + int64_t recrf_dist; + int64_t intra_sse; + int64_t intra_dist; + int64_t cmp_recrf_dist[2]; + int64_t mc_dep_rate; + int64_t mc_dep_dist; + int64_t pred_error[INTER_REFS_PER_FRAME]; + int32_t intra_cost; + int32_t inter_cost; + int32_t srcrf_rate; + int32_t recrf_rate; + int32_t intra_rate; + int32_t cmp_recrf_rate[2]; + int_mv mv[INTER_REFS_PER_FRAME]; + int8_t ref_frame_index[2]; +} TplDepStats; + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + const YV12_BUFFER_CONFIG *gf_picture; + YV12_BUFFER_CONFIG *rec_picture; + int ref_map_index[REF_FRAMES]; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + int base_rdmult; + uint32_t frame_display_index; + // When set, SAD metric is used for intra and inter mode decision. + int use_pred_sad; +} TplDepFrame; + +/*!\endcond */ +/*! + * \brief Params related to temporal dependency model. + */ +typedef struct TplParams { + /*! + * Whether the tpl stats is ready. + */ + int ready; + + /*! + * Block granularity of tpl score storage. + */ + uint8_t tpl_stats_block_mis_log2; + + /*! + * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16. + */ + uint8_t tpl_bsize_1d; + + /*! + * Buffer to store the frame level tpl information for each frame in a gf + * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf + * group + */ + TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS]; + + /*! + * Buffer to store tpl stats at block granularity. + * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf + * group. + */ + TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS]; + + /*! + * Pointer to the buffer which stores tpl transform stats per frame. + * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group. + * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when + * tpl is enabled. + */ + TplTxfmStats *txfm_stats_list; + + /*! + * Buffer to store tpl reconstructed frame. + * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group. + */ + YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS]; + + /*! + * Pointer to tpl_stats_buffer. + */ + TplDepFrame *tpl_frame; + + /*! + * Scale factors for the current frame. + */ + struct scale_factors sf; + + /*! + * GF group index of the current frame. + */ + int frame_idx; + + /*! + * Array of pointers to the frame buffers holding the source frame. + * src_ref_frame[i] stores the pointer to the source frame of the ith + * reference frame type. + */ + const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME]; + + /*! + * Array of pointers to the frame buffers holding the tpl reconstructed frame. + * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith + * reference frame type. + */ + const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME]; + + /*! + * Parameters related to synchronization for top-right dependency in row based + * multi-threading of tpl + */ + AV1TplRowMultiThreadSync tpl_mt_sync; + + /*! + * Frame border for tpl frame. + */ + int border_in_pixels; + + /*! + * Factor to adjust r0 if TPL uses a subset of frames in the gf group. + */ + double r0_adjust_factor; +} TplParams; + +#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG +#define VBR_RC_INFO_MAX_FRAMES 500 +#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG + +#if CONFIG_BITRATE_ACCURACY + +/*! + * \brief This structure stores information needed for bitrate accuracy + * experiment. + */ +typedef struct { + int ready; + double total_bit_budget; // The total bit budget of the entire video + int show_frame_count; // Number of show frames in the entire video + + int gop_showframe_count; // The number of show frames in the current gop + double gop_bit_budget; // The bitbudget for the current gop + double scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve the + // budget estimation + double mv_scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve + // MV entropy estimation + + // === Below this line are GOP related data that will be updated per GOP === + int base_q_index; // Stores the base q index. + int q_index_list_ready; + int q_index_list[VBR_RC_INFO_MAX_FRAMES]; // q indices for the current + // GOP + + // Array to store qstep_ratio for each frame in a GOP + double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES]; + +#if CONFIG_THREE_PASS + TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES]; + FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES]; + int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES]; + int gop_length_list[VBR_RC_INFO_MAX_FRAMES]; + int cur_gop_idx; + int total_frame_count; + int gop_count; +#endif // CONFIG_THREE_PASS +} VBR_RATECTRL_INFO; + +static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) { + vbr_rc_info->q_index_list_ready = 0; + av1_zero(vbr_rc_info->q_index_list); +} + +void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget, + int show_frame_count); + +int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info, + int gf_frame_index); + +void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info, + const struct TPL_INFO *tpl_info); + +void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info, + int gop_showframe_count); + +void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count, + const double *qstep_ratio_list, + aom_bit_depth_t bit_depth, int *q_index_list); + +/*!\brief Update q_index_list in vbr_rc_info based on tpl stats + * + * \param[out] vbr_rc_info Rate control info for BITRATE_ACCURACY + * experiment + * \param[in] tpl_data TPL struct + * \param[in] gf_group GOP struct + * \param[in] bit_depth bit depth + */ +void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info, + const TplParams *tpl_data, + const struct GF_GROUP *gf_group, + aom_bit_depth_t bit_depth); +/* + *!\brief Compute the number of bits needed to encode a GOP + * + * \param[in] base_q_index base layer q_index + * \param[in] bit_depth bit depth + * \param[in] update_type_scale_factors array of scale factors for each + * update_type + * \param[in] frame_count size of update_type_list, + * qstep_ratio_list stats_list, + * q_index_list and + * estimated_bitrate_byframe + * \param[in] update_type_list array of update_type, one per frame + * \param[in] qstep_ratio_list array of qstep_ratio, one per frame + * \param[in] stats_list array of transform stats, one per + * frame + * \param[out] q_index_list array of q_index, one per frame + * \param[out] estimated_bitrate_byframe array to keep track of frame + * bitrate + * + * \return The estimated GOP bitrate. + * + */ +double av1_vbr_rc_info_estimate_gop_bitrate( + int base_q_index, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe); + +/*!\brief Estimate the optimal base q index for a GOP. + * + * This function uses a binary search to find base layer q index to + * achieve the specified bit budget. + * + * \param[in] bit_budget target bit budget + * \param[in] bit_depth bit depth + * \param[in] update_type_scale_factors array of scale factors for each + * update_type + * \param[in] frame_count size of update_type_list, qstep_ratio_list + * stats_list, q_index_list and + * estimated_bitrate_byframe + * \param[in] update_type_list array of update_type, one per frame + * \param[in] qstep_ratio_list array of qstep_ratio, one per frame + * \param[in] stats_list array of transform stats, one per frame + * \param[out] q_index_list array of q_index, one per frame + * \param[out] estimated_bitrate_byframe Array to keep track of frame + * bitrate + * + * \return Returns the optimal base q index to use. + */ +int av1_vbr_rc_info_estimate_base_q( + double bit_budget, aom_bit_depth_t bit_depth, + const double *update_type_scale_factors, int frame_count, + const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list, + const TplTxfmStats *stats_list, int *q_index_list, + double *estimated_bitrate_byframe); + +#endif // CONFIG_BITRATE_ACCURACY + +#if CONFIG_RD_COMMAND +typedef enum { + RD_OPTION_NONE, + RD_OPTION_SET_Q, + RD_OPTION_SET_Q_RDMULT +} RD_OPTION; + +typedef struct RD_COMMAND { + RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS]; + int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS]; + int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS]; + int frame_count; + int frame_index; +} RD_COMMAND; + +void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command); +#endif // CONFIG_RD_COMMAND + +/*!\brief Allocate buffers used by tpl model + * + * \param[in] Top-level encode/decode structure + * \param[in] lag_in_frames number of lookahead frames + * + * \param[out] tpl_data tpl data structure + */ + +void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi, + CommonModeInfoParams *const mi_params, int width, + int height, int byte_alignment, int lag_in_frames); + +static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) { + aom_free(tpl_tmp_buffers->predictor8); + tpl_tmp_buffers->predictor8 = NULL; + aom_free(tpl_tmp_buffers->src_diff); + tpl_tmp_buffers->src_diff = NULL; + aom_free(tpl_tmp_buffers->coeff); + tpl_tmp_buffers->coeff = NULL; + aom_free(tpl_tmp_buffers->qcoeff); + tpl_tmp_buffers->qcoeff = NULL; + aom_free(tpl_tmp_buffers->dqcoeff); + tpl_tmp_buffers->dqcoeff = NULL; +} + +static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers, + uint8_t tpl_bsize_1d) { + // Number of pixels in a tpl block + const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d; + + // Allocate temporary buffers used in mode estimation. + tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign( + 32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8)); + tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff)); + tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff)); + tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff)); + tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign( + 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff)); + + if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff && + tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff && + tpl_tmp_buffers->dqcoeff)) { + tpl_dealloc_temp_buffers(tpl_tmp_buffers); + return false; + } + return true; +} + +/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF + * group) and selects between 16 and 32 frame GOP structure. + * + *\ingroup tpl_modelling + * + * \param[in] cpi Top - level encoder instance structure + * \param[in] gop_eval Flag if it is in the GOP length decision stage + * \param[in] frame_params Per frame encoding parameters + * + * \return Indicates whether or not we should use a longer GOP length. + */ +int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval, + const struct EncodeFrameParams *const frame_params); + +/*!\cond */ + +void av1_tpl_preload_rc_estimate( + struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params); + +int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift); + +void av1_init_tpl_stats(TplParams *const tpl_data); + +int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index); + +void av1_tpl_rdmult_setup(struct AV1_COMP *cpi); + +void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x, + BLOCK_SIZE sb_size, int mi_row, int mi_col); + +void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, + TplTxfmStats *tpl_txfm_stats, + TplBuffers *tpl_tmp_buffers, MACROBLOCK *x, + int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size); + +/*!\brief Compute the entropy of an exponential probability distribution + * function (pdf) subjected to uniform quantization. + * + * pdf(x) = b*exp(-b*x) + * + *\ingroup tpl_modelling + * + * \param[in] q_step quantizer step size + * \param[in] b parameter of exponential distribution + * + * \return entropy cost + */ +double av1_exponential_entropy(double q_step, double b); + +/*!\brief Compute the entropy of a Laplace probability distribution + * function (pdf) subjected to non-uniform quantization. + * + * pdf(x) = 0.5*b*exp(-0.5*b*|x|) + * + *\ingroup tpl_modelling + * + * \param[in] q_step quantizer step size for non-zero bins + * \param[in] b parameter of Laplace distribution + * \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio * q_step + * + * \return entropy cost + */ +double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio); + +/*!\brief Compute the frame rate using transform block stats + * + * Assume each position i in the transform block is of Laplace distribution + * with mean absolute deviation abs_coeff_mean[i] + * + * Then we can use av1_laplace_entropy() to compute the expected frame + * rate. + * + *\ingroup tpl_modelling + * + * \param[in] q_index quantizer index + * \param[in] block_count number of transform blocks + * \param[in] abs_coeff_mean array of mean absolute deviation + * \param[in] coeff_num number of coefficients per transform block + * + * \return expected frame rate + */ +double av1_laplace_estimate_frame_rate(int q_index, int block_count, + const double *abs_coeff_mean, + int coeff_num); + +/* + *!\brief Init TplTxfmStats + * + * \param[in] tpl_txfm_stats a structure for storing transform stats + * + */ +void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats); + +#if CONFIG_BITRATE_ACCURACY +/* + *!\brief Accumulate TplTxfmStats + * + * \param[in] sub_stats a structure for storing sub transform stats + * \param[out] accumulated_stats a structure for storing accumulated + *transform stats + * + */ +void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, + TplTxfmStats *accumulated_stats); + +/* + *!\brief Record a transform block into TplTxfmStats + * + * \param[in] tpl_txfm_stats A structure for storing transform stats + * \param[out] coeff An array of transform coefficients. Its size + * should equal to tpl_txfm_stats.coeff_num. + * + */ +void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, + const tran_low_t *coeff); + +/* + *!\brief Update abs_coeff_mean and ready of txfm_stats + * If txfm_block_count > 0, this function will use abs_coeff_sum and + * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag + * will be set to one. + * + * \param[in] txfm_stats A structure for storing transform stats + */ +void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats); +#endif // CONFIG_BITRATE_ACCURACY + +/*!\brief Estimate coefficient entropy using Laplace dsitribution + * + *\ingroup tpl_modelling + * + * This function is equivalent to -log2(laplace_prob()), where laplace_prob() + *is defined in tpl_model_test.cc + * + * \param[in] q_step quantizer step size without any scaling + * \param[in] b mean absolute deviation of Laplace + *distribution \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio + ** q_step \param[in] qcoeff quantized coefficient + * + * \return estimated coefficient entropy + * + */ +double av1_estimate_coeff_entropy(double q_step, double b, + double zero_bin_ratio, int qcoeff); + +/*!\brief Estimate entropy of a transform block using Laplace dsitribution + * + *\ingroup tpl_modelling + * + * \param[in] q_index quantizer index + * \param[in] abs_coeff_mean array of mean absolute deviations + * \param[in] qcoeff_arr array of quantized coefficients + * \param[in] coeff_num number of coefficients per transform block + * + * \return estimated transform block entropy + * + */ +double av1_estimate_txfm_block_entropy(int q_index, + const double *abs_coeff_mean, + int *qcoeff_arr, int coeff_num); + +// TODO(angiebird): Add doxygen description here. +int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num); + +/*!\brief Compute the overlap area between two blocks with the same size + * + *\ingroup tpl_modelling + * + * If there is no overlap, this function should return zero. + * + * \param[in] row_a row position of the first block + * \param[in] col_a column position of the first block + * \param[in] row_b row position of the second block + * \param[in] col_b column position of the second block + * \param[in] width width shared by the two blocks + * \param[in] height height shared by the two blocks + * + * \return overlap area of the two blocks + */ +int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, + int height); + +/*!\brief Get current frame's q_index from tpl stats and leaf_qindex + * + * \param[in] tpl_data TPL struct + * \param[in] gf_frame_index current frame index in the GOP + * \param[in] leaf_qindex q index of leaf frame + * \param[in] bit_depth bit depth + * + * \return q_index + */ +int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index, + int leaf_qindex, aom_bit_depth_t bit_depth); + +/*!\brief Compute the frame importance from TPL stats + * + * \param[in] tpl_data TPL struct + * \param[in] gf_frame_index current frame index in the GOP + * + * \return frame_importance + */ +double av1_tpl_get_frame_importance(const TplParams *tpl_data, + int gf_frame_index); + +/*!\brief Compute the ratio between arf q step and the leaf q step based on + * TPL stats + * + * \param[in] tpl_data TPL struct + * \param[in] gf_frame_index current frame index in the GOP + * \param[in] leaf_qindex q index of leaf frame + * \param[in] bit_depth bit depth + * + * \return qstep_ratio + */ +double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index); + +/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep + * + * \param[in] leaf_qindex q index of leaf frame + * \param[in] qstep_ratio step ratio between target q index and + * leaf q index \param[in] bit_depth bit depth + * + * \return q_index + */ +int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio, + aom_bit_depth_t bit_depth); + +/*!\brief Improve the motion vector estimation by taking neighbors into + * account. + * + * Use the upper and left neighbor block as the reference MVs. + * Compute the minimum difference between current MV and reference MV. + * + * \param[in] tpl_frame Tpl frame struct + * \param[in] row Current row + * \param[in] col Current column + * \param[in] step Step parameter for av1_tpl_ptr_pos + * \param[in] tpl_stride Stride parameter for av1_tpl_ptr_pos + * \param[in] right_shift Right shift parameter for + * av1_tpl_ptr_pos + */ +int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col, + int step, int tpl_stride, int right_shift); + +/*!\brief Compute the entropy of motion vectors for a single frame. + * + * \param[in] tpl_frame TPL frame struct + * \param[in] right_shift right shift value for step + * + * \return Bits used by the motion vectors for one frame. + */ +double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame, + uint8_t right_shift); + +#if CONFIG_RATECTRL_LOG +typedef struct { + int coding_frame_count; + int base_q_index; + + // Encode decision + int q_index_list[VBR_RC_INFO_MAX_FRAMES]; + double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES]; + FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES]; + + // Frame stats + TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES]; + + // Estimated encode results + double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES]; + + // Actual encode results + double act_rate_list[VBR_RC_INFO_MAX_FRAMES]; + double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES]; +} RATECTRL_LOG; + +static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); } + +static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index, + const TplTxfmStats *txfm_stats) { + rc_log->txfm_stats_list[coding_index] = *txfm_stats; +} + +static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log, + int coding_index, + double qstep_ratio, int q_index, + FRAME_UPDATE_TYPE update_type) { + rc_log->qstep_ratio_list[coding_index] = qstep_ratio; + rc_log->q_index_list[coding_index] = q_index; + rc_log->update_type_list[coding_index] = update_type; + const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index]; + rc_log->est_coeff_rate_list[coding_index] = 0; + if (txfm_stats->ready) { + rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate( + q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean, + txfm_stats->coeff_num); + } +} + +static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index, + double act_rate, + double act_coeff_rate) { + rc_log->act_rate_list[coding_index] = act_rate; + rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate; +} + +static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log, + int base_q_index, + int coding_frame_count) { + rc_log->base_q_index = base_q_index; + rc_log->coding_frame_count = coding_frame_count; +} + +static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) { + printf("= chunk 1\n"); + printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count, + rc_log->base_q_index); + printf("= frame %d\n", rc_log->coding_frame_count); + for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count; + coding_idx++) { + printf( + "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f " + "act_coeff_rate %f act_rate %f\n", + coding_idx, rc_log->update_type_list[coding_idx], + rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx], + rc_log->est_coeff_rate_list[coding_idx], + rc_log->act_coeff_rate_list[coding_idx], + rc_log->act_rate_list[coding_idx]); + } +} +#endif // CONFIG_RATECTRL_LOG + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TPL_MODEL_H_ diff --git a/third_party/aom/av1/encoder/tune_butteraugli.c b/third_party/aom/av1/encoder/tune_butteraugli.c new file mode 100644 index 0000000000..92fc4b2a92 --- /dev/null +++ b/third_party/aom/av1/encoder/tune_butteraugli.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "av1/encoder/tune_butteraugli.h" + +#include "aom_dsp/butteraugli.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/var_based_part.h" + +static const int resize_factor = 2; + +static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *recon, + const double K) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + const CommonModeInfoParams *const mi_params = &cm->mi_params; + const aom_color_range_t color_range = + seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_crop_width; + const int height = source->y_crop_height; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; + + float *diffmap; + CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap))); + if (!aom_calc_butteraugli(source, recon, bit_depth, + seq_params->matrix_coefficients, color_range, + diffmap)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to calculate Butteraugli distances."); + } + + const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor; + const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor; + const int num_cols = + (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w; + const int num_rows = + (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h; + const int block_w = num_mi_w << 2; + const int block_h = num_mi_h << 2; + double log_sum = 0.0; + double blk_count = 0.0; + + // Loop through each block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const int y_start = row * block_h; + const int x_start = col * block_w; + float dbutteraugli = 0.0f; + float dmse = 0.0f; + float px_count = 0.0f; + + // Loop through each pixel. + for (int y = y_start; y < y_start + block_h && y < height; y++) { + for (int x = x_start; x < x_start + block_w && x < width; x++) { + dbutteraugli += powf(diffmap[y * width + x], 12.0f); + float px_diff = source->y_buffer[y * source->y_stride + x] - + recon->y_buffer[y * recon->y_stride + x]; + dmse += px_diff * px_diff; + px_count += 1.0f; + } + } + const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y), + (height + ss_y) >> ss_y); + for (int y = y_start >> ss_y; y < y_end; y++) { + const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x), + (width + ss_x) >> ss_x); + for (int x = x_start >> ss_x; x < x_end; x++) { + const int src_px_index = y * source->uv_stride + x; + const int recon_px_index = y * recon->uv_stride + x; + const float px_diff_u = (float)(source->u_buffer[src_px_index] - + recon->u_buffer[recon_px_index]); + const float px_diff_v = (float)(source->v_buffer[src_px_index] - + recon->v_buffer[recon_px_index]); + dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v; + px_count += 2.0f; + } + } + + dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f); + dmse = dmse / px_count; + const float eps = 0.01f; + double weight; + if (dbutteraugli < eps || dmse < eps) { + weight = -1.0; + } else { + blk_count += 1.0; + weight = dmse / dbutteraugli; + weight = AOMMIN(weight, 5.0); + weight += K; + log_sum += log(weight); + } + cpi->butteraugli_info.rdmult_scaling_factors[index] = weight; + } + } + // Geometric average of the weights. + log_sum = exp(log_sum / blk_count); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index]; + if (*weight <= 0.0) { + *weight = 1.0; + } else { + *weight /= log_sum; + } + *weight = AOMMIN(*weight, 2.5); + *weight = AOMMAX(*weight, 0.4); + } + } + + aom_free(diffmap); +} + +void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int *rdmult) { + assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI); + if (!cpi->butteraugli_info.recon_set) { + return; + } + const AV1_COMMON *const cm = &cpi->common; + + const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize]; + const int num_mi_h = mi_size_high[butteraugli_rdo_bsize]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + for (int row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (int col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += + log(cpi->butteraugli_info.rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(&x->errorperbit, *rdmult); +} + +static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int w, int h) { + for (int row = 0; row < h; row++) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int width, int height) { + copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width, + height); + const int width_uv = (width + src->subsampling_x) >> src->subsampling_x; + const int height_uv = (height + src->subsampling_y) >> src->subsampling_y; + copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + width_uv, height_uv); + copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + width_uv, height_uv); +} + +static void zero_plane(uint8_t *dst, int dst_stride, int h) { + for (int row = 0; row < h; row++) { + memset(dst, 0, dst_stride); + dst += dst_stride; + } +} + +static void zero_img(YV12_BUFFER_CONFIG *dst) { + zero_plane(dst->y_buffer, dst->y_stride, dst->y_height); + zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height); + zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height); +} + +void av1_setup_butteraugli_source(AV1_COMP *cpi) { + YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source; + AV1_COMMON *const cm = &cpi->common; + const int width = cpi->source->y_crop_width; + const int height = cpi->source->y_crop_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; + if (dst->buffer_alloc_sz == 0) { + aom_alloc_frame_buffer( + dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + } + av1_copy_and_extend_frame(cpi->source, dst); + + YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source; + if (resized_dst->buffer_alloc_sz == 0) { + aom_alloc_frame_buffer( + resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + } + if (!av1_resize_and_extend_frame_nonnormative( + cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating buffers during resize"); + } + + zero_img(cpi->source); + copy_img(resized_dst, cpi->source, width / resize_factor, + height / resize_factor); +} + +void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) { + av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source); + AV1_COMMON *const cm = &cpi->common; + const int width = cpi->source->y_crop_width; + const int height = cpi->source->y_crop_height; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; + + YV12_BUFFER_CONFIG resized_recon; + memset(&resized_recon, 0, sizeof(resized_recon)); + aom_alloc_frame_buffer( + &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor, + height / resize_factor); + + set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source, + &resized_recon, K); + cpi->butteraugli_info.recon_set = true; + aom_free_frame_buffer(&resized_recon); +} + +void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + const int q_index = 96; + + // Setup necessary params for encoding, including frame source, etc. + if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); + av1_set_frame_size(cpi, cm->superres_upscaled_width, + cm->superres_upscaled_height); + + cpi->source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, + 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_realloc_and_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, + cpi->image_pyramid_levels); + } + + av1_setup_butteraugli_source(cpi); + av1_setup_frame(cpi); + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + const PARTITION_SEARCH_TYPE partition_search_type = + cpi->sf.part_sf.partition_search_type; + const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size; + // Enable a quicker pass by uncommenting the following lines: + // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index, + q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q_index, 0); + av1_encode_frame(cpi); + + av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3); + cpi->sf.part_sf.partition_search_type = partition_search_type; + cpi->sf.part_sf.fixed_partition_size = fixed_partition_size; +} diff --git a/third_party/aom/av1/encoder/tune_butteraugli.h b/third_party/aom/av1/encoder/tune_butteraugli.h new file mode 100644 index 0000000000..bae5d2a882 --- /dev/null +++ b/third_party/aom/av1/encoder/tune_butteraugli.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ +#define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ + +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/block.h" + +typedef struct { + // Stores the scaling factors for rdmult when tuning for Butteraugli. + // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for + // 4x4 block at (row, col). + double *rdmult_scaling_factors; + YV12_BUFFER_CONFIG source, resized_source; + bool recon_set; +} TuneButteraugliInfo; + +struct AV1_COMP; +static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16; + +void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int *rdmult); + +void av1_setup_butteraugli_source(struct AV1_COMP *cpi); + +// 'K' is used to balance the rate-distortion distribution between PSNR +// and Butteraugli. +void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi, + double K); + +void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi); + +#endif // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c new file mode 100644 index 0000000000..4e5ffa387c --- /dev/null +++ b/third_party/aom/av1/encoder/tune_vmaf.c @@ -0,0 +1,1112 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/tune_vmaf.h" + +#include "aom_dsp/psnr.h" +#include "av1/encoder/extend.h" +#include "av1/encoder/rdopt.h" +#include "config/aom_scale_rtcd.h" + +static const double kBaselineVmaf = 97.42773; + +static double get_layer_value(const double *array, int layer) { + while (array[layer] < 0.0 && layer > 0) layer--; + return AOMMAX(array[layer], 0.0); +} + +static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *ref, + const BLOCK_SIZE block_size, const int mb_row, + const int mb_col, FULLPEL_MV *ref_mv) { + // Block information (ONLY Y-plane is used for motion search). + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int y_stride = src->y_stride; + assert(y_stride == ref->y_stride); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + + // Save input state. + MACROBLOCK *const mb = &cpi->td.mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + const struct buf_2d ori_src_buf = mb->plane[0].src; + const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0]; + + // Parameters used for motion search. + FULLPEL_MOTION_SEARCH_PARAMS full_ms_params; + FULLPEL_MV_STATS best_mv_stats; + const SEARCH_METHODS search_method = NSTEP; + const search_site_config *search_site_cfg = + cpi->mv_search_params.search_site_cfg[SS_CFG_FPF]; + const int step_param = + av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height)); + + // Baseline position for motion search (used for rate distortion comparison). + const MV baseline_mv = kZeroMv; + + // Setup. + mb->plane[0].src.buf = src->y_buffer + y_offset; + mb->plane[0].src.stride = y_stride; + mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset; + mbd->plane[0].pre[0].stride = y_stride; + + // Unused intermediate results for motion search. + int cost_list[5]; + + // Do motion search. + // Only do full search on the entire block. + av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size, + &baseline_mv, *ref_mv, search_site_cfg, + search_method, + /*fine_search_interval=*/0); + av1_full_pixel_search(*ref_mv, &full_ms_params, step_param, + cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats, + NULL); + + // Restore input state. + mb->plane[0].src = ori_src_buf; + mbd->plane[0].pre[0] = ori_pre_buf; +} + +static unsigned int residual_variance(const AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *ref, + const BLOCK_SIZE block_size, + const int mb_row, const int mb_col, + FULLPEL_MV ref_mv, unsigned int *sse) { + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int y_stride = src->y_stride; + assert(y_stride == ref->y_stride); + const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; + const int mv_offset = ref_mv.row * y_stride + ref_mv.col; + const unsigned int var = cpi->ppi->fn_ptr[block_size].vf( + ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset, + y_stride, sse); + return var; +} + +static double frame_average_variance(const AV1_COMP *const cpi, + const YV12_BUFFER_CONFIG *const frame) { + const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const uint8_t *const y_buffer = frame->y_buffer; + const int y_stride = frame->y_stride; + const BLOCK_SIZE block_size = BLOCK_64X64; + + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + int row, col; + double var = 0.0, var_count = 0.0; + const int use_hbd = frame->flags & YV12_FLAG_HIGHBITDEPTH; + + // Loop through each block. + for (row = 0; row < frame->y_height / block_h; ++row) { + for (col = 0; col < frame->y_width / block_w; ++col) { + struct buf_2d buf; + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + + buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + var += av1_get_perpixel_variance(cpi, xd, &buf, block_size, AOM_PLANE_Y, + use_hbd); + var_count += 1.0; + } + } + var /= var_count; + return var; +} + +static double residual_frame_average_variance(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *src, + const YV12_BUFFER_CONFIG *ref, + FULLPEL_MV *mvs) { + if (ref == NULL) return frame_average_variance(cpi, src); + const BLOCK_SIZE block_size = BLOCK_16X16; + const int frame_height = src->y_height; + const int frame_width = src->y_width; + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int mb_rows = (frame_height + mb_height - 1) / mb_height; + const int mb_cols = (frame_width + mb_width - 1) / mb_width; + const int num_planes = av1_num_planes(&cpi->common); + const int mi_h = mi_size_high_log2[block_size]; + const int mi_w = mi_size_wide_log2[block_size]; + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + + // Save input state. + MACROBLOCK *const mb = &cpi->td.mb; + MACROBLOCKD *const mbd = &mb->e_mbd; + uint8_t *input_buffer[MAX_MB_PLANE]; + for (int i = 0; i < num_planes; i++) { + input_buffer[i] = mbd->plane[i].pre[0].buf; + } + MB_MODE_INFO **input_mb_mode_info = mbd->mi; + + bool do_motion_search = false; + if (mvs == NULL) { + do_motion_search = true; + CHECK_MEM_ERROR(&cpi->common, mvs, + (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs))); + } + + unsigned int variance = 0; + // Perform temporal filtering block by block. + for (int mb_row = 0; mb_row < mb_rows; mb_row++) { + av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits, + (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2), + cpi->oxcf.border_in_pixels); + FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols]; + if (do_motion_search) { + motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv); + } + unsigned int mv_sse; + const unsigned int blk_var = residual_variance( + cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse); + variance += blk_var; + } + } + + // Restore input state + for (int i = 0; i < num_planes; i++) { + mbd->plane[i].pre[0].buf = input_buffer[i]; + } + mbd->mi = input_mb_mode_info; + return (double)variance / (double)(mb_rows * mb_cols); +} + +// TODO(sdeng): Add the SIMD implementation. +static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source, + int source_stride, + const uint16_t *blurred, + int blurred_stride, uint16_t *dst, + int dst_stride, int w, int h, + double amount, int bit_depth) { + const int max_value = (1 << bit_depth) - 1; + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const double val = + (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); + dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value); + } + source += source_stride; + blurred += blurred_stride; + dst += dst_stride; + } +} + +static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride, + const uint8_t *blurred, int blurred_stride, + uint8_t *dst, int dst_stride, int w, int h, + double amount) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + const double val = + (double)source[j] + amount * ((double)source[j] - (double)blurred[j]); + dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255); + } + source += source_stride; + blurred += blurred_stride; + dst += dst_stride; + } +} + +static AOM_INLINE void unsharp(const AV1_COMP *const cpi, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *blurred, + const YV12_BUFFER_CONFIG *dst, double amount) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + if (cpi->common.seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH); + assert(dst->flags & YV12_FLAG_HIGHBITDEPTH); + highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride, + CONVERT_TO_SHORTPTR(blurred->y_buffer), + blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer), + dst->y_stride, source->y_width, source->y_height, + amount, bit_depth); + } else { + unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer, + blurred->y_stride, dst->y_buffer, dst->y_stride, + source->y_width, source->y_height, amount); + } +} + +// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, +// all co-efficients must be even. +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, + 30, 8, 0, 0 }; +static AOM_INLINE void gaussian_blur(const int bit_depth, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dst) { + const int block_size = BLOCK_128X128; + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + const int num_cols = (source->y_width + block_w - 1) / block_w; + const int num_rows = (source->y_height + block_h - 1) / block_h; + int row, col; + + ConvolveParams conv_params = get_conv_params(0, 0, bit_depth); + InterpFilterParams filter = { .filter_ptr = gauss_filter, + .taps = 8, + .interp_filter = EIGHTTAP_REGULAR }; + + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + + uint8_t *src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *dst_buf = + dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y; + + if (source->flags & YV12_FLAG_HIGHBITDEPTH) { + av1_highbd_convolve_2d_sr( + CONVERT_TO_SHORTPTR(src_buf), source->y_stride, + CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h, + &filter, &filter, 0, 0, &conv_params, bit_depth); + } else { + av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride, + block_w, block_h, &filter, &filter, 0, 0, + &conv_params); + } + } + } +} + +static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi, + double source_variance, + YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const sharpened) { + const int bit_depth = cpi->td.mb.e_mbd.bd; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + double new_vmaf; + + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth, + cal_vmaf_neg, &new_vmaf); + + const double sharpened_var = frame_average_variance(cpi, sharpened); + return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf); +} + +static double find_best_frame_unsharp_amount_loop( + const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened, + double best_vmaf, const double baseline_variance, + const double unsharp_amount_start, const double step_size, + const int max_loop_count, const double max_amount) { + const double min_amount = 0.0; + int loop_count = 0; + double approx_vmaf = best_vmaf; + double unsharp_amount = unsharp_amount_start; + do { + best_vmaf = approx_vmaf; + unsharp_amount += step_size; + if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; + unsharp(cpi, source, blurred, sharpened, unsharp_amount); + approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened); + + loop_count++; + } while (approx_vmaf > best_vmaf && loop_count < max_loop_count); + unsharp_amount = + approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size; + return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); +} + +static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, + const double unsharp_amount_start, + const double step_size, + const int max_loop_count, + const double max_filter_amount) { + const AV1_COMMON *const cm = &cpi->common; + const int width = source->y_width; + const int height = source->y_height; + YV12_BUFFER_CONFIG sharpened; + memset(&sharpened, 0, sizeof(sharpened)); + aom_alloc_frame_buffer( + &sharpened, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + const double baseline_variance = frame_average_variance(cpi, source); + double unsharp_amount; + if (unsharp_amount_start <= step_size) { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0, + step_size, max_loop_count, max_filter_amount); + } else { + double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start; + double v0, v1; + unsharp(cpi, source, blurred, &sharpened, a0); + v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); + unsharp(cpi, source, blurred, &sharpened, a1); + v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); + if (fabs(v0 - v1) < 0.01) { + unsharp_amount = a0; + } else if (v0 > v1) { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, v0, baseline_variance, a0, + -step_size, max_loop_count, max_filter_amount); + } else { + unsharp_amount = find_best_frame_unsharp_amount_loop( + cpi, source, blurred, &sharpened, v1, baseline_variance, a1, + step_size, max_loop_count, max_filter_amount); + } + } + + aom_free_frame_buffer(&sharpened); + return unsharp_amount; +} + +void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_width; + const int height = source->y_height; + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double best_frame_unsharp_amount = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + + if (best_frame_unsharp_amount <= 0.0) return; + + YV12_BUFFER_CONFIG blurred; + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer( + &blurred, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + gaussian_blur(bit_depth, source, &blurred); + unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); + aom_free_frame_buffer(&blurred); +} + +void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + const AV1_COMMON *const cm = &cpi->common; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int width = source->y_width; + const int height = source->y_height; + + YV12_BUFFER_CONFIG source_extended, blurred; + memset(&source_extended, 0, sizeof(source_extended)); + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer( + &source_extended, width, height, source->subsampling_x, + source->subsampling_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer( + &blurred, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + av1_copy_and_extend_frame(source, &source_extended); + gaussian_blur(bit_depth, &source_extended, &blurred); + aom_free_frame_buffer(&source_extended); + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double last_frame_unsharp_amount = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + + const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( + cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01); + + cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = + best_frame_unsharp_amount; + + unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); + aom_free_frame_buffer(&blurred); +} + +void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, + YV12_BUFFER_CONFIG *const source) { + const AV1_COMMON *const cm = &cpi->common; + const int width = source->y_width; + const int height = source->y_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; + + YV12_BUFFER_CONFIG source_extended, blurred; + memset(&blurred, 0, sizeof(blurred)); + memset(&source_extended, 0, sizeof(source_extended)); + aom_alloc_frame_buffer( + &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + av1_copy_and_extend_frame(source, &source_extended); + gaussian_blur(bit_depth, &source_extended, &blurred); + aom_free_frame_buffer(&source_extended); + + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double last_frame_unsharp_amount = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + + const double best_frame_unsharp_amount = find_best_frame_unsharp_amount( + cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01); + + cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = + best_frame_unsharp_amount; + + const int block_size = BLOCK_64X64; + const int block_w = mi_size_wide[block_size] * 4; + const int block_h = mi_size_high[block_size] * 4; + const int num_cols = (source->y_width + block_w - 1) / block_w; + const int num_rows = (source->y_height + block_h - 1) / block_h; + double *best_unsharp_amounts = + aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts)); + if (!best_unsharp_amounts) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating vmaf data"); + } + + YV12_BUFFER_CONFIG source_block, blurred_block; + memset(&source_block, 0, sizeof(source_block)); + memset(&blurred_block, 0, sizeof(blurred_block)); + aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + const int block_width = AOMMIN(width - col_offset_y, block_w); + const int block_height = AOMMIN(height - row_offset_y, block_h); + const int index = col + row * num_cols; + + if (cm->seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); + uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + + row_offset_y * source->y_stride + + col_offset_y; + uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + + row_offset_y * blurred.y_stride + + col_offset_y; + uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer); + uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer); + + // Copy block from source frame. + for (int i = 0; i < block_h; ++i) { + for (int j = 0; j < block_w; ++j) { + if (i >= block_height || j >= block_width) { + src_dst[j] = 0; + blurred_dst[j] = 0; + } else { + src_dst[j] = frame_src_buf[j]; + blurred_dst[j] = frame_blurred_buf[j]; + } + } + frame_src_buf += source->y_stride; + frame_blurred_buf += blurred.y_stride; + src_dst += source_block.y_stride; + blurred_dst += blurred_block.y_stride; + } + } else { + uint8_t *frame_src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *frame_blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + uint8_t *blurred_dst = blurred_block.y_buffer; + uint8_t *src_dst = source_block.y_buffer; + + // Copy block from source frame. + for (int i = 0; i < block_h; ++i) { + for (int j = 0; j < block_w; ++j) { + if (i >= block_height || j >= block_width) { + src_dst[j] = 0; + blurred_dst[j] = 0; + } else { + src_dst[j] = frame_src_buf[j]; + blurred_dst[j] = frame_blurred_buf[j]; + } + } + frame_src_buf += source->y_stride; + frame_blurred_buf += blurred.y_stride; + src_dst += source_block.y_stride; + blurred_dst += blurred_block.y_stride; + } + } + + best_unsharp_amounts[index] = find_best_frame_unsharp_amount( + cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3, + 1.5); + } + } + + // Apply best blur amounts + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int row_offset_y = row * block_h; + const int col_offset_y = col * block_w; + const int block_width = AOMMIN(source->y_width - col_offset_y, block_w); + const int block_height = AOMMIN(source->y_height - row_offset_y, block_h); + const int index = col + row * num_cols; + + if (cm->seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); + uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + + row_offset_y * source->y_stride + col_offset_y; + uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) + + row_offset_y * blurred.y_stride + col_offset_y; + highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf, + blurred.y_stride, src_buf, source->y_stride, + block_width, block_height, + best_unsharp_amounts[index], bit_depth); + } else { + uint8_t *src_buf = + source->y_buffer + row_offset_y * source->y_stride + col_offset_y; + uint8_t *blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride, + src_buf, source->y_stride, block_width, block_height, + best_unsharp_amounts[index]); + } + } + } + + aom_free_frame_buffer(&source_block); + aom_free_frame_buffer(&blurred_block); + aom_free_frame_buffer(&blurred); + aom_free(best_unsharp_amounts); +} + +void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + const int y_width = cpi->source->y_width; + const int y_height = cpi->source->y_height; + const int resized_block_size = BLOCK_32X32; + const int resize_factor = 2; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; + + YV12_BUFFER_CONFIG resized_source; + memset(&resized_source, 0, sizeof(resized_source)); + aom_alloc_frame_buffer( + &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x, + ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + if (!av1_resize_and_extend_frame_nonnormative( + cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating buffers during resize"); + } + + const int resized_y_width = resized_source.y_width; + const int resized_y_height = resized_source.y_height; + const int resized_block_w = mi_size_wide[resized_block_size] * 4; + const int resized_block_h = mi_size_high[resized_block_size] * 4; + const int num_cols = + (resized_y_width + resized_block_w - 1) / resized_block_w; + const int num_rows = + (resized_y_height + resized_block_h - 1) / resized_block_h; + + YV12_BUFFER_CONFIG blurred; + memset(&blurred, 0, sizeof(blurred)); + aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x, + ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + gaussian_blur(bit_depth, &resized_source, &blurred); + + YV12_BUFFER_CONFIG recon; + memset(&recon, 0, sizeof(recon)); + aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_yv12_copy_frame(&resized_source, &recon, 1); + + VmafContext *vmaf_context; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg); + unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses)); + if (!sses) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating vmaf data"); + } + + // Loop through each 'block_size' block. + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const int row_offset_y = row * resized_block_h; + const int col_offset_y = col * resized_block_w; + + uint8_t *const orig_buf = resized_source.y_buffer + + row_offset_y * resized_source.y_stride + + col_offset_y; + uint8_t *const blurred_buf = + blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; + + cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride, + blurred_buf, blurred.y_stride, + &sses[index]); + + uint8_t *const recon_buf = + recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y; + // Set recon buf + if (cpi->common.seq_params->use_highbitdepth) { + highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, + CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, + CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, + resized_block_w, resized_block_h, 0.0, bit_depth); + } else { + unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf, + blurred.y_stride, recon_buf, recon.y_stride, + resized_block_w, resized_block_h, 0.0); + } + + aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth, + index); + + // Restore recon buf + if (cpi->common.seq_params->use_highbitdepth) { + highbd_unsharp_rect( + CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, + CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, + CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w, + resized_block_h, 0.0, bit_depth); + } else { + unsharp_rect(orig_buf, resized_source.y_stride, orig_buf, + resized_source.y_stride, recon_buf, recon.y_stride, + resized_block_w, resized_block_h, 0.0); + } + } + } + aom_flush_vmaf_context(vmaf_context); + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const double vmaf = aom_calc_vmaf_at_index( + vmaf_context, cpi->vmaf_info.vmaf_model, index); + const double dvmaf = kBaselineVmaf - vmaf; + + const double mse = + (double)sses[index] / (double)(resized_y_width * resized_y_height); + double weight; + const double eps = 0.01 / (num_rows * num_cols); + if (dvmaf < eps || mse < eps) { + weight = 1.0; + } else { + weight = mse / dvmaf; + } + + // Normalize it with a data fitted model. + weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8; + cpi->vmaf_info.rdmult_scaling_factors[index] = weight; + } + } + + aom_free_frame_buffer(&resized_source); + aom_free_frame_buffer(&blurred); + aom_close_vmaf_context(vmaf_context); + aom_free(sses); +} + +void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const AV1_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_64X64; + const int num_mi_w = mi_size_wide[bsize_base]; + const int num_mi_h = mi_size_high[bsize_base]; + const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w; + const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h; + const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w; + const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + for (row = mi_row / num_mi_w; + row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { + for (col = mi_col / num_mi_h; + col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); + *rdmult = AOMMAX(*rdmult, 0); + av1_set_error_per_bit(&x->errorperbit, *rdmult); +} + +// TODO(sdeng): replace them with the SIMD versions. +static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h) { + double accum = 0.0; + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + double img1px = src[i * src_stride + j]; + double img2px = ref[i * ref_stride + j]; + + accum += fabs(img1px - img2px); + } + } + + return accum / (double)(h * w); +} + +static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, + int h) { + double accum = 0.0; + int i, j; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + double img1px = src[i * src_stride + j]; + double img2px = ref[i * ref_stride + j]; + + accum += fabs(img1px - img2px); + } + } + + return accum / (double)(h * w); +} + +static double calc_vmaf_motion_score(const AV1_COMP *const cpi, + const AV1_COMMON *const cm, + const YV12_BUFFER_CONFIG *const cur, + const YV12_BUFFER_CONFIG *const last, + const YV12_BUFFER_CONFIG *const next) { + const int y_width = cur->y_width; + const int y_height = cur->y_height; + YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cur->subsampling_x; + const int ss_y = cur->subsampling_y; + + memset(&blurred_cur, 0, sizeof(blurred_cur)); + memset(&blurred_last, 0, sizeof(blurred_last)); + memset(&blurred_next, 0, sizeof(blurred_next)); + + aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + + gaussian_blur(bit_depth, cur, &blurred_cur); + gaussian_blur(bit_depth, last, &blurred_last); + if (next) gaussian_blur(bit_depth, next, &blurred_next); + + double motion1, motion2 = 65536.0; + if (cm->seq_params->use_highbitdepth) { + assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH); + assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH); + const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); + motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), + blurred_cur.y_stride, + CONVERT_TO_SHORTPTR(blurred_last.y_buffer), + blurred_last.y_stride, y_width, y_height) * + scale_factor; + if (next) { + assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH); + motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer), + blurred_cur.y_stride, + CONVERT_TO_SHORTPTR(blurred_next.y_buffer), + blurred_next.y_stride, y_width, y_height) * + scale_factor; + } + } else { + motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, + blurred_last.y_buffer, blurred_last.y_stride, y_width, + y_height); + if (next) { + motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride, + blurred_next.y_buffer, blurred_next.y_stride, + y_width, y_height); + } + } + + aom_free_frame_buffer(&blurred_cur); + aom_free_frame_buffer(&blurred_last); + aom_free_frame_buffer(&blurred_next); + + return AOMMIN(motion1, motion2); +} + +static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi, + YV12_BUFFER_CONFIG **last, + YV12_BUFFER_CONFIG **next) { + const AV1_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const int src_index = + cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index]; + struct lookahead_entry *last_entry = av1_lookahead_peek( + cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage); + struct lookahead_entry *next_entry = av1_lookahead_peek( + cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage); + *next = &next_entry->img; + *last = cm->show_frame ? cpi->last_source : &last_entry->img; +} + +// Calculates the new qindex from the VMAF motion score. This is based on the +// observation: when the motion score becomes higher, the VMAF score of the +// same source and distorted frames would become higher. +int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { + const AV1_COMMON *const cm = &cpi->common; + if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) { + return current_qindex; + } + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + const double last_frame_ysse = + get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth); + const double last_frame_vmaf = + get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth); + const int bit_depth = cpi->td.mb.e_mbd.bd; + const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) * + (1 << (bit_depth - 8))); + const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf; + const double sse_threshold = + 0.01 * cpi->source->y_width * cpi->source->y_height; + const double vmaf_threshold = 0.01; + if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) { + return current_qindex; + } + YV12_BUFFER_CONFIG *cur_buf = cpi->source; + if (cm->show_frame == 0) { + const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; + struct lookahead_entry *cur_entry = av1_lookahead_peek( + cpi->ppi->lookahead, src_index, cpi->compressor_stage); + cur_buf = &cur_entry->img; + } + assert(cur_buf); + + YV12_BUFFER_CONFIG *next_buf, *last_buf; + get_neighbor_frames(cpi, &last_buf, &next_buf); + assert(last_buf); + + const double motion = + calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf); + + // Get dVMAF through a data fitted model. + const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion)); + const double dsse = dvmaf * approx_sse / approx_dvmaf; + + // Clamping beta to address VQ issue (aomedia:3170). + const double beta = AOMMAX(approx_sse / (dsse + approx_sse), 0.5); + const int offset = + av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta); + int qindex = current_qindex + offset; + + qindex = AOMMIN(qindex, MAXQ); + qindex = AOMMAX(qindex, MINQ); + + return qindex; +} + +static AOM_INLINE double cal_approx_score( + AV1_COMP *const cpi, double src_variance, double new_variance, + double src_score, YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const recon_sharpened) { + double score; + const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth, + cal_vmaf_neg, &score); + return src_variance / new_variance * (score - src_score); +} + +static double find_best_frame_unsharp_amount_loop_neg( + AV1_COMP *const cpi, double src_variance, double base_score, + YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon, + YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred, + YV12_BUFFER_CONFIG *const recon_blurred, + YV12_BUFFER_CONFIG *const src_sharpened, + YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, + double best_score, const double unsharp_amount_start, + const double step_size, const int max_loop_count, const double max_amount) { + const double min_amount = 0.0; + int loop_count = 0; + double approx_score = best_score; + double unsharp_amount = unsharp_amount_start; + + do { + best_score = approx_score; + unsharp_amount += step_size; + if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; + unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount); + unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount); + const double new_variance = + residual_frame_average_variance(cpi, src_sharpened, ref, mvs); + approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score, + src, recon_sharpened); + + loop_count++; + } while (approx_score > best_score && loop_count < max_loop_count); + unsharp_amount = + approx_score > best_score ? unsharp_amount : unsharp_amount - step_size; + + return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); +} + +static double find_best_frame_unsharp_amount_neg( + AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref, + double base_score, const double unsharp_amount_start, + const double step_size, const int max_loop_count, + const double max_filter_amount) { + FULLPEL_MV *mvs = NULL; + const double src_variance = + residual_frame_average_variance(cpi, src, ref, mvs); + + const AV1_COMMON *const cm = &cpi->common; + const int width = recon->y_width; + const int height = recon->y_height; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = recon->subsampling_x; + const int ss_y = recon->subsampling_y; + + YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened; + memset(&recon_sharpened, 0, sizeof(recon_sharpened)); + memset(&src_sharpened, 0, sizeof(src_sharpened)); + memset(&recon_blurred, 0, sizeof(recon_blurred)); + memset(&src_blurred, 0, sizeof(src_blurred)); + aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment, 0, 0); + aom_alloc_frame_buffer( + &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + + gaussian_blur(bit_depth, recon, &recon_blurred); + gaussian_blur(bit_depth, src, &src_blurred); + + unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start); + unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start); + const double variance_start = + residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); + const double score_start = cal_approx_score( + cpi, src_variance, variance_start, base_score, src, &recon_sharpened); + + const double unsharp_amount_next = unsharp_amount_start + step_size; + unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next); + unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next); + const double variance_next = + residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); + const double score_next = cal_approx_score(cpi, src_variance, variance_next, + base_score, src, &recon_sharpened); + + double unsharp_amount; + if (score_next > score_start) { + unsharp_amount = find_best_frame_unsharp_amount_loop_neg( + cpi, src_variance, base_score, src, recon, ref, &src_blurred, + &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next, + unsharp_amount_next, step_size, max_loop_count, max_filter_amount); + } else { + unsharp_amount = find_best_frame_unsharp_amount_loop_neg( + cpi, src_variance, base_score, src, recon, ref, &src_blurred, + &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start, + unsharp_amount_start, -step_size, max_loop_count, max_filter_amount); + } + + aom_free_frame_buffer(&recon_sharpened); + aom_free_frame_buffer(&src_sharpened); + aom_free_frame_buffer(&recon_blurred); + aom_free_frame_buffer(&src_blurred); + aom_free(mvs); + return unsharp_amount; +} + +void av1_update_vmaf_curve(AV1_COMP *cpi) { + YV12_BUFFER_CONFIG *source = cpi->source; + YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + const int bit_depth = cpi->td.mb.e_mbd.bd; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); + double base_score; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth, + cal_vmaf_neg, &base_score); + cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score; + if (cpi->common.seq_params->use_highbitdepth) { + assert(source->flags & YV12_FLAG_HIGHBITDEPTH); + assert(recon->flags & YV12_FLAG_HIGHBITDEPTH); + cpi->vmaf_info.last_frame_ysse[layer_depth] = + (double)aom_highbd_get_y_sse(source, recon); + } else { + cpi->vmaf_info.last_frame_ysse[layer_depth] = + (double)aom_get_y_sse(source, recon); + } + + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { + YV12_BUFFER_CONFIG *last, *next; + get_neighbor_frames(cpi, &last, &next); + double best_unsharp_amount_start = + get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); + const int max_loop_count = 5; + cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = + find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score, + best_unsharp_amount_start, 0.025, + max_loop_count, 1.01); + } +} diff --git a/third_party/aom/av1/encoder/tune_vmaf.h b/third_party/aom/av1/encoder/tune_vmaf.h new file mode 100644 index 0000000000..a04a29e6fe --- /dev/null +++ b/third_party/aom/av1/encoder/tune_vmaf.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_ +#define AOM_AV1_ENCODER_TUNE_VMAF_H_ + +#include "aom_dsp/vmaf.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/block.h" + +typedef struct { + // Stores the scaling factors for rdmult when tuning for VMAF. + // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for + // 64x64 block at (row, col). + double *rdmult_scaling_factors; + + // Stores the luma sse of the last frame. + double last_frame_ysse[MAX_ARF_LAYERS]; + + // Stores the VMAF of the last frame. + double last_frame_vmaf[MAX_ARF_LAYERS]; + + // Stores the filter strength of the last frame. + double last_frame_unsharp_amount[MAX_ARF_LAYERS]; + + // Stores the origial qindex before scaling. + int original_qindex; + + // VMAF model used in VMAF caculations. + VmafModel *vmaf_model; +} TuneVMAFInfo; + +struct AV1_COMP; + +void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi, + YV12_BUFFER_CONFIG *source); + +void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi, + YV12_BUFFER_CONFIG *source); + +void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi, + YV12_BUFFER_CONFIG *source); + +void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi); + +void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult); + +int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex); + +void av1_update_vmaf_curve(struct AV1_COMP *cpi); + +#endif // AOM_AV1_ENCODER_TUNE_VMAF_H_ diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h new file mode 100644 index 0000000000..aab5e1398d --- /dev/null +++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h @@ -0,0 +1,3422 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*! \file + * Contains the details of the ML models used for pruning transform size. This + * file is only included by av1/encoder/tx_search.c. + */ +#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ +#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "av1/encoder/ml.h" + +/***************************CONFIG_NN_V2 (New)********************************/ +#if CONFIG_NN_V2 +// Tx type model for 4x4 block. +static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, +}; + +static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, +}; + +static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, +}; + +static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, +}; + +static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x4_hor_layer0_weights, // weights + av1_tx_type_nn_4x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x4_hor_layer1_weights, + av1_tx_type_nn_4x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x4_ver_layer0_weights, // weights + av1_tx_type_nn_4x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x4_ver_layer1_weights, + av1_tx_type_nn_4x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = { + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, +}; + +static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = { + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, +}; + +static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = { + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, +}; + +static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = { + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, +}; + +static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x8_hor_layer0_weights, // weights + av1_tx_type_nn_4x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x8_hor_layer1_weights, + av1_tx_type_nn_4x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = { + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = { + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, +}; + +static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = { + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, +}; + +static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = { + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, +}; + +static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_4x8_ver_layer0_weights, // weights + av1_tx_type_nn_4x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x8_ver_layer1_weights, + av1_tx_type_nn_4x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +/******************************************************************************/ + +// Tx type model for 8x4 block. +static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = { + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, +}; + +static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = { + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, +}; + +static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = { + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, +}; + +static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = { + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, +}; + +static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x4_hor_layer0_weights, // weights + av1_tx_type_nn_8x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x4_hor_layer1_weights, + av1_tx_type_nn_8x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = { + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = { + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, +}; + +static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = { + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, +}; + +static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = { + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, +}; + +static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_8x4_ver_layer0_weights, // weights + av1_tx_type_nn_8x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x4_ver_layer1_weights, + av1_tx_type_nn_8x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x8_hor_layer0_weights, // weights + av1_tx_type_nn_8x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x8_hor_layer1_weights, + av1_tx_type_nn_8x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x8_ver_layer0_weights, // weights + av1_tx_type_nn_8x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x8_ver_layer1_weights, + av1_tx_type_nn_8x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = { + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, +}; + +static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = { + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, +}; + +static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = { + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, +}; + +static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = { + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, +}; + +static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x16_hor_layer0_weights, // weights + av1_tx_type_nn_8x16_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x16_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x16_hor_layer1_weights, + av1_tx_type_nn_8x16_hor_layer1_bias, + NONE, + av1_tx_type_nn_8x16_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x16_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = { + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = { + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, +}; + +static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = { + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, +}; + +static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = { + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, +}; + +static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_8x16_ver_layer0_weights, // weights + av1_tx_type_nn_8x16_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_8x16_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_8x16_ver_layer1_weights, + av1_tx_type_nn_8x16_ver_layer1_bias, + NONE, + av1_tx_type_nn_8x16_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_8x16_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = { + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, +}; + +static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = { + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, +}; + +static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = { + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, +}; + +static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = { + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, +}; + +static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x8_hor_layer0_weights, // weights + av1_tx_type_nn_16x8_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x8_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x8_hor_layer1_weights, + av1_tx_type_nn_16x8_hor_layer1_bias, + NONE, + av1_tx_type_nn_16x8_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x8_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = { + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = { + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, +}; + +static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = { + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, +}; + +static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = { + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, +}; + +static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x8_ver_layer0_weights, // weights + av1_tx_type_nn_16x8_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x8_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x8_ver_layer1_weights, + av1_tx_type_nn_16x8_ver_layer1_bias, + NONE, + av1_tx_type_nn_16x8_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x8_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static float av1_tx_type_nn_16x16_layer0_weights[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static float av1_tx_type_nn_16x16_layer0_bias[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static float av1_tx_type_nn_16x16_layer1_weights[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static float av1_tx_type_nn_16x16_layer1_bias[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x16_layer0_weights, // weights + av1_tx_type_nn_16x16_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x16_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x16_layer1_weights, + av1_tx_type_nn_16x16_layer1_bias, + NONE, + av1_tx_type_nn_16x16_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x16_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 4x16 block. +static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_4x16_hor_layer0_weights, // weights + av1_tx_type_nn_4x16_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x16_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x16_hor_layer1_weights, + av1_tx_type_nn_4x16_hor_layer1_bias, + NONE, + av1_tx_type_nn_4x16_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x16_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_4x16_ver_layer0_weights, // weights + av1_tx_type_nn_4x16_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_4x16_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_4x16_ver_layer1_weights, + av1_tx_type_nn_4x16_ver_layer1_bias, + NONE, + av1_tx_type_nn_4x16_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_4x16_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Tx type model for 16x4 block. +static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 }; +static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 8, // num_inputs + 16, // num_outputs + av1_tx_type_nn_16x4_hor_layer0_weights, // weights + av1_tx_type_nn_16x4_hor_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x4_hor_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 16, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x4_hor_layer1_weights, + av1_tx_type_nn_16x4_hor_layer1_bias, + NONE, + av1_tx_type_nn_16x4_hor_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x4_hor_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 }; +static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 }; + +static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = { + 1, // num_hidden_layers + { + // fc layer setting + { + // layer 0 + 4, // num_inputs + 8, // num_outputs + av1_tx_type_nn_16x4_ver_layer0_weights, // weights + av1_tx_type_nn_16x4_ver_layer0_bias, // bias + RELU, // activation + av1_tx_type_nn_16x4_ver_layer0_out, // output + NULL, + NULL, + NULL, + }, + { + 8, // num_inputs (!!same as num_outputs of last layer) + 4, + av1_tx_type_nn_16x4_ver_layer1_weights, + av1_tx_type_nn_16x4_ver_layer1_bias, + NONE, + av1_tx_type_nn_16x4_ver_layer1_out, + NULL, + NULL, + NULL, + }, + }, + 4, // num_outputs + av1_tx_type_nn_16x4_ver_layer1_out, // logits (!!same as last layer output) + SOFTMAX_CROSS_ENTROPY, +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; +#else +/******************************CONFIG_NN***************************************/ +// Tx type model for 4x4 block. +static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = { + -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f, + 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f, + -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f, + 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f, + 1.35792f, 0.27733f, 0.88660f, -0.68304f, +}; + +static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = { + 1.38742f, 0.59540f, -1.37622f, 1.92114f, + 0.00000f, -0.38998f, -0.32726f, -0.15650f, +}; + +static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = { + 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f, + -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f, + -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f, + 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f, + -0.26782f, -0.65416f, -0.10648f, 0.05568f, +}; + +static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = { + 4.07177f, + 3.26961f, + 0.58083f, + 1.21199f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_hor_layer0, + av1_tx_type_nn_weights_4x4_hor_layer1 }, + { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = { + -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f, + 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f, + 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f, + 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f, + -0.06589f, -0.28142f, -0.33118f, 1.72227f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = { + -0.33685f, 0.22025f, 0.28140f, 0.56138f, + 0.93489f, -1.77048f, 1.34989f, -0.93747f, +}; + +static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = { + -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f, + 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f, + -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f, + -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f, + -0.86315f, -0.53336f, 0.30320f, -1.32331f, +}; + +static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = { + -1.31519f, + -3.26321f, + 1.71794f, + -1.90778f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x4_ver_layer0, + av1_tx_type_nn_weights_4x4_ver_layer1 }, + { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 4x8 block. +static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = { + 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f, + 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f, + -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f, + -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f, + -1.35896f, -1.17121f, 1.68866f, 0.10357f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = { + 2.93391f, 0.66831f, -0.21419f, 0.00000f, + -0.72878f, 0.15127f, -1.46755f, 0.16658f, +}; + +static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = { + -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f, + -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f, + 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f, + 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f, + -0.50191f, 0.18219f, 1.83664f, -0.75276f, +}; + +static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = { + -1.17455f, + -2.26089f, + -1.79863f, + -2.26333f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x8_hor_layer0, + av1_tx_type_nn_weights_4x8_hor_layer1 }, + { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = { + -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f, + -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f, + -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f, + 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f, + 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f, + 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f, + -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f, + -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f, + 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f, + -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f, + -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f, + -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f, + 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f, + 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f, + -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f, + -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f, + 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f, + -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f, + -0.21958f, 0.05970f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = { + 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f, + 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f, + 0.08288f, 0.18195f, -0.79890f, 0.10047f, +}; + +static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = { + -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f, + -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f, + -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f, + -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f, + 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f, + 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f, + -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f, + -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f, + -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f, + -1.01848f, +}; + +static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = { + -1.45955f, + -2.08949f, + -1.24813f, + -1.55368f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x8_ver_layer0, + av1_tx_type_nn_weights_4x8_ver_layer1 }, + { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x4 block. +static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = { + -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f, + 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f, + -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f, + -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f, + -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f, + 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f, + 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f, + -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f, + -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f, + 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f, + 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f, + -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f, + -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f, + 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f, + 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f, + 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f, + -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f, + -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f, + -1.85523f, 0.92532f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = { + 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f, + -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f, + -0.28958f, -0.32869f, -0.01704f, 0.68171f, +}; + +static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = { + -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f, + -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f, + 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f, + -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f, + 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f, + -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f, + -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f, + 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f, + 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f, + -1.10654f, +}; + +static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = { + -0.92861f, + -1.45151f, + -1.33588f, + -4.33853f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x4_hor_layer0, + av1_tx_type_nn_weights_8x4_hor_layer1 }, + { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = { + -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f, + -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f, + -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f, + -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f, + 1.66212f, 1.70826f, 1.55182f, 0.12230f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = { + 0.10943f, 2.09789f, 2.16578f, 0.15766f, + -0.42461f, 0.00000f, 1.22090f, -1.28717f, +}; + +static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = { + 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f, + 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f, + 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f, + -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f, + -1.15005f, -0.39311f, 1.51236f, -1.68973f, +}; + +static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = { + 1.81013f, + 1.10517f, + 2.90059f, + 0.95391f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x4_ver_layer0, + av1_tx_type_nn_weights_8x4_ver_layer1 }, + { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x8 block. +static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = { + -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f, + -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f, + 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f, + 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f, + -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f, + -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f, + -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f, + 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f, + 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f, + -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f, + 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f, + -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f, + 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f, + 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f, + 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f, + 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f, + 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f, + 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f, + -0.99892f, 1.09823f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = { + -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f, + -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f, + -0.26319f, 2.65579f, -1.30137f, -0.01487f, +}; + +static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = { + -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f, + -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f, + 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f, + 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f, + 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f, + -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f, + 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f, + 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f, + 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f, + 0.06161f, +}; + +static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = { + 1.70385f, + 1.82373f, + 1.78496f, + 1.80826f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_hor_layer0, + av1_tx_type_nn_weights_8x8_hor_layer1 }, + { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = { + -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f, + 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f, + -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f, + -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f, + 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f, + 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f, + 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f, + -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f, + -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f, + 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f, + 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f, + -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f, + 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f, + 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f, + -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f, + 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f, + -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f, + -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f, + -1.29848f, 0.39308f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = { + -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f, + 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f, + 0.83015f, 0.06024f, 1.17180f, 0.65122f, +}; + +static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = { + -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f, + 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f, + 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f, + 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f, + 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f, + 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f, + 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f, + 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f, + -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f, + -0.41305f, +}; + +static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = { + 2.14067f, + 2.76699f, + 2.04233f, + 1.34803f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x8_ver_layer0, + av1_tx_type_nn_weights_8x8_ver_layer1 }, + { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 8x16 block. +static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = { + -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f, + 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f, + -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f, + 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f, + -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f, + 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f, + -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f, + 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f, + -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f, + -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f, + 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f, + 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f, + -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f, + 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f, + -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f, + 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f, + 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f, + -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f, + -0.28136f, 0.42556f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = { + 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f, + -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f, + 1.81560f, -1.02643f, -0.81690f, 0.08302f, +}; + +static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = { + 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f, + -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f, + 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f, + -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f, + 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f, + 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f, + 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f, + 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f, + 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f, + -1.31243f, +}; + +static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = { + 0.83359f, + 1.06875f, + 1.77645f, + 1.49570f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x16_hor_layer0, + av1_tx_type_nn_weights_8x16_hor_layer1 }, + { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = { + 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f, + -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f, + -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f, + 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f, + -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f, + 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f, + 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f, + 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f, + -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f, + -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f, + 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f, + 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f, + -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f, + -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f, + -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f, + -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f, + -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f, + 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f, + -0.12236f, 0.16075f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = { + -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f, + -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f, + 0.57598f, 0.99819f, 0.75175f, 0.17044f, +}; + +static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = { + -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f, + 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f, + -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f, + 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f, + -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f, + -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f, + -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f, + 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f, + 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f, + 2.20547f, +}; + +static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = { + -0.44080f, + -1.67455f, + -1.46332f, + -6.13206f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_8x16_ver_layer0, + av1_tx_type_nn_weights_8x16_ver_layer1 }, + { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x8 block. +static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = { + 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f, + -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f, + -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f, + 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f, + 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f, + 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f, + 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f, + -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f, + -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f, + -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f, + 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f, + -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f, + -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f, + -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f, + 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f, + -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f, + -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f, + 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f, + -0.36570f, -0.50757f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = { + -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f, + 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f, + -0.12329f, 0.08986f, 1.08117f, -0.00220f, +}; + +static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = { + 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f, + 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f, + -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f, + -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f, + -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f, + -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f, + 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f, + 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f, + 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f, + -0.23347f, +}; + +static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = { + 3.57175f, + 2.42612f, + 3.31259f, + 2.08287f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x8_hor_layer0, + av1_tx_type_nn_weights_16x8_hor_layer1 }, + { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = { + 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f, + 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f, + -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f, + 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f, + 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f, + -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f, + 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f, + -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f, + 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f, + 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f, + 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f, + -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f, + -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f, + -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f, + 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f, + 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f, + -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f, + -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f, + -0.81945f, -0.41647f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = { + 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f, + 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f, + -0.04510f, 0.48000f, -0.09354f, -0.42422f, +}; + +static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = { + 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f, + -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f, + 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f, + -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f, + -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f, + 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f, + 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f, + -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f, + 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f, + -0.00873f, +}; + +static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = { + 3.34981f, + 3.74710f, + 1.38339f, + 0.45176f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x8_ver_layer0, + av1_tx_type_nn_weights_16x8_ver_layer1 }, + { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x16 block. +static const float av1_tx_type_nn_weights_16x16_layer0[128] = { + 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f, + 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f, + -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f, + -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f, + 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f, + 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f, + 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f, + 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f, + -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f, + 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f, + 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f, + 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f, + -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f, + 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f, + 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f, + -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f, + -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f, + 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f, + 0.50355f, 0.08592f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer0[16] = { + -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f, + -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f, + -0.14062f, -0.42120f, 0.94573f, -0.09287f, +}; + +static const float av1_tx_type_nn_weights_16x16_layer1[64] = { + -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f, + 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f, + 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f, + 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f, + 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f, + 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f, + -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f, + 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f, + -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f, + 1.08829f, +}; + +static const float av1_tx_type_nn_bias_16x16_layer1[4] = { + 0.81986f, + 1.26865f, + 0.11118f, + 2.48404f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x16 = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_type_nn_weights_16x16_layer0, + av1_tx_type_nn_weights_16x16_layer1, + }, + { + av1_tx_type_nn_bias_16x16_layer0, + av1_tx_type_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx type model for 4x16 block. +static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = { + 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f, + 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f, + 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f, + 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f, + -1.74563f, -0.88830f, -1.77603f, 2.15935f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = { + -0.36435f, -2.22731f, -0.00837f, -1.34546f, + 0.62806f, -0.20675f, 4.91940f, -0.56079f, +}; + +static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = { + -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f, + -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f, + 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f, + 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f, + 1.28413f, -0.30326f, 2.45329f, -0.83335f, +}; + +static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = { + 2.33198f, + 3.36245f, + 1.62603f, + 2.91056f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_hor_layer0, + av1_tx_type_nn_weights_4x16_hor_layer1 }, + { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = { + 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f, + 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f, + -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f, + -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f, + -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f, + -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f, + 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f, + 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f, + 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f, + -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f, + -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f, + 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f, + 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f, + 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f, + 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f, + -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f, + 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f, + 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f, + -0.27975f, -0.01149f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = { + -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f, + -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f, + -0.32530f, 0.73483f, 0.08322f, -0.23890f, +}; + +static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = { + 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f, + -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f, + 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f, + -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f, + 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f, + -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f, + 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f, + 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f, + -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f, + -0.56513f, +}; + +static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = { + 4.60896f, + 4.53551f, + 4.53124f, + 4.27435f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_4x16_ver_layer0, + av1_tx_type_nn_weights_4x16_ver_layer1 }, + { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 } +}; +/******************************************************************************/ + +// Tx type model for 16x4 block. +static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = { + 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f, + 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f, + -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f, + -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f, + -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f, + -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f, + 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f, + 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f, + 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f, + -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f, + 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f, + -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f, + 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f, + -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f, + -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f, + -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f, + 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f, + 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f, + 0.19055f, -1.56413f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = { + -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f, + 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f, + 1.14048f, 0.33308f, -1.10886f, 0.41184f, +}; + +static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = { + -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f, + 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f, + -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f, + -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f, + 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f, + -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f, + -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f, + 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f, + 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f, + -0.43819f, +}; + +static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = { + 2.32575f, + 2.75703f, + 1.12304f, + 2.15567f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = { + 8, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x4_hor_layer0, + av1_tx_type_nn_weights_16x4_hor_layer1 }, + { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 } +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = { + 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f, + 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f, + -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f, + -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f, + -0.17967f, -0.96622f, 0.42635f, -1.04784f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = { + -0.52088f, 0.52844f, -1.03655f, -0.30974f, + 2.59952f, -1.93604f, 0.00000f, 2.51787f, +}; + +static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = { + 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f, + 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f, + 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f, + -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f, + 1.26814f, -1.93873f, -0.00768f, 1.58309f, +}; + +static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = { + 2.34713f, + 1.68667f, + 1.25488f, + 1.69812f, +}; + +static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = { + 4, // num_inputs + 4, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { av1_tx_type_nn_weights_16x4_ver_layer0, + av1_tx_type_nn_weights_16x4_ver_layer1 }, + { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 } +}; +/******************************************************************************/ + +// Map tx_size to its corresponding neural net model for tx type prediction. +static const NN_CONFIG *const av1_tx_type_nnconfig_map_hor[] = { + &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform + &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform + &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform + &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform + &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform + &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; + +static const NN_CONFIG *const av1_tx_type_nnconfig_map_ver[] = { + &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform + &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform + &av1_tx_type_nnconfig_16x16, // 16x16 transform + NULL, // 32x32 transform + NULL, // 64x64 transform + &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform + &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform + &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform + &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform + NULL, // 16x32 transform + NULL, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform + &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform + NULL, // 8x32 transform + NULL, // 32x8 transform + NULL, // 16x64 transform + NULL, // 64x16 transform +}; +#endif // CONFIG_NN_V2 + +// Tx split model for 4x8 block. +static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = { + 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f, + -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f, + 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f, + -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f, + -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f, + 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f, + 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f, + 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f, + 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f, + 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f, + -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f, + 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f, + -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f, + -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f, + 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f, + -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f, + -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f, + 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f, + -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f, + -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f, + 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f, + -0.792429f, -0.385862f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer0[16] = { + 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f, + -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f, + 0.262171f, -1.598153f, -1.427340f, -1.602306f, +}; + +static const float av1_tx_split_nn_weights_4x8_layer1[16] = { + -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f, + -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f, + 0.085082f, 0.614986f, 0.847904f, 0.637578f, +}; + +static const float av1_tx_split_nn_bias_4x8_layer1[1] = { + 0.20586078f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x8 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x8_layer0, + av1_tx_split_nn_weights_4x8_layer1, + }, + { + av1_tx_split_nn_bias_4x8_layer0, + av1_tx_split_nn_bias_4x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x8 block. +static const float av1_tx_split_nn_weights_8x8_layer0[144] = { + 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f, + -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f, + -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f, + -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f, + 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f, + 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f, + 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f, + -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f, + 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f, + 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f, + 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f, + 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f, + -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f, + -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f, + 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f, + -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f, + 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f, + 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f, + -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f, + -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f, + 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f, + -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f, + -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f, + 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer0[12] = { + 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f, + 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f, +}; + +static const float av1_tx_split_nn_weights_8x8_layer1[12] = { + 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f, + -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f, +}; + +static const float av1_tx_split_nn_bias_8x8_layer1[1] = { + -0.156294f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x8 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 12, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x8_layer0, + av1_tx_split_nn_weights_8x8_layer1, + }, + { + av1_tx_split_nn_bias_8x8_layer0, + av1_tx_split_nn_bias_8x8_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x16 block. +static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = { + 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f, + 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f, + -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f, + -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f, + -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f, + -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f, + 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f, + 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f, + -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f, + -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f, + -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f, + -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f, + 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f, + 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f, + -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f, + 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f, + 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f, + 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f, + 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f, + -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f, + 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f, + 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f, + 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f, + -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f, + -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f, + 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f, + -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f, + 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f, + 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f, + 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f, + 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f, + 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f, + -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f, + -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f, + 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f, + 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f, + -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f, + -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f, + 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f, + 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f, + 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f, + 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f, + -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f, + -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f, + 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f, + -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f, + 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f, + -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f, + -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f, + 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f, + 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f, + -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f, + 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f, + -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f, + -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f, + 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f, + 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f, + 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f, + -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f, + -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f, + -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f, + 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f, + -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f, + 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f, + -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f, + -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f, + -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f, + -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f, + 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f, + 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f, + 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f, + -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f, + -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f, + -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f, + 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f, + 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f, + 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f, + -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f, + 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f, + -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f, + 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f, + 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f, + 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f, + -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f, + -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f, + -0.408768f, 0.184693f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer0[64] = { + -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f, + -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f, + 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f, + 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f, + 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f, + 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f, + -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f, + -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f, + 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f, + -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f, + -0.255844f, -0.078400f, 0.476752f, 0.643001f, +}; + +static const float av1_tx_split_nn_weights_8x16_layer1[64] = { + -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f, + 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f, + 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f, + -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f, + 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f, + 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f, + 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f, + -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f, + 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f, + -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f, + -0.256734f, 0.177370f, 0.213522f, -0.530158f, +}; + +static const float av1_tx_split_nn_bias_8x16_layer1[1] = { + 0.14910713f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 64, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x16_layer0, + av1_tx_split_nn_weights_8x16_layer1, + }, + { + av1_tx_split_nn_bias_8x16_layer0, + av1_tx_split_nn_bias_8x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x16 block. +static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = { + -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f, + 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f, + 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f, + -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f, + 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f, + -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f, + -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f, + -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f, + 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f, + -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f, + 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f, + -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f, + -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f, + 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f, + 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f, + -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f, + -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f, + -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f, + -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f, + -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f, + -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f, + -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f, + 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f, + -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f, + -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f, + -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f, + 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f, + -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f, + 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f, + -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f, + -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f, + 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f, + 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f, + 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f, + -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f, + -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f, + 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f, + 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f, + -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f, + -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f, + 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f, + 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f, + 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f, + -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f, + -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f, + -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f, + 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f, + 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer0[24] = { + -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f, + 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f, + -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f, + -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f, +}; + +static const float av1_tx_split_nn_weights_16x16_layer1[24] = { + -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f, + -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f, + -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f, + -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f, +}; + +static const float av1_tx_split_nn_bias_16x16_layer1[1] = { + 0.184803f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x16 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x16_layer0, + av1_tx_split_nn_weights_16x16_layer1, + }, + { + av1_tx_split_nn_bias_16x16_layer0, + av1_tx_split_nn_bias_16x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x32 block. +static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = { + -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f, + -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f, + 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f, + -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f, + -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f, + 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f, + -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f, + -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f, + -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f, + 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f, + -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f, + 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f, + -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f, + -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f, + -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f, + 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f, + -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f, + 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f, + -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f, + -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f, + 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f, + -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f, + 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f, + -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f, + 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f, + -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f, + -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f, + -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f, + 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f, + -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f, + 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f, + 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f, + 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f, + 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f, + -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f, + 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f, + 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f, + 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f, + 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f, + 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f, + -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f, + 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f, + 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f, + -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f, + -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f, + -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f, + -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f, + -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f, + -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f, + 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f, + -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f, + -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f, + 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f, + 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f, + -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f, + -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f, + -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f, + 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f, + -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f, + -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f, + 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f, + 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f, + 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f, + 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer0[32] = { + 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f, + -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f, + -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f, + -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f, + 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f, + 0.254942f, -0.017796f, +}; + +static const float av1_tx_split_nn_weights_32x32_layer1[32] = { + -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f, + -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f, + 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f, + -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f, + 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f, + -0.068547f, -0.154148f, +}; + +static const float av1_tx_split_nn_bias_32x32_layer1[1] = { + 0.316622f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x32 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x32_layer0, + av1_tx_split_nn_weights_32x32_layer1, + }, + { + av1_tx_split_nn_bias_32x32_layer0, + av1_tx_split_nn_bias_32x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 64x64 block. +static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = { + -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f, + 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f, + 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f, + 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f, + -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f, + -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f, + 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f, + -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f, + -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f, + 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f, + -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f, + 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f, + 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f, + -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f, + -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f, + 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f, + 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f, + 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f, + -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f, + -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f, + 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f, + -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f, + 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f, + 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f, + 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f, + 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f, + 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f, + -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f, + 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f, + 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f, + 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f, + 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f, + 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f, + 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f, + -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f, + -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f, + 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f, + -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f, + -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f, + -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f, + -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f, + -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f, + -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f, + -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f, + 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f, + 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f, + 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f, + -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f, + -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f, + 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f, + 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f, + 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f, + -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f, + -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f, + -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f, + 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f, + -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f, + -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f, + -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f, + -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f, + 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f, + -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f, + 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f, + 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer0[32] = { + 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f, + -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f, + 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f, + 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f, + -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f, + -0.365437f, 0.229255f, +}; + +static const float av1_tx_split_nn_weights_64x64_layer1[32] = { + 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f, + -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f, + -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f, + -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f, + 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f, + 0.207812f, 0.513560f, +}; + +static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f }; + +static const NN_CONFIG av1_tx_split_nnconfig_64x64 = { + 12, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_64x64_layer0, + av1_tx_split_nn_weights_64x64_layer1, + }, + { + av1_tx_split_nn_bias_64x64_layer0, + av1_tx_split_nn_bias_64x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 4x16 block. +static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = { + -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f, + -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f, + -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f, + -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f, + -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f, + -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f, + -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f, + 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f, + 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f, + 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f, + 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f, + -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f, + -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f, + -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f, + -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f, + -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f, + -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f, + -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f, + 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f, + 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f, + 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f, + -1.416451f, -0.166467f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer0[16] = { + 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f, + -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f, + -1.191704f, -3.800073f, 4.121552f, -1.399397f, +}; + +static const float av1_tx_split_nn_weights_4x16_layer1[16] = { + -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f, + -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f, + 0.462109f, 0.343315f, 1.092593f, 0.483152f, +}; + +static const float av1_tx_split_nn_bias_4x16_layer1[1] = { + 0.8205083f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_4x16 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_4x16_layer0, + av1_tx_split_nn_weights_4x16_layer1, + }, + { + av1_tx_split_nn_bias_4x16_layer0, + av1_tx_split_nn_bias_4x16_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = { + 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f, + 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f, + 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f, + 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f, + -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f, + 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f, + -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f, + -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f, + 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f, + -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f, + 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f, + -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f, + -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f, + 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f, + 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f, + 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f, + -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f, + 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f, + 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f, + 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f, + -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f, + -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f, + -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f, + -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f, + -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f, + -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f, + -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f, + 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f, + 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f, + -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f, + 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f, + -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f, + -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f, + 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f, + -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f, + 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f, + 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f, + -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f, + -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f, + 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f, + 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f, + 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f, + -0.129147f, 0.045916f, -0.606865f, -0.101378f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer0[32] = { + 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f, + 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f, + 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f, + 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f, + 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f, + -0.294389f, 1.456413f, +}; + +static const float av1_tx_split_nn_weights_16x32_layer1[32] = { + 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f, + -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f, + -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f, + -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f, + -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f, + 0.418904f, 1.792187f, +}; + +static const float av1_tx_split_nn_bias_16x32_layer1[1] = { + -0.29233751f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x32_layer0, + av1_tx_split_nn_weights_16x32_layer1, + }, + { + av1_tx_split_nn_bias_16x32_layer0, + av1_tx_split_nn_bias_16x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 32x64 block. +static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = { + 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f, + -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f, + 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f, + 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f, + 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f, + 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f, + -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f, + 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f, + 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f, + -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f, + -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f, + 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f, + -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f, + -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f, + 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f, + -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f, + -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f, + -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f, + 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f, + 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f, + 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f, + 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f, + -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f, + 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f, + 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f, + -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f, + -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f, + 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f, + -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f, + -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f, + -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f, + -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f, + -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f, + -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f, + 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f, + 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f, + 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f, + -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f, + 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f, + -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f, + -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f, + 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f, + 0.440626f, -0.158048f, -0.461031f, -0.146280f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer0[32] = { + 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f, + -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f, + -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f, + 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f, + 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f, + 0.552712f, 0.299410f, +}; + +static const float av1_tx_split_nn_weights_32x64_layer1[32] = { + 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f, + 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f, + -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f, + -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f, + 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f, + 0.352981f, 0.111265f, +}; + +static const float av1_tx_split_nn_bias_32x64_layer1[1] = { + -0.18160765f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_32x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_32x64_layer0, + av1_tx_split_nn_weights_32x64_layer1, + }, + { + av1_tx_split_nn_bias_32x64_layer0, + av1_tx_split_nn_bias_32x64_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 8x32 block. +static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = { + -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f, + -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f, + 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f, + 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f, + -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f, + 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f, + 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f, + 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f, + 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f, + 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f, + 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f, + 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f, + 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f, + 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f, + 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f, + 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f, + 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f, + 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f, + -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f, + 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f, + 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f, + -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f, + 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f, + -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f, + 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f, + 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f, + 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f, + 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f, + 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f, + 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f, + -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f, + 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer0[24] = { + -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f, + -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f, + -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f, + 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f, +}; + +static const float av1_tx_split_nn_weights_8x32_layer1[24] = { + 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f, + -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f, + -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f, + 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f, +}; + +static const float av1_tx_split_nn_bias_8x32_layer1[1] = { + 0.13435879f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_8x32 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 24, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_8x32_layer0, + av1_tx_split_nn_weights_8x32_layer1, + }, + { + av1_tx_split_nn_bias_8x32_layer0, + av1_tx_split_nn_bias_8x32_layer1, + }, +}; +/******************************************************************************/ + +// Tx split model for 16x32 block. +static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = { + -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f, + -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f, + -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f, + 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f, + -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f, + -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f, + -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f, + 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f, + -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f, + -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f, + -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f, + 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f, + -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f, + -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f, + 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f, + 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f, + -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f, + -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f, + -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f, + -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f, + 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f, + 0.101996f, 0.120878f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer0[16] = { + 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f, + -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f, + -0.299866f, -0.103079f, -0.190352f, -0.048121f, +}; + +static const float av1_tx_split_nn_weights_16x64_layer1[16] = { + -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f, + 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f, + 0.348337f, -0.205082f, 0.347129f, -0.322277f, +}; + +static const float av1_tx_split_nn_bias_16x64_layer1[1] = { + 0.04230947f, +}; + +static const NN_CONFIG av1_tx_split_nnconfig_16x64 = { + 8, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 16, + }, // num_hidden_nodes + { + av1_tx_split_nn_weights_16x64_layer0, + av1_tx_split_nn_weights_16x64_layer1, + }, + { + av1_tx_split_nn_bias_16x64_layer0, + av1_tx_split_nn_bias_16x64_layer1, + }, +}; +/******************************************************************************/ + +// Map block size to its corresponding neural net model for tx split prediction. +static const NN_CONFIG *const av1_tx_split_nnconfig_map[TX_SIZES_ALL] = { + NULL, // TX_4X4, + &av1_tx_split_nnconfig_8x8, // TX_8X8, + &av1_tx_split_nnconfig_16x16, // TX_16X16, + &av1_tx_split_nnconfig_32x32, // TX_32X32, + &av1_tx_split_nnconfig_64x64, // TX_64X64, + &av1_tx_split_nnconfig_4x8, // TX_4X8, + &av1_tx_split_nnconfig_4x8, // TX_8X4, + &av1_tx_split_nnconfig_8x16, // TX_8X16, + &av1_tx_split_nnconfig_8x16, // TX_16X8, + &av1_tx_split_nnconfig_16x32, // TX_16X32, + &av1_tx_split_nnconfig_16x32, // TX_32X16, + &av1_tx_split_nnconfig_32x64, // TX_32X64, + &av1_tx_split_nnconfig_32x64, // TX_64X32, + &av1_tx_split_nnconfig_4x16, // TX_4X16, + &av1_tx_split_nnconfig_4x16, // TX_16X4, + &av1_tx_split_nnconfig_8x32, // TX_8X32, + &av1_tx_split_nnconfig_8x32, // TX_32X8, + &av1_tx_split_nnconfig_16x64, // TX_16X64, + &av1_tx_split_nnconfig_16x64, // TX_64X16, +}; + +#if !CONFIG_REALTIME_ONLY +#define NUM_INTRA_TX_SPLIT_FEATURES 14 +#define NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS 1 +#define NUM_INTRA_TX_SPLIT_HIDDEN_NODES 16 +// Model to prune intra transform depth for intra 8x8 block. +static const float av1_intra_tx_split_8x8_mean[NUM_INTRA_TX_SPLIT_FEATURES] = { + 0.110706f, 18.901518f, 0.250436f, 13.483487f, 0.118141f, + 14.318728f, 0.028409f, 14.257664f, 0.045839f, 15.143358f, + 9.702971f, 14.300809f, 6.018646f, 3.682534f, +}; + +static const float av1_intra_tx_split_8x8_std[NUM_INTRA_TX_SPLIT_FEATURES] = { + 13.750575f, 13.440116f, 14.334330f, 12.236641f, 18.415247f, + 12.733355f, 18.309339f, 12.858130f, 23.465142f, 13.447014f, + 8.625048f, 10.456774f, 1.185447f, 1.810423f, +}; + +static const float av1_intra_tx_split_nn_weights_8x8_layer0 + [NUM_INTRA_TX_SPLIT_FEATURES * NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { + -0.156142f, -0.753623f, 0.026883f, 0.039188f, -0.035310f, 0.106140f, + 0.051622f, 0.077838f, 0.101632f, 0.107278f, 0.232200f, 0.269083f, + 0.048966f, -1.553293f, -0.113983f, -0.151248f, -0.067369f, 0.787292f, + 0.076651f, -0.802634f, 0.266414f, 1.107563f, -0.068848f, -0.956468f, + -0.074920f, -0.192258f, 0.006207f, 0.176196f, -0.493442f, 0.152290f, + -0.208874f, -0.014658f, 0.297385f, -0.351695f, 0.246295f, -0.178519f, + -0.204191f, 0.049663f, -0.330343f, -0.299754f, 0.246215f, -0.014558f, + -0.117611f, 0.206445f, 0.045840f, -0.047563f, -0.049679f, 0.406892f, + -0.052307f, -1.513404f, 0.166166f, 0.520760f, -0.143320f, -0.593928f, + -0.010533f, 0.250752f, 0.076738f, 0.537512f, -0.082619f, -1.534031f, + 0.047109f, 0.634247f, -0.089730f, 0.545534f, -0.022742f, -0.779047f, + -0.606358f, -0.199145f, -0.051269f, 0.248784f, 0.327545f, -0.851751f, + 0.071739f, 0.035975f, 0.387781f, -0.136427f, -0.284436f, 0.578449f, + -0.198276f, 0.579950f, 0.600111f, -0.370164f, -0.215297f, 0.517342f, + 0.200061f, -2.507660f, -0.030851f, 0.227315f, -0.078289f, 0.276052f, + -0.050281f, 0.251481f, -0.139318f, 0.281175f, 0.226524f, 0.058968f, + 0.197436f, 0.517294f, -0.105914f, -1.599567f, 0.064985f, 0.043209f, + -0.280038f, 0.126874f, 0.330387f, -0.014407f, 0.031241f, 0.237801f, + 0.948959f, -0.253791f, -0.022622f, -0.061430f, 0.265852f, 0.750823f, + 0.086606f, 0.853527f, -0.180971f, -1.255744f, -0.152979f, -1.022198f, + -0.044708f, 0.506424f, -0.501968f, -0.416863f, -0.012688f, 0.193523f, + -0.093698f, 0.430875f, 0.007379f, 0.019278f, 0.080890f, 0.462755f, + -0.054326f, -0.157611f, -0.004851f, -1.275676f, -0.060528f, -0.508170f, + 0.195429f, -0.023534f, 0.355211f, 0.983561f, -0.122036f, -0.911948f, + -0.172280f, -1.135245f, -0.043211f, 0.576456f, -0.075247f, 0.429734f, + -0.246309f, -0.355575f, -0.048809f, 0.217113f, 0.078385f, 0.720341f, + 0.007070f, 0.144617f, -0.167642f, 0.303056f, -0.031425f, 0.123448f, + -0.320530f, 0.164070f, -0.497849f, -0.233918f, -0.032123f, 0.084983f, + 0.312216f, 0.062609f, -0.389815f, 0.237593f, 0.000157f, -0.642068f, + 0.167898f, 0.495234f, -0.083493f, -0.555971f, 0.124437f, 0.381125f, + -0.459219f, 0.047924f, -0.138222f, -2.232816f, 0.127585f, -0.102420f, + 0.131598f, 0.036837f, -0.163055f, -0.067429f, -0.078521f, -0.055666f, + 1.387057f, 0.400154f, -0.003355f, -0.073627f, -0.305098f, -0.413383f, + -0.008266f, -0.038329f, 0.209808f, 0.375777f, 0.037274f, -0.050226f, + -0.100576f, 0.237441f, 0.237854f, 0.828296f, 0.001149f, -0.093964f, + 0.214051f, -0.031486f, -0.561307f, 0.014540f, 0.169357f, 0.323202f, + -0.395334f, -0.038941f, 0.476800f, -0.213122f, -0.287521f, -0.420717f, + -0.054142f, -0.102266f, + }; + +static const float + av1_intra_tx_split_nn_bias_8x8_layer0[NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { + -1.150850f, -0.236404f, 0.184554f, -0.904162f, -0.949979f, 0.427016f, + -0.546867f, -0.611094f, -0.676570f, -0.208959f, -0.286384f, 0.562238f, + 0.434197f, -0.746518f, 0.123085f, -0.549836f, + }; + +static const float av1_intra_tx_split_nn_weights_8x8_layer1 + [NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = { + 0.749814f, 0.598172f, 0.375611f, 0.751612f, 0.947538f, -0.282228f, + -1.457522f, -1.092290f, 0.738657f, 0.575779f, 0.514823f, -0.560616f, + -0.491619f, -1.482014f, 0.524625f, -0.533590f, + }; + +static const float av1_intra_tx_split_nn_bias_8x8_layer1[1] = { + -0.488888f, +}; + +static const NN_CONFIG av1_intra_tx_split_nnconfig_8x8 = { + NUM_INTRA_TX_SPLIT_FEATURES, // num_inputs + 1, // num_outputs + NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS, // num_hidden_layers + { + NUM_INTRA_TX_SPLIT_HIDDEN_NODES, + }, // num_hidden_nodes + { + av1_intra_tx_split_nn_weights_8x8_layer0, + av1_intra_tx_split_nn_weights_8x8_layer1, + }, + { + av1_intra_tx_split_nn_bias_8x8_layer0, + av1_intra_tx_split_nn_bias_8x8_layer1, + }, +}; + +static const float av1_intra_tx_prune_nn_thresh_8x8[2] = { -0.405465f, + 0.405465f }; +#endif // !CONFIG_REALTIME_ONLY + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_ diff --git a/third_party/aom/av1/encoder/tx_search.c b/third_party/aom/av1/encoder/tx_search.c new file mode 100644 index 0000000000..7292c01191 --- /dev/null +++ b/third_party/aom/av1/encoder/tx_search.c @@ -0,0 +1,3830 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/cfl.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/block.h" +#include "av1/encoder/hybrid_fwd_txfm.h" +#include "av1/common/idct.h" +#include "av1/encoder/model_rd.h" +#include "av1/encoder/random.h" +#include "av1/encoder/rdopt_utils.h" +#include "av1/encoder/sorting_network.h" +#include "av1/encoder/tx_prune_model_weights.h" +#include "av1/encoder/tx_search.h" +#include "av1/encoder/txb_rdopt.h" + +#define PROB_THRESH_OFFSET_TX_TYPE 100 + +struct rdcost_block_args { + const AV1_COMP *cpi; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[MAX_MIB_SIZE]; + ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]; + RD_STATS rd_stats; + int64_t current_rd; + int64_t best_rd; + int exit_early; + int incomplete_exit; + FAST_TX_SEARCH_MODE ftxs_mode; + int skip_trellis; +}; + +typedef struct { + int64_t rd; + int txb_entropy_ctx; + TX_TYPE tx_type; +} TxCandidateInfo; + +// origin_threshold * 128 / 100 +static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { + { + 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, + }, + { + 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, + }, + { + 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, + }, +}; + +// lookup table for predict_skip_txfm +// int max_tx_size = max_txsize_rect_lookup[bsize]; +// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) +// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); +static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { + TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, + TX_8X8, TX_8X8, TX_16X16, TX_16X16, +}; + +// look-up table for sqrt of number of pixels in a transform block +// rounded up to the nearest integer. +static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6, + 12, 12, 23, 23, 32, 32, 8, + 8, 16, 16, 23, 23 }; + +static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + const int16_t *diff = x->plane[0].src_diff; + const uint32_t hash = + av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator, + (uint8_t *)diff, 2 * rows * cols); + return (hash << 5) + bsize; +} + +static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record, + const int64_t ref_best_rd, + const uint32_t hash) { + int32_t match_index = -1; + if (ref_best_rd != INT64_MAX) { + for (int i = 0; i < mb_rd_record->num; ++i) { + const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; + // If there is a match in the mb_rd_record, fetch the RD decision and + // terminate early. + if (mb_rd_record->mb_rd_info[index].hash_value == hash) { + match_index = index; + break; + } + } + } + return match_index; +} + +static AOM_INLINE void fetch_mb_rd_info(int n4, + const MB_RD_INFO *const mb_rd_info, + RD_STATS *const rd_stats, + MACROBLOCK *const x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + mbmi->tx_size = mb_rd_info->tx_size; + memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip, + sizeof(mb_rd_info->blk_skip[0]) * n4); + av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size); + av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4); + *rd_stats = mb_rd_info->rd_stats; +} + +int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8) { + int visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, + NULL, &visible_cols, &visible_rows); + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + + diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); + uint64_t sse = + aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); + if (block_mse_q8 != NULL) { + if (visible_cols > 0 && visible_rows > 0) + *block_mse_q8 = + (unsigned int)((256 * sse) / (visible_cols * visible_rows)); + else + *block_mse_q8 = UINT_MAX; + } + return sse; +} + +// Computes the residual block's SSE and mean on all visible 4x4s in the +// transform block +static INLINE int64_t pixel_diff_stats( + MACROBLOCK *x, int plane, int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) { + int visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, + NULL, &visible_cols, &visible_rows); + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *diff = x->plane[plane].src_diff; + + diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2); + uint64_t sse = 0; + int sum = 0; + sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum); + if (visible_cols > 0 && visible_rows > 0) { + double norm_factor = 1.0 / (visible_cols * visible_rows); + int sign_sum = sum > 0 ? 1 : -1; + // Conversion to transform domain + *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7; + *per_px_mean = sign_sum * (*per_px_mean); + *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse)); + *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum)); + } else { + *block_mse_q8 = UINT_MAX; + } + return sse; +} + +// Uses simple features on top of DCT coefficients to quickly predict +// whether optimal RD decision is to skip encoding the residual. +// The sse value is stored in dist. +static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, + int reduced_tx_set) { + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); + + *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL); + + const int64_t mse = *dist / bw / bh; + // Normalized quantizer takes the transform upscaling factor (8 for tx size + // smaller than 32) into account. + const int16_t normalized_dc_q = dc_q >> 3; + const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; + // For faster early skip decision, use dist to compare against threshold so + // that quality risk is less for the skip=1 decision. Otherwise, use mse + // since the fwd_txfm coeff checks will take care of quality + // TODO(any): Use dist to return 0 when skip_txfm_level is 1 + int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse; + // Predict not to skip when error is larger than threshold. + if (pred_err > mse_thresh) return 0; + // Return as skip otherwise for aggressive early skip + else if (txfm_params->skip_txfm_level >= 2) + return 1; + + const int max_tx_size = max_predict_sf_tx_size[bsize]; + const int tx_h = tx_size_high[max_tx_size]; + const int tx_w = tx_size_wide[max_tx_size]; + DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); + TxfmParam param; + param.tx_type = DCT_DCT; + param.tx_size = max_tx_size; + param.bd = xd->bd; + param.is_hbd = is_cur_buf_hbd(xd); + param.lossless = 0; + param.tx_set_type = av1_get_ext_tx_set_type( + param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); + const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); + const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; + const int16_t *src_diff = x->plane[0].src_diff; + const int n_coeff = tx_w * tx_h; + const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const uint32_t dc_thresh = max_qcoef_thresh * dc_q; + const uint32_t ac_thresh = max_qcoef_thresh * ac_q; + for (int row = 0; row < bh; row += tx_h) { + for (int col = 0; col < bw; col += tx_w) { + av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); + // Operating on TX domain, not pixels; we want the QTX quantizers + const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); + if (dc_coef >= dc_thresh) return 0; + for (int i = 1; i < n_coeff; ++i) { + const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); + if (ac_coef >= ac_thresh) return 0; + } + } + src_diff += tx_h * bw; + } + return 1; +} + +// Used to set proper context for early termination with skip = 1. +static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t dist) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int n4 = bsize_to_num_blk(bsize); + const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4); + memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); + mbmi->tx_size = tx_size; + for (int i = 0; i < n4; ++i) + set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1); + rd_stats->skip_txfm = 1; + if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); + rd_stats->dist = rd_stats->sse = (dist << 4); + // Though decision is to make the block as skip based on luma stats, + // it is possible that block becomes non skip after chroma rd. In addition + // intermediate non skip costs calculated by caller function will be + // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not + // accounted). Hence intermediate rate is populated to code the luma tx blks + // as skip, the caller function based on final rd decision (i.e., skip vs + // non-skip) sets the final rate accordingly. Here the rate populated + // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx + // size possible) in the current block. Eg: For 128*128 block, rate would be + // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx + // block as 'all zeros' + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); + ENTROPY_CONTEXT *ta = ctxa; + ENTROPY_CONTEXT *tl = ctxl; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->rate = zero_blk_rate * + (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) * + (block_size_high[bsize] >> tx_size_high_log2[tx_size]); +} + +static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + MB_RD_RECORD *mb_rd_record) { + int index; + if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) { + index = + (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN; + ++mb_rd_record->num; + } else { + index = mb_rd_record->index_start; + mb_rd_record->index_start = + (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; + } + MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index]; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + mb_rd_info->hash_value = hash; + mb_rd_info->tx_size = mbmi->tx_size; + memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip, + sizeof(mb_rd_info->blk_skip[0]) * n4); + av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size); + av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4); + mb_rd_info->rd_stats = *rd_stats; +} + +static int get_search_init_depth(int mi_width, int mi_height, int is_inter, + const SPEED_FEATURES *sf, + int tx_size_search_method) { + if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH; + + if (sf->tx_sf.tx_size_search_lgr_block) { + if (mi_width > mi_size_wide[BLOCK_64X64] || + mi_height > mi_size_high[BLOCK_64X64]) + return MAX_VARTX_DEPTH; + } + + if (is_inter) { + return (mi_height != mi_width) + ? sf->tx_sf.inter_tx_size_search_init_depth_rect + : sf->tx_sf.inter_tx_size_search_init_depth_sqr; + } else { + return (mi_height != mi_width) + ? sf->tx_sf.intra_tx_size_search_init_depth_rect + : sf->tx_sf.intra_tx_size_search_init_depth_sqr; + } +} + +static AOM_INLINE void select_tx_block( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode); + +// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values +// 0: Do not collect any RD stats +// 1: Collect RD stats for transform units +// 2: Collect RD stats for partition units +#if CONFIG_COLLECT_RD_STATS + +static AOM_INLINE void get_energy_distribution_fine( + const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int need_4th, double *hordist, + double *verdist) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) { + // Special cases: calculate 'esq' values manually, as we don't have 'vf' + // functions for the 16 (very small) sub-blocks of this block. + const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3; + const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3; + assert(bw <= 32); + assert(bh <= 32); + assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); + if (cpi->common.seq_params->use_highbitdepth) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += + (src16[j + i * src_stride] - dst16[j + i * dst_stride]) * + (src16[j + i * src_stride] - dst16[j + i * dst_stride]); + } + } else { + for (int i = 0; i < bh; ++i) + for (int j = 0; j < bw; ++j) { + const int index = (j >> w_shift) + ((i >> h_shift) << 2); + esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) * + (src[j + i * src_stride] - dst[j + i * dst_stride]); + } + } + } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks. + const int f_index = + (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16; + assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL); + const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; + assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); + assert(block_size_high[bsize] == 4 * block_size_high[subsize]); + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[1]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[2]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[3]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[5]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[6]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[7]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[9]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[10]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[11]); + src += bh / 4 * src_stride; + dst += bh / 4 * dst_stride; + + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[13]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[14]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[15]); + } + + double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + + esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + + esq[12] + esq[13] + esq[14] + esq[15]; + if (total > 0) { + const double e_recip = 1.0 / total; + hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip; + hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip; + hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip; + if (need_4th) { + hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip; + } + verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip; + verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip; + verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip; + if (need_4th) { + verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip; + } + } else { + hordist[0] = verdist[0] = 0.25; + hordist[1] = verdist[1] = 0.25; + hordist[2] = verdist[2] = 0.25; + if (need_4th) { + hordist[3] = verdist[3] = 0.25; + } + } +} + +static double get_sse_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int err = diff[j * stride + i]; + sum += err * err; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_sad_norm(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += abs(diff[j * stride + i]); + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static AOM_INLINE void get_2x2_normalized_sses_and_sads( + const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src, + int src_stride, const uint8_t *const dst, int dst_stride, + const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr, + double *const sad_norm_arr) { + const BLOCK_SIZE tx_bsize_half = + get_partition_subsize(tx_bsize, PARTITION_SPLIT); + if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats + const int half_width = block_size_wide[tx_bsize] / 2; + const int half_height = block_size_high[tx_bsize] / 2; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const int16_t *const this_src_diff = + src_diff + row * half_height * diff_stride + col * half_width; + if (sse_norm_arr) { + sse_norm_arr[row * 2 + col] = + get_sse_norm(this_src_diff, diff_stride, half_width, half_height); + } + if (sad_norm_arr) { + sad_norm_arr[row * 2 + col] = + get_sad_norm(this_src_diff, diff_stride, half_width, half_height); + } + } + } + } else { // use function pointers to calculate stats + const int half_width = block_size_wide[tx_bsize_half]; + const int half_height = block_size_high[tx_bsize_half]; + const int num_samples_half = half_width * half_height; + for (int row = 0; row < 2; ++row) { + for (int col = 0; col < 2; ++col) { + const uint8_t *const this_src = + src + row * half_height * src_stride + col * half_width; + const uint8_t *const this_dst = + dst + row * half_height * dst_stride + col * half_width; + + if (sse_norm_arr) { + unsigned int this_sse; + cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, + dst_stride, &this_sse); + sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; + } + + if (sad_norm_arr) { + const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf( + this_src, src_stride, this_dst, dst_stride); + sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; + } + } + } + } +} + +#if CONFIG_COLLECT_RD_STATS == 1 +static double get_mean(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += diff[j * stride + i]; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} +static AOM_INLINE void PrintTransformUnitStats( + const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + TX_TYPE tx_type, int64_t rd) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 256 > 0) return; + + const char output_file[] = "tu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int txw = tx_size_wide[tx_size]; + const int txh = tx_size_high[tx_size]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int q_step = p->dequant_QTX[1] >> dequant_shift; + const int num_samples = txw * txh; + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + + fprintf(fout, "%g %g", rate_norm, dist_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = + &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = + &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + unsigned int sse; + cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = (double)sad / num_samples; + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + const int diff_stride = block_size_wide[plane_bsize]; + const int16_t *const src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; + const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; + + fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size], + tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col); + + int model_rate; + int64_t model_dist; + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g", model_rate_norm, model_dist_norm); + + const double mean = get_mean(src_diff, diff_stride, txw, txh); + float hor_corr, vert_corr; + av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride, + 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + fprintf(fout, " %d %" PRId64, x->rdmult, rd); + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS == 1 + +#if CONFIG_COLLECT_RD_STATS >= 2 +static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { + const AV1_COMMON *cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + int64_t total_sse = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + unsigned int sse; + + if (plane) continue; + + cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); + total_sse += sse; + } + total_sse <<= 4; + return total_sse; +} + +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, + int64_t sse, int *est_residue_cost, + int64_t *est_dist) { + const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; + if (md->ready) { + if (sse < md->dist_mean) { + *est_residue_cost = 0; + *est_dist = sse; + } else { + *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } + } + return 1; + } + return 0; +} + +static double get_highbd_diff_mean(const uint8_t *src8, int src_stride, + const uint8_t *dst8, int dst_stride, int w, + int h) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_diff_mean(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi, + const TileDataEnc *tile_data, + MACROBLOCK *x, + const RD_STATS *const rd_stats, + BLOCK_SIZE plane_bsize) { + if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && + (tile_data == NULL || + !tile_data->inter_mode_rd_models[plane_bsize].ready)) + return; + (void)tile_data; + // Generate small sample to restrict output size. + static unsigned int seed = 95014; + + if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) != + 1) + return; + + const char output_file[] = "pu_stats.txt"; + FILE *fout = fopen(output_file, "a"); + if (!fout) return; + + MACROBLOCKD *const xd = &x->e_mbd; + const int plane = 0; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const int diff_stride = block_size_wide[plane_bsize]; + int bw, bh; + get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, + &bh); + const int num_samples = bw * bh; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int q_step = p->dequant_QTX[1] >> dequant_shift; + const int shift = (xd->bd - 8); + + const double rate_norm = (double)rd_stats->rate / num_samples; + const double dist_norm = (double)rd_stats->dist / num_samples; + const double rdcost_norm = + (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples; + + fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm); + + const int src_stride = p->src.stride; + const uint8_t *const src = p->src.buf; + const int dst_stride = pd->dst.stride; + const uint8_t *const dst = pd->dst.buf; + const int16_t *const src_diff = p->src_diff; + + int64_t sse = calculate_sse(xd, p, pd, bw, bh); + const double sse_norm = (double)sse / num_samples; + + const unsigned int sad = + cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); + const double sad_norm = + (double)sad / (1 << num_pels_log2_lookup[plane_bsize]); + + fprintf(fout, " %g %g", sse_norm, sad_norm); + + double sse_norm_arr[4], sad_norm_arr[4]; + get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, + dst_stride, src_diff, diff_stride, + sse_norm_arr, sad_norm_arr); + if (shift) { + for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); + for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sse_norm_arr[i]); + } + for (int i = 0; i < 4; ++i) { + fprintf(fout, " %g", sad_norm_arr[i]); + } + + fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh); + + int model_rate; + int64_t model_dist; + model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples, + &model_rate, &model_dist); + const double model_rdcost_norm = + (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples; + const double model_rate_norm = (double)model_rate / num_samples; + const double model_dist_norm = (double)model_dist / num_samples; + fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, + model_rdcost_norm); + + double mean; + if (is_cur_buf_hbd(xd)) { + mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } + mean /= (1 << shift); + float hor_corr, vert_corr; + av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr, + &vert_corr); + fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr); + + double hdist[4] = { 0 }, vdist[4] = { 0 }; + get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst, + dst_stride, 1, hdist, vdist); + fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], + hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + + if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) { + assert(tile_data->inter_mode_rd_models[plane_bsize].ready); + const int64_t overall_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost, + &est_dist); + const double est_residue_cost_norm = (double)est_residue_cost / num_samples; + const double est_dist_norm = (double)est_dist / num_samples; + const double est_rdcost_norm = + (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples; + fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm, + est_rdcost_norm); + } + + fprintf(fout, "\n"); + fclose(fout); +} +#endif // CONFIG_COLLECT_RD_STATS >= 2 +#endif // CONFIG_COLLECT_RD_STATS + +static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x, + int plane, int block, + int blk_row, int blk_col, + int eob, + int reduced_tx_set) { + if (!eob) return; + struct macroblock_plane *const p = &x->plane[plane]; + MACROBLOCKD *const xd = &x->e_mbd; + tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE tx_size = av1_get_tx_size(plane, xd); + const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, + tx_size, reduced_tx_set); + + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, + dst_stride, eob, reduced_tx_set); +} + +static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, int skip_trellis, + TX_TYPE best_tx_type, int do_quant, + int *rate_cost, uint16_t best_eob) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (!is_inter && best_eob && + (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] || + blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) { + // if the quantized coefficients are stored in the dqcoeff buffer, we don't + // need to do transform and quantization again. + if (do_quant) { + TxfmParam txfm_param_intra; + QUANT_PARAM quant_param_intra; + av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra); + av1_setup_quant(tx_size, !skip_trellis, + skip_trellis + ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra); + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type, + &quant_param_intra); + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, + &txfm_param_intra, &quant_param_intra); + if (quant_param_intra.use_optimize_b) { + av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, + rate_cost); + } + } + + inverse_transform_block_facade(x, plane, block, blk_row, blk_col, + x->plane[plane].eobs[block], + cm->features.reduced_tx_set_used); + + // This may happen because of hash collision. The eob stored in the hash + // table is non-zero, but the real eob is zero. We need to make sure tx_type + // is DCT_DCT in this case. + if (plane == 0 && x->plane[plane].eobs[block] == 0 && + best_tx_type != DCT_DCT) { + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + } +} + +static unsigned pixel_dist_visible_only( + const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src, + const int src_stride, const uint8_t *dst, const int dst_stride, + const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows, + int visible_cols) { + unsigned sse; + + if (txb_rows == visible_rows && txb_cols == visible_cols) { + cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + return sse; + } + +#if CONFIG_AV1_HIGHBITDEPTH + const MACROBLOCKD *xd = &x->e_mbd; + if (is_cur_buf_hbd(xd)) { + uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, + visible_cols, visible_rows); + return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); + } +#else + (void)x; +#endif + sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, + visible_rows); + return sse; +} + +// Compute the pixel domain distortion from src and dst on all visible 4x4s in +// the +// transform block. +static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, + int plane, const uint8_t *src, const int src_stride, + const uint8_t *dst, const int dst_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + int txb_rows, txb_cols, visible_rows, visible_cols; + const MACROBLOCKD *xd = &x->e_mbd; + + get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, + &txb_cols, &txb_rows, &visible_cols, &visible_rows); + assert(visible_rows > 0); + assert(visible_cols > 0); + + unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst, + dst_stride, tx_bsize, txb_rows, + txb_cols, visible_rows, visible_cols); + + return sse; +} + +static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, + int plane, BLOCK_SIZE plane_bsize, + int block, int blk_row, int blk_col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const uint16_t eob = p->eobs[block]; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const int bsw = block_size_wide[tx_bsize]; + const int bsh = block_size_high[tx_bsize]; + const int src_stride = x->plane[plane].src.stride; + const int dst_stride = xd->plane[plane].dst.stride; + // Scale the transform block index to pixel unit. + const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2; + const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2; + const uint8_t *src = &x->plane[plane].src.buf[src_idx]; + const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx]; + const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + + assert(cpi != NULL); + assert(tx_size_wide_log2[0] == tx_size_high_log2[0]); + + uint8_t *recon; + DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); + +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(xd)) { + recon = CONVERT_TO_BYTEPTR(recon16); + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, + CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh); + } else { + recon = (uint8_t *)recon16; + aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh); + } +#else + recon = (uint8_t *)recon16; + aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh); +#endif + + const PLANE_TYPE plane_type = get_plane_type(plane); + TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size, + cpi->common.features.reduced_tx_set_used); + av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon, + MAX_TX_SIZE, eob, + cpi->common.features.reduced_tx_set_used); + + return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE, + blk_row, blk_col, plane_bsize, tx_bsize); +} + +// pruning thresholds for prune_txk_type and prune_txk_type_separ +static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000 +static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100 + +// R-D costs are sorted in ascending order. +static INLINE void sort_rd(int64_t rds[], int txk[], int len) { + int i, j, k; + + for (i = 1; i <= len - 1; ++i) { + for (j = 0; j < i; ++j) { + if (rds[j] > rds[i]) { + int64_t temprd; + int tempi; + + temprd = rds[i]; + tempi = txk[i]; + + for (k = i; k > j; k--) { + rds[k] = rds[k - 1]; + txk[k] = txk[k - 1]; + } + + rds[j] = temprd; + txk[j] = tempi; + break; + } + } + } +} + +static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, + const qm_val_t *qmatrix, + const int16_t *scan, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; + + for (i = 0; i < block_size; i++) { + int64_t weight = qmatrix[scan[i]]; + int64_t dd = coeff[i] - dqcoeff[i]; + dd *= weight; + int64_t cc = coeff[i]; + cc *= weight; + // The ranges of coeff and dqcoeff are + // bd8 : 18 bits (including sign) + // bd10: 20 bits (including sign) + // bd12: 22 bits (including sign) + // As AOM_QM_BITS is 5, the intermediate quantities in the calculation + // below should fit in 54 bits, thus no overflow should happen. + error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); + sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); + } + + *ssz = sqcoeff; + return error; +} + +static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, + const qm_val_t *qmatrix, + const int16_t *scan, int64_t *out_dist, + int64_t *out_sse) { + const struct macroblock_plane *const p = &x->plane[plane]; + // Transform domain distortion computation is more efficient as it does + // not involve an inverse transform, but it is less accurate. + const int buffer_length = av1_get_max_eob(tx_size); + int64_t this_sse; + // TX-domain results need to shift down to Q2/D10 to match pixel + // domain distortion values which are in Q2^2 + int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff = p->coeff + block_offset; + tran_low_t *const dqcoeff = p->dqcoeff + block_offset; +#if CONFIG_AV1_HIGHBITDEPTH + MACROBLOCKD *const xd = &x->e_mbd; + if (is_cur_buf_hbd(xd)) { + // TODO(veluca): handle use_qm_dist_metric for HBD too. + *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, + xd->bd); + } else { +#endif + if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) { + *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse); + } else { + *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix, + scan, &this_sse); + } +#if CONFIG_AV1_HIGHBITDEPTH + } +#endif + + *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift); + *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift); +} + +uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, int *txk_map, + int16_t allowed_tx_mask, int prune_factor, + const TXB_CTX *const txb_ctx, + int reduced_tx_set_used, int64_t ref_best_rd, + int num_sel) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + int idx; + + int64_t rds_v[4]; + int64_t rds_h[4]; + int idx_v[4] = { 0, 1, 2, 3 }; + int idx_h[4] = { 0, 1, 2, 3 }; + int skip_v[4] = { 0 }; + int skip_h[4] = { 0 }; + const int idx_map[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + + const int sel_pattern_v[16] = { + 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3 + }; + const int sel_pattern_h[16] = { + 0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3 + }; + + QUANT_PARAM quant_param; + TxfmParam txfm_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, + &quant_param); + int tx_type; + // to ensure we can try ones even outside of ext_tx_set of current block + // this function should only be called for size < 16 + assert(txsize_sqr_up_map[tx_size] <= TX_16X16); + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + + int rate_cost = 0; + int64_t dist = 0, sse = 0; + // evaluate horizontal with vertical DCT + for (idx = 0; idx < 4; ++idx) { + tx_type = idx_map[idx]; + txfm_param.tx_type = tx_type; + + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &dist, &sse); + + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + + rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist); + + if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) { + skip_h[idx] = 1; + } + } + sort_rd(rds_h, idx_h, 4); + for (idx = 1; idx < 4; idx++) { + if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1; + } + + if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF; + + // evaluate vertical with the best horizontal chosen + rds_v[0] = rds_h[0]; + int start_v = 1, end_v = 4; + const int *idx_map_v = idx_map + idx_h[0]; + + for (idx = start_v; idx < end_v; ++idx) { + tx_type = idx_map_v[idx_v[idx] * 4]; + txfm_param.tx_type = tx_type; + + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &dist, &sse); + + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + + rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist); + + if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) { + skip_v[idx] = 1; + } + } + sort_rd(rds_v, idx_v, 4); + for (idx = 1; idx < 4; idx++) { + if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1; + } + + // combine rd_h and rd_v to prune tx candidates + int i_v, i_h; + int64_t rds[16]; + int num_cand = 0, last = TX_TYPES - 1; + + for (int i = 0; i < 16; i++) { + i_v = sel_pattern_v[i]; + i_h = sel_pattern_h[i]; + tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]]; + if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] || + skip_v[idx_v[i_v]]) { + txk_map[last] = tx_type; + last--; + } else { + txk_map[num_cand] = tx_type; + rds[num_cand] = rds_v[i_v] + rds_h[i_h]; + if (rds[num_cand] == 0) rds[num_cand] = 1; + num_cand++; + } + } + sort_rd(rds, txk_map, num_cand); + + uint16_t prune = (uint16_t)(~(1 << txk_map[0])); + num_sel = AOMMIN(num_sel, num_cand); + + for (int i = 1; i < num_sel; i++) { + int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]); + if (factor < (int64_t)prune_factor) + prune &= ~(1 << txk_map[i]); + else + break; + } + return prune; +} + +uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, int *txk_map, + uint16_t allowed_tx_mask, int prune_factor, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + int tx_type; + + int64_t rds[TX_TYPES]; + + int num_cand = 0; + int last = TX_TYPES - 1; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt, + &quant_param); + + for (int idx = 0; idx < TX_TYPES; idx++) { + tx_type = idx; + int rate_cost = 0; + int64_t dist = 0, sse = 0; + if (!(allowed_tx_mask & (1 << tx_type))) { + txk_map[last] = tx_type; + last--; + continue; + } + txfm_param.tx_type = tx_type; + + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + + // do txfm and quantization + av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param, + &quant_param); + // estimate rate cost + rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used, 0); + // tx domain dist + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &dist, &sse); + + txk_map[num_cand] = tx_type; + rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist); + if (rds[num_cand] == 0) rds[num_cand] = 1; + num_cand++; + } + + if (num_cand == 0) return (uint16_t)0xFFFF; + + sort_rd(rds, txk_map, num_cand); + uint16_t prune = (uint16_t)(~(1 << txk_map[0])); + + // 0 < prune_factor <= 1000 controls aggressiveness + int64_t factor = 0; + for (int idx = 1; idx < num_cand; idx++) { + factor = 1000 * (rds[idx] - rds[0]) / rds[0]; + if (factor < (int64_t)prune_factor) + prune &= ~(1 << txk_map[idx]); + else + break; + } + return prune; +} + +// These thresholds were calibrated to provide a certain number of TX types +// pruned by the model on average, i.e. selecting a threshold with index i +// will lead to pruning i+1 TX types on average +static const float *prune_2D_adaptive_thresholds[] = { + // TX_4X4 + (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f, + 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f, + 0.09778f, 0.11780f }, + // TX_8X8 + (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f, + 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f, + 0.10803f, 0.14124f }, + // TX_16X16 + (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f, + 0.06897f, 0.07629f, 0.08875f, 0.11169f }, + // TX_32X32 + NULL, + // TX_64X64 + NULL, + // TX_4X8 + (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f, + 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f, + 0.10168f, 0.12585f }, + // TX_8X4 + (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f, + 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f, + 0.10583f, 0.13123f }, + // TX_8X16 + (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f, + 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f, + 0.10730f, 0.14221f }, + // TX_16X8 + (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f, + 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f, + 0.10339f, 0.13464f }, + // TX_16X32 + NULL, + // TX_32X16 + NULL, + // TX_32X64 + NULL, + // TX_64X32 + NULL, + // TX_4X16 + (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f, + 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f, + 0.10242f, 0.12878f }, + // TX_16X4 + (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f, + 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f, + 0.10217f, 0.12610f }, + // TX_8X32 + NULL, + // TX_32X8 + NULL, + // TX_16X64 + NULL, + // TX_64X16 + NULL, +}; + +static INLINE float get_adaptive_thresholds( + TX_SIZE tx_size, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) { + const int prune_aggr_table[5][2] = { + { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 } + }; + int pruning_aggressiveness = 0; + if (tx_set_type == EXT_TX_SET_ALL16) + pruning_aggressiveness = + prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0]; + else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) + pruning_aggressiveness = + prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1]; + + return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness]; +} + +static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff, + int stride, int bw, int bh, + float *hordist, + float *verdist) { + // First compute downscaled block energy values (esq); downscale factors + // are defined by w_shift and h_shift. + unsigned int esq[256]; + const int w_shift = bw <= 8 ? 0 : 1; + const int h_shift = bh <= 8 ? 0 : 1; + const int esq_w = bw >> w_shift; + const int esq_h = bh >> h_shift; + const int esq_sz = esq_w * esq_h; + int i, j; + memset(esq, 0, esq_sz * sizeof(esq[0])); + if (w_shift) { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j += 2) { + cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] + + cur_diff_row[j + 1] * cur_diff_row[j + 1]); + } + } + } else { + for (i = 0; i < bh; i++) { + unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w; + const int16_t *cur_diff_row = diff + i * stride; + for (j = 0; j < bw; j++) { + cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j]; + } + } + } + + uint64_t total = 0; + for (i = 0; i < esq_sz; i++) total += esq[i]; + + // Output hordist and verdist arrays are normalized 1D projections of esq + if (total == 0) { + float hor_val = 1.0f / esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val; + float ver_val = 1.0f / esq_h; + for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val; + return; + } + + const float e_recip = 1.0f / (float)total; + memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0])); + memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0])); + const unsigned int *cur_esq_row; + for (i = 0; i < esq_h - 1; i++) { + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) { + hordist[j] += (float)cur_esq_row[j]; + verdist[i] += (float)cur_esq_row[j]; + } + verdist[i] += (float)cur_esq_row[j]; + } + cur_esq_row = esq + i * esq_w; + for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j]; + + for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip; + for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip; +} + +static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) { + return mask & (1 << val); +} + +static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) { + *mask |= (1 << val); +} + +static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) { + *mask &= ~(1 << val); +} + +static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size, + int blk_row, int blk_col, TxSetType tx_set_type, + TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map, + uint16_t *allowed_tx_mask) { + // This table is used because the search order is different from the enum + // order. + static const int tx_type_table_2D[16] = { + DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT, + ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST, + FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST, + H_DCT, H_ADST, H_FLIPADST, IDTX + }; + if (tx_set_type != EXT_TX_SET_ALL16 && + tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT) + return; +#if CONFIG_NN_V2 + NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; +#else + const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size]; + const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size]; +#endif + if (!nn_config_hor || !nn_config_ver) return; // Model not established yet. + + float hfeatures[16], vfeatures[16]; + float hscores[4], vscores[4]; + float scores_2D_raw[16]; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + const int hfeatures_num = bw <= 8 ? bw : bw / 2; + const int vfeatures_num = bh <= 8 ? bh : bh / 2; + assert(hfeatures_num <= 16); + assert(vfeatures_num <= 16); + + const struct macroblock_plane *const p = &x->plane[0]; + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures, + vfeatures); + + av1_get_horver_correlation_full(diff, diff_stride, bw, bh, + &hfeatures[hfeatures_num - 1], + &vfeatures[vfeatures_num - 1]); + +#if CONFIG_NN_V2 + av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores); + av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores); +#else + av1_nn_predict(hfeatures, nn_config_hor, 1, hscores); + av1_nn_predict(vfeatures, nn_config_ver, 1, vscores); +#endif + + for (int i = 0; i < 4; i++) { + float *cur_scores_2D = scores_2D_raw + i * 4; + cur_scores_2D[0] = vscores[i] * hscores[0]; + cur_scores_2D[1] = vscores[i] * hscores[1]; + cur_scores_2D[2] = vscores[i] * hscores[2]; + cur_scores_2D[3] = vscores[i] * hscores[3]; + } + + assert(TX_TYPES == 16); + // This version of the function only works when there are at most 16 classes. + // So we will need to change the optimization or use av1_nn_softmax instead if + // this ever gets changed. + av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw); + + const float score_thresh = + get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode); + + // Always keep the TX type with the highest score, prune all others with + // score below score_thresh. + int max_score_i = 0; + float max_score = 0.0f; + uint16_t allow_bitmask = 0; + float sum_score = 0.0; + // Calculate sum of allowed tx type score and Populate allow bit mask based + // on score_thresh and allowed_tx_mask + int allow_count = 0; + int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID, + TX_TYPE_INVALID }; + float scores_2D[16] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }; + for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) { + const int allow_tx_type = + check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]); + if (!allow_tx_type) { + continue; + } + if (scores_2D_raw[tx_idx] > max_score) { + max_score = scores_2D_raw[tx_idx]; + max_score_i = tx_idx; + } + if (scores_2D_raw[tx_idx] >= score_thresh) { + // Set allow mask based on score_thresh + set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]); + + // Accumulate score of allowed tx type + sum_score += scores_2D_raw[tx_idx]; + + scores_2D[allow_count] = scores_2D_raw[tx_idx]; + tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx]; + allow_count += 1; + } + } + if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) { + // If even the tx_type with max score is pruned, this means that no other + // tx_type is feasible. When this happens, we force enable max_score_i and + // end the search. + set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]); + memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D)); + *allowed_tx_mask = allow_bitmask; + return; + } + + // Sort tx type probability of all types + if (allow_count <= 8) { + av1_sort_fi32_8(scores_2D, tx_type_allowed); + } else { + av1_sort_fi32_16(scores_2D, tx_type_allowed); + } + + // Enable more pruning based on tx type probability and number of allowed tx + // types + if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) { + float temp_score = 0.0; + float score_ratio = 0.0; + int tx_idx, tx_count = 0; + const float inv_sum_score = 100 / sum_score; + // Get allowed tx types based on sorted probability score and tx count + for (tx_idx = 0; tx_idx < allow_count; tx_idx++) { + // Skip the tx type which has more than 30% of cumulative + // probability and allowed tx type count is more than 2 + if (score_ratio > 30.0 && tx_count >= 2) break; + + assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx])); + // Calculate cumulative probability + temp_score += scores_2D[tx_idx]; + + // Calculate percentage of cumulative probability of allowed tx type + score_ratio = temp_score * inv_sum_score; + tx_count++; + } + // Set remaining tx types as pruned + for (; tx_idx < allow_count; tx_idx++) + unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]); + } + + memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D)); + *allowed_tx_mask = allow_bitmask; +} + +static float get_dev(float mean, double x2_sum, int num) { + const float e_x2 = (float)(x2_sum / num); + const float diff = e_x2 - mean * mean; + const float dev = (diff > 0) ? sqrtf(diff) : 0; + return dev; +} + +// Writes the features required by the ML model to predict tx split based on +// mean and standard deviation values of the block and sub-blocks. +// Returns the number of elements written to the output array which is at most +// 12 currently. Hence 'features' buffer should be able to accommodate at least +// 12 elements. +static AOM_INLINE int get_mean_dev_features(const int16_t *data, int stride, + int bw, int bh, float *features) { + const int16_t *const data_ptr = &data[0]; + const int subh = (bh >= bw) ? (bh >> 1) : bh; + const int subw = (bw >= bh) ? (bw >> 1) : bw; + const int num = bw * bh; + const int sub_num = subw * subh; + int feature_idx = 2; + int total_x_sum = 0; + int64_t total_x2_sum = 0; + int num_sub_blks = 0; + double mean2_sum = 0.0f; + float dev_sum = 0.0f; + + for (int row = 0; row < bh; row += subh) { + for (int col = 0; col < bw; col += subw) { + int x_sum; + int64_t x2_sum; + // TODO(any): Write a SIMD version. Clear registers. + aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, + &x_sum, &x2_sum); + total_x_sum += x_sum; + total_x2_sum += x2_sum; + + const float mean = (float)x_sum / sub_num; + const float dev = get_dev(mean, (double)x2_sum, sub_num); + features[feature_idx++] = mean; + features[feature_idx++] = dev; + mean2_sum += (double)(mean * mean); + dev_sum += dev; + num_sub_blks++; + } + } + + const float lvl0_mean = (float)total_x_sum / num; + features[0] = lvl0_mean; + features[1] = get_dev(lvl0_mean, (double)total_x2_sum, num); + + // Deviation of means. + features[feature_idx++] = get_dev(lvl0_mean, mean2_sum, num_sub_blks); + // Mean of deviations. + features[feature_idx++] = dev_sum / num_sub_blks; + + return feature_idx; +} + +static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row, + int blk_col, TX_SIZE tx_size) { + const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size]; + if (!nn_config) return -1; + + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = + x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + float features[64] = { 0.0f }; + get_mean_dev_features(diff, diff_stride, bw, bh, features); + + float score = 0.0f; + av1_nn_predict(features, nn_config, 1, &score); + + int int_score = (int)(score * 10000); + return clamp(int_score, -80000, 80000); +} + +static INLINE uint16_t +get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, + int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode, + int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const int is_inter = is_inter_block(mbmi); + const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY; + // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed < + // TX_TYPES, only that specific tx type is allowed. + TX_TYPE txk_allowed = TX_TYPES; + + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int use_actual_frame_probs = 1; + const int *tx_type_probs; +#if CONFIG_FPMT_TEST + use_actual_frame_probs = + (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1; + if (!use_actual_frame_probs) { + tx_type_probs = + (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size]; + } +#endif + if (use_actual_frame_probs) { + tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size]; + } + + if ((!is_inter && txfm_params->use_default_intra_tx_type) || + (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) { + txk_allowed = + get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools); + } else if (is_inter && + txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) { + if (tx_type_probs[DEFAULT_INTER_TX_TYPE] > + txfm_params->default_inter_tx_type_prob_thresh) { + txk_allowed = DEFAULT_INTER_TX_TYPE; + } else { + int force_tx_type = 0; + int max_prob = 0; + const int tx_type_prob_threshold = + txfm_params->default_inter_tx_type_prob_thresh + + PROB_THRESH_OFFSET_TX_TYPE; + for (int i = 1; i < TX_TYPES; i++) { // find maximum probability. + if (tx_type_probs[i] > max_prob) { + max_prob = tx_type_probs[i]; + force_tx_type = i; + } + } + if (max_prob > tx_type_prob_threshold) // force tx type with max prob. + txk_allowed = force_tx_type; + else if (x->rd_model == LOW_TXFM_RD) { + if (plane == 0) txk_allowed = DCT_DCT; + } + } + } else if (x->rd_model == LOW_TXFM_RD) { + if (plane == 0) txk_allowed = DCT_DCT; + } + + const TxSetType tx_set_type = av1_get_ext_tx_set_type( + tx_size, is_inter, cm->features.reduced_tx_set_used); + + TX_TYPE uv_tx_type = DCT_DCT; + if (plane) { + // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y + uv_tx_type = txk_allowed = + av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size, + cm->features.reduced_tx_set_used); + } + PREDICTION_MODE intra_dir = + mbmi->filter_intra_mode_info.use_filter_intra + ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode] + : mbmi->mode; + uint16_t ext_tx_used_flag = + cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 && + tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT + ? av1_reduced_intra_tx_used_flag[intra_dir] + : av1_ext_tx_used_flag[tx_set_type]; + + if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2) + ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir]; + + if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || + ext_tx_used_flag == 0x0001 || + (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) || + (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) { + txk_allowed = DCT_DCT; + } + + if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0) + ext_tx_used_flag &= DCT_ADST_TX_MASK; + + uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. + if (txk_allowed < TX_TYPES) { + allowed_tx_mask = 1 << txk_allowed; + allowed_tx_mask &= ext_tx_used_flag; + } else if (fast_tx_search) { + allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT + allowed_tx_mask &= ext_tx_used_flag; + } else { + assert(plane == 0); + allowed_tx_mask = ext_tx_used_flag; + int num_allowed = 0; + int i; + + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 }, + { 10, 17, 17, 10, 17, 17, 17 } }; + const int thresh = + thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1] + [update_type]; + uint16_t prune = 0; + int max_prob = -1; + int max_idx = 0; + for (i = 0; i < TX_TYPES; i++) { + if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) { + max_prob = tx_type_probs[i]; + max_idx = i; + } + if (tx_type_probs[i] < thresh) prune |= (1 << i); + } + if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx); + allowed_tx_mask &= (~prune); + } + for (i = 0; i < TX_TYPES; i++) { + if (allowed_tx_mask & (1 << i)) num_allowed++; + } + assert(num_allowed > 0); + + if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) { + int pf = prune_factors[txfm_params->prune_2d_txfm_mode]; + int mf = mul_factors[txfm_params->prune_2d_txfm_mode]; + if (num_allowed <= 7) { + const uint16_t prune = + prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col, + plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx, + cm->features.reduced_tx_set_used); + allowed_tx_mask &= (~prune); + } else { + const int num_sel = (num_allowed * mf + 50) / 100; + const uint16_t prune = prune_txk_type_separ( + cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize, + txk_map, allowed_tx_mask, pf, txb_ctx, + cm->features.reduced_tx_set_used, ref_best_rd, num_sel); + + allowed_tx_mask &= (~prune); + } + } else { + assert(num_allowed > 0); + int allowed_tx_count = + (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5; + // !fast_tx_search && txk_end != txk_start && plane == 0 + if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter && + num_allowed > allowed_tx_count) { + prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type, + txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask); + } + } + } + + // Need to have at least one transform type allowed. + if (allowed_tx_mask == 0) { + txk_allowed = (plane ? uv_tx_type : DCT_DCT); + allowed_tx_mask = (1 << txk_allowed); + } + + assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed)); + *allowed_txk_types = txk_allowed; + return allowed_tx_mask; +} + +#if CONFIG_RD_DEBUG +static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane, + int txb_coeff_cost) { + rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; +} +#endif + +static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block, + TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + int reduced_tx_set_used) { +#if TXCOEFF_COST_TIMER + struct aom_usec_timer timer; + aom_usec_timer_start(&timer); +#endif + const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type, + txb_ctx, reduced_tx_set_used); +#if TXCOEFF_COST_TIMER + AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common; + aom_usec_timer_mark(&timer); + const int64_t elapsed_time = aom_usec_timer_elapsed(&timer); + tmp_cm->txcoeff_cost_timer += elapsed_time; + ++tmp_cm->txcoeff_cost_count; +#endif + return cost; +} + +static int skip_trellis_opt_based_on_satd(MACROBLOCK *x, + QUANT_PARAM *quant_param, int plane, + int block, TX_SIZE tx_size, + int quant_b_adapt, int qstep, + unsigned int coeff_opt_satd_threshold, + int skip_trellis, int dc_only_blk) { + if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX)) + return skip_trellis; + + const struct macroblock_plane *const p = &x->plane[plane]; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *const coeff_ptr = p->coeff + block_offset; + const int n_coeffs = av1_get_max_eob(tx_size); + const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)); + int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs); + satd = RIGHT_SIGNED_SHIFT(satd, shift); + satd >>= (x->e_mbd.bd - 8); + + const int skip_block_trellis = + ((uint64_t)satd > + (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]); + + av1_setup_quant( + tx_size, !skip_block_trellis, + skip_block_trellis + ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + quant_b_adapt, quant_param); + + return skip_block_trellis; +} + +// Predict DC only blocks if the residual variance is below a qstep based +// threshold.For such blocks, transform type search is bypassed. +static INLINE void predict_dc_only_block( + MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int block, int blk_row, int blk_col, RD_STATS *best_rd_stats, + int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean, + int *dc_only_blk) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; + uint64_t block_var = UINT64_MAX; + const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3; + *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize, + txsize_to_bsize[tx_size], block_mse_q8, + per_px_mean, &block_var); + assert((*block_mse_q8) != UINT_MAX); + uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep); + if (is_cur_buf_hbd(xd)) + block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2); + + if (block_var >= var_threshold) return; + const unsigned int predict_dc_level = x->txfm_search_params.predict_dc_level; + assert(predict_dc_level != 0); + + // Prediction of skip block if residual mean and variance are less + // than qstep based threshold + if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) { + // If the normalized mean of residual block is less than the dc qstep and + // the normalized block variance is less than ac qstep, then the block is + // assumed to be a skip block and its rdcost is updated accordingly. + best_rd_stats->skip_txfm = 1; + + x->plane[plane].eobs[block] = 0; + + if (is_cur_buf_hbd(xd)) + *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2); + + best_rd_stats->dist = (*block_sse) << 4; + best_rd_stats->sse = best_rd_stats->dist; + + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl); + ENTROPY_CONTEXT *ta = ctxa; + ENTROPY_CONTEXT *tl = ctxl; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx_tmp; + const PLANE_TYPE plane_type = get_plane_type(plane); + get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp); + const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type] + .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1]; + best_rd_stats->rate = zero_blk_rate; + + best_rd_stats->rdcost = + RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse); + + x->plane[plane].txb_entropy_ctx[block] = 0; + } else if (predict_dc_level > 1) { + // Predict DC only blocks based on residual variance. + // For chroma plane, this prediction is disabled for intra blocks. + if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1; + } +} + +// Search for the best transform type for a given transform block. +// This function can be used for both inter and intra, both luma and chroma. +static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + const TXB_CTX *const txb_ctx, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis, + int64_t ref_best_rd, RD_STATS *best_rd_stats) { + const AV1_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + int64_t best_rd = INT64_MAX; + uint16_t best_eob = 0; + TX_TYPE best_tx_type = DCT_DCT; + int rate_cost = 0; + struct macroblock_plane *const p = &x->plane[plane]; + tran_low_t *orig_dqcoeff = p->dqcoeff; + tran_low_t *best_dqcoeff = x->dqcoeff_buf; + const int tx_type_map_idx = + plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col; + av1_invalid_rd_stats(best_rd_stats); + + skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id], + DRY_RUN_NORMAL); + + uint8_t best_txb_ctx = 0; + // txk_allowed = TX_TYPES: >1 tx types are allowed + // txk_allowed < TX_TYPES: only that specific tx type is allowed. + TX_TYPE txk_allowed = TX_TYPES; + int txk_map[TX_TYPES] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + }; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; + const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift; + + const uint8_t txw = tx_size_wide[tx_size]; + const uint8_t txh = tx_size_high[tx_size]; + int64_t block_sse; + unsigned int block_mse_q8; + int dc_only_blk = 0; + const bool predict_dc_block = + txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64; + int64_t per_px_mean = INT64_MAX; + if (predict_dc_block) { + predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row, + blk_col, best_rd_stats, &block_sse, &block_mse_q8, + &per_px_mean, &dc_only_blk); + if (best_rd_stats->skip_txfm == 1) { + const TX_TYPE tx_type = DCT_DCT; + if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; + return; + } + } else { + block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, + txsize_to_bsize[tx_size], &block_mse_q8); + assert(block_mse_q8 != UINT_MAX); + } + + // Bit mask to indicate which transform types are allowed in the RD search. + uint16_t tx_mask; + + // Use DCT_DCT transform for DC only block. + if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1) + tx_mask = 1 << DCT_DCT; + else + tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, + tx_size, txb_ctx, ftxs_mode, ref_best_rd, + &txk_allowed, txk_map); + const uint16_t allowed_tx_mask = tx_mask; + + if (is_cur_buf_hbd(xd)) { + block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); + block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2); + } + block_sse *= 16; + // Use mse / qstep^2 based threshold logic to take decision of R-D + // optimization of coeffs. For smaller residuals, coeff optimization + // would be helpful. For larger residuals, R-D optimization may not be + // effective. + // TODO(any): Experiment with variance and mean based thresholds + const int perform_block_coeff_opt = + ((uint64_t)block_mse_q8 <= + (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep); + skip_trellis |= !perform_block_coeff_opt; + + // Flag to indicate if distortion should be calculated in transform domain or + // not during iterating through transform type candidates. + // Transform domain distortion is accurate for higher residuals. + // TODO(any): Experiment with variance and mean based thresholds + int use_transform_domain_distortion = + (txfm_params->use_transform_domain_distortion > 0) && + (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) && + // Any 64-pt transforms only preserves half the coefficients. + // Therefore transform domain distortion is not valid for these + // transform sizes. + (txsize_sqr_up_map[tx_size] != TX_64X64) && + // Use pixel domain distortion for DC only blocks + !dc_only_blk; + // Flag to indicate if an extra calculation of distortion in the pixel domain + // should be performed at the end, after the best transform type has been + // decided. + int calc_pixel_domain_distortion_final = + txfm_params->use_transform_domain_distortion == 1 && + use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD; + if (calc_pixel_domain_distortion_final && + (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001)) + calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0; + + const uint16_t *eobs_ptr = x->plane[plane].eobs; + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + int skip_trellis_based_on_satd[TX_TYPES] = { 0 }; + av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, !skip_trellis, + skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP) + : AV1_XFORM_QUANT_FP, + cpi->oxcf.q_cfg.quant_b_adapt, &quant_param); + + // Iterate through all transform type candidates. + for (int idx = 0; idx < TX_TYPES; ++idx) { + const TX_TYPE tx_type = (TX_TYPE)txk_map[idx]; + if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type)) + continue; + txfm_param.tx_type = tx_type; + if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) { + av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type, + &quant_param); + } + if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type; + RD_STATS this_rd_stats; + av1_invalid_rd_stats(&this_rd_stats); + + if (!dc_only_blk) + av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param); + else + av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean); + + skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd( + x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt, + qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk); + + av1_quant(x, plane, block, &txfm_param, &quant_param); + + // Calculate rate cost of quantized coefficients. + if (quant_param.use_optimize_b) { + // TODO(aomedia:3209): update Trellis quantization to take into account + // quantization matrices. + av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + &rate_cost); + } else { + rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx, + cm->features.reduced_tx_set_used); + } + + // If rd cost based on coeff rate alone is already more than best_rd, + // terminate early. + if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue; + + // Calculate distortion. + if (eobs_ptr[block] == 0) { + // When eob is 0, pixel domain distortion is more efficient and accurate. + this_rd_stats.dist = this_rd_stats.sse = block_sse; + } else if (dc_only_blk) { + this_rd_stats.sse = block_sse; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + } else if (use_transform_domain_distortion) { + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &this_rd_stats.dist, + &this_rd_stats.sse); + } else { + int64_t sse_diff = INT64_MAX; + // high_energy threshold assumes that every pixel within a txfm block + // has a residue energy of at least 25% of the maximum, i.e. 128 * 128 + // for 8 bit. + const int64_t high_energy_thresh = + ((int64_t)128 * 128 * tx_size_2d[tx_size]); + const int is_high_energy = (block_sse >= high_energy_thresh); + if (tx_size == TX_64X64 || is_high_energy) { + // Because 3 out 4 quadrants of transform coefficients are forced to + // zero, the inverse transform has a tendency to overflow. sse_diff + // is effectively the energy of those 3 quadrants, here we use it + // to decide if we should do pixel domain distortion. If the energy + // is mostly in first quadrant, then it is unlikely that we have + // overflow issue in inverse transform. + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix, + scan_order->scan, &this_rd_stats.dist, + &this_rd_stats.sse); + sse_diff = block_sse - this_rd_stats.sse; + } + if (tx_size != TX_64X64 || !is_high_energy || + (sse_diff * 2) < this_rd_stats.sse) { + const int64_t tx_domain_dist = this_rd_stats.dist; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + // For high energy blocks, occasionally, the pixel domain distortion + // can be artificially low due to clamping at reconstruction stage + // even when inverse transform output is hugely different from the + // actual residue. + if (is_high_energy && this_rd_stats.dist < tx_domain_dist) + this_rd_stats.dist = tx_domain_dist; + } else { + assert(sse_diff < INT64_MAX); + this_rd_stats.dist += sse_diff; + } + this_rd_stats.sse = block_sse; + } + + this_rd_stats.rate = rate_cost; + + const int64_t rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + + if (rd < best_rd) { + best_rd = rd; + *best_rd_stats = this_rd_stats; + best_tx_type = tx_type; + best_txb_ctx = x->plane[plane].txb_entropy_ctx[block]; + best_eob = x->plane[plane].eobs[block]; + // Swap dqcoeff buffers + tran_low_t *const tmp_dqcoeff = best_dqcoeff; + best_dqcoeff = p->dqcoeff; + p->dqcoeff = tmp_dqcoeff; + } + +#if CONFIG_COLLECT_RD_STATS == 1 + if (plane == 0) { + PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col, + plane_bsize, tx_size, tx_type, rd); + } +#endif // CONFIG_COLLECT_RD_STATS == 1 + +#if COLLECT_TX_SIZE_DATA + // Generate small sample to restrict output size. + static unsigned int seed = 21743; + if (lcg_rand16(&seed) % 200 == 0) { + FILE *fp = NULL; + + if (within_border) { + fp = fopen(av1_tx_size_data_output_file, "a"); + } + + if (fp) { + // Transform info and RD + const int txb_w = tx_size_wide[tx_size]; + const int txb_h = tx_size_high[tx_size]; + + // Residue signal. + const int diff_stride = block_size_wide[plane_bsize]; + struct macroblock_plane *const p = &x->plane[plane]; + const int16_t *src_diff = + &p->src_diff[(blk_row * diff_stride + blk_col) * 4]; + + for (int r = 0; r < txb_h; ++r) { + for (int c = 0; c < txb_w; ++c) { + fprintf(fp, "%d,", src_diff[c]); + } + src_diff += diff_stride; + } + + fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd); + fprintf(fp, "\n"); + fclose(fp); + } + } +#endif // COLLECT_TX_SIZE_DATA + + // If the current best RD cost is much worse than the reference RD cost, + // terminate early. + if (cpi->sf.tx_sf.adaptive_txb_search_level) { + if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) > + ref_best_rd) { + break; + } + } + + // Terminate transform type search if the block has been quantized to + // all zero. + if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break; + } + + assert(best_rd != INT64_MAX); + + best_rd_stats->skip_txfm = best_eob == 0; + if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type); + x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx; + x->plane[plane].eobs[block] = best_eob; + skip_trellis = skip_trellis_based_on_satd[best_tx_type]; + + // Point dqcoeff to the quantized coefficients corresponding to the best + // transform type, then we can skip transform and quantization, e.g. in the + // final pixel domain distortion calculation and recon_intra(). + p->dqcoeff = best_dqcoeff; + + if (calc_pixel_domain_distortion_final && best_eob) { + best_rd_stats->dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + best_rd_stats->sse = block_sse; + } + + // Intra mode needs decoded pixels such that the next transform block + // can use them for prediction. + recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob); + p->dqcoeff = orig_dqcoeff; +} + +// Pick transform type for a luma transform block of tx_size. Note this function +// is used only for inter-predicted blocks. +static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, + TX_SIZE tx_size, int blk_row, int blk_col, + int block, int plane_bsize, TXB_CTX *txb_ctx, + RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode, + int64_t ref_rdcost) { + assert(is_inter_block(x->e_mbd.mi[0])); + RD_STATS this_rd_stats; + const int skip_trellis = 0; + search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size, + txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats); + + av1_merge_rd_stats(rd_stats, &this_rd_stats); +} + +static AOM_INLINE void try_tx_block_no_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, + const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl, + int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblock_plane *const p = &x->plane[0]; + const int bw = mi_size_wide[plane_bsize]; + const ENTROPY_CONTEXT *const pta = ta + blk_col; + const ENTROPY_CONTEXT *const ptl = tl + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); + mbmi->inter_tx_size[index] = tx_size; + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd); + assert(rd_stats->rate < INT_MAX); + + const int pick_skip_txfm = + !xd->lossless[mbmi->segment_id] && + (rd_stats->skip_txfm == 1 || + RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse)); + if (pick_skip_txfm) { +#if CONFIG_RD_DEBUG + update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate); +#endif // CONFIG_RD_DEBUG + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + p->eobs[block] = 0; + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } + rd_stats->skip_txfm = pick_skip_txfm; + set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col, + pick_skip_txfm); + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0]; + + no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + no_split->txb_entropy_ctx = p->txb_entropy_ctx[block]; + no_split->tx_type = + xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col]; +} + +static AOM_INLINE void try_tx_block_split( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd, + FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) { + assert(tx_size < TX_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + const int txb_width = tx_size_wide_unit[tx_size]; + const int txb_height = tx_size_high_unit[tx_size]; + // Transform size after splitting current block. + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int sub_txb_width = tx_size_wide_unit[sub_txs]; + const int sub_txb_height = tx_size_high_unit[sub_txs]; + const int sub_step = sub_txb_width * sub_txb_height; + const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width); + assert(nblks > 0); + av1_init_rd_stats(split_rd_stats); + split_rd_stats->rate = + x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1]; + + for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) { + const int offsetr = blk_row + r; + if (offsetr >= max_blocks_high) break; + for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) { + assert(blk_idx < 4); + const int offsetc = blk_col + c; + if (offsetc >= max_blocks_wide) continue; + + RD_STATS this_rd_stats; + int this_cost_valid = 1; + select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, + plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats, + no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost, + &this_cost_valid, ftxs_mode); + if (!this_cost_valid) { + split_rd_stats->rdcost = INT64_MAX; + return; + } + av1_merge_rd_stats(split_rd_stats, &this_rd_stats); + split_rd_stats->rdcost = + RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); + if (split_rd_stats->rdcost > ref_best_rd) { + split_rd_stats->rdcost = INT64_MAX; + return; + } + block += sub_step; + } + } +} + +static float get_var(float mean, double x2_sum, int num) { + const float e_x2 = (float)(x2_sum / num); + const float diff = e_x2 - mean * mean; + return diff; +} + +static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw, + int bh, float *dev_of_mean, + float *var_of_vars) { + const int16_t *const data_ptr = &data[0]; + const int subh = (bh >= bw) ? (bh >> 1) : bh; + const int subw = (bw >= bh) ? (bw >> 1) : bw; + const int num = bw * bh; + const int sub_num = subw * subh; + int total_x_sum = 0; + int64_t total_x2_sum = 0; + int blk_idx = 0; + float var_sum = 0.0f; + float mean_sum = 0.0f; + double var2_sum = 0.0f; + double mean2_sum = 0.0f; + + for (int row = 0; row < bh; row += subh) { + for (int col = 0; col < bw; col += subw) { + int x_sum; + int64_t x2_sum; + aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh, + &x_sum, &x2_sum); + total_x_sum += x_sum; + total_x2_sum += x2_sum; + + const float mean = (float)x_sum / sub_num; + const float var = get_var(mean, (double)x2_sum, sub_num); + mean_sum += mean; + mean2_sum += (double)(mean * mean); + var_sum += var; + var2_sum += var * var; + blk_idx++; + } + } + + const float lvl0_mean = (float)total_x_sum / num; + const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num); + mean_sum += lvl0_mean; + mean2_sum += (double)(lvl0_mean * lvl0_mean); + var_sum += block_var; + var2_sum += block_var * block_var; + const float av_mean = mean_sum / 5; + + if (blk_idx > 1) { + // Deviation of means. + *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1)); + // Variance of variances. + const float mean_var = var_sum / (blk_idx + 1); + *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1)); + } +} + +static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize, + int blk_row, int blk_col, TX_SIZE tx_size, + int *try_no_split, int *try_split, + int pruning_level) { + const int diff_stride = block_size_wide[bsize]; + const int16_t *diff = + x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + float dev_of_means = 0.0f; + float var_of_vars = 0.0f; + + // This function calculates the deviation of means, and the variance of pixel + // variances of the block as well as it's sub-blocks. + get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars); + const int dc_q = x->plane[0].dequant_QTX[0] >> 3; + const int ac_q = x->plane[0].dequant_QTX[1] >> 3; + const int no_split_thresh_scales[4] = { 0, 24, 8, 8 }; + const int no_split_thresh_scale = no_split_thresh_scales[pruning_level]; + const int split_thresh_scales[4] = { 0, 24, 10, 8 }; + const int split_thresh_scale = split_thresh_scales[pruning_level]; + + if ((dev_of_means <= dc_q) && + (split_thresh_scale * var_of_vars <= ac_q * ac_q)) { + *try_split = 0; + } + if ((dev_of_means > no_split_thresh_scale * dc_q) && + (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) { + *try_no_split = 0; + } +} + +// Search for the best transform partition(recursive)/type for a given +// inter-predicted luma block. The obtained transform selection will be saved +// in xd->mi[0], the corresponding RD stats will be saved in rd_stats. +static AOM_INLINE void select_tx_block( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, + ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, + RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) { + assert(tx_size < TX_SIZES_ALL); + av1_init_rd_stats(rd_stats); + if (ref_best_rd < 0) { + *is_cost_valid = 0; + return; + } + + MACROBLOCKD *const xd = &x->e_mbd; + assert(blk_row < max_block_high(xd, plane_bsize, 0) && + blk_col < max_block_wide(xd, plane_bsize, 0)); + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->bsize, tx_size); + struct macroblock_plane *const p = &x->plane[0]; + + int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 || + txsize_sqr_up_map[tx_size] != TX_64X64) && + (cpi->oxcf.txfm_cfg.enable_rect_tx || + tx_size_wide[tx_size] == tx_size_high[tx_size]); + int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; + TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES }; + + // Prune tx_split and no-split based on sub-block properties. + if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 && + cpi->sf.tx_sf.prune_tx_size_level > 0) { + prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size, + &try_no_split, &try_split, + cpi->sf.tx_sf.prune_tx_size_level); + } + + if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) { + if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0; + } + + // Try using current block as a single transform block without split. + if (try_no_split) { + try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd, + ftxs_mode, &no_split); + + // Speed features for early termination. + const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level; + if (search_level) { + if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) { + *is_cost_valid = 0; + return; + } + if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) { + try_split = 0; + } + } + if (cpi->sf.tx_sf.txb_split_cap) { + if (p->eobs[block] == 0) try_split = 0; + } + } + + // ML based speed feature to skip searching for split transform blocks. + if (x->e_mbd.bd == 8 && try_split && + !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) { + const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh; + if (threshold >= 0) { + const int split_score = + ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size); + if (split_score < -threshold) try_split = 0; + } + } + + RD_STATS split_rd_stats; + split_rd_stats.rdcost = INT64_MAX; + // Try splitting current block into smaller transform blocks. + if (try_split) { + try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth, + plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd, + AOMMIN(no_split.rd, ref_best_rd), ftxs_mode, + &split_rd_stats); + } + + if (no_split.rd < split_rd_stats.rdcost) { + ENTROPY_CONTEXT *pta = ta + blk_col; + ENTROPY_CONTEXT *ptl = tl + blk_row; + p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx; + av1_set_txb_context(x, 0, block, tx_size, pta, ptl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) { + for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) { + const int index = + av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx); + mbmi->inter_tx_size[index] = tx_size; + } + } + mbmi->tx_size = tx_size; + update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type); + const int bw = mi_size_wide[plane_bsize]; + set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col, + rd_stats->skip_txfm); + } else { + *rd_stats = split_rd_stats; + if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0; + } +} + +static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi, + MACROBLOCK *x, RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type); + + // If tx64 is not enabled, we need to go down to the next available size + if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) { + static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_32X32, // 64x64 transform + TX_4X8, // 4x8 transform + TX_8X4, // 8x4 transform + TX_8X16, // 8x16 transform + TX_16X8, // 16x8 transform + TX_16X32, // 16x32 transform + TX_32X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X16, // 4x16 transform + TX_16X4, // 16x4 transform + TX_8X32, // 8x32 transform + TX_32X8, // 32x8 transform + TX_16X32, // 16x64 transform + TX_32X16, // 64x16 transform + }; + mbmi->tx_size = tx_size_max_32[mbmi->tx_size]; + } else if (cpi->oxcf.txfm_cfg.enable_tx64 && + !cpi->oxcf.txfm_cfg.enable_rect_tx) { + static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_64X64, // 64x64 transform + TX_4X4, // 4x8 transform + TX_4X4, // 8x4 transform + TX_8X8, // 8x16 transform + TX_8X8, // 16x8 transform + TX_16X16, // 16x32 transform + TX_16X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X4, // 4x16 transform + TX_4X4, // 16x4 transform + TX_8X8, // 8x32 transform + TX_8X8, // 32x8 transform + TX_16X16, // 16x64 transform + TX_16X16, // 64x16 transform + }; + mbmi->tx_size = tx_size_max_square[mbmi->tx_size]; + } else if (!cpi->oxcf.txfm_cfg.enable_tx64 && + !cpi->oxcf.txfm_cfg.enable_rect_tx) { + static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = { + TX_4X4, // 4x4 transform + TX_8X8, // 8x8 transform + TX_16X16, // 16x16 transform + TX_32X32, // 32x32 transform + TX_32X32, // 64x64 transform + TX_4X4, // 4x8 transform + TX_4X4, // 8x4 transform + TX_8X8, // 8x16 transform + TX_8X8, // 16x8 transform + TX_16X16, // 16x32 transform + TX_16X16, // 32x16 transform + TX_32X32, // 32x64 transform + TX_32X32, // 64x32 transform + TX_4X4, // 4x16 transform + TX_4X4, // 16x4 transform + TX_8X8, // 8x32 transform + TX_8X8, // 32x8 transform + TX_16X16, // 16x64 transform + TX_16X16, // 64x16 transform + }; + + mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size]; + } + + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + // Skip RDcost is used only for Inter blocks + const int64_t skip_txfm_rd = + is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; + const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0); + const int skip_trellis = 0; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, + AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs, + mbmi->tx_size, FTXS_NONE, skip_trellis); +} + +static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi, + MACROBLOCK *x, + RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + + mbmi->tx_size = TX_4X4; + // TODO(any) : Pass this_rd based on skip/non-skip cost + const int skip_trellis = 0; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size, + FTXS_NONE, skip_trellis); +} + +#if !CONFIG_REALTIME_ONLY +static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row, + int blk_col, BLOCK_SIZE bsize, + TX_SIZE tx_size) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + + // Disable the pruning logic using NN model for the following cases: + // 1) Lossless coding as only 4x4 transform is evaluated in this case + // 2) When transform and current block sizes do not match as the features are + // obtained over the current block + // 3) When operating bit-depth is not 8-bit as the input features are not + // scaled according to bit-depth. + if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize || + xd->bd != 8) + return; + + // Currently NN model based pruning is supported only when largest transform + // size is 8x8 + if (tx_size != TX_8X8) return; + + // Neural network model is a sequential neural net and was trained using SGD + // optimizer. The model can be further improved in terms of speed/quality by + // considering the following experiments: + // 1) Generate ML model by training with balanced data for different learning + // rates and optimizers. + // 2) Experiment with ML model by adding features related to the statistics of + // top and left pixels to capture the accuracy of reconstructed neighbouring + // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4 + // sub-blocks, etc. + // 3) Generate ML models for transform blocks other than 8x8. + const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8; + const float *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8; + + float features[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.0f }; + const int diff_stride = block_size_wide[bsize]; + + const int16_t *diff = x->plane[0].src_diff + MI_SIZE * blk_row * diff_stride + + MI_SIZE * blk_col; + const int bw = tx_size_wide[tx_size]; + const int bh = tx_size_high[tx_size]; + + int feature_idx = get_mean_dev_features(diff, diff_stride, bw, bh, features); + + features[feature_idx++] = log1pf((float)x->source_variance); + + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + const float log_dc_q_square = log1pf((float)(dc_q * dc_q) / 256.0f); + features[feature_idx++] = log_dc_q_square; + assert(feature_idx == NUM_INTRA_TX_SPLIT_FEATURES); + for (int i = 0; i < NUM_INTRA_TX_SPLIT_FEATURES; i++) { + features[i] = (features[i] - av1_intra_tx_split_8x8_mean[i]) / + av1_intra_tx_split_8x8_std[i]; + } + + float score; + av1_nn_predict(features, nn_config, 1, &score); + + TxfmSearchParams *const txfm_params = &x->txfm_search_params; + if (score <= intra_tx_prune_thresh[0]) + txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_SPLIT; + else if (score > intra_tx_prune_thresh[1]) + txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_LARGEST; +} +#endif // !CONFIG_REALTIME_ONLY + +// Search for the best uniform transform size and type for current coding block. +static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, + MACROBLOCK *x, + RD_STATS *rd_stats, + int64_t ref_best_rd, + BLOCK_SIZE bs) { + av1_invalid_rd_stats(rd_stats); + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + TxfmSearchParams *const txfm_params = &x->txfm_search_params; + const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; + const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT; + int start_tx; + // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls + // how many times of splitting is allowed during the RD search. + int init_depth; + + if (tx_select) { + start_tx = max_rect_tx_size; + init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], + is_inter_block(mbmi), &cpi->sf, + txfm_params->tx_size_search_method); + if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 && + txsize_sqr_up_map[start_tx] == TX_64X64) { + start_tx = sub_tx_size_map[start_tx]; + } + } else { + const TX_SIZE chosen_tx_size = + tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type); + start_tx = chosen_tx_size; + init_depth = MAX_TX_DEPTH; + } + + const int skip_trellis = 0; + uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + TX_SIZE best_tx_size = max_rect_tx_size; + int64_t best_rd = INT64_MAX; + const int num_blks = bsize_to_num_blk(bs); + x->rd_model = FULL_TXFM_RD; + int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX }; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH; + depth++, tx_size = sub_tx_size_map[tx_size]) { + if ((!cpi->oxcf.txfm_cfg.enable_tx64 && + txsize_sqr_up_map[tx_size] == TX_64X64) || + (!cpi->oxcf.txfm_cfg.enable_rect_tx && + tx_size_wide[tx_size] != tx_size_high[tx_size])) { + continue; + } + +#if !CONFIG_REALTIME_ONLY + if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break; + + // Set the flag to enable the evaluation of NN classifier to prune transform + // depths. As the features are based on intra residual information of + // largest transform, the evaluation of NN model is enabled only for this + // case. + txfm_params->enable_nn_prune_intra_tx_depths = + (cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx); +#endif + + RD_STATS this_rd_stats; + // When the speed feature use_rd_based_breakout_for_intra_tx_search is + // enabled, use the known minimum best_rd for early termination. + const int64_t rd_thresh = + cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search + ? AOMMIN(ref_best_rd, best_rd) + : ref_best_rd; + rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs, + tx_size, FTXS_NONE, skip_trellis); + if (rd[depth] < best_rd) { + av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks); + av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks); + best_tx_size = tx_size; + best_rd = rd[depth]; + *rd_stats = this_rd_stats; + } + if (tx_size == TX_4X4) break; + // If we are searching three depths, prune the smallest size depending + // on rd results for the first two depths for low contrast blocks. + if (depth > init_depth && depth != MAX_TX_DEPTH && + x->source_variance < 256) { + if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break; + } + } + + if (rd_stats->rate != INT_MAX) { + mbmi->tx_size = best_tx_size; + av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks); + av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks); + } + +#if !CONFIG_REALTIME_ONLY + // Reset the flags to avoid any unintentional evaluation of NN model and + // consumption of prune depths. + txfm_params->enable_nn_prune_intra_tx_depths = false; + txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE; +#endif +} + +// Search for the best transform type for the given transform block in the +// given plane/channel, and calculate the corresponding RD cost. +static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row, + int blk_col, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct rdcost_block_args *args = arg; + if (args->exit_early) { + args->incomplete_exit = 1; + return; + } + + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + const int is_inter = is_inter_block(xd->mi[0]); + const AV1_COMP *cpi = args->cpi; + ENTROPY_CONTEXT *a = args->t_above + blk_col; + ENTROPY_CONTEXT *l = args->t_left + blk_row; + const AV1_COMMON *cm = &cpi->common; + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + + if (!is_inter) { + av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); +#if !CONFIG_REALTIME_ONLY + const TxfmSearchParams *const txfm_params = &x->txfm_search_params; + if (txfm_params->enable_nn_prune_intra_tx_depths) { + ml_predict_intra_tx_depth_prune(x, blk_row, blk_col, plane_bsize, + tx_size); + if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_LARGEST) { + av1_invalid_rd_stats(&args->rd_stats); + args->exit_early = 1; + return; + } + } +#endif + } + + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); + search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, + &txb_ctx, args->ftxs_mode, args->skip_trellis, + args->best_rd - args->current_rd, &this_rd_stats); + + if (plane == AOM_PLANE_Y && xd->cfl.store_y) { + assert(!is_inter || plane_bsize < BLOCK_8X8); + cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); + } + +#if CONFIG_RD_DEBUG + update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate); +#endif // CONFIG_RD_DEBUG + av1_set_txb_context(x, plane, block, tx_size, a, l); + + const int blk_idx = + blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col; + + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + if (plane == 0) + set_blk_skip(txfm_info->blk_skip, plane, blk_idx, + x->plane[plane].eobs[block] == 0); + else + set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0); + + int64_t rd; + if (is_inter) { + const int64_t no_skip_txfm_rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); + rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd); + this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block]; + } else { + // Signal non-skip_txfm for Intra blocks + rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + this_rd_stats.skip_txfm = 0; + } + + av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); + + args->current_rd += rd; + if (args->current_rd > args->best_rd) args->exit_early = 1; +} + +int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const ModeCosts *mode_costs = &x->mode_costs; + const int is_inter = is_inter_block(mbmi); + const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->bsize); + int tx_size_rate = 0; + if (tx_select) { + const int ctx = txfm_partition_context( + xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size); + tx_size_rate = mode_costs->txfm_partition_cost[ctx][0]; + } + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; + const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0); + const int64_t no_this_rd = + RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0); + mbmi->tx_size = tx_size; + + const uint8_t txw_unit = tx_size_wide_unit[tx_size]; + const uint8_t txh_unit = tx_size_high_unit[tx_size]; + const int step = txw_unit * txh_unit; + const int max_blocks_wide = max_block_wide(xd, bs, 0); + const int max_blocks_high = max_block_high(xd, bs, 0); + + struct rdcost_block_args args; + av1_zero(args); + args.x = x; + args.cpi = cpi; + args.best_rd = ref_best_rd; + args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd); + av1_init_rd_stats(&args.rd_stats); + av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left); + int i = 0; + for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit; + blk_row += txh_unit) { + for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) { + RD_STATS this_rd_stats; + av1_init_rd_stats(&this_rd_stats); + + if (args.exit_early) { + args.incomplete_exit = 1; + break; + } + + ENTROPY_CONTEXT *a = args.t_above + blk_col; + ENTROPY_CONTEXT *l = args.t_left + blk_row; + TXB_CTX txb_ctx; + get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx); + + TxfmParam txfm_param; + QUANT_PARAM quant_param; + av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param); + av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param); + + av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param); + av1_quant(x, 0, i, &txfm_param, &quant_param); + + this_rd_stats.rate = + cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0); + + const SCAN_ORDER *const scan_order = + get_scan(txfm_param.tx_size, txfm_param.tx_type); + dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix, + scan_order->scan, &this_rd_stats.dist, + &this_rd_stats.sse); + + const int64_t no_skip_txfm_rd = + RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse); + + this_rd_stats.skip_txfm &= !x->plane[0].eobs[i]; + + av1_merge_rd_stats(&args.rd_stats, &this_rd_stats); + args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd); + + if (args.current_rd > ref_best_rd) { + args.exit_early = 1; + break; + } + + av1_set_txb_context(x, 0, i, tx_size, a, l); + i += step; + } + } + + if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats); + + *rd_stats = args.rd_stats; + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + int64_t rd; + // rdstats->rate should include all the rate except skip/non-skip cost as the + // same is accounted in the caller functions after rd evaluation of all + // planes. However the decisions should be done after considering the + // skip/non-skip header cost + if (rd_stats->skip_txfm && is_inter) { + rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + } else { + // Intra blocks are always signalled as non-skip + rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate, + rd_stats->dist); + rd_stats->rate += tx_size_rate; + } + // Check if forcing the block to skip transform leads to smaller RD cost. + if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) { + int64_t temp_skip_txfm_rd = + RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + if (temp_skip_txfm_rd <= rd) { + rd = temp_skip_txfm_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + } + } + + return rd; +} + +int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { + assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs))); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const ModeCosts *mode_costs = &x->mode_costs; + const int is_inter = is_inter_block(mbmi); + const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT && + block_signals_txsize(mbmi->bsize); + int tx_size_rate = 0; + if (tx_select) { + const int ctx = txfm_partition_context( + xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size); + tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0] + : tx_size_cost(x, bs, tx_size); + } + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1]; + const int64_t skip_txfm_rd = + is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX; + const int64_t no_this_rd = + RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0); + + mbmi->tx_size = tx_size; + av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, + AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs, + tx_size, ftxs_mode, skip_trellis); + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + int64_t rd; + // rdstats->rate should include all the rate except skip/non-skip cost as the + // same is accounted in the caller functions after rd evaluation of all + // planes. However the decisions should be done after considering the + // skip/non-skip header cost + if (rd_stats->skip_txfm && is_inter) { + rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + } else { + // Intra blocks are always signalled as non-skip + rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate, + rd_stats->dist); + rd_stats->rate += tx_size_rate; + } + // Check if forcing the block to skip transform leads to smaller RD cost. + if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) { + int64_t temp_skip_txfm_rd = + RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + if (temp_skip_txfm_rd <= rd) { + rd = temp_skip_txfm_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + } + } + + return rd; +} + +// Search for the best transform type for a luma inter-predicted block, given +// the transform block partitions. +// This function is used only when some speed features are enabled. +static AOM_INLINE void tx_block_yrd( + const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block, + TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth, + ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx, + TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd, + RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) { + assert(tx_size < TX_SIZES_ALL); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(is_inter_block(mbmi)); + const int max_blocks_high = max_block_high(xd, plane_bsize, 0); + const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); + + if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; + + const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index( + plane_bsize, blk_row, blk_col)]; + const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row, + mbmi->bsize, tx_size); + + av1_init_rd_stats(rd_stats); + if (tx_size == plane_tx_size) { + ENTROPY_CONTEXT *ta = above_ctx + blk_col; + ENTROPY_CONTEXT *tl = left_ctx + blk_row; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx); + + const int zero_blk_rate = + x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->zero_rate = zero_blk_rate; + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd); + const int mi_width = mi_size_wide[plane_bsize]; + TxfmSearchInfo *txfm_info = &x->txfm_search_info; + if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= + RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || + rd_stats->skip_txfm == 1) { + rd_stats->rate = zero_blk_rate; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1); + x->plane[0].eobs[block] = 0; + x->plane[0].txb_entropy_ctx[block] = 0; + update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT); + } else { + rd_stats->skip_txfm = 0; + set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0); + } + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0]; + av1_set_txb_context(x, 0, block, tx_size, ta, tl); + txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size, + tx_size); + } else { + const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; + const int txb_width = tx_size_wide_unit[sub_txs]; + const int txb_height = tx_size_high_unit[sub_txs]; + const int step = txb_height * txb_width; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + RD_STATS pn_rd_stats; + int64_t this_rd = 0; + assert(txb_width > 0 && txb_height > 0); + + for (int row = 0; row < row_end; row += txb_height) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += txb_width) { + const int offsetc = blk_col + col; + + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize, + depth + 1, above_ctx, left_ctx, tx_above, tx_left, + ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist); + block += step; + } + } + + if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) + rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1]; + } +} + +// search for tx type with tx sizes already decided for a inter-predicted luma +// partition block. It's used only when some speed features are enabled. +// Return value 0: early termination triggered, no valid rd cost available; +// 1: rd cost values are valid. +static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) { + if (ref_best_rd < 0) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + + av1_init_rd_stats(rd_stats); + + MACROBLOCKD *const xd = &x->e_mbd; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + const struct macroblockd_plane *const pd = &xd->plane[0]; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0); + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + const int init_depth = get_search_init_depth( + mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method); + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + + int64_t this_rd = 0; + for (int idy = 0, block = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + RD_STATS pn_rd_stats; + av1_init_rd_stats(&pn_rd_stats); + tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth, + ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd, + &pn_rd_stats, ftxs_mode); + if (pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return 0; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + this_rd += + AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist), + RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse)); + block += step; + } + } + + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse); + this_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist); + if (skip_txfm_rd < this_rd) { + this_rd = skip_txfm_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip_txfm = 1; + } + + const int is_cost_valid = this_rd > ref_best_rd; + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + return is_cost_valid; +} + +// Search for the best transform size and type for current inter-predicted +// luma block with recursive transform block partitioning. The obtained +// transform selection will be saved in xd->mi[0], the corresponding RD stats +// will be saved in rd_stats. The returned value is the corresponding RD cost. +static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + assert(is_inter_block(xd->mi[0])); + assert(bsize < BLOCK_SIZES_ALL); + const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD; + int64_t rd_thresh = ref_best_rd; + if (rd_thresh == 0) { + av1_invalid_rd_stats(rd_stats); + return INT64_MAX; + } + if (fast_tx_search && rd_thresh < INT64_MAX) { + if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3); + } + assert(rd_thresh > 0); + const FAST_TX_SEARCH_MODE ftxs_mode = + fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE; + const struct macroblockd_plane *const pd = &xd->plane[0]; + assert(bsize < BLOCK_SIZES_ALL); + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + const int init_depth = get_search_init_depth( + mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method); + const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0]; + const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1]; + int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0); + int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0); + int block = 0; + + av1_init_rd_stats(rd_stats); + for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) { + for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) { + const int64_t best_rd_sofar = + (rd_thresh == INT64_MAX) + ? INT64_MAX + : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd))); + int is_cost_valid = 1; + RD_STATS pn_rd_stats; + // Search for the best transform block size and type for the sub-block. + select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize, + ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX, + best_rd_sofar, &is_cost_valid, ftxs_mode); + if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return INT64_MAX; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse); + no_skip_txfm_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist); + block += step; + } + } + + if (rd_stats->rate == INT_MAX) return INT64_MAX; + + rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd); + + // If fast_tx_search is true, only DCT and 1D DCT were tested in + // select_inter_block_yrd() above. Do a better search for tx type with + // tx sizes already decided. + if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) { + if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE)) + return INT64_MAX; + } + + int64_t final_rd; + if (rd_stats->skip_txfm) { + final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse); + } else { + final_rd = + RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist); + if (!xd->lossless[xd->mi[0]->segment_id]) { + final_rd = + AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse)); + } + } + + return final_rd; +} + +// Return 1 to terminate transform search early. The decision is made based on +// the comparison with the reference RD cost and the model-estimated RD cost. +static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi, + MACROBLOCK *x, + BLOCK_SIZE bsize, + int64_t ref_best_rd) { + const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level; + assert(level >= 0 && level <= 2); + int model_rate; + int64_t model_dist; + uint8_t model_skip; + MACROBLOCKD *const xd = &x->e_mbd; + model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE]( + cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL, + NULL, NULL, NULL); + if (model_skip) return 0; + const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist); + // TODO(debargha, urvang): Improve the model and make the check below + // tighter. + static const int prune_factor_by8[] = { 3, 5 }; + const int factor = prune_factor_by8[level - 1]; + return ((model_rd * factor) >> 3) > ref_best_rd; +} + +void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + const TxfmSearchParams *txfm_params = &x->txfm_search_params; + assert(is_inter_block(xd->mi[0])); + + av1_invalid_rd_stats(rd_stats); + + // If modeled RD cost is a lot worse than the best so far, terminate early. + if (cpi->sf.tx_sf.model_based_prune_tx_search_level && + ref_best_rd != INT64_MAX) { + if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return; + } + + // Hashing based speed feature. If the hash of the prediction residue block is + // found in the hash table, use previous search results and terminate early. + uint32_t hash = 0; + MB_RD_RECORD *mb_rd_record = NULL; + const int mi_row = x->e_mbd.mi_row; + const int mi_col = x->e_mbd.mi_col; + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); + const int is_mb_rd_hash_enabled = + (within_border && cpi->sf.rd_sf.use_mb_rd_hash); + const int n4 = bsize_to_num_blk(bsize); + if (is_mb_rd_hash_enabled) { + hash = get_block_residue_hash(x, bsize); + mb_rd_record = x->txfm_search_info.mb_rd_record; + const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index]; + fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x); + return; + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (txfm_params->skip_txfm_level && + predict_skip_txfm(x, bsize, &dist, + cpi->common.features.reduced_tx_set_used)) { + set_skip_txfm(x, rd_stats, bsize, dist); + // Save the RD search results into mb_rd_record. + if (is_mb_rd_hash_enabled) + save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); + return; + } +#if CONFIG_SPEED_STATS + ++x->txfm_search_info.tx_search_count; +#endif // CONFIG_SPEED_STATS + + const int64_t rd = + select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd); + + if (rd == INT64_MAX) { + // We should always find at least one candidate unless ref_best_rd is less + // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type + // might have failed to find something better) + assert(ref_best_rd != INT64_MAX); + av1_invalid_rd_stats(rd_stats); + return; + } + + // Save the RD search results into mb_rd_record. + if (is_mb_rd_hash_enabled) { + assert(mb_rd_record != NULL); + save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record); + } +} + +void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const TxfmSearchParams *tx_params = &x->txfm_search_params; + assert(bs == mbmi->bsize); + const int is_inter = is_inter_block(mbmi); + const int mi_row = xd->mi_row; + const int mi_col = xd->mi_col; + + av1_init_rd_stats(rd_stats); + + // Hashing based speed feature for inter blocks. If the hash of the residue + // block is found in the table, use previously saved search results and + // terminate early. + uint32_t hash = 0; + MB_RD_RECORD *mb_rd_record = NULL; + const int num_blks = bsize_to_num_blk(bs); + if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) { + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end); + if (within_border) { + hash = get_block_residue_hash(x, bs); + mb_rd_record = x->txfm_search_info.mb_rd_record; + const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index]; + fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x); + return; + } + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + if (tx_params->skip_txfm_level && is_inter && + !xd->lossless[mbmi->segment_id] && + predict_skip_txfm(x, bs, &dist, + cpi->common.features.reduced_tx_set_used)) { + // Populate rdstats as per skip decision + set_skip_txfm(x, rd_stats, bs, dist); + // Save the RD search results into mb_rd_record. + if (mb_rd_record) { + save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); + } + return; + } + + if (xd->lossless[mbmi->segment_id]) { + // Lossless mode can only pick the smallest (4x4) transform size. + choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else if (tx_params->tx_size_search_method == USE_LARGESTALL) { + choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); + } else { + choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs); + } + + // Save the RD search results into mb_rd_record for possible reuse in future. + if (mb_rd_record) { + save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record); + } +} + +int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t ref_best_rd) { + av1_init_rd_stats(rd_stats); + if (ref_best_rd < 0) return 0; + if (!x->e_mbd.is_chroma_ref) return 1; + + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U]; + const int is_inter = is_inter_block(mbmi); + int64_t this_rd = 0, skip_txfm_rd = 0; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + if (is_inter) { + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) + av1_subtract_plane(x, plane_bsize, plane); + } + + const int skip_trellis = 0; + const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); + int is_cost_valid = 1; + for (int plane = 1; plane < MAX_MB_PLANE; ++plane) { + RD_STATS this_rd_stats; + int64_t chroma_ref_best_rd = ref_best_rd; + // For inter blocks, refined ref_best_rd is used for early exit + // For intra blocks, even though current rd crosses ref_best_rd, early + // exit is not recommended as current rd is used for gating subsequent + // modes as well (say, for angular modes) + // TODO(any): Extend the early exit mechanism for intra modes as well + if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter && + chroma_ref_best_rd != INT64_MAX) + chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd); + av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane, + plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis); + if (this_rd_stats.rate == INT_MAX) { + is_cost_valid = 0; + break; + } + av1_merge_rd_stats(rd_stats, &this_rd_stats); + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) { + is_cost_valid = 0; + break; + } + } + + if (!is_cost_valid) { + // reset cost value + av1_invalid_rd_stats(rd_stats); + } + + return is_cost_valid; +} + +void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, + int skip_trellis) { + assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size)); + + if (!cpi->oxcf.txfm_cfg.enable_tx64 && + txsize_sqr_up_map[tx_size] == TX_64X64) { + av1_invalid_rd_stats(rd_stats); + return; + } + + if (current_rd > ref_best_rd) { + av1_invalid_rd_stats(rd_stats); + return; + } + + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + av1_zero(args); + args.x = x; + args.cpi = cpi; + args.best_rd = ref_best_rd; + args.current_rd = current_rd; + args.ftxs_mode = ftxs_mode; + args.skip_trellis = skip_trellis; + av1_init_rd_stats(&args.rd_stats); + + av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left); + av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm, + &args); + + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early; + + if (invalid_rd) { + av1_invalid_rd_stats(rd_stats); + } else { + *rd_stats = args.rd_stats; + } +} + +int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) { + MACROBLOCKD *const xd = &x->e_mbd; + TxfmSearchParams *txfm_params = &x->txfm_search_params; + const int skip_ctx = av1_get_skip_txfm_context(xd); + const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0], + x->mode_costs.skip_txfm_cost[skip_ctx][1] }; + const int64_t min_header_rate = + mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]); + // Account for minimum skip and non_skip rd. + // Eventually either one of them will be added to mode_rate + const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + if (min_header_rd_possible > ref_best_rd) { + av1_invalid_rd_stats(rd_stats_y); + return 0; + } + + const AV1_COMMON *cm = &cpi->common; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); + const int64_t rd_thresh = + ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; + av1_init_rd_stats(rd_stats); + av1_init_rd_stats(rd_stats_y); + rd_stats->rate = mode_rate; + + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (txfm_params->tx_mode_search_type == TX_MODE_SELECT && + !xd->lossless[mbmi->segment_id]) { + av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); +#if CONFIG_COLLECT_RD_STATS == 2 + PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize); +#endif // CONFIG_COLLECT_RD_STATS == 2 + } else { + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->height * xd->width; ++i) + set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm); + } + + if (rd_stats_y->rate == INT_MAX) return 0; + + av1_merge_rd_stats(rd_stats, rd_stats_y); + + const int64_t non_skip_txfm_rdcosty = + RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist); + const int64_t skip_txfm_rdcosty = + RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse); + const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty); + if (min_rdcosty > ref_best_rd) return 0; + + av1_init_rd_stats(rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + int64_t ref_best_chroma_rd = ref_best_rd; + // Calculate best rd cost possible for chroma + if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && + (ref_best_chroma_rd != INT64_MAX)) { + ref_best_chroma_rd = (ref_best_chroma_rd - + AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty)); + } + const int is_cost_valid_uv = + av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd); + if (!is_cost_valid_uv) return 0; + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } + + int choose_skip_txfm = rd_stats->skip_txfm; + if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) { + const int64_t rdcost_no_skip_txfm = RDCOST( + x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0], + rd_stats->dist); + const int64_t rdcost_skip_txfm = + RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse); + if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1; + } + if (choose_skip_txfm) { + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + rd_stats->rate = mode_rate + skip_txfm_cost[1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->dist = rd_stats_y->sse; + rd_stats_uv->dist = rd_stats_uv->sse; + mbmi->skip_txfm = 1; + if (rd_stats->skip_txfm) { + const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + if (tmprd > ref_best_rd) return 0; + } + } else { + rd_stats->rate += skip_txfm_cost[0]; + mbmi->skip_txfm = 0; + } + + return 1; +} diff --git a/third_party/aom/av1/encoder/tx_search.h b/third_party/aom/av1/encoder/tx_search.h new file mode 100644 index 0000000000..ed95c1cd98 --- /dev/null +++ b/third_party/aom/av1/encoder/tx_search.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ +#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ + +#include "av1/common/pred_common.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Set this macro as 1 to collect data about tx size selection. +#define COLLECT_TX_SIZE_DATA 0 + +#if COLLECT_TX_SIZE_DATA +static const char av1_tx_size_data_output_file[] = "tx_size_data.txt"; +#endif + +enum { + FTXS_NONE = 0, + FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, + FTXS_DISABLE_TRELLIS_OPT = 1 << 1, + FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 +} UENUM1BYTE(FAST_TX_SEARCH_MODE); + +static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize, + TX_SIZE tx_size) { + assert(bsize == x->e_mbd.mi[0]->bsize); + if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT || + !block_signals_txsize(bsize)) + return 0; + + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(tx_size, bsize); + const MACROBLOCKD *const xd = &x->e_mbd; + const int tx_size_ctx = get_tx_size_context(xd); + return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth]; +} + +/*!\brief Compute the pixel domain distortion. + * + * \ingroup transform_search + * Compute the pixel domain distortion from diff on all visible 4x4s in the + * transform block. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane Plane index + * \param[in] blk_row Block row index + * \param[in] blk_col Block col index + * \param[in] plane_bsize Current plane block size + * \param[in] tx_bsize Transform size + * \param[in] block_mse_q8 Block mse + * \return An int64_t value that is the block sse. + */ +int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8); + +int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size); + +/*!\brief Transform type search for luma macroblock with fixed transform size. + * + * \ingroup transform_search + * Search for the best transform type and return the transform coefficients RD + * cost of current luma macroblock with the given uniform transform size. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] cpi Top-level encoder structure + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \param[in] bs Size of the current macroblock + * \param[in] tx_size The given transform size + * \param[in] ftxs_mode Transform search mode specifying desired speed + and quality tradeoff + * \param[in] skip_trellis Binary flag indicating if trellis optimization + should be skipped + * \return An int64_t value that is the best RD cost found. + */ +int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, int64_t ref_best_rd, + BLOCK_SIZE bs, TX_SIZE tx_size, + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis); + +/*!\brief Recursive transform size and type search. + * + * \ingroup transform_search + * Search for best transform size and type for luma inter blocks. The transform + * block partitioning can be recursive resulting in non-uniform transform sizes. + * The best transform size and type, if found, will be saved in the MB_MODE_INFO + * structure, and the corresponding RD stats will be saved in rd_stats. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] bsize Current macroblock size + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \remark Nothing is returned. The selected transform size and type will + be saved in the MB_MODE_INFO structure + */ +void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int64_t ref_best_rd); + +/*!\brief Uniform transform size and type search. + * + * \ingroup transform_search + * Search for the best transform size and type for current macroblock block, + * with the assumption that all the transform blocks have a uniform size + * (VP9 style). The selected transform size and type will be saved in the + * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats. + * This function may be used for both intra and inter predicted blocks. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] bs Current macroblock size + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \remark Nothing is returned. The selected transform size and type will + be saved in the MB_MODE_INFO structure + */ +void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bs, + int64_t ref_best_rd); + +/*!\brief Chroma block transform search. + * + * \ingroup transform_search + * Calculate the transform coefficient RD cost for the given chroma macroblock + * If the current mode is intra, then this function will compute the predictor. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] bsize Current macroblock size + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \return An integer value is returned. 0: early termination triggered, + no valid rd cost available; 1: rd cost values are valid. + */ +int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, + BLOCK_SIZE bsize, int64_t ref_best_rd); + +/*!\brief Transform type search with fixed transform size. + * + * \ingroup transform_search + * Search for the best transform type and calculate the transform coefficients + * RD cost of the current transform block with the specified (uniform) transform + * size and plane. The RD results will be saved in rd_stats. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] cpi Top-level encoder structure + * \param[in] rd_stats Pointer to struct to keep track of the RD stats + * \param[in] ref_best_rd Best RD cost seen for this block so far + * \param[in] current_rd Current RD cost for this block so far + * \param[in] plane Plane index + * \param[in] plane_bsize Size of the current macroblock considering + sup-sampling + * \param[in] tx_size The given transform size + * \param[in] ftxs_mode Transform search mode specifying desired speed + and quality tradeoff + * \param[in] skip_trellis Binary flag indicating if trellis optimization + should be skipped + * + * \remark Nothing is returned. The RD results will be saved in rd_stats. + */ +void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, + RD_STATS *rd_stats, int64_t ref_best_rd, + int64_t current_rd, int plane, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, + int skip_trellis); + +/*!\brief Recursive transform size and type search. + * + * \ingroup transform_search + * This function combines y and uv planes' transform search processes together + * for inter-predicted blocks (including IntraBC), when the prediction is + * already generated. It first does subtraction to obtain the prediction error. + * Then it calls + * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and + * av1_txfm_uvrd sequentially and handles possible early terminations. + * The RD metrics are calculated and stored in rd_stats/_y/_uv. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] bsize Current macroblock size + * \param[in] rd_stats Pointer to struct to keep track of the overal RD + stats + * \param[in] rd_stats_y Pointer to struct to keep track of the RD + stats for the luma plane + * \param[in] rd_stats_uv Pointer to struct to keep track of the RD + stats for the chroma planes + * \param[in] mode_rate Rate cost to encode the prediction mode info. of + the current macroblock + * \param[in] ref_best_rd Best RD cost seen for this block so far + * + * \return An integer value is returned indicating if a valid transform + candidate is found (1) or not (0). + */ +int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_ diff --git a/third_party/aom/av1/encoder/txb_rdopt.c b/third_party/aom/av1/encoder/txb_rdopt.c new file mode 100644 index 0000000000..e551e8aa12 --- /dev/null +++ b/third_party/aom/av1/encoder/txb_rdopt.c @@ -0,0 +1,659 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/txb_rdopt.h" +#include "av1/encoder/txb_rdopt_utils.h" + +#include "av1/common/idct.h" + +static INLINE void update_coeff_general( + int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size, + TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift, + int dc_sign_ctx, const int16_t *dequant, const int16_t *scan, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, + const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int is_last = si == (eob - 1); + const int coeff_ctx = get_lower_levels_ctx_general( + is_last, si, bhl, width, levels, ci, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const int sign = (qc < 0) ? 1 : 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); + const int rate = + get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bhl, tx_class, levels); + const int64_t rd = RDCOST(rdmult, rate, dist); + + tran_low_t qc_low, dqc_low; + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + if (abs_qc == 1) { + abs_qc_low = qc_low = dqc_low = 0; + dist_low = dist0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci); + rate_low = + get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bhl, tx_class, levels); + } + + rd_low = RDCOST(rdmult, rate_low, dist_low); + if (rd_low < rd) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + *accu_dist += dist_low - dist0; + } else { + *accu_rate += rate; + *accu_dist += dist - dist0; + } + } +} + +static AOM_FORCE_INLINE void update_coeff_simple( + int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class, + int bhl, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs, + const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff, + uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + (void)eob; + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(si != eob - 1); + assert(si > 0); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + const tran_low_t abs_qc = abs(qc); + const tran_low_t abs_tqc = abs(tcoeff[ci]); + const tran_low_t abs_dqc = abs(dqcoeff[ci]); + int rate_low = 0; + const int rate = get_two_coeff_cost_simple( + ci, abs_qc, coeff_ctx, txb_costs, bhl, tx_class, levels, &rate_low); + if (abs_dqc < abs_tqc) { + *accu_rate += rate; + return; + } + + const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci); + const int64_t rd = RDCOST(rdmult, rate, dist); + + const tran_low_t abs_qc_low = abs_qc - 1; + const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + const int64_t dist_low = + get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci); + const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + + if (rd_low < rd) { + const int sign = (qc < 0) ? 1 : 0; + qcoeff[ci] = (-sign ^ abs_qc_low) + sign; + dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign; + levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); + *accu_rate += rate_low; + } else { + *accu_rate += rate; + } + } +} + +static AOM_FORCE_INLINE void update_coeff_eob( + int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci, + int si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width, + int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant, + const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness, + const qm_val_t *iqmatrix, const qm_val_t *qmatrix) { + const int dqv = get_dqv(dequant, scan[si], iqmatrix); + assert(si != *eob - 1); + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const int coeff_ctx = + get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class); + if (qc == 0) { + *accu_rate += txb_costs->base_cost[coeff_ctx][0]; + } else { + int lower_level = 0; + const tran_low_t abs_qc = abs(qc); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int sign = (qc < 0) ? 1 : 0; + const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); + int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0; + int rate = + get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx, + txb_costs, bhl, tx_class, levels); + int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist); + + tran_low_t qc_low, dqc_low; + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + + if (abs_qc == 1) { + abs_qc_low = 0; + dqc_low = qc_low = 0; + dist_low = 0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist); + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0; + rate_low = + get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bhl, tx_class, levels); + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); + } + + int lower_level_new_eob = 0; + const int new_eob = si + 1; + const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si); + const int new_eob_cost = + get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class); + int rate_coeff_eob = + new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob, + dc_sign_ctx, txb_costs, bhl, + tx_class); + int64_t dist_new_eob = dist; + int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob); + + if (abs_qc_low > 0) { + const int rate_coeff_eob_low = + new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign, + coeff_ctx_new_eob, dc_sign_ctx, + txb_costs, bhl, tx_class); + const int64_t dist_new_eob_low = dist_low; + const int64_t rd_new_eob_low = + RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low); + if (rd_new_eob_low < rd_new_eob) { + lower_level_new_eob = 1; + rd_new_eob = rd_new_eob_low; + rate_coeff_eob = rate_coeff_eob_low; + dist_new_eob = dist_new_eob_low; + } + } + + if (sharpness == 0 || abs_qc > 1) { + if (rd_low < rd) { + lower_level = 1; + rd = rd_low; + rate = rate_low; + dist = dist_low; + } + } + + if (sharpness == 0 && rd_new_eob < rd) { + for (int ni = 0; ni < *nz_num; ++ni) { + int last_ci = nz_ci[ni]; + levels[get_padded_idx(last_ci, bhl)] = 0; + qcoeff[last_ci] = 0; + dqcoeff[last_ci] = 0; + } + *eob = new_eob; + *nz_num = 0; + *accu_rate = rate_coeff_eob; + *accu_dist = dist_new_eob; + lower_level = lower_level_new_eob; + } else { + *accu_rate += rate; + *accu_dist += dist; + } + + if (lower_level) { + qcoeff[ci] = qc_low; + dqcoeff[ci] = dqc_low; + levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX); + } + if (qcoeff[ci]) { + nz_ci[*nz_num] = ci; + ++*nz_num; + } + } +} + +static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob, + int nz_num, int *nz_ci, int64_t rdmult, + int skip_cost, int non_skip_cost, + tran_low_t *qcoeff, tran_low_t *dqcoeff) { + const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist); + const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0); + if (rd_new_eob < rd) { + for (int i = 0; i < nz_num; ++i) { + const int ci = nz_ci[i]; + qcoeff[ci] = 0; + dqcoeff[ci] = 0; + // no need to set up levels because this is the last step + // levels[get_padded_idx(ci, bhl)] = 0; + } + *accu_rate = 0; + *eob = 0; + } +} + +// TODO(angiebird): use this function whenever it's possible +static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd, + int plane, TX_SIZE tx_size, TX_TYPE tx_type, + int reduced_tx_set_used) { + if (plane > 0) return 0; + + const TX_SIZE square_tx_size = txsize_sqr_map[tx_size]; + + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int is_inter = is_inter_block(mbmi); + if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 && + !xd->lossless[xd->mi[0]->segment_id]) { + const int ext_tx_set = + get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used); + if (is_inter) { + if (ext_tx_set > 0) + return x->mode_costs + .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type]; + } else { + if (ext_tx_set > 0) { + PREDICTION_MODE intra_dir; + if (mbmi->filter_intra_mode_info.use_filter_intra) + intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info + .filter_intra_mode]; + else + intra_dir = mbmi->mode; + return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size] + [intra_dir][tx_type]; + } + } + } + return 0; +} + +int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + const int shift = av1_get_tx_scale(tx_size); + int eob = p->eobs[block]; + const int16_t *dequant = p->dequant_QTX; + const qm_val_t *iqmatrix = + av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type); + const qm_val_t *qmatrix = + cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR + ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size, + tx_type) + : NULL; + const int block_offset = BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + block_offset; + tran_low_t *dqcoeff = p->dqcoeff + block_offset; + const tran_low_t *tcoeff = p->coeff + block_offset; + const CoeffCosts *coeff_costs = &x->coeff_costs; + + // This function is not called if eob = 0. + assert(eob > 0); + + const AV1_COMMON *cm = &cpi->common; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const MB_MODE_INFO *mbmi = xd->mi[0]; + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + assert(height == (1 << bhl)); + const int is_inter = is_inter_block(mbmi); + const LV_MAP_COEFF_COST *txb_costs = + &coeff_costs->coeff_costs[txs_ctx][plane_type]; + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *txb_eob_costs = + &coeff_costs->eob_costs[eob_multi_size][plane_type]; + + const int rshift = 2; + + const int64_t rdmult = + (((int64_t)x->rdmult * + (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) + + 2) >> + rshift; + + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + + if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels); + + // TODO(angirbird): check iqmatrix + + const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0]; + const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class); + int accu_rate = eob_cost; + int64_t accu_dist = 0; + int si = eob - 1; + const int ci = scan[si]; + const tran_low_t qc = qcoeff[ci]; + const tran_low_t abs_qc = abs(qc); + const int sign = qc < 0; + const int max_nz_num = 2; + int nz_num = 1; + int nz_ci[3] = { ci, 0, 0 }; + if (abs_qc >= 2) { + update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class, + bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels, iqmatrix, qmatrix); + --si; + } else { + assert(abs_qc == 1); + const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, si); + accu_rate += + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx, + txb_costs, bhl, tx_class); + const tran_low_t tqc = tcoeff[ci]; + const tran_low_t dqc = dqcoeff[ci]; + const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci); + const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci); + accu_dist += dist - dist0; + --si; + } + +#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 0 && nz_num <= max_nz_num; --si) { \ + update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \ + tx_size, tx_class_literal, bhl, width, \ + txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \ + txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \ + levels, sharpness, iqmatrix, qmatrix); \ + } \ + break + switch (tx_class) { + UPDATE_COEFF_EOB_CASE(TX_CLASS_2D); + UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_EOB_CASE + default: assert(false); + } + + if (si == -1 && nz_num <= max_nz_num && sharpness == 0) { + update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost, + non_skip_cost, qcoeff, dqcoeff); + } + +#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \ + case tx_class_literal: \ + for (; si >= 1; --si) { \ + update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \ + rdmult, shift, dequant, scan, txb_costs, tcoeff, \ + qcoeff, dqcoeff, levels, iqmatrix, qmatrix); \ + } \ + break + switch (tx_class) { + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ); + UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT); +#undef UPDATE_COEFF_SIMPLE_CASE + default: assert(false); + } + + // DC position + if (si == 0) { + // no need to update accu_dist because it's not used after this point + int64_t dummy_dist = 0; + update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class, + bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx, + dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff, + levels, iqmatrix, qmatrix); + } + + const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type, + cm->features.reduced_tx_set_used); + if (eob == 0) + accu_rate += skip_cost; + else + accu_rate += non_skip_cost + tx_type_cost; + + p->eobs[block] = eob; + p->txb_entropy_ctx[block] = + av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]); + + *rate_cost = accu_rate; + return eob; +} + +static AOM_FORCE_INLINE int warehouse_efficients_txb( + const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, + const struct macroblock_plane *p, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, + int reduced_tx_set_used) { + const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + const int bhl = get_txb_bhl(tx_size); + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + const int16_t *const scan = scan_order->scan; + uint8_t levels_buf[TX_PAD_2D]; + uint8_t *const levels = set_levels(levels_buf, height); + DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]); + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->coeff_costs.eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + av1_txb_init_levels(qcoeff, width, height, levels); + + cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); + + const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] = + coeff_costs->lps_cost; + int c = eob - 1; + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int sign = AOMSIGN(v); + const int level = (v ^ sign) - sign; + const int coeff_ctx = coeff_contexts[pos]; + cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1]; + + if (v) { + // sign bit cost + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx_eob(pos, bhl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + if (c) { + cost += av1_cost_literal(1); + } else { + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + return cost; + } + } + } + const int(*base_cost)[8] = coeff_costs->base_cost; + for (c = eob - 2; c >= 1; --c) { + const int pos = scan[c]; + const int coeff_ctx = coeff_contexts[pos]; + const tran_low_t v = qcoeff[pos]; + const int level = abs(v); + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; + if (v) { + // sign bit cost + cost += av1_cost_literal(1); + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bhl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + } + } + // c == 0 after previous loop + { + const int pos = scan[c]; + const tran_low_t v = qcoeff[pos]; + const int coeff_ctx = coeff_contexts[pos]; + const int sign = AOMSIGN(v); + const int level = (v ^ sign) - sign; + cost += base_cost[coeff_ctx][AOMMIN(level, 3)]; + + if (v) { + // sign bit cost + const int sign01 = (sign ^ sign) - sign; + const int dc_sign_ctx = txb_ctx->dc_sign_ctx; + cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; + if (level > NUM_BASE_LEVELS) { + const int ctx = get_br_ctx(levels, pos, bhl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); + } + } + } + return cost; +} + +int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type) { + assert(plane == 0); + + int cost = 0; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); + + int eob = p->eobs[block]; + + // coeffs + int c = eob - 1; + // eob + { + const int pos = scan[c]; + const tran_low_t v = abs(qcoeff[pos]) - 1; + cost += (v << (AV1_PROB_COST_SHIFT + 2)); + } + // other coeffs + for (c = eob - 2; c >= 0; c--) { + const int pos = scan[c]; + const tran_low_t v = abs(qcoeff[pos]); + const int idx = AOMMIN(v, 14); + + cost += costLUT[idx]; + } + + // const_term does not contain DC, and log(e) does not contain eob, so both + // (eob-1) + cost += (const_term + loge_par) * (eob - 1); + + return cost; +} + +static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian( + const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob, + const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs, + const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class, + int reduced_tx_set_used) { + const int txb_skip_ctx = txb_ctx->txb_skip_ctx; + + const int eob_multi_size = txsize_log2_minus4[tx_size]; + const LV_MAP_EOB_COST *const eob_costs = + &x->coeff_costs.eob_costs[eob_multi_size][plane_type]; + int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; + + cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used); + + cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class); + + cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type); + return cost; +} + +int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used) { + const struct macroblock_plane *p = &x->plane[plane]; + const int eob = p->eobs[block]; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs.coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + + return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob, + plane_type, coeff_costs, xd, tx_type, + tx_class, reduced_tx_set_used); +} + +int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + const int reduced_tx_set_used, + const int adjust_eob) { + const struct macroblock_plane *p = &x->plane[plane]; + int eob = p->eobs[block]; + + if (adjust_eob) { + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block); + tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block); + tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan, + tcoeff, qcoeff, dqcoeff); + p->eobs[block] = eob; + } + + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const PLANE_TYPE plane_type = get_plane_type(plane); + const LV_MAP_COEFF_COST *const coeff_costs = + &x->coeff_costs.coeff_costs[txs_ctx][plane_type]; + if (eob == 0) { + return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; + } + + const MACROBLOCKD *const xd = &x->e_mbd; + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + + return warehouse_efficients_txb_laplacian( + x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd, + tx_type, tx_class, reduced_tx_set_used); +} diff --git a/third_party/aom/av1/encoder/txb_rdopt.h b/third_party/aom/av1/encoder/txb_rdopt.h new file mode 100644 index 0000000000..70b322a2e1 --- /dev/null +++ b/third_party/aom/av1/encoder/txb_rdopt.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_ +#define AOM_AV1_ENCODER_TXB_RDOPT_H_ + +#include "av1/common/blockd.h" +#include "av1/common/txb_common.h" +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Adjust the magnitude of quantized coefficients to achieve better + * rate-distortion (RD) trade-off. + * + * \ingroup coefficient_coding + * + * This function goes through each coefficient and greedily choose to lower + * the coefficient magnitude by 1 or not based on the RD score. + * + * The coefficients are processing in reversed scan order. + * + * Note that, the end of block position (eob) may change if the original last + * coefficient is lowered to zero. + * + * \param[in] cpi Top-level encoder structure + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * \param[in] tx_size The transform size + * \param[in] tx_type The transform type + * \param[in] txb_ctx Context info for entropy coding transform block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[out] rate_cost The entropy cost of coding the transform block + * after adjustment of coefficients. + * \param[in] sharpness When sharpness > 0, the function will be less + * aggressive towards lowering the magnitude of coefficients. + * In this way, the transform block will contain more high-frequency + * coefficients and therefore will preserve the sharpness of the reconstructed + * block. + */ +int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, + int block, TX_SIZE tx_size, TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int *rate_cost, + int sharpness); + +/*!\brief Compute the entropy cost of coding coefficients in a transform block. + * + * \ingroup coefficient_coding + * + * \param[in] x Pointer to structure holding the data for + the current encoding macroblock. + * \param[in] plane The index of the current plane. + * \param[in] block The index of the current transform block + in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block. + * \param[in] tx_size The transform size. + * \param[in] tx_type The transform type. + * \param[in] txb_ctx Context info for entropy coding transform + block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[in] reduced_tx_set_used Whether the transform type is chosen from + * a reduced set. + */ +int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block, + const TX_SIZE tx_size, const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, int reduced_tx_set_used); + +/*!\brief Estimate the entropy cost of coding a transform block using Laplacian + * distribution. + * + * \ingroup coefficient_coding + * + * This function compute the entropy costs of the end of block position (eob) + * and the transform type (tx_type) precisely. + * + * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs + * of coefficients in the transform block. + * + * In the end, the function returns the sum of entropy costs of end of block + * position (eob), transform type (tx_type) and coefficients. + * + * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less + * accurate. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block + * \param[in] tx_size The transform size + * \param[in] tx_type The transform type + * \param[in] txb_ctx Context info for entropy coding transform block + * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). + * \param[in] reduced_tx_set_used Whether the transform type is chosen from + * a reduced set. + * \param[in] adjust_eob Whether to adjust the end of block position + (eob) + * or not. + * \return int Estimated entropy cost of coding the transform + block. + */ +int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type, + const TXB_CTX *const txb_ctx, + const int reduced_tx_set_used, + const int adjust_eob); + +/*!\brief Estimate the entropy cost of transform coefficients using Laplacian + * distribution. + * + * \ingroup coefficient_coding + * + * This function assumes each transform coefficient is of its own Laplacian + * distribution and the coefficient is the only observation of the Laplacian + * distribution. + * + * Based on that, each coefficient's coding cost can be estimated by computing + * the entropy of the corresponding Laplacian distribution. + * + * This function then return the sum of the estimated entropy cost for all + * coefficients in the transform block. + * + * Note that the entropy cost of end of block (eob) and transform type (tx_type) + * are not included. + * + * \param[in] x Pointer to structure holding the data for the + current encoding macroblock + * \param[in] plane The index of the current plane + * \param[in] block The index of the current transform block in the + * macroblock. It's defined by number of 4x4 units that have been coded before + * the currernt transform block + * \param[in] tx_size The transform size + * \param[in] tx_type The transform type + * \return int Estimated entropy cost of coefficients in the + * transform block. + */ +int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane, + const int block, const TX_SIZE tx_size, + const TX_TYPE tx_type); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_TXB_RDOPT_H_ diff --git a/third_party/aom/av1/encoder/txb_rdopt_utils.h b/third_party/aom/av1/encoder/txb_rdopt_utils.h new file mode 100644 index 0000000000..b9f08aacf0 --- /dev/null +++ b/third_party/aom/av1/encoder/txb_rdopt_utils.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ +#define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ + +#include "av1/encoder/encodetxb.h" + +static const int golomb_bits_cost[32] = { + 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5, + 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9 +}; + +static const int golomb_cost_diff[32] = { + 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0, + 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +// Look up table of individual cost of coefficient by its quantization level. +// determined based on Laplacian distribution conditioned on estimated context +static const int costLUT[15] = { -1143, 53, 545, 825, 1031, + 1209, 1393, 1577, 1763, 1947, + 2132, 2317, 2501, 2686, 2871 }; + +static const int const_term = (1 << AV1_PROB_COST_SHIFT); + +static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000; + +static INLINE int get_dqv(const int16_t *dequant, int coeff_idx, + const qm_val_t *iqmatrix) { + int dqv = dequant[!!coeff_idx]; + if (iqmatrix != NULL) + dqv = + ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + return dqv; +} + +static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff, + int shift, const qm_val_t *qmatrix, + int coeff_idx) { + int64_t diff = (tcoeff - dqcoeff) * (1 << shift); + if (qmatrix == NULL) { + return diff * diff; + } + // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion + // computation done in av1_block_error_qm, improving visual quality. + // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22 + // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The + // multiplication `diff * diff` then does not risk overflowing. + diff *= qmatrix[coeff_idx]; + const int64_t error = + (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS); + return error; +} + +static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs, + const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) { + int eob_extra; + const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra); + int eob_cost = 0; + const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1; + eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1]; + + if (av1_eob_offset_bits[eob_pt] > 0) { + const int eob_ctx = eob_pt - 3; + const int eob_shift = av1_eob_offset_bits[eob_pt] - 1; + const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; + eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit]; + const int offset_bits = av1_eob_offset_bits[eob_pt]; + if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1); + } + return eob_cost; +} + +static INLINE int get_golomb_cost(int abs_qc) { + if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { + const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + const int length = get_msb(r) + 1; + return av1_cost_literal(2 * length - 1); + } + return 0; +} + +static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + return coeff_lps[base_range] + get_golomb_cost(level); +} + +static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps, + int *diff) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + int golomb_bits = 0; + if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) + *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1]; + + if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) { + int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + if (r < 32) { + golomb_bits = golomb_bits_cost[r]; + *diff += golomb_cost_diff[r]; + } else { + golomb_bits = get_golomb_cost(level); + *diff += (r & (r - 1)) == 0 ? 1024 : 0; + } + } + + return coeff_lps[base_range] + golomb_bits; +} + +static AOM_FORCE_INLINE int get_two_coeff_cost_simple( + int ci, tran_low_t abs_qc, int coeff_ctx, + const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class, + const uint8_t *levels, int *cost_low) { + // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) + // and not the last (scan_idx != eob - 1) + assert(ci > 0); + int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + int diff = 0; + if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4]; + if (abs_qc) { + cost += av1_cost_literal(1); + if (abs_qc > NUM_BASE_LEVELS) { + const int br_ctx = get_br_ctx(levels, ci, bhl, tx_class); + int brcost_diff = 0; + cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx], + &brcost_diff); + diff += brcost_diff; + } + } + *cost_low = cost - diff; + + return cost; +} + +static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign, + int coeff_ctx, int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bhl, TX_CLASS tx_class) { + int cost = 0; + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + br_ctx = get_br_ctx_eob(ci, bhl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); + } + } + return cost; +} + +static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc, + int sign, int coeff_ctx, + int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bhl, TX_CLASS tx_class, + const uint8_t *levels) { + int cost = 0; + if (is_last) { + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + } else { + cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + } + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + if (is_last) + br_ctx = get_br_ctx_eob(ci, bhl, tx_class); + else + br_ctx = get_br_ctx(levels, ci, bhl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); + } + } + return cost; +} + +static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv, + int shift, tran_low_t *qc_low, + tran_low_t *dqc_low) { + tran_low_t abs_qc_low = abs_qc - 1; + *qc_low = (-sign ^ abs_qc_low) + sign; + assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low); + tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + *dqc_low = (-sign ^ abs_dqc_low) + sign; + assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low); +} + +static INLINE void update_coeff_eob_fast(int *eob, int shift, + const int16_t *dequant_ptr, + const int16_t *scan, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + // TODO(sarahparker) make this work for aomqm + int eob_out = *eob; + int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7), + dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) }; + + for (int i = *eob - 1; i >= 0; i--) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) { + eob_out--; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } else { + break; + } + } + + *eob = eob_out; +} +#endif // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_ diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c new file mode 100644 index 0000000000..f664795153 --- /dev/null +++ b/third_party/aom/av1/encoder/var_based_part.c @@ -0,0 +1,1914 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" + +#include "av1/common/reconinter.h" +#include "av1/common/blockd.h" + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/var_based_part.h" +#include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/rdopt_utils.h" + +// Possible values for the force_split variable while evaluating variance based +// partitioning. +enum { + // Evaluate all partition types + PART_EVAL_ALL = 0, + // Force PARTITION_SPLIT + PART_EVAL_ONLY_SPLIT = 1, + // Force PARTITION_NONE + PART_EVAL_ONLY_NONE = 2 +} UENUM1BYTE(PART_EVAL_STATUS); + +typedef struct { + VPVariance *part_variances; + VPartVar *split[4]; +} variance_node; + +static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize, + variance_node *node) { + node->part_variances = NULL; + switch (bsize) { + case BLOCK_128X128: { + VP128x128 *vt = (VP128x128 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_64X64: { + VP64x64 *vt = (VP64x64 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_32X32: { + VP32x32 *vt = (VP32x32 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_16X16: { + VP16x16 *vt = (VP16x16 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + case BLOCK_8X8: { + VP8x8 *vt = (VP8x8 *)data; + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx].part_variances.none; + break; + } + default: { + VP4x4 *vt = (VP4x4 *)data; + assert(bsize == BLOCK_4X4); + node->part_variances = &vt->part_variances; + for (int split_idx = 0; split_idx < 4; split_idx++) + node->split[split_idx] = &vt->split[split_idx]; + break; + } + } +} + +// Set variance values given sum square error, sum error, count. +static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c, + VPartVar *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->log2_count = c; +} + +static AOM_INLINE void get_variance(VPartVar *v) { + v->variance = + (int)(256 * (v->sum_square_error - + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> + v->log2_count); +} + +static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b, + VPartVar *r) { + assert(a->log2_count == b->log2_count); + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->log2_count + 1, r); +} + +static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) { + variance_node node; + memset(&node, 0, sizeof(node)); + tree_to_node(data, bsize, &node); + sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); + sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); + sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); + sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); + sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], + &node.part_variances->none); +} + +static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + if (cpi->common.mi_params.mi_cols > mi_col && + cpi->common.mi_params.mi_rows > mi_row) { + CommonModeInfoParams *mi_params = &cpi->common.mi_params; + const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col); + const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col); + MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] = + &mi_params->mi_alloc[mi_alloc_idx]; + mi->bsize = bsize; + } +} + +static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, void *data, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int64_t threshold, BLOCK_SIZE bsize_min, + PART_EVAL_STATUS force_split) { + AV1_COMMON *const cm = &cpi->common; + variance_node vt; + const int block_width = mi_size_wide[bsize]; + const int block_height = mi_size_high[bsize]; + int bs_width_check = block_width; + int bs_height_check = block_height; + int bs_width_vert_check = block_width >> 1; + int bs_height_horiz_check = block_height >> 1; + // On the right and bottom boundary we only need to check + // if half the bsize fits, because boundary is extended + // up to 64. So do this check only for sb_size = 64X64. + if (cm->seq_params->sb_size == BLOCK_64X64) { + if (tile->mi_col_end == cm->mi_params.mi_cols) { + bs_width_check = (block_width >> 1) + 1; + bs_width_vert_check = (block_width >> 2) + 1; + } + if (tile->mi_row_end == cm->mi_params.mi_rows) { + bs_height_check = (block_height >> 1) + 1; + bs_height_horiz_check = (block_height >> 2) + 1; + } + } + + assert(block_height == block_width); + tree_to_node(data, bsize, &vt); + + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_check <= tile->mi_row_end && + force_split == PART_EVAL_ONLY_NONE) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } + if (force_split == PART_EVAL_ONLY_SPLIT) return 0; + + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (bsize == bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_check <= tile->mi_row_end && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } + return 0; + } else if (bsize > bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + // For key frame: take split for bsize above 32X32 or very high variance. + if (frame_is_intra_only(cm) && + (bsize > BLOCK_32X32 || + vt.part_variances->none.variance > (threshold << 4))) { + return 0; + } + // If variance is low, take the bsize (no split). + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_check <= tile->mi_row_end && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } + // Check vertical split. + if (mi_row + bs_height_check <= tile->mi_row_end && + mi_col + bs_width_vert_check <= tile->mi_col_end) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); + BLOCK_SIZE plane_bsize = + get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + get_variance(&vt.part_variances->vert[0]); + get_variance(&vt.part_variances->vert[1]); + if (vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold && + plane_bsize < BLOCK_INVALID) { + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); + return 1; + } + } + // Check horizontal split. + if (mi_col + bs_width_check <= tile->mi_col_end && + mi_row + bs_height_horiz_check <= tile->mi_row_end) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + BLOCK_SIZE plane_bsize = + get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x, + xd->plane[AOM_PLANE_U].subsampling_y); + get_variance(&vt.part_variances->horz[0]); + get_variance(&vt.part_variances->horz[1]); + if (vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold && + plane_bsize < BLOCK_INVALID) { + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + } + return 0; + } + return 0; +} + +static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide, + int pixels_high) { + int all_inside = 1; + for (int idx = 0; idx < 4; idx++) { + all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide); + all_inside &= ((y16_idx + GET_BLK_IDX_Y(idx, 3)) < pixels_high); + } + return all_inside; +} + +#if CONFIG_AV1_HIGHBITDEPTH +// TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd +static AOM_INLINE void fill_variance_8x8avg_highbd( + const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, + int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide, + int pixels_high) { + for (int idx = 0; idx < 4; idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int src_avg = aom_highbd_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, + src_stride); + int dst_avg = aom_highbd_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, + dst_stride); + + sum = src_avg - dst_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none); + } +} +#endif + +static AOM_INLINE void fill_variance_8x8avg_lowbd( + const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, + int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide, + int pixels_high) { + unsigned int sse[4] = { 0 }; + int sum[4] = { 0 }; + + if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) { + int src_avg[4]; + int dst_avg[4]; + aom_avg_8x8_quad(src_buf, src_stride, x16_idx, y16_idx, src_avg); + aom_avg_8x8_quad(dst_buf, dst_stride, x16_idx, y16_idx, dst_avg); + for (int idx = 0; idx < 4; idx++) { + sum[idx] = src_avg[idx] - dst_avg[idx]; + sse[idx] = sum[idx] * sum[idx]; + } + } else { + for (int idx = 0; idx < 4; idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int src_avg = + aom_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride); + int dst_avg = + aom_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride); + sum[idx] = src_avg - dst_avg; + sse[idx] = sum[idx] * sum[idx]; + } + } + } + + for (int idx = 0; idx < 4; idx++) { + fill_variance(sse[idx], sum[idx], 0, &vst->split[idx].part_variances.none); + } +} + +// Obtain parameters required to calculate variance (such as sum, sse, etc,.) +// at 8x8 sub-block level for a given 16x16 block. +// The function can be called only when is_key_frame is false since sum is +// computed between source and reference frames. +static AOM_INLINE void fill_variance_8x8avg( + const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf, + int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag, + int pixels_wide, int pixels_high) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag) { + fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride, + x16_idx, y16_idx, vst, pixels_wide, + pixels_high); + return; + } +#else + (void)highbd_flag; +#endif // CONFIG_AV1_HIGHBITDEPTH + fill_variance_8x8avg_lowbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx, + y16_idx, vst, pixels_wide, pixels_high); +} + +static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride, + const uint8_t *dst_buf, int dst_stride, + int x16_idx, int y16_idx, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high) { + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (int idx = 0; idx < 4; idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + aom_highbd_minmax_8x8( + src_buf + y8_idx * src_stride + x8_idx, src_stride, + dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max); + } else { + aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride, + dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, + &max); + } +#else + aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride, + dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, + &max); +#endif + if ((max - min) > minmax_max) minmax_max = (max - min); + if ((max - min) < minmax_min) minmax_min = (max - min); + } + } + return (minmax_max - minmax_min); +} + +// Function to compute average and variance of 4x4 sub-block. +// The function can be called only when is_key_frame is true since sum is +// computed using source frame only. +static AOM_INLINE void fill_variance_4x4avg(const uint8_t *src_buf, + int src_stride, int x8_idx, + int y8_idx, VP8x8 *vst, +#if CONFIG_AV1_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, int pixels_high, + int border_offset_4x4) { + for (int idx = 0; idx < 4; idx++) { + const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2); + const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide - border_offset_4x4 && + y4_idx < pixels_high - border_offset_4x4) { + int src_avg; + int dst_avg = 128; +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + src_avg = aom_highbd_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, + src_stride); + } else { + src_avg = + aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); + } +#else + src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride); +#endif + + sum = src_avg - dst_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none); + } +} + +// TODO(kyslov) Bring back threshold adjustment based on content state +static int64_t scale_part_thresh_content(int64_t threshold_base, int speed, + int width, int height, + int non_reference_frame) { + (void)width; + (void)height; + int64_t threshold = threshold_base; + if (non_reference_frame) threshold = (3 * threshold) >> 1; + if (speed >= 8) { + return (5 * threshold) >> 2; + } + return threshold; +} + +// Tune thresholds less or more aggressively to prefer larger partitions +static AOM_INLINE void tune_thresh_based_on_qindex( + AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex, + int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd, + int lighting_change) { + double weight; + if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) { + const int win = 20; + if (current_qindex < QINDEX_LARGE_BLOCK_THR - win) + weight = 1.0; + else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win) + weight = 0.0; + else + weight = + 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win); + if (num_pixels > RESOLUTION_480P) { + for (int i = 0; i < 4; i++) { + thresholds[i] <<= 1; + } + } + if (num_pixels <= RESOLUTION_288P) { + thresholds[3] = INT64_MAX; + if (is_segment_id_boosted == false) { + thresholds[1] <<= 2; + thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4; + } else { + thresholds[1] <<= 1; + thresholds[2] <<= 3; + } + // Allow for split to 8x8 for superblocks where part of it has + // moving boundary. So allow for sb with source_sad above threshold, + // and avoid very large source_sad or high source content, to avoid + // too many 8x8 within superblock. + uint64_t avg_source_sad_thresh = 25000; + uint64_t block_sad_low = 25000; + uint64_t block_sad_high = 50000; + if (cpi->svc.temporal_layer_id == 0 && + cpi->svc.number_temporal_layers > 1) { + // Increase the sad thresholds for base TL0, as reference/LAST is + // 2/4 frames behind (for 2/3 #TL). + avg_source_sad_thresh = 40000; + block_sad_high = 70000; + } + if (is_segment_id_boosted == false && + cpi->rc.avg_source_sad < avg_source_sad_thresh && + block_sad > block_sad_low && block_sad < block_sad_high && + !lighting_change) { + thresholds[2] = (3 * thresholds[2]) >> 2; + thresholds[3] = thresholds[2] << 3; + } + // Condition the increase of partition thresholds on the segment + // and the content. Avoid the increase for superblocks which have + // high source sad, unless the whole frame has very high motion + // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks + // have high source sad). + } else if (num_pixels > RESOLUTION_480P && is_segment_id_boosted == false && + (source_sad_nonrd != kHighSad || + cpi->rc.avg_source_sad > 50000)) { + thresholds[0] = (3 * thresholds[0]) >> 1; + thresholds[3] = INT64_MAX; + if (current_qindex > QINDEX_LARGE_BLOCK_THR) { + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]); + } + } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && + is_segment_id_boosted == false && + (source_sad_nonrd != kHighSad || + cpi->rc.avg_source_sad > 50000)) { + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]); + thresholds[3] = INT64_MAX; + } + } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) { + thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0; + thresholds[2] = + (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2]; + } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) { + const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1; + if (current_qindex < QINDEX_LARGE_BLOCK_THR - 45) + weight = 1.0; + else if (current_qindex > QINDEX_LARGE_BLOCK_THR + 45) + weight = 0.0; + else + weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + 45) / (2 * 45); + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]); + thresholds[3] = + (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]); + } + if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128)) + thresholds[3] = INT64_MAX; +} + +static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[], + int64_t threshold_base, + int threshold_left_shift, + int num_pixels) { + if (cpi->sf.rt_sf.force_large_partition_blocks_intra) { + const int shift_steps = + threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8); + assert(shift_steps >= 0); + threshold_base <<= shift_steps; + } + thresholds[0] = threshold_base; + thresholds[1] = threshold_base; + if (num_pixels < RESOLUTION_720P) { + thresholds[2] = threshold_base / 3; + thresholds[3] = threshold_base >> 1; + } else { + int shift_val = 2; + if (cpi->sf.rt_sf.force_large_partition_blocks_intra) { + shift_val = 0; + } + + thresholds[2] = threshold_base >> shift_val; + thresholds[3] = threshold_base >> shift_val; + } + thresholds[4] = threshold_base << 2; +} + +static AOM_INLINE void tune_thresh_based_on_resolution( + AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base, + int current_qindex, int source_sad_rd, int num_pixels) { + if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1; + if (num_pixels <= RESOLUTION_288P) { + const int qindex_thr[5][2] = { + { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 }, + }; + int th_idx = 0; + if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1) + th_idx = + (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0; + if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3) + th_idx = cpi->sf.rt_sf.var_part_based_on_qidx; + const int qindex_low_thr = qindex_thr[th_idx][0]; + const int qindex_high_thr = qindex_thr[th_idx][1]; + if (current_qindex >= qindex_high_thr) { + threshold_base = (5 * threshold_base) >> 1; + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base << 2; + thresholds[3] = threshold_base << 5; + } else if (current_qindex < qindex_low_thr) { + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base >> 1; + thresholds[3] = threshold_base << 3; + } else { + int64_t qi_diff_low = current_qindex - qindex_low_thr; + int64_t qi_diff_high = qindex_high_thr - current_qindex; + int64_t threshold_diff = qindex_high_thr - qindex_low_thr; + int64_t threshold_base_high = (5 * threshold_base) >> 1; + + threshold_diff = threshold_diff > 0 ? threshold_diff : 1; + threshold_base = + (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) / + threshold_diff; + thresholds[1] = threshold_base >> 3; + thresholds[2] = ((qi_diff_low * threshold_base) + + qi_diff_high * (threshold_base >> 1)) / + threshold_diff; + thresholds[3] = ((qi_diff_low * (threshold_base << 5)) + + qi_diff_high * (threshold_base << 3)) / + threshold_diff; + } + } else if (num_pixels < RESOLUTION_720P) { + thresholds[2] = (5 * threshold_base) >> 2; + } else if (num_pixels < RESOLUTION_1080P) { + thresholds[2] = threshold_base << 1; + } else { + // num_pixels >= RESOLUTION_1080P + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (num_pixels < RESOLUTION_1440P) { + thresholds[2] = (5 * threshold_base) >> 1; + } else { + thresholds[2] = (7 * threshold_base) >> 1; + } + } else { + if (cpi->oxcf.speed > 7) { + thresholds[2] = 6 * threshold_base; + } else { + thresholds[2] = 3 * threshold_base; + } + } + } +} + +// Increase partition thresholds for noisy content. Apply it only for +// superblocks where sumdiff is low, as we assume the sumdiff of superblock +// whose only change is due to noise will be low (i.e, noise will average +// out over large block). +static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi, + int64_t threshold_base, + int content_lowsumdiff, + int num_pixels) { + AV1_COMMON *const cm = &cpi->common; + int64_t updated_thresh_base = threshold_base; + if (cpi->noise_estimate.enabled && content_lowsumdiff && + num_pixels > RESOLUTION_480P && cm->current_frame.frame_number > 60) { + NOISE_LEVEL noise_level = + av1_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level == kHigh) + updated_thresh_base = (5 * updated_thresh_base) >> 1; + else if (noise_level == kMedium && + !cpi->sf.rt_sf.prefer_large_partition_blocks) + updated_thresh_base = (5 * updated_thresh_base) >> 2; + } + // TODO(kyslov) Enable var based partition adjusment on temporal denoising +#if 0 // CONFIG_AV1_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow) + updated_thresh_base = + av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level, + content_state, cpi->svc.temporal_layer_id); + else + threshold_base = + scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width, + cm->height, cpi->ppi->rtc_ref.non_reference_frame); +#else + // Increase base variance threshold based on content_state/sum_diff level. + updated_thresh_base = scale_part_thresh_content( + updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height, + cpi->ppi->rtc_ref.non_reference_frame); +#endif + return updated_thresh_base; +} + +static AOM_INLINE void set_vbp_thresholds( + AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex, + int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd, + bool is_segment_id_boosted, int lighting_change) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = is_key_frame ? 120 : 1; + const int ac_q = av1_ac_quant_QTX(qindex, 0, cm->seq_params->bit_depth); + int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q); + const int current_qindex = cm->quant_params.base_qindex; + const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift; + const int num_pixels = cm->width * cm->height; + + if (is_key_frame) { + set_vbp_thresholds_key_frame(cpi, thresholds, threshold_base, + threshold_left_shift, num_pixels); + return; + } + + threshold_base = tune_thresh_noisy_content(cpi, threshold_base, + content_lowsumdiff, num_pixels); + thresholds[0] = threshold_base >> 1; + thresholds[1] = threshold_base; + thresholds[3] = threshold_base << threshold_left_shift; + + tune_thresh_based_on_resolution(cpi, thresholds, threshold_base, + current_qindex, source_sad_rd, num_pixels); + + tune_thresh_based_on_qindex(cpi, thresholds, blk_sad, current_qindex, + num_pixels, is_segment_id_boosted, + source_sad_nonrd, lighting_change); +} + +// Set temporal variance low flag for superblock 64x64. +// Only first 25 in the array are used in this case. +static AOM_INLINE void set_low_temp_var_flag_64x64( + CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info, + MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col, + int mi_row) { + if (xd->mi[0]->bsize == BLOCK_64X64) { + if ((vt->part_variances).none.variance < (thresholds[0] >> 1)) + part_info->variance_low[0] = 1; + } else if (xd->mi[0]->bsize == BLOCK_64X32) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 1] = 1; + } + } else if (xd->mi[0]->bsize == BLOCK_32X64) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 3] = 1; + } + } else { + static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int idx_str = mi_params->mi_stride * (mi_row + idx[lvl1_idx][0]) + + mi_col + idx[lvl1_idx][1]; + MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str; + + if (mi_params->mi_cols <= mi_col + idx[lvl1_idx][1] || + mi_params->mi_rows <= mi_row + idx[lvl1_idx][0]) + continue; + + if (*this_mi == NULL) continue; + + if ((*this_mi)->bsize == BLOCK_32X32) { + int64_t threshold_32x32 = (5 * thresholds[1]) >> 3; + if (vt->split[lvl1_idx].part_variances.none.variance < threshold_32x32) + part_info->variance_low[lvl1_idx + 5] = 1; + } else { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*this_mi)->bsize == BLOCK_16X16 || + (*this_mi)->bsize == BLOCK_32X16 || + (*this_mi)->bsize == BLOCK_16X32) { + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + if (vt->split[lvl1_idx] + .split[lvl2_idx] + .part_variances.none.variance < (thresholds[2] >> 8)) + part_info->variance_low[(lvl1_idx << 2) + lvl2_idx + 9] = 1; + } + } + } + } + } +} + +static AOM_INLINE void set_low_temp_var_flag_128x128( + CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info, + MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col, + int mi_row) { + if (xd->mi[0]->bsize == BLOCK_128X128) { + if (vt->part_variances.none.variance < (thresholds[0] >> 1)) + part_info->variance_low[0] = 1; + } else if (xd->mi[0]->bsize == BLOCK_128X64) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 1] = 1; + } + } else if (xd->mi[0]->bsize == BLOCK_64X128) { + for (int part_idx = 0; part_idx < 2; part_idx++) { + if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2)) + part_info->variance_low[part_idx + 3] = 1; + } + } else { + static const int idx64[4][2] = { + { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 } + }; + static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } }; + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int idx_str = mi_params->mi_stride * (mi_row + idx64[lvl1_idx][0]) + + mi_col + idx64[lvl1_idx][1]; + MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str; + if (*mi_64 == NULL) continue; + if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] || + mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0]) + continue; + const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3; + if ((*mi_64)->bsize == BLOCK_64X64) { + if (vt->split[lvl1_idx].part_variances.none.variance < threshold_64x64) + part_info->variance_low[5 + lvl1_idx] = 1; + } else if ((*mi_64)->bsize == BLOCK_64X32) { + for (int part_idx = 0; part_idx < 2; part_idx++) + if (vt->split[lvl1_idx].part_variances.horz[part_idx].variance < + (threshold_64x64 >> 1)) + part_info->variance_low[9 + (lvl1_idx << 1) + part_idx] = 1; + } else if ((*mi_64)->bsize == BLOCK_32X64) { + for (int part_idx = 0; part_idx < 2; part_idx++) + if (vt->split[lvl1_idx].part_variances.vert[part_idx].variance < + (threshold_64x64 >> 1)) + part_info->variance_low[17 + (lvl1_idx << 1) + part_idx] = 1; + } else { + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + const int idx_str1 = + mi_params->mi_stride * idx32[lvl2_idx][0] + idx32[lvl2_idx][1]; + MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1; + if (*mi_32 == NULL) continue; + + if (mi_params->mi_cols <= + mi_col + idx64[lvl1_idx][1] + idx32[lvl2_idx][1] || + mi_params->mi_rows <= + mi_row + idx64[lvl1_idx][0] + idx32[lvl2_idx][0]) + continue; + const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3; + if ((*mi_32)->bsize == BLOCK_32X32) { + if (vt->split[lvl1_idx] + .split[lvl2_idx] + .part_variances.none.variance < threshold_32x32) + part_info->variance_low[25 + (lvl1_idx << 2) + lvl2_idx] = 1; + } else { + // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block + // inside. + if ((*mi_32)->bsize == BLOCK_16X16 || + (*mi_32)->bsize == BLOCK_32X16 || + (*mi_32)->bsize == BLOCK_16X32) { + for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { + VPartVar *none_var = &vt->split[lvl1_idx] + .split[lvl2_idx] + .split[lvl3_idx] + .part_variances.none; + if (none_var->variance < (thresholds[3] >> 8)) + part_info->variance_low[41 + (lvl1_idx << 4) + + (lvl2_idx << 2) + lvl3_idx] = 1; + } + } + } + } + } + } + } +} + +static AOM_INLINE void set_low_temp_var_flag( + AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd, + VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, + int mi_col, int mi_row, const bool is_small_sb) { + AV1_COMMON *const cm = &cpi->common; + // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected. + // If the temporal variance is small set the flag + // variance_low for the block. The variance threshold can be adjusted, the + // higher the more aggressive. + if (ref_frame_partition == LAST_FRAME) { + if (is_small_sb) + set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd, + &(vt->split[0]), thresholds, mi_col, mi_row); + else + set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt, + thresholds, mi_col, mi_row); + } +} + +static const int pos_shift_16x16[4][4] = { + { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } +}; + +int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + // Relative indices of MB inside the superblock. + const int mi_x = mi_row & 0xF; + const int mi_y = mi_col & 0xF; + // Relative indices of 16x16 block inside the superblock. + const int i = mi_x >> 2; + const int j = mi_y >> 2; + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + switch (bsize) { + case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break; + case BLOCK_64X32: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[1]; + } else if (!mi_y && mi_x) { + force_skip_low_temp_var = variance_low[2]; + } + break; + case BLOCK_32X64: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[3]; + } else if (mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[4]; + } + break; + case BLOCK_32X32: + if (!mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[5]; + } else if (mi_y && !mi_x) { + force_skip_low_temp_var = variance_low[6]; + } else if (!mi_y && mi_x) { + force_skip_low_temp_var = variance_low[7]; + } else if (mi_y && mi_x) { + force_skip_low_temp_var = variance_low[8]; + } + break; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]]; + break; + default: break; + } + + return force_skip_low_temp_var; +} + +int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row, + int mi_col, BLOCK_SIZE bsize) { + int force_skip_low_temp_var = 0; + int x, y; + x = (mi_col & 0x1F) >> 4; + // y = (mi_row & 0x1F) >> 4; + // const int idx64 = (y << 1) + x; + y = (mi_row & 0x17) >> 3; + const int idx64 = y + x; + + x = (mi_col & 0xF) >> 3; + // y = (mi_row & 0xF) >> 3; + // const int idx32 = (y << 1) + x; + y = (mi_row & 0xB) >> 2; + const int idx32 = y + x; + + x = (mi_col & 0x7) >> 2; + // y = (mi_row & 0x7) >> 2; + // const int idx16 = (y << 1) + x; + y = (mi_row & 0x5) >> 1; + const int idx16 = y + x; + // Set force_skip_low_temp_var based on the block size and block offset. + switch (bsize) { + case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break; + case BLOCK_128X64: + assert((mi_col & 0x1F) == 0); + force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)]; + break; + case BLOCK_64X128: + assert((mi_row & 0x1F) == 0); + force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)]; + break; + case BLOCK_64X64: + // Location of this 64x64 block inside the 128x128 superblock + force_skip_low_temp_var = variance_low[5 + idx64]; + break; + case BLOCK_64X32: + x = (mi_col & 0x1F) >> 4; + y = (mi_row & 0x1F) >> 3; + /* + .---------------.---------------. + | x=0,y=0,idx=0 | x=0,y=0,idx=2 | + :---------------+---------------: + | x=0,y=1,idx=1 | x=1,y=1,idx=3 | + :---------------+---------------: + | x=0,y=2,idx=4 | x=1,y=2,idx=6 | + :---------------+---------------: + | x=0,y=3,idx=5 | x=1,y=3,idx=7 | + '---------------'---------------' + */ + const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2); + force_skip_low_temp_var = variance_low[9 + idx64x32]; + break; + case BLOCK_32X64: + x = (mi_col & 0x1F) >> 3; + y = (mi_row & 0x1F) >> 4; + const int idx32x64 = (y << 2) + x; + force_skip_low_temp_var = variance_low[17 + idx32x64]; + break; + case BLOCK_32X32: + force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32]; + break; + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: + force_skip_low_temp_var = + variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16]; + break; + default: break; + } + return force_skip_low_temp_var; +} + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex, + int content_lowsumdiff) { + SPEED_FEATURES *const sf = &cpi->sf; + if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) { + return; + } else { + set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, 0, qindex, + content_lowsumdiff, 0, 0, 0, 0); + // The threshold below is not changed locally. + cpi->vbp_info.threshold_minmax = 15 + (qindex >> 3); + } +} + +static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, unsigned int y_sad, + unsigned int y_sad_g, + unsigned int y_sad_alt, bool is_key_frame, + bool zero_motion, unsigned int *uv_sad) { + MACROBLOCKD *xd = &x->e_mbd; + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + int shift_upper_limit = 1; + int shift_lower_limit = 3; + int fac_uv = 6; + if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return; + + // Use lower threshold (more conservative in setting color flag) for + // higher resolutions non-screen, which tend to have more camera noise. + // Since this may be used to skip compound mode in nonrd pickmode, which + // is generally more effective for higher resolutions, better to be more + // conservative. + if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + if (cpi->common.width * cpi->common.height >= RESOLUTION_1080P) + fac_uv = 3; + else + fac_uv = 5; + } + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && + cpi->rc.high_source_sad) { + shift_lower_limit = 7; + } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 && + cpi->common.width * cpi->common.height >= 640 * 360) { + shift_upper_limit = 2; + shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4; + } + + MB_MODE_INFO *mi = xd->mi[0]; + const AV1_COMMON *const cm = &cpi->common; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + const YV12_BUFFER_CONFIG *yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, LAST_FRAME); + struct buf_2d dst; + unsigned int uv_sad_g = 0; + unsigned int uv_sad_alt = 0; + + for (int plane = AOM_PLANE_U; plane < MAX_MB_PLANE; ++plane) { + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + + if (bs != BLOCK_INVALID) { + // For last: + if (zero_motion) { + if (mi->ref_frame[0] == LAST_FRAME) { + uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride); + } else { + uint8_t *src = (plane == 1) ? yv12->u_buffer : yv12->v_buffer; + setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12->uv_crop_width, + yv12->uv_crop_height, yv12->uv_stride, xd->mi_row, + xd->mi_col, sf, xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y); + + uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, dst.buf, dst.stride); + } + } else { + uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); + } + + // For golden: + if (y_sad_g != UINT_MAX) { + uint8_t *src = (plane == 1) ? yv12_g->u_buffer : yv12_g->v_buffer; + setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_g->uv_crop_width, + yv12_g->uv_crop_height, yv12_g->uv_stride, xd->mi_row, + xd->mi_col, sf, xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y); + uv_sad_g = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf, + dst.stride); + } + + // For altref: + if (y_sad_alt != UINT_MAX) { + uint8_t *src = (plane == 1) ? yv12_alt->u_buffer : yv12_alt->v_buffer; + setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_alt->uv_crop_width, + yv12_alt->uv_crop_height, yv12_alt->uv_stride, + xd->mi_row, xd->mi_col, sf, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y); + uv_sad_alt = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, + dst.buf, dst.stride); + } + } + + if (uv_sad[plane - 1] > (y_sad >> shift_upper_limit)) + x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 1; + else if (uv_sad[plane - 1] < (y_sad >> shift_lower_limit)) + x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 0; + // Borderline case: to be refined at coding block level in nonrd_pickmode, + // for coding block size < sb_size. + else + x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 2; + + x->color_sensitivity_sb_g[COLOR_SENS_IDX(plane)] = + uv_sad_g > y_sad_g / fac_uv; + x->color_sensitivity_sb_alt[COLOR_SENS_IDX(plane)] = + uv_sad_alt > y_sad_alt / fac_uv; + } +} + +static void fill_variance_tree_leaves( + AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split, + int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4], + int64_t *thresholds, const uint8_t *src_buf, int src_stride, + const uint8_t *dst_buf, int dst_stride, bool is_key_frame, + const bool is_small_sb) { + MACROBLOCKD *xd = &x->e_mbd; + const int num_64x64_blocks = is_small_sb ? 1 : 4; + // TODO(kyslov) Bring back compute_minmax_variance with content type detection + const int compute_minmax_variance = 0; + const int segment_id = xd->mi[0]->segment_id; + int pixels_wide = 128, pixels_high = 128; + int border_offset_4x4 = 0; + int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf; + // dst_buf pointer is not used for is_key_frame, so it should be NULL. + assert(IMPLIES(is_key_frame, dst_buf == NULL)); + if (is_small_sb) { + pixels_wide = 64; + pixels_high = 64; + } + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); +#if CONFIG_AV1_TEMPORAL_DENOISING + temporal_denoising |= cpi->oxcf.noise_sensitivity; +#endif + // For temporal filtering or temporal denoiser enabled: since the source + // is modified we need to avoid 4x4 avg along superblock boundary, since + // simd code will load 8 pixels for 4x4 avg and so can access source + // data outside superblock (while its being modified by temporal filter). + // Temporal filtering is never done on key frames. + if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4; + for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; blk64_idx++) { + const int x64_idx = GET_BLK_IDX_X(blk64_idx, 6); + const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 6); + const int blk64_scale_idx = blk64_idx << 2; + force_split[blk64_idx + 1] = PART_EVAL_ALL; + + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int x32_idx = x64_idx + GET_BLK_IDX_X(lvl1_idx, 5); + const int y32_idx = y64_idx + GET_BLK_IDX_Y(lvl1_idx, 5); + const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ALL; + avg_16x16[blk64_idx][lvl1_idx] = 0; + maxvar_16x16[blk64_idx][lvl1_idx] = 0; + minvar_16x16[blk64_idx][lvl1_idx] = INT_MAX; + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + const int x16_idx = x32_idx + GET_BLK_IDX_X(lvl2_idx, 4); + const int y16_idx = y32_idx + GET_BLK_IDX_Y(lvl2_idx, 4); + const int split_index = 21 + lvl1_scale_idx + lvl2_idx; + VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + force_split[split_index] = PART_EVAL_ALL; + if (is_key_frame) { + // Go down to 4x4 down-sampling for variance. + for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { + const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3); + const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3); + VP8x8 *vst2 = &vst->split[lvl3_idx]; + fill_variance_4x4avg(src_buf, src_stride, x8_idx, y8_idx, vst2, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high, border_offset_4x4); + } + } else { + fill_variance_8x8avg(src_buf, src_stride, dst_buf, dst_stride, + x16_idx, y16_idx, vst, is_cur_buf_hbd(xd), + pixels_wide, pixels_high); + + fill_variance_tree(vst, BLOCK_16X16); + VPartVar *none_var = &vt->split[blk64_idx] + .split[lvl1_idx] + .split[lvl2_idx] + .part_variances.none; + get_variance(none_var); + const int val_none_var = none_var->variance; + avg_16x16[blk64_idx][lvl1_idx] += val_none_var; + minvar_16x16[blk64_idx][lvl1_idx] = + AOMMIN(minvar_16x16[blk64_idx][lvl1_idx], val_none_var); + maxvar_16x16[blk64_idx][lvl1_idx] = + AOMMAX(maxvar_16x16[blk64_idx][lvl1_idx], val_none_var); + if (val_none_var > thresholds[3]) { + // 16X16 variance is above threshold for split, so force split to + // 8x8 for this 16x16 block (this also forces splits for upper + // levels). + force_split[split_index] = PART_EVAL_ONLY_SPLIT; + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } else if (!cyclic_refresh_segment_id_boosted(segment_id) && + compute_minmax_variance && val_none_var > thresholds[2]) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above + // threshold, force split to 8x8 block for this 16x16 block. + int minmax = compute_minmax_8x8(src_buf, src_stride, dst_buf, + dst_stride, x16_idx, y16_idx, +#if CONFIG_AV1_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high); + const int thresh_minmax = (int)cpi->vbp_info.threshold_minmax; + if (minmax > thresh_minmax) { + force_split[split_index] = PART_EVAL_ONLY_SPLIT; + force_split[5 + blk64_scale_idx + lvl1_idx] = + PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + } + } + } + } + } +} + +static AOM_INLINE void set_ref_frame_for_partition( + AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi, + unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt, + const YV12_BUFFER_CONFIG *yv12_g, const YV12_BUFFER_CONFIG *yv12_alt, + int mi_row, int mi_col, int num_planes) { + AV1_COMMON *const cm = &cpi->common; + const bool is_set_golden_ref_frame = + *y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt; + const bool is_set_altref_ref_frame = + *y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g; + + if (is_set_golden_ref_frame) { + av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + *y_sad = *y_sad_g; + *ref_frame_partition = GOLDEN_FRAME; + x->nonrd_prune_ref_frame_search = 0; + x->sb_me_partition = 0; + } else if (is_set_altref_ref_frame) { + av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col, + get_ref_scale_factors(cm, ALTREF_FRAME), num_planes); + mi->ref_frame[0] = ALTREF_FRAME; + mi->mv[0].as_int = 0; + *y_sad = *y_sad_alt; + *ref_frame_partition = ALTREF_FRAME; + x->nonrd_prune_ref_frame_search = 0; + x->sb_me_partition = 0; + } else { + *ref_frame_partition = LAST_FRAME; + x->nonrd_prune_ref_frame_search = + cpi->sf.rt_sf.nonrd_prune_ref_frame_search; + } +} + +static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0, + const FULLPEL_MV *mv1) { + return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col); +} + +static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x, + unsigned int *y_sad, + bool is_small_sb, + int est_motion) { + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + // TODO(yunqingwang@google.com): test if this condition works with other + // speeds. + if (est_motion > 2 && source_sad_nonrd > kMedSad) return; + + MACROBLOCKD *xd = &x->e_mbd; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + MB_MODE_INFO *mi = xd->mi[0]; + + unsigned int above_y_sad = UINT_MAX; + unsigned int left_y_sad = UINT_MAX; + FULLPEL_MV above_mv = kZeroFullMv; + FULLPEL_MV left_mv = kZeroFullMv; + SubpelMvLimits subpel_mv_limits; + const MV dummy_mv = { 0, 0 }; + av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, &dummy_mv); + + // Current best MV + FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv); + const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8; + + if (xd->up_available) { + const MB_MODE_INFO *above_mbmi = xd->above_mbmi; + if (above_mbmi->mode >= INTRA_MODE_END && + above_mbmi->ref_frame[0] == LAST_FRAME) { + MV temp = above_mbmi->mv[0].as_mv; + clamp_mv(&temp, &subpel_mv_limits); + above_mv = get_fullmv_from_mv(&temp); + + if (mv_distance(&best_mv, &above_mv) > 0) { + uint8_t const *ref_buf = + get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv); + above_y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, ref_buf, + xd->plane[0].pre[0].stride); + } + } + } + if (xd->left_available) { + const MB_MODE_INFO *left_mbmi = xd->left_mbmi; + if (left_mbmi->mode >= INTRA_MODE_END && + left_mbmi->ref_frame[0] == LAST_FRAME) { + MV temp = left_mbmi->mv[0].as_mv; + clamp_mv(&temp, &subpel_mv_limits); + left_mv = get_fullmv_from_mv(&temp); + + if (mv_distance(&best_mv, &left_mv) > 0 && + mv_distance(&above_mv, &left_mv) > 0) { + uint8_t const *ref_buf = + get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv); + left_y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, ref_buf, + xd->plane[0].pre[0].stride); + } + } + } + + if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) { + *y_sad = above_y_sad; + mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv); + clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits); + } + if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) { + *y_sad = left_y_sad; + mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv); + clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits); + } +} + +static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, + unsigned int *y_sad_g, unsigned int *y_sad_alt, + unsigned int *y_sad_last, + MV_REFERENCE_FRAME *ref_frame_partition, + struct scale_factors *sf_no_scale, int mi_row, + int mi_col, bool is_small_sb, bool scaled_ref_last) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int num_planes = av1_num_planes(cm); + bool scaled_ref_golden = false; + bool scaled_ref_alt = false; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + MB_MODE_INFO *mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *yv12 = + scaled_ref_last ? av1_get_scaled_ref_frame(cpi, LAST_FRAME) + : get_ref_frame_yv12_buf(cm, LAST_FRAME); + assert(yv12 != NULL); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const YV12_BUFFER_CONFIG *yv12_alt = NULL; + // Check if LAST is a reference. For spatial layers always use it as + // reference scaling. + int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) || + cpi->svc.number_spatial_layers > 1; + int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG; + int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config || + cpi->sf.rt_sf.use_nonrd_altref_frame || + (cpi->sf.rt_sf.use_comp_ref_nonrd && + cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1); + + // For 1 spatial layer: GOLDEN is another temporal reference. + // Check if it should be used as reference for partitioning. + if (cpi->svc.number_spatial_layers == 1 && use_golden_ref && + (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { + yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (yv12_g && (yv12_g->y_crop_height != cm->height || + yv12_g->y_crop_width != cm->width)) { + yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + scaled_ref_golden = true; + } + if (yv12_g && yv12_g != yv12) { + av1_setup_pre_planes( + xd, 0, yv12_g, mi_row, mi_col, + scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME), + num_planes); + *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, + xd->plane[AOM_PLANE_Y].pre[0].buf, + xd->plane[AOM_PLANE_Y].pre[0].stride); + } + } + + // For 1 spatial layer: ALTREF is another temporal reference. + // Check if it should be used as reference for partitioning. + if (cpi->svc.number_spatial_layers == 1 && use_alt_ref && + (cpi->ref_frame_flags & AOM_ALT_FLAG) && + (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { + yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + if (yv12_alt && (yv12_alt->y_crop_height != cm->height || + yv12_alt->y_crop_width != cm->width)) { + yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME); + scaled_ref_alt = true; + } + if (yv12_alt && yv12_alt != yv12) { + av1_setup_pre_planes( + xd, 0, yv12_alt, mi_row, mi_col, + scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME), + num_planes); + *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, + xd->plane[AOM_PLANE_Y].pre[0].buf, + xd->plane[AOM_PLANE_Y].pre[0].stride); + } + } + + if (use_last_ref) { + const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd; + av1_setup_pre_planes( + xd, 0, yv12, mi_row, mi_col, + scaled_ref_last ? NULL : get_ref_scale_factors(cm, LAST_FRAME), + num_planes); + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE_FRAME; + mi->bsize = cm->seq_params->sb_size; + mi->mv[0].as_int = 0; + mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); + + int est_motion = cpi->sf.rt_sf.estimate_motion_for_var_based_partition; + // TODO(b/290596301): Look into adjusting this condition. + // There is regression on color content when + // estimate_motion_for_var_based_partition = 3 and high motion, + // so for now force it to 2 based on superblock sad. + if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2; + + if (est_motion == 1 || est_motion == 2) { + if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { + // For screen only do int_pro_motion for spatial variance above + // threshold and motion level above LowSad. + if (x->source_variance > 100 && source_sad_nonrd > kLowSad) { + int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN; + int me_search_size_col = + is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1; + // For screen use larger search size row motion to capture + // vertical scroll, which can be larger motion. + int me_search_size_row = + is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1; + unsigned int y_sad_zero; + *y_sad = av1_int_pro_motion_estimation( + cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv, + &y_sad_zero, me_search_size_col, me_search_size_row); + // The logic below selects whether the motion estimated in the + // int_pro_motion() will be used in nonrd_pickmode. Only do this + // for screen for now. + if (is_screen) { + unsigned int thresh_sad = + (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000; + if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) { + x->sb_me_partition = 1; + x->sb_me_mv.as_int = mi->mv[0].as_int; + } else { + x->sb_me_partition = 0; + // Fall back to using zero motion. + *y_sad = y_sad_zero; + mi->mv[0].as_int = 0; + } + } + } + } + } + + if (*y_sad == UINT_MAX) { + *y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, + xd->plane[AOM_PLANE_Y].pre[0].buf, + xd->plane[AOM_PLANE_Y].pre[0].stride); + } + + // Evaluate if neighbours' MVs give better predictions. Zero MV is tested + // already, so only non-zero MVs are tested here. Here the neighbour blocks + // are the first block above or left to this superblock. + if (est_motion >= 2 && (xd->up_available || xd->left_available)) + evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion); + + *y_sad_last = *y_sad; + } + + // Pick the ref frame for partitioning, use golden or altref frame only if + // its lower sad, bias to LAST with factor 0.9. + set_ref_frame_for_partition(cpi, x, xd, ref_frame_partition, mi, y_sad, + y_sad_g, y_sad_alt, yv12_g, yv12_alt, mi_row, + mi_col, num_planes); + + // Only calculate the predictor for non-zero MV. + if (mi->mv[0].as_int != 0) { + if (!scaled_ref_last) { + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + } else { + xd->block_ref_scale_factors[0] = sf_no_scale; + xd->block_ref_scale_factors[1] = sf_no_scale; + } + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, + cm->seq_params->sb_size, AOM_PLANE_Y, + num_planes - 1); + } +} + +// Decides whether to split or merge a 16x16 partition block in variance based +// partitioning based on the 8x8 sub-block variances. +static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var( + VP16x16 *var_16x16_info, int64_t threshold16) { + int max_8x8_var = 0, min_8x8_var = INT_MAX; + for (int split_idx = 0; split_idx < 4; split_idx++) { + get_variance(&var_16x16_info->split[split_idx].part_variances.none); + int this_8x8_var = + var_16x16_info->split[split_idx].part_variances.none.variance; + max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var); + min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var); + } + // If the difference between maximum and minimum sub-block variances is high, + // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate + // only PARTITION_NONE. The shift factor for threshold16 has been derived + // empirically. + return ((max_8x8_var - min_8x8_var) > (threshold16 << 2)) + ? PART_EVAL_ONLY_SPLIT + : PART_EVAL_ONLY_NONE; +} + +static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad( + int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) { + if (set_zeromv_skip_based_on_source_sad == 0) return false; + + if (set_zeromv_skip_based_on_source_sad >= 3) + return source_sad_nonrd <= kLowSad; + else if (set_zeromv_skip_based_on_source_sad >= 2) + return source_sad_nonrd <= kVeryLowSad; + else if (set_zeromv_skip_based_on_source_sad >= 1) + return source_sad_nonrd == kZeroSad; + + return false; +} + +static AOM_INLINE bool set_force_zeromv_skip_for_sb( + AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt, + unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad, + BLOCK_SIZE bsize) { + AV1_COMMON *const cm = &cpi->common; + if (!is_set_force_zeromv_skip_based_on_src_sad( + cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad, + x->content_state_sb.source_sad_nonrd)) + return false; + int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0; + const int block_width = mi_size_wide[cm->seq_params->sb_size]; + const int block_height = mi_size_high[cm->seq_params->sb_size]; + const unsigned int thresh_exit_part_y = + cpi->zeromv_skip_thresh_exit_part[bsize] << shift; + unsigned int thresh_exit_part_uv = + CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift; + // Be more aggressive in UV threshold if source_sad >= VeryLowSad + // to suppreess visual artifact caused by the speed feature: + // set_zeromv_skip_based_on_source_sad = 2. For now only for + // part_early_exit_zeromv = 1. + if (x->content_state_sb.source_sad_nonrd >= kVeryLowSad && + cpi->sf.rt_sf.part_early_exit_zeromv == 1) + thresh_exit_part_uv = thresh_exit_part_uv >> 3; + if (mi_col + block_width <= tile->mi_col_end && + mi_row + block_height <= tile->mi_row_end && y_sad < thresh_exit_part_y && + uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) { + set_block_size(cpi, mi_row, mi_col, bsize); + x->force_zeromv_skip_for_sb = 1; + aom_free(vt); + // Partition shape is set here at SB level. + // Exit needs to happen from av1_choose_var_based_partitioning(). + return true; + } else if (x->content_state_sb.source_sad_nonrd == kZeroSad && + cpi->sf.rt_sf.part_early_exit_zeromv >= 2) + x->force_zeromv_skip_for_sb = 2; + return false; +} + +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + ThreadData *td, MACROBLOCK *x, int mi_row, + int mi_col) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, choose_var_based_partitioning_time); +#endif + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds; + PART_EVAL_STATUS force_split[85]; + int avg_64x64; + int max_var_32x32[4]; + int min_var_32x32[4]; + int var_32x32; + int var_64x64; + int min_var_64x64 = INT_MAX; + int max_var_64x64 = 0; + int avg_16x16[4][4]; + int maxvar_16x16[4][4]; + int minvar_16x16[4][4]; + const uint8_t *src_buf; + const uint8_t *dst_buf; + int dst_stride; + unsigned int uv_sad[MAX_MB_PLANE - 1]; + NOISE_LEVEL noise_level = kLow; + bool is_zero_motion = true; + bool scaled_ref_last = false; + struct scale_factors sf_no_scale; + av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height, + cm->width, cm->height); + + bool is_key_frame = + (frame_is_intra_only(cm) || + (cpi->ppi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); + + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const bool is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); + const int num_64x64_blocks = is_small_sb ? 1 : 4; + + unsigned int y_sad = UINT_MAX; + unsigned int y_sad_g = UINT_MAX; + unsigned int y_sad_alt = UINT_MAX; + unsigned int y_sad_last = UINT_MAX; + BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; + + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; + + int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1], + vbp_thresholds[2], vbp_thresholds[3], + vbp_thresholds[4] }; + + const int segment_id = xd->mi[0]->segment_id; + uint64_t blk_sad = 0; + if (cpi->src_sad_blk_64x64 != NULL && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; + const int sb_cols = + (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; + const int sbi_col = mi_col / sb_size_by_mb; + const int sbi_row = mi_row / sb_size_by_mb; + blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols]; + } + + const bool is_segment_id_boosted = + cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cyclic_refresh_segment_id_boosted(segment_id); + const int qindex = + is_segment_id_boosted + ? av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + : cm->quant_params.base_qindex; + set_vbp_thresholds( + cpi, thresholds, blk_sad, qindex, x->content_state_sb.low_sumdiff, + x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd, + is_segment_id_boosted, x->content_state_sb.lighting_change); + + src_buf = x->plane[AOM_PLANE_Y].src.buf; + int src_stride = x->plane[AOM_PLANE_Y].src.stride; + + // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, + // 5-20 for the 16x16 blocks. + force_split[0] = PART_EVAL_ALL; + memset(x->part_search_info.variance_low, 0, + sizeof(x->part_search_info.variance_low)); + + // Check if LAST frame is NULL, and if so, treat this frame + // as a key frame, for the purpose of the superblock partitioning. + // LAST == NULL can happen in cases where enhancement spatial layers are + // enabled dyanmically and the only reference is the spatial(GOLDEN). + // If LAST frame has a different resolution: set the scaled_ref_last flag + // and check if ref_scaled is NULL. + if (!frame_is_intra_only(cm)) { + const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, LAST_FRAME); + if (ref == NULL) { + is_key_frame = true; + } else if (ref->y_crop_height != cm->height || + ref->y_crop_width != cm->width) { + scaled_ref_last = true; + const YV12_BUFFER_CONFIG *ref_scaled = + av1_get_scaled_ref_frame(cpi, LAST_FRAME); + if (ref_scaled == NULL) is_key_frame = true; + } + } + + x->source_variance = UINT_MAX; + // For nord_pickmode: compute source_variance, only for superblocks with + // some motion for now. This input can then be used to bias the partitioning + // or the chroma_check. + if (cpi->sf.rt_sf.use_nonrd_pick_mode && + x->content_state_sb.source_sad_nonrd > kLowSad) + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y); + + if (!is_key_frame) { + setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last, + &ref_frame_partition, &sf_no_scale, mi_row, mi_col, + is_small_sb, scaled_ref_last); + + MB_MODE_INFO *mi = xd->mi[0]; + // Use reference SB directly for zero mv. + if (mi->mv[0].as_int != 0) { + dst_buf = xd->plane[AOM_PLANE_Y].dst.buf; + dst_stride = xd->plane[AOM_PLANE_Y].dst.stride; + is_zero_motion = false; + } else { + dst_buf = xd->plane[AOM_PLANE_Y].pre[0].buf; + dst_stride = xd->plane[AOM_PLANE_Y].pre[0].stride; + } + } else { + dst_buf = NULL; + dst_stride = 0; + } + + // check and set the color sensitivity of sb. + av1_zero(uv_sad); + chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, y_sad_alt, is_key_frame, + is_zero_motion, uv_sad); + + x->force_zeromv_skip_for_sb = 0; + + VP128x128 *vt; + AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); + vt->split = td->vt64x64; + + // If the superblock is completely static (zero source sad) and + // the y_sad (relative to LAST ref) is very small, take the sb_size partition + // and exit, and force zeromv_last skip mode for nonrd_pickmode. + // Only do this on the base segment (so the QP-boosted segment, if applied, + // can still continue cleaning/ramping up the quality). + // Condition on color uv_sad is also added. + if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv && + cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE && + ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) { + // Exit here, if zero mv skip flag is set at SB level. + if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col, + y_sad, bsize)) + return 0; + } + + if (cpi->noise_estimate.enabled) + noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); + + // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames) + // variances for splits. + fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16, + minvar_16x16, thresholds, src_buf, src_stride, + dst_buf, dst_stride, is_key_frame, is_small_sb); + + avg_64x64 = 0; + for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { + max_var_32x32[blk64_idx] = 0; + min_var_32x32[blk64_idx] = INT_MAX; + const int blk64_scale_idx = blk64_idx << 2; + for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { + const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; + for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { + if (!is_key_frame) continue; + VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) + fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8); + fill_variance_tree(vtemp, BLOCK_16X16); + // If variance of this 16x16 block is above the threshold, force block + // to split. This also forces a split on the upper levels. + get_variance(&vtemp->part_variances.none); + if (vtemp->part_variances.none.variance > thresholds[3]) { + const int split_index = 21 + lvl1_scale_idx + lvl2_idx; + force_split[split_index] = + cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var + ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3]) + : PART_EVAL_ONLY_SPLIT; + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + } + fill_variance_tree(&vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32); + // If variance of this 32x32 block is above the threshold, or if its above + // (some threshold of) the average variance over the sub-16x16 blocks, + // then force this block to split. This also forces a split on the upper + // (64x64) level. + uint64_t frame_sad_thresh = 20000; + const int is_360p_or_smaller = cm->width * cm->height <= RESOLUTION_360P; + if (cpi->svc.number_temporal_layers > 2 && + cpi->svc.temporal_layer_id == 0) + frame_sad_thresh = frame_sad_thresh << 1; + if (force_split[5 + blk64_scale_idx + lvl1_idx] == PART_EVAL_ALL) { + get_variance(&vt->split[blk64_idx].split[lvl1_idx].part_variances.none); + var_32x32 = + vt->split[blk64_idx].split[lvl1_idx].part_variances.none.variance; + max_var_32x32[blk64_idx] = AOMMAX(var_32x32, max_var_32x32[blk64_idx]); + min_var_32x32[blk64_idx] = AOMMIN(var_32x32, min_var_32x32[blk64_idx]); + const int max_min_var_16X16_diff = (maxvar_16x16[blk64_idx][lvl1_idx] - + minvar_16x16[blk64_idx][lvl1_idx]); + + if (var_32x32 > thresholds[2] || + (!is_key_frame && var_32x32 > (thresholds[2] >> 1) && + var_32x32 > (avg_16x16[blk64_idx][lvl1_idx] >> 1))) { + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } else if (!is_key_frame && is_360p_or_smaller && + ((max_min_var_16X16_diff > (thresholds[2] >> 1) && + maxvar_16x16[blk64_idx][lvl1_idx] > thresholds[2]) || + (cpi->sf.rt_sf.prefer_large_partition_blocks && + x->content_state_sb.source_sad_nonrd > kLowSad && + cpi->rc.frame_source_sad < frame_sad_thresh && + maxvar_16x16[blk64_idx][lvl1_idx] > (thresholds[2] >> 4) && + maxvar_16x16[blk64_idx][lvl1_idx] > + (minvar_16x16[blk64_idx][lvl1_idx] << 2)))) { + force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT; + force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + } + } + if (force_split[1 + blk64_idx] == PART_EVAL_ALL) { + fill_variance_tree(&vt->split[blk64_idx], BLOCK_64X64); + get_variance(&vt->split[blk64_idx].part_variances.none); + var_64x64 = vt->split[blk64_idx].part_variances.none.variance; + max_var_64x64 = AOMMAX(var_64x64, max_var_64x64); + min_var_64x64 = AOMMIN(var_64x64, min_var_64x64); + // If the difference of the max-min variances of sub-blocks or max + // variance of a sub-block is above some threshold of then force this + // block to split. Only checking this for noise level >= medium, if + // encoder is in SVC or if we already forced large blocks. + const int max_min_var_32x32_diff = + max_var_32x32[blk64_idx] - min_var_32x32[blk64_idx]; + const int check_max_var = max_var_32x32[blk64_idx] > thresholds[1] >> 1; + const bool check_noise_lvl = noise_level >= kMedium || + cpi->ppi->use_svc || + cpi->sf.rt_sf.prefer_large_partition_blocks; + const int64_t set_threshold = 3 * (thresholds[1] >> 3); + + if (!is_key_frame && max_min_var_32x32_diff > set_threshold && + check_max_var && check_noise_lvl) { + force_split[1 + blk64_idx] = PART_EVAL_ONLY_SPLIT; + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + avg_64x64 += var_64x64; + } + if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT; + } + + if (force_split[0] == PART_EVAL_ALL) { + fill_variance_tree(vt, BLOCK_128X128); + get_variance(&vt->part_variances.none); + const int set_avg_64x64 = (9 * avg_64x64) >> 5; + if (!is_key_frame && vt->part_variances.none.variance > set_avg_64x64) + force_split[0] = PART_EVAL_ONLY_SPLIT; + + if (!is_key_frame && + (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) && + max_var_64x64 > thresholds[0] >> 1) + force_split[0] = PART_EVAL_ONLY_SPLIT; + } + + if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end || + !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col, + thresholds[0], BLOCK_16X16, force_split[0])) { + for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { + const int x64_idx = GET_BLK_IDX_X(blk64_idx, 4); + const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 4); + const int blk64_scale_idx = blk64_idx << 2; + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold. + if (set_vt_partitioning(cpi, xd, tile, &vt->split[blk64_idx], BLOCK_64X64, + mi_row + y64_idx, mi_col + x64_idx, thresholds[1], + BLOCK_16X16, force_split[1 + blk64_idx])) + continue; + for (int lvl1_idx = 0; lvl1_idx < 4; ++lvl1_idx) { + const int x32_idx = GET_BLK_IDX_X(lvl1_idx, 3); + const int y32_idx = GET_BLK_IDX_Y(lvl1_idx, 3); + const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; + if (set_vt_partitioning( + cpi, xd, tile, &vt->split[blk64_idx].split[lvl1_idx], + BLOCK_32X32, (mi_row + y64_idx + y32_idx), + (mi_col + x64_idx + x32_idx), thresholds[2], BLOCK_16X16, + force_split[5 + blk64_scale_idx + lvl1_idx])) + continue; + for (int lvl2_idx = 0; lvl2_idx < 4; ++lvl2_idx) { + const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2); + const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2); + const int split_index = 21 + lvl1_scale_idx + lvl2_idx; + VP16x16 *vtemp = + &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16, + mi_row + y64_idx + y32_idx + y16_idx, + mi_col + x64_idx + x32_idx + x16_idx, + thresholds[3], BLOCK_8X8, + force_split[split_index])) + continue; + for (int lvl3_idx = 0; lvl3_idx < 4; ++lvl3_idx) { + const int x8_idx = GET_BLK_IDX_X(lvl3_idx, 1); + const int y8_idx = GET_BLK_IDX_Y(lvl3_idx, 1); + set_block_size(cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx), + (mi_col + x64_idx + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + } + } + } + } + } + + if (cpi->sf.rt_sf.short_circuit_low_temp_var) { + set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds, + ref_frame_partition, mi_col, mi_row, is_small_sb); + } + + aom_free(vt); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, choose_var_based_partitioning_time); +#endif + return 0; +} diff --git a/third_party/aom/av1/encoder/var_based_part.h b/third_party/aom/av1/encoder/var_based_part.h new file mode 100644 index 0000000000..f912458307 --- /dev/null +++ b/third_party/aom/av1/encoder/var_based_part.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_ +#define AOM_AV1_ENCODER_VAR_BASED_PART_H_ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/encoder/encoder.h" + +// Calculate block index x and y from split level and index +#define GET_BLK_IDX_X(idx, level) (((idx) & (0x01)) << (level)) +#define GET_BLK_IDX_Y(idx, level) (((idx) >> (0x01)) << (level)) + +#ifdef __cplusplus +extern "C" { +#endif + +#define QINDEX_LARGE_BLOCK_THR \ + 100 // Use increased thresholds for midres for speed 9 when qindex is above + // this threshold + +#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \ + ((3 * (thresh_exit_part)) >> 2) +/*!\brief Set the thresholds for variance based partition. + * + * Set the variance split thresholds for following the block sizes: + * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32, + * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is + * currently only used on key frame. The thresholds are based om Q, resolution, + * noise level, and content state. + * + * \ingroup variance_partition + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] q q index + * \param[in] content_lowsumdiff Low sumdiff flag for superblock + * + * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds. + */ +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, + int content_lowsumdiff); + +/*!\brief Variance based partition selection. + * + * Select the partitioning based on the variance of the residual signal, + * residual generated as the difference between the source and prediction. + * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever + * has lower y sad. For LAST, option exists (speed feature) to use motion + * compensation based on superblock motion via int_pro_motion_estimation. For + * key frames reference is fixed 128 level, so variance is the source variance. + * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled), + * and selection is done top-down via as set of partition thresholds. defined + * for each block level, and set based on Q, resolution, noise level, and + * content state. + * + * \ingroup variance_partition + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] tile Pointer to TileInfo + * \param[in] td Pointer to ThreadData + * \param[in] x Pointer to MACROBLOCK + * \param[in] mi_row Row coordinate of the superblock in a step + size of MI_SIZE + * \param[in] mi_col Column coordinate of the super block in a step + size of MI_SIZE + * + * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low + * temporal variance flag and the color sensitivity flag (both used in + * nonrd_pickmode). + */ +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + ThreadData *td, MACROBLOCK *x, int mi_row, + int mi_col); + +// Read out the block's temporal variance for 64x64 SB case. +int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize); +// Read out the block's temporal variance for 128x128 SB case. +int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row, + int mi_col, BLOCK_SIZE bsize); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_ diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c new file mode 100644 index 0000000000..40670178d7 --- /dev/null +++ b/third_party/aom/av1/encoder/wedge_utils.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +#include "aom_ports/mem.h" + +#include "aom_dsp/aom_dsp_common.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * Computes SSE of a compound predictor constructed from 2 fundamental + * predictors p0 and p1 using blending with mask. + * + * r1: Residuals of p1. + * (source - p1) + * d: Difference of p1 and p0. + * (p1 - p0) + * m: The blending mask + * N: Number of pixels + * + * 'r1', 'd', and 'm' are contiguous. + * + * Computes: + * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), + * where r0 is (source - p0), and r1 is (source - p1), which is in turn + * is equivalent to: + * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), + * which is the SSE of the residuals of the compound predictor scaled up by + * MAX_MASK_VALUE**2. + * + * Note that we clamp the partial term in the loop to 16 bits signed. This is + * to facilitate equivalent SIMD implementation. It should have no effect if + * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always + * holds for 8 bit input, and on real input, it should hold practically always, + * as residuals are expected to be small. + */ +uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + uint64_t csse = 0; + int i; + + for (i = 0; i < N; i++) { + int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; + t = clamp(t, INT16_MIN, INT16_MAX); + csse += t * t; + } + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * Choose the mask sign for a compound predictor. + * + * ds: Difference of the squares of the residuals. + * r0**2 - r1**2 + * m: The blending mask + * N: Number of pixels + * limit: Pre-computed threshold value. + * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * 'ds' and 'm' are contiguous. + * + * Returns true if the negated mask has lower SSE compared to the positive + * mask. Computation is based on: + * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) + * > + * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) + * + * which can be simplified to: + * + * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) + * + * The right hand side does not depend on the mask, and needs to be passed as + * the 'limit' parameter. + * + * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left + * hand side is simply a scalar product between an int16_t and uint8_t vector. + * + * Note that for efficiency, ds is stored on 16 bits. Real input residuals + * being small, this should not cause a noticeable issue. + */ +int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc = 0; + + do { + acc += *ds++ * *m++; + } while (--N); + + return acc > limit; +} + +/** + * Compute the element-wise difference of the squares of 2 arrays. + * + * d: Difference of the squares of the inputs: a**2 - b**2 + * a: First input array + * b: Second input array + * N: Number of elements + * + * 'd', 'a', and 'b' are contiguous. + * + * The result is saturated to signed 16 bits. + */ +void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + int i; + + for (i = 0; i < N; i++) + d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c new file mode 100644 index 0000000000..494b0fdf15 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c @@ -0,0 +1,1409 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/encoder/x86/av1_txfm1d_sse4.h" + +void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int stride) { + __m128i buf0[32]; + __m128i buf1[32]; + const int32_t *cospi; + + int startidx = 0 * stride; + int endidx = 31 * stride; + // stage 0 + // stage 1 + buf1[0] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[1] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[2] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[3] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[4] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[5] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[6] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[7] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[8] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[9] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[10] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[11] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[12] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[13] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[14] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += stride; + endidx -= stride; + buf1[15] = _mm_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + cos_bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], + cos_bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], + buf1[14], cos_bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], + buf0[31], cos_bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + + startidx = 0 * stride; + endidx = 31 * stride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += stride; + endidx -= stride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} + +void av1_fadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 4; + const int num_per_128 = 4; + const int32_t *cospi; + __m128i buf0[4]; + __m128i buf1[4]; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + // stage 0; + int j; + for (j = 0; j < 4; ++j) { + buf0[j] = input[j * col_num + col]; + } + + // stage 1 + buf1[0] = buf0[3]; + buf1[1] = buf0[0]; + buf1[2] = buf0[1]; + buf1[3] = buf0[2]; + + // stage 2 + cospi = cospi_arr(cos_bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], + cos_bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 3 + buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); + buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); + buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); + buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], cos_bit); + + // stage 5 + buf1[0] = buf0[0]; + buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]); + buf1[2] = buf0[3]; + buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]); + + for (j = 0; j < 4; ++j) { + output[j * col_num + col] = buf1[j]; + } + } +} + +void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]); + __m128i cospi_p32 = _mm_set1_epi32(cospi[32]); + __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]); + __m128i cospi_p48 = _mm_set1_epi32(cospi[48]); + __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]); + __m128i cospi_p16 = _mm_set1_epi32(cospi[16]); + __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]); + __m128i cospi_p56 = _mm_set1_epi32(cospi[56]); + __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]); + __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]); + __m128i cospi_p24 = _mm_set1_epi32(cospi[24]); + __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]); + __m128i cospi_p08 = _mm_set1_epi32(cospi[8]); + __m128i cospi_p40 = _mm_set1_epi32(cospi[40]); + __m128i cospi_p60 = _mm_set1_epi32(cospi[60]); + __m128i cospi_p04 = _mm_set1_epi32(cospi[4]); + __m128i cospi_p28 = _mm_set1_epi32(cospi[28]); + __m128i cospi_p36 = _mm_set1_epi32(cospi[36]); + __m128i cospi_p44 = _mm_set1_epi32(cospi[44]); + __m128i cospi_p20 = _mm_set1_epi32(cospi[20]); + __m128i cospi_p12 = _mm_set1_epi32(cospi[12]); + __m128i cospi_p52 = _mm_set1_epi32(cospi[52]); + __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]); + __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]); + __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]); + __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]); + __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]); + __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]); + __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]); + __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]); + __m128i cospi_p62 = _mm_set1_epi32(cospi[62]); + __m128i cospi_p02 = _mm_set1_epi32(cospi[2]); + __m128i cospi_p30 = _mm_set1_epi32(cospi[30]); + __m128i cospi_p34 = _mm_set1_epi32(cospi[34]); + __m128i cospi_p46 = _mm_set1_epi32(cospi[46]); + __m128i cospi_p18 = _mm_set1_epi32(cospi[18]); + __m128i cospi_p14 = _mm_set1_epi32(cospi[14]); + __m128i cospi_p50 = _mm_set1_epi32(cospi[50]); + __m128i cospi_p54 = _mm_set1_epi32(cospi[54]); + __m128i cospi_p10 = _mm_set1_epi32(cospi[10]); + __m128i cospi_p22 = _mm_set1_epi32(cospi[22]); + __m128i cospi_p42 = _mm_set1_epi32(cospi[42]); + __m128i cospi_p38 = _mm_set1_epi32(cospi[38]); + __m128i cospi_p26 = _mm_set1_epi32(cospi[26]); + __m128i cospi_p06 = _mm_set1_epi32(cospi[6]); + __m128i cospi_p58 = _mm_set1_epi32(cospi[58]); + __m128i cospi_p63 = _mm_set1_epi32(cospi[63]); + __m128i cospi_p01 = _mm_set1_epi32(cospi[1]); + __m128i cospi_p31 = _mm_set1_epi32(cospi[31]); + __m128i cospi_p33 = _mm_set1_epi32(cospi[33]); + __m128i cospi_p47 = _mm_set1_epi32(cospi[47]); + __m128i cospi_p17 = _mm_set1_epi32(cospi[17]); + __m128i cospi_p15 = _mm_set1_epi32(cospi[15]); + __m128i cospi_p49 = _mm_set1_epi32(cospi[49]); + __m128i cospi_p55 = _mm_set1_epi32(cospi[55]); + __m128i cospi_p09 = _mm_set1_epi32(cospi[9]); + __m128i cospi_p23 = _mm_set1_epi32(cospi[23]); + __m128i cospi_p41 = _mm_set1_epi32(cospi[41]); + __m128i cospi_p39 = _mm_set1_epi32(cospi[39]); + __m128i cospi_p25 = _mm_set1_epi32(cospi[25]); + __m128i cospi_p07 = _mm_set1_epi32(cospi[7]); + __m128i cospi_p57 = _mm_set1_epi32(cospi[57]); + __m128i cospi_p59 = _mm_set1_epi32(cospi[59]); + __m128i cospi_p05 = _mm_set1_epi32(cospi[5]); + __m128i cospi_p27 = _mm_set1_epi32(cospi[27]); + __m128i cospi_p37 = _mm_set1_epi32(cospi[37]); + __m128i cospi_p43 = _mm_set1_epi32(cospi[43]); + __m128i cospi_p21 = _mm_set1_epi32(cospi[21]); + __m128i cospi_p11 = _mm_set1_epi32(cospi[11]); + __m128i cospi_p53 = _mm_set1_epi32(cospi[53]); + __m128i cospi_p51 = _mm_set1_epi32(cospi[51]); + __m128i cospi_p13 = _mm_set1_epi32(cospi[13]); + __m128i cospi_p19 = _mm_set1_epi32(cospi[19]); + __m128i cospi_p45 = _mm_set1_epi32(cospi[45]); + __m128i cospi_p35 = _mm_set1_epi32(cospi[35]); + __m128i cospi_p29 = _mm_set1_epi32(cospi[29]); + __m128i cospi_p03 = _mm_set1_epi32(cospi[3]); + __m128i cospi_p61 = _mm_set1_epi32(cospi[61]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m128i x1[64]; + x1[0] = _mm_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_add_epi32(x1[0], x1[31]); + x2[31] = _mm_sub_epi32(x1[0], x1[31]); + x2[1] = _mm_add_epi32(x1[1], x1[30]); + x2[30] = _mm_sub_epi32(x1[1], x1[30]); + x2[2] = _mm_add_epi32(x1[2], x1[29]); + x2[29] = _mm_sub_epi32(x1[2], x1[29]); + x2[3] = _mm_add_epi32(x1[3], x1[28]); + x2[28] = _mm_sub_epi32(x1[3], x1[28]); + x2[4] = _mm_add_epi32(x1[4], x1[27]); + x2[27] = _mm_sub_epi32(x1[4], x1[27]); + x2[5] = _mm_add_epi32(x1[5], x1[26]); + x2[26] = _mm_sub_epi32(x1[5], x1[26]); + x2[6] = _mm_add_epi32(x1[6], x1[25]); + x2[25] = _mm_sub_epi32(x1[6], x1[25]); + x2[7] = _mm_add_epi32(x1[7], x1[24]); + x2[24] = _mm_sub_epi32(x1[7], x1[24]); + x2[8] = _mm_add_epi32(x1[8], x1[23]); + x2[23] = _mm_sub_epi32(x1[8], x1[23]); + x2[9] = _mm_add_epi32(x1[9], x1[22]); + x2[22] = _mm_sub_epi32(x1[9], x1[22]); + x2[10] = _mm_add_epi32(x1[10], x1[21]); + x2[21] = _mm_sub_epi32(x1[10], x1[21]); + x2[11] = _mm_add_epi32(x1[11], x1[20]); + x2[20] = _mm_sub_epi32(x1[11], x1[20]); + x2[12] = _mm_add_epi32(x1[12], x1[19]); + x2[19] = _mm_sub_epi32(x1[12], x1[19]); + x2[13] = _mm_add_epi32(x1[13], x1[18]); + x2[18] = _mm_sub_epi32(x1[13], x1[18]); + x2[14] = _mm_add_epi32(x1[14], x1[17]); + x2[17] = _mm_sub_epi32(x1[14], x1[17]); + x2[15] = _mm_add_epi32(x1[15], x1[16]); + x2[16] = _mm_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], + __rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_add_epi32(x2[0], x2[15]); + x3[15] = _mm_sub_epi32(x2[0], x2[15]); + x3[1] = _mm_add_epi32(x2[1], x2[14]); + x3[14] = _mm_sub_epi32(x2[1], x2[14]); + x3[2] = _mm_add_epi32(x2[2], x2[13]); + x3[13] = _mm_sub_epi32(x2[2], x2[13]); + x3[3] = _mm_add_epi32(x2[3], x2[12]); + x3[12] = _mm_sub_epi32(x2[3], x2[12]); + x3[4] = _mm_add_epi32(x2[4], x2[11]); + x3[11] = _mm_sub_epi32(x2[4], x2[11]); + x3[5] = _mm_add_epi32(x2[5], x2[10]); + x3[10] = _mm_sub_epi32(x2[5], x2[10]); + x3[6] = _mm_add_epi32(x2[6], x2[9]); + x3[9] = _mm_sub_epi32(x2[6], x2[9]); + x3[7] = _mm_add_epi32(x2[7], x2[8]); + x3[8] = _mm_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], + __rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_add_epi32(x2[32], x2[47]); + x3[47] = _mm_sub_epi32(x2[32], x2[47]); + x3[33] = _mm_add_epi32(x2[33], x2[46]); + x3[46] = _mm_sub_epi32(x2[33], x2[46]); + x3[34] = _mm_add_epi32(x2[34], x2[45]); + x3[45] = _mm_sub_epi32(x2[34], x2[45]); + x3[35] = _mm_add_epi32(x2[35], x2[44]); + x3[44] = _mm_sub_epi32(x2[35], x2[44]); + x3[36] = _mm_add_epi32(x2[36], x2[43]); + x3[43] = _mm_sub_epi32(x2[36], x2[43]); + x3[37] = _mm_add_epi32(x2[37], x2[42]); + x3[42] = _mm_sub_epi32(x2[37], x2[42]); + x3[38] = _mm_add_epi32(x2[38], x2[41]); + x3[41] = _mm_sub_epi32(x2[38], x2[41]); + x3[39] = _mm_add_epi32(x2[39], x2[40]); + x3[40] = _mm_sub_epi32(x2[39], x2[40]); + x3[48] = _mm_sub_epi32(x2[63], x2[48]); + x3[63] = _mm_add_epi32(x2[63], x2[48]); + x3[49] = _mm_sub_epi32(x2[62], x2[49]); + x3[62] = _mm_add_epi32(x2[62], x2[49]); + x3[50] = _mm_sub_epi32(x2[61], x2[50]); + x3[61] = _mm_add_epi32(x2[61], x2[50]); + x3[51] = _mm_sub_epi32(x2[60], x2[51]); + x3[60] = _mm_add_epi32(x2[60], x2[51]); + x3[52] = _mm_sub_epi32(x2[59], x2[52]); + x3[59] = _mm_add_epi32(x2[59], x2[52]); + x3[53] = _mm_sub_epi32(x2[58], x2[53]); + x3[58] = _mm_add_epi32(x2[58], x2[53]); + x3[54] = _mm_sub_epi32(x2[57], x2[54]); + x3[57] = _mm_add_epi32(x2[57], x2[54]); + x3[55] = _mm_sub_epi32(x2[56], x2[55]); + x3[56] = _mm_add_epi32(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_add_epi32(x3[0], x3[7]); + x4[7] = _mm_sub_epi32(x3[0], x3[7]); + x4[1] = _mm_add_epi32(x3[1], x3[6]); + x4[6] = _mm_sub_epi32(x3[1], x3[6]); + x4[2] = _mm_add_epi32(x3[2], x3[5]); + x4[5] = _mm_sub_epi32(x3[2], x3[5]); + x4[3] = _mm_add_epi32(x3[3], x3[4]); + x4[4] = _mm_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], + __rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_add_epi32(x3[16], x3[23]); + x4[23] = _mm_sub_epi32(x3[16], x3[23]); + x4[17] = _mm_add_epi32(x3[17], x3[22]); + x4[22] = _mm_sub_epi32(x3[17], x3[22]); + x4[18] = _mm_add_epi32(x3[18], x3[21]); + x4[21] = _mm_sub_epi32(x3[18], x3[21]); + x4[19] = _mm_add_epi32(x3[19], x3[20]); + x4[20] = _mm_sub_epi32(x3[19], x3[20]); + x4[24] = _mm_sub_epi32(x3[31], x3[24]); + x4[31] = _mm_add_epi32(x3[31], x3[24]); + x4[25] = _mm_sub_epi32(x3[30], x3[25]); + x4[30] = _mm_add_epi32(x3[30], x3[25]); + x4[26] = _mm_sub_epi32(x3[29], x3[26]); + x4[29] = _mm_add_epi32(x3[29], x3[26]); + x4[27] = _mm_sub_epi32(x3[28], x3[27]); + x4[28] = _mm_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], + __rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_add_epi32(x4[0], x4[3]); + x5[3] = _mm_sub_epi32(x4[0], x4[3]); + x5[1] = _mm_add_epi32(x4[1], x4[2]); + x5[2] = _mm_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], + __rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm_add_epi32(x4[8], x4[11]); + x5[11] = _mm_sub_epi32(x4[8], x4[11]); + x5[9] = _mm_add_epi32(x4[9], x4[10]); + x5[10] = _mm_sub_epi32(x4[9], x4[10]); + x5[12] = _mm_sub_epi32(x4[15], x4[12]); + x5[15] = _mm_add_epi32(x4[15], x4[12]); + x5[13] = _mm_sub_epi32(x4[14], x4[13]); + x5[14] = _mm_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], + __rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_add_epi32(x4[32], x4[39]); + x5[39] = _mm_sub_epi32(x4[32], x4[39]); + x5[33] = _mm_add_epi32(x4[33], x4[38]); + x5[38] = _mm_sub_epi32(x4[33], x4[38]); + x5[34] = _mm_add_epi32(x4[34], x4[37]); + x5[37] = _mm_sub_epi32(x4[34], x4[37]); + x5[35] = _mm_add_epi32(x4[35], x4[36]); + x5[36] = _mm_sub_epi32(x4[35], x4[36]); + x5[40] = _mm_sub_epi32(x4[47], x4[40]); + x5[47] = _mm_add_epi32(x4[47], x4[40]); + x5[41] = _mm_sub_epi32(x4[46], x4[41]); + x5[46] = _mm_add_epi32(x4[46], x4[41]); + x5[42] = _mm_sub_epi32(x4[45], x4[42]); + x5[45] = _mm_add_epi32(x4[45], x4[42]); + x5[43] = _mm_sub_epi32(x4[44], x4[43]); + x5[44] = _mm_add_epi32(x4[44], x4[43]); + x5[48] = _mm_add_epi32(x4[48], x4[55]); + x5[55] = _mm_sub_epi32(x4[48], x4[55]); + x5[49] = _mm_add_epi32(x4[49], x4[54]); + x5[54] = _mm_sub_epi32(x4[49], x4[54]); + x5[50] = _mm_add_epi32(x4[50], x4[53]); + x5[53] = _mm_sub_epi32(x4[50], x4[53]); + x5[51] = _mm_add_epi32(x4[51], x4[52]); + x5[52] = _mm_sub_epi32(x4[51], x4[52]); + x5[56] = _mm_sub_epi32(x4[63], x4[56]); + x5[63] = _mm_add_epi32(x4[63], x4[56]); + x5[57] = _mm_sub_epi32(x4[62], x4[57]); + x5[62] = _mm_add_epi32(x4[62], x4[57]); + x5[58] = _mm_sub_epi32(x4[61], x4[58]); + x5[61] = _mm_add_epi32(x4[61], x4[58]); + x5[59] = _mm_sub_epi32(x4[60], x4[59]); + x5[60] = _mm_add_epi32(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3], + __rounding, cos_bit); + x6[4] = _mm_add_epi32(x5[4], x5[5]); + x6[5] = _mm_sub_epi32(x5[4], x5[5]); + x6[6] = _mm_sub_epi32(x5[7], x5[6]); + x6[7] = _mm_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], + __rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_add_epi32(x5[16], x5[19]); + x6[19] = _mm_sub_epi32(x5[16], x5[19]); + x6[17] = _mm_add_epi32(x5[17], x5[18]); + x6[18] = _mm_sub_epi32(x5[17], x5[18]); + x6[20] = _mm_sub_epi32(x5[23], x5[20]); + x6[23] = _mm_add_epi32(x5[23], x5[20]); + x6[21] = _mm_sub_epi32(x5[22], x5[21]); + x6[22] = _mm_add_epi32(x5[22], x5[21]); + x6[24] = _mm_add_epi32(x5[24], x5[27]); + x6[27] = _mm_sub_epi32(x5[24], x5[27]); + x6[25] = _mm_add_epi32(x5[25], x5[26]); + x6[26] = _mm_sub_epi32(x5[25], x5[26]); + x6[28] = _mm_sub_epi32(x5[31], x5[28]); + x6[31] = _mm_add_epi32(x5[31], x5[28]); + x6[29] = _mm_sub_epi32(x5[30], x5[29]); + x6[30] = _mm_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], + __rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], + __rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6], + __rounding, cos_bit); + x7[8] = _mm_add_epi32(x6[8], x6[9]); + x7[9] = _mm_sub_epi32(x6[8], x6[9]); + x7[10] = _mm_sub_epi32(x6[11], x6[10]); + x7[11] = _mm_add_epi32(x6[11], x6[10]); + x7[12] = _mm_add_epi32(x6[12], x6[13]); + x7[13] = _mm_sub_epi32(x6[12], x6[13]); + x7[14] = _mm_sub_epi32(x6[15], x6[14]); + x7[15] = _mm_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], + __rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], + __rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_add_epi32(x6[32], x6[35]); + x7[35] = _mm_sub_epi32(x6[32], x6[35]); + x7[33] = _mm_add_epi32(x6[33], x6[34]); + x7[34] = _mm_sub_epi32(x6[33], x6[34]); + x7[36] = _mm_sub_epi32(x6[39], x6[36]); + x7[39] = _mm_add_epi32(x6[39], x6[36]); + x7[37] = _mm_sub_epi32(x6[38], x6[37]); + x7[38] = _mm_add_epi32(x6[38], x6[37]); + x7[40] = _mm_add_epi32(x6[40], x6[43]); + x7[43] = _mm_sub_epi32(x6[40], x6[43]); + x7[41] = _mm_add_epi32(x6[41], x6[42]); + x7[42] = _mm_sub_epi32(x6[41], x6[42]); + x7[44] = _mm_sub_epi32(x6[47], x6[44]); + x7[47] = _mm_add_epi32(x6[47], x6[44]); + x7[45] = _mm_sub_epi32(x6[46], x6[45]); + x7[46] = _mm_add_epi32(x6[46], x6[45]); + x7[48] = _mm_add_epi32(x6[48], x6[51]); + x7[51] = _mm_sub_epi32(x6[48], x6[51]); + x7[49] = _mm_add_epi32(x6[49], x6[50]); + x7[50] = _mm_sub_epi32(x6[49], x6[50]); + x7[52] = _mm_sub_epi32(x6[55], x6[52]); + x7[55] = _mm_add_epi32(x6[55], x6[52]); + x7[53] = _mm_sub_epi32(x6[54], x6[53]); + x7[54] = _mm_add_epi32(x6[54], x6[53]); + x7[56] = _mm_add_epi32(x6[56], x6[59]); + x7[59] = _mm_sub_epi32(x6[56], x6[59]); + x7[57] = _mm_add_epi32(x6[57], x6[58]); + x7[58] = _mm_sub_epi32(x6[57], x6[58]); + x7[60] = _mm_sub_epi32(x6[63], x6[60]); + x7[63] = _mm_add_epi32(x6[63], x6[60]); + x7[61] = _mm_sub_epi32(x6[62], x6[61]); + x7[62] = _mm_add_epi32(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12], + __rounding, cos_bit); + x8[16] = _mm_add_epi32(x7[16], x7[17]); + x8[17] = _mm_sub_epi32(x7[16], x7[17]); + x8[18] = _mm_sub_epi32(x7[19], x7[18]); + x8[19] = _mm_add_epi32(x7[19], x7[18]); + x8[20] = _mm_add_epi32(x7[20], x7[21]); + x8[21] = _mm_sub_epi32(x7[20], x7[21]); + x8[22] = _mm_sub_epi32(x7[23], x7[22]); + x8[23] = _mm_add_epi32(x7[23], x7[22]); + x8[24] = _mm_add_epi32(x7[24], x7[25]); + x8[25] = _mm_sub_epi32(x7[24], x7[25]); + x8[26] = _mm_sub_epi32(x7[27], x7[26]); + x8[27] = _mm_add_epi32(x7[27], x7[26]); + x8[28] = _mm_add_epi32(x7[28], x7[29]); + x8[29] = _mm_sub_epi32(x7[28], x7[29]); + x8[30] = _mm_sub_epi32(x7[31], x7[30]); + x8[31] = _mm_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + __rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + __rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + __rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + __rounding, cos_bit); + btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + __rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25], + __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24], + __rounding, cos_bit); + x9[32] = _mm_add_epi32(x8[32], x8[33]); + x9[33] = _mm_sub_epi32(x8[32], x8[33]); + x9[34] = _mm_sub_epi32(x8[35], x8[34]); + x9[35] = _mm_add_epi32(x8[35], x8[34]); + x9[36] = _mm_add_epi32(x8[36], x8[37]); + x9[37] = _mm_sub_epi32(x8[36], x8[37]); + x9[38] = _mm_sub_epi32(x8[39], x8[38]); + x9[39] = _mm_add_epi32(x8[39], x8[38]); + x9[40] = _mm_add_epi32(x8[40], x8[41]); + x9[41] = _mm_sub_epi32(x8[40], x8[41]); + x9[42] = _mm_sub_epi32(x8[43], x8[42]); + x9[43] = _mm_add_epi32(x8[43], x8[42]); + x9[44] = _mm_add_epi32(x8[44], x8[45]); + x9[45] = _mm_sub_epi32(x8[44], x8[45]); + x9[46] = _mm_sub_epi32(x8[47], x8[46]); + x9[47] = _mm_add_epi32(x8[47], x8[46]); + x9[48] = _mm_add_epi32(x8[48], x8[49]); + x9[49] = _mm_sub_epi32(x8[48], x8[49]); + x9[50] = _mm_sub_epi32(x8[51], x8[50]); + x9[51] = _mm_add_epi32(x8[51], x8[50]); + x9[52] = _mm_add_epi32(x8[52], x8[53]); + x9[53] = _mm_sub_epi32(x8[52], x8[53]); + x9[54] = _mm_sub_epi32(x8[55], x8[54]); + x9[55] = _mm_add_epi32(x8[55], x8[54]); + x9[56] = _mm_add_epi32(x8[56], x8[57]); + x9[57] = _mm_sub_epi32(x8[56], x8[57]); + x9[58] = _mm_sub_epi32(x8[59], x8[58]); + x9[59] = _mm_add_epi32(x8[59], x8[58]); + x9[60] = _mm_add_epi32(x8[60], x8[61]); + x9[61] = _mm_sub_epi32(x8[60], x8[61]); + x9[62] = _mm_sub_epi32(x8[63], x8[62]); + x9[63] = _mm_add_epi32(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32], + x10[63], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33], + x10[62], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34], + x10[61], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35], + x10[60], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36], + x10[59], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37], + x10[58], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38], + x10[57], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39], + x10[56], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40], + x10[55], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41], + x10[54], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42], + x10[53], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43], + x10[52], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44], + x10[51], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45], + x10[50], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46], + x10[49], __rounding, cos_bit); + btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47], + x10[48], __rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + // stage 11 + output[startidx] = x10[0]; + output[endidx] = x10[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[32]; + output[endidx] = x10[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[16]; + output[endidx] = x10[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[48]; + output[endidx] = x10[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[8]; + output[endidx] = x10[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[40]; + output[endidx] = x10[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[24]; + output[endidx] = x10[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[56]; + output[endidx] = x10[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[4]; + output[endidx] = x10[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[36]; + output[endidx] = x10[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[20]; + output[endidx] = x10[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[52]; + output[endidx] = x10[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[12]; + output[endidx] = x10[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[44]; + output[endidx] = x10[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[28]; + output[endidx] = x10[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[60]; + output[endidx] = x10[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[2]; + output[endidx] = x10[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[34]; + output[endidx] = x10[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[18]; + output[endidx] = x10[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[50]; + output[endidx] = x10[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[10]; + output[endidx] = x10[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[42]; + output[endidx] = x10[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[26]; + output[endidx] = x10[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[58]; + output[endidx] = x10[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[6]; + output[endidx] = x10[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[38]; + output[endidx] = x10[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[22]; + output[endidx] = x10[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[54]; + output[endidx] = x10[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[14]; + output[endidx] = x10[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[46]; + output[endidx] = x10[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[30]; + output[endidx] = x10[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x10[62]; + output[endidx] = x10[1]; +} + +void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int col_num) { + (void)cos_bit; + for (int i = 0; i < 32; i++) { + output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c new file mode 100644 index 0000000000..b143df3523 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c @@ -0,0 +1,3010 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/x86/av1_fwd_txfm_avx2.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m256i x1[16]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + + // stage 4 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + + // stage 6 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + + // stage 7 + output[0] = x1[0]; + output[1] = x1[8]; + output[2] = x1[4]; + output[3] = x1[12]; + output[4] = x1[2]; + output[5] = x1[10]; + output[6] = x1[6]; + output[7] = x1[14]; + output[8] = x1[1]; + output[9] = x1[9]; + output[10] = x1[5]; + output[11] = x1[13]; + output[12] = x1[3]; + output[13] = x1[11]; + output[14] = x1[7]; + output[15] = x1[15]; +} + +static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m256i x1[32]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]); + __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]); + __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]); + __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]); + __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]); + __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]); + __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]); + __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]); + __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]); + __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]); + __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]); + __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]); + __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]); + __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]); + __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]); + __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]); + __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]); + __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]); + __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]); + __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]); + __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]); + __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]); + __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]); + __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]); + __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]); + __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]); + __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]); + __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]); + __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]); + __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]); + __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]); + __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]); + __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]); + __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]); + __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]); + __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]); + __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]); + __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]); + __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]); + __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]); + __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]); + __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]); + __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]); + __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]); + __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]); + __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]); + __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]); + __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]); + __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]); + __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]); + __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]); + __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]); + __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]); + __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]); + __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]); + __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]); + __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]); + __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]); + __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]); + __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]); + __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m256i x1[64]; + btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_16_adds_subs_avx2(&x1[0], &x1[31]); + btf_16_adds_subs_avx2(&x1[1], &x1[30]); + btf_16_adds_subs_avx2(&x1[2], &x1[29]); + btf_16_adds_subs_avx2(&x1[3], &x1[28]); + btf_16_adds_subs_avx2(&x1[4], &x1[27]); + btf_16_adds_subs_avx2(&x1[5], &x1[26]); + btf_16_adds_subs_avx2(&x1[6], &x1[25]); + btf_16_adds_subs_avx2(&x1[7], &x1[24]); + btf_16_adds_subs_avx2(&x1[8], &x1[23]); + btf_16_adds_subs_avx2(&x1[9], &x1[22]); + btf_16_adds_subs_avx2(&x1[10], &x1[21]); + btf_16_adds_subs_avx2(&x1[11], &x1[20]); + btf_16_adds_subs_avx2(&x1[12], &x1[19]); + btf_16_adds_subs_avx2(&x1[13], &x1[18]); + btf_16_adds_subs_avx2(&x1[14], &x1[17]); + btf_16_adds_subs_avx2(&x1[15], &x1[16]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[15]); + btf_16_adds_subs_avx2(&x1[1], &x1[14]); + btf_16_adds_subs_avx2(&x1[2], &x1[13]); + btf_16_adds_subs_avx2(&x1[3], &x1[12]); + btf_16_adds_subs_avx2(&x1[4], &x1[11]); + btf_16_adds_subs_avx2(&x1[5], &x1[10]); + btf_16_adds_subs_avx2(&x1[6], &x1[9]); + btf_16_adds_subs_avx2(&x1[7], &x1[8]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[47]); + btf_16_adds_subs_avx2(&x1[33], &x1[46]); + btf_16_adds_subs_avx2(&x1[34], &x1[45]); + btf_16_adds_subs_avx2(&x1[35], &x1[44]); + btf_16_adds_subs_avx2(&x1[36], &x1[43]); + btf_16_adds_subs_avx2(&x1[37], &x1[42]); + btf_16_adds_subs_avx2(&x1[38], &x1[41]); + btf_16_adds_subs_avx2(&x1[39], &x1[40]); + btf_16_adds_subs_avx2(&x1[63], &x1[48]); + btf_16_adds_subs_avx2(&x1[62], &x1[49]); + btf_16_adds_subs_avx2(&x1[61], &x1[50]); + btf_16_adds_subs_avx2(&x1[60], &x1[51]); + btf_16_adds_subs_avx2(&x1[59], &x1[52]); + btf_16_adds_subs_avx2(&x1[58], &x1[53]); + btf_16_adds_subs_avx2(&x1[57], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_16_adds_subs_avx2(&x1[0], &x1[7]); + btf_16_adds_subs_avx2(&x1[1], &x1[6]); + btf_16_adds_subs_avx2(&x1[2], &x1[5]); + btf_16_adds_subs_avx2(&x1[3], &x1[4]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[23]); + btf_16_adds_subs_avx2(&x1[17], &x1[22]); + btf_16_adds_subs_avx2(&x1[18], &x1[21]); + btf_16_adds_subs_avx2(&x1[19], &x1[20]); + btf_16_adds_subs_avx2(&x1[31], &x1[24]); + btf_16_adds_subs_avx2(&x1[30], &x1[25]); + btf_16_adds_subs_avx2(&x1[29], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[27]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[3]); + btf_16_adds_subs_avx2(&x1[1], &x1[2]); + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[11]); + btf_16_adds_subs_avx2(&x1[9], &x1[10]); + btf_16_adds_subs_avx2(&x1[15], &x1[12]); + btf_16_adds_subs_avx2(&x1[14], &x1[13]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[39]); + btf_16_adds_subs_avx2(&x1[33], &x1[38]); + btf_16_adds_subs_avx2(&x1[34], &x1[37]); + btf_16_adds_subs_avx2(&x1[35], &x1[36]); + btf_16_adds_subs_avx2(&x1[47], &x1[40]); + btf_16_adds_subs_avx2(&x1[46], &x1[41]); + btf_16_adds_subs_avx2(&x1[45], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[43]); + btf_16_adds_subs_avx2(&x1[48], &x1[55]); + btf_16_adds_subs_avx2(&x1[49], &x1[54]); + btf_16_adds_subs_avx2(&x1[50], &x1[53]); + btf_16_adds_subs_avx2(&x1[51], &x1[52]); + btf_16_adds_subs_avx2(&x1[63], &x1[56]); + btf_16_adds_subs_avx2(&x1[62], &x1[57]); + btf_16_adds_subs_avx2(&x1[61], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[4], &x1[5]); + btf_16_adds_subs_avx2(&x1[7], &x1[6]); + btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[19]); + btf_16_adds_subs_avx2(&x1[17], &x1[18]); + btf_16_adds_subs_avx2(&x1[23], &x1[20]); + btf_16_adds_subs_avx2(&x1[22], &x1[21]); + btf_16_adds_subs_avx2(&x1[24], &x1[27]); + btf_16_adds_subs_avx2(&x1[25], &x1[26]); + btf_16_adds_subs_avx2(&x1[31], &x1[28]); + btf_16_adds_subs_avx2(&x1[30], &x1[29]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[8], &x1[9]); + btf_16_adds_subs_avx2(&x1[11], &x1[10]); + btf_16_adds_subs_avx2(&x1[12], &x1[13]); + btf_16_adds_subs_avx2(&x1[15], &x1[14]); + btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[35]); + btf_16_adds_subs_avx2(&x1[33], &x1[34]); + btf_16_adds_subs_avx2(&x1[39], &x1[36]); + btf_16_adds_subs_avx2(&x1[38], &x1[37]); + btf_16_adds_subs_avx2(&x1[40], &x1[43]); + btf_16_adds_subs_avx2(&x1[41], &x1[42]); + btf_16_adds_subs_avx2(&x1[47], &x1[44]); + btf_16_adds_subs_avx2(&x1[46], &x1[45]); + btf_16_adds_subs_avx2(&x1[48], &x1[51]); + btf_16_adds_subs_avx2(&x1[49], &x1[50]); + btf_16_adds_subs_avx2(&x1[55], &x1[52]); + btf_16_adds_subs_avx2(&x1[54], &x1[53]); + btf_16_adds_subs_avx2(&x1[56], &x1[59]); + btf_16_adds_subs_avx2(&x1[57], &x1[58]); + btf_16_adds_subs_avx2(&x1[63], &x1[60]); + btf_16_adds_subs_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit); + btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit); + btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[16], &x1[17]); + btf_16_adds_subs_avx2(&x1[19], &x1[18]); + btf_16_adds_subs_avx2(&x1[20], &x1[21]); + btf_16_adds_subs_avx2(&x1[23], &x1[22]); + btf_16_adds_subs_avx2(&x1[24], &x1[25]); + btf_16_adds_subs_avx2(&x1[27], &x1[26]); + btf_16_adds_subs_avx2(&x1[28], &x1[29]); + btf_16_adds_subs_avx2(&x1[31], &x1[30]); + btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit); + btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit); + btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit); + btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit); + btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit); + btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit); + btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit); + btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit); + btf_16_adds_subs_avx2(&x1[32], &x1[33]); + btf_16_adds_subs_avx2(&x1[35], &x1[34]); + btf_16_adds_subs_avx2(&x1[36], &x1[37]); + btf_16_adds_subs_avx2(&x1[39], &x1[38]); + btf_16_adds_subs_avx2(&x1[40], &x1[41]); + btf_16_adds_subs_avx2(&x1[43], &x1[42]); + btf_16_adds_subs_avx2(&x1[44], &x1[45]); + btf_16_adds_subs_avx2(&x1[47], &x1[46]); + btf_16_adds_subs_avx2(&x1[48], &x1[49]); + btf_16_adds_subs_avx2(&x1[51], &x1[50]); + btf_16_adds_subs_avx2(&x1[52], &x1[53]); + btf_16_adds_subs_avx2(&x1[55], &x1[54]); + btf_16_adds_subs_avx2(&x1[56], &x1[57]); + btf_16_adds_subs_avx2(&x1[59], &x1[58]); + btf_16_adds_subs_avx2(&x1[60], &x1[61]); + btf_16_adds_subs_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit); + btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit); + btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit); + btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit); + btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit); + btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit); + btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit); + btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit); + btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit); + btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit); + btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit); + btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit); + btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit); + btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit); + btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit); + btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fdct32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + __m256i x1[32]; + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + // stage 0 + // stage 1 + btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]); + btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]); + btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]); + btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]); + btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]); + btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]); + btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]); + btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]); + btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]); + btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]); + btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]); + btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]); + btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]); + btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]); + btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]); + btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit); + + // stage 5 + btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + + // stage 6 + btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + + // stage 8 + btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit); + + // stage 9 + output[0] = x1[0]; + output[1] = x1[16]; + output[2] = x1[8]; + output[3] = x1[24]; + output[4] = x1[4]; + output[5] = x1[20]; + output[6] = x1[12]; + output[7] = x1[28]; + output[8] = x1[2]; + output[9] = x1[18]; + output[10] = x1[10]; + output[11] = x1[26]; + output[12] = x1[6]; + output[13] = x1[22]; + output[14] = x1[14]; + output[15] = x1[30]; + output[16] = x1[1]; + output[17] = x1[17]; + output[18] = x1[9]; + output[19] = x1[25]; + output[20] = x1[5]; + output[21] = x1[21]; + output[22] = x1[13]; + output[23] = x1[29]; + output[24] = x1[3]; + output[25] = x1[19]; + output[26] = x1[11]; + output[27] = x1[27]; + output[28] = x1[7]; + output[29] = x1[23]; + output[30] = x1[15]; + output[31] = x1[31]; +} + +static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + // stage 1 + __m256i x1[64]; + btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]); + btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]); + btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]); + btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]); + btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]); + btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]); + btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]); + btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]); + btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]); + btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]); + btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]); + btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]); + btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]); + btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]); + btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]); + btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]); + btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]); + btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]); + btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]); + btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]); + btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]); + btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]); + btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]); + btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]); + btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]); + btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]); + btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]); + btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]); + btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]); + btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]); + btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]); + btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]); + + // stage 2 + btf_32_add_sub_avx2(&x1[0], &x1[31]); + btf_32_add_sub_avx2(&x1[1], &x1[30]); + btf_32_add_sub_avx2(&x1[2], &x1[29]); + btf_32_add_sub_avx2(&x1[3], &x1[28]); + btf_32_add_sub_avx2(&x1[4], &x1[27]); + btf_32_add_sub_avx2(&x1[5], &x1[26]); + btf_32_add_sub_avx2(&x1[6], &x1[25]); + btf_32_add_sub_avx2(&x1[7], &x1[24]); + btf_32_add_sub_avx2(&x1[8], &x1[23]); + btf_32_add_sub_avx2(&x1[9], &x1[22]); + btf_32_add_sub_avx2(&x1[10], &x1[21]); + btf_32_add_sub_avx2(&x1[11], &x1[20]); + btf_32_add_sub_avx2(&x1[12], &x1[19]); + btf_32_add_sub_avx2(&x1[13], &x1[18]); + btf_32_add_sub_avx2(&x1[14], &x1[17]); + btf_32_add_sub_avx2(&x1[15], &x1[16]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit); + + // stage 3 + btf_32_add_sub_avx2(&x1[0], &x1[15]); + btf_32_add_sub_avx2(&x1[1], &x1[14]); + btf_32_add_sub_avx2(&x1[2], &x1[13]); + btf_32_add_sub_avx2(&x1[3], &x1[12]); + btf_32_add_sub_avx2(&x1[4], &x1[11]); + btf_32_add_sub_avx2(&x1[5], &x1[10]); + btf_32_add_sub_avx2(&x1[6], &x1[9]); + btf_32_add_sub_avx2(&x1[7], &x1[8]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[47]); + btf_32_add_sub_avx2(&x1[33], &x1[46]); + btf_32_add_sub_avx2(&x1[34], &x1[45]); + btf_32_add_sub_avx2(&x1[35], &x1[44]); + btf_32_add_sub_avx2(&x1[36], &x1[43]); + btf_32_add_sub_avx2(&x1[37], &x1[42]); + btf_32_add_sub_avx2(&x1[38], &x1[41]); + btf_32_add_sub_avx2(&x1[39], &x1[40]); + btf_32_add_sub_avx2(&x1[63], &x1[48]); + btf_32_add_sub_avx2(&x1[62], &x1[49]); + btf_32_add_sub_avx2(&x1[61], &x1[50]); + btf_32_add_sub_avx2(&x1[60], &x1[51]); + btf_32_add_sub_avx2(&x1[59], &x1[52]); + btf_32_add_sub_avx2(&x1[58], &x1[53]); + btf_32_add_sub_avx2(&x1[57], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[55]); + + // stage 4 + btf_32_add_sub_avx2(&x1[0], &x1[7]); + btf_32_add_sub_avx2(&x1[1], &x1[6]); + btf_32_add_sub_avx2(&x1[2], &x1[5]); + btf_32_add_sub_avx2(&x1[3], &x1[4]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[23]); + btf_32_add_sub_avx2(&x1[17], &x1[22]); + btf_32_add_sub_avx2(&x1[18], &x1[21]); + btf_32_add_sub_avx2(&x1[19], &x1[20]); + btf_32_add_sub_avx2(&x1[31], &x1[24]); + btf_32_add_sub_avx2(&x1[30], &x1[25]); + btf_32_add_sub_avx2(&x1[29], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[27]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit); + + // stage 5 + btf_32_add_sub_avx2(&x1[0], &x1[3]); + btf_32_add_sub_avx2(&x1[1], &x1[2]); + btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[11]); + btf_32_add_sub_avx2(&x1[9], &x1[10]); + btf_32_add_sub_avx2(&x1[15], &x1[12]); + btf_32_add_sub_avx2(&x1[14], &x1[13]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[39]); + btf_32_add_sub_avx2(&x1[33], &x1[38]); + btf_32_add_sub_avx2(&x1[34], &x1[37]); + btf_32_add_sub_avx2(&x1[35], &x1[36]); + btf_32_add_sub_avx2(&x1[47], &x1[40]); + btf_32_add_sub_avx2(&x1[46], &x1[41]); + btf_32_add_sub_avx2(&x1[45], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[43]); + btf_32_add_sub_avx2(&x1[48], &x1[55]); + btf_32_add_sub_avx2(&x1[49], &x1[54]); + btf_32_add_sub_avx2(&x1[50], &x1[53]); + btf_32_add_sub_avx2(&x1[51], &x1[52]); + btf_32_add_sub_avx2(&x1[63], &x1[56]); + btf_32_add_sub_avx2(&x1[62], &x1[57]); + btf_32_add_sub_avx2(&x1[61], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[59]); + + // stage 6 + btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit); + btf_32_add_sub_avx2(&x1[4], &x1[5]); + btf_32_add_sub_avx2(&x1[7], &x1[6]); + btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[19]); + btf_32_add_sub_avx2(&x1[17], &x1[18]); + btf_32_add_sub_avx2(&x1[23], &x1[20]); + btf_32_add_sub_avx2(&x1[22], &x1[21]); + btf_32_add_sub_avx2(&x1[24], &x1[27]); + btf_32_add_sub_avx2(&x1[25], &x1[26]); + btf_32_add_sub_avx2(&x1[31], &x1[28]); + btf_32_add_sub_avx2(&x1[30], &x1[29]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit); + + // stage 7 + btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit); + btf_32_add_sub_avx2(&x1[8], &x1[9]); + btf_32_add_sub_avx2(&x1[11], &x1[10]); + btf_32_add_sub_avx2(&x1[12], &x1[13]); + btf_32_add_sub_avx2(&x1[15], &x1[14]); + btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[35]); + btf_32_add_sub_avx2(&x1[33], &x1[34]); + btf_32_add_sub_avx2(&x1[39], &x1[36]); + btf_32_add_sub_avx2(&x1[38], &x1[37]); + btf_32_add_sub_avx2(&x1[40], &x1[43]); + btf_32_add_sub_avx2(&x1[41], &x1[42]); + btf_32_add_sub_avx2(&x1[47], &x1[44]); + btf_32_add_sub_avx2(&x1[46], &x1[45]); + btf_32_add_sub_avx2(&x1[48], &x1[51]); + btf_32_add_sub_avx2(&x1[49], &x1[50]); + btf_32_add_sub_avx2(&x1[55], &x1[52]); + btf_32_add_sub_avx2(&x1[54], &x1[53]); + btf_32_add_sub_avx2(&x1[56], &x1[59]); + btf_32_add_sub_avx2(&x1[57], &x1[58]); + btf_32_add_sub_avx2(&x1[63], &x1[60]); + btf_32_add_sub_avx2(&x1[62], &x1[61]); + + // stage 8 + btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit); + btf_32_add_sub_avx2(&x1[16], &x1[17]); + btf_32_add_sub_avx2(&x1[19], &x1[18]); + btf_32_add_sub_avx2(&x1[20], &x1[21]); + btf_32_add_sub_avx2(&x1[23], &x1[22]); + btf_32_add_sub_avx2(&x1[24], &x1[25]); + btf_32_add_sub_avx2(&x1[27], &x1[26]); + btf_32_add_sub_avx2(&x1[28], &x1[29]); + btf_32_add_sub_avx2(&x1[31], &x1[30]); + btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit); + + // stage 9 + btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit); + btf_32_add_sub_avx2(&x1[32], &x1[33]); + btf_32_add_sub_avx2(&x1[35], &x1[34]); + btf_32_add_sub_avx2(&x1[36], &x1[37]); + btf_32_add_sub_avx2(&x1[39], &x1[38]); + btf_32_add_sub_avx2(&x1[40], &x1[41]); + btf_32_add_sub_avx2(&x1[43], &x1[42]); + btf_32_add_sub_avx2(&x1[44], &x1[45]); + btf_32_add_sub_avx2(&x1[47], &x1[46]); + btf_32_add_sub_avx2(&x1[48], &x1[49]); + btf_32_add_sub_avx2(&x1[51], &x1[50]); + btf_32_add_sub_avx2(&x1[52], &x1[53]); + btf_32_add_sub_avx2(&x1[55], &x1[54]); + btf_32_add_sub_avx2(&x1[56], &x1[57]); + btf_32_add_sub_avx2(&x1[59], &x1[58]); + btf_32_add_sub_avx2(&x1[60], &x1[61]); + btf_32_add_sub_avx2(&x1[63], &x1[62]); + + // stage 10 + btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit); + btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit); + + // stage 11 + output[0] = x1[0]; + output[1] = x1[32]; + output[2] = x1[16]; + output[3] = x1[48]; + output[4] = x1[8]; + output[5] = x1[40]; + output[6] = x1[24]; + output[7] = x1[56]; + output[8] = x1[4]; + output[9] = x1[36]; + output[10] = x1[20]; + output[11] = x1[52]; + output[12] = x1[12]; + output[13] = x1[44]; + output[14] = x1[28]; + output[15] = x1[60]; + output[16] = x1[2]; + output[17] = x1[34]; + output[18] = x1[18]; + output[19] = x1[50]; + output[20] = x1[10]; + output[21] = x1[42]; + output[22] = x1[26]; + output[23] = x1[58]; + output[24] = x1[6]; + output[25] = x1[38]; + output[26] = x1[22]; + output[27] = x1[54]; + output[28] = x1[14]; + output[29] = x1[46]; + output[30] = x1[30]; + output[31] = x1[62]; + output[32] = x1[1]; + output[33] = x1[33]; + output[34] = x1[17]; + output[35] = x1[49]; + output[36] = x1[9]; + output[37] = x1[41]; + output[38] = x1[25]; + output[39] = x1[57]; + output[40] = x1[5]; + output[41] = x1[37]; + output[42] = x1[21]; + output[43] = x1[53]; + output[44] = x1[13]; + output[45] = x1[45]; + output[46] = x1[29]; + output[47] = x1[61]; + output[48] = x1[3]; + output[49] = x1[35]; + output[50] = x1[19]; + output[51] = x1[51]; + output[52] = x1[11]; + output[53] = x1[43]; + output[54] = x1[27]; + output[55] = x1[59]; + output[56] = x1[7]; + output[57] = x1[39]; + output[58] = x1[23]; + output[59] = x1[55]; + output[60] = x1[15]; + output[61] = x1[47]; + output[62] = x1[31]; + output[63] = x1[63]; +} + +static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]); + __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]); + __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]); + __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]); + __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]); + __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]); + __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]); + __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]); + __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]); + __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]); + __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]); + __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]); + __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]); + __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]); + __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]); + __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]); + __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]); + __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]); + __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]); + __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]); + __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]); + __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m256i x1[16]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[15]); + x1[2] = _mm256_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm256_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm256_subs_epi16(__zero, input[11]); + x1[8] = _mm256_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm256_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm256_subs_epi16(__zero, input[13]); + x1[14] = _mm256_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit); + + // stage 3 + btf_16_adds_subs_avx2(&x1[0], &x1[2]); + btf_16_adds_subs_avx2(&x1[1], &x1[3]); + btf_16_adds_subs_avx2(&x1[4], &x1[6]); + btf_16_adds_subs_avx2(&x1[5], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[10]); + btf_16_adds_subs_avx2(&x1[9], &x1[11]); + btf_16_adds_subs_avx2(&x1[12], &x1[14]); + btf_16_adds_subs_avx2(&x1[13], &x1[15]); + + // stage 4 + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit); + + // stage 5 + btf_16_adds_subs_avx2(&x1[0], &x1[4]); + btf_16_adds_subs_avx2(&x1[1], &x1[5]); + btf_16_adds_subs_avx2(&x1[2], &x1[6]); + btf_16_adds_subs_avx2(&x1[3], &x1[7]); + btf_16_adds_subs_avx2(&x1[8], &x1[12]); + btf_16_adds_subs_avx2(&x1[9], &x1[13]); + btf_16_adds_subs_avx2(&x1[10], &x1[14]); + btf_16_adds_subs_avx2(&x1[11], &x1[15]); + + // stage 6 + btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit); + + // stage 7 + btf_16_adds_subs_avx2(&x1[0], &x1[8]); + btf_16_adds_subs_avx2(&x1[1], &x1[9]); + btf_16_adds_subs_avx2(&x1[2], &x1[10]); + btf_16_adds_subs_avx2(&x1[3], &x1[11]); + btf_16_adds_subs_avx2(&x1[4], &x1[12]); + btf_16_adds_subs_avx2(&x1[5], &x1[13]); + btf_16_adds_subs_avx2(&x1[6], &x1[14]); + btf_16_adds_subs_avx2(&x1[7], &x1[15]); + + // stage 8 + btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit); + btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit); + btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit); + btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit); + btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit); + btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit); + btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit); + btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit); + + // stage 9 + output[0] = x1[1]; + output[1] = x1[14]; + output[2] = x1[3]; + output[3] = x1[12]; + output[4] = x1[5]; + output[5] = x1[10]; + output[6] = x1[7]; + output[7] = x1[8]; + output[8] = x1[9]; + output[9] = x1[6]; + output[10] = x1[11]; + output[11] = x1[4]; + output[12] = x1[13]; + output[13] = x1[2]; + output[14] = x1[15]; + output[15] = x1[0]; +} + +static INLINE void fidentity16x16_new_avx2(const __m256i *input, + __m256i *output, int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one); + const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + output[i] = _mm256_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm256_slli_epi16(input[i], 2); + } +} + +static INLINE void store_output_32bit_w16(int32_t *const out, + const __m256i *const in1, + const __m256i *const in2, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out + stride * i), in1[i]); + _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]); + } +} + +// Store 8 16 bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + int32_t *out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i]))); + _mm256_store_si256( + (__m256i *)(out + 8), + _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1))); + out += stride; + } +} + +static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8); + const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + _mm256_store_si256((__m256i *)b, b_lo); + _mm256_store_si256((__m256i *)(b + 8), b_hi); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_avx2(in[i], out + i * stride); + } +} + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = { + fdct16x32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_avx2, // IDTX + fdct16x32_avx2, // V_DCT + fidentity16x32_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = { + fdct16x32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity16x32_avx2, // IDTX + fidentity16x32_avx2, // V_DCT + fdct16x32_avx2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fadst16x16_new_avx2, // ADST_DCT + fdct16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fadst16x16_new_avx2, // FLIPADST_DCT + fdct16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fdct16x16_new_avx2, // V_DCT + fidentity16x16_new_avx2, // H_DCT + fadst16x16_new_avx2, // V_ADST + fidentity16x16_new_avx2, // H_ADST + fadst16x16_new_avx2, // V_FLIPADST + fidentity16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = { + fdct16x16_new_avx2, // DCT_DCT + fdct16x16_new_avx2, // ADST_DCT + fadst16x16_new_avx2, // DCT_ADST + fadst16x16_new_avx2, // ADST_ADST + fdct16x16_new_avx2, // FLIPADST_DCT + fadst16x16_new_avx2, // DCT_FLIPADST + fadst16x16_new_avx2, // FLIPADST_FLIPADST + fadst16x16_new_avx2, // ADST_FLIPADST + fadst16x16_new_avx2, // FLIPADST_ADST + fidentity16x16_new_avx2, // IDTX + fidentity16x16_new_avx2, // V_DCT + fdct16x16_new_avx2, // H_DCT + fidentity16x16_new_avx2, // V_ADST + fadst16x16_new_avx2, // H_ADST + fidentity16x16_new_avx2, // V_FLIPADST + fadst16x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride, + __m128i *out, int bit) { + out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride)); + out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride)); + out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride)); + out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride)); + out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride)); + out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride)); + out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride)); + out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride)); + out[0] = _mm_slli_epi16(out[0], bit); + out[1] = _mm_slli_epi16(out[1], bit); + out[2] = _mm_slli_epi16(out[2], bit); + out[3] = _mm_slli_epi16(out[3], bit); + out[4] = _mm_slli_epi16(out[4], bit); + out[5] = _mm_slli_epi16(out[5], bit); + out[6] = _mm_slli_epi16(out[6], bit); + out[7] = _mm_slli_epi16(out[7], bit); +} + +static INLINE void load_buffer_and_flip_round_shift(const int16_t *in, + int stride, __m128i *out, + int bit) { + out[7] = load_16bit_to_16bit(in + 0 * stride); + out[6] = load_16bit_to_16bit(in + 1 * stride); + out[5] = load_16bit_to_16bit(in + 2 * stride); + out[4] = load_16bit_to_16bit(in + 3 * stride); + out[3] = load_16bit_to_16bit(in + 4 * stride); + out[2] = load_16bit_to_16bit(in + 5 * stride); + out[1] = load_16bit_to_16bit(in + 6 * stride); + out[0] = load_16bit_to_16bit(in + 7 * stride); + out[7] = _mm_slli_epi16(out[7], bit); + out[6] = _mm_slli_epi16(out[6], bit); + out[5] = _mm_slli_epi16(out[5], bit); + out[4] = _mm_slli_epi16(out[4], bit); + out[3] = _mm_slli_epi16(out[3], bit); + out[2] = _mm_slli_epi16(out[2], bit); + out[1] = _mm_slli_epi16(out[1], bit); + out[0] = _mm_slli_epi16(out[0], bit); +} + +#define TRANSPOSE_8X8_AVX2() \ + { \ + /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \ + /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \ + /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \ + /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \ + const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \ + const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \ + const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \ + const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \ + /* Unpack 32 bit elements resulting in: */ \ + /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \ + /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \ + /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \ + /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \ + const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \ + const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \ + const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \ + const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \ + /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \ + /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \ + /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \ + /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \ + c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \ + c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \ + c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \ + c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \ + } + +static INLINE void transpose_round_shift_flip_8x8(__m128i *const in, + __m128i *const out, int bit) { + __m256i c0, c1, c2, c3; + bit = -bit; + const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); + const __m256i s04 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); + const __m256i s15 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); + const __m256i s26 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); + const __m256i s37 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); + + const __m256i a0 = _mm256_adds_epi16(s04, rounding); + const __m256i a1 = _mm256_adds_epi16(s15, rounding); + const __m256i a2 = _mm256_adds_epi16(s26, rounding); + const __m256i a3 = _mm256_adds_epi16(s37, rounding); + + // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 + // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 + // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 + // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 + const __m256i b0 = _mm256_srai_epi16(a0, bit); + const __m256i b1 = _mm256_srai_epi16(a1, bit); + const __m256i b2 = _mm256_srai_epi16(a2, bit); + const __m256i b3 = _mm256_srai_epi16(a3, bit); + + TRANSPOSE_8X8_AVX2() + + // Unpack 64 bit elements resulting in: + // out[7]: 00 10 20 30 40 50 60 70 + // out[6]: 01 11 21 31 41 51 61 71 + // out[5]: 02 12 22 32 42 52 62 72 + // out[4]: 03 13 23 33 43 53 63 73 + // out[3]: 04 14 24 34 44 54 64 74 + // out[2]: 05 15 25 35 45 55 65 75 + // out[1]: 06 16 26 36 46 56 66 76 + // out[0]: 07 17 27 37 47 57 67 77 + out[7] = _mm256_castsi256_si128(c0); + out[6] = _mm256_extractf128_si256(c0, 1); + out[5] = _mm256_castsi256_si128(c1); + out[4] = _mm256_extractf128_si256(c1, 1); + out[3] = _mm256_castsi256_si128(c2); + out[2] = _mm256_extractf128_si256(c2, 1); + out[1] = _mm256_castsi256_si128(c3); + out[0] = _mm256_extractf128_si256(c3, 1); +} + +static INLINE void transpose_round_shift_8x8(__m128i *const in, + __m128i *const out, int bit) { + __m256i c0, c1, c2, c3; + bit = -bit; + const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1)); + const __m256i s04 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1); + const __m256i s15 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1); + const __m256i s26 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1); + const __m256i s37 = + _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1); + + const __m256i a0 = _mm256_adds_epi16(s04, rounding); + const __m256i a1 = _mm256_adds_epi16(s15, rounding); + const __m256i a2 = _mm256_adds_epi16(s26, rounding); + const __m256i a3 = _mm256_adds_epi16(s37, rounding); + + // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47 + // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57 + // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67 + // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77 + const __m256i b0 = _mm256_srai_epi16(a0, bit); + const __m256i b1 = _mm256_srai_epi16(a1, bit); + const __m256i b2 = _mm256_srai_epi16(a2, bit); + const __m256i b3 = _mm256_srai_epi16(a3, bit); + + TRANSPOSE_8X8_AVX2() + // Unpack 64 bit elements resulting in: + // out[7]: 00 10 20 30 40 50 60 70 + // out[6]: 01 11 21 31 41 51 61 71 + // out[5]: 02 12 22 32 42 52 62 72 + // out[4]: 03 13 23 33 43 53 63 73 + // out[3]: 04 14 24 34 44 54 64 74 + // out[2]: 05 15 25 35 45 55 65 75 + // out[1]: 06 16 26 36 46 56 66 76 + // out[0]: 07 17 27 37 47 57 67 77 + out[0] = _mm256_castsi256_si128(c0); + out[1] = _mm256_extractf128_si256(c0, 1); + out[2] = _mm256_castsi256_si128(c1); + out[3] = _mm256_extractf128_si256(c1, 1); + out[4] = _mm256_castsi256_si128(c2); + out[5] = _mm256_extractf128_si256(c2, 1); + out[6] = _mm256_castsi256_si128(c3); + out[7] = _mm256_extractf128_si256(c3, 1); +} + +static INLINE void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in, + int32_t *const out, + const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out + i * stride), + _mm256_cvtepi16_epi32(in[i])); + } +} + +static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // Condition to check shift bit is avoided while round shifting, by assuming + // that shift[0] will always be positive. + assert(shift[0] > 0); + if (ud_flip) + load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]); + else + load_buffer_and_round_shift(input, stride, buf0, shift[0]); + + col_txfm(buf0, buf0, cos_bit_col); + // Condition to check shift bit is avoided while round shifting, by assuming + // that shift[1] will always be negative. + assert(shift[1] < 0); + + if (lr_flip) { + transpose_round_shift_flip_8x8(buf0, buf1, shift[1]); + } else { + transpose_round_shift_8x8(buf0, buf1, shift[1]); + } + + buf = buf1; + row_txfm(buf, buf, cos_bit_row); + + // Round and shift operation is avoided here as the shift bit is assumed to be + // zero always. + assert(shift[2] == 0); + store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8); +} + +static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X16; + __m256i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int32_t i = 0; + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); +} + +static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_32X32; + __m256i buf0[32], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i); + transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i); + } + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width); + } +} + +static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct64_new_avx2(bufA, bufA, cos_bit_row); + fdct64_new_avx2(bufB, bufB, cos_bit_row); + round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]); + round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]); + store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_16X32; + __m256i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1); + transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16); + + for (int i = 0; i < 2; i++) { + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, + width); + } +} + +static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, + height); + } else { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + } + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i); + } + + __m256i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_avx2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width); +} + +static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type]; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[64]; + __m256i bufB[64]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct64_new_avx2(bufA, bufA, cos_bit_row); + fdct64_new_avx2(bufB, bufB, cos_bit_row); + round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); + round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m256i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(2, height_div16); ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i bufA[32]; + __m256i bufB[32]; + __m128i *buf = (__m128i *)(buf1 + width * i); + for (int j = 0; j < width; ++j) { + bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]); + bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]); + } + fdct32_avx2(bufA, bufA, cos_bit_row); + fdct32_avx2(bufB, bufB, cos_bit_row); + round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2); + round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m256i buf0[64], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x64_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x16_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < AOMMIN(2, height_div16); i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width); + } +} + +static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m256i buf0[64], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct16x16_new_avx2; + const transform_1d_avx2 row_txfm = fdct16x64_new_avx2; + const int width_div16 = (width >> 4); + const int height_div16 = (height >> 4); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height); + round_shift_16bit_w16_avx2(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit_w16_avx2(buf0, height, shift[1]); + for (int j = 0; j < height_div16; ++j) { + transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i); + } + } + + for (int i = 0; i < height_div16; i++) { + __m256i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit_w16_avx2(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32); + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0, + __m256i *in1, __m128i *out0, __m128i *out1, + __m128i *out2, __m128i *out3, + const __m256i *__rounding, int8_t *cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, *w0); + __m256i u1 = _mm256_madd_epi16(t1, *w0); + __m256i v0 = _mm256_madd_epi16(t0, *w1); + __m256i v1 = _mm256_madd_epi16(t1, *w1); + + __m256i a0 = _mm256_add_epi32(u0, *__rounding); + __m256i a1 = _mm256_add_epi32(u1, *__rounding); + __m256i b0 = _mm256_add_epi32(v0, *__rounding); + __m256i b1 = _mm256_add_epi32(v1, *__rounding); + + __m256i c0 = _mm256_srai_epi32(a0, *cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, *cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, *cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, *cos_bit); + + __m256i temp0 = _mm256_packs_epi32(c0, c1); + __m256i temp1 = _mm256_packs_epi32(d0, d1); + + *out0 = _mm256_castsi256_si128(temp0); + *out1 = _mm256_castsi256_si128(temp1); + *out2 = _mm256_extracti128_si256(temp0, 0x01); + *out3 = _mm256_extracti128_si256(temp1, 0x01); +} + +static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(input[0], input[7]); + x1[7] = _mm256_subs_epi16(input[0], input[7]); + x1[1] = _mm256_adds_epi16(input[1], input[6]); + x1[6] = _mm256_subs_epi16(input[1], input[6]); + x1[2] = _mm256_adds_epi16(input[2], input[5]); + x1[5] = _mm256_subs_epi16(input[2], input[5]); + x1[3] = _mm256_adds_epi16(input[3], input[4]); + x1[4] = _mm256_subs_epi16(input[3], input[4]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[3] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[2] = _mm256_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding, + cos_bit); + x2[5] = x1[5]; + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding, + cos_bit); + x3[0] = x2[0]; + x3[1] = x2[1]; + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding, + cos_bit); + x3[2] = x2[2]; + x3[3] = x2[3]; + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_subs_epi16(x2[7], x2[6]); + x3[7] = _mm256_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[7] = x3[7]; + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding, + cos_bit); + x4[5] = x3[5]; + x4[6] = x3[6]; + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m256i x1[8]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[7]); + x1[2] = _mm256_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm256_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm256_subs_epi16(__zero, input[5]); + + // stage 2 + __m256i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding, + cos_bit); + x2[2] = x1[2]; + x2[3] = x1[3]; + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding, + cos_bit); + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[2]); + x3[2] = _mm256_subs_epi16(x2[0], x2[2]); + x3[1] = _mm256_adds_epi16(x2[1], x2[3]); + x3[3] = _mm256_subs_epi16(x2[1], x2[3]); + x3[4] = _mm256_adds_epi16(x2[4], x2[6]); + x3[6] = _mm256_subs_epi16(x2[4], x2[6]); + x3[5] = _mm256_adds_epi16(x2[5], x2[7]); + x3[7] = _mm256_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[5] = x3[5]; + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding, + cos_bit); + x4[6] = x3[6]; + x4[7] = x3[7]; + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[4]); + x5[4] = _mm256_subs_epi16(x4[0], x4[4]); + x5[1] = _mm256_adds_epi16(x4[1], x4[5]); + x5[5] = _mm256_subs_epi16(x4[1], x4[5]); + x5[2] = _mm256_adds_epi16(x4[2], x4[6]); + x5[6] = _mm256_subs_epi16(x4[2], x4[6]); + x5[3] = _mm256_adds_epi16(x4[3], x4[7]); + x5[7] = _mm256_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m256i x6[8]; + btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding, + cos_bit); + x6[0] = x5[0]; + x6[1] = x5[1]; + btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding, + cos_bit); + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding, + cos_bit); + x6[4] = x5[4]; + x6[5] = x5[5]; + btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding, + cos_bit); + x6[6] = x5[6]; + x6[7] = x5[7]; + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm256_adds_epi16(input[0], input[0]); + output[1] = _mm256_adds_epi16(input[1], input[1]); + output[2] = _mm256_adds_epi16(input[2], input[2]); + output[3] = _mm256_adds_epi16(input[3], input[3]); + output[4] = _mm256_adds_epi16(input[4], input[4]); + output[5] = _mm256_adds_epi16(input[5], input[5]); + output[6] = _mm256_adds_epi16(input[6], input[6]); + output[7] = _mm256_adds_epi16(input[7], input[7]); +} + +static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i temp0, temp1, temp2, temp3; + __m256i in0, in1; + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + __m256i cospi_arr[12]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32), + cospi_m32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p48_p16, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_m16_p48, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48), + cospi_m48_m16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16), + cospi_m16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08), + cospi_p24_p40, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56), + cospi_m40_p24, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04), + cospi_p28_p36, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60), + cospi_m36_p28, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20), + cospi_p12_p52, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44), + cospi_m52_p12, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1); + x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14], + 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1); + x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11], + 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(x[0], x[1]); + x1[7] = _mm256_subs_epi16(x[0], x[1]); + x1[1] = _mm256_adds_epi16(x[2], x[3]); + x1[6] = _mm256_subs_epi16(x[2], x[3]); + x1[2] = _mm256_adds_epi16(x[4], x[5]); + x1[5] = _mm256_subs_epi16(x[4], x[5]); + x1[3] = _mm256_adds_epi16(x[6], x[7]); + x1[4] = _mm256_subs_epi16(x[6], x[7]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[7] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[6] = _mm256_subs_epi16(x1[1], x1[2]); + x2[2] = x1[4]; + x2[3] = x1[7]; + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 3 + __m256i x3[8]; + x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e); + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]), + _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1); + x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1); + x3[3] = _mm256_adds_epi16(x2[2], x2[4]); + x3[4] = _mm256_subs_epi16(x2[2], x2[4]); + x3[5] = _mm256_adds_epi16(x2[3], x2[5]); + x3[6] = _mm256_subs_epi16(x2[3], x2[5]); + + // stage 4 + __m256i x4[8]; + x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0); + x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0], + &output[8], &output[4], &output[12], &__rounding_256, &cos_bit); + x4[2] = _mm256_adds_epi16(x3[2], x3[7]); + x4[3] = _mm256_subs_epi16(x3[2], x3[7]); + x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20); + x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20); + in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31); + in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 5 + __m256i x5[4]; + in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31); + in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14], + &output[10], &output[6], &__rounding_256, &cos_bit); + x5[0] = _mm256_adds_epi16(x4[4], x4[6]); + x5[1] = _mm256_subs_epi16(x4[4], x4[6]); + x5[2] = _mm256_adds_epi16(x4[5], x4[7]); + x5[3] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20); + in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15], + &output[9], &output[7], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31); + in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5], + &output[11], &output[13], &output[3], &__rounding_256, &cos_bit); +} + +static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i in0, in1; + __m128i temp0, temp1, temp2, temp3; + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + __m256i cospi_arr[20]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08), + cospi_p24_m40, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08), + cospi_m24_p40, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62), + cospi_p10_p54, 0x1); + cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02), + cospi_p54_m10, 0x1); + cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46), + cospi_p26_p38, 0x1); + cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18), + cospi_p38_m26, 0x1); + cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30), + cospi_p42_p22, 0x1); + cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34), + cospi_p22_m42, 0x1); + cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14), + cospi_p58_p06, 0x1); + cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50), + cospi_p06_m58, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1); + x[1] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1); + x[5] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = x[0]; + x1[1] = _mm256_subs_epi16(__zero, x[7]); + x1[2] = x[2]; + x1[3] = _mm256_subs_epi16(__zero, x[5]); + x1[4] = _mm256_subs_epi16(__zero, x[4]); + x1[5] = x[3]; + x1[6] = _mm256_subs_epi16(__zero, x[6]); + x1[7] = x[1]; + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0); + x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0); + x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0); + x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0); + in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0); + in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0); + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21); + in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_adds_epi16(x2[3], x2[2]); + x3[3] = _mm256_subs_epi16(x2[3], x2[2]); + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_adds_epi16(x2[7], x2[6]); + x3[7] = _mm256_subs_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[4] = x3[4]; + x4[5] = x3[5]; + in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20); + in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20); + in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[2]); + x5[1] = _mm256_subs_epi16(x4[0], x4[2]); + x5[2] = _mm256_adds_epi16(x4[1], x4[3]); + x5[3] = _mm256_subs_epi16(x4[1], x4[3]); + x5[4] = _mm256_adds_epi16(x4[4], x4[6]); + x5[5] = _mm256_subs_epi16(x4[4], x4[6]); + x5[6] = _mm256_adds_epi16(x4[5], x4[7]); + x5[7] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + __m256i x6[8]; + x6[0] = x5[0]; + x6[1] = x5[2]; + x6[2] = x5[1]; + x6[3] = x5[3]; + in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20); + in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20); + in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 7 + __m256i x7[8]; + x7[0] = _mm256_adds_epi16(x6[0], x6[4]); + x7[1] = _mm256_subs_epi16(x6[0], x6[4]); + x7[2] = _mm256_adds_epi16(x6[1], x6[5]); + x7[3] = _mm256_subs_epi16(x6[1], x6[5]); + x7[4] = _mm256_adds_epi16(x6[2], x6[6]); + x7[5] = _mm256_subs_epi16(x6[2], x6[6]); + x7[6] = _mm256_adds_epi16(x6[3], x6[7]); + x7[7] = _mm256_subs_epi16(x6[3], x6[7]); + + // stage 8 + in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20); + in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31); + btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15], + &output[0], &output[13], &output[2], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20); + in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31); + btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11], + &output[4], &output[9], &output[6], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20); + in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31); + btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7], + &output[8], &output[5], &output[10], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20); + in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31); + btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3], + &output[12], &output[1], &output[14], &__rounding_256, &cos_bit); +} + +static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + __m256i temp; + for (int i = 0; i < 16; i += 2) { + temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), + input[i + 1], 0x1); + const __m256i a_lo = _mm256_unpacklo_epi16(temp, one); + const __m256i a_hi = _mm256_unpackhi_epi16(temp, one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + temp = _mm256_packs_epi32(b_lo, b_hi); + output[i] = _mm256_castsi256_si128(temp); + output[i + 1] = _mm256_extractf128_si256(temp, 0x1); + } +} + +static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fdct8x8_new_avx2, // ADST_DCT + fadst8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fdct8x8_new_avx2, // FLIPADST_DCT + fadst8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fidentity8x8_new_avx2, // V_DCT + fdct8x8_new_avx2, // H_DCT + fidentity8x8_new_avx2, // V_ADST + fadst8x8_new_avx2, // H_ADST + fidentity8x8_new_avx2, // V_FLIPADST + fadst8x8_new_avx2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fadst8x16_new_avx2, // ADST_DCT + fdct8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fadst8x16_new_avx2, // FLIPADST_DCT + fdct8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fdct8x16_new_avx2, // V_DCT + fidentity8x16_new_avx2, // H_DCT + fadst8x16_new_avx2, // V_ADST + fidentity8x16_new_avx2, // H_ADST + fadst8x16_new_avx2, // V_FLIPADST + fidentity8x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fadst8x8_new_avx2, // ADST_DCT + fdct8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fadst8x8_new_avx2, // FLIPADST_DCT + fdct8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fdct8x8_new_avx2, // V_DCT + fidentity8x8_new_avx2, // H_DCT + fadst8x8_new_avx2, // V_ADST + fidentity8x8_new_avx2, // H_ADST + fadst8x8_new_avx2, // V_FLIPADST + fidentity8x8_new_avx2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fdct8x16_new_avx2, // ADST_DCT + fadst8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fdct8x16_new_avx2, // FLIPADST_DCT + fadst8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fidentity8x16_new_avx2, // V_DCT + fdct8x16_new_avx2, // H_DCT + fidentity8x16_new_avx2, // V_ADST + fadst8x16_new_avx2, // H_ADST + fidentity8x16_new_avx2, // V_FLIPADST + fadst8x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + __m128i *bufl, *bufu; + if (lr_flip) { + bufl = buf0; + bufu = buf0 + 8; + flip_buf_sse2(buf1 + width * 0, bufl, width); + flip_buf_sse2(buf1 + width * 1, bufu, width); + } else { + bufl = buf1 + width * 0; + bufu = buf1 + width * 1; + } + pack_reg(bufl, bufu, buf2); + row_txfm(buf2, buf2, cos_bit_row); + round_shift_16bit_w16_avx2(buf2, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width); +} + +static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height); + } else { + load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height); + } + pack_reg(buf0, &buf0[8], buf2); + round_shift_16bit_w16_avx2(buf2, height, shift[0]); + col_txfm(buf2, buf2, cos_bit_col); + round_shift_16bit_w16_avx2(buf2, height, shift[1]); + transpose_16bit_16x8_avx2(buf2, buf2); + extract_reg(buf2, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform + lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform + lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform + lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform + lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform + lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform + lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform + lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c new file mode 100644 index 0000000000..825da8d7b4 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "config/av1_rtcd.h" + +#include "av1/common/enums.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +static INLINE void int16_array_with_stride_to_int32_array_without_stride( + const int16_t *input, int stride, int32_t *output, int txfm1d_size) { + int r, c; + for (r = 0; r < txfm1d_size; r++) { + for (c = 0; c < txfm1d_size; c++) { + output[r * txfm1d_size + c] = (int32_t)input[r * stride + c]; + } + } +} + +static INLINE void store_output_32bit_w8(int32_t *const out, + const __m128i *const in1, + const __m128i *const in2, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm_store_si128((__m128i *)(out + stride * i), in1[i]); + _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]); + } +} + +typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit, + const int8_t *stage_range) { + const int txfm_size = 32; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + int col; + (void)stage_range; + for (col = 0; col < col_num; col++) { + av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num); + } +} + +static void fdct64_new_sse4_1(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + const int txfm_size = 64; + const int num_per_128 = 4; + int col_num = txfm_size / num_per_128; + (void)stage_range; + for (int col = 0; col < col_num; col++) { + av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num); + } +} +static void idtx32x32_sse4_1(__m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range) { + (void)stage_range; + + for (int i = 0; i < 8; i++) { + av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1); + } +} + +static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { + switch (txfm_type) { + case TXFM_TYPE_DCT32: return fdct32_sse4_1; + case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; + case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; + default: assert(0); + } + return NULL; +} + +static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, + const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + // TODO(sarahparker) This does not currently support rectangular transforms + // and will break without splitting txfm_size out into row and col size. + // Rectangular transforms use c code only, so it should be ok for now. + // It will be corrected when there are sse implementations for rectangular + // transforms. + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t *stage_range_row = cfg->stage_range_row; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row); + + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, + txfm_size); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); + av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); +} + +static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input, + int32_t *output, const int stride, + const TXFM_2D_FLIP_CFG *cfg, + int32_t *txfm_buf) { + assert(cfg->tx_size < TX_SIZES); + const int txfm_size = tx_size_wide[cfg->tx_size]; + const int8_t *shift = cfg->shift; + const int8_t *stage_range_col = cfg->stage_range_col; + const int8_t cos_bit_col = cfg->cos_bit_col; + const int8_t cos_bit_row = cfg->cos_bit_row; + const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col); + __m128i *buf_128 = (__m128i *)txfm_buf; + __m128i *out_128 = (__m128i *)output; + + const int num_per_128 = 4; + int txfm2d_size_128 = txfm_size * txfm_size / num_per_128; + int col_num = txfm_size / num_per_128; + + int16_array_with_stride_to_int32_array_without_stride(input, stride, output, + txfm_size); + /*col wise transform*/ + txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); + av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); + transpose_32(txfm_size, out_128, buf_128); + + /*row wise transform*/ + for (int col = 0; col < (col_num >> 1); col++) { + av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num, + (col_num >> 1)); + } + + txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1); + av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]); +} + +void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg); + (void)bd; + fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]); + TXFM_2D_FLIP_CFG cfg; + av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg); + (void)bd; + fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf); +} + +static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m128i buf0[64], buf1[512]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]); + av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]); + + store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + const TX_SIZE tx_size = TX_64X32; + __m128i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + assert(tx_type == DCT_DCT); + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[64]; + __m128i bufB[64]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1); + av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_32X64; + __m128i buf0[64], buf1[256]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < AOMMIN(4, height_div8); ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i bufA[32]; + __m128i bufB[32]; + __m128i *buf = buf1 + width * i; + for (int j = 0; j < width; ++j) { + bufA[j] = _mm_cvtepi16_epi32(buf[j]); + bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j])); + } + av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1); + av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1); + av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2); + av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2); + + store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32); + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform + lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + } else { + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h new file mode 100644 index 0000000000..aaad76e5ae --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ +#include + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1, + __m256i *in0, __m256i *in1, + const __m256i _r, const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i ww0 = _mm256_set1_epi32(w0); + const __m256i ww1 = _mm256_set1_epi32(w1); + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1, + __m256i *in0, __m256i *in1, + const __m256i _r, + const int32_t cos_bit) { + __m256i _in0 = *in0; + __m256i _in1 = *in1; + const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0); + const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1); + __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1); + temp0 = _mm256_add_epi32(temp0, _r); + *in0 = _mm256_srai_epi32(temp0, cos_bit); + const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1); + const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0); + __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1); + temp1 = _mm256_add_epi32(temp1, _r); + *in1 = _mm256_srai_epi32(temp1, cos_bit); +} + +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c new file mode 100644 index 0000000000..a4def754b0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c @@ -0,0 +1,2673 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/x86/av1_txfm_sse2.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_fwd_txfm_sse2.h" + +// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible). + +static void fdct4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i u[4], v[4]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[3], input[2]); + + v[0] = _mm_add_epi16(u[0], u[1]); + v[1] = _mm_sub_epi16(u[0], u[1]); + + u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0 + u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2 + u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1 + u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3 + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[3], __rounding); + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[1]); + output[1] = _mm_packs_epi32(u[2], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fdct8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + + // stage 1 + __m128i x1[4]; + x1[0] = _mm_adds_epi16(input[0], input[3]); + x1[3] = _mm_subs_epi16(input[0], input[3]); + x1[1] = _mm_adds_epi16(input[1], input[2]); + x1[2] = _mm_subs_epi16(input[1], input[2]); + + // stage 2 + __m128i x2[4]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]); + + // stage 3 + output[0] = x2[0]; + output[1] = x2[2]; + output[2] = x2[1]; + output[3] = x2[3]; +} + +static void fdct4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5], + &x1[6], &x2[5], &x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0], + &x2[1], &x3[0], &x3[1]); + btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2], + &x2[3], &x3[2], &x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4], + &x3[7], &x4[4], &x4[7]); + btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5], + &x3[6], &x4[5], &x4[6]); + + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static void fdct8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + // stage 1 + __m128i x1[16]; + x1[0] = _mm_adds_epi16(input[0], input[15]); + x1[15] = _mm_subs_epi16(input[0], input[15]); + x1[1] = _mm_adds_epi16(input[1], input[14]); + x1[14] = _mm_subs_epi16(input[1], input[14]); + x1[2] = _mm_adds_epi16(input[2], input[13]); + x1[13] = _mm_subs_epi16(input[2], input[13]); + x1[3] = _mm_adds_epi16(input[3], input[12]); + x1[12] = _mm_subs_epi16(input[3], input[12]); + x1[4] = _mm_adds_epi16(input[4], input[11]); + x1[11] = _mm_subs_epi16(input[4], input[11]); + x1[5] = _mm_adds_epi16(input[5], input[10]); + x1[10] = _mm_subs_epi16(input[5], input[10]); + x1[6] = _mm_adds_epi16(input[6], input[9]); + x1[9] = _mm_subs_epi16(input[6], input[9]); + x1[7] = _mm_adds_epi16(input[7], input[8]); + x1[8] = _mm_subs_epi16(input[7], input[8]); + + // stage 2 + __m128i x2[16]; + x2[0] = _mm_adds_epi16(x1[0], x1[7]); + x2[7] = _mm_subs_epi16(x1[0], x1[7]); + x2[1] = _mm_adds_epi16(x1[1], x1[6]); + x2[6] = _mm_subs_epi16(x1[1], x1[6]); + x2[2] = _mm_adds_epi16(x1[2], x1[5]); + x2[5] = _mm_subs_epi16(x1[2], x1[5]); + x2[3] = _mm_adds_epi16(x1[3], x1[4]); + x2[4] = _mm_subs_epi16(x1[3], x1[4]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]); + x2[14] = x1[14]; + x2[15] = x1[15]; + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[3]); + x3[3] = _mm_subs_epi16(x2[0], x2[3]); + x3[1] = _mm_adds_epi16(x2[1], x2[2]); + x3[2] = _mm_subs_epi16(x2[1], x2[2]); + x3[4] = x2[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]); + x3[7] = x2[7]; + x3[8] = _mm_adds_epi16(x2[8], x2[11]); + x3[11] = _mm_subs_epi16(x2[8], x2[11]); + x3[9] = _mm_adds_epi16(x2[9], x2[10]); + x3[10] = _mm_subs_epi16(x2[9], x2[10]); + x3[12] = _mm_subs_epi16(x2[15], x2[12]); + x3[15] = _mm_adds_epi16(x2[15], x2[12]); + x3[13] = _mm_subs_epi16(x2[14], x2[13]); + x3[14] = _mm_adds_epi16(x2[14], x2[13]); + + // stage 4 + __m128i x4[16]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]); + x4[4] = _mm_adds_epi16(x3[4], x3[5]); + x4[5] = _mm_subs_epi16(x3[4], x3[5]); + x4[6] = _mm_subs_epi16(x3[7], x3[6]); + x4[7] = _mm_adds_epi16(x3[7], x3[6]); + x4[8] = x3[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]); + x4[11] = x3[11]; + x4[12] = x3[12]; + x4[15] = x3[15]; + + // stage 5 + __m128i x5[16]; + x5[0] = x4[0]; + x5[1] = x4[1]; + x5[2] = x4[2]; + x5[3] = x4[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]); + x5[8] = _mm_adds_epi16(x4[8], x4[9]); + x5[9] = _mm_subs_epi16(x4[8], x4[9]); + x5[10] = _mm_subs_epi16(x4[11], x4[10]); + x5[11] = _mm_adds_epi16(x4[11], x4[10]); + x5[12] = _mm_adds_epi16(x4[12], x4[13]); + x5[13] = _mm_subs_epi16(x4[12], x4[13]); + x5[14] = _mm_subs_epi16(x4[15], x4[14]); + x5[15] = _mm_adds_epi16(x4[15], x4[14]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]); + + // stage 7 + output[0] = x6[0]; + output[1] = x6[8]; + output[2] = x6[4]; + output[3] = x6[12]; + output[4] = x6[2]; + output[5] = x6[10]; + output[6] = x6[6]; + output[7] = x6[14]; + output[8] = x6[1]; + output[9] = x6[9]; + output[10] = x6[5]; + output[11] = x6[13]; + output[12] = x6[3]; + output[13] = x6[11]; + output[14] = x6[7]; + output[15] = x6[15]; +} + +void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + + // stage 1 + __m128i x1[32]; + x1[0] = _mm_adds_epi16(input[0], input[31]); + x1[31] = _mm_subs_epi16(input[0], input[31]); + x1[1] = _mm_adds_epi16(input[1], input[30]); + x1[30] = _mm_subs_epi16(input[1], input[30]); + x1[2] = _mm_adds_epi16(input[2], input[29]); + x1[29] = _mm_subs_epi16(input[2], input[29]); + x1[3] = _mm_adds_epi16(input[3], input[28]); + x1[28] = _mm_subs_epi16(input[3], input[28]); + x1[4] = _mm_adds_epi16(input[4], input[27]); + x1[27] = _mm_subs_epi16(input[4], input[27]); + x1[5] = _mm_adds_epi16(input[5], input[26]); + x1[26] = _mm_subs_epi16(input[5], input[26]); + x1[6] = _mm_adds_epi16(input[6], input[25]); + x1[25] = _mm_subs_epi16(input[6], input[25]); + x1[7] = _mm_adds_epi16(input[7], input[24]); + x1[24] = _mm_subs_epi16(input[7], input[24]); + x1[8] = _mm_adds_epi16(input[8], input[23]); + x1[23] = _mm_subs_epi16(input[8], input[23]); + x1[9] = _mm_adds_epi16(input[9], input[22]); + x1[22] = _mm_subs_epi16(input[9], input[22]); + x1[10] = _mm_adds_epi16(input[10], input[21]); + x1[21] = _mm_subs_epi16(input[10], input[21]); + x1[11] = _mm_adds_epi16(input[11], input[20]); + x1[20] = _mm_subs_epi16(input[11], input[20]); + x1[12] = _mm_adds_epi16(input[12], input[19]); + x1[19] = _mm_subs_epi16(input[12], input[19]); + x1[13] = _mm_adds_epi16(input[13], input[18]); + x1[18] = _mm_subs_epi16(input[13], input[18]); + x1[14] = _mm_adds_epi16(input[14], input[17]); + x1[17] = _mm_subs_epi16(input[14], input[17]); + x1[15] = _mm_adds_epi16(input[15], input[16]); + x1[16] = _mm_subs_epi16(input[15], input[16]); + + // stage 2 + __m128i x2[32]; + x2[0] = _mm_adds_epi16(x1[0], x1[15]); + x2[15] = _mm_subs_epi16(x1[0], x1[15]); + x2[1] = _mm_adds_epi16(x1[1], x1[14]); + x2[14] = _mm_subs_epi16(x1[1], x1[14]); + x2[2] = _mm_adds_epi16(x1[2], x1[13]); + x2[13] = _mm_subs_epi16(x1[2], x1[13]); + x2[3] = _mm_adds_epi16(x1[3], x1[12]); + x2[12] = _mm_subs_epi16(x1[3], x1[12]); + x2[4] = _mm_adds_epi16(x1[4], x1[11]); + x2[11] = _mm_subs_epi16(x1[4], x1[11]); + x2[5] = _mm_adds_epi16(x1[5], x1[10]); + x2[10] = _mm_subs_epi16(x1[5], x1[10]); + x2[6] = _mm_adds_epi16(x1[6], x1[9]); + x2[9] = _mm_subs_epi16(x1[6], x1[9]); + x2[7] = _mm_adds_epi16(x1[7], x1[8]); + x2[8] = _mm_subs_epi16(x1[7], x1[8]); + x2[16] = x1[16]; + x2[17] = x1[17]; + x2[18] = x1[18]; + x2[19] = x1[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]); + x2[28] = x1[28]; + x2[29] = x1[29]; + x2[30] = x1[30]; + x2[31] = x1[31]; + + // stage 3 + __m128i x3[32]; + x3[0] = _mm_adds_epi16(x2[0], x2[7]); + x3[7] = _mm_subs_epi16(x2[0], x2[7]); + x3[1] = _mm_adds_epi16(x2[1], x2[6]); + x3[6] = _mm_subs_epi16(x2[1], x2[6]); + x3[2] = _mm_adds_epi16(x2[2], x2[5]); + x3[5] = _mm_subs_epi16(x2[2], x2[5]); + x3[3] = _mm_adds_epi16(x2[3], x2[4]); + x3[4] = _mm_subs_epi16(x2[3], x2[4]); + x3[8] = x2[8]; + x3[9] = x2[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]); + x3[14] = x2[14]; + x3[15] = x2[15]; + x3[16] = _mm_adds_epi16(x2[16], x2[23]); + x3[23] = _mm_subs_epi16(x2[16], x2[23]); + x3[17] = _mm_adds_epi16(x2[17], x2[22]); + x3[22] = _mm_subs_epi16(x2[17], x2[22]); + x3[18] = _mm_adds_epi16(x2[18], x2[21]); + x3[21] = _mm_subs_epi16(x2[18], x2[21]); + x3[19] = _mm_adds_epi16(x2[19], x2[20]); + x3[20] = _mm_subs_epi16(x2[19], x2[20]); + x3[24] = _mm_subs_epi16(x2[31], x2[24]); + x3[31] = _mm_adds_epi16(x2[31], x2[24]); + x3[25] = _mm_subs_epi16(x2[30], x2[25]); + x3[30] = _mm_adds_epi16(x2[30], x2[25]); + x3[26] = _mm_subs_epi16(x2[29], x2[26]); + x3[29] = _mm_adds_epi16(x2[29], x2[26]); + x3[27] = _mm_subs_epi16(x2[28], x2[27]); + x3[28] = _mm_adds_epi16(x2[28], x2[27]); + + // stage 4 + __m128i x4[32]; + x4[0] = _mm_adds_epi16(x3[0], x3[3]); + x4[3] = _mm_subs_epi16(x3[0], x3[3]); + x4[1] = _mm_adds_epi16(x3[1], x3[2]); + x4[2] = _mm_subs_epi16(x3[1], x3[2]); + x4[4] = x3[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]); + x4[7] = x3[7]; + x4[8] = _mm_adds_epi16(x3[8], x3[11]); + x4[11] = _mm_subs_epi16(x3[8], x3[11]); + x4[9] = _mm_adds_epi16(x3[9], x3[10]); + x4[10] = _mm_subs_epi16(x3[9], x3[10]); + x4[12] = _mm_subs_epi16(x3[15], x3[12]); + x4[15] = _mm_adds_epi16(x3[15], x3[12]); + x4[13] = _mm_subs_epi16(x3[14], x3[13]); + x4[14] = _mm_adds_epi16(x3[14], x3[13]); + x4[16] = x3[16]; + x4[17] = x3[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]); + x4[22] = x3[22]; + x4[23] = x3[23]; + x4[24] = x3[24]; + x4[25] = x3[25]; + x4[30] = x3[30]; + x4[31] = x3[31]; + + // stage 5 + __m128i x5[32]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]); + x5[4] = _mm_adds_epi16(x4[4], x4[5]); + x5[5] = _mm_subs_epi16(x4[4], x4[5]); + x5[6] = _mm_subs_epi16(x4[7], x4[6]); + x5[7] = _mm_adds_epi16(x4[7], x4[6]); + x5[8] = x4[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]); + x5[11] = x4[11]; + x5[12] = x4[12]; + x5[15] = x4[15]; + x5[16] = _mm_adds_epi16(x4[16], x4[19]); + x5[19] = _mm_subs_epi16(x4[16], x4[19]); + x5[17] = _mm_adds_epi16(x4[17], x4[18]); + x5[18] = _mm_subs_epi16(x4[17], x4[18]); + x5[20] = _mm_subs_epi16(x4[23], x4[20]); + x5[23] = _mm_adds_epi16(x4[23], x4[20]); + x5[21] = _mm_subs_epi16(x4[22], x4[21]); + x5[22] = _mm_adds_epi16(x4[22], x4[21]); + x5[24] = _mm_adds_epi16(x4[24], x4[27]); + x5[27] = _mm_subs_epi16(x4[24], x4[27]); + x5[25] = _mm_adds_epi16(x4[25], x4[26]); + x5[26] = _mm_subs_epi16(x4[25], x4[26]); + x5[28] = _mm_subs_epi16(x4[31], x4[28]); + x5[31] = _mm_adds_epi16(x4[31], x4[28]); + x5[29] = _mm_subs_epi16(x4[30], x4[29]); + x5[30] = _mm_adds_epi16(x4[30], x4[29]); + + // stage 6 + __m128i x6[32]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]); + x6[8] = _mm_adds_epi16(x5[8], x5[9]); + x6[9] = _mm_subs_epi16(x5[8], x5[9]); + x6[10] = _mm_subs_epi16(x5[11], x5[10]); + x6[11] = _mm_adds_epi16(x5[11], x5[10]); + x6[12] = _mm_adds_epi16(x5[12], x5[13]); + x6[13] = _mm_subs_epi16(x5[12], x5[13]); + x6[14] = _mm_subs_epi16(x5[15], x5[14]); + x6[15] = _mm_adds_epi16(x5[15], x5[14]); + x6[16] = x5[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]); + x6[19] = x5[19]; + x6[20] = x5[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]); + x6[23] = x5[23]; + x6[24] = x5[24]; + x6[27] = x5[27]; + x6[28] = x5[28]; + x6[31] = x5[31]; + + // stage 7 + __m128i x7[32]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + x7[4] = x6[4]; + x7[5] = x6[5]; + x7[6] = x6[6]; + x7[7] = x6[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]); + x7[16] = _mm_adds_epi16(x6[16], x6[17]); + x7[17] = _mm_subs_epi16(x6[16], x6[17]); + x7[18] = _mm_subs_epi16(x6[19], x6[18]); + x7[19] = _mm_adds_epi16(x6[19], x6[18]); + x7[20] = _mm_adds_epi16(x6[20], x6[21]); + x7[21] = _mm_subs_epi16(x6[20], x6[21]); + x7[22] = _mm_subs_epi16(x6[23], x6[22]); + x7[23] = _mm_adds_epi16(x6[23], x6[22]); + x7[24] = _mm_adds_epi16(x6[24], x6[25]); + x7[25] = _mm_subs_epi16(x6[24], x6[25]); + x7[26] = _mm_subs_epi16(x6[27], x6[26]); + x7[27] = _mm_adds_epi16(x6[27], x6[26]); + x7[28] = _mm_adds_epi16(x6[28], x6[29]); + x7[29] = _mm_subs_epi16(x6[28], x6[29]); + x7[30] = _mm_subs_epi16(x6[31], x6[30]); + x7[31] = _mm_adds_epi16(x6[31], x6[30]); + + // stage 8 + __m128i x8[32]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + x8[8] = x7[8]; + x8[9] = x7[9]; + x8[10] = x7[10]; + x8[11] = x7[11]; + x8[12] = x7[12]; + x8[13] = x7[13]; + x8[14] = x7[14]; + x8[15] = x7[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]); + + // stage 9 + output[0] = x8[0]; + output[1] = x8[16]; + output[2] = x8[8]; + output[3] = x8[24]; + output[4] = x8[4]; + output[5] = x8[20]; + output[6] = x8[12]; + output[7] = x8[28]; + output[8] = x8[2]; + output[9] = x8[18]; + output[10] = x8[10]; + output[11] = x8[26]; + output[12] = x8[6]; + output[13] = x8[22]; + output[14] = x8[14]; + output[15] = x8[30]; + output[16] = x8[1]; + output[17] = x8[17]; + output[18] = x8[9]; + output[19] = x8[25]; + output[20] = x8[5]; + output[21] = x8[21]; + output[22] = x8[13]; + output[23] = x8[29]; + output[24] = x8[3]; + output[25] = x8[19]; + output[26] = x8[11]; + output[27] = x8[27]; + output[28] = x8[7]; + output[29] = x8[23]; + output[30] = x8[15]; + output[31] = x8[31]; +} + +void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]); + __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]); + __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]); + __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]); + __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]); + __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]); + __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]); + __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]); + __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]); + __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]); + __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]); + __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]); + __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]); + __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]); + __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]); + __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]); + __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]); + __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]); + __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]); + __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]); + __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]); + __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]); + __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]); + __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]); + __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]); + __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]); + __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]); + __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]); + __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]); + __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]); + __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]); + __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]); + __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]); + __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]); + __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]); + __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]); + __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]); + __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]); + __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]); + __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]); + __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]); + __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]); + __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]); + __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]); + __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]); + __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]); + __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]); + __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]); + __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]); + __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]); + __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]); + __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]); + + // stage 1 + __m128i x1[64]; + x1[0] = _mm_adds_epi16(input[0], input[63]); + x1[63] = _mm_subs_epi16(input[0], input[63]); + x1[1] = _mm_adds_epi16(input[1], input[62]); + x1[62] = _mm_subs_epi16(input[1], input[62]); + x1[2] = _mm_adds_epi16(input[2], input[61]); + x1[61] = _mm_subs_epi16(input[2], input[61]); + x1[3] = _mm_adds_epi16(input[3], input[60]); + x1[60] = _mm_subs_epi16(input[3], input[60]); + x1[4] = _mm_adds_epi16(input[4], input[59]); + x1[59] = _mm_subs_epi16(input[4], input[59]); + x1[5] = _mm_adds_epi16(input[5], input[58]); + x1[58] = _mm_subs_epi16(input[5], input[58]); + x1[6] = _mm_adds_epi16(input[6], input[57]); + x1[57] = _mm_subs_epi16(input[6], input[57]); + x1[7] = _mm_adds_epi16(input[7], input[56]); + x1[56] = _mm_subs_epi16(input[7], input[56]); + x1[8] = _mm_adds_epi16(input[8], input[55]); + x1[55] = _mm_subs_epi16(input[8], input[55]); + x1[9] = _mm_adds_epi16(input[9], input[54]); + x1[54] = _mm_subs_epi16(input[9], input[54]); + x1[10] = _mm_adds_epi16(input[10], input[53]); + x1[53] = _mm_subs_epi16(input[10], input[53]); + x1[11] = _mm_adds_epi16(input[11], input[52]); + x1[52] = _mm_subs_epi16(input[11], input[52]); + x1[12] = _mm_adds_epi16(input[12], input[51]); + x1[51] = _mm_subs_epi16(input[12], input[51]); + x1[13] = _mm_adds_epi16(input[13], input[50]); + x1[50] = _mm_subs_epi16(input[13], input[50]); + x1[14] = _mm_adds_epi16(input[14], input[49]); + x1[49] = _mm_subs_epi16(input[14], input[49]); + x1[15] = _mm_adds_epi16(input[15], input[48]); + x1[48] = _mm_subs_epi16(input[15], input[48]); + x1[16] = _mm_adds_epi16(input[16], input[47]); + x1[47] = _mm_subs_epi16(input[16], input[47]); + x1[17] = _mm_adds_epi16(input[17], input[46]); + x1[46] = _mm_subs_epi16(input[17], input[46]); + x1[18] = _mm_adds_epi16(input[18], input[45]); + x1[45] = _mm_subs_epi16(input[18], input[45]); + x1[19] = _mm_adds_epi16(input[19], input[44]); + x1[44] = _mm_subs_epi16(input[19], input[44]); + x1[20] = _mm_adds_epi16(input[20], input[43]); + x1[43] = _mm_subs_epi16(input[20], input[43]); + x1[21] = _mm_adds_epi16(input[21], input[42]); + x1[42] = _mm_subs_epi16(input[21], input[42]); + x1[22] = _mm_adds_epi16(input[22], input[41]); + x1[41] = _mm_subs_epi16(input[22], input[41]); + x1[23] = _mm_adds_epi16(input[23], input[40]); + x1[40] = _mm_subs_epi16(input[23], input[40]); + x1[24] = _mm_adds_epi16(input[24], input[39]); + x1[39] = _mm_subs_epi16(input[24], input[39]); + x1[25] = _mm_adds_epi16(input[25], input[38]); + x1[38] = _mm_subs_epi16(input[25], input[38]); + x1[26] = _mm_adds_epi16(input[26], input[37]); + x1[37] = _mm_subs_epi16(input[26], input[37]); + x1[27] = _mm_adds_epi16(input[27], input[36]); + x1[36] = _mm_subs_epi16(input[27], input[36]); + x1[28] = _mm_adds_epi16(input[28], input[35]); + x1[35] = _mm_subs_epi16(input[28], input[35]); + x1[29] = _mm_adds_epi16(input[29], input[34]); + x1[34] = _mm_subs_epi16(input[29], input[34]); + x1[30] = _mm_adds_epi16(input[30], input[33]); + x1[33] = _mm_subs_epi16(input[30], input[33]); + x1[31] = _mm_adds_epi16(input[31], input[32]); + x1[32] = _mm_subs_epi16(input[31], input[32]); + + // stage 2 + __m128i x2[64]; + x2[0] = _mm_adds_epi16(x1[0], x1[31]); + x2[31] = _mm_subs_epi16(x1[0], x1[31]); + x2[1] = _mm_adds_epi16(x1[1], x1[30]); + x2[30] = _mm_subs_epi16(x1[1], x1[30]); + x2[2] = _mm_adds_epi16(x1[2], x1[29]); + x2[29] = _mm_subs_epi16(x1[2], x1[29]); + x2[3] = _mm_adds_epi16(x1[3], x1[28]); + x2[28] = _mm_subs_epi16(x1[3], x1[28]); + x2[4] = _mm_adds_epi16(x1[4], x1[27]); + x2[27] = _mm_subs_epi16(x1[4], x1[27]); + x2[5] = _mm_adds_epi16(x1[5], x1[26]); + x2[26] = _mm_subs_epi16(x1[5], x1[26]); + x2[6] = _mm_adds_epi16(x1[6], x1[25]); + x2[25] = _mm_subs_epi16(x1[6], x1[25]); + x2[7] = _mm_adds_epi16(x1[7], x1[24]); + x2[24] = _mm_subs_epi16(x1[7], x1[24]); + x2[8] = _mm_adds_epi16(x1[8], x1[23]); + x2[23] = _mm_subs_epi16(x1[8], x1[23]); + x2[9] = _mm_adds_epi16(x1[9], x1[22]); + x2[22] = _mm_subs_epi16(x1[9], x1[22]); + x2[10] = _mm_adds_epi16(x1[10], x1[21]); + x2[21] = _mm_subs_epi16(x1[10], x1[21]); + x2[11] = _mm_adds_epi16(x1[11], x1[20]); + x2[20] = _mm_subs_epi16(x1[11], x1[20]); + x2[12] = _mm_adds_epi16(x1[12], x1[19]); + x2[19] = _mm_subs_epi16(x1[12], x1[19]); + x2[13] = _mm_adds_epi16(x1[13], x1[18]); + x2[18] = _mm_subs_epi16(x1[13], x1[18]); + x2[14] = _mm_adds_epi16(x1[14], x1[17]); + x2[17] = _mm_subs_epi16(x1[14], x1[17]); + x2[15] = _mm_adds_epi16(x1[15], x1[16]); + x2[16] = _mm_subs_epi16(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; + + // stage 3 + __m128i x3[64]; + x3[0] = _mm_adds_epi16(x2[0], x2[15]); + x3[15] = _mm_subs_epi16(x2[0], x2[15]); + x3[1] = _mm_adds_epi16(x2[1], x2[14]); + x3[14] = _mm_subs_epi16(x2[1], x2[14]); + x3[2] = _mm_adds_epi16(x2[2], x2[13]); + x3[13] = _mm_subs_epi16(x2[2], x2[13]); + x3[3] = _mm_adds_epi16(x2[3], x2[12]); + x3[12] = _mm_subs_epi16(x2[3], x2[12]); + x3[4] = _mm_adds_epi16(x2[4], x2[11]); + x3[11] = _mm_subs_epi16(x2[4], x2[11]); + x3[5] = _mm_adds_epi16(x2[5], x2[10]); + x3[10] = _mm_subs_epi16(x2[5], x2[10]); + x3[6] = _mm_adds_epi16(x2[6], x2[9]); + x3[9] = _mm_subs_epi16(x2[6], x2[9]); + x3[7] = _mm_adds_epi16(x2[7], x2[8]); + x3[8] = _mm_subs_epi16(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm_adds_epi16(x2[32], x2[47]); + x3[47] = _mm_subs_epi16(x2[32], x2[47]); + x3[33] = _mm_adds_epi16(x2[33], x2[46]); + x3[46] = _mm_subs_epi16(x2[33], x2[46]); + x3[34] = _mm_adds_epi16(x2[34], x2[45]); + x3[45] = _mm_subs_epi16(x2[34], x2[45]); + x3[35] = _mm_adds_epi16(x2[35], x2[44]); + x3[44] = _mm_subs_epi16(x2[35], x2[44]); + x3[36] = _mm_adds_epi16(x2[36], x2[43]); + x3[43] = _mm_subs_epi16(x2[36], x2[43]); + x3[37] = _mm_adds_epi16(x2[37], x2[42]); + x3[42] = _mm_subs_epi16(x2[37], x2[42]); + x3[38] = _mm_adds_epi16(x2[38], x2[41]); + x3[41] = _mm_subs_epi16(x2[38], x2[41]); + x3[39] = _mm_adds_epi16(x2[39], x2[40]); + x3[40] = _mm_subs_epi16(x2[39], x2[40]); + x3[48] = _mm_subs_epi16(x2[63], x2[48]); + x3[63] = _mm_adds_epi16(x2[63], x2[48]); + x3[49] = _mm_subs_epi16(x2[62], x2[49]); + x3[62] = _mm_adds_epi16(x2[62], x2[49]); + x3[50] = _mm_subs_epi16(x2[61], x2[50]); + x3[61] = _mm_adds_epi16(x2[61], x2[50]); + x3[51] = _mm_subs_epi16(x2[60], x2[51]); + x3[60] = _mm_adds_epi16(x2[60], x2[51]); + x3[52] = _mm_subs_epi16(x2[59], x2[52]); + x3[59] = _mm_adds_epi16(x2[59], x2[52]); + x3[53] = _mm_subs_epi16(x2[58], x2[53]); + x3[58] = _mm_adds_epi16(x2[58], x2[53]); + x3[54] = _mm_subs_epi16(x2[57], x2[54]); + x3[57] = _mm_adds_epi16(x2[57], x2[54]); + x3[55] = _mm_subs_epi16(x2[56], x2[55]); + x3[56] = _mm_adds_epi16(x2[56], x2[55]); + + // stage 4 + __m128i x4[64]; + x4[0] = _mm_adds_epi16(x3[0], x3[7]); + x4[7] = _mm_subs_epi16(x3[0], x3[7]); + x4[1] = _mm_adds_epi16(x3[1], x3[6]); + x4[6] = _mm_subs_epi16(x3[1], x3[6]); + x4[2] = _mm_adds_epi16(x3[2], x3[5]); + x4[5] = _mm_subs_epi16(x3[2], x3[5]); + x4[3] = _mm_adds_epi16(x3[3], x3[4]); + x4[4] = _mm_subs_epi16(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm_adds_epi16(x3[16], x3[23]); + x4[23] = _mm_subs_epi16(x3[16], x3[23]); + x4[17] = _mm_adds_epi16(x3[17], x3[22]); + x4[22] = _mm_subs_epi16(x3[17], x3[22]); + x4[18] = _mm_adds_epi16(x3[18], x3[21]); + x4[21] = _mm_subs_epi16(x3[18], x3[21]); + x4[19] = _mm_adds_epi16(x3[19], x3[20]); + x4[20] = _mm_subs_epi16(x3[19], x3[20]); + x4[24] = _mm_subs_epi16(x3[31], x3[24]); + x4[31] = _mm_adds_epi16(x3[31], x3[24]); + x4[25] = _mm_subs_epi16(x3[30], x3[25]); + x4[30] = _mm_adds_epi16(x3[30], x3[25]); + x4[26] = _mm_subs_epi16(x3[29], x3[26]); + x4[29] = _mm_adds_epi16(x3[29], x3[26]); + x4[27] = _mm_subs_epi16(x3[28], x3[27]); + x4[28] = _mm_adds_epi16(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; + + // stage 5 + __m128i x5[64]; + x5[0] = _mm_adds_epi16(x4[0], x4[3]); + x5[3] = _mm_subs_epi16(x4[0], x4[3]); + x5[1] = _mm_adds_epi16(x4[1], x4[2]); + x5[2] = _mm_subs_epi16(x4[1], x4[2]); + x5[4] = x4[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]); + x5[7] = x4[7]; + x5[8] = _mm_adds_epi16(x4[8], x4[11]); + x5[11] = _mm_subs_epi16(x4[8], x4[11]); + x5[9] = _mm_adds_epi16(x4[9], x4[10]); + x5[10] = _mm_subs_epi16(x4[9], x4[10]); + x5[12] = _mm_subs_epi16(x4[15], x4[12]); + x5[15] = _mm_adds_epi16(x4[15], x4[12]); + x5[13] = _mm_subs_epi16(x4[14], x4[13]); + x5[14] = _mm_adds_epi16(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]); + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm_adds_epi16(x4[32], x4[39]); + x5[39] = _mm_subs_epi16(x4[32], x4[39]); + x5[33] = _mm_adds_epi16(x4[33], x4[38]); + x5[38] = _mm_subs_epi16(x4[33], x4[38]); + x5[34] = _mm_adds_epi16(x4[34], x4[37]); + x5[37] = _mm_subs_epi16(x4[34], x4[37]); + x5[35] = _mm_adds_epi16(x4[35], x4[36]); + x5[36] = _mm_subs_epi16(x4[35], x4[36]); + x5[40] = _mm_subs_epi16(x4[47], x4[40]); + x5[47] = _mm_adds_epi16(x4[47], x4[40]); + x5[41] = _mm_subs_epi16(x4[46], x4[41]); + x5[46] = _mm_adds_epi16(x4[46], x4[41]); + x5[42] = _mm_subs_epi16(x4[45], x4[42]); + x5[45] = _mm_adds_epi16(x4[45], x4[42]); + x5[43] = _mm_subs_epi16(x4[44], x4[43]); + x5[44] = _mm_adds_epi16(x4[44], x4[43]); + x5[48] = _mm_adds_epi16(x4[48], x4[55]); + x5[55] = _mm_subs_epi16(x4[48], x4[55]); + x5[49] = _mm_adds_epi16(x4[49], x4[54]); + x5[54] = _mm_subs_epi16(x4[49], x4[54]); + x5[50] = _mm_adds_epi16(x4[50], x4[53]); + x5[53] = _mm_subs_epi16(x4[50], x4[53]); + x5[51] = _mm_adds_epi16(x4[51], x4[52]); + x5[52] = _mm_subs_epi16(x4[51], x4[52]); + x5[56] = _mm_subs_epi16(x4[63], x4[56]); + x5[63] = _mm_adds_epi16(x4[63], x4[56]); + x5[57] = _mm_subs_epi16(x4[62], x4[57]); + x5[62] = _mm_adds_epi16(x4[62], x4[57]); + x5[58] = _mm_subs_epi16(x4[61], x4[58]); + x5[61] = _mm_adds_epi16(x4[61], x4[58]); + x5[59] = _mm_subs_epi16(x4[60], x4[59]); + x5[60] = _mm_adds_epi16(x4[60], x4[59]); + + // stage 6 + __m128i x6[64]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]); + x6[4] = _mm_adds_epi16(x5[4], x5[5]); + x6[5] = _mm_subs_epi16(x5[4], x5[5]); + x6[6] = _mm_subs_epi16(x5[7], x5[6]); + x6[7] = _mm_adds_epi16(x5[7], x5[6]); + x6[8] = x5[8]; + btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]); + btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm_adds_epi16(x5[16], x5[19]); + x6[19] = _mm_subs_epi16(x5[16], x5[19]); + x6[17] = _mm_adds_epi16(x5[17], x5[18]); + x6[18] = _mm_subs_epi16(x5[17], x5[18]); + x6[20] = _mm_subs_epi16(x5[23], x5[20]); + x6[23] = _mm_adds_epi16(x5[23], x5[20]); + x6[21] = _mm_subs_epi16(x5[22], x5[21]); + x6[22] = _mm_adds_epi16(x5[22], x5[21]); + x6[24] = _mm_adds_epi16(x5[24], x5[27]); + x6[27] = _mm_subs_epi16(x5[24], x5[27]); + x6[25] = _mm_adds_epi16(x5[25], x5[26]); + x6[26] = _mm_subs_epi16(x5[25], x5[26]); + x6[28] = _mm_subs_epi16(x5[31], x5[28]); + x6[31] = _mm_adds_epi16(x5[31], x5[28]); + x6[29] = _mm_subs_epi16(x5[30], x5[29]); + x6[30] = _mm_adds_epi16(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]); + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]); + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; + + // stage 7 + __m128i x7[64]; + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]); + x7[8] = _mm_adds_epi16(x6[8], x6[9]); + x7[9] = _mm_subs_epi16(x6[8], x6[9]); + x7[10] = _mm_subs_epi16(x6[11], x6[10]); + x7[11] = _mm_adds_epi16(x6[11], x6[10]); + x7[12] = _mm_adds_epi16(x6[12], x6[13]); + x7[13] = _mm_subs_epi16(x6[12], x6[13]); + x7[14] = _mm_subs_epi16(x6[15], x6[14]); + x7[15] = _mm_adds_epi16(x6[15], x6[14]); + x7[16] = x6[16]; + btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]); + btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]); + btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm_adds_epi16(x6[32], x6[35]); + x7[35] = _mm_subs_epi16(x6[32], x6[35]); + x7[33] = _mm_adds_epi16(x6[33], x6[34]); + x7[34] = _mm_subs_epi16(x6[33], x6[34]); + x7[36] = _mm_subs_epi16(x6[39], x6[36]); + x7[39] = _mm_adds_epi16(x6[39], x6[36]); + x7[37] = _mm_subs_epi16(x6[38], x6[37]); + x7[38] = _mm_adds_epi16(x6[38], x6[37]); + x7[40] = _mm_adds_epi16(x6[40], x6[43]); + x7[43] = _mm_subs_epi16(x6[40], x6[43]); + x7[41] = _mm_adds_epi16(x6[41], x6[42]); + x7[42] = _mm_subs_epi16(x6[41], x6[42]); + x7[44] = _mm_subs_epi16(x6[47], x6[44]); + x7[47] = _mm_adds_epi16(x6[47], x6[44]); + x7[45] = _mm_subs_epi16(x6[46], x6[45]); + x7[46] = _mm_adds_epi16(x6[46], x6[45]); + x7[48] = _mm_adds_epi16(x6[48], x6[51]); + x7[51] = _mm_subs_epi16(x6[48], x6[51]); + x7[49] = _mm_adds_epi16(x6[49], x6[50]); + x7[50] = _mm_subs_epi16(x6[49], x6[50]); + x7[52] = _mm_subs_epi16(x6[55], x6[52]); + x7[55] = _mm_adds_epi16(x6[55], x6[52]); + x7[53] = _mm_subs_epi16(x6[54], x6[53]); + x7[54] = _mm_adds_epi16(x6[54], x6[53]); + x7[56] = _mm_adds_epi16(x6[56], x6[59]); + x7[59] = _mm_subs_epi16(x6[56], x6[59]); + x7[57] = _mm_adds_epi16(x6[57], x6[58]); + x7[58] = _mm_subs_epi16(x6[57], x6[58]); + x7[60] = _mm_subs_epi16(x6[63], x6[60]); + x7[63] = _mm_adds_epi16(x6[63], x6[60]); + x7[61] = _mm_subs_epi16(x6[62], x6[61]); + x7[62] = _mm_adds_epi16(x6[62], x6[61]); + + // stage 8 + __m128i x8[64]; + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]); + btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]); + btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]); + btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]); + x8[16] = _mm_adds_epi16(x7[16], x7[17]); + x8[17] = _mm_subs_epi16(x7[16], x7[17]); + x8[18] = _mm_subs_epi16(x7[19], x7[18]); + x8[19] = _mm_adds_epi16(x7[19], x7[18]); + x8[20] = _mm_adds_epi16(x7[20], x7[21]); + x8[21] = _mm_subs_epi16(x7[20], x7[21]); + x8[22] = _mm_subs_epi16(x7[23], x7[22]); + x8[23] = _mm_adds_epi16(x7[23], x7[22]); + x8[24] = _mm_adds_epi16(x7[24], x7[25]); + x8[25] = _mm_subs_epi16(x7[24], x7[25]); + x8[26] = _mm_subs_epi16(x7[27], x7[26]); + x8[27] = _mm_adds_epi16(x7[27], x7[26]); + x8[28] = _mm_adds_epi16(x7[28], x7[29]); + x8[29] = _mm_subs_epi16(x7[28], x7[29]); + x8[30] = _mm_subs_epi16(x7[31], x7[30]); + x8[31] = _mm_adds_epi16(x7[31], x7[30]); + x8[32] = x7[32]; + btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]); + btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]); + btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]); + btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]); + btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; + + // stage 9 + __m128i x9[64]; + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]); + btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]); + btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]); + btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]); + btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]); + btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]); + btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]); + btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]); + x9[32] = _mm_adds_epi16(x8[32], x8[33]); + x9[33] = _mm_subs_epi16(x8[32], x8[33]); + x9[34] = _mm_subs_epi16(x8[35], x8[34]); + x9[35] = _mm_adds_epi16(x8[35], x8[34]); + x9[36] = _mm_adds_epi16(x8[36], x8[37]); + x9[37] = _mm_subs_epi16(x8[36], x8[37]); + x9[38] = _mm_subs_epi16(x8[39], x8[38]); + x9[39] = _mm_adds_epi16(x8[39], x8[38]); + x9[40] = _mm_adds_epi16(x8[40], x8[41]); + x9[41] = _mm_subs_epi16(x8[40], x8[41]); + x9[42] = _mm_subs_epi16(x8[43], x8[42]); + x9[43] = _mm_adds_epi16(x8[43], x8[42]); + x9[44] = _mm_adds_epi16(x8[44], x8[45]); + x9[45] = _mm_subs_epi16(x8[44], x8[45]); + x9[46] = _mm_subs_epi16(x8[47], x8[46]); + x9[47] = _mm_adds_epi16(x8[47], x8[46]); + x9[48] = _mm_adds_epi16(x8[48], x8[49]); + x9[49] = _mm_subs_epi16(x8[48], x8[49]); + x9[50] = _mm_subs_epi16(x8[51], x8[50]); + x9[51] = _mm_adds_epi16(x8[51], x8[50]); + x9[52] = _mm_adds_epi16(x8[52], x8[53]); + x9[53] = _mm_subs_epi16(x8[52], x8[53]); + x9[54] = _mm_subs_epi16(x8[55], x8[54]); + x9[55] = _mm_adds_epi16(x8[55], x8[54]); + x9[56] = _mm_adds_epi16(x8[56], x8[57]); + x9[57] = _mm_subs_epi16(x8[56], x8[57]); + x9[58] = _mm_subs_epi16(x8[59], x8[58]); + x9[59] = _mm_adds_epi16(x8[59], x8[58]); + x9[60] = _mm_adds_epi16(x8[60], x8[61]); + x9[61] = _mm_subs_epi16(x8[60], x8[61]); + x9[62] = _mm_subs_epi16(x8[63], x8[62]); + x9[63] = _mm_adds_epi16(x8[63], x8[62]); + + // stage 10 + __m128i x10[64]; + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]); + btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]); + btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]); + btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]); + btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]); + btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]); + btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]); + btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]); + btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]); + btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]); + btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]); + btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]); + btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]); + btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]); + btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]); + btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]); + + // stage 11 + output[0] = x10[0]; + output[1] = x10[32]; + output[2] = x10[16]; + output[3] = x10[48]; + output[4] = x10[8]; + output[5] = x10[40]; + output[6] = x10[24]; + output[7] = x10[56]; + output[8] = x10[4]; + output[9] = x10[36]; + output[10] = x10[20]; + output[11] = x10[52]; + output[12] = x10[12]; + output[13] = x10[44]; + output[14] = x10[28]; + output[15] = x10[60]; + output[16] = x10[2]; + output[17] = x10[34]; + output[18] = x10[18]; + output[19] = x10[50]; + output[20] = x10[10]; + output[21] = x10[42]; + output[22] = x10[26]; + output[23] = x10[58]; + output[24] = x10[6]; + output[25] = x10[38]; + output[26] = x10[22]; + output[27] = x10[54]; + output[28] = x10[14]; + output[29] = x10[46]; + output[30] = x10[30]; + output[31] = x10[62]; + output[32] = x10[1]; + output[33] = x10[33]; + output[34] = x10[17]; + output[35] = x10[49]; + output[36] = x10[9]; + output[37] = x10[41]; + output[38] = x10[25]; + output[39] = x10[57]; + output[40] = x10[5]; + output[41] = x10[37]; + output[42] = x10[21]; + output[43] = x10[53]; + output[44] = x10[13]; + output[45] = x10[45]; + output[46] = x10[29]; + output[47] = x10[61]; + output[48] = x10[3]; + output[49] = x10[35]; + output[50] = x10[19]; + output[51] = x10[51]; + output[52] = x10[11]; + output[53] = x10[43]; + output[54] = x10[27]; + output[55] = x10[59]; + output[56] = x10[7]; + output[57] = x10[39]; + output[58] = x10[23]; + output[59] = x10[55]; + output[60] = x10[15]; + output[61] = x10[47]; + output[62] = x10[31]; + output[63] = x10[63]; +} + +static void fadst4x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u[8], v[8]; + + u[0] = _mm_unpacklo_epi16(input[0], input[1]); + u[1] = _mm_unpacklo_epi16(input[2], input[3]); + u[2] = _mm_unpacklo_epi16(in7, __zero); + u[3] = _mm_unpacklo_epi16(input[2], __zero); + u[4] = _mm_unpacklo_epi16(input[3], __zero); + + v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03); + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_sub_epi32(v[2], v[6]); + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], __rounding); + v[1] = _mm_add_epi32(u[1], __rounding); + v[2] = _mm_add_epi32(u[2], __rounding); + v[3] = _mm_add_epi32(u[6], __rounding); + + u[0] = _mm_srai_epi32(v[0], cos_bit); + u[1] = _mm_srai_epi32(v[1], cos_bit); + u[2] = _mm_srai_epi32(v[2], cos_bit); + u[3] = _mm_srai_epi32(v[3], cos_bit); + + output[0] = _mm_packs_epi32(u[0], u[2]); + output[1] = _mm_packs_epi32(u[1], u[3]); + output[2] = _mm_srli_si128(output[0], 8); + output[3] = _mm_srli_si128(output[1], 8); +} + +static void fadst4x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2], + &x1[3], &x2[2], &x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6], + &x1[7], &x2[6], &x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4], + &x3[5], &x4[4], &x4[5]); + btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6], + &x3[7], &x4[6], &x4[7]); + + // stage 5 + __m128i x5[8]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m128i x6[8]; + btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0], + &x5[1], &x6[0], &x6[1]); + btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2], + &x5[3], &x6[2], &x6[3]); + btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4], + &x5[5], &x6[4], &x6[5]); + btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6], + &x5[7], &x6[6], &x6[7]); + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static void fadst8x4_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *sinpi = sinpi_arr(cos_bit); + const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]); + const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]); + const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]); + const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]); + const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + const __m128i in7 = _mm_add_epi16(input[0], input[1]); + __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8]; + + u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]); + u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]); + u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]); + u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]); + u_lo[2] = _mm_unpacklo_epi16(in7, __zero); + u_hi[2] = _mm_unpackhi_epi16(in7, __zero); + u_lo[3] = _mm_unpacklo_epi16(input[2], __zero); + u_hi[3] = _mm_unpackhi_epi16(input[2], __zero); + u_lo[4] = _mm_unpacklo_epi16(input[3], __zero); + u_hi[4] = _mm_unpackhi_epi16(input[3], __zero); + + v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2 + v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2 + v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5 + v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5 + v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1 + v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1 + v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3 + v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3 + v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6 + v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6 + v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4 + v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4 + v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03); + v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03); + + u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]); + u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]); + u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]); + u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]); + u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]); + u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]); + u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]); + u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]); + u_lo[4] = _mm_slli_epi32(v_lo[5], 2); + u_hi[4] = _mm_slli_epi32(v_hi[5], 2); + u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]); + u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]); + u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]); + u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]); + + v_lo[0] = _mm_add_epi32(u_lo[0], __rounding); + v_hi[0] = _mm_add_epi32(u_hi[0], __rounding); + v_lo[1] = _mm_add_epi32(u_lo[1], __rounding); + v_hi[1] = _mm_add_epi32(u_hi[1], __rounding); + v_lo[2] = _mm_add_epi32(u_lo[2], __rounding); + v_hi[2] = _mm_add_epi32(u_hi[2], __rounding); + v_lo[3] = _mm_add_epi32(u_lo[6], __rounding); + v_hi[3] = _mm_add_epi32(u_hi[6], __rounding); + + u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit); + u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit); + u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit); + u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit); + u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit); + u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit); + u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit); + u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit); + + output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]); + output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]); + output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]); + output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]); +} + +static void fadst8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + // stage 1 + __m128i x1[16]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[15]); + x1[2] = _mm_subs_epi16(__zero, input[7]); + x1[3] = input[8]; + x1[4] = _mm_subs_epi16(__zero, input[3]); + x1[5] = input[12]; + x1[6] = input[4]; + x1[7] = _mm_subs_epi16(__zero, input[11]); + x1[8] = _mm_subs_epi16(__zero, input[1]); + x1[9] = input[14]; + x1[10] = input[6]; + x1[11] = _mm_subs_epi16(__zero, input[9]); + x1[12] = input[2]; + x1[13] = _mm_subs_epi16(__zero, input[13]); + x1[14] = _mm_subs_epi16(__zero, input[5]); + x1[15] = input[10]; + + // stage 2 + __m128i x2[16]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + x2[8] = x1[8]; + x2[9] = x1[9]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]); + x2[12] = x1[12]; + x2[13] = x1[13]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]); + + // stage 3 + __m128i x3[16]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + x3[8] = _mm_adds_epi16(x2[8], x2[10]); + x3[10] = _mm_subs_epi16(x2[8], x2[10]); + x3[9] = _mm_adds_epi16(x2[9], x2[11]); + x3[11] = _mm_subs_epi16(x2[9], x2[11]); + x3[12] = _mm_adds_epi16(x2[12], x2[14]); + x3[14] = _mm_subs_epi16(x2[12], x2[14]); + x3[13] = _mm_adds_epi16(x2[13], x2[15]); + x3[15] = _mm_subs_epi16(x2[13], x2[15]); + + // stage 4 + __m128i x4[16]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + x4[8] = x3[8]; + x4[9] = x3[9]; + x4[10] = x3[10]; + x4[11] = x3[11]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]); + + // stage 5 + __m128i x5[16]; + x5[0] = _mm_adds_epi16(x4[0], x4[4]); + x5[4] = _mm_subs_epi16(x4[0], x4[4]); + x5[1] = _mm_adds_epi16(x4[1], x4[5]); + x5[5] = _mm_subs_epi16(x4[1], x4[5]); + x5[2] = _mm_adds_epi16(x4[2], x4[6]); + x5[6] = _mm_subs_epi16(x4[2], x4[6]); + x5[3] = _mm_adds_epi16(x4[3], x4[7]); + x5[7] = _mm_subs_epi16(x4[3], x4[7]); + x5[8] = _mm_adds_epi16(x4[8], x4[12]); + x5[12] = _mm_subs_epi16(x4[8], x4[12]); + x5[9] = _mm_adds_epi16(x4[9], x4[13]); + x5[13] = _mm_subs_epi16(x4[9], x4[13]); + x5[10] = _mm_adds_epi16(x4[10], x4[14]); + x5[14] = _mm_subs_epi16(x4[10], x4[14]); + x5[11] = _mm_adds_epi16(x4[11], x4[15]); + x5[15] = _mm_subs_epi16(x4[11], x4[15]); + + // stage 6 + __m128i x6[16]; + x6[0] = x5[0]; + x6[1] = x5[1]; + x6[2] = x5[2]; + x6[3] = x5[3]; + x6[4] = x5[4]; + x6[5] = x5[5]; + x6[6] = x5[6]; + x6[7] = x5[7]; + btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]); + btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]); + btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]); + btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]); + + // stage 7 + __m128i x7[16]; + x7[0] = _mm_adds_epi16(x6[0], x6[8]); + x7[8] = _mm_subs_epi16(x6[0], x6[8]); + x7[1] = _mm_adds_epi16(x6[1], x6[9]); + x7[9] = _mm_subs_epi16(x6[1], x6[9]); + x7[2] = _mm_adds_epi16(x6[2], x6[10]); + x7[10] = _mm_subs_epi16(x6[2], x6[10]); + x7[3] = _mm_adds_epi16(x6[3], x6[11]); + x7[11] = _mm_subs_epi16(x6[3], x6[11]); + x7[4] = _mm_adds_epi16(x6[4], x6[12]); + x7[12] = _mm_subs_epi16(x6[4], x6[12]); + x7[5] = _mm_adds_epi16(x6[5], x6[13]); + x7[13] = _mm_subs_epi16(x6[5], x6[13]); + x7[6] = _mm_adds_epi16(x6[6], x6[14]); + x7[14] = _mm_subs_epi16(x6[6], x6[14]); + x7[7] = _mm_adds_epi16(x6[7], x6[15]); + x7[15] = _mm_subs_epi16(x6[7], x6[15]); + + // stage 8 + __m128i x8[16]; + btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]); + btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]); + btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]); + btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]); + btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]); + btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]); + btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]); + btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]); + + // stage 9 + output[0] = x8[1]; + output[1] = x8[14]; + output[2] = x8[3]; + output[3] = x8[12]; + output[4] = x8[5]; + output[5] = x8[10]; + output[6] = x8[7]; + output[7] = x8[8]; + output[8] = x8[9]; + output[9] = x8[6]; + output[10] = x8[11]; + output[11] = x8[4]; + output[12] = x8[13]; + output[13] = x8[2]; + output[14] = x8[15]; + output[15] = x8[0]; +} + +static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fadst4x4_new_sse2, // ADST_DCT + fdct4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fadst4x4_new_sse2, // FLIPADST_DCT + fdct4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fdct4x4_new_sse2, // V_DCT + fidentity4x4_new_sse2, // H_DCT + fadst4x4_new_sse2, // V_ADST + fidentity4x4_new_sse2, // H_ADST + fadst4x4_new_sse2, // V_FLIPADST + fidentity4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = { + fdct4x4_new_sse2, // DCT_DCT + fdct4x4_new_sse2, // ADST_DCT + fadst4x4_new_sse2, // DCT_ADST + fadst4x4_new_sse2, // ADST_ADST + fdct4x4_new_sse2, // FLIPADST_DCT + fadst4x4_new_sse2, // DCT_FLIPADST + fadst4x4_new_sse2, // FLIPADST_FLIPADST + fadst4x4_new_sse2, // ADST_FLIPADST + fadst4x4_new_sse2, // FLIPADST_ADST + fidentity4x4_new_sse2, // IDTX + fidentity4x4_new_sse2, // V_DCT + fdct4x4_new_sse2, // H_DCT + fidentity4x4_new_sse2, // V_ADST + fadst4x4_new_sse2, // H_ADST + fidentity4x4_new_sse2, // V_FLIPADST + fadst4x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fadst4x8_new_sse2, // ADST_DCT + fdct4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fadst4x8_new_sse2, // FLIPADST_DCT + fdct4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct4x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst4x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst4x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fdct8x4_new_sse2, // ADST_DCT + fadst8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fdct8x4_new_sse2, // FLIPADST_DCT + fadst8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fidentity8x4_new_sse2, // V_DCT + fdct8x4_new_sse2, // H_DCT + fidentity8x4_new_sse2, // V_ADST + fadst8x4_new_sse2, // H_ADST + fidentity8x4_new_sse2, // V_FLIPADST + fadst8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = { + fdct8x4_new_sse2, // DCT_DCT + fadst8x4_new_sse2, // ADST_DCT + fdct8x4_new_sse2, // DCT_ADST + fadst8x4_new_sse2, // ADST_ADST + fadst8x4_new_sse2, // FLIPADST_DCT + fdct8x4_new_sse2, // DCT_FLIPADST + fadst8x4_new_sse2, // FLIPADST_FLIPADST + fadst8x4_new_sse2, // ADST_FLIPADST + fadst8x4_new_sse2, // FLIPADST_ADST + fidentity8x4_new_sse2, // IDTX + fdct8x4_new_sse2, // V_DCT + fidentity8x4_new_sse2, // H_DCT + fadst8x4_new_sse2, // V_ADST + fidentity8x4_new_sse2, // H_ADST + fadst8x4_new_sse2, // V_FLIPADST + fidentity8x4_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = { + fdct4x8_new_sse2, // DCT_DCT + fdct4x8_new_sse2, // ADST_DCT + fadst4x8_new_sse2, // DCT_ADST + fadst4x8_new_sse2, // ADST_ADST + fdct4x8_new_sse2, // FLIPADST_DCT + fadst4x8_new_sse2, // DCT_FLIPADST + fadst4x8_new_sse2, // FLIPADST_FLIPADST + fadst4x8_new_sse2, // ADST_FLIPADST + fadst4x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct4x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst4x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst4x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fadst8x8_new_sse2, // ADST_DCT + fdct8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fadst8x8_new_sse2, // FLIPADST_DCT + fdct8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fdct8x8_new_sse2, // V_DCT + fidentity8x8_new_sse2, // H_DCT + fadst8x8_new_sse2, // V_ADST + fidentity8x8_new_sse2, // H_ADST + fadst8x8_new_sse2, // V_FLIPADST + fidentity8x8_new_sse2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = { + fdct8x8_new_sse2, // DCT_DCT + fdct8x8_new_sse2, // ADST_DCT + fadst8x8_new_sse2, // DCT_ADST + fadst8x8_new_sse2, // ADST_ADST + fdct8x8_new_sse2, // FLIPADST_DCT + fadst8x8_new_sse2, // DCT_FLIPADST + fadst8x8_new_sse2, // FLIPADST_FLIPADST + fadst8x8_new_sse2, // ADST_FLIPADST + fadst8x8_new_sse2, // FLIPADST_ADST + fidentity8x8_new_sse2, // IDTX + fidentity8x8_new_sse2, // V_DCT + fdct8x8_new_sse2, // H_DCT + fidentity8x8_new_sse2, // V_ADST + fadst8x8_new_sse2, // H_ADST + fidentity8x8_new_sse2, // V_FLIPADST + fadst8x8_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fadst8x16_new_sse2, // ADST_DCT + fdct8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fadst8x16_new_sse2, // FLIPADST_DCT + fdct8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fdct8x16_new_sse2, // V_DCT + fidentity8x16_new_sse2, // H_DCT + fadst8x16_new_sse2, // V_ADST + fidentity8x16_new_sse2, // H_ADST + fadst8x16_new_sse2, // V_FLIPADST + fidentity8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_sse2, // DCT_DCT + fdct8x16_new_sse2, // ADST_DCT + fadst8x16_new_sse2, // DCT_ADST + fadst8x16_new_sse2, // ADST_ADST + fdct8x16_new_sse2, // FLIPADST_DCT + fadst8x16_new_sse2, // DCT_FLIPADST + fadst8x16_new_sse2, // FLIPADST_FLIPADST + fadst8x16_new_sse2, // ADST_FLIPADST + fadst8x16_new_sse2, // FLIPADST_ADST + fidentity8x16_new_sse2, // IDTX + fidentity8x16_new_sse2, // V_DCT + fdct8x16_new_sse2, // H_DCT + fidentity8x16_new_sse2, // V_ADST + fadst8x16_new_sse2, // H_ADST + fidentity8x16_new_sse2, // V_FLIPADST + fadst8x16_new_sse2 // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = { + av1_fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + fidentity8x32_new_sse2, // V_DCT + av1_fdct8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[4], buf1[4], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x4(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w4(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)stride; + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 4; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit_w4(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_4x8(buf0, buf1); + transpose_16bit_4x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + 8 * i, buf, width); + } else { + buf = buf1 + 8 * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[8], buf1[8], *buf; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + else + load_buffer_16bit_to_16bit(input, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + transpose_16bit_8x8(buf0 + 16, buf1 + 16); + transpose_16bit_8x8(buf0 + 24, buf1 + 24); + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 4; + const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x4(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w4(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 8 * i); + } + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width); +} + +void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } +} + +void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 8; + const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + } + + for (int i = 0; i < 1; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i); + } + + for (int i = 0; i < 2; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[32], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32]; + const int txw_idx = get_txw_idx(TX_32X32); + const int txh_idx = get_txh_idx(TX_32X32); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 32; + const int height = 32; + const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type]; + + if (col_txfm != NULL && row_txfm != NULL) { + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 4; i++) { + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i); + transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i); + transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i); + transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i); + } + + for (int i = 0; i < 4; i++) { + __m128i *buf; + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1 + width * i, buf, width); + } else { + buf = buf1 + width * i; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width); + } + } else { + av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); + } +} + +void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X16; + __m128i buf0[64], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = fdct8x16_new_sse2; + const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < height_div8; i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32); + } + // Zero out the bottom 16x32 area. + memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); +} + +void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_16X64; + __m128i buf0[64], buf1[128]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2; + const transform_1d_sse2 row_txfm = fdct8x16_new_sse2; + const int width_div8 = (width >> 3); + const int height_div8 = (height >> 3); + + for (int i = 0; i < width_div8; i++) { + load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height); + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + for (int j = 0; j < height_div8; ++j) { + transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i); + } + } + + for (int i = 0; i < AOMMIN(4, height_div8); i++) { + __m128i *buf = buf1 + width * i; + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16); + } +} + +static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { + av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform + av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform + av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform + av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform + NULL, // 64x64 transform + av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform + av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform + av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform + av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform + av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform + NULL, // 32x64 transform + NULL, // 64x32 transform + av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform + av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform + av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform + av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform + av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform + av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform +}; + +void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff, + int diff_stride, TxfmParam *txfm_param) { + FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size]; + + if ((fwd_txfm2d_func == NULL) || + (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) + av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); + else + fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, + txfm_param->bd); +} diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h new file mode 100644 index 0000000000..3cb869a8fe --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ +#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ + +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit); +void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit); + +static INLINE void fidentity4x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a = _mm_unpacklo_epi16(input[i], one); + const __m128i b = scale_round_sse2(a, NewSqrt2); + output[i] = _mm_packs_epi32(b, b); + } +} + +static INLINE void fidentity8x4_new_sse2(const __m128i *const input, + __m128i *const output, + const int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm_adds_epi16(input[0], input[0]); + output[1] = _mm_adds_epi16(input[1], input[1]); + output[2] = _mm_adds_epi16(input[2], input[2]); + output[3] = _mm_adds_epi16(input[3], input[3]); + output[4] = _mm_adds_epi16(input[4], input[4]); + output[5] = _mm_adds_epi16(input[5], input[5]); + output[6] = _mm_adds_epi16(input[6], input[6]); + output[7] = _mm_adds_epi16(input[7], input[7]); +} + +static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m128i x1[8]; + x1[0] = _mm_adds_epi16(input[0], input[7]); + x1[7] = _mm_subs_epi16(input[0], input[7]); + x1[1] = _mm_adds_epi16(input[1], input[6]); + x1[6] = _mm_subs_epi16(input[1], input[6]); + x1[2] = _mm_adds_epi16(input[2], input[5]); + x1[5] = _mm_subs_epi16(input[2], input[5]); + x1[3] = _mm_adds_epi16(input[3], input[4]); + x1[4] = _mm_subs_epi16(input[3], input[4]); + + // stage 2 + __m128i x2[8]; + x2[0] = _mm_adds_epi16(x1[0], x1[3]); + x2[3] = _mm_subs_epi16(x1[0], x1[3]); + x2[1] = _mm_adds_epi16(x1[1], x1[2]); + x2[2] = _mm_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]); + x2[7] = x1[7]; + + // stage 3 + __m128i x3[8]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]); + btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[5]); + x3[5] = _mm_subs_epi16(x2[4], x2[5]); + x3[6] = _mm_subs_epi16(x2[7], x2[6]); + x3[7] = _mm_adds_epi16(x2[7], x2[6]); + + // stage 4 and 5 + output[0] = x3[0]; + output[4] = x3[1]; + output[2] = x3[2]; + output[6] = x3[3]; + btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]); + btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]); +} + +static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m128i __zero = _mm_setzero_si128(); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + + const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]); + const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]); + const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]); + const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]); + const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]); + const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]); + const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]); + const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m128i x1[8]; + x1[0] = input[0]; + x1[1] = _mm_subs_epi16(__zero, input[7]); + x1[2] = _mm_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm_subs_epi16(__zero, input[5]); + + // stage 2 + __m128i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]); + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]); + + // stage 3 + __m128i x3[8]; + x3[0] = _mm_adds_epi16(x2[0], x2[2]); + x3[2] = _mm_subs_epi16(x2[0], x2[2]); + x3[1] = _mm_adds_epi16(x2[1], x2[3]); + x3[3] = _mm_subs_epi16(x2[1], x2[3]); + x3[4] = _mm_adds_epi16(x2[4], x2[6]); + x3[6] = _mm_subs_epi16(x2[4], x2[6]); + x3[5] = _mm_adds_epi16(x2[5], x2[7]); + x3[7] = _mm_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m128i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]); + btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]); + + // stage 5, 6 and 7 + output[7] = _mm_adds_epi16(x4[0], x4[4]); + output[3] = _mm_subs_epi16(x4[0], x4[4]); + output[0] = _mm_adds_epi16(x4[1], x4[5]); + output[4] = _mm_subs_epi16(x4[1], x4[5]); + output[5] = _mm_adds_epi16(x4[2], x4[6]); + output[1] = _mm_subs_epi16(x4[2], x4[6]); + output[2] = _mm_adds_epi16(x4[3], x4[7]); + output[6] = _mm_subs_epi16(x4[3], x4[7]); + + btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7], + output[0]); + btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5], + output[2]); + btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3], + output[4]); + btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1], + output[6]); +} + +static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i a_lo = _mm_unpacklo_epi16(input[i], one); + const __m128i a_hi = _mm_unpackhi_epi16(input[i], one); + const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2); + const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2); + output[i] = _mm_packs_epi32(b_lo, b_hi); + } +} + +static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + for (int i = 0; i < 32; ++i) { + output[i] = _mm_slli_epi16(input[i], 2); + } +} + +static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = { + av1_fdct8x32_new_sse2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + fidentity8x32_new_sse2, // IDTX + av1_fdct8x32_new_sse2, // V_DCT + fidentity8x32_new_sse2, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c new file mode 100644 index 0000000000..b58911fcb2 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(*p, zero); + const __m128i ac = _mm_unpackhi_epi16(*p, zero); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + if (log_scale) { + const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale)); + round = _mm_mulhrs_epi16(round, round_scale); + } + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + init_one_qp(&dequant, &qp[2]); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, int log_scale, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i abs_coeff = _mm256_abs_epi32(*c); + __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); + + __m256i q_lo = _mm256_mul_epi32(q, qp[1]); + __m256i q_hi = _mm256_srli_epi64(q, 32); + const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); + q_hi = _mm256_mul_epi32(q_hi, qp_hi); + q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); + q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); + q_hi = _mm256_slli_epi64(q_hi, 32); + q = _mm256_or_si256(q_lo, q_hi); + const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); + const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); + q = _mm256_andnot_si256(mask, q); + + __m256i dq = _mm256_mullo_epi32(q, qp[2]); + dq = _mm256_srai_epi32(dq, log_scale); + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); +} + +void av1_highbd_quantize_fp_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + (void)scan; + (void)zbin_ptr; + (void)quant_shift_ptr; + const unsigned int step = 8; + __m256i qp[3], coeff; + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c new file mode 100644 index 0000000000..40b3b460b6 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" + +// Coefficient quantization phase 1 +// param[0-2] : rounding/quan/dequan constants +static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param, + const int shift, const int scale, + __m128i *qcoeff, __m128i *dquan, + __m128i *sign) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi32(1); + + *sign = _mm_cmplt_epi32(*coeff, zero); + *sign = _mm_or_si128(*sign, one); + *coeff = _mm_abs_epi32(*coeff); + + qcoeff[0] = _mm_add_epi32(*coeff, param[0]); + qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero); + qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero); + + qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]); + qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift); + dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]); + dquan[0] = _mm_srli_epi64(dquan[0], scale); + const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale); + qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]); +} + +// Coefficient quantization phase 2 +static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan, + const __m128i *sign, + const __m128i *param, const int shift, + const int scale, tran_low_t *qAddr, + tran_low_t *dqAddr) { + __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0); + __m128i mask0H = _mm_set_epi32(0, 0, -1, -1); + + qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]); + qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift); + dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]); + dquan[1] = _mm_srli_epi64(dquan[1], scale); + + // combine L&H + qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8); + qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d); + + qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H); + qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L); + + dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8); + dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d); + + dquan[0] = _mm_and_si128(dquan[0], mask0H); + dquan[1] = _mm_and_si128(dquan[1], mask0L); + + qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]); + dquan[0] = _mm_or_si128(dquan[0], dquan[1]); + + qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign); + dquan[0] = _mm_sign_epi32(dquan[0], *sign); + qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]); + dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]); + _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]); + _mm_storeu_si128((__m128i *)dqAddr, dquan[0]); +} + +static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan, + __m128i *eob) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, iscanIdx; + const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr); + const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4)); + __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero); + __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero); + + nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero); + nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero); + + mask = _mm_packs_epi32(nz_flag0, nz_flag1); + iscanIdx = _mm_loadu_si128((__m128i const *)iscan); + iscanIdx = _mm_sub_epi16(iscanIdx, mask); + iscanIdx = _mm_and_si128(iscanIdx, mask); + *eob = _mm_max_epi16(*eob, iscanIdx); +} + +static INLINE uint16_t get_accumulated_eob(__m128i *eob) { + __m128i eob_shuffled; + uint16_t eobValue; + eob_shuffled = _mm_shuffle_epi32(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1); + *eob = _mm_max_epi16(*eob, eob_shuffled); + eobValue = _mm_extract_epi16(*eob, 0); + return eobValue; +} + +void av1_highbd_quantize_fp_sse4_1( + const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, int log_scale) { + __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign; + __m128i eob = _mm_setzero_si128(); + const tran_low_t *src = coeff_ptr; + tran_low_t *quanAddr = qcoeff_ptr; + tran_low_t *dquanAddr = dqcoeff_ptr; + const int shift = 16 - log_scale; + const int coeff_stride = 4; + const int quan_stride = coeff_stride; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)scan; + + memset(quanAddr, 0, count * sizeof(quanAddr[0])); + memset(dquanAddr, 0, count * sizeof(dquanAddr[0])); + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); + const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); + + qparam[0] = _mm_set_epi32(round1, round1, round1, round0); + qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]); + qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]); + qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1], + dequant_ptr[0]); + + // DC and first 3 AC + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + + // update round/quan/dquan for AC + qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]); + qparam[1] = xx_set1_64_from_32i(quant_ptr[1]); + qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]); + qparam[3] = _mm_set1_epi32(dequant_ptr[1]); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr, dquanAddr); + + // next 4 AC + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale, + quanAddr + quan_stride, dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + + // loop for the rest of AC + while (count > 0) { + src += coeff_stride << 1; + quanAddr += quan_stride << 1; + dquanAddr += quan_stride << 1; + iscan += quan_stride << 1; + + coeff[0] = _mm_loadu_si128((__m128i const *)src); + coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride)); + + quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr, dquanAddr); + + quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant, + &coeff_sign); + quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, + log_scale, quanAddr + quan_stride, + dquanAddr + quan_stride); + + find_eob(quanAddr, iscan, &eob); + + count -= 8; + } + *eob_ptr = get_accumulated_eob(&eob); +} diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c new file mode 100644 index 0000000000..52ddc66437 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include // AVX2 + +#include "config/av1_rtcd.h" +#include "aom_dsp/x86/synonyms.h" + +static int64_t k_means_horizontal_sum_avx2(__m256i a) { + const __m128i low = _mm256_castsi256_si128(a); + const __m128i high = _mm256_extracti128_si256(a, 1); + const __m128i sum = _mm_add_epi64(low, high); + const __m128i sum_high = _mm_unpackhi_epi64(sum, sum); + int64_t res; + _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high)); + return res; +} + +void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m256i v_zero = _mm256_setzero_si256(); + __m256i sum = _mm256_setzero_si256(); + __m256i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + cents[j] = _mm256_set1_epi16(centroids[j]); + } + + for (int i = 0; i < n; i += 16) { + const __m256i in = _mm256_loadu_si256((__m256i *)data); + __m256i ind = _mm256_setzero_si256(); + // Compute the distance to the first centroid. + __m256i d1 = _mm256_sub_epi16(in, cents[0]); + __m256i dist_min = _mm256_abs_epi16(d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm256_sub_epi16(in, cents[j]); + const __m256i dist = _mm256_abs_epi16(d1); + // Compare to the minimal one. + const __m256i cmp = _mm256_cmpgt_epi16(dist_min, dist); + dist_min = _mm256_min_epi16(dist_min, dist); + const __m256i ind1 = _mm256_set1_epi16(j); + ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind), + _mm256_and_si256(cmp, ind1)); + } + + const __m256i p1 = _mm256_packus_epi16(ind, v_zero); + const __m256i px = _mm256_permute4x64_epi64(p1, 0x58); + const __m128i d2 = _mm256_extracti128_si256(px, 0); + + _mm_storeu_si128((__m128i *)indices, d2); + + if (total_dist) { + // Square, convert to 32 bit and add together. + dist_min = _mm256_madd_epi16(dist_min, dist_min); + // Convert to 64 bit and add to sum. + const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero); + const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero); + sum = _mm256_add_epi64(sum, dist1); + sum = _mm256_add_epi64(sum, dist2); + } + + indices += 16; + data += 16; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_avx2(sum); + } +} + +void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m256i v_zero = _mm256_setzero_si256(); + const __m256i permute = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0); + __m256i sum = _mm256_setzero_si256(); + __m256i ind[2]; + __m256i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; + cents[j] = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, + cy, cx, cy, cx); + } + + for (int i = 0; i < n; i += 16) { + for (int l = 0; l < 2; ++l) { + const __m256i in = _mm256_loadu_si256((__m256i *)data); + ind[l] = _mm256_setzero_si256(); + // Compute the distance to the first centroid. + __m256i d1 = _mm256_sub_epi16(in, cents[0]); + __m256i dist_min = _mm256_madd_epi16(d1, d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm256_sub_epi16(in, cents[j]); + const __m256i dist = _mm256_madd_epi16(d1, d1); + // Compare to the minimal one. + const __m256i cmp = _mm256_cmpgt_epi32(dist_min, dist); + dist_min = _mm256_min_epi32(dist_min, dist); + const __m256i ind1 = _mm256_set1_epi32(j); + ind[l] = _mm256_or_si256(_mm256_andnot_si256(cmp, ind[l]), + _mm256_and_si256(cmp, ind1)); + } + if (total_dist) { + // Convert to 64 bit and add to sum. + const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero); + const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero); + sum = _mm256_add_epi64(sum, dist1); + sum = _mm256_add_epi64(sum, dist2); + } + data += 16; + } + // Cast to 8 bit and store. + const __m256i d2 = _mm256_packus_epi32(ind[0], ind[1]); + const __m256i d3 = _mm256_packus_epi16(d2, v_zero); + const __m256i d4 = _mm256_permutevar8x32_epi32(d3, permute); + const __m128i d5 = _mm256_extracti128_si256(d4, 0); + _mm_storeu_si128((__m128i *)indices, d5); + indices += 16; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_avx2(sum); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c new file mode 100644 index 0000000000..6c75822350 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/av1_rtcd.h" +#include "aom_dsp/x86/synonyms.h" + +static int64_t k_means_horizontal_sum_sse2(__m128i a) { + const __m128i sum1 = _mm_unpackhi_epi64(a, a); + const __m128i sum2 = _mm_add_epi64(a, sum1); + int64_t res; + _mm_storel_epi64((__m128i *)&res, sum2); + return res; +} + +void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m128i v_zero = _mm_setzero_si128(); + __m128i sum = _mm_setzero_si128(); + __m128i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + cents[j] = _mm_set1_epi16(centroids[j]); + } + + for (int i = 0; i < n; i += 8) { + const __m128i in = _mm_loadu_si128((__m128i *)data); + __m128i ind = _mm_setzero_si128(); + // Compute the distance to the first centroid. + __m128i d1 = _mm_sub_epi16(in, cents[0]); + __m128i d2 = _mm_sub_epi16(cents[0], in); + __m128i dist_min = _mm_max_epi16(d1, d2); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm_sub_epi16(in, cents[j]); + d2 = _mm_sub_epi16(cents[j], in); + const __m128i dist = _mm_max_epi16(d1, d2); + // Compare to the minimal one. + const __m128i cmp = _mm_cmpgt_epi16(dist_min, dist); + dist_min = _mm_min_epi16(dist_min, dist); + const __m128i ind1 = _mm_set1_epi16(j); + ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1)); + } + if (total_dist) { + // Square, convert to 32 bit and add together. + dist_min = _mm_madd_epi16(dist_min, dist_min); + // Convert to 64 bit and add to sum. + const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero); + const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero); + sum = _mm_add_epi64(sum, dist1); + sum = _mm_add_epi64(sum, dist2); + } + __m128i p2 = _mm_packus_epi16(ind, v_zero); + _mm_storel_epi64((__m128i *)indices, p2); + indices += 8; + data += 8; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_sse2(sum); + } +} + +void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *total_dist, int n, + int k) { + const __m128i v_zero = _mm_setzero_si128(); + __m128i sum = _mm_setzero_si128(); + __m128i ind[2]; + __m128i cents[PALETTE_MAX_SIZE]; + for (int j = 0; j < k; ++j) { + const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1]; + cents[j] = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx); + } + + for (int i = 0; i < n; i += 8) { + for (int l = 0; l < 2; ++l) { + const __m128i in = _mm_loadu_si128((__m128i *)data); + ind[l] = _mm_setzero_si128(); + // Compute the distance to the first centroid. + __m128i d1 = _mm_sub_epi16(in, cents[0]); + __m128i dist_min = _mm_madd_epi16(d1, d1); + + for (int j = 1; j < k; ++j) { + // Compute the distance to the centroid. + d1 = _mm_sub_epi16(in, cents[j]); + const __m128i dist = _mm_madd_epi16(d1, d1); + // Compare to the minimal one. + const __m128i cmp = _mm_cmpgt_epi32(dist_min, dist); + const __m128i dist1 = _mm_andnot_si128(cmp, dist_min); + const __m128i dist2 = _mm_and_si128(cmp, dist); + dist_min = _mm_or_si128(dist1, dist2); + const __m128i ind1 = _mm_set1_epi32(j); + ind[l] = _mm_or_si128(_mm_andnot_si128(cmp, ind[l]), + _mm_and_si128(cmp, ind1)); + } + if (total_dist) { + // Convert to 64 bit and add to sum. + const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero); + const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero); + sum = _mm_add_epi64(sum, dist1); + sum = _mm_add_epi64(sum, dist2); + } + data += 8; + } + // Cast to 8 bit and store. + const __m128i d2 = _mm_packus_epi16(ind[0], ind[1]); + const __m128i d3 = _mm_packus_epi16(d2, v_zero); + _mm_storel_epi64((__m128i *)indices, d3); + indices += 8; + } + if (total_dist) { + *total_dist = k_means_horizontal_sum_sse2(sum); + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c new file mode 100644 index 0000000000..75c5172f85 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" + +static INLINE void write_zero(tran_low_t *qcoeff) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); +} + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i ac = _mm_unpackhi_epi64(*p, *p); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1); +} + +static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *dequant_ptr, int log_scale, + __m256i *thr, __m256i *qp) { + __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + + if (log_scale > 0) { + const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1)); + round = _mm_add_epi16(round, rnd); + round = _mm_srai_epi16(round, log_scale); + } + + init_one_qp(&round, &qp[0]); + init_one_qp(&quant, &qp[1]); + + if (log_scale == 1) { + qp[1] = _mm256_slli_epi16(qp[1], log_scale); + } + + init_one_qp(&dequant, &qp[2]); + *thr = _mm256_srai_epi16(qp[2], 1 + log_scale); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1)); +} + +static INLINE void update_qp(__m256i *thr, __m256i *qp) { + qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11); + qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11); + qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11); + *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11); +} + +static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) { + const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +} + +static INLINE uint16_t quant_gather_eob(__m256i eob) { + const __m128i eob_lo = _mm256_castsi256_si128(eob); + const __m128i eob_hi = _mm256_extractf128_si256(eob, 1); + __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi); + eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s); + eob_s = _mm_minpos_epu16(eob_s); + return INT16_MAX - _mm_extract_epi16(eob_s, 0); +} + +static INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +static AOM_FORCE_INLINE void quantize_lp_16_first( + const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256, + __m256i *dequant256, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff); + + const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); + *eob = _mm256_max_epi16(*eob, nz_iscan); +} + +static AOM_FORCE_INLINE void quantize_lp_16( + const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256, + __m256i *quant256, __m256i *dequant256, __m256i *eob) { + const __m256i coeff = + _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff); + + const __m256i iscan = + _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs)); + const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask); + const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask); + *eob = _mm256_max_epi16(*eob, nz_iscan); +} + +void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + __m256i eob256 = _mm256_setzero_si256(); + + // Setup global values. + __m256i round256 = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr)); + __m256i quant256 = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr)); + __m256i dequant256 = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + + // Populate upper AC values. + round256 = _mm256_permute4x64_epi64(round256, 0x54); + quant256 = _mm256_permute4x64_epi64(quant256, 0x54); + dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); + + // Process DC and the first 15 AC coeffs. + quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, + &quant256, &dequant256, &eob256); + + if (n_coeffs > 16) { + // Overwrite the DC constants with AC constants + dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); + quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); + round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + + // AC only loop. + for (int idx = 16; idx < n_coeffs; idx += 16) { + quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256, + &quant256, &dequant256, &eob256); + } + } + + *eob_ptr = accumulate_eob256(eob256); +} + +static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); + const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); + const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static AOM_FORCE_INLINE void quantize_fp_16( + const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + __m256i *eob) { + const __m256i coeff = load_coefficients_avx2(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); + const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]); + const __m256i q = _mm256_sign_epi16(abs_q, coeff); + const __m256i dq = _mm256_mullo_epi16(q, qp[2]); + const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); + + store_coefficients_avx2(q, qcoeff_ptr); + store_coefficients_avx2(dq, dqcoeff_ptr); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } else { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + } +} + +void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int log_scale = 0; + const int step = 16; + __m256i qp[3], thr; + __m256i eob = _mm256_setzero_si256(); + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + + quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(&thr, qp); + + while (n_coeffs > 0) { + quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static AOM_FORCE_INLINE void quantize_fp_32x32( + const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + __m256i *eob) { + const __m256i coeff = load_coefficients_avx2(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]); + const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]); + const __m256i q = _mm256_sign_epi16(abs_q, coeff); + const __m256i abs_dq = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256()); + const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); + + store_coefficients_avx2(q, qcoeff_ptr); + store_coefficients_avx2(dq, dqcoeff_ptr); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } else { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + } +} + +void av1_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int log_scale = 1; + const unsigned int step = 16; + __m256i qp[3], thr; + __m256i eob = _mm256_setzero_si256(); + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + + quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(&thr, qp); + + while (n_coeffs > 0) { + quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} + +static INLINE void quantize_fp_64x64(const __m256i *thr, const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = load_coefficients_avx2(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int nzflag = _mm256_movemask_epi8(mask); + + if (nzflag) { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask); + const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2); + const __m256i ql = + _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14); + const __m256i abs_q = _mm256_or_si256(qh, ql); + const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14); + const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2); + const __m256i abs_dq = _mm256_or_si256(dqh, dql); + const __m256i q = _mm256_sign_epi16(abs_q, coeff); + const __m256i dq = _mm256_sign_epi16(abs_dq, coeff); + // Check the signed q/dq value here instead of the absolute value. When + // dequant equals 4, the dequant threshold (*thr) becomes 0 after being + // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the + // abs_coeff is 0, the nzflag will be set. As a result, the eob will be + // incorrectly calculated. The psign instruction corrects the error by + // zeroing out q/dq if coeff is zero. + const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256()); + const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256()); + + store_coefficients_avx2(q, qcoeff_ptr); + store_coefficients_avx2(dq, dqcoeff_ptr); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } else { + write_zero(qcoeff_ptr); + write_zero(dqcoeff_ptr); + } +} + +void av1_quantize_fp_64x64_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + const int log_scale = 2; + const unsigned int step = 16; + __m256i qp[3], thr; + __m256i eob = _mm256_setzero_si256(); + + init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp); + + quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + + update_qp(&thr, qp); + + while (n_coeffs > 0) { + quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, + &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan_ptr += step; + n_coeffs -= step; + } + *eob_ptr = quant_gather_eob(eob); +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c new file mode 100644 index 0000000000..b533894015 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m128i *c0, __m128i *c1) { + const tran_low_t *addr = coeff + offset; + if (sizeof(tran_low_t) == 4) { + const __m128i x0 = _mm_load_si128((const __m128i *)addr); + const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); + const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); + const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); + *c0 = _mm_packs_epi32(x0, x1); + *c1 = _mm_packs_epi32(x2, x3); + } else { + *c0 = _mm_load_si128((const __m128i *)addr); + *c1 = _mm_load_si128((const __m128i *)addr + 1); + } +} + +static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1, + tran_low_t *qcoeff, intptr_t offset) { + tran_low_t *addr = qcoeff + offset; + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); + __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); + __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); + _mm_store_si128((__m128i *)addr, y0); + _mm_store_si128((__m128i *)addr + 1, y1); + + sign_bits = _mm_cmplt_epi16(*qc1, zero); + y0 = _mm_unpacklo_epi16(*qc1, sign_bits); + y1 = _mm_unpackhi_epi16(*qc1, sign_bits); + _mm_store_si128((__m128i *)addr + 2, y0); + _mm_store_si128((__m128i *)addr + 3, y1); + } else { + _mm_store_si128((__m128i *)addr, *qc0); + _mm_store_si128((__m128i *)addr + 1, *qc1); + } +} + +static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { + const __m128i zero = _mm_setzero_si128(); + tran_low_t *addr = qcoeff + offset; + if (sizeof(tran_low_t) == 4) { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + _mm_store_si128((__m128i *)addr + 2, zero); + _mm_store_si128((__m128i *)addr + 3, zero); + } else { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + } +} + +static INLINE void quantize(const int16_t *iscan_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + const __m128i *thr0, const __m128i *thr1, + __m128i *eob) { + __m128i coeff0, coeff1; + // Do DC and first 15 AC + read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0), + _mm_cmpeq_epi16(qcoeff0, *thr0)); + const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1), + _mm_cmpeq_epi16(qcoeff1, *thr1)); + const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); + } else { + write_zero(qcoeff_ptr, n_coeffs); + write_zero(dqcoeff_ptr, n_coeffs); + } +} + +void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + (void)scan_ptr; + (void)zbin_ptr; + (void)quant_shift_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + const __m128i thr0 = _mm_srai_epi16(dequant0, 1); + const __m128i thr1 = _mm_srai_epi16(dequant1, 1); + __m128i eob = _mm_setzero_si128(); + + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob); + + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1, + &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } +} + +static INLINE void quantize_lp(const int16_t *iscan_ptr, + const int16_t *coeff_ptr, intptr_t n_coeffs, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const __m128i *round0, const __m128i *round1, + const __m128i *quant0, const __m128i *quant1, + const __m128i *dequant0, const __m128i *dequant1, + __m128i *eob) { + const int16_t *read = coeff_ptr + n_coeffs; + __m128i coeff0 = _mm_load_si128((const __m128i *)read); + __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1); + + // Poor man's sign extract + const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15); + const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15); + __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_adds_epi16(qcoeff0, *round0); + qcoeff1 = _mm_adds_epi16(qcoeff1, *round1); + const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0); + const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + int16_t *addr = qcoeff_ptr + n_coeffs; + _mm_store_si128((__m128i *)addr, qcoeff0); + _mm_store_si128((__m128i *)addr + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0); + coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1); + + addr = dqcoeff_ptr + n_coeffs; + _mm_store_si128((__m128i *)addr, coeff0); + _mm_store_si128((__m128i *)addr + 1, coeff1); + + const __m128i zero = _mm_setzero_si128(); + // Scan for eob + const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + + const __m128i iscan0 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + const __m128i iscan1 = + _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + + // Add one to convert from indices to counts + const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0); + const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1); + const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0); + const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1); + const __m128i eob2 = _mm_max_epi16(eob0, eob1); + *eob = _mm_max_epi16(*eob, eob2); +} + +void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr); + const __m128i round1 = _mm_unpackhi_epi64(round0, round0); + const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0); + const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr); + const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0); + __m128i eob = _mm_setzero_si128(); + + // DC and first 15 AC + quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0, + &round1, &quant0, &quant1, &dequant0, &dequant1, &eob); + n_coeffs += 8 * 2; + + // AC only loop + while (n_coeffs < 0) { + quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1, + &round1, &quant1, &quant1, &dequant1, &dequant1, &eob); + n_coeffs += 8 * 2; + } + + // Accumulate EOB + *eob_ptr = accumulate_eob(eob); +} diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm new file mode 100644 index 0000000000..ad4ae274e2 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, fp_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m1, m5 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, fp_32x32 + psllw m2, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + lea coeffq, [ coeffq+ncoeffq*2] + lea r5q, [ r5q+ncoeffq*2] + lea r3q, [ r3q+ncoeffq*2] + lea r4q, [r4q+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m8 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; r4[i] = r3[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 + psrlw m0, m3, 2 +%else + psrlw m0, m3, 1 +%endif + mova [r4q+ncoeffq*2+ 0], m8 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + + or r6, r2 + jz .skip_iter + + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [r3q+ncoeffq*2+ 0], m14 + mova [r3q+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; r4[i] = r3[i] * q + pmullw m13, m3 ; r4[i] = r3[i] * q +%ifidn %1, fp_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [r4q+ncoeffq*2+ 0], m14 + mova [r4q+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + + jmp .accumulate_eob +.skip_iter: + mova [r3q+ncoeffq*2+ 0], m5 + mova [r3q+ncoeffq*2+16], m5 + mova [r4q+ncoeffq*2+ 0], m5 + mova [r4q+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + + lea r0q, [r0q+ncoeffq*2] + lea r2q, [r2q+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [r0q+ncoeffq*2+ 0], m7 + mova [r0q+ncoeffq*2+16], m7 + mova [r2q+ncoeffq*2+ 0], m7 + mova [r2q+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [r3q], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7 diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm new file mode 100644 index 0000000000..618758105a --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm @@ -0,0 +1,222 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro + +SECTION .text + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(av1_ssim_parms_16x16_sse2) +sym(av1_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +globalsym(av1_ssim_parms_8x8_sse2) +sym(av1_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c new file mode 100644 index 0000000000..830f40ecb0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" + +#include "av1/common/reconinter.h" +#include "av1/encoder/context_tree.h" +#include "av1/encoder/av1_temporal_denoiser.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + return _mm_cvtsi128_si32(hgfedcba); +} + +// Denoise a 16x1 vector. +static INLINE __m128i av1_denoiser_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, + const __m128i *k_16, const __m128i *l3, const __m128i *l32, + const __m128i *l21, __m128i acc_diff) { + // Calculate differences + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + __m128i v_running_avg_y; + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); + // Clamp absolute difference to 16 to be used to get mask. Doing this + // allows us to use _mm_cmpgt_epi8, which operates on signed byte. + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); + // Get masks for l2 l1 and l0 adjustments. + const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); + // Get adjustments for l2, l1, and l0. + __m128i adj2 = _mm_and_si128(mask2, *l32); + const __m128i adj1 = _mm_and_si128(mask1, *l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + + // Combine the adjustments and get absolute adjustments. + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(*l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + // Restore the sign and get positive and negative adjustments. + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + // Calculate filtered value. + v_running_avg_y = _mm_adds_epu8(v_sig, padj); + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Adjustments <=7, and each element in acc_diff can fit in signed + // char. + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise a 16x1 vector with a weaker filter. +static INLINE __m128i av1_denoiser_adj_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const __m128i k_0, const __m128i k_delta, __m128i acc_diff) { + __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + return acc_diff; +} + +// Denoise 8x8 and 8x16 blocks. +static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude, int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_height = block_size_high[bs] >> 1; + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], + running_buffer[r], &k_0, &k_4, &k_8, + &k_16, &l3, &l32, &l21, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = sum_diff_16x1(acc_diff); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + acc_diff = av1_denoiser_adj_16x1_sse2( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0, + k_delta, acc_diff); + memcpy(running_avg_y, running_buffer[r], width); + memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, + width); + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = sum_diff_16x1(acc_diff); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +// Denoise 16x16 to 128x128 blocks. +static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh, r, c, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + __m128i acc_diff[8][8]; + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + // Modify each level's adjustment according to motion_magnitude. + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); + // Difference between level 3 and level 2 is 2. + const __m128i l32 = _mm_set1_epi8(2); + // Difference between level 2 and level 1 is 1. + const __m128i l21 = _mm_set1_epi8(1); + const int b_width = block_size_wide[bs]; + const int b_height = block_size_high[bs]; + const int b_width_shift4 = b_width >> 4; + + for (r = 0; r < 8; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r] = _mm_setzero_si128(); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2( + sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3, + &l32, &l21, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + acc_diff[c][r >> 4] = + av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y, + k_0, k_delta, acc_diff[c][r >> 4]); + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]); + } + } + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride, + avg, avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } else { + return COPY_BLOCK; + } +} diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h new file mode 100644 index 0000000000..7a0f32898b --- /dev/null +++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ +#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ + +#include +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/av1_txfm_sse4.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_fdct4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int stride); +void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit, + const int instride, const int outstride); +void av1_fadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_fadst16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idct4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct32_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_idct64_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_iadst4_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst8_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); +void av1_iadst16_sse4_1(const __m128i *input, __m128i *output, + const int8_t cos_bit, const int8_t *stage_range); + +void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit, + const int col_num); + +static INLINE void transpose_32_4x4(int stride, const __m128i *input, + __m128i *output) { + __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); + __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); + __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); + __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +// the entire input block can be represent by a grid of 4x4 blocks +// each 4x4 blocks can be represent by 4 vertical __m128i +// we first transpose each 4x4 block internally +// then transpose the grid +static INLINE void transpose_32(int txfm_size, const __m128i *input, + __m128i *output) { + const int num_per_128 = 4; + const int row_size = txfm_size; + const int col_size = txfm_size / num_per_128; + int r, c; + + // transpose each 4x4 block internally + for (r = 0; r < row_size; r += 4) { + for (c = 0; c < col_size; c++) { + transpose_32_4x4(col_size, &input[r * col_size + c], + &output[c * 4 * col_size + r / 4]); + } + } +} + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m128i ww0 = _mm_set1_epi32(w0); \ + const __m128i ww1 = _mm_set1_epi32(w1); \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = av1_round_shift_32_sse4_1(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = av1_round_shift_32_sse4_1(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \ + const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = _mm_add_epi32(out0, r); \ + out0 = _mm_srai_epi32(out0, bit); \ + const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \ + const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm_add_epi32(out1, r); \ + out1 = _mm_srai_epi32(out1, bit); \ + } while (0) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \ + } while (0) + +#ifdef __cplusplus +} +#endif + +#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_ diff --git a/third_party/aom/av1/encoder/x86/cnn_avx2.c b/third_party/aom/av1/encoder/x86/cnn_avx2.c new file mode 100644 index 0000000000..ee93b3d5a0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/cnn_avx2.c @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" + +// This mask rearranges source pixels in the order shown below. +// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7. +// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14. +// This shuffling is needed to process 3 5x5 blocks which need +// source pixels in the following order. +// 1st 5x5 block: source pixels needed are 0 to 4, +// 2nd 5x5 block: source pixels needed are 4 to 8, +// 3rd 5x5 block: source pixels needed are 8 to 12. +// Source pixels are loaded like mentioned below. +// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7 +// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14 +// After applying masks, source bytes will be in the order: +// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6 +// consists 5 pixels needed for 1st 5x5 block and +// first 3 pixels needed for 2nd 5x5 block. +// load_src1 : 7, 8, 8, 9, 10, 11, 12, x +// consists last 2 pixels needed for 2nd 5x5 block and +// 5 pixels needed for 3rd 5x5 block. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 }, + { 0, 1, 1, 2, 3, 4, 5, 0 } }; + +// This mask rearrange the weights to match shuffled source pixels order. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 }, + { 3, 4, 0, 1, 2, 3, 4, 0 } }; + +// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2. +// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and +// filter_height are equal to 2. So rearranging the weights in the +// order shown below to match source pixels. Basically this mask replicates +// the weights across the width of 2. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_weight_layer_1_and_2[2][8]) = { + { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 } +}; + +// After the stages of multiplication and accumulation, the output values +// in the register will be jumbled. In order to store register into +// output buffer in a proper way, the following mask is applied on output +// register. +DECLARE_ALIGNED(32, static const uint32_t, + shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 }; + +// Load weights needed for layer 0 (for 5x5 block processing), +// and fill the registers appropriately to match source pixel mapping. +static INLINE void prepare_weights_for_5x5_convolve( + const float *layer_config_weights, int off, float weight[5][8], + const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0, + const __m256i weight_mask_1) { + for (int row = 0; row < 5; ++row) { + for (int col = 0; col < 5; ++col) { + weight[row][col] = layer_config_weights[off]; + off += cstep; + } + } + shuffle_weight[0] = _mm256_loadu_ps(weight[0]); + shuffle_weight[1] = _mm256_loadu_ps(weight[1]); + shuffle_weight[2] = _mm256_loadu_ps(weight[2]); + shuffle_weight[3] = _mm256_loadu_ps(weight[3]); + shuffle_weight[4] = _mm256_loadu_ps(weight[4]); + + shuffle_weight[0] = + _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0); + shuffle_weight[1] = + _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0); + shuffle_weight[2] = + _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0); + shuffle_weight[3] = + _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0); + shuffle_weight[4] = + _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0); + shuffle_weight[5] = + _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1); + shuffle_weight[6] = + _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1); + shuffle_weight[7] = + _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1); + shuffle_weight[8] = + _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1); + shuffle_weight[9] = + _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1); +} + +// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and +// arranges them appropriately to process 3 blocks. +#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS() \ + do { \ + for (int row = 0; row < 5; row++) { \ + load_src_0 = _mm256_loadu_ps(input_ptr); \ + load_src_1 = _mm256_loadu_ps(input_ptr + 7); \ + load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1); \ + load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2); \ + load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \ + load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \ + accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0); \ + accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1); \ + input_ptr += in_stride; \ + } \ + } while (0) + +// Load masks needed for shuffling of output and weights. +static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask, + __m256i *weight_mask) { + // Load shuffle buffer needed to sort the output. + *output_mask = + _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2); + + // Load shuffle buffers needed for weight. + weight_mask[0] = + _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]); + weight_mask[1] = + _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]); +} + +// Load weights needed for layer 1 and 2 (for 2x2 block processing), +// and fill the registers appropriately to match source pixel mapping. +static INLINE void prepare_weights_for_2x2_convolve( + const float *layer_config_weights, int off, const int cstep, + __m256 *shuffle_weight, __m256i *weight_mask) { + // Weights needed for 2x2 block. + float weight[4] = { 0 }; + for (int i = 0; i < 4; ++i) { + weight[i] = layer_config_weights[off]; + off += cstep; + } + + const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight)); + shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]); + shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]); +} + +// Do convolution of one 5x5 block. +#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride) \ + do { \ + __m128 load_src[5]; \ + load_src[0] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[0][4]; \ + input_ptr += in_stride; \ + load_src[1] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[1][4]; \ + input_ptr += in_stride; \ + load_src[2] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[2][4]; \ + input_ptr += in_stride; \ + load_src[3] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[3][4]; \ + input_ptr += in_stride; \ + load_src[4] = _mm_loadu_ps(input_ptr); \ + last_column_sum += input_ptr[4] * weight[4][4]; \ + \ + load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \ + load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \ + load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \ + load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \ + load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \ + \ + accum0 = _mm_add_ps(load_src[0], accum0); \ + load_src[1] = _mm_add_ps(load_src[1], load_src[2]); \ + load_src[3] = _mm_add_ps(load_src[3], load_src[4]); \ + load_src[1] = _mm_add_ps(load_src[1], load_src[3]); \ + accum0 = _mm_add_ps(accum0, load_src[1]); \ + } while (0) + +// Do convolution on 8 horizontal 2x2 blocks. +static INLINE void perform_convolve_for_8h_2x2_blocks( + const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, + __m256i shuffle_output_mask) { + __m256 load_src[4]; + // Load input into source registers. + load_src[0] = _mm256_loadu_ps(input_ptr); + load_src[1] = _mm256_loadu_ps(input_ptr + 8); + load_src[2] = _mm256_loadu_ps(input_ptr + in_stride); + load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8); + + // Multiply the loaded input with corresponding weights. + load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); + load_src[1] = _mm256_mul_ps(load_src[1], weight[0]); + load_src[2] = _mm256_mul_ps(load_src[2], weight[1]); + load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); + + // Accumulate across 2x2 blocks. + load_src[0] = _mm256_add_ps(load_src[0], load_src[2]); + load_src[1] = _mm256_add_ps(load_src[1], load_src[3]); + load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]); + + // Sort the output in order to store into output buffer. + load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); + *out_accum = _mm256_add_ps(*out_accum, load_src[0]); +} + +// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks. +static INLINE void perform_convolve_for_4hx2v_2x2_blocks( + const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum, + __m256i shuffle_output_mask) { + __m256 load_src[4]; + // Load input into source registers. + load_src[0] = _mm256_loadu_ps(input_ptr); + load_src[1] = _mm256_loadu_ps(input_ptr + in_stride); + load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2)); + load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3)); + + // Multiply the loaded input with corresponding weights. + load_src[0] = _mm256_mul_ps(load_src[0], weight[0]); + load_src[1] = _mm256_mul_ps(load_src[1], weight[1]); + load_src[2] = _mm256_mul_ps(load_src[2], weight[0]); + load_src[3] = _mm256_mul_ps(load_src[3], weight[1]); + + // Accumulate across 2x2 blocks. + load_src[0] = _mm256_add_ps(load_src[0], load_src[1]); + load_src[2] = _mm256_add_ps(load_src[2], load_src[3]); + load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]); + + // Sort the output in order to store into output buffer. + load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask); + *out_accum = _mm256_add_ps(*out_accum, load_src[0]); +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when +// filter_width and filter_height are equal to 5. +// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config. +// Based on the configuration set for each layer, the current encoder +// always chooses the case of no_maxpool_padding_valid. +// And also for layer 0 convolution happens at 5x5 level as the +// filter_width and filter_height are set as 5. +static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + const int kFilterWidth = 5; + const int kFilterHeight = 5; + const int kSkipWidth = 4; + const int kSkipHeight = 4; + assert(layer_config->filter_width == kFilterWidth && + layer_config->filter_height == kFilterHeight); + assert(layer_config->skip_width == kSkipWidth && + layer_config->skip_height == kSkipHeight); + + // Load shuffle buffers needed for source. + const __m256i block0_1 = + _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]); + const __m256i block1_2 = + _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]); + + // Load shuffle buffers needed for weight. + const __m256i weight_mask_0 = + _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]); + const __m256i weight_mask_1 = + _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]); + + // Width needs to be moved to go to next iteration of processing 3 5x5 blocks. + const int kSkipWidthForNextIter = kSkipWidth * 3; + + // Minimum width required to process 3 5x5 blocks at a time. + // min width (for processing 3 5x5 block) = 2*skip_width + filter_width + // Here, skip_width specifies how much width we should move while processing + // next block convolution and filter_width specifies for how many pixels + // filter needs to be applied. + const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + const float out_ch_bias = layer_config->bias[i]; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[10]; + + // Weights needed are 5x5, for SIMD purpose made this array as 5x8. + float weight[5][8] = { { 0 } }; + int off = k * layer_config->out_channels + i; + + // In layer 0, the convolution process happens at 5x5. + // The weights needed for 5x5 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_5x5_convolve(layer_config->weights, off, weight, + cstep, shuffle_weight, weight_mask_0, + weight_mask_1); + + for (int h = 0, u = 0; h < in_height - kFilterHeight + 1; + h += kSkipHeight, ++u) { + const int out_h = u * out_stride; + int v = 0; + int w = 0; + int rem_width = in_width; + // Processing 3 5x5 blocks at a time, if sufficient width is present. + while (rem_width >= kMinWidthFor3_5x5Blocks) { + __m256 load_src_0, load_src_1; + __m256 accum_src_0 = _mm256_setzero_ps(); + __m256 accum_src_1 = _mm256_setzero_ps(); + const float *input_ptr = &input[k][h * in_stride + w]; + PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS(); + + // Accumulate across column. + __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1); + __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1); + __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1); + + __m128 accum_l = _mm256_castps256_ps128(accum); + __m128 accum_h = _mm256_extractf128_ps(accum, 1); + + __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0); + __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h); + __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h); + + // 1st 5x5 block output. + output[i][out_h + v] = + out_ch_bias + _mm_cvtss_f32(tmp_reg_2) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1)); + + // 2nd 5x5 block output. + output[i][out_h + v + 1] = + out_ch_bias + + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2)); + + // 3rd 5x5 block output. + output[i][out_h + v + 2] = + out_ch_bias + + _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) + + _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3)); + + v += 3; + w += kSkipWidthForNextIter; + rem_width -= kSkipWidthForNextIter; + } + + // Process remaining blocks as single 5x5 block at a time. + while (rem_width >= kFilterWidth) { + float last_column_sum = 0; + __m128 accum = _mm_setzero_ps(); + const float *input_ptr = &input[k][h * in_stride + w]; + PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride); + + // Accumulate across column. + accum = _mm_hadd_ps(accum, accum); + output[i][out_h + v] = out_ch_bias + last_column_sum + + _mm_cvtss_f32(accum) + + _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1)); + + v += 1; + w += kSkipWidth; + rem_width -= kSkipWidth; + } + } + } + } +} + +// AVX2 implementation for layer 1. +static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2( + const float **input, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + __m256i weight_mask[2]; + __m256i shuffle_output_mask; + load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); + + const int kInHeight = 16; + const int kFilterHeight = 2; + const int kSkipHeight = 2; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); + // out_accum registers are used to store the 2x2 convolve outputs + // (calculated over input block size), which are accumulated across the + // in_channels. As per the design, each iteration of for loop processes 8 + // (horizontal) 2x2 blocks and stores in corresponding out_accum register + // (as input size is 16x16, a total of 64 2x2 blocks are present and 8 + // out_accum registers are enough to store the outputs). + // Hence for loops corresponding to 'j' and 'h', below, run over the number + // of out_accum registers. + __m256 out_accum[8]; + for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[2]; + int off = k * layer_config->out_channels + i; + // In layer 1, the convolution process happens at 2x2. + // The weights needed for 2x2 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, + shuffle_weight, weight_mask); + + for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; + h += kSkipHeight, ++u) { + const float *input_ptr = &input[k][h * in_stride]; + perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight, + &out_accum[u], shuffle_output_mask); + } + } + // Store output of layer 1. + for (int j = 0; j < 8; ++j) { + _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]); + } + } +} + +// AVX2 implementation for layer 2. +static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2( + const float **input, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + __m256i weight_mask[2]; + __m256i shuffle_output_mask; + load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask); + + const int kInHeight = 8; + const int kFilterHeight = 2; + const int kSkipHeight = 2; + for (int i = start_idx; i < layer_config->out_channels; i += channel_step) { + __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]); + // out_accum registers are used to store the 2x2 convolve outputs + // (calculated over input block size), which are accumulated across the + // in_channels. As per the design, each iteration of for loop processes 8 + // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding + // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are + // present and 2 out_accum registers are enough to store the outputs). + // Hence for loops corresponding to 'j' and 'h', below, run over the number + // of out_accum registers. + __m256 out_accum[2]; + + // Height needs to be moved to go to next iteration of processing + // while processing 2 2x2 blocks vertically. + const int kSkipHeightForNextIter = kSkipHeight * 2; + for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg; + for (int k = 0; k < layer_config->in_channels; ++k) { + __m256 shuffle_weight[2]; + int off = k * layer_config->out_channels + i; + // In layer 2, the convolution process happens at 2x2. + // The weights needed for 2x2 block are same across the in-channels, + // which is why the load of weights happens once for each in-channel. + prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep, + shuffle_weight, weight_mask); + + for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1; + h += kSkipHeightForNextIter, ++u) { + const float *input_ptr = &input[k][h * in_stride]; + perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride, + shuffle_weight, &out_accum[u], + shuffle_output_mask); + } + } + // Store output of layer 2. + for (int j = 0; j < 2; ++j) { + _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]); + } + } +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when +// filter_width and filter_height are equal to 2. +// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config, +// the filter_width and filter_height are equal to 2 for layer >= 1. So +// convolution happens at 2x2 for layer >= 1. +void cnn_convolve_no_maxpool_padding_valid_2x2_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride, + int start_idx, const int cstep, const int channel_step) { + assert(layer_config->filter_width == 2 && layer_config->filter_height == 2); + assert(layer_config->skip_width == 2 && layer_config->skip_height == 2); + + if (in_width == 16 && in_height == 16) { + // This case of in_width and in_height equal to 16 corresponds to layer 1. + // The output size of this layer is 8x8. + cnn_convolve_no_maxpool_padding_valid_layer1_avx2( + input, in_stride, layer_config, output, out_stride, start_idx, cstep, + channel_step); + } else if (in_width == 8 && in_height == 8) { + // This case of in_width and in_height equal to 8 corresponds to layer 2. + // The output size of this layer is 4x4. + cnn_convolve_no_maxpool_padding_valid_layer2_avx2( + input, in_stride, layer_config, output, out_stride, start_idx, cstep, + channel_step); + } else { + // For layer equal to 3 and 4, the input is of size 4x4 and 2x2 + // respectively. Implementing SIMD for these cases might not be optimal, + // which is why we call C path for layer >= 3. + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} + +// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_avx2( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + cnn_convolve_no_maxpool_padding_valid_5x5_avx2( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + cnn_convolve_no_maxpool_padding_valid_2x2_avx2( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm new file mode 100644 index 0000000000..b185548184 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm @@ -0,0 +1,82 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 + + RET diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c new file mode 100644 index 0000000000..9627f75930 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ +#include /* AVX2 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + const __m256i y_zeros = _mm256_setzero_si256(); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride; + uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); + + do { + yy_storeu_256(bottom_buf, y_zeros); + bottom_buf += 32; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const __m256i c0 = yy_loadu_256(cf); + const __m256i c1 = yy_loadu_256(cf + 8); + const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); + const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); + const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); + const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); + yy_storeu_256(ls, res); + ls += 32; + cf += 16; + i += 4; + } while (i < width); + } else if (height == 8) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + const __m128i res0 = _mm256_castsi256_si128(res); + const __m128i res1 = _mm256_extracti128_si256(res, 1); + xx_storel_64(ls, res0); + *(int32_t *)(ls + height) = 0; + xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); + *(int32_t *)(ls + height + stride) = 0; + xx_storel_64(ls + stride * 2, res1); + *(int32_t *)(ls + height + stride * 2) = 0; + xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); + *(int32_t *)(ls + height + stride * 3) = 0; + cf += 32; + ls += stride << 2; + i += 4; + } while (i < width); + } else if (height == 16) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + xx_storeu_128(ls, _mm256_castsi256_si128(res)); + xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); + cf += 32; + *(int32_t *)(ls + height) = 0; + *(int32_t *)(ls + stride + height) = 0; + ls += stride << 1; + i += 2; + } while (i < width); + } else { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + yy_storeu_256(ls, res); + cf += 32; + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c new file mode 100644 index 0000000000..d23a688747 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c @@ -0,0 +1,505 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" + +static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride); + level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride); + level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride); + level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride); + level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride); +} + +static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src, + const int stride, + const ptrdiff_t *const offsets, + __m128i *const level) { + level[0] = _mm_loadu_si128((__m128i *)(src + 1)); + level[1] = _mm_loadu_si128((__m128i *)(src + stride)); + level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0])); + level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1])); + level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2])); +} + +static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) { + const __m128i const_3 = _mm_set1_epi8(3); + const __m128i const_4 = _mm_set1_epi8(4); + __m128i count; + + count = _mm_min_epu8(level[0], const_3); + level[1] = _mm_min_epu8(level[1], const_3); + level[2] = _mm_min_epu8(level[2], const_3); + level[3] = _mm_min_epu8(level[3], const_3); + level[4] = _mm_min_epu8(level[4], const_3); + count = _mm_add_epi8(count, level[1]); + count = _mm_add_epi8(count, level[2]); + count = _mm_add_epi8(count, level[3]); + count = _mm_add_epi8(count, level[4]); + count = _mm_avg_epu8(count, _mm_setzero_si128()); + count = _mm_min_epu8(count, const_4); + return count; +} + +static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *const coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(21); + __m128i pos_to_offset = + (width == 4) + ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21) + : _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, + 21, 21); + __m128i count; + __m128i level[5]; + int8_t *cc = coeff_contexts; + int col = width; + + assert(!(width % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + cc += 16; + col -= 4; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(width % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 4 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(width % 4)); + + do { + load_levels_4x4x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 4 * stride; + coeff_contexts += 16; + col -= 4; + } while (col); +} + +static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int col = width; + __m128i count; + __m128i level[5]; + __m128i pos_to_offset[3]; + + assert(!(width % 2)); + + if (width == 8) { + pos_to_offset[0] = + _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } else if (width < 8) { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, + 21, 21, 21, 21, 21); + } else { + pos_to_offset[0] = _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16); + pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, + 21, 21, 21, 21, 21); + } + pos_to_offset[2] = _mm_set1_epi8(21); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += 2 * stride; + cc += 16; + col -= 2; + } while (col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int col = width; + __m128i count; + __m128i level[5]; + + assert(!(width % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, + const int width, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = 8 + TX_PAD_HOR; + const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5); + int col = width; + __m128i count; + __m128i level[5]; + + assert(!(width % 2)); + + do { + load_levels_8x2x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 2 * stride; + coeff_contexts += 16; + col -= 2; + } while (col); +} + +static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, + const int real_width, + const int real_height, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + int8_t *cc = coeff_contexts; + int col = width; + __m128i pos_to_offset[5]; + __m128i pos_to_offset_large[3]; + __m128i count; + __m128i level[5]; + + assert(!(height % 16)); + + pos_to_offset_large[2] = _mm_set1_epi8(21); + if (real_width == real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = + pos_to_offset_large[2]; + } else if (real_width < real_height) { + pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[1] = _mm_setr_epi8(11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8( + 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21); + pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; + } else { // real_width > real_height + pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8( + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); + pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21); + pos_to_offset[4] = pos_to_offset_large[2]; + pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(16); + } + + do { + int h = height; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)cc, count); + levels += 16; + cc += 16; + h -= 16; + pos_to_offset[0] = pos_to_offset_large[0]; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + pos_to_offset[2] = pos_to_offset[3]; + pos_to_offset[3] = pos_to_offset[4]; + pos_to_offset_large[0] = pos_to_offset_large[1]; + pos_to_offset_large[1] = pos_to_offset_large[2]; + levels += TX_PAD_HOR; + } while (--col); + + coeff_contexts[0] = 0; +} + +static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + const __m128i pos_to_offset_large = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(height % 16)); + + do { + __m128i pos_to_offset = + _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, + SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10); + int h = height; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset); + _mm_store_si128((__m128i *)coeff_contexts, count); + pos_to_offset = pos_to_offset_large; + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + levels += TX_PAD_HOR; + } while (--col); +} + +static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, + const int width, const int height, + const ptrdiff_t *const offsets, + int8_t *coeff_contexts) { + const int stride = height + TX_PAD_HOR; + __m128i pos_to_offset[3]; + __m128i count; + __m128i level[5]; + int col = width; + + assert(!(height % 16)); + + pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0); + pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5); + pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10); + + do { + int h = height; + + do { + load_levels_16x1x5_sse2(levels, stride, offsets, level); + count = get_coeff_contexts_kernel_sse2(level); + count = _mm_add_epi8(count, pos_to_offset[0]); + _mm_store_si128((__m128i *)coeff_contexts, count); + levels += 16; + coeff_contexts += 16; + h -= 16; + } while (h); + + pos_to_offset[0] = pos_to_offset[1]; + pos_to_offset[1] = pos_to_offset[2]; + levels += TX_PAD_HOR; + } while (--col); +} + +// Note: levels[] must be in the range [0, 127], inclusive. +void av1_get_nz_map_contexts_sse2(const uint8_t *const levels, + const int16_t *const scan, const uint16_t eob, + const TX_SIZE tx_size, + const TX_CLASS tx_class, + int8_t *const coeff_contexts) { + const int last_idx = eob - 1; + if (!last_idx) { + coeff_contexts[0] = 0; + return; + } + + const int real_width = tx_size_wide[tx_size]; + const int real_height = tx_size_high[tx_size]; + const int width = get_txb_wide(tx_size); + const int height = get_txb_high(tx_size); + const int stride = height + TX_PAD_HOR; + ptrdiff_t offsets[3]; + + /* coeff_contexts must be 16 byte aligned. */ + assert(!((intptr_t)coeff_contexts & 0xf)); + + if (tx_class == TX_CLASS_2D) { + offsets[0] = 0 * stride + 2; + offsets[1] = 1 * stride + 1; + offsets[2] = 2 * stride + 0; + + if (height == 4) { + get_4_nz_map_contexts_2d(levels, width, offsets, coeff_contexts); + } else if (height == 8) { + get_8_coeff_contexts_2d(levels, width, offsets, coeff_contexts); + } else if (height == 16) { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, + offsets, coeff_contexts); + } + } else if (tx_class == TX_CLASS_HORIZ) { + offsets[0] = 2 * stride; + offsets[1] = 3 * stride; + offsets[2] = 4 * stride; + if (height == 4) { + get_4_nz_map_contexts_hor(levels, width, offsets, coeff_contexts); + } else if (height == 8) { + get_8_coeff_contexts_hor(levels, width, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_hor(levels, width, height, offsets, + coeff_contexts); + } + } else { // TX_CLASS_VERT + offsets[0] = 2; + offsets[1] = 3; + offsets[2] = 4; + if (height == 4) { + get_4_nz_map_contexts_ver(levels, width, offsets, coeff_contexts); + } else if (height == 8) { + get_8_coeff_contexts_ver(levels, width, offsets, coeff_contexts); + } else { + get_16n_coeff_contexts_ver(levels, width, height, offsets, + coeff_contexts); + } + } + + const int bhl = get_txb_bhl(tx_size); + const int pos = scan[last_idx]; + if (last_idx <= (width << bhl) / 8) + coeff_contexts[pos] = 1; + else if (last_idx <= (width << bhl) / 4) + coeff_contexts[pos] = 2; + else + coeff_contexts[pos] = 3; +} diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c new file mode 100644 index 0000000000..72bd8e3411 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include /* SSE4.1 */ + +#include "aom/aom_integer.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" + +void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = height + TX_PAD_HOR; + const __m128i zeros = _mm_setzero_si128(); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf = levels + stride * width; + uint8_t *bottom_buf_end = bottom_buf + bottom_len; + do { + _mm_storeu_si128((__m128i *)(bottom_buf), zeros); + bottom_buf += 16; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (height == 4) { + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); + xx_storeu_128(ls, lsAB); + ls += (stride << 1); + cf += (height << 1); + i += 2; + } while (i < width); + } else if (height == 8) { + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); + xx_storeu_128(ls, absAB8); + ls += stride; + cf += height; + i += 1; + } while (i < width); + } else { + do { + int j = 0; + do { + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffC = xx_loadu_128(cf + 8); + const __m128i coeffD = xx_loadu_128(cf + 12); + const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); + const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); + const __m128i absAB = _mm_abs_epi16(coeffAB); + const __m128i absCD = _mm_abs_epi16(coeffCD); + const __m128i absABCD = _mm_packs_epi16(absAB, absCD); + xx_storeu_128(ls + j, absABCD); + j += 16; + cf += 16; + } while (j < height); + *(int32_t *)(ls + height) = 0; + ls += stride; + i += 1; + } while (i < width); + } +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c new file mode 100644 index 0000000000..57725d1795 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, + __m256i *c) { + const tran_low_t *addr = coeff + offset; + + if (sizeof(tran_low_t) == 4) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); + const __m256i y = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(y, 0xD8); + } else { + *c = _mm256_loadu_si256((const __m256i *)addr); + } +} + +static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256) { + const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff); + // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // r0 r1 r2 r3 r4 r5 r6 r7 + const __m256i error = _mm256_madd_epi16(diff, diff); + // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7 + const __m256i error_hi = _mm256_hadd_epi32(error, error); + // r0+r1 | r2+r3 | r4+r5 | r6+r7 + *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256()); +} + +static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); + const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16)); + const __m256i _dqcoeff_1 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 16)); + + // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0); + const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1); + + // r0 r1 r2 r3 r4 r5 r6 r7 + const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0); + const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1); + const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1); + + // For extreme input values, the accumulation needs to happen in 64 bit + // precision to avoid any overflow. + const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero); + const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero); + const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo); + *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0); +} + +static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + __m256i *sse_256, + intptr_t num_coeff) { + const __m256i zero = _mm256_setzero_si256(); + for (int i = 0; i < num_coeff; i += 64) { + // Load 64 elements for coeff and dqcoeff. + const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff); + const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16)); + const __m256i _dqcoeff_1 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 16)); + const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32)); + const __m256i _dqcoeff_2 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 32)); + const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48)); + const __m256i _dqcoeff_3 = + _mm256_loadu_si256((const __m256i *)(dqcoeff + 48)); + + // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0); + const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1); + const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2); + const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3); + + // r0 r1 r2 r3 r4 r5 r6 r7 + const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0); + const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1); + const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2); + const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3); + // r00 r01 r02 r03 r04 r05 r06 r07 + const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1); + // r10 r11 r12 r13 r14 r15 r16 r17 + const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3); + + // For extreme input values, the accumulation needs to happen in 64 bit + // precision to avoid any overflow. r00 r01 r04 r05 + const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero); + // r02 r03 r06 r07 + const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero); + // r10 r11 r14 r15 + const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero); + // r12 r13 r16 r17 + const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero); + + const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo); + const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo); + const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0); + *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp); + coeff += 64; + dqcoeff += 64; + } +} + +int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t num_coeff) { + assert(num_coeff % 16 == 0); + __m256i sse_256 = _mm256_setzero_si256(); + int64_t sse; + + if (num_coeff == 16) + av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256); + else if (num_coeff == 32) + av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256); + else + av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff); + + // Save the higher 64 bit of each 128 bit lane. + const __m256i sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + // Accumulate the sse_256 register to get final sse + const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} + +int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_reg_64hi, ssz_reg_64hi; + __m128i sse_reg128, ssz_reg128; + int64_t sse; + int i; + const __m256i zero_reg = _mm256_setzero_si256(); + + // init sse and ssz registerd to zero + sse_reg = _mm256_setzero_si256(); + ssz_reg = _mm256_setzero_si256(); + + for (i = 0; i < block_size; i += 16) { + // load 32 bytes from coeff and dqcoeff + read_coeff(coeff, i, &coeff_reg); + read_coeff(dqcoeff, i, &dqcoeff_reg); + // dqcoeff - coeff + dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); + // madd (dqcoeff - coeff) + dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); + // madd coeff + coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); + // expand each double word of madd (dqcoeff - coeff) to quad word + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); + // add each quad word of madd (dqcoeff - coeff) and madd (coeff) + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } + // save the higher 64 bit of each 128 bit lane + sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); + ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); + // add the higher 64 bit to the low 64 bit + sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); + ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); + + // add each 64 bit from each of the 128 bit lane of the 256 bit + sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), + _mm256_extractf128_si256(sse_reg, 1)); + + ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), + _mm256_extractf128_si256(ssz_reg, 1)); + + // store the results + _mm_storel_epi64((__m128i *)(&sse), sse_reg128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); + _mm256_zeroupper(); + return sse; +} diff --git a/third_party/aom/av1/encoder/x86/error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c new file mode 100644 index 0000000000..61f65c623f --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "config/av1_rtcd.h" + +#include "aom/aom_integer.h" + +static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) { + __m128i reg_hi = _mm_srli_si128(reg, 8); + reg = _mm_add_epi64(reg, reg_hi); + + return reg; +} + +int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff, + intptr_t block_size) { + assert(block_size % 16 == 0); + assert(block_size >= 16); + + const __m128i zero = _mm_setzero_si128(); + __m128i accum_0 = zero; + __m128i accum_1 = zero; + + for (int i = 0; i < block_size; i += 16) { + // Load 8 elements for coeff and dqcoeff. + const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff); + const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8)); + const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff); + const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8)); + // Compute the diff + const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0); + const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1); + // Compute the error + const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0); + const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1); + + const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero); + const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero); + const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero); + const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero); + + // Accumulate + accum_0 = _mm_add_epi64(accum_0, error_lo_0); + accum_1 = _mm_add_epi64(accum_1, error_lo_1); + accum_0 = _mm_add_epi64(accum_0, error_hi_0); + accum_1 = _mm_add_epi64(accum_1, error_hi_1); + + // Advance + coeff += 16; + dqcoeff += 16; + } + + __m128i accum = _mm_add_epi64(accum_0, accum_1); + // Reduce sum the register + accum = reduce_sum_epi64(accum); + + // Store the results. +#if AOM_ARCH_X86_64 + return _mm_cvtsi128_si64(accum); +#else + int64_t result; + _mm_storel_epi64((__m128i *)&result, accum); + return result; +#endif // AOM_ARCH_X86_64 +} diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm new file mode 100644 index 0000000000..6407c106ab --- /dev/null +++ b/third_party/aom/av1/encoder/x86/error_sse2.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 + lea %1, [%1 + %2 * 4] +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%endmacro + +%define private_prefix av1 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register +.loop: + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m0 + punpckhdq m2, m5 + paddq m6, m7 + paddq m6, m2 + jg .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if AOM_ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c new file mode 100644 index 0000000000..ebe75310e9 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/hash_sse42.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" + +// Byte-boundary alignment issues +#define ALIGN_SIZE 8 +#define ALIGN_MASK (ALIGN_SIZE - 1) + +#define CALC_CRC(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/** + * Calculates 32-bit CRC for the input buffer + * polynomial is 0x11EDC6F41 + * @return A 32-bit unsigned integer representing the CRC + */ +uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + +#ifdef __x86_64__ + uint64_t crc64 = crc; + CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len) + crc = (uint32_t)crc64; +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len) + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len) + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len) + return (crc ^ 0xFFFFFFFF); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c new file mode 100644 index 0000000000..340307cb3e --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom/aom_integer.h" +#include "av1/common/common.h" +#include "config/av1_rtcd.h" + +int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i; + int64_t temp1[8]; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 16) { + __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i)); + __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8)); + __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i)); + __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8)); + + __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff); + __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2); + __m256i diff1h = _mm256_srli_epi64(diff1, 32); + __m256i diff2h = _mm256_srli_epi64(diff2, 32); + __m256i res = _mm256_mul_epi32(diff1, diff1); + __m256i res1 = _mm256_mul_epi32(diff1h, diff1h); + __m256i res2 = _mm256_mul_epi32(diff2, diff2); + __m256i res3 = _mm256_mul_epi32(diff2h, diff2h); + __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32); + __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32); + res = _mm256_mul_epi32(mm256_coeff, mm256_coeff); + res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh); + res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2); + res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2); + __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + _mm256_storeu_si256((__m256i *)temp1, res_diff); + _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff); + + error += temp1[0] + temp1[1] + temp1[2] + temp1[3]; + sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c new file mode 100644 index 0000000000..b0b2757568 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "av1/common/common.h" +#include "config/av1_rtcd.h" + +int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i, j, test; + uint32_t temp[4]; + __m128i max, min, cmp0, cmp1, cmp2, cmp3; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 8) { + // Load the data into xmm registers + __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + // Check if any values require more than 15 bit + max = _mm_set1_epi32(0x3fff); + min = _mm_set1_epi32((int)0xffffc000); + cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), + _mm_cmplt_epi32(mm_coeff, min)); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), + _mm_cmplt_epi32(mm_coeff2, min)); + cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), + _mm_cmplt_epi32(mm_dqcoeff, min)); + cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), + _mm_cmplt_epi32(mm_dqcoeff2, min)); + test = _mm_movemask_epi8( + _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); + + if (!test) { + __m128i mm_diff, error_sse2, sqcoeff_sse2; + mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); + mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); + mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); + error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); + sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); + _mm_storeu_si128((__m128i *)temp, error_sse2); + error = error + temp[0] + temp[1] + temp[2] + temp[3]; + _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); + sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; + } else { + for (j = 0; j < 8; j++) { + const int64_t diff = coeff[i + j] - dqcoeff[i + j]; + error += diff * diff; + sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; + } + } + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c new file mode 100644 index 0000000000..9cdf21fc7c --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c @@ -0,0 +1,3132 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /*AVX2*/ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i out1[8]; + if (!flipud) { + out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + } else { + out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } + if (!fliplr) { + out[0] = _mm256_cvtepi16_epi32(out1[0]); + out[1] = _mm256_cvtepi16_epi32(out1[1]); + out[2] = _mm256_cvtepi16_epi32(out1[2]); + out[3] = _mm256_cvtepi16_epi32(out1[3]); + out[4] = _mm256_cvtepi16_epi32(out1[4]); + out[5] = _mm256_cvtepi16_epi32(out1[5]); + out[6] = _mm256_cvtepi16_epi32(out1[6]); + out[7] = _mm256_cvtepi16_epi32(out1[7]); + + } else { + out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0])); + out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1])); + out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2])); + out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3])); + out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4])); + out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5])); + out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6])); + out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7])); + } + out[0] = _mm256_slli_epi32(out[0], shift); + out[1] = _mm256_slli_epi32(out[1], shift); + out[2] = _mm256_slli_epi32(out[2], shift); + out[3] = _mm256_slli_epi32(out[3], shift); + out[4] = _mm256_slli_epi32(out[4], shift); + out[5] = _mm256_slli_epi32(out[5], shift); + out[6] = _mm256_slli_epi32(out[6], shift); + out[7] = _mm256_slli_epi32(out[7], shift); +} +static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) { + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + + in[0] = _mm256_add_epi32(in[0], rounding); + in[1] = _mm256_add_epi32(in[1], rounding); + in[2] = _mm256_add_epi32(in[2], rounding); + in[3] = _mm256_add_epi32(in[3], rounding); + in[4] = _mm256_add_epi32(in[4], rounding); + in[5] = _mm256_add_epi32(in[5], rounding); + in[6] = _mm256_add_epi32(in[6], rounding); + in[7] = _mm256_add_epi32(in[7], rounding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + in[4] = _mm256_srai_epi32(in[4], shift); + in[5] = _mm256_srai_epi32(in[5], shift); + in[6] = _mm256_srai_epi32(in[6], shift); + in[7] = _mm256_srai_epi32(in[7], shift); +} +static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift); +} +static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out, + int stride, int height, int outstride, + int flipud, int fliplr) { + __m256i out1[64]; + if (!flipud) { + for (int i = 0; i < height; i++) { + out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } else { + for (int i = 0; i < height; i++) { + out1[(height - 1) - i] = + _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } + if (!fliplr) { + for (int i = 0; i < height; i++) { + out[i * outstride] = + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i])); + out[i * outstride + 1] = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1)); + } + } else { + for (int i = 0; i < height; i++) { + out[i * outstride + 1] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_castsi256_si128(out1[i]))); + out[i * outstride + 0] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1))); + } + } +} + +static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out, + const int instride, + const int outstride) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]); + u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]); + + u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]); + u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]); + + u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]); + u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]); + + u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]); + u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); +} +static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit, + int stride) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi32(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_add_epi32(in[stride * i], round); + in[stride * i] = _mm256_srai_epi32(in[stride * i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_slli_epi32(in[stride * i], bit); + } + } +} +static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), in[i]); + out += stride; + } +} +static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in, + __m256i *out) { + fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2); + fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2); +} + +static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} +#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m256i ww0 = _mm256_set1_epi32(w0); \ + const __m256i ww1 = _mm256_set1_epi32(w1); \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \ + } while (0) + +#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + out0 = _mm256_add_epi32(out0, r); \ + out0 = _mm256_srai_epi32(out0, bit); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm256_add_epi32(out1, r); \ + out1 = _mm256_srai_epi32(out1, bit); \ + } while (0) + +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, + const int8_t cos_bit, int instride, + int outstride); +static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[8], v[8]; + for (int col = 0; col < col_num; ++col) { + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]); + v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[0] = _mm256_add_epi32(u[0], u[3]); + v[3] = _mm256_sub_epi32(u[0], u[3]); + v[1] = _mm256_add_epi32(u[1], u[2]); + v[2] = _mm256_sub_epi32(u[1], u[2]); + + v[5] = _mm256_mullo_epi32(u[5], cospim32); + v[6] = _mm256_mullo_epi32(u[6], cospi32); + v[5] = _mm256_add_epi32(v[5], v[6]); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + u[0] = _mm256_mullo_epi32(u[5], cospi32); + v[6] = _mm256_mullo_epi32(u[6], cospim32); + v[6] = _mm256_sub_epi32(u[0], v[6]); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm256_mullo_epi32(v[0], cospi32); + v[1] = _mm256_mullo_epi32(v[1], cospi32); + u[0] = _mm256_add_epi32(v[0], v[1]); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_sub_epi32(v[0], v[1]); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm256_mullo_epi32(v[2], cospi48); + v[1] = _mm256_mullo_epi32(v[3], cospi16); + u[2] = _mm256_add_epi32(v[0], v[1]); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + v[0] = _mm256_mullo_epi32(v[2], cospi16); + v[1] = _mm256_mullo_epi32(v[3], cospi48); + u[3] = _mm256_sub_epi32(v[1], v[0]); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_add_epi32(v[4], v[5]); + u[5] = _mm256_sub_epi32(v[4], v[5]); + u[6] = _mm256_sub_epi32(v[7], v[6]); + u[7] = _mm256_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm256_mullo_epi32(u[4], cospi56); + v[1] = _mm256_mullo_epi32(u[7], cospi8); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm256_mullo_epi32(u[4], cospi8); + v[1] = _mm256_mullo_epi32(u[7], cospi56); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm256_mullo_epi32(u[5], cospi24); + v[1] = _mm256_mullo_epi32(u[6], cospi40); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm256_mullo_epi32(u[5], cospi40); + v[1] = _mm256_mullo_epi32(u[6], cospi24); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6] + + out[0 * outstride + col] = u[0]; // buf0[0] + out[4 * outstride + col] = u[1]; // buf0[1] + out[2 * outstride + col] = u[2]; // buf0[2] + out[6 * outstride + col] = u[3]; // buf0[3] + } +} +static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstirde) { + (void)col_num; + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + for (int col = 0; col < col_num; ++col) { + u0 = in[0 * col_num + col]; + u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]); + u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]); + u3 = in[4 * col_num + col]; + u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]); + u5 = in[6 * col_num + col]; + u6 = in[2 * col_num + col]; + u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm256_mullo_epi32(u2, cospi32); + y = _mm256_mullo_epi32(u3, cospi32); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + v3 = _mm256_sub_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm256_mullo_epi32(u6, cospi32); + y = _mm256_mullo_epi32(u7, cospi32); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + v7 = _mm256_sub_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm256_add_epi32(v0, v2); + u1 = _mm256_add_epi32(v1, v3); + u2 = _mm256_sub_epi32(v0, v2); + u3 = _mm256_sub_epi32(v1, v3); + u4 = _mm256_add_epi32(v4, v6); + u5 = _mm256_add_epi32(v5, v7); + u6 = _mm256_sub_epi32(v4, v6); + u7 = _mm256_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm256_mullo_epi32(u4, cospi16); + y = _mm256_mullo_epi32(u5, cospi48); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi48); + y = _mm256_mullo_epi32(u5, cospim16); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospim48); + y = _mm256_mullo_epi32(u7, cospi16); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi16); + y = _mm256_mullo_epi32(u7, cospi48); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm256_add_epi32(v0, v4); + u1 = _mm256_add_epi32(v1, v5); + u2 = _mm256_add_epi32(v2, v6); + u3 = _mm256_add_epi32(v3, v7); + u4 = _mm256_sub_epi32(v0, v4); + u5 = _mm256_sub_epi32(v1, v5); + u6 = _mm256_sub_epi32(v2, v6); + u7 = _mm256_sub_epi32(v3, v7); + + // stage 6 + x = _mm256_mullo_epi32(u0, cospi4); + y = _mm256_mullo_epi32(u1, cospi60); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + x = _mm256_mullo_epi32(u0, cospi60); + y = _mm256_mullo_epi32(u1, cospim4); + v1 = _mm256_add_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi20); + y = _mm256_mullo_epi32(u3, cospi44); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi44); + y = _mm256_mullo_epi32(u3, cospim20); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + x = _mm256_mullo_epi32(u4, cospi36); + y = _mm256_mullo_epi32(u5, cospi28); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi28); + y = _mm256_mullo_epi32(u5, cospim36); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospi52); + y = _mm256_mullo_epi32(u7, cospi12); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi12); + y = _mm256_mullo_epi32(u7, cospim52); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 7 + out[0 * outstirde + col] = v1; + out[1 * outstirde + col] = v6; + out[2 * outstirde + col] = v3; + out[3 * outstirde + col] = v4; + out[4 * outstirde + col] = v5; + out[5 * outstirde + col] = v2; + out[6 * outstirde + col] = v7; + out[7 * outstirde + col] = v0; + } +} +static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num, + int outstride) { + (void)bit; + (void)outstride; + int num_iters = 8 * col_num; + for (int i = 0; i < num_iters; i += 8) { + out[i] = _mm256_add_epi32(in[i], in[i]); + out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]); + out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]); + out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]); + out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]); + out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]); + out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]); + out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]); + } +} +void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[8], out[8]; + const TX_SIZE tx_size = TX_8X8; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int width_div8 = (width >> 3); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case ADST_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case DCT_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case ADST_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case FLIPADST_DCT: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case DCT_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case ADST_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case FLIPADST_ADST: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case IDTX: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case V_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case H_DCT: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case V_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case H_ADST: + load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case V_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + case H_FLIPADST: + load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 8); + break; + default: assert(0); + } + (void)bd; +} + +static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm256_add_epi32(u[0], u[7]); + v[7] = _mm256_sub_epi32(u[0], u[7]); + v[1] = _mm256_add_epi32(u[1], u[6]); + v[6] = _mm256_sub_epi32(u[1], u[6]); + v[2] = _mm256_add_epi32(u[2], u[5]); + v[5] = _mm256_sub_epi32(u[2], u[5]); + v[3] = _mm256_add_epi32(u[3], u[4]); + v[4] = _mm256_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm256_mullo_epi32(u[10], cospim32); + x = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[13], cospim32); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospim32); + x = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi32); + x = _mm256_mullo_epi32(u[12], cospim32); + v[12] = _mm256_sub_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[3]); + u[3] = _mm256_sub_epi32(v[0], v[3]); + u[1] = _mm256_add_epi32(v[1], v[2]); + u[2] = _mm256_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm256_mullo_epi32(v[5], cospim32); + x = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi32); + x = _mm256_mullo_epi32(v[6], cospim32); + u[6] = _mm256_sub_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm256_add_epi32(v[8], v[11]); + u[11] = _mm256_sub_epi32(v[8], v[11]); + u[9] = _mm256_add_epi32(v[9], v[10]); + u[10] = _mm256_sub_epi32(v[9], v[10]); + u[12] = _mm256_sub_epi32(v[15], v[12]); + u[15] = _mm256_add_epi32(v[15], v[12]); + u[13] = _mm256_sub_epi32(v[14], v[13]); + u[14] = _mm256_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm256_mullo_epi32(u[0], cospi32); + u[1] = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(u[0], u[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(u[0], u[1]); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(u[2], cospi48); + x = _mm256_mullo_epi32(u[3], cospi16); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(u[2], cospi16); + x = _mm256_mullo_epi32(u[3], cospi48); + v[3] = _mm256_sub_epi32(x, v[3]); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_add_epi32(u[4], u[5]); + v[5] = _mm256_sub_epi32(u[4], u[5]); + v[6] = _mm256_sub_epi32(u[7], u[6]); + v[7] = _mm256_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm256_mullo_epi32(u[9], cospim16); + x = _mm256_mullo_epi32(u[14], cospi48); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi48); + x = _mm256_mullo_epi32(u[14], cospim16); + v[14] = _mm256_sub_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospim48); + x = _mm256_mullo_epi32(u[13], cospim16); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospim16); + x = _mm256_mullo_epi32(u[13], cospim48); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi56); + x = _mm256_mullo_epi32(v[7], cospi8); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[7] = _mm256_mullo_epi32(v[4], cospi8); + x = _mm256_mullo_epi32(v[7], cospi56); + u[7] = _mm256_sub_epi32(x, u[7]); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[5] = _mm256_mullo_epi32(v[5], cospi24); + x = _mm256_mullo_epi32(v[6], cospi40); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi40); + x = _mm256_mullo_epi32(v[6], cospi24); + u[6] = _mm256_sub_epi32(x, u[6]); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[8] = _mm256_add_epi32(v[8], v[9]); + u[9] = _mm256_sub_epi32(v[8], v[9]); + u[10] = _mm256_sub_epi32(v[11], v[10]); + u[11] = _mm256_add_epi32(v[11], v[10]); + u[12] = _mm256_add_epi32(v[12], v[13]); + u[13] = _mm256_sub_epi32(v[12], v[13]); + u[14] = _mm256_sub_epi32(v[15], v[14]); + u[15] = _mm256_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi60); + x = _mm256_mullo_epi32(u[15], cospi4); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[15] = _mm256_mullo_epi32(u[8], cospi4); + x = _mm256_mullo_epi32(u[15], cospi60); + v[15] = _mm256_sub_epi32(x, v[15]); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + v[9] = _mm256_mullo_epi32(u[9], cospi28); + x = _mm256_mullo_epi32(u[14], cospi36); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi36); + x = _mm256_mullo_epi32(u[14], cospi28); + v[14] = _mm256_sub_epi32(x, v[14]); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi44); + x = _mm256_mullo_epi32(u[13], cospi20); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi20); + x = _mm256_mullo_epi32(u[13], cospi44); + v[13] = _mm256_sub_epi32(x, v[13]); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospi12); + x = _mm256_mullo_epi32(u[12], cospi52); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi52); + x = _mm256_mullo_epi32(u[12], cospi12); + v[12] = _mm256_sub_epi32(x, v[12]); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + out[0 * outstride + col] = v[0]; + out[1 * outstride + col] = v[8]; + out[2 * outstride + col] = v[4]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[2]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[6]; + out[7 * outstride + col] = v[14]; + out[8 * outstride + col] = v[1]; + out[9 * outstride + col] = v[9]; + out[10 * outstride + col] = v[5]; + out[11 * outstride + col] = v[13]; + out[12 * outstride + col] = v[3]; + out[13 * outstride + col] = v[11]; + out[14 * outstride + col] = v[7]; + out[15 * outstride + col] = v[15]; + } +} +static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int num_cols, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + + __m256i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm256_mullo_epi32(u[2], cospi32); + y = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(x, y); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(x, y); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm256_mullo_epi32(u[6], cospi32); + y = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(x, y); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(x, y); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(x, y); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(x, y); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm256_mullo_epi32(u[14], cospi32); + y = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(x, y); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(x, y); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[2]); + u[1] = _mm256_add_epi32(v[1], v[3]); + u[2] = _mm256_sub_epi32(v[0], v[2]); + u[3] = _mm256_sub_epi32(v[1], v[3]); + u[4] = _mm256_add_epi32(v[4], v[6]); + u[5] = _mm256_add_epi32(v[5], v[7]); + u[6] = _mm256_sub_epi32(v[4], v[6]); + u[7] = _mm256_sub_epi32(v[5], v[7]); + u[8] = _mm256_add_epi32(v[8], v[10]); + u[9] = _mm256_add_epi32(v[9], v[11]); + u[10] = _mm256_sub_epi32(v[8], v[10]); + u[11] = _mm256_sub_epi32(v[9], v[11]); + u[12] = _mm256_add_epi32(v[12], v[14]); + u[13] = _mm256_add_epi32(v[13], v[15]); + u[14] = _mm256_sub_epi32(v[12], v[14]); + u[15] = _mm256_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm256_add_epi32(v[0], v[4]); + u[1] = _mm256_add_epi32(v[1], v[5]); + u[2] = _mm256_add_epi32(v[2], v[6]); + u[3] = _mm256_add_epi32(v[3], v[7]); + u[4] = _mm256_sub_epi32(v[0], v[4]); + u[5] = _mm256_sub_epi32(v[1], v[5]); + u[6] = _mm256_sub_epi32(v[2], v[6]); + u[7] = _mm256_sub_epi32(v[3], v[7]); + u[8] = _mm256_add_epi32(v[8], v[12]); + u[9] = _mm256_add_epi32(v[9], v[13]); + u[10] = _mm256_add_epi32(v[10], v[14]); + u[11] = _mm256_add_epi32(v[11], v[15]); + u[12] = _mm256_sub_epi32(v[8], v[12]); + u[13] = _mm256_sub_epi32(v[9], v[13]); + u[14] = _mm256_sub_epi32(v[10], v[14]); + u[15] = _mm256_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm256_add_epi32(v[0], v[8]); + u[1] = _mm256_add_epi32(v[1], v[9]); + u[2] = _mm256_add_epi32(v[2], v[10]); + u[3] = _mm256_add_epi32(v[3], v[11]); + u[4] = _mm256_add_epi32(v[4], v[12]); + u[5] = _mm256_add_epi32(v[5], v[13]); + u[6] = _mm256_add_epi32(v[6], v[14]); + u[7] = _mm256_add_epi32(v[7], v[15]); + u[8] = _mm256_sub_epi32(v[0], v[8]); + u[9] = _mm256_sub_epi32(v[1], v[9]); + u[10] = _mm256_sub_epi32(v[2], v[10]); + u[11] = _mm256_sub_epi32(v[3], v[11]); + u[12] = _mm256_sub_epi32(v[4], v[12]); + u[13] = _mm256_sub_epi32(v[5], v[13]); + u[14] = _mm256_sub_epi32(v[6], v[14]); + u[15] = _mm256_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * outstride + col] = v[1]; + out[1 * outstride + col] = v[14]; + out[2 * outstride + col] = v[3]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[5]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[7]; + out[7 * outstride + col] = v[8]; + out[8 * outstride + col] = v[9]; + out[9 * outstride + col] = v[6]; + out[10 * outstride + col] = v[11]; + out[11 * outstride + col] = v[4]; + out[12 * outstride + col] = v[13]; + out[13 * outstride + col] = v[2]; + out[14 * outstride + col] = v[15]; + out[15 * outstride + col] = v[0]; + } +} +static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit, + int col_num, const int outstride) { + (void)bit; + (void)outstride; + __m256i fact = _mm256_set1_epi32(2 * NewSqrt2); + __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m256i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm256_mullo_epi32(in[i], fact); + a_low = _mm256_add_epi32(a_low, offset); + out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits); + } +} +static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16_avx2, // DCT_DCT + fadst16_avx2, // ADST_DCT + fdct16_avx2, // DCT_ADST + fadst16_avx2, // ADST_ADST + fadst16_avx2, // FLIPADST_DCT + fdct16_avx2, // DCT_FLIPADST + fadst16_avx2, // FLIPADST_FLIPADST + fadst16_avx2, // ADST_FLIPADST + fadst16_avx2, // FLIPADST_ADST + idtx16_avx2, // IDTX + fdct16_avx2, // V_DCT + idtx16_avx2, // H_DCT + fadst16_avx2, // V_ADST + idtx16_avx2, // H_ADST + fadst16_avx2, // V_FLIPADST + idtx16_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8_avx2, // DCT_DCT + fdct8_avx2, // ADST_DCT + fadst8_avx2, // DCT_ADST + fadst8_avx2, // ADST_ADST + fdct8_avx2, // FLIPADST_DCT + fadst8_avx2, // DCT_FLIPADST + fadst8_avx2, // FLIPADST_FLIPADST + fadst8_avx2, // ADST_FLIPADST + fadst8_avx2, // FLIPADST_ADST + idtx8_avx2, // IDTX + idtx8_avx2, // V_DCT + fdct8_avx2, // H_DCT + idtx8_avx2, // V_ADST + fadst8_avx2, // H_ADST + idtx8_avx2, // V_FLIPADST + fadst8_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, out, bit, 1, 1); + col_txfm_8x8_rounding(out, -shift[1]); + col_txfm_8x8_rounding(&out[8], -shift[1]); + fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); + fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); + row_txfm(in, out, bit, 2, 2); + round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2); + store_buffer_avx2(in, coeff, 8, 16); + (void)bd; +} +static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8_avx2, // DCT_DCT + fadst8_avx2, // ADST_DCT + fdct8_avx2, // DCT_ADST + fadst8_avx2, // ADST_ADST + fadst8_avx2, // FLIPADST_DCT + fdct8_avx2, // DCT_FLIPADST + fadst8_avx2, // FLIPADST_FLIPADST + fadst8_avx2, // ADST_FLIPADST + fadst8_avx2, // FLIPADST_ADST + idtx8_avx2, // IDTX + fdct8_avx2, // V_DCT + idtx8_avx2, // H_DCT + fadst8_avx2, // V_ADST + idtx8_avx2, // H_ADST + fadst8_avx2, // V_FLIPADST + idtx8_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16_avx2, // DCT_DCT + fdct16_avx2, // ADST_DCT + fadst16_avx2, // DCT_ADST + fadst16_avx2, // ADST_ADST + fdct16_avx2, // FLIPADST_DCT + fadst16_avx2, // DCT_FLIPADST + fadst16_avx2, // FLIPADST_FLIPADST + fadst16_avx2, // ADST_FLIPADST + fadst16_avx2, // FLIPADST_ADST + idtx16_avx2, // IDTX + idtx16_avx2, // V_DCT + fdct16_avx2, // H_DCT + idtx16_avx2, // V_ADST + fadst16_avx2, // H_ADST + idtx16_avx2, // V_FLIPADST + fadst16_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip); + round_shift_32_8xn_avx2(in, 16, shift[0], 1); + col_txfm(in, out, bit, 2, 2); + round_shift_32_8xn_avx2(out, 16, shift[1], 1); + fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); + fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); + row_txfm(in, out, bit, 1, 1); + round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2); + store_buffer_avx2(out, coeff, 8, 16); + (void)bd; +} +void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[32], out[32]; + const TX_SIZE tx_size = TX_16X16; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int width_div8 = (width >> 3); + const int width_div16 = (width >> 4); + const int size = (height << 1); + switch (tx_type) { + case DCT_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case ADST_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case DCT_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case ADST_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case FLIPADST_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case DCT_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case FLIPADST_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case ADST_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case FLIPADST_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case IDTX: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case V_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case H_DCT: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case V_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case H_ADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case V_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + case H_FLIPADST: + load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + fwd_txfm_transpose_16x16_avx2(out, in); + fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + store_buffer_avx2(out, coeff, 8, 32); + break; + default: assert(0); + } + (void)bd; +} +static INLINE void fdct32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, const int instride, + const int outstride) { + __m256i buf0[32]; + __m256i buf1[32]; + const int32_t *cospi; + int startidx = 0 * instride; + int endidx = 31 * instride; + // stage 0 + // stage 1 + buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7], + cos_bit); + btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15], + cos_bit); + btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31], + cos_bit); + btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24], + cos_bit); + + startidx = 0 * outstride; + endidx = 31 * outstride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} +static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, int instride, + int outstride) { + (void)cos_bit; + for (int i = 0; i < 32; i += 8) { + output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2); + output[(i + 1) * outstride] = + _mm256_slli_epi32(input[(i + 1) * instride], 2); + output[(i + 2) * outstride] = + _mm256_slli_epi32(input[(i + 2) * instride], 2); + output[(i + 3) * outstride] = + _mm256_slli_epi32(input[(i + 3) * instride], 2); + output[(i + 4) * outstride] = + _mm256_slli_epi32(input[(i + 4) * instride], 2); + output[(i + 5) * outstride] = + _mm256_slli_epi32(input[(i + 5) * instride], 2); + output[(i + 6) * outstride] = + _mm256_slli_epi32(input[(i + 6) * instride], 2); + output[(i + 7) * outstride] = + _mm256_slli_epi32(input[(i + 7) * instride], 2); + } +} +static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = { + fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = { + fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[128], buf1[128]; + const int tx_size = TX_32X32; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type]; + int r, c; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + + for (int i = 0; i < width_div16; i++) { + load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height, + width_div8, 0, 0); + round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); + col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8, + width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < width_div16; i++) { + row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8, + width_div8); + row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8); + round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8); + } + + store_buffer_avx2(buf1, output, 8, 128); +} +static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2, + __m256i *cospi_m32, __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x2[0] = _mm256_add_epi32(x1[0], x1[31]); + x2[31] = _mm256_sub_epi32(x1[0], x1[31]); + x2[1] = _mm256_add_epi32(x1[1], x1[30]); + x2[30] = _mm256_sub_epi32(x1[1], x1[30]); + x2[2] = _mm256_add_epi32(x1[2], x1[29]); + x2[29] = _mm256_sub_epi32(x1[2], x1[29]); + x2[3] = _mm256_add_epi32(x1[3], x1[28]); + x2[28] = _mm256_sub_epi32(x1[3], x1[28]); + x2[4] = _mm256_add_epi32(x1[4], x1[27]); + x2[27] = _mm256_sub_epi32(x1[4], x1[27]); + x2[5] = _mm256_add_epi32(x1[5], x1[26]); + x2[26] = _mm256_sub_epi32(x1[5], x1[26]); + x2[6] = _mm256_add_epi32(x1[6], x1[25]); + x2[25] = _mm256_sub_epi32(x1[6], x1[25]); + x2[7] = _mm256_add_epi32(x1[7], x1[24]); + x2[24] = _mm256_sub_epi32(x1[7], x1[24]); + x2[8] = _mm256_add_epi32(x1[8], x1[23]); + x2[23] = _mm256_sub_epi32(x1[8], x1[23]); + x2[9] = _mm256_add_epi32(x1[9], x1[22]); + x2[22] = _mm256_sub_epi32(x1[9], x1[22]); + x2[10] = _mm256_add_epi32(x1[10], x1[21]); + x2[21] = _mm256_sub_epi32(x1[10], x1[21]); + x2[11] = _mm256_add_epi32(x1[11], x1[20]); + x2[20] = _mm256_sub_epi32(x1[11], x1[20]); + x2[12] = _mm256_add_epi32(x1[12], x1[19]); + x2[19] = _mm256_sub_epi32(x1[12], x1[19]); + x2[13] = _mm256_add_epi32(x1[13], x1[18]); + x2[18] = _mm256_sub_epi32(x1[13], x1[18]); + x2[14] = _mm256_add_epi32(x1[14], x1[17]); + x2[17] = _mm256_sub_epi32(x1[14], x1[17]); + x2[15] = _mm256_add_epi32(x1[15], x1[16]); + x2[16] = _mm256_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48], + *__rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; +} +static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3, + __m256i *cospi_m32, __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x3[0] = _mm256_add_epi32(x2[0], x2[15]); + x3[15] = _mm256_sub_epi32(x2[0], x2[15]); + x3[1] = _mm256_add_epi32(x2[1], x2[14]); + x3[14] = _mm256_sub_epi32(x2[1], x2[14]); + x3[2] = _mm256_add_epi32(x2[2], x2[13]); + x3[13] = _mm256_sub_epi32(x2[2], x2[13]); + x3[3] = _mm256_add_epi32(x2[3], x2[12]); + x3[12] = _mm256_sub_epi32(x2[3], x2[12]); + x3[4] = _mm256_add_epi32(x2[4], x2[11]); + x3[11] = _mm256_sub_epi32(x2[4], x2[11]); + x3[5] = _mm256_add_epi32(x2[5], x2[10]); + x3[10] = _mm256_sub_epi32(x2[5], x2[10]); + x3[6] = _mm256_add_epi32(x2[6], x2[9]); + x3[9] = _mm256_sub_epi32(x2[6], x2[9]); + x3[7] = _mm256_add_epi32(x2[7], x2[8]); + x3[8] = _mm256_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24], + *__rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm256_add_epi32(x2[32], x2[47]); + x3[47] = _mm256_sub_epi32(x2[32], x2[47]); + x3[33] = _mm256_add_epi32(x2[33], x2[46]); + x3[46] = _mm256_sub_epi32(x2[33], x2[46]); + x3[34] = _mm256_add_epi32(x2[34], x2[45]); + x3[45] = _mm256_sub_epi32(x2[34], x2[45]); + x3[35] = _mm256_add_epi32(x2[35], x2[44]); + x3[44] = _mm256_sub_epi32(x2[35], x2[44]); + x3[36] = _mm256_add_epi32(x2[36], x2[43]); + x3[43] = _mm256_sub_epi32(x2[36], x2[43]); + x3[37] = _mm256_add_epi32(x2[37], x2[42]); + x3[42] = _mm256_sub_epi32(x2[37], x2[42]); + x3[38] = _mm256_add_epi32(x2[38], x2[41]); + x3[41] = _mm256_sub_epi32(x2[38], x2[41]); + x3[39] = _mm256_add_epi32(x2[39], x2[40]); + x3[40] = _mm256_sub_epi32(x2[39], x2[40]); + x3[48] = _mm256_sub_epi32(x2[63], x2[48]); + x3[63] = _mm256_add_epi32(x2[63], x2[48]); + x3[49] = _mm256_sub_epi32(x2[62], x2[49]); + x3[62] = _mm256_add_epi32(x2[62], x2[49]); + x3[50] = _mm256_sub_epi32(x2[61], x2[50]); + x3[61] = _mm256_add_epi32(x2[61], x2[50]); + x3[51] = _mm256_sub_epi32(x2[60], x2[51]); + x3[60] = _mm256_add_epi32(x2[60], x2[51]); + x3[52] = _mm256_sub_epi32(x2[59], x2[52]); + x3[59] = _mm256_add_epi32(x2[59], x2[52]); + x3[53] = _mm256_sub_epi32(x2[58], x2[53]); + x3[58] = _mm256_add_epi32(x2[58], x2[53]); + x3[54] = _mm256_sub_epi32(x2[57], x2[54]); + x3[57] = _mm256_add_epi32(x2[57], x2[54]); + x3[55] = _mm256_sub_epi32(x2[56], x2[55]); + x3[56] = _mm256_add_epi32(x2[56], x2[55]); +} +static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4, + __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, + __m256i *cospi_m48, + const __m256i *__rounding, + int8_t cos_bit) { + x4[0] = _mm256_add_epi32(x3[0], x3[7]); + x4[7] = _mm256_sub_epi32(x3[0], x3[7]); + x4[1] = _mm256_add_epi32(x3[1], x3[6]); + x4[6] = _mm256_sub_epi32(x3[1], x3[6]); + x4[2] = _mm256_add_epi32(x3[2], x3[5]); + x4[5] = _mm256_sub_epi32(x3[2], x3[5]); + x4[3] = _mm256_add_epi32(x3[3], x3[4]); + x4[4] = _mm256_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12], + *__rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm256_add_epi32(x3[16], x3[23]); + x4[23] = _mm256_sub_epi32(x3[16], x3[23]); + x4[17] = _mm256_add_epi32(x3[17], x3[22]); + x4[22] = _mm256_sub_epi32(x3[17], x3[22]); + x4[18] = _mm256_add_epi32(x3[18], x3[21]); + x4[21] = _mm256_sub_epi32(x3[18], x3[21]); + x4[19] = _mm256_add_epi32(x3[19], x3[20]); + x4[20] = _mm256_sub_epi32(x3[19], x3[20]); + x4[24] = _mm256_sub_epi32(x3[31], x3[24]); + x4[31] = _mm256_add_epi32(x3[31], x3[24]); + x4[25] = _mm256_sub_epi32(x3[30], x3[25]); + x4[30] = _mm256_add_epi32(x3[30], x3[25]); + x4[26] = _mm256_sub_epi32(x3[29], x3[26]); + x4[29] = _mm256_add_epi32(x3[29], x3[26]); + x4[27] = _mm256_sub_epi32(x3[28], x3[27]); + x4[28] = _mm256_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52], + *__rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; +} +static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5, + __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, + __m256i *cospi_m48, + const __m256i *__rounding, + int8_t cos_bit) { + x5[0] = _mm256_add_epi32(x4[0], x4[3]); + x5[3] = _mm256_sub_epi32(x4[0], x4[3]); + x5[1] = _mm256_add_epi32(x4[1], x4[2]); + x5[2] = _mm256_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6], + *__rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm256_add_epi32(x4[8], x4[11]); + x5[11] = _mm256_sub_epi32(x4[8], x4[11]); + x5[9] = _mm256_add_epi32(x4[9], x4[10]); + x5[10] = _mm256_sub_epi32(x4[9], x4[10]); + x5[12] = _mm256_sub_epi32(x4[15], x4[12]); + x5[15] = _mm256_add_epi32(x4[15], x4[12]); + x5[13] = _mm256_sub_epi32(x4[14], x4[13]); + x5[14] = _mm256_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26], + *__rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm256_add_epi32(x4[32], x4[39]); + x5[39] = _mm256_sub_epi32(x4[32], x4[39]); + x5[33] = _mm256_add_epi32(x4[33], x4[38]); + x5[38] = _mm256_sub_epi32(x4[33], x4[38]); + x5[34] = _mm256_add_epi32(x4[34], x4[37]); + x5[37] = _mm256_sub_epi32(x4[34], x4[37]); + x5[35] = _mm256_add_epi32(x4[35], x4[36]); + x5[36] = _mm256_sub_epi32(x4[35], x4[36]); + x5[40] = _mm256_sub_epi32(x4[47], x4[40]); + x5[47] = _mm256_add_epi32(x4[47], x4[40]); + x5[41] = _mm256_sub_epi32(x4[46], x4[41]); + x5[46] = _mm256_add_epi32(x4[46], x4[41]); + x5[42] = _mm256_sub_epi32(x4[45], x4[42]); + x5[45] = _mm256_add_epi32(x4[45], x4[42]); + x5[43] = _mm256_sub_epi32(x4[44], x4[43]); + x5[44] = _mm256_add_epi32(x4[44], x4[43]); + x5[48] = _mm256_add_epi32(x4[48], x4[55]); + x5[55] = _mm256_sub_epi32(x4[48], x4[55]); + x5[49] = _mm256_add_epi32(x4[49], x4[54]); + x5[54] = _mm256_sub_epi32(x4[49], x4[54]); + x5[50] = _mm256_add_epi32(x4[50], x4[53]); + x5[53] = _mm256_sub_epi32(x4[50], x4[53]); + x5[51] = _mm256_add_epi32(x4[51], x4[52]); + x5[52] = _mm256_sub_epi32(x4[51], x4[52]); + x5[56] = _mm256_sub_epi32(x4[63], x4[56]); + x5[63] = _mm256_add_epi32(x4[63], x4[56]); + x5[57] = _mm256_sub_epi32(x4[62], x4[57]); + x5[62] = _mm256_add_epi32(x4[62], x4[57]); + x5[58] = _mm256_sub_epi32(x4[61], x4[58]); + x5[61] = _mm256_add_epi32(x4[61], x4[58]); + x5[59] = _mm256_sub_epi32(x4[60], x4[59]); + x5[60] = _mm256_add_epi32(x4[60], x4[59]); +} +static INLINE void fdct64_stage6_avx2( + __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, + __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24, + const __m256i *__rounding, int8_t cos_bit) { + btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3], + *__rounding, cos_bit); + x6[4] = _mm256_add_epi32(x5[4], x5[5]); + x6[5] = _mm256_sub_epi32(x5[4], x5[5]); + x6[6] = _mm256_sub_epi32(x5[7], x5[6]); + x6[7] = _mm256_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13], + *__rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm256_add_epi32(x5[16], x5[19]); + x6[19] = _mm256_sub_epi32(x5[16], x5[19]); + x6[17] = _mm256_add_epi32(x5[17], x5[18]); + x6[18] = _mm256_sub_epi32(x5[17], x5[18]); + x6[20] = _mm256_sub_epi32(x5[23], x5[20]); + x6[23] = _mm256_add_epi32(x5[23], x5[20]); + x6[21] = _mm256_sub_epi32(x5[22], x5[21]); + x6[22] = _mm256_add_epi32(x5[22], x5[21]); + x6[24] = _mm256_add_epi32(x5[24], x5[27]); + x6[27] = _mm256_sub_epi32(x5[24], x5[27]); + x6[25] = _mm256_add_epi32(x5[25], x5[26]); + x6[26] = _mm256_sub_epi32(x5[25], x5[26]); + x6[28] = _mm256_sub_epi32(x5[31], x5[28]); + x6[31] = _mm256_add_epi32(x5[31], x5[28]); + x6[29] = _mm256_sub_epi32(x5[30], x5[29]); + x6[30] = _mm256_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58], + *__rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50], + *__rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; +} +static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7, + __m256i *cospi_p08, __m256i *cospi_p56, + __m256i *cospi_p40, __m256i *cospi_p24, + __m256i *cospi_m08, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_m24, + const __m256i *__rounding, + int8_t cos_bit) { + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6], + *__rounding, cos_bit); + x7[8] = _mm256_add_epi32(x6[8], x6[9]); + x7[9] = _mm256_sub_epi32(x6[8], x6[9]); + x7[10] = _mm256_sub_epi32(x6[11], x6[10]); + x7[11] = _mm256_add_epi32(x6[11], x6[10]); + x7[12] = _mm256_add_epi32(x6[12], x6[13]); + x7[13] = _mm256_sub_epi32(x6[12], x6[13]); + x7[14] = _mm256_sub_epi32(x6[15], x6[14]); + x7[15] = _mm256_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29], + *__rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25], + *__rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm256_add_epi32(x6[32], x6[35]); + x7[35] = _mm256_sub_epi32(x6[32], x6[35]); + x7[33] = _mm256_add_epi32(x6[33], x6[34]); + x7[34] = _mm256_sub_epi32(x6[33], x6[34]); + x7[36] = _mm256_sub_epi32(x6[39], x6[36]); + x7[39] = _mm256_add_epi32(x6[39], x6[36]); + x7[37] = _mm256_sub_epi32(x6[38], x6[37]); + x7[38] = _mm256_add_epi32(x6[38], x6[37]); + x7[40] = _mm256_add_epi32(x6[40], x6[43]); + x7[43] = _mm256_sub_epi32(x6[40], x6[43]); + x7[41] = _mm256_add_epi32(x6[41], x6[42]); + x7[42] = _mm256_sub_epi32(x6[41], x6[42]); + x7[44] = _mm256_sub_epi32(x6[47], x6[44]); + x7[47] = _mm256_add_epi32(x6[47], x6[44]); + x7[45] = _mm256_sub_epi32(x6[46], x6[45]); + x7[46] = _mm256_add_epi32(x6[46], x6[45]); + x7[48] = _mm256_add_epi32(x6[48], x6[51]); + x7[51] = _mm256_sub_epi32(x6[48], x6[51]); + x7[49] = _mm256_add_epi32(x6[49], x6[50]); + x7[50] = _mm256_sub_epi32(x6[49], x6[50]); + x7[52] = _mm256_sub_epi32(x6[55], x6[52]); + x7[55] = _mm256_add_epi32(x6[55], x6[52]); + x7[53] = _mm256_sub_epi32(x6[54], x6[53]); + x7[54] = _mm256_add_epi32(x6[54], x6[53]); + x7[56] = _mm256_add_epi32(x6[56], x6[59]); + x7[59] = _mm256_sub_epi32(x6[56], x6[59]); + x7[57] = _mm256_add_epi32(x6[57], x6[58]); + x7[58] = _mm256_sub_epi32(x6[57], x6[58]); + x7[60] = _mm256_sub_epi32(x6[63], x6[60]); + x7[63] = _mm256_add_epi32(x6[63], x6[60]); + x7[61] = _mm256_sub_epi32(x6[62], x6[61]); + x7[62] = _mm256_add_epi32(x6[62], x6[61]); +} +static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + + btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12], + *__rounding, cos_bit); + x8[16] = _mm256_add_epi32(x7[16], x7[17]); + x8[17] = _mm256_sub_epi32(x7[16], x7[17]); + x8[18] = _mm256_sub_epi32(x7[19], x7[18]); + x8[19] = _mm256_add_epi32(x7[19], x7[18]); + x8[20] = _mm256_add_epi32(x7[20], x7[21]); + x8[21] = _mm256_sub_epi32(x7[20], x7[21]); + x8[22] = _mm256_sub_epi32(x7[23], x7[22]); + x8[23] = _mm256_add_epi32(x7[23], x7[22]); + x8[24] = _mm256_add_epi32(x7[24], x7[25]); + x8[25] = _mm256_sub_epi32(x7[24], x7[25]); + x8[26] = _mm256_sub_epi32(x7[27], x7[26]); + x8[27] = _mm256_add_epi32(x7[27], x7[26]); + x8[28] = _mm256_add_epi32(x7[28], x7[29]); + x8[29] = _mm256_sub_epi32(x7[28], x7[29]); + x8[30] = _mm256_sub_epi32(x7[31], x7[30]); + x8[31] = _mm256_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + *__rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + *__rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + *__rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + *__rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; +} +static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24], + *__rounding, cos_bit); + x9[32] = _mm256_add_epi32(x8[32], x8[33]); + x9[33] = _mm256_sub_epi32(x8[32], x8[33]); + x9[34] = _mm256_sub_epi32(x8[35], x8[34]); + x9[35] = _mm256_add_epi32(x8[35], x8[34]); + x9[36] = _mm256_add_epi32(x8[36], x8[37]); + x9[37] = _mm256_sub_epi32(x8[36], x8[37]); + x9[38] = _mm256_sub_epi32(x8[39], x8[38]); + x9[39] = _mm256_add_epi32(x8[39], x8[38]); + x9[40] = _mm256_add_epi32(x8[40], x8[41]); + x9[41] = _mm256_sub_epi32(x8[40], x8[41]); + x9[42] = _mm256_sub_epi32(x8[43], x8[42]); + x9[43] = _mm256_add_epi32(x8[43], x8[42]); + x9[44] = _mm256_add_epi32(x8[44], x8[45]); + x9[45] = _mm256_sub_epi32(x8[44], x8[45]); + x9[46] = _mm256_sub_epi32(x8[47], x8[46]); + x9[47] = _mm256_add_epi32(x8[47], x8[46]); + x9[48] = _mm256_add_epi32(x8[48], x8[49]); + x9[49] = _mm256_sub_epi32(x8[48], x8[49]); + x9[50] = _mm256_sub_epi32(x8[51], x8[50]); + x9[51] = _mm256_add_epi32(x8[51], x8[50]); + x9[52] = _mm256_add_epi32(x8[52], x8[53]); + x9[53] = _mm256_sub_epi32(x8[52], x8[53]); + x9[54] = _mm256_sub_epi32(x8[55], x8[54]); + x9[55] = _mm256_add_epi32(x8[55], x8[54]); + x9[56] = _mm256_add_epi32(x8[56], x8[57]); + x9[57] = _mm256_sub_epi32(x8[56], x8[57]); + x9[58] = _mm256_sub_epi32(x8[59], x8[58]); + x9[59] = _mm256_add_epi32(x8[59], x8[58]); + x9[60] = _mm256_add_epi32(x8[60], x8[61]); + x9[61] = _mm256_sub_epi32(x8[60], x8[61]); + x9[62] = _mm256_sub_epi32(x8[63], x8[62]); + x9[63] = _mm256_add_epi32(x8[63], x8[62]); +} +static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48], + *__rounding, cos_bit); +} +static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m256i x1[64]; + x1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m256i x2[64]; + fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 3 + fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 4 + fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 5 + fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 6 + fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40, + &cospi_p24, &cospi_m24, &__rounding, cos_bit); + // stage 7 + fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24, + &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24, + &__rounding, cos_bit); + // stage 8 + fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit); + // stage 9 + fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit); + // stage 10 + fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + + // stage 11 + output[startidx] = x2[0]; + output[endidx] = x2[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[32]; + output[endidx] = x2[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[16]; + output[endidx] = x2[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[48]; + output[endidx] = x2[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[8]; + output[endidx] = x2[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[40]; + output[endidx] = x2[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[24]; + output[endidx] = x2[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[56]; + output[endidx] = x2[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[4]; + output[endidx] = x2[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[36]; + output[endidx] = x2[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[20]; + output[endidx] = x2[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[52]; + output[endidx] = x2[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[12]; + output[endidx] = x2[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[44]; + output[endidx] = x2[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[28]; + output[endidx] = x2[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[60]; + output[endidx] = x2[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[2]; + output[endidx] = x2[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[34]; + output[endidx] = x2[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[18]; + output[endidx] = x2[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[50]; + output[endidx] = x2[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[10]; + output[endidx] = x2[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[42]; + output[endidx] = x2[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[26]; + output[endidx] = x2[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[58]; + output[endidx] = x2[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[6]; + output[endidx] = x2[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[38]; + output[endidx] = x2[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[22]; + output[endidx] = x2[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[54]; + output[endidx] = x2[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[14]; + output[endidx] = x2[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[46]; + output[endidx] = x2[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[30]; + output[endidx] = x2[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[62]; + output[endidx] = x2[1]; +} +void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[512], buf1[512]; + const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = fdct64_avx2; + const transform_1d_avx2 row_txfm = fdct64_avx2; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + int r, c; + for (int i = 0; i < width_div16; i++) { + load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height, + width_div8, 0, 0); + round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8); + col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < 2; i++) { + row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8, + width_div16); + row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8, + width_div16); + round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2], + width_div16); + round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2], + width_div16); + } + + store_buffer_avx2(buf0, output, 8, 128); +} diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c new file mode 100644 index 0000000000..158b4ae439 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c @@ -0,0 +1,2629 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include /* SSE4.1 */ + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/transpose_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" +#include "av1/common/av1_txfm.h" +#include "av1/common/x86/highbd_txfm_utility_sse4.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "av1/encoder/x86/av1_txfm1d_sse4.h" +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +static INLINE void store_output_w4(int32_t *const out, const __m128i *const in, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm_store_si128((__m128i *)(out + i * stride), in[i]); + } +} + +void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) { + __m128i in[4]; + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + + // Convert to int32_t. + __m128i op[4]; + op[0] = _mm_cvtepi16_epi32(in[0]); + op[1] = _mm_cvtepi16_epi32(in[1]); + op[2] = _mm_cvtepi16_epi32(in[2]); + op[3] = _mm_cvtepi16_epi32(in[3]); + + for (int i = 0; i < 2; ++i) { + __m128i a1 = op[0]; + __m128i b1 = op[1]; + __m128i c1 = op[2]; + __m128i d1 = op[3]; + __m128i e1; + + a1 = _mm_add_epi32(a1, b1); // a1 += b1 + d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1 + e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1 + e1 = _mm_srai_epi32(e1, 1); + b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1 + c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1 + a1 = _mm_sub_epi32(a1, c1); // a1 -= c1 + d1 = _mm_add_epi32(d1, b1); // d1 += b1 + + op[0] = a1; + op[1] = c1; + op[2] = d1; + op[3] = b1; + + if (i == 0) { + transpose_32bit_4x4(op, op); + } + } + + op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT); + op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT); + op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT); + op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT); + + _mm_storeu_si128((__m128i *)(output + 0), op[0]); + _mm_storeu_si128((__m128i *)(output + 4), op[1]); + _mm_storeu_si128((__m128i *)(output + 8), op[2]); + _mm_storeu_si128((__m128i *)(output + 12), op[3]); +} + +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + if (!flipud) { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + } else { + in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = _mm_shufflelo_epi16(in[0], 0x1b); + in[1] = _mm_shufflelo_epi16(in[1], 0x1b); + in[2] = _mm_shufflelo_epi16(in[2], 0x1b); + in[3] = _mm_shufflelo_epi16(in[3], 0x1b); + } + + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(in[1]); + in[2] = _mm_cvtepi16_epi32(in[2]); + in[3] = _mm_cvtepi16_epi32(in[3]); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); +} + +// We only use stage-2 bit; +// shift[0] is used in load_buffer_4x4() +// shift[1] is used in txfm_func_col() +// shift[2] is used in txfm_func_row() +static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_col) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i s0, s1, s2, s3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + int endidx = 3 * num_col; + s0 = _mm_add_epi32(in[0], in[endidx]); + s3 = _mm_sub_epi32(in[0], in[endidx]); + endidx -= num_col; + s1 = _mm_add_epi32(in[num_col], in[endidx]); + s2 = _mm_sub_epi32(in[num_col], in[endidx]); + + // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit); + u0 = _mm_mullo_epi32(s0, cospi32); + u1 = _mm_mullo_epi32(s1, cospi32); + u2 = _mm_add_epi32(u0, u1); + v0 = _mm_sub_epi32(u0, u1); + + u3 = _mm_add_epi32(u2, rnding); + v1 = _mm_add_epi32(v0, rnding); + + u0 = _mm_srai_epi32(u3, bit); + u2 = _mm_srai_epi32(v1, bit); + + // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit); + v0 = _mm_mullo_epi32(s2, cospi48); + v1 = _mm_mullo_epi32(s3, cospi16); + v2 = _mm_add_epi32(v0, v1); + + v3 = _mm_add_epi32(v2, rnding); + u1 = _mm_srai_epi32(v3, bit); + + v0 = _mm_mullo_epi32(s2, cospi16); + v1 = _mm_mullo_epi32(s3, cospi48); + v2 = _mm_sub_epi32(v1, v0); + + v3 = _mm_add_epi32(v2, rnding); + u3 = _mm_srai_epi32(v3, bit); + + // Note: shift[1] and shift[2] are zeros + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; +} + +static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); +} + +static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_col) { + const int32_t *sinpi = sinpi_arr(bit); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]); + const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]); + const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]); + const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]); + __m128i t; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i x0, x1, x2, x3; + __m128i u0, u1, u2, u3; + + int idx = 0 * num_col; + s0 = _mm_mullo_epi32(in[idx], sinpi1); + s1 = _mm_mullo_epi32(in[idx], sinpi4); + t = _mm_add_epi32(in[idx], in[idx + num_col]); + idx += num_col; + s2 = _mm_mullo_epi32(in[idx], sinpi2); + s3 = _mm_mullo_epi32(in[idx], sinpi1); + idx += num_col; + s4 = _mm_mullo_epi32(in[idx], sinpi3); + idx += num_col; + s5 = _mm_mullo_epi32(in[idx], sinpi4); + s6 = _mm_mullo_epi32(in[idx], sinpi2); + s7 = _mm_sub_epi32(t, in[idx]); + + t = _mm_add_epi32(s0, s2); + x0 = _mm_add_epi32(t, s5); + x1 = _mm_mullo_epi32(s7, sinpi3); + t = _mm_sub_epi32(s1, s3); + x2 = _mm_add_epi32(t, s6); + x3 = s4; + + s0 = _mm_add_epi32(x0, x3); + s1 = x1; + s2 = _mm_sub_epi32(x2, x3); + t = _mm_sub_epi32(x2, x0); + s3 = _mm_add_epi32(t, x3); + + u0 = _mm_add_epi32(s0, rnding); + u0 = _mm_srai_epi32(u0, bit); + + u1 = _mm_add_epi32(s1, rnding); + u1 = _mm_srai_epi32(u1, bit); + + u2 = _mm_add_epi32(s2, rnding); + u2 = _mm_srai_epi32(u2, bit); + + u3 = _mm_add_epi32(s3, rnding); + u3 = _mm_srai_epi32(u3, bit); + + out[0] = u0; + out[1] = u1; + out[2] = u2; + out[3] = u3; +} +static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a_low; + + for (int i = 0; i < 4; i++) { + a_low = _mm_mullo_epi32(in[i * col_num], fact); + a_low = _mm_add_epi32(a_low, offset); + out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); + } +} +void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff, + int input_stride, TX_TYPE tx_type, int bd) { + __m128i in[4]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4]; + const int txw_idx = get_txw_idx(TX_4X4); + const int txh_idx = get_txh_idx(TX_4X4); + + switch (tx_type) { + case DCT_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case DCT_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_DCT: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case DCT_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case ADST_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case FLIPADST_ADST: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case IDTX: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_DCT: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_ADST: + load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case V_FLIPADST: + load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + case H_FLIPADST: + load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]); + idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + transpose_32bit_4x4(in, in); + fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1); + write_buffer_4x4(in, coeff); + break; + default: assert(0); + } + (void)bd; +} + +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, + int stride, int flipud, int fliplr, + int shift) { + __m128i u; + if (!flipud) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } else { + in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + } + + if (fliplr) { + in[0] = mm_reverse_epi16(in[0]); + in[1] = mm_reverse_epi16(in[1]); + in[2] = mm_reverse_epi16(in[2]); + in[3] = mm_reverse_epi16(in[3]); + in[4] = mm_reverse_epi16(in[4]); + in[5] = mm_reverse_epi16(in[5]); + in[6] = mm_reverse_epi16(in[6]); + in[7] = mm_reverse_epi16(in[7]); + } + + u = _mm_unpackhi_epi64(in[4], in[4]); + in[8] = _mm_cvtepi16_epi32(in[4]); + in[9] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[5], in[5]); + in[10] = _mm_cvtepi16_epi32(in[5]); + in[11] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[6], in[6]); + in[12] = _mm_cvtepi16_epi32(in[6]); + in[13] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[7], in[7]); + in[14] = _mm_cvtepi16_epi32(in[7]); + in[15] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[3], in[3]); + in[6] = _mm_cvtepi16_epi32(in[3]); + in[7] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[2], in[2]); + in[4] = _mm_cvtepi16_epi32(in[2]); + in[5] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[1], in[1]); + in[2] = _mm_cvtepi16_epi32(in[1]); + in[3] = _mm_cvtepi16_epi32(u); + + u = _mm_unpackhi_epi64(in[0], in[0]); + in[0] = _mm_cvtepi16_epi32(in[0]); + in[1] = _mm_cvtepi16_epi32(u); + + in[0] = _mm_slli_epi32(in[0], shift); + in[1] = _mm_slli_epi32(in[1], shift); + in[2] = _mm_slli_epi32(in[2], shift); + in[3] = _mm_slli_epi32(in[3], shift); + in[4] = _mm_slli_epi32(in[4], shift); + in[5] = _mm_slli_epi32(in[5], shift); + in[6] = _mm_slli_epi32(in[6], shift); + in[7] = _mm_slli_epi32(in[7], shift); + + in[8] = _mm_slli_epi32(in[8], shift); + in[9] = _mm_slli_epi32(in[9], shift); + in[10] = _mm_slli_epi32(in[10], shift); + in[11] = _mm_slli_epi32(in[11], shift); + in[12] = _mm_slli_epi32(in[12], shift); + in[13] = _mm_slli_epi32(in[13], shift); + in[14] = _mm_slli_epi32(in[14], shift); + in[15] = _mm_slli_epi32(in[15], shift); +} + +static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + in[8] = _mm_add_epi32(in[8], rounding); + in[9] = _mm_add_epi32(in[9], rounding); + in[10] = _mm_add_epi32(in[10], rounding); + in[11] = _mm_add_epi32(in[11], rounding); + in[12] = _mm_add_epi32(in[12], rounding); + in[13] = _mm_add_epi32(in[13], rounding); + in[14] = _mm_add_epi32(in[14], rounding); + in[15] = _mm_add_epi32(in[15], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); + in[8] = _mm_srai_epi32(in[8], shift); + in[9] = _mm_srai_epi32(in[9], shift); + in[10] = _mm_srai_epi32(in[10], shift); + in[11] = _mm_srai_epi32(in[11], shift); + in[12] = _mm_srai_epi32(in[12], shift); + in[13] = _mm_srai_epi32(in[13], shift); + in[14] = _mm_srai_epi32(in[14], shift); + in[15] = _mm_srai_epi32(in[15], shift); +} + +static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) { + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + + in[0] = _mm_add_epi32(in[0], rounding); + in[1] = _mm_add_epi32(in[1], rounding); + in[2] = _mm_add_epi32(in[2], rounding); + in[3] = _mm_add_epi32(in[3], rounding); + in[4] = _mm_add_epi32(in[4], rounding); + in[5] = _mm_add_epi32(in[5], rounding); + in[6] = _mm_add_epi32(in[6], rounding); + in[7] = _mm_add_epi32(in[7], rounding); + + in[0] = _mm_srai_epi32(in[0], shift); + in[1] = _mm_srai_epi32(in[1], shift); + in[2] = _mm_srai_epi32(in[2], shift); + in[3] = _mm_srai_epi32(in[3], shift); + in[4] = _mm_srai_epi32(in[4], shift); + in[5] = _mm_srai_epi32(in[5], shift); + in[6] = _mm_srai_epi32(in[6], shift); + in[7] = _mm_srai_epi32(in[7], shift); +} + +static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) { + _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); + _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); + _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); + _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); + + _mm_store_si128((__m128i *)(output + 4 * 4), res[4]); + _mm_store_si128((__m128i *)(output + 5 * 4), res[5]); + _mm_store_si128((__m128i *)(output + 6 * 4), res[6]); + _mm_store_si128((__m128i *)(output + 7 * 4), res[7]); + + _mm_store_si128((__m128i *)(output + 8 * 4), res[8]); + _mm_store_si128((__m128i *)(output + 9 * 4), res[9]); + _mm_store_si128((__m128i *)(output + 10 * 4), res[10]); + _mm_store_si128((__m128i *)(output + 11 * 4), res[11]); + + _mm_store_si128((__m128i *)(output + 12 * 4), res[12]); + _mm_store_si128((__m128i *)(output + 13 * 4), res[13]); + _mm_store_si128((__m128i *)(output + 14 * 4), res[14]); + _mm_store_si128((__m128i *)(output + 15 * 4), res[15]); +} + +static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output, + const int stride) { + _mm_storeu_si128((__m128i *)(output), res[0]); + _mm_storeu_si128((__m128i *)(output + 4), res[1]); + _mm_storeu_si128((__m128i *)(output + stride), res[2]); + _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]); + + _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]); + _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]); + _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]); + _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]); + + _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]); + _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]); + _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]); + _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]); + + _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]); + _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]); + _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]); + _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]); +} + +static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[8], v[8]; + + int startidx = 0 * col_num; + int endidx = 7 * col_num; + // Even 8 points 0, 2, ..., 14 + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[startidx], in[endidx]); + v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7] + startidx += col_num; + endidx -= col_num; + u[1] = _mm_add_epi32(in[startidx], in[endidx]); + u[6] = _mm_sub_epi32(in[startidx], in[endidx]); + startidx += col_num; + endidx -= col_num; + u[2] = _mm_add_epi32(in[startidx], in[endidx]); + u[5] = _mm_sub_epi32(in[startidx], in[endidx]); + startidx += col_num; + endidx -= col_num; + u[3] = _mm_add_epi32(in[startidx], in[endidx]); + v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4] + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[3]); + v[3] = _mm_sub_epi32(u[0], u[3]); + v[1] = _mm_add_epi32(u[1], u[2]); + v[2] = _mm_sub_epi32(u[1], u[2]); + + v[5] = _mm_mullo_epi32(u[5], cospim32); + v[6] = _mm_mullo_epi32(u[6], cospi32); + v[5] = _mm_add_epi32(v[5], v[6]); + v[5] = _mm_add_epi32(v[5], rnding); + v[5] = _mm_srai_epi32(v[5], bit); + + u[0] = _mm_mullo_epi32(u[5], cospi32); + v[6] = _mm_mullo_epi32(u[6], cospim32); + v[6] = _mm_sub_epi32(u[0], v[6]); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm_mullo_epi32(v[0], cospi32); + v[1] = _mm_mullo_epi32(v[1], cospi32); + u[0] = _mm_add_epi32(v[0], v[1]); + u[0] = _mm_add_epi32(u[0], rnding); + u[0] = _mm_srai_epi32(u[0], bit); + + u[1] = _mm_sub_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(u[1], rnding); + u[1] = _mm_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm_mullo_epi32(v[2], cospi48); + v[1] = _mm_mullo_epi32(v[3], cospi16); + u[2] = _mm_add_epi32(v[0], v[1]); + u[2] = _mm_add_epi32(u[2], rnding); + u[2] = _mm_srai_epi32(u[2], bit); + + v[0] = _mm_mullo_epi32(v[2], cospi16); + v[1] = _mm_mullo_epi32(v[3], cospi48); + u[3] = _mm_sub_epi32(v[1], v[0]); + u[3] = _mm_add_epi32(u[3], rnding); + u[3] = _mm_srai_epi32(u[3], bit); + + u[4] = _mm_add_epi32(v[4], v[5]); + u[5] = _mm_sub_epi32(v[4], v[5]); + u[6] = _mm_sub_epi32(v[7], v[6]); + u[7] = _mm_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm_mullo_epi32(u[4], cospi56); + v[1] = _mm_mullo_epi32(u[7], cospi8); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm_mullo_epi32(u[4], cospi8); + v[1] = _mm_mullo_epi32(u[7], cospi56); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm_mullo_epi32(u[5], cospi24); + v[1] = _mm_mullo_epi32(u[6], cospi40); + v[0] = _mm_add_epi32(v[0], v[1]); + v[0] = _mm_add_epi32(v[0], rnding); + out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm_mullo_epi32(u[5], cospi40); + v[1] = _mm_mullo_epi32(u[6], cospi24); + v[0] = _mm_sub_epi32(v[1], v[0]); + v[0] = _mm_add_epi32(v[0], rnding); + out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6] + + out[0 * col_num] = u[0]; // buf0[0] + out[4 * col_num] = u[1]; // buf0[1] + out[2 * col_num] = u[2]; // buf0[2] + out[6 * col_num] = u[3]; // buf0[3] +} + +static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + fdct4x8_sse4_1(in, out, bit, col_num); + fdct4x8_sse4_1(in + 1, out + 1, bit, col_num); +} + +static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospim4 = _mm_set1_epi32(-cospi[4]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospim20 = _mm_set1_epi32(-cospi[20]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospim36 = _mm_set1_epi32(-cospi[36]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i cospim52 = _mm_set1_epi32(-cospi[52]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x, y; + int col; + + // Note: + // Even column: 0, 2, ..., 14 + // Odd column: 1, 3, ..., 15 + // one even column plus one odd column constructs one row (8 coeffs) + // total we have 8 rows (8x8). + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u0 = in[col_num * 0 + col]; + u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]); + u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]); + u3 = in[col_num * 4 + col]; + u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]); + u5 = in[col_num * 6 + col]; + u6 = in[col_num * 2 + col]; + u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm_mullo_epi32(u2, cospi32); + y = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + v3 = _mm_sub_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm_mullo_epi32(u6, cospi32); + y = _mm_mullo_epi32(u7, cospi32); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + v7 = _mm_sub_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm_add_epi32(v0, v2); + u1 = _mm_add_epi32(v1, v3); + u2 = _mm_sub_epi32(v0, v2); + u3 = _mm_sub_epi32(v1, v3); + u4 = _mm_add_epi32(v4, v6); + u5 = _mm_add_epi32(v5, v7); + u6 = _mm_sub_epi32(v4, v6); + u7 = _mm_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm_mullo_epi32(u4, cospi16); + y = _mm_mullo_epi32(u5, cospi48); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi48); + y = _mm_mullo_epi32(u5, cospim16); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospim48); + y = _mm_mullo_epi32(u7, cospi16); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi16); + y = _mm_mullo_epi32(u7, cospi48); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm_add_epi32(v0, v4); + u1 = _mm_add_epi32(v1, v5); + u2 = _mm_add_epi32(v2, v6); + u3 = _mm_add_epi32(v3, v7); + u4 = _mm_sub_epi32(v0, v4); + u5 = _mm_sub_epi32(v1, v5); + u6 = _mm_sub_epi32(v2, v6); + u7 = _mm_sub_epi32(v3, v7); + + // stage 6 + x = _mm_mullo_epi32(u0, cospi4); + y = _mm_mullo_epi32(u1, cospi60); + v0 = _mm_add_epi32(x, y); + v0 = _mm_add_epi32(v0, rnding); + v0 = _mm_srai_epi32(v0, bit); + + x = _mm_mullo_epi32(u0, cospi60); + y = _mm_mullo_epi32(u1, cospim4); + v1 = _mm_add_epi32(x, y); + v1 = _mm_add_epi32(v1, rnding); + v1 = _mm_srai_epi32(v1, bit); + + x = _mm_mullo_epi32(u2, cospi20); + y = _mm_mullo_epi32(u3, cospi44); + v2 = _mm_add_epi32(x, y); + v2 = _mm_add_epi32(v2, rnding); + v2 = _mm_srai_epi32(v2, bit); + + x = _mm_mullo_epi32(u2, cospi44); + y = _mm_mullo_epi32(u3, cospim20); + v3 = _mm_add_epi32(x, y); + v3 = _mm_add_epi32(v3, rnding); + v3 = _mm_srai_epi32(v3, bit); + + x = _mm_mullo_epi32(u4, cospi36); + y = _mm_mullo_epi32(u5, cospi28); + v4 = _mm_add_epi32(x, y); + v4 = _mm_add_epi32(v4, rnding); + v4 = _mm_srai_epi32(v4, bit); + + x = _mm_mullo_epi32(u4, cospi28); + y = _mm_mullo_epi32(u5, cospim36); + v5 = _mm_add_epi32(x, y); + v5 = _mm_add_epi32(v5, rnding); + v5 = _mm_srai_epi32(v5, bit); + + x = _mm_mullo_epi32(u6, cospi52); + y = _mm_mullo_epi32(u7, cospi12); + v6 = _mm_add_epi32(x, y); + v6 = _mm_add_epi32(v6, rnding); + v6 = _mm_srai_epi32(v6, bit); + + x = _mm_mullo_epi32(u6, cospi12); + y = _mm_mullo_epi32(u7, cospim52); + v7 = _mm_add_epi32(x, y); + v7 = _mm_add_epi32(v7, rnding); + v7 = _mm_srai_epi32(v7, bit); + + // stage 7 + out[col_num * 0 + col] = v1; + out[col_num * 1 + col] = v6; + out[col_num * 2 + col] = v3; + out[col_num * 3 + col] = v4; + out[col_num * 4 + col] = v5; + out[col_num * 5 + col] = v2; + out[col_num * 6 + col] = v7; + out[col_num * 7 + col] = v0; + } +} +static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + + for (int i = 0; i < col_num; i += 1) { + out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]); + out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]); + out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]); + out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]); + out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]); + out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]); + out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]); + out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]); + } +} +#if !CONFIG_REALTIME_ONLY +static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + (void)col_num; + for (int j = 0; j < 2; j++) { + out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]); + out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]); + out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]); + out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]); + out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]); + out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]); + out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]); + out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]); + } +} +#endif +void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[16], out[16]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8]; + const int txw_idx = get_txw_idx(TX_8X8); + const int txh_idx = get_txh_idx(TX_8X8); + + switch (tx_type) { + case DCT_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case ADST_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case ADST_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case FLIPADST_DCT: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case DCT_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 1, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case ADST_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case FLIPADST_ADST: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case IDTX: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case V_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case H_DCT: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case V_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case H_ADST: + load_buffer_8x8(input, in, stride, 0, 0, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case V_FLIPADST: + load_buffer_8x8(input, in, stride, 1, 0, shift[0]); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + case H_FLIPADST: + load_buffer_8x8(input, in, stride, 0, 1, shift[0]); + idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + col_txfm_8x8_rounding(out, -shift[1]); + transpose_8x8(out, in); + fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2); + write_buffer_8x8(out, coeff); + break; + default: assert(0); + } + (void)bd; +} + +// Hybrid Transform 16x16 + +static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) { + int row_index = 0; + int dst_index = 0; + int src_index = 0; + + // row 0, 1, .., 7 + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 8); + + // row 8, 9, ..., 15 + src_index += 16; + do { + out[dst_index] = in[src_index]; + out[dst_index + 1] = in[src_index + 1]; + out[dst_index + 2] = in[src_index + 16]; + out[dst_index + 3] = in[src_index + 17]; + dst_index += 4; + src_index += 2; + row_index += 1; + } while (row_index < 16); +} + +static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i in[64]; + // Load 4 8x8 blocks + const int16_t *topL = input; + const int16_t *topR = input + 8; + const int16_t *botL = input + 8 * stride; + const int16_t *botR = input + 8 * stride + 8; + + const int16_t *tmp; + + if (flipud) { + // Swap left columns + tmp = topL; + topL = botL; + botL = tmp; + // Swap right columns + tmp = topR; + topR = botR; + botR = tmp; + } + + if (fliplr) { + // Swap top rows + tmp = topL; + topL = topR; + topR = tmp; + // Swap bottom rows + tmp = botL; + botL = botR; + botR = tmp; + } + + // load first 8 columns + load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift); + load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift); + + // load second 8 columns + load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift); + load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift); + + convert_8x8_to_16x16(in, out); +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_8x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *topR = input + 4; + + const int16_t *tmp; + + if (fliplr) { + tmp = topL; + topL = topR; + topR = tmp; + } + + load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *topR = input + 8; + + const int16_t *tmp; + + if (fliplr) { + tmp = topL; + topL = topR; + topR = tmp; + } + + load_buffer_8x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift); +} + +static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 4 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + + load_buffer_4x4(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift); +} + +#if !CONFIG_REALTIME_ONLY +static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out, + const int stride, const int flipud, + const int fliplr, const int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + load_buffer_4x8(topL, out, stride, flipud, fliplr, shift); + load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift); +} +#endif + +static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out, + int stride, int flipud, int fliplr, + int shift, const int height) { + const int16_t *in = input; + __m128i *output = out; + for (int col = 0; col < height; col++) { + in = input + col * stride; + output = out + col * 8; + load_buffer_4x4(in, output, 4, flipud, fliplr, shift); + load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift); + } +} + +static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int col_num) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospim32 = _mm_set1_epi32(-cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi60 = _mm_set1_epi32(cospi[60]); + const __m128i cospi4 = _mm_set1_epi32(cospi[4]); + const __m128i cospi28 = _mm_set1_epi32(cospi[28]); + const __m128i cospi36 = _mm_set1_epi32(cospi[36]); + const __m128i cospi44 = _mm_set1_epi32(cospi[44]); + const __m128i cospi20 = _mm_set1_epi32(cospi[20]); + const __m128i cospi12 = _mm_set1_epi32(cospi[12]); + const __m128i cospi52 = _mm_set1_epi32(cospi[52]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + __m128i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm_add_epi32(u[0], u[7]); + v[7] = _mm_sub_epi32(u[0], u[7]); + v[1] = _mm_add_epi32(u[1], u[6]); + v[6] = _mm_sub_epi32(u[1], u[6]); + v[2] = _mm_add_epi32(u[2], u[5]); + v[5] = _mm_sub_epi32(u[2], u[5]); + v[3] = _mm_add_epi32(u[3], u[4]); + v[4] = _mm_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm_mullo_epi32(u[10], cospim32); + x = _mm_mullo_epi32(u[13], cospi32); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi32); + x = _mm_mullo_epi32(u[13], cospim32); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospim32); + x = _mm_mullo_epi32(u[12], cospi32); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi32); + x = _mm_mullo_epi32(u[12], cospim32); + v[12] = _mm_sub_epi32(v[12], x); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[3]); + u[3] = _mm_sub_epi32(v[0], v[3]); + u[1] = _mm_add_epi32(v[1], v[2]); + u[2] = _mm_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm_mullo_epi32(v[5], cospim32); + x = _mm_mullo_epi32(v[6], cospi32); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi32); + x = _mm_mullo_epi32(v[6], cospim32); + u[6] = _mm_sub_epi32(u[6], x); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm_add_epi32(v[8], v[11]); + u[11] = _mm_sub_epi32(v[8], v[11]); + u[9] = _mm_add_epi32(v[9], v[10]); + u[10] = _mm_sub_epi32(v[9], v[10]); + u[12] = _mm_sub_epi32(v[15], v[12]); + u[15] = _mm_add_epi32(v[15], v[12]); + u[13] = _mm_sub_epi32(v[14], v[13]); + u[14] = _mm_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm_mullo_epi32(u[0], cospi32); + u[1] = _mm_mullo_epi32(u[1], cospi32); + v[0] = _mm_add_epi32(u[0], u[1]); + v[0] = _mm_add_epi32(v[0], rnding); + v[0] = _mm_srai_epi32(v[0], bit); + + v[1] = _mm_sub_epi32(u[0], u[1]); + v[1] = _mm_add_epi32(v[1], rnding); + v[1] = _mm_srai_epi32(v[1], bit); + + v[2] = _mm_mullo_epi32(u[2], cospi48); + x = _mm_mullo_epi32(u[3], cospi16); + v[2] = _mm_add_epi32(v[2], x); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_mullo_epi32(u[2], cospi16); + x = _mm_mullo_epi32(u[3], cospi48); + v[3] = _mm_sub_epi32(x, v[3]); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = _mm_add_epi32(u[4], u[5]); + v[5] = _mm_sub_epi32(u[4], u[5]); + v[6] = _mm_sub_epi32(u[7], u[6]); + v[7] = _mm_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm_mullo_epi32(u[9], cospim16); + x = _mm_mullo_epi32(u[14], cospi48); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi48); + x = _mm_mullo_epi32(u[14], cospim16); + v[14] = _mm_sub_epi32(v[14], x); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospim48); + x = _mm_mullo_epi32(u[13], cospim16); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospim16); + x = _mm_mullo_epi32(u[13], cospim48); + v[13] = _mm_sub_epi32(v[13], x); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm_mullo_epi32(v[4], cospi56); + x = _mm_mullo_epi32(v[7], cospi8); + u[4] = _mm_add_epi32(u[4], x); + u[4] = _mm_add_epi32(u[4], rnding); + u[4] = _mm_srai_epi32(u[4], bit); + + u[7] = _mm_mullo_epi32(v[4], cospi8); + x = _mm_mullo_epi32(v[7], cospi56); + u[7] = _mm_sub_epi32(x, u[7]); + u[7] = _mm_add_epi32(u[7], rnding); + u[7] = _mm_srai_epi32(u[7], bit); + + u[5] = _mm_mullo_epi32(v[5], cospi24); + x = _mm_mullo_epi32(v[6], cospi40); + u[5] = _mm_add_epi32(u[5], x); + u[5] = _mm_add_epi32(u[5], rnding); + u[5] = _mm_srai_epi32(u[5], bit); + + u[6] = _mm_mullo_epi32(v[5], cospi40); + x = _mm_mullo_epi32(v[6], cospi24); + u[6] = _mm_sub_epi32(x, u[6]); + u[6] = _mm_add_epi32(u[6], rnding); + u[6] = _mm_srai_epi32(u[6], bit); + + u[8] = _mm_add_epi32(v[8], v[9]); + u[9] = _mm_sub_epi32(v[8], v[9]); + u[10] = _mm_sub_epi32(v[11], v[10]); + u[11] = _mm_add_epi32(v[11], v[10]); + u[12] = _mm_add_epi32(v[12], v[13]); + u[13] = _mm_sub_epi32(v[12], v[13]); + u[14] = _mm_sub_epi32(v[15], v[14]); + u[15] = _mm_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm_mullo_epi32(u[8], cospi60); + x = _mm_mullo_epi32(u[15], cospi4); + v[8] = _mm_add_epi32(v[8], x); + v[8] = _mm_add_epi32(v[8], rnding); + v[8] = _mm_srai_epi32(v[8], bit); + + v[15] = _mm_mullo_epi32(u[8], cospi4); + x = _mm_mullo_epi32(u[15], cospi60); + v[15] = _mm_sub_epi32(x, v[15]); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + v[9] = _mm_mullo_epi32(u[9], cospi28); + x = _mm_mullo_epi32(u[14], cospi36); + v[9] = _mm_add_epi32(v[9], x); + v[9] = _mm_add_epi32(v[9], rnding); + v[9] = _mm_srai_epi32(v[9], bit); + + v[14] = _mm_mullo_epi32(u[9], cospi36); + x = _mm_mullo_epi32(u[14], cospi28); + v[14] = _mm_sub_epi32(x, v[14]); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[10] = _mm_mullo_epi32(u[10], cospi44); + x = _mm_mullo_epi32(u[13], cospi20); + v[10] = _mm_add_epi32(v[10], x); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[13] = _mm_mullo_epi32(u[10], cospi20); + x = _mm_mullo_epi32(u[13], cospi44); + v[13] = _mm_sub_epi32(x, v[13]); + v[13] = _mm_add_epi32(v[13], rnding); + v[13] = _mm_srai_epi32(v[13], bit); + + v[11] = _mm_mullo_epi32(u[11], cospi12); + x = _mm_mullo_epi32(u[12], cospi52); + v[11] = _mm_add_epi32(v[11], x); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = _mm_mullo_epi32(u[11], cospi52); + x = _mm_mullo_epi32(u[12], cospi12); + v[12] = _mm_sub_epi32(x, v[12]); + v[12] = _mm_add_epi32(v[12], rnding); + v[12] = _mm_srai_epi32(v[12], bit); + + out[0 * col_num + col] = v[0]; + out[1 * col_num + col] = v[8]; + out[2 * col_num + col] = v[4]; + out[3 * col_num + col] = v[12]; + out[4 * col_num + col] = v[2]; + out[5 * col_num + col] = v[10]; + out[6 * col_num + col] = v[6]; + out[7 * col_num + col] = v[14]; + out[8 * col_num + col] = v[1]; + out[9 * col_num + col] = v[9]; + out[10 * col_num + col] = v[5]; + out[11 * col_num + col] = v[13]; + out[12 * col_num + col] = v[3]; + out[13 * col_num + col] = v[11]; + out[14 * col_num + col] = v[7]; + out[15 * col_num + col] = v[15]; + } +} + +static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, + const int num_cols) { + const int32_t *cospi = cospi_arr(bit); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i cospi48 = _mm_set1_epi32(cospi[48]); + const __m128i cospi16 = _mm_set1_epi32(cospi[16]); + const __m128i cospim16 = _mm_set1_epi32(-cospi[16]); + const __m128i cospim48 = _mm_set1_epi32(-cospi[48]); + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospim56 = _mm_set1_epi32(-cospi[56]); + const __m128i cospim8 = _mm_set1_epi32(-cospi[8]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospim24 = _mm_set1_epi32(-cospi[24]); + const __m128i cospim40 = _mm_set1_epi32(-cospi[40]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi2 = _mm_set1_epi32(cospi[2]); + const __m128i cospi62 = _mm_set1_epi32(cospi[62]); + const __m128i cospim2 = _mm_set1_epi32(-cospi[2]); + const __m128i cospi10 = _mm_set1_epi32(cospi[10]); + const __m128i cospi54 = _mm_set1_epi32(cospi[54]); + const __m128i cospim10 = _mm_set1_epi32(-cospi[10]); + const __m128i cospi18 = _mm_set1_epi32(cospi[18]); + const __m128i cospi46 = _mm_set1_epi32(cospi[46]); + const __m128i cospim18 = _mm_set1_epi32(-cospi[18]); + const __m128i cospi26 = _mm_set1_epi32(cospi[26]); + const __m128i cospi38 = _mm_set1_epi32(cospi[38]); + const __m128i cospim26 = _mm_set1_epi32(-cospi[26]); + const __m128i cospi34 = _mm_set1_epi32(cospi[34]); + const __m128i cospi30 = _mm_set1_epi32(cospi[30]); + const __m128i cospim34 = _mm_set1_epi32(-cospi[34]); + const __m128i cospi42 = _mm_set1_epi32(cospi[42]); + const __m128i cospi22 = _mm_set1_epi32(cospi[22]); + const __m128i cospim42 = _mm_set1_epi32(-cospi[42]); + const __m128i cospi50 = _mm_set1_epi32(cospi[50]); + const __m128i cospi14 = _mm_set1_epi32(cospi[14]); + const __m128i cospim50 = _mm_set1_epi32(-cospi[50]); + const __m128i cospi58 = _mm_set1_epi32(cospi[58]); + const __m128i cospi6 = _mm_set1_epi32(cospi[6]); + const __m128i cospim58 = _mm_set1_epi32(-cospi[58]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i zero = _mm_setzero_si128(); + + __m128i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm_mullo_epi32(u[2], cospi32); + y = _mm_mullo_epi32(u[3], cospi32); + v[2] = _mm_add_epi32(x, y); + v[2] = _mm_add_epi32(v[2], rnding); + v[2] = _mm_srai_epi32(v[2], bit); + + v[3] = _mm_sub_epi32(x, y); + v[3] = _mm_add_epi32(v[3], rnding); + v[3] = _mm_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm_mullo_epi32(u[6], cospi32); + y = _mm_mullo_epi32(u[7], cospi32); + v[6] = _mm_add_epi32(x, y); + v[6] = _mm_add_epi32(v[6], rnding); + v[6] = _mm_srai_epi32(v[6], bit); + + v[7] = _mm_sub_epi32(x, y); + v[7] = _mm_add_epi32(v[7], rnding); + v[7] = _mm_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm_mullo_epi32(u[10], cospi32); + y = _mm_mullo_epi32(u[11], cospi32); + v[10] = _mm_add_epi32(x, y); + v[10] = _mm_add_epi32(v[10], rnding); + v[10] = _mm_srai_epi32(v[10], bit); + + v[11] = _mm_sub_epi32(x, y); + v[11] = _mm_add_epi32(v[11], rnding); + v[11] = _mm_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm_mullo_epi32(u[14], cospi32); + y = _mm_mullo_epi32(u[15], cospi32); + v[14] = _mm_add_epi32(x, y); + v[14] = _mm_add_epi32(v[14], rnding); + v[14] = _mm_srai_epi32(v[14], bit); + + v[15] = _mm_sub_epi32(x, y); + v[15] = _mm_add_epi32(v[15], rnding); + v[15] = _mm_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm_add_epi32(v[0], v[2]); + u[1] = _mm_add_epi32(v[1], v[3]); + u[2] = _mm_sub_epi32(v[0], v[2]); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[4] = _mm_add_epi32(v[4], v[6]); + u[5] = _mm_add_epi32(v[5], v[7]); + u[6] = _mm_sub_epi32(v[4], v[6]); + u[7] = _mm_sub_epi32(v[5], v[7]); + u[8] = _mm_add_epi32(v[8], v[10]); + u[9] = _mm_add_epi32(v[9], v[11]); + u[10] = _mm_sub_epi32(v[8], v[10]); + u[11] = _mm_sub_epi32(v[9], v[11]); + u[12] = _mm_add_epi32(v[12], v[14]); + u[13] = _mm_add_epi32(v[13], v[15]); + u[14] = _mm_sub_epi32(v[12], v[14]); + u[15] = _mm_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * num_cols + col] = v[1]; + out[1 * num_cols + col] = v[14]; + out[2 * num_cols + col] = v[3]; + out[3 * num_cols + col] = v[12]; + out[4 * num_cols + col] = v[5]; + out[5 * num_cols + col] = v[10]; + out[6 * num_cols + col] = v[7]; + out[7 * num_cols + col] = v[8]; + out[8 * num_cols + col] = v[9]; + out[9 * num_cols + col] = v[6]; + out[10 * num_cols + col] = v[11]; + out[11 * num_cols + col] = v[4]; + out[12 * num_cols + col] = v[13]; + out[13 * num_cols + col] = v[2]; + out[14 * num_cols + col] = v[15]; + out[15 * num_cols + col] = v[0]; + } +} + +static void col_txfm_16x16_rounding(__m128i *in, int shift) { + // Note: + // We split 16x16 rounding into 4 sections of 8x8 rounding, + // instead of 4 columns + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); + col_txfm_8x8_rounding(&in[32], shift); + col_txfm_8x8_rounding(&in[48], shift); +} + +static void col_txfm_8x16_rounding(__m128i *in, int shift) { + col_txfm_8x8_rounding(&in[0], shift); + col_txfm_8x8_rounding(&in[16], shift); +} + +static void write_buffer_16x16(const __m128i *in, int32_t *output) { + const int size_8x8 = 16 * 4; + write_buffer_8x8(&in[0], output); + output += size_8x8; + write_buffer_8x8(&in[16], output); + output += size_8x8; + write_buffer_8x8(&in[32], output); + output += size_8x8; + write_buffer_8x8(&in[48], output); +} +static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) { + (void)bit; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm_mullo_epi32(in[i], fact); + a_low = _mm_add_epi32(a_low, offset); + out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits); + } +} +void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64], out[64]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16]; + const int txw_idx = get_txw_idx(TX_16X16); + const int txh_idx = get_txh_idx(TX_16X16); + const int col_num = 4; + switch (tx_type) { + case DCT_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case ADST_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case DCT_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case ADST_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case FLIPADST_DCT: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case DCT_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case FLIPADST_FLIPADST: + load_buffer_16x16(input, in, stride, 1, 1, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case ADST_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case FLIPADST_ADST: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case IDTX: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case V_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case H_DCT: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case V_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case H_ADST: + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + case V_FLIPADST: + load_buffer_16x16(input, in, stride, 1, 0, shift[0]); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], + col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num); + write_buffer_16x16(out, coeff); + break; + case H_FLIPADST: + load_buffer_16x16(input, in, stride, 0, 1, shift[0]); + idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num); + col_txfm_16x16_rounding(out, -shift[1]); + transpose_16x16(out, in); + fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], + col_num); + write_buffer_16x16(out, coeff); + break; + default: assert(0); + } + (void)bd; +} + +static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; i += 2) in[30 - i] = out[i]; + for (int i = 1; i < size; i += 2) in[size - i] = out[i]; +} + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + fdct8x8_sse4_1, // V_DCT + idtx8x8_sse4_1, // H_DCT + fadst8x8_sse4_1, // V_ADST + idtx8x8_sse4_1, // H_ADST + fadst8x8_sse4_1, // V_FLIPADST + idtx8x8_sse4_1 // H_FLIPADST +}; +#if !CONFIG_REALTIME_ONLY +static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST-ADST + idtx32x8_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL, // H_FLIPADST +}; +#endif +static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = { + fdct4x8_sse4_1, // DCT_DCT + fadst8x8_sse4_1, // ADST_DCT + fdct4x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fadst8x8_sse4_1, // FLIPADST_DCT + fdct4x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + fdct4x8_sse4_1, // V_DCT + idtx8x8_sse4_1, // H_DCT + fadst8x8_sse4_1, // V_ADST + idtx8x8_sse4_1, // H_ADST + fadst8x8_sse4_1, // V_FLIPADST + idtx8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fdct16x16_sse4_1, // ADST_DCT + fadst16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fdct16x16_sse4_1, // FLIPADST_DCT + fadst16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + idtx16x16_sse4_1, // V_DCT + fdct16x16_sse4_1, // H_DCT + idtx16x16_sse4_1, // V_ADST + fadst16x16_sse4_1, // H_ADST + idtx16x16_sse4_1, // V_FLIPADST + fadst16x16_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + fadst16x16_sse4_1, // ADST_DCT + fdct16x16_sse4_1, // DCT_ADST + fadst16x16_sse4_1, // ADST_ADST + fadst16x16_sse4_1, // FLIPADST_DCT + fdct16x16_sse4_1, // DCT_FLIPADST + fadst16x16_sse4_1, // FLIPADST_FLIPADST + fadst16x16_sse4_1, // ADST_FLIPADST + fadst16x16_sse4_1, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + fdct16x16_sse4_1, // V_DCT + idtx16x16_sse4_1, // H_DCT + fadst16x16_sse4_1, // V_ADST + idtx16x16_sse4_1, // H_ADST + fadst16x16_sse4_1, // V_FLIPADST + idtx16x16_sse4_1 // H_FLIPADST +}; +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = { + fdct8x8_sse4_1, // DCT_DCT + fdct8x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct8x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + idtx8x8_sse4_1, // V_DCT + fdct8x8_sse4_1, // H_DCT + idtx8x8_sse4_1, // V_ADST + fadst8x8_sse4_1, // H_ADST + idtx8x8_sse4_1, // V_FLIPADST + fadst8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = { + fdct4x8_sse4_1, // DCT_DCT + fdct4x8_sse4_1, // ADST_DCT + fadst8x8_sse4_1, // DCT_ADST + fadst8x8_sse4_1, // ADST_ADST + fdct4x8_sse4_1, // FLIPADST_DCT + fadst8x8_sse4_1, // DCT_FLIPADST + fadst8x8_sse4_1, // FLIPADST_FLIPADST + fadst8x8_sse4_1, // ADST_FLIPADST + fadst8x8_sse4_1, // FLIPADST_ADST + idtx8x8_sse4_1, // IDTX + idtx8x8_sse4_1, // V_DCT + fdct4x8_sse4_1, // H_DCT + idtx8x8_sse4_1, // V_ADST + fadst8x8_sse4_1, // H_ADST + idtx8x8_sse4_1, // V_FLIPADST + fadst8x8_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = { + fdct4x4_sse4_1, // DCT_DCT + fdct4x4_sse4_1, // ADST_DCT + fadst4x4_sse4_1, // DCT_ADST + fadst4x4_sse4_1, // ADST_ADST + fdct4x4_sse4_1, // FLIPADST_DCT + fadst4x4_sse4_1, // DCT_FLIPADST + fadst4x4_sse4_1, // FLIPADST_FLIPADST + fadst4x4_sse4_1, // ADST_FLIPADST + fadst4x4_sse4_1, // FLIPADST_ADST + idtx4x4_sse4_1, // IDTX + idtx4x4_sse4_1, // V_DCT + fdct4x4_sse4_1, // H_DCT + idtx4x4_sse4_1, // V_ADST + fadst4x4_sse4_1, // H_ADST + idtx4x4_sse4_1, // V_FLIPADST + fadst4x4_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = { + fdct4x4_sse4_1, // DCT_DCT + fadst4x4_sse4_1, // ADST_DCT + fdct4x4_sse4_1, // DCT_ADST + fadst4x4_sse4_1, // ADST_ADST + fadst4x4_sse4_1, // FLIPADST_DCT + fdct4x4_sse4_1, // DCT_FLIPADST + fadst4x4_sse4_1, // FLIPADST_FLIPADST + fadst4x4_sse4_1, // ADST_FLIPADST + fadst4x4_sse4_1, // FLIPADST_ADST + idtx4x4_sse4_1, // IDTX + fdct4x4_sse4_1, // V_DCT + idtx4x4_sse4_1, // H_DCT + fadst4x4_sse4_1, // V_ADST + idtx4x4_sse4_1, // H_ADST + fadst4x4_sse4_1, // V_FLIPADST + idtx4x4_sse4_1 // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = { + av1_fdct32_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + av1_idtx32_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = { + fdct16x16_sse4_1, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx16x16_sse4_1, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; + +void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < 2; i++) { + load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x8_rounding(in, -shift[1]); + transpose_8x8(in, out + i * 16); + } + + if (lr_flip) { + flip_buf_sse4_1(in, out, 32); + row_txfm(in, out, bit, 2); + } else { + row_txfm(out, out, bit, 2); + } + + for (int i = 0; i < 2; i++) { + av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2], + NewSqrt2); + write_buffer_8x8(in, coeff + i * 64); + } + (void)bd; +} + +void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[32], out[32]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bit, 2); + col_txfm_8x16_rounding(in, -shift[1]); + transpose_8x8(in, out); + transpose_8x8(in + 16, out + 16); + + for (int i = 0; i < 2; i++) { + row_txfm(out + i * 16, out, bit, 2); + av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2); + write_buffer_16x8(out, coeff + i * 8, 16); + } + (void)bd; +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16]; + const int txw_idx = get_txw_idx(TX_4X16); + const int txh_idx = get_txh_idx(TX_4X16); + const int txfm_size_col = tx_size_wide[TX_4X16]; + const int txfm_size_row = tx_size_high[TX_4X16]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col transform + load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, outcoeff128, bitcol, 1); + col_txfm_8x8_rounding(outcoeff128, -shift[1]); + transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < 4; i++) { + __m128i tmp[4]; + row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2); + store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col); + } + (void)bd; +} +#endif + +void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[16]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4]; + const int txw_idx = get_txw_idx(TX_16X4); + const int txh_idx = get_txh_idx(TX_16X4); + const int txfm_size_col = tx_size_wide[TX_16X4]; + const int txfm_size_row = tx_size_high[TX_16X4]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + // col transform + load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]); + + for (int i = 0; i < (txfm_size_col >> 2); i++) { + __m128i *cur_in = &in[i * txfm_size_row]; + col_txfm(cur_in, cur_in, bitcol, 1); + transpose_32bit_4x4(cur_in, cur_in); + } + col_txfm_8x8_rounding(in, -shift[1]); + + // row transform + row_txfm(in, outcoeff128, bitrow, 1); + (void)bd; +} + +void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[128]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32]; + const int txw_idx = get_txw_idx(TX_16X32); + const int txh_idx = get_txh_idx(TX_16X32); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + // column transform + load_buffer_16x16(input, in, stride, 0, 0, shift[0]); + load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]); + + for (int i = 0; i < 4; i++) { + col_txfm((in + i), (in + i), bitcol, 4); + } + col_txfm_16x16_rounding(&in[0], -shift[1]); + col_txfm_16x16_rounding(&in[64], -shift[1]); + transpose_8nx8n(in, outcoef128, 16, 32); + + // row transform + row_txfm(outcoef128, in, bitrow, 8); + av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + (void)tx_type; + __m128i in[512]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64]; + const int txw_idx = get_txw_idx(TX_32X64); + const int txh_idx = get_txh_idx(TX_32X64); + const int txfm_size_col = tx_size_wide[TX_32X64]; + const int txfm_size_row = tx_size_high[TX_32X64]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int num_row = txfm_size_row >> 2; + const int num_col = txfm_size_col >> 2; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row); + for (int i = 0; i < num_col; i++) { + av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col); + } + for (int i = 0; i < num_col; i++) { + col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]); + } + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_row; i++) { + av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row); + } + for (int i = 0; i < txfm_size_col; i++) { + av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8, + -shift[2], NewSqrt2); + } + (void)bd; +} + +void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + (void)tx_type; + __m128i in[512]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32]; + const int txw_idx = get_txw_idx(TX_64X32); + const int txh_idx = get_txh_idx(TX_64X32); + const int txfm_size_col = tx_size_wide[TX_64X32]; + const int txfm_size_row = tx_size_high[TX_64X32]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const int num_row = txfm_size_row >> 2; + const int num_col = txfm_size_col >> 2; + + // column transform + for (int i = 0; i < 32; i++) { + load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]); + load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0, + shift[0]); + load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0, + shift[0]); + load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0, + shift[0]); + } + + for (int i = 0; i < num_col; i++) { + av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col); + } + + for (int i = 0; i < num_row; i++) { + col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]); + } + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_row; i++) { + av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row); + } + av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2], + NewSqrt2); + (void)bd; +} + +void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[128]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16]; + const int txw_idx = get_txw_idx(TX_32X16); + const int txh_idx = get_txh_idx(TX_32X16); + const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16); + col_txfm(in, in, bitcol, 8); + col_txfm_16x16_rounding(&in[0], -shift[1]); + col_txfm_16x16_rounding(&in[64], -shift[1]); + transpose_8nx8n(in, outcoef128, 32, 16); + + // row transform + for (int i = 0; i < 4; i++) { + row_txfm((outcoef128 + i), (in + i), bitrow, 4); + } + av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2], + NewSqrt2); + (void)bd; +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32]; + const int txw_idx = get_txw_idx(TX_8X32); + const int txh_idx = get_txh_idx(TX_8X32); + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + const int txfm_size_col = tx_size_wide[TX_8X32]; + const int txfm_size_row = tx_size_high[TX_8X32]; + const int num_col = txfm_size_col >> 2; + + // column transform + load_buffer_8x16(input, in, stride, 0, 0, shift[0]); + load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row, + stride, 0, 0, shift[0]); + + for (int i = 0; i < num_col; i++) { + col_txfm((in + i), (in + i), bitcol, num_col); + } + col_txfm_16x16_rounding(in, -shift[1]); + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < txfm_size_col; i += 2) { + row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col); + } + (void)bd; +} + +void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[64]; + __m128i *outcoef128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8]; + const int txw_idx = get_txw_idx(TX_32X8); + const int txh_idx = get_txh_idx(TX_32X8); + const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + + const int txfm_size_col = tx_size_wide[TX_32X8]; + const int txfm_size_row = tx_size_high[TX_32X8]; + const int num_col = txfm_size_row >> 2; + + // column transform + load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8); + for (int i = 0; i < txfm_size_row; i += 2) { + col_txfm((in + i), (in + i), bitcol, txfm_size_row); + } + + col_txfm_16x16_rounding(&in[0], -shift[1]); + transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row); + + // row transform + for (int i = 0; i < num_col; i++) { + row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col); + } + (void)bd; +} +#endif + +void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[8]; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8]; + const int txw_idx = get_txw_idx(TX_4X8); + const int txh_idx = get_txh_idx(TX_4X8); + const int txfm_size_col = tx_size_wide[TX_4X8]; + const int txfm_size_row = tx_size_high[TX_4X8]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type]; + + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, in, bitcol, 1); + col_txfm_4x8_rounding(in, -shift[1]); + + for (int i = 0; i < 2; i++) { + __m128i *cur_in = &in[i * 4]; + transpose_32bit_4x4(cur_in, cur_in); + row_txfm(cur_in, cur_in, bitrow, 1); + av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col, + -shift[2], NewSqrt2); + store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4); + } + (void)bd; +} + +void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m128i in[8]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4]; + const int txw_idx = get_txw_idx(TX_8X4); + const int txh_idx = get_txh_idx(TX_8X4); + const int txfm_size_col = tx_size_wide[TX_8X4]; + const int txfm_size_row = tx_size_high[TX_8X4]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type]; + const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col tranform + load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]); + for (int i = 0; i < 2; i++) { + __m128i *cur_in = &in[i * txfm_size_row]; + col_txfm(cur_in, cur_in, bitcol, 1); + transpose_32bit_4x4(cur_in, cur_in); + } + col_txfm_4x8_rounding(in, -shift[1]); + + // row tranform + row_txfm(in, outcoeff128, bitrow, 1); + av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col, + -shift[2], NewSqrt2); + (void)bd; +} + +#if !CONFIG_REALTIME_ONLY +void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[256]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64]; + const int txw_idx = get_txw_idx(TX_16X64); + const int txh_idx = get_txh_idx(TX_16X64); + const int txfm_size_col = tx_size_wide[TX_16X64]; + const int txfm_size_row = tx_size_high[TX_16X64]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + const int num_col = txfm_size_col >> 2; + // col tranform + for (int i = 0; i < txfm_size_row; i += num_col) { + load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col, + ud_flip, lr_flip, shift[0]); + } + + for (int i = 0; i < num_col; i++) { + av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col); + } + + col_txfm_16x16_rounding(outcoeff128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); + + transpose_8nx8n(outcoeff128, in, txfm_size_col, 32); + fdct16x16_sse4_1(in, outcoeff128, bitrow, 8); + (void)bd; +} + +void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff, + int stride, TX_TYPE tx_type, int bd) { + __m128i in[256]; + __m128i *outcoeff128 = (__m128i *)coeff; + const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16]; + const int txw_idx = get_txw_idx(TX_64X16); + const int txh_idx = get_txh_idx(TX_64X16); + const int txfm_size_col = tx_size_wide[TX_64X16]; + const int txfm_size_row = tx_size_high[TX_64X16]; + int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx]; + int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + // col tranform + for (int i = 0; i < txfm_size_row; i++) { + load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4, + ud_flip, lr_flip, shift[0]); + } + + fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row); + col_txfm_16x16_rounding(outcoeff128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]); + col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]); + + transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row); + for (int i = 0; i < 4; i++) { + av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4); + } + memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff)); + (void)bd; +} +#endif diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c new file mode 100644 index 0000000000..ca448ca37b --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, + { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, + { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, + { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } +}; + +static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint32_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint16_t *src1 = frame1; + const uint16_t *src2 = frame2; + uint32_t *dst = frame_sse + 2; + for (int i = 0; i < block_height; i++) { + __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1); + __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2); + __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2); + __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff); + __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); + + __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); + __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); + __m256i diff_lo = + _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); + __m256i diff_hi = + _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); + + _mm256_storeu_si256((__m256i *)dst, diff_lo); + dst += 8; + _mm256_storeu_si256((__m256i *)dst, diff_hi); + + src1 += stride, src2 += stride2; + dst += sse_stride - 8; + } +} + +static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint32_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint16_t *src1 = frame1; + const uint16_t *src2 = frame2; + uint32_t *dst = frame_sse + 2; + for (int i = 0; i < block_height; i++) { + __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1); + __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2); + __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2); + __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff); + __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); + + __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); + __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); + __m256i diff_lo = + _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); + __m256i diff_hi = + _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); + + _mm256_storeu_si256((__m256i *)dst, diff_lo); + _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi); + + v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16)); + v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16)); + v_diff = _mm256_sub_epi16(v_src1, v_src2); + v_mullo = _mm256_mullo_epi16(v_diff, v_diff); + v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff); + + v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi); + v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi); + diff_lo = + _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1); + diff_hi = + _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0); + + _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo); + _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi); + + src1 += stride; + src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src, + __m256i *v256tmp) { + *v256tmp = _mm256_loadu_si256((__m256i *)src); + // For the first column, replicate the first element twice to the left + __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA); + *v256tmp = _mm256_inserti128_si256(*v256tmp, + _mm256_extracti128_si256(v256tmp1, 0), 0); +} + +static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src, + __m256i *v256tmp) { + *v256tmp = _mm256_loadu_si256((__m256i *)src); + // For the last column, replicate the last element twice to the right + __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54); + *v256tmp = _mm256_inserti128_si256(*v256tmp, + _mm256_extracti128_si256(v256tmp1, 1), 1); +} + +static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { + // Mask the required 5 values inside the vector + __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); + __m128i v128a, v128b; + // Extract 256b as two 128b registers A and B + v128a = _mm256_castsi256_si128(vtmp); + v128b = _mm256_extracti128_si256(vtmp, 1); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A2+B2, A3+B3, 0, 0] + v128b = _mm_srli_si128(v128a, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A1+B1+A3+B3, 0, 0, 0] + v128b = _mm_srli_si128(v128a, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + v128a = _mm_add_epi32(v128a, v128b); + return _mm_extract_epi32(v128a, 0); +} + +static void highbd_apply_temporal_filter( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + if (block_width == 32) { + get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } else { + get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } + + __m256i vsrc[5]; + + // Traverse 4 columns at a time + // First and last columns will require padding + int col; + uint32_t *src = frame_sse; + for (int i = 2; i < 5; i++) { + xx_load_and_pad_left(src, &vsrc[i]); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height - 3; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + xx_load_and_pad_left(src, &vsrc[4]); + src += SSE_STRIDE; + + acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3); + } + for (col = 4; col < block_width - 4; col += 4) { + src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + vsrc[i] = _mm256_loadu_si256((__m256i *)src); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height - 3; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + vsrc[4] = _mm256_loadu_si256((__m256i *)src); + + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + } + + src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad_right(src, &vsrc[i]); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height - 3; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + xx_load_and_pad_right(src, &vsrc[4]); + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]); + __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]); + __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2); + __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]); + + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3); + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } +} + +void av1_highbd_apply_temporal_filter_avx2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint32_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred); + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint16_t *ref = + CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + highbd_apply_temporal_filter( + ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, + luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c new file mode 100644 index 0000000000..2032847083 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +// For the squared error buffer, keep a padding for 4 samples +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { + { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } +}; + +static void get_squared_error(const uint16_t *frame1, const unsigned int stride, + const uint16_t *frame2, + const unsigned int stride2, const int block_width, + const int block_height, uint32_t *frame_sse, + const unsigned int dst_stride) { + const uint16_t *src1 = frame1; + const uint16_t *src2 = frame2; + uint32_t *dst = frame_sse; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j += 8) { + __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); + __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); + + __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2); + __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff); + __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff); + + __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh); + __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh); + + _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); + _mm_storeu_si128((__m128i *)(dst + j + 6), vres2); + } + + src1 += stride; + src2 += stride2; + dst += dst_stride; + } +} + +static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col, + int block_width) { + __m128i vtmp1 = _mm_loadu_si128((__m128i *)src); + __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4)); + // For the first column, replicate the first element twice to the left + dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); + // For the last column, replicate the last element twice to the right + dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); +} + +static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { + __m128i veca, vecb; + // Mask and obtain the required 5 values inside the vector + veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); + vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + veca = _mm_add_epi32(veca, vecb); + // B = [A2+B2, A3+B3, 0, 0] + vecb = _mm_srli_si128(veca, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + veca = _mm_add_epi32(veca, vecb); + // B = [A1+B1+A3+B3, 0, 0, 0] + vecb = _mm_srli_si128(veca, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + veca = _mm_add_epi32(veca, vecb); + return _mm_cvtsi128_si32(veca); +} + +static void highbd_apply_temporal_filter( + const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, + frame_sse, SSE_STRIDE); + + __m128i vsrc[5][2]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint32_t *src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad(src, vsrc[i], col, block_width); + src += SSE_STRIDE; + } + + // Padding for top 2 rows + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (int row = 0; row < block_height - 3; row++) { + __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); + __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); + __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); + __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); + + __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); + __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); + __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); + __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); + + vsrc[0][0] = vsrc[1][0]; + vsrc[0][1] = vsrc[1][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + vsrc[2][0] = vsrc[3][0]; + vsrc[2][1] = vsrc[3][1]; + vsrc[3][0] = vsrc[4][0]; + vsrc[3][1] = vsrc[4][1]; + + // Load next row + xx_load_and_pad(src, vsrc[4], col, block_width); + src += SSE_STRIDE; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); + } + for (int row = block_height - 3; row < block_height; row++) { + __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]); + __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]); + __m128i vsum13 = _mm_add_epi32(vsum11, vsum12); + __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]); + + __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]); + __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]); + __m128i vsum23 = _mm_add_epi32(vsum21, vsum22); + __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]); + + vsrc[0][0] = vsrc[1][0]; + vsrc[0][1] = vsrc[1][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + vsrc[2][0] = vsrc[3][0]; + vsrc[2][1] = vsrc[3][1]; + vsrc[3][0] = vsrc[4][0]; + vsrc[3][1] = vsrc[4][1]; + + acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0); + acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1); + acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2); + acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3); + } + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + // Scale down the difference for high bit depth input. + diff_sse >>= ((bd - 8) * 2); + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } +} + +void av1_highbd_apply_temporal_filter_sse2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint32_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred); + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint16_t *ref = + CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + highbd_apply_temporal_filter( + ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h, + subblock_mses, accum + plane_offset, count + plane_offset, frame_sse, + luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/ml_avx2.c b/third_party/aom/av1/encoder/x86/ml_avx2.c new file mode 100644 index 0000000000..6432708416 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/ml_avx2.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/x86/ml_sse3.h" + +#define CALC_OUTPUT_FOR_2ROWS \ + const int index = weight_idx + (2 * i * tot_num_inputs); \ + const __m256 weight0 = _mm256_loadu_ps(&weights[index]); \ + const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \ + const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); \ + const __m256 mul1 = _mm256_mul_ps(inputs256, weight1); \ + hadd[i] = _mm256_hadd_ps(mul0, mul1); + +static INLINE void nn_propagate_8to1( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + int num_outputs, float *const output_nodes, int is_clip_required) { + // Process one output row at a time. + for (int out = 0; out < num_outputs; out++) { + __m256 in_result = _mm256_setzero_ps(); + float bias_val = bias[out]; + for (int in = 0; in < num_inputs_to_process; in += 8) { + const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); + const int weight_idx = in + (out * tot_num_inputs); + const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]); + const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); + in_result = _mm256_add_ps(in_result, mul0); + } + const __m128 low_128 = _mm256_castps256_ps128(in_result); + const __m128 high_128 = _mm256_extractf128_ps(in_result, 1); + const __m128 sum_par_0 = _mm_add_ps(low_128, high_128); + const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0); + const __m128 sum_tot = + _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1); + + bias_val += _mm_cvtss_f32(sum_tot); + if (is_clip_required) bias_val = AOMMAX(bias_val, 0); + output_nodes[out] = bias_val; + } +} + +static INLINE void nn_propagate_8to4( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + int num_outputs, float *const output_nodes, int is_clip_required) { + __m256 hadd[2]; + for (int out = 0; out < num_outputs; out += 4) { + __m128 bias_reg = _mm_loadu_ps(&bias[out]); + __m128 in_result = _mm_setzero_ps(); + for (int in = 0; in < num_inputs_to_process; in += 8) { + const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); + const int weight_idx = in + (out * tot_num_inputs); + // Process two output row at a time. + for (int i = 0; i < 2; i++) { + CALC_OUTPUT_FOR_2ROWS + } + + const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]); + const __m128 low_128 = _mm256_castps256_ps128(sum_par); + const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1); + const __m128 result = _mm_add_ps(low_128, high_128); + + in_result = _mm_add_ps(in_result, result); + } + + in_result = _mm_add_ps(in_result, bias_reg); + if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps()); + _mm_storeu_ps(&output_nodes[out], in_result); + } +} + +static INLINE void nn_propagate_8to8( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + int num_outputs, float *const output_nodes, int is_clip_required) { + __m256 hadd[4]; + for (int out = 0; out < num_outputs; out += 8) { + __m256 bias_reg = _mm256_loadu_ps(&bias[out]); + __m256 in_result = _mm256_setzero_ps(); + for (int in = 0; in < num_inputs_to_process; in += 8) { + const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]); + const int weight_idx = in + (out * tot_num_inputs); + // Process two output rows at a time. + for (int i = 0; i < 4; i++) { + CALC_OUTPUT_FOR_2ROWS + } + const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]); + const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]); + + __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20); + __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31); + + __m256 result = _mm256_add_ps(ht_0, ht_1); + in_result = _mm256_add_ps(in_result, result); + } + in_result = _mm256_add_ps(in_result, bias_reg); + if (is_clip_required) + in_result = _mm256_max_ps(in_result, _mm256_setzero_ps()); + _mm256_storeu_ps(&output_nodes[out], in_result); + } +} + +static INLINE void nn_propagate_input_multiple_of_8( + const float *const inputs, const float *const weights, + const float *const bias, int num_inputs_to_process, int tot_num_inputs, + bool is_output_layer, int num_outputs, float *const output_nodes) { + // The saturation of output is considered for hidden layer which is not equal + // to final hidden layer. + const int is_clip_required = + !is_output_layer && num_inputs_to_process == tot_num_inputs; + if (num_outputs % 8 == 0) { + nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process, + tot_num_inputs, num_outputs, output_nodes, + is_clip_required); + } else if (num_outputs % 4 == 0) { + nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process, + tot_num_inputs, num_outputs, output_nodes, + is_clip_required); + } else { + nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process, + tot_num_inputs, num_outputs, output_nodes, + is_clip_required); + } +} + +void av1_nn_predict_avx2(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER); + + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool is_output_layer = layer == nn_config->num_hidden_layers; + float *const output_nodes = is_output_layer ? output : &buf[buf_index][0]; + const int num_outputs = is_output_layer + ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER); + + // Process input multiple of 8 using AVX2 intrinsic. + if (num_inputs % 8 == 0) { + nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias, + num_inputs, num_inputs, is_output_layer, + num_outputs, output_nodes); + } else { + // When number of inputs is not multiple of 8, use hybrid approach of AVX2 + // and SSE3 based on the need. + const int in_mul_8 = num_inputs / 8; + const int num_inputs_to_process = in_mul_8 * 8; + int bias_is_considered = 0; + if (in_mul_8) { + nn_propagate_input_multiple_of_8( + input_nodes, layer_weights, layer_bias, num_inputs_to_process, + num_inputs, is_output_layer, num_outputs, output_nodes); + bias_is_considered = 1; + } + + const float *out_temp = bias_is_considered ? output_nodes : layer_bias; + const int input_remaining = num_inputs % 8; + if (input_remaining % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]); + __m128 out_l = _mm_loadu_ps(&out_temp[out]); + for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { + av1_nn_propagate_4to8_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &out_h, &out_l, num_inputs); + } + if (!is_output_layer) { + const __m128 zero = _mm_setzero_ps(); + out_h = _mm_max_ps(out_h, zero); + out_l = _mm_max_ps(out_l, zero); + } + _mm_storeu_ps(&output_nodes[out + 4], out_h); + _mm_storeu_ps(&output_nodes[out], out_l); + } + } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&out_temp[out]); + for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { + av1_nn_propagate_4to4_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &outputs, num_inputs); + } + if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (input_remaining % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 outputs = _mm_load1_ps(&out_temp[out]); + for (int in = in_mul_8 * 8; in < num_inputs; in += 4) { + av1_nn_propagate_4to1_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &outputs); + } + if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); + output_nodes[out] = _mm_cvtss_f32(outputs); + } + } else { + // Use SSE instructions for scalar operations to avoid the latency + // of swapping between SIMD and FPU modes. + for (int out = 0; out < num_outputs; out++) { + __m128 outputs = _mm_load1_ps(&out_temp[out]); + for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) { + __m128 input = _mm_load1_ps(&input_nodes[in_node]); + __m128 weight = + _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); + outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight)); + } + if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps()); + output_nodes[out] = _mm_cvtss_f32(outputs); + } + } + } + // Before processing the next layer, treat the output of current layer as + // input to next layer. + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.c b/third_party/aom/av1/encoder/x86/ml_sse3.c new file mode 100644 index 0000000000..4748a68d38 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/ml_sse3.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/ml.h" +#include "av1/encoder/x86/ml_sse3.h" + +// In order to avoid the high-latency of swapping between FPU and SIMD +// operations, we keep the result in a 128-bit register even though we only +// care about a single value. +static void nn_propagate_8to1(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs_h = _mm_loadu_ps(&inputs[4]); + const __m128 inputs_l = _mm_loadu_ps(inputs); + + const __m128 weights_h = _mm_loadu_ps(&weights[4]); + const __m128 weights_l = _mm_loadu_ps(weights); + + const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l); + // [7 6 5 4] [3 2 1 0] (weight and input indices) + + const __m128 vadd = _mm_add_ps(mul_l, mul_h); + // [7+3 6+2 5+1 4+0] + const __m128 hadd1 = _mm_hadd_ps(vadd, vadd); + // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +void av1_nn_propagate_4to1_sse3(const float *const inputs, + const float *const weights, + __m128 *const output) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + const __m128 weights128 = _mm_loadu_ps(weights); + + const __m128 mul = _mm_mul_ps(inputs128, weights128); + // [3 2 1 0] (weight and input indices) + + const __m128 hadd1 = _mm_hadd_ps(mul, mul); + // [3+2 1+0 3+2 1+0] + const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1); + // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0] + *output = _mm_add_ps(*output, hadd2); +} + +void av1_nn_propagate_4to4_sse3(const float *const inputs, + const float *const weights, + __m128 *const outputs, const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[2]; + for (int i = 0; i < 2; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 mul0 = _mm_mul_ps(weight0, inputs128); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul1 = _mm_mul_ps(weight1, inputs128); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + + const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + + *outputs = _mm_add_ps(*outputs, hh); +} + +void av1_nn_propagate_4to8_sse3(const float *const inputs, + const float *const weights, __m128 *const out_h, + __m128 *const out_l, const int num_inputs) { + const __m128 inputs128 = _mm_loadu_ps(inputs); + + __m128 hadd[4]; + for (int i = 0; i < 4; i++) { // For each pair of outputs + const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]); + const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]); + const __m128 mul0 = _mm_mul_ps(inputs128, weight0); + const __m128 mul1 = _mm_mul_ps(inputs128, weight1); + hadd[i] = _mm_hadd_ps(mul0, mul1); + } + // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices) + // hadd[1] = [15+14 13+12 11+10 9+8] + // hadd[2] = [23+22 21+20 19+18 17+16] + // hadd[3] = [31+30 29+28 27+26 25+24] + + const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]); + // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0] + const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]); + // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16] + + *out_h = _mm_add_ps(*out_h, hh1); + *out_l = _mm_add_ps(*out_l, hh0); +} + +static void nn_propagate_8to4(const float *const inputs, + const float *const weights, __m128 *const outputs, + const int num_inputs) { + const __m128 inputs_h = _mm_loadu_ps(inputs + 4); + const __m128 inputs_l = _mm_loadu_ps(inputs); + // [7 6 5 4] [3 2 1 0] (input indices) + + __m128 add[4]; + for (int i = 0; i < 4; i++) { // For each output: + const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]); + const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]); + const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h); + const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l); + add[i] = _mm_add_ps(mul_l, mul_h); + } + // add[0] = [7+3 6+2 5+1 4+0] + // add[1] = [15+11 14+10 13+9 12+8] + // add[2] = [23+19 22+18 21+17 20+16] + // add[3] = [31+27 30+26 29+25 28+24] + + const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]); + // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16] + const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]); + // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0] + + const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h); + // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16 + // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0] + + *outputs = _mm_add_ps(*outputs, haddhadd); +} + +static void nn_activate8(__m128 *out_h, __m128 *out_l) { + const __m128 zero = _mm_setzero_ps(); + *out_h = _mm_max_ps(*out_h, zero); + *out_l = _mm_max_ps(*out_l, zero); +} + +static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); } + +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +void av1_nn_predict_sse3(const float *input_nodes, + const NN_CONFIG *const nn_config, int reduce_prec, + float *const output) { + float buf[2][NN_MAX_NODES_PER_LAYER]; + int buf_index = 0; + int num_inputs = nn_config->num_inputs; + + // Hidden layers, except the final iteration is the output layer. + for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) { + const float *layer_weights = nn_config->weights[layer]; + const float *layer_bias = nn_config->bias[layer]; + bool output_layer = (layer == nn_config->num_hidden_layers); + float *const output_nodes = output_layer ? output : &buf[buf_index][0]; + const int num_outputs = output_layer ? nn_config->num_outputs + : nn_config->num_hidden_nodes[layer]; + + if (num_inputs % 4 == 0 && num_outputs % 8 == 0) { + for (int out = 0; out < num_outputs; out += 8) { + __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]); + __m128 out_l = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + av1_nn_propagate_4to8_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &out_h, &out_l, num_inputs); + } + if (!output_layer) nn_activate8(&out_h, &out_l); + _mm_storeu_ps(&output_nodes[out + 4], out_h); + _mm_storeu_ps(&output_nodes[out], out_l); + } + } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to4(&input_nodes[in], + &layer_weights[out * num_inputs + in], &outputs, + num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) { + for (int out = 0; out < num_outputs; out += 4) { + __m128 outputs = _mm_loadu_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + av1_nn_propagate_4to4_sse3(&input_nodes[in], + &layer_weights[out * num_inputs + in], + &outputs, num_inputs); + } + if (!output_layer) nn_activate4(&outputs); + _mm_storeu_ps(&output_nodes[out], outputs); + } + } else if (num_inputs % 8 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 8) { + nn_propagate_8to1(&input_nodes[in], + &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else if (num_inputs % 4 == 0) { + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in = 0; in < num_inputs; in += 4) { + av1_nn_propagate_4to1_sse3( + &input_nodes[in], &layer_weights[out * num_inputs + in], &total); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } else { + // Use SSE instructions for scalar operations to avoid the latency of + // swapping between SIMD and FPU modes. + for (int out = 0; out < num_outputs; out++) { + __m128 total = _mm_load1_ps(&layer_bias[out]); + for (int in_node = 0; in_node < num_inputs; in_node++) { + __m128 input = _mm_load1_ps(&input_nodes[in_node]); + __m128 weight = + _mm_load1_ps(&layer_weights[num_inputs * out + in_node]); + total = _mm_add_ps(total, _mm_mul_ps(input, weight)); + } + if (!output_layer) nn_activate4(&total); + output_nodes[out] = _mm_cvtss_f32(total); + } + } + input_nodes = output_nodes; + num_inputs = num_outputs; + buf_index = 1 - buf_index; + } + if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs); +} + +// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential +// Function. Neural Computation, 11(4):853–862, 1999. +static AOM_INLINE __m128 approx_exp(__m128 y) { +#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) +#define B \ + 127 // Offset for the exponent according to IEEE floating point standard. +#define C 60801 // Magic number controls the accuracy of approximation + const __m128 multiplier = _mm_set1_ps(A); + const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C); + + y = _mm_mul_ps(y, multiplier); + y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset)); + return y; +#undef A +#undef B +#undef C +} + +static AOM_INLINE __m128 reduce_max(__m128 reg) { + __m128 tmp_reg; + + tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10 + reg = _mm_max_ps(reg, tmp_reg); + + tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01 + reg = _mm_max_ps(reg, tmp_reg); + + return reg; +} + +static AOM_INLINE __m128 reduce_sum(__m128 reg) { + __m128 tmp_reg; + + tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10 + reg = _mm_add_ps(reg, tmp_reg); + + tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01 + reg = _mm_add_ps(reg, tmp_reg); + + return reg; +} + +void av1_nn_fast_softmax_16_sse3(const float *input, float *output) { + // Clips at -10 to avoid underflowing + const __m128 clipper = _mm_set1_ps(-10.0f); + + // Load in 16 values + __m128 in_0 = _mm_loadu_ps(&input[0]); + __m128 in_1 = _mm_loadu_ps(&input[4]); + __m128 in_2 = _mm_loadu_ps(&input[8]); + __m128 in_3 = _mm_loadu_ps(&input[12]); + + // Get the max + __m128 max_0 = _mm_max_ps(in_0, in_1); + __m128 max_1 = _mm_max_ps(in_2, in_3); + + max_0 = _mm_max_ps(max_0, max_1); + max_0 = reduce_max(max_0); + + // Subtract the max off and clip + in_0 = _mm_sub_ps(in_0, max_0); + in_1 = _mm_sub_ps(in_1, max_0); + in_2 = _mm_sub_ps(in_2, max_0); + in_3 = _mm_sub_ps(in_3, max_0); + + in_0 = _mm_max_ps(in_0, clipper); + in_1 = _mm_max_ps(in_1, clipper); + in_2 = _mm_max_ps(in_2, clipper); + in_3 = _mm_max_ps(in_3, clipper); + + // Exponentiate and compute the denominator + __m128 sum = in_0 = approx_exp(in_0); + in_1 = approx_exp(in_1); + sum = _mm_add_ps(sum, in_1); + in_2 = approx_exp(in_2); + sum = _mm_add_ps(sum, in_2); + in_3 = approx_exp(in_3); + sum = _mm_add_ps(sum, in_3); + sum = reduce_sum(sum); + + // Divide to get the probability + in_0 = _mm_div_ps(in_0, sum); + in_1 = _mm_div_ps(in_1, sum); + in_2 = _mm_div_ps(in_2, sum); + in_3 = _mm_div_ps(in_3, sum); + + _mm_storeu_ps(&output[0], in_0); + _mm_storeu_ps(&output[4], in_1); + _mm_storeu_ps(&output[8], in_2); + _mm_storeu_ps(&output[12], in_3); +} diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.h b/third_party/aom/av1/encoder/x86/ml_sse3.h new file mode 100644 index 0000000000..f41a2474af --- /dev/null +++ b/third_party/aom/av1/encoder/x86/ml_sse3.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_ +#define AOM_AV1_ENCODER_X86_ML_SSE3_H_ + +#include + +void av1_nn_propagate_4to1_sse3(const float *const inputs, + const float *const weights, + __m128 *const output); + +void av1_nn_propagate_4to4_sse3(const float *const inputs, + const float *const weights, + __m128 *const outputs, const int num_inputs); + +void av1_nn_propagate_4to8_sse3(const float *const inputs, + const float *const weights, __m128 *const out_h, + __m128 *const out_l, const int num_inputs); + +#endif // AOM_AV1_ENCODER_X86_ML_SSE3_H_ diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c new file mode 100644 index 0000000000..6658ed39a8 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c @@ -0,0 +1,2348 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/transpose_sse2.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd, + const __m256i *shuffle, + const __m256i *dgd_ijkl) { + // Load two 128-bit chunks from dgd + const __m256i s0 = _mm256_inserti128_si256( + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)), + _mm_loadu_si128((__m128i *)(dgd + 4)), 1); + // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices) + // The weird order is so the shuffle stays within 128-bit lanes + + // Shuffle 16x u16 values within lanes according to the mask: + // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4] + // (Actually we shuffle u8 values as there's no 16-bit shuffle) + const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle); + // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices) + + // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit + // integers then horizontally add pairs of these integers resulting in 8x + // 32-bit integers + const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1); + // d0 = [a b c d] [e f g h] as u32 + + // Take the lower-half of d0, extend to u64, add it on to dst (H) + const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0)); + // d0l = [a b] [c d] as u64 + const __m256i dst0 = yy_load_256(dst); + yy_store_256(dst, _mm256_add_epi64(d0l, dst0)); + + // Take the upper-half of d0, extend to u64, add it on to dst (H) + const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1)); + // d0h = [e f] [g h] as u64 + const __m256i dst1 = yy_load_256(dst + 4); + yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1)); +} + +static INLINE void acc_stat_highbd_win7_one_line_avx2( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m256i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd_ijkl combined as a u32, + // then broadcast to 8x u32 slots of a 256 + const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16 + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_avx2` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1); + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win7_opt_avx2( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; + DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win7_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +static INLINE void acc_stat_highbd_win5_one_line_avx2( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m256i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd_ijkl combined as a u32, + // then broadcast to 8x u32 slots of a 256 + const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16 + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_avx2` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1); + + acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win5_opt_avx2( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + DECLARE_ALIGNED( + 32, int64_t, + H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win5_one_line_avx2( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int64[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN) { + compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) { + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd)); +} + +static INLINE __m256i convert_and_add_avx2(__m256i src) { + const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src)); + const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1)); + return _mm256_add_epi64(s0, s1); +} + +static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1, + __m256i *src2, __m256i *src3) { + // 00 01 10 11 02 03 12 13 + const __m256i s_0 = _mm256_hadd_epi32(src0, src1); + // 20 21 30 31 22 23 32 33 + const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3); + // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33 + const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1); + return convert_and_add_avx2(s_2); +} + +static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) { + // 00 10 02 12 + const __m256i t0 = _mm256_unpacklo_epi64(src0, src1); + // 01 11 03 13 + const __m256i t1 = _mm256_unpackhi_epi64(src0, src1); + // 00+01 10+11 02+03 12+13 + const __m256i sum = _mm256_add_epi64(t0, t1); + // 00+01 10+11 + const __m128i sum0 = _mm256_castsi256_si128(sum); + // 02+03 12+13 + const __m128i sum1 = _mm256_extracti128_si256(sum, 1); + // 00+01+02+03 10+11+12+13 + return _mm_add_epi64(sum0, sum1); +} + +static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) { + // 00 01 02 03 + const __m256i s0 = convert_and_add_avx2(src0); + // 10 11 12 13 + const __m256i s1 = convert_and_add_avx2(src1); + return add_64bit_lvl_avx2(s0, s1); +} + +static INLINE int32_t calc_sum_of_register(__m256i src) { + const __m128i src_l = _mm256_castsi256_si128(src); + const __m128i src_h = _mm256_extracti128_si256(src, 1); + const __m128i sum = _mm_add_epi32(src_l, src_h); + const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8)); + const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4)); + return _mm_cvtsi128_si32(dst1); +} + +static INLINE void transpose_64bit_4x4_avx2(const __m256i *const src, + __m256i *const dst) { + // Unpack 64 bit elements. Goes from: + // src[0]: 00 01 02 03 + // src[1]: 10 11 12 13 + // src[2]: 20 21 22 23 + // src[3]: 30 31 32 33 + // to: + // reg0: 00 10 02 12 + // reg1: 20 30 22 32 + // reg2: 01 11 03 13 + // reg3: 21 31 23 33 + const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]); + const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]); + const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]); + const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]); + + // Unpack 64 bit elements resulting in: + // dst[0]: 00 10 20 30 + // dst[1]: 01 11 21 31 + // dst[2]: 02 12 22 32 + // dst[3]: 03 13 23 33 + dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1); + dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1); + dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0); + dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0); +} + +// When we load 32 values of int8_t type and need less than 32 values for +// processing, the below mask is used to make the extra values zero. +static const int8_t mask_8bit[32] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes +}; + +// When we load 16 values of int16_t type and need less than 16 values for +// processing, the below mask is used to make the extra values zero. +static const int16_t mask_16bit[32] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes +}; + +static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start, + int32_t h_end, int32_t v_start, + int32_t v_end, int32_t stride) { + const uint8_t *src_temp = src + v_start * stride + h_start; + const __m256i zero = _mm256_setzero_si256(); + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const int32_t wd_beyond_mul32 = width & 31; + const int32_t wd_mul32 = width - wd_beyond_mul32; + __m128i mask_low, mask_high; + __m256i ss = zero; + + // When width is not multiple of 32, it still loads 32 and to make the data + // which is extra (beyond required) as zero using the below mask. + if (wd_beyond_mul32 >= 16) { + mask_low = _mm_set1_epi8(-1); + mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32])); + } else { + mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32])); + mask_high = _mm_setzero_si128(); + } + const __m256i mask = + _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1); + + int32_t proc_ht = 0; + do { + // Process width in multiple of 32. + int32_t proc_wd = 0; + while (proc_wd < wd_mul32) { + const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); + const __m256i sad_0 = _mm256_sad_epu8(s_0, zero); + ss = _mm256_add_epi32(ss, sad_0); + proc_wd += 32; + } + + // Process the remaining width. + if (wd_beyond_mul32) { + const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd)); + const __m256i s_m_0 = _mm256_and_si256(s_0, mask); + const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero); + ss = _mm256_add_epi32(ss, sad_0); + } + src_temp += stride; + proc_ht++; + } while (proc_ht < height); + + const uint32_t sum = calc_sum_of_register(ss); + const uint8_t avg = sum / (width * height); + return avg; +} + +// Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not +// 0, it writes (16 - n) more data than required. +static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride, + uint8_t avg, int32_t width, + int32_t height, int16_t *dst, + int32_t dst_stride, + int use_downsampled_wiener_stats) { + const __m256i avg_reg = _mm256_set1_epi16(avg); + + int32_t proc_ht = 0; + do { + int ds_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + if (use_downsampled_wiener_stats && + (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + ds_factor = height - proc_ht; + } + + int32_t proc_wd = 0; + while (proc_wd < width) { + const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd)); + const __m256i ss = _mm256_cvtepu8_epi16(s); + const __m256i d = _mm256_sub_epi16(ss, avg_reg); + _mm256_storeu_si256((__m256i *)(dst + proc_wd), d); + proc_wd += 16; + } + + src += ds_factor * src_stride; + dst += ds_factor * dst_stride; + proc_ht += ds_factor; + } while (proc_ht < height); +} + +// Fills lower-triangular elements of H buffer from upper triangular elements of +// the same +static INLINE void fill_lower_triag_elements_avx2(const int32_t wiener_win2, + int64_t *const H) { + for (int32_t i = 0; i < wiener_win2 - 1; i += 4) { + __m256i in[4], out[4]; + + in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1)); + in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1)); + in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1)); + in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1)); + + transpose_64bit_4x4_avx2(in, out); + + _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i), + _mm256_castsi256_si128(out[0])); + _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i), + _mm256_castsi256_si128(out[1])); + _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]); + _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]); + + for (int32_t j = i + 5; j < wiener_win2; j += 4) { + in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j)); + in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j)); + in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j)); + in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j)); + + transpose_64bit_4x4_avx2(in, out); + + _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]); + _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]); + _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]); + _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]); + } + } +} + +// Fill H buffer based on loop_count. +#define INIT_H_VALUES(d, loop_count) \ + for (int g = 0; g < (loop_count); g++) { \ + const __m256i dgd0 = \ + _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \ + madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]); \ + } + +// Fill M & H buffer. +#define INIT_MH_VALUES(d) \ + for (int g = 0; g < wiener_win; g++) { \ + const __m256i dgds_0 = \ + _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \ + madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]); \ + madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]); \ + } + +// Update the dgd pointers appropriately. +#define INITIALIZATION(wiener_window_sz) \ + j = i / (wiener_window_sz); \ + const int16_t *d_window = d + j; \ + const int16_t *d_current_row = \ + d + j + ((i % (wiener_window_sz)) * d_stride); \ + int proc_ht = v_start; \ + downsample_factor = \ + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ + __m256i sum_h[wiener_window_sz]; \ + memset(sum_h, 0, sizeof(sum_h)); + +// Update the downsample factor appropriately. +#define UPDATE_DOWNSAMPLE_FACTOR \ + int proc_wd = 0; \ + if (use_downsampled_wiener_stats && \ + ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \ + downsample_factor = v_end - proc_ht; \ + } \ + const __m256i df_reg = _mm256_set1_epi16(downsample_factor); + +#define CALCULATE_REMAINING_H_WIN5 \ + while (j < wiener_win) { \ + d_window = d; \ + d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \ + const __m256i zero = _mm256_setzero_si256(); \ + sum_h[0] = zero; \ + sum_h[1] = zero; \ + sum_h[2] = zero; \ + sum_h[3] = zero; \ + sum_h[4] = zero; \ + \ + proc_ht = v_start; \ + downsample_factor = \ + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ + do { \ + UPDATE_DOWNSAMPLE_FACTOR; \ + \ + /* Process the amount of width multiple of 16.*/ \ + while (proc_wd < wd_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 5) \ + \ + proc_wd += 16; \ + }; \ + \ + /* Process the remaining width here. */ \ + if (wd_beyond_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 5) \ + } \ + proc_ht += downsample_factor; \ + d_window += downsample_factor * d_stride; \ + d_current_row += downsample_factor * d_stride; \ + } while (proc_ht < v_end); \ + const __m256i s_h0 = \ + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \ + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \ + s_h0); \ + const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); \ + const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); \ + _mm_storel_epi64( \ + (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); \ + j++; \ + } + +#define CALCULATE_REMAINING_H_WIN7 \ + while (j < wiener_win) { \ + d_window = d; \ + d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \ + const __m256i zero = _mm256_setzero_si256(); \ + sum_h[0] = zero; \ + sum_h[1] = zero; \ + sum_h[2] = zero; \ + sum_h[3] = zero; \ + sum_h[4] = zero; \ + sum_h[5] = zero; \ + sum_h[6] = zero; \ + \ + proc_ht = v_start; \ + downsample_factor = \ + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \ + do { \ + UPDATE_DOWNSAMPLE_FACTOR; \ + \ + /* Process the amount of width multiple of 16.*/ \ + while (proc_wd < wd_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 7) \ + \ + proc_wd += 16; \ + }; \ + \ + /* Process the remaining width here. */ \ + if (wd_beyond_mul16) { \ + const __m256i dgd = \ + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \ + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \ + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \ + INIT_H_VALUES(d_window + j + proc_wd, 7) \ + } \ + proc_ht += downsample_factor; \ + d_window += downsample_factor * d_stride; \ + d_current_row += downsample_factor * d_stride; \ + } while (proc_ht < v_end); \ + const __m256i s_h1 = \ + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \ + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \ + s_h1); \ + const __m256i s_h2 = \ + hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); \ + _mm256_storeu_si256( \ + (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2); \ + j++; \ + } + +// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate +// the filter tap values required for wiener filtering. Here, the buffer H is of +// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size +// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the +// value above the diagonal (upper triangle) are equal to the values below the +// diagonal (lower triangle). The calculation of elements/stats of H(upper +// triangle) and M is done in steps as described below where each step fills +// specific values of H and M. +// Once the upper triangular elements of H matrix are derived, the same will be +// copied to lower triangular using the function +// fill_lower_triag_elements_avx2(). +// Example: Wiener window size = +// WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy +// (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124] +// [H30 H31 H32 ---- H323 H324] +// [H40 H41 H42 ---- H423 H424] +// [H50 H51 H52 ---- H523 H524] +// [H60 H61 H62 ---- H623 H624] +// || +// || +// [H230 H231 H232 ---- H2323 H2324] +// [H240 H241 H242 ---- H2423 H2424] +// In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e., +// H00 to H024) is filled. The remaining rows of H buffer are filled through +// steps 2 to 6. +static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride, + const int16_t *const s, int32_t s_stride, + int32_t width, int v_start, int v_end, + int64_t *const M, int64_t *const H, + int use_downsampled_wiener_stats) { + const int32_t wiener_win = WIENER_WIN_CHROMA; + const int32_t wiener_win2 = wiener_win * wiener_win; + // Amount of width which is beyond multiple of 16. This case is handled + // appropriately to process only the required width towards the end. + const int32_t wd_mul16 = width & ~15; + const int32_t wd_beyond_mul16 = width - wd_mul16; + const __m256i mask = + _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); + int downsample_factor; + + // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024) + // values are filled here. Here, the loop over 'j' is executed for values 0 + // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of + // M and H are filled as shown below. + // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,. + int j = 0; + do { + const int16_t *s_t = s; + const int16_t *d_t = d; + __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; + __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int proc_ht = v_start; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mask = _mm256_and_si256(src, mask); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + } + proc_ht += downsample_factor; + s_t += downsample_factor * s_stride; + d_t += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i s_m = + hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]); + const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]); + _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m); + _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h); + + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h); + _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h); + } while (++j < wiener_win); + + // The below steps are designed to fill remaining rows of H buffer. Here, aim + // is to fill only upper triangle elements correspond to each row and lower + // triangle elements are copied from upper-triangle elements. Also, as + // mentioned in Step 1, the core function is designed to fill 5 + // elements/stats/values of H buffer. + // + // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill + // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc, + // are need not be filled. As the core function process 5 values, in first + // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69 + // from row6, etc. + for (int i = 1; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill + // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from + // row7, etc, are need not be filled. As the core function process 5 values, + // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from + // row2, H77-H79 from row7, etc. + for (int i = 2; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill + // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from + // row8, etc, are need not be filled. As the core function process 5 values, + // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from + // row3, H88-89 from row8, etc. + for (int i = 3; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]); + _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill + // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from + // row9, etc, are need not be filled. As the core function process 5 values, + // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4, + // H99 from row9, etc. + for (int i = 4; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN_CHROMA) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]); + _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN5 + } + + // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only + // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from + // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59 + // from row5 and H1010-H1014 from row10, etc. + for (int i = 5; i < wiener_win2; i += wiener_win) { + // Derive j'th iteration from where the H buffer filling needs to be + // started. + j = i / wiener_win; + int shift = 0; + do { + // Update the dgd pointers appropriately. + int proc_ht = v_start; + const int16_t *d_window = d + (i / wiener_win); + const int16_t *d_current_row = + d + (i / wiener_win) + ((i % wiener_win) * d_stride); + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() }; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 5) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 5) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), + s_h); + const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); + const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); + _mm_storel_epi64( + (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); + shift++; + } while (++j < wiener_win); + } + + fill_lower_triag_elements_avx2(wiener_win2, H); +} + +// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate +// the filter tap values required for wiener filtering. Here, the buffer H is of +// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size +// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the +// value above the diagonal (upper triangle) are equal to the values below the +// diagonal (lower triangle). The calculation of elements/stats of H(upper +// triangle) and M is done in steps as described below where each step fills +// specific values of H and M. +// Example: +// Wiener window size = WIENER_WIN (7) +// M buffer = [M0 M1 M2 ---- M47 M48] +// H buffer = Hxy (x-row, y-column) +// [H00 H01 H02 ---- H047 H048] +// [H10 H11 H12 ---- H147 H148] +// [H30 H31 H32 ---- H347 H348] +// [H40 H41 H42 ---- H447 H448] +// [H50 H51 H52 ---- H547 H548] +// [H60 H61 H62 ---- H647 H648] +// || +// || +// [H470 H471 H472 ---- H4747 H4748] +// [H480 H481 H482 ---- H4847 H4848] +// In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e., +// H00 to H048) is filled. The remaining rows of H buffer are filled through +// steps 2 to 8. +static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride, + const int16_t *const s, int32_t s_stride, + int32_t width, int v_start, int v_end, + int64_t *const M, int64_t *const H, + int use_downsampled_wiener_stats) { + const int32_t wiener_win = WIENER_WIN; + const int32_t wiener_win2 = wiener_win * wiener_win; + // Amount of width which is beyond multiple of 16. This case is handled + // appropriately to process only the required width towards the end. + const int32_t wd_mul16 = width & ~15; + const int32_t wd_beyond_mul16 = width - wd_mul16; + const __m256i mask = + _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16])); + int downsample_factor; + + // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048) + // values are filled here. Here, the loop over 'j' is executed for values 0 + // to 6. When the loop executed for a specific 'j', 7 values of M and H are + // filled as shown below. + // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,. + int j = 0; + do { + const int16_t *s_t = s; + const int16_t *d_t = d; + __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() }; + __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() }; + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int proc_ht = v_start; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + + proc_wd += 16; + } + + if (wd_beyond_mul16) { + const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd)); + const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd)); + const __m256i src_mask = _mm256_and_si256(src, mask); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_MH_VALUES(d_t + j + proc_wd) + } + proc_ht += downsample_factor; + s_t += downsample_factor * s_stride; + d_t += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i s_m0 = + hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]); + const __m256i s_m1 = + hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]); + _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0); + _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4), + _mm256_castsi256_si128(s_m1)); + _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6], + _mm256_extracti128_si256(s_m1, 1)); + + const __m256i sh_0 = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + const __m256i sh_1 = + hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); + _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0); + _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4), + _mm256_castsi256_si128(sh_1)); + _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6], + _mm256_extracti128_si256(sh_1, 1)); + } while (++j < wiener_win); + + // The below steps are designed to fill remaining rows of H buffer. Here, aim + // is to fill only upper triangle elements correspond to each row and lower + // triangle elements are copied from upper-triangle elements. Also, as + // mentioned in Step 1, the core function is designed to fill 7 + // elements/stats/values of H buffer. + // + // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need + // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from + // row8, etc. are need not be filled. As the core function process 7 values, + // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from + // row1 and H88-H813 from row8, etc. + for (int i = 1; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]); + _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need + // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and + // H97-H98 from row9, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 5 values to be filled + // i.e., H22-H26 from row2 and H99-H913 from row9, etc. + for (int i = 2; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); + const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); + _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need + // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and + // H107-H109 from row10, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 4 values to be filled + // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc. + for (int i = 3; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need + // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and + // H117-H1110 from row10, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 3 values to be filled + // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc. + for (int i = 4; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need + // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and + // H127-H1211 from row12, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 2 values to be filled + // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc. + for (int i = 5; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need + // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and + // H137-H1312 from row13, etc. are need not be filled. As the core function + // process 7 values, in first iteration of 'j' only 1 value to be filled + // i.e., H66 from row6 and H1313 from row13, etc. + for (int i = 6; i < wiener_win2; i += wiener_win) { + // Update the dgd pointers appropriately and also derive the 'j'th iteration + // from where the H buffer filling needs to be started. + INITIALIZATION(WIENER_WIN) + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + const __m256i s_h = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h)); + + // process the remaining 'j' iterations. + j++; + CALCULATE_REMAINING_H_WIN7 + } + + // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need + // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and + // H147-H1413 from row14, etc. are need not be filled. The first iteration of + // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc. + for (int i = 7; i < wiener_win2; i += wiener_win) { + // Derive j'th iteration from where the H buffer filling needs to be + // started. + j = i / wiener_win; + int shift = 0; + do { + // Update the dgd pointers appropriately. + int proc_ht = v_start; + const int16_t *d_window = d + (i / WIENER_WIN); + const int16_t *d_current_row = + d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride); + downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() }; + do { + UPDATE_DOWNSAMPLE_FACTOR + + // Process the amount of width multiple of 16. + while (proc_wd < wd_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 7) + + proc_wd += 16; + } + + // Process the remaining width here. + if (wd_beyond_mul16) { + const __m256i dgd = + _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); + const __m256i dgd_mask = _mm256_and_si256(dgd, mask); + const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); + INIT_H_VALUES(d_window + shift + proc_wd, 7) + } + proc_ht += downsample_factor; + d_window += downsample_factor * d_stride; + d_current_row += downsample_factor * d_stride; + } while (proc_ht < v_end); + + const __m256i sh_0 = + hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); + const __m256i sh_1 = + hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); + _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), + sh_0); + _mm_storeu_si128( + (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), + _mm256_castsi256_si128(sh_1)); + _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6], + _mm256_extracti128_si256(sh_1, 1)); + shift++; + } while (++j < wiener_win); + } + + fill_lower_triag_elements_avx2(wiener_win2, H); +} + +void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) { + // Currently, libaom supports Wiener filter processing with window sizes as + // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD + // support is not facilitated. Hence, invoke C function for the same. + av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + return; + } + + const int32_t wiener_halfwin = wiener_win >> 1; + const uint8_t avg = + calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride); + const int32_t width = h_end - h_start; + const int32_t height = v_end - v_start; + const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15; + const int32_t s_stride = (width + 15) & ~15; + + // Based on the sf 'use_downsampled_wiener_stats', process either once for + // UPDATE_DOWNSAMPLE_FACTOR or for each row. + sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg, + width, height, src_avg, s_stride, + use_downsampled_wiener_stats); + + // Compute (dgd-avg) buffer here which is used to fill H buffer. + sub_avg_block_avx2( + dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin, + dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, + dgd_avg, d_stride, 0); + if (wiener_win == WIENER_WIN) { + compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width, + v_start, v_end, M, H, use_downsampled_wiener_stats); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width, + v_start, v_end, M, H, use_downsampled_wiener_stats); + } +} + +static INLINE __m256i pair_set_epi16(int a, int b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt0_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt0 + j), + yy_loadu_256(flt0 + j + 8)), + 0xd8); + const __m256i flt1_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt1 + j), + yy_loadu_256(flt1 + j + 8)), + 0xd8); + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0); + const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0); + const __m256i v0 = _mm256_madd_epi16( + xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i v1 = _mm256_madd_epi16( + xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m256i xq_coeff = + pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i flt_16b = _mm256_permute4x64_epi64( + _mm256_packs_epi32(yy_loadu_256(flt + j), + yy_loadu_256(flt + j + 8)), + 0xd8); + const __m256i v0 = + _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0)); + const __m256i v1 = + _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0)); + const __m256i vr0 = + _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift); + const __m256i vr1 = + _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift); + const __m256i e0 = _mm256_sub_epi16( + _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0); + const __m256i err0 = _mm256_madd_epi16(e0, e0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_active * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum64_0); + sum64 = _mm256_add_epi64(sum64, sum64_1); + } + } else { + __m256i sum32 = _mm256_setzero_si256(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j)); + const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j)); + const __m256i diff0 = _mm256_sub_epi16(d0, s0); + const __m256i err0 = _mm256_madd_epi16(diff0, diff0); + sum32 = _mm256_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + const __m256i sum64_0 = + _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32)); + const __m256i sum64_1 = + _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64_0, sum64_1); + } + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} + +// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of +// C and H need to be computed. +static AOM_INLINE void calc_proj_params_r0_r1_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h00, h01, h11, c0, c1; + const __m256i zero = _mm256_setzero_si256(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i h01_even = _mm256_mul_epi32(f1, f2); + const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f2, 32)); + h01 = _mm256_add_epi64(h01, h01_even); + h01 = _mm256_add_epi64(h01, h01_odd); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + __m256i c_low = _mm256_unpacklo_epi64(c0, c1); + const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); + c_low = _mm256_add_epi64(c_low, c_high); + const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), + _mm256_castsi256_si128(c_low)); + + __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); + const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); + h0x_low = _mm256_add_epi64(h0x_low, h0x_high); + const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), + _mm256_castsi256_si128(h0x_low)); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); + const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); + h1x_low = _mm256_add_epi64(h1x_low, h1x_high); + const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), + _mm256_castsi256_si128(h1x_low)); + + xx_storeu_128(C, c_128bit); + xx_storeu_128(H[0], h0x_128bit); + xx_storeu_128(H[1], h1x_128bit); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt0, + int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h00, c0; + const __m256i zero = _mm256_setzero_si256(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + } + } + const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), + _mm256_castsi256_si128(h00)); + const __m128i h00_val = + _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); + + const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), + _mm256_castsi256_si128(c0)); + const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, + int dat_stride, int32_t *flt1, + int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m256i h11, c1; + const __m256i zero = _mm256_setzero_si256(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu8_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), + _mm256_castsi256_si128(h11)); + const __m128i h11_val = + _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); + + const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), + _mm256_castsi256_si128(c1)); + const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); + const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// AVX2 variant of av1_calc_proj_params_c. +void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], + int64_t C[2], const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride, + flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride, + flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m256i h00, h01, h11, c0, c1; + const __m256i zero = _mm256_setzero_si256(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i h01_even = _mm256_mul_epi32(f1, f2); + const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f2, 32)); + h01 = _mm256_add_epi64(h01, h01_even); + h01 = _mm256_add_epi64(h01, h01_odd); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + __m256i c_low = _mm256_unpacklo_epi64(c0, c1); + const __m256i c_high = _mm256_unpackhi_epi64(c0, c1); + c_low = _mm256_add_epi64(c_low, c_high); + const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1), + _mm256_castsi256_si128(c_low)); + + __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01); + const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01); + h0x_low = _mm256_add_epi64(h0x_low, h0x_high); + const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1), + _mm256_castsi256_si128(h0x_low)); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11); + const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11); + h1x_low = _mm256_add_epi64(h1x_low, h1x_high); + const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1), + _mm256_castsi256_si128(h1x_low)); + + xx_storeu_128(C, c_128bit); + xx_storeu_128(H[0], h0x_128bit); + xx_storeu_128(H[1], h1x_128bit); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +static AOM_INLINE void calc_proj_params_r0_high_bd_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m256i h00, c0; + const __m256i zero = _mm256_setzero_si256(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(src + i * src_stride + j))); + __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f1 = _mm256_sub_epi32(f1, d); + + const __m256i h00_even = _mm256_mul_epi32(f1, f1); + const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), + _mm256_srli_epi64(f1, 32)); + h00 = _mm256_add_epi64(h00, h00_even); + h00 = _mm256_add_epi64(h00, h00_odd); + + const __m256i c0_even = _mm256_mul_epi32(f1, s); + const __m256i c0_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32)); + c0 = _mm256_add_epi64(c0, c0_even); + c0 = _mm256_add_epi64(c0, c0_odd); + } + } + const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1), + _mm256_castsi256_si128(h00)); + const __m128i h00_val = + _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8)); + + const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1), + _mm256_castsi256_si128(c0)); + const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero)); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero)); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +static AOM_INLINE void calc_proj_params_r1_high_bd_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m256i h11, c1; + const __m256i zero = _mm256_setzero_si256(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 8) { + const __m256i u_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(dat + i * dat_stride + j))); + const __m256i s_load = _mm256_cvtepu16_epi32( + _mm_load_si128((__m128i *)(src + i * src_stride + j))); + __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j)); + __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm256_sub_epi32(s, d); + f2 = _mm256_sub_epi32(f2, d); + + const __m256i h11_even = _mm256_mul_epi32(f2, f2); + const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), + _mm256_srli_epi64(f2, 32)); + h11 = _mm256_add_epi64(h11, h11_even); + h11 = _mm256_add_epi64(h11, h11_odd); + + const __m256i c1_even = _mm256_mul_epi32(f2, s); + const __m256i c1_odd = + _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32)); + c1 = _mm256_add_epi64(c1, c1_even); + c1 = _mm256_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1), + _mm256_castsi256_si128(h11)); + const __m128i h11_val = + _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8)); + + const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1), + _mm256_castsi256_si128(c1)); + const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8)); + + const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val); + const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// AVX2 variant of av1_calc_proj_params_high_bd_c. +void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_avx2( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + __m256i sum64 = _mm256_setzero_si256(); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled + const __m256i xq0 = _mm256_set1_epi32(xq[0]); + const __m256i xq1 = _mm256_set1_epi32(xq[1]); + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time + // Load 16 pixels each from source image and corrupted image + const __m256i s0 = yy_loadu_256(src + j); + const __m256i d0 = yy_loadu_256(dat + j); + // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices) + + // Shift-up each pixel to match filtered image scaling + const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS); + + // Split u0 into two halves and pad each from u16 to i32 + const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0)); + const __m256i u0h = + _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1)); + // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 + + // Load 16 pixels from each filtered image + const __m256i flt0l = yy_loadu_256(flt0 + j); + const __m256i flt0h = yy_loadu_256(flt0 + j + 8); + const __m256i flt1l = yy_loadu_256(flt1 + j); + const __m256i flt1h = yy_loadu_256(flt1 + j + 8); + // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32 + + // Subtract shifted corrupt image from each filtered image + const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l); + const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h); + const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l); + const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h); + + // Multiply basis vectors by appropriate coefficients + const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0); + const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0); + const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1); + const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1); + + // Add together the contributions from the two basis vectors + const __m256i vl = _mm256_add_epi32(v0l, v1l); + const __m256i vh = _mm256_add_epi32(v0h, v1h); + + // Right-shift v with appropriate rounding + const __m256i vrl = + _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); + const __m256i vrh = + _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); + // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] + + // Saturate each i32 to an i16 then combine both halves + // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes + const __m256i vr = + _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); + // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] + // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] + + // Add twin-subspace-sgr-filter to corrupt image then subtract source + const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m256i err0 = _mm256_madd_epi16(e0, e0); + + sum32 = _mm256_add_epi32(sum32, err0); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 16) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled + const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m256i xq_active = _mm256_set1_epi32(xq_on); + const __m256i xq_inactive = + _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 16; j += 16) { + // Load 16 pixels from source image + const __m256i s0 = yy_loadu_256(src + j); + // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + + // Load 16 pixels from corrupted image and pad each u16 to i32 + const __m256i d0 = yy_loadu_256(dat + j); + const __m256i d0h = + _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1)); + const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0)); + // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + // Load 16 pixels from the filtered image + const __m256i flth = yy_loadu_256(flt + j + 8); + const __m256i fltl = yy_loadu_256(flt + j); + // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active); + const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active); + const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive); + const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive); + + const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq); + const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq); + + // Shift this down with appropriate rounding + const __m256i vrh = + _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift); + const __m256i vrl = + _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift); + // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32 + + // Saturate each i32 to an i16 then combine both halves + // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes + const __m256i vr = + _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8); + // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16 + // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 + + // Subtract twin-subspace-sgr filtered from source image to get error + const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m256i err0 = _mm256_madd_epi16(e0, e0); + + sum32 = _mm256_add_epi32(sum32, err0); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 16) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_on * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } + } else { // Neither filter is enabled + for (i = 0; i < height; ++i) { + __m256i sum32 = _mm256_setzero_si256(); + for (j = 0; j <= width - 32; j += 32) { + // Load 2x16 u16 from source image + const __m256i s0l = yy_loadu_256(src + j); + const __m256i s0h = yy_loadu_256(src + j + 16); + + // Load 2x16 u16 from corrupted image + const __m256i d0l = yy_loadu_256(dat + j); + const __m256i d0h = yy_loadu_256(dat + j + 16); + + // Subtract corrupted image from source image + const __m256i diffl = _mm256_sub_epi16(d0l, s0l); + const __m256i diffh = _mm256_sub_epi16(d0h, s0h); + + // Square error and add adjacent values + const __m256i err0l = _mm256_madd_epi16(diffl, diffl); + const __m256i err0h = _mm256_madd_epi16(diffh, diffh); + + sum32 = _mm256_add_epi32(sum32, err0l); + sum32 = _mm256_add_epi32(sum32, err0h); + } + + const __m256i sum32l = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32)); + sum64 = _mm256_add_epi64(sum64, sum32l); + const __m256i sum32h = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1)); + sum64 = _mm256_add_epi64(sum64, sum32h); + + // Process remaining pixels (modulu 16) + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + // Sum 4 values from sum64l and sum64h into err + int64_t sum[4]; + yy_storeu_256(sum, sum64); + err += sum[0] + sum[1] + sum[2] + sum[3]; + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c new file mode 100644 index 0000000000..50db305802 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c @@ -0,0 +1,1483 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/common/restoration.h" +#include "av1/encoder/pickrst.h" + +static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src, + const __m128i *shuffle, const __m128i *kl) { + const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle); + const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s)); + const __m128i d1 = + _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8))); + const __m128i dst0 = xx_loadu_128(dst); + const __m128i dst1 = xx_loadu_128(dst + 4); + const __m128i r0 = _mm_add_epi32(dst0, d0); + const __m128i r1 = _mm_add_epi32(dst1, d1); + xx_storeu_128(dst, r0); + xx_storeu_128(dst + 4, r1); +} + +static INLINE void acc_stat_win7_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN], + int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + const int wiener_win = 7; + int j, k, l; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + *sumX += X1; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_sse41` function wants its input to have interleaved + // copies of two pixels, but we only have one. However, the pixels + // are (effectively) used as inputs to a multiply-accumulate. + // So if we set the extra pixel slot to 0, then it is effectively + // ignored. + const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1)); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win7_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint8_t avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int32_t sumX_row = 0; + int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } }; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i = i + downsample_factor) { + if (use_downsampled_wiener_stats && + (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + downsample_factor = vert_end - i; + } + sumX_row = 0; + memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN); + memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN); + memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8)); + acc_stat_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row); + sumX += sumX_row * downsample_factor; + // Scale M matrix based on the downsampling factor + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + sumY[k][l] += (sumY_row[k][l] * downsample_factor); + M_int32[k][l] += (M_int32_row[k][l] * downsample_factor); + } + } + // Scale H matrix based on the downsampling factor + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int32[k][l] += (H_int32_row[k][l] * downsample_factor); + } + } + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN2; ++k) { + for (l = 0; l < WIENER_WIN * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd, + const __m128i *shuffle, + const __m128i *dgd_ijkl) { + // Load 256 bits from dgd in two chunks + const __m128i s0l = xx_loadu_128(dgd); + const __m128i s0h = xx_loadu_128(dgd + 4); + // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices) + // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices) + // (Slightly strange order so we can apply the same shuffle to both halves) + + // Shuffle the u16 values in each half (actually using 8-bit shuffle mask) + const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle); + const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle); + // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices) + // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices) + + // Multiply s1 by dgd_ijkl resulting in 8x u32 values + // Horizontally add pairs of u32 resulting in 4x u32 + const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l); + const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h); + // dl = [d c b a] as u32 values + // dh = [h g f e] as u32 values + + // Add these 8x u32 results on to dst in four parts + const __m128i dll = _mm_cvtepu32_epi64(dl); + const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8)); + const __m128i dhl = _mm_cvtepu32_epi64(dh); + const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8)); + // dll = [b a] as u64 values, etc. + + const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll); + xx_storeu_128(dst, rll); + const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh); + xx_storeu_128(dst + 2, rlh); + const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl); + xx_storeu_128(dst + 4, rhl); + const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh); + xx_storeu_128(dst + 6, rhh); +} + +static INLINE void acc_stat_highbd_win7_one_line_sse4_1( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN], + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd as a single u32 + // Then broadcast to 4x u32 slots of a 128 + const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [y x y x y x y x] as u16 + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_sse41` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m128i dgd_ijkl = _mm_set1_epi32((int)D1); + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win7_opt_sse4_1( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + // Load just half of the 256-bit shuffle control used for the AVX2 version + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win7_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +static INLINE void acc_stat_highbd_win5_one_line_sse4_1( + const uint16_t *dgd, const uint16_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + int j, k, l; + const int wiener_win = WIENER_WIN_CHROMA; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint16_t X1 = src[j]; + const uint16_t X2 = src[j + 1]; + *sumX += X1 + X2; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + const uint16_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + // Load two u16 values from dgd as a single u32 + // then broadcast to 4x u32 slots of a 128 + const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l)); + // dgd_ijkl = [y x y x y x y x] as u16 + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint16_t X1 = src[j]; + *sumX += X1; + const uint16_t *dgd_ij = dgd + j; + for (k = 0; k < wiener_win; k++) { + const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int64_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint16_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_highbd_sse41` function wants its input to have + // interleaved copies of two pixels, but we only have one. However, the + // pixels are (effectively) used as inputs to a multiply-accumulate. So + // if we set the extra pixel slot to 0, then it is effectively ignored. + const __m128i dgd_ijkl = _mm_set1_epi32((int)D1); + + acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, + &dgd_ijkl); + acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, + &dgd_ijkl); + } + } + } +} + +static INLINE void compute_stats_highbd_win5_opt_sse4_1( + const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const uint16_t avg = + find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + + // Load just half of the 256-bit shuffle control used for the AVX2 version + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i++) { + acc_stat_highbd_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX, sumY, M_int, H_int); + } + } + + uint8_t bit_depth_divider = 1; + if (bit_depth == AOM_BITS_12) + bit_depth_divider = 16; + else if (bit_depth == AOM_BITS_10) + bit_depth_divider = 4; + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = (M_int[k][l] + + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) / + bit_depth_divider; + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = + (H_int_[n * 8 + m] + + (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) / + bit_depth_divider; + } + } + } + } +} + +void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, + int h_end, int v_start, int v_end, + int dgd_stride, int src_stride, int64_t *M, + int64_t *H, aom_bit_depth_t bit_depth) { + if (wiener_win == WIENER_WIN) { + compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, + bit_depth); + } else { + av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start, + v_end, dgd_stride, src_stride, M, H, bit_depth); + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static INLINE void acc_stat_win5_one_line_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, + int dgd_stride, const __m128i *shuffle, int32_t *sumX, + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA], + int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) { + const int wiener_win = WIENER_WIN_CHROMA; + int j, k, l; + // Main loop handles two pixels at a time + // We can assume that h_start is even, since it will always be aligned to + // a tile edge + some number of restoration units, and both of those will + // be 64-pixel aligned. + // However, at the edge of the image, h_end may be odd, so we need to handle + // that case correctly. + assert(h_start % 2 == 0); + const int h_end_even = h_end & ~1; + const int has_odd_pixel = h_end & 1; + for (j = h_start; j < h_end_even; j += 2) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + const uint8_t X2 = src[j + 1]; + *sumX += X1 + X2; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + const uint8_t D2 = dgd_ijk[l + 1]; + sumY[k][l] += D1 + D2; + M_int[k][l] += D1 * X1 + D2 * X2; + + const __m128i kl = + _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l))); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } + // If the width is odd, add in the final pixel + if (has_odd_pixel) { + const uint8_t *dgd_ij = dgd + j; + const uint8_t X1 = src[j]; + *sumX += X1; + for (k = 0; k < wiener_win; k++) { + const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride; + for (l = 0; l < wiener_win; l++) { + int32_t *H_ = &H_int[(l * wiener_win + k)][0]; + const uint8_t D1 = dgd_ijk[l]; + sumY[k][l] += D1; + M_int[k][l] += D1 * X1; + + // The `acc_stat_sse41` function wants its input to have interleaved + // copies of two pixels, but we only have one. However, the pixels + // are (effectively) used as inputs to a multiply-accumulate. + // So if we set the extra pixel slot to 0, then it is effectively + // ignored. + const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1)); + acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl); + acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl); + } + } + } +} + +static INLINE void compute_stats_win5_opt_sse4_1( + const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start, + int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + int i, j, k, l, m, n; + const int wiener_win = WIENER_WIN_CHROMA; + const int pixel_count = (h_end - h_start) * (v_end - v_start); + const int wiener_win2 = wiener_win * wiener_win; + const int wiener_halfwin = (wiener_win >> 1); + const uint8_t avg = + find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride); + + int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } }; + int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + int32_t sumX = 0; + const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + int32_t sumX_row = 0; + int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } }; + + const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data); + for (j = v_start; j < v_end; j += 64) { + const int vert_end = AOMMIN(64, v_end - j) + j; + for (i = j; i < vert_end; i = i + downsample_factor) { + if (use_downsampled_wiener_stats && + (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) { + downsample_factor = vert_end - i; + } + sumX_row = 0; + memset(sumY_row, 0, + sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA); + memset(M_int32_row, 0, + sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA); + memset(H_int32_row, 0, + sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8)); + acc_stat_win5_one_line_sse4_1( + dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end, + dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row); + sumX += sumX_row * downsample_factor; + // Scale M matrix based on the downsampling factor + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + sumY[k][l] += (sumY_row[k][l] * downsample_factor); + M_int32[k][l] += (M_int32_row[k][l] * downsample_factor); + } + } + // Scale H matrix based on the downsampling factor + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int32[k][l] += (H_int32_row[k][l] * downsample_factor); + } + } + } + for (k = 0; k < wiener_win; ++k) { + for (l = 0; l < wiener_win; ++l) { + M_int64[k][l] += M_int32[k][l]; + M_int32[k][l] = 0; + } + } + for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) { + for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) { + H_int64[k][l] += H_int32[k][l]; + H_int32[k][l] = 0; + } + } + } + + const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count; + for (k = 0; k < wiener_win; k++) { + for (l = 0; l < wiener_win; l++) { + const int32_t idx0 = l * wiener_win + k; + M[idx0] = + M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l])); + int64_t *H_ = H + idx0 * wiener_win2; + int64_t *H_int_ = &H_int64[idx0][0]; + for (m = 0; m < wiener_win; m++) { + for (n = 0; n < wiener_win; n++) { + H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum - + (int64_t)avg * (sumY[k][l] + sumY[n][m]); + } + } + } + } +} +void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd, + const uint8_t *src, int16_t *dgd_avg, + int16_t *src_avg, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + int use_downsampled_wiener_stats) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + } else if (wiener_win == WIENER_WIN_CHROMA) { + compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end, + dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + } else { + av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, + v_start, v_end, dgd_stride, src_stride, M, H, + use_downsampled_wiener_stats); + } +} + +static INLINE __m128i pair_set_epi16(int a, int b) { + return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +int64_t av1_lowbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint8_t *src = src8; + const uint8_t *dat = dat8; + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { + __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]); + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt0_16b = + _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4)); + const __m128i flt1_16b = + _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4)); + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0); + const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0); + const __m128i v0 = _mm_madd_epi16( + xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i v1 = _mm_madd_epi16( + xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else if (params->r[0] > 0 || params->r[1] > 0) { + const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m128i xq_coeff = + pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j)); + const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j)); + const __m128i flt_16b = + _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4)); + const __m128i v0 = + _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0)); + const __m128i v1 = + _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0)); + const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift); + const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift); + const __m128i e0 = + _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0); + const __m128i err0 = _mm_madd_epi16(e0, e0); + sum32 = _mm_add_epi32(sum32, err0); + } + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_active * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum64_0); + sum64 = _mm_add_epi64(sum64, sum64_1); + } + } else { + __m128i sum32 = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j <= width - 16; j += 16) { + const __m128i d = xx_loadu_128(dat + j); + const __m128i s = xx_loadu_128(src + j); + const __m128i d0 = _mm_cvtepu8_epi16(d); + const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8)); + const __m128i s0 = _mm_cvtepu8_epi16(s); + const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)); + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32); + const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64_0, sum64_1); + } + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} + +// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of +// C and H need to be computed. +static AOM_INLINE void calc_proj_params_r0_r1_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m128i h00, h01, h11, c0, c1; + const __m128i zero = _mm_setzero_si128(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); + const __m128i s_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i h01_even = _mm_mul_epi32(f1, f2); + const __m128i h01_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32)); + h01 = _mm_add_epi64(h01, h01_even); + h01 = _mm_add_epi64(h01, h01_odd); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + __m128i c_low = _mm_unpacklo_epi64(c0, c1); + const __m128i c_high = _mm_unpackhi_epi64(c0, c1); + c_low = _mm_add_epi64(c_low, c_high); + + __m128i h0x_low = _mm_unpacklo_epi64(h00, h01); + const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01); + h0x_low = _mm_add_epi64(h0x_low, h0x_high); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m128i h1x_low = _mm_unpacklo_epi64(zero, h11); + const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11); + h1x_low = _mm_add_epi64(h1x_low, h1x_high); + + xx_storeu_128(C, c_low); + xx_storeu_128(H[0], h0x_low); + xx_storeu_128(H[1], h1x_low); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m128i h00, c0; + const __m128i zero = _mm_setzero_si128(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); + const __m128i s_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + } + } + const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8)); + + const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, zero); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint8_t *src = src8; + const uint8_t *dat = dat8; + __m128i h11, c1; + const __m128i zero = _mm_setzero_si128(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j)))); + const __m128i s_load = _mm_cvtepu8_epi32( + _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j)))); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8)); + + const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8)); + + const __m128i c = _mm_unpacklo_epi64(zero, c1_val); + const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// SSE4.1 variant of av1_calc_proj_params_c. +void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height, + int src_stride, const uint8_t *dat8, + int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m128i h00, h01, h11, c0, c1; + const __m128i zero = _mm_setzero_si128(); + h01 = h11 = c0 = c1 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m128i s_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i h01_even = _mm_mul_epi32(f1, f2); + const __m128i h01_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32)); + h01 = _mm_add_epi64(h01, h01_even); + h01 = _mm_add_epi64(h01, h01_odd); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + __m128i c_low = _mm_unpacklo_epi64(c0, c1); + const __m128i c_high = _mm_unpackhi_epi64(c0, c1); + c_low = _mm_add_epi64(c_low, c_high); + + __m128i h0x_low = _mm_unpacklo_epi64(h00, h01); + const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01); + h0x_low = _mm_add_epi64(h0x_low, h0x_high); + + // Using the symmetric properties of H, calculations of H[1][0] are not + // needed. + __m128i h1x_low = _mm_unpacklo_epi64(zero, h11); + const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11); + h1x_low = _mm_add_epi64(h1x_low, h1x_high); + + xx_storeu_128(C, c_low); + xx_storeu_128(H[0], h0x_low); + xx_storeu_128(H[1], h1x_low); + + H[0][0] /= size; + H[0][1] /= size; + H[1][1] /= size; + + // Since H is a symmetric matrix + H[1][0] = H[0][1]; + C[0] /= size; + C[1] /= size; +} + +// When only params->r[0] > 0. In this case only H[0][0] and C[0] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m128i h00, c0; + const __m128i zero = _mm_setzero_si128(); + c0 = h00 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m128i s_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f1 = _mm_sub_epi32(f1, d); + + const __m128i h00_even = _mm_mul_epi32(f1, f1); + const __m128i h00_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32)); + h00 = _mm_add_epi64(h00, h00_even); + h00 = _mm_add_epi64(h00, h00_odd); + + const __m128i c0_even = _mm_mul_epi32(f1, s); + const __m128i c0_odd = + _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32)); + c0 = _mm_add_epi64(c0, c0_even); + c0 = _mm_add_epi64(c0, c0_odd); + } + } + const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8)); + + const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8)); + + const __m128i c = _mm_unpacklo_epi64(c0_val, zero); + const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero); + + xx_storeu_128(C, c); + xx_storeu_128(H[0], h0x); + + H[0][0] /= size; + C[0] /= size; +} + +// When only params->r[1] > 0. In this case only H[1][1] and C[1] are +// non-zero and need to be computed. +static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2]) { + const int size = width * height; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + __m128i h11, c1; + const __m128i zero = _mm_setzero_si128(); + c1 = h11 = zero; + + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; j += 4) { + const __m128i u_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j))); + const __m128i s_load = _mm_cvtepu16_epi32( + _mm_loadl_epi64((__m128i *)(src + i * src_stride + j))); + __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j)); + __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS); + __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS); + s = _mm_sub_epi32(s, d); + f2 = _mm_sub_epi32(f2, d); + + const __m128i h11_even = _mm_mul_epi32(f2, f2); + const __m128i h11_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32)); + h11 = _mm_add_epi64(h11, h11_even); + h11 = _mm_add_epi64(h11, h11_odd); + + const __m128i c1_even = _mm_mul_epi32(f2, s); + const __m128i c1_odd = + _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32)); + c1 = _mm_add_epi64(c1, c1_even); + c1 = _mm_add_epi64(c1, c1_odd); + } + } + + const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8)); + + const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8)); + + const __m128i c = _mm_unpacklo_epi64(zero, c1_val); + const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val); + + xx_storeu_128(C, c); + xx_storeu_128(H[1], h1x); + + H[1][1] /= size; + C[1] /= size; +} + +// SSE4.1 variant of av1_calc_proj_params_high_bd_c. +void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width, + int height, int src_stride, + const uint8_t *dat8, int dat_stride, + int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, + int64_t H[2][2], int64_t C[2], + const sgr_params_type *params) { + if ((params->r[0] > 0) && (params->r[1] > 0)) { + calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, flt1, + flt1_stride, H, C); + } else if (params->r[0] > 0) { + calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt0, flt0_stride, H, C); + } else if (params->r[1] > 0) { + calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8, + dat_stride, flt1, flt1_stride, H, C); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +int64_t av1_highbd_pixel_proj_error_sse4_1( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + int i, j, k; + const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; + const __m128i rounding = _mm_set1_epi32(1 << (shift - 1)); + __m128i sum64 = _mm_setzero_si128(); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t err = 0; + if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled + const __m128i xq0 = _mm_set1_epi32(xq[0]); + const __m128i xq1 = _mm_set1_epi32(xq[1]); + + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + // Load 8x pixels from source image + const __m128i s0 = xx_loadu_128(src + j); + // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[]) + + // Load 8x pixels from corrupted image + const __m128i d0 = xx_loadu_128(dat + j); + // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[]) + + // Shift each pixel value up by SGRPROJ_RST_BITS + const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS); + + // Split u0 into two halves and pad each from u16 to i32 + const __m128i u0l = _mm_cvtepu16_epi32(u0); + const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8)); + // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices + + // Load 8 pixels from first and second filtered images + const __m128i flt0l = xx_loadu_128(flt0 + j); + const __m128i flt0h = xx_loadu_128(flt0 + j + 4); + const __m128i flt1l = xx_loadu_128(flt1 + j); + const __m128i flt1h = xx_loadu_128(flt1 + j + 4); + // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j) + // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j) + + // Subtract shifted corrupt image from each filtered image + // This gives our two basis vectors for the projection + const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l); + const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h); + const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l); + const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h); + // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32 + // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32 + + // Multiply each basis vector by the corresponding coefficient + const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0); + const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0); + const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1); + const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1); + + // Add together the contribution from each scaled basis vector + const __m128i vl = _mm_add_epi32(v0l, v1l); + const __m128i vh = _mm_add_epi32(v0h, v1h); + + // Right-shift v with appropriate rounding + const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); + const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); + + // Saturate each i32 value to i16 and combine lower and upper halves + const __m128i vr = _mm_packs_epi32(vrl, vrh); + + // Add twin-subspace-sgr-filter to corrupt image then subtract source + const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m128i err0 = _mm_madd_epi16(e0, e0); + + sum32 = _mm_add_epi32(sum32, err0); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 8) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } + } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled + const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1]; + const __m128i xq_active = _mm_set1_epi32(xq_on); + const __m128i xq_inactive = + _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS)); + const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 8; j += 8) { + // Load 8x pixels from source image + const __m128i s0 = xx_loadu_128(src + j); + // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[]) + + // Load 8x pixels from corrupted image and pad each u16 to i32 + const __m128i d0 = xx_loadu_128(dat + j); + const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8)); + const __m128i d0l = _mm_cvtepu16_epi32(d0); + // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[]) + + // Load 8 pixels from the filtered image + const __m128i flth = xx_loadu_128(flt + j + 4); + const __m128i fltl = xx_loadu_128(flt + j); + // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j) + + const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active); + const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active); + const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive); + const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive); + + const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq); + const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq); + // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ] + // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ] + + // Shift this down with appropriate rounding + const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift); + const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift); + + // Saturate vr0 and vr1 from i32 to i16 then pack together + const __m128i vr = _mm_packs_epi32(vrl, vrh); + + // Subtract twin-subspace-sgr filtered from source image to get error + const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0); + + // Calculate squared error and add adjacent values + const __m128i err0 = _mm_madd_epi16(e0, e0); + + sum32 = _mm_add_epi32(sum32, err0); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels in this row (modulo 8) + for (k = j; k < width; ++k) { + const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); + int32_t v = xq_on * (flt[k] - u); + const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + flt += flt_stride; + } + } else { // Neither filter is enabled + for (i = 0; i < height; ++i) { + __m128i sum32 = _mm_setzero_si128(); + for (j = 0; j <= width - 16; j += 16) { + // Load 2x8 u16 from source image + const __m128i s0 = xx_loadu_128(src + j); + const __m128i s1 = xx_loadu_128(src + j + 8); + // Load 2x8 u16 from corrupted image + const __m128i d0 = xx_loadu_128(dat + j); + const __m128i d1 = xx_loadu_128(dat + j + 8); + + // Subtract corrupted image from source image + const __m128i diff0 = _mm_sub_epi16(d0, s0); + const __m128i diff1 = _mm_sub_epi16(d1, s1); + + // Square error and add adjacent values + const __m128i err0 = _mm_madd_epi16(diff0, diff0); + const __m128i err1 = _mm_madd_epi16(diff1, diff1); + + sum32 = _mm_add_epi32(sum32, err0); + sum32 = _mm_add_epi32(sum32, err1); + } + + const __m128i sum32l = _mm_cvtepu32_epi64(sum32); + sum64 = _mm_add_epi64(sum64, sum32l); + const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8)); + sum64 = _mm_add_epi64(sum64, sum32h); + + // Process remaining pixels (modulu 8) + for (k = j; k < width; ++k) { + const int32_t e = (int32_t)(dat[k]) - src[k]; + err += ((int64_t)e * e); + } + dat += dat_stride; + src += src_stride; + } + } + + // Sum 4 values from sum64l and sum64h into err + int64_t sum[2]; + xx_storeu_128(sum, sum64); + err += sum[0] + sum[1]; + return err; +} +#endif // CONFIG_AV1_HIGHBITDEPTH diff --git a/third_party/aom/av1/encoder/x86/rdopt_avx2.c b/third_party/aom/av1/encoder/x86/rdopt_avx2.c new file mode 100644 index 0000000000..a0ab3940c0 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/rdopt_avx2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "config/av1_rtcd.h" +#include "av1/encoder/rdopt.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + __m256i *xy_sum_32, + __m256i *xz_sum_32, __m256i *x_sum_32, + __m256i *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const __m256i pixels = _mm256_set_epi64x( + loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]), + loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride])); + // pixels = [d c b a h g f e] [l k j i p o n m] as i16 + + const __m256i slli = _mm256_slli_epi64(pixels, 16); + // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16 + + const __m256i madd_xy = _mm256_madd_epi16(pixels, slli); + // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32 + *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy); + + // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90 + const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90); + // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16 + + const __m256i madd_xz = _mm256_madd_epi16(slli, perm); + // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32 + *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz); + + // Sum every element in slli (and then also their squares) + const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1)); + // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32 + *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli); + + const __m256i madd_slli = _mm256_madd_epi16(slli, slli); + // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32 + *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli); +} + +void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 }; + __m256i xy_sum_32 = _mm256_setzero_si256(); + __m256i xz_sum_32 = _mm256_setzero_si256(); + __m256i x_sum_32 = _mm256_setzero_si256(); + __m256i x2_sum_32 = _mm256_setzero_si256(); + for (int i = 0; i <= height - 4; i += 3) { + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32); + // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh] + // [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32 + yy_storeu_256(xy_xz_tmp, hadd_xy_xz); + xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1]; + xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3]; + + const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32); + // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g] + // [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32 + yy_storeu_256(x_x2_tmp, hadd_x_x2); + x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1]; + x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3]; + + xy_sum_32 = _mm256_setzero_si256(); + xz_sum_32 = _mm256_setzero_si256(); + x_sum_32 = _mm256_setzero_si256(); + x2_sum_32 = _mm256_setzero_si256(); + } + + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/third_party/aom/av1/encoder/x86/rdopt_sse4.c b/third_party/aom/av1/encoder/x86/rdopt_sse4.c new file mode 100644 index 0000000000..12ac146195 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/rdopt_sse4.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "aom_dsp/x86/synonyms.h" + +#include "config/av1_rtcd.h" +#include "av1/encoder/rdopt.h" + +// Process horizontal and vertical correlations in a 4x4 block of pixels. +// We actually use the 4x4 pixels to calculate correlations corresponding to +// the top-left 3x3 pixels, so this function must be called with 1x1 overlap, +// moving the window along/down by 3 pixels at a time. +INLINE static void horver_correlation_4x4(const int16_t *diff, int stride, + __m128i *xy_sum_32, + __m128i *xz_sum_32, __m128i *x_sum_32, + __m128i *x2_sum_32) { + // Pixels in this 4x4 [ a b c d ] + // are referred to as: [ e f g h ] + // [ i j k l ] + // [ m n o p ] + + const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride], + *(int64_t *)&diff[2 * stride]); + const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride], + *(int64_t *)&diff[3 * stride]); + // pixelsa = [d c b a l k j i] as i16 + // pixelsb = [h g f e p o n m] as i16 + + const __m128i slli_a = _mm_slli_epi64(pixelsa, 16); + const __m128i slli_b = _mm_slli_epi64(pixelsb, 16); + // slli_a = [c b a 0 k j i 0] as i16 + // slli_b = [g f e 0 o n m 0] as i16 + + const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a); + const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b); + // xy_madd_a = [bc+cd ab jk+kl ij] as i32 + // xy_madd_b = [fg+gh ef no+op mn] as i32 + + const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a); + // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32 + *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32); + + const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b); + // xz_madd_a = [bf+cg ae jn+ko im] i32 + + const __m128i swap_b = _mm_srli_si128(slli_b, 8); + // swap_b = [0 0 0 0 g f e 0] as i16 + const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b); + // xz_madd_b = [0 0 gk+fj ei] i32 + + const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a); + // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32 + *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32); + + // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k + // (sum up every element in slli_a and swap_b) + const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a); + const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a); + // sum_slli_a32 = [c+b a k+j i] as i32 + const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b); + // swap_b32 = [g f e 0] as i32 + *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32); + *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32); + // sum = [c+b+g a+f k+j+e i] as i32 + + // Also sum their squares + const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a); + const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b); + // slli_a_2 = [c2+b2 a2 k2+j2 i2] + // swap_b_2 = [0 0 g2+f2 e2] + const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2); + // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2] + *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2); +} + +void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride, + int width, int height, float *hcorr, + float *vcorr) { + // The following notation is used: + // x - current pixel + // y - right neighbour pixel + // z - below neighbour pixel + // w - down-right neighbour pixel + int64_t xy_sum = 0, xz_sum = 0; + int64_t x_sum = 0, x2_sum = 0; + + // Process horizontal and vertical correlations through the body in 4x4 + // blocks. This excludes the final row and column and possibly one extra + // column depending how 3 divides into width and height + int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 }; + int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 }; + __m128i xy_sum_32 = _mm_setzero_si128(); + __m128i xz_sum_32 = _mm_setzero_si128(); + __m128i x_sum_32 = _mm_setzero_si128(); + __m128i x2_sum_32 = _mm_setzero_si128(); + for (int i = 0; i <= height - 4; i += 3) { + for (int j = 0; j <= width - 4; j += 3) { + horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32, + &xz_sum_32, &x_sum_32, &x2_sum_32); + } + xx_storeu_128(xy_tmp, xy_sum_32); + xx_storeu_128(xz_tmp, xz_sum_32); + xx_storeu_128(x_tmp, x_sum_32); + xx_storeu_128(x2_tmp, x2_sum_32); + xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1]; + xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0]; + x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0]; + x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0]; + xy_sum_32 = _mm_setzero_si128(); + xz_sum_32 = _mm_setzero_si128(); + x_sum_32 = _mm_setzero_si128(); + x2_sum_32 = _mm_setzero_si128(); + } + + // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols + int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0; + + // Do we have 2 rows remaining or just the one? Note that width and height + // are powers of 2, so each modulo 3 must be 1 or 2. + if (height % 3 == 1) { // Just horiz corrs on the final row + const int16_t x0 = diff[(height - 1) * stride]; + x_sum += x0; + x_finalrow += x0; + x2_sum += x0 * x0; + x2_finalrow += x0 * x0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 1) * stride + j]; + const int16_t y = diff[(height - 1) * stride + j + 1]; + xy_sum += x * y; + x_sum += y; + x2_sum += y * y; + x_finalrow += y; + x2_finalrow += y * y; + } + } else { // Two rows remaining to do + const int16_t x0 = diff[(height - 2) * stride]; + const int16_t z0 = diff[(height - 1) * stride]; + x_sum += x0 + z0; + x2_sum += x0 * x0 + z0 * z0; + x_finalrow += z0; + x2_finalrow += z0 * z0; + for (int j = 0; j < width - 1; ++j) { + const int16_t x = diff[(height - 2) * stride + j]; + const int16_t y = diff[(height - 2) * stride + j + 1]; + const int16_t z = diff[(height - 1) * stride + j]; + const int16_t w = diff[(height - 1) * stride + j + 1]; + + // Horizontal and vertical correlations for the penultimate row: + xy_sum += x * y; + xz_sum += x * z; + + // Now just horizontal correlations for the final row: + xy_sum += z * w; + + x_sum += y + w; + x2_sum += y * y + w * w; + x_finalrow += w; + x2_finalrow += w * w; + } + } + + // Do we have 2 columns remaining or just the one? + if (width % 3 == 1) { // Just vert corrs on the final col + const int16_t x0 = diff[width - 1]; + x_sum += x0; + x_finalcol += x0; + x2_sum += x0 * x0; + x2_finalcol += x0 * x0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 1]; + xz_sum += x * z; + x_finalcol += z; + x2_finalcol += z * z; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z; + x2_sum += z * z; + } + } + } else { // Two cols remaining + const int16_t x0 = diff[width - 2]; + const int16_t y0 = diff[width - 1]; + x_sum += x0 + y0; + x2_sum += x0 * x0 + y0 * y0; + x_finalcol += y0; + x2_finalcol += y0 * y0; + for (int i = 0; i < height - 1; ++i) { + const int16_t x = diff[i * stride + width - 2]; + const int16_t y = diff[i * stride + width - 1]; + const int16_t z = diff[(i + 1) * stride + width - 2]; + const int16_t w = diff[(i + 1) * stride + width - 1]; + + // Horizontal and vertical correlations for the penultimate col: + // Skip these on the last iteration of this loop if we also had two + // rows remaining, otherwise the final horizontal and vertical correlation + // get erroneously processed twice + if (i < height - 2 || height % 3 == 1) { + xy_sum += x * y; + xz_sum += x * z; + } + + x_finalcol += w; + x2_finalcol += w * w; + // So the bottom-right elements don't get counted twice: + if (i < height - (height % 3 == 1 ? 2 : 3)) { + x_sum += z + w; + x2_sum += z * z + w * w; + } + + // Now just vertical correlations for the final column: + xz_sum += y * w; + } + } + + // Calculate the simple sums and squared-sums + int64_t x_firstrow = 0, x_firstcol = 0; + int64_t x2_firstrow = 0, x2_firstcol = 0; + + for (int j = 0; j < width; ++j) { + x_firstrow += diff[j]; + x2_firstrow += diff[j] * diff[j]; + } + for (int i = 0; i < height; ++i) { + x_firstcol += diff[i * stride]; + x2_firstcol += diff[i * stride] * diff[i * stride]; + } + + int64_t xhor_sum = x_sum - x_finalcol; + int64_t xver_sum = x_sum - x_finalrow; + int64_t y_sum = x_sum - x_firstcol; + int64_t z_sum = x_sum - x_firstrow; + int64_t x2hor_sum = x2_sum - x2_finalcol; + int64_t x2ver_sum = x2_sum - x2_finalrow; + int64_t y2_sum = x2_sum - x2_firstcol; + int64_t z2_sum = x2_sum - x2_firstrow; + + const float num_hor = (float)(height * (width - 1)); + const float num_ver = (float)((height - 1) * width); + + const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor; + const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver; + + const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor; + const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver; + + const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor; + const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver; + + if (xhor_var_n > 0 && y_var_n > 0) { + *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n); + *hcorr = *hcorr < 0 ? 0 : *hcorr; + } else { + *hcorr = 1.0; + } + if (xver_var_n > 0 && z_var_n > 0) { + *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n); + *vcorr = *vcorr < 0 ? 0 : *vcorr; + } else { + *vcorr = 1.0; + } +} diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c new file mode 100644 index 0000000000..a492483721 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "aom_dsp/x86/synonyms.h" + +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" +#include "av1/common/reconinter.h" +#include "av1/common/reconintra.h" +#include "av1/encoder/reconinter_enc.h" + +void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref, int ref_stride, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for + // 2-tap yet. + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; + + if (!subpel_x_q3 && !subpel_y_q3) { + if (width >= 16) { + int i; + assert(!(width & 15)); + /*Read 16 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 16) { + xx_storeu_128(comp_pred, xx_loadu_128(ref)); + comp_pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + int i; + assert(!(width & 7)); + assert(!(height & 1)); + /*Read 8 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); + xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); + comp_pred += 16; + ref += 2 * ref_stride; + } + } else { + int i; + assert(!(width & 3)); + assert(!(height & 3)); + /*Read 4 pixels four rows at a time.*/ + for (i = 0; i < height; i++) { + const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); + const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); + const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); + const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); + const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), + _mm_unpacklo_epi32(row2, row3)); + xx_storeu_128(comp_pred, reg); + comp_pred += 16; + ref += 4 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + int plane = 0; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + + InterPredParams inter_pred_params; + inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd); + const int_interpfilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + av1_init_inter_params( + &inter_pred_params, width, height, mi_y >> pd->subsampling_y, + mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y, + xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters); + av1_enc_build_one_inter_predictor(comp_pred8, width, mv, + &inter_pred_params); + return; + } + } + + const InterpFilterParams *filter = av1_get_filter(subpel_search); + int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS; + if (!subpel_x_q3 && !subpel_y_q3) { + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + if (width >= 8) { + int i; + assert(!(width & 7)); + /*Read 8 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + _mm_storeu_si128((__m128i *)comp_pred, s0); + comp_pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + int i; + assert(!(width & 3)); + /*Read 4 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); + __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + __m128i t0 = _mm_unpacklo_epi64(s0, s1); + _mm_storeu_si128((__m128i *)comp_pred, t0); + comp_pred += 8; + ref += 2 * ref_stride; + } + } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1); + uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS) + ? temp + (filter_taps >> 1) * MAX_SB_SIZE + : temp; + uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz( + ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); + aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE, + comp_pred8, width, NULL, -1, kernel_y, 16, width, + height, bd); + } +} + +void aom_highbd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ + assert(!(width * height & 7)); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; + pred += 8; + } +} + +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, + int subpel_search) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + int n; + int i; + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd, subpel_search); + assert(!(width * height & 7)); + n = width * height >> 3; + + const int16_t wt0 = (int16_t)jcp_param->fwd_offset; + const int16_t wt1 = (int16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set1_epi16(wt0); + const __m128i w1 = _mm_set1_epi16(wt1); + const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = _mm_set1_epi16(round); + + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred16); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); + + comp_pred16 += 8; + pred += 8; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + for (i = 0; i < n; i++) { + __m128i s0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(pred); + xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); + comp_pred += 16; + pred += 16; + } +} diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c new file mode 100644 index 0000000000..df7aa95855 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_dist_wtd_comp_avg_upsampled_pred_ssse3( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + + const int8_t w0 = (int8_t)jcp_param->fwd_offset; + const int8_t w1 = (int8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = _mm_set1_epi16(round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + } +} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c new file mode 100644 index 0000000000..752d6f3f0b --- /dev/null +++ b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +#define SSE_STRIDE (BW + 2) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = { + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 }, + { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 }, + { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 }, + { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } +}; + +DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = { + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 } +}; + +#define CALC_X_GRADIENT(AC, GI, DF, out) \ + out = _mm256_abs_epi16( \ + _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1))); + +#define CALC_Y_GRADIENT(AC, GI, BH, out) \ + out = _mm256_abs_epi16( \ + _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1))); + +double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height, + int width, int stride, + int edge_thresh) { + int count = 0; + int64_t accum = 0; + // w32 stores width multiple of 32. + const int w32 = (width - 1) & ~0x1f; + const __m256i zero = _mm256_setzero_si256(); + const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh); + __m256i num_accumulator = zero; + __m256i sum_accumulator = zero; + + // A | B | C + // D | E | F + // G | H | I + // g_x = (A - C) + (G - I) + 2*(D - F) + // g_y = (A + C) - (G + I) + 2*(B - H) + // v = 4*E - 2*(D+F+B+H) + (A+C+G+I) + + // Process the width multiple of 32 here. + for (int w = 1; w < w32; w += 32) { + int h = 1; + const int start_idx = h * stride + w; + const int stride_0 = start_idx - stride; + + __m256i num_accum_row_lvl = zero; + const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1])); + const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1])); + const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1])); + const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1])); + __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0])); + __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx])); + + const __m256i A_lo = _mm256_unpacklo_epi8(A, zero); + const __m256i A_hi = _mm256_unpackhi_epi8(A, zero); + const __m256i C_lo = _mm256_unpacklo_epi8(C, zero); + const __m256i C_hi = _mm256_unpackhi_epi8(C, zero); + const __m256i D_lo = _mm256_unpacklo_epi8(D, zero); + const __m256i D_hi = _mm256_unpackhi_epi8(D, zero); + const __m256i F_lo = _mm256_unpacklo_epi8(F, zero); + const __m256i F_hi = _mm256_unpackhi_epi8(F, zero); + + __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo); + __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi); + __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo); + __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi); + __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo); + __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi); + __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo); + __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi); + + for (; h < height - 1; h++) { + __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi, + gy_hi; + const int k = h * stride + w; + const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1])); + const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride])); + const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1])); + + const __m256i B_lo = _mm256_unpacklo_epi8(B, zero); + const __m256i B_hi = _mm256_unpackhi_epi8(B, zero); + const __m256i G_lo = _mm256_unpacklo_epi8(G, zero); + const __m256i G_hi = _mm256_unpackhi_epi8(G, zero); + const __m256i I_lo = _mm256_unpacklo_epi8(I, zero); + const __m256i I_hi = _mm256_unpackhi_epi8(I, zero); + const __m256i H_lo = _mm256_unpacklo_epi8(H, zero); + const __m256i H_hi = _mm256_unpackhi_epi8(H, zero); + + sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo); + sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi); + sum_GI_lo = _mm256_add_epi16(G_lo, I_lo); + sum_GI_hi = _mm256_add_epi16(G_hi, I_hi); + const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo); + const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi); + + CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo) + CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo) + + const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo); + + CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi) + CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi) + + const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi); + + __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo); + __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi); + const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi); + + // v = 4*E -2*(D+F+B+H) + (A+C+G+I) + if (_mm256_movemask_epi8(comp_reg) != 0) { + const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo); + const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi); + + // 2*(D+F+B+H) + const __m256i sum_DFBH_lo = + _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1); + // (A+C+G+I) + const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo); + const __m256i sum_DFBH_hi = + _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1); + const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi); + + // Convert E register values from 8bit to 16bit + const __m256i E_lo = _mm256_unpacklo_epi8(E, zero); + const __m256i E_hi = _mm256_unpackhi_epi8(E, zero); + + // 4*E - 2*(D+F+B+H)+ (A+C+G+I) + const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16( + _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo), + sum_ACGI_lo)); + const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16( + _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi), + sum_ACGI_hi)); + cmp_lo = _mm256_srli_epi16(cmp_lo, 15); + cmp_hi = _mm256_srli_epi16(cmp_hi, 15); + const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo); + const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi); + + num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo); + num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi); + + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpacklo_epi16(var_lo, zero)); + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpackhi_epi16(var_lo, zero)); + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpacklo_epi16(var_hi, zero)); + sum_accumulator = _mm256_add_epi32(sum_accumulator, + _mm256_unpackhi_epi16(var_hi, zero)); + } + sub_AC_lo = sub_DF_lo; + sub_AC_hi = sub_DF_hi; + sub_DF_lo = sub_GI_lo; + sub_DF_hi = sub_GI_hi; + sum_AC_lo = sum_DF_lo; + sum_AC_hi = sum_DF_hi; + sum_DF_lo = sum_GI_lo; + sum_DF_hi = sum_GI_hi; + B = E; + E = H; + } + const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero); + const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero); + num_accumulator = + _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1)); + } + + // Process the remaining width here. + for (int h = 1; h < height - 1; ++h) { + for (int w = w32 + 1; w < width - 1; ++w) { + const int k = h * stride + w; + + // Compute sobel gradients + const int g_x = (src[k - stride - 1] - src[k - stride + 1]) + + (src[k + stride - 1] - src[k + stride + 1]) + + 2 * (src[k - 1] - src[k + 1]); + const int g_y = (src[k - stride - 1] - src[k + stride - 1]) + + (src[k - stride + 1] - src[k + stride + 1]) + + 2 * (src[k - stride] - src[k + stride]); + const int ga = abs(g_x) + abs(g_y); + + if (ga < edge_thresh) { + // Find Laplacian + const int v = + 4 * src[k] - + 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) + + (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] + + src[k + stride + 1]); + accum += abs(v); + ++count; + } + } + } + + // s0 s1 n0 n1 s2 s3 n2 n3 + __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator); + __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx); + __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1); + // s0+s2 s1+s3 n0+n2 n1+n3 + __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi); + // s0+s2+s1+s3 n0+n2+n1+n3 + __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1); + + accum += _mm_cvtsi128_si32(result); + count += _mm_extract_epi32(result, 2); + + // If very few smooth pels, return -1 since the estimate is unreliable. + return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2; +} + +static AOM_FORCE_INLINE void get_squared_error_16x16_avx2( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint16_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + for (int i = 0; i < block_height; i++) { + __m128i vf1_128, vf2_128; + __m256i vf1, vf2, vdiff1, vsqdiff1; + + vf1_128 = _mm_loadu_si128((__m128i *)(src1)); + vf2_128 = _mm_loadu_si128((__m128i *)(src2)); + vf1 = _mm256_cvtepu8_epi16(vf1_128); + vf2 = _mm256_cvtepu8_epi16(vf2_128); + vdiff1 = _mm256_sub_epi16(vf1, vf2); + vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1); + + _mm256_storeu_si256((__m256i *)(dst), vsqdiff1); + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride, src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE void get_squared_error_32x32_avx2( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + uint16_t *frame_sse, const unsigned int sse_stride) { + (void)block_width; + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + for (int i = 0; i < block_height; i++) { + __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2; + + vsrc1 = _mm256_loadu_si256((__m256i *)src1); + vsrc2 = _mm256_loadu_si256((__m256i *)src2); + vmax = _mm256_max_epu8(vsrc1, vsrc2); + vmin = _mm256_min_epu8(vsrc1, vsrc2); + vdiff = _mm256_subs_epu8(vmax, vmin); + + __m128i vtmp1 = _mm256_castsi256_si128(vdiff); + __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1); + vdiff1 = _mm256_cvtepu8_epi16(vtmp1); + vdiff2 = _mm256_cvtepu8_epi16(vtmp2); + + vres1 = _mm256_mullo_epi16(vdiff1, vdiff1); + vres2 = _mm256_mullo_epi16(vdiff2, vdiff2); + _mm256_storeu_si256((__m256i *)(dst), vres1); + _mm256_storeu_si256((__m256i *)(dst + 16), vres2); + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride; + src2 += stride2; + dst += sse_stride; + } +} + +static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col, + int block_width) { + __m128i v128tmp = _mm_loadu_si128((__m128i *)(src)); + if (col == 0) { + // For the first column, replicate the first element twice to the left + v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]); + } + if (col == block_width - 4) { + // For the last column, replicate the last element twice to the right + v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]); + } + return _mm256_cvtepu16_epi32(v128tmp); +} + +static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) { + // Mask the required 5 values inside the vector + __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]); + __m128i v128a, v128b; + // Extract 256b as two 128b registers A and B + v128a = _mm256_castsi256_si128(vtmp); + v128b = _mm256_extracti128_si256(vtmp, 1); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A2+B2, A3+B3, 0, 0] + v128b = _mm_srli_si128(v128a, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + v128a = _mm_add_epi32(v128a, v128b); + // B = [A1+B1+A3+B3, 0, 0, 0] + v128b = _mm_srli_si128(v128a, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + v128a = _mm_add_epi32(v128a, v128b); + return _mm_extract_epi32(v128a, 0); +} + +// AVX2 implementation of approx_exp() +static AOM_INLINE __m256 approx_exp_avx2(__m256 y) { +#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2) +#define B \ + 127 // Offset for the exponent according to IEEE floating point standard. +#define C 60801 // Magic number controls the accuracy of approximation + const __m256 multiplier = _mm256_set1_ps(A); + const __m256i offset = _mm256_set1_epi32(B * (1 << 23) - C); + + y = _mm256_mul_ps(y, multiplier); + y = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_cvttps_epi32(y), offset)); + return y; +#undef A +#undef B +#undef C +} + +static void apply_temporal_filter( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint16_t *frame_sse, uint32_t *luma_sse_sum, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + if (block_width == 32) { + get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } else { + get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width, + block_height, frame_sse, SSE_STRIDE); + } + + __m256i vsrc[5]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + vsrc[i] = xx_load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } + + // Copy first row to first 2 vectors + vsrc[0] = vsrc[2]; + vsrc[1] = vsrc[2]; + + for (int row = 0; row < block_height; row++) { + __m256i vsum = _mm256_setzero_si256(); + + // Add 5 consecutive rows + for (int i = 0; i < 5; i++) { + vsum = _mm256_add_epi32(vsum, vsrc[i]); + } + + // Push all elements by one element to the top + for (int i = 0; i < 4; i++) { + vsrc[i] = vsrc[i + 1]; + } + + // Load next row to the last element + if (row <= block_height - 4) { + vsrc[4] = xx_load_and_pad(src, col, block_width); + src += SSE_STRIDE; + } else { + vsrc[4] = vsrc[3]; + } + + // Accumulate the sum horizontally + for (int i = 0; i < 4; i++) { + acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i); + } + } + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + __m256d subblock_mses_reg[4]; + __m256d d_factor_mul_n_decay_qr_invs[4]; + const __m256 zero = _mm256_set1_ps(0.0f); + const __m256 point_five = _mm256_set1_ps(0.5f); + const __m256 seven = _mm256_set1_ps(7.0f); + const __m256d inv_num_ref_pixel_256bit = _mm256_set1_pd(inv_num_ref_pixels); + const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor); + const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE); + // Maintain registers to hold mse and d_factor at subblock level. + subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]); + subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]); + subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]); + subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]); + d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]); + d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]); + d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]); + d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]); + + for (int i = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW; + for (int j = 0; j < block_width; j += 8) { + const __m256i acc_sse = + _mm256_lddqu_si256((__m256i *)(acc_5x5_sse[i] + j)); + const __m256i luma_sse = + _mm256_lddqu_si256((__m256i *)((luma_sse_sum_temp + j))); + + // uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + const __m256i diff_sse = _mm256_add_epi32(acc_sse, luma_sse); + + const __m256d diff_sse_pd_1 = + _mm256_cvtepi32_pd(_mm256_castsi256_si128(diff_sse)); + const __m256d diff_sse_pd_2 = + _mm256_cvtepi32_pd(_mm256_extracti128_si256(diff_sse, 1)); + + // const double window_error = diff_sse * inv_num_ref_pixels; + const __m256d window_error_1 = + _mm256_mul_pd(diff_sse_pd_1, inv_num_ref_pixel_256bit); + const __m256d window_error_2 = + _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit); + + // const int subblock_idx = y_blk_raster_offset + (j >= block_width / + // 2); + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const __m256d blk_error = subblock_mses_reg[subblock_idx]; + + // const double combined_error = + // weight_factor *window_error + subblock_mses_scaled[subblock_idx]; + const __m256d combined_error_1 = _mm256_add_pd( + _mm256_mul_pd(window_error_1, weight_factor_256bit), blk_error); + + const __m256d combined_error_2 = _mm256_add_pd( + _mm256_mul_pd(window_error_2, weight_factor_256bit), blk_error); + + // d_factor_decayed[subblock_idx] + const __m256d d_fact_mul_n_decay = + d_factor_mul_n_decay_qr_invs[subblock_idx]; + + // double scaled_error = combined_error * + // d_factor_decayed[subblock_idx]; + const __m256d scaled_error_1 = + _mm256_mul_pd(combined_error_1, d_fact_mul_n_decay); + const __m256d scaled_error_2 = + _mm256_mul_pd(combined_error_2, d_fact_mul_n_decay); + + const __m128 scaled_error_ps_1 = _mm256_cvtpd_ps(scaled_error_1); + const __m128 scaled_error_ps_2 = _mm256_cvtpd_ps(scaled_error_2); + + const __m256 scaled_error_ps = _mm256_insertf128_ps( + _mm256_castps128_ps256(scaled_error_ps_1), scaled_error_ps_2, 0x1); + + // scaled_error = AOMMIN(scaled_error, 7); + const __m256 scaled_diff_ps = _mm256_min_ps(scaled_error_ps, seven); + const __m256 minus_scaled_diff_ps = _mm256_sub_ps(zero, scaled_diff_ps); + // const int weight = + //(int)(approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE + 0.5f); + const __m256 exp_result = approx_exp_avx2(minus_scaled_diff_ps); + const __m256 scale_weight_exp_result = + _mm256_mul_ps(exp_result, tf_weight_scale); + const __m256 round_result = + _mm256_add_ps(scale_weight_exp_result, point_five); + __m256i weights_in_32bit = _mm256_cvttps_epi32(round_result); + + __m128i weights_in_16bit = + _mm_packus_epi32(_mm256_castsi256_si128(weights_in_32bit), + _mm256_extractf128_si256(weights_in_32bit, 0x1)); + + // count[k] += weight; + // accumulator[k] += weight * pixel_value; + const int stride_idx = i * stride2 + j; + const __m128i count_array = + _mm_loadu_si128((__m128i *)(count + stride_idx)); + _mm_storeu_si128((__m128i *)(count + stride_idx), + _mm_add_epi16(count_array, weights_in_16bit)); + + const __m256i accumulator_array = + _mm256_loadu_si256((__m256i *)(accumulator + stride_idx)); + const __m128i pred_values = + _mm_loadl_epi64((__m128i *)(frame2 + stride_idx)); + + const __m256i pred_values_u32 = _mm256_cvtepu8_epi32(pred_values); + const __m256i mull_frame2_weight_u32 = + _mm256_mullo_epi32(pred_values_u32, weights_in_32bit); + _mm256_storeu_si256( + (__m256i *)(accumulator + stride_idx), + _mm256_add_epi32(accumulator_array, mull_frame2_weight_u32)); + } + } + } +} + +void av1_apply_temporal_filter_avx2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!"); + assert(!is_high_bitdepth && "Only support low bit-depth with avx2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx]; + } + } + } + } + } + + apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w, + plane_w, plane_h, subblock_mses, accum + plane_offset, + count + plane_offset, frame_sse, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c new file mode 100644 index 0000000000..842d3b13c8 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/av1_rtcd.h" +#include "aom_dsp/mathutils.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" + +// For the squared error buffer, keep a padding for 4 samples +#define SSE_STRIDE (BW + 4) + +DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = { + { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } }, + { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } } +}; + +static void get_squared_error(const uint8_t *frame1, const unsigned int stride, + const uint8_t *frame2, const unsigned int stride2, + const int block_width, const int block_height, + uint16_t *frame_sse, + const unsigned int dst_stride) { + const uint8_t *src1 = frame1; + const uint8_t *src2 = frame2; + uint16_t *dst = frame_sse; + + for (int i = 0; i < block_height; i++) { + for (int j = 0; j < block_width; j += 16) { + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j)); + __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j)); + + __m128i vmax = _mm_max_epu8(vsrc1, vsrc2); + __m128i vmin = _mm_min_epu8(vsrc1, vsrc2); + __m128i vdiff = _mm_subs_epu8(vmax, vmin); + + __m128i vzero = _mm_setzero_si128(); + __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero); + __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero); + + __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1); + __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2); + + _mm_storeu_si128((__m128i *)(dst + j + 2), vres1); + _mm_storeu_si128((__m128i *)(dst + j + 10), vres2); + } + + // Set zero to uninitialized memory to avoid uninitialized loads later + *(int *)(dst + block_width + 2) = _mm_cvtsi128_si32(_mm_setzero_si128()); + + src1 += stride; + src2 += stride2; + dst += dst_stride; + } +} + +static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col, + int block_width) { + __m128i vtmp = _mm_loadu_si128((__m128i *)src); + __m128i vzero = _mm_setzero_si128(); + __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero); + __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero); + // For the first column, replicate the first element twice to the left + dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA); + // For the last column, replicate the last element twice to the right + dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54); +} + +static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) { + __m128i veca, vecb; + // Mask and obtain the required 5 values inside the vector + veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]); + vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]); + // A = [A0+B0, A1+B1, A2+B2, A3+B3] + veca = _mm_add_epi32(veca, vecb); + // B = [A2+B2, A3+B3, 0, 0] + vecb = _mm_srli_si128(veca, 8); + // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X] + veca = _mm_add_epi32(veca, vecb); + // B = [A1+B1+A3+B3, 0, 0, 0] + vecb = _mm_srli_si128(veca, 4); + // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X] + veca = _mm_add_epi32(veca, vecb); + return _mm_cvtsi128_si32(veca); +} + +static void apply_temporal_filter( + const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2, + const unsigned int stride2, const int block_width, const int block_height, + const int *subblock_mses, unsigned int *accumulator, uint16_t *count, + uint16_t *frame_sse, uint32_t *luma_sse_sum, + const double inv_num_ref_pixels, const double decay_factor, + const double inv_factor, const double weight_factor, double *d_factor, + int tf_wgt_calc_lvl) { + assert(((block_width == 16) || (block_width == 32)) && + ((block_height == 16) || (block_height == 32))); + + uint32_t acc_5x5_sse[BH][BW]; + + get_squared_error(frame1, stride, frame2, stride2, block_width, block_height, + frame_sse, SSE_STRIDE); + + __m128i vsrc[5][2]; + + // Traverse 4 columns at a time + // First and last columns will require padding + for (int col = 0; col < block_width; col += 4) { + uint16_t *src = frame_sse + col; + + // Load and pad(for first and last col) 3 rows from the top + for (int i = 2; i < 5; i++) { + xx_load_and_pad(src, vsrc[i], col, block_width); + src += SSE_STRIDE; + } + + // Padding for top 2 rows + vsrc[0][0] = vsrc[2][0]; + vsrc[0][1] = vsrc[2][1]; + vsrc[1][0] = vsrc[2][0]; + vsrc[1][1] = vsrc[2][1]; + + for (int row = 0; row < block_height; row++) { + __m128i vsum1 = _mm_setzero_si128(); + __m128i vsum2 = _mm_setzero_si128(); + + // Add 5 consecutive rows + for (int i = 0; i < 5; i++) { + vsum1 = _mm_add_epi32(vsrc[i][0], vsum1); + vsum2 = _mm_add_epi32(vsrc[i][1], vsum2); + } + + // Push all elements by one element to the top + for (int i = 0; i < 4; i++) { + vsrc[i][0] = vsrc[i + 1][0]; + vsrc[i][1] = vsrc[i + 1][1]; + } + + if (row <= block_height - 4) { + // Load next row + xx_load_and_pad(src, vsrc[4], col, block_width); + src += SSE_STRIDE; + } else { + // Padding for bottom 2 rows + vsrc[4][0] = vsrc[3][0]; + vsrc[4][1] = vsrc[3][1]; + } + + // Accumulate the sum horizontally + for (int i = 0; i < 4; i++) { + acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i); + } + } + } + + double subblock_mses_scaled[4]; + double d_factor_decayed[4]; + for (int idx = 0; idx < 4; idx++) { + subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor; + d_factor_decayed[idx] = d_factor[idx] * decay_factor; + } + if (tf_wgt_calc_lvl == 0) { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE); + + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } else { + for (int i = 0, k = 0; i < block_height; i++) { + const int y_blk_raster_offset = (i >= block_height / 2) * 2; + for (int j = 0; j < block_width; j++, k++) { + const int pixel_value = frame2[i * stride2 + j]; + uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j]; + + const double window_error = diff_sse * inv_num_ref_pixels; + const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2); + const double combined_error = + weight_factor * window_error + subblock_mses_scaled[subblock_idx]; + + double scaled_error = combined_error * d_factor_decayed[subblock_idx]; + scaled_error = AOMMIN(scaled_error, 7); + const float fweight = + approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE; + const int weight = iroundpf(fweight); + count[k] += weight; + accumulator[k] += weight * pixel_value; + } + } + } +} + +void av1_apply_temporal_filter_sse2( + const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd, + const BLOCK_SIZE block_size, const int mb_row, const int mb_col, + const int num_planes, const double *noise_levels, const MV *subblock_mvs, + const int *subblock_mses, const int q_factor, const int filter_strength, + int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum, + uint16_t *count) { + const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH; + assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!"); + assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!"); + assert(!is_high_bitdepth && "Only support low bit-depth with sse2!"); + assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE); + (void)is_high_bitdepth; + + const int mb_height = block_size_high[block_size]; + const int mb_width = block_size_wide[block_size]; + const int frame_height = frame_to_filter->y_crop_height; + const int frame_width = frame_to_filter->y_crop_width; + const int min_frame_size = AOMMIN(frame_height, frame_width); + // Variables to simplify combined error calculation. + const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) * + TF_SEARCH_ERROR_NORM_WEIGHT); + const double weight_factor = + (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. + double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); + q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } + // Smaller strength -> smaller filtering weight. + double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); + s_decay = CLIP(s_decay, 1e-5, 1); + double d_factor[4] = { 0 }; + uint16_t frame_sse[SSE_STRIDE * BH] = { 0 }; + uint32_t luma_sse_sum[BW * BH] = { 0 }; + + for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) { + // Larger motion vector -> smaller filtering weight. + const MV mv = subblock_mvs[subblock_idx]; + const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2)); + double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD; + distance_threshold = AOMMAX(distance_threshold, 1); + d_factor[subblock_idx] = distance / distance_threshold; + d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1); + } + + // Handle planes in sequence. + int plane_offset = 0; + for (int plane = 0; plane < num_planes; ++plane) { + const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y; + const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x; + const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1]; + const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w; + + const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset; + const int ss_x_shift = + mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x; + const int ss_y_shift = + mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y; + const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH + + ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0); + const double inv_num_ref_pixels = 1.0 / num_ref_pixels; + // Larger noise -> larger filtering weight. + const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. + const double decay_factor = 1 / (n_decay * q_decay * s_decay); + + // Filter U-plane and V-plane using Y-plane. This is because motion + // search is only done on Y-plane, so the information from Y-plane + // will be more accurate. The luma sse sum is reused in both chroma + // planes. + if (plane == AOM_PLANE_U) { + for (unsigned int i = 0, k = 0; i < plane_h; i++) { + for (unsigned int j = 0; j < plane_w; j++, k++) { + for (int ii = 0; ii < (1 << ss_y_shift); ++ii) { + for (int jj = 0; jj < (1 << ss_x_shift); ++jj) { + const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane. + const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane. + luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2]; + } + } + } + } + } + + apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w, + plane_w, plane_h, subblock_mses, accum + plane_offset, + count + plane_offset, frame_sse, luma_sse_sum, + inv_num_ref_pixels, decay_factor, inv_factor, + weight_factor, d_factor, tf_wgt_calc_lvl); + plane_offset += plane_h * plane_w; + } +} diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c new file mode 100644 index 0000000000..9cde860534 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + + uint64_t csse; + + const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE); + const __m256i v_zext_q = yy_set1_64_from_32i(~0); + + __m256i v_acc0_q = _mm256_setzero_si256(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n)); + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n)); + const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n)); + + const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w); + const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w); + const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b); + + const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w); + + const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w); + const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w); + + const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d); + + const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w); + + const __m256i v_sum0_q = _mm256_add_epi64( + _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32)); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q); + + n += 16; + } while (n); + + v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8)); + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); +#if AOM_ARCH_X86_64 + csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&csse, v_acc_q_0); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + __m256i v_acc0_d = _mm256_setzero_si256(); + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m)); + const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32)); + + const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds)); + const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16)); + const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32)); + const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48)); + + const __m256i v_m0_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b)); + const __m256i v_m1_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1)); + const __m256i v_m2_w = + _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b)); + const __m256i v_m3_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1)); + + const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w); + const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w); + const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w); + const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w); + + const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d); + const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d); + + const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d); + + v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31); + v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm256_unpackhi_epi32(v_acc0_d, v_sign_d)); + + __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8)); + + __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q); + __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1); + v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1); + +#if AOM_ARCH_X86_64 + acc = _mm_extract_epi64(v_acc_q_0, 0); +#else + xx_storel_64(&acc, v_acc_q_0); +#endif + + return acc > limit; +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001); + + assert(N % 64 == 0); + + do { + const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a)); + const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b)); + const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16)); + const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16)); + const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32)); + const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32)); + const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48)); + const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48)); + + const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w); + const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w); + const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w); + const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w); + const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w); + const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w); + const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w); + const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w); + const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w); + const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w); + const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w); + const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w); + const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w); + const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w); + const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w); + + const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w); + const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w); + const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w); + const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w); + + _mm256_store_si256((__m256i *)(d), v_r0_w); + _mm256_store_si256((__m256i *)(d + 16), v_r1_w); + _mm256_store_si256((__m256i *)(d + 32), v_r2_w); + _mm256_store_si256((__m256i *)(d + 48), v_r3_w); + + a += 64; + b += 64; + d += 64; + N -= 64; + } while (N); +} diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c new file mode 100644 index 0000000000..d7ac2223f2 --- /dev/null +++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "aom/aom_integer.h" + +#include "av1/common/reconinter.h" + +#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) + +/** + * See av1_wedge_sse_from_residuals_c + */ +uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + int n = -N; + int n8 = n + 8; + + uint64_t csse; + + const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); + const __m128i v_zext_q = xx_set1_64_from_32i(~0); + + __m128i v_acc0_q = _mm_setzero_si128(); + + assert(N % 64 == 0); + + r1 += N; + d += N; + m += N; + + do { + const __m128i v_r0_w = xx_load_128(r1 + n); + const __m128i v_r1_w = xx_load_128(r1 + n8); + const __m128i v_d0_w = xx_load_128(d + n); + const __m128i v_d1_w = xx_load_128(d + n8); + const __m128i v_m01_b = xx_load_128(m + n); + + const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); + const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); + const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); + const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + + const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); + const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); + const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); + + const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); + const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); + const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); + const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); + + const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); + const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); + + const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); + const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); + + const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), + _mm_srli_epi64(v_sq0_d, 32)); + const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), + _mm_srli_epi64(v_sq1_d, 32)); + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); + + n8 += 16; + n += 16; + } while (n); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + +#if AOM_ARCH_X86_64 + csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); +#else + xx_storel_64(&csse, v_acc0_q); +#endif + + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +/** + * See av1_wedge_sign_from_residuals_c + */ +int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + int64_t acc; + + __m128i v_sign_d; + __m128i v_acc0_d = _mm_setzero_si128(); + __m128i v_acc1_d = _mm_setzero_si128(); + __m128i v_acc_q; + + // Input size limited to 8192 by the use of 32 bit accumulators and m + // being between [0, 64]. Overflow might happen at larger sizes, + // though it is practically impossible on real video input. + assert(N < 8192); + assert(N % 64 == 0); + + do { + const __m128i v_m01_b = xx_load_128(m); + const __m128i v_m23_b = xx_load_128(m + 16); + const __m128i v_m45_b = xx_load_128(m + 32); + const __m128i v_m67_b = xx_load_128(m + 48); + + const __m128i v_d0_w = xx_load_128(ds); + const __m128i v_d1_w = xx_load_128(ds + 8); + const __m128i v_d2_w = xx_load_128(ds + 16); + const __m128i v_d3_w = xx_load_128(ds + 24); + const __m128i v_d4_w = xx_load_128(ds + 32); + const __m128i v_d5_w = xx_load_128(ds + 40); + const __m128i v_d6_w = xx_load_128(ds + 48); + const __m128i v_d7_w = xx_load_128(ds + 56); + + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); + const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); + const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); + const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); + const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); + + const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); + const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); + const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); + const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); + const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); + const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); + + const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); + const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); + const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); + const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); + + const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); + const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); + + v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); + v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); + + ds += 64; + m += 64; + + N -= 64; + } while (N); + + v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); + v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), + _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); + + v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); + v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), + _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); + + v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if AOM_ARCH_X86_64 + acc = _mm_cvtsi128_si64(v_acc_q); +#else + xx_storel_64(&acc, v_acc_q); +#endif + + return acc > limit; +} + +// Negate under mask +static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { + return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); +} + +/** + * av1_wedge_compute_delta_squares_c + */ +void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, + const int16_t *b, int N) { + const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0, + (short)0xffff, 0, (short)0xffff, 0); + + assert(N % 64 == 0); + + do { + const __m128i v_a0_w = xx_load_128(a); + const __m128i v_b0_w = xx_load_128(b); + const __m128i v_a1_w = xx_load_128(a + 8); + const __m128i v_b1_w = xx_load_128(b + 8); + const __m128i v_a2_w = xx_load_128(a + 16); + const __m128i v_b2_w = xx_load_128(b + 16); + const __m128i v_a3_w = xx_load_128(a + 24); + const __m128i v_b3_w = xx_load_128(b + 24); + + const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); + const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); + const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); + const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); + const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); + const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); + const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); + const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); + + // Negate top word of pairs + const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); + const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); + const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); + const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); + const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); + const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); + const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); + const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); + + const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); + const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); + const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); + const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); + const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); + const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); + const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); + const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); + + const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); + const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); + const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); + const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); + + xx_store_128(d, v_r0_w); + xx_store_128(d + 8, v_r1_w); + xx_store_128(d + 16, v_r2_w); + xx_store_128(d + 24, v_r3_w); + + a += 32; + b += 32; + d += 32; + N -= 32; + } while (N); +} diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com new file mode 100644 index 0000000000..5c8e0e09d1 --- /dev/null +++ b/third_party/aom/av1/exports_com @@ -0,0 +1,2 @@ +text aom_read_obu_header_and_size +text av1_resize_frame420 diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec new file mode 100644 index 0000000000..daabf6766d --- /dev/null +++ b/third_party/aom/av1/exports_dec @@ -0,0 +1,3 @@ +data aom_codec_av1_dx_algo +text aom_codec_av1_dx +text av1_add_film_grain diff --git a/third_party/aom/av1/exports_enc b/third_party/aom/av1/exports_enc new file mode 100644 index 0000000000..dc4a9eae79 --- /dev/null +++ b/third_party/aom/av1/exports_enc @@ -0,0 +1,2 @@ +data aom_codec_av1_cx_algo +text aom_codec_av1_cx diff --git a/third_party/aom/av1/exports_ident b/third_party/aom/av1/exports_ident new file mode 100644 index 0000000000..b523a679d5 --- /dev/null +++ b/third_party/aom/av1/exports_ident @@ -0,0 +1,2 @@ +text ifd_init +text ifd_inspect diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test new file mode 100644 index 0000000000..dab3775750 --- /dev/null +++ b/third_party/aom/av1/exports_test @@ -0,0 +1,2 @@ +text av1_get_fwd_txfm_cfg +text av1_rtcd diff --git a/third_party/aom/av1/ratectrl_rtc.cc b/third_party/aom/av1/ratectrl_rtc.cc new file mode 100644 index 0000000000..83e88ba480 --- /dev/null +++ b/third_party/aom/av1/ratectrl_rtc.cc @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/ratectrl_rtc.h" + +#include +#include + +#include "aom/aomcx.h" +#include "aom/aom_encoder.h" +#include "aom_mem/aom_mem.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/encoder_utils.h" +#include "av1/encoder/pickcdef.h" +#include "av1/encoder/picklpf.h" +#include "av1/encoder/ratectrl.h" +#include "av1/encoder/rc_utils.h" +#include "av1/encoder/svc_layercontext.h" + +namespace aom { + +AV1RateControlRtcConfig::AV1RateControlRtcConfig() { + width = 1280; + height = 720; + max_quantizer = 63; + min_quantizer = 2; + target_bandwidth = 1000; + buf_initial_sz = 600; + buf_optimal_sz = 600; + buf_sz = 1000; + undershoot_pct = overshoot_pct = 50; + max_intra_bitrate_pct = 50; + max_inter_bitrate_pct = 0; + frame_drop_thresh = 0; + max_consec_drop = 0; + framerate = 30.0; + ss_number_layers = 1; + ts_number_layers = 1; + aq_mode = 0; + layer_target_bitrate[0] = static_cast(target_bandwidth); + ts_rate_decimator[0] = 1; + av1_zero(max_quantizers); + av1_zero(min_quantizers); + av1_zero(scaling_factor_den); + av1_zero(scaling_factor_num); + av1_zero(layer_target_bitrate); + av1_zero(ts_rate_decimator); + scaling_factor_num[0] = 1; + scaling_factor_den[0] = 1; + max_quantizers[0] = max_quantizer; + min_quantizers[0] = min_quantizer; +} + +std::unique_ptr AV1RateControlRTC::Create( + const AV1RateControlRtcConfig &cfg) { + std::unique_ptr rc_api(new (std::nothrow) + AV1RateControlRTC()); + if (!rc_api) return nullptr; + rc_api->cpi_ = static_cast(aom_memalign(32, sizeof(*cpi_))); + if (!rc_api->cpi_) return nullptr; + av1_zero(*rc_api->cpi_); + rc_api->cpi_->ppi = + static_cast(aom_memalign(32, sizeof(AV1_PRIMARY))); + if (!rc_api->cpi_->ppi) return nullptr; + av1_zero(*rc_api->cpi_->ppi); + rc_api->cpi_->common.seq_params = &rc_api->cpi_->ppi->seq_params; + av1_zero(*rc_api->cpi_->common.seq_params); + if (!rc_api->InitRateControl(cfg)) return nullptr; + if (cfg.aq_mode) { + AV1_COMP *const cpi = rc_api->cpi_; + cpi->enc_seg.map = static_cast(aom_calloc( + cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols, + sizeof(*cpi->enc_seg.map))); + if (!cpi->enc_seg.map) return nullptr; + cpi->cyclic_refresh = av1_cyclic_refresh_alloc( + cpi->common.mi_params.mi_rows, cpi->common.mi_params.mi_cols); + if (!cpi->cyclic_refresh) return nullptr; + } + return rc_api; +} + +AV1RateControlRTC::~AV1RateControlRTC() { + if (cpi_) { + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + aom_free(lc->map); + } + } + } + aom_free(cpi_->svc.layer_context); + cpi_->svc.layer_context = nullptr; + + if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { + aom_free(cpi_->enc_seg.map); + cpi_->enc_seg.map = nullptr; + av1_cyclic_refresh_free(cpi_->cyclic_refresh); + } + aom_free(cpi_->ppi); + aom_free(cpi_); + } +} + +bool AV1RateControlRTC::InitRateControl(const AV1RateControlRtcConfig &rc_cfg) { + AV1_COMMON *cm = &cpi_->common; + AV1EncoderConfig *oxcf = &cpi_->oxcf; + RATE_CONTROL *const rc = &cpi_->rc; + cm->seq_params->profile = PROFILE_0; + cm->seq_params->bit_depth = AOM_BITS_8; + cm->show_frame = 1; + oxcf->profile = cm->seq_params->profile; + oxcf->mode = REALTIME; + oxcf->rc_cfg.mode = AOM_CBR; + oxcf->pass = AOM_RC_ONE_PASS; + oxcf->q_cfg.aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ; + oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT; + oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh; + rc->max_consec_drop = rc_cfg.max_consec_drop; + cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP; + oxcf->tool_cfg.bit_depth = AOM_BITS_8; + oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC; + oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL; + cm->current_frame.frame_number = 0; + cpi_->ppi->p_rc.kf_boost = DEFAULT_KF_BOOST_RT; + for (auto &lvl_idx : oxcf->target_seq_level_idx) lvl_idx = SEQ_LEVEL_MAX; + + memcpy(cpi_->ppi->level_params.target_seq_level_idx, + oxcf->target_seq_level_idx, sizeof(oxcf->target_seq_level_idx)); + if (!UpdateRateControl(rc_cfg)) return false; + set_sb_size(cm->seq_params, + av1_select_sb_size(oxcf, cm->width, cm->height, + cpi_->svc.number_spatial_layers)); + cpi_->ppi->use_svc = cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1; + av1_primary_rc_init(oxcf, &cpi_->ppi->p_rc); + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + av1_rc_init_minq_luts(); + av1_rc_init(oxcf, rc); + // Enable external rate control. + cpi_->rc.rtc_external_ratectrl = 1; + cpi_->sf.rt_sf.use_nonrd_pick_mode = 1; + return true; +} + +bool AV1RateControlRTC::UpdateRateControl( + const AV1RateControlRtcConfig &rc_cfg) { + if (rc_cfg.ss_number_layers < 1 || + rc_cfg.ss_number_layers > AOM_MAX_SS_LAYERS || + rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > AOM_MAX_TS_LAYERS) { + return false; + } + const int num_layers = rc_cfg.ss_number_layers * rc_cfg.ts_number_layers; + if (num_layers > 1 && !av1_alloc_layer_context(cpi_, num_layers)) { + return false; + } + AV1_COMMON *cm = &cpi_->common; + AV1EncoderConfig *oxcf = &cpi_->oxcf; + RATE_CONTROL *const rc = &cpi_->rc; + initial_width_ = rc_cfg.width; + initial_height_ = rc_cfg.height; + cm->width = rc_cfg.width; + cm->height = rc_cfg.height; + oxcf->frm_dim_cfg.width = rc_cfg.width; + oxcf->frm_dim_cfg.height = rc_cfg.height; + oxcf->rc_cfg.worst_allowed_q = av1_quantizer_to_qindex(rc_cfg.max_quantizer); + oxcf->rc_cfg.best_allowed_q = av1_quantizer_to_qindex(rc_cfg.min_quantizer); + rc->worst_quality = oxcf->rc_cfg.worst_allowed_q; + rc->best_quality = oxcf->rc_cfg.best_allowed_q; + oxcf->input_cfg.init_framerate = rc_cfg.framerate; + oxcf->rc_cfg.target_bandwidth = rc_cfg.target_bandwidth > INT64_MAX / 1000 + ? INT64_MAX + : 1000 * rc_cfg.target_bandwidth; + oxcf->rc_cfg.starting_buffer_level_ms = rc_cfg.buf_initial_sz; + oxcf->rc_cfg.optimal_buffer_level_ms = rc_cfg.buf_optimal_sz; + oxcf->rc_cfg.maximum_buffer_size_ms = rc_cfg.buf_sz; + oxcf->rc_cfg.under_shoot_pct = rc_cfg.undershoot_pct; + oxcf->rc_cfg.over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh; + rc->max_consec_drop = rc_cfg.max_consec_drop; + oxcf->rc_cfg.max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + oxcf->rc_cfg.max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct; + cpi_->framerate = rc_cfg.framerate; + if (rc_cfg.is_screen) { + cpi_->oxcf.tune_cfg.content = AOM_CONTENT_SCREEN; + cpi_->is_screen_content_type = 1; + } + cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; + cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; + set_primary_rc_buffer_sizes(oxcf, cpi_->ppi); + enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8); + av1_new_framerate(cpi_, cpi_->framerate); + if (cpi_->svc.number_temporal_layers > 1 || + cpi_->svc.number_spatial_layers > 1) { + int64_t target_bandwidth_svc = 0; + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lc->layer_target_bitrate = 1000 * rc_cfg.layer_target_bitrate[layer]; + lc->max_q = rc_cfg.max_quantizers[layer]; + lc->min_q = rc_cfg.min_quantizers[layer]; + lrc->worst_quality = + av1_quantizer_to_qindex(rc_cfg.max_quantizers[layer]); + lrc->best_quality = + av1_quantizer_to_qindex(rc_cfg.min_quantizers[layer]); + lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl]; + lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl]; + lc->framerate_factor = rc_cfg.ts_rate_decimator[tl]; + if (tl == cpi_->svc.number_temporal_layers - 1) + target_bandwidth_svc += lc->layer_target_bitrate; + } + } + + if (cm->current_frame.frame_number == 0) av1_init_layer_context(cpi_); + // This is needed to initialize external RC flag in layer context structure. + cpi_->rc.rtc_external_ratectrl = 1; + av1_update_layer_context_change_config(cpi_, target_bandwidth_svc); + } + check_reset_rc_flag(cpi_); + return true; +} + +FrameDropDecision AV1RateControlRTC::ComputeQP( + const AV1FrameParamsRTC &frame_params) { + AV1_COMMON *const cm = &cpi_->common; + int width, height; + GF_GROUP *const gf_group = &cpi_->ppi->gf_group; + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1) { + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + av1_get_layer_resolution(initial_width_, initial_height_, + lc->scaling_factor_num, lc->scaling_factor_den, + &width, &height); + cm->width = width; + cm->height = height; + } + enc_set_mb_mi(&cm->mi_params, cm->width, cm->height, BLOCK_8X8); + cm->current_frame.frame_type = frame_params.frame_type; + cpi_->refresh_frame.golden_frame = + (cm->current_frame.frame_type == KEY_FRAME) ? 1 : 0; + cpi_->sf.rt_sf.use_nonrd_pick_mode = 1; + + if (frame_params.frame_type == kKeyFrame) { + gf_group->update_type[cpi_->gf_frame_index] = KF_UPDATE; + gf_group->frame_type[cpi_->gf_frame_index] = KEY_FRAME; + gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_RESET; + if (cpi_->ppi->use_svc) { + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + if (cm->current_frame.frame_number > 0) + av1_svc_reset_temporal_layers(cpi_, 1); + cpi_->svc.layer_context[layer].is_key_frame = 1; + } + } else { + gf_group->update_type[cpi_->gf_frame_index] = LF_UPDATE; + gf_group->frame_type[cpi_->gf_frame_index] = INTER_FRAME; + gf_group->refbuf_state[cpi_->gf_frame_index] = REFBUF_UPDATE; + if (cpi_->ppi->use_svc) { + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + cpi_->svc.layer_context[layer].is_key_frame = 0; + } + } + if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) + cpi_->rc.frames_since_key++; + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + av1_update_temporal_layer_framerate(cpi_); + av1_restore_layer_context(cpi_); + } + int target = 0; + if (cpi_->oxcf.rc_cfg.mode == AOM_CBR) { + if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) + av1_cyclic_refresh_update_parameters(cpi_); + if (frame_is_intra_only(cm)) { + target = av1_calc_iframe_target_size_one_pass_cbr(cpi_); + cpi_->common.current_frame.frame_number = 0; + } else { + target = av1_calc_pframe_target_size_one_pass_cbr( + cpi_, gf_group->update_type[cpi_->gf_frame_index]); + } + } + av1_rc_set_frame_target(cpi_, target, cm->width, cm->height); + // Always drop for spatial enhancement layer if layer bandwidth is 0. + // Otherwise check for frame-dropping based on buffer level in + // av1_rc_drop_frame(). + if ((cpi_->svc.spatial_layer_id > 0 && + cpi_->oxcf.rc_cfg.target_bandwidth == 0) || + av1_rc_drop_frame(cpi_)) { + cpi_->is_dropped_frame = true; + av1_rc_postencode_update_drop_frame(cpi_); + cpi_->frame_index_set.show_frame_count++; + cpi_->common.current_frame.frame_number++; + return FrameDropDecision::kDrop; + } + int bottom_index = 0, top_index = 0; + cpi_->common.quant_params.base_qindex = + av1_rc_pick_q_and_bounds(cpi_, cm->width, cm->height, + cpi_->gf_frame_index, &bottom_index, &top_index); + if (cpi_->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) + av1_cyclic_refresh_setup(cpi_); + return FrameDropDecision::kOk; +} + +int AV1RateControlRTC::GetQP() const { + return cpi_->common.quant_params.base_qindex; +} + +AV1LoopfilterLevel AV1RateControlRTC::GetLoopfilterLevel() const { + av1_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q); + AV1LoopfilterLevel lpf_level; + lpf_level.filter_level[0] = cpi_->common.lf.filter_level[0]; + lpf_level.filter_level[1] = cpi_->common.lf.filter_level[1]; + lpf_level.filter_level_u = cpi_->common.lf.filter_level_u; + lpf_level.filter_level_v = cpi_->common.lf.filter_level_v; + + return lpf_level; +} + +AV1CdefInfo AV1RateControlRTC::GetCdefInfo() const { + av1_pick_cdef_from_qp(&cpi_->common, 0, 0); + AV1CdefInfo cdef_level; + cdef_level.cdef_strength_y = cpi_->common.cdef_info.cdef_strengths[0]; + cdef_level.cdef_strength_uv = cpi_->common.cdef_info.cdef_uv_strengths[0]; + cdef_level.damping = cpi_->common.cdef_info.cdef_damping; + + return cdef_level; +} + +bool AV1RateControlRTC::GetSegmentationData( + AV1SegmentationData *segmentation_data) const { + if (cpi_->oxcf.q_cfg.aq_mode == 0) { + return false; + } + segmentation_data->segmentation_map = cpi_->enc_seg.map; + segmentation_data->segmentation_map_size = + cpi_->common.mi_params.mi_rows * cpi_->common.mi_params.mi_cols; + segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta; + segmentation_data->delta_q_size = 3u; + return true; +} + +void AV1RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { + cpi_->common.current_frame.frame_number++; + if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) + cpi_->svc.prev_number_spatial_layers = cpi_->svc.number_spatial_layers; + av1_rc_postencode_update(cpi_, encoded_frame_size); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + av1_save_layer_context(cpi_); +} + +} // namespace aom diff --git a/third_party/aom/av1/ratectrl_rtc.h b/third_party/aom/av1/ratectrl_rtc.h new file mode 100644 index 0000000000..1894469dd1 --- /dev/null +++ b/third_party/aom/av1/ratectrl_rtc.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_RATECTRL_RTC_H_ +#define AOM_AV1_RATECTRL_RTC_H_ + +#include +#include + +struct AV1_COMP; + +namespace aom { + +// These constants come from AV1 spec. +static constexpr size_t kAV1MaxLayers = 32; +static constexpr size_t kAV1MaxTemporalLayers = 8; +static constexpr size_t kAV1MaxSpatialLayers = 4; + +enum FrameType { kKeyFrame, kInterFrame }; + +struct AV1RateControlRtcConfig { + public: + AV1RateControlRtcConfig(); + + int width; + int height; + // Flag indicating if the content is screen or not. + bool is_screen = false; + // 0-63 + int max_quantizer; + int min_quantizer; + int64_t target_bandwidth; + int64_t buf_initial_sz; + int64_t buf_optimal_sz; + int64_t buf_sz; + int undershoot_pct; + int overshoot_pct; + int max_intra_bitrate_pct; + int max_inter_bitrate_pct; + int frame_drop_thresh; + int max_consec_drop; + double framerate; + int layer_target_bitrate[kAV1MaxLayers]; + int ts_rate_decimator[kAV1MaxTemporalLayers]; + int aq_mode; + // Number of spatial layers + int ss_number_layers; + // Number of temporal layers + int ts_number_layers; + int max_quantizers[kAV1MaxLayers]; + int min_quantizers[kAV1MaxLayers]; + int scaling_factor_num[kAV1MaxSpatialLayers]; + int scaling_factor_den[kAV1MaxSpatialLayers]; +}; + +struct AV1FrameParamsRTC { + FrameType frame_type; + int spatial_layer_id; + int temporal_layer_id; +}; + +struct AV1LoopfilterLevel { + int filter_level[2]; + int filter_level_u; + int filter_level_v; +}; + +struct AV1CdefInfo { + int cdef_strength_y; + int cdef_strength_uv; + int damping; +}; + +struct AV1SegmentationData { + const uint8_t *segmentation_map; + size_t segmentation_map_size; + const int *delta_q; + size_t delta_q_size; +}; + +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + +class AV1RateControlRTC { + public: + static std::unique_ptr Create( + const AV1RateControlRtcConfig &cfg); + ~AV1RateControlRTC(); + + bool UpdateRateControl(const AV1RateControlRtcConfig &rc_cfg); + // GetQP() needs to be called after ComputeQP() to get the latest QP + int GetQP() const; + // GetLoopfilterLevel() needs to be called after ComputeQP() + AV1LoopfilterLevel GetLoopfilterLevel() const; + // GetCdefInfo() needs to be called after ComputeQP() + AV1CdefInfo GetCdefInfo() const; + // Returns the segmentation map used for cyclic refresh, based on 4x4 blocks. + bool GetSegmentationData(AV1SegmentationData *segmentation_data) const; + // ComputeQP returns the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called (av1_rc_postencode_update_drop_frame is already + // called via ComputeQP if drop is decided). + FrameDropDecision ComputeQP(const AV1FrameParamsRTC &frame_params); + // Feedback to rate control with the size of current encoded frame + void PostEncodeUpdate(uint64_t encoded_frame_size); + + private: + AV1RateControlRTC() = default; + bool InitRateControl(const AV1RateControlRtcConfig &cfg); + AV1_COMP *cpi_; + int initial_width_; + int initial_height_; +}; + +} // namespace aom + +#endif // AOM_AV1_RATECTRL_RTC_H_ diff --git a/third_party/aom/build/cmake/aom_config.c.template b/third_party/aom/build/cmake/aom_config.c.template new file mode 100644 index 0000000000..93a6d8f1ad --- /dev/null +++ b/third_party/aom/build/cmake/aom_config.c.template @@ -0,0 +1,13 @@ +/* + * Copyright (c) @year@, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "aom/aom_codec.h" +static const char* const cfg = "${AOM_CMAKE_CONFIG}"; +const char *aom_codec_build_config(void) {return cfg;} diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake new file mode 100644 index 0000000000..da7de4b0f4 --- /dev/null +++ b/third_party/aom/build/cmake/aom_config_defaults.cmake @@ -0,0 +1,235 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. + +include("${AOM_ROOT}/build/cmake/util.cmake") + +# This file sets default values for libaom configuration variables. All libaom +# config variables are added to the CMake variable cache via the macros provided +# in util.cmake. + +# +# The variables in this section of the file are detected at configuration time, +# but can be overridden via the use of CONFIG_* and ENABLE_* values also defined +# in this file. +# + +set_aom_detect_var(INLINE "" "Sets INLINE value for current target.") + +# CPUs. +set_aom_detect_var(AOM_ARCH_AARCH64 0 "Enables AArch64 architecture.") +set_aom_detect_var(AOM_ARCH_ARM 0 "Enables ARM architecture.") +set_aom_detect_var(AOM_ARCH_PPC 0 "Enables PPC architecture.") +set_aom_detect_var(AOM_ARCH_X86 0 "Enables X86 architecture.") +set_aom_detect_var(AOM_ARCH_X86_64 0 "Enables X86_64 architecture.") + +# Arm/AArch64 feature flags. +set_aom_detect_var(HAVE_NEON 0 "Enables Neon intrinsics optimizations.") +set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.") +set_aom_detect_var(HAVE_NEON_DOTPROD 0 + "Enables Armv8.2-A Neon dotprod intrinsics optimizations.") +set_aom_detect_var(HAVE_NEON_I8MM 0 + "Enables Armv8.2-A Neon i8mm intrinsics optimizations.") +set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.") + +# PPC feature flags. +set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.") + +# x86/x86_64 feature flags. +set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ") +set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.") +set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.") +set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.") +set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.") +set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.") +set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.") +set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.") +set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.") + +# Flags describing the build environment. +set_aom_detect_var(HAVE_FEXCEPT 0 + "Internal flag, GNU fenv.h present for target.") +set_aom_detect_var(HAVE_PTHREAD_H 0 "Internal flag, target pthread support.") +set_aom_detect_var(HAVE_UNISTD_H 0 + "Internal flag, unistd.h present for target.") +set_aom_detect_var(HAVE_WXWIDGETS 0 "WxWidgets present.") + +# +# Variables in this section can be set from the CMake command line or from +# within the CMake GUI. The variables control libaom features. +# + +# Build configuration flags. +set_aom_config_var(AOM_RTCD_FLAGS "" + "Arguments to pass to rtcd.pl. Separate with ';'") +set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.") +set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.") +set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.") +set_aom_config_var(CONFIG_FPMT_TEST 0 "Enable FPMT testing.") +set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).") +set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.") +set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.") +set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.") + +set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1 + "Build with high bitdepth support.") +set_aom_config_var(CONFIG_AV1_TEMPORAL_DENOISING 0 + "Build with temporal denoising support.") +set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.") +set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.") +set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.") +set_aom_config_var(CONFIG_REALTIME_ONLY 0 + "Build for RTC-only. See aomcx.h for all disabled features.") +set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.") +set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.") +set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.") + +# Debugging flags. +set_aom_config_var(CONFIG_DEBUG 0 "Enable debug-only code.") +set_aom_config_var(CONFIG_EXCLUDE_SIMD_MISMATCH 0 + "Exclude mismatch in SIMD functions for testing/debugging.") +set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.") + +# AV1 feature flags. +set_aom_config_var(CONFIG_ACCOUNTING 0 "Enables bit accounting.") +set_aom_config_var(CONFIG_ANALYZER 0 "Enables bit stream analyzer.") +set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0 + "Coefficient range check.") +set_aom_config_var(CONFIG_DENOISE 1 + "Denoise/noise modeling support in encoder.") +set_aom_config_var(CONFIG_INSPECTION 0 "Enables bitstream inspection.") +set_aom_config_var(CONFIG_INTERNAL_STATS 0 "Enables internal encoder stats.") +set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0 + "Force high bitdepth decoding pipeline on 8-bit input.") +mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING) +set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 + "Max profile to support decoding.") +set_aom_config_var( + CONFIG_NORMAL_TILE_MODE 0 + "Only enables general decoding (disables large scale tile decoding).") +set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.") +set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.") +set_aom_config_var(CONFIG_TUNE_BUTTERAUGLI 0 + "Enable encoding tuning for Butteraugli.") +set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.") +set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.") +set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.") +set_aom_config_var(STATIC_LINK_JXL 0 "Statically link the JPEG-XL library.") + +# AV1 experiment flags. +set_aom_config_var(CONFIG_BITRATE_ACCURACY 0 + "AV1 experiment: Improve bitrate accuracy.") +set_aom_config_var( + CONFIG_BITRATE_ACCURACY_BL 0 + "AV1 experiment: Baseline of improve bitrate accuracy experiment.") +set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 + "AV1 experiment: Bitstream debugging.") +set_aom_config_var( + CONFIG_COLLECT_COMPONENT_TIMING 0 + "AV1 experiment: Collect encoding component timing information.") +set_aom_config_var( + CONFIG_COLLECT_PARTITION_STATS 0 + "AV1 experiment: Collect partition timing stats. Can be 1 or 2.") +set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment.") +set_aom_config_var( + CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 + "AV1 experiment: Disable full_pixel_motion_search_based_split on BLOCK_8X8.") +set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment.") +set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment.") +set_aom_config_var(CONFIG_NN_V2 0 + "AV1 experiment: Fully-connected neural nets ver.2.") +set_aom_config_var(CONFIG_OPTICAL_FLOW_API 0 + "AV1 experiment: for optical flow API.") +set_aom_config_var(CONFIG_PARTITION_SEARCH_ORDER 0 + "AV1 experiment: Use alternative partition search order.") +set_aom_config_var(CONFIG_RATECTRL_LOG 0 + "AV1 experiment: Log rate control decision.") +set_aom_config_var(CONFIG_RD_COMMAND 0 + "AV1 experiment: Use external rdmult and q_index.") +set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment.") +set_aom_config_var( + CONFIG_RT_ML_PARTITIONING 0 + "AV1 experiment: Build with ML-based partitioning for Real Time.") +set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment.") +set_aom_config_var(CONFIG_TFLITE 0 + "AV1 experiment: Enable tensorflow lite library.") +set_aom_config_var(CONFIG_THREE_PASS 0 + "AV1 experiment: Enable three-pass encoding.") +set_aom_config_var(CONFIG_OUTPUT_FRAME_SIZE 0 + "AV1 experiment: Output frame size information.") +set_aom_config_var( + CONFIG_SALIENCY_MAP 0 + "AV1 experiment: Enable saliency map based encoding tuning for VMAF.") +set_aom_config_var(CONFIG_CWG_C013 0 + "AV1 experiment: Support for 7.x and 8.x levels.") +# Add this change to make aomenc reported PSNR consistent with libvmaf result. +set_aom_config_var(CONFIG_LIBVMAF_PSNR_PEAK 1 + "Use libvmaf PSNR peak for 10- and 12-bit") + +# +# Variables in this section control optional features of the build system. +# +set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF) +set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests" + OFF) +set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF) +set_aom_option_var(ENABLE_DOCS + "Enable documentation generation (doxygen required)." ON) +set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests" + OFF) +set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON) +set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF) +set_aom_option_var( + ENABLE_IDE_TEST_HOSTING + "Enables running tests within IDEs like Visual Studio and Xcode." OFF) +set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF) +set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets." + ON) +set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON) +set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory." + ON) +set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time." + OFF) + +# Arm/AArch64 assembly/intrinsics flags. +set_aom_option_var(ENABLE_NEON + "Enables Neon optimizations on Arm/AArch64 targets." ON) +set_aom_option_var(ENABLE_ARM_CRC32 "Enables Arm CRC32 optimizations." ON) +set_aom_option_var( + ENABLE_NEON_DOTPROD + "Enables Armv8.2-A Neon dotprod optimizations on AArch64 targets." ON) +set_aom_option_var( + ENABLE_NEON_I8MM + "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON) +set_aom_option_var(ENABLE_SVE + "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON) + +# VSX intrinsics flags. +set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets." + ON) + +# x86/x86_64 assembly/intrinsics flags. +set_aom_option_var(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets." + ON) +set_aom_option_var(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets." + ON) +set_aom_option_var(ENABLE_SSE2 + "Enables SSE2 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSE3 + "Enables SSE3 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSSE3 + "Enables SSSE3 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSE4_1 + "Enables SSE4_1 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_SSE4_2 + "Enables SSE4_2 optimizations on x86/x86_64 targets." ON) +set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." + ON) +set_aom_option_var(ENABLE_AVX2 + "Enables AVX2 optimizations on x86/x86_64 targets." ON) diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake new file mode 100644 index 0000000000..917e7cac5d --- /dev/null +++ b/third_party/aom/build/cmake/aom_configure.cmake @@ -0,0 +1,489 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ +set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1) + +include(FindThreads) + +include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") +include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake") +include("${AOM_ROOT}/build/cmake/aom_optimization.cmake") +include("${AOM_ROOT}/build/cmake/compiler_flags.cmake") +include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") +include("${AOM_ROOT}/build/cmake/util.cmake") + +if(DEFINED CONFIG_LOWBITDEPTH) + message(WARNING "CONFIG_LOWBITDEPTH has been removed. \ + Use -DFORCE_HIGHBITDEPTH_DECODING=1 instead of -DCONFIG_LOWBITDEPTH=0 \ + and -DFORCE_HIGHBITDEPTH_DECODING=0 instead of -DCONFIG_LOWBITDEPTH=1.") + if(NOT CONFIG_LOWBITDEPTH) + set(FORCE_HIGHBITDEPTH_DECODING + 1 + CACHE STRING "${cmake_cmdline_helpstring}" FORCE) + endif() +endif() + +if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH) + change_config_and_warn(CONFIG_AV1_HIGHBITDEPTH 1 + "FORCE_HIGHBITDEPTH_DECODING") +endif() + +if(CONFIG_THREE_PASS AND NOT CONFIG_AV1_DECODER) + change_config_and_warn(CONFIG_THREE_PASS 0 "CONFIG_AV1_DECODER=0") +endif() + +# Generate the user config settings. +list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS}) +foreach(cache_var ${aom_build_vars}) + get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING) + if(cache_var_helpstring STREQUAL cmake_cmdline_helpstring) + set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}") + endif() +endforeach() +string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG) + +# Detect target CPU. +if(NOT AOM_TARGET_CPU) + string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase) + if(cpu_lowercase STREQUAL "amd64" OR cpu_lowercase STREQUAL "x86_64") + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + set(AOM_TARGET_CPU "x86") + elseif(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(AOM_TARGET_CPU "x86_64") + else() + message( + FATAL_ERROR "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n" + " CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n" + " CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n" + " CMAKE_GENERATOR=${CMAKE_GENERATOR}\n") + endif() + elseif(cpu_lowercase STREQUAL "i386" OR cpu_lowercase STREQUAL "x86") + set(AOM_TARGET_CPU "x86") + elseif(cpu_lowercase MATCHES "^arm") + set(AOM_TARGET_CPU "${cpu_lowercase}") + elseif(cpu_lowercase MATCHES "aarch64") + set(AOM_TARGET_CPU "arm64") + elseif(cpu_lowercase MATCHES "^ppc") + set(AOM_TARGET_CPU "ppc") + else() + message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not " + "supported, falling back to the generic target") + set(AOM_TARGET_CPU "generic") + endif() +endif() + +if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string. + if(IS_ABSOLUTE "${CMAKE_TOOLCHAIN_FILE}") + file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}" + "${CMAKE_TOOLCHAIN_FILE}") + else() + set(toolchain_path "${CMAKE_TOOLCHAIN_FILE}") + endif() + set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"") + set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}") +else() + + # Add detected CPU to the config string. + set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}") +endif() +set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}") +file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}") +set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}") +string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG) + +message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}") +set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME}) + +string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase) +if(build_type_lowercase STREQUAL "debug") + set(CONFIG_DEBUG 1) +endif() + +if(BUILD_SHARED_LIBS) + set(CONFIG_PIC 1) + set(CONFIG_SHARED 1) +elseif(NOT CONFIG_PIC) + # Update the variable only when it does not carry the CMake assigned help + # string for variables specified via the command line. This allows the user to + # force CONFIG_PIC=0. + unset(cache_helpstring) + get_property(cache_helpstring CACHE CONFIG_PIC PROPERTY HELPSTRING) + if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}") + aom_check_c_compiles("pie_check" " + #if !(__pie__ || __PIE__) + #error Neither __pie__ or __PIE__ are set + #endif + extern void unused(void); + void unused(void) {}" HAVE_PIE) + + if(HAVE_PIE) + # If -fpie or -fPIE are used ensure the assembly code has PIC enabled to + # avoid DT_TEXTRELs: /usr/bin/ld: warning: creating DT_TEXTREL in a PIE + set(CONFIG_PIC 1) + message( + "CONFIG_PIC enabled for position independent executable (PIE) build") + endif() + endif() + unset(cache_helpstring) +endif() + +if(NOT MSVC) + if(CONFIG_PIC) + + # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to + # work. + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + if(AOM_TARGET_SYSTEM STREQUAL "Linux" + AND AOM_TARGET_CPU MATCHES "^armv[78]") + set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1) + else() + set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC) + endif() + endif() +endif() + +if(AOM_TARGET_CPU STREQUAL "x86" OR AOM_TARGET_CPU STREQUAL "x86_64") + find_program(CMAKE_ASM_NASM_COMPILER yasm $ENV{YASM_PATH}) + if(NOT CMAKE_ASM_NASM_COMPILER OR ENABLE_NASM) + unset(CMAKE_ASM_NASM_COMPILER CACHE) + find_program(CMAKE_ASM_NASM_COMPILER nasm $ENV{NASM_PATH}) + endif() + + include(CheckLanguage) + check_language(ASM_NASM) + if(CMAKE_ASM_NASM_COMPILER) + get_asm_obj_format("objformat") + unset(CMAKE_ASM_NASM_OBJECT_FORMAT) + set(CMAKE_ASM_NASM_OBJECT_FORMAT ${objformat}) + enable_language(ASM_NASM) + if(CMAKE_ASM_NASM_COMPILER_ID STREQUAL "NASM") + test_nasm() + endif() + # Xcode requires building the objects manually, so pass the object format + # flag. + if(XCODE) + set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS}) + endif() + else() + message( + FATAL_ERROR + "Unable to find assembler. Install 'yasm' or 'nasm.' " + "To build without optimizations, add -DAOM_TARGET_CPU=generic to " + "your cmake command line.") + endif() + string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS) +elseif(AOM_TARGET_CPU MATCHES "arm") + if(AOM_TARGET_SYSTEM STREQUAL "Darwin") + if(NOT CMAKE_ASM_COMPILER) + set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) + endif() + set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT}) + elseif(AOM_TARGET_SYSTEM STREQUAL "Windows") + if(NOT CMAKE_ASM_COMPILER) + set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER} -c -mimplicit-it=always) + endif() + else() + if(NOT CMAKE_ASM_COMPILER) + set(CMAKE_ASM_COMPILER as) + endif() + endif() + include(CheckLanguage) + check_language(ASM) + if(NOT CMAKE_ASM_COMPILER) + message( + FATAL_ERROR + "Unable to find assembler and optimizations are enabled." + "Searched for ${CMAKE_ASM_COMPILER}. Install it, add it to your path," + "or set the assembler directly by adding " + "-DCMAKE_ASM_COMPILER= to your CMake command line." + "To build without optimizations, add -DAOM_TARGET_CPU=generic to your " + "cmake command line.") + endif() + enable_language(ASM) + string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS) +endif() + +if(CONFIG_ANALYZER) + find_package(wxWidgets REQUIRED adv base core) + include(${wxWidgets_USE_FILE}) +endif() + +if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang") + set(CONFIG_GCC 1) +endif() + +if(CONFIG_GCOV) + message("--- Testing for CONFIG_GCOV support.") + require_linker_flag("-fprofile-arcs -ftest-coverage") + require_compiler_flag("-fprofile-arcs -ftest-coverage" YES) +endif() + +if(CONFIG_GPROF) + message("--- Testing for CONFIG_GPROF support.") + require_compiler_flag("-pg" YES) +endif() + +if(AOM_TARGET_SYSTEM MATCHES "Darwin\|Linux\|Windows\|Android") + set(CONFIG_OS_SUPPORT 1) +endif() + +if(AOM_TARGET_SYSTEM STREQUAL "Windows") + # The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set + # it to 0x0601 (Windows 7). + add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601") + # Quiet warnings related to fopen, printf, etc. + add_compiler_flag_if_supported("-D_CRT_SECURE_NO_WARNINGS") +endif() + +# +# Fix CONFIG_* dependencies. This must be done before including cpu.cmake to +# ensure RTCD_CONFIG_* are properly set. +fix_experiment_configs() + +# Test compiler support. +aom_get_inline("INLINE") + +# Don't just check for pthread.h, but use the result of the full pthreads +# including a linking check in FindThreads above. +set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT}) +aom_check_source_compiles("unistd_check" "#include " HAVE_UNISTD_H) + +if(NOT WIN32) + aom_push_var(CMAKE_REQUIRED_LIBRARIES "m") + aom_check_c_compiles("fenv_check" "#define _GNU_SOURCE + #include + void unused(void) { + (void)unused; + (void)feenableexcept(FE_DIVBYZERO | FE_INVALID); + }" HAVE_FEXCEPT) + aom_pop_var(CMAKE_REQUIRED_LIBRARIES) +endif() + +include("${AOM_ROOT}/build/cmake/cpu.cmake") + +if(ENABLE_CCACHE) + set_compiler_launcher(ENABLE_CCACHE ccache) +endif() + +if(ENABLE_DISTCC) + set_compiler_launcher(ENABLE_DISTCC distcc) +endif() + +if(ENABLE_GOMA) + set_compiler_launcher(ENABLE_GOMA gomacc) +endif() + +if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER) + message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.") +endif() + +if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT) + change_config_and_warn(CONFIG_SIZE_LIMIT 1 + "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT") +endif() + +if(CONFIG_SIZE_LIMIT) + if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT) + message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT " + "and DECODE_WIDTH_LIMIT must be set.") + endif() +endif() + +# Test compiler flags. +if(MSVC) + # It isn't possible to specify C99 conformance for MSVC. MSVC doesn't support + # C++ standards modes earlier than C++14. + add_cxx_flag_if_supported("/std:c++14") + add_compiler_flag_if_supported("/W3") + + # Disable MSVC warnings that suggest making code non-portable. + add_compiler_flag_if_supported("/wd4996") + if(ENABLE_WERROR) + add_compiler_flag_if_supported("/WX") + endif() +else() + require_c_flag("-std=c99" YES) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "GNU" + AND CMAKE_CXX_SIMULATE_ID STREQUAL "MSVC") + # Microsoft's C++ Standard Library requires C++14 as it's MSVC's default and + # minimum supported C++ version. If Clang is using this Standard Library + # implementation, it cannot target C++11. + require_cxx_flag_nomsvc("-std=c++14" YES) + else() + require_cxx_flag_nomsvc("-std=c++11" YES) + endif() + add_compiler_flag_if_supported("-Wall") + add_compiler_flag_if_supported("-Wdisabled-optimization") + add_compiler_flag_if_supported("-Wextra") + # Prior to version 3.19.0 cmake would fail to parse the warning emitted by gcc + # with this flag. Note the order of this check and -Wextra-semi-stmt is + # important due to is_flag_present() matching substrings with string(FIND + # ...). + if(CMAKE_VERSION VERSION_LESS "3.19" + AND CMAKE_C_COMPILER_ID STREQUAL "GNU" + AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 10) + add_cxx_flag_if_supported("-Wextra-semi") + else() + add_compiler_flag_if_supported("-Wextra-semi") + endif() + add_compiler_flag_if_supported("-Wextra-semi-stmt") + add_compiler_flag_if_supported("-Wfloat-conversion") + add_compiler_flag_if_supported("-Wformat=2") + add_c_flag_if_supported("-Wimplicit-function-declaration") + add_compiler_flag_if_supported("-Wlogical-op") + add_compiler_flag_if_supported("-Wpointer-arith") + add_compiler_flag_if_supported("-Wshadow") + add_compiler_flag_if_supported("-Wshorten-64-to-32") + add_compiler_flag_if_supported("-Wsign-compare") + add_compiler_flag_if_supported("-Wstring-conversion") + add_compiler_flag_if_supported("-Wtype-limits") + add_compiler_flag_if_supported("-Wuninitialized") + add_compiler_flag_if_supported("-Wunreachable-code-aggressive") + add_compiler_flag_if_supported("-Wunused") + add_compiler_flag_if_supported("-Wvla") + add_cxx_flag_if_supported("-Wc++14-extensions") + add_cxx_flag_if_supported("-Wc++17-extensions") + add_cxx_flag_if_supported("-Wc++20-extensions") + + if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address|undefined") + + # This combination has more stack overhead, so we account for it by + # providing higher stack limit than usual. + add_c_flag_if_supported("-Wstack-usage=285000") + add_cxx_flag_if_supported("-Wstack-usage=270000") + elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected. + add_c_flag_if_supported("-Wstack-usage=135000") + add_cxx_flag_if_supported("-Wstack-usage=240000") + else() + add_c_flag_if_supported("-Wstack-usage=100000") + add_cxx_flag_if_supported("-Wstack-usage=240000") + endif() + + if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND SANITIZE MATCHES "address") + # Disable no optimization warning when compiling with sanitizers + add_compiler_flag_if_supported("-Wno-disabled-optimization") + endif() + + # Add -Wundef only for C files to avoid massive gtest warning spam. + add_c_flag_if_supported("-Wundef") + + # Quiet gcc 6 vs 7 abi warnings: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 + if(AOM_TARGET_CPU MATCHES "arm") + add_cxx_flag_if_supported("-Wno-psabi") + endif() + + if(ENABLE_WERROR) + add_compiler_flag_if_supported("-Werror") + endif() + + if(build_type_lowercase MATCHES "rel") + add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0") + endif() + add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE") + add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64") +endif() + +# Prior to r23, or with ANDROID_USE_LEGACY_TOOLCHAIN_FILE set, +# android.toolchain.cmake would set normal (non-cache) versions of variables +# like CMAKE_C_FLAGS_RELEASE which would mask the ones added to the cache +# variable in add_compiler_flag_if_supported(), etc. As a workaround we add +# everything accumulated in AOM_C/CXX_FLAGS to the normal versions. This could +# also be addressed by reworking the flag tests and adding the results directly +# to target_compile_options() as in e.g., libgav1, but that's a larger task. +# https://github.com/android/ndk/wiki/Changelog-r23#changes +if(ANDROID + AND ("${ANDROID_NDK_MAJOR}" LESS 23 OR ANDROID_USE_LEGACY_TOOLCHAIN_FILE)) + foreach(lang C;CXX) + string(STRIP "${AOM_${lang}_FLAGS}" AOM_${lang}_FLAGS) + if(AOM_${lang}_FLAGS) + foreach(config ${AOM_${lang}_CONFIGS}) + set(${config} "${${config}} ${AOM_${lang}_FLAGS}") + endforeach() + endif() + endforeach() +endif() + +set(AOM_LIB_LINK_TYPE PUBLIC) +if(EMSCRIPTEN) + + # Avoid CMake generation time errors resulting from collisions with the form + # of target_link_libraries() used by Emscripten.cmake. + unset(AOM_LIB_LINK_TYPE) +endif() + +# Generate aom_config templates. +set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake") +set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake") +execute_process( + COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -P + "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake") + +# Generate aom_config.{asm,h}. +configure_file("${aom_config_asm_template}" + "${AOM_CONFIG_DIR}/config/aom_config.asm") +configure_file("${aom_config_h_template}" + "${AOM_CONFIG_DIR}/config/aom_config.h") + +# Read the current git hash. +find_package(Git) +if(NOT GIT_FOUND) + message("--- Git missing, version will be read from CHANGELOG.") +endif() + +string(TIMESTAMP year "%Y") +configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template" + "${AOM_CONFIG_DIR}/config/aom_config.c") + +# Find Perl and generate the RTCD sources. +find_package(Perl) +if(NOT PERL_FOUND) + message(FATAL_ERROR "Perl is required to build libaom.") +endif() + +set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl" + "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl") +set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h" + "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h" + "${AOM_CONFIG_DIR}/config/av1_rtcd.h") +set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" + "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" + "${AOM_ROOT}/av1/common/av1_rtcd.c") +set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd) +list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT) +math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1") + +foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT}) + list(GET AOM_RTCD_CONFIG_FILE_LIST ${NUM} AOM_RTCD_CONFIG_FILE) + list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE) + list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE) + list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL) + execute_process( + COMMAND + ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl" + --arch=${AOM_TARGET_CPU} + --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS} + --config=${AOM_CONFIG_DIR}/config/aom_config.h ${AOM_RTCD_CONFIG_FILE} + OUTPUT_FILE ${AOM_RTCD_HEADER_FILE}) +endforeach() + +# Generate aom_version.h. +execute_process(COMMAND ${CMAKE_COMMAND} + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DGIT_EXECUTABLE=${GIT_EXECUTABLE} + -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P + "${AOM_ROOT}/build/cmake/version.cmake") diff --git a/third_party/aom/build/cmake/aom_experiment_deps.cmake b/third_party/aom/build/cmake/aom_experiment_deps.cmake new file mode 100644 index 0000000000..3bbeb0c874 --- /dev/null +++ b/third_party/aom/build/cmake/aom_experiment_deps.cmake @@ -0,0 +1,24 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ +set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1) + +# Adjusts CONFIG_* CMake variables to address conflicts between active AV1 +# experiments. +macro(fix_experiment_configs) + + if(CONFIG_ANALYZER) + change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER) + endif() + +endmacro() diff --git a/third_party/aom/build/cmake/aom_install.cmake b/third_party/aom/build/cmake/aom_install.cmake new file mode 100644 index 0000000000..2c263e96b9 --- /dev/null +++ b/third_party/aom/build/cmake/aom_install.cmake @@ -0,0 +1,98 @@ +# +# Copyright (c) 2018, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h" + "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_frame_buffer.h" + "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h") + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h" + "${AOM_ROOT}/aom/aomdx.h") +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h" + "${AOM_ROOT}/aom/aom_encoder.h" + "${AOM_ROOT}/aom/aom_external_partition.h") +endif() + +# Generate aom.pc and setup dependencies to ensure it is created when necessary. +# Note: aom.pc generation uses GNUInstallDirs: +# https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html +macro(setup_aom_install_targets) + if(NOT XCODE) + include("GNUInstallDirs") + set(AOM_PKG_CONFIG_FILE "${AOM_CONFIG_DIR}/aom.pc") + + # Create a library target for creating aom.pc. + create_no_op_source_file(aom_pc c AOM_PKG_CONFIG_SOURCES) + add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES}) + + # Setup a rule to generate aom.pc. + add_custom_command( + OUTPUT "${AOM_PKG_CONFIG_FILE}" + COMMAND ${CMAKE_COMMAND} ARGS + -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} + -DAOM_ROOT=${AOM_ROOT} + -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} + -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR} + -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR} + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} + -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME} + -DCMAKE_THREAD_LIBS_INIT=${CMAKE_THREAD_LIBS_INIT} + -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD} + -DCONFIG_TUNE_VMAF=${CONFIG_TUNE_VMAF} + -DCONFIG_TUNE_BUTTERAUGLI=${CONFIG_TUNE_BUTTERAUGLI} + -DCONFIG_SALIENCY_MAP=${CONFIG_SALIENCY_MAP} + -DCONFIG_TFLITE=${CONFIG_TFLITE} + -P + "${AOM_ROOT}/build/cmake/pkg_config.cmake" + COMMENT "Writing aom.pc" + VERBATIM) + + # Explicitly add a dependency on the pkg-config file to ensure it's built. + get_property(aom_pc_sources TARGET aom_pc PROPERTY SOURCES) + set_source_files_properties(${aom_pc_sources} OBJECT_DEPENDS + "${AOM_PKG_CONFIG_FILE}") + + # Our pkg-config file carries version information: add a dependency on the + # version rule. + add_dependencies(aom_pc aom_version) + + if(CONFIG_AV1_DECODER) + if(ENABLE_EXAMPLES) + list(APPEND AOM_INSTALL_BINS aomdec) + endif() + endif() + + if(CONFIG_AV1_ENCODER) + if(ENABLE_EXAMPLES) + list(APPEND AOM_INSTALL_BINS aomenc) + endif() + endif() + + if(BUILD_SHARED_LIBS) + set(AOM_INSTALL_LIBS aom aom_static) + else() + set(AOM_INSTALL_LIBS aom) + endif() + + # Setup the install rules. install() will automatically prepend + # CMAKE_INSTALL_PREFIX to relative paths + install(FILES ${AOM_INSTALL_INCS} + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/aom") + install(FILES "${AOM_PKG_CONFIG_FILE}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") + install(TARGETS ${AOM_INSTALL_LIBS};${AOM_INSTALL_BINS} + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") + endif() +endmacro() diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake new file mode 100644 index 0000000000..0f93228eef --- /dev/null +++ b/third_party/aom/build/cmake/aom_optimization.cmake @@ -0,0 +1,279 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ +set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1) + +include("${AOM_ROOT}/build/cmake/util.cmake") + +# Translate $flag to one which MSVC understands, and write the new flag to the +# variable named by $translated_flag (or unset it, when MSVC needs no flag). +function(get_msvc_intrinsic_flag flag translated_flag) + if("${flag}" STREQUAL "-mavx") + set(${translated_flag} "/arch:AVX" PARENT_SCOPE) + elseif("${flag}" STREQUAL "-mavx2") + set(${translated_flag} "/arch:AVX2" PARENT_SCOPE) + else() + + # MSVC does not need flags for intrinsics flavors other than AVX/AVX2. + unset(${translated_flag} PARENT_SCOPE) + endif() +endfunction() + +# Adds an object library target. Terminates generation if $flag is not supported +# by the current compiler. $flag is the intrinsics flag required by the current +# compiler, and is added to the compile flags for all sources in $sources. +# $opt_name is used to name the target. $target_to_update is made dependent upon +# the created target. +# +# Note: this function always updates the aom, and aom_static targets because +# OBJECT libraries have rules that disallow the direct addition of .o files to +# them as dependencies. Static and shared libraries do not have this limitation. +function(add_intrinsics_object_library flag opt_name target_to_update sources) + if("${${sources}}" STREQUAL "") + return() + endif() + set(target_name ${target_to_update}_${opt_name}_intrinsics) + add_library(${target_name} OBJECT ${${sources}}) + set_property(TARGET ${target_name} PROPERTY FOLDER ${AOM_TARGET_CPU}) + + # MSVC does not need flags for intrinsics flavors other than AVX/AVX2. + # However, for clang-cl, the default is SSE2, and the MSVC frontend does not + # provide any flags to enable SSE3 up to SSE4.1. So we need to restrict the + # usage of MSVC-style flags to only the real MSVC. + if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + get_msvc_intrinsic_flag("${flag}" "flag") + endif() + + if("${flag}" STREQUAL "-mavx2") + unset(FLAG_SUPPORTED) + check_c_compiler_flag("-mno-avx256-split-unaligned-load" FLAG_SUPPORTED) + if(${FLAG_SUPPORTED}) + set(flag "${flag} -mno-avx256-split-unaligned-load") + endif() + + unset(FLAG_SUPPORTED) + check_c_compiler_flag("-mno-avx256-split-unaligned-store" FLAG_SUPPORTED) + if(${FLAG_SUPPORTED}) + set(flag "${flag} -mno-avx256-split-unaligned-store") + endif() + endif() + + if(flag) + separate_arguments(flag) + target_compile_options(${target_name} PUBLIC ${flag}) + endif() + + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() + + # Add the new lib target to the global list of aom library targets. + list(APPEND AOM_LIB_TARGETS ${target_name}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() + +# Adds sources in list named by $sources to $target and adds $flag to the +# compile flags for each source file. +function(add_intrinsics_source_to_target flag target sources) + target_sources(${target} PRIVATE ${${sources}}) + if(MSVC) + get_msvc_intrinsic_flag("${flag}" "flag") + endif() + if(flag) + foreach(source ${${sources}}) + set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag}) + endforeach() + endif() +endfunction() + +# Writes object format for the current target to the var named by $out_format, +# or terminates the build when the object format for the current target is +# unknown. +function(get_asm_obj_format out_format) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(objformat "macho64") + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + set(objformat "win64") + else() + set(objformat "elf64") + endif() + elseif("${AOM_TARGET_CPU}" STREQUAL "x86") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(objformat "macho32") + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "CYGWIN" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + set(objformat "win32") + else() + set(objformat "elf32") + endif() + else() + message( + FATAL_ERROR "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}") + endif() + + set(${out_format} ${objformat} PARENT_SCOPE) +endfunction() + +# Adds library target named $lib_name for ASM files in variable named by +# $asm_sources. Builds an output directory path from $lib_name. Links $lib_name +# into the aom library target(s). Generates a C file with an unused no-op +# function to ensure that all cmake generators can determine the linker +# language, and that build tools don't complain that an object exposes no +# symbols. +# +# In Xcode-based builds every step described above happens twice, and +# directory/target/object names are updated to include _shared and _static +# suffixes. +function(add_asm_library lib_name asm_sources) + if("${${asm_sources}}" STREQUAL "") + return() + endif() + + if(XCODE) + # CMake's generator does not output a build rule for Nasm files. Moreover, + # it makes Xcode believe Nasm files are of type "sourcecode" instead of + # "sourcecode.nasm", which prevents even the default rule from applying. + # This default rule is broken, though, because it doesn't apply any of the + # flags specified for ASM_NASM. See https://discourse.cmake.org/t/building- + # nasm-files-with-xcode/7934 + list(APPEND asm_configs "static") + if(BUILD_SHARED_LIBS) + list(APPEND asm_configs "shared") + endif() + + set(as_executable "${CMAKE_ASM_NASM_COMPILER}") + if(NOT as_executable) + set(as_executable "${CMAKE_ASM_COMPILER}") + endif() + + foreach(asm_config ${asm_configs}) + set(asm_lib_name ${lib_name}_${asm_config}) + set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${asm_lib_name}") + if(NOT EXISTS "${asm_lib_obj_dir}") + file(MAKE_DIRECTORY "${asm_lib_obj_dir}") + endif() + + foreach(asm_source ${${asm_sources}}) + get_filename_component(asm_source_name "${asm_source}" NAME) + set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o") + add_custom_command(OUTPUT "${asm_object}" + COMMAND ${as_executable} ARGS ${AOM_AS_FLAGS} + -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o + "${asm_object}" "${asm_source}" + DEPENDS "${asm_source}" + COMMENT "Building ASM object ${asm_object}" + WORKING_DIRECTORY "${AOM_CONFIG_DIR}" + VERBATIM) + if(BUILD_SHARED_LIBS AND "${asm_config}" STREQUAL "static") + target_sources(aom_static PRIVATE "${asm_object}") + else() + target_sources(aom PRIVATE "${asm_object}") + endif() + endforeach() + endforeach() + else() + # For non-Xcode generators, CMake does not need extra help. The language + # support takes care of it. + set(asm_lib_name ${lib_name}) + + add_library(${asm_lib_name} OBJECT ${${asm_sources}}) + target_include_directories(${asm_lib_name} + PRIVATE ${AOM_ROOT} ${AOM_CONFIG_DIR}) + target_compile_options(${asm_lib_name} PRIVATE ${AOM_AS_FLAGS}) + set_property(TARGET ${asm_lib_name} PROPERTY FOLDER ${AOM_TARGET_CPU}) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE "$") + endif() + target_sources(aom PRIVATE "$") + + # Add the new lib target to the global list of aom library targets. + list(APPEND AOM_LIB_TARGETS ${asm_lib_name}) + endif() + + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction() + +# Terminates generation if nasm found in PATH does not meet requirements. +# Currently checks only for presence of required object formats and support for +# the -Ox argument (multipass optimization). +function(test_nasm) + execute_process(COMMAND ${CMAKE_ASM_NASM_COMPILER} -hf + OUTPUT_VARIABLE nasm_helptext) + + if(NOT "${nasm_helptext}" MATCHES "-Ox") + message( + FATAL_ERROR "Unsupported nasm: multipass optimization not supported.") + endif() + + if("${AOM_TARGET_CPU}" STREQUAL "x86") + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + if(NOT "${nasm_helptext}" MATCHES "macho32") + message( + FATAL_ERROR "Unsupported nasm: macho32 object format not supported.") + endif() + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + if(NOT "${nasm_helptext}" MATCHES "win32") + message( + FATAL_ERROR "Unsupported nasm: win32 object format not supported.") + endif() + else() + if(NOT "${nasm_helptext}" MATCHES "elf32") + message( + FATAL_ERROR "Unsupported nasm: elf32 object format not supported.") + endif() + endif() + else() + if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + if(NOT "${nasm_helptext}" MATCHES "macho64") + message( + FATAL_ERROR "Unsupported nasm: macho64 object format not supported.") + endif() + elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" + OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows") + if(NOT "${nasm_helptext}" MATCHES "win64") + message( + FATAL_ERROR "Unsupported nasm: win64 object format not supported.") + endif() + else() + if(NOT "${nasm_helptext}" MATCHES "elf64") + message( + FATAL_ERROR "Unsupported nasm: elf64 object format not supported.") + endif() + endif() + endif() +endfunction() + +# Adds build command for generation of rtcd C source files using +# build/cmake/rtcd.pl. $config is the input perl file, $output is the output C +# include file, $source is the C source file, and $symbol is used for the symbol +# argument passed to rtcd.pl. +function(add_rtcd_build_step config output source symbol) + add_custom_command( + OUTPUT ${output} + COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/cmake/rtcd.pl" + --arch=${AOM_TARGET_CPU} + --sym=${symbol} ${AOM_RTCD_FLAGS} + --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output} + DEPENDS "${AOM_ROOT}/build/cmake/rtcd.pl" ${config} + COMMENT "Generating ${output}" + WORKING_DIRECTORY ${AOM_CONFIG_DIR} + VERBATIM) + set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output}) + set_property(SOURCE ${output} PROPERTY GENERATED TRUE) +endfunction() diff --git a/third_party/aom/build/cmake/compiler_flags.cmake b/third_party/aom/build/cmake/compiler_flags.cmake new file mode 100644 index 0000000000..f008b964f5 --- /dev/null +++ b/third_party/aom/build/cmake/compiler_flags.cmake @@ -0,0 +1,385 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ +set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1) + +include(CheckCCompilerFlag) +include(CheckCXXCompilerFlag) +include("${AOM_ROOT}/build/cmake/compiler_tests.cmake") + +# Strings used to cache flags. +set(AOM_C_FLAGS) +set(AOM_CXX_FLAGS) +set(AOM_EXE_LINKER_FLAGS) +set(AOM_FAILED_C_FLAGS) +set(AOM_FAILED_CXX_FLAGS) + +# Sets variable named by $out_is_present to YES in the caller's scope when $flag +# is found in the string variable named by $flag_cache. Sets the var to NO +# otherwise. +function(is_flag_present flag_cache flag out_is_present) + string(FIND "${${flag_cache}}" "${flag}" flag_pos) + if(${flag_pos} EQUAL -1) + set(${out_is_present} NO PARENT_SCOPE) + else() + set(${out_is_present} YES PARENT_SCOPE) + endif() +endfunction() + +# Appends $flag to $flags. Ignores scope via use of FORCE with set() call. +function(append_flag flags flag) + string(FIND "${${flags}}" "${flag}" found) + if(${found} EQUAL -1) + set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE) + endif() +endfunction() + +# Checks C compiler for support of $c_flag. Adds $c_flag to all +# $CMAKE_C_FLAGS_s stored in AOM_C_CONFIGS when the compile test passes. +# Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test +# outcome. +function(add_c_flag_if_supported c_flag) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok) + is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed) + if(${flag_ok} OR ${flag_failed}) + return() + endif() + + # Between 3.17.0 and 3.18.2 check_c_compiler_flag() sets a normal variable at + # parent scope while check_cxx_source_compiles() continues to set an internal + # cache variable, so we unset both to avoid the failure / success state + # persisting between checks. See + # https://gitlab.kitware.com/cmake/cmake/-/issues/21207. + unset(C_FLAG_SUPPORTED) + unset(C_FLAG_SUPPORTED CACHE) + message("Checking C compiler flag support for: " ${c_flag}) + check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED) + + if(${C_FLAG_SUPPORTED}) + append_flag(AOM_C_FLAGS "${c_flag}") + foreach(config ${AOM_C_CONFIGS}) + unset(C_FLAG_FOUND) + append_flag("${config}" "${c_flag}") + endforeach() + else() + append_flag(AOM_FAILED_C_FLAGS "${c_flag}") + endif() +endfunction() + +# Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all +# $CMAKE_CXX_FLAGS_s stored in AOM_CXX_CONFIGS when the compile test +# passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending +# on test outcome. +function(add_cxx_flag_if_supported cxx_flag) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok) + is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed) + if(${flag_ok} OR ${flag_failed}) + return() + endif() + + # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal variable + # at parent scope while check_cxx_source_compiles() continues to set an + # internal cache variable, so we unset both to avoid the failure / success + # state persisting between checks. See + # https://gitlab.kitware.com/cmake/cmake/-/issues/21207. + unset(CXX_FLAG_SUPPORTED) + unset(CXX_FLAG_SUPPORTED CACHE) + message("Checking C++ compiler flag support for: " ${cxx_flag}) + check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED) + + if(${CXX_FLAG_SUPPORTED}) + append_flag(AOM_CXX_FLAGS "${cxx_flag}") + foreach(config ${AOM_CXX_CONFIGS}) + unset(CXX_FLAG_FOUND) + append_flag("${config}" "${cxx_flag}") + endforeach() + else() + append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}") + endif() +endfunction() + +# Convenience method for adding a flag to both the C and C++ compiler command +# lines. +function(add_compiler_flag_if_supported flag) + add_c_flag_if_supported(${flag}) + add_cxx_flag_if_supported(${flag}) +endfunction() + +# Checks C compiler for support of $c_flag and terminates generation when +# support is not present. +function(require_c_flag c_flag update_c_flags) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok) + if(${flag_ok}) + return() + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}") + endif() + + unset(HAVE_C_FLAG CACHE) + message("Checking C compiler flag support for: " ${c_flag}) + check_c_compiler_flag("${c_flag}" HAVE_C_FLAG) + if(NOT HAVE_C_FLAG) + message( + FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${c_flag}.") + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_pop_var(CMAKE_EXE_LINKER_FLAGS) + endif() + + append_flag(AOM_C_FLAGS "${c_flag}") + if(update_c_flags) + foreach(config ${AOM_C_CONFIGS}) + set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE) + endforeach() + endif() +endfunction() + +# Checks CXX compiler for support of $cxx_flag and terminates generation when +# support is not present. +function(require_cxx_flag cxx_flag update_cxx_flags) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok) + if(${flag_ok}) + return() + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}") + endif() + + unset(HAVE_CXX_FLAG CACHE) + message("Checking C compiler flag support for: " ${cxx_flag}) + check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG) + if(NOT HAVE_CXX_FLAG) + message( + FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.") + endif() + + if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") + aom_pop_var(CMAKE_EXE_LINKER_FLAGS) + endif() + + append_flag(AOM_CXX_FLAGS "${cxx_flag}") + if(update_cxx_flags) + foreach(config ${AOM_CXX_CONFIGS}) + set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE) + endforeach() + endif() +endfunction() + +# Checks for support of $flag by both the C and CXX compilers. Terminates +# generation when support is not present in both compilers. +function(require_compiler_flag flag update_cmake_flags) + require_c_flag(${flag} ${update_cmake_flags}) + require_cxx_flag(${flag} ${update_cmake_flags}) +endfunction() + +# Checks only non-MSVC targets for support of $c_flag and terminates generation +# when support is not present. +function(require_c_flag_nomsvc c_flag update_c_flags) + if(NOT MSVC) + require_c_flag(${c_flag} ${update_c_flags}) + endif() +endfunction() + +# Checks only non-MSVC targets for support of $cxx_flag and terminates +# generation when support is not present. +function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags) + if(NOT MSVC) + require_cxx_flag(${cxx_flag} ${update_cxx_flags}) + endif() +endfunction() + +# Checks only non-MSVC targets for support of $flag by both the C and CXX +# compilers. Terminates generation when support is not present in both +# compilers. +function(require_compiler_flag_nomsvc flag update_cmake_flags) + require_c_flag_nomsvc(${flag} ${update_cmake_flags}) + require_cxx_flag_nomsvc(${flag} ${update_cmake_flags}) +endfunction() + +# Adds $preproc_def to C compiler command line (as -D$preproc_def) if not +# already present. +function(add_c_preproc_definition preproc_def) + set(preproc_def "-D${preproc_def}") + is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_C_CONFIGS}) + set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE) + endforeach() +endfunction() + +# Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not +# already present. +function(add_cxx_preproc_definition preproc_def) + set(preproc_def "-D${preproc_def}") + is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_CXX_CONFIGS}) + set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE) + endforeach() +endfunction() + +# Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if +# not already present. +function(add_preproc_definition preproc_def) + add_c_preproc_definition(${preproc_def}) + add_cxx_preproc_definition(${preproc_def}) +endfunction() + +# Adds $flag to assembler command line. +function(append_as_flag flag) + is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + append_flag(AOM_AS_FLAGS "${flag}") +endfunction() + +# Adds $flag to the C compiler command line. +function(append_c_flag flag) + is_flag_present(AOM_C_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_C_CONFIGS}) + append_flag(${config} "${flag}") + endforeach() +endfunction() + +# Adds $flag to the CXX compiler command line. +function(append_cxx_flag flag) + is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + + foreach(config ${AOM_CXX_CONFIGS}) + append_flag(${config} "${flag}") + endforeach() +endfunction() + +# Adds $flag to the C and CXX compiler command lines. +function(append_compiler_flag flag) + append_c_flag(${flag}) + append_cxx_flag(${flag}) +endfunction() + +# Adds $flag to the executable linker command line when not present. +function(append_exe_linker_flag flag) + is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached) + if(${flag_cached}) + return() + endif() + + append_flag(AOM_EXE_LINKER_FLAGS "${flag}") + foreach(config ${AOM_EXE_LINKER_CONFIGS}) + append_flag(${config} "${flag}") + endforeach() +endfunction() + +# Adds $flag to the link flags for $target. +function(append_link_flag_to_target target flag) + unset(target_link_flags) + get_target_property(target_link_flags ${target} LINK_FLAGS) + + if(target_link_flags) + is_flag_present(target_link_flags "${flag}" flag_found) + if(${flag_found}) + return() + endif() + set(target_link_flags "${target_link_flags} ${flag}") + else() + set(target_link_flags "${flag}") + endif() + + set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags}) +endfunction() + +# Adds $flag to executable linker flags, and makes sure C/CXX builds still work. +function(require_linker_flag flag) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + append_exe_linker_flag(${flag}) + + unset(c_passed) + aom_check_c_compiles("LINKER_FLAG_C_TEST(${flag})" "" c_passed) + unset(cxx_passed) + aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed) + + if(NOT c_passed OR NOT cxx_passed) + message(FATAL_ERROR "Linker flag test for ${flag} failed.") + endif() +endfunction() + +# Appends flags in $AOM_EXTRA__FLAGS variables to the flags used at build +# time. +function(set_user_flags) + + # Linker flags are handled first because some C/CXX flags require that a + # linker flag is present at link time. + if(AOM_EXTRA_EXE_LINKER_FLAGS) + is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}" + extra_present) + if(NOT ${extra_present}) + require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}") + endif() + endif() + if(AOM_EXTRA_AS_FLAGS) + + # TODO(tomfinegan): assembler flag testing would be a good thing to have. + is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present) + if(NOT ${extra_present}) + append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}") + endif() + endif() + if(AOM_EXTRA_C_FLAGS) + is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present) + if(NOT ${extra_present}) + require_c_flag("${AOM_EXTRA_C_FLAGS}" YES) + endif() + endif() + if(AOM_EXTRA_CXX_FLAGS) + is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present) + if(NOT ${extra_present}) + require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES) + endif() + endif() +endfunction() diff --git a/third_party/aom/build/cmake/compiler_tests.cmake b/third_party/aom/build/cmake/compiler_tests.cmake new file mode 100644 index 0000000000..0402832253 --- /dev/null +++ b/third_party/aom/build/cmake/compiler_tests.cmake @@ -0,0 +1,179 @@ +# +# Copyright (c) 2016, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ +set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1) + +include(CheckCSourceCompiles) +include(CheckCXXSourceCompiles) + +# CMake passes command line flags like this: +# +# * $compiler $lang_flags $lang_flags_config ... +# +# To ensure the flags tested here and elsewhere are obeyed a list of active +# build configuration types is built, and flags are applied to the flag strings +# for each configuration currently active for C and CXX builds as determined by +# reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When +# $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in +# use: currently this includes MSVC and Xcode. For other generators +# $CMAKE_BUILD_TYPE is used. For both cases AOM__CONFIGS is populated with +# CMake string variable names that contain flags for the currently available +# configuration(s). +unset(AOM_C_CONFIGS) +unset(AOM_CXX_CONFIGS) +list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs) +if(${num_configs} GREATER 0) + foreach(config ${CMAKE_CONFIGURATION_TYPES}) + string(TOUPPER ${config} config) + list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}") + list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}") + list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}") + endforeach() +else() + string(TOUPPER ${CMAKE_BUILD_TYPE} config) + set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}") + set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}") + set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}") +endif() + +# The basic main() function used in all compile tests. +set(AOM_C_MAIN "\nint main(void) { return 0; }") +set(AOM_CXX_MAIN "\nint main() { return 0; }") + +# Strings containing the names of passed and failed tests. +set(AOM_C_PASSED_TESTS) +set(AOM_C_FAILED_TESTS) +set(AOM_CXX_PASSED_TESTS) +set(AOM_CXX_FAILED_TESTS) + +function(aom_push_var var new_value) + set(SAVED_${var} ${${var}} PARENT_SCOPE) + set(${var} "${${var}} ${new_value}" PARENT_SCOPE) +endfunction() + +function(aom_pop_var var) + set(var ${SAVED_${var}} PARENT_SCOPE) + unset(SAVED_${var} PARENT_SCOPE) +endfunction() + +# Confirms $test_source compiles and stores $test_name in one of +# $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the +# test passes $result_var is set to 1. When it fails $result_var is unset. The +# test is not run if the test name is found in either of the passed or failed +# test variables. +function(aom_check_c_compiles test_name test_source result_var) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + unset(C_TEST_PASSED CACHE) + unset(C_TEST_FAILED CACHE) + string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED) + string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED) + if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1) + unset(C_TEST_COMPILED CACHE) + message("Running C compiler test: ${test_name}") + check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED) + set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE) + + if(C_TEST_COMPILED) + set(AOM_C_PASSED_TESTS + "${AOM_C_PASSED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + else() + set(AOM_C_FAILED_TESTS + "${AOM_C_FAILED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + message("C Compiler test ${test_name} failed.") + endif() + elseif(NOT ${C_TEST_PASSED} EQUAL -1) + set(${result_var} 1 PARENT_SCOPE) + else() # ${C_TEST_FAILED} NOT EQUAL -1 + unset(${result_var} PARENT_SCOPE) + endif() +endfunction() + +# Confirms $test_source compiles and stores $test_name in one of +# $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the +# test passes $result_var is set to 1. When it fails $result_var is unset. The +# test is not run if the test name is found in either of the passed or failed +# test variables. +function(aom_check_cxx_compiles test_name test_source result_var) + if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS) + return() + endif() + + unset(CXX_TEST_PASSED CACHE) + unset(CXX_TEST_FAILED CACHE) + string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED) + string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED) + if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1) + unset(CXX_TEST_COMPILED CACHE) + message("Running CXX compiler test: ${test_name}") + check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}" + CXX_TEST_COMPILED) + set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE) + + if(CXX_TEST_COMPILED) + set(AOM_CXX_PASSED_TESTS + "${AOM_CXX_PASSED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + else() + set(AOM_CXX_FAILED_TESTS + "${AOM_CXX_FAILED_TESTS} ${test_name}" + CACHE STRING "" FORCE) + message("CXX Compiler test ${test_name} failed.") + endif() + elseif(NOT ${CXX_TEST_PASSED} EQUAL -1) + set(${result_var} 1 PARENT_SCOPE) + else() # ${CXX_TEST_FAILED} NOT EQUAL -1 + unset(${result_var} PARENT_SCOPE) + endif() +endfunction() + +# Convenience function that confirms $test_source compiles as C and C++. +# $result_var is set to 1 when both tests are successful, and 0 when one or both +# tests fail. Note: This function is intended to be used to write to result +# variables that are expanded via configure_file(). $result_var is set to 1 or 0 +# to allow direct usage of the value in generated source files. +function(aom_check_source_compiles test_name test_source result_var) + unset(C_PASSED) + unset(CXX_PASSED) + aom_check_c_compiles(${test_name} ${test_source} C_PASSED) + aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED) + if(C_PASSED AND CXX_PASSED) + set(${result_var} 1 PARENT_SCOPE) + else() + set(${result_var} 0 PARENT_SCOPE) + endif() +endfunction() + +# When inline support is detected for the current compiler the supported +# inlining keyword is written to $result in caller scope. +function(aom_get_inline result) + aom_check_source_compiles("inline_check_1" + "static inline void function(void) {}" + HAVE_INLINE_1) + if(HAVE_INLINE_1 EQUAL 1) + set(${result} "inline" PARENT_SCOPE) + return() + endif() + + # Check __inline. + aom_check_source_compiles("inline_check_2" + "static __inline void function(void) {}" + HAVE_INLINE_2) + if(HAVE_INLINE_2 EQUAL 1) + set(${result} "__inline" PARENT_SCOPE) + endif() +endfunction() diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake new file mode 100644 index 0000000000..a9b7a67070 --- /dev/null +++ b/third_party/aom/build/cmake/cpu.cmake @@ -0,0 +1,108 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# + +if("${AOM_TARGET_CPU}" STREQUAL "arm64") + set(AOM_ARCH_ARM 1) + set(AOM_ARCH_AARCH64 1) + set(RTCD_ARCH_ARM "yes") + + set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE") + set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc") + set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod") + set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm") + set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve") + + # Check that the compiler flag to enable each flavor is supported by the + # compiler. This may not be the case for new architecture features on old + # compiler versions. + foreach(flavor ${ARM64_FLAVORS}) + if(ENABLE_${flavor} AND NOT DEFINED AOM_${flavor}_FLAG) + set(AOM_${flavor}_FLAG "${AOM_${flavor}_DEFAULT_FLAG}") + unset(FLAG_SUPPORTED) + check_c_compiler_flag("${AOM_${flavor}_FLAG}" FLAG_SUPPORTED) + if(NOT ${FLAG_SUPPORTED}) + set(ENABLE_${flavor} 0) + endif() + endif() + endforeach() + + # SVE requires that the Neon-SVE bridge header is also available. + if(ENABLE_SVE) + set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}") + aom_check_source_compiles("arm_neon_sve_bridge_available" " +#ifndef __ARM_NEON_SVE_BRIDGE +#error 1 +#endif +#include +#include " HAVE_SVE_HEADERS) + set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) + if(HAVE_SVE_HEADERS EQUAL 0) + set(ENABLE_SVE 0) + endif() + endif() + + foreach(flavor ${ARM64_FLAVORS}) + if(ENABLE_${flavor}) + set(HAVE_${flavor} 1) + set(RTCD_HAVE_${flavor} "yes") + else() + set(HAVE_${flavor} 0) + string(TOLOWER ${flavor} flavor) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) + endif() + endforeach() + +elseif("${AOM_TARGET_CPU}" MATCHES "^arm") + set(AOM_ARCH_ARM 1) + set(RTCD_ARCH_ARM "yes") + + if(ENABLE_NEON) + set(HAVE_NEON 1) + set(RTCD_HAVE_NEON "yes") + else() + set(HAVE_NEON 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon) + endif() + +elseif("${AOM_TARGET_CPU}" MATCHES "ppc") + set(AOM_ARCH_PPC 1) + set(RTCD_ARCH_PPC "yes") + + if(ENABLE_VSX) + set(HAVE_VSX 1) + set(RTCD_HAVE_VSX "yes") + else() + set(HAVE_VSX 0) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx) + endif() +elseif("${AOM_TARGET_CPU}" MATCHES "^x86") + if("${AOM_TARGET_CPU}" STREQUAL "x86") + set(AOM_ARCH_X86 1) + set(RTCD_ARCH_X86 "yes") + elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64") + set(AOM_ARCH_X86_64 1) + set(RTCD_ARCH_X86_64 "yes") + endif() + + set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2") + foreach(flavor ${X86_FLAVORS}) + if(ENABLE_${flavor} AND NOT disable_remaining_flavors) + set(HAVE_${flavor} 1) + set(RTCD_HAVE_${flavor} "yes") + else() + set(disable_remaining_flavors 1) + set(HAVE_${flavor} 0) + string(TOLOWER ${flavor} flavor) + set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor}) + endif() + endforeach() +endif() diff --git a/third_party/aom/build/cmake/dist.cmake b/third_party/aom/build/cmake/dist.cmake new file mode 100644 index 0000000000..5b9fc95d41 --- /dev/null +++ b/third_party/aom/build/cmake/dist.cmake @@ -0,0 +1,64 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +# Converts spaces in $in_string to semicolons and writes the output to +# $out_string. In CMake's eyes this converts the input string to a list. +function(listify_string in_string out_string) + string(REPLACE " " ";" ${out_string} ${in_string}) + set(${out_string} "${${out_string}}" PARENT_SCOPE) +endfunction() + +set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES" + "AOM_DIST_LIBS" "ENABLE_DOCS") + +foreach(arg ${REQUIRED_ARGS}) + if("${${arg}}" STREQUAL "") + message(FATAL_ERROR "${arg} must not be empty.") + endif() +endforeach() + +if(ENABLE_DOCS) + file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}") +endif() + +if(AOM_DIST_EXAMPLES) + listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES") + foreach(example ${AOM_DIST_EXAMPLES}) + if(NOT "${example}" MATCHES "aomdec\|aomenc") + file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples") + endif() + endforeach() +endif() + +if(AOM_DIST_TOOLS) + listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS") + foreach(tool ${AOM_DIST_TOOLS}) + file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools") + endforeach() +endif() + +if(AOM_DIST_APPS) + listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS") + foreach(app ${AOM_DIST_APPS}) + file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin") + endforeach() +endif() + +listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES") +foreach(inc ${AOM_DIST_INCLUDES}) + file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom") +endforeach() + +listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS") +foreach(lib ${AOM_DIST_LIBS}) + file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib") +endforeach() diff --git a/third_party/aom/build/cmake/exports.cmake b/third_party/aom/build/cmake/exports.cmake new file mode 100644 index 0000000000..1cea2b52ab --- /dev/null +++ b/third_party/aom/build/cmake/exports.cmake @@ -0,0 +1,76 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_ +set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1) + +include("${AOM_ROOT}/build/cmake/exports_sources.cmake") + +# Creates the custom target which handles generation of the symbol export lists. +function(setup_exports_target) + if(APPLE) + set(symbol_file_ext "syms") + elseif(WIN32) + set(symbol_file_ext "def") + else() + set(symbol_file_ext "ver") + endif() + + set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}") + + add_custom_target( + generate_exports + COMMAND ${CMAKE_COMMAND} + -DAOM_ROOT="${AOM_ROOT}" + -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" + -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM} + -DAOM_SYM_FILE="${aom_sym_file}" + -DAOM_MSVC=${MSVC} + -DAOM_XCODE=${XCODE} + -DCMAKE_SHARED_LIBRARY_PREFIX="${CMAKE_SHARED_LIBRARY_PREFIX}" + -DCONFIG_NAME=$ + -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER} + -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER} + -DCONFIG_INSPECTION=${CONFIG_INSPECTION} + -DENABLE_TESTS=${ENABLE_TESTS} + -P + "${AOM_ROOT}/build/cmake/generate_exports.cmake" + SOURCES ${AOM_EXPORTS_SOURCES} + DEPENDS ${AOM_EXPORTS_SOURCES} BYPRODUCTS ${aom_sym_file}) + + # Make libaom depend on the exports file, and set flags to pick it up when + # creating the dylib. + add_dependencies(aom generate_exports) + + if(APPLE) + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}") + elseif(WIN32) + if(MSVC) + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}") + else() + # For MinGW and MSYS compilers, you can use either version scripts or + # module definition files. If the latter, it must be supplied as an + # "object". + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "${aom_sym_file}") + endif() + else() + set_property(TARGET aom + APPEND_STRING + PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}") + endif() +endfunction() diff --git a/third_party/aom/build/cmake/exports_sources.cmake b/third_party/aom/build/cmake/exports_sources.cmake new file mode 100644 index 0000000000..46bf001d86 --- /dev/null +++ b/third_party/aom/build/cmake/exports_sources.cmake @@ -0,0 +1,35 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_) + return() +endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ +set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1) + +list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com" + "${AOM_ROOT}/av1/exports_com") + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec" + "${AOM_ROOT}/av1/exports_dec") + if(CONFIG_INSPECTION) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/av1/exports_ident") + endif() +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc" + "${AOM_ROOT}/av1/exports_enc") +endif() + +if(ENABLE_TESTS) + list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_test" + "${AOM_ROOT}/av1/exports_test") +endif() diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake new file mode 100644 index 0000000000..529daaf02a --- /dev/null +++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake @@ -0,0 +1,92 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +string(TIMESTAMP year "%Y") +set(asm_file_header_block "\; +\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved +\; +\; This source code is subject to the terms of the BSD 2 Clause License and +\; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +\; was not distributed with this source code in the LICENSE file, you can +\; obtain it at www.aomedia.org/license/software. If the Alliance for Open +\; Media Patent License 1.0 was not distributed with this source code in the +\; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +\; +") +set(h_file_header_block "/* + * Copyright (c) ${year}, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +\#ifndef AOM_CONFIG_H_ +\#define AOM_CONFIG_H_ +") +set(cmake_file_header_block "## +## Copyright (c) ${year}, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +") + +# Terminates cmake execution when $var_name is an empty string, or the variable +# name it contains does not expand to an existing directory. +function(check_directory_var var_name) + if("${var_name}" STREQUAL "") + message(FATAL_ERROR "The CMake variable ${var_name} must be defined.") + endif() + + if(NOT EXISTS "${${var_name}}") + message(FATAL_ERROR "${${var_name}} (${var_name}) missing.") + endif() +endfunction() + +check_directory_var(AOM_CONFIG_DIR) +check_directory_var(AOM_ROOT) + +set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") +if(NOT EXISTS "${AOM_DEFAULTS}") + message( + FATAL_ERROR "Configuration default values file (${AOM_DEFAULTS}) missing.") +endif() + +include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake") +list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS}) +list(SORT aom_build_vars) + +set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake") +file(WRITE "${aom_config_h_template}" ${h_file_header_block}) +foreach(aom_var ${aom_build_vars}) + if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS") + file(APPEND "${aom_config_h_template}" + "\#define ${aom_var} \${${aom_var}}\n") + endif() +endforeach() +file(APPEND "${aom_config_h_template}" "\#endif // AOM_CONFIG_H_") + +set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake") +file(WRITE "${aom_asm_config_template}" ${asm_file_header_block}) +foreach(aom_var ${aom_build_vars}) + if(NOT "${aom_var}" STREQUAL "INLINE" + AND NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS") + file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n") + endif() +endforeach() diff --git a/third_party/aom/build/cmake/generate_exports.cmake b/third_party/aom/build/cmake/generate_exports.cmake new file mode 100644 index 0000000000..3a5f67cea6 --- /dev/null +++ b/third_party/aom/build/cmake/generate_exports.cmake @@ -0,0 +1,69 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +# CMAKE_SHARED_LIBRARY_PREFIX can be empty +set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE" + "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER") + +foreach(arg ${REQUIRED_ARGS}) + if("${${arg}}" STREQUAL "") + message(FATAL_ERROR "${arg} must not be empty.") + endif() +endforeach() + +include("${AOM_ROOT}/build/cmake/exports_sources.cmake") + +if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + set(symbol_prefix "_") +elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS") + file(WRITE "${AOM_SYM_FILE}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n" + "EXPORTS\n") +else() + set(symbol_suffix ";") +endif() + +set(aom_sym_file "${AOM_SYM_FILE}") + +if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin") + file(REMOVE "${aom_sym_file}") +elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS") + file(WRITE "${aom_sym_file}" "LIBRARY ${CMAKE_SHARED_LIBRARY_PREFIX}aom\n" + "EXPORTS\n") +else() + file(WRITE "${aom_sym_file}" "{\nglobal:\n") +endif() + +foreach(export_file ${AOM_EXPORTS_SOURCES}) + file(STRINGS "${export_file}" exported_file_data) + set(exported_symbols "${exported_symbols} ${exported_file_data};") + string(STRIP "${exported_symbols}" exported_symbols) +endforeach() + +foreach(exported_symbol ${exported_symbols}) + string(STRIP "${exported_symbol}" exported_symbol) + if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS") + string(SUBSTRING ${exported_symbol} 0 4 export_type) + string(COMPARE EQUAL "${export_type}" "data" is_data) + if(is_data) + set(symbol_suffix " DATA") + else() + set(symbol_suffix "") + endif() + endif() + string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}") + set(exported_symbol " ${symbol_prefix}${exported_symbol}${symbol_suffix}") + file(APPEND "${aom_sym_file}" "${exported_symbol}\n") +endforeach() + +if("${aom_sym_file}" MATCHES "ver$") + file(APPEND "${aom_sym_file}" " \nlocal:\n *;\n};") +endif() diff --git a/third_party/aom/build/cmake/pkg_config.cmake b/third_party/aom/build/cmake/pkg_config.cmake new file mode 100644 index 0000000000..c4f94808a5 --- /dev/null +++ b/third_party/aom/build/cmake/pkg_config.cmake @@ -0,0 +1,69 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +cmake_minimum_required(VERSION 3.5) + +set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX" + "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR" + "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME" + "CONFIG_MULTITHREAD") + +foreach(arg ${REQUIRED_ARGS}) + if("${${arg}}" STREQUAL "") + message(FATAL_ERROR "${arg} must not be empty.") + endif() +endforeach() + +include("${AOM_ROOT}/build/cmake/util.cmake") + +extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version) + +# Create a version string suitable for comparison using the RPM version compare +# algorithm: strip out everything after the number. +string(FIND "${aom_version}" "-" dash_pos) +if(${dash_pos} EQUAL -1) + set(package_version "${aom_version}") +else() + string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version) +endif() + +# Write pkg-config info. +set(prefix "${CMAKE_INSTALL_PREFIX}") +set(bindir "${CMAKE_INSTALL_BINDIR}") +set(includedir "${CMAKE_INSTALL_INCLUDEDIR}") +set(libdir "${CMAKE_INSTALL_LIBDIR}") +set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc") +string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name) +file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n") +file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n") +file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}\n") +file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/${includedir}\n") +file(APPEND "${pkgconfig_file}" "libdir=\${exec_prefix}/${libdir}\n\n") +file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n") +file( + APPEND "${pkgconfig_file}" + "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n") +file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n") +file(APPEND "${pkgconfig_file}" "Requires:") +if(CONFIG_TUNE_VMAF) + file(APPEND "${pkgconfig_file}" " libvmaf") +endif() +if(CONFIG_TUNE_BUTTERAUGLI) + file(APPEND "${pkgconfig_file}" " libjxl") +endif() +file(APPEND "${pkgconfig_file}" "\nConflicts:\n") +file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n") +if(CONFIG_MULTITHREAD AND CMAKE_THREAD_LIBS_INIT) + file(APPEND "${pkgconfig_file}" + "Libs.private: -lm ${CMAKE_THREAD_LIBS_INIT}\n") +else() + file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n") +endif() +file(APPEND "${pkgconfig_file}" "Cflags: -I\${includedir}\n") diff --git a/third_party/aom/build/cmake/rtcd.pl b/third_party/aom/build/cmake/rtcd.pl new file mode 100755 index 0000000000..1cf52f076c --- /dev/null +++ b/third_party/aom/build/cmake/rtcd.pl @@ -0,0 +1,430 @@ +#!/usr/bin/env perl +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +no strict 'refs'; +use warnings; +use Getopt::Long; +Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32; + +my %ALL_FUNCS = (); +my @ALL_ARCHS; +my @ALL_FORWARD_DECLS; +my @REQUIRES; + +my %opts = (); +my %disabled = (); +my %required = (); + +my @argv; +foreach (@ARGV) { + $disabled{$1} = 1, next if /--disable-(.*)/; + $required{$1} = 1, next if /--require-(.*)/; + push @argv, $_; +} + +# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility. +@ARGV = @argv; +GetOptions( + \%opts, + 'arch=s', + 'sym=s', + 'config=s', +); + +foreach my $opt (qw/arch config/) { + if (!defined($opts{$opt})) { + warn "--$opt is required!\n"; + Getopt::Long::HelpMessage('-exit' => 1); + } +} + +foreach my $defs_file (@ARGV) { + if (!-f $defs_file) { + warn "$defs_file: $!\n"; + Getopt::Long::HelpMessage('-exit' => 1); + } +} + +open CONFIG_FILE, $opts{config} or + die "Error opening config file '$opts{config}': $!\n"; + +my %config = (); +while () { + next if !/^#define\s+(?:CONFIG_|HAVE_)/; + chomp; + my @line_components = split /\s/; + scalar @line_components > 2 or + die "Invalid input passed to rtcd.pl via $opts{config}."; + # $line_components[0] = #define + # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING) + # $line_components[2] = flag value (0 or 1) + $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : ""; +} +close CONFIG_FILE; + +# +# Routines for the RTCD DSL to call +# +sub aom_config($) { + return (defined $config{$_[0]}) ? $config{$_[0]} : ""; +} + +sub specialize { + if (@_ <= 1) { + die "'specialize' must be called with a function name and at least one ", + "architecture ('C' is implied): \n@_\n"; + } + my $fn=$_[0]; + shift; + foreach my $opt (@_) { + eval "\$${fn}_${opt}=${fn}_${opt}"; + } +} + +sub add_proto { + my $fn = splice(@_, -2, 1); + my @proto = @_; + foreach (@proto) { tr/\t/ / } + $ALL_FUNCS{$fn} = \@proto; + specialize $fn, "c"; +} + +sub require { + foreach my $fn (keys %ALL_FUNCS) { + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + + # if we already have a default, then we can disable it, as we know + # we can do better. + my $best = eval "\$${fn}_default"; + if ($best) { + my $best_ofn = eval "\$${best}"; + if ($best_ofn && "$best_ofn" ne "$ofn") { + eval "\$${best}_link = 'false'"; + } + } + eval "\$${fn}_default=${fn}_${opt}"; + eval "\$${fn}_${opt}_link='true'"; + } + } +} + +sub forward_decls { + push @ALL_FORWARD_DECLS, @_; +} + +# +# Include the user's directives +# +foreach my $f (@ARGV) { + open FILE, "<", $f or die "cannot open $f: $!\n"; + my $contents = join('', ); + close FILE; + eval $contents or warn "eval failed: $@\n"; +} + +# +# Process the directives according to the command line +# +sub process_forward_decls() { + foreach (@ALL_FORWARD_DECLS) { + $_->(); + } +} + +sub determine_indirection { + aom_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS); + foreach my $fn (keys %ALL_FUNCS) { + my $n = ""; + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + my $link = eval "\$${fn}_${opt}_link"; + next if $link && $link eq "false"; + $n .= "x"; + } + if ($n eq "x") { + eval "\$${fn}_indirect = 'false'"; + } else { + eval "\$${fn}_indirect = 'true'"; + } + } +} + +sub declare_function_pointers { + foreach my $fn (sort keys %ALL_FUNCS) { + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + print "$rtyp ${ofn}($args);\n"; + } + if (eval "\$${fn}_indirect" eq "false") { + print "#define ${fn} ${dfn}\n"; + } else { + print "RTCD_EXTERN $rtyp (*${fn})($args);\n"; + } + print "\n"; + } +} + +sub set_function_pointers { + foreach my $fn (sort keys %ALL_FUNCS) { + my @val = @{$ALL_FUNCS{$fn}}; + my $args = pop @val; + my $rtyp = "@val"; + my $dfn = eval "\$${fn}_default"; + $dfn = eval "\$${dfn}"; + if (eval "\$${fn}_indirect" eq "true") { + print " $fn = $dfn;\n"; + foreach my $opt (@_) { + my $ofn = eval "\$${fn}_${opt}"; + next if !$ofn; + next if "$ofn" eq "$dfn"; + my $link = eval "\$${fn}_${opt}_link"; + next if $link && $link eq "false"; + my $cond = eval "\$have_${opt}"; + print " if (${cond}) $fn = $ofn;\n" + } + } + } +} + +sub filter { + my @filtered; + foreach (@_) { push @filtered, $_ unless $disabled{$_}; } + return @filtered; +} + +# +# Helper functions for generating the arch specific RTCD files +# +sub common_top() { + my $include_guard = uc($opts{sym})."_H_"; + print < \$version_data, + 'version_filename=s' => \$version_filename) or + die("Invalid arg(s): $!"); + +if (!defined $version_data || length($version_data) == 0 || + !defined $version_filename || length($version_filename) == 0) { + die("--version_data and --version_filename are required."); +} + +# Determine if $version_data is a filename or a git tag/description. +my $version_string; +chomp($version_data); +if (-r $version_data) { + # $version_data is the path to the CHANGELOG. Parse the most recent version. + my $changelog_filename = $version_data; + open(my $changelog_file, '<', $changelog_filename) or + die("Unable to open CHANGELOG @ $changelog_filename: $!."); + + while (my $line = <$changelog_file>) { + my @split_line = split(" ", $line, 3); + next if @split_line < 2; + $version_string = $split_line[1]; + last if substr($version_string, 0, 1) eq "v"; + } + close($changelog_file); +} else { + # $version_data is either a tag name or a full git description, one of: + # tagName OR tagName-commitsSinceTag-shortCommitHash + # In either case we want the first element of the array returned by split. + $version_string = (split("-", $version_data))[0]; + $git_desc = $version_data; +} + +if (substr($version_string, 0, 1) eq "v") { + $version_string = substr($version_string, 1); +} + +my @version_components = split('\.', $version_string, 4); +my $version_major = $version_components[0]; +my $version_minor = $version_components[1]; +my $version_patch = $version_components[2]; + +my $version_extra = ""; +if (length($git_desc) > 0) { + my @git_desc_components = split('-', $git_desc, 2); + if (@git_desc_components > 1) { + $version_extra = $git_desc_components[1]; + } +} + +open(my $version_file, '>', $version_filename) or + die("Cannot open $version_filename: $!"); + +my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))"; +my $year = (localtime)[5] + 1900; +my $lic_block = << "EOF"; +/* + * Copyright (c) $year, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +EOF + +select $version_file; +if (length($git_desc)) { + print << "EOF"; +$lic_block +#define VERSION_MAJOR $version_major +#define VERSION_MINOR $version_minor +#define VERSION_PATCH $version_patch +#define VERSION_EXTRA \"$version_extra\" +#define VERSION_PACKED \\ + $version_packed +#define VERSION_STRING_NOSP \"$git_desc\" +#define VERSION_STRING \" $git_desc\" +EOF +} else { + print << "EOF"; +$lic_block +#define VERSION_MAJOR $version_major +#define VERSION_MINOR $version_minor +#define VERSION_PATCH $version_patch +#define VERSION_EXTRA \"$version_extra\" +#define VERSION_PACKED \\ + $version_packed +#define VERSION_STRING_NOSP \"v$version_string\" +#define VERSION_STRING \" v$version_string\" +EOF +} +close($version_file); diff --git a/third_party/aom/codereview.settings b/third_party/aom/codereview.settings new file mode 100644 index 0000000000..185e9344cf --- /dev/null +++ b/third_party/aom/codereview.settings @@ -0,0 +1,4 @@ +# This file is used by git cl to get repository specific information. +GERRIT_HOST: True +CODE_REVIEW_SERVER: aomedia-review.googlesource.com +GERRIT_SQUASH_UPLOADS: False diff --git a/third_party/aom/common/args.c b/third_party/aom/common/args.c new file mode 100644 index 0000000000..b5ede193b5 --- /dev/null +++ b/third_party/aom/common/args.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/args.h" + +#include +#include +#include +#include + +#include "aom/aom_integer.h" +#include "aom_ports/msvc.h" +#include "aom/aom_codec.h" +#include "common/tools_common.h" + +static const char kSbSizeWarningString[] = + "super_block_size has to be 64 or 128."; +static const char kMinpartWarningString[] = + "min_partition_size has to be smaller or equal to max_partition_size."; +static const char kMaxpartWarningString[] = + "max_partition_size has to be smaller or equal to super_block_size."; + +static char *ignore_front_spaces(const char *str) { + while (str[0] == ' ' || str[0] == '\t') ++str; + return (char *)str; +} + +static void ignore_end_spaces(char *str) { + char *end = str + strlen(str); + while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' || + end[0] == '\r' || end[0] == '\0')) + --end; + if (end >= str) end[1] = '\0'; +} + +int parse_cfg(const char *file, cfg_options_t *config) { + char line[1024 * 10]; + FILE *f = fopen(file, "r"); + if (!f) return 1; + +#define GET_PARAMS(field) \ + if (strcmp(left, #field) == 0) { \ + config->field = atoi(right); \ + continue; \ + } + + while (fgets(line, sizeof(line) - 1, f)) { + char *actual_line = ignore_front_spaces(line); + char *left, *right, *comment; + size_t length = strlen(actual_line); + + if (length == 0 || actual_line[0] == '#') continue; + right = strchr(actual_line, '='); + if (right == NULL) continue; + right[0] = '\0'; + + left = ignore_front_spaces(actual_line); + right = ignore_front_spaces(right + 1); + + comment = strchr(right, '#'); + if (comment != NULL) comment[0] = '\0'; + + ignore_end_spaces(left); + ignore_end_spaces(right); + + GET_PARAMS(super_block_size) + GET_PARAMS(max_partition_size) + GET_PARAMS(min_partition_size) + GET_PARAMS(disable_ab_partition_type) + GET_PARAMS(disable_rect_partition_type) + GET_PARAMS(disable_1to4_partition_type) + GET_PARAMS(disable_flip_idtx) + GET_PARAMS(disable_cdef) + GET_PARAMS(disable_lr) + GET_PARAMS(disable_obmc) + GET_PARAMS(disable_warp_motion) + GET_PARAMS(disable_global_motion) + GET_PARAMS(disable_dist_wtd_comp) + GET_PARAMS(disable_diff_wtd_comp) + GET_PARAMS(disable_inter_intra_comp) + GET_PARAMS(disable_masked_comp) + GET_PARAMS(disable_one_sided_comp) + GET_PARAMS(disable_palette) + GET_PARAMS(disable_intrabc) + GET_PARAMS(disable_cfl) + GET_PARAMS(disable_smooth_intra) + GET_PARAMS(disable_filter_intra) + GET_PARAMS(disable_dual_filter) + GET_PARAMS(disable_intra_angle_delta) + GET_PARAMS(disable_intra_edge_filter) + GET_PARAMS(disable_tx_64x64) + GET_PARAMS(disable_smooth_inter_intra) + GET_PARAMS(disable_inter_inter_wedge) + GET_PARAMS(disable_inter_intra_wedge) + GET_PARAMS(disable_paeth_intra) + GET_PARAMS(disable_trellis_quant) + GET_PARAMS(disable_ref_frame_mv) + GET_PARAMS(reduced_reference_set) + GET_PARAMS(reduced_tx_type_set) + + fprintf(stderr, "\nInvalid parameter: %s", left); + exit(-1); + } + + if (config->super_block_size != 128 && config->super_block_size != 64) { + fprintf(stderr, "\n%s", kSbSizeWarningString); + exit(-1); + } + if (config->min_partition_size > config->max_partition_size) { + fprintf(stderr, "\n%s", kMinpartWarningString); + exit(-1); + } + if (config->max_partition_size > config->super_block_size) { + fprintf(stderr, "\n%s", kMaxpartWarningString); + exit(-1); + } + + fclose(f); + config->init_by_cfg_file = 1; + + return 0; +} + +int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + int ret = arg_match_helper(arg_, def, argv, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} + +const char *arg_next(struct arg *arg) { + if (arg->argv[0]) arg->argv += arg->argv_step; + + return *arg->argv; +} + +char **argv_dup(int argc, const char **argv) { + char **new_argv = malloc((argc + 1) * sizeof(*argv)); + if (!new_argv) return NULL; + + memcpy(new_argv, argv, argc * sizeof(*argv)); + new_argv[argc] = NULL; + return new_argv; +} + +void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { + for (; *defs; defs++) { + const struct arg_def *def = *defs; + char *short_val = def->has_val ? " " : ""; + char *long_val = def->has_val ? "=" : ""; + int n = 0; + + // Short options are indented with two spaces. Long options are indented + // with 12 spaces. + if (def->short_name && def->long_name) { + char *comma = def->has_val ? "," : ", "; + + n = fprintf(fp, " -%s%s%s --%s%s", def->short_name, short_val, comma, + def->long_name, long_val); + } else if (def->short_name) + n = fprintf(fp, " -%s%s", def->short_name, short_val); + else if (def->long_name) + n = fprintf(fp, " --%s%s", def->long_name, long_val); + + // Descriptions are indented with 40 spaces. If an option is 40 characters + // or longer, its description starts on the next line. + if (n < 40) + for (int i = 0; i < 40 - n; i++) fputc(' ', fp); + else + fputs("\n ", fp); + fprintf(fp, "%s\n", def->desc); + + if (def->enums) { + const struct arg_enum_list *listptr; + + fprintf(fp, " %-37s\t ", ""); + + for (listptr = def->enums; listptr->name; listptr++) + fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n"); + } + } +} + +unsigned int arg_parse_uint(const struct arg *arg) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + unsigned int ret = arg_parse_uint_helper(arg, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} + +int arg_parse_int(const struct arg *arg) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + int ret = arg_parse_int_helper(arg, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} + +struct aom_rational arg_parse_rational(const struct arg *arg) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + struct aom_rational ret = arg_parse_rational_helper(arg, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} + +int arg_parse_enum(const struct arg *arg) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + int ret = arg_parse_enum_helper(arg, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} + +int arg_parse_enum_or_int(const struct arg *arg) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + int ret = arg_parse_enum_or_int_helper(arg, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} + +// parse a comma separated list of at most n integers +// return the number of elements in the list +int arg_parse_list(const struct arg *arg, int *list, int n) { + char err_msg[ARG_ERR_MSG_MAX_LEN]; + int ret = arg_parse_list_helper(arg, list, n, err_msg); + if (err_msg[0] != '\0') { + die("%s", err_msg); + } + return ret; +} diff --git a/third_party/aom/common/args.h b/third_party/aom/common/args.h new file mode 100644 index 0000000000..1c5c437632 --- /dev/null +++ b/third_party/aom/common/args.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_ARGS_H_ +#define AOM_COMMON_ARGS_H_ +#include + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" +#include "common/args_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int arg_match(struct arg *arg_, const struct arg_def *def, char **argv); +int parse_cfg(const char *file, cfg_options_t *config); +const char *arg_next(struct arg *arg); +void arg_show_usage(FILE *fp, const struct arg_def *const *defs); +char **argv_dup(int argc, const char **argv); + +unsigned int arg_parse_uint(const struct arg *arg); +int arg_parse_int(const struct arg *arg); +struct aom_rational arg_parse_rational(const struct arg *arg); +int arg_parse_enum(const struct arg *arg); +int arg_parse_enum_or_int(const struct arg *arg); +int arg_parse_list(const struct arg *arg, int *list, int n); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_ARGS_H_ diff --git a/third_party/aom/common/args_helper.c b/third_party/aom/common/args_helper.c new file mode 100644 index 0000000000..2201868335 --- /dev/null +++ b/third_party/aom/common/args_helper.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "common/args_helper.h" + +#include +#include +#include +#include +#include + +#define SET_ERR_STRING(...) \ + if (err_msg) snprintf(err_msg, ARG_ERR_MSG_MAX_LEN, __VA_ARGS__) + +struct arg arg_init(char **argv) { + struct arg a; + + a.argv = argv; + a.argv_step = 1; + a.name = NULL; + a.val = NULL; + a.def = NULL; + return a; +} + +int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv, + char *err_msg) { + struct arg arg; + + if (err_msg) err_msg[0] = '\0'; + + assert(def->has_val == 0 || def->has_val == 1 || def->has_val == -1); + + if (!argv[0] || argv[0][0] != '-') return 0; + + arg = arg_init(argv); + + if (def->short_name && !strcmp(arg.argv[0] + 1, def->short_name)) { + arg.name = arg.argv[0] + 1; + arg.val = def->has_val ? arg.argv[1] : NULL; + arg.argv_step = def->has_val ? 2 : 1; + } else if (def->long_name) { + const size_t name_len = strlen(def->long_name); + + if (arg.argv[0][1] == '-' && + !strncmp(arg.argv[0] + 2, def->long_name, name_len) && + (arg.argv[0][name_len + 2] == '=' || + arg.argv[0][name_len + 2] == '\0')) { + arg.name = arg.argv[0] + 2; + arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL; + arg.argv_step = 1; + } + } + + if (arg.name) { + if (def->has_val == -1) { + arg.def = def; + *arg_ = arg; + return 1; + } + + if (!arg.val && def->has_val) { + SET_ERR_STRING("Error: option %s requires argument.\n", arg.name); + return 0; + } + + if (arg.val && !def->has_val) { + SET_ERR_STRING("Error: option %s requires no argument.\n", arg.name); + return 0; + } + + arg.def = def; + *arg_ = arg; + return 1; + } + + return 0; +} + +unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg) { + char *endptr; + const unsigned long rawval = strtoul(arg->val, &endptr, 10); // NOLINT + + if (err_msg) err_msg[0] = '\0'; + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval <= UINT_MAX) return (unsigned int)rawval; + SET_ERR_STRING("Option %s: Value %lu out of range for unsigned int\n", + arg->name, rawval); + return 0; + } + SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr); + return 0; +} + +int arg_parse_int_helper(const struct arg *arg, char *err_msg) { + char *endptr; + const long rawval = strtol(arg->val, &endptr, 10); // NOLINT + + if (err_msg) err_msg[0] = '\0'; + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval; + SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", + arg->name, rawval); + return 0; + } + SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr); + return 0; +} + +struct aom_rational arg_parse_rational_helper(const struct arg *arg, + char *err_msg) { + long rawval; // NOLINT + char *endptr; + struct aom_rational rat = { 0, 1 }; + + if (err_msg) err_msg[0] = '\0'; + + /* parse numerator */ + rawval = strtol(arg->val, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '/') { + if (rawval >= INT_MIN && rawval <= INT_MAX) { + rat.num = (int)rawval; + } else { + SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", + arg->name, rawval); + return rat; + } + } else { + SET_ERR_STRING("Option %s: Expected / at '%c'\n", arg->name, *endptr); + return rat; + } + + /* parse denominator */ + rawval = strtol(endptr + 1, &endptr, 10); + + if (arg->val[0] != '\0' && endptr[0] == '\0') { + if (rawval >= INT_MIN && rawval <= INT_MAX) { + rat.den = (int)rawval; + } else { + SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", + arg->name, rawval); + return rat; + } + } else { + SET_ERR_STRING("Option %s: Invalid character '%c'\n", arg->name, *endptr); + return rat; + } + + return rat; +} + +int arg_parse_enum_helper(const struct arg *arg, char *err_msg) { + const struct arg_enum_list *listptr; + long rawval; // NOLINT + char *endptr; + + if (err_msg) err_msg[0] = '\0'; + + /* First see if the value can be parsed as a raw value */ + rawval = strtol(arg->val, &endptr, 10); + if (arg->val[0] != '\0' && endptr[0] == '\0') { + /* Got a raw value, make sure it's valid */ + for (listptr = arg->def->enums; listptr->name; listptr++) + if (listptr->val == rawval) return (int)rawval; + } + + /* Next see if it can be parsed as a string */ + for (listptr = arg->def->enums; listptr->name; listptr++) + if (!strcmp(arg->val, listptr->name)) return listptr->val; + + SET_ERR_STRING("Option %s: Invalid value '%s'\n", arg->name, arg->val); + return 0; +} + +int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg) { + if (arg->def->enums) return arg_parse_enum_helper(arg, err_msg); + return arg_parse_int_helper(arg, err_msg); +} + +// parse a comma separated list of at most n integers +// return the number of elements in the list +int arg_parse_list_helper(const struct arg *arg, int *list, int n, + char *err_msg) { + const char *ptr = arg->val; + char *endptr; + int i = 0; + + if (err_msg) err_msg[0] = '\0'; + + while (ptr[0] != '\0') { + long rawval = strtol(ptr, &endptr, 10); // NOLINT + if (rawval < INT_MIN || rawval > INT_MAX) { + SET_ERR_STRING("Option %s: Value %ld out of range for signed int\n", + arg->name, rawval); + return 0; + } else if (i >= n) { + SET_ERR_STRING("Option %s: List has more than %d entries\n", arg->name, + n); + return 0; + } else if (*endptr == ',') { + endptr++; + } else if (*endptr != '\0') { + SET_ERR_STRING("Option %s: Bad list separator '%c'\n", arg->name, + *endptr); + return 0; + } + list[i++] = (int)rawval; + ptr = endptr; + } + return i; +} diff --git a/third_party/aom/common/args_helper.h b/third_party/aom/common/args_helper.h new file mode 100644 index 0000000000..c86a6128d3 --- /dev/null +++ b/third_party/aom/common/args_helper.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_ARGS_HELPER_H_ +#define AOM_COMMON_ARGS_HELPER_H_ + +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Maximum length of the error messages for the helper functions. +#define ARG_ERR_MSG_MAX_LEN 200 + +struct arg { + char **argv; + const char *name; + const char *val; + unsigned int argv_step; + const struct arg_def *def; +}; + +struct arg_enum_list { + const char *name; + int val; +}; +#define ARG_ENUM_LIST_END \ + { 0 } + +typedef struct arg_def { + const char *short_name; + const char *long_name; + int has_val; // 0: The argument must not have a value. + // 1: The argument must have a value. + // -1: The argument may or may not have a value. + const char *desc; + const struct arg_enum_list *enums; +} arg_def_t; +#define ARG_DEF(s, l, v, d) \ + { s, l, v, d, NULL } +#define ARG_DEF_ENUM(s, l, v, d, e) \ + { s, l, v, d, e } +#define ARG_DEF_LIST_END \ + { 0 } + +struct arg arg_init(char **argv); + +/* + * The helper functions below all take an optional parameter err_msg for + * error reporting. When err_msg is not NULL (must point to a buffer + * which is at least ARG_ERR_MSG_MAX_LEN bytes long), a related error message is + * stored in it if an error occurs. It will be set to an empty string if no + * error occurs. + */ +int arg_match_helper(struct arg *arg_, const struct arg_def *def, char **argv, + char *err_msg); +unsigned int arg_parse_uint_helper(const struct arg *arg, char *err_msg); +int arg_parse_int_helper(const struct arg *arg, char *err_msg); +struct aom_rational arg_parse_rational_helper(const struct arg *arg, + char *err_msg); +int arg_parse_enum_helper(const struct arg *arg, char *err_msg); +int arg_parse_enum_or_int_helper(const struct arg *arg, char *err_msg); +int arg_parse_list_helper(const struct arg *arg, int *list, int n, + char *err_msg); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_ARGS_HELPER_H_ diff --git a/third_party/aom/common/av1_config.c b/third_party/aom/common/av1_config.c new file mode 100644 index 0000000000..9f5b02015b --- /dev/null +++ b/third_party/aom/common/av1_config.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "aom/aom_image.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "av1/common/obu_util.h" +#include "common/av1_config.h" +#include "config/aom_config.h" + +// Helper macros to reduce verbosity required to check for read errors. +// +// Note that when using these macros, even single line if statements should use +// curly braces to avoid unexpected behavior because all but the +// AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements. +#define AV1C_READ_BIT_OR_RETURN_ERROR(field) \ + int field = 0; \ + do { \ + field = aom_rb_read_bit(reader); \ + if (result == -1) { \ + fprintf(stderr, \ + "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \ + field, result); \ + return -1; \ + } \ + } while (0) + +#define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \ + int field = 0; \ + do { \ + field = aom_rb_read_literal(reader, (length)); \ + if (result == -1) { \ + fprintf(stderr, \ + "av1c: Could not read bits for " #field \ + ", value=%d result=%d.\n", \ + field, result); \ + return -1; \ + } \ + } while (0) + +// Helper macros for setting/restoring the error handler data in +// aom_read_bit_buffer. +#define AV1C_PUSH_ERROR_HANDLER_DATA(new_data) \ + void *original_error_handler_data = NULL; \ + do { \ + original_error_handler_data = reader->error_handler_data; \ + reader->error_handler_data = &new_data; \ + } while (0) + +#define AV1C_POP_ERROR_HANDLER_DATA() \ + do { \ + reader->error_handler_data = original_error_handler_data; \ + } while (0) + +static const size_t kAv1cSize = 4; + +static void bitreader_error_handler(void *data) { + int *error_val = (int *)data; + *error_val = -1; +} + +// Parse the AV1 timing_info() structure: +// timing_info( ) { +// num_units_in_display_tick f(32) +// time_scale f(32) +// equal_picture_interval f(1) +// if (equal_picture_interval) +// num_ticks_per_picture_minus_1 uvlc() +// } +static int parse_timing_info(struct aom_read_bit_buffer *reader) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32); + AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32); + + AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval); + if (equal_picture_interval) { + uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader); + if (result == -1) { + fprintf(stderr, + "av1c: Could not read bits for " + "num_ticks_per_picture_minus_1, value=%u.\n", + num_ticks_per_picture_minus_1); + return result; + } + } + + AV1C_POP_ERROR_HANDLER_DATA(); + return result; +} + +// Parse the AV1 decoder_model_info() structure: +// decoder_model_info( ) { +// buffer_delay_length_minus_1 f(5) +// num_units_in_decoding_tick f(32) +// buffer_removal_time_length_minus_1 f(5) +// frame_presentation_time_length_minus_1 f(5) +// } +// +// Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1. +static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5); + AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32); + AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5); + AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5); + + AV1C_POP_ERROR_HANDLER_DATA(); + return buffer_delay_length_minus_1 + 1; +} + +// Parse the AV1 operating_parameters_info() structure: +// operating_parameters_info( op ) { +// n = buffer_delay_length_minus_1 + 1 +// decoder_buffer_delay[ op ] f(n) +// encoder_buffer_delay[ op ] f(n) +// low_delay_mode_flag[ op ] f(1) +// } +static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader, + int buffer_delay_length_minus_1) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + const int buffer_delay_length = buffer_delay_length_minus_1 + 1; + AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length); + AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length); + AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag); + + AV1C_POP_ERROR_HANDLER_DATA(); + return result; +} + +// Parse the AV1 color_config() structure..See: +// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44 +static int parse_color_config(struct aom_read_bit_buffer *reader, + Av1Config *config) { + int result = 0; + AV1C_PUSH_ERROR_HANDLER_DATA(result); + + AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth); + config->high_bitdepth = high_bitdepth; + + int bit_depth = 0; + if (config->seq_profile == 2 && config->high_bitdepth) { + AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit); + config->twelve_bit = twelve_bit; + bit_depth = config->twelve_bit ? 12 : 10; + } else { + bit_depth = config->high_bitdepth ? 10 : 8; + } + + if (config->seq_profile != 1) { + AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome); + config->monochrome = mono_chrome; + } + + int color_primaries = AOM_CICP_CP_UNSPECIFIED; + int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED; + int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED; + + AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag); + if (color_description_present_flag) { + AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8); + color_primaries = color_primaries_val; + AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8); + transfer_characteristics = transfer_characteristics_val; + AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8); + matrix_coefficients = matrix_coefficients_val; + } + + if (config->monochrome) { + AV1C_READ_BIT_OR_RETURN_ERROR(color_range); + config->chroma_subsampling_x = 1; + config->chroma_subsampling_y = 1; + } else if (color_primaries == AOM_CICP_CP_BT_709 && + transfer_characteristics == AOM_CICP_TC_SRGB && + matrix_coefficients == AOM_CICP_MC_IDENTITY) { + config->chroma_subsampling_x = 0; + config->chroma_subsampling_y = 0; + } else { + AV1C_READ_BIT_OR_RETURN_ERROR(color_range); + if (config->seq_profile == 0) { + config->chroma_subsampling_x = 1; + config->chroma_subsampling_y = 1; + } else if (config->seq_profile == 1) { + config->chroma_subsampling_x = 0; + config->chroma_subsampling_y = 0; + } else { + if (bit_depth == 12) { + AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x); + config->chroma_subsampling_x = subsampling_x; + if (subsampling_x) { + AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y); + config->chroma_subsampling_y = subsampling_y; + } else { + config->chroma_subsampling_y = 0; + } + } else { + config->chroma_subsampling_x = 1; + config->chroma_subsampling_y = 0; + } + } + + if (config->chroma_subsampling_x && config->chroma_subsampling_y) { + AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2); + config->chroma_sample_position = chroma_sample_position; + } + } + + if (!config->monochrome) { + AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q); + } + + AV1C_POP_ERROR_HANDLER_DATA(); + return result; +} + +// Parse AV1 Sequence Header OBU. See: +// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41 +static int parse_sequence_header(const uint8_t *const buffer, size_t length, + Av1Config *config) { + int result = 0; + // The reader instance is local to this function, but a pointer to the + // reader instance is used within this function and throughout this file to + // allow use of the helper macros that reduce parse error checking verbosity. + struct aom_read_bit_buffer reader_instance = { buffer, buffer + length, 0, + &result, + bitreader_error_handler }; + struct aom_read_bit_buffer *reader = &reader_instance; + + AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3); + config->seq_profile = seq_profile; + AV1C_READ_BIT_OR_RETURN_ERROR(still_picture); + AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header); + if (reduced_still_picture_header) { + config->initial_presentation_delay_present = 0; + AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5); + config->seq_level_idx_0 = seq_level_idx_0; + config->seq_tier_0 = 0; + } else { + int has_decoder_model = 0; + int buffer_delay_length = 0; + + AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag); + if (timing_info_present_flag) { + if (parse_timing_info(reader) != 0) return -1; + + AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag); + if (decoder_model_info_present_flag && + (buffer_delay_length = parse_decoder_model_info(reader)) == -1) { + return -1; + } + has_decoder_model = 1; + } + + AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present); + config->initial_presentation_delay_present = + initial_presentation_delay_present; + + AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5); + const int num_operating_points = operating_points_cnt_minus_1 + 1; + + for (int op_index = 0; op_index < num_operating_points; ++op_index) { + AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12); + AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5); + + int seq_tier = 0; + if (seq_level_idx > 7) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op); + seq_tier = seq_tier_this_op; + } + + if (has_decoder_model) { + AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op); + if (decoder_model_present_for_op) { + if (parse_operating_parameters_info(reader, buffer_delay_length) == + -1) { + return -1; + } + } + } + + if (config->initial_presentation_delay_present) { + // Skip the initial presentation delay bits if present since this + // function has no access to the data required to properly set the + // field. + AV1C_READ_BIT_OR_RETURN_ERROR( + initial_presentation_delay_present_for_this_op); + if (initial_presentation_delay_present_for_this_op) { + AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4); + } + } + + if (op_index == 0) { + // Av1Config needs only the values from the first operating point. + config->seq_level_idx_0 = seq_level_idx; + config->seq_tier_0 = seq_tier; + config->initial_presentation_delay_present = 0; + config->initial_presentation_delay_minus_one = 0; + } + } + } + + AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4); + AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4); + AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1, + frame_width_bits_minus_1 + 1); + AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1, + frame_height_bits_minus_1 + 1); + + uint8_t frame_id_numbers_present = 0; + if (!reduced_still_picture_header) { + AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag); + frame_id_numbers_present = frame_id_numbers_present_flag; + } + + if (frame_id_numbers_present) { + AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4); + AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3); + } + + AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter); + + if (!reduced_still_picture_header) { + AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter); + + AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint); + if (enable_order_hint) { + AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs); + } + + const int SELECT_SCREEN_CONTENT_TOOLS = 2; + int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS; + AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools); + if (!seq_choose_screen_content_tools) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val); + seq_force_screen_content_tools = seq_force_screen_content_tools_val; + } + + if (seq_force_screen_content_tools > 0) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv); + + if (!seq_choose_integer_mv) { + AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv); + } + } + + if (enable_order_hint) { + AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3); + } + } + + AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration); + + if (parse_color_config(reader, config) != 0) { + fprintf(stderr, "av1c: color_config() parse failed.\n"); + return -1; + } + + AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present); + return 0; +} + +int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb, + Av1Config *config) { + if (!buffer || length == 0 || !config) { + return -1; + } + + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + + size_t sequence_header_length = 0; + size_t obu_header_length = 0; + if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header, + &sequence_header_length, + &obu_header_length) != AOM_CODEC_OK || + obu_header.type != OBU_SEQUENCE_HEADER || + sequence_header_length + obu_header_length > length) { + return -1; + } + + memset(config, 0, sizeof(*config)); + config->marker = 1; + config->version = 1; + return parse_sequence_header(buffer + obu_header_length, + sequence_header_length, config); +} + +int read_av1config(const uint8_t *buffer, size_t buffer_length, + size_t *bytes_read, Av1Config *config) { + if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1; + + *bytes_read = 0; + + int result = 0; + struct aom_read_bit_buffer reader_instance = { buffer, buffer + buffer_length, + 0, &result, + bitreader_error_handler }; + struct aom_read_bit_buffer *reader = &reader_instance; + + memset(config, 0, sizeof(*config)); + + AV1C_READ_BIT_OR_RETURN_ERROR(marker); + config->marker = marker; + + AV1C_READ_BITS_OR_RETURN_ERROR(version, 7); + config->version = version; + + AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3); + config->seq_profile = seq_profile; + + AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5); + config->seq_level_idx_0 = seq_level_idx_0; + + AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0); + config->seq_tier_0 = seq_tier_0; + + AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth); + config->high_bitdepth = high_bitdepth; + + AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit); + config->twelve_bit = twelve_bit; + + AV1C_READ_BIT_OR_RETURN_ERROR(monochrome); + config->monochrome = monochrome; + + AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x); + config->chroma_subsampling_x = chroma_subsampling_x; + + AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y); + config->chroma_subsampling_y = chroma_subsampling_y; + + AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2); + config->chroma_sample_position = chroma_sample_position; + + AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3); + + AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present); + config->initial_presentation_delay_present = + initial_presentation_delay_present; + + AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4); + config->initial_presentation_delay_minus_one = + initial_presentation_delay_minus_one; + + *bytes_read = aom_rb_bytes_read(reader); + + return 0; +} + +int write_av1config(const Av1Config *config, size_t capacity, + size_t *bytes_written, uint8_t *buffer) { + if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1; + + *bytes_written = 0; + memset(buffer, 0, kAv1cSize); + + struct aom_write_bit_buffer writer = { buffer, 0 }; + + aom_wb_write_bit(&writer, config->marker); + aom_wb_write_literal(&writer, config->version, 7); + aom_wb_write_literal(&writer, config->seq_profile, 3); + aom_wb_write_literal(&writer, config->seq_level_idx_0, 5); + aom_wb_write_bit(&writer, config->seq_tier_0); + aom_wb_write_bit(&writer, config->high_bitdepth); + aom_wb_write_bit(&writer, config->twelve_bit); + aom_wb_write_bit(&writer, config->monochrome); + aom_wb_write_bit(&writer, config->chroma_subsampling_x); + aom_wb_write_bit(&writer, config->chroma_subsampling_y); + aom_wb_write_literal(&writer, config->chroma_sample_position, 2); + aom_wb_write_literal(&writer, 0, 3); // reserved + aom_wb_write_bit(&writer, config->initial_presentation_delay_present); + + if (config->initial_presentation_delay_present) { + aom_wb_write_literal(&writer, config->initial_presentation_delay_minus_one, + 4); + } else { + aom_wb_write_literal(&writer, 0, 4); // reserved + } + + *bytes_written = aom_wb_bytes_written(&writer); + return 0; +} + +#undef AV1C_READ_BIT_OR_RETURN_ERROR +#undef AV1C_READ_BITS_OR_RETURN_ERROR +#undef AV1C_PUSH_ERROR_HANDLER_DATA +#undef AV1C_POP_ERROR_HANDLER_DATA diff --git a/third_party/aom/common/av1_config.h b/third_party/aom/common/av1_config.h new file mode 100644 index 0000000000..a15bedb305 --- /dev/null +++ b/third_party/aom/common/av1_config.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_AV1_CONFIG_H_ +#define AOM_COMMON_AV1_CONFIG_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Struct representing ISOBMFF/Matroska AV1 config. See: +// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax +// +// The AV1 config has the following format: +// +// unsigned int (1) marker = 1; +// unsigned int (7) version = 1; +// unsigned int (3) seq_profile; +// unsigned int (5) seq_level_idx_0; +// unsigned int (1) seq_tier_0; +// unsigned int (1) high_bitdepth; +// unsigned int (1) twelve_bit; +// unsigned int (1) monochrome; +// unsigned int (1) chroma_subsampling_x; +// unsigned int (1) chroma_subsampling_y; +// unsigned int (2) chroma_sample_position; +// unsigned int (3) reserved = 0; +// +// unsigned int (1) initial_presentation_delay_present; +// if (initial_presentation_delay_present) { +// unsigned int (4) initial_presentation_delay_minus_one; +// } else { +// unsigned int (4) reserved = 0; +// } +// +// unsigned int (8)[] configOBUs; +// +// Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so +// the field is omitted. +typedef struct _Av1Config { + uint8_t marker; + uint8_t version; + uint8_t seq_profile; + uint8_t seq_level_idx_0; + uint8_t seq_tier_0; + uint8_t high_bitdepth; + uint8_t twelve_bit; + uint8_t monochrome; + uint8_t chroma_subsampling_x; + uint8_t chroma_subsampling_y; + uint8_t chroma_sample_position; + uint8_t initial_presentation_delay_present; + uint8_t initial_presentation_delay_minus_one; +} Av1Config; + +// Attempts to parse a Sequence Header OBU and set the paramenters of 'config'. +// Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple +// OBUs, but the Sequence Header OBU must be the first OBU within the buffer. +int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb, + Av1Config *config); + +// Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success. +// Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or +// when parsing of 'buffer' fails. +int read_av1config(const uint8_t *buffer, size_t buffer_length, + size_t *bytes_read, Av1Config *config); + +// Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'. +// Returns -1 when passed NULL pointers or when 'capacity' insufficient. +int write_av1config(const Av1Config *config, size_t capacity, + size_t *bytes_written, uint8_t *buffer); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_AV1_CONFIG_H_ diff --git a/third_party/aom/common/ivf_dec.cmake b/third_party/aom/common/ivf_dec.cmake new file mode 100644 index 0000000000..fedeea7940 --- /dev/null +++ b/third_party/aom/common/ivf_dec.cmake @@ -0,0 +1,28 @@ +# +# Copyright (c) 2021, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_COMMON_IVF_DEC_CMAKE_) + return() +endif() # AOM_COMMON_AOM_COMMON_CMAKE_ +set(AOM_COMMON_IVF_DEC_CMAKE_ 1) + +list(APPEND IVF_DEC_SOURCES "${AOM_ROOT}/common/ivfdec.c" + "${AOM_ROOT}/common/ivfdec.h") + +# Creates the aom_common build target and makes libaom depend on it. The libaom +# target must exist before this function is called. +function(setup_ivf_dec_targets) + add_library(ivf_dec OBJECT ${IVF_DEC_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} ivf_dec PARENT_SCOPE) + target_sources(aom PRIVATE $) + if(BUILD_SHARED_LIBS) + target_sources(aom_static PRIVATE $) + endif() +endfunction() diff --git a/third_party/aom/common/ivfdec.c b/third_party/aom/common/ivfdec.c new file mode 100644 index 0000000000..6e714d1cfe --- /dev/null +++ b/third_party/aom/common/ivfdec.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/ivfdec.h" + +#include +#include +#include + +#include "aom_ports/mem_ops.h" +#include "aom_ports/sanitizer.h" +#include "tools_common.h" + +static const char *IVF_SIGNATURE = "DKIF"; + +static void fix_framerate(int *num, int *den) { + if (*den <= 0 || *den >= 1000000000 || *num <= 0 || *num >= 1000) { + // framerate seems to be invalid, just default to 30fps. + *num = 30; + *den = 1; + } +} + +int file_is_ivf(struct AvxInputContext *input_ctx) { + unsigned char raw_hdr[32]; + int is_ivf = 0; + + if (buffer_input(input_ctx, 32, raw_hdr, /*buffered=*/true) == 32) { + if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) { + is_ivf = 1; + + if (mem_get_le16(raw_hdr + 4) != 0) { + fprintf(stderr, + "Error: Unrecognized IVF version! This file may not" + " decode properly.\n"); + } + + input_ctx->fourcc = mem_get_le32(raw_hdr + 8); + input_ctx->width = mem_get_le16(raw_hdr + 12); + input_ctx->height = mem_get_le16(raw_hdr + 14); + input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16); + input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20); + fix_framerate(&input_ctx->framerate.numerator, + &input_ctx->framerate.denominator); + } + } + + if (!is_ivf) { + rewind_detect(input_ctx); + } + return is_ivf; +} + +int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size, + aom_codec_pts_t *pts) { + unsigned char raw_header[IVF_FRAME_HDR_SZ] = { 0 }; + size_t frame_size = 0; + + if (read_from_input(input_ctx, IVF_FRAME_HDR_SZ, raw_header) != + IVF_FRAME_HDR_SZ) { + if (!input_eof(input_ctx)) + fprintf(stderr, "Warning: Failed to read frame size\n"); + } else { + frame_size = mem_get_le32(raw_header); + + if (frame_size > 256 * 1024 * 1024) { + fprintf(stderr, "Warning: Read invalid frame size (%u)\n", + (unsigned int)frame_size); + frame_size = 0; + } + + if (frame_size > *buffer_size) { + uint8_t *new_buffer = (uint8_t *)realloc(*buffer, 2 * frame_size); + + if (new_buffer) { + *buffer = new_buffer; + *buffer_size = 2 * frame_size; + } else { + fprintf(stderr, "Warning: Failed to allocate compressed data buffer\n"); + frame_size = 0; + } + } + + if (pts) { + *pts = mem_get_le32(&raw_header[4]); + *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32); + } + } + + if (!input_eof(input_ctx)) { + ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size); + if (read_from_input(input_ctx, frame_size, *buffer) != frame_size) { + fprintf(stderr, "Warning: Failed to read full frame\n"); + return 1; + } + + ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size); + *bytes_read = frame_size; + return 0; + } + + return 1; +} diff --git a/third_party/aom/common/ivfdec.h b/third_party/aom/common/ivfdec.h new file mode 100644 index 0000000000..e8fe8d0c53 --- /dev/null +++ b/third_party/aom/common/ivfdec.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_IVFDEC_H_ +#define AOM_COMMON_IVFDEC_H_ + +#include "aom/aom_codec.h" +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int file_is_ivf(struct AvxInputContext *input); +int ivf_read_frame(struct AvxInputContext *input_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size, + aom_codec_pts_t *pts); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_IVFDEC_H_ diff --git a/third_party/aom/common/ivfenc.c b/third_party/aom/common/ivfenc.c new file mode 100644 index 0000000000..64715f4d74 --- /dev/null +++ b/third_party/aom/common/ivfenc.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/ivfenc.h" + +#include "aom/aom_encoder.h" +#include "aom_ports/mem_ops.h" + +void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg, + unsigned int fourcc, int frame_cnt) { + char header[32]; + + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); // version + mem_put_le16(header + 6, 32); // header size + mem_put_le32(header + 8, fourcc); // fourcc + mem_put_le16(header + 12, cfg->g_w); // width + mem_put_le16(header + 14, cfg->g_h); // height + mem_put_le32(header + 16, cfg->g_timebase.den); // rate + mem_put_le32(header + 20, cfg->g_timebase.num); // scale + mem_put_le32(header + 24, frame_cnt); // length + mem_put_le32(header + 28, 0); // unused + + fwrite(header, 1, 32, outfile); +} + +void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) { + char header[12]; + + mem_put_le32(header, (int)frame_size); + mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF)); + mem_put_le32(header + 8, (int)(pts >> 32)); + fwrite(header, 1, 12, outfile); +} + +void ivf_write_frame_size(FILE *outfile, size_t frame_size) { + char header[4]; + + mem_put_le32(header, (int)frame_size); + fwrite(header, 1, 4, outfile); +} diff --git a/third_party/aom/common/ivfenc.h b/third_party/aom/common/ivfenc.h new file mode 100644 index 0000000000..8f6d947d47 --- /dev/null +++ b/third_party/aom/common/ivfenc.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_IVFENC_H_ +#define AOM_COMMON_IVFENC_H_ + +#include "common/tools_common.h" + +struct aom_codec_enc_cfg; +struct aom_codec_cx_pkt; + +#ifdef __cplusplus +extern "C" { +#endif + +void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg, + uint32_t fourcc, int frame_cnt); + +void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size); + +void ivf_write_frame_size(FILE *outfile, size_t frame_size); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_IVFENC_H_ diff --git a/third_party/aom/common/md5_utils.c b/third_party/aom/common/md5_utils.c new file mode 100644 index 0000000000..c69aa57a3b --- /dev/null +++ b/third_party/aom/common/md5_utils.c @@ -0,0 +1,257 @@ +/* + * This code implements the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' header + * definitions + * - Ian Jackson . + * Still in the public domain. + */ + +#include /* for memcpy() */ + +#include "common/md5_utils.h" + +static void byteSwap(UWORD32 *buf, unsigned words) { + md5byte *p; + + /* Only swap bytes for big endian machines */ + int i = 1; + + if (*(char *)&i == 1) return; + + p = (md5byte *)buf; + + do { + *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 | + ((unsigned)p[1] << 8 | p[0]); + p += 4; + } while (--words); +} + +/* + * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious + * initialization constants. + */ +void MD5Init(struct MD5Context *ctx) { + ctx->buf[0] = 0x67452301; + ctx->buf[1] = 0xefcdab89; + ctx->buf[2] = 0x98badcfe; + ctx->buf[3] = 0x10325476; + + ctx->bytes[0] = 0; + ctx->bytes[1] = 0; +} + +/* + * Update context to reflect the concatenation of another buffer full + * of bytes. + */ +void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) { + UWORD32 t; + + /* Update byte count */ + + t = ctx->bytes[0]; + + if ((ctx->bytes[0] = t + len) < t) + ctx->bytes[1]++; /* Carry from low to high */ + + t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */ + + if (t > len) { + memcpy((md5byte *)ctx->in + 64 - t, buf, len); + return; + } + + /* First chunk is an odd size */ + memcpy((md5byte *)ctx->in + 64 - t, buf, t); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += t; + len -= t; + + /* Process data in 64-byte chunks */ + while (len >= 64) { + memcpy(ctx->in, buf, 64); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + buf += 64; + len -= 64; + } + + /* Handle any remaining bytes of data. */ + memcpy(ctx->in, buf, len); +} + +/* + * Final wrapup - pad to 64-byte boundary with the bit pattern + * 1 0* (64-bit count of bits processed, MSB-first) + */ +void MD5Final(md5byte digest[16], struct MD5Context *ctx) { + int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */ + md5byte *p = (md5byte *)ctx->in + count; + + /* Set the first char of padding to 0x80. There is always room. */ + *p++ = 0x80; + + /* Bytes of padding needed to make 56 bytes (-8..55) */ + count = 56 - 1 - count; + + if (count < 0) { /* Padding forces an extra block */ + memset(p, 0, count + 8); + byteSwap(ctx->in, 16); + MD5Transform(ctx->buf, ctx->in); + p = (md5byte *)ctx->in; + count = 56; + } + + memset(p, 0, count); + byteSwap(ctx->in, 14); + + /* Append length in bits and transform */ + ctx->in[14] = ctx->bytes[0] << 3; + ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29; + MD5Transform(ctx->buf, ctx->in); + + byteSwap(ctx->buf, 4); + memcpy(digest, ctx->buf, 16); + memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */ +} + +#ifndef ASM_MD5 + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +#if defined(__clang__) && defined(__has_attribute) +#if __has_attribute(no_sanitize) +#define AOM_NO_UNSIGNED_OVERFLOW_CHECK \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#if __clang_major__ >= 12 +#define VPX_NO_UNSIGNED_SHIFT_CHECK \ + __attribute__((no_sanitize("unsigned-shift-base"))) +#endif // __clang__ >= 12 +#endif // __clang__ + +#ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK +#define AOM_NO_UNSIGNED_OVERFLOW_CHECK +#endif +#ifndef AOM_NO_UNSIGNED_SHIFT_CHECK +#define AOM_NO_UNSIGNED_SHIFT_CHECK +#endif + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +AOM_NO_UNSIGNED_OVERFLOW_CHECK AOM_NO_UNSIGNED_SHIFT_CHECK void MD5Transform( + UWORD32 buf[4], UWORD32 const in[16]) { + register UWORD32 a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +#undef AOM_NO_UNSIGNED_OVERFLOW_CHECK +#undef AOM_NO_UNSIGNED_SHIFT_CHECK + +#endif diff --git a/third_party/aom/common/md5_utils.h b/third_party/aom/common/md5_utils.h new file mode 100644 index 0000000000..144fa3ad28 --- /dev/null +++ b/third_party/aom/common/md5_utils.h @@ -0,0 +1,49 @@ +/* + * This is the header file for the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + * + * Changed so as no longer to depend on Colin Plumb's `usual.h' + * header definitions + * - Ian Jackson . + * Still in the public domain. + */ + +#ifndef AOM_COMMON_MD5_UTILS_H_ +#define AOM_COMMON_MD5_UTILS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define md5byte unsigned char +#define UWORD32 unsigned int + +typedef struct MD5Context MD5Context; +struct MD5Context { + UWORD32 buf[4]; + UWORD32 bytes[2]; + UWORD32 in[16]; +}; + +void MD5Init(struct MD5Context *context); +void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len); +void MD5Final(unsigned char digest[16], struct MD5Context *context); +void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_MD5_UTILS_H_ diff --git a/third_party/aom/common/obudec.c b/third_party/aom/common/obudec.c new file mode 100644 index 0000000000..8b7bd39a60 --- /dev/null +++ b/third_party/aom/common/obudec.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "common/obudec.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem_ops.h" +#include "av1/common/common.h" +#include "av1/common/obu_util.h" +#include "tools_common.h" + +#define OBU_BUFFER_SIZE (500 * 1024) + +#define OBU_HEADER_SIZE 1 +#define OBU_EXTENSION_SIZE 1 +#define OBU_MAX_LENGTH_FIELD_SIZE 8 + +#define OBU_MAX_HEADER_SIZE \ + (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 2 * OBU_MAX_LENGTH_FIELD_SIZE) + +#define OBU_DETECTION_SIZE \ + (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 4 * OBU_MAX_LENGTH_FIELD_SIZE) + +// Reads unsigned LEB128 integer and returns 0 upon successful read and decode. +// Stores raw bytes in 'value_buffer', length of the number in 'value_length', +// and decoded value in 'value'. If 'buffered' is true, it is buffered in the +// detect buffer first. +static int obudec_read_leb128(struct AvxInputContext *input_ctx, + uint8_t *value_buffer, size_t *value_length, + uint64_t *value, bool buffered) { + if (!input_ctx || !value_buffer || !value_length || !value) return -1; + size_t len; + for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) { + const size_t num_read = + buffer_input(input_ctx, 1, &value_buffer[len], buffered); + if (num_read == 0) { + if (len == 0 && input_eof(input_ctx)) { + *value_length = 0; + return 0; + } + // Ran out of data before completing read of value. + return -1; + } + if ((value_buffer[len] >> 7) == 0) { + ++len; + *value_length = len; + break; + } + } + + return aom_uleb_decode(value_buffer, len, value, NULL); +} + +// Reads OBU header from 'input_ctx'. The 'buffer_capacity' passed in must be +// large enough to store an OBU header with extension (2 bytes). Raw OBU data is +// written to 'obu_data', parsed OBU header values are written to 'obu_header', +// and total bytes read from file are written to 'bytes_read'. Returns 0 for +// success, and non-zero on failure. When end of file is reached, the return +// value is 0 and the 'bytes_read' value is set to 0. If 'buffered' is true, it +// is buffered in the detect buffer first. +static int obudec_read_obu_header(struct AvxInputContext *input_ctx, + size_t buffer_capacity, int is_annexb, + uint8_t *obu_data, ObuHeader *obu_header, + size_t *bytes_read, bool buffered) { + if (!input_ctx || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) || + !obu_data || !obu_header || !bytes_read) { + return -1; + } + *bytes_read = buffer_input(input_ctx, 1, obu_data, buffered); + + if (input_eof(input_ctx) && *bytes_read == 0) { + return 0; + } else if (*bytes_read != 1) { + fprintf(stderr, "obudec: Failure reading OBU header.\n"); + return -1; + } + + const int has_extension = (obu_data[0] >> 2) & 0x1; + if (has_extension) { + if (buffer_input(input_ctx, 1, &obu_data[1], buffered) != 1) { + fprintf(stderr, "obudec: Failure reading OBU extension."); + return -1; + } + ++*bytes_read; + } + + size_t obu_bytes_parsed = 0; + const aom_codec_err_t parse_result = aom_read_obu_header( + obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb); + if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) { + fprintf(stderr, "obudec: Error parsing OBU header.\n"); + return -1; + } + + return 0; +} + +// Reads OBU payload from 'input_ctx' and returns 0 for success when all payload +// bytes are read from the file. Payload data is written to 'obu_data', and +// actual bytes read added to 'bytes_read'. If 'buffered' is true, it is +// buffered in the detect buffer first. +static int obudec_read_obu_payload(struct AvxInputContext *input_ctx, + size_t payload_length, uint8_t *obu_data, + size_t *bytes_read, bool buffered) { + if (!input_ctx || payload_length == 0 || !obu_data || !bytes_read) return -1; + + if (buffer_input(input_ctx, payload_length, obu_data, buffered) != + payload_length) { + fprintf(stderr, "obudec: Failure reading OBU payload.\n"); + return -1; + } + + *bytes_read += payload_length; + return 0; +} + +static int obudec_read_obu_header_and_size( + struct AvxInputContext *input_ctx, size_t buffer_capacity, int is_annexb, + uint8_t *buffer, size_t *bytes_read, size_t *payload_length, + ObuHeader *obu_header, bool buffered) { + const size_t kMinimumBufferSize = OBU_MAX_HEADER_SIZE; + if (!input_ctx || !buffer || !bytes_read || !payload_length || !obu_header || + buffer_capacity < kMinimumBufferSize) { + return -1; + } + + size_t leb128_length_obu = 0; + size_t leb128_length_payload = 0; + uint64_t obu_size = 0; + if (is_annexb) { + if (obudec_read_leb128(input_ctx, &buffer[0], &leb128_length_obu, &obu_size, + buffered) != 0) { + fprintf(stderr, "obudec: Failure reading OBU size length.\n"); + return -1; + } else if (leb128_length_obu == 0) { + *payload_length = 0; + return 0; + } + if (obu_size > UINT32_MAX) { + fprintf(stderr, "obudec: OBU payload length too large.\n"); + return -1; + } + } + + size_t header_size = 0; + if (obudec_read_obu_header(input_ctx, buffer_capacity - leb128_length_obu, + is_annexb, buffer + leb128_length_obu, obu_header, + &header_size, buffered) != 0) { + return -1; + } else if (header_size == 0) { + *payload_length = 0; + return 0; + } + + if (!obu_header->has_size_field) { + assert(is_annexb); + if (obu_size < header_size) { + fprintf(stderr, "obudec: OBU size is too small.\n"); + return -1; + } + *payload_length = (size_t)obu_size - header_size; + } else { + uint64_t u64_payload_length = 0; + if (obudec_read_leb128(input_ctx, &buffer[leb128_length_obu + header_size], + &leb128_length_payload, &u64_payload_length, + buffered) != 0) { + fprintf(stderr, "obudec: Failure reading OBU payload length.\n"); + return -1; + } + if (u64_payload_length > UINT32_MAX) { + fprintf(stderr, "obudec: OBU payload length too large.\n"); + return -1; + } + + *payload_length = (size_t)u64_payload_length; + } + + *bytes_read = leb128_length_obu + header_size + leb128_length_payload; + return 0; +} + +static int obudec_grow_buffer(size_t growth_amount, uint8_t **obu_buffer, + size_t *obu_buffer_capacity) { + if (!*obu_buffer || !obu_buffer_capacity || growth_amount == 0) { + return -1; + } + + const size_t capacity = *obu_buffer_capacity; + if (SIZE_MAX - growth_amount < capacity) { + fprintf(stderr, "obudec: cannot grow buffer, capacity will roll over.\n"); + return -1; + } + + const size_t new_capacity = capacity + growth_amount; + +#if defined AOM_MAX_ALLOCABLE_MEMORY + if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) { + fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n"); + return -1; + } +#endif + + uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity); + if (!new_buffer) { + fprintf(stderr, "obudec: Failed to allocate compressed data buffer.\n"); + return -1; + } + + *obu_buffer = new_buffer; + *obu_buffer_capacity = new_capacity; + return 0; +} + +static int obudec_read_one_obu(struct AvxInputContext *input_ctx, + uint8_t **obu_buffer, size_t obu_bytes_buffered, + size_t *obu_buffer_capacity, size_t *obu_length, + ObuHeader *obu_header, int is_annexb, + bool buffered) { + if (!input_ctx || !(*obu_buffer) || !obu_buffer_capacity || !obu_length || + !obu_header) { + return -1; + } + + size_t bytes_read = 0; + size_t obu_payload_length = 0; + size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered; + + if (available_buffer_capacity < OBU_MAX_HEADER_SIZE) { + if (obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE), + obu_buffer, obu_buffer_capacity) != 0) { + *obu_length = bytes_read; + return -1; + } + available_buffer_capacity += + AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE); + } + + const int status = obudec_read_obu_header_and_size( + input_ctx, available_buffer_capacity, is_annexb, + *obu_buffer + obu_bytes_buffered, &bytes_read, &obu_payload_length, + obu_header, buffered); + if (status < 0) return status; + + if (obu_payload_length > SIZE_MAX - bytes_read) return -1; + + if (obu_payload_length > 256 * 1024 * 1024) { + fprintf(stderr, "obudec: Read invalid OBU size (%u)\n", + (unsigned int)obu_payload_length); + *obu_length = bytes_read + obu_payload_length; + return -1; + } + + if (bytes_read + obu_payload_length > available_buffer_capacity && + obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, obu_payload_length), + obu_buffer, obu_buffer_capacity) != 0) { + *obu_length = bytes_read + obu_payload_length; + return -1; + } + + if (obu_payload_length > 0 && + obudec_read_obu_payload(input_ctx, obu_payload_length, + *obu_buffer + obu_bytes_buffered + bytes_read, + &bytes_read, buffered) != 0) { + return -1; + } + + *obu_length = bytes_read; + return 0; +} + +int file_is_obu(struct ObuDecInputContext *obu_ctx) { + if (!obu_ctx || !obu_ctx->avx_ctx) return 0; + + struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx; + uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 }; + const int is_annexb = obu_ctx->is_annexb; + size_t payload_length = 0; + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + size_t length_of_unit_size = 0; + size_t annexb_header_length = 0; + uint64_t unit_size = 0; + + if (is_annexb) { + // read the size of first temporal unit + if (obudec_read_leb128(avx_ctx, &detect_buf[0], &length_of_unit_size, + &unit_size, /*buffered=*/true) != 0) { + fprintf(stderr, "obudec: Failure reading temporal unit header\n"); + rewind_detect(avx_ctx); + return 0; + } + + // read the size of first frame unit + if (obudec_read_leb128(avx_ctx, &detect_buf[length_of_unit_size], + &annexb_header_length, &unit_size, + /*buffered=*/true) != 0) { + fprintf(stderr, "obudec: Failure reading frame unit header\n"); + rewind_detect(avx_ctx); + return 0; + } + annexb_header_length += length_of_unit_size; + } + + size_t bytes_read = 0; + if (obudec_read_obu_header_and_size( + avx_ctx, OBU_DETECTION_SIZE - annexb_header_length, is_annexb, + &detect_buf[annexb_header_length], &bytes_read, &payload_length, + &obu_header, /*buffered=*/true) != 0) { + fprintf(stderr, "obudec: Failure reading first OBU.\n"); + rewind_detect(avx_ctx); + return 0; + } + + if (is_annexb) { + bytes_read += annexb_header_length; + } + + if (obu_header.type != OBU_TEMPORAL_DELIMITER && + obu_header.type != OBU_SEQUENCE_HEADER) { + rewind_detect(avx_ctx); + return 0; + } + + if (obu_header.has_size_field) { + if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) { + fprintf( + stderr, + "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero)."); + rewind_detect(avx_ctx); + return 0; + } + } else if (!is_annexb) { + fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n"); + rewind_detect(avx_ctx); + return 0; + } + + // Appears that input is valid Section 5 AV1 stream. + obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE); + if (!obu_ctx->buffer) { + fprintf(stderr, "Out of memory.\n"); + rewind_detect(avx_ctx); + return 0; + } + obu_ctx->buffer_capacity = OBU_BUFFER_SIZE; + + memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read); + obu_ctx->bytes_buffered = bytes_read; + // If the first OBU is a SEQUENCE_HEADER, then it will have a payload. + // We need to read this in so that our buffer only contains complete OBUs. + if (payload_length > 0) { + if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) { + fprintf(stderr, "obudec: First OBU's payload is too large\n"); + rewind_detect(avx_ctx); + obudec_free(obu_ctx); + return 0; + } + + size_t payload_bytes = 0; + const int status = obudec_read_obu_payload( + avx_ctx, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes, + /*buffered=*/false); + if (status < 0) { + rewind_detect(avx_ctx); + obudec_free(obu_ctx); + return 0; + } + obu_ctx->bytes_buffered += payload_bytes; + } + return 1; +} + +int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx, + uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size) { + FILE *f = obu_ctx->avx_ctx->file; + if (!f) return -1; + + *buffer_size = 0; + *bytes_read = 0; + + if (input_eof(obu_ctx->avx_ctx)) { + return 1; + } + + size_t tu_size; + size_t obu_size = 0; + size_t length_of_temporal_unit_size = 0; + uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 }; + + if (obu_ctx->is_annexb) { + uint64_t size = 0; + + if (obu_ctx->bytes_buffered == 0) { + if (obudec_read_leb128(obu_ctx->avx_ctx, &tuheader[0], + &length_of_temporal_unit_size, &size, + /*buffered=*/false) != 0) { + fprintf(stderr, "obudec: Failure reading temporal unit header\n"); + return -1; + } + if (size == 0 && input_eof(obu_ctx->avx_ctx)) { + return 1; + } + } else { + // temporal unit size was already stored in buffer + if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size, + &length_of_temporal_unit_size) != 0) { + fprintf(stderr, "obudec: Failure reading temporal unit header\n"); + return -1; + } + } + + if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) { + fprintf(stderr, "obudec: TU too large.\n"); + return -1; + } + + size += length_of_temporal_unit_size; + tu_size = (size_t)size; + } else { + while (1) { + ObuHeader obu_header; + memset(&obu_header, 0, sizeof(obu_header)); + + if (obudec_read_one_obu(obu_ctx->avx_ctx, &obu_ctx->buffer, + obu_ctx->bytes_buffered, + &obu_ctx->buffer_capacity, &obu_size, &obu_header, + 0, /*buffered=*/false) != 0) { + fprintf(stderr, "obudec: read_one_obu failed in TU loop\n"); + return -1; + } + + if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) { + tu_size = obu_ctx->bytes_buffered; + break; + } else { + obu_ctx->bytes_buffered += obu_size; + } + } + } + +#if defined AOM_MAX_ALLOCABLE_MEMORY + if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) { + fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n"); + return -1; + } +#endif + if (tu_size > 0) { + uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size); + if (!new_buffer) { + free(*buffer); + fprintf(stderr, "obudec: Out of memory.\n"); + return -1; + } + *buffer = new_buffer; + } + *bytes_read = tu_size; + *buffer_size = tu_size; + + if (!obu_ctx->is_annexb) { + memcpy(*buffer, obu_ctx->buffer, tu_size); + + // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size) + // points to the end of the buffer. + memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered, + obu_size); + obu_ctx->bytes_buffered = obu_size; + } else { + if (!input_eof(obu_ctx->avx_ctx)) { + size_t data_size; + size_t offset; + if (!obu_ctx->bytes_buffered) { + data_size = tu_size - length_of_temporal_unit_size; + memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size); + offset = length_of_temporal_unit_size; + } else { + const size_t copy_size = AOMMIN(obu_ctx->bytes_buffered, tu_size); + memcpy(*buffer, obu_ctx->buffer, copy_size); + offset = copy_size; + data_size = tu_size - copy_size; + obu_ctx->bytes_buffered -= copy_size; + } + + if (read_from_input(obu_ctx->avx_ctx, data_size, *buffer + offset) != + data_size) { + fprintf(stderr, "obudec: Failed to read full temporal unit\n"); + return -1; + } + } + } + return 0; +} + +void obudec_free(struct ObuDecInputContext *obu_ctx) { + free(obu_ctx->buffer); + obu_ctx->buffer = NULL; + obu_ctx->buffer_capacity = 0; + obu_ctx->bytes_buffered = 0; +} diff --git a/third_party/aom/common/obudec.h b/third_party/aom/common/obudec.h new file mode 100644 index 0000000000..b2adb1e3d7 --- /dev/null +++ b/third_party/aom/common/obudec.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_OBUDEC_H_ +#define AOM_COMMON_OBUDEC_H_ + +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ObuDecInputContext { + struct AvxInputContext *avx_ctx; + uint8_t *buffer; + size_t buffer_capacity; + size_t bytes_buffered; + int is_annexb; +}; + +// Returns 1 when file data starts (if Annex B stream, after reading the +// size of the OBU) with what appears to be a Temporal Delimiter +// OBU as defined by Section 5 of the AV1 bitstream specification. +int file_is_obu(struct ObuDecInputContext *obu_ctx); + +// Reads one Temporal Unit from the input file. Returns 0 when a TU is +// successfully read, 1 when end of file is reached, and less than 0 when an +// error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size, +// returns buffer capacity via 'buffer_size', and returns size of buffered data +// via 'bytes_read'. +int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx, + uint8_t **buffer, size_t *bytes_read, + size_t *buffer_size); + +void obudec_free(struct ObuDecInputContext *obu_ctx); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_OBUDEC_H_ diff --git a/third_party/aom/common/rawenc.c b/third_party/aom/common/rawenc.c new file mode 100644 index 0000000000..aa80d2cae3 --- /dev/null +++ b/third_party/aom/common/rawenc.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "common/rawenc.h" + +// Number of bytes to write per batch in write_greyscale. +#define BATCH_SIZE 8 + +// Interface to writing to either a file or MD5Context. Takes a pointer to +// either the file or MD5Context, the buffer, the size of each element, and +// number of elements to write. Note that size and nmemb (last two args) must +// be unsigned int, as the interface to MD5Update requires that. +typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int); + +static void write_file(void *fp, const uint8_t *buffer, unsigned int size, + unsigned int nmemb) { + fwrite(buffer, size, nmemb, (FILE *)fp); +} + +static void write_md5(void *md5, const uint8_t *buffer, unsigned int size, + unsigned int nmemb) { + MD5Update((MD5Context *)md5, buffer, size * nmemb); +} + +// Writes out n neutral chroma samples (for greyscale). +static void write_greyscale(const aom_image_t *img, int n, WRITER writer_func, + void *file_or_md5) { + // Batch 8 writes for low bit-depth, 4 writes for high bit-depth. + int bytes_per_sample; + union { + uint8_t u8[BATCH_SIZE]; + uint16_t u16[BATCH_SIZE / 2]; + } batched; + if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + bytes_per_sample = 2; + for (int i = 0; i < BATCH_SIZE / 2; ++i) { + batched.u16[i] = 1 << (img->bit_depth - 1); + } + } else { + bytes_per_sample = 1; + for (int i = 0; i < BATCH_SIZE; ++i) { + batched.u8[i] = 0x80; + } + } + const int samples_per_batch = BATCH_SIZE / bytes_per_sample; + const int num_batched_writes = n / samples_per_batch; + for (int i = 0; i < num_batched_writes; ++i) { + writer_func(file_or_md5, batched.u8, sizeof(uint8_t), BATCH_SIZE); + } + const int remaining = n % samples_per_batch; + for (int i = 0; i < remaining; ++i) { + writer_func(file_or_md5, batched.u8, sizeof(uint8_t), bytes_per_sample); + } +} + +// Encapsulates the logic for writing raw data to either an image file or +// to an MD5 context. +static void raw_write_image_file_or_md5(const aom_image_t *img, + const int *planes, const int num_planes, + void *file_or_md5, WRITER writer_func) { + const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH; + const int bytes_per_sample = high_bitdepth ? 2 : 1; + for (int i = 0; i < num_planes; ++i) { + const int plane = planes[i]; + const int w = aom_img_plane_width(img, plane); + const int h = aom_img_plane_height(img, plane); + // If we're on a color plane and the output is monochrome, write a greyscale + // value. Since there are only YUV planes, compare against Y. + if (img->monochrome && plane != AOM_PLANE_Y) { + write_greyscale(img, w * h, writer_func, file_or_md5); + continue; + } + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + for (int y = 0; y < h; ++y) { + writer_func(file_or_md5, buf, bytes_per_sample, w); + buf += stride; + } + } +} + +void raw_write_image_file(const aom_image_t *img, const int *planes, + const int num_planes, FILE *file) { + raw_write_image_file_or_md5(img, planes, num_planes, file, write_file); +} + +void raw_update_image_md5(const aom_image_t *img, const int *planes, + const int num_planes, MD5Context *md5) { + raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5); +} diff --git a/third_party/aom/common/rawenc.h b/third_party/aom/common/rawenc.h new file mode 100644 index 0000000000..cf5e00e6fd --- /dev/null +++ b/third_party/aom/common/rawenc.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_RAWENC_H_ +#define AOM_COMMON_RAWENC_H_ + +#include "aom/aom_decoder.h" +#include "common/md5_utils.h" +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void raw_write_image_file(const aom_image_t *img, const int *planes, + const int num_planes, FILE *file); +void raw_update_image_md5(const aom_image_t *img, const int *planes, + const int num_planes, MD5Context *md5); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_RAWENC_H_ diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c new file mode 100644 index 0000000000..4d77a1b427 --- /dev/null +++ b/third_party/aom/common/tools_common.c @@ -0,0 +1,636 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include +#include +#include + +#include "common/tools_common.h" + +#if CONFIG_AV1_ENCODER +#include "aom/aomcx.h" +#endif + +#if CONFIG_AV1_DECODER +#include "aom/aomdx.h" +#endif + +#if defined(_WIN32) +#include +#include +#endif + +#define LOG_ERROR(label) \ + do { \ + const char *l = label; \ + va_list ap; \ + va_start(ap, fmt); \ + if (l) fprintf(stderr, "%s: ", l); \ + vfprintf(stderr, fmt, ap); \ + fprintf(stderr, "\n"); \ + va_end(ap); \ + } while (0) + +FILE *set_binary_mode(FILE *stream) { + (void)stream; +#if defined(_WIN32) + _setmode(_fileno(stream), _O_BINARY); +#endif + return stream; +} + +void die(const char *fmt, ...) { + LOG_ERROR(NULL); + usage_exit(); +} + +void fatal(const char *fmt, ...) { + LOG_ERROR("Fatal"); + exit(EXIT_FAILURE); +} + +void aom_tools_warn(const char *fmt, ...) { LOG_ERROR("Warning"); } + +void die_codec(aom_codec_ctx_t *ctx, const char *s) { + const char *detail = aom_codec_error_detail(ctx); + + fprintf(stderr, "%s: %s\n", s, aom_codec_error(ctx)); + if (detail) fprintf(stderr, " %s\n", detail); + exit(EXIT_FAILURE); +} + +const char *image_format_to_string(aom_img_fmt_t fmt) { + switch (fmt) { + case AOM_IMG_FMT_I420: return "I420"; + case AOM_IMG_FMT_I422: return "I422"; + case AOM_IMG_FMT_I444: return "I444"; + case AOM_IMG_FMT_YV12: return "YV12"; + case AOM_IMG_FMT_NV12: return "NV12"; + case AOM_IMG_FMT_YV1216: return "YV1216"; + case AOM_IMG_FMT_I42016: return "I42016"; + case AOM_IMG_FMT_I42216: return "I42216"; + case AOM_IMG_FMT_I44416: return "I44416"; + default: return "Other"; + } +} + +int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) { + FILE *f = input_ctx->file; + struct FileTypeDetectionBuffer *detect = &input_ctx->detect; + int plane = 0; + int shortread = 0; + const int bytespp = (yuv_frame->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + + for (plane = 0; plane < 3; ++plane) { + uint8_t *ptr; + int w = aom_img_plane_width(yuv_frame, plane); + const int h = aom_img_plane_height(yuv_frame, plane); + int r; + // Assuming that for nv12 we read all chroma data at one time + if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + /* Determine the correct plane based on the image format. The for-loop + * always counts in Y,U,V order, but this may not match the order of + * the data on disk. + */ + switch (plane) { + case 1: + ptr = + yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_V + : AOM_PLANE_U]; + break; + case 2: + ptr = + yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_U + : AOM_PLANE_V]; + break; + default: ptr = yuv_frame->planes[plane]; + } + + for (r = 0; r < h; ++r) { + size_t needed = w * bytespp; + size_t buf_position = 0; + const size_t left = detect->buf_read - detect->position; + if (left > 0) { + const size_t more = (left < needed) ? left : needed; + memcpy(ptr, detect->buf + detect->position, more); + buf_position = more; + needed -= more; + detect->position += more; + } + if (needed > 0) { + shortread |= (fread(ptr + buf_position, 1, needed, f) < needed); + } + + ptr += yuv_frame->stride[plane]; + } + } + + return shortread; +} + +struct CodecInfo { + // Pointer to a function of zero arguments that returns an aom_codec_iface_t. + aom_codec_iface_t *(*interface)(void); + const char *short_name; + uint32_t fourcc; +}; + +#if CONFIG_AV1_ENCODER +static const struct CodecInfo aom_encoders[] = { + { &aom_codec_av1_cx, "av1", AV1_FOURCC }, +}; + +int get_aom_encoder_count(void) { + return sizeof(aom_encoders) / sizeof(aom_encoders[0]); +} + +aom_codec_iface_t *get_aom_encoder_by_index(int i) { + assert(i >= 0 && i < get_aom_encoder_count()); + return aom_encoders[i].interface(); +} + +aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name) { + for (int i = 0; i < get_aom_encoder_count(); ++i) { + const struct CodecInfo *info = &aom_encoders[i]; + if (strcmp(info->short_name, name) == 0) return info->interface(); + } + return NULL; +} + +uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface) { + for (int i = 0; i < get_aom_encoder_count(); ++i) { + const struct CodecInfo *info = &aom_encoders[i]; + if (info->interface() == iface) { + return info->fourcc; + } + } + return 0; +} + +const char *get_short_name_by_aom_encoder(aom_codec_iface_t *iface) { + for (int i = 0; i < get_aom_encoder_count(); ++i) { + const struct CodecInfo *info = &aom_encoders[i]; + if (info->interface() == iface) { + return info->short_name; + } + } + return NULL; +} + +#endif // CONFIG_AV1_ENCODER + +#if CONFIG_AV1_DECODER +static const struct CodecInfo aom_decoders[] = { + { &aom_codec_av1_dx, "av1", AV1_FOURCC }, +}; + +int get_aom_decoder_count(void) { + return sizeof(aom_decoders) / sizeof(aom_decoders[0]); +} + +aom_codec_iface_t *get_aom_decoder_by_index(int i) { + assert(i >= 0 && i < get_aom_decoder_count()); + return aom_decoders[i].interface(); +} + +aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name) { + for (int i = 0; i < get_aom_decoder_count(); ++i) { + const struct CodecInfo *info = &aom_decoders[i]; + if (strcmp(info->short_name, name) == 0) return info->interface(); + } + return NULL; +} + +aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc) { + for (int i = 0; i < get_aom_decoder_count(); ++i) { + const struct CodecInfo *info = &aom_decoders[i]; + if (info->fourcc == fourcc) return info->interface(); + } + return NULL; +} + +const char *get_short_name_by_aom_decoder(aom_codec_iface_t *iface) { + for (int i = 0; i < get_aom_decoder_count(); ++i) { + const struct CodecInfo *info = &aom_decoders[i]; + if (info->interface() == iface) { + return info->short_name; + } + } + return NULL; +} + +uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface) { + for (int i = 0; i < get_aom_decoder_count(); ++i) { + const struct CodecInfo *info = &aom_decoders[i]; + if (info->interface() == iface) { + return info->fourcc; + } + } + return 0; +} + +#endif // CONFIG_AV1_DECODER + +void aom_img_write(const aom_image_t *img, FILE *file) { + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = aom_img_plane_width(img, plane) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = aom_img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + } +} + +bool aom_img_read(aom_image_t *img, FILE *file) { + int plane; + const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + + for (plane = 0; plane < 3; ++plane) { + unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = aom_img_plane_width(img, plane) * bytespp; + const int h = aom_img_plane_height(img, plane); + int y; + + for (y = 0; y < h; ++y) { + if (fread(buf, 1, w, file) != (size_t)w) return false; + buf += stride; + } + } + + return true; +} + +// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t +double sse_to_psnr(double samples, double peak, double sse) { + static const double kMaxPSNR = 100.0; + + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > kMaxPSNR ? kMaxPSNR : psnr; + } else { + return kMaxPSNR; + } +} + +// TODO(debargha): Consolidate the functions below into a separate file. +static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { + // Note the offset is 1 less than half. + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case AOM_IMG_FMT_I42016: + case AOM_IMG_FMT_I42216: + case AOM_IMG_FMT_I44416: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset; + } + } +} + +static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { + // Note the offset is 1 less than half. + const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0; + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case AOM_IMG_FMT_YV12: + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I444: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint8_t *p_src = src->planes[plane] + y * src->stride[plane]; + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) { + *p_dst++ = (*p_src++ << input_shift) + offset; + } + } + } +} + +void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, + int input_shift) { + if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + highbd_img_upshift(dst, src, input_shift); + } else { + lowbd_img_upshift(dst, src, input_shift); + } +} + +void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) { + int plane; + if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w || + dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I444: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = (uint8_t)(*p_src++); + } + } + } +} + +static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt || + down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (src->fmt) { + case AOM_IMG_FMT_I42016: + case AOM_IMG_FMT_I42216: + case AOM_IMG_FMT_I44416: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint16_t *p_dst = + (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]); + for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift; + } + } +} + +static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { + int plane; + if (dst->d_w != src->d_w || dst->d_h != src->d_h || + dst->x_chroma_shift != src->x_chroma_shift || + dst->y_chroma_shift != src->y_chroma_shift || + src->fmt != dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH || down_shift < 0) { + fatal("Unsupported image conversion"); + } + switch (dst->fmt) { + case AOM_IMG_FMT_I420: + case AOM_IMG_FMT_I422: + case AOM_IMG_FMT_I444: break; + default: fatal("Unsupported image conversion"); + } + for (plane = 0; plane < 3; plane++) { + int w = src->d_w; + int h = src->d_h; + int x, y; + if (plane) { + w = (w + src->x_chroma_shift) >> src->x_chroma_shift; + h = (h + src->y_chroma_shift) >> src->y_chroma_shift; + } + for (y = 0; y < h; y++) { + const uint16_t *p_src = + (const uint16_t *)(src->planes[plane] + y * src->stride[plane]); + uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane]; + for (x = 0; x < w; x++) { + *p_dst++ = *p_src++ >> down_shift; + } + } + } +} + +void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift) { + if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + highbd_img_downshift(dst, src, down_shift); + } else { + lowbd_img_downshift(dst, src, down_shift); + } +} + +static int img_shifted_realloc_required(const aom_image_t *img, + const aom_image_t *shifted, + aom_img_fmt_t required_fmt) { + return img->d_w != shifted->d_w || img->d_h != shifted->d_h || + required_fmt != shifted->fmt; +} + +bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr, + aom_image_t **img_shifted_ptr) { + aom_image_t *img = *img_ptr; + aom_image_t *img_shifted = *img_shifted_ptr; + + const aom_img_fmt_t shifted_fmt = output_bit_depth == 8 + ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH + : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH; + + if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) { + if (img_shifted && + img_shifted_realloc_required(img, img_shifted, shifted_fmt)) { + aom_img_free(img_shifted); + img_shifted = NULL; + } + if (img_shifted) { + img_shifted->monochrome = img->monochrome; + } + if (!img_shifted) { + img_shifted = aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); + if (!img_shifted) { + *img_shifted_ptr = NULL; + return false; + } + img_shifted->bit_depth = output_bit_depth; + img_shifted->monochrome = img->monochrome; + img_shifted->csp = img->csp; + } + if (output_bit_depth > img->bit_depth) { + aom_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth); + } else { + aom_img_downshift(img_shifted, img, img->bit_depth - output_bit_depth); + } + *img_shifted_ptr = img_shifted; + *img_ptr = img_shifted; + } + + return true; +} + +// Related to I420, NV12 format has one luma "luminance" plane Y and one plane +// with U and V values interleaved. +void aom_img_write_nv12(const aom_image_t *img, FILE *file) { + // Y plane + const unsigned char *buf = img->planes[0]; + int stride = img->stride[0]; + int w = aom_img_plane_width(img, 0) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int h = aom_img_plane_height(img, 0); + int x, y; + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + + // Interleaved U and V plane + const unsigned char *ubuf = img->planes[1]; + const unsigned char *vbuf = img->planes[2]; + const size_t size = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + stride = img->stride[1]; + w = aom_img_plane_width(img, 1); + h = aom_img_plane_height(img, 1); + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + fwrite(ubuf, size, 1, file); + fwrite(vbuf, size, 1, file); + ubuf += size; + vbuf += size; + } + ubuf += (stride - w * size); + vbuf += (stride - w * size); + } +} + +size_t read_from_input(struct AvxInputContext *input_ctx, size_t n, + unsigned char *buf) { + const size_t buffered_bytes = + input_ctx->detect.buf_read - input_ctx->detect.position; + size_t read_n; + if (buffered_bytes == 0) { + read_n = fread(buf, 1, n, input_ctx->file); + } else if (n <= buffered_bytes) { + memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position, n); + input_ctx->detect.position += n; + read_n = n; + } else { + memcpy(buf, input_ctx->detect.buf + input_ctx->detect.position, + buffered_bytes); + input_ctx->detect.position += buffered_bytes; + read_n = buffered_bytes; + read_n += + fread(buf + buffered_bytes, 1, n - buffered_bytes, input_ctx->file); + } + return read_n; +} + +size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n) { + if (n + input_ctx->detect.position > DETECT_BUF_SZ) { + die("Failed to store in the detect buffer, maximum size exceeded."); + } + const size_t buffered_bytes = + input_ctx->detect.buf_read - input_ctx->detect.position; + size_t read_n; + if (buffered_bytes == 0) { + read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1, n, + input_ctx->file); + input_ctx->detect.buf_read += read_n; + } else if (n <= buffered_bytes) { + // In this case, don't need to do anything as the data is already in + // the detect buffer + read_n = n; + } else { + read_n = fread(input_ctx->detect.buf + input_ctx->detect.buf_read, 1, + n - buffered_bytes, input_ctx->file); + input_ctx->detect.buf_read += read_n; + read_n += buffered_bytes; + } + return read_n; +} + +// Read from detect buffer to a buffer. If not enough, read from input and also +// buffer them first. +size_t buffer_input(struct AvxInputContext *input_ctx, size_t n, + unsigned char *buf, bool buffered) { + if (!buffered) { + return read_from_input(input_ctx, n, buf); + } + const size_t buf_n = input_to_detect_buf(input_ctx, n); + if (buf_n < n) { + return buf_n; + } + return read_from_input(input_ctx, n, buf); +} + +void rewind_detect(struct AvxInputContext *input_ctx) { + input_ctx->detect.position = 0; +} + +bool input_eof(struct AvxInputContext *input_ctx) { + return feof(input_ctx->file) && + input_ctx->detect.position == input_ctx->detect.buf_read; +} diff --git a/third_party/aom/common/tools_common.h b/third_party/aom/common/tools_common.h new file mode 100644 index 0000000000..b31371c670 --- /dev/null +++ b/third_party/aom/common/tools_common.h @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_TOOLS_COMMON_H_ +#define AOM_COMMON_TOOLS_COMMON_H_ + +#include +#include + +#include "config/aom_config.h" + +#include "aom/aom_codec.h" +#include "aom/aom_image.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_ports/msvc.h" + +#if CONFIG_AV1_ENCODER +#include "common/y4minput.h" +#endif + +#if defined(_MSC_VER) +/* MSVS uses _f{seek,tell}i64. */ +#define fseeko _fseeki64 +#define ftello _ftelli64 +typedef int64_t FileOffset; +#elif defined(_WIN32) +#include /* NOLINT*/ +/* MinGW uses f{seek,tell}o64 for large files. */ +#define fseeko fseeko64 +#define ftello ftello64 +typedef off64_t FileOffset; +#elif CONFIG_OS_SUPPORT +#include /* NOLINT*/ +typedef off_t FileOffset; +/* Use 32-bit file operations in WebM file format when building ARM + * executables (.axf) with RVCT. */ +#else +#define fseeko fseek +#define ftello ftell +typedef long FileOffset; /* NOLINT */ +#endif /* CONFIG_OS_SUPPORT */ + +#if CONFIG_OS_SUPPORT +#if defined(_MSC_VER) +#include /* NOLINT */ +#define isatty _isatty +#define fileno _fileno +#else +#include /* NOLINT */ +#endif /* _MSC_VER */ +#endif /* CONFIG_OS_SUPPORT */ + +#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) + +#ifndef PATH_MAX +#define PATH_MAX 512 +#endif + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +#define RAW_FRAME_HDR_SZ sizeof(uint32_t) +#define OBU_DETECTION_SZ 34 // See common/obudec.c + +#define DETECT_BUF_SZ 34 // Max of the above header sizes + +#define AV1_FOURCC 0x31305641 + +enum VideoFileType { + FILE_TYPE_OBU, + FILE_TYPE_RAW, + FILE_TYPE_IVF, + FILE_TYPE_Y4M, + FILE_TYPE_WEBM +}; + +// The fourcc for large_scale_tile encoding is "LSTC". +#define LST_FOURCC 0x4354534c + +struct FileTypeDetectionBuffer { + char buf[DETECT_BUF_SZ]; + size_t buf_read; + size_t position; +}; + +struct AvxRational { + int numerator; + int denominator; +}; + +struct AvxInputContext { + const char *filename; + FILE *file; + int64_t length; + struct FileTypeDetectionBuffer detect; + enum VideoFileType file_type; + uint32_t width; + uint32_t height; + struct AvxRational pixel_aspect_ratio; + aom_img_fmt_t fmt; + aom_bit_depth_t bit_depth; + int only_i420; + uint32_t fourcc; + struct AvxRational framerate; +#if CONFIG_AV1_ENCODER + y4m_input y4m; +#endif + aom_color_range_t color_range; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__GNUC__) +#define AOM_NO_RETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define AOM_NO_RETURN __declspec(noreturn) +#else +#define AOM_NO_RETURN +#endif + +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef AOM_TOOLS_FORMAT_PRINTF +#define AOM_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + +/* Sets a stdio stream into binary mode */ +FILE *set_binary_mode(FILE *stream); + +AOM_NO_RETURN void die(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2); +AOM_NO_RETURN void fatal(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2); +void aom_tools_warn(const char *fmt, ...) AOM_TOOLS_FORMAT_PRINTF(1, 2); + +AOM_NO_RETURN void die_codec(aom_codec_ctx_t *ctx, const char *s); + +/* The tool including this file must define usage_exit() */ +AOM_NO_RETURN void usage_exit(void); + +#undef AOM_NO_RETURN + +// The AOM library can support different encoders / decoders. These +// functions provide different ways to lookup / iterate through them. +// The return result may be NULL to indicate no codec was found. +int get_aom_encoder_count(void); +aom_codec_iface_t *get_aom_encoder_by_index(int i); +aom_codec_iface_t *get_aom_encoder_by_short_name(const char *name); +// If the interface is unknown, returns NULL. +const char *get_short_name_by_aom_encoder(aom_codec_iface_t *encoder); +// If the interface is unknown, returns 0. +uint32_t get_fourcc_by_aom_encoder(aom_codec_iface_t *iface); + +int get_aom_decoder_count(void); +aom_codec_iface_t *get_aom_decoder_by_index(int i); +aom_codec_iface_t *get_aom_decoder_by_short_name(const char *name); +aom_codec_iface_t *get_aom_decoder_by_fourcc(uint32_t fourcc); +const char *get_short_name_by_aom_decoder(aom_codec_iface_t *decoder); +// If the interface is unknown, returns 0. +uint32_t get_fourcc_by_aom_decoder(aom_codec_iface_t *iface); + +const char *image_format_to_string(aom_img_fmt_t fmt); + +int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame); + +void aom_img_write(const aom_image_t *img, FILE *file); +// Returns true on success, false on failure. +bool aom_img_read(aom_image_t *img, FILE *file); + +double sse_to_psnr(double samples, double peak, double mse); +void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift); +void aom_img_downshift(aom_image_t *dst, const aom_image_t *src, + int down_shift); +// Returns true on success, false on failure. +bool aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr, + aom_image_t **img_shifted_ptr); +void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src); + +// Output in NV12 format. +void aom_img_write_nv12(const aom_image_t *img, FILE *file); + +size_t read_from_input(struct AvxInputContext *input_ctx, size_t n, + unsigned char *buf); +size_t input_to_detect_buf(struct AvxInputContext *input_ctx, size_t n); +size_t buffer_input(struct AvxInputContext *input_ctx, size_t n, + unsigned char *buf, bool buffered); +void rewind_detect(struct AvxInputContext *input_ctx); +bool input_eof(struct AvxInputContext *input_ctx); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // AOM_COMMON_TOOLS_COMMON_H_ diff --git a/third_party/aom/common/video_common.h b/third_party/aom/common/video_common.h new file mode 100644 index 0000000000..bf95031be6 --- /dev/null +++ b/third_party/aom/common/video_common.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_VIDEO_COMMON_H_ +#define AOM_COMMON_VIDEO_COMMON_H_ + +#include "common/tools_common.h" + +typedef struct { + uint32_t codec_fourcc; + int frame_width; + int frame_height; + struct AvxRational time_base; + unsigned int is_annexb; +} AvxVideoInfo; + +#endif // AOM_COMMON_VIDEO_COMMON_H_ diff --git a/third_party/aom/common/video_reader.c b/third_party/aom/common/video_reader.c new file mode 100644 index 0000000000..27f69a9672 --- /dev/null +++ b/third_party/aom/common/video_reader.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include + +#include "aom_ports/mem_ops.h" +#include "common/ivfdec.h" +#include "common/obudec.h" +#include "common/tools_common.h" +#include "common/video_reader.h" +#include "common/webmdec.h" + +struct AvxVideoReaderStruct { + AvxVideoInfo info; + struct AvxInputContext input_ctx; + struct ObuDecInputContext obu_ctx; + struct WebmInputContext webm_ctx; + uint8_t *buffer; + size_t buffer_size; + size_t frame_size; + aom_codec_pts_t pts; +}; + +AvxVideoReader *aom_video_reader_open(const char *filename) { + AvxVideoReader *reader = NULL; + const bool using_file = strcmp(filename, "-") != 0; + FILE *const file = + using_file ? fopen(filename, "rb") : set_binary_mode(stdin); + if (!file) return NULL; // Can't open file + + reader = (AvxVideoReader *)calloc(1, sizeof(*reader)); + if (!reader) { + fclose(file); + return NULL; // Can't allocate AvxVideoReader + } + + reader->input_ctx.filename = filename; + reader->input_ctx.file = file; + reader->obu_ctx.avx_ctx = &reader->input_ctx; + reader->obu_ctx.is_annexb = 1; + + // TODO(https://crbug.com/aomedia/1706): webm type does not support reading + // from stdin yet, and file_is_webm is not using the detect buffer when + // determining the type. Therefore it should only be checked when using a file + // and needs to be checked prior to other types. + if (false) { +#if CONFIG_WEBM_IO + } else if (using_file && + file_is_webm(&reader->webm_ctx, &reader->input_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_WEBM; + reader->info.codec_fourcc = reader->input_ctx.fourcc; + reader->info.frame_width = reader->input_ctx.width; + reader->info.frame_height = reader->input_ctx.height; +#endif + } else if (file_is_ivf(&reader->input_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_IVF; + reader->info.codec_fourcc = reader->input_ctx.fourcc; + reader->info.frame_width = reader->input_ctx.width; + reader->info.frame_height = reader->input_ctx.height; + } else if (file_is_obu(&reader->obu_ctx)) { + reader->input_ctx.file_type = FILE_TYPE_OBU; + // assume AV1 + reader->info.codec_fourcc = AV1_FOURCC; + reader->info.is_annexb = reader->obu_ctx.is_annexb; + } else { + fclose(file); + free(reader); + return NULL; // Unknown file type + } + + return reader; +} + +void aom_video_reader_close(AvxVideoReader *reader) { + if (reader) { + fclose(reader->input_ctx.file); + if (reader->input_ctx.file_type == FILE_TYPE_OBU) { + obudec_free(&reader->obu_ctx); + } + free(reader->buffer); + free(reader); + } +} + +int aom_video_reader_read_frame(AvxVideoReader *reader) { + if (reader->input_ctx.file_type == FILE_TYPE_IVF) { + return !ivf_read_frame(&reader->input_ctx, &reader->buffer, + &reader->frame_size, &reader->buffer_size, + &reader->pts); + } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) { + return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer, + &reader->frame_size, + &reader->buffer_size); +#if CONFIG_WEBM_IO + } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) { + return !webm_read_frame(&reader->webm_ctx, &reader->buffer, + &reader->frame_size, &reader->buffer_size); +#endif + } else { + assert(0); + return 0; + } +} + +const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, + size_t *size) { + if (size) *size = reader->frame_size; + + return reader->buffer; +} + +int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) { + return (int64_t)reader->pts; +} + +FILE *aom_video_reader_get_file(AvxVideoReader *reader) { + return reader->input_ctx.file; +} + +const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) { + return &reader->info; +} + +void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) { + reader->info.codec_fourcc = fourcc; +} diff --git a/third_party/aom/common/video_reader.h b/third_party/aom/common/video_reader.h new file mode 100644 index 0000000000..9ab439e8af --- /dev/null +++ b/third_party/aom/common/video_reader.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_VIDEO_READER_H_ +#define AOM_COMMON_VIDEO_READER_H_ + +#include "common/video_common.h" + +// The following code is work in progress. It is going to support transparent +// reading of input files. Right now only IVF format is supported for +// simplicity. The main goal the API is to be simple and easy to use in example +// code and in aomenc/aomdec later. All low-level details like memory +// buffer management are hidden from API users. +struct AvxVideoReaderStruct; +typedef struct AvxVideoReaderStruct AvxVideoReader; + +#ifdef __cplusplus +extern "C" { +#endif + +// Opens the input file for reading and inspects it to determine file type. +// Returns an opaque AvxVideoReader* upon success, or NULL upon failure. +// Right now only IVF format is supported. +AvxVideoReader *aom_video_reader_open(const char *filename); + +// Frees all resources associated with AvxVideoReader* returned from +// aom_video_reader_open() call. +void aom_video_reader_close(AvxVideoReader *reader); + +// Reads frame from the file and stores it in internal buffer. +int aom_video_reader_read_frame(AvxVideoReader *reader); + +// Returns the pointer to memory buffer with frame data read by last call to +// aom_video_reader_read_frame(). +const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size); + +// Returns the pts of the frame. +int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader); +// Return the reader file. +FILE *aom_video_reader_get_file(AvxVideoReader *reader); + +// Fills AvxVideoInfo with information from opened video file. +const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader); + +// Set fourcc. +void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_VIDEO_READER_H_ diff --git a/third_party/aom/common/video_writer.c b/third_party/aom/common/video_writer.c new file mode 100644 index 0000000000..1d4328ae1e --- /dev/null +++ b/third_party/aom/common/video_writer.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "common/video_writer.h" + +#include + +#include "aom/aom_encoder.h" +#include "common/ivfenc.h" + +struct AvxVideoWriterStruct { + AvxVideoInfo info; + FILE *file; + int frame_count; +}; + +static void write_header(FILE *file, const AvxVideoInfo *info, + int frame_count) { + struct aom_codec_enc_cfg cfg; + cfg.g_w = info->frame_width; + cfg.g_h = info->frame_height; + cfg.g_timebase.num = info->time_base.numerator; + cfg.g_timebase.den = info->time_base.denominator; + + ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count); +} + +AvxVideoWriter *aom_video_writer_open(const char *filename, + AvxContainer container, + const AvxVideoInfo *info) { + if (container == kContainerIVF) { + AvxVideoWriter *writer = NULL; + FILE *const file = fopen(filename, "wb"); + if (!file) return NULL; + + writer = malloc(sizeof(*writer)); + if (!writer) { + fclose(file); + return NULL; + } + writer->frame_count = 0; + writer->info = *info; + writer->file = file; + + write_header(writer->file, info, 0); + + return writer; + } + + return NULL; +} + +void aom_video_writer_close(AvxVideoWriter *writer) { + if (writer) { + // Rewriting frame header with real frame count + rewind(writer->file); + write_header(writer->file, &writer->info, writer->frame_count); + + fclose(writer->file); + free(writer); + } +} + +int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts) { + ivf_write_frame_header(writer->file, pts, size); + if (fwrite(buffer, 1, size, writer->file) != size) return 0; + + ++writer->frame_count; + + return 1; +} + +void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) { + writer->info.codec_fourcc = fourcc; +} diff --git a/third_party/aom/common/video_writer.h b/third_party/aom/common/video_writer.h new file mode 100644 index 0000000000..8712d47a58 --- /dev/null +++ b/third_party/aom/common/video_writer.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_VIDEO_WRITER_H_ +#define AOM_COMMON_VIDEO_WRITER_H_ + +#include "common/video_common.h" + +enum { kContainerIVF } UENUM1BYTE(AvxContainer); + +struct AvxVideoWriterStruct; +typedef struct AvxVideoWriterStruct AvxVideoWriter; + +#ifdef __cplusplus +extern "C" { +#endif + +// Finds and opens writer for specified container format. +// Returns an opaque AvxVideoWriter* upon success, or NULL upon failure. +// Right now only IVF format is supported. +AvxVideoWriter *aom_video_writer_open(const char *filename, + AvxContainer container, + const AvxVideoInfo *info); + +// Frees all resources associated with AvxVideoWriter* returned from +// aom_video_writer_open() call. +void aom_video_writer_close(AvxVideoWriter *writer); + +// Writes frame bytes to the file. +int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, + size_t size, int64_t pts); +// Set fourcc. +void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_VIDEO_WRITER_H_ diff --git a/third_party/aom/common/warnings.c b/third_party/aom/common/warnings.c new file mode 100644 index 0000000000..a20531cb8b --- /dev/null +++ b/third_party/aom/common/warnings.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/warnings.h" + +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "apps/aomenc.h" +#include "common/tools_common.h" + +static const char quantizer_warning_string[] = + "Bad quantizer values. Quantizer values should not be equal, and should " + "differ by at least 8."; + +struct WarningListNode { + const char *warning_string; + struct WarningListNode *next_warning; +}; + +struct WarningList { + struct WarningListNode *warning_node; +}; + +static void add_warning(const char *warning_string, + struct WarningList *warning_list) { + struct WarningListNode **node = &warning_list->warning_node; + + struct WarningListNode *new_node = malloc(sizeof(*new_node)); + if (new_node == NULL) { + fatal("Unable to allocate warning node."); + } + + new_node->warning_string = warning_string; + new_node->next_warning = NULL; + + while (*node != NULL) node = &(*node)->next_warning; + + *node = new_node; +} + +static void free_warning_list(struct WarningList *warning_list) { + while (warning_list->warning_node != NULL) { + struct WarningListNode *const node = warning_list->warning_node; + warning_list->warning_node = node->next_warning; + free(node); + } +} + +static int continue_prompt(int num_warnings) { + int c; + fprintf(stderr, + "%d encoder configuration warning(s). Continue? (y to continue) ", + num_warnings); + c = getchar(); + return c == 'y'; +} + +static void check_quantizer(int min_q, int max_q, + struct WarningList *warning_list) { + const int lossless = min_q == 0 && max_q == 0; + if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8)) + add_warning(quantizer_warning_string, warning_list); +} + +void check_encoder_config(int disable_prompt, + const struct AvxEncoderConfig *global_config, + const struct aom_codec_enc_cfg *stream_config) { + int num_warnings = 0; + struct WarningListNode *warning = NULL; + struct WarningList warning_list = { 0 }; + (void)global_config; + check_quantizer(stream_config->rc_min_quantizer, + stream_config->rc_max_quantizer, &warning_list); + /* Count and print warnings. */ + for (warning = warning_list.warning_node; warning != NULL; + warning = warning->next_warning, ++num_warnings) { + aom_tools_warn("%s", warning->warning_string); + } + + free_warning_list(&warning_list); + + if (num_warnings) { + if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE); + } +} diff --git a/third_party/aom/common/warnings.h b/third_party/aom/common/warnings.h new file mode 100644 index 0000000000..36f1fe0706 --- /dev/null +++ b/third_party/aom/common/warnings.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_WARNINGS_H_ +#define AOM_COMMON_WARNINGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_codec_enc_cfg; +struct AvxEncoderConfig; + +/* + * Checks config for improperly used settings. Warns user upon encountering + * settings that will lead to poor output quality. Prompts user to continue + * when warnings are issued. + */ +void check_encoder_config(int disable_prompt, + const struct AvxEncoderConfig *global_config, + const struct aom_codec_enc_cfg *stream_config); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_WARNINGS_H_ diff --git a/third_party/aom/common/webmdec.cc b/third_party/aom/common/webmdec.cc new file mode 100644 index 0000000000..33bda59021 --- /dev/null +++ b/third_party/aom/common/webmdec.cc @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/webmdec.h" + +#include +#include +#include + +#include "third_party/libwebm/mkvparser/mkvparser.h" +#include "third_party/libwebm/mkvparser/mkvreader.h" + +namespace { + +void reset(struct WebmInputContext *const webm_ctx) { + if (webm_ctx->reader != NULL) { + mkvparser::MkvReader *const reader = + reinterpret_cast(webm_ctx->reader); + delete reader; + } + if (webm_ctx->segment != NULL) { + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + delete segment; + } + if (webm_ctx->buffer != NULL) { + delete[] webm_ctx->buffer; + } + webm_ctx->reader = NULL; + webm_ctx->segment = NULL; + webm_ctx->buffer = NULL; + webm_ctx->cluster = NULL; + webm_ctx->block_entry = NULL; + webm_ctx->block = NULL; + webm_ctx->block_frame_index = 0; + webm_ctx->video_track_index = 0; + webm_ctx->timestamp_ns = 0; + webm_ctx->is_key_frame = false; +} + +void get_first_cluster(struct WebmInputContext *const webm_ctx) { + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const mkvparser::Cluster *const cluster = segment->GetFirst(); + webm_ctx->cluster = cluster; +} + +void rewind_and_reset(struct WebmInputContext *const webm_ctx, + struct AvxInputContext *const aom_ctx) { + rewind(aom_ctx->file); + reset(webm_ctx); +} + +} // namespace + +int file_is_webm(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx) { + mkvparser::MkvReader *const reader = new mkvparser::MkvReader(aom_ctx->file); + webm_ctx->reader = reader; + webm_ctx->reached_eos = 0; + + mkvparser::EBMLHeader header; + long long pos = 0; + if (header.Parse(reader, pos) < 0) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + mkvparser::Segment *segment; + if (mkvparser::Segment::CreateInstance(reader, pos, segment)) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + webm_ctx->segment = segment; + if (segment->Load() < 0) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + const mkvparser::Tracks *const tracks = segment->GetTracks(); + const mkvparser::VideoTrack *video_track = NULL; + for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) { + const mkvparser::Track *const track = tracks->GetTrackByIndex(i); + if (track->GetType() == mkvparser::Track::kVideo) { + video_track = static_cast(track); + webm_ctx->video_track_index = static_cast(track->GetNumber()); + break; + } + } + + if (video_track == NULL || video_track->GetCodecId() == NULL) { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + if (!strncmp(video_track->GetCodecId(), "V_AV1", 5)) { + aom_ctx->fourcc = AV1_FOURCC; + } else { + rewind_and_reset(webm_ctx, aom_ctx); + return 0; + } + + aom_ctx->framerate.denominator = 0; + aom_ctx->framerate.numerator = 0; + aom_ctx->width = static_cast(video_track->GetWidth()); + aom_ctx->height = static_cast(video_track->GetHeight()); + + get_first_cluster(webm_ctx); + + return 1; +} + +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size) { + assert(webm_ctx->buffer == *buffer); + // This check is needed for frame parallel decoding, in which case this + // function could be called even after it has reached end of input stream. + if (webm_ctx->reached_eos) { + return 1; + } + mkvparser::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const mkvparser::Cluster *cluster = + reinterpret_cast(webm_ctx->cluster); + const mkvparser::Block *block = + reinterpret_cast(webm_ctx->block); + const mkvparser::BlockEntry *block_entry = + reinterpret_cast(webm_ctx->block_entry); + bool block_entry_eos = false; + do { + long status = 0; + bool get_new_block = false; + if (block_entry == NULL && !block_entry_eos) { + status = cluster->GetFirst(block_entry); + get_new_block = true; + } else if (block_entry_eos || block_entry->EOS()) { + cluster = segment->GetNext(cluster); + if (cluster == NULL || cluster->EOS()) { + *bytes_read = 0; + webm_ctx->reached_eos = 1; + return 1; + } + status = cluster->GetFirst(block_entry); + block_entry_eos = false; + get_new_block = true; + } else if (block == NULL || + webm_ctx->block_frame_index == block->GetFrameCount() || + block->GetTrackNumber() != webm_ctx->video_track_index) { + status = cluster->GetNext(block_entry, block_entry); + if (block_entry == NULL || block_entry->EOS()) { + block_entry_eos = true; + continue; + } + get_new_block = true; + } + if (status || block_entry == NULL) { + return -1; + } + if (get_new_block) { + block = block_entry->GetBlock(); + if (block == NULL) return -1; + webm_ctx->block_frame_index = 0; + } + } while (block_entry_eos || + block->GetTrackNumber() != webm_ctx->video_track_index); + + webm_ctx->cluster = cluster; + webm_ctx->block_entry = block_entry; + webm_ctx->block = block; + + const mkvparser::Block::Frame &frame = + block->GetFrame(webm_ctx->block_frame_index); + ++webm_ctx->block_frame_index; + if (frame.len > static_cast(*buffer_size)) { + delete[] * buffer; + *buffer = new uint8_t[frame.len]; + webm_ctx->buffer = *buffer; + if (*buffer == NULL) { + return -1; + } + *buffer_size = frame.len; + } + *bytes_read = frame.len; + webm_ctx->timestamp_ns = block->GetTime(cluster); + webm_ctx->is_key_frame = block->IsKey(); + + mkvparser::MkvReader *const reader = + reinterpret_cast(webm_ctx->reader); + return frame.Read(reader, *buffer) ? -1 : 0; +} + +// Calculate the greatest common divisor between two numbers. +static int gcd(int a, int b) { + int remainder; + while (b > 0) { + remainder = a % b; + a = b; + b = remainder; + } + return a; +} + +int webm_guess_framerate(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx) { + uint32_t i = 0; + uint8_t *buffer = NULL; + size_t buffer_size = 0; + size_t bytes_read = 0; + assert(webm_ctx->buffer == NULL); + while (webm_ctx->timestamp_ns < 1000000000 && i < 50) { + if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) { + break; + } + ++i; + } + aom_ctx->framerate.numerator = (i - 1) * 1000000; + aom_ctx->framerate.denominator = + static_cast(webm_ctx->timestamp_ns / 1000); + // Fraction might be represented in large numbers, like 49000000/980000 + // for 50fps. Simplify as much as possible. + int g = gcd(aom_ctx->framerate.numerator, aom_ctx->framerate.denominator); + if (g != 0) { + aom_ctx->framerate.numerator /= g; + aom_ctx->framerate.denominator /= g; + } + + delete[] buffer; + webm_ctx->buffer = NULL; + + get_first_cluster(webm_ctx); + webm_ctx->block = NULL; + webm_ctx->block_entry = NULL; + webm_ctx->block_frame_index = 0; + webm_ctx->timestamp_ns = 0; + webm_ctx->reached_eos = 0; + + return 0; +} + +void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); } diff --git a/third_party/aom/common/webmdec.h b/third_party/aom/common/webmdec.h new file mode 100644 index 0000000000..fcbdeffe4d --- /dev/null +++ b/third_party/aom/common/webmdec.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_WEBMDEC_H_ +#define AOM_COMMON_WEBMDEC_H_ + +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AvxInputContext; + +struct WebmInputContext { + void *reader; + void *segment; + uint8_t *buffer; + const void *cluster; + const void *block_entry; + const void *block; + int block_frame_index; + int video_track_index; + int64_t timestamp_ns; + int is_key_frame; + int reached_eos; +}; + +// Checks if the input is a WebM file. If so, initializes WebMInputContext so +// that webm_read_frame can be called to retrieve a video frame. +// Returns 1 on success and 0 on failure or input is not WebM file. +// TODO(vigneshv): Refactor this function into two smaller functions specific +// to their task. +int file_is_webm(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx); + +// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed +// by this function. For the first call, |buffer| should be NULL and +// |*buffer_size| should be 0. Once all the frames are read and used, +// webm_free() should be called, otherwise there will be a leak. +// Parameters: +// webm_ctx - WebmInputContext object +// buffer - pointer where the frame data will be filled. +// bytes_read - pointer to bytes read. +// buffer_size - pointer to buffer size. +// Return values: +// 0 - Success +// 1 - End of Stream +// -1 - Error +int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, + size_t *bytes_read, size_t *buffer_size); + +// Guesses the frame rate of the input file based on the container timestamps. +int webm_guess_framerate(struct WebmInputContext *webm_ctx, + struct AvxInputContext *aom_ctx); + +// Resets the WebMInputContext. +void webm_free(struct WebmInputContext *webm_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_WEBMDEC_H_ diff --git a/third_party/aom/common/webmenc.cc b/third_party/aom/common/webmenc.cc new file mode 100644 index 0000000000..bb754e8119 --- /dev/null +++ b/third_party/aom/common/webmenc.cc @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "common/webmenc.h" + +#include +#include + +#include +#include +#include + +#include "common/av1_config.h" +#include "third_party/libwebm/mkvmuxer/mkvmuxer.h" +#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h" +#include "third_party/libwebm/mkvmuxer/mkvwriter.h" + +namespace { +const uint64_t kDebugTrackUid = 0xDEADBEEF; +const int kVideoTrackNumber = 1; + +// Simplistic mechanism to detect if an argv parameter refers to +// an input or output file. Returns the total number of arguments that +// should be skipped. +int skip_input_output_arg(const char *arg, const char *input_fname) { + if (strcmp(arg, input_fname) == 0) { + return 1; + } + if (strcmp(arg, "-o") == 0 || strcmp(arg, "--output") == 0) { + return 2; + } + if (strncmp(arg, "--output=", strlen("--output=")) == 0) { + return 1; + } + return 0; +} + +} // namespace + +char *extract_encoder_settings(const char *version, const char **argv, int argc, + const char *input_fname) { + // + 9 for "version:" prefix and for null terminator. + size_t total_size = strlen(version) + 9; + int i = 1; + while (i < argc) { + int num_skip = skip_input_output_arg(argv[i], input_fname); + i += num_skip; + if (num_skip == 0) { + total_size += strlen(argv[i]) + 1; // + 1 is for space separator. + ++i; + } + } + char *result = static_cast(malloc(total_size)); + if (result == nullptr) { + return nullptr; + } + char *cur = result; + cur += snprintf(cur, total_size, "version:%s", version); + i = 1; + while (i < argc) { + int num_skip = skip_input_output_arg(argv[i], input_fname); + i += num_skip; + if (num_skip == 0) { + cur += snprintf(cur, total_size, " %s", argv[i]); + ++i; + } + } + *cur = '\0'; + return result; +} + +int write_webm_file_header(struct WebmOutputContext *webm_ctx, + aom_codec_ctx_t *encoder_ctx, + const aom_codec_enc_cfg_t *cfg, + stereo_format_t stereo_fmt, unsigned int fourcc, + const struct AvxRational *par, + const char *encoder_settings) { + std::unique_ptr writer( + new (std::nothrow) mkvmuxer::MkvWriter(webm_ctx->stream)); + std::unique_ptr segment(new (std::nothrow) + mkvmuxer::Segment()); + if (writer == nullptr || segment == nullptr) { + fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n"); + return -1; + } + + bool ok = segment->Init(writer.get()); + if (!ok) { + fprintf(stderr, "webmenc> mkvmuxer Init failed.\n"); + return -1; + } + + segment->set_mode(mkvmuxer::Segment::kFile); + segment->OutputCues(true); + + mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo(); + if (!info) { + fprintf(stderr, "webmenc> Cannot retrieve Segment Info.\n"); + return -1; + } + + const uint64_t kTimecodeScale = 1000000; + info->set_timecode_scale(kTimecodeScale); + std::string version = "aomenc"; + if (!webm_ctx->debug) { + version.append(std::string(" ") + aom_codec_version_str()); + } + info->set_writing_app(version.c_str()); + + const uint64_t video_track_id = + segment->AddVideoTrack(static_cast(cfg->g_w), + static_cast(cfg->g_h), kVideoTrackNumber); + mkvmuxer::VideoTrack *const video_track = static_cast( + segment->GetTrackByNumber(video_track_id)); + + if (!video_track) { + fprintf(stderr, "webmenc> Video track creation failed.\n"); + return -1; + } + + ok = false; + aom_fixed_buf_t *obu_sequence_header = + aom_codec_get_global_headers(encoder_ctx); + if (obu_sequence_header) { + Av1Config av1_config; + if (get_av1config_from_obu( + reinterpret_cast(obu_sequence_header->buf), + obu_sequence_header->sz, false, &av1_config) == 0) { + uint8_t av1_config_buffer[4] = { 0 }; + size_t bytes_written = 0; + if (write_av1config(&av1_config, sizeof(av1_config_buffer), + &bytes_written, av1_config_buffer) == 0) { + ok = video_track->SetCodecPrivate(av1_config_buffer, + sizeof(av1_config_buffer)); + } + } + free(obu_sequence_header->buf); + free(obu_sequence_header); + } + if (!ok) { + fprintf(stderr, "webmenc> Unable to set AV1 config.\n"); + return -1; + } + + ok = video_track->SetStereoMode(stereo_fmt); + if (!ok) { + fprintf(stderr, "webmenc> Unable to set stereo mode.\n"); + return -1; + } + + if (fourcc != AV1_FOURCC) { + fprintf(stderr, "webmenc> Unsupported codec (unknown 4 CC).\n"); + return -1; + } + video_track->set_codec_id("V_AV1"); + + if (par->numerator > 1 || par->denominator > 1) { + // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type + // to WebM format. + const uint64_t display_width = static_cast( + ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5); + video_track->set_display_width(display_width); + video_track->set_display_height(cfg->g_h); + } + + if (encoder_settings != nullptr) { + mkvmuxer::Tag *tag = segment->AddTag(); + if (tag == nullptr) { + fprintf(stderr, + "webmenc> Unable to allocate memory for encoder settings tag.\n"); + return -1; + } + ok = tag->add_simple_tag("ENCODER_SETTINGS", encoder_settings); + if (!ok) { + fprintf(stderr, + "webmenc> Unable to allocate memory for encoder settings tag.\n"); + return -1; + } + } + + if (webm_ctx->debug) { + video_track->set_uid(kDebugTrackUid); + } + + webm_ctx->writer = writer.release(); + webm_ctx->segment = segment.release(); + return 0; +} + +int write_webm_block(struct WebmOutputContext *webm_ctx, + const aom_codec_enc_cfg_t *cfg, + const aom_codec_cx_pkt_t *pkt) { + if (!webm_ctx->segment) { + fprintf(stderr, "webmenc> segment is NULL.\n"); + return -1; + } + mkvmuxer::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num / + cfg->g_timebase.den; + if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000; + webm_ctx->last_pts_ns = pts_ns; + + if (!segment->AddFrame(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, kVideoTrackNumber, pts_ns, + pkt->data.frame.flags & AOM_FRAME_IS_KEY)) { + fprintf(stderr, "webmenc> AddFrame failed.\n"); + return -1; + } + return 0; +} + +int write_webm_file_footer(struct WebmOutputContext *webm_ctx) { + if (!webm_ctx->writer || !webm_ctx->segment) { + fprintf(stderr, "webmenc> segment or writer NULL.\n"); + return -1; + } + mkvmuxer::MkvWriter *const writer = + reinterpret_cast(webm_ctx->writer); + mkvmuxer::Segment *const segment = + reinterpret_cast(webm_ctx->segment); + const bool ok = segment->Finalize(); + delete segment; + delete writer; + webm_ctx->writer = NULL; + webm_ctx->segment = NULL; + + if (!ok) { + fprintf(stderr, "webmenc> Segment::Finalize failed.\n"); + return -1; + } + + return 0; +} diff --git a/third_party/aom/common/webmenc.h b/third_party/aom/common/webmenc.h new file mode 100644 index 0000000000..c912208b45 --- /dev/null +++ b/third_party/aom/common/webmenc.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_COMMON_WEBMENC_H_ +#define AOM_COMMON_WEBMENC_H_ + +#include +#include + +#include "tools_common.h" +#include "aom/aom_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct WebmOutputContext { + int debug; + FILE *stream; + int64_t last_pts_ns; + void *writer; + void *segment; +}; + +/* Stereo 3D packed frame format */ +enum { + STEREO_FORMAT_MONO = 0, + STEREO_FORMAT_LEFT_RIGHT = 1, + STEREO_FORMAT_BOTTOM_TOP = 2, + STEREO_FORMAT_TOP_BOTTOM = 3, + STEREO_FORMAT_RIGHT_LEFT = 11 +} UENUM1BYTE(stereo_format_t); + +// Simplistic mechanism to extract encoder settings, without having +// to re-invoke the entire flag-parsing logic. It lists the codec version +// and then copies the arguments as-is from argv, but skips the binary name, +// any arguments that match the input filename, and the output flags "-o" +// and "--output" (and the following argument for those flags). The caller +// is responsible for free-ing the returned string. If there is insufficient +// memory, it returns nullptr. +char *extract_encoder_settings(const char *version, const char **argv, int argc, + const char *input_fname); + +// The following functions wrap libwebm's mkvmuxer. All functions return 0 upon +// success, or -1 upon failure. + +int write_webm_file_header(struct WebmOutputContext *webm_ctx, + aom_codec_ctx_t *encoder_ctx, + const aom_codec_enc_cfg_t *cfg, + stereo_format_t stereo_fmt, unsigned int fourcc, + const struct AvxRational *par, + const char *encoder_settings); + +int write_webm_block(struct WebmOutputContext *webm_ctx, + const aom_codec_enc_cfg_t *cfg, + const aom_codec_cx_pkt_t *pkt); + +int write_webm_file_footer(struct WebmOutputContext *webm_ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_WEBMENC_H_ diff --git a/third_party/aom/common/y4menc.c b/third_party/aom/common/y4menc.c new file mode 100644 index 0000000000..25086a91d0 --- /dev/null +++ b/third_party/aom/common/y4menc.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "common/rawenc.h" +#include "common/y4menc.h" + +// Returns the Y4M name associated with the monochrome colorspace. +static const char *monochrome_colorspace(unsigned int bit_depth) { + switch (bit_depth) { + case 8: return "Cmono"; + case 9: return "Cmono9"; + case 10: return "Cmono10"; + case 12: return "Cmono12"; + case 16: return "Cmono16"; + default: assert(0); return NULL; + } +} + +// Return the Y4M name of the 8-bit colorspace, given the chroma position and +// image format. +static const char *colorspace8(aom_chroma_sample_position_t csp, + aom_img_fmt_t fmt) { + switch (fmt) { + case AOM_IMG_FMT_I444: return "C444"; + case AOM_IMG_FMT_I422: return "C422"; + default: + if (csp == AOM_CSP_VERTICAL) { + return "C420mpeg2 XYSCSS=420MPEG2"; + } else if (csp == AOM_CSP_COLOCATED) { + // Note that Y4M does not have a dedicated header for colocated chroma, + // and that FFMPEG interprets C420 as C420jpeg. + return "C420"; + } else { + return "C420jpeg"; + } + } +} + +// Return the Y4M name of the colorspace, given the bit depth and image format. +static const char *colorspace(unsigned int bit_depth, + aom_chroma_sample_position_t csp, + aom_img_fmt_t fmt) { + switch (bit_depth) { + case 8: return colorspace8(csp, fmt); + case 9: + return fmt == AOM_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9" + : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9" + : "C420p9 XYSCSS=420P9"; + case 10: + return fmt == AOM_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10" + : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10" + : "C420p10 XYSCSS=420P10"; + case 12: + return fmt == AOM_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12" + : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12" + : "C420p12 XYSCSS=420P12"; + case 14: + return fmt == AOM_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14" + : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14" + : "C420p14 XYSCSS=420P14"; + case 16: + return fmt == AOM_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16" + : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16" + : "C420p16 XYSCSS=420P16"; + default: assert(0); return NULL; + } +} + +int y4m_write_file_header(char *buf, size_t len, int width, int height, + const struct AvxRational *framerate, int monochrome, + aom_chroma_sample_position_t csp, aom_img_fmt_t fmt, + unsigned int bit_depth, aom_color_range_t range) { + const char *color = monochrome ? monochrome_colorspace(bit_depth) + : colorspace(bit_depth, csp, fmt); + const char *color_range = ""; // Default assumption is studio range. + if (range == AOM_CR_FULL_RANGE) { + color_range = " XCOLORRANGE=FULL"; + } + return snprintf(buf, len, "YUV4MPEG2 W%d H%d F%d:%d Ip %s%s\n", width, height, + framerate->numerator, framerate->denominator, color, + color_range); +} + +int y4m_write_frame_header(char *buf, size_t len) { + return snprintf(buf, len, "FRAME\n"); +} + +void y4m_write_image_file(const aom_image_t *img, const int *planes, + FILE *file) { + int num_planes = img->monochrome ? 1 : 3; + raw_write_image_file(img, planes, num_planes, file); +} + +void y4m_update_image_md5(const aom_image_t *img, const int *planes, + MD5Context *md5) { + int num_planes = img->monochrome ? 1 : 3; + raw_update_image_md5(img, planes, num_planes, md5); +} diff --git a/third_party/aom/common/y4menc.h b/third_party/aom/common/y4menc.h new file mode 100644 index 0000000000..6484efcc50 --- /dev/null +++ b/third_party/aom/common/y4menc.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_Y4MENC_H_ +#define AOM_COMMON_Y4MENC_H_ + +#include "aom/aom_decoder.h" +#include "common/md5_utils.h" +#include "common/tools_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define Y4M_BUFFER_SIZE 256 + +int y4m_write_file_header(char *buf, size_t len, int width, int height, + const struct AvxRational *framerate, int monochrome, + aom_chroma_sample_position_t csp, aom_img_fmt_t fmt, + unsigned int bit_depth, aom_color_range_t range); +int y4m_write_frame_header(char *buf, size_t len); +void y4m_write_image_file(const aom_image_t *img, const int *planes, + FILE *file); +void y4m_update_image_md5(const aom_image_t *img, const int *planes, + MD5Context *md5); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_Y4MENC_H_ diff --git a/third_party/aom/common/y4minput.c b/third_party/aom/common/y4minput.c new file mode 100644 index 0000000000..1974d76f1f --- /dev/null +++ b/third_party/aom/common/y4minput.c @@ -0,0 +1,1222 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * Based on code from the OggTheora software codec source code, + * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. + */ +#include +#include +#include +#include + +#include "aom/aom_integer.h" +#include "aom_ports/msvc.h" +#include "y4minput.h" + +// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. +// Returns true on success. +static int file_read(void *buf, size_t size, FILE *file) { + const int kMaxTries = 5; + int try_count = 0; + int file_error = 0; + size_t len = 0; + while (!feof(file) && len < size && try_count < kMaxTries) { + const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); + ++try_count; + len += n; + file_error = ferror(file); + if (file_error) { + if (errno == EINTR || errno == EAGAIN) { + clearerr(file); + continue; + } else { + fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n", + (uint32_t)len, (uint32_t)size, errno, strerror(errno)); + return 0; + } + } + } + + if (!feof(file) && len != size) { + fprintf(stderr, + "Error reading file: %u of %u bytes read," + " error: %d, tries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, try_count, errno, + strerror(errno)); + } + return len == size; +} + +// Stores the color range in 'y4m_ctx', returning 1 if successfully parsed, +// 0 otherwise. +static int parse_color_range(y4m_input *y4m_ctx, const char *buf) { + // Note that default is studio range. + if (strcmp(buf, "LIMITED") == 0) { + return 1; + } + if (strcmp(buf, "FULL") == 0) { + y4m_ctx->color_range = AOM_CR_FULL_RANGE; + return 1; + } + fprintf(stderr, "Unknown color range value: %s\n", buf); + return 0; +} + +static int parse_metadata(y4m_input *y4m_ctx, const char *buf) { + if (strncmp(buf, "COLORRANGE=", 11) == 0) { + return parse_color_range(y4m_ctx, buf + 11); + } + return 1; // No support for other metadata, just ignore them. +} + +static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { + char *p; + char *q; + for (p = _tags;; p = q) { + /*Skip any leading spaces.*/ + while (*p == ' ') p++; + /*If that's all we have, stop.*/ + if (p[0] == '\0') break; + /*Find the end of this tag.*/ + for (q = p + 1; *q != '\0' && *q != ' '; q++) { + } + /*Process the tag.*/ + switch (p[0]) { + case 'W': { + if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; + } break; + case 'H': { + if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; + } break; + case 'F': { + if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { + return -1; + } + } break; + case 'I': { + _y4m->interlace = p[1]; + } break; + case 'A': { + if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { + return -1; + } + } break; + case 'C': { + if (q - p > 16) return -1; + memcpy(_y4m->chroma_type, p + 1, q - p - 1); + _y4m->chroma_type[q - p - 1] = '\0'; + } break; + case 'X': { + if (!parse_metadata(_y4m, p + 1)) return -1; + } break; + default: break; /*Ignore unknown tags.*/ + } + } + return 0; +} + +// Copy a single tag into the buffer, along with a null character. +// Returns 0 if any file IO errors occur. +static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) { + size_t i; + assert(buf_len >= 1); + // Skip leading space characters. + do { + if (!file_read(buf, 1, file)) { + return 0; + } + } while (buf[0] == ' '); + + // If we hit the newline, treat this as the "empty" tag. + if (buf[0] == '\n') { + buf[0] = '\0'; + *end_tag = '\n'; + return 1; + } + + // Copy over characters until a space is hit, or the buffer is exhausted. + for (i = 1; i < buf_len; ++i) { + if (!file_read(buf + i, 1, file)) { + return 0; + } + if (buf[i] == ' ' || buf[i] == '\n') { + break; + } + } + if (i == buf_len) { + fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n", + (unsigned long)i); + return 0; + } + *end_tag = buf[i]; + buf[i] = '\0'; + return 1; +} + +// Returns 1 if tags were parsed successfully, 0 otherwise. +static int parse_tags(y4m_input *y4m_ctx, FILE *file) { + char tag[256]; + char end; // Character denoting the end of the tag, ' ' or '\n'. + // Set Y4M tags to defaults, updating them as processing occurs. Mandatory + // fields are marked with -1 and will be checked after the tags are parsed. + y4m_ctx->pic_w = -1; + y4m_ctx->pic_h = -1; + y4m_ctx->fps_n = -1; // Also serves as marker for fps_d + y4m_ctx->par_n = 0; + y4m_ctx->par_d = 0; + y4m_ctx->interlace = '?'; + y4m_ctx->color_range = AOM_CR_STUDIO_RANGE; + snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420"); + + // Find one tag at a time. + do { + if (!copy_tag(tag, sizeof(tag), &end, file)) { + return 0; + } + // y4m_parse_tags returns 0 on success. + if (y4m_parse_tags(y4m_ctx, tag)) { + return 0; + } + } while (end != '\n'); + + // Check the mandatory fields. + if (y4m_ctx->pic_w == -1) { + fprintf(stderr, "Width field missing\n"); + return 0; + } + if (y4m_ctx->pic_h == -1) { + fprintf(stderr, "Height field missing\n"); + return 0; + } + if (y4m_ctx->fps_n == -1) { + fprintf(stderr, "FPS field missing\n"); + return 0; + } + return 1; +} + +/*All anti-aliasing filters in the following conversion functions are based on + one of two window functions: + The 6-tap Lanczos window (for down-sampling and shifts): + sinc(\pi*t)*sinc(\pi*t/3), |t|<3 (sinc(t)==sin(t)/t) + 0, |t|>=3 + The 4-tap Mitchell window (for up-sampling): + 7|t|^3-12|t|^2+16/3, |t|<1 + -(7/3)|x|^3+12|x|^2-20|x|+32/3, |t|<2 + 0, |t|>=2 + The number of taps is intentionally kept small to reduce computational + overhead and limit ringing. + + The taps from these filters are scaled so that their sum is 1, and the + result is scaled by 128 and rounded to integers to create a filter whose + intermediate values fit inside 16 bits. + Coefficients are rounded in such a way as to ensure their sum is still 128, + which is usually equivalent to normal rounding. + + Conversions which require both horizontal and vertical filtering could + have these steps pipelined, for less memory consumption and better cache + performance, but we do them separately for simplicity.*/ +#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) +#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) +#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 420mpeg2 chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + BR | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + BR | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the site locations one quarter pixel (at + the chroma plane's resolution) to the right. + The 4:2:2 modes look exactly the same, except there are twice as many chroma + lines, and they are vertically co-sited with the luma samples in both the + mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ +static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, + const unsigned char *_src, int _c_w, + int _c_h) { + int y; + int x; + for (y = 0; y < _c_h; y++) { + /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos + window.*/ + for (x = 0; x < OC_MINI(_c_w, 2); x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + + 64) >> + 7, + 255); + } + for (; x < _c_w - 3; x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, + 255); + } + for (; x < _c_w; x++) { + _dst[x] = (unsigned char)OC_CLAMPI( + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, + 255); + } + _dst += _c_w; + _src += _c_w; + } +} + +/*This format is only used for interlaced content, but is included for + completeness. + + 420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 420paldv chroma samples are sited like: + YR------Y-------YR------Y------- + | | | | + | | | | + | | | | + YB------Y-------YB------Y------- + | | | | + | | | | + | | | | + YR------Y-------YR------Y------- + | | | | + | | | | + | | | | + YB------Y-------YB------Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the site locations one quarter pixel (at + the chroma plane's resolution) to the right. + Then we use another filter to move the C_r location down one quarter pixel, + and the C_b location up one quarter pixel.*/ +static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + 1) / 2; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_sz = c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*First do the horizontal re-sampling. + This is the same as the mpeg2 case, except that after the horizontal + case, we need to apply a second vertical filter.*/ + y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); + _aux += c_sz; + switch (pli) { + case 1: { + /*Slide C_b up a quarter-pel. + This is the same filter used above, but in the other order.*/ + for (x = 0; x < c_w; x++) { + for (y = 0; y < OC_MINI(c_h, 3); y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h - 2; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, + 255); + } + _dst++; + tmp++; + } + _dst += c_sz - c_w; + tmp -= c_w; + } break; + case 2: { + /*Slide C_r down a quarter-pel. + This is the same as the horizontal filter.*/ + for (x = 0; x < c_w; x++) { + for (y = 0; y < OC_MINI(c_h, 2); y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + + tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h - 3; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, + 255); + } + for (; y < c_h; y++) { + _dst[y * c_w] = (unsigned char)OC_CLAMPI( + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] - + 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] + + 64) >> + 7, + 255); + } + _dst++; + tmp++; + } + } break; + } + /*For actual interlaced material, this would have to be done separately on + each field, and the shift amounts would be different. + C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8, + C_b up 1/8 in the bottom field. + The corresponding filters would be: + Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128 + Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/ + } +} + +/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0. + This is used as a helper by several conversion routines.*/ +static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, + const unsigned char *_src, int _c_w, + int _c_h) { + int y; + int x; + /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ + for (x = 0; x < _c_w; x++) { + for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, + (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, + 255); + } + for (; y < _c_h - 3; y += 2) { + _dst[(y >> 1) * _c_w] = + OC_CLAMPI(0, + (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, + 255); + } + for (; y < _c_h; y += 2) { + _dst[(y >> 1) * _c_w] = OC_CLAMPI( + 0, + (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) + + 78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) + + 64) >> + 7, + 255); + } + _src++; + _dst++; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 422jpeg chroma samples are sited like: + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + Y---BR--Y-------Y---BR--Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to decimate the chroma planes by two in the + vertical direction.*/ +static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + for (pli = 1; pli < 3; pli++) { + y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h); + _aux += c_sz; + _dst += dst_c_sz; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 422 chroma samples are sited like: + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + YBR-----Y-------YBR-----Y------- + | | | | + | | | | + | | | | + + We use a resampling filter to shift the original site locations one quarter + pixel (at the original chroma resolution) to the right. + Then we use a second resampling filter to decimate the chroma planes by two + in the vertical direction.*/ +static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_h; + int dst_c_sz; + int pli; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = c_w * dst_c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*In reality, the horizontal and vertical steps could be pipelined, for + less memory consumption and better cache performance, but we do them + separately for simplicity.*/ + /*First do horizontal filtering (convert to 422jpeg)*/ + y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h); + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h); + _aux += c_sz; + _dst += dst_c_sz; + } +} + +/*420jpeg chroma samples are sited like: + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | BR | | BR | + | | | | + Y-------Y-------Y-------Y------- + | | | | + | | | | + | | | | + + 411 chroma samples are sited like: + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + YBR-----Y-------Y-------Y------- + | | | | + | | | | + | | | | + + We use a filter to resample at site locations one eighth pixel (at the source + chroma plane's horizontal resolution) and five eighths of a pixel to the + right. + Then we use another filter to decimate the planes by 2 in the vertical + direction.*/ +static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + tmp_sz = dst_c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*In reality, the horizontal and vertical steps could be pipelined, for + less memory consumption and better cache performance, but we do them + separately for simplicity.*/ + /*First do horizontal filtering (convert to 422jpeg)*/ + for (y = 0; y < c_h; y++) { + /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a + 4-tap Mitchell window.*/ + for (x = 0; x < OC_MINI(c_w, 1); x++) { + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, + (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, + 255); + } + for (; x < c_w - 2; x++) { + tmp[x << 1] = + (unsigned char)OC_CLAMPI(0, + (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, + 255); + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, + 255); + } + for (; x < c_w; x++) { + tmp[x << 1] = (unsigned char)OC_CLAMPI( + 0, + (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - + _aux[c_w - 1] + 64) >> + 7, + 255); + if ((x << 1 | 1) < dst_c_w) { + tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + + 86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >> + 7, + 255); + } + } + tmp += dst_c_w; + _aux += c_w; + } + tmp -= tmp_sz; + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); + _dst += dst_c_sz; + } +} + +/*Convert 444 to 420jpeg.*/ +static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + unsigned char *tmp; + int c_w; + int c_h; + int c_sz; + int dst_c_w; + int dst_c_h; + int dst_c_sz; + int tmp_sz; + int pli; + int y; + int x; + /*Skip past the luma data.*/ + _dst += _y4m->pic_w * _y4m->pic_h; + /*Compute the size of each chroma plane.*/ + c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h; + c_h = _y4m->pic_h; + dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + dst_c_sz = dst_c_w * dst_c_h; + tmp_sz = dst_c_w * c_h; + tmp = _aux + 2 * c_sz; + for (pli = 1; pli < 3; pli++) { + /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ + for (y = 0; y < c_h; y++) { + for (x = 0; x < OC_MINI(c_w, 2); x += 2) { + tmp[x >> 1] = OC_CLAMPI(0, + (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); + } + for (; x < c_w - 3; x += 2) { + tmp[x >> 1] = OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, + 255); + } + for (; x < c_w; x += 2) { + tmp[x >> 1] = + OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); + } + tmp += dst_c_w; + _aux += c_w; + } + tmp -= tmp_sz; + /*Now do the vertical filtering.*/ + y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h); + _dst += dst_c_sz; + } +} + +/*The image is padded with empty chroma components at 4:2:0.*/ +static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + int c_sz; + (void)_aux; + _dst += _y4m->pic_w * _y4m->pic_h; + c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * + ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); + memset(_dst, 128, c_sz * 2); +} + +/*No conversion function needed.*/ +static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_aux) { + (void)_y4m; + (void)_dst; + (void)_aux; +} + +static const char TAG[] = "YUV4MPEG2"; + +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, aom_chroma_sample_position_t csp, + int only_420) { + // File must start with |TAG|. + char tag_buffer[9]; // 9 == strlen(TAG) + // Read as much as possible from |skip_buffer|, which were characters + // that were previously read from the file to do input-type detection. + assert(num_skip >= 0 && num_skip <= 8); + if (num_skip > 0) { + memcpy(tag_buffer, skip_buffer, num_skip); + } + // Start reading from the file now that the |skip_buffer| is depleted. + if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) { + return -1; + } + if (memcmp(TAG, tag_buffer, 9) != 0) { + fprintf(stderr, "Error parsing header: must start with %s\n", TAG); + return -1; + } + // Next character must be a space. + if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') { + fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); + return -1; + } + if (!parse_tags(y4m_ctx, file)) { + fprintf(stderr, "Error parsing %s header.\n", TAG); + return -1; + } + if (y4m_ctx->interlace == '?') { + fprintf(stderr, + "Warning: Input video interlacing format unknown; " + "assuming progressive scan.\n"); + } else if (y4m_ctx->interlace != 'p') { + fprintf(stderr, + "Input video is interlaced; " + "Only progressive scan handled.\n"); + return -1; + } + /* Only support vertical chroma sample position if the input format is + * already 420mpeg2. Colocated is not supported in Y4M. + */ + if (csp == AOM_CSP_VERTICAL && + strcmp(y4m_ctx->chroma_type, "420mpeg2") != 0) { + fprintf(stderr, + "Vertical chroma sample position only supported " + "for 420mpeg2 input\n"); + return -1; + } + if (csp == AOM_CSP_COLOCATED) { + // TODO(any): check the right way to handle this in y4m + fprintf(stderr, + "Ignoring colocated chroma sample position for reading in Y4M\n"); + } + y4m_ctx->aom_fmt = AOM_IMG_FMT_I420; + y4m_ctx->bps = 12; + y4m_ctx->bit_depth = 8; + y4m_ctx->aux_buf = NULL; + y4m_ctx->dst_buf = NULL; + if (strcmp(y4m_ctx->chroma_type, "420") == 0 || + strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 || + strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + /* Natively supported: no conversion required. */ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); + /* Natively supported: no conversion required. */ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 10; + y4m_ctx->bps = 15; + y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); + /* Natively supported: no conversion required. */ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 12; + y4m_ctx->bps = 18; + y4m_ctx->aom_fmt = AOM_IMG_FMT_I42016; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_sz = + 3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422jpeg_420jpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + if (only_420) { + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422_420jpeg; + } else { + y4m_ctx->aom_fmt = AOM_IMG_FMT_I422; + y4m_ctx->bps = 16; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + /*Natively supported: no conversion required.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } + } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216; + y4m_ctx->bps = 20; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->aom_fmt = AOM_IMG_FMT_I42216; + y4m_ctx->bps = 24; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) { + y4m_ctx->src_c_dec_h = 4; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = + y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_411_420jpeg; + } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + if (only_420) { + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer.*/ + y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_444_420jpeg; + } else { + y4m_ctx->aom_fmt = AOM_IMG_FMT_I444; + y4m_ctx->bps = 24; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Natively supported: no conversion required.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } + } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416; + y4m_ctx->bps = 30; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->aom_fmt = AOM_IMG_FMT_I44416; + y4m_ctx->bps = 36; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + if (only_420) { + fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "444alpha") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + if (only_420) { + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*Chroma filter required: read into the aux buf first. + We need to make two filter passes, so we need some extra space in the + aux buffer. + The extra plane also gets read into the aux buf. + It will be discarded.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = + 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_444_420jpeg; + } else { + fprintf(stderr, "Unsupported format: 444A\n"); + return -1; + } + } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0; + y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; + /*No extra space required, but we need to clear the chroma planes.*/ + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_mono_420jpeg; + } else { + fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type); + return -1; + } + /*The size of the final frame buffers is always computed from the + destination chroma decimation type.*/ + y4m_ctx->dst_buf_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) * + ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v); + if (y4m_ctx->bit_depth == 8) + y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz); + else + y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz); + if (!y4m_ctx->dst_buf) return -1; + + if (y4m_ctx->aux_buf_sz > 0) { + y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz); + if (!y4m_ctx->aux_buf) { + free(y4m_ctx->dst_buf); + return -1; + } + } + return 0; +} + +void y4m_input_close(y4m_input *_y4m) { + free(_y4m->dst_buf); + free(_y4m->aux_buf); +} + +int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *_img) { + char frame[6]; + int pic_sz; + int c_w; + int c_h; + int c_sz; + int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1; + /*Read and skip the frame header.*/ + if (!file_read(frame, 6, _fin)) return 0; + if (memcmp(frame, "FRAME", 5)) { + fprintf(stderr, "Loss of framing in Y4M input data\n"); + return -1; + } + if (frame[5] != '\n') { + char c; + int j; + for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) { + } + if (j == 79) { + fprintf(stderr, "Error parsing Y4M frame header\n"); + return -1; + } + } + /*Read the frame data that needs no conversion.*/ + if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) { + fprintf(stderr, "Error reading Y4M frame data.\n"); + return -1; + } + /*Read the frame data that does need conversion.*/ + if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) { + fprintf(stderr, "Error reading Y4M frame data.\n"); + return -1; + } + /*Now convert the just read frame.*/ + (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf); + /*Fill in the frame buffer pointers. + We don't use aom_img_wrap() because it forces padding for odd picture + sizes, which would require a separate fread call for every row.*/ + memset(_img, 0, sizeof(*_img)); + /*Y4M has the planes in Y'CbCr order, which libaom calls Y, U, and V.*/ + _img->fmt = _y4m->aom_fmt; + _img->w = _img->d_w = _y4m->pic_w; + _img->h = _img->d_h = _y4m->pic_h; + _img->bit_depth = _y4m->bit_depth; + _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; + _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; + _img->bps = _y4m->bps; + + /*Set up the buffer pointers.*/ + pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample; + c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; + c_w *= bytes_per_sample; + c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; + c_sz = c_w * c_h; + _img->stride[AOM_PLANE_Y] = _y4m->pic_w * bytes_per_sample; + _img->stride[AOM_PLANE_U] = _img->stride[AOM_PLANE_V] = c_w; + _img->planes[AOM_PLANE_Y] = _y4m->dst_buf; + _img->planes[AOM_PLANE_U] = _y4m->dst_buf + pic_sz; + _img->planes[AOM_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz; + return 1; +} diff --git a/third_party/aom/common/y4minput.h b/third_party/aom/common/y4minput.h new file mode 100644 index 0000000000..2472007b67 --- /dev/null +++ b/third_party/aom/common/y4minput.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * Based on code from the OggTheora software codec source code, + * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. + */ + +#ifndef AOM_COMMON_Y4MINPUT_H_ +#define AOM_COMMON_Y4MINPUT_H_ + +#include +#include "aom/aom_image.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct y4m_input y4m_input; + +/*The function used to perform chroma conversion.*/ +typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst, + unsigned char *_src); + +struct y4m_input { + int pic_w; + int pic_h; + int fps_n; + int fps_d; + int par_n; + int par_d; + char interlace; + int src_c_dec_h; + int src_c_dec_v; + int dst_c_dec_h; + int dst_c_dec_v; + char chroma_type[16]; + /*The size of each converted frame buffer.*/ + size_t dst_buf_sz; + /*The amount to read directly into the converted frame buffer.*/ + size_t dst_buf_read_sz; + /*The size of the auxilliary buffer.*/ + size_t aux_buf_sz; + /*The amount to read into the auxilliary buffer.*/ + size_t aux_buf_read_sz; + y4m_convert_func convert; + unsigned char *dst_buf; + unsigned char *aux_buf; + enum aom_img_fmt aom_fmt; + int bps; + unsigned int bit_depth; + aom_color_range_t color_range; +}; + +/** + * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after + * reading it. Note that |csp| should only be set for 420 input, and the input + * chroma is shifted if necessary. The code does not support the conversion + * from co-located to vertical. The |skip_buffer| indicates bytes that were + * previously read from |file|, to do input-type detection; this buffer will + * be read before the |file| is read. It is of size |num_skip|, which *must* + * be 8 or less. + * + * Returns 0 on success, -1 on failure. + */ +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, aom_chroma_sample_position_t csp, + int only_420); +void y4m_input_close(y4m_input *_y4m); +int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_Y4MINPUT_H_ diff --git a/third_party/aom/doc/AlgorithmDescription.md b/third_party/aom/doc/AlgorithmDescription.md new file mode 100644 index 0000000000..bfd64dad67 --- /dev/null +++ b/third_party/aom/doc/AlgorithmDescription.md @@ -0,0 +1,799 @@ +
Algorithm Description
+ +# Abstract +This document describes technical aspects of coding tools included in +the associated codec. This document is not a specification of the associated +codec. Instead, it summarizes the highlighted features of coding tools for new +developers. This document should be updated when significant new normative +changes have been integrated into the associated codec. + +# Table of Contents + +[Abbreviations](#Abbreviations) + +[Algorithm description](#Algorithm-Description) + +- [Block Partitioning](#Block-Partitioning) + - [Coding block partition](#Coding-block-partition) + - [Transform block partition](#Transform-block-partition) +- [Intra Prediction](#Intra-Prediction) + - [Directional intra prediction modes](#Directional-intra-prediction-modes) + - [Non-directional intra prediction modes](#Non-directional-intra-prediction-modes) + - [Recursive filtering modes](#Recursive-filtering-modes) + - [Chroma from Luma mode](#Chroma-from-Luma-mode) +- [Inter Prediction](#Inter-Prediction) + - [Motion vector prediction](#Motion-vector-prediction) + - [Motion vector coding](#Motion-vector-coding) + - [Interpolation filter for motion compensation](#Interpolation-filter-for-motion-compensation) + - [Warped motion compensation](#Warped-motion-compensation) + - [Overlapped block motion compensation](#Overlapped-block-motion-compensation) + - [Reference frames](#Reference-frames) + - [Compound Prediction](#Compound-Prediction) +- [Transform](#Transform) +- [Quantization](#Quantization) +- [Entropy Coding](#Entropy-Coding) +- [Loop filtering and post-processing](#Loop-filtering-and-post-processing) + - [Deblocking](#Deblocking) + - [Constrained directional enhancement](#Constrained-directional-enhancement) + - [Loop Restoration filter](#Loop-Restoration-filter) + - [Frame super-resolution](#Frame-super-resolution) + - [Film grain synthesis](#Film-grain-synthesis) +- [Screen content coding](#Screen-content-coding) + - [Intra block copy](#Intra-block-copy) + - [Palette mode](#Palette-mode) + +[References](#References) + +# Abbreviations + +CfL: Chroma from Luma\ +IntraBC: Intra block copy\ +LCU: Largest coding unit\ +OBMC: Overlapped Block Motion Compensation\ +CDEF: Constrained Directional Enhancement Filter + +# Algorithm Description + +## Block Partitioning + +### Coding block partition + +The largest coding block unit (LCU) applied in this codec is 128×128. In +addition to no split mode `PARTITION_NONE`, the partition tree supports 9 +different partitioning patterns, as shown in below figure. + +
Figure 1: Supported coding block +partitions
+ +According to the number of sub-partitions, the 9 partition modes are summarized +as follows: 1. Four partitions: `PARTITION_SPLIT`, `PARTITION_VERT_4`, +`PARTITION_HORZ_4` 2. Three partitions (T-Shape): `PARTITION_HORZ_A`, +`PARTITION_HORZ_B`, `PARTITION_VERT_A`, `PARTITION_HORZ_B` 3. Two partitions: +`PARTITION_HORZ`, `PARTITION_VERT` + +Among all the 9 partitioning patterns, only `PARTITION_SPLIT` mode supports +recursive partitioning, i.e., sub-partitions can be further split, other +partitioning modes cannot further split. Particularly, for 8x8 and 128x128, +`PARTITION_VERT_4`, `PARTITION_HORZ_4` are not used, and for 8x8, T-Shape +partitions are not used either. + +### Transform block partition + +For both intra and inter coded blocks, the coding block can be further +partitioned into multiple transform units with the partitioning depth up to 2 +levels. The mapping from the transform size of the current depth to the +transform size of the next depth is shown in the following Table 1. + +
Table 1: Transform partition size +setting
Partition
+ +Furthermore, for intra coded blocks, the transform partition is done in a way +that all the transform blocks have the same size, and the transform blocks are +coded in a raster scan order. An example of the transform block partitioning for +intra coded block is shown in the Figure 2. + +
Figure 2: Example of transform +partitioning for intra coded block
+ +For inter coded blocks, the transform unit partitioning can be done in a +recursive manner with the partitioning depth up to 2 levels. The transform +partitioning supports 1:1 (square), 1:2/2:1, and 1:4/4:1 transform unit sizes +ranging from 4×4 to 64×64. If the coding block is smaller than or equal to +64x64, the transform block partitioning can only apply to luma component, for +chroma blocks, the transform block size is identical to the coding block size. +Otherwise, if the coding block width or height is greater than 64, then both the +luma and chroma coding blocks will implicitly split into multiples of min(W, +64)x min(H, 64) and min(W, 32)x min(H, 32) transform blocks, respectively. + +
Figure 3: Example of transform +partitioning for inter coded block
+ +## Intra Prediction + +### Directional intra prediction modes + +Directional intra prediction modes are applied in intra prediction, which models +local textures using a given direction pattern. Directional intra prediction +modes are represented by nominal modes and angle delta. The nominal modes are +similar set of intra prediction angles used in VP9, which includes 8 angles. The +index value of angle delta is ranging from -3 ~ +3, and zero delta angle +indicates a nominal mode. The prediction angle is represented by a nominal intra +angle plus an angle delta. In total, there are 56 directional intra prediction +modes, as shown in the following figure. In the below figure, solid arrows +indicate directional intra prediction modes and dotted arrows represent non-zero +angle delta. + +
Figure 4: Directional intra +prediction modes
+ +The nominal mode index and angle delta index is signalled separately, and +nominal mode index is signalled before the associated angle delta index. It is +noted that for small block sizes, where the coding gain from extending intra +prediction angles may saturate, only the nominal modes are used and angle delta +index is not signalled. + +### Non-directional intra prediction modes + +In addition to directional intra prediction modes, four non-directional intra +modes which simulate smooth textures are also included. The four non-directional +intra modes include `SMOOTH_V`, `SMOOTH_H`, `SMOOTH` and `PAETH predictor`. + +In `SMOOTH V`, `SMOOTH H` and `SMOOTH modes`, the prediction values are +generated using quadratic interpolation along vertical, horizontal directions, +or the average thereof. The samples used in the quadratic interpolation include +reconstructed samples from the top and left neighboring blocks and samples from +the right and bottom boundaries which are approximated by top reconstructed +samples and the left reconstructed samples. + +In `PAETH predictor` mode, the prediction for each sample is assigned as one +from the top (T), left (L) and top-left (TL) reference samples, which has the +value closest to the Paeth predictor value, i.e., T + L -TL. The samples used in +`PAETH predictor` are illustrated in below figure. + +
Directional
+intra
Figure 5: Paeth predictor
+
+ +### Recursive filtering modes + +Five filtering intra modes are defined, and each mode specify a set of eight +7-tap filters. Given the selected filtering mode index (0~4), the current block +is divided into 4x2 sub-blocks. For one 4×2 sub-block, each sample is predicted +by 7-tap interpolation using the 7 top and left neighboring samples as inputs. +Different filters are applied for samples located at different coordinates +within a 4×2 sub-block. The prediction process can be done recursively in unit +4x2 sub-block, which means that prediction samples generated for one 4x2 +prediction block can be used to predict another 4x2 sub-block. + +
Figure 6: Recursive filtering +modes
+ +### Chroma from Luma mode + +Chroma from Luma (CfL) is a chroma intra prediction mode, which models chroma +samples as a linear function of co-located reconstructed luma samples. To align +the resolution between luma and chroma samples for different chroma sampling +format, e.g., 4:2:0 and 4:2:2, reconstructed luma pixels may need to be +sub-sampled before being used in CfL mode. In addition, the DC component is +removed to form the AC contribution. In CfL mode, the model parameters which +specify the linear function between two color components are optimized by +encoder signalled in the bitstream. + +
Directional
+intra
Figure 7: CfL prediction
+
+ +## Inter Prediction + +### Motion vector prediction + +Motion vectors are predicted by neighboring blocks which can be either spatial +neighboring blocks, or temporal neighboring blocks located in a reference frame. +A set of MV predictors will be identified by checking all these blocks and +utilized to encode the motion vector information. + +**Spatial motion vector prediction** + +There are two sets of spatial neighboring blocks that can be utilized for +finding spatial MV predictors, including the adjacent spatial neighbors which +are direct top and left neighbors of the current block, and second outer spatial +neighbors which are close but not directly adjacent to the current block. The +two sets of spatial neighboring blocks are illustrated in an example shown in +Figure 8. + +
Figure 8: Motion field +estimation by linear projection
+ +For each set of spatial neighbors, the top row will be checked from left to +right and then the left column will be checked from top to down. For the +adjacent spatial neighbors, an additional top-right block will be also checked +after checking the left column neighboring blocks. For the non-adjacent spatial +neighbors, the top-left block located at (-1, -1) position will be checked +first, then the top row and left column in a similar manner as the adjacent +neighbors. The adjacent neighbors will be checked first, then the temporal MV +predictor that will be described in the next subsection will be checked second, +after that, the non-adjacent spatial neighboring blocks will be checked. + +For compound prediction which utilizes a pair of reference frames, the +non-adjacent spatial neighbors are not used for deriving the MV predictor. + +**Temporal motion vector prediction** + +In addition to spatial neighboring blocks, MV predictor can be also derived +using co-located blocks of reference pictures, namely temporal MV predictor. To +generate temporal MV predictor, the MVs of reference frames are first stored +together with reference indices associated with the reference frame. Then for +each 8x8 block of the current frame, the MVs of a reference frame which pass the +8x8 block are identified and stored together with the reference frame index in a +temporal MV buffer. In an example shown in Figure 5, the MV of reference frame 1 +(R1) pointing from R1 to a reference frame of R1 is identified, i.e., MVref, +which passes a 8x8 block (shaded in blue dots) of current frame. Then this MVref +is stored in the temporal MV buffer associated with this 8x8 block.
Directional
+intra
Figure 9: Motion field estimation by linear +projection
Finally, given a couple of pre-defined block +coordinates, the associated MVs stored in the temporal MV buffer are identified +and projected accordingly to derive a temporal MV predictor which points from +the current block to its reference frame, e.g., MV0 in Figure 5. In Figure 6, +the pre-defined block positions for deriving temporal MV predictors of a 16x16 +block are shown and up to 7 blocks will be checked to find valid temporal MV +predictors.
Directional intra
Figure 10: Block positions for deriving temporal MV +predictors
The temporal MV predictors are checked after +the nearest spatial MV predictors but before the non-adjacent spatial MV +predictors. + +All the spatial and temporal MV candidates will be put together in a pool, with +each predictor associated with a weighting determined during the scanning of the +spatial and temporal neighboring blocks. Based on the associated weightings, the +candidates are sorted and ranked, and up to four candidates will be used as a +list MV predictor list. + +### Motion vector coding + +### Interpolation filter for motion compensation + +[Ed.: to be added] + +### Warped motion compensation + +**Global warped motion** + +The global motion information is signalled at each inter frame, wherein the +global motion type and motion parameters are included. The global motion types +and the number of the associated parameters are listed in the following table. + + +| Global motion type | Number of parameters | +|:------------------:|:--------------------:| +| Identity (zero motion)| 0 | +| Translation | 2 | +| Rotzoom | 4 | +| General affine | 6 | + +For an inter coded block, after the reference frame index is +transmitted, if the motion of current block is indicated as global motion, the +global motion type and the associated parameters of the given reference will be +used for current block. + +**Local warped motion** + +For an inter coded block, local warped motion is allowed when the following +conditions are all satisfied: + +* Current block is single prediction +* Width or height is greater than or equal to 8 samples +* At least one of the immediate neighbors uses same reference frame with current block + +If the local warped motion is used for current block, instead of signalling the +affine parameters, they are estimated by using mean square minimization of the +distance between the reference projection and modeled projection based on the +motion vectors of current block and its immediate neighbors. To estimate the +parameters of local warped motion, the projection sample pair of the center +pixel in neighboring block and its corresponding pixel in the reference frame +are collected if the neighboring block uses the same reference frame with +current block. After that, 3 extra samples are created by shifting the center +position by a quarter sample in one or two dimensions, and these samples are +also considered as projection sample pairs to ensure the stability of the model +parameter estimation process. + + +### Overlapped block motion compensation + +For an inter-coded block, overlapped block motion compensation (OBMC) is allowed +when the following conditions are all satisfied. + +* Current block is single prediction +* Width or height is greater than or equal to 8 samples +* At least one of the neighboring blocks are inter-coded blocks + +When OBMC is applied to current block, firstly, the initial inter prediction +samples is generated by using the assigned motion vector of current block, then +the inter predicted samples for the current block and inter predicted samples +based on motion vectors from the above and left blocks are blended to generate +the final prediction samples.The maximum number of neighboring motion vectors is +limited based on the size of current block, and up to 4 motion vectors from each +of upper and left blocks can be involved in the OBMC process of current block. + +One example of the processing order of neighboring blocks is shown in the +following picture, wherein the values marked in each block indicate the +processing order of the motion vectors of current block and neighboring blocks. +To be specific, the motion vector of current block is firstly applied to +generate inter prediction samples P0(x,y). Then motion vector of block 1 is +applied to generate the prediction samples p1(x,y). After that, the prediction +samples in the overlapping area between block 0 and block 1 is an weighted +average of p0(x,y) and p1(x,y). The overlapping area of block 1 and block 0 is +marked in grey in the following picture. The motion vectors of block 2, 3, 4 are +further applied and blended in the same way. + +
Directional
+intra
Figure 11: neighboring blocks for OBMC +process
+ +### Reference frames + +[Ed.: to be added] + +### Compound Prediction + +[Ed.: to be added] + +**Compound wedge prediction** + +[Ed.: to be added] + +**Difference-modulated masked prediction** + +[Ed.: to be added] + +**Frame distance-based compound prediction** + +[Ed.: to be added] + +**Compound inter-intra prediction** + +[Ed.: to be added] + +## Transform + +The separable 2D transform process is applied on prediction residuals. For the +forward transform, a 1-D vertical transform is performed first on each column of +the input residual block, then a horizontal transform is performed on each row +of the vertical transform output. For the backward transform, a 1-D horizontal +transform is performed first on each row of the input de-quantized coefficient +block, then a vertical transform is performed on each column of the horizontal +transform output. The primary 1-D transforms include four different types of +transform: a) 4-point, 8-point, 16-point, 32-point, 64-point DCT-2; b) 4-point, +8-point, 16-point asymmetric DST’s (DST-4, DST-7) and c) their flipped +versions; d) 4-point, 8-point, 16-point, 32-point identity transforms. When +transform size is 4-point, ADST refers to DST-7, otherwise, when transform size +is greater than 4-point, ADST refers to DST-4. + +
Table 2: Transform basis functions +(DCT-2, DST-4 and DST-7 for N-point input.
Partition
+ +For luma component, each transform block can select one pair of horizontal and +vertical transform combination given a pre-defined set of transform type +candidates, and the selection is explicitly signalled into the bitstream. +However, the selection is not signalled when Max(width,height) is 64. When +the maximum of transform block width and height is greater than or equal to 32, +the set of transform type candidates depend on the prediction mode, as described +in Table 3. Otherwise, when the maximum of transform block width and height is +smaller than 32, the set of transform type candidates depend on the prediction +mode, as described in Table 4. + +
Table 3: Transform type candidates +for luma component when max(width, height) is greater than or equal to 32. +
Partition +
+ +
Table 4: Transform type candidates +for luma component when max(width, height) is smaller than 32.
+Partition
+ +The set of transform type candidates (namely transform set) is defined in Table +5. + +
Table 5: Definition of transform set. +
Partition
+ +For chroma component, the transform type selection is done in an implicit way. +For intra prediction residuals, the transform type is selected according to the +intra prediction mode, as specified in Table 4. For inter prediction residuals, +the transform type is selected according to the transform type selection of the +co-located luma block. Therefore, for chroma component, there is no transform +type signalling in the bitstream. + +
Table 6: Transform type selection for +chroma component intra prediction residuals.
Partition
+ +The computational cost of large size (e.g., 64-point) transforms is further +reduced by zeroing out all the coefficients except the following two cases: + +1. The top-left 32×32 quadrant for 64×64/64×32/32×64 DCT_DCT hybrid transforms +2. The left 32×16 area for 64×16 and top 16×32 for16×64 DCT_DCT hybrid transforms. + +Both the DCT-2 and ADST (DST-4, DST-7) are implemented using butterfly structure +[1], which included multiple stages of butterfly operations. Each butterfly +operations can be calculated in parallel and different stages are cascaded in a +sequential order. + +## Quantization +Quantization of transform coefficients may apply different quantization step +size for DC and AC transform coefficients, and different quantization step size +for luma and chroma transform coefficients. To specify the quantization step +size, in the frame header, a _**base_q_idx**_ syntax element is first signalled, +which is a 8-bit fixed length code specifying the quantization step size for +luma AC coefficients. The valid range of _**base_q_idx**_ is [0, 255]. + +After that, the delta value relative to base_q_idx for Luma DC coefficients, +indicated as DeltaQYDc is further signalled. Furthermore, if there are more than +one color plane, then a flag _**diff_uv_delta**_ is signaled to indicate whether +Cb and Cr color components apply different quantization index values. If +_**diff_uv_delta**_ is signalled as 0, then only the delta values relative to +base_q_idx for chroma DC coefficients (indicated as DeltaQUDc) and AC +coefficients (indicated as DeltaQUAc) are signalled. Otherwise, the delta values +relative to base_q_idx for both the Cb and Cr DC coefficients (indicated as +DeltaQUDc and DeltaQVDc) and AC coefficients (indicated as DeltaQUAc and +DeltaQVAc) are signalled. + +The above decoded DeltaQYDc, DeltaQUAc, DeltaQUDc, DeltaQVAc and DeltaQVDc are +added to _base_q_idx_ to derive the quantization indices. Then these +quantization indices are further mapped to quantization step size according to +two tables. For DC coefficients, the mapping from quantization index to +quantization step size for 8-bit, 10-bit and 12-bit internal bit depth is +specified by a lookup table Dc_Qlookup[3][256], and the mapping from +quantization index to quantization step size for 8-bit, 10-bit and 12-bit is +specified by a lookup table Ac_Qlookup[3][256]. + +
quant_dc
Figure 11: Quantization step size of DC coefficients +for different internal bit-depth
+ +
quant_ac
Figure 12: Quantization step size of AC coefficients +for different internal bit-depth
+ +Given the quantization step size, indicated as _Qstep_, the input quantized +coefficients is further de-quantized using the following formula: + +_F_ = sign * ( (_f_ * _Qstep_) % 0xFFFFFF ) / _deNorm_ + +, where _f_ is the input quantized coefficient, _F_ is the output dequantized +coefficient, _deNorm_ is a constant value derived from the transform block area +size, as indicated by the following table: + +| _deNorm_ | Tx block area size | +|----------|:--------------------------| +| 1| Less than 512 samples | +| 2 | 512 or 1024 samples | +| 4 | Greater than 1024 samples | + +When the quantization index is 0, the quantization is performed using a +quantization step size equal to 1, which is lossless coding mode. + +## Entropy Coding + +**Entropy coding engine** + +[Ed.: to be added] + +**Coefficient coding** + +For each transform unit, the coefficient coding starts with coding a skip sign, +which is followed by the signaling of primary transform kernel type and the +end-of-block (EOB) position in case the transform coding is not skipped. After +that, the coefficient values are coded in a multiple level map manner plus sign +values. The level maps are coded as three level planes, namely lower-level, +middle-level and higher-level planes, and the sign is coded as another separate +plane. The lower-level, middle-level and higher-level planes correspond to +correspond to different ranges of coefficient magnitudes. The lower level plane +corresponds to the range of 0–2, the middle level plane takes care of the +range of 3–14, and the higher-level plane covers the range of 15 and above. + +The three level planes are coded as follows. After the EOB position is coded, +the lower-level and middle-level planes are coded together in backward scan +order, and the scan order refers to zig-zag scan applied on the entire transform +unit basis. Then the sign plane and higher-level plane are coded together in +forward scan order. After that, the remainder (coefficient level minus 14) is +entropy coded using Exp-Golomb code. + +The context model applied to the lower level plane depends on the primary +transform directions, including: bi-directional, horizontal, and vertical, as +well as transform size, and up to five neighbor (in frequency domain) +coefficients are used to derive the context. The middle level plane uses a +similar context model, but the number of context neighbor coefficients is +reduced from 5 to 2. The higher-level plane is coded by Exp-Golomb code without +using context model. For the sign plane, except the DC sign that is coded using +the DC signs from its neighboring transform units, sign values of other +coefficients are coded directly without using context model. + +## Loop filtering and post-processing + +### Deblocking + +There are four methods when picking deblocking filter level, which are listed +below: + +* LPF_PICK_FROM_FULL_IMAGE: search the full image with different values +* LPF_PICK_FROM_Q: estimate the filter level based on quantizer and frame type +* LPF_PICK_FROM_SUBIMAGE: estimate the level from a portion of image +* LPF_PICK_MINIMAL_LPF: set the filter level to 0 and disable the deblocking + +When estimating the filter level from the full image or sub-image, the searching +starts from the previous frame filter level, ends when the filter step is less +or equal to zero. In addition to filter level, there are some other parameters +which control the deblocking filter such as sharpness level, mode deltas, and +reference deltas. + +Deblocking is performed at 128x128 super block level, and the vertical and +horizontal edges are filtered respectively. For a 128x128 super block, the +vertical/horizontal edges aligned with each 8x8 block is firstly filtered. If +the 4x4 transform is used, the internal edge aligned with a 4x4 block will be +further filtered. The filter length is switchable from 4-tap, 6-tap, 8-tap, +14-tap, and 0-tap (no filtering). The location of filter taps are identified +based on the number of filter taps in order to compute the filter mask. When +finally performing the filtering, outer taps are added if there is high edge +variance. + +### Constrained directional enhancement filter + +**Edge Direction Estimation**\ +In CDEF, edge direction search is performed at 8x8 block-level. There are +eight edge directions in total, as illustrated in Figure 13. +
Figure 13: Line number +k for pixels following direction d=0:7 in an 8x8 block.
+ +The optimal edge direction d_opt is found by maximizing the following +term [3]: + +
+ + +where x_p is the value of pixel p, P_{d,k} is the set of pixels in +line k following direction d, N_{d,k} is the cardinality of P_{d,k}. + +**Directional filter**\ +CDEF consists two filter taps: the primary tap and the secondary tap. +The primary tap works along the edge direction (as shown in Figure 14), +while the secondary tap forms an oriented 45 degree off the edge direction + (as shown in Figure 15). + +
Figure 14: Primary filter +taps following edge direction. For even strengths a = 2 and b = 4, for +odd strengths a = 3 and b = 3. The filtered pixel is shown in the +highlighted center.
+ +
Figure 15: Secondary +filter taps. The filtered pixel is shown in the highlighted center. +
+ +CDEF can be described by the following equation: + +
+ + + +where x(i,j) and y(i,j) are the input and output reconstructed values +of CDEF. p denotes primary tap, and s denotes secondary tap, w is +the weight between primary and secondary tap. f(d,S,D) is a non-linear +filtering function, S denotes filter strength, D is a damping parameter. +For 8-bit content, S^p ranges from 0 to 15, and S^s can be +0, 1, 2, or 4. D ranges from 3 to 6 for luma, and 2 to 4 for chroma. + +**Non linear filter**\ +CDEF uses a non-linear filtering function to prevent excessive blurring +when applied across an edge. It is achieved by ignoring pixels that are +too different from the current pixels to be filtered. When the difference +between current pixel and it's neighboring pixel d is within a threshold, +f(d,S,D) = d, otherwise f(d,S,D) = 0. Specifically, the strength S +determines the maximum difference allowed and damping D determines the +point to ignore the filter tap. + +### Loop Restoration filter + +**Separable symmetric wiener filter** + +Let F be a w x w 2D filter taps around the pixel to be filtered, denoted as +a w^2 x 1 column vector. When compared with traditional Wiener Filter, +Separable Symmetric Wiener Filter has the following three constraints in order +to save signaling bits and reduce complexity [4]: + +1) The w x w filter window of is separated into horizontal and vertical w-tap +convolutions. + +2) The horizontal and vertical filters are constrained to be symmetric. + +3) It is assumed that the summation of horizontal/vertical filter coefficients +is 1. + +As a result, F can be written as F = column_vectorize[ab^T], subject to a(i) += a(w - 1 - i), b(i) = b(w - 1 - i), for i = [0, r - 1], and sum(a(i)) = +sum(b(i)) = 1, where a is the vertical filters and b is the horizontal filters. +The derivation of the filters a and b starts from an initial guess of +horizontal and vertical filters, optimizing one of the two while holding the +other fixed. In the implementation w = 7, thus, 3 taps need to be sent for +filters a and b, respectively. When signaling the filter coefficients, 4, 5 and +6 bits are used for the first three filter taps, and the remaining ones are +obtained from the normalization and symmetry constraints. 30 bits in total are +transmitted for both vertical and horizontal filters. + + +**Dual self-guided filter** + +Dual self-guided filter is designed to firstly obtain two coarse restorations +X1 and X2 of the degraded frame X, and the final restoration Xr is obtained as +a combination of the degraded samples, and the difference between the degraded +samples and the coarse restorations [4]: + +
+ + +At encoder side, alpha and beta are computed using: + +
+ + +where A = {X1 - X, X2 - X}, b = Y - X, and Y is the original source. + +X1 and X2 are obtained using guided filtering, and the filtering is controlled +by a radius r and a noise parameter e, where a higher r implies a higher +spatial variance and a higher e implies a higher range variance [4]. X1 and X2 +can be described by {r1, e1} and {r2, e2}, respectively. + +The encoder sends a 6-tuple {r1, e1, r2, e2, alpha, beta} to the decoder. In +the implementation, {r1, e1, r2, e2} uses a 3-bit codebook, and {alpha, beta} +uses 7-bit each due to much higher precision, resulting in a total of 17 bits. +r is always less or equal to 3 [4]. + +Guided filtering can be described by a local linear model: + +
+ + +where x and y are the input and output samples, F and G are determined by the +statistics in the neighboring of the pixel to be filtered. It is called +self-guided filtering when the guidance image is the same as the degraded +image[4]. + +Following are three steps when deriving F and G of the self-guided filtering: + +1) Compute mean u and variance d of pixels in a (2r + 1) x (2r + 1) window +around the pixel to be filtered. + +2) For each pixel, compute f = d / (d + e); g = (1 - f)u. + +3) Compute F and G for each pixel as averages of f and g values in a 3 x 3 +window around the pixel for use in step 2. + +### Frame super-resolution + +In order to improve the perceptual quality of decoded pictures, a +super-resolution process is applied at low bit-rates [5]. First, at encoder +side, the source video is downscaled as a non-normative procedure. Second, +the downscaled video is encoded, followed by deblocking and CDEF process. +Third, a linear upscaling process is applied as a normative procedure to bring +the encoded video back to it's original spatial resolution. Lastly, the loop +restoration is applied to resolve part of the high frequency lost. The last +two steps together are called super-resolving process [5]. Similarly, decoding, +deblocking and CDEF processes are applied at lower spatial resolution at +decoder side. Then, the frames go through the super-resolving process. +In order to reduce overheads in line-buffers with respect to hardware +implementation, the upscaling and downscaling process are applied to +horizontal dimension only. + +### Film grain synthesis + +At encoder side, film grain is removed from the input video as a denoising +process. Then, the structure and intensity of the input video are analyzed +by canny edge detector, and smooth areas are used to estimate the strength +of film grain. Once the strength is estimated, the denoised video and film +grain parameters are sent to decoder side. Those parameters are used to +synthesis the grain and add it back to the decoded video, producing the final +output video. + +In order to reconstruct the film grain, the following parameters are sent to +decoder side: lag value, autoregressive coefficients, values for precomputed +look-up table index of chroma components, and a set of points for a piece-wise +linear scaling function [6]. Those parameters are signaled as quantized +integers including 64 bytes for scaling function and 74 bytes for +autoregressive coefficients. Once the parameters are received, an +autoregressive process is applied in a raster scan order to generate one 64x64 +luma and two 32x32 chroma film grain templates [6]. Those templates are used +to generate the grain for the remaining part of a picture. + +## Screen content coding + +To improve the coding performance of screen content coding, the associated video +codec incorporates several coding tools,for example, intra block copy +(IntraBC) is employed to handle the repeated patterns in a screen picture, and +palette mode is used to handle the screen blocks with a limited number of +different colors. + +### Intra block copy + +Intra Block Copy (IntraBC) [2] is a coding tool similar to inter-picture +prediction. The main difference is that in IntraBC, a predictor block is +formed from the reconstructed samples (before application of in-loop filtering) +of the current picture. Therefore, IntraBC can be considered as "motion +compensation" within current picture. + +A block vector (BV) was coded to specify the location of the predictor block. +The BV precision is integer. The BV will be signalled in the bitstream since the +decoder needs it to locate the predictor. For current block, the flag use +IntraBC indicating whether current block is IntraBC mode is first transmitted in +bit stream. Then, if the current block is IntraBC mode, the BV difference diff +is obtained by subtracting the reference BV from the current BV, and then diff +is classified into four types according to the diff values of horizontal and +vertical component. Type information needs to be transmitted into the bitstream, +after that, diff values of two components may be signalled based on the type +info. + +IntraBC is very effective for screen content coding, but it also brings a lot of +difficulties to hardware design. To facilitate the hardware design, the +following modifications are adopted. + +1) when IntraBC is allowed, the loop filters are disabled, which are de-blocking +filter, the CDEF (Constrained Directional Enhancement Filter), and the Loop +Restoration. By doing this, picture buffer of reconstructed samples can be +shared between IntraBC and inter prediction. + +2) To facilitate parallel decoding, the prediction cannot exceed the restricted +areas. For one super block, if the coordinate of its top-left position is (x0, +y0), the prediction at position (x, y) can be accessed by IntraBC, if y < y0 and +x < x0 + 2 * (y0 - y) + +3) To allow hardware writing back delay, immediate reconstructed areas cannot be +accessed by IntraBC prediction. The restricted immediate reconstructed area can +be 1 ∼ n super blocks. So on top of modification 2, if the coordinate of one +super block's top-left position is (x0, y0), the prediction at position (x, y) +can be accessed by IntraBC, if y < y0 and x < x0 + 2 * (y0 - y) - D, where D +denotes the restricted immediate reconstructed area. When D is one super block, +the prediction area is shown in below figure. + +
Intra block
+copy
Figure 13: the prediction area for IntraBC mode +in one super block prediction
+ +### Palette mode + +# References + +[1] J. Han, Y. Xu and D. Mukherjee, "A butterfly structured design of the hybrid +transform coding scheme," 2013 Picture Coding Symposium (PCS), San Jose, CA, +2013, pp. 17-20.\ +[2] J. Li, H. Su, A. Converse, B. Li, R. Zhou, B. Lin, J. Xu, Y. Lu, and R. +Xiong, "Intra Block Copy for Screen Content in the Emerging AV1 Video Codec," +2018 Data Compression Conference, Snowbird, Utah, USA.\ +[3] S. Midtskogen and J.M. Valin. "The AV1 constrained directional enhancement + filter (CDEF)." In 2018 IEEE International Conference on Acoustics, Speech + and Signal Processing (ICASSP), pp. 1193-1197. IEEE, 2018.\ +[4] D. Mukherjee, S. Li, Y. Chen, A. Anis, S. Parker, and +J. Bankoski. "A switchable loop-restoration with side-information framework +for the emerging AV1 video codec." In 2017 IEEE International Conference on +Image Processing (ICIP), pp. 265-269. IEEE, 2017.\ +[5] Y. Chen, D. Murherjee, J. Han, A. Grange, Y. Xu, Z. Liu,... & C.H.Chiang, +(2018, June). "An overview of core coding tools in the AV1 video codec."" +In 2018 Picture Coding Symposium (PCS) (pp. 41-45). IEEE.\ +[6] A. Norkin, & N. Birkbeck, (2018, March). "Film grain synthesis for AV1 +video codec." In 2018 Data Compression Conference (pp. 3-12). IEEE. diff --git a/third_party/aom/doc/dev_guide/av1_decoder.dox b/third_party/aom/doc/dev_guide/av1_decoder.dox new file mode 100644 index 0000000000..f65ddb51ca --- /dev/null +++ b/third_party/aom/doc/dev_guide/av1_decoder.dox @@ -0,0 +1,11 @@ +/*!\page decoder_guide AV1 DECODER GUIDE + + Describe AV1 decoding techniques here. + + \cond + \if av1_md_support + [AV1 Algorithm Description](\ref LALGORITHMDESCRIPTION) + \endif + \endcond + +*/ diff --git a/third_party/aom/doc/dev_guide/av1_encoder.dox b/third_party/aom/doc/dev_guide/av1_encoder.dox new file mode 100644 index 0000000000..0f7e8f87e2 --- /dev/null +++ b/third_party/aom/doc/dev_guide/av1_encoder.dox @@ -0,0 +1,1617 @@ +/*!\page encoder_guide AV1 ENCODER GUIDE + +\tableofcontents + +\section architecture_introduction Introduction + +This document provides an architectural overview of the libaom AV1 encoder. + +It is intended as a high level starting point for anyone wishing to contribute +to the project, that will help them to more quickly understand the structure +of the encoder and find their way around the codebase. + +It stands above and will where necessary link to more detailed function +level documents. + +\subsection architecture_gencodecs Generic Block Transform Based Codecs + +Most modern video encoders including VP8, H.264, VP9, HEVC and AV1 +(in increasing order of complexity) share a common basic paradigm. This +comprises separating a stream of raw video frames into a series of discrete +blocks (of one or more sizes), then computing a prediction signal and a +quantized, transform coded, residual error signal. The prediction and residual +error signal, along with any side information needed by the decoder, are then +entropy coded and packed to form the encoded bitstream. See Figure 1: below, +where the blue blocks are, to all intents and purposes, the lossless parts of +the encoder and the red block is the lossy part. + +This is of course a gross oversimplification, even in regard to the simplest +of the above codecs. For example, all of them allow for block based +prediction at multiple different scales (i.e. different block sizes) and may +use previously coded pixels in the current frame for prediction or pixels from +one or more previously encoded frames. Further, they may support multiple +different transforms and transform sizes and quality optimization tools like +loop filtering. + +\image html genericcodecflow.png "" width=70% + +\subsection architecture_av1_structure AV1 Structure and Complexity + +As previously stated, AV1 adopts the same underlying paradigm as other block +transform based codecs. However, it is much more complicated than previous +generation codecs and supports many more block partitioning, prediction and +transform options. + +AV1 supports block partitions of various sizes from 128x128 pixels down to 4x4 +pixels using a multi-layer recursive tree structure as illustrated in figure 2 +below. + +\image html av1partitions.png "" width=70% + +AV1 also provides 71 basic intra prediction modes, 56 single frame inter prediction +modes (7 reference frames x 4 modes x 2 for OBMC (overlapped block motion +compensation)), 12768 compound inter prediction modes (that combine inter +predictors from two reference frames) and 36708 compound inter / intra +prediction modes. Furthermore, in addition to simple inter motion estimation, +AV1 also supports warped motion prediction using affine transforms. + +In terms of transform coding, it has 16 separable 2-D transform kernels +\f$(DCT, ADST, fADST, IDTX)^2\f$ that can be applied at up to 19 different +scales from 64x64 down to 4x4 pixels. + +When combined together, this means that for any one 8x8 pixel block in a +source frame, there are approximately 45,000,000 different ways that it can +be encoded. + +Consequently, AV1 requires complex control processes. While not necessarily +a normative part of the bitstream, these are the algorithms that turn a set +of compression tools and a bitstream format specification, into a coherent +and useful codec implementation. These may include but are not limited to +things like :- + +- Rate distortion optimization (The process of trying to choose the most + efficient combination of block size, prediction mode, transform type + etc.) +- Rate control (regulation of the output bitrate) +- Encoder speed vs quality trade offs. +- Features such as two pass encoding or optimization for low delay + encoding. + +For a more detailed overview of AV1's encoding tools and a discussion of some +of the design considerations and hardware constraints that had to be +accommodated, please refer to +A Technical Overview of AV1. + +Figure 3 provides a slightly expanded but still simplistic view of the +AV1 encoder architecture with blocks that relate to some of the subsequent +sections of this document. In this diagram, the raw uncompressed frame buffers +are shown in dark green and the reconstructed frame buffers used for +prediction in light green. Red indicates those parts of the codec that are +(or may be) lossy, where fidelity can be traded off against compression +efficiency, whilst light blue shows algorithms or coding tools that are +lossless. The yellow blocks represent non-bitstream normative configuration +and control algorithms. + +\image html av1encoderflow.png "" width=70% + +\section architecture_command_line The Libaom Command Line Interface + + Add details or links here: TODO ? elliotk@ + +\section architecture_enc_data_structures Main Encoder Data Structures + +The following are the main high level data structures used by the libaom AV1 +encoder and referenced elsewhere in this overview document: + +- \ref AV1_PRIMARY + - \ref AV1_PRIMARY.gf_group (\ref GF_GROUP) + - \ref AV1_PRIMARY.lap_enabled + - \ref AV1_PRIMARY.twopass (\ref TWO_PASS) + - \ref AV1_PRIMARY.p_rc (\ref PRIMARY_RATE_CONTROL) + - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO) + +- \ref AV1_COMP + - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig) + - \ref AV1_COMP.rc (\ref RATE_CONTROL) + - \ref AV1_COMP.speed + - \ref AV1_COMP.sf (\ref SPEED_FEATURES) + +- \ref AV1EncoderConfig (Encoder configuration parameters) + - \ref AV1EncoderConfig.pass + - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg) + - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg) + - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg) + +- \ref AlgoCfg (Algorithm related configuration parameters) + - \ref AlgoCfg.arnr_max_frames + - \ref AlgoCfg.arnr_strength + +- \ref KeyFrameCfg (Keyframe coding configuration parameters) + - \ref KeyFrameCfg.enable_keyframe_filtering + +- \ref RateControlCfg (Rate control configuration) + - \ref RateControlCfg.mode + - \ref RateControlCfg.target_bandwidth + - \ref RateControlCfg.best_allowed_q + - \ref RateControlCfg.worst_allowed_q + - \ref RateControlCfg.cq_level + - \ref RateControlCfg.under_shoot_pct + - \ref RateControlCfg.over_shoot_pct + - \ref RateControlCfg.maximum_buffer_size_ms + - \ref RateControlCfg.starting_buffer_level_ms + - \ref RateControlCfg.optimal_buffer_level_ms + - \ref RateControlCfg.vbrbias + - \ref RateControlCfg.vbrmin_section + - \ref RateControlCfg.vbrmax_section + +- \ref PRIMARY_RATE_CONTROL (Primary Rate control status) + - \ref PRIMARY_RATE_CONTROL.gf_intervals[] + - \ref PRIMARY_RATE_CONTROL.cur_gf_index + +- \ref RATE_CONTROL (Rate control status) + - \ref RATE_CONTROL.intervals_till_gf_calculate_due + - \ref RATE_CONTROL.frames_till_gf_update_due + - \ref RATE_CONTROL.frames_to_key + +- \ref TWO_PASS (Two pass status and control data) + +- \ref GF_GROUP (Data related to the current GF/ARF group) + +- \ref FIRSTPASS_STATS (Defines entries in the first pass stats buffer) + - \ref FIRSTPASS_STATS.coded_error + +- \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters) + - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES) + +- \ref HIGH_LEVEL_SPEED_FEATURES + - \ref HIGH_LEVEL_SPEED_FEATURES.recode_loop + - \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance + +- \ref TplParams + +\section architecture_enc_use_cases Encoder Use Cases + +The libaom AV1 encoder is configurable to support a number of different use +cases and rate control strategies. + +The principle use cases for which it is optimised are as follows: + + - Video on Demand / Streaming + - Low Delay or Live Streaming + - Video Conferencing / Real Time Coding (RTC) + - Fixed Quality / Testing + +Other examples of use cases for which the encoder could be configured but for +which there is less by way of specific optimizations include: + + - Download and Play + - Disk Playback> + - Storage + - Editing + - Broadcast video + +Specific use cases may have particular requirements or constraints. For +example: + +Video Conferencing: In a video conference we need to encode the video +in real time and to avoid any coding tools that could increase latency, such +as frame look ahead. + +Live Streams: In cases such as live streaming of games or events, it +may be possible to allow some limited buffering of the video and use of +lookahead coding tools to improve encoding quality. However, whilst a lag of +a second or two may be fine given the one way nature of this type of video, +it is clearly not possible to use tools such as two pass coding. + +Broadcast: Broadcast video (e.g. digital TV over satellite) may have +specific requirements such as frequent and regular key frames (e.g. once per +second or more) as these are important as entry points to users when switching +channels. There may also be strict upper limits on bandwidth over a short +window of time. + +Download and Play: Download and play applications may have less strict +requirements in terms of local frame by frame rate control but there may be a +requirement to accurately hit a file size target for the video clip as a +whole. Similar considerations may apply to playback from mass storage devices +such as DVD or disk drives. + +Editing: In certain special use cases such as offline editing, it may +be desirable to have very high quality and data rate but also very frequent +key frames or indeed to encode the video exclusively as key frames. Lossless +video encoding may also be required in this use case. + +VOD / Streaming: One of the most important and common use cases for AV1 +is video on demand or streaming, for services such as YouTube and Netflix. In +this use case it is possible to do two or even multi-pass encoding to improve +compression efficiency. Streaming services will often store many encoded +copies of a video at different resolutions and data rates to support users +with different types of playback device and bandwidth limitations. +Furthermore, these services support dynamic switching between multiple +streams, so that they can respond to changing network conditions. + +Exact rate control when encoding for a specific format (e.g 360P or 1080P on +YouTube) may not be critical, provided that the video bandwidth remains within +allowed limits. Whilst a format may have a nominal target data rate, this can +be considered more as the desired average egress rate over the video corpus +rather than a strict requirement for any individual clip. Indeed, in order +to maintain optimal quality of experience for the end user, it may be +desirable to encode some easier videos or sections of video at a lower data +rate and harder videos or sections at a higher rate. + +VOD / streaming does not usually require very frequent key frames (as in the +broadcast case) but key frames are important in trick play (scanning back and +forth to different points in a video) and for adaptive stream switching. As +such, in a use case like YouTube, there is normally an upper limit on the +maximum time between key frames of a few seconds, but within certain limits +the encoder can try to align key frames with real scene cuts. + +Whilst encoder speed may not seem to be as critical in this use case, for +services such as YouTube, where millions of new videos have to be encoded +every day, encoder speed is still important, so libaom allows command line +control of the encode speed vs quality trade off. + +Fixed Quality / Testing Mode: Libaom also has a fixed quality encoder +pathway designed for testing under highly constrained conditions. + +\section architecture_enc_speed_quality Speed vs Quality Trade Off + +In any modern video encoder there are trade offs that can be made in regard to +the amount of time spent encoding a video or video frame vs the quality of the +final encode. + +These trade offs typically limit the scope of the search for an optimal +prediction / transform combination with faster encode modes doing fewer +partition, reference frame, prediction mode and transform searches at the cost +of some reduction in coding efficiency. + +The pruning of the size of the search tree is typically based on assumptions +about the likelihood of different search modes being selected based on what +has gone before and features such as the dimensions of the video frames and +the Q value selected for encoding the frame. For example certain intra modes +are less likely to be chosen at high Q but may be more likely if similar +modes were used for the previously coded blocks above and to the left of the +current block. + +The speed settings depend both on the use case (e.g. Real Time encoding) and +an explicit speed control passed in on the command line as --cpu-used +and stored in the \ref AV1_COMP.speed field of the main compressor instance +data structure (cpi). + +The control flags for the speed trade off are stored the \ref AV1_COMP.sf +field of the compressor instancve and are set in the following functions:- + +- \ref av1_set_speed_features_framesize_independent() +- \ref av1_set_speed_features_framesize_dependent() +- \ref av1_set_speed_features_qindex_dependent() + +A second factor impacting the speed of encode is rate distortion optimisation +(rd vs non-rd encoding). + +When rate distortion optimization is enabled each candidate combination of +a prediction mode and transform coding strategy is fully encoded and the +resulting error (or distortion) as compared to the original source and the +number of bits used, are passed to a rate distortion function. This function +converts the distortion and cost in bits to a single RD value (where +lower is better). This RD value is used to decide between different +encoding strategies for the current block where, for example, a one may +result in a lower distortion but a larger number of bits. + +The calculation of this RD value is broadly speaking as follows: + +\f[ + RD = (λ * Rate) + Distortion +\f] + +This assumes a linear relationship between the number of bits used and +distortion (represented by the rate multiplier value λ) which is +not actually valid across a broad range of rate and distortion values. +Typically, where distortion is high, expending a small number of extra bits +will result in a large change in distortion. However, at lower values of +distortion the cost in bits of each incremental improvement is large. + +To deal with this we scale the value of λ based on the quantizer +value chosen for the frame. This is assumed to be a proxy for our approximate +position on the true rate distortion curve and it is further assumed that over +a limited range of distortion values, a linear relationship between distortion +and rate is a valid approximation. + +Doing a rate distortion test on each candidate prediction / transform +combination is expensive in terms of cpu cycles. Hence, for cases where encode +speed is critical, libaom implements a non-rd pathway where the RD +value is estimated based on the prediction error and quantizer setting. + +\section architecture_enc_src_proc Source Frame Processing + +\subsection architecture_enc_frame_proc_data Main Data Structures + +The following are the main data structures referenced in this section +(see also \ref architecture_enc_data_structures): + +- \ref AV1_PRIMARY ppi (the primary compressor instance data structure) + - \ref AV1_PRIMARY.tf_info (\ref TEMPORAL_FILTER_INFO) + +- \ref AV1_COMP cpi (the main compressor instance data structure) + - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig) + +- \ref AV1EncoderConfig (Encoder configuration parameters) + - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg) + - \ref AV1EncoderConfig.kf_cfg (\ref KeyFrameCfg) + +- \ref AlgoCfg (Algorithm related configuration parameters) + - \ref AlgoCfg.arnr_max_frames + - \ref AlgoCfg.arnr_strength + +- \ref KeyFrameCfg (Keyframe coding configuration parameters) + - \ref KeyFrameCfg.enable_keyframe_filtering + +\subsection architecture_enc_frame_proc_ingest Frame Ingest / Coding Pipeline + + To encode a frame, first call \ref av1_receive_raw_frame() to obtain the raw + frame data. Then call \ref av1_get_compressed_data() to encode raw frame data + into compressed frame data. The main body of \ref av1_get_compressed_data() + is \ref av1_encode_strategy(), which determines high-level encode strategy + (frame type, frame placement, etc.) and then encodes the frame by calling + \ref av1_encode(). In \ref av1_encode(), \ref av1_first_pass() will execute + the first_pass of two-pass encoding, while \ref encode_frame_to_data_rate() + will perform the final pass for either one-pass or two-pass encoding. + + The main body of \ref encode_frame_to_data_rate() is + \ref encode_with_recode_loop_and_filter(), which handles encoding before + in-loop filters (with recode loops \ref encode_with_recode_loop(), or + without any recode loop \ref encode_without_recode()), followed by in-loop + filters (deblocking filters \ref loopfilter_frame(), CDEF filters and + restoration filters \ref cdef_restoration_frame()). + + Except for rate/quality control, both \ref encode_with_recode_loop() and + \ref encode_without_recode() call \ref av1_encode_frame() to manage the + reference frame buffers and \ref encode_frame_internal() to perform the + rest of encoding that does not require access to external frames. + \ref encode_frame_internal() is the starting point for the partition search + (see \ref architecture_enc_partitions). + +\subsection architecture_enc_frame_proc_tf Temporal Filtering + +\subsubsection architecture_enc_frame_proc_tf_overview Overview + +Video codecs exploit the spatial and temporal correlations in video signals to +achieve compression efficiency. The noise factor in the source signal +attenuates such correlation and impedes the codec performance. Denoising the +video signal is potentially a promising solution. + +One strategy for denoising a source is motion compensated temporal filtering. +Unlike image denoising, where only the spatial information is available, +video denoising can leverage a combination of the spatial and temporal +information. Specifically, in the temporal domain, similar pixels can often be +tracked along the motion trajectory of moving objects. Motion estimation is +applied to neighboring frames to find similar patches or blocks of pixels that +can be combined to create a temporally filtered output. + +AV1, in common with VP8 and VP9, uses an in-loop motion compensated temporal +filter to generate what are referred to as alternate reference frames (or ARF +frames). These can be encoded in the bitstream and stored as frame buffers for +use in the prediction of subsequent frames, but are not usually directly +displayed (hence they are sometimes referred to as non-display frames). + +The following command line parameters set the strength of the filter, the +number of frames used and determine whether filtering is allowed for key +frames. + +- --arnr-strength (\ref AlgoCfg.arnr_strength) +- --arnr-maxframes (\ref AlgoCfg.arnr_max_frames) +- --enable-keyframe-filtering + (\ref KeyFrameCfg.enable_keyframe_filtering) + +Note that in AV1, the temporal filtering scheme is designed around the +hierarchical ARF based pyramid coding structure. We typically apply denoising +only on key frame and ARF frames at the highest (and sometimes the second +highest) layer in the hierarchical coding structure. + +\subsubsection architecture_enc_frame_proc_tf_algo Temporal Filtering Algorithm + +Our method divides the current frame into "MxM" blocks. For each block, a +motion search is applied on frames before and after the current frame. Only +the best matching patch with the smallest mean square error (MSE) is kept as a +candidate patch for a neighbour frame. The current block is also a candidate +patch. A total of N candidate patches are combined to generate the filtered +output. + +Let f(i) represent the filtered sample value and \f$p_{j}(i)\f$ the sample +value of the j-th patch. The filtering process is: + +\f[ + f(i) = \frac{p_{0}(i) + \sum_{j=1}^{N} ω_{j}(i).p_{j}(i)} + {1 + \sum_{j=1}^{N} ω_{j}(i)} +\f] + +where \f$ ω_{j}(i) \f$ is the weight of the j-th patch from a total of +N patches. The weight is determined by the patch difference as: + +\f[ + ω_{j}(i) = exp(-\frac{D_{j}(i)}{h^2}) +\f] + +where \f$ D_{j}(i) \f$ is the sum of squared difference between the current +block and the j-th candidate patch: + +\f[ + D_{j}(i) = \sum_{k\inΩ_{i}}||p_{0}(k) - p_{j}(k)||_{2} +\f] + +where: +- \f$p_{0}\f$ refers to the current frame. +- \f$Ω_{i}\f$ is the patch window, an "LxL" pixel square. +- h is a critical parameter that controls the decay of the weights measured by + the Euclidean distance. It is derived from an estimate of noise amplitude in + the source. This allows the filter coefficients to adapt for videos with + different noise characteristics. +- Usually, M = 32, N = 7, and L = 5, but they can be adjusted. + +It is recommended that the reader refers to the code for more details. + +\subsubsection architecture_enc_frame_proc_tf_funcs Temporal Filter Functions + +The main entry point for temporal filtering is \ref av1_temporal_filter(). +This function returns 1 if temporal filtering is successful, otherwise 0. +When temporal filtering is applied, the filtered frame will be held in +the output_frame, which is the frame to be +encoded in the following encoding process. + +Almost all temporal filter related code is in av1/encoder/temporal_filter.c +and av1/encoder/temporal_filter.h. + +Inside \ref av1_temporal_filter(), the reader's attention is directed to +\ref tf_setup_filtering_buffer() and \ref tf_do_filtering(). + +- \ref tf_setup_filtering_buffer(): sets up the frame buffer for + temporal filtering, determines the number of frames to be used, and + calculates the noise level of each frame. + +- \ref tf_do_filtering(): the main function for the temporal + filtering algorithm. It breaks each frame into "MxM" blocks. For each + block a motion search \ref tf_motion_search() is applied to find + the motion vector from one neighboring frame. tf_build_predictor() is then + called to build the matching patch and \ref av1_apply_temporal_filter_c() (see + also optimised SIMD versions) to apply temporal filtering. The weighted + average over each pixel is accumulated and finally normalized in + \ref tf_normalize_filtered_frame() to generate the final filtered frame. + +- \ref av1_apply_temporal_filter_c(): the core function of our temporal + filtering algorithm (see also optimised SIMD versions). + +\subsection architecture_enc_frame_proc_film Film Grain Modelling + + Add details here. + +\section architecture_enc_rate_ctrl Rate Control + +\subsection architecture_enc_rate_ctrl_data Main Data Structures + +The following are the main data structures referenced in this section +(see also \ref architecture_enc_data_structures): + + - \ref AV1_PRIMARY ppi (the primary compressor instance data structure) + - \ref AV1_PRIMARY.twopass (\ref TWO_PASS) + + - \ref AV1_COMP cpi (the main compressor instance data structure) + - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig) + - \ref AV1_COMP.rc (\ref RATE_CONTROL) + - \ref AV1_COMP.sf (\ref SPEED_FEATURES) + + - \ref AV1EncoderConfig (Encoder configuration parameters) + - \ref AV1EncoderConfig.rc_cfg (\ref RateControlCfg) + + - \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first + pass stats) + + - \ref SPEED_FEATURES (Encode speed vs quality tradeoff parameters) + - \ref SPEED_FEATURES.hl_sf (\ref HIGH_LEVEL_SPEED_FEATURES) + +\subsection architecture_enc_rate_ctrl_options Supported Rate Control Options + +Different use cases (\ref architecture_enc_use_cases) may have different +requirements in terms of data rate control. + +The broad rate control strategy is selected using the --end-usage +parameter on the command line, which maps onto the field +\ref aom_codec_enc_cfg_t.rc_end_usage in \ref aom_encoder.h. + +The four supported options are:- + +- VBR (Variable Bitrate) +- CBR (Constant Bitrate) +- CQ (Constrained Quality mode ; A constrained variant of VBR) +- Fixed Q (Constant quality of Q mode) + +The value of \ref aom_codec_enc_cfg_t.rc_end_usage is in turn copied over +into the encoder rate control configuration data structure as +\ref RateControlCfg.mode. + +In regards to the most important use cases above, Video on demand uses either +VBR or CQ mode. CBR is the preferred rate control model for RTC and Live +streaming and Fixed Q is only used in testing. + +The behaviour of each of these modes is regulated by a series of secondary +command line rate control options but also depends somewhat on the selected +use case, whether 2-pass coding is enabled and the selected encode speed vs +quality trade offs (\ref AV1_COMP.speed and \ref AV1_COMP.sf). + +The list below gives the names of the main rate control command line +options together with the names of the corresponding fields in the rate +control configuration data structures. + +- --target-bitrate (\ref RateControlCfg.target_bandwidth) +- --min-q (\ref RateControlCfg.best_allowed_q) +- --max-q (\ref RateControlCfg.worst_allowed_q) +- --cq-level (\ref RateControlCfg.cq_level) +- --undershoot-pct (\ref RateControlCfg.under_shoot_pct) +- --overshoot-pct (\ref RateControlCfg.over_shoot_pct) + +The following control aspects of vbr encoding + +- --bias-pct (\ref RateControlCfg.vbrbias) +- --minsection-pct ((\ref RateControlCfg.vbrmin_section) +- --maxsection-pct ((\ref RateControlCfg.vbrmax_section) + +The following relate to buffer and delay management in one pass low delay and +real time coding + +- --buf-sz (\ref RateControlCfg.maximum_buffer_size_ms) +- --buf-initial-sz (\ref RateControlCfg.starting_buffer_level_ms) +- --buf-optimal-sz (\ref RateControlCfg.optimal_buffer_level_ms) + +\subsection architecture_enc_vbr Variable Bitrate (VBR) Encoding + +For streamed VOD content the most common rate control strategy is Variable +Bitrate (VBR) encoding. The CQ mode mentioned above is a variant of this +where additional quantizer and quality constraints are applied. VBR +encoding may in theory be used in conjunction with either 1-pass or 2-pass +encoding. + +VBR encoding varies the number of bits given to each frame or group of frames +according to the difficulty of that frame or group of frames, such that easier +frames are allocated fewer bits and harder frames are allocated more bits. The +intent here is to even out the quality between frames. This contrasts with +Constant Bitrate (CBR) encoding where each frame is allocated the same number +of bits. + +Whilst for any given frame or group of frames the data rate may vary, the VBR +algorithm attempts to deliver a given average bitrate over a wider time +interval. In standard VBR encoding, the time interval over which the data rate +is averaged is usually the duration of the video clip. An alternative +approach is to target an average VBR bitrate over the entire video corpus for +a particular video format (corpus VBR). + +\subsubsection architecture_enc_1pass_vbr 1 Pass VBR Encoding + +The command line for libaom does allow 1 Pass VBR, but this has not been +properly optimised and behaves much like 1 pass CBR in most regards, with bits +allocated to frames by the following functions: + +- \ref av1_calc_iframe_target_size_one_pass_vbr() +- \ref av1_calc_pframe_target_size_one_pass_vbr() + +\subsubsection architecture_enc_2pass_vbr 2 Pass VBR Encoding + +The main focus here will be on 2-pass VBR encoding (and the related CQ mode) +as these are the modes most commonly used for VOD content. + +2-pass encoding is selected on the command line by setting --passes=2 +(or -p 2). + +Generally speaking, in 2-pass encoding, an encoder will first encode a video +using a default set of parameters and assumptions. Depending on the outcome +of that first encode, the baseline assumptions and parameters will be adjusted +to optimize the output during the second pass. In essence the first pass is a +fact finding mission to establish the complexity and variability of the video, +in order to allow a better allocation of bits in the second pass. + +The libaom 2-pass algorithm is unusual in that the first pass is not a full +encode of the video. Rather it uses a limited set of prediction and transform +options and a fixed quantizer, to generate statistics about each frame. No +output bitstream is created and the per frame first pass statistics are stored +entirely in volatile memory. This has some disadvantages when compared to a +full first pass encode, but avoids the need for file I/O and improves speed. + +For two pass encoding, the function \ref av1_encode() will first be called +for each frame in the video with the value \ref AV1EncoderConfig.pass = 1. +This will result in calls to \ref av1_first_pass(). + +Statistics for each frame are stored in \ref FIRSTPASS_STATS frame_stats_buf. + +After completion of the first pass, \ref av1_encode() will be called again for +each frame with \ref AV1EncoderConfig.pass = 2. The frames are then encoded in +accordance with the statistics gathered during the first pass by calls to +\ref encode_frame_to_data_rate() which in turn calls + \ref av1_get_second_pass_params(). + +In summary the second pass code :- + +- Searches for scene cuts (if auto key frame detection is enabled). +- Defines the length of and hierarchical structure to be used in each + ARF/GF group. +- Allocates bits based on the relative complexity of each frame, the quality + of frame to frame prediction and the type of frame (e.g. key frame, ARF + frame, golden frame or normal leaf frame). +- Suggests a maximum Q (quantizer value) for each ARF/GF group, based on + estimated complexity and recent rate control compliance + (\ref RATE_CONTROL.active_worst_quality) +- Tracks adherence to the overall rate control objectives and adjusts + heuristics. + +The main two pass functions in regard to the above include:- + +- \ref find_next_key_frame() +- \ref define_gf_group() +- \ref calculate_total_gf_group_bits() +- \ref get_twopass_worst_quality() +- \ref av1_gop_setup_structure() +- \ref av1_gop_bit_allocation() +- \ref av1_twopass_postencode_update() + +For each frame, the two pass algorithm defines a target number of bits +\ref RATE_CONTROL.base_frame_target, which is then adjusted if necessary to +reflect any undershoot or overshoot on previous frames to give +\ref RATE_CONTROL.this_frame_target. + +As well as \ref RATE_CONTROL.active_worst_quality, the two pass code also +maintains a record of the actual Q value used to encode previous frames +at each level in the current pyramid hierarchy +(\ref PRIMARY_RATE_CONTROL.active_best_quality). The function +\ref rc_pick_q_and_bounds(), uses these values to set a permitted Q range +for each frame. + +\subsubsection architecture_enc_1pass_lagged 1 Pass Lagged VBR Encoding + +1 pass lagged encode falls between simple 1 pass encoding and full two pass +encoding and is used for cases where it is not possible to do a full first +pass through the entire video clip, but where some delay is permissible. For +example near live streaming where there is a delay of up to a few seconds. In +this case the first pass and second pass are in effect combined such that the +first pass starts encoding the clip and the second pass lags behind it by a +few frames. When using this method, full sequence level statistics are not +available, but it is possible to collect and use frame or group of frame level +data to help in the allocation of bits and in defining ARF/GF coding +hierarchies. The reader is referred to the \ref AV1_PRIMARY.lap_enabled field +in the main compressor instance (where lap stands for +look ahead processing). This encoding mode for the most part uses the +same rate control pathways as two pass VBR encoding. + +\subsection architecture_enc_rc_loop The Main Rate Control Loop + +Having established a target rate for a given frame and an allowed range of Q +values, the encoder then tries to encode the frame at a rate that is as close +as possible to the target value, given the Q range constraints. + +There are two main mechanisms by which this is achieved. + +The first selects a frame level Q, using an adaptive estimate of the number of +bits that will be generated when the frame is encoded at any given Q. +Fundamentally this mechanism is common to VBR, CBR and to use cases such as +RTC with small adjustments. + +As the Q value mainly adjusts the precision of the residual signal, it is not +actually a reliable basis for accurately predicting the number of bits that +will be generated across all clips. A well predicted clip, for example, may +have a much smaller error residual after prediction. The algorithm copes with +this by adapting its predictions on the fly using a feedback loop based on how +well it did the previous time around. + +The main functions responsible for the prediction of Q and the adaptation over +time, for the two pass encoding pipeline are: + +- \ref rc_pick_q_and_bounds() + - \ref get_q() + - \ref av1_rc_regulate_q() + - \ref get_rate_correction_factor() + - \ref set_rate_correction_factor() + - \ref find_closest_qindex_by_rate() +- \ref av1_twopass_postencode_update() + - \ref av1_rc_update_rate_correction_factors() + +A second mechanism for control comes into play if there is a large rate miss +for the current frame (much too big or too small). This is a recode mechanism +which allows the current frame to be re-encoded one or more times with a +revised Q value. This obviously has significant implications for encode speed +and in the case of RTC latency (hence it is not used for the RTC pathway). + +Whether or not a recode is allowed for a given frame depends on the selected +encode speed vs quality trade off. This is set on the command line using the +--cpu-used parameter which maps onto the \ref AV1_COMP.speed field in the main +compressor instance data structure. + +The value of \ref AV1_COMP.speed, combined with the use case, is used to +populate the speed features data structure AV1_COMP.sf. In particular +\ref HIGH_LEVEL_SPEED_FEATURES.recode_loop determines the types of frames that +may be recoded and \ref HIGH_LEVEL_SPEED_FEATURES.recode_tolerance is a rate +error trigger threshold. + +For more information the reader is directed to the following functions: + +- \ref encode_with_recode_loop() +- \ref encode_without_recode() +- \ref recode_loop_update_q() +- \ref recode_loop_test() +- \ref av1_set_speed_features_framesize_independent() +- \ref av1_set_speed_features_framesize_dependent() + +\subsection architecture_enc_fixed_q Fixed Q Mode + +There are two main fixed Q cases: +-# Fixed Q with adaptive qp offsets: same qp offset for each pyramid level + in a given video, but these offsets are adaptive based on video content. +-# Fixed Q with fixed qp offsets: content-independent fixed qp offsets for + each pyramid level. + +The reader is also refered to the following functions: +- \ref av1_rc_pick_q_and_bounds() +- \ref rc_pick_q_and_bounds_no_stats_cbr() +- \ref rc_pick_q_and_bounds_no_stats() +- \ref rc_pick_q_and_bounds() + +\section architecture_enc_frame_groups GF/ ARF Frame Groups & Hierarchical Coding + +\subsection architecture_enc_frame_groups_data Main Data Structures + +The following are the main data structures referenced in this section +(see also \ref architecture_enc_data_structures): + +- \ref AV1_COMP cpi (the main compressor instance data structure) + - \ref AV1_COMP.rc (\ref RATE_CONTROL) + +- \ref FIRSTPASS_STATS *frame_stats_buf (used to store per frame first pass +stats) + +\subsection architecture_enc_frame_groups_groups Frame Groups + +To process a sequence/stream of video frames, the encoder divides the frames +into groups and encodes them sequentially (possibly dependent on previous +groups). In AV1 such a group is usually referred to as a golden frame group +(GF group) or sometimes an Alt-Ref (ARF) group or a group of pictures (GOP). +A GF group determines and stores the coding structure of the frames (for +example, frame type, usage of the hierarchical structure, usage of overlay +frames, etc.) and can be considered as the base unit to process the frames, +therefore playing an important role in the encoder. + +The length of a specific GF group is arguably the most important aspect when +determining a GF group. This is because most GF group level decisions are +based on the frame characteristics, if not on the length itself directly. +Note that the GF group is always a group of consecutive frames, which means +the start and end of the group (so again, the length of it) determines which +frames are included in it and hence determines the characteristics of the GF +group. Therefore, in this document we will first discuss the GF group length +decision in Libaom, followed by frame structure decisions when defining a GF +group with a certain length. + +\subsection architecture_enc_gf_length GF / ARF Group Length Determination + +The basic intuition of determining the GF group length is that it is usually +desirable to group together frames that are similar. Hence, we may choose +longer groups when consecutive frames are very alike and shorter ones when +they are very different. + +The determination of the GF group length is done in function \ref +calculate_gf_length(). The following encoder use cases are supported: + +
    +
  • Single pass with look-ahead disabled(\ref has_no_stats_stage()): + in this case there is no information available on the following stream + of frames, therefore the function will set the GF group length for the + current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS + groups) to be the maximum value allowed.
  • + +
  • Single pass with look-ahead enabled (\ref AV1_PRIMARY.lap_enabled): + look-ahead processing is enabled for single pass, therefore there is a + limited amount of information available regarding future frames. In this + case the function will determine the length based on \ref FIRSTPASS_STATS + (which is generated when processing the look-ahead buffer) for only the + current GF group.
  • + +
  • Two pass: the first pass in two-pass encoding collects the stats + and will not call the function. In the second pass, the function tries to + determine the GF group length of the current and the following GF groups (a + total number of MAX_NUM_GF_INTERVALS groups) based on the first-pass + statistics. Note that as we will be discussing later, such decisions may not + be accurate and can be changed later.
  • +
+ +Except for the first trivial case where there is no prior knowledge of the +following frames, the function \ref calculate_gf_length() tries to determine the +GF group length based on the first pass statistics. The determination is divided +into two parts: + +
    +
  1. Baseline decision based on accumulated statistics: this part of the function + iterates through the firstpass statistics of the following frames and + accumulates the statistics with function accumulate_next_frame_stats. + The accumulated statistics are then used to determine whether the + correlation in the GF group has dropped too much in function detect_gf_cut. + If detect_gf_cut returns non-zero, or if we've reached the end of + first-pass statistics, the baseline decision is set at the current point.
  2. + +
  3. If we are not at the end of the first-pass statistics, the next part will + try to refine the baseline decision. This algorithm is based on the analysis + of firstpass stats. It tries to cut the groups in stable regions or + relatively stable points. Also it tries to avoid cutting in a blending + region.
  4. +
+ +As mentioned, for two-pass encoding, the function \ref +calculate_gf_length() tries to determine the length of as many as +MAX_NUM_GF_INTERVALS groups. The decisions are stored in +\ref PRIMARY_RATE_CONTROL.gf_intervals[]. The variables +\ref RATE_CONTROL.intervals_till_gf_calculate_due and +\ref PRIMARY_RATE_CONTROL.gf_intervals[] help with managing and updating the stored +decisions. In the function \ref define_gf_group(), the corresponding +stored length decision will be used to define the current GF group. + +When the maximum GF group length is larger or equal to 32, the encoder will +enforce an extra layer to determine whether to use maximum GF length of 32 +or 16 for every GF group. In such a case, \ref calculate_gf_length() is +first called with the original maximum length (>=32). Afterwards, +\ref av1_tpl_setup_stats() is called to analyze the determined GF group +and compare the reference to the last frame and the middle frame. If it is +decided that we should use a maximum GF length of 16, the function +\ref calculate_gf_length() is called again with the updated maximum +length, and it only sets the length for a single GF group +(\ref RATE_CONTROL.intervals_till_gf_calculate_due is set to 1). This process +is shown below. + +\image html tplgfgroupdiagram.png "" width=40% + +Before encoding each frame, the encoder checks +\ref RATE_CONTROL.frames_till_gf_update_due. If it is zero, indicating +processing of the current GF group is done, the encoder will check whether +\ref RATE_CONTROL.intervals_till_gf_calculate_due is zero. If it is, as +discussed above, \ref calculate_gf_length() is called with original +maximum length. If it is not zero, then the GF group length value stored +in \ref PRIMARY_RATE_CONTROL.gf_intervals[\ref PRIMARY_RATE_CONTROL.cur_gf_index] is used +(subject to change as discussed above). + +\subsection architecture_enc_gf_structure Defining a GF Group's Structure + +The function \ref define_gf_group() defines the frame structure as well +as other GF group level parameters (e.g. bit allocation) once the length of +the current GF group is determined. + +The function first iterates through the first pass statistics in the GF group to +accumulate various stats, using accumulate_this_frame_stats() and +accumulate_next_frame_stats(). The accumulated statistics are then used to +determine the use of the use of ALTREF frame along with other properties of the +GF group. The values of \ref PRIMARY_RATE_CONTROL.cur_gf_index, \ref +RATE_CONTROL.intervals_till_gf_calculate_due and \ref +RATE_CONTROL.frames_till_gf_update_due are also updated accordingly. + +The function \ref av1_gop_setup_structure() is called at the end to determine +the frame layers and reference maps in the GF group, where the +construct_multi_layer_gf_structure() function sets the frame update types for +each frame and the group structure. + +- If ALTREF frames are allowed for the GF group: the first frame is set to + KF_UPDATE, GF_UPDATE or ARF_UPDATE. The last frames of the GF group is set to + OVERLAY_UPDATE. Then in set_multi_layer_params(), frame update + types are determined recursively in a binary tree fashion, and assigned to + give the final IBBB structure for the group. - If the current branch has more + than 2 frames and we have not reached maximum layer depth, then the middle + frame is set as INTNL_ARF_UPDATE, and the left and right branches are + processed recursively. - If the current branch has less than 3 frames, or we + have reached maximum layer depth, then every frame in the branch is set to + LF_UPDATE. + +- If ALTREF frame is not allowed for the GF group: the frames are set + as LF_UPDATE. This basically forms an IPPP GF group structure. + +As mentioned, the encoder may use Temporal dependancy modelling (TPL - see \ref +architecture_enc_tpl) to determine whether we should use a maximum length of 32 +or 16 for the current GF group. This requires calls to \ref define_gf_group() +but should not change other settings (since it is in essence a trial). This +special case is indicated by the setting parameter is_final_pass for to +zero. + +For single pass encodes where look-ahead processing is disabled +(\ref AV1_PRIMARY.lap_enabled = 0), \ref define_gf_group_pass0() is used +instead of \ref define_gf_group(). + +\subsection architecture_enc_kf_groups Key Frame Groups + +A special constraint for GF group length is the location of the next keyframe +(KF). The frames between two KFs are referred to as a KF group. Each KF group +can be encoded and decoded independently. Because of this, a GF group cannot +span beyond a KF and the location of the next KF is set as a hard boundary +for GF group length. + +
    +
  • For two-pass encoding \ref RATE_CONTROL.frames_to_key controls when to + encode a key frame. When it is zero, the current frame is a keyframe and + the function \ref find_next_key_frame() is called. This in turn calls + \ref define_kf_interval() to work out where the next key frame should + be placed.
  • + +
  • For single-pass with look-ahead enabled, \ref define_kf_interval() + is called whenever a GF group update is needed (when + \ref RATE_CONTROL.frames_till_gf_update_due is zero). This is because + generally KFs are more widely spaced and the look-ahead buffer is usually + not long enough.
  • + +
  • For single-pass with look-ahead disabled, the KFs are placed according + to the command line parameter --kf-max-dist (The above two cases are + also subject to this constraint).
  • +
+ +The function \ref define_kf_interval() tries to detect a scenecut. +If a scenecut within kf-max-dist is detected, then it is set as the next +keyframe. Otherwise the given maximum value is used. + +\section architecture_enc_tpl Temporal Dependency Modelling + +The temporal dependency model runs at the beginning of each GOP. It builds the +motion trajectory within the GOP in units of 16x16 blocks. The temporal +dependency of a 16x16 block is evaluated as the predictive coding gains it +contributes to its trailing motion trajectory. This temporal dependency model +reflects how important a coding block is for the coding efficiency of the +overall GOP. It is hence used to scale the Lagrangian multiplier used in the +rate-distortion optimization framework. + +\subsection architecture_enc_tpl_config Configurations + +The temporal dependency model and its applications are by default turned on in +libaom encoder for the VoD use case. To disable it, use --tpl-model=0 in the +aomenc configuration. + +\subsection architecture_enc_tpl_algoritms Algorithms + +The scheme works in the reverse frame processing order over the source frames, +propagating information from future frames back to the current frame. For each +frame, a propagation step is run for each MB. it operates as follows: + +
    +
  • Estimate the intra prediction cost in terms of sum of absolute Hadamard + transform difference (SATD) noted as intra_cost. It also loads the motion + information available from the first-pass encode and estimates the inter + prediction cost as inter_cost. Due to the use of hybrid inter/intra + prediction mode, the inter_cost value is further upper bounded by + intra_cost. A propagation cost variable is used to collect all the + information flowed back from future processing frames. It is initialized as + 0 for all the blocks in the last processing frame in a group of pictures + (GOP).
  • + +
  • The fraction of information from a current block to be propagated towards + its reference block is estimated as: +\f[ + propagation\_fraction = (1 - inter\_cost/intra\_cost) +\f] + It reflects how much the motion compensated reference would reduce the + prediction error in percentage.
  • + +
  • The total amount of information the current block contributes to the GOP + is estimated as intra_cost + propagation_cost. The information that it + propagates towards its reference block is captured by: + +\f[ + propagation\_amount = + (intra\_cost + propagation\_cost) * propagation\_fraction +\f]
  • + +
  • Note that the reference block may not necessarily sit on the grid of + 16x16 blocks. The propagation amount is hence dispensed to all the blocks + that overlap with the reference block. The corresponding block in the + reference frame accumulates its own propagation cost as it receives back + propagation. + +\f[ + propagation\_cost = propagation\_cost + + (\frac{overlap\_area}{(16*16)} * propagation\_amount) +\f]
  • + +
  • In the final encoding stage, the distortion propagation factor of a block + is evaluated as \f$(1 + \frac{propagation\_cost}{intra\_cost})\f$, where the second term + captures its impact on later frames in a GOP.
  • + +
  • The Lagrangian multiplier is adapted at the 64x64 block level. For every + 64x64 block in a frame, we have a distortion propagation factor: + +\f[ + dist\_prop[i] = 1 + \frac{propagation\_cost[i]}{intra\_cost[i]} +\f] + + where i denotes the block index in the frame. We also have the frame level + distortion propagation factor: + +\f[ + dist\_prop = 1 + + \frac{\sum_{i}propagation\_cost[i]}{\sum_{i}intra\_cost[i]} +\f] + + which is used to normalize the propagation factor at the 64x64 block level. The + Lagrangian multiplier is hence adapted as: + +\f[ + λ[i] = λ[0] * \frac{dist\_prop}{dist\_prop[i]} +\f] + + where λ0 is the multiplier associated with the frame level QP. The + 64x64 block level QP is scaled according to the Lagrangian multiplier. +
+ +\subsection architecture_enc_tpl_keyfun Key Functions and data structures + +The reader is also refered to the following functions and data structures: + +- \ref TplParams +- \ref av1_tpl_setup_stats() builds the TPL model. +- \ref setup_delta_q() Assign different quantization parameters to each super + block based on its TPL weight. + +\section architecture_enc_partitions Block Partition Search + + A frame is first split into tiles in \ref encode_tiles(), with each tile + compressed by av1_encode_tile(). Then a tile is processed in superblock rows + via \ref av1_encode_sb_row() and then \ref encode_sb_row(). + + The partition search processes superblocks sequentially in \ref + encode_sb_row(). Two search modes are supported, depending upon the encoding + configuration, \ref encode_nonrd_sb() is for 1-pass and real-time modes, + while \ref encode_rd_sb() performs more exhaustive rate distortion based + searches. + + Partition search over the recursive quad-tree space is implemented by + recursive calls to \ref av1_nonrd_use_partition(), + \ref av1_rd_use_partition(), or av1_rd_pick_partition() and returning best + options for sub-trees to their parent partitions. + + In libaom, the partition search lays on top of the mode search (predictor, + transform, etc.), instead of being a separate module. The interface of mode + search is \ref pick_sb_modes(), which connects the partition_search with + \ref architecture_enc_inter_modes and \ref architecture_enc_intra_modes. To + make good decisions, reconstruction is also required in order to build + references and contexts. This is implemented by \ref encode_sb() at the + sub-tree level and \ref encode_b() at coding block level. + + See also \ref partition_search + +\section architecture_enc_intra_modes Intra Mode Search + +AV1 also provides 71 different intra prediction modes, i.e. modes that predict +only based upon information in the current frame with no dependency on +previous or future frames. For key frames, where this independence from any +other frame is a defining requirement and for other cases where intra only +frames are required, the encoder need only considers these modes in the rate +distortion loop. + +Even so, in most use cases, searching all possible intra prediction modes for +every block and partition size is not practical and some pruning of the search +tree is necessary. + +For the Rate distortion optimized case, the main top level function +responsible for selecting the intra prediction mode for a given block is +\ref av1_rd_pick_intra_mode_sb(). The readers attention is also drawn to the +functions \ref hybrid_intra_mode_search() and \ref av1_nonrd_pick_intra_mode() +which may be used where encode speed is critical. The choice between the +rd path and the non rd or hybrid paths depends on the encoder use case and the +\ref AV1_COMP.speed parameter. Further fine control of the speed vs quality +trade off is provided by means of fields in \ref AV1_COMP.sf (which has type +\ref SPEED_FEATURES). + +Note that some intra modes are only considered for specific use cases or +types of video. For example the palette based prediction modes are often +valueable for graphics or screen share content but not for natural video. +(See \ref av1_search_palette_mode()) + +See also \ref intra_mode_search for more details. + +\section architecture_enc_inter_modes Inter Prediction Mode Search + +For inter frames, where we also allow prediction using one or more previously +coded frames (which may chronologically speaking be past or future frames or +non-display reference buffers such as ARF frames), the size of the search tree +that needs to be traversed, to select a prediction mode, is considerably more +massive. + +In addition to the 71 possible intra modes we also need to consider 56 single +frame inter prediction modes (7 reference frames x 4 modes x 2 for OBMC +(overlapped block motion compensation)), 12768 compound inter prediction modes +(these are modes that combine inter predictors from two reference frames) and +36708 compound inter / intra prediction modes. + +As with the intra mode search, libaom supports an RD based pathway and a non +rd pathway for speed critical use cases. The entry points for these two cases +are \ref av1_rd_pick_inter_mode() and \ref av1_nonrd_pick_inter_mode_sb() +respectively. + +Various heuristics and predictive strategies are used to prune the search tree +with fine control provided through the speed features parameter in the main +compressor instance data structure \ref AV1_COMP.sf. + +It is worth noting, that some prediction modes incurr a much larger rate cost +than others (ignoring for now the cost of coding the error residual). For +example, a compound mode that requires the encoder to specify two reference +frames and two new motion vectors will almost inevitable have a higher rate +cost than a simple inter prediction mode that uses a predicted or 0,0 motion +vector. As such, if we have already found a mode for the current block that +has a low RD cost, we can skip a large number of the possible modes on the +basis that even if the error residual is 0 the inherent rate cost of the +mode itself will garauntee that it is not chosen. + +See also \ref inter_mode_search for more details. + +\section architecture_enc_tx_search Transform Search + +AV1 implements the transform stage using 4 seperable 1-d transforms (DCT, +ADST, FLIPADST and IDTX, where FLIPADST is the reversed version of ADST +and IDTX is the identity transform) which can be combined to give 16 2-d +combinations. + +These combinations can be applied at 19 different scales from 64x64 pixels +down to 4x4 pixels. + +This gives rise to a large number of possible candidate transform options +for coding the residual error after prediction. An exhaustive rate-distortion +based evaluation of all candidates would not be practical from a speed +perspective in a production encoder implementation. Hence libaom addopts a +number of strategies to prune the selection of both the transform size and +transform type. + +There are a number of strategies that have been tested and implememnted in +libaom including: + +- A statistics based approach that looks at the frequency with which certain + combinations are used in a given context and prunes out very unlikely + candidates. It is worth noting here that some size candidates can be pruned + out immediately based on the size of the prediction partition. For example it + does not make sense to use a transform size that is larger than the + prediction partition size but also a very large prediction partition size is + unlikely to be optimally pared with small transforms. + +- A Machine learning based model + +- A method that initially tests candidates using a fast algorithm that skips + entropy encoding and uses an estimated cost model to choose a reduced subset + for full RD analysis. This subject is covered more fully in a paper authored + by Bohan Li, Jingning Han, and Yaowu Xu titled: Fast Transform Type + Selection Using Conditional Laplace Distribution Based Rate Estimation + +TODO Add link to paper when available + +See also \ref transform_search for more details. + +\section architecture_post_enc_filt Post Encode Loop Filtering + +AV1 supports three types of post encode in loop filtering to improve +the quality of the reconstructed video. + +- Deblocking Filter The first of these is a farily traditional boundary + deblocking filter that attempts to smooth discontinuities that may occur at + the boundaries between blocks. See also \ref in_loop_filter. + +- CDEF Filter The constrained directional enhancement filter (CDEF) + allows the codec to apply a non-linear deringing filter along certain + (potentially oblique) directions. A primary filter is applied along the + selected direction, whilst a secondary filter is applied at 45 degrees to + the primary direction. (See also \ref in_loop_cdef and + A Technical Overview of AV1. + +- Loop Restoration Filter The loop restoration filter is applied after + any prior post filtering stages. It acts on units of either 64 x 64, + 128 x 128, or 256 x 256 pixel blocks, refered to as loop restoration units. + Each unit can independently select either to bypass filtering, use a Wiener + filter, or use a self-guided filter. (See also \ref in_loop_restoration and + A Technical Overview of AV1. + +\section architecture_entropy Entropy Coding + +\subsection architecture_entropy_aritmetic Arithmetic Coder + +VP9, used a binary arithmetic coder to encode symbols, where the propability +of a 1 or 0 at each descision node was based on a context model that took +into account recently coded values (for example previously coded coefficients +in the current block). A mechanism existed to update the context model each +frame, either explicitly in the bitstream, or implicitly at both the encoder +and decoder based on the observed frequency of different outcomes in the +previous frame. VP9 also supported seperate context models for different types +of frame (e.g. inter coded frames and key frames). + +In contrast, AV1 uses an M-ary symbol arithmetic coder to compress the syntax +elements, where integer \f$M\in[2, 14]\f$. This approach is based upon the entropy +coding strategy used in the Daala video codec and allows for some bit-level +parallelism in its implementation. AV1 also has an extended context model and +allows for updates to the probabilities on a per symbol basis as opposed to +the per frame strategy in VP9. + +To improve the performance / throughput of the arithmetic encoder, especially +in hardware implementations, the probability model is updated and maintained +at 15-bit precision, but the arithmetic encoder only uses the most significant +9 bits when encoding a symbol. A more detailed discussion of the algorithm +and design constraints can be found in + A Technical Overview of AV1. + +TODO add references to key functions / files. + +As with VP9, a mechanism exists in AV1 to encode some elements into the +bitstream as uncrompresed bits or literal values, without using the arithmetic +coder. For example, some frame and sequence header values, where it is +beneficial to be able to read the values directly. + +TODO add references to key functions / files. + +\subsection architecture_entropy_coef Transform Coefficient Coding and Optimization +\image html coeff_coding.png "" width=70% + +\subsubsection architecture_entropy_coef_what Transform coefficient coding +Transform coefficient coding is where the encoder compresses a quantized version +of prediction residue into the bitstream. + +\paragraph architecture_entropy_coef_prepare Preparation - transform and quantize +Before the entropy coding stage, the encoder decouple the pixel-to-pixel +correlation of the prediction residue by transforming the residue from the +spatial domain to the frequency domain. Then the encoder quantizes the transform +coefficients to make the coefficients ready for entropy coding. + +\paragraph architecture_entropy_coef_coding The coding process +The encoder uses \ref av1_write_coeffs_txb() to write the coefficients of +a transform block into the bitstream. +The coding process has three stages. +1. The encoder will code transform block skip flag (txb_skip). If the skip flag is +off, then the encoder will code the end of block position (eob) which is the scan +index of the last non-zero coefficient plus one. +2. Second, the encoder will code lower magnitude levels of each coefficient in +reverse scan order. +3. Finally, the encoder will code the sign and higher magnitude levels for each +coefficient if they are available. + +Related functions: +- \ref av1_write_coeffs_txb() +- write_inter_txb_coeff() +- \ref av1_write_intra_coeffs_mb() + +\paragraph architecture_entropy_coef_context Context information +To improve the compression efficiency, the encoder uses several context models +tailored for transform coefficients to capture the correlations between coding +symbols. Most of the context models are built to capture the correlations +between the coefficients within the same transform block. However, transform +block skip flag (txb_skip) and the sign of dc coefficient (dc_sign) require +context info from neighboring transform blocks. + +Here is how context info spread between transform blocks. Before coding a +transform block, the encoder will use get_txb_ctx() to collect the context +information from neighboring transform blocks. Then the context information +will be used for coding transform block skip flag (txb_skip) and the sign of +dc coefficient (dc_sign). After the transform block is coded, the encoder will +extract the context info from the current block using +\ref av1_get_txb_entropy_context(). Then encoder will store the context info +into a byte (uint8_t) using av1_set_entropy_contexts(). The encoder will use +the context info to code other transform blocks. + +Related functions: +- \ref av1_get_txb_entropy_context() +- av1_set_entropy_contexts() +- get_txb_ctx() +- \ref av1_update_intra_mb_txb_context() + +\subsubsection architecture_entropy_coef_rd RD optimization +Beside the actual entropy coding, the encoder uses several utility functions +to make optimal RD decisions. + +\paragraph architecture_entropy_coef_cost Entropy cost +The encoder uses \ref av1_cost_coeffs_txb() or \ref av1_cost_coeffs_txb_laplacian() +to estimate the entropy cost of a transform block. Note that +\ref av1_cost_coeffs_txb() is slower but accurate whereas +\ref av1_cost_coeffs_txb_laplacian() is faster but less accurate. + +Related functions: +- \ref av1_cost_coeffs_txb() +- \ref av1_cost_coeffs_txb_laplacian() +- \ref av1_cost_coeffs_txb_estimate() + +\paragraph architecture_entropy_coef_opt Quantized level optimization +Beside computing entropy cost, the encoder also uses \ref av1_optimize_txb() +to adjust the coefficient’s quantized levels to achieve optimal RD trade-off. +In \ref av1_optimize_txb(), the encoder goes through each quantized +coefficient and lowers the quantized coefficient level by one if the action +yields a better RD score. + +Related functions: +- \ref av1_optimize_txb() + +All the related functions are listed in \ref coefficient_coding. + +*/ + +/*!\defgroup encoder_algo Encoder Algorithm + * + * The encoder algorithm describes how a sequence is encoded, including high + * level decision as well as algorithm used at every encoding stage. + */ + +/*!\defgroup high_level_algo High-level Algorithm + * \ingroup encoder_algo + * This module describes sequence level/frame level algorithm in AV1. + * More details will be added. + * @{ + */ + +/*!\defgroup speed_features Speed vs Quality Trade Off + * \ingroup high_level_algo + * This module describes the encode speed vs quality tradeoff + * @{ + */ +/*! @} - end defgroup speed_features */ + +/*!\defgroup src_frame_proc Source Frame Processing + * \ingroup high_level_algo + * This module describes algorithms in AV1 assosciated with the + * pre-processing of source frames. See also \ref architecture_enc_src_proc + * + * @{ + */ +/*! @} - end defgroup src_frame_proc */ + +/*!\defgroup rate_control Rate Control + * \ingroup high_level_algo + * This module describes rate control algorithm in AV1. + * See also \ref architecture_enc_rate_ctrl + * @{ + */ +/*! @} - end defgroup rate_control */ + +/*!\defgroup tpl_modelling Temporal Dependency Modelling + * \ingroup high_level_algo + * This module includes algorithms to implement temporal dependency modelling. + * See also \ref architecture_enc_tpl + * @{ + */ +/*! @} - end defgroup tpl_modelling */ + +/*!\defgroup two_pass_algo Two Pass Mode + \ingroup high_level_algo + + In two pass mode, the input file is passed into the encoder for a quick + first pass, where statistics are gathered. These statistics and the input + file are then passed back into the encoder for a second pass. The statistics + help the encoder reach the desired bitrate without as much overshooting or + undershooting. + + During the first pass, the codec will return "stats" packets that contain + information useful for the second pass. The caller should concatenate these + packets as they are received. In the second pass, the concatenated packets + are passed in, along with the frames to encode. During the second pass, + "frame" packets are returned that represent the compressed video. + + A complete example can be found in `examples/twopass_encoder.c`. Pseudocode + is provided below to illustrate the core parts. + + During the first pass, the uncompressed frames are passed in and stats + information is appended to a byte array. + +~~~~~~~~~~~~~~~{.c} +// For simplicity, assume that there is enough memory in the stats buffer. +// Actual code will want to use a resizable array. stats_len represents +// the length of data already present in the buffer. +void get_stats_data(aom_codec_ctx_t *encoder, char *stats, + size_t *stats_len, bool *got_data) { + const aom_codec_cx_pkt_t *pkt; + aom_codec_iter_t iter = NULL; + while ((pkt = aom_codec_get_cx_data(encoder, &iter))) { + *got_data = true; + if (pkt->kind != AOM_CODEC_STATS_PKT) continue; + memcpy(stats + *stats_len, pkt->data.twopass_stats.buf, + pkt->data.twopass_stats.sz); + *stats_len += pkt->data.twopass_stats.sz; + } +} + +void first_pass(char *stats, size_t *stats_len) { + struct aom_codec_enc_cfg first_pass_cfg; + ... // Initialize the config as needed. + first_pass_cfg.g_pass = AOM_RC_FIRST_PASS; + aom_codec_ctx_t first_pass_encoder; + ... // Initialize the encoder. + + while (frame_available) { + // Read in the uncompressed frame, update frame_available + aom_image_t *frame_to_encode = ...; + aom_codec_encode(&first_pass_encoder, img, pts, duration, flags); + get_stats_data(&first_pass_encoder, stats, stats_len); + } + // After all frames have been processed, call aom_codec_encode with + // a NULL ptr repeatedly, until no more data is returned. The NULL + // ptr tells the encoder that no more frames are available. + bool got_data; + do { + got_data = false; + aom_codec_encode(&first_pass_encoder, NULL, pts, duration, flags); + get_stats_data(&first_pass_encoder, stats, stats_len, &got_data); + } while (got_data); + + aom_codec_destroy(&first_pass_encoder); +} +~~~~~~~~~~~~~~~ + + During the second pass, the uncompressed frames and the stats are + passed into the encoder. + +~~~~~~~~~~~~~~~{.c} +// Write out each encoded frame to the file. +void get_cx_data(aom_codec_ctx_t *encoder, FILE *file, + bool *got_data) { + const aom_codec_cx_pkt_t *pkt; + aom_codec_iter_t iter = NULL; + while ((pkt = aom_codec_get_cx_data(encoder, &iter))) { + *got_data = true; + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) continue; + fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, file); + } +} + +void second_pass(char *stats, size_t stats_len) { + struct aom_codec_enc_cfg second_pass_cfg; + ... // Initialize the config file as needed. + second_pass_cfg.g_pass = AOM_RC_LAST_PASS; + cfg.rc_twopass_stats_in.buf = stats; + cfg.rc_twopass_stats_in.sz = stats_len; + aom_codec_ctx_t second_pass_encoder; + ... // Initialize the encoder from the config. + + FILE *output = fopen("output.obu", "wb"); + while (frame_available) { + // Read in the uncompressed frame, update frame_available + aom_image_t *frame_to_encode = ...; + aom_codec_encode(&second_pass_encoder, img, pts, duration, flags); + get_cx_data(&second_pass_encoder, output); + } + // Pass in NULL to flush the encoder. + bool got_data; + do { + got_data = false; + aom_codec_encode(&second_pass_encoder, NULL, pts, duration, flags); + get_cx_data(&second_pass_encoder, output, &got_data); + } while (got_data); + + aom_codec_destroy(&second_pass_encoder); +} +~~~~~~~~~~~~~~~ + */ + + /*!\defgroup look_ahead_buffer The Look-Ahead Buffer + \ingroup high_level_algo + + A program should call \ref aom_codec_encode() for each frame that needs + processing. These frames are internally copied and stored in a fixed-size + circular buffer, known as the look-ahead buffer. Other parts of the code + will use future frame information to inform current frame decisions; + examples include the first-pass algorithm, TPL model, and temporal filter. + Note that this buffer also keeps a reference to the last source frame. + + The look-ahead buffer is defined in \ref av1/encoder/lookahead.h. It acts as an + opaque structure, with an interface to create and free memory associated with + it. It supports pushing and popping frames onto the structure in a FIFO + fashion. It also allows look-ahead when using the \ref av1_lookahead_peek() + function with a non-negative number, and look-behind when -1 is passed in (for + the last source frame; e.g., firstpass will use this for motion estimation). + The \ref av1_lookahead_depth() function returns the current number of frames + stored in it. Note that \ref av1_lookahead_pop() is a bit of a misnomer - it + only pops if either the "flush" variable is set, or the buffer is at maximum + capacity. + + The buffer is stored in the \ref AV1_PRIMARY::lookahead field. + It is initialized in the first call to \ref aom_codec_encode(), in the + \ref av1_receive_raw_frame() sub-routine. The buffer size is defined by + the g_lag_in_frames parameter set in the + \ref aom_codec_enc_cfg_t::g_lag_in_frames struct. + This can be modified manually but should only be set once. On the command + line, the flag "--lag-in-frames" controls it. The default size is 19 for + non-realtime usage and 1 for realtime. Note that a maximum value of 35 is + enforced. + + A frame will stay in the buffer as long as possible. As mentioned above, + the \ref av1_lookahead_pop() only removes a frame when either flush is set, + or the buffer is full. Note that each call to \ref aom_codec_encode() inserts + another frame into the buffer, and pop is called by the sub-function + \ref av1_encode_strategy(). The buffer is told to flush when + \ref aom_codec_encode() is passed a NULL image pointer. Note that the caller + must repeatedly call \ref aom_codec_encode() with a NULL image pointer, until + no more packets are available, in order to fully flush the buffer. + + */ + +/*! @} - end defgroup high_level_algo */ + +/*!\defgroup partition_search Partition Search + * \ingroup encoder_algo + * For and overview of the partition search see \ref architecture_enc_partitions + * @{ + */ + +/*! @} - end defgroup partition_search */ + +/*!\defgroup intra_mode_search Intra Mode Search + * \ingroup encoder_algo + * This module describes intra mode search algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup intra_mode_search */ + +/*!\defgroup inter_mode_search Inter Mode Search + * \ingroup encoder_algo + * This module describes inter mode search algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup inter_mode_search */ + +/*!\defgroup palette_mode_search Palette Mode Search + * \ingroup intra_mode_search + * This module describes palette mode search algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup palette_mode_search */ + +/*!\defgroup transform_search Transform Search + * \ingroup encoder_algo + * This module describes transform search algorithm in AV1. + * @{ + */ +/*! @} - end defgroup transform_search */ + +/*!\defgroup coefficient_coding Transform Coefficient Coding and Optimization + * \ingroup encoder_algo + * This module describes the algorithms of transform coefficient coding and optimization in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup coefficient_coding */ + +/*!\defgroup in_loop_filter In-loop Filter + * \ingroup encoder_algo + * This module describes in-loop filter algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup in_loop_filter */ + +/*!\defgroup in_loop_cdef CDEF + * \ingroup encoder_algo + * This module describes the CDEF parameter search algorithm + * in AV1. More details will be added. + * @{ + */ +/*! @} - end defgroup in_loop_restoration */ + +/*!\defgroup in_loop_restoration Loop Restoration + * \ingroup encoder_algo + * This module describes the loop restoration search + * and estimation algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup in_loop_restoration */ + +/*!\defgroup cyclic_refresh Cyclic Refresh + * \ingroup encoder_algo + * This module describes the cyclic refresh (aq-mode=3) in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup cyclic_refresh */ + +/*!\defgroup SVC Scalable Video Coding + * \ingroup encoder_algo + * This module describes scalable video coding algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup SVC */ +/*!\defgroup variance_partition Variance Partition + * \ingroup encoder_algo + * This module describes variance partition algorithm in AV1. + * More details will be added. + * @{ + */ +/*! @} - end defgroup variance_partition */ +/*!\defgroup nonrd_mode_search NonRD Optimized Mode Search + * \ingroup encoder_algo + * This module describes NonRD Optimized Mode Search used in Real-Time mode. + * More details will be added. + * @{ + */ +/*! @} - end defgroup nonrd_mode_search */ diff --git a/third_party/aom/doc/dev_guide/av1encoderflow.png b/third_party/aom/doc/dev_guide/av1encoderflow.png new file mode 100644 index 0000000000..5e69fce39c Binary files /dev/null and b/third_party/aom/doc/dev_guide/av1encoderflow.png differ diff --git a/third_party/aom/doc/dev_guide/av1partitions.png b/third_party/aom/doc/dev_guide/av1partitions.png new file mode 100644 index 0000000000..125439f5cb Binary files /dev/null and b/third_party/aom/doc/dev_guide/av1partitions.png differ diff --git a/third_party/aom/doc/dev_guide/coeff_coding.png b/third_party/aom/doc/dev_guide/coeff_coding.png new file mode 100644 index 0000000000..cba97dd712 Binary files /dev/null and b/third_party/aom/doc/dev_guide/coeff_coding.png differ diff --git a/third_party/aom/doc/dev_guide/filter_flow.png b/third_party/aom/doc/dev_guide/filter_flow.png new file mode 100644 index 0000000000..82849a0666 Binary files /dev/null and b/third_party/aom/doc/dev_guide/filter_flow.png differ diff --git a/third_party/aom/doc/dev_guide/filter_thr.png b/third_party/aom/doc/dev_guide/filter_thr.png new file mode 100644 index 0000000000..b833e941f6 Binary files /dev/null and b/third_party/aom/doc/dev_guide/filter_thr.png differ diff --git a/third_party/aom/doc/dev_guide/genericcodecflow.png b/third_party/aom/doc/dev_guide/genericcodecflow.png new file mode 100644 index 0000000000..65a6b2f19e Binary files /dev/null and b/third_party/aom/doc/dev_guide/genericcodecflow.png differ diff --git a/third_party/aom/doc/dev_guide/gf_group.png b/third_party/aom/doc/dev_guide/gf_group.png new file mode 100644 index 0000000000..1cd47d2490 Binary files /dev/null and b/third_party/aom/doc/dev_guide/gf_group.png differ diff --git a/third_party/aom/doc/dev_guide/partition.png b/third_party/aom/doc/dev_guide/partition.png new file mode 100644 index 0000000000..914d6c2fd0 Binary files /dev/null and b/third_party/aom/doc/dev_guide/partition.png differ diff --git a/third_party/aom/doc/dev_guide/tplgfgroupdiagram.png b/third_party/aom/doc/dev_guide/tplgfgroupdiagram.png new file mode 100644 index 0000000000..fa5b0671c2 Binary files /dev/null and b/third_party/aom/doc/dev_guide/tplgfgroupdiagram.png differ diff --git a/third_party/aom/doc/img/edge_direction.svg b/third_party/aom/doc/img/edge_direction.svg new file mode 100644 index 0000000000..343a2b9f60 --- /dev/null +++ b/third_party/aom/doc/img/edge_direction.svg @@ -0,0 +1,6319 @@ + + + + + + + + + + + + + + Page-1 + + + + Square + 0 + + + + + + + 0 + + Square.113 + 1 + + + + + + + 1 + + Square.114 + 2 + + + + + + + 2 + + Square.115 + 3 + + + + + + + 3 + + Square.116 + 1 + + + + + + + 1 + + Square.117 + 2 + + + + + + + 2 + + Square.118 + 3 + + + + + + + 3 + + Square.119 + 4 + + + + + + + 4 + + Square.124 + 2 + + + + + + + 2 + + Square.125 + 3 + + + + + + + 3 + + Square.126 + 4 + + + + + + + 4 + + Square.127 + 5 + + + + + + + 5 + + Square.128 + 3 + + + + + + + 3 + + Square.129 + 4 + + + + + + + 4 + + Square.130 + 5 + + + + + + + 5 + + Square.131 + 6 + + + + + + + 6 + + Square.132 + 4 + + + + + + + 4 + + Square.133 + 5 + + + + + + + 5 + + Square.134 + 6 + + + + + + + 6 + + Square.135 + 7 + + + + + + + 7 + + Square.136 + 5 + + + + + + + 5 + + Square.137 + 6 + + + + + + + 6 + + Square.138 + 7 + + + + + + + 7 + + Square.139 + 8 + + + + + + + 8 + + Square.140 + 6 + + + + + + + 6 + + Square.141 + 7 + + + + + + + 7 + + Square.142 + 8 + + + + + + + 8 + + Square.143 + 9 + + + + + + + 9 + + Square.144 + 7 + + + + + + + 7 + + Square.145 + 8 + + + + + + + 8 + + Square.146 + 9 + + + + + + + 9 + + Square.147 + 10 + + + + + + + 10 + + Square.148 + 4 + + + + + + + 4 + + Square.149 + 5 + + + + + + + 5 + + Square.150 + 6 + + + + + + + 6 + + Square.151 + 7 + + + + + + + 7 + + Square.152 + 5 + + + + + + + 5 + + Square.153 + 6 + + + + + + + 6 + + Square.154 + 7 + + + + + + + 7 + + Square.155 + 8 + + + + + + + 8 + + Square.156 + 6 + + + + + + + 6 + + Square.157 + 7 + + + + + + + 7 + + Square.158 + 8 + + + + + + + 8 + + Square.159 + 9 + + + + + + + 9 + + Square.160 + 7 + + + + + + + 7 + + Square.161 + 8 + + + + + + + 8 + + Square.162 + 9 + + + + + + + 9 + + Square.163 + 10 + + + + + + + 10 + + Square.164 + 8 + + + + + + + 8 + + Square.165 + 9 + + + + + + + 9 + + Square.166 + 10 + + + + + + + 10 + + Square.167 + 11 + + + + + + + 11 + + Square.168 + 9 + + + + + + + 9 + + Square.169 + 10 + + + + + + + 10 + + Square.170 + 11 + + + + + + + 11 + + Square.171 + 12 + + + + + + + 12 + + Square.172 + 10 + + + + + + + 10 + + Square.173 + 11 + + + + + + + 11 + + Square.174 + 12 + + + + + + + 12 + + Square.175 + 13 + + + + + + + 13 + + Square.176 + 11 + + + + + + + 11 + + Square.177 + 12 + + + + + + + 12 + + Square.178 + 13 + + + + + + + 13 + + Square.179 + 14 + + + + + + + 14 + + Square.180 + 0 + + + + + + + 0 + + Square.181 + 0 + + + + + + + 0 + + Square.182 + 1 + + + + + + + 1 + + Square.183 + 1 + + + + + + + 1 + + Square.184 + 1 + + + + + + + 1 + + Square.185 + 1 + + + + + + + 1 + + Square.186 + 2 + + + + + + + 2 + + Square.187 + 2 + + + + + + + 2 + + Square.188 + 2 + + + + + + + 2 + + Square.189 + 2 + + + + + + + 2 + + Square.190 + 3 + + + + + + + 3 + + Square.191 + 3 + + + + + + + 3 + + Square.192 + 3 + + + + + + + 3 + + Square.193 + 3 + + + + + + + 3 + + Square.194 + 4 + + + + + + + 4 + + Square.195 + 4 + + + + + + + 4 + + Square.196 + 4 + + + + + + + 4 + + Square.197 + 4 + + + + + + + 4 + + Square.198 + 5 + + + + + + + 5 + + Square.199 + 5 + + + + + + + 5 + + Square.200 + 5 + + + + + + + 5 + + Square.201 + 5 + + + + + + + 5 + + Square.202 + 6 + + + + + + + 6 + + Square.203 + 6 + + + + + + + 6 + + Square.204 + 6 + + + + + + + 6 + + Square.205 + 6 + + + + + + + 6 + + Square.206 + 7 + + + + + + + 7 + + Square.207 + 7 + + + + + + + 7 + + Square.208 + 7 + + + + + + + 7 + + Square.209 + 7 + + + + + + + 7 + + Square.210 + 8 + + + + + + + 8 + + Square.211 + 8 + + + + + + + 8 + + Square.212 + 2 + + + + + + + 2 + + Square.213 + 2 + + + + + + + 2 + + Square.214 + 3 + + + + + + + 3 + + Square.215 + 3 + + + + + + + 3 + + Square.216 + 3 + + + + + + + 3 + + Square.217 + 3 + + + + + + + 3 + + Square.218 + 4 + + + + + + + 4 + + Square.219 + 4 + + + + + + + 4 + + Square.220 + 4 + + + + + + + 4 + + Square.221 + 4 + + + + + + + 4 + + Square.222 + 5 + + + + + + + 5 + + Square.223 + 5 + + + + + + + 5 + + Square.224 + 5 + + + + + + + 5 + + Square.225 + 5 + + + + + + + 5 + + Square.226 + 6 + + + + + + + 6 + + Square.227 + 6 + + + + + + + 6 + + Square.228 + 6 + + + + + + + 6 + + Square.229 + 6 + + + + + + + 6 + + Square.230 + 7 + + + + + + + 7 + + Square.231 + 7 + + + + + + + 7 + + Square.232 + 7 + + + + + + + 7 + + Square.233 + 7 + + + + + + + 7 + + Square.234 + 8 + + + + + + + 8 + + Square.235 + 8 + + + + + + + 8 + + Square.236 + 8 + + + + + + + 8 + + Square.237 + 8 + + + + + + + 8 + + Square.238 + 9 + + + + + + + 9 + + Square.239 + 9 + + + + + + + 9 + + Square.240 + 9 + + + + + + + 9 + + Square.241 + 9 + + + + + + + 9 + + Square.242 + 10 + + + + + + + 10 + + Square.243 + 10 + + + + + + + 10 + + Square.244 + 0 + + + + + + + 0 + + Square.245 + 0 + + + + + + + 0 + + Square.246 + 0 + + + + + + + 0 + + Square.247 + 0 + + + + + + + 0 + + Square.248 + 1 + + + + + + + 1 + + Square.249 + 1 + + + + + + + 1 + + Square.250 + 1 + + + + + + + 1 + + Square.251 + 1 + + + + + + + 1 + + Square.252 + 2 + + + + + + + 2 + + Square.253 + 2 + + + + + + + 2 + + Square.254 + 2 + + + + + + + 2 + + Square.255 + 2 + + + + + + + 2 + + Square.256 + 3 + + + + + + + 3 + + Square.257 + 3 + + + + + + + 3 + + Square.258 + 3 + + + + + + + 3 + + Square.259 + 3 + + + + + + + 3 + + Square.260 + 4 + + + + + + + 4 + + Square.261 + 4 + + + + + + + 4 + + Square.262 + 4 + + + + + + + 4 + + Square.263 + 4 + + + + + + + 4 + + Square.264 + 5 + + + + + + + 5 + + Square.265 + 5 + + + + + + + 5 + + Square.266 + 5 + + + + + + + 5 + + Square.267 + 5 + + + + + + + 5 + + Square.268 + 6 + + + + + + + 6 + + Square.269 + 6 + + + + + + + 6 + + Square.270 + 6 + + + + + + + 6 + + Square.271 + 6 + + + + + + + 6 + + Square.272 + 7 + + + + + + + 7 + + Square.273 + 7 + + + + + + + 7 + + Square.274 + 7 + + + + + + + 7 + + Square.275 + 7 + + + + + + + 7 + + Square.276 + 0 + + + + + + + 0 + + Square.277 + 0 + + + + + + + 0 + + Square.278 + 0 + + + + + + + 0 + + Square.279 + 0 + + + + + + + 0 + + Square.280 + 1 + + + + + + + 1 + + Square.281 + 1 + + + + + + + 1 + + Square.282 + 1 + + + + + + + 1 + + Square.283 + 1 + + + + + + + 1 + + Square.284 + 2 + + + + + + + 2 + + Square.285 + 2 + + + + + + + 2 + + Square.286 + 2 + + + + + + + 2 + + Square.287 + 2 + + + + + + + 2 + + Square.288 + 3 + + + + + + + 3 + + Square.289 + 3 + + + + + + + 3 + + Square.290 + 3 + + + + + + + 3 + + Square.291 + 3 + + + + + + + 3 + + Square.292 + 4 + + + + + + + 4 + + Square.293 + 4 + + + + + + + 4 + + Square.294 + 4 + + + + + + + 4 + + Square.295 + 4 + + + + + + + 4 + + Square.296 + 5 + + + + + + + 5 + + Square.297 + 5 + + + + + + + 5 + + Square.298 + 5 + + + + + + + 5 + + Square.299 + 5 + + + + + + + 5 + + Square.300 + 6 + + + + + + + 6 + + Square.301 + 6 + + + + + + + 6 + + Square.302 + 6 + + + + + + + 6 + + Square.303 + 6 + + + + + + + 6 + + Square.304 + 7 + + + + + + + 7 + + Square.305 + 7 + + + + + + + 7 + + Square.306 + 7 + + + + + + + 7 + + Square.307 + 7 + + + + + + + 7 + + Square.308 + 3 + + + + + + + 3 + + Square.309 + 3 + + + + + + + 3 + + Square.310 + 2 + + + + + + + 2 + + Square.311 + 2 + + + + + + + 2 + + Square.312 + 4 + + + + + + + 4 + + Square.313 + 4 + + + + + + + 4 + + Square.314 + 3 + + + + + + + 3 + + Square.315 + 3 + + + + + + + 3 + + Square.316 + 5 + + + + + + + 5 + + Square.317 + 5 + + + + + + + 5 + + Square.318 + 4 + + + + + + + 4 + + Square.319 + 4 + + + + + + + 4 + + Square.320 + 6 + + + + + + + 6 + + Square.321 + 6 + + + + + + + 6 + + Square.322 + 5 + + + + + + + 5 + + Square.323 + 5 + + + + + + + 5 + + Square.324 + 7 + + + + + + + 7 + + Square.325 + 7 + + + + + + + 7 + + Square.326 + 6 + + + + + + + 6 + + Square.327 + 6 + + + + + + + 6 + + Square.328 + 8 + + + + + + + 8 + + Square.329 + 8 + + + + + + + 8 + + Square.330 + 7 + + + + + + + 7 + + Square.331 + 7 + + + + + + + 7 + + Square.332 + 9 + + + + + + + 9 + + Square.333 + 9 + + + + + + + 9 + + Square.334 + 8 + + + + + + + 8 + + Square.335 + 8 + + + + + + + 8 + + Square.336 + 10 + + + + + + + 10 + + Square.337 + 10 + + + + + + + 10 + + Square.338 + 9 + + + + + + + 9 + + Square.339 + 9 + + + + + + + 9 + + Square.340 + 1 + + + + + + + 1 + + Square.341 + 1 + + + + + + + 1 + + Square.342 + 0 + + + + + + + 0 + + Square.343 + 0 + + + + + + + 0 + + Square.344 + 2 + + + + + + + 2 + + Square.345 + 2 + + + + + + + 2 + + Square.346 + 1 + + + + + + + 1 + + Square.347 + 1 + + + + + + + 1 + + Square.348 + 3 + + + + + + + 3 + + Square.349 + 3 + + + + + + + 3 + + Square.350 + 2 + + + + + + + 2 + + Square.351 + 2 + + + + + + + 2 + + Square.352 + 4 + + + + + + + 4 + + Square.353 + 4 + + + + + + + 4 + + Square.354 + 3 + + + + + + + 3 + + Square.355 + 3 + + + + + + + 3 + + Square.356 + 5 + + + + + + + 5 + + Square.357 + 5 + + + + + + + 5 + + Square.358 + 4 + + + + + + + 4 + + Square.359 + 4 + + + + + + + 4 + + Square.360 + 6 + + + + + + + 6 + + Square.361 + 6 + + + + + + + 6 + + Square.362 + 5 + + + + + + + 5 + + Square.363 + 5 + + + + + + + 5 + + Square.364 + 7 + + + + + + + 7 + + Square.365 + 7 + + + + + + + 7 + + Square.366 + 6 + + + + + + + 6 + + Square.367 + 6 + + + + + + + 6 + + Square.368 + 8 + + + + + + + 8 + + Square.369 + 8 + + + + + + + 8 + + Square.370 + 7 + + + + + + + 7 + + Square.371 + 7 + + + + + + + 7 + + Square.372 + 7 + + + + + + + 7 + + Square.373 + 6 + + + + + + + 6 + + Square.374 + 5 + + + + + + + 5 + + Square.375 + 4 + + + + + + + 4 + + Square.376 + 8 + + + + + + + 8 + + Square.377 + 7 + + + + + + + 7 + + Square.378 + 6 + + + + + + + 6 + + Square.379 + 5 + + + + + + + 5 + + Square.380 + 9 + + + + + + + 9 + + Square.381 + 8 + + + + + + + 8 + + Square.382 + 7 + + + + + + + 7 + + Square.383 + 6 + + + + + + + 6 + + Square.384 + 10 + + + + + + + 10 + + Square.385 + 9 + + + + + + + 9 + + Square.386 + 8 + + + + + + + 8 + + Square.387 + 7 + + + + + + + 7 + + Square.388 + 11 + + + + + + + 11 + + Square.389 + 10 + + + + + + + 10 + + Square.390 + 9 + + + + + + + 9 + + Square.391 + 8 + + + + + + + 8 + + Square.392 + 12 + + + + + + + 12 + + Square.393 + 11 + + + + + + + 11 + + Square.394 + 10 + + + + + + + 10 + + Square.395 + 9 + + + + + + + 9 + + Square.396 + 13 + + + + + + + 13 + + Square.397 + 12 + + + + + + + 12 + + Square.398 + 11 + + + + + + + 11 + + Square.399 + 10 + + + + + + + 10 + + Square.400 + 14 + + + + + + + 14 + + Square.401 + 13 + + + + + + + 13 + + Square.402 + 12 + + + + + + + 12 + + Square.403 + 11 + + + + + + + 11 + + Square.404 + 3 + + + + + + + 3 + + Square.405 + 2 + + + + + + + 2 + + Square.406 + 1 + + + + + + + 1 + + Square.407 + 0 + + + + + + + 0 + + Square.408 + 4 + + + + + + + 4 + + Square.409 + 3 + + + + + + + 3 + + Square.410 + 2 + + + + + + + 2 + + Square.411 + 1 + + + + + + + 1 + + Square.412 + 5 + + + + + + + 5 + + Square.413 + 4 + + + + + + + 4 + + Square.414 + 3 + + + + + + + 3 + + Square.415 + 2 + + + + + + + 2 + + Square.416 + 6 + + + + + + + 6 + + Square.417 + 5 + + + + + + + 5 + + Square.418 + 4 + + + + + + + 4 + + Square.419 + 3 + + + + + + + 3 + + Square.420 + 7 + + + + + + + 7 + + Square.421 + 6 + + + + + + + 6 + + Square.422 + 5 + + + + + + + 5 + + Square.423 + 4 + + + + + + + 4 + + Square.424 + 8 + + + + + + + 8 + + Square.425 + 7 + + + + + + + 7 + + Square.426 + 6 + + + + + + + 6 + + Square.427 + 5 + + + + + + + 5 + + Square.428 + 9 + + + + + + + 9 + + Square.429 + 8 + + + + + + + 8 + + Square.430 + 7 + + + + + + + 7 + + Square.431 + 6 + + + + + + + 6 + + Square.432 + 10 + + + + + + + 10 + + Square.433 + 9 + + + + + + + 9 + + Square.434 + 8 + + + + + + + 8 + + Square.435 + 7 + + + + + + + 7 + + Square.436 + 7 + + + + + + + 7 + + Square.437 + 6 + + + + + + + 6 + + Square.438 + 5 + + + + + + + 5 + + Square.439 + 4 + + + + + + + 4 + + Square.440 + 7 + + + + + + + 7 + + Square.441 + 6 + + + + + + + 6 + + Square.442 + 5 + + + + + + + 5 + + Square.443 + 4 + + + + + + + 4 + + Square.444 + 8 + + + + + + + 8 + + Square.445 + 7 + + + + + + + 7 + + Square.446 + 6 + + + + + + + 6 + + Square.447 + 5 + + + + + + + 5 + + Square.448 + 8 + + + + + + + 8 + + Square.449 + 7 + + + + + + + 7 + + Square.450 + 6 + + + + + + + 6 + + Square.451 + 5 + + + + + + + 5 + + Square.452 + 9 + + + + + + + 9 + + Square.453 + 8 + + + + + + + 8 + + Square.454 + 7 + + + + + + + 7 + + Square.455 + 6 + + + + + + + 6 + + Square.456 + 9 + + + + + + + 9 + + Square.457 + 8 + + + + + + + 8 + + Square.458 + 7 + + + + + + + 7 + + Square.459 + 6 + + + + + + + 6 + + Square.460 + 10 + + + + + + + 10 + + Square.461 + 9 + + + + + + + 9 + + Square.462 + 8 + + + + + + + 8 + + Square.463 + 7 + + + + + + + 7 + + Square.464 + 10 + + + + + + + 10 + + Square.465 + 9 + + + + + + + 9 + + Square.466 + 8 + + + + + + + 8 + + Square.467 + 7 + + + + + + + 7 + + Square.468 + 3 + + + + + + + 3 + + Square.469 + 2 + + + + + + + 2 + + Square.470 + 1 + + + + + + + 1 + + Square.471 + 0 + + + + + + + 0 + + Square.472 + 3 + + + + + + + 3 + + Square.473 + 2 + + + + + + + 2 + + Square.474 + 1 + + + + + + + 1 + + Square.475 + 0 + + + + + + + 0 + + Square.476 + 4 + + + + + + + 4 + + Square.477 + 3 + + + + + + + 3 + + Square.478 + 2 + + + + + + + 2 + + Square.479 + 1 + + + + + + + 1 + + Square.480 + 4 + + + + + + + 4 + + Square.481 + 3 + + + + + + + 3 + + Square.482 + 2 + + + + + + + 2 + + Square.483 + 1 + + + + + + + 1 + + Square.484 + 5 + + + + + + + 5 + + Square.485 + 4 + + + + + + + 4 + + Square.486 + 3 + + + + + + + 3 + + Square.487 + 2 + + + + + + + 2 + + Square.488 + 5 + + + + + + + 5 + + Square.489 + 4 + + + + + + + 4 + + Square.490 + 3 + + + + + + + 3 + + Square.491 + 2 + + + + + + + 2 + + Square.492 + 6 + + + + + + + 6 + + Square.493 + 5 + + + + + + + 5 + + Square.494 + 4 + + + + + + + 4 + + Square.495 + 3 + + + + + + + 3 + + Square.496 + 6 + + + + + + + 6 + + Square.497 + 5 + + + + + + + 5 + + Square.498 + 4 + + + + + + + 4 + + Square.499 + 3 + + + + + + + 3 + + Square.500 + 0 + + + + + + + 0 + + Square.501 + 1 + + + + + + + 1 + + Square.502 + 2 + + + + + + + 2 + + Square.503 + 3 + + + + + + + 3 + + Square.504 + 0 + + + + + + + 0 + + Square.505 + 1 + + + + + + + 1 + + Square.506 + 2 + + + + + + + 2 + + Square.507 + 3 + + + + + + + 3 + + Square.508 + 0 + + + + + + + 0 + + Square.509 + 1 + + + + + + + 1 + + Square.510 + 2 + + + + + + + 2 + + Square.511 + 3 + + + + + + + 3 + + Square.512 + 0 + + + + + + + 0 + + Square.513 + 1 + + + + + + + 1 + + Square.514 + 2 + + + + + + + 2 + + Square.515 + 3 + + + + + + + 3 + + Square.516 + 0 + + + + + + + 0 + + Square.517 + 1 + + + + + + + 1 + + Square.518 + 2 + + + + + + + 2 + + Square.519 + 3 + + + + + + + 3 + + Square.520 + 0 + + + + + + + 0 + + Square.521 + 1 + + + + + + + 1 + + Square.522 + 2 + + + + + + + 2 + + Square.523 + 3 + + + + + + + 3 + + Square.524 + 0 + + + + + + + 0 + + Square.525 + 1 + + + + + + + 1 + + Square.526 + 2 + + + + + + + 2 + + Square.527 + 3 + + + + + + + 3 + + Square.528 + 0 + + + + + + + 0 + + Square.529 + 1 + + + + + + + 1 + + Square.530 + 2 + + + + + + + 2 + + Square.531 + 3 + + + + + + + 3 + + Square.532 + 4 + + + + + + + 4 + + Square.533 + 5 + + + + + + + 5 + + Square.534 + 6 + + + + + + + 6 + + Square.535 + 7 + + + + + + + 7 + + Square.536 + 4 + + + + + + + 4 + + Square.537 + 5 + + + + + + + 5 + + Square.538 + 6 + + + + + + + 6 + + Square.539 + 7 + + + + + + + 7 + + Square.540 + 4 + + + + + + + 4 + + Square.541 + 5 + + + + + + + 5 + + Square.542 + 6 + + + + + + + 6 + + Square.543 + 7 + + + + + + + 7 + + Square.544 + 4 + + + + + + + 4 + + Square.545 + 5 + + + + + + + 5 + + Square.546 + 6 + + + + + + + 6 + + Square.547 + 7 + + + + + + + 7 + + Square.548 + 4 + + + + + + + 4 + + Square.549 + 5 + + + + + + + 5 + + Square.550 + 6 + + + + + + + 6 + + Square.551 + 7 + + + + + + + 7 + + Square.552 + 4 + + + + + + + 4 + + Square.553 + 5 + + + + + + + 5 + + Square.554 + 6 + + + + + + + 6 + + Square.555 + 7 + + + + + + + 7 + + Square.556 + 4 + + + + + + + 4 + + Square.557 + 5 + + + + + + + 5 + + Square.558 + 6 + + + + + + + 6 + + Square.559 + 7 + + + + + + + 7 + + Square.560 + 4 + + + + + + + 4 + + Square.561 + 5 + + + + + + + 5 + + Square.562 + 6 + + + + + + + 6 + + Square.563 + 7 + + + + + + + 7 + + Square.564 + 0 + + + + + + + 0 + + Square.565 + 1 + + + + + + + 1 + + Square.566 + 2 + + + + + + + 2 + + Square.567 + 3 + + + + + + + 3 + + Square.568 + 0 + + + + + + + 0 + + Square.569 + 1 + + + + + + + 1 + + Square.570 + 2 + + + + + + + 2 + + Square.571 + 3 + + + + + + + 3 + + Square.572 + 1 + + + + + + + 1 + + Square.573 + 2 + + + + + + + 2 + + Square.574 + 3 + + + + + + + 3 + + Square.575 + 4 + + + + + + + 4 + + Square.576 + 1 + + + + + + + 1 + + Square.577 + 2 + + + + + + + 2 + + Square.578 + 3 + + + + + + + 3 + + Square.579 + 4 + + + + + + + 4 + + Square.580 + 2 + + + + + + + 2 + + Square.581 + 3 + + + + + + + 3 + + Square.582 + 4 + + + + + + + 4 + + Square.583 + 5 + + + + + + + 5 + + Square.584 + 2 + + + + + + + 2 + + Square.585 + 3 + + + + + + + 3 + + Square.586 + 4 + + + + + + + 4 + + Square.587 + 5 + + + + + + + 5 + + Square.588 + 3 + + + + + + + 3 + + Square.589 + 4 + + + + + + + 4 + + Square.590 + 5 + + + + + + + 5 + + Square.591 + 6 + + + + + + + 6 + + Square.592 + 3 + + + + + + + 3 + + Square.593 + 4 + + + + + + + 4 + + Square.594 + 5 + + + + + + + 5 + + Square.595 + 6 + + + + + + + 6 + + Square.596 + 4 + + + + + + + 4 + + Square.597 + 5 + + + + + + + 5 + + Square.598 + 6 + + + + + + + 6 + + Square.599 + 7 + + + + + + + 7 + + Square.600 + 4 + + + + + + + 4 + + Square.601 + 5 + + + + + + + 5 + + Square.602 + 6 + + + + + + + 6 + + Square.603 + 7 + + + + + + + 7 + + Square.604 + 5 + + + + + + + 5 + + Square.605 + 6 + + + + + + + 6 + + Square.606 + 7 + + + + + + + 7 + + Square.607 + 8 + + + + + + + 8 + + Square.608 + 5 + + + + + + + 5 + + Square.609 + 6 + + + + + + + 6 + + Square.610 + 7 + + + + + + + 7 + + Square.611 + 8 + + + + + + + 8 + + Square.612 + 6 + + + + + + + 6 + + Square.613 + 7 + + + + + + + 7 + + Square.614 + 8 + + + + + + + 8 + + Square.615 + 9 + + + + + + + 9 + + Square.616 + 6 + + + + + + + 6 + + Square.617 + 7 + + + + + + + 7 + + Square.618 + 8 + + + + + + + 8 + + Square.619 + 9 + + + + + + + 9 + + Square.620 + 7 + + + + + + + 7 + + Square.621 + 8 + + + + + + + 8 + + Square.622 + 9 + + + + + + + 9 + + Square.623 + 10 + + + + + + + 10 + + Square.624 + 7 + + + + + + + 7 + + Square.625 + 8 + + + + + + + 8 + + Square.626 + 9 + + + + + + + 9 + + Square.627 + 10 + + + + + + + 10 + + Sheet.630 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.631 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.632 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.633 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.634 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.635 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.636 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.637 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sheet.638 + d = 0 + + + + d = 0 + + Sheet.639 + d = 1 + + + + d = 1 + + Sheet.640 + d = 2 + + + + d = 2 + + Sheet.641 + d = 3 + + + + d = 3 + + Sheet.642 + d = 4 + + + + d = 4 + + Sheet.643 + d = 5 + + + + d = 5 + + Sheet.644 + d = 6 + + + + d = 6 + + Sheet.645 + d = 7 + + + + d = 7 + + diff --git a/third_party/aom/doc/img/equ_dir_search.svg b/third_party/aom/doc/img/equ_dir_search.svg new file mode 100644 index 0000000000..3f14e3d95c --- /dev/null +++ b/third_party/aom/doc/img/equ_dir_search.svg @@ -0,0 +1,206 @@ + + + + + + + + + + + + + + Page-1 + + + Sheet.1 + + + + + + diff --git a/third_party/aom/doc/img/equ_dual_self_guided.svg b/third_party/aom/doc/img/equ_dual_self_guided.svg new file mode 100644 index 0000000000..c936f46f46 --- /dev/null +++ b/third_party/aom/doc/img/equ_dual_self_guided.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + Page-1 + + + Sheet.1 + + + + + + diff --git a/third_party/aom/doc/img/equ_dual_self_para.svg b/third_party/aom/doc/img/equ_dual_self_para.svg new file mode 100644 index 0000000000..d294bcae25 --- /dev/null +++ b/third_party/aom/doc/img/equ_dual_self_para.svg @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + Page-1 + + + Sheet.1 + + + + + + diff --git a/third_party/aom/doc/img/equ_edge_direction.svg b/third_party/aom/doc/img/equ_edge_direction.svg new file mode 100644 index 0000000000..d36634db1b --- /dev/null +++ b/third_party/aom/doc/img/equ_edge_direction.svg @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + Page-1 + + + Sheet.1 + + + + + + diff --git a/third_party/aom/doc/img/equ_guided_filter.svg b/third_party/aom/doc/img/equ_guided_filter.svg new file mode 100644 index 0000000000..021c194d7a --- /dev/null +++ b/third_party/aom/doc/img/equ_guided_filter.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + Page-1 + + + Sheet.1 + + + + + + diff --git a/third_party/aom/doc/img/equ_wiener_filter.svg b/third_party/aom/doc/img/equ_wiener_filter.svg new file mode 100644 index 0000000000..fcea1c8391 --- /dev/null +++ b/third_party/aom/doc/img/equ_wiener_filter.svg @@ -0,0 +1,51 @@ + + + + + + + + + + + + + + Page-1 + + + Sheet.1 + + + + + + diff --git a/third_party/aom/doc/img/inter_motion_field.svg b/third_party/aom/doc/img/inter_motion_field.svg new file mode 100644 index 0000000000..091ae11f35 --- /dev/null +++ b/third_party/aom/doc/img/inter_motion_field.svg @@ -0,0 +1,219 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Page-1 + + + Parallelogram + + + + + + + Parallelogram.2 + + + + + + + Parallelogram.3 + + + + + + + Parallelogram.4 + + + + + + + Parallelogram.5 + + + + + + + + Parallelogram.6 + + + + + + + Sheet.7 + MVref + + + + + MVref + + Sheet.9 + + + + Sheet.12 + + + + Parallelogram.13 + + + + + + + Parallelogram.14 + + + + + + + + Sheet.8 + + + + Sheet.15 + MVref + + + + + MVref + + Sheet.16 + + + + Sheet.17 + MV0 + + + + + MV0 + + Sheet.19 + + + + Sheet.21 + Current frame + + + + Current frame + + Sheet.22 + Reference frame 1 (R1) + + + + Reference frame 1 (R1) + + Sheet.23 + Reference frame 0 + + + + Reference frame 0 + + Sheet.24 + Reference frame of R1 + + + + Reference frame of R1 + + diff --git a/third_party/aom/doc/img/inter_obmc.svg b/third_party/aom/doc/img/inter_obmc.svg new file mode 100644 index 0000000000..a69084b08e --- /dev/null +++ b/third_party/aom/doc/img/inter_obmc.svg @@ -0,0 +1,61 @@ + + + + + + + + + + 页-1 + + + 工作表.1 + 4 + + + + 4 + + 工作表.2 + 0 + + + + 0 + + 工作表.3 + 2 + + + + 2 + + 工作表.4 + 3 + + + + 3 + + 工作表.5 + 1 + + + + 1 + + 工作表.6 + + + + diff --git a/third_party/aom/doc/img/inter_spatial_mvp.svg b/third_party/aom/doc/img/inter_spatial_mvp.svg new file mode 100644 index 0000000000..aa2e88afe8 --- /dev/null +++ b/third_party/aom/doc/img/inter_spatial_mvp.svg @@ -0,0 +1,215 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Page-1 + + + Square.33 + + + + + + + Square.38 + + + + + + + Square.39 + + + + + + + Square.40 + + + + + + + Square.41 + + + + + + + Square.42 + + + + + + + Square.43 + + + + + + + Square.44 + + + + + + + Square.45 + + + + + + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + Square.54 + + + + + + + Sheet.55 + + + + Sheet.56 + + + + Sheet.58 + + + + Square.59 + + + + + + + Sheet.60 + + + + Sheet.61 + + + + diff --git a/third_party/aom/doc/img/inter_tmvp_positions.svg b/third_party/aom/doc/img/inter_tmvp_positions.svg new file mode 100644 index 0000000000..87f8dfa80f --- /dev/null +++ b/third_party/aom/doc/img/inter_tmvp_positions.svg @@ -0,0 +1,99 @@ + + + + + + + + + + Page-1 + + + Square.31 + B4 + + + + + + + B4 + + Square.30 + B6 + + + + + + + B6 + + Square.32 + B5 + + + + + + + B5 + + Square + + + + + + + Square.26 + B0 + + + + + + + B0 + + Square.27 + B1 + + + + + + + B1 + + Square.28 + B2 + + + + + + + B2 + + Square.29 + B3 + + + + + + + B3 + + diff --git a/third_party/aom/doc/img/inter_tx_partition.svg b/third_party/aom/doc/img/inter_tx_partition.svg new file mode 100644 index 0000000000..6f853c65d3 --- /dev/null +++ b/third_party/aom/doc/img/inter_tx_partition.svg @@ -0,0 +1,87 @@ + + + + + + + + + + + + + + + + + + Page-1 + + + Square.24 + + + + + + + Square.25 + + + + + + + Sheet.26 + + + + Sheet.27 + + + + Sheet.28 + + + + Sheet.29 + + + + Sheet.30 + + + + Sheet.31 + + + + Sheet.32 + + + + Sheet.33 + + + + Sheet.34 + + + + Sheet.35 + + + + diff --git a/third_party/aom/doc/img/intra_cfl.svg b/third_party/aom/doc/img/intra_cfl.svg new file mode 100644 index 0000000000..1153a2845e --- /dev/null +++ b/third_party/aom/doc/img/intra_cfl.svg @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + Page-1 + + + + + + + + + + + + + + Tagged process + + Sheet.6 + + + + Sheet.7 + + + Sheet.8 + Sub-Sample + + + + Sub-Sample + + + + + + + + + + + + Tagged process.9 + + Sheet.10 + + + + Sheet.11 + + + Sheet.12 + Average + + + + Average + + + Sheet.27 + + + + Sheet.28 + + + + Sheet.29 + - + + + + - + + Dynamic connector + + + + Sheet.35 + + + + Sheet.36 + × + + + + × + + Sheet.37 + + + + Sheet.38 + + + + Sheet.39 + + + + + + + + + Sheet.40 + + + + Dynamic connector.41 + + + + Dynamic connector.74 + + + + Sheet.75 + + + + Sheet.78 + Chroma DC Prediction + + + + Chroma DC Prediction + + Sheet.82 + Scaling parameter α + + + + Scaling parameter α + + Sheet.83 + Luma reconstructed samples + + + + Luma reconstructed samples + + Sheet.84 + CfL Prediction + + + + CfL Prediction + + Sheet.85 + “AC” contribution + + + + AC contribution + + diff --git a/third_party/aom/doc/img/intra_directional.svg b/third_party/aom/doc/img/intra_directional.svg new file mode 100644 index 0000000000..3a08007a95 --- /dev/null +++ b/third_party/aom/doc/img/intra_directional.svg @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + Page-1 + + + Square + + + + + + + Sheet.5 + D135_PRED + + + + + D135_PRED + + Sheet.6 + H_PRED + + + + + H_PRED + + Sheet.8 + D113_PRED + + + + + D113_PRED + + Sheet.9 + D157_PRED + + + + + D157_PRED + + Sheet.10 + D203_PRED + + + + + D203_PRED + + Sheet.11 + D45_PRED + + + + + D45_PRED + + Sheet.12 + V_PRED + + + + + V_PRED + + Sheet.13 + + + + Sheet.14 + + + + Sheet.15 + + + + Sheet.16 + + + + Sheet.17 + + + + Sheet.18 + + + + Sheet.30 + D67_PRED + + + + + D67_PRED + + Sheet.31 + +1 + + + + +1 + + Sheet.32 + +2 + + + + +2 + + Sheet.33 + +3 + + + + +3 + + Sheet.34 + -1 + + + + -1 + + Sheet.35 + -2 + + + + -2 + + Sheet.36 + -3 + + + + -3 + + diff --git a/third_party/aom/doc/img/intra_paeth.svg b/third_party/aom/doc/img/intra_paeth.svg new file mode 100644 index 0000000000..f7a831febb --- /dev/null +++ b/third_party/aom/doc/img/intra_paeth.svg @@ -0,0 +1,181 @@ + + + + + + + + + + Page-1 + + + Square.211 + L + + + + + + + L + + Square.212 + T + + + + + + + T + + Square.213 + TL + + + + + + + TL + + Sheet.214 + + Square.183 + + + + + + + Square.184 + + + + + + + Square.185 + + + + + + + Square.186 + + + + + + + Square.199 + + + + + + + Square.200 + + + + + + + Square.201 + Current Pixel + + + + + + + Current Pixel + + Square.202 + + + + + + + Square.203 + + + + + + + Square.204 + + + + + + + Square.205 + + + + + + + Square.206 + + + + + + + Square.207 + + + + + + + Square.208 + + + + + + + Square.209 + + + + + + + Square.210 + + + + + + + + Square.215 + + + + + + + diff --git a/third_party/aom/doc/img/intra_recursive.svg b/third_party/aom/doc/img/intra_recursive.svg new file mode 100644 index 0000000000..adc4193169 --- /dev/null +++ b/third_party/aom/doc/img/intra_recursive.svg @@ -0,0 +1,710 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + Page-1 + + + Sheet.149 + + Square.142 + + + + + + + Square.143 + + + + + + + Square.144 + + + + + + + Square.145 + + + + + + + Square.146 + + + + + + + Square.147 + + + + + + + Square.148 + + + + + + + + Sheet.64 + + Rectangle + + + + + + + Sheet.63 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.65 + + Rectangle + + + + + + + Sheet.67 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.76 + + Rectangle + + + + + + + Sheet.78 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.87 + + Rectangle + + + + + + + Sheet.89 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.98 + + Rectangle + + + + + + + Sheet.100 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.109 + + Rectangle + + + + + + + Sheet.111 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.120 + + Rectangle + + + + + + + Sheet.122 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.131 + + Rectangle + + + + + + + Sheet.133 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + + + Sheet.150 + + + + Sheet.151 + + + + Sheet.152 + + + + Sheet.153 + + + + Sheet.154 + + + + Sheet.155 + + + + Sheet.156 + + + + Sheet.157 + + + + Sheet.158 + + + + Sheet.159 + + + + Sheet.160 + + + + Sheet.161 + + + + Sheet.162 + + + + Sheet.163 + + + + diff --git a/third_party/aom/doc/img/intra_tx_partition.svg b/third_party/aom/doc/img/intra_tx_partition.svg new file mode 100644 index 0000000000..69575d4cd7 --- /dev/null +++ b/third_party/aom/doc/img/intra_tx_partition.svg @@ -0,0 +1,142 @@ + + + + + + + + + + + + + + + + + + Page-1 + + + Square + + + + + + + Square.4 + + + + + + + Sheet.5 + + + + Sheet.6 + + + + Square.7 + + + + + + + Sheet.8 + + + + Sheet.9 + + + + Sheet.10 + + + + Sheet.11 + + + + Sheet.12 + + + + Sheet.13 + + + + Sheet.14 + + + + Sheet.15 + + + + Sheet.16 + + + + Sheet.17 + + + + Sheet.18 + + + + Sheet.19 + + + + Sheet.20 + + + + Sheet.21 + + + + Sheet.22 + + + + Sheet.23 + + + + Sheet.36 + + + + Sheet.37 + + + + Sheet.38 + + + + Sheet.39 + + + + diff --git a/third_party/aom/doc/img/loop_restoration.svg b/third_party/aom/doc/img/loop_restoration.svg new file mode 100644 index 0000000000..cdeb76a871 --- /dev/null +++ b/third_party/aom/doc/img/loop_restoration.svg @@ -0,0 +1,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Page-1 + + + + Parallelogram + + + + + + + Sheet.28 + + + + Sheet.29 + + + + Sheet.33 + + + + Sheet.36 + + + + Sheet.37 + + + + Sheet.38 + X + + + + X + + Sheet.43 + X1 + + + + X1 + + Sheet.52 + X2 + + + + X2 + + Sheet.53 + Y + + + + Y + + Sheet.54 + Xr = X + α(X1 – X) + β(X2 – X) + + + + Xr = X + α(X1 X) + β(X2 X) + + diff --git a/third_party/aom/doc/img/partition_codingblock.svg b/third_party/aom/doc/img/partition_codingblock.svg new file mode 100644 index 0000000000..872692dbd7 --- /dev/null +++ b/third_party/aom/doc/img/partition_codingblock.svg @@ -0,0 +1,225 @@ + + + + + + + + + + Page-1 + + + Square + + + + + + + Square.6 + + + + + + + Square.15 + + + + + + + Square.24 + + + + + + + Square.30 + + + + + + + Square.34 + + + + + + + Square.38 + + + + + + + Square.42 + + + + + + + Square.47 + + + + + + + Sheet.50 + PARTITION_SPLIT + + + + PARTITION_SPLIT + + Sheet.51 + PARTITION_VERT_4 + + + + PARTITION_VERT_4 + + Sheet.52 + PARTITION_HORZ_4 + + + + PARTITION_HORZ_4 + + Sheet.60 + PARTITION_HORZ_B + + + + PARTITION_HORZ_B + + Sheet.61 + PARTITION_VERT_A + + + + PARTITION_VERT_A + + Sheet.62 + PARTITION_HORZ_A + + + + PARTITION_HORZ_A + + Sheet.63 + PARTITION_VERT_B + + + + PARTITION_VERT_B + + Sheet.64 + PARTITION_HORZ + + + + PARTITION_HORZ + + Sheet.65 + PARTITION_VERT + + + + PARTITION_VERT + + Sheet.66 + + + + Sheet.67 + + + + Sheet.68 + + + + Sheet.69 + + + + Sheet.70 + + + + Sheet.71 + + + + Sheet.72 + + + + Sheet.73 + + + + Sheet.74 + + + + Sheet.75 + + + + Sheet.76 + + + + Sheet.77 + + + + Sheet.78 + + + + Sheet.79 + + + + Sheet.80 + + + + Sheet.81 + + + + Sheet.82 + + + + Sheet.83 + + + + diff --git a/third_party/aom/doc/img/primary_tap.svg b/third_party/aom/doc/img/primary_tap.svg new file mode 100644 index 0000000000..8cd2a18134 --- /dev/null +++ b/third_party/aom/doc/img/primary_tap.svg @@ -0,0 +1,1589 @@ + + + + + + + + + + + + + + Page-1 + + + Square + + + + + + + Square.2 + + + + + + + Square.3 + + + + + + + Square.4 + + + + + + + Square.5 + a/16 + + + + + + + a/16 + + Square.6 + + + + + + + Square.7 + + + + + + + Square.8 + + + + + + + Square.9 + b/16 + + + + + + + b/16 + + Square.10 + + + + + + + Square.11 + + + + + + + Square.12 + + + + + + + Square.13 + + + + + + + Square.14 + + + + + + + Square.15 + + + + + + + Square.16 + + + + + + + Square.17 + b/16 + + + + + + + b/16 + + Square.18 + + + + + + + Square.19 + + + + + + + Square.20 + + + + + + + Square.21 + a/16 + + + + + + + a/16 + + Square.22 + + + + + + + Square.23 + + + + + + + Square.24 + + + + + + + Square.25 + + + + + + + Square.30 + + + + + + + Square.31 + + + + + + + Square.32 + + + + + + + Square.33 + + + + + + + Square.34 + + + + + + + Square.35 + + + + + + + Square.36 + + + + + + + Square.37 + + + + + + + Square.38 + + + + + + + Square.39 + a/16 + + + + + + + a/16 + + Square.40 + + + + + + + Square.41 + b/16 + + + + + + + b/16 + + Square.42 + + + + + + + Square.43 + b/16 + + + + + + + b/16 + + Square.44 + + + + + + + Square.45 + a/16 + + + + + + + a/16 + + Square.46 + + + + + + + Square.47 + + + + + + + Square.48 + + + + + + + Square.49 + + + + + + + Square.50 + + + + + + + Square.51 + + + + + + + Square.52 + + + + + + + Square.53 + + + + + + + Square.54 + + + + + + + Square.55 + + + + + + + Square.56 + + + + + + + Square.57 + + + + + + + Square.58 + + + + + + + Square.59 + + + + + + + Square.60 + + + + + + + Square.61 + + + + + + + Square.62 + + + + + + + Square.63 + + + + + + + Square.64 + + + + + + + Square.65 + a/16 + + + + + + + a/16 + + Square.66 + b/16 + + + + + + + b/16 + + Square.67 + + + + + + + Square.68 + b/16 + + + + + + + b/16 + + Square.69 + a/16 + + + + + + + a/16 + + Square.70 + + + + + + + Square.71 + + + + + + + Square.72 + + + + + + + Square.73 + + + + + + + Square.74 + + + + + + + Square.75 + + + + + + + Square.76 + + + + + + + Square.77 + + + + + + + Square.78 + + + + + + + Square.79 + + + + + + + Square.80 + + + + + + + Square.81 + + + + + + + Square.82 + + + + + + + Square.83 + + + + + + + Square.84 + + + + + + + Square.85 + a/16 + + + + + + + a/16 + + Square.86 + + + + + + + Square.87 + + + + + + + Square.88 + + + + + + + Square.89 + + + + + + + Square.90 + + + + + + + Square.91 + b/16 + + + + + + + b/16 + + Square.92 + + + + + + + Square.93 + b/16 + + + + + + + b/16 + + Square.94 + + + + + + + Square.95 + + + + + + + Square.96 + + + + + + + Square.97 + + + + + + + Square.98 + + + + + + + Square.99 + a/16 + + + + + + + a/16 + + Square.100 + + + + + + + Square.101 + + + + + + + Square.102 + + + + + + + Square.103 + + + + + + + Square.104 + + + + + + + Square.115 + a/16 + + + + + + + a/16 + + Square.116 + + + + + + + Square.117 + + + + + + + Square.118 + + + + + + + Square.119 + + + + + + + Square.120 + + + + + + + Square.121 + b/16 + + + + + + + b/16 + + Square.122 + + + + + + + Square.123 + + + + + + + Square.124 + + + + + + + Square.125 + + + + + + + Square.126 + + + + + + + Square.127 + + + + + + + Square.128 + + + + + + + Square.129 + + + + + + + Square.130 + + + + + + + Square.131 + + + + + + + Square.132 + + + + + + + Square.133 + b/16 + + + + + + + b/16 + + Square.134 + + + + + + + Square.135 + + + + + + + Square.136 + + + + + + + Square.137 + + + + + + + Square.138 + + + + + + + Square.139 + a/16 + + + + + + + a/16 + + Square.140 + + + + + + + Square.141 + a/16 + + + + + + + a/16 + + Square.142 + + + + + + + Square.143 + + + + + + + Square.144 + + + + + + + Square.145 + + + + + + + Square.146 + + + + + + + Square.147 + b/16 + + + + + + + b/16 + + Square.148 + + + + + + + Square.149 + + + + + + + Square.150 + + + + + + + Square.151 + + + + + + + Square.152 + + + + + + + Square.153 + + + + + + + Square.154 + + + + + + + Square.155 + + + + + + + Square.156 + + + + + + + Square.157 + b/16 + + + + + + + b/16 + + Square.158 + + + + + + + Square.159 + + + + + + + Square.160 + + + + + + + Square.161 + + + + + + + Square.162 + + + + + + + Square.163 + a/16 + + + + + + + a/16 + + Square.164 + + + + + + + Square.165 + + + + + + + Square.166 + + + + + + + Square.167 + a/16 + + + + + + + a/16 + + Square.168 + + + + + + + Square.169 + + + + + + + Square.170 + + + + + + + Square.171 + + + + + + + Square.172 + b/16 + + + + + + + b/16 + + Square.173 + + + + + + + Square.174 + + + + + + + Square.175 + + + + + + + Square.176 + + + + + + + Square.177 + + + + + + + Square.178 + + + + + + + Square.179 + + + + + + + Square.180 + + + + + + + Square.181 + + + + + + + Square.182 + b/16 + + + + + + + b/16 + + Square.183 + + + + + + + Square.184 + + + + + + + Square.185 + + + + + + + Square.186 + + + + + + + Square.187 + a/16 + + + + + + + a/16 + + Square.188 + + + + + + + Square.189 + + + + + + + Square.190 + + + + + + + Square.191 + + + + + + + Square.192 + + + + + + + Square.193 + a/16 + + + + + + + a/16 + + Square.194 + + + + + + + Square.195 + + + + + + + Square.196 + + + + + + + Square.197 + b/16 + + + + + + + b/16 + + Square.198 + + + + + + + Square.199 + + + + + + + Square.200 + + + + + + + Square.201 + + + + + + + Square.202 + + + + + + + Square.203 + + + + + + + Square.204 + + + + + + + Square.205 + + + + + + + Square.206 + + + + + + + Square.207 + b/16 + + + + + + + b/16 + + Square.208 + + + + + + + Square.209 + + + + + + + Square.210 + + + + + + + Square.211 + a/16 + + + + + + + a/16 + + Square.212 + + + + + + + Square.213 + + + + + + + Square.214 + + + + + + + Sheet.236 + d = 0 + + + + d = 0 + + Sheet.237 + d = 1 + + + + d = 1 + + Sheet.238 + d = 2 + + + + d = 2 + + Sheet.239 + d = 3 + + + + d = 3 + + Sheet.240 + d = 4 + + + + d = 4 + + Sheet.241 + d = 5 + + + + d = 5 + + Sheet.242 + d = 6 + + + + d = 6 + + Sheet.243 + d = 7 + + + + d = 7 + + diff --git a/third_party/aom/doc/img/quant_ac.svg b/third_party/aom/doc/img/quant_ac.svg new file mode 100644 index 0000000000..3f589c8be6 --- /dev/null +++ b/third_party/aom/doc/img/quant_ac.svg @@ -0,0 +1 @@ +tables3Asset 105000100001500020000250003000035000050100150200250QstepQ_index8-bit AC10-bit AC12-bit AC \ No newline at end of file diff --git a/third_party/aom/doc/img/quant_dc.svg b/third_party/aom/doc/img/quant_dc.svg new file mode 100644 index 0000000000..4fda1084e1 --- /dev/null +++ b/third_party/aom/doc/img/quant_dc.svg @@ -0,0 +1 @@ +tables2Asset 10500010000150002000025000050100150200250QstepQ_index8-bit DC10-bit DC12-bit DC \ No newline at end of file diff --git a/third_party/aom/doc/img/scc_intrabc.svg b/third_party/aom/doc/img/scc_intrabc.svg new file mode 100644 index 0000000000..dfe4948861 --- /dev/null +++ b/third_party/aom/doc/img/scc_intrabc.svg @@ -0,0 +1,348 @@ + + + + + + + + + + Page-1 + + + Sheet.1 + + + + Sheet.3 + + + + Sheet.4 + + + + Sheet.5 + + + + Sheet.6 + + + + Sheet.7 + + + + Sheet.8 + + + + Sheet.9 + + + + Sheet.10 + + + + Sheet.11 + + + + Sheet.12 + + + + Sheet.13 + + + + Sheet.14 + + + + Sheet.15 + + + + Sheet.16 + + + + Sheet.17 + + + + Sheet.18 + + + + Sheet.19 + + + + Sheet.20 + + + + Sheet.21 + + + + Sheet.22 + + + + Sheet.23 + + + + Sheet.24 + + + + Sheet.25 + + + + Sheet.26 + + + + Sheet.27 + + + + Sheet.28 + + + + Sheet.29 + + + + Sheet.30 + + + + Sheet.31 + + + + Sheet.32 + + + + Sheet.33 + + + + Sheet.34 + + + + Sheet.35 + + + + Sheet.36 + + + + Sheet.37 + + + + Sheet.38 + + + + Sheet.39 + + + + Sheet.40 + + + + Sheet.41 + + + + Sheet.42 + + + + Sheet.43 + + + + Sheet.44 + + + + Sheet.45 + + + + Sheet.46 + + + + Sheet.47 + + + + Sheet.48 + + + + Sheet.49 + + + + Sheet.50 + + + + Sheet.51 + + + + Sheet.52 + + + + Sheet.53 + + + + Sheet.54 + + + + Sheet.55 + + + + Sheet.56 + + + + Sheet.57 + + + + Sheet.58 + + + + Sheet.59 + + + + Sheet.60 + + + + Sheet.61 + + + + Sheet.62 + + + + Sheet.63 + + + + Sheet.64 + + + + Sheet.65 + + + + Sheet.66 + + + + Sheet.67 + + + + Sheet.68 + + + + Sheet.69 + + + + Sheet.70 + + + + Sheet.71 + + + + Sheet.72 + + + + Sheet.73 + + + + Sheet.74 + + + + Sheet.75 + + + + Sheet.76 + + + + Sheet.79 + Current processing block + + + + Current processing block + + Sheet.80 + Allowed prediction block + + + + Allowed prediction block + + Sheet.81 + Restricted immediate blocks + + + + Restricted immediate blocks + + diff --git a/third_party/aom/doc/img/secondary_tap.svg b/third_party/aom/doc/img/secondary_tap.svg new file mode 100644 index 0000000000..4c6283de36 --- /dev/null +++ b/third_party/aom/doc/img/secondary_tap.svg @@ -0,0 +1,857 @@ + + + + + + + + + + + + + + Page-1 + + + Square + + + + + + + Square.2 + + + + + + + Square.3 + 1/16 + + + + + + + 1/16 + + Square.4 + + + + + + + Square.5 + + + + + + + Square.6 + + + + + + + Square.7 + + + + + + + Square.8 + 2/16 + + + + + + + 2/16 + + Square.9 + + + + + + + Square.10 + + + + + + + Square.11 + 1/16 + + + + + + + 1/16 + + Square.12 + 2/16 + + + + + + + 2/16 + + Square.13 + + + + + + + Square.14 + 2/16 + + + + + + + 2/16 + + Square.15 + 1/16 + + + + + + + 1/16 + + Square.16 + + + + + + + Square.17 + + + + + + + Square.18 + 2/16 + + + + + + + 2/16 + + Square.19 + + + + + + + Square.20 + + + + + + + Square.21 + + + + + + + Square.22 + + + + + + + Square.23 + 1/16 + + + + + + + 1/16 + + Square.24 + + + + + + + Square.25 + + + + + + + Square.30 + + + + + + + Square.31 + + + + + + + Square.32 + + + + + + + Square.33 + 1/16 + + + + + + + 1/16 + + Square.34 + + + + + + + Square.35 + 1/16 + + + + + + + 1/16 + + Square.36 + + + + + + + Square.37 + 2/16 + + + + + + + 2/16 + + Square.38 + + + + + + + Square.39 + + + + + + + Square.40 + + + + + + + Square.41 + 2/16 + + + + + + + 2/16 + + Square.42 + + + + + + + Square.43 + 2/16 + + + + + + + 2/16 + + Square.44 + + + + + + + Square.45 + + + + + + + Square.46 + + + + + + + Square.47 + 2/16 + + + + + + + 2/16 + + Square.48 + + + + + + + Square.49 + 1/16 + + + + + + + 1/16 + + Square.50 + + + + + + + Square.51 + 1/16 + + + + + + + 1/16 + + Square.52 + + + + + + + Square.53 + + + + + + + Square.54 + + + + + + + Square.55 + 1/16 + + + + + + + 1/16 + + Square.56 + + + + + + + Square.57 + + + + + + + Square.58 + + + + + + + Square.59 + 1/16 + + + + + + + 1/16 + + Square.60 + + + + + + + Square.61 + 2/16 + + + + + + + 2/16 + + Square.62 + + + + + + + Square.63 + 2/16 + + + + + + + 2/16 + + Square.64 + + + + + + + Square.65 + + + + + + + Square.66 + + + + + + + Square.67 + + + + + + + Square.68 + + + + + + + Square.69 + + + + + + + Square.70 + + + + + + + Square.71 + 2/16 + + + + + + + 2/16 + + Square.72 + + + + + + + Square.73 + 2/16 + + + + + + + 2/16 + + Square.74 + + + + + + + Square.75 + 1/16 + + + + + + + 1/16 + + Square.76 + + + + + + + Square.77 + + + + + + + Square.78 + + + + + + + Square.79 + 1/16 + + + + + + + 1/16 + + Square.80 + + + + + + + Square.81 + 1/16 + + + + + + + 1/16 + + Square.82 + + + + + + + Square.83 + + + + + + + Square.84 + + + + + + + Square.85 + + + + + + + Square.86 + + + + + + + Square.87 + 2/16 + + + + + + + 2/16 + + Square.88 + + + + + + + Square.89 + 1/16 + + + + + + + 1/16 + + Square.90 + + + + + + + Square.91 + 2/16 + + + + + + + 2/16 + + Square.92 + + + + + + + Square.93 + 2/16 + + + + + + + 2/16 + + Square.94 + + + + + + + Square.95 + 1/16 + + + + + + + 1/16 + + Square.96 + + + + + + + Square.97 + 2/16 + + + + + + + 2/16 + + Square.98 + + + + + + + Square.99 + + + + + + + Square.100 + + + + + + + Square.101 + + + + + + + Square.102 + + + + + + + Square.103 + 1/16 + + + + + + + 1/16 + + Square.104 + + + + + + + Sheet.236 + d = 0, 4 + + + + d = 0, 4 + + Sheet.237 + d = 1, 5 + + + + d = 1, 5 + + Sheet.238 + d = 2, 6 + + + + d = 2, 6 + + Sheet.239 + d = 3, 7 + + + + d = 3, 7 + + diff --git a/third_party/aom/doc/img/tx_basis.svg b/third_party/aom/doc/img/tx_basis.svg new file mode 100644 index 0000000000..eb27b0314b --- /dev/null +++ b/third_party/aom/doc/img/tx_basis.svg @@ -0,0 +1 @@ +tables2Asset 1Transform TypeBasis function Ti(j), i, j = 0, 1, , N-1DCT-2DST-4DST-7IDT()=02cos+1where 0==0?2:1()=2sin(2+1)+14()=42+1sin(2+1)+12===?1:0 \ No newline at end of file diff --git a/third_party/aom/doc/img/tx_cands_large.svg b/third_party/aom/doc/img/tx_cands_large.svg new file mode 100644 index 0000000000..fb4f5f49bf --- /dev/null +++ b/third_party/aom/doc/img/tx_cands_large.svg @@ -0,0 +1 @@ +tables2Asset 1Max(width, height)IntraInter32DCTOnlyDCTOnly, IDTX64DCTOnlyDCTOnly \ No newline at end of file diff --git a/third_party/aom/doc/img/tx_cands_small.svg b/third_party/aom/doc/img/tx_cands_small.svg new file mode 100644 index 0000000000..ddd9a87e53 --- /dev/null +++ b/third_party/aom/doc/img/tx_cands_small.svg @@ -0,0 +1 @@ +tx_cands_smallAsset 1Min(width, height)IntraInter4DTT4, IDTX, 1DDCTALL168DTT4, IDTX, 1DDCTALL1616DTT4, IDTXDTT9, IDTX, 1DDCT \ No newline at end of file diff --git a/third_party/aom/doc/img/tx_chroma.svg b/third_party/aom/doc/img/tx_chroma.svg new file mode 100644 index 0000000000..a0915e0031 --- /dev/null +++ b/third_party/aom/doc/img/tx_chroma.svg @@ -0,0 +1 @@ +tx_chromaAsset 1Intra Prediction ModeVertical transformHorizontal transformDC_PREDDCTDCTV_PREDADSTDCTH_PREDDCTADSTD45_PREDDCTDCTD135_PREDADSTADSTD113_PREDADSTDCTD157_PREDDCTADSTD203_PREDDCTADSTD67_PREDADSTDCTSMOOTH_PREDADSTADSTSMOOTH_V_PREDADSTDCTSMOOTH_H_PREDDCTADSTPAETH_PREDADSTADST \ No newline at end of file diff --git a/third_party/aom/doc/img/tx_partition.svg b/third_party/aom/doc/img/tx_partition.svg new file mode 100644 index 0000000000..e0ce50c507 --- /dev/null +++ b/third_party/aom/doc/img/tx_partition.svg @@ -0,0 +1 @@ +tables2Asset 1Transform size of current depthTransform size of next depthTX_4X4TX_4X4TX_8X8TX_4X4TX_16X16TX_8X8TX_32X32TX_16X16TX_64X64TX_32X32TX_4X8TX_4X4TX_8X4TX_4X4TX_8X16TX_8X8TX_16X8TX_8X8TX_16X32TX_16X16TX_32X16TX_16X16TX_32X64TX_32X32TX_64X32TX_32X32TX_4X16TX_4X8TX_16X4TX_8X4TX_8X32TX_8X16TX_32X8TX_16X8TX_16X64TX_16X32TX_64X16TX_32X16 \ No newline at end of file diff --git a/third_party/aom/doc/img/tx_set.svg b/third_party/aom/doc/img/tx_set.svg new file mode 100644 index 0000000000..dee10d4d93 --- /dev/null +++ b/third_party/aom/doc/img/tx_set.svg @@ -0,0 +1 @@ +tx_setAsset 1Transform setVertical transformHorizontal transformDCTOnlyDCTDCTIDTXIDTIDTDCT IDT IDTDCTADST ADST ADST DCT DCT ADST DCTDCTDCT DCT DCT ADST DCT Flipped ADST ADSTDCTADST ADST ADST Flipped ADST Flipped ADST DCT Flipped ADST ADST Flipped ADSTFlipped ADST DCT DCT DCT ADST DCT Flipped ADST DCT IDT ADST DCT ADST ADST ADST Flipped ADST ADST IDT Flipped ADST DCT Flipped ADST ADST Flipped ADST Flipped ADST Flipped ADST IDT IDT DCT IDT ADST IDT Flipped ADST IDT IDT 1DDCTDTT4DTT9ALL16 \ No newline at end of file diff --git a/third_party/aom/docs.cmake b/third_party/aom/docs.cmake new file mode 100644 index 0000000000..0d7b4cfde3 --- /dev/null +++ b/third_party/aom/docs.cmake @@ -0,0 +1,345 @@ +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_DOCS_CMAKE_) + return() +endif() # AOM_DOCS_CMAKE_ +set(AOM_DOCS_CMAKE_ 1) + +cmake_minimum_required(VERSION 3.5) + +set(AOM_DOXYFILE "${AOM_CONFIG_DIR}/doxyfile") +set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template") +set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox") +set(AOM_DOXYGEN_SECTIONS "av1") + +set(AOM_DOXYGEN_SOURCES + "${AOM_ROOT}/aom/aom.h" + "${AOM_ROOT}/aom/aom_codec.h" + "${AOM_ROOT}/aom/aom_decoder.h" + "${AOM_ROOT}/aom/aom_encoder.h" + "${AOM_ROOT}/aom/aom_external_partition.h" + "${AOM_ROOT}/aom/aom_frame_buffer.h" + "${AOM_ROOT}/aom/aom_image.h" + "${AOM_ROOT}/aom/aom_integer.h" + "${AOM_ROOT}/av1/common/av1_common_int.h" + "${AOM_ROOT}/av1/common/av1_loopfilter.h" + "${AOM_ROOT}/av1/common/blockd.h" + "${AOM_ROOT}/av1/common/cdef.h" + "${AOM_ROOT}/av1/common/enums.h" + "${AOM_ROOT}/av1/common/restoration.h" + "${AOM_ROOT}/keywords.dox" + "${AOM_ROOT}/mainpage.dox" + "${AOM_ROOT}/usage.dox") + +if(CONFIG_AV1_DECODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/apps/aomdec.c" + "${AOM_ROOT}/examples/decode_to_md5.c" + "${AOM_ROOT}/examples/decode_with_drops.c" + "${AOM_ROOT}/examples/simple_decoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Full featured decoder." + "Frame by frame MD5 checksum." + "Drops frames while decoding." + "Simplified decoder loop.") + + set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder") + + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h" + "${AOM_ROOT}/usage_dx.dox" + "${AOM_ROOT}/av1/decoder/decoder.h") + + if(CONFIG_ANALYZER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/analyzer.cc") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Bitstream analyzer.") + endif() + + if(CONFIG_INSPECTION) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/inspect.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Bitstream inspector.") + endif() + + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} + "${AOM_ROOT}/doc/dev_guide/av1_decoder.dox") +endif() + +if(CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/apps/aomenc.c" + "${AOM_ROOT}/examples/lossless_encoder.c" + "${AOM_ROOT}/examples/set_maps.c" + "${AOM_ROOT}/examples/simple_encoder.c" + "${AOM_ROOT}/examples/twopass_encoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Full featured encoder." + "Simplified lossless encoder." + "Set active and ROI maps." + "Simplified encoder loop." + "Two-pass encoder loop.") + + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/scalable_encoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Scalable encoder loop.") + + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/svc_encoder_rtc.cc") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Layered encoder for RTC.") + + set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder") + + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h" + "${AOM_ROOT}/usage_cx.dox") + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} + "${AOM_ROOT}/doc/dev_guide/av1_encoder.dox") + set(AOM_DOXYGEN_SOURCES + ${AOM_DOXYGEN_SOURCES} + "${AOM_ROOT}/aom_scale/yv12config.h" + "${AOM_ROOT}/av1/encoder/bitstream.h" + "${AOM_ROOT}/av1/encoder/block.h" + "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h" + "${AOM_ROOT}/av1/encoder/encode_strategy.c" + "${AOM_ROOT}/av1/encoder/encode_strategy.h" + "${AOM_ROOT}/av1/encoder/encodeframe.c" + "${AOM_ROOT}/av1/encoder/encoder.c" + "${AOM_ROOT}/av1/encoder/encoder.h" + "${AOM_ROOT}/av1/encoder/encodetxb.h" + "${AOM_ROOT}/av1/encoder/firstpass.h" + "${AOM_ROOT}/av1/encoder/gop_structure.h" + "${AOM_ROOT}/av1/encoder/interp_search.c" + "${AOM_ROOT}/av1/encoder/intra_mode_search.h" + "${AOM_ROOT}/av1/encoder/intra_mode_search.c" + "${AOM_ROOT}/av1/encoder/intra_mode_search_utils.h" + "${AOM_ROOT}/av1/encoder/lookahead.h" + "${AOM_ROOT}/av1/encoder/palette.h" + "${AOM_ROOT}/av1/encoder/palette.c" + "${AOM_ROOT}/av1/encoder/partition_search.h" + "${AOM_ROOT}/av1/encoder/partition_search.c" + "${AOM_ROOT}/av1/encoder/pass2_strategy.h" + "${AOM_ROOT}/av1/encoder/pass2_strategy.c" + "${AOM_ROOT}/av1/encoder/pickcdef.h" + "${AOM_ROOT}/av1/encoder/picklpf.h" + "${AOM_ROOT}/av1/encoder/pickrst.h" + "${AOM_ROOT}/av1/encoder/ratectrl.c" + "${AOM_ROOT}/av1/encoder/ratectrl.h" + "${AOM_ROOT}/av1/encoder/rc_utils.h" + "${AOM_ROOT}/av1/encoder/rdopt.h" + "${AOM_ROOT}/av1/encoder/rdopt.c" + "${AOM_ROOT}/av1/encoder/speed_features.h" + "${AOM_ROOT}/av1/encoder/svc_layercontext.c" + "${AOM_ROOT}/av1/encoder/svc_layercontext.h" + "${AOM_ROOT}/av1/encoder/temporal_filter.h" + "${AOM_ROOT}/av1/encoder/temporal_filter.c" + "${AOM_ROOT}/av1/encoder/tpl_model.h" + "${AOM_ROOT}/av1/encoder/tx_search.h" + "${AOM_ROOT}/av1/encoder/txb_rdopt.h" + "${AOM_ROOT}/av1/encoder/var_based_part.h" + "${AOM_ROOT}/av1/encoder/nonrd_opt.h" + "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c") +endif() + +if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/aom_cx_set_ref.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Set encoder reference frame.") +endif() + +if(CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_encoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield encoder example.") +endif() + +if(CONFIG_AV1_DECODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield tile list decoder example.") +endif() + +if(CONFIG_AV1_DECODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_decoder.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield decoder example.") +endif() + +if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER) + set(AOM_DOXYGEN_EXAMPLE_SOURCES + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c") + + set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS} + "Lightfield bitstream parsing example.") +endif() + +# Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE +# as values assigned to $var_name with no line breaks between list items. +# Appends a new line after the entire config variable is expanded. +function(write_cmake_list_to_doxygen_config_var var_name list_name) + unset(output_string) + foreach(list_item ${${list_name}}) + set(output_string "${output_string} ${list_item} ") + endforeach() + string(STRIP "${output_string}" output_string) + file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n") +endfunction() + +function(get_name file_path name_var) + get_filename_component(file_basename ${file_path} NAME) + get_filename_component(${name_var} ${file_basename} NAME_WE) + set(${name_var} ${${name_var}} PARENT_SCOPE) +endfunction() + +function(setup_documentation_targets) + + # Sanity check: the lengths of these lists must match. + list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources) + list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs) + if(NOT ${num_sources} EQUAL ${num_descs}) + message(FATAL_ERROR "Unequal example and description totals.") + endif() + + # Take the list of examples and produce example_basename.dox for each file in + # the list. + file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}") + foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES}) + unset(example_basename) + get_name("${example_file}" "example_name") + set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox") + set(dox_string "/*!\\page example_${example_name} ${example_name}\n") + set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n") + file(WRITE "${example_dox}" ${dox_string}) + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}") + endforeach() + + # Generate samples.dox, an index page that refers to the example_basename.dox + # files that were just created. + set(samples_header " +/*!\\page samples Sample Code +This SDK includes a number of sample applications. Each sample documents a +feature of the SDK in both prose and the associated C code. The following +samples are included: +") + + set(utils_desc " +In addition, the SDK contains a number of utilities. Since these utilities are +built upon the concepts described in the sample code listed above, they are not +documented in pieces like the samples are. Their source is included here for +reference. The following utilities are included: +") + + # Write the description for the samples section. + set(samples_dox "${AOM_CONFIG_DIR}/samples.dox") + file(WRITE "${samples_dox}" "${samples_header}\n") + + # Iterate over $AOM_DOXYGEN_EXAMPLE_SOURCES and + # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by + # AV1's doxygen setup. + math(EXPR max_example_index "${num_sources} - 1") + foreach(NUM RANGE ${max_example_index}) + list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name) + get_name("${ex_name}" "ex_name") + + # AV1's doxygen lists aomdec and aomenc as utils apart from the examples. + # Save the indexes for another pass. + if("${ex_name}" MATCHES "aomdec\|aomenc") + set(util_indexes "${util_indexes}" "${NUM}") + continue() + endif() + list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc) + file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n") + endforeach() + + # Write the description and index for the utils. + file(APPEND "${samples_dox}" "${utils_desc}\n") + foreach(util_index ${util_indexes}) + list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name) + get_name("${ex_name}" "ex_name") + list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc) + file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n") + endforeach() + file(APPEND "${samples_dox}" "*/") + + # Add $samples_dox to the doxygen inputs. + get_filename_component(samples_dox ${samples_dox} NAME) + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox}) + + # There are issues to show Markdown file for old Doxygen version. Here, only + # enable Markdown support for 1.8.16 or newer. + if(${DOXYGEN_VERSION_VALUE} GREATER_EQUAL 1008016) + set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_md_support") + set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/README.md") + # Uncomment and add AlgorithmDescription.md in result page when it is done. + # set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} + # "${AOM_ROOT}/doc/AlgorithmDescription.md") + endif() + + # Generate libaom's doxyfile. + file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n") + file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data) + file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data}) + file(APPEND "${AOM_DOXYFILE}" + "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n") + file(APPEND "${AOM_DOXYFILE}" + "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n") + file(APPEND "${AOM_DOXYFILE}" + "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n") + write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES") + write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS" + "AOM_DOXYGEN_SECTIONS") + + # Add AOMedia logo. + set(aom_logo "aomedia_logo_200.png") + configure_file(${AOM_ROOT}/${aom_logo} ${AOM_CONFIG_DIR}/${aom_logo} COPYONLY) + file(APPEND "${AOM_DOXYFILE}" + "PROJECT_LOGO = ${AOM_CONFIG_DIR}/${aom_logo}\n") + + # Only set HAVE_DOT to YES if dot tool is found. + if(DOXYGEN_DOT_FOUND) + file(APPEND "${AOM_DOXYFILE}" "HAVE_DOT = YES\n") + file(APPEND "${AOM_DOXYFILE}" "DOT_GRAPH_MAX_NODES = 10000\n") + endif() + + # Add image path. + file(APPEND "${AOM_DOXYFILE}" "IMAGE_PATH += ${AOM_ROOT}/doc/dev_guide\n") + + # Allow banner style comments + file(APPEND "${AOM_DOXYFILE}" "JAVADOC_BANNER = YES") + + # Add the doxygen generation rule. + add_custom_target(docs ALL + COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}" + DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES} + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_DOXYGEN_CONFIG_TEMPLATE}" + SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES} + ${AOM_DOXYGEN_EXAMPLE_SOURCES} + "${AOM_DOXYGEN_CONFIG_TEMPLATE}") +endfunction() diff --git a/third_party/aom/examples/analyzer.cc b/third_party/aom/examples/analyzer.cc new file mode 100644 index 0000000000..501f5024db --- /dev/null +++ b/third_party/aom/examples/analyzer.cc @@ -0,0 +1,722 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "av1/common/av1_common_int.h" +#include "av1/decoder/accounting.h" +#include "av1/decoder/inspection.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +#define OD_SIGNMASK(a) (-((a) < 0)) +#define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) +#define OD_DIV_ROUND(x, y) (((x) + OD_FLIPSIGNI((y) >> 1, x)) / (y)) + +enum { + OD_LUMA_MASK = 1 << 0, + OD_CB_MASK = 1 << 1, + OD_CR_MASK = 1 << 2, + OD_ALL_MASK = OD_LUMA_MASK | OD_CB_MASK | OD_CR_MASK +}; + +class AV1Decoder { + private: + FILE *input; + wxString path; + + AvxVideoReader *reader; + const AvxVideoInfo *info; + + insp_frame_data frame_data; + + aom_codec_ctx_t codec; + bool show_padding; + + public: + aom_image_t *image; + int frame; + + int plane_mask; + + AV1Decoder(); + ~AV1Decoder(); + + bool open(const wxString &path); + void close(); + bool step(); + + int getWidthPadding() const; + int getHeightPadding() const; + void togglePadding(); + int getWidth() const; + int getHeight() const; + + bool getAccountingStruct(Accounting **acct); + bool setInspectionCallback(); + + static void inspect(void *decoder, void *data); +}; + +AV1Decoder::AV1Decoder() + : reader(NULL), info(NULL), decoder(NULL), show_padding(false), image(NULL), + frame(0) {} + +AV1Decoder::~AV1Decoder() {} + +void AV1Decoder::togglePadding() { show_padding = !show_padding; } + +bool AV1Decoder::open(const wxString &path) { + reader = aom_video_reader_open(path.mb_str()); + if (!reader) { + fprintf(stderr, "Failed to open %s for reading.", path.mb_str().data()); + return false; + } + this->path = path; + info = aom_video_reader_get_info(reader); + decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) { + fprintf(stderr, "Unknown input codec."); + return false; + } + printf("Using %s\n", aom_codec_iface_name(decoder)); + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) { + fprintf(stderr, "Failed to initialize decoder."); + return false; + } + ifd_init(&frame_data, info->frame_width, info->frame_height); + setInspectionCallback(); + return true; +} + +void AV1Decoder::close() {} + +bool AV1Decoder::step() { + if (aom_video_reader_read_frame(reader)) { + size_t frame_size; + const unsigned char *frame_data; + frame_data = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) { + fprintf(stderr, "Failed to decode frame."); + return false; + } else { + aom_codec_iter_t iter = NULL; + image = aom_codec_get_frame(&codec, &iter); + if (image != NULL) { + frame++; + return true; + } + return false; + } + } + return false; +} + +int AV1Decoder::getWidth() const { + return info->frame_width + 2 * getWidthPadding(); +} + +int AV1Decoder::getWidthPadding() const { + return show_padding ? AOMMAX(info->frame_width + 16, + ALIGN_POWER_OF_TWO(info->frame_width, 6)) - + info->frame_width + : 0; +} + +int AV1Decoder::getHeight() const { + return info->frame_height + 2 * getHeightPadding(); +} + +int AV1Decoder::getHeightPadding() const { + return show_padding ? AOMMAX(info->frame_height + 16, + ALIGN_POWER_OF_TWO(info->frame_height, 6)) - + info->frame_height + : 0; +} + +bool AV1Decoder::getAccountingStruct(Accounting **accounting) { + return aom_codec_control(&codec, AV1_GET_ACCOUNTING, accounting) == + AOM_CODEC_OK; +} + +bool AV1Decoder::setInspectionCallback() { + aom_inspect_init ii; + ii.inspect_cb = AV1Decoder::inspect; + ii.inspect_ctx = (void *)this; + return aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii) == + AOM_CODEC_OK; +} + +void AV1Decoder::inspect(void *pbi, void *data) { + AV1Decoder *decoder = (AV1Decoder *)data; + ifd_inspect(&decoder->frame_data, pbi, 0); +} + +#define MIN_ZOOM (1) +#define MAX_ZOOM (4) + +class AnalyzerPanel : public wxPanel { + DECLARE_EVENT_TABLE() + + private: + AV1Decoder decoder; + const wxString path; + + int zoom; + unsigned char *pixels; + + const bool bit_accounting; + double *bpp_q3; + + int plane_mask; + + // The display size is the decode size, scaled by the zoom. + int getDisplayWidth() const; + int getDisplayHeight() const; + + bool updateDisplaySize(); + + void computeBitsPerPixel(); + + public: + AnalyzerPanel(wxWindow *parent, const wxString &path, + const bool bit_accounting); + ~AnalyzerPanel(); + + bool open(const wxString &path); + void close(); + void render(); + void togglePadding(); + bool nextFrame(); + void refresh(); + + int getZoom() const; + bool setZoom(int zoom); + + void setShowPlane(bool show_plane, int mask); + + void onPaint(wxPaintEvent &event); // NOLINT +}; + +BEGIN_EVENT_TABLE(AnalyzerPanel, wxPanel) +EVT_PAINT(AnalyzerPanel::onPaint) +END_EVENT_TABLE() + +AnalyzerPanel::AnalyzerPanel(wxWindow *parent, const wxString &path, + const bool bit_accounting) + : wxPanel(parent), path(path), zoom(0), pixels(NULL), + bit_accounting(bit_accounting), bpp_q3(NULL), plane_mask(OD_ALL_MASK) {} + +AnalyzerPanel::~AnalyzerPanel() { close(); } + +void AnalyzerPanel::setShowPlane(bool show_plane, int mask) { + if (show_plane) { + plane_mask |= mask; + } else { + plane_mask &= ~mask; + } +} + +void AnalyzerPanel::render() { + aom_image_t *img = decoder.image; + const int hbd = !!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH); + int y_stride = img->stride[0] >> hbd; + int cb_stride = img->stride[1] >> hbd; + int cr_stride = img->stride[2] >> hbd; + int p_stride = 3 * getDisplayWidth(); + unsigned char *y_row = img->planes[0]; + unsigned char *cb_row = img->planes[1]; + unsigned char *cr_row = img->planes[2]; + uint16_t *y_row16 = reinterpret_cast(y_row); + uint16_t *cb_row16 = reinterpret_cast(cb_row); + uint16_t *cr_row16 = reinterpret_cast(cr_row); + unsigned char *p_row = pixels; + int y_width_padding = decoder.getWidthPadding(); + int cb_width_padding = y_width_padding >> 1; + int cr_width_padding = y_width_padding >> 1; + int y_height_padding = decoder.getHeightPadding(); + int cb_height_padding = y_height_padding >> 1; + int cr_height_padding = y_height_padding >> 1; + for (int j = 0; j < decoder.getHeight(); j++) { + unsigned char *y = y_row - y_stride * y_height_padding; + unsigned char *cb = cb_row - cb_stride * cb_height_padding; + unsigned char *cr = cr_row - cr_stride * cr_height_padding; + uint16_t *y16 = y_row16 - y_stride * y_height_padding; + uint16_t *cb16 = cb_row16 - cb_stride * cb_height_padding; + uint16_t *cr16 = cr_row16 - cr_stride * cr_height_padding; + unsigned char *p = p_row; + for (int i = 0; i < decoder.getWidth(); i++) { + int64_t yval; + int64_t cbval; + int64_t crval; + int pmask; + unsigned rval; + unsigned gval; + unsigned bval; + if (hbd) { + yval = *(y16 - y_width_padding); + cbval = *(cb16 - cb_width_padding); + crval = *(cr16 - cr_width_padding); + } else { + yval = *(y - y_width_padding); + cbval = *(cb - cb_width_padding); + crval = *(cr - cr_width_padding); + } + pmask = plane_mask; + if (pmask & OD_LUMA_MASK) { + yval -= 16; + } else { + yval = 128; + } + cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128); + crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128); + /*This is intentionally slow and very accurate.*/ + rval = OD_CLAMPI( + 0, + (int32_t)OD_DIV_ROUND( + 2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL), + 65535); + gval = OD_CLAMPI(0, + (int32_t)OD_DIV_ROUND(2916394880000LL * yval - + 534117096223LL * cbval - + 1334761232047LL * crval, + 9745792000LL), + 65535); + bval = OD_CLAMPI( + 0, + (int32_t)OD_DIV_ROUND( + 2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL), + 65535); + unsigned char *px_row = p; + for (int v = 0; v < zoom; v++) { + unsigned char *px = px_row; + for (int u = 0; u < zoom; u++) { + *(px + 0) = (unsigned char)(rval >> 8); + *(px + 1) = (unsigned char)(gval >> 8); + *(px + 2) = (unsigned char)(bval >> 8); + px += 3; + } + px_row += p_stride; + } + if (hbd) { + int dc = ((y16 - y_row16) & 1) | (1 - img->x_chroma_shift); + y16++; + cb16 += dc; + cr16 += dc; + } else { + int dc = ((y - y_row) & 1) | (1 - img->x_chroma_shift); + y++; + cb += dc; + cr += dc; + } + p += zoom * 3; + } + int dc = -((j & 1) | (1 - img->y_chroma_shift)); + if (hbd) { + y_row16 += y_stride; + cb_row16 += dc & cb_stride; + cr_row16 += dc & cr_stride; + } else { + y_row += y_stride; + cb_row += dc & cb_stride; + cr_row += dc & cr_stride; + } + p_row += zoom * p_stride; + } +} + +void AnalyzerPanel::computeBitsPerPixel() { + Accounting *acct; + double bpp_total; + int totals_q3[MAX_SYMBOL_TYPES] = { 0 }; + int sym_count[MAX_SYMBOL_TYPES] = { 0 }; + decoder.getAccountingStruct(&acct); + for (int j = 0; j < decoder.getHeight(); j++) { + for (int i = 0; i < decoder.getWidth(); i++) { + bpp_q3[j * decoder.getWidth() + i] = 0.0; + } + } + bpp_total = 0; + for (int i = 0; i < acct->syms.num_syms; i++) { + AccountingSymbol *s; + s = &acct->syms.syms[i]; + totals_q3[s->id] += s->bits; + sym_count[s->id] += s->samples; + } + printf("=== Frame: %-3i ===\n", decoder.frame - 1); + for (int i = 0; i < acct->syms.dictionary.num_strs; i++) { + if (totals_q3[i]) { + printf("%30s = %10.3f (%f bit/symbol)\n", acct->syms.dictionary.strs[i], + (float)totals_q3[i] / 8, (float)totals_q3[i] / 8 / sym_count[i]); + } + } + printf("\n"); +} + +void AnalyzerPanel::togglePadding() { + decoder.togglePadding(); + updateDisplaySize(); +} + +bool AnalyzerPanel::nextFrame() { + if (decoder.step()) { + refresh(); + return true; + } + return false; +} + +void AnalyzerPanel::refresh() { + if (bit_accounting) { + computeBitsPerPixel(); + } + render(); +} + +int AnalyzerPanel::getDisplayWidth() const { return zoom * decoder.getWidth(); } + +int AnalyzerPanel::getDisplayHeight() const { + return zoom * decoder.getHeight(); +} + +bool AnalyzerPanel::updateDisplaySize() { + unsigned char *p = (unsigned char *)malloc( + sizeof(*p) * 3 * getDisplayWidth() * getDisplayHeight()); + if (p == NULL) { + return false; + } + free(pixels); + pixels = p; + SetSize(getDisplayWidth(), getDisplayHeight()); + return true; +} + +bool AnalyzerPanel::open(const wxString &path) { + if (!decoder.open(path)) { + return false; + } + if (!setZoom(MIN_ZOOM)) { + return false; + } + if (bit_accounting) { + bpp_q3 = (double *)malloc(sizeof(*bpp_q3) * decoder.getWidth() * + decoder.getHeight()); + if (bpp_q3 == NULL) { + fprintf(stderr, "Could not allocate memory for bit accounting\n"); + close(); + return false; + } + } + if (!nextFrame()) { + close(); + return false; + } + SetFocus(); + return true; +} + +void AnalyzerPanel::close() { + decoder.close(); + free(pixels); + pixels = NULL; + free(bpp_q3); + bpp_q3 = NULL; +} + +int AnalyzerPanel::getZoom() const { return zoom; } + +bool AnalyzerPanel::setZoom(int z) { + if (z <= MAX_ZOOM && z >= MIN_ZOOM && zoom != z) { + int old_zoom = zoom; + zoom = z; + if (!updateDisplaySize()) { + zoom = old_zoom; + return false; + } + return true; + } + return false; +} + +void AnalyzerPanel::onPaint(wxPaintEvent &) { + wxBitmap bmp(wxImage(getDisplayWidth(), getDisplayHeight(), pixels, true)); + wxBufferedPaintDC dc(this, bmp); +} + +class AnalyzerFrame : public wxFrame { + DECLARE_EVENT_TABLE() + + private: + AnalyzerPanel *panel; + const bool bit_accounting; + + wxMenu *fileMenu; + wxMenu *viewMenu; + wxMenu *playbackMenu; + + public: + AnalyzerFrame(const bool bit_accounting); // NOLINT + + void onOpen(wxCommandEvent &event); // NOLINT + void onClose(wxCommandEvent &event); // NOLINT + void onQuit(wxCommandEvent &event); // NOLINT + + void onTogglePadding(wxCommandEvent &event); // NOLINT + void onZoomIn(wxCommandEvent &event); // NOLINT + void onZoomOut(wxCommandEvent &event); // NOLINT + void onActualSize(wxCommandEvent &event); // NOLINT + + void onToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT + void onResetAndToggleViewMenuCheckBox(wxCommandEvent &event); // NOLINT + + void onNextFrame(wxCommandEvent &event); // NOLINT + void onGotoFrame(wxCommandEvent &event); // NOLINT + void onRestart(wxCommandEvent &event); // NOLINT + + void onAbout(wxCommandEvent &event); // NOLINT + + bool open(const wxString &path); + bool setZoom(int zoom); + void updateViewMenu(); +}; + +enum { + wxID_NEXT_FRAME = 6000, + wxID_SHOW_Y, + wxID_SHOW_U, + wxID_SHOW_V, + wxID_GOTO_FRAME, + wxID_RESTART, + wxID_ACTUAL_SIZE, + wxID_PADDING +}; + +BEGIN_EVENT_TABLE(AnalyzerFrame, wxFrame) +EVT_MENU(wxID_OPEN, AnalyzerFrame::onOpen) +EVT_MENU(wxID_CLOSE, AnalyzerFrame::onClose) +EVT_MENU(wxID_EXIT, AnalyzerFrame::onQuit) +EVT_MENU(wxID_PADDING, AnalyzerFrame::onTogglePadding) +EVT_MENU(wxID_ZOOM_IN, AnalyzerFrame::onZoomIn) +EVT_MENU(wxID_ZOOM_OUT, AnalyzerFrame::onZoomOut) +EVT_MENU(wxID_ACTUAL_SIZE, AnalyzerFrame::onActualSize) +EVT_MENU(wxID_SHOW_Y, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) +EVT_MENU(wxID_SHOW_U, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) +EVT_MENU(wxID_SHOW_V, AnalyzerFrame::onResetAndToggleViewMenuCheckBox) +EVT_MENU(wxID_NEXT_FRAME, AnalyzerFrame::onNextFrame) +EVT_MENU(wxID_GOTO_FRAME, AnalyzerFrame::onGotoFrame) +EVT_MENU(wxID_RESTART, AnalyzerFrame::onRestart) +EVT_MENU(wxID_ABOUT, AnalyzerFrame::onAbout) +END_EVENT_TABLE() + +AnalyzerFrame::AnalyzerFrame(const bool bit_accounting) + : wxFrame(NULL, wxID_ANY, _("AV1 Stream Analyzer"), wxDefaultPosition, + wxDefaultSize, wxDEFAULT_FRAME_STYLE), + panel(NULL), bit_accounting(bit_accounting) { + wxMenuBar *mb = new wxMenuBar(); + + fileMenu = new wxMenu(); + fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open AV1 file")); + fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close AV1 file")); + fileMenu->Enable(wxID_CLOSE, false); + fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program")); + mb->Append(fileMenu, _("&File")); + + wxAcceleratorEntry entries[2]; + entries[0].Set(wxACCEL_CTRL, (int)'=', wxID_ZOOM_IN); + entries[1].Set(wxACCEL_CTRL | wxACCEL_SHIFT, (int)'-', wxID_ZOOM_OUT); + wxAcceleratorTable accel(2, entries); + this->SetAcceleratorTable(accel); + + viewMenu = new wxMenu(); + +viewMenu->Append(wxID_PADDING, _("Toggle padding\tCtrl-p"), + _("Show padding")); + viewMenu->Append(wxID_ZOOM_IN, _("Zoom-In\tCtrl-+"), _("Double image size")); + viewMenu->Append(wxID_ZOOM_OUT, _("Zoom-Out\tCtrl--"), _("Half image size")); + viewMenu->Append(wxID_ACTUAL_SIZE, _("Actual size\tCtrl-0"), + _("Actual size of the frame")); + viewMenu->AppendSeparator(); + viewMenu->AppendCheckItem(wxID_SHOW_Y, _("&Y plane\tCtrl-Y"), + _("Show Y plane")); + viewMenu->AppendCheckItem(wxID_SHOW_U, _("&U plane\tCtrl-U"), + _("Show U plane")); + viewMenu->AppendCheckItem(wxID_SHOW_V, _("&V plane\tCtrl-V"), + _("Show V plane")); + mb->Append(viewMenu, _("&View")); + + playbackMenu = new wxMenu(); + playbackMenu->Append(wxID_NEXT_FRAME, _("Next frame\tCtrl-."), + _("Go to next frame")); + /*playbackMenu->Append(wxID_RESTART, _("&Restart\tCtrl-R"), + _("Set video to frame 0")); + playbackMenu->Append(wxID_GOTO_FRAME, _("Jump to Frame\tCtrl-J"), + _("Go to frame number"));*/ + mb->Append(playbackMenu, _("&Playback")); + + wxMenu *helpMenu = new wxMenu(); + helpMenu->Append(wxID_ABOUT, _("&About...\tF1"), _("Show about dialog")); + mb->Append(helpMenu, _("&Help")); + + SetMenuBar(mb); + + CreateStatusBar(1); +} + +void AnalyzerFrame::onOpen(wxCommandEvent &WXUNUSED(event)) { + wxFileDialog openFileDialog(this, _("Open file"), wxEmptyString, + wxEmptyString, _("AV1 files (*.ivf)|*.ivf"), + wxFD_OPEN | wxFD_FILE_MUST_EXIST); + if (openFileDialog.ShowModal() != wxID_CANCEL) { + open(openFileDialog.GetPath()); + } +} + +void AnalyzerFrame::onClose(wxCommandEvent &WXUNUSED(event)) {} + +void AnalyzerFrame::onQuit(wxCommandEvent &WXUNUSED(event)) { Close(true); } + +void AnalyzerFrame::onTogglePadding(wxCommandEvent &WXUNUSED(event)) { + panel->togglePadding(); + SetClientSize(panel->GetSize()); + panel->render(); + panel->Refresh(); +} + +void AnalyzerFrame::onZoomIn(wxCommandEvent &WXUNUSED(event)) { + setZoom(panel->getZoom() + 1); +} + +void AnalyzerFrame::onZoomOut(wxCommandEvent &WXUNUSED(event)) { + setZoom(panel->getZoom() - 1); +} + +void AnalyzerFrame::onActualSize(wxCommandEvent &WXUNUSED(event)) { + setZoom(MIN_ZOOM); +} + +void AnalyzerFrame::onToggleViewMenuCheckBox(wxCommandEvent &event) { // NOLINT + GetMenuBar()->Check(event.GetId(), event.IsChecked()); + updateViewMenu(); +} + +void AnalyzerFrame::onResetAndToggleViewMenuCheckBox( + wxCommandEvent &event) { // NOLINT + int id = event.GetId(); + if (id != wxID_SHOW_Y && id != wxID_SHOW_U && id != wxID_SHOW_V) { + GetMenuBar()->Check(wxID_SHOW_Y, true); + GetMenuBar()->Check(wxID_SHOW_U, true); + GetMenuBar()->Check(wxID_SHOW_V, true); + } + onToggleViewMenuCheckBox(event); +} + +void AnalyzerFrame::onNextFrame(wxCommandEvent &WXUNUSED(event)) { + panel->nextFrame(); + panel->Refresh(false); +} + +void AnalyzerFrame::onGotoFrame(wxCommandEvent &WXUNUSED(event)) {} + +void AnalyzerFrame::onRestart(wxCommandEvent &WXUNUSED(event)) {} + +void AnalyzerFrame::onAbout(wxCommandEvent &WXUNUSED(event)) { + wxAboutDialogInfo info; + info.SetName(_("AV1 Bitstream Analyzer")); + info.SetVersion(_("0.1-beta")); + info.SetDescription( + _("This program implements a bitstream analyzer for AV1")); + info.SetCopyright( + wxT("(C) 2017 Alliance for Open Media ")); + wxAboutBox(info); +} + +bool AnalyzerFrame::open(const wxString &path) { + panel = new AnalyzerPanel(this, path, bit_accounting); + if (panel->open(path)) { + SetClientSize(panel->GetSize()); + return true; + } else { + delete panel; + return false; + } +} + +bool AnalyzerFrame::setZoom(int zoom) { + if (panel->setZoom(zoom)) { + GetMenuBar()->Enable(wxID_ACTUAL_SIZE, zoom != MIN_ZOOM); + GetMenuBar()->Enable(wxID_ZOOM_IN, zoom != MAX_ZOOM); + GetMenuBar()->Enable(wxID_ZOOM_OUT, zoom != MIN_ZOOM); + SetClientSize(panel->GetSize()); + panel->render(); + panel->Refresh(); + return true; + } + return false; +} + +void AnalyzerFrame::updateViewMenu() { + panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_Y), OD_LUMA_MASK); + panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_U), OD_CB_MASK); + panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_V), OD_CR_MASK); + SetClientSize(panel->GetSize()); + panel->render(); + panel->Refresh(false); +} + +class Analyzer : public wxApp { + private: + AnalyzerFrame *frame; + + public: + void OnInitCmdLine(wxCmdLineParser &parser); // NOLINT + bool OnCmdLineParsed(wxCmdLineParser &parser); // NOLINT +}; + +static const wxCmdLineEntryDesc CMD_LINE_DESC[] = { + { wxCMD_LINE_SWITCH, _("h"), _("help"), _("Display this help and exit."), + wxCMD_LINE_VAL_NONE, wxCMD_LINE_OPTION_HELP }, + { wxCMD_LINE_SWITCH, _("a"), _("bit-accounting"), _("Enable bit accounting"), + wxCMD_LINE_VAL_NONE, wxCMD_LINE_PARAM_OPTIONAL }, + { wxCMD_LINE_PARAM, NULL, NULL, _("input.ivf"), wxCMD_LINE_VAL_STRING, + wxCMD_LINE_PARAM_OPTIONAL }, + { wxCMD_LINE_NONE } +}; + +void Analyzer::OnInitCmdLine(wxCmdLineParser &parser) { // NOLINT + parser.SetDesc(CMD_LINE_DESC); + parser.SetSwitchChars(_("-")); +} + +bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) { // NOLINT + bool bit_accounting = parser.Found(_("a")); + if (bit_accounting && !CONFIG_ACCOUNTING) { + fprintf(stderr, + "Bit accounting support not found. " + "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n"); + return false; + } + frame = new AnalyzerFrame(parser.Found(_("a"))); + frame->Show(); + if (parser.GetParamCount() > 0) { + return frame->open(parser.GetParam(0)); + } + return true; +} + +void usage_exit(void) { + fprintf(stderr, "uhh\n"); + exit(EXIT_FAILURE); +} + +IMPLEMENT_APP(Analyzer) diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c new file mode 100644 index 0000000000..b7fb7bce45 --- /dev/null +++ b/third_party/aom/examples/aom_cx_set_ref.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// AV1 Set Reference Frame +// ============================ +// +// This is an example demonstrating how to overwrite the AV1 encoder's +// internal reference frame. In the sample we set the last frame to the +// current frame. This technique could be used to bounce between two cameras. +// +// The decoder would also have to set the reference frame to the same value +// on the same frame, or the video will become corrupt. The 'test_decode' +// variable is set to 1 in this example that tests if the encoder and decoder +// results are matching. +// +// Usage +// ----- +// This example encodes a raw video. And the last argument passed in specifies +// the frame number to update the reference frame on. For example, run +// examples/aom_cx_set_ref av1 352 288 in.yuv out.ivf 4 30 +// The parameter is parsed as follows: +// +// +// Extra Variables +// --------------- +// This example maintains the frame number passed on the command line +// in the `update_frame_num` variable. +// +// +// Configuration +// ------------- +// +// The reference frame is updated on the frame specified on the command +// line. +// +// Observing The Effects +// --------------------- +// The encoder and decoder results should be matching when the same reference +// frame setting operation is done in both encoder and decoder. Otherwise, +// the encoder/decoder mismatch would be seen. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "aom_scale/yv12config.h" +#include "common/tools_common.h" +#include "common/video_writer.h" +#include "examples/encoder_util.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n", + exec_name); + exit(EXIT_FAILURE); +} + +static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder, + unsigned int frame_out, int *mismatch_seen) { + aom_image_t enc_img, dec_img; + + if (*mismatch_seen) return; + + /* Get the internal reference frame */ + if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img)) + die_codec(encoder, "Failed to get encoder reference frame"); + if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img)) + die_codec(decoder, "Failed to get decoder reference frame"); + + if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t enc_hbd_img; + aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); + enc_img = enc_hbd_img; + } + if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t dec_hbd_img; + aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); + dec_img = dec_hbd_img; + } + } + + if (!aom_compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + aom_find_mismatch(&enc_img, &dec_img, y, u, v); + } + + printf( + "Encode/decode mismatch on frame %u at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + *mismatch_seen = 1; + } + + aom_img_free(&enc_img); + aom_img_free(&dec_img); +} + +static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img, + unsigned int frame_in, AvxVideoWriter *writer, + int test_decode, aom_codec_ctx_t *dcodec, + unsigned int *frame_out, int *mismatch_seen, + aom_image_t *ext_ref) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + int got_data; + const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0); + if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame"); + + got_data = 0; + + while ((pkt = aom_codec_get_cx_data(ecodec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + + ++*frame_out; + + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(ecodec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + got_data = 1; + + // Decode 1 frame. + if (test_decode) { + if (aom_codec_decode(dcodec, pkt->data.frame.buf, + (unsigned int)pkt->data.frame.sz, NULL)) + die_codec(dcodec, "Failed to decode frame."); + + // Copy out first decoded frame, and use it as reference later. + if (*frame_out == 1 && ext_ref != NULL) + if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref)) + die_codec(dcodec, "Failed to get decoder new frame"); + } + } + } + + // Mismatch checking + if (got_data && test_decode) { + testing_decode(ecodec, dcodec, *frame_out, mismatch_seen); + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + // Encoder + aom_codec_ctx_t ecodec; + aom_codec_enc_cfg_t cfg; + unsigned int frame_in = 0; + aom_image_t raw; + aom_image_t raw_shift; + aom_image_t ext_ref; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + int flags = 0; + int allocated_raw_shift = 0; + aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420; + aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; + + // Test encoder/decoder mismatch. + int test_decode = 1; + // Decoder + aom_codec_ctx_t dcodec; + unsigned int frame_out = 0; + + // The frame number to set reference frame on + unsigned int update_frame_num = 0; + int mismatch_seen = 0; + + const int fps = 30; + const int bitrate = 500; + + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile_arg = NULL; + const char *outfile_arg = NULL; + const char *update_frame_num_arg = NULL; + unsigned int limit = 0; + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&ecodec, 0, sizeof(ecodec)); + memset(&cfg, 0, sizeof(cfg)); + memset(&info, 0, sizeof(info)); + + if (argc < 7) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile_arg = argv[4]; + outfile_arg = argv[5]; + update_frame_num_arg = argv[6]; + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0); + // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are + // allocated while calling aom_codec_encode(), thus, setting reference for + // 1st frame isn't supported. + if (update_frame_num <= 1) { + die("Couldn't parse frame number '%s'\n", update_frame_num_arg); + } + + if (argc > 7) { + limit = (unsigned int)strtoul(argv[7], NULL, 0); + if (update_frame_num > limit) + die("Update frame number couldn't larger than limit\n"); + } + + info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + // In this test, the bit depth of input video is 8-bit, and the input format + // is AOM_IMG_FMT_I420. + if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) { + die("Failed to allocate image."); + } + + if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + // Allocate memory with the border so that it can be used as a reference. + if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width, + info.frame_height, 32, 8, + AOM_DEC_BORDER_IN_PIXELS)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", aom_codec_iface_name(encoder)); + +#if CONFIG_REALTIME_ONLY + res = aom_codec_enc_config_default(encoder, &cfg, 1); +#else + res = aom_codec_enc_config_default(encoder, &cfg, 0); +#endif + if (res) die_codec(&ecodec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_lag_in_frames = 3; + cfg.g_bit_depth = AOM_BITS_8; + + flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING) + ? AOM_CODEC_USE_HIGHBITDEPTH + : 0; + + writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading.", infile_arg); + + if (aom_codec_enc_init(&ecodec, encoder, &cfg, flags)) + die("Failed to initialize encoder"); + + // Disable alt_ref. + if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0)) + die_codec(&ecodec, "Failed to set enable auto alt ref"); + + if (test_decode) { + aom_codec_iface_t *decoder = get_aom_decoder_by_short_name(codec_arg); + if (aom_codec_dec_init(&dcodec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + } + + // Encode frames. + while (aom_img_read(&raw, infile)) { + if (limit && frame_in >= limit) break; + aom_image_t *frame_to_encode; + + if (FORCE_HIGHBITDEPTH_DECODING) { + // Need to allocate larger buffer to use hbd internal. + int input_shift = 0; + if (!allocated_raw_shift) { + aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH, + info.frame_width, info.frame_height, 32); + allocated_raw_shift = 1; + } + aom_img_upshift(&raw_shift, &raw, input_shift); + frame_to_encode = &raw_shift; + } else { + frame_to_encode = &raw; + } + + if (update_frame_num > 1 && frame_out + 1 == update_frame_num) { + av1_ref_frame_t ref; + ref.idx = 0; + ref.use_external_ref = 0; + ref.img = ext_ref; + // Set reference frame in encoder. + if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref)) + die_codec(&ecodec, "Failed to set encoder reference frame"); + printf(" "); + +#if CONFIG_REALTIME_ONLY + // Set cpu speed in encoder. + if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7)) + die_codec(&ecodec, "Failed to set cpu speed"); +#endif + + // If set_reference in decoder is commented out, the enc/dec mismatch + // would be seen. + if (test_decode) { + ref.use_external_ref = 1; + if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref)) + die_codec(&dcodec, "Failed to set decoder reference frame"); + } + } + + encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode, + &dcodec, &frame_out, &mismatch_seen, &ext_ref); + frame_in++; + if (mismatch_seen) break; + } + + // Flush encoder. + if (!mismatch_seen) + while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec, + &frame_out, &mismatch_seen, NULL)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %u frames.\n", frame_out); + + if (test_decode) { + if (!mismatch_seen) + printf("Encoder/decoder results are matching.\n"); + else + printf("Encoder/decoder results are NOT matching.\n"); + } + + if (test_decode) + if (aom_codec_destroy(&dcodec)) + die_codec(&dcodec, "Failed to destroy decoder"); + + if (allocated_raw_shift) aom_img_free(&raw_shift); + aom_img_free(&ext_ref); + aom_img_free(&raw); + if (aom_codec_destroy(&ecodec)) + die_codec(&ecodec, "Failed to destroy encoder."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/av1_dec_fuzzer.cc b/third_party/aom/examples/av1_dec_fuzzer.cc new file mode 100644 index 0000000000..9b9a0b9cb6 --- /dev/null +++ b/third_party/aom/examples/av1_dec_fuzzer.cc @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* + * See build_av1_dec_fuzzer.sh for building instructions. + */ + +#include +#include +#include +#include +#include +#include +#include "config/aom_config.h" +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_ports/mem_ops.h" + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size <= IVF_FILE_HDR_SZ) { + return 0; + } + + aom_codec_iface_t *codec_interface = aom_codec_av1_dx(); + aom_codec_ctx_t codec; + // Set thread count in the range [1, 64]. + const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; + aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING }; + if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) { + return 0; + } + + data += IVF_FILE_HDR_SZ; + size -= IVF_FILE_HDR_SZ; + + while (size > IVF_FRAME_HDR_SZ) { + size_t frame_size = mem_get_le32(data); + size -= IVF_FRAME_HDR_SZ; + data += IVF_FRAME_HDR_SZ; + frame_size = std::min(size, frame_size); + + const aom_codec_err_t err = + aom_codec_decode(&codec, data, frame_size, nullptr); + static_cast(err); + aom_codec_iter_t iter = nullptr; + aom_image_t *img = nullptr; + while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) { + } + data += frame_size; + size -= frame_size; + } + aom_codec_destroy(&codec); + return 0; +} diff --git a/third_party/aom/examples/av1_dec_fuzzer.dict b/third_party/aom/examples/av1_dec_fuzzer.dict new file mode 100644 index 0000000000..fb1638864c --- /dev/null +++ b/third_party/aom/examples/av1_dec_fuzzer.dict @@ -0,0 +1,5 @@ +# IVF Signature + version (bytes 0-5) +kw1="DKIF\x00\x00" + +# AV1 codec fourCC (bytes 8-11) +kw2="AV01" diff --git a/third_party/aom/examples/build_av1_dec_fuzzer.sh b/third_party/aom/examples/build_av1_dec_fuzzer.sh new file mode 100755 index 0000000000..40355ea133 --- /dev/null +++ b/third_party/aom/examples/build_av1_dec_fuzzer.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# +# Copyright (c) 2019, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# +############################################################################### +# Fuzzer for libaom decoder. +# ========================== +# Requirements +# --------------------- +# Clang6.0 or above (must support -fsanitize=fuzzer -fsanitize=fuzzer-no-link) +# +# References: +# --------------------- +# http://llvm.org/docs/LibFuzzer.html +# https://github.com/google/oss-fuzz +# +# Steps to build / run +# --------------------- + +set -eu + +# Have a copy of AOM and a build directory ready. +if [[ $# -ne 2 ]]; then + echo "Pass in the AOM source tree as first argument, and a build directory " + echo "as the second argument. The AOM source tree can be obtained via: " + echo " git clone https://aomedia.googlesource.com/aom" + exit 2 +fi +if [[ -z "${CC:-}" ]]; then + echo "Set the CC environment variable to point to your C compiler." + exit 2 +fi +if [[ -z "${CXX:-}" ]]; then + echo "Set the CXX environment variable to point to your C++ compiler." + exit 2 +fi + +AOM_DIR=$1 +BUILD_DIR=$2 +# Run CMake with address sanitizer enabled and build the codec. +# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows +# in the transform functions. Also set memory limits. +EXTRA_C_FLAGS='-UNDEBUG -DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824' +cd "${BUILD_DIR}" +cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \ + -DFORCE_HIGHBITDEPTH_DECODING=0 \ + -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \ + -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \ + -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \ + -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=fuzzer-no-link,address + +# Build the codec. +make -j$(nproc) + +# Build the av1 fuzzer +$CXX -std=c++11 -I${AOM_DIR} -I${BUILD_DIR} \ + -g -fsanitize=fuzzer,address \ + ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \ + ${BUILD_DIR}/libaom.a + +echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer." +echo "Create a corpus directory, copy IVF files in there, and run:" +echo " av1_dec_fuzzer CORPUS_DIR" diff --git a/third_party/aom/examples/decode_to_md5.c b/third_party/aom/examples/decode_to_md5.c new file mode 100644 index 0000000000..07f788ff97 --- /dev/null +++ b/third_party/aom/examples/decode_to_md5.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Frame-by-frame MD5 Checksum +// =========================== +// +// This example builds upon the simple decoder loop to show how checksums +// of the decoded output can be generated. These are used for validating +// decoder implementations against the reference implementation, for example. +// +// MD5 algorithm +// ------------- +// The Message-Digest 5 (MD5) is a well known hash function. We have provided +// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest +// Algorithm for your use. Our implmentation only changes the interface of this +// reference code. You must include the `md5_utils.h` header for access to these +// functions. +// +// Processing The Decoded Data +// --------------------------- +// Each row of the image is passed to the MD5 accumulator. First the Y plane +// is processed, then U, then V. It is important to honor the image's `stride` +// values. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "common/md5_utils.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) { + int plane, y; + MD5Context md5; + + MD5Init(&md5); + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = plane ? (img->d_w + 1) >> 1 : img->d_w; + const int h = plane ? (img->d_h + 1) >> 1 : img->d_h; + + for (y = 0; y < h; ++y) { + MD5Update(&md5, buf, w); + buf += stride; + } + } + + MD5Final(digest, &md5); +} + +static void print_md5(FILE *stream, unsigned char digest[16]) { + int i; + + for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]); +} + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + info = aom_video_reader_get_info(reader); + + aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", aom_codec_iface_name(decoder)); + + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder"); + + while (aom_video_reader_read_frame(reader)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame"); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + unsigned char digest[16]; + + get_image_md5(img, digest); + print_md5(outfile, digest); + fprintf(outfile, " img-%ux%u-%04d.i420\n", img->d_w, img->d_h, + ++frame_cnt); + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_reader_close(reader); + + fclose(outfile); + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/decode_with_drops.c b/third_party/aom/examples/decode_with_drops.c new file mode 100644 index 0000000000..9bec6ee2df --- /dev/null +++ b/third_party/aom/examples/decode_with_drops.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Decode With Drops Example +// ========================= +// +// This is an example utility which drops a series of frames, as specified +// on the command line. This is useful for observing the error recovery +// features of the codec. +// +// Usage +// ----- +// This example adds a single argument to the `simple_decoder` example, +// which specifies the range or pattern of frames to drop. The parameter is +// parsed as follows: +// +// Dropping A Range Of Frames +// -------------------------- +// To drop a range of frames, specify the starting frame and the ending +// frame to drop, separated by a dash. The following command will drop +// frames 5 through 10 (base 1). +// +// $ ./decode_with_drops in.ivf out.i420 5-10 +// +// +// Dropping A Pattern Of Frames +// ---------------------------- +// To drop a pattern of frames, specify the number of frames to drop and +// the number of frames after which to repeat the pattern, separated by +// a forward-slash. The following command will drop 3 of 7 frames. +// Specifically, it will decode 4 frames, then drop 3 frames, and then +// repeat. +// +// $ ./decode_with_drops in.ivf out.i420 3/7 +// +// +// Extra Variables +// --------------- +// This example maintains the pattern passed on the command line in the +// `n`, `m`, and `is_range` variables: +// +// +// Making The Drop Decision +// ------------------------ +// The example decides whether to drop the frame based on the current +// frame number, immediately before decoding the frame. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + int n = 0; + int m = 0; + int is_range = 0; + char *nptr = NULL; + + exec_name = argv[0]; + + if (argc != 4) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + n = (int)strtol(argv[3], &nptr, 0); + m = (int)strtol(nptr + 1, NULL, 0); + is_range = (*nptr == '-'); + if (!n || !m || (*nptr != '-' && *nptr != '/')) + die("Couldn't parse pattern %s.\n", argv[3]); + + info = aom_video_reader_get_info(reader); + + aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", aom_codec_iface_name(decoder)); + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + + while (aom_video_reader_read_frame(reader)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + size_t frame_size = 0; + int skip; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + ++frame_cnt; + + skip = (is_range && frame_cnt >= n && frame_cnt <= m) || + (!is_range && m - (frame_cnt - 1) % m <= n); + + if (!skip) { + putc('.', stdout); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) + aom_img_write(img, outfile); + } else { + putc('X', stdout); + } + + fflush(stdout); + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + aom_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/encoder_util.c b/third_party/aom/examples/encoder_util.c new file mode 100644 index 0000000000..e43b372506 --- /dev/null +++ b/third_party/aom/examples/encoder_util.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Utility functions used by encoder binaries. + +#include "examples/encoder_util.h" + +#include +#include + +#include "aom/aom_integer.h" + +#define mmin(a, b) ((a) < (b) ? (a) : (b)) + +static void find_mismatch_plane(const aom_image_t *const img1, + const aom_image_t *const img2, int plane, + int use_highbitdepth, int loc[4]) { + const unsigned char *const p1 = img1->planes[plane]; + const int p1_stride = img1->stride[plane] >> use_highbitdepth; + const unsigned char *const p2 = img2->planes[plane]; + const int p2_stride = img2->stride[plane] >> use_highbitdepth; + const uint32_t bsize = 64; + const int is_y_plane = (plane == AOM_PLANE_Y); + const uint32_t bsizex = is_y_plane ? bsize : bsize >> img1->x_chroma_shift; + const uint32_t bsizey = is_y_plane ? bsize : bsize >> img1->y_chroma_shift; + const uint32_t c_w = + is_y_plane ? img1->d_w + : (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + is_y_plane ? img1->d_h + : (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + assert(img1->d_w == img2->d_w && img1->d_h == img2->d_h); + assert(img1->x_chroma_shift == img2->x_chroma_shift && + img1->y_chroma_shift == img2->y_chroma_shift); + loc[0] = loc[1] = loc[2] = loc[3] = -1; + if (img1->monochrome && img2->monochrome && plane) return; + int match = 1; + uint32_t i, j; + for (i = 0; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + const int si = + is_y_plane ? mmin(i + bsizey, c_h) - i : mmin(i + bsizey, c_h - i); + const int sj = + is_y_plane ? mmin(j + bsizex, c_w) - j : mmin(j + bsizex, c_w - j); + int k, l; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + const int row = i + k; + const int col = j + l; + const int offset1 = row * p1_stride + col; + const int offset2 = row * p2_stride + col; + const int val1 = use_highbitdepth + ? p1[2 * offset1] | (p1[2 * offset1 + 1] << 8) + : p1[offset1]; + const int val2 = use_highbitdepth + ? p2[2 * offset2] | (p2[2 * offset2 + 1] << 8) + : p2[offset2]; + if (val1 != val2) { + loc[0] = row; + loc[1] = col; + loc[2] = val1; + loc[3] = val2; + match = 0; + break; + } + } + } + } + } +} + +static void find_mismatch_helper(const aom_image_t *const img1, + const aom_image_t *const img2, + int use_highbitdepth, int yloc[4], int uloc[4], + int vloc[4]) { + find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc); + find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc); + find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc); +} + +void aom_find_mismatch_high(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]) { + find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc); +} + +void aom_find_mismatch(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]) { + find_mismatch_helper(img1, img2, 0, yloc, uloc, vloc); +} + +int aom_compare_img(const aom_image_t *const img1, + const aom_image_t *const img2) { + assert(img1->cp == img2->cp); + assert(img1->tc == img2->tc); + assert(img1->mc == img2->mc); + assert(img1->monochrome == img2->monochrome); + + int num_planes = img1->monochrome ? 1 : 3; + + uint32_t l_w = img1->d_w; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + + match &= (img1->fmt == img2->fmt); + match &= (img1->d_w == img2->d_w); + match &= (img1->d_h == img2->d_h); + if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + l_w *= 2; + c_w *= 2; + } + + for (int plane = 0; plane < num_planes; ++plane) { + uint32_t height = plane ? c_h : img1->d_h; + uint32_t width = plane ? c_w : l_w; + + for (uint32_t i = 0; i < height; ++i) { + match &= + (memcmp(img1->planes[plane] + i * img1->stride[plane], + img2->planes[plane] + i * img2->stride[plane], width) == 0); + } + } + + return match; +} diff --git a/third_party/aom/examples/encoder_util.h b/third_party/aom/examples/encoder_util.h new file mode 100644 index 0000000000..fa0e7d1880 --- /dev/null +++ b/third_party/aom/examples/encoder_util.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Utility functions used by encoder binaries. + +#ifndef AOM_EXAMPLES_ENCODER_UTIL_H_ +#define AOM_EXAMPLES_ENCODER_UTIL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom/aom_image.h" + +// Returns mismatch location (?loc[0],?loc[1]) and the values at that location +// in img1 (?loc[2]) and img2 (?loc[3]). +void aom_find_mismatch_high(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], + int uloc[4], int vloc[4]); + +void aom_find_mismatch(const aom_image_t *const img1, + const aom_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]); + +// Returns 1 if the two images match. +int aom_compare_img(const aom_image_t *const img1, + const aom_image_t *const img2); + +#ifdef __cplusplus +} +#endif +#endif // AOM_EXAMPLES_ENCODER_UTIL_H_ diff --git a/third_party/aom/examples/inspect.c b/third_party/aom/examples/inspect.c new file mode 100644 index 0000000000..e285be0209 --- /dev/null +++ b/third_party/aom/examples/inspect.c @@ -0,0 +1,963 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Inspect Decoder +// ================ +// +// This is a simple decoder loop that writes JSON stats to stdout. This tool +// can also be compiled with Emscripten and used as a library. + +#include +#include +#include + +#ifdef __EMSCRIPTEN__ +#include +#else +#define EMSCRIPTEN_KEEPALIVE +#endif + +#include "config/aom_config.h" + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "av1/common/av1_common_int.h" + +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#include "av1/decoder/inspection.h" +#include "common/args.h" +#include "common/tools_common.h" +#include "common/video_common.h" +#include "common/video_reader.h" + +// Max JSON buffer size. +const int MAX_BUFFER = 1024 * 1024 * 256; + +typedef enum { + ACCOUNTING_LAYER = 1, + BLOCK_SIZE_LAYER = 1 << 1, + TRANSFORM_SIZE_LAYER = 1 << 2, + TRANSFORM_TYPE_LAYER = 1 << 3, + MODE_LAYER = 1 << 4, + SKIP_LAYER = 1 << 5, + FILTER_LAYER = 1 << 6, + CDEF_LAYER = 1 << 7, + REFERENCE_FRAME_LAYER = 1 << 8, + MOTION_VECTORS_LAYER = 1 << 9, + UV_MODE_LAYER = 1 << 10, + CFL_LAYER = 1 << 11, + DUAL_FILTER_LAYER = 1 << 12, + Q_INDEX_LAYER = 1 << 13, + SEGMENT_ID_LAYER = 1 << 14, + MOTION_MODE_LAYER = 1 << 15, + COMPOUND_TYPE_LAYER = 1 << 16, + INTRABC_LAYER = 1 << 17, + PALETTE_LAYER = 1 << 18, + UV_PALETTE_LAYER = 1 << 19, + ALL_LAYERS = (1 << 20) - 1 +} LayerType; + +static LayerType layers = 0; + +static int stop_after = 0; +static int compress = 0; + +static const arg_def_t limit_arg = + ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames"); +static const arg_def_t dump_all_arg = ARG_DEF("A", "all", 0, "Dump All"); +static const arg_def_t compress_arg = + ARG_DEF("x", "compress", 0, "Compress JSON using RLE"); +static const arg_def_t dump_accounting_arg = + ARG_DEF("a", "accounting", 0, "Dump Accounting"); +static const arg_def_t dump_block_size_arg = + ARG_DEF("bs", "blockSize", 0, "Dump Block Size"); +static const arg_def_t dump_motion_vectors_arg = + ARG_DEF("mv", "motionVectors", 0, "Dump Motion Vectors"); +static const arg_def_t dump_transform_size_arg = + ARG_DEF("ts", "transformSize", 0, "Dump Transform Size"); +static const arg_def_t dump_transform_type_arg = + ARG_DEF("tt", "transformType", 0, "Dump Transform Type"); +static const arg_def_t dump_mode_arg = ARG_DEF("m", "mode", 0, "Dump Mode"); +static const arg_def_t dump_motion_mode_arg = + ARG_DEF("mm", "motion_mode", 0, "Dump Motion Modes"); +static const arg_def_t dump_compound_type_arg = + ARG_DEF("ct", "compound_type", 0, "Dump Compound Types"); +static const arg_def_t dump_uv_mode_arg = + ARG_DEF("uvm", "uv_mode", 0, "Dump UV Intra Prediction Modes"); +static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip"); +static const arg_def_t dump_filter_arg = + ARG_DEF("f", "filter", 0, "Dump Filter"); +static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF"); +static const arg_def_t dump_cfl_arg = + ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas"); +static const arg_def_t dump_dual_filter_type_arg = + ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type"); +static const arg_def_t dump_reference_frame_arg = + ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame"); +static const arg_def_t dump_delta_q_arg = + ARG_DEF("dq", "delta_q", 0, "Dump QIndex"); +static const arg_def_t dump_seg_id_arg = + ARG_DEF("si", "seg_id", 0, "Dump Segment ID"); +static const arg_def_t dump_intrabc_arg = + ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used"); +static const arg_def_t dump_palette_arg = + ARG_DEF("plt", "palette", 0, "Dump Palette Size"); +static const arg_def_t dump_uv_palette_arg = + ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size"); +static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help"); +static const arg_def_t skip_non_transform_arg = ARG_DEF( + "snt", "skip_non_transform", 1, "Skip is counted as a non transform."); +static const arg_def_t combined_arg = + ARG_DEF("comb", "combined", 1, "combinining parameters into one output."); + +int combined_parm_list[15]; +int combined_parm_count = 0; + +static const arg_def_t *main_args[] = { &limit_arg, + &dump_all_arg, + &compress_arg, +#if CONFIG_ACCOUNTING + &dump_accounting_arg, +#endif + &dump_block_size_arg, + &dump_transform_size_arg, + &dump_transform_type_arg, + &dump_mode_arg, + &dump_uv_mode_arg, + &dump_motion_mode_arg, + &dump_compound_type_arg, + &dump_skip_arg, + &dump_filter_arg, + &dump_cdef_arg, + &dump_dual_filter_type_arg, + &dump_cfl_arg, + &dump_reference_frame_arg, + &dump_motion_vectors_arg, + &dump_delta_q_arg, + &dump_seg_id_arg, + &dump_intrabc_arg, + &dump_palette_arg, + &dump_uv_palette_arg, + &usage_arg, + &skip_non_transform_arg, + &combined_arg, + NULL }; +#define ENUM(name) \ + { #name, name } +#define LAST_ENUM \ + { NULL, 0 } +typedef struct map_entry { + const char *name; + int value; +} map_entry; + +const map_entry refs_map[] = { + ENUM(INTRA_FRAME), ENUM(LAST_FRAME), ENUM(LAST2_FRAME), + ENUM(LAST3_FRAME), ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME), + ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM +}; + +const map_entry block_size_map[] = { + ENUM(BLOCK_4X4), ENUM(BLOCK_4X8), ENUM(BLOCK_8X4), + ENUM(BLOCK_8X8), ENUM(BLOCK_8X16), ENUM(BLOCK_16X8), + ENUM(BLOCK_16X16), ENUM(BLOCK_16X32), ENUM(BLOCK_32X16), + ENUM(BLOCK_32X32), ENUM(BLOCK_32X64), ENUM(BLOCK_64X32), + ENUM(BLOCK_64X64), ENUM(BLOCK_64X128), ENUM(BLOCK_128X64), + ENUM(BLOCK_128X128), ENUM(BLOCK_4X16), ENUM(BLOCK_16X4), + ENUM(BLOCK_8X32), ENUM(BLOCK_32X8), ENUM(BLOCK_16X64), + ENUM(BLOCK_64X16), LAST_ENUM +}; + +#define TX_SKIP -1 + +const map_entry tx_size_map[] = { + ENUM(TX_4X4), ENUM(TX_8X8), ENUM(TX_16X16), ENUM(TX_32X32), + ENUM(TX_64X64), ENUM(TX_4X8), ENUM(TX_8X4), ENUM(TX_8X16), + ENUM(TX_16X8), ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64), + ENUM(TX_64X32), ENUM(TX_4X16), ENUM(TX_16X4), ENUM(TX_8X32), + ENUM(TX_32X8), ENUM(TX_16X64), ENUM(TX_64X16), LAST_ENUM +}; + +const map_entry tx_type_map[] = { ENUM(DCT_DCT), + ENUM(ADST_DCT), + ENUM(DCT_ADST), + ENUM(ADST_ADST), + ENUM(FLIPADST_DCT), + ENUM(DCT_FLIPADST), + ENUM(FLIPADST_FLIPADST), + ENUM(ADST_FLIPADST), + ENUM(FLIPADST_ADST), + ENUM(IDTX), + ENUM(V_DCT), + ENUM(H_DCT), + ENUM(V_ADST), + ENUM(H_ADST), + ENUM(V_FLIPADST), + ENUM(H_FLIPADST), + LAST_ENUM }; +const map_entry dual_filter_map[] = { ENUM(REG_REG), ENUM(REG_SMOOTH), + ENUM(REG_SHARP), ENUM(SMOOTH_REG), + ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP), + ENUM(SHARP_REG), ENUM(SHARP_SMOOTH), + ENUM(SHARP_SHARP), LAST_ENUM }; + +const map_entry prediction_mode_map[] = { + ENUM(DC_PRED), ENUM(V_PRED), ENUM(H_PRED), + ENUM(D45_PRED), ENUM(D135_PRED), ENUM(D113_PRED), + ENUM(D157_PRED), ENUM(D203_PRED), ENUM(D67_PRED), + ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED), + ENUM(PAETH_PRED), ENUM(NEARESTMV), ENUM(NEARMV), + ENUM(GLOBALMV), ENUM(NEWMV), ENUM(NEAREST_NEARESTMV), + ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV), + ENUM(NEAR_NEWMV), ENUM(NEW_NEARMV), ENUM(GLOBAL_GLOBALMV), + ENUM(NEW_NEWMV), ENUM(INTRA_INVALID), LAST_ENUM +}; + +const map_entry motion_mode_map[] = { ENUM(SIMPLE_TRANSLATION), + ENUM(OBMC_CAUSAL), // 2-sided OBMC + ENUM(WARPED_CAUSAL), // 2-sided WARPED + LAST_ENUM }; + +const map_entry compound_type_map[] = { ENUM(COMPOUND_AVERAGE), + ENUM(COMPOUND_WEDGE), + ENUM(COMPOUND_DIFFWTD), LAST_ENUM }; + +const map_entry uv_prediction_mode_map[] = { + ENUM(UV_DC_PRED), ENUM(UV_V_PRED), + ENUM(UV_H_PRED), ENUM(UV_D45_PRED), + ENUM(UV_D135_PRED), ENUM(UV_D113_PRED), + ENUM(UV_D157_PRED), ENUM(UV_D203_PRED), + ENUM(UV_D67_PRED), ENUM(UV_SMOOTH_PRED), + ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED), + ENUM(UV_PAETH_PRED), ENUM(UV_CFL_PRED), + ENUM(UV_MODE_INVALID), LAST_ENUM +}; +#define NO_SKIP 0 +#define SKIP 1 + +const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM }; + +const map_entry intrabc_map[] = { { "INTRABC", 1 }, + { "NO_INTRABC", 0 }, + LAST_ENUM }; + +const map_entry palette_map[] = { + { "ZERO_COLORS", 0 }, { "TWO_COLORS", 2 }, { "THREE_COLORS", 3 }, + { "FOUR_COLORS", 4 }, { "FIVE_COLORS", 5 }, { "SIX_COLORS", 6 }, + { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM +}; + +const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM }; + +static const char *exec_name; + +struct parm_offset { + char parm[60]; + char offset; +}; +struct parm_offset parm_offsets[] = { + { "blockSize", offsetof(insp_mi_data, bsize) }, + { "transformSize", offsetof(insp_mi_data, tx_size) }, + { "transformType", offsetof(insp_mi_data, tx_type) }, + { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) }, + { "mode", offsetof(insp_mi_data, mode) }, + { "uv_mode", offsetof(insp_mi_data, uv_mode) }, + { "motion_mode", offsetof(insp_mi_data, motion_mode) }, + { "compound_type", offsetof(insp_mi_data, compound_type) }, + { "referenceFrame", offsetof(insp_mi_data, ref_frame) }, + { "skip", offsetof(insp_mi_data, skip) }, +}; +int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]); + +int convert_to_indices(char *str, int *indices, int maxCount, int *count) { + *count = 0; + do { + char *comma = strchr(str, ','); + int length = (comma ? (int)(comma - str) : (int)strlen(str)); + int i; + for (i = 0; i < parm_count; ++i) { + if (!strncmp(str, parm_offsets[i].parm, length)) { + break; + } + } + if (i == parm_count) return 0; + indices[(*count)++] = i; + if (*count > maxCount) return 0; + str += length + 1; + } while (strlen(str) > 0); + return 1; +} + +insp_frame_data frame_data; +int frame_count = 0; +int decoded_frame_count = 0; +aom_codec_ctx_t codec; +AvxVideoReader *reader = NULL; +const AvxVideoInfo *info = NULL; +aom_image_t *img = NULL; + +void on_frame_decoded_dump(char *json) { +#ifdef __EMSCRIPTEN__ + EM_ASM_({ Module.on_frame_decoded_json($0); }, json); +#else + printf("%s", json); +#endif +} + +// Writing out the JSON buffer using snprintf is very slow, especially when +// compiled with emscripten, these functions speed things up quite a bit. +int put_str(char *buffer, const char *str) { + int i; + for (i = 0; str[i] != '\0'; i++) { + buffer[i] = str[i]; + } + return i; +} + +int put_str_with_escape(char *buffer, const char *str) { + int i; + int j = 0; + for (i = 0; str[i] != '\0'; i++) { + if (str[i] < ' ') { + continue; + } else if (str[i] == '"' || str[i] == '\\') { + buffer[j++] = '\\'; + } + buffer[j++] = str[i]; + } + return j; +} + +int put_num(char *buffer, char prefix, int num, char suffix) { + int i = 0; + char *buf = buffer; + int is_neg = 0; + if (prefix) { + buf[i++] = prefix; + } + if (num == 0) { + buf[i++] = '0'; + } else { + if (num < 0) { + num = -num; + is_neg = 1; + } + int s = i; + while (num != 0) { + buf[i++] = '0' + (num % 10); + num = num / 10; + } + if (is_neg) { + buf[i++] = '-'; + } + int e = i - 1; + while (s < e) { + int t = buf[s]; + buf[s] = buf[e]; + buf[e] = t; + s++; + e--; + } + } + if (suffix) { + buf[i++] = suffix; + } + return i; +} + +int put_map(char *buffer, const map_entry *map) { + char *buf = buffer; + const map_entry *entry = map; + while (entry->name != NULL) { + *(buf++) = '"'; + buf += put_str(buf, entry->name); + *(buf++) = '"'; + buf += put_num(buf, ':', entry->value, 0); + entry++; + if (entry->name != NULL) { + *(buf++) = ','; + } + } + return (int)(buf - buffer); +} + +int put_reference_frame(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, t; + buf += put_str(buf, " \"referenceFrameMap\": {"); + buf += put_map(buf, refs_map); + buf += put_str(buf, "},\n"); + buf += put_str(buf, " \"referenceFrame\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + buf += put_num(buf, '[', mi->ref_frame[0], 0); + buf += put_num(buf, ',', mi->ref_frame[1], ']'); + if (compress) { // RLE + for (t = c + 1; t < mi_cols; ++t) { + insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; + if (mi->ref_frame[0] != next_mi->ref_frame[0] || + mi->ref_frame[1] != next_mi->ref_frame[1]) { + break; + } + } + if (t - c > 1) { + *(buf++) = ','; + buf += put_num(buf, '[', t - c - 1, ']'); + c = t - 1; + } + } + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +int put_motion_vectors(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, t; + buf += put_str(buf, " \"motionVectors\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + buf += put_num(buf, '[', mi->mv[0].col, 0); + buf += put_num(buf, ',', mi->mv[0].row, 0); + buf += put_num(buf, ',', mi->mv[1].col, 0); + buf += put_num(buf, ',', mi->mv[1].row, ']'); + if (compress) { // RLE + for (t = c + 1; t < mi_cols; ++t) { + insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; + if (mi->mv[0].col != next_mi->mv[0].col || + mi->mv[0].row != next_mi->mv[0].row || + mi->mv[1].col != next_mi->mv[1].col || + mi->mv[1].row != next_mi->mv[1].row) { + break; + } + } + if (t - c > 1) { + *(buf++) = ','; + buf += put_num(buf, '[', t - c - 1, ']'); + c = t - 1; + } + } + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +int put_combined(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, p; + buf += put_str(buf, " \""); + for (p = 0; p < combined_parm_count; ++p) { + if (p) buf += put_str(buf, "&"); + buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm); + } + buf += put_str(buf, "\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + *(buf++) = '['; + for (p = 0; p < combined_parm_count; ++p) { + if (p) *(buf++) = ','; + int16_t *v = (int16_t *)(((int8_t *)mi) + + parm_offsets[combined_parm_list[p]].offset); + buf += put_num(buf, 0, v[0], 0); + } + *(buf++) = ']'; + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +int put_block_info(char *buffer, const map_entry *map, const char *name, + size_t offset, int len) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, t, i; + if (compress && len == 1) { + die("Can't encode scalars as arrays when RLE compression is enabled."); + } + if (map) { + buf += snprintf(buf, MAX_BUFFER, " \"%sMap\": {", name); + buf += put_map(buf, map); + buf += put_str(buf, "},\n"); + } + buf += snprintf(buf, MAX_BUFFER, " \"%s\": [", name); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + int16_t *v = (int16_t *)(((int8_t *)mi) + offset); + if (len == 0) { + buf += put_num(buf, 0, v[0], 0); + } else { + buf += put_str(buf, "["); + for (i = 0; i < len; i++) { + buf += put_num(buf, 0, v[i], 0); + if (i < len - 1) { + buf += put_str(buf, ","); + } + } + buf += put_str(buf, "]"); + } + if (compress) { // RLE + for (t = c + 1; t < mi_cols; ++t) { + insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t]; + int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset); + int same = 0; + if (len == 0) { + same = v[0] == nv[0]; + } else { + for (i = 0; i < len; i++) { + same = v[i] == nv[i]; + if (!same) { + break; + } + } + } + if (!same) { + break; + } + } + if (t - c > 1) { + *(buf++) = ','; + buf += put_num(buf, '[', t - c - 1, ']'); + c = t - 1; + } + } + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + +#if CONFIG_ACCOUNTING +int put_accounting(char *buffer) { + char *buf = buffer; + int i; + const Accounting *accounting = frame_data.accounting; + if (accounting == NULL) { + printf("XXX\n"); + return 0; + } + const int num_syms = accounting->syms.num_syms; + const int num_strs = accounting->syms.dictionary.num_strs; + buf += put_str(buf, " \"symbolsMap\": ["); + for (i = 0; i < num_strs; i++) { + buf += snprintf(buf, MAX_BUFFER, "\"%s\"", + accounting->syms.dictionary.strs[i]); + if (i < num_strs - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + buf += put_str(buf, " \"symbols\": [\n "); + AccountingSymbolContext context; + context.x = -2; + context.y = -2; + AccountingSymbol *sym; + for (i = 0; i < num_syms; i++) { + sym = &accounting->syms.syms[i]; + if (memcmp(&context, &sym->context, sizeof(AccountingSymbolContext)) != 0) { + buf += put_num(buf, '[', sym->context.x, 0); + buf += put_num(buf, ',', sym->context.y, ']'); + } else { + buf += put_num(buf, '[', sym->id, 0); + buf += put_num(buf, ',', sym->bits, 0); + buf += put_num(buf, ',', sym->samples, ']'); + } + context = sym->context; + if (i < num_syms - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} +#endif + +int skip_non_transform = 0; + +void inspect(void *pbi, void *data) { + /* Fetch frame data. */ + ifd_inspect(&frame_data, pbi, skip_non_transform); + + // Show existing frames just show a reference buffer we've already decoded. + // There's no information to show. + if (frame_data.show_existing_frame) return; + + (void)data; + // We allocate enough space and hope we don't write out of bounds. Totally + // unsafe but this speeds things up, especially when compiled to Javascript. + char *buffer = aom_malloc(MAX_BUFFER); + if (!buffer) { + fprintf(stderr, "Error allocating inspect info buffer\n"); + abort(); + } + char *buf = buffer; + buf += put_str(buf, "{\n"); + if (layers & BLOCK_SIZE_LAYER) { + buf += put_block_info(buf, block_size_map, "blockSize", + offsetof(insp_mi_data, bsize), 0); + } + if (layers & TRANSFORM_SIZE_LAYER) { + buf += put_block_info(buf, tx_size_map, "transformSize", + offsetof(insp_mi_data, tx_size), 0); + } + if (layers & TRANSFORM_TYPE_LAYER) { + buf += put_block_info(buf, tx_type_map, "transformType", + offsetof(insp_mi_data, tx_type), 0); + } + if (layers & DUAL_FILTER_LAYER) { + buf += put_block_info(buf, dual_filter_map, "dualFilterType", + offsetof(insp_mi_data, dual_filter_type), 0); + } + if (layers & MODE_LAYER) { + buf += put_block_info(buf, prediction_mode_map, "mode", + offsetof(insp_mi_data, mode), 0); + } + if (layers & UV_MODE_LAYER) { + buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode", + offsetof(insp_mi_data, uv_mode), 0); + } + if (layers & MOTION_MODE_LAYER) { + buf += put_block_info(buf, motion_mode_map, "motion_mode", + offsetof(insp_mi_data, motion_mode), 0); + } + if (layers & COMPOUND_TYPE_LAYER) { + buf += put_block_info(buf, compound_type_map, "compound_type", + offsetof(insp_mi_data, compound_type), 0); + } + if (layers & SKIP_LAYER) { + buf += + put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0); + } + if (layers & FILTER_LAYER) { + buf += + put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2); + } + if (layers & CDEF_LAYER) { + buf += put_block_info(buf, NULL, "cdef_level", + offsetof(insp_mi_data, cdef_level), 0); + buf += put_block_info(buf, NULL, "cdef_strength", + offsetof(insp_mi_data, cdef_strength), 0); + } + if (layers & CFL_LAYER) { + buf += put_block_info(buf, NULL, "cfl_alpha_idx", + offsetof(insp_mi_data, cfl_alpha_idx), 0); + buf += put_block_info(buf, NULL, "cfl_alpha_sign", + offsetof(insp_mi_data, cfl_alpha_sign), 0); + } + if (layers & Q_INDEX_LAYER) { + buf += put_block_info(buf, NULL, "delta_q", + offsetof(insp_mi_data, current_qindex), 0); + } + if (layers & SEGMENT_ID_LAYER) { + buf += put_block_info(buf, NULL, "seg_id", + offsetof(insp_mi_data, segment_id), 0); + } + if (layers & MOTION_VECTORS_LAYER) { + buf += put_motion_vectors(buf); + } + if (layers & INTRABC_LAYER) { + buf += put_block_info(buf, intrabc_map, "intrabc", + offsetof(insp_mi_data, intrabc), 0); + } + if (layers & PALETTE_LAYER) { + buf += put_block_info(buf, palette_map, "palette", + offsetof(insp_mi_data, palette), 0); + } + if (layers & UV_PALETTE_LAYER) { + buf += put_block_info(buf, palette_map, "uv_palette", + offsetof(insp_mi_data, uv_palette), 0); + } + if (combined_parm_count > 0) buf += put_combined(buf); + if (layers & REFERENCE_FRAME_LAYER) { + buf += put_block_info(buf, refs_map, "referenceFrame", + offsetof(insp_mi_data, ref_frame), 2); + } +#if CONFIG_ACCOUNTING + if (layers & ACCOUNTING_LAYER) { + buf += put_accounting(buf); + } +#endif + buf += + snprintf(buf, MAX_BUFFER, " \"frame\": %d,\n", frame_data.frame_number); + buf += snprintf(buf, MAX_BUFFER, " \"showFrame\": %d,\n", + frame_data.show_frame); + buf += snprintf(buf, MAX_BUFFER, " \"frameType\": %d,\n", + frame_data.frame_type); + buf += snprintf(buf, MAX_BUFFER, " \"baseQIndex\": %d,\n", + frame_data.base_qindex); + buf += snprintf(buf, MAX_BUFFER, " \"tileCols\": %d,\n", + frame_data.tile_mi_cols); + buf += snprintf(buf, MAX_BUFFER, " \"tileRows\": %d,\n", + frame_data.tile_mi_rows); + buf += snprintf(buf, MAX_BUFFER, " \"deltaQPresentFlag\": %d,\n", + frame_data.delta_q_present_flag); + buf += snprintf(buf, MAX_BUFFER, " \"deltaQRes\": %d,\n", + frame_data.delta_q_res); + buf += put_str(buf, " \"config\": {"); + buf += put_map(buf, config_map); + buf += put_str(buf, "},\n"); + buf += put_str(buf, " \"configString\": \""); + buf += put_str_with_escape(buf, aom_codec_build_config()); + buf += put_str(buf, "\"\n"); + decoded_frame_count++; + buf += put_str(buf, "},\n"); + *(buf++) = 0; + on_frame_decoded_dump(buffer); + aom_free(buffer); +} + +void ifd_init_cb(void) { + aom_inspect_init ii; + ii.inspect_cb = inspect; + ii.inspect_ctx = NULL; + aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii); +} + +EMSCRIPTEN_KEEPALIVE +int open_file(char *file) { + if (file == NULL) { + // The JS analyzer puts the .ivf file at this location. + file = "/tmp/input.ivf"; + } + reader = aom_video_reader_open(file); + if (!reader) die("Failed to open %s for reading.", file); + info = aom_video_reader_get_info(reader); + aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + fprintf(stderr, "Using %s\n", aom_codec_iface_name(decoder)); + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + ifd_init(&frame_data, info->frame_width, info->frame_height); + ifd_init_cb(); + return EXIT_SUCCESS; +} + +Av1DecodeReturn adr; +int have_frame = 0; +const unsigned char *frame; +const unsigned char *end_frame; +size_t frame_size = 0; + +EMSCRIPTEN_KEEPALIVE +int read_frame(void) { + img = NULL; + + // This loop skips over any frames that are show_existing_frames, as + // there is nothing to analyze. + do { + if (!have_frame) { + if (!aom_video_reader_read_frame(reader)) return EXIT_FAILURE; + frame = aom_video_reader_get_frame(reader, &frame_size); + + have_frame = 1; + end_frame = frame + frame_size; + } + + if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, &adr) != + AOM_CODEC_OK) { + die_codec(&codec, "Failed to decode frame."); + } + + frame = adr.buf; + frame_size = end_frame - frame; + if (frame == end_frame) have_frame = 0; + } while (adr.show_existing); + + int got_any_frames = 0; + aom_image_t *frame_img; + struct av1_ref_frame ref_dec; + ref_dec.idx = adr.idx; + + // ref_dec.idx is the index to the reference buffer idx to AV1_GET_REFERENCE + // if its -1 the decoder didn't update any reference buffer and the only + // way to see the frame is aom_codec_get_frame. + if (ref_dec.idx == -1) { + aom_codec_iter_t iter = NULL; + img = frame_img = aom_codec_get_frame(&codec, &iter); + ++frame_count; + got_any_frames = 1; + } else if (!aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_dec)) { + img = frame_img = &ref_dec.img; + ++frame_count; + got_any_frames = 1; + } + if (!got_any_frames) { + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} + +EMSCRIPTEN_KEEPALIVE +const char *get_aom_codec_build_config(void) { + return aom_codec_build_config(); +} + +EMSCRIPTEN_KEEPALIVE +int get_bit_depth(void) { return img->bit_depth; } + +EMSCRIPTEN_KEEPALIVE +int get_bits_per_sample(void) { return img->bps; } + +EMSCRIPTEN_KEEPALIVE +int get_image_format(void) { return img->fmt; } + +EMSCRIPTEN_KEEPALIVE +unsigned char *get_plane(int plane) { return img->planes[plane]; } + +EMSCRIPTEN_KEEPALIVE +int get_plane_stride(int plane) { return img->stride[plane]; } + +EMSCRIPTEN_KEEPALIVE +int get_plane_width(int plane) { return aom_img_plane_width(img, plane); } + +EMSCRIPTEN_KEEPALIVE +int get_plane_height(int plane) { return aom_img_plane_height(img, plane); } + +EMSCRIPTEN_KEEPALIVE +int get_frame_width(void) { return info->frame_width; } + +EMSCRIPTEN_KEEPALIVE +int get_frame_height(void) { return info->frame_height; } + +static void parse_args(char **argv) { + char **argi, **argj; + struct arg arg; + (void)dump_accounting_arg; + (void)dump_cdef_arg; + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + if (arg_match(&arg, &dump_block_size_arg, argi)) layers |= BLOCK_SIZE_LAYER; +#if CONFIG_ACCOUNTING + else if (arg_match(&arg, &dump_accounting_arg, argi)) + layers |= ACCOUNTING_LAYER; +#endif + else if (arg_match(&arg, &dump_transform_size_arg, argi)) + layers |= TRANSFORM_SIZE_LAYER; + else if (arg_match(&arg, &dump_transform_type_arg, argi)) + layers |= TRANSFORM_TYPE_LAYER; + else if (arg_match(&arg, &dump_mode_arg, argi)) + layers |= MODE_LAYER; + else if (arg_match(&arg, &dump_uv_mode_arg, argi)) + layers |= UV_MODE_LAYER; + else if (arg_match(&arg, &dump_motion_mode_arg, argi)) + layers |= MOTION_MODE_LAYER; + else if (arg_match(&arg, &dump_compound_type_arg, argi)) + layers |= COMPOUND_TYPE_LAYER; + else if (arg_match(&arg, &dump_skip_arg, argi)) + layers |= SKIP_LAYER; + else if (arg_match(&arg, &dump_filter_arg, argi)) + layers |= FILTER_LAYER; + else if (arg_match(&arg, &dump_cdef_arg, argi)) + layers |= CDEF_LAYER; + else if (arg_match(&arg, &dump_cfl_arg, argi)) + layers |= CFL_LAYER; + else if (arg_match(&arg, &dump_reference_frame_arg, argi)) + layers |= REFERENCE_FRAME_LAYER; + else if (arg_match(&arg, &dump_motion_vectors_arg, argi)) + layers |= MOTION_VECTORS_LAYER; + else if (arg_match(&arg, &dump_dual_filter_type_arg, argi)) + layers |= DUAL_FILTER_LAYER; + else if (arg_match(&arg, &dump_delta_q_arg, argi)) + layers |= Q_INDEX_LAYER; + else if (arg_match(&arg, &dump_seg_id_arg, argi)) + layers |= SEGMENT_ID_LAYER; + else if (arg_match(&arg, &dump_intrabc_arg, argi)) + layers |= INTRABC_LAYER; + else if (arg_match(&arg, &dump_palette_arg, argi)) + layers |= PALETTE_LAYER; + else if (arg_match(&arg, &dump_uv_palette_arg, argi)) + layers |= UV_PALETTE_LAYER; + else if (arg_match(&arg, &dump_all_arg, argi)) + layers |= ALL_LAYERS; + else if (arg_match(&arg, &compress_arg, argi)) + compress = 1; + else if (arg_match(&arg, &usage_arg, argi)) + usage_exit(); + else if (arg_match(&arg, &limit_arg, argi)) + stop_after = arg_parse_uint(&arg); + else if (arg_match(&arg, &skip_non_transform_arg, argi)) + skip_non_transform = arg_parse_uint(&arg); + else if (arg_match(&arg, &combined_arg, argi)) + convert_to_indices( + (char *)arg.val, combined_parm_list, + sizeof(combined_parm_list) / sizeof(combined_parm_list[0]), + &combined_parm_count); + else + argj++; + } +} + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s src_filename \n", exec_name); + fprintf(stderr, "\nOptions:\n"); + arg_show_usage(stderr, main_args); + exit(EXIT_FAILURE); +} + +EMSCRIPTEN_KEEPALIVE +int main(int argc, char **argv) { + exec_name = argv[0]; + parse_args(argv); + if (argc >= 2) { + open_file(argv[1]); + printf("[\n"); + while (1) { + if (stop_after && (decoded_frame_count >= stop_after)) break; + if (read_frame()) break; + } + printf("null\n"); + printf("]"); + } else { + usage_exit(); + } +} + +EMSCRIPTEN_KEEPALIVE +void quit(void) { + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_reader_close(reader); +} + +EMSCRIPTEN_KEEPALIVE +void set_layers(LayerType v) { layers = v; } + +EMSCRIPTEN_KEEPALIVE +void set_compress(int v) { compress = v; } diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c new file mode 100644 index 0000000000..05272bafa3 --- /dev/null +++ b/third_party/aom/examples/lightfield_bitstream_parsing.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Bitstream Parsing +// ============================ +// +// This is a lightfield bitstream parsing example. It takes an input file +// containing the whole compressed lightfield bitstream(ivf file) and a text +// file containing a stream of tiles to decode and then constructs and outputs +// a new bitstream that can be decoded by an AV1 decoder. The output bitstream +// contains reference frames(i.e. anchor frames), camera frame header, and +// tile list OBUs. num_references is the number of anchor frames coded at the +// beginning of the light field file. After running the lightfield encoder, +// run lightfield bitstream parsing: +// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4 +// tile_list.txt +// +// The tile_list.txt is expected to be of the form: +// Frame +// +// +// ... +// Frame +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aom_encoder.h" +#include "aom/aom_integer.h" +#include "aom/aomdx.h" +#include "aom_dsp/bitwriter_buffer.h" +#include "common/tools_common.h" +#include "common/video_reader.h" +#include "common/video_writer.h" + +#define MAX_TILES 512 + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) + +const int output_frame_width = 512; +const int output_frame_height = 512; + +// Spec: +// typedef struct { +// uint8_t anchor_frame_idx; +// uint8_t tile_row; +// uint8_t tile_col; +// uint16_t coded_tile_data_size_minus_1; +// uint8_t *coded_tile_data; +// } TILE_LIST_ENTRY; + +// Tile list entry provided by the application +typedef struct { + int image_idx; + int reference_idx; + int tile_col; + int tile_row; +} TILE_LIST_INFO; + +static int get_image_bps(aom_img_fmt_t fmt) { + switch (fmt) { + case AOM_IMG_FMT_I420: return 12; + case AOM_IMG_FMT_I422: return 16; + case AOM_IMG_FMT_I444: return 24; + case AOM_IMG_FMT_I42016: return 24; + case AOM_IMG_FMT_I42216: return 32; + case AOM_IMG_FMT_I44416: return 48; + default: die("Invalid image format"); + } +} + +static void process_tile_list(const TILE_LIST_INFO *tiles, int num_tiles, + aom_codec_pts_t tl_pts, unsigned char **frames, + const size_t *frame_sizes, aom_codec_ctx_t *codec, + unsigned char *tl_buf, AvxVideoWriter *writer, + uint8_t output_frame_width_in_tiles_minus_1, + uint8_t output_frame_height_in_tiles_minus_1) { + unsigned char *tl = tl_buf; + struct aom_write_bit_buffer wb = { tl, 0 }; + unsigned char *saved_obu_size_loc = NULL; + uint32_t tile_list_obu_header_size = 0; + uint32_t tile_list_obu_size = 0; + int num_tiles_minus_1 = num_tiles - 1; + int i; + + // Write the tile list OBU header that is 1 byte long. + aom_wb_write_literal(&wb, 0, 1); // forbidden bit. + aom_wb_write_literal(&wb, 8, 4); // tile list OBU: "1000" + aom_wb_write_literal(&wb, 0, 1); // obu_extension = 0 + aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field + aom_wb_write_literal(&wb, 0, 1); // reserved + tl++; + tile_list_obu_header_size++; + + // Write the OBU size using a fixed length_field_size of 4 bytes. + saved_obu_size_loc = tl; + // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32. + aom_wb_write_unsigned_literal(&wb, 0, 32); + tl += 4; + tile_list_obu_header_size += 4; + + // write_tile_list_obu() + aom_wb_write_literal(&wb, output_frame_width_in_tiles_minus_1, 8); + aom_wb_write_literal(&wb, output_frame_height_in_tiles_minus_1, 8); + aom_wb_write_literal(&wb, num_tiles_minus_1, 16); + tl += 4; + tile_list_obu_size += 4; + + // Write each tile's data + for (i = 0; i <= num_tiles_minus_1; i++) { + aom_tile_data tile_data = { 0, NULL, 0 }; + + int image_idx = tiles[i].image_idx; + int ref_idx = tiles[i].reference_idx; + int tc = tiles[i].tile_col; + int tr = tiles[i].tile_row; + + // Reset bit writer to the right location. + wb.bit_buffer = tl; + wb.bit_offset = 0; + + size_t frame_size = frame_sizes[image_idx]; + const unsigned char *frame = frames[image_idx]; + + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc); + + aom_codec_err_t aom_status = + aom_codec_decode(codec, frame, frame_size, NULL); + if (aom_status) die_codec(codec, "Failed to decode tile."); + + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_DATA, &tile_data); + + // Copy over tile info. + // uint8_t anchor_frame_idx; + // uint8_t tile_row; + // uint8_t tile_col; + // uint16_t coded_tile_data_size_minus_1; + // uint8_t *coded_tile_data; + uint32_t tile_info_bytes = 5; + aom_wb_write_literal(&wb, ref_idx, 8); + aom_wb_write_literal(&wb, tr, 8); + aom_wb_write_literal(&wb, tc, 8); + aom_wb_write_literal(&wb, (int)tile_data.coded_tile_data_size - 1, 16); + tl += tile_info_bytes; + + memcpy(tl, (uint8_t *)tile_data.coded_tile_data, + tile_data.coded_tile_data_size); + tl += tile_data.coded_tile_data_size; + + tile_list_obu_size += + tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size; + } + + // Write tile list OBU size. + size_t bytes_written = 0; + if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc, + &bytes_written)) + die_codec(codec, "Failed to encode the tile list obu size."); + + // Copy the tile list. + if (!aom_video_writer_write_frame( + writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size, + tl_pts)) + die_codec(codec, "Failed to copy compressed tile list."); +} + +int main(int argc, char **argv) { + AvxVideoReader *reader = NULL; + AvxVideoWriter *writer = NULL; + const AvxVideoInfo *info = NULL; + int num_references; + int i; + aom_codec_pts_t pts; + const char *tile_list_file = NULL; + + exec_name = argv[0]; + if (argc != 5) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + num_references = (int)strtol(argv[3], NULL, 0); + info = aom_video_reader_get_info(reader); + + aom_video_reader_set_fourcc(reader, AV1_FOURCC); + + // The writer to write out ivf file in tile list OBU, which can be decoded by + // AV1 decoder. + writer = aom_video_writer_open(argv[2], kContainerIVF, info); + if (!writer) die("Failed to open %s for writing", argv[2]); + + tile_list_file = argv[4]; + + aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + printf("Using %s\n", aom_codec_iface_name(decoder)); + + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + + // Decode anchor frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); + + printf("Reading %d reference images.\n", num_references); + for (i = 0; i < num_references; ++i) { + aom_video_reader_read_frame(reader); + + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader); + + // Copy references bitstream directly. + if (!aom_video_writer_write_frame(writer, frame, frame_size, pts)) + die_codec(&codec, "Failed to copy compressed anchor frame."); + + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + } + + // Decode camera frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, 1); + + FILE *infile = aom_video_reader_get_file(reader); + // Record the offset of the first camera image. + const FileOffset camera_frame_pos = ftello(infile); + + printf("Loading compressed frames into memory.\n"); + + // Count the frames in the lightfield. + int num_frames = 0; + while (aom_video_reader_read_frame(reader)) { + ++num_frames; + } + if (num_frames < 1) die("Input light field has no frames."); + + // Read all of the lightfield frames into memory. + unsigned char **frames = + (unsigned char **)malloc(num_frames * sizeof(unsigned char *)); + size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t)); + if (!(frames && frame_sizes)) die("Failed to allocate frame data."); + + // Seek to the first camera image. + fseeko(infile, camera_frame_pos, SEEK_SET); + for (int f = 0; f < num_frames; ++f) { + aom_video_reader_read_frame(reader); + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char)); + if (!frames[f]) die("Failed to allocate frame data."); + memcpy(frames[f], frame, frame_size); + frame_sizes[f] = frame_size; + } + printf("Read %d frames.\n", num_frames); + + // Copy first camera frame for getting camera frame header. This is done + // only once. + { + size_t frame_size = frame_sizes[0]; + const unsigned char *frame = frames[0]; + pts = num_references; + aom_tile_data frame_header_info = { 0, NULL, 0 }; + + // Need to decode frame header to get camera frame header info. So, here + // decoding 1 tile is enough. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_ROW, 0); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_COL, 0); + + aom_codec_err_t aom_status = + aom_codec_decode(&codec, frame, frame_size, NULL); + if (aom_status) die_codec(&codec, "Failed to decode tile."); + + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_HEADER_INFO, + &frame_header_info); + + size_t obu_size_offset = + (uint8_t *)frame_header_info.coded_tile_data - frame; + size_t length_field_size = frame_header_info.coded_tile_data_size; + // Remove ext-tile tile info. + uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1; + size_t bytes_to_copy = + obu_size_offset + length_field_size + frame_header_size; + + unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy); + if (frame_hdr_buf == NULL) + die_codec(&codec, "Failed to allocate frame header buffer."); + + memcpy(frame_hdr_buf, frame, bytes_to_copy); + + // Update frame header OBU size. + size_t bytes_written = 0; + if (aom_uleb_encode_fixed_size( + frame_header_size, length_field_size, length_field_size, + frame_hdr_buf + obu_size_offset, &bytes_written)) + die_codec(&codec, "Failed to encode the tile list obu size."); + + // Copy camera frame header bitstream. + if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy, + pts)) + die_codec(&codec, "Failed to copy compressed camera frame header."); + free(frame_hdr_buf); + } + + // Read out the image format. + aom_img_fmt_t ref_fmt = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + const int bps = get_image_bps(ref_fmt); + if (!bps) die_codec(&codec, "Invalid image format."); + // read out the tile size. + unsigned int tile_size = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(&codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + // Allocate a buffer to store tile list bitstream. + const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) * + ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8; + + unsigned char *tl_buf = (unsigned char *)malloc(data_sz); + if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer."); + + aom_codec_pts_t tl_pts = num_references; + const uint8_t output_frame_width_in_tiles_minus_1 = + output_frame_width / tile_width - 1; + const uint8_t output_frame_height_in_tiles_minus_1 = + output_frame_height / tile_height - 1; + + printf("Reading tile list from file.\n"); + char line[1024]; + FILE *tile_list_fptr = fopen(tile_list_file, "r"); + if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file."); + int num_tiles = 0; + TILE_LIST_INFO tiles[MAX_TILES]; + while ((fgets(line, 1024, tile_list_fptr)) != NULL) { + if (line[0] == 'F' || num_tiles >= MAX_TILES) { + // Flush existing tile list and start another, either because we hit a + // new render frame or because we've hit our max number of tiles per list. + if (num_tiles > 0) { + process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec, + tl_buf, writer, output_frame_width_in_tiles_minus_1, + output_frame_height_in_tiles_minus_1); + ++tl_pts; + } + num_tiles = 0; + } + if (line[0] == 'F') { + continue; + } + if (sscanf(line, "%d %d %d %d", &tiles[num_tiles].image_idx, + &tiles[num_tiles].reference_idx, &tiles[num_tiles].tile_col, + &tiles[num_tiles].tile_row) == 4) { + if (tiles[num_tiles].image_idx >= num_frames) { + die("Tile list image_idx out of bounds: %d >= %d.", + tiles[num_tiles].image_idx, num_frames); + } + if (tiles[num_tiles].reference_idx >= num_references) { + die("Tile list reference_idx out of bounds: %d >= %d.", + tiles[num_tiles].reference_idx, num_references); + } + ++num_tiles; + } + } + if (num_tiles > 0) { + // Flush out the last tile list. + process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec, + tl_buf, writer, output_frame_width_in_tiles_minus_1, + output_frame_height_in_tiles_minus_1); + ++tl_pts; + } + + const int num_tile_lists = (int)(tl_pts - pts); + printf("Finished processing tile lists. Num tile lists: %d.\n", + num_tile_lists); + free(tl_buf); + for (int f = 0; f < num_frames; ++f) { + free(frames[f]); + } + free(frame_sizes); + free(frames); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_writer_close(writer); + aom_video_reader_close(reader); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c new file mode 100644 index 0000000000..65b13efa1a --- /dev/null +++ b/third_party/aom/examples/lightfield_decoder.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Decoder +// ================== +// +// This is an example of a simple lightfield decoder. It builds upon the +// simple_decoder.c example. It takes an input file containing the compressed +// data (in ivf format), treating it as a lightfield instead of a video; and a +// text file with a list of tiles to decode. There is an optional parameter +// allowing to choose the output format, and the supported formats are +// YUV1D(default), YUV, and NV12. +// After running the lightfield encoder, run lightfield decoder to decode a +// batch of tiles: +// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4 tile_list.txt +// 0(optional) +// The tile_list.txt is expected to be of the form: +// Frame +// +// +// ... +// Frame +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +enum { + YUV1D, // 1D tile output for conformance test. + YUV, // Tile output in YUV format. + NV12, // Tile output in NV12 format. +} UENUM1BYTE(OUTPUT_FORMAT); + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +// Output frame size +static const int output_frame_width = 512; +static const int output_frame_height = 512; + +static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst, + int dst_row_offset, int dst_col_offset) { + const int shift = (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *src_buf = src->planes[plane]; + const int src_stride = src->stride[plane]; + unsigned char *dst_buf = dst->planes[plane]; + const int dst_stride = dst->stride[plane]; + const int roffset = + (plane > 0) ? dst_row_offset >> dst->y_chroma_shift : dst_row_offset; + const int coffset = + (plane > 0) ? dst_col_offset >> dst->x_chroma_shift : dst_col_offset; + + // col offset needs to be adjusted for HBD. + dst_buf += roffset * dst_stride + (coffset << shift); + + const int w = (aom_img_plane_width(src, plane) << shift); + const int h = aom_img_plane_height(src, plane); + int y; + + for (y = 0; y < h; ++y) { + memcpy(dst_buf, src_buf, w); + src_buf += src_stride; + dst_buf += dst_stride; + } + } +} + +static void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame, + size_t frame_size, int tr, int tc, int ref_idx, + aom_image_t *reference_images, aom_image_t *output, + int *tile_idx, unsigned int *output_bit_depth, + aom_image_t **img_ptr, int output_format) { + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr); + AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc); + + av1_ref_frame_t ref; + ref.idx = 0; + ref.use_external_ref = 1; + ref.img = reference_images[ref_idx]; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_REFERENCE, &ref)) { + die_codec(codec, "Failed to set reference frame."); + } + + aom_codec_err_t aom_status = aom_codec_decode(codec, frame, frame_size, NULL); + if (aom_status) die_codec(codec, "Failed to decode tile."); + + aom_codec_iter_t iter = NULL; + aom_image_t *img = aom_codec_get_frame(codec, &iter); + if (!img) die_codec(codec, "Failed to get frame."); + *img_ptr = img; + + // aom_img_alloc() sets bit_depth as follows: + // output->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + // Use img->bit_depth(read from bitstream), so that aom_shift_img() + // works as expected. + output->bit_depth = img->bit_depth; + *output_bit_depth = img->bit_depth; + + if (output_format != YUV1D) { + // read out the tile size. + unsigned int tile_size = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + const uint32_t output_frame_width_in_tiles = + output_frame_width / tile_width; + + // Copy the tile to the output frame. + const int row_offset = + (*tile_idx / output_frame_width_in_tiles) * tile_height; + const int col_offset = + (*tile_idx % output_frame_width_in_tiles) * tile_width; + + aom_img_copy_tile(img, output, row_offset, col_offset); + (*tile_idx)++; + } +} + +static void img_write_to_file(const aom_image_t *img, FILE *file, + int output_format) { + if (output_format == YUV) + aom_img_write(img, file); + else if (output_format == NV12) + aom_img_write_nv12(img, file); + else + die("Invalid output format"); +} + +int main(int argc, char **argv) { + FILE *outfile = NULL; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + int num_references; + aom_img_fmt_t ref_fmt = 0; + aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; + aom_image_t output; + aom_image_t *output_shifted = NULL; + size_t frame_size = 0; + const unsigned char *frame = NULL; + int i, j; + const char *tile_list_file = NULL; + int output_format = YUV1D; + exec_name = argv[0]; + + if (argc < 5) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + num_references = (int)strtol(argv[3], NULL, 0); + tile_list_file = argv[4]; + + if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0); + if (output_format < YUV1D || output_format > NV12) + die("Output format out of range [0, 2]"); + + info = aom_video_reader_get_info(reader); + + aom_codec_iface_t *decoder; + if (info->codec_fourcc == LST_FOURCC) + decoder = get_aom_decoder_by_fourcc(AV1_FOURCC); + else + die("Unknown input codec."); + printf("Using %s\n", aom_codec_iface_name(decoder)); + + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die_codec(&codec, "Failed to initialize decoder."); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, + info->is_annexb)) { + die("Failed to set annex b status"); + } + + // Decode anchor frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); + for (i = 0; i < num_references; ++i) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + if (i == 0) { + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + + int frame_res[2]; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res)) + die_codec(&codec, "Failed to get the image frame size"); + + // Allocate memory to store decoded references. Allocate memory with the + // border so that it can be used as a reference. + for (j = 0; j < num_references; j++) { + unsigned int border = AOM_DEC_BORDER_IN_PIXELS; + if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, + frame_res[0], frame_res[1], 32, 8, + border)) { + die("Failed to allocate references."); + } + } + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE, + &reference_images[i])) + die_codec(&codec, "Failed to copy decoded reference frame"); + + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + char name[1024]; + snprintf(name, sizeof(name), "ref_%d.yuv", i); + printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h); + FILE *ref_file = fopen(name, "wb"); + aom_img_write(img, ref_file); + fclose(ref_file); + } + } + + FILE *infile = aom_video_reader_get_file(reader); + // Record the offset of the first camera image. + const FileOffset camera_frame_pos = ftello(infile); + + printf("Loading compressed frames into memory.\n"); + + // Count the frames in the lightfield. + int num_frames = 0; + while (aom_video_reader_read_frame(reader)) { + ++num_frames; + } + if (num_frames < 1) die("Input light field has no frames."); + + // Read all of the lightfield frames into memory. + unsigned char **frames = + (unsigned char **)malloc(num_frames * sizeof(unsigned char *)); + size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t)); + if (!(frames && frame_sizes)) die("Failed to allocate frame data."); + // Seek to the first camera image. + fseeko(infile, camera_frame_pos, SEEK_SET); + for (int f = 0; f < num_frames; ++f) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char)); + if (!frames[f]) die("Failed to allocate frame data."); + memcpy(frames[f], frame, frame_size); + frame_sizes[f] = frame_size; + } + printf("Read %d frames.\n", num_frames); + + if (output_format != YUV1D) { + // Allocate the output frame. + aom_img_fmt_t out_fmt = ref_fmt; + if (FORCE_HIGHBITDEPTH_DECODING) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + if (!aom_img_alloc(&output, out_fmt, output_frame_width, + output_frame_height, 32)) + die("Failed to allocate output image."); + } + + printf("Decoding tile list from file.\n"); + char line[1024]; + FILE *tile_list_fptr = fopen(tile_list_file, "r"); + if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file."); + int tile_list_cnt = 0; + int tile_list_writes = 0; + int tile_idx = 0; + aom_image_t *out = NULL; + unsigned int output_bit_depth = 0; + + while ((fgets(line, 1024, tile_list_fptr)) != NULL) { + if (line[0] == 'F') { + if (output_format != YUV1D) { + // Write out the tile list. + if (tile_list_cnt) { + out = &output; + if (output_bit_depth != 0) { + if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) { + die("Error allocating image"); + } + } + img_write_to_file(out, outfile, output_format); + tile_list_writes++; + } + + tile_list_cnt++; + tile_idx = 0; + // Then memset the frame. + memset(output.img_data, 0, output.sz); + } + continue; + } + + int image_idx, ref_idx, tc, tr; + sscanf(line, "%d %d %d %d", &image_idx, &ref_idx, &tc, &tr); + if (image_idx >= num_frames) { + die("Tile list image_idx out of bounds: %d >= %d.", image_idx, + num_frames); + } + if (ref_idx >= num_references) { + die("Tile list ref_idx out of bounds: %d >= %d.", ref_idx, + num_references); + } + frame = frames[image_idx]; + frame_size = frame_sizes[image_idx]; + + aom_image_t *img = NULL; + decode_tile(&codec, frame, frame_size, tr, tc, ref_idx, reference_images, + &output, &tile_idx, &output_bit_depth, &img, output_format); + if (output_format == YUV1D) { + out = img; + if (output_bit_depth != 0) { + if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) { + die("Error allocating image"); + } + } + aom_img_write(out, outfile); + } + } + + if (output_format != YUV1D) { + // Write out the last tile list. + if (tile_list_writes < tile_list_cnt) { + out = &output; + if (output_bit_depth != 0) { + if (!aom_shift_img(output_bit_depth, &out, &output_shifted)) { + die("Error allocating image"); + } + } + img_write_to_file(out, outfile, output_format); + } + } + + if (output_shifted) aom_img_free(output_shifted); + if (output_format != YUV1D) aom_img_free(&output); + for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]); + for (int f = 0; f < num_frames; ++f) { + free(frames[f]); + } + free(frame_sizes); + free(frames); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c new file mode 100644 index 0000000000..9aef836ac2 --- /dev/null +++ b/third_party/aom/examples/lightfield_encoder.c @@ -0,0 +1,525 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Encoder +// ================== +// +// This is an example of a simple lightfield encoder. It builds upon the +// twopass_encoder.c example. It takes an input file in YV12 format, +// treating it as a planar lightfield instead of a video. The img_width +// and img_height arguments are the dimensions of the lightfield images, +// while the lf_width and lf_height arguments are the number of +// lightfield images in each dimension. The lf_blocksize determines the +// number of reference images used for MCP. For example, 5 means that there +// is a reference image for every 5x5 lightfield image block. All images +// within a block will use the center image in that block as the reference +// image for MCP. +// Run "make test" to download lightfield test data: vase10x10.yuv. +// Run lightfield encoder to encode whole lightfield: +// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5 + +// Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print +// out the uncompressed header and the frame contexts, which can be used to +// test the bit exactness of the headers and the frame contexts for large scale +// tile coded frames. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "av1/encoder/encoder_utils.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n", + exec_name); + exit(EXIT_FAILURE); +} + +static int img_size_bytes(aom_image_t *img) { + int image_size_bytes = 0; + int plane; + for (plane = 0; plane < 3; ++plane) { + const int w = aom_img_plane_width(img, plane) * + ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + const int h = aom_img_plane_height(img, plane); + image_size_bytes += w * h; + } + return image_size_bytes; +} + +static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, + aom_fixed_buf_t *stats) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_STATS_PKT) { + const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; + const size_t pkt_size = pkt->data.twopass_stats.sz; + stats->buf = realloc(stats->buf, stats->sz + pkt_size); + if (!stats->buf) die("Failed to allocate frame stats buffer."); + memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); + stats->sz += pkt_size; + } + } + + return got_pkts; +} + +static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) + die_codec(ctx, "Failed to write compressed frame."); + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw, + aom_image_t *raw_shift) { + if (FORCE_HIGHBITDEPTH_DECODING) { + // Need to allocate larger buffer to use hbd internal. + int input_shift = 0; + aom_img_upshift(raw_shift, raw, input_shift); + *frame_to_encode = raw_shift; + } else { + *frame_to_encode = raw; + } +} + +static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile, + aom_codec_iface_t *encoder, + const aom_codec_enc_cfg_t *cfg, int lf_width, + int lf_height, int lf_blocksize, int flags, + aom_image_t *raw_shift) { + aom_codec_ctx_t codec; + int frame_count = 0; + int image_size_bytes = img_size_bytes(raw); + int u_blocks, v_blocks; + int bu, bv; + aom_fixed_buf_t stats = { NULL, 0 }; + aom_image_t *frame_to_encode; + + if (aom_codec_enc_init(&codec, encoder, cfg, flags)) + die("Failed to initialize encoder"); + if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0)) + die_codec(&codec, "Failed to turn off auto altref"); + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0)) + die_codec(&codec, "Failed to set frame parallel decoding"); + + // How many reference images we need to encode. + u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; + v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; + + printf("\n First pass: "); + + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u_block_size, v_block_size; + int block_ref_u, block_ref_v; + + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + u_block_size = block_u_end - block_u_min; + v_block_size = block_v_end - block_v_min; + block_ref_u = block_u_min + u_block_size / 2; + block_ref_v = block_v_min + v_block_size / 2; + + printf("A%d, ", (block_ref_u + block_ref_v * lf_width)); + fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes, + SEEK_SET); + aom_img_read(raw, infile); + get_raw_image(&frame_to_encode, raw, raw_shift); + + // Reference frames can be encoded encoded without tiles. + ++frame_count; + get_frame_stats(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF, + &stats); + } + } + + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1)) + die_codec(&codec, "Failed to set frame parallel decoding"); + + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u, v; + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + for (v = block_v_min; v < block_v_end; ++v) { + for (u = block_u_min; u < block_u_end; ++u) { + printf("C%d, ", (u + v * lf_width)); + fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET); + aom_img_read(raw, infile); + get_raw_image(&frame_to_encode, raw, raw_shift); + + ++frame_count; + get_frame_stats(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, + &stats); + } + } + } + } + // Flush encoder. + // No ARF, this should not be needed. + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) { + } + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + printf("\nFirst pass complete. Processed %d frames.\n", frame_count); + + return stats; +} + +static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, + aom_codec_iface_t *encoder, aom_codec_enc_cfg_t *cfg, + int lf_width, int lf_height, int lf_blocksize, int flags, + aom_image_t *raw_shift) { + AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder), + cfg->g_w, + cfg->g_h, + { cfg->g_timebase.num, cfg->g_timebase.den }, + 0 }; + AvxVideoWriter *writer = NULL; + aom_codec_ctx_t codec; + int frame_count = 0; + int image_size_bytes = img_size_bytes(raw); + int bu, bv; + int u_blocks, v_blocks; + aom_image_t *frame_to_encode; + aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; + int reference_image_num = 0; + int i; + + writer = aom_video_writer_open(outfile_name, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing", outfile_name); + + if (aom_codec_enc_init(&codec, encoder, cfg, flags)) + die("Failed to initialize encoder"); + if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0)) + die_codec(&codec, "Failed to turn off auto altref"); + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0)) + die_codec(&codec, "Failed to set frame parallel decoding"); + if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1)) + die_codec(&codec, "Failed to enable encoder ext_tile debug"); + if (aom_codec_control(&codec, AOME_SET_CPUUSED, 3)) + die_codec(&codec, "Failed to set cpu-used"); + + // Note: The superblock is a sequence parameter and has to be the same for 1 + // sequence. In lightfield application, must choose the superblock size(either + // 64x64 or 128x128) before the encoding starts. Otherwise, the default is + // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64 + // internally. + if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE, + AOM_SUPERBLOCK_SIZE_64X64)) + die_codec(&codec, "Failed to set SB size"); + + u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize; + v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize; + + reference_image_num = u_blocks * v_blocks; + // Set the max gf group length so the references are guaranteed to be in + // a different gf group than any of the regular frames. This avoids using + // both vbr and constant quality mode in a single group. The number of + // references now cannot surpass 17 because of the enforced MAX_GF_INTERVAL of + // 16. If it is necessary to exceed this reference frame limit, one will have + // to do some additional handling to ensure references are in separate gf + // groups from the regular frames. + if (aom_codec_control(&codec, AV1E_SET_MAX_GF_INTERVAL, + reference_image_num - 1)) + die_codec(&codec, "Failed to set max gf interval"); + aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; + if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + // Allocate memory with the border so that it can be used as a reference. + const bool resize = + codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode; + const bool all_intra = reference_image_num - 1 == 0; + int border_in_pixels = + av1_get_enc_border_size(resize, all_intra, BLOCK_64X64); + + for (i = 0; i < reference_image_num; i++) { + if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w, + cfg->g_h, 32, 8, border_in_pixels)) { + die("Failed to allocate image."); + } + } + + printf("\n Second pass: "); + + // Encode reference images first. + printf("Encoding Reference Images\n"); + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u_block_size, v_block_size; + int block_ref_u, block_ref_v; + + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + u_block_size = block_u_end - block_u_min; + v_block_size = block_v_end - block_v_min; + block_ref_u = block_u_min + u_block_size / 2; + block_ref_v = block_v_min + v_block_size / 2; + + printf("A%d, ", (block_ref_u + block_ref_v * lf_width)); + fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes, + SEEK_SET); + aom_img_read(raw, infile); + + get_raw_image(&frame_to_encode, raw, raw_shift); + + // Reference frames may be encoded without tiles. + ++frame_count; + printf("Encoding reference image %d of %d\n", bv * u_blocks + bu, + u_blocks * v_blocks); + encode_frame(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, + writer); + + if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE, + &reference_images[frame_count - 1])) + die_codec(&codec, "Failed to copy decoder reference frame"); + } + } + + cfg->large_scale_tile = 1; + // Fixed q encoding for camera frames. + cfg->rc_end_usage = AOM_Q; + if (aom_codec_enc_config_set(&codec, cfg)) + die_codec(&codec, "Failed to configure encoder"); + + // The fixed q value used in encoding. + if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36)) + die_codec(&codec, "Failed to set cq level"); + if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1)) + die_codec(&codec, "Failed to set frame parallel decoding"); + if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1)) + die_codec(&codec, "Failed to turn on single tile decoding"); + // Set tile_columns and tile_rows to MAX values, which guarantees the tile + // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution. + if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6)) + die_codec(&codec, "Failed to set tile width"); + if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6)) + die_codec(&codec, "Failed to set tile height"); + + for (bv = 0; bv < v_blocks; ++bv) { + for (bu = 0; bu < u_blocks; ++bu) { + const int block_u_min = bu * lf_blocksize; + const int block_v_min = bv * lf_blocksize; + int block_u_end = (bu + 1) * lf_blocksize; + int block_v_end = (bv + 1) * lf_blocksize; + int u, v; + block_u_end = block_u_end < lf_width ? block_u_end : lf_width; + block_v_end = block_v_end < lf_height ? block_v_end : lf_height; + for (v = block_v_min; v < block_v_end; ++v) { + for (u = block_u_min; u < block_u_end; ++u) { + av1_ref_frame_t ref; + ref.idx = 0; + ref.use_external_ref = 1; + ref.img = reference_images[bv * u_blocks + bu]; + if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref)) + die_codec(&codec, "Failed to set reference frame"); + + printf("C%d, ", (u + v * lf_width)); + fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET); + aom_img_read(raw, infile); + get_raw_image(&frame_to_encode, raw, raw_shift); + + ++frame_count; + printf("Encoding image %d of %d\n", + frame_count - (u_blocks * v_blocks), lf_width * lf_height); + encode_frame(&codec, frame_to_encode, frame_count, 1, + AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | + AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY, + writer); + } + } + } + } + + // Flush encoder. + // No ARF, this should not be needed. + while (encode_frame(&codec, NULL, -1, 1, 0, writer)) { + } + + for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]); + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + // Modify large_scale_file fourcc. + if (cfg->large_scale_tile == 1) + aom_video_writer_set_fourcc(writer, LST_FOURCC); + aom_video_writer_close(writer); + + printf("\nSecond pass complete. Processed %d frames.\n", frame_count); +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + int w, h; + // The number of lightfield images in the u and v dimensions. + int lf_width, lf_height; + // Defines how many images refer to the same reference image for MCP. + // lf_blocksize X lf_blocksize images will all use the reference image + // in the middle of the block of images. + int lf_blocksize; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + aom_image_t raw; + aom_image_t raw_shift; + aom_codec_err_t res; + aom_fixed_buf_t stats; + int flags = 0; + + const int fps = 30; + const int bitrate = 200; // kbit/s + const char *const width_arg = argv[1]; + const char *const height_arg = argv[2]; + const char *const infile_arg = argv[3]; + const char *const outfile_arg = argv[4]; + const char *const lf_width_arg = argv[5]; + const char *const lf_height_arg = argv[6]; + const char *lf_blocksize_arg = argv[7]; + exec_name = argv[0]; + + if (argc < 8) die("Invalid number of arguments"); + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1"); + if (!encoder) die("Unsupported codec."); + + w = (int)strtol(width_arg, NULL, 0); + h = (int)strtol(height_arg, NULL, 0); + lf_width = (int)strtol(lf_width_arg, NULL, 0); + lf_height = (int)strtol(lf_height_arg, NULL, 0); + lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0); + lf_blocksize = lf_blocksize < lf_width ? lf_blocksize : lf_width; + lf_blocksize = lf_blocksize < lf_height ? lf_blocksize : lf_height; + + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + die("Invalid frame size: %dx%d", w, h); + if (lf_width <= 0 || lf_height <= 0) + die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height); + if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize); + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) { + die("Failed to allocate image."); + } + if (FORCE_HIGHBITDEPTH_DECODING) { + // Need to allocate larger buffer to use hbd internal. + aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h, + 32); + } + + printf("Using %s\n", aom_codec_iface_name(encoder)); + + // Configuration + res = aom_codec_enc_config_default(encoder, &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = w; + cfg.g_h = h; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = fps; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = 0; // This is required. + cfg.g_lag_in_frames = 0; // need to set this since default is 19. + cfg.kf_mode = AOM_KF_DISABLED; + cfg.large_scale_tile = 0; // Only set it to 1 for camera frame encoding. + cfg.g_bit_depth = AOM_BITS_8; + flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING) + ? AOM_CODEC_USE_HIGHBITDEPTH + : 0; + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading", infile_arg); + + // Pass 0 + cfg.g_pass = AOM_RC_FIRST_PASS; + stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize, + flags, &raw_shift); + + // Pass 1 + rewind(infile); + cfg.g_pass = AOM_RC_LAST_PASS; + cfg.rc_twopass_stats_in = stats; + pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height, + lf_blocksize, flags, &raw_shift); + free(stats.buf); + + if (FORCE_HIGHBITDEPTH_DECODING) aom_img_free(&raw_shift); + aom_img_free(&raw); + fclose(infile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c new file mode 100644 index 0000000000..d71ff5b387 --- /dev/null +++ b/third_party/aom/examples/lightfield_tile_list_decoder.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Lightfield Tile List Decoder +// ============================ +// +// This is a lightfield tile list decoder example. It takes an input file that +// contains the anchor frames that are references of the coded tiles, the camera +// frame header, and tile list OBUs that include the tile information and the +// compressed tile data. This input file is reconstructed from the encoded +// lightfield ivf file, and is decodable by AV1 decoder. num_references is +// the number of anchor frames coded at the beginning of the light field file. +// num_tile_lists is the number of tile lists need to be decoded. There is an +// optional parameter allowing to choose the output format, and the supported +// formats are YUV1D(default), YUV, and NV12. +// Run lightfield tile list decoder to decode an AV1 tile list file: +// examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv +// 4 2 0(optional) + +#include +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_scale/yv12config.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +enum { + YUV1D, // 1D tile output for conformance test. + YUV, // Tile output in YUV format. + NV12, // Tile output in NV12 format. +} UENUM1BYTE(OUTPUT_FORMAT); + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + "\n", + exec_name); + exit(EXIT_FAILURE); +} + +static void write_tile_yuv1d(aom_codec_ctx_t *codec, const aom_image_t *img, + FILE *file) { + // read out the tile size. + unsigned int tile_size = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size)) + die_codec(codec, "Failed to get the tile size"); + const unsigned int tile_width = tile_size >> 16; + const unsigned int tile_height = tile_size & 65535; + const uint32_t output_frame_width_in_tiles = img->d_w / tile_width; + + unsigned int tile_count = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_COUNT, &tile_count)) + die_codec(codec, "Failed to get the tile size"); + + // Write tile to file. + const int shift = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + unsigned int tile_idx; + + for (tile_idx = 0; tile_idx < tile_count; ++tile_idx) { + const int row_offset = + (tile_idx / output_frame_width_in_tiles) * tile_height; + const int col_offset = + (tile_idx % output_frame_width_in_tiles) * tile_width; + int plane; + + for (plane = 0; plane < 3; ++plane) { + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int roffset = + (plane > 0) ? row_offset >> img->y_chroma_shift : row_offset; + const int coffset = + (plane > 0) ? col_offset >> img->x_chroma_shift : col_offset; + const int w = (plane > 0) ? ((tile_width >> img->x_chroma_shift) << shift) + : (tile_width << shift); + const int h = + (plane > 0) ? (tile_height >> img->y_chroma_shift) : tile_height; + int y; + + // col offset needs to be adjusted for HBD. + buf += roffset * stride + (coffset << shift); + + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } + } + } +} + +int main(int argc, char **argv) { + FILE *outfile = NULL; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + int num_references; + int num_tile_lists; + aom_image_t reference_images[MAX_EXTERNAL_REFERENCES]; + size_t frame_size = 0; + const unsigned char *frame = NULL; + int output_format = YUV1D; + int i, j, n; + + exec_name = argv[0]; + + if (argc < 5) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + num_references = (int)strtol(argv[3], NULL, 0); + num_tile_lists = (int)strtol(argv[4], NULL, 0); + + if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0); + if (output_format < YUV1D || output_format > NV12) + die("Output format out of range [0, 2]"); + + info = aom_video_reader_get_info(reader); + + aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + printf("Using %s\n", aom_codec_iface_name(decoder)); + + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, + info->is_annexb)) { + die_codec(&codec, "Failed to set annex b status"); + } + + // Decode anchor frames. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0); + for (i = 0; i < num_references; ++i) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + if (i == 0) { + aom_img_fmt_t ref_fmt = 0; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt)) + die_codec(&codec, "Failed to get the image format"); + + int frame_res[2]; + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res)) + die_codec(&codec, "Failed to get the image frame size"); + + // Allocate memory to store decoded references. Allocate memory with the + // border so that it can be used as a reference. + for (j = 0; j < num_references; j++) { + unsigned int border = AOM_DEC_BORDER_IN_PIXELS; + if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, + frame_res[0], frame_res[1], 32, 8, + border)) { + fatal("Failed to allocate references."); + } + } + } + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE, + &reference_images[i])) + die_codec(&codec, "Failed to copy decoded reference frame"); + + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + char name[1024]; + snprintf(name, sizeof(name), "ref_%d.yuv", i); + printf("writing ref image to %s, %u, %u\n", name, img->d_w, img->d_h); + FILE *ref_file = fopen(name, "wb"); + aom_img_write(img, ref_file); + fclose(ref_file); + } + } + + // Decode the lightfield. + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1); + + // Set external references. + av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references }; + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref); + // Must decode the camera frame header first. + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode the frame."); + // Decode tile lists one by one. + for (n = 0; n < num_tile_lists; n++) { + aom_video_reader_read_frame(reader); + frame = aom_video_reader_get_frame(reader, &frame_size); + + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode the tile list."); + aom_codec_iter_t iter = NULL; + aom_image_t *img = aom_codec_get_frame(&codec, &iter); + if (!img) die_codec(&codec, "Failed to get frame."); + + if (output_format == YUV1D) + // write the tile to the output file in 1D format. + write_tile_yuv1d(&codec, img, outfile); + else if (output_format == YUV) + aom_img_write(img, outfile); + else + // NV12 output format + aom_img_write_nv12(img, outfile); + } + + for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + aom_video_reader_close(reader); + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/lossless_encoder.c b/third_party/aom/examples/lossless_encoder.c new file mode 100644 index 0000000000..1971b9c9df --- /dev/null +++ b/third_party/aom/examples/lossless_encoder.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "lossless_encoder: Example demonstrating lossless " + "encoding feature. Supports raw input only.\n"); + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, int flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = + aom_codec_encode(codec, img, frame_index, 1, flags); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + aom_image_t raw; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const int fps = 30; + + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&info, 0, sizeof(info)); + + if (argc < 5) die("Invalid number of arguments"); + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name("av1"); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); + info.frame_width = (int)strtol(argv[1], NULL, 0); + info.frame_height = (int)strtol(argv[2], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", aom_codec_iface_name(encoder)); + + aom_codec_ctx_t codec; + res = aom_codec_enc_config_default(encoder, &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + + writer = aom_video_writer_open(argv[4], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[4]); + + if (!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading.", argv[3]); + + if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) + die("Failed to initialize encoder"); + + if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1)) + die_codec(&codec, "Failed to use lossless mode"); + + // Encode frames. + while (aom_img_read(&raw, infile)) { + encode_frame(&codec, &raw, frame_count++, 0, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + aom_img_free(&raw); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/noise_model.c b/third_party/aom/examples/noise_model.c new file mode 100644 index 0000000000..1de13267fc --- /dev/null +++ b/third_party/aom/examples/noise_model.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief This is an sample binary to create noise params from input video. + * + * To allow for external denoising applications, this sample binary illustrates + * how to create a film grain table (film grain params as a function of time) + * from an input video and its corresponding denoised source. + * + * The --output-grain-table file can be passed as input to the encoder (in + * aomenc this is done through the "--film-grain-table" parameter). + * + * As an example, where the input source is an 854x480 yuv420p 8-bit video + * named "input.854_480.yuv" you would use steps similar to the following: + * + * # Run your denoiser (e.g, using hqdn3d filter): + * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \ + * -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \ + * denoised.854_480.yuv + * + * # Model the noise between the denoised version and original source: + * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \ + * --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \ + * --output-grain-table=film_grain.tbl + * + * # Encode with your favorite settings (including the grain table): + * aomenc --limit=100 --cpu-used=4 --input-bit-depth=8 \ + * --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \ + * --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \ + * -o denoised_with_grain_params.ivf denoised.854_480.yuv + */ +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom_dsp/aom_dsp_common.h" + +#if CONFIG_AV1_DECODER +#include "av1/decoder/grain_synthesis.h" +#endif + +#include "aom_dsp/grain_table.h" +#include "aom_dsp/noise_model.h" +#include "aom_dsp/noise_util.h" +#include "aom_mem/aom_mem.h" +#include "common/args.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s --input= --input-denoised= " + "--output-grain-table= " + "See comments in noise_model.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); +static const arg_def_t width_arg = + ARG_DEF("w", "width", 1, "Input width (if rawvideo)"); +static const arg_def_t height_arg = + ARG_DEF("h", "height", 1, "Input height (if rawvideo)"); +static const arg_def_t skip_frames_arg = + ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)"); +static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate"); +static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename"); +static const arg_def_t output_grain_table_arg = + ARG_DEF("n", "output-grain-table", 1, "Output noise file"); +static const arg_def_t input_denoised_arg = + ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only"); +static const arg_def_t flat_block_finder_arg = + ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder"); +static const arg_def_t block_size_arg = + ARG_DEF("b", "block-size", 1, "Block size"); +static const arg_def_t bit_depth_arg = + ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input"); +static const arg_def_t use_i420 = + ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)"); +static const arg_def_t use_i422 = + ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422"); +static const arg_def_t use_i444 = + ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444"); +static const arg_def_t debug_file_arg = + ARG_DEF(NULL, "debug-file", 1, "File to output debug info"); + +typedef struct { + int width; + int height; + struct aom_rational fps; + const char *input; + const char *input_denoised; + const char *output_grain_table; + int img_fmt; + int block_size; + int bit_depth; + int run_flat_block_finder; + int force_flat_psd; + int skip_frames; + const char *debug_file; +} noise_model_args_t; + +static void parse_args(noise_model_args_t *noise_args, char **argv) { + struct arg arg; + static const arg_def_t *main_args[] = { &help, + &input_arg, + &fps_arg, + &width_arg, + &height_arg, + &block_size_arg, + &output_grain_table_arg, + &input_denoised_arg, + &use_i420, + &use_i422, + &use_i444, + &debug_file_arg, + NULL }; + for (; *argv; argv++) { + if (arg_match(&arg, &help, argv)) { + fprintf(stdout, "\nOptions:\n"); + arg_show_usage(stdout, main_args); + exit(0); + } else if (arg_match(&arg, &width_arg, argv)) { + noise_args->width = atoi(arg.val); + } else if (arg_match(&arg, &height_arg, argv)) { + noise_args->height = atoi(arg.val); + } else if (arg_match(&arg, &input_arg, argv)) { + noise_args->input = arg.val; + } else if (arg_match(&arg, &input_denoised_arg, argv)) { + noise_args->input_denoised = arg.val; + } else if (arg_match(&arg, &output_grain_table_arg, argv)) { + noise_args->output_grain_table = arg.val; + } else if (arg_match(&arg, &block_size_arg, argv)) { + noise_args->block_size = atoi(arg.val); + } else if (arg_match(&arg, &bit_depth_arg, argv)) { + noise_args->bit_depth = atoi(arg.val); + } else if (arg_match(&arg, &flat_block_finder_arg, argv)) { + noise_args->run_flat_block_finder = atoi(arg.val); + } else if (arg_match(&arg, &fps_arg, argv)) { + noise_args->fps = arg_parse_rational(&arg); + } else if (arg_match(&arg, &use_i420, argv)) { + noise_args->img_fmt = AOM_IMG_FMT_I420; + } else if (arg_match(&arg, &use_i422, argv)) { + noise_args->img_fmt = AOM_IMG_FMT_I422; + } else if (arg_match(&arg, &use_i444, argv)) { + noise_args->img_fmt = AOM_IMG_FMT_I444; + } else if (arg_match(&arg, &skip_frames_arg, argv)) { + noise_args->skip_frames = atoi(arg.val); + } else if (arg_match(&arg, &debug_file_arg, argv)) { + noise_args->debug_file = arg.val; + } else { + fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv); + arg_show_usage(stdout, main_args); + exit(0); + } + } + if (noise_args->bit_depth > 8) { + noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + } +} + +#if CONFIG_AV1_DECODER +static void print_variance_y(FILE *debug_file, aom_image_t *raw, + aom_image_t *denoised, const uint8_t *flat_blocks, + int block_size, aom_film_grain_t *grain) { + aom_image_t renoised; + grain->apply_grain = 1; + grain->random_seed = 7391; + grain->bit_depth = raw->bit_depth; + aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1); + + if (av1_add_film_grain(grain, denoised, &renoised)) { + fprintf(stderr, "Internal failure in av1_add_film_grain().\n"); + aom_img_free(&renoised); + return; + } + + const int num_blocks_w = (raw->w + block_size - 1) / block_size; + const int num_blocks_h = (raw->h + block_size - 1) / block_size; + fprintf(debug_file, "x = ["); + for (int by = 0; by < num_blocks_h; by++) { + for (int bx = 0; bx < num_blocks_w; bx++) { + double block_mean = 0; + double noise_std = 0, noise_mean = 0; + double renoise_std = 0, renoise_mean = 0; + for (int yi = 0; yi < block_size; ++yi) { + const int y = by * block_size + yi; + for (int xi = 0; xi < block_size; ++xi) { + const int x = bx * block_size + xi; + const double noise_v = (raw->planes[0][y * raw->stride[0] + x] - + denoised->planes[0][y * raw->stride[0] + x]); + noise_mean += noise_v; + noise_std += noise_v * noise_v; + + block_mean += raw->planes[0][y * raw->stride[0] + x]; + + const double renoise_v = + (renoised.planes[0][y * raw->stride[0] + x] - + denoised->planes[0][y * raw->stride[0] + x]); + renoise_mean += renoise_v; + renoise_std += renoise_v * renoise_v; + } + } + int n = (block_size * block_size); + block_mean /= n; + noise_mean /= n; + renoise_mean /= n; + noise_std = sqrt(noise_std / n - noise_mean * noise_mean); + renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean); + fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf ", + flat_blocks[by * num_blocks_w + bx], block_mean, noise_std, + renoise_std); + } + fprintf(debug_file, "\n"); + } + fprintf(debug_file, "];\n"); + + if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + fprintf(stderr, + "Detailed debug info not supported for high bit" + "depth formats\n"); + } else { + fprintf(debug_file, "figure(2); clf;\n"); + fprintf(debug_file, + "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n"); + fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n"); + fprintf(debug_file, + "plot(linspace(0, 255, length(noise_strength_0)), " + "noise_strength_0, 'b');\n"); + fprintf(debug_file, + "title('Scatter plot of intensity vs noise strength');\n"); + fprintf(debug_file, + "legend('Actual', 'Estimated', 'Estimated strength');\n"); + fprintf(debug_file, "figure(3); clf;\n"); + fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n"); + fprintf(debug_file, "title('Actual vs Estimated');\n"); + fprintf(debug_file, "pause(3);\n"); + } + aom_img_free(&renoised); +} +#endif + +static void print_debug_info(FILE *debug_file, aom_image_t *raw, + aom_image_t *denoised, uint8_t *flat_blocks, + int block_size, aom_noise_model_t *noise_model) { + (void)raw; + (void)denoised; + (void)flat_blocks; + (void)block_size; + fprintf(debug_file, "figure(3); clf;\n"); + fprintf(debug_file, "figure(2); clf;\n"); + fprintf(debug_file, "figure(1); clf;\n"); + for (int c = 0; c < 3; ++c) { + fprintf(debug_file, "noise_strength_%d = [\n", c); + const aom_equation_system_t *eqns = + &noise_model->combined_state[c].strength_solver.eqns; + for (int k = 0; k < eqns->n; ++k) { + fprintf(debug_file, "%lf ", eqns->x[k]); + } + fprintf(debug_file, "];\n"); + fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c); + } + fprintf(debug_file, "legend('Y', 'cb', 'cr');\n"); + fprintf(debug_file, "title('Noise strength function');\n"); + +#if CONFIG_AV1_DECODER + aom_film_grain_t grain; + aom_noise_model_get_grain_parameters(noise_model, &grain); + print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain); +#endif + fflush(debug_file); +} + +int main(int argc, char *argv[]) { + noise_model_args_t args = { 0, 0, { 25, 1 }, 0, 0, 0, AOM_IMG_FMT_I420, + 32, 8, 1, 0, 1, NULL }; + aom_image_t raw, denoised; + FILE *infile = NULL; + AvxVideoInfo info; + + memset(&info, 0, sizeof(info)); + + (void)argc; + exec_name = argv[0]; + parse_args(&args, argv + 1); + + info.frame_width = args.width; + info.frame_height = args.height; + info.time_base.numerator = args.fps.den; + info.time_base.denominator = args.fps.num; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height, + 1)) { + die("Failed to allocate image."); + } + if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + infile = fopen(args.input, "rb"); + if (!infile) { + die("Failed to open input file: %s", args.input); + } + fprintf(stderr, "Bit depth: %d stride:%d\n", args.bit_depth, raw.stride[0]); + + const int high_bd = args.bit_depth > 8; + const int block_size = args.block_size; + aom_flat_block_finder_t block_finder; + aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth, + high_bd); + + const int num_blocks_w = (info.frame_width + block_size - 1) / block_size; + const int num_blocks_h = (info.frame_height + block_size - 1) / block_size; + uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h); + if (!flat_blocks) die("Failed to allocate block data."); + // Sets the random seed on the first entry in the output table + int16_t random_seed = 7391; + aom_noise_model_t noise_model; + aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth, + high_bd }; + aom_noise_model_init(&noise_model, params); + + FILE *denoised_file = 0; + if (args.input_denoised) { + denoised_file = fopen(args.input_denoised, "rb"); + if (!denoised_file) + die("Unable to open input_denoised: %s", args.input_denoised); + } else { + die("--input-denoised file must be specified"); + } + FILE *debug_file = 0; + if (args.debug_file) { + debug_file = fopen(args.debug_file, "w"); + } + aom_film_grain_table_t grain_table = { 0, 0 }; + + int64_t prev_timestamp = 0; + int frame_count = 0; + while (aom_img_read(&raw, infile)) { + if (args.input_denoised) { + if (!aom_img_read(&denoised, denoised_file)) { + die("Unable to read input denoised file"); + } + } + if (frame_count % args.skip_frames == 0) { + int num_flat_blocks = num_blocks_w * num_blocks_h; + memset(flat_blocks, 1, num_flat_blocks); + if (args.run_flat_block_finder) { + memset(flat_blocks, 0, num_flat_blocks); + num_flat_blocks = aom_flat_block_finder_run( + &block_finder, raw.planes[0], info.frame_width, info.frame_height, + info.frame_width, flat_blocks); + fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks); + } + + const uint8_t *planes[3] = { raw.planes[0], raw.planes[1], + raw.planes[2] }; + uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1], + denoised.planes[2] }; + int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd, + raw.stride[2] >> high_bd }; + int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 }; + + fprintf(stdout, "Updating noise model...\n"); + aom_noise_status_t status = aom_noise_model_update( + &noise_model, (const uint8_t *const *)planes, + (const uint8_t *const *)denoised_planes, info.frame_width, + info.frame_height, strides, chroma_sub, flat_blocks, block_size); + + int64_t cur_timestamp = + frame_count * 10000000ULL * args.fps.den / args.fps.num; + if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { + fprintf(stdout, + "Noise type is different, updating parameters for time " + "[ %" PRId64 ", %" PRId64 ")\n", + prev_timestamp, cur_timestamp); + aom_film_grain_t grain; + aom_noise_model_get_grain_parameters(&noise_model, &grain); + grain.random_seed = random_seed; + random_seed = 0; + aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp, + &grain); + aom_noise_model_save_latest(&noise_model); + prev_timestamp = cur_timestamp; + } + if (debug_file) { + print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size, + &noise_model); + } + fprintf(stdout, "Done noise model update, status = %d\n", status); + } + frame_count++; + } + + aom_film_grain_t grain; + aom_noise_model_get_grain_parameters(&noise_model, &grain); + grain.random_seed = random_seed; + aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain); + if (args.output_grain_table) { + struct aom_internal_error_info error_info; + if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table, + args.output_grain_table, + &error_info)) { + die("Unable to write output film grain table"); + } + } + aom_film_grain_table_free(&grain_table); + + if (infile) fclose(infile); + if (denoised_file) fclose(denoised_file); + if (debug_file) fclose(debug_file); + aom_img_free(&raw); + aom_img_free(&denoised); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/photon_noise_table.c b/third_party/aom/examples/photon_noise_table.c new file mode 100644 index 0000000000..d3a21a48ee --- /dev/null +++ b/third_party/aom/examples/photon_noise_table.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// This tool creates a film grain table, for use in stills and videos, +// representing the noise that one would get by shooting with a digital camera +// at a given light level. Much of the noise in digital images is photon shot +// noise, which is due to the characteristics of photon arrival and grows in +// standard deviation as the square root of the expected number of photons +// captured. +// https://www.photonstophotos.net/Emil%20Martinec/noise.html#shotnoise +// +// The proxy used by this tool for the amount of light captured is the ISO value +// such that the focal plane exposure at the time of capture would have been +// mapped by a 35mm camera to the output lightness observed in the image. That +// is, if one were to shoot on a 35mm camera (36×24mm sensor) at the nominal +// exposure for that ISO setting, the resulting image should contain noise of +// the same order of magnitude as generated by this tool. +// +// Example usage: +// +// ./photon_noise_table --width=3840 --height=2160 --iso=25600 -o noise.tbl +// # Then, for example: +// aomenc --film-grain-table=noise.tbl ... +// # Or: +// avifenc -c aom -a film-grain-table=noise.tbl ... +// +// The (mostly) square-root relationship between light intensity and noise +// amplitude holds in linear light, but AV1 streams are most often encoded +// non-linearly, and the film grain is applied to those non-linear values. +// Therefore, this tool must account for the non-linearity, and this is +// controlled by the optional `--transfer-function` (or `-t`) parameter, which +// specifies the tone response curve that will be used when encoding the actual +// image. The default for this tool is sRGB, which is approximately similar to +// an encoding gamma of 1/2.2 (i.e. a decoding gamma of 2.2) though not quite +// identical. +// +// As alluded to above, the tool assumes that the image is taken from the +// entirety of a 36×24mm (“35mm format”) sensor. If that assumption does not +// hold, then a “35mm-equivalent ISO value” that can be passed to the tool can +// be obtained by multiplying the true ISO value by the ratio of 36×24mm to the +// area that was actually used. For formats that approximately share the same +// aspect ratio, this is often expressed as the square of the “equivalence +// ratio” which is the ratio of their diagonals. For example, APS-C (often +// ~24×16mm) is said to have an equivalence ratio of 1.5 relative to the 35mm +// format, and therefore ISO 1000 on APS-C and ISO 1000×1.5² = 2250 on 35mm +// produce an image of the same lightness from the same amount of light spread +// onto their respective surface areas (resulting in different focal plane +// exposures), and those images will thus have similar amounts of noise if the +// cameras are of similar technology. https://doi.org/10.1117/1.OE.57.11.110801 +// +// The tool needs to know the resolution of the images to which its grain tables +// will be applied so that it can know how the light on the sensor was shared +// between its pixels. As a general rule, while a higher pixel count will lead +// to more noise per pixel, when the final image is viewed at the same physical +// size, that noise will tend to “average out” to the same amount over a given +// area, since there will be more pixels in it which, in aggregate, will have +// received essentially as much light. Put differently, the amount of noise +// depends on the scale at which it is measured, and the decision for this tool +// was to make that scale relative to the image instead of its constituent +// samples. For more on this, see: +// +// https://www.photonstophotos.net/Emil%20Martinec/noise-p3.html#pixelsize +// https://www.dpreview.com/articles/5365920428/the-effect-of-pixel-and-sensor-sizes-on-noise/2 +// https://www.dpreview.com/videos/7940373140/dpreview-tv-why-lower-resolution-sensors-are-not-better-in-low-light + +#include +#include +#include +#include + +#include "aom_dsp/grain_table.h" +#include "common/args.h" +#include "common/tools_common.h" + +static const char *exec_name; + +static const struct arg_enum_list transfer_functions[] = { + { "bt470m", AOM_CICP_TC_BT_470_M }, { "bt470bg", AOM_CICP_TC_BT_470_B_G }, + { "srgb", AOM_CICP_TC_SRGB }, { "smpte2084", AOM_CICP_TC_SMPTE_2084 }, + { "hlg", AOM_CICP_TC_HLG }, ARG_ENUM_LIST_END +}; + +static arg_def_t help_arg = + ARG_DEF("h", "help", 0, "Show the available options"); +static arg_def_t width_arg = + ARG_DEF("w", "width", 1, "Width of the image in pixels (required)"); +static arg_def_t height_arg = + ARG_DEF("l", "height", 1, "Height of the image in pixels (required)"); +static arg_def_t iso_arg = ARG_DEF( + "i", "iso", 1, "ISO setting indicative of the light level (required)"); +static arg_def_t output_arg = + ARG_DEF("o", "output", 1, + "Output file to which to write the film grain table (required)"); +static arg_def_t transfer_function_arg = + ARG_DEF_ENUM("t", "transfer-function", 1, + "Transfer function used by the encoded image (default = sRGB)", + transfer_functions); + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s [--transfer-function=] --width= " + "--height= --iso= --output=\n", + exec_name); + exit(EXIT_FAILURE); +} + +typedef struct { + float (*to_linear)(float); + float (*from_linear)(float); + // In linear output light. This would typically be 0.18 for SDR (this matches + // the definition of Standard Output Sensitivity from ISO 12232:2019), but in + // HDR, we certainly do not want to consider 18% of the maximum output a + // “mid-tone”, as it would be e.g. 1800 cd/m² for SMPTE ST 2084 (PQ). + float mid_tone; +} transfer_function_t; + +static const transfer_function_t *find_transfer_function( + aom_transfer_characteristics_t tc); + +typedef struct { + int width; + int height; + int iso_setting; + + const transfer_function_t *transfer_function; + + const char *output_filename; +} photon_noise_args_t; + +static void parse_args(int argc, char **argv, + photon_noise_args_t *photon_noise_args) { + static const arg_def_t *args[] = { &help_arg, &width_arg, + &height_arg, &iso_arg, + &output_arg, &transfer_function_arg, + NULL }; + struct arg arg; + int width_set = 0, height_set = 0, iso_set = 0, output_set = 0, i; + + photon_noise_args->transfer_function = + find_transfer_function(AOM_CICP_TC_SRGB); + + for (i = 1; i < argc; i += arg.argv_step) { + arg.argv_step = 1; + if (arg_match(&arg, &help_arg, argv + i)) { + arg_show_usage(stdout, args); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &width_arg, argv + i)) { + photon_noise_args->width = arg_parse_int(&arg); + width_set = 1; + } else if (arg_match(&arg, &height_arg, argv + i)) { + photon_noise_args->height = arg_parse_int(&arg); + height_set = 1; + } else if (arg_match(&arg, &iso_arg, argv + i)) { + photon_noise_args->iso_setting = arg_parse_int(&arg); + iso_set = 1; + } else if (arg_match(&arg, &output_arg, argv + i)) { + photon_noise_args->output_filename = arg.val; + output_set = 1; + } else if (arg_match(&arg, &transfer_function_arg, argv + i)) { + const aom_transfer_characteristics_t tc = arg_parse_enum(&arg); + photon_noise_args->transfer_function = find_transfer_function(tc); + } else { + fatal("unrecognized argument \"%s\", see --help for available options", + argv[i]); + } + } + + if (!width_set) { + fprintf(stderr, "Missing required parameter --width\n"); + exit(EXIT_FAILURE); + } + + if (!height_set) { + fprintf(stderr, "Missing required parameter --height\n"); + exit(EXIT_FAILURE); + } + + if (!iso_set) { + fprintf(stderr, "Missing required parameter --iso\n"); + exit(EXIT_FAILURE); + } + + if (!output_set) { + fprintf(stderr, "Missing required parameter --output\n"); + exit(EXIT_FAILURE); + } +} + +static float maxf(float a, float b) { return a > b ? a : b; } +static float minf(float a, float b) { return a < b ? a : b; } + +static float gamma22_to_linear(float g) { return powf(g, 2.2f); } +static float gamma22_from_linear(float l) { return powf(l, 1 / 2.2f); } +static float gamma28_to_linear(float g) { return powf(g, 2.8f); } +static float gamma28_from_linear(float l) { return powf(l, 1 / 2.8f); } + +static float srgb_to_linear(float srgb) { + return srgb <= 0.04045f ? srgb / 12.92f + : powf((srgb + 0.055f) / 1.055f, 2.4f); +} +static float srgb_from_linear(float linear) { + return linear <= 0.0031308f ? 12.92f * linear + : 1.055f * powf(linear, 1 / 2.4f) - 0.055f; +} + +static const float kPqM1 = 2610.f / 16384; +static const float kPqM2 = 128 * 2523.f / 4096; +static const float kPqC1 = 3424.f / 4096; +static const float kPqC2 = 32 * 2413.f / 4096; +static const float kPqC3 = 32 * 2392.f / 4096; +static float pq_to_linear(float pq) { + const float pq_pow_inv_m2 = powf(pq, 1.f / kPqM2); + return powf(maxf(0, pq_pow_inv_m2 - kPqC1) / (kPqC2 - kPqC3 * pq_pow_inv_m2), + 1.f / kPqM1); +} +static float pq_from_linear(float linear) { + const float linear_pow_m1 = powf(linear, kPqM1); + return powf((kPqC1 + kPqC2 * linear_pow_m1) / (1 + kPqC3 * linear_pow_m1), + kPqM2); +} + +// Note: it is perhaps debatable whether “linear” for HLG should be scene light +// or display light. Here, it is implemented in terms of display light assuming +// a nominal peak display luminance of 1000 cd/m², hence the system γ of 1.2. To +// make it scene light instead, the OOTF (powf(x, 1.2f)) and its inverse should +// be removed from the functions below, and the .mid_tone should be replaced +// with powf(26.f / 1000, 1 / 1.2f). +static const float kHlgA = 0.17883277f; +static const float kHlgB = 0.28466892f; +static const float kHlgC = 0.55991073f; +static float hlg_to_linear(float hlg) { + // EOTF = OOTF ∘ OETF⁻¹ + const float linear = + hlg <= 0.5f ? hlg * hlg / 3 : (expf((hlg - kHlgC) / kHlgA) + kHlgB) / 12; + return powf(linear, 1.2f); +} +static float hlg_from_linear(float linear) { + // EOTF⁻¹ = OETF ∘ OOTF⁻¹ + linear = powf(linear, 1.f / 1.2f); + return linear <= 1.f / 12 ? sqrtf(3 * linear) + : kHlgA * logf(12 * linear - kHlgB) + kHlgC; +} + +static const transfer_function_t *find_transfer_function( + aom_transfer_characteristics_t tc) { + static const transfer_function_t + kGamma22TransferFunction = { .to_linear = &gamma22_to_linear, + .from_linear = &gamma22_from_linear, + .mid_tone = 0.18f }, + kGamma28TransferFunction = { .to_linear = &gamma28_to_linear, + .from_linear = &gamma28_from_linear, + .mid_tone = 0.18f }, + kSRgbTransferFunction = { .to_linear = &srgb_to_linear, + .from_linear = &srgb_from_linear, + .mid_tone = 0.18f }, + kPqTransferFunction = { .to_linear = &pq_to_linear, + .from_linear = &pq_from_linear, + // https://www.itu.int/pub/R-REP-BT.2408-4-2021 + // page 6 (PDF page 8) + .mid_tone = 26.f / 10000 }, + kHlgTransferFunction = { .to_linear = &hlg_to_linear, + .from_linear = &hlg_from_linear, + .mid_tone = 26.f / 1000 }; + + switch (tc) { + case AOM_CICP_TC_BT_470_M: return &kGamma22TransferFunction; + case AOM_CICP_TC_BT_470_B_G: return &kGamma28TransferFunction; + case AOM_CICP_TC_SRGB: return &kSRgbTransferFunction; + case AOM_CICP_TC_SMPTE_2084: return &kPqTransferFunction; + case AOM_CICP_TC_HLG: return &kHlgTransferFunction; + + default: fatal("unimplemented transfer function %d", tc); + } +} + +static void generate_photon_noise(const photon_noise_args_t *photon_noise_args, + aom_film_grain_t *film_grain) { + // Assumes a daylight-like spectrum. + // https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s + static const float kPhotonsPerLxSPerUm2 = 11260; + + // Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into + // account. + static const float kEffectiveQuantumEfficiency = 0.20f; + + // Also reasonable values for current cameras. The read noise is typically + // higher than this at low ISO settings but it matters less there. + static const float kPhotoResponseNonUniformity = 0.005f; + static const float kInputReferredReadNoise = 1.5f; + + // Focal plane exposure for a mid-tone (typically a 18% reflectance card), in + // lx·s. + const float mid_tone_exposure = 10.f / photon_noise_args->iso_setting; + + // In microns. Assumes a 35mm sensor (36mm × 24mm). + const float pixel_area_um2 = (36000 * 24000.f) / (photon_noise_args->width * + photon_noise_args->height); + + const float mid_tone_electrons_per_pixel = kEffectiveQuantumEfficiency * + kPhotonsPerLxSPerUm2 * + mid_tone_exposure * pixel_area_um2; + const float max_electrons_per_pixel = + mid_tone_electrons_per_pixel / + photon_noise_args->transfer_function->mid_tone; + + int i; + + film_grain->num_y_points = 14; + for (i = 0; i < film_grain->num_y_points; ++i) { + float x = i / (film_grain->num_y_points - 1.f); + const float linear = photon_noise_args->transfer_function->to_linear(x); + const float electrons_per_pixel = max_electrons_per_pixel * linear; + // Quadrature sum of the relevant sources of noise, in electrons rms. Photon + // shot noise is sqrt(electrons) so we can skip the square root and the + // squaring. + // https://en.wikipedia.org/wiki/Addition_in_quadrature + // https://doi.org/10.1117/3.725073 + const float noise_in_electrons = + sqrtf(kInputReferredReadNoise * kInputReferredReadNoise + + electrons_per_pixel + + (kPhotoResponseNonUniformity * kPhotoResponseNonUniformity * + electrons_per_pixel * electrons_per_pixel)); + const float linear_noise = noise_in_electrons / max_electrons_per_pixel; + const float linear_range_start = maxf(0.f, linear - 2 * linear_noise); + const float linear_range_end = minf(1.f, linear + 2 * linear_noise); + const float tf_slope = + (photon_noise_args->transfer_function->from_linear(linear_range_end) - + photon_noise_args->transfer_function->from_linear( + linear_range_start)) / + (linear_range_end - linear_range_start); + float encoded_noise = linear_noise * tf_slope; + + x = roundf(255 * x); + encoded_noise = minf(255.f, roundf(255 * 7.88f * encoded_noise)); + + film_grain->scaling_points_y[i][0] = (int)x; + film_grain->scaling_points_y[i][1] = (int)encoded_noise; + } + + film_grain->apply_grain = 1; + film_grain->update_parameters = 1; + film_grain->num_cb_points = 0; + film_grain->num_cr_points = 0; + film_grain->scaling_shift = 8; + film_grain->ar_coeff_lag = 0; + film_grain->ar_coeffs_cb[0] = 0; + film_grain->ar_coeffs_cr[0] = 0; + film_grain->ar_coeff_shift = 6; + film_grain->cb_mult = 0; + film_grain->cb_luma_mult = 0; + film_grain->cb_offset = 0; + film_grain->cr_mult = 0; + film_grain->cr_luma_mult = 0; + film_grain->cr_offset = 0; + film_grain->overlap_flag = 1; + film_grain->random_seed = 7391; + film_grain->chroma_scaling_from_luma = 0; +} + +int main(int argc, char **argv) { + photon_noise_args_t photon_noise_args; + aom_film_grain_table_t film_grain_table; + aom_film_grain_t film_grain; + struct aom_internal_error_info error_info; + memset(&photon_noise_args, 0, sizeof(photon_noise_args)); + memset(&film_grain_table, 0, sizeof(film_grain_table)); + memset(&film_grain, 0, sizeof(film_grain)); + memset(&error_info, 0, sizeof(error_info)); + + exec_name = argv[0]; + parse_args(argc, argv, &photon_noise_args); + + generate_photon_noise(&photon_noise_args, &film_grain); + aom_film_grain_table_append(&film_grain_table, 0, 9223372036854775807ull, + &film_grain); + if (aom_film_grain_table_write(&film_grain_table, + photon_noise_args.output_filename, + &error_info) != AOM_CODEC_OK) { + aom_film_grain_table_free(&film_grain_table); + fprintf(stderr, "Failed to write film grain table"); + if (error_info.has_detail) { + fprintf(stderr, ": %s", error_info.detail); + } + fprintf(stderr, "\n"); + return EXIT_FAILURE; + } + aom_film_grain_table_free(&film_grain_table); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/scalable_decoder.c b/third_party/aom/examples/scalable_decoder.c new file mode 100644 index 0000000000..00fe820fd5 --- /dev/null +++ b/third_party/aom/examples/scalable_decoder.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Scalable Decoder +// ============== +// +// This is an example of a scalable decoder loop. It takes a 2-spatial-layer +// input file +// containing the compressed data (in OBU format), passes it through the +// decoder, and writes the decompressed frames to disk. The base layer and +// enhancement layers are stored as separate files, out_lyr0.yuv and +// out_lyr1.yuv, respectively. +// +// Standard Includes +// ----------------- +// For decoders, you only have to include `aom_decoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// av1. +// +// Initializing The Codec +// ---------------------- +// The libaom decoder is initialized by the call to aom_codec_dec_init(). +// Determining the codec interface to use is handled by AvxVideoReader and the +// functions prefixed with aom_video_reader_. Discussion of those functions is +// beyond the scope of this example, but the main gist is to open the input file +// and parse just enough of it to determine if it's a AVx file and which AVx +// codec is contained within the file. +// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this +// example because we want the algorithm to determine the stream configuration +// (width/height) and allocate memory automatically. +// +// Decoding A Frame +// ---------------- +// Once the frame has been read into memory, it is decoded using the +// `aom_codec_decode` function. The call takes a pointer to the data +// (`frame`) and the length of the data (`frame_size`). No application data +// is associated with the frame in this example, so the `user_priv` +// parameter is NULL. The `deadline` parameter is left at zero for this +// example. This parameter is generally only used when doing adaptive post +// processing. +// +// Codecs may produce a variable number of output frames for every call to +// `aom_codec_decode`. These frames are retrieved by the +// `aom_codec_get_frame` iterator function. The iterator variable `iter` is +// initialized to NULL each time `aom_codec_decode` is called. +// `aom_codec_get_frame` is called in a loop, returning a pointer to a +// decoded image or NULL to indicate the end of list. +// +// Processing The Decoded Data +// --------------------------- +// In this example, we simply write the encoded data to disk. It is +// important to honor the image's `stride` values. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exceptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "common/obudec.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +#define MAX_LAYERS 5 + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile[MAX_LAYERS]; + char filename[80]; + FILE *inputfile = NULL; + uint8_t *buf = NULL; + size_t bytes_in_buffer = 0; + size_t buffer_size = 0; + struct AvxInputContext aom_input_ctx; + struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 }; + aom_codec_stream_info_t si; + uint8_t tmpbuf[32]; + unsigned int i; + + exec_name = argv[0]; + + if (argc != 2) die("Invalid number of arguments."); + + if (!(inputfile = fopen(argv[1], "rb"))) + die("Failed to open %s for read.", argv[1]); + obu_ctx.avx_ctx->file = inputfile; + obu_ctx.avx_ctx->filename = argv[1]; + + aom_codec_iface_t *decoder = get_aom_decoder_by_index(0); + printf("Using %s\n", aom_codec_iface_name(decoder)); + + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + + if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) { + die_codec(&codec, "Failed to set output_all_layers control."); + } + + // peak sequence header OBU to get number of spatial layers + const size_t ret = fread(tmpbuf, 1, 32, inputfile); + if (ret != 32) die_codec(&codec, "Input is not a valid obu file"); + si.is_annexb = 0; + if (aom_codec_peek_stream_info(decoder, tmpbuf, 32, &si)) { + die_codec(&codec, "Input is not a valid obu file"); + } + fseek(inputfile, -32, SEEK_CUR); + + if (!file_is_obu(&obu_ctx)) + die_codec(&codec, "Input is not a valid obu file"); + + // open base layer output yuv file + snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0); + if (!(outfile[0] = fopen(filename, "wb"))) + die("Failed top open output for writing."); + + // open any enhancement layer output yuv files + for (i = 1; i < si.number_spatial_layers; i++) { + snprintf(filename, sizeof(filename), "out_lyr%u.yuv", i); + if (!(outfile[i] = fopen(filename, "wb"))) + die("Failed to open output for writing."); + } + + while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer, + &buffer_size)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + aom_image_t *img_shifted = + aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16); + img_shifted->bit_depth = 8; + aom_img_downshift(img_shifted, img, + img->bit_depth - img_shifted->bit_depth); + if (img->spatial_id == 0) { + printf("Writing base layer 0 %d\n", frame_cnt); + aom_img_write(img_shifted, outfile[0]); + } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) { + printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt); + aom_img_write(img_shifted, outfile[img->spatial_id]); + } else { + die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count"); + } + if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt; + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]); + + fclose(inputfile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/scalable_encoder.c b/third_party/aom/examples/scalable_encoder.c new file mode 100644 index 0000000000..5bfd1840b2 --- /dev/null +++ b/third_party/aom/examples/scalable_encoder.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Scalable Encoder +// ============== +// +// This is an example of a scalable encoder loop. It takes two input files in +// YV12 format, passes it through the encoder, and writes the compressed +// frames to disk in OBU format. +// +// Getting The Default Configuration +// --------------------------------- +// Encoders have the notion of "usage profiles." For example, an encoder +// may want to publish default configurations for both a video +// conferencing application and a best quality offline encoder. These +// obviously have very different default settings. Consult the +// documentation for your codec to see if it provides any default +// configurations. All codecs provide a default configuration, number 0, +// which is valid for material in the vacinity of QCIF/QVGA. +// +// Updating The Configuration +// --------------------------------- +// Almost all applications will want to update the default configuration +// with settings specific to their usage. Here we set the width and height +// of the video file to that specified on the command line. We also scale +// the default bitrate based on the ratio between the default resolution +// and the resolution specified on the command line. +// +// Encoding A Frame +// ---------------- +// The frame is read as a continuous block (size = width * height * 3 / 2) +// from the input file. If a frame was read (the input file has not hit +// EOF) then the frame is passed to the encoder. Otherwise, a NULL +// is passed, indicating the End-Of-Stream condition to the encoder. The +// `frame_cnt` is reused as the presentation time stamp (PTS) and each +// frame is shown for one frame-time in duration. The flags parameter is +// unused in this example. + +// Forced Keyframes +// ---------------- +// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the +// flags passed to `aom_codec_control()`. In this example, we force a +// keyframe every frames. Note, the output stream can +// contain additional keyframes beyond those that have been forced using the +// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the +// encoder. +// +// Processing The Encoded Data +// --------------------------- +// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exeptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "av1/common/enums.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n" + "See comments in scalable_encoder.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, int flags, FILE *outfile) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = + aom_codec_encode(codec, img, frame_index, 1, flags); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) != + pkt->data.frame.sz) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + printf(" %6d\n", (int)pkt->data.frame.sz); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile0 = NULL; + FILE *infile1 = NULL; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + aom_image_t raw0, raw1; + aom_codec_err_t res; + AvxVideoInfo info; + const int fps = 30; + const int bitrate = 200; + int keyframe_interval = 0; + int max_frames = 0; + int frames_encoded = 0; + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile0_arg = NULL; + const char *infile1_arg = NULL; + const char *outfile_arg = NULL; + // const char *keyframe_interval_arg = NULL; + FILE *outfile = NULL; + + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&info, 0, sizeof(info)); + + if (argc != 8) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile0_arg = argv[4]; + infile1_arg = argv[5]; + outfile_arg = argv[6]; + max_frames = (int)strtol(argv[7], NULL, 0); + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image for layer 0."); + } + if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image for layer 1."); + } + + // keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); + keyframe_interval = 100; + if (keyframe_interval < 0) die("Invalid keyframe interval value."); + + printf("Using %s\n", aom_codec_iface_name(encoder)); + + aom_codec_ctx_t codec; + res = aom_codec_enc_config_default(encoder, &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = 0; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = AOM_Q; + cfg.save_as_annexb = 0; + + outfile = fopen(outfile_arg, "wb"); + if (!outfile) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile0 = fopen(infile0_arg, "rb"))) + die("Failed to open %s for reading.", infile0_arg); + if (!(infile1 = fopen(infile1_arg, "rb"))) + die("Failed to open %s for reading.", infile0_arg); + + if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) + die("Failed to initialize encoder"); + if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8)) + die_codec(&codec, "Failed to set cpu to 8"); + + if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2)) + die_codec(&codec, "Failed to set tile columns to 2"); + if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3)) + die_codec(&codec, "Failed to set num of tile groups to 3"); + + if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2)) + die_codec(&codec, "Failed to set number of spatial layers to 2"); + + // Encode frames. + while (aom_img_read(&raw0, infile0)) { + int flags = 0; + + // configure and encode base layer + + if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0) + flags |= AOM_EFLAG_FORCE_KF; + else + // use previous base layer (LAST) as sole reference + // save this frame as LAST to be used as reference by enhanmcent layer + // and next base layer + flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | + AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 | + AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_NO_UPD_ENTROPY; + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + if (aom_codec_enc_config_set(&codec, &cfg)) + die_codec(&codec, "Failed to set enc cfg for layer 0"); + if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0)) + die_codec(&codec, "Failed to set layer id to 0"); + if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62)) + die_codec(&codec, "Failed to set cq level"); + encode_frame(&codec, &raw0, frame_count++, flags, outfile); + + // configure and encode enhancement layer + + // use LAST (base layer) as sole reference + flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 | + AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD | + AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST | + AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF | + AOM_EFLAG_NO_UPD_ENTROPY; + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + aom_img_read(&raw1, infile1); + if (aom_codec_enc_config_set(&codec, &cfg)) + die_codec(&codec, "Failed to set enc cfg for layer 1"); + if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1)) + die_codec(&codec, "Failed to set layer id to 1"); + if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10)) + die_codec(&codec, "Failed to set cq level"); + encode_frame(&codec, &raw1, frame_count++, flags, outfile); + + frames_encoded++; + + if (max_frames > 0 && frames_encoded >= max_frames) break; + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, outfile)) continue; + + printf("\n"); + fclose(infile0); + fclose(infile1); + printf("Processed %d frames.\n", frame_count / 2); + + aom_img_free(&raw0); + aom_img_free(&raw1); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/set_maps.c b/third_party/aom/examples/set_maps.c new file mode 100644 index 0000000000..2593faba34 --- /dev/null +++ b/third_party/aom/examples/set_maps.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// AOM Set Active and ROI Maps +// =========================== +// +// This is an example demonstrating how to control the AOM encoder's +// ROI and Active maps. +// +// ROI (Region of Interest) maps are a way for the application to assign +// each macroblock in the image to a region, and then set quantizer and +// filtering parameters on that image. +// +// Active maps are a way for the application to specify on a +// macroblock-by-macroblock basis whether there is any activity in that +// macroblock. +// +// +// Configuration +// ------------- +// An ROI map is set on frame 22. If the width of the image in macroblocks +// is evenly divisible by 4, then the output will appear to have distinct +// columns, where the quantizer, loopfilter, and static threshold differ +// from column to column. +// +// An active map is set on frame 33. If the width of the image in macroblocks +// is evenly divisible by 4, then the output will appear to have distinct +// columns, where one column will have motion and the next will not. +// +// The active map is cleared on frame 44. +// +// Observing The Effects +// --------------------- +// Use the `simple_decoder` example to decode this sample, and observe +// the change in the image at frames 22, 33, and 44. + +#include +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", + exec_name); + exit(EXIT_FAILURE); +} + +static void set_active_map(const aom_codec_enc_cfg_t *cfg, + aom_codec_ctx_t *codec) { + unsigned int i; + aom_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + + map.active_map = (uint8_t *)malloc(map.rows * map.cols); + if (!map.active_map) die("Failed to allocate active map"); + for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2; + + if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); + + free(map.active_map); +} + +static void unset_active_map(const aom_codec_enc_cfg_t *cfg, + aom_codec_ctx_t *codec) { + aom_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + map.active_map = NULL; + + if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + const int limit = 10; + aom_image_t raw; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const int fps = 2; // TODO(dkovalev) add command line argument + const double bits_per_pixel_per_frame = 0.067; + +#if CONFIG_REALTIME_ONLY + const int usage = 1; + const int speed = 7; +#else + const int usage = 0; + const int speed = 2; +#endif + + exec_name = argv[0]; + if (argc != 6) die("Invalid number of arguments"); + + memset(&info, 0, sizeof(info)); + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(argv[1]); + if (encoder == NULL) { + die("Unsupported codec."); + } + assert(encoder != NULL); + info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); + info.frame_width = (int)strtol(argv[2], NULL, 0); + info.frame_height = (int)strtol(argv[3], NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + printf("Using %s\n", aom_codec_iface_name(encoder)); + + res = aom_codec_enc_config_default(encoder, &cfg, usage); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = + (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000); + cfg.g_lag_in_frames = 0; + + writer = aom_video_writer_open(argv[5], kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", argv[5]); + + if (!(infile = fopen(argv[4], "rb"))) + die("Failed to open %s for reading.", argv[4]); + + if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) + die("Failed to initialize encoder"); + + if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed)) + die_codec(&codec, "Failed to set cpu-used"); + + // Encode frames. + while (aom_img_read(&raw, infile) && frame_count < limit) { + ++frame_count; + + if (frame_count == 5) { + set_active_map(&cfg, &codec); + } else if (frame_count == 9) { + unset_active_map(&cfg, &codec); + } + + encode_frame(&codec, &raw, frame_count, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, writer)) { + } + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + aom_img_free(&raw); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/simple_decoder.c b/third_party/aom/examples/simple_decoder.c new file mode 100644 index 0000000000..b6891dcbba --- /dev/null +++ b/third_party/aom/examples/simple_decoder.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Simple Decoder +// ============== +// +// This is an example of a simple decoder loop. It takes an input file +// containing the compressed data (in IVF format), passes it through the +// decoder, and writes the decompressed frames to disk. Other decoder +// examples build upon this one. +// +// The details of the IVF format have been elided from this example for +// simplicity of presentation, as IVF files will not generally be used by +// your application. In general, an IVF file consists of a file header, +// followed by a variable number of frames. Each frame consists of a frame +// header followed by a variable length payload. The length of the payload +// is specified in the first four bytes of the frame header. The payload is +// the raw compressed data. +// +// Standard Includes +// ----------------- +// For decoders, you only have to include `aom_decoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// aom. +// +// Initializing The Codec +// ---------------------- +// The libaom decoder is initialized by the call to aom_codec_dec_init(). +// Determining the codec interface to use is handled by AvxVideoReader and the +// functions prefixed with aom_video_reader_. Discussion of those functions is +// beyond the scope of this example, but the main gist is to open the input file +// and parse just enough of it to determine if it's a AVx file and which AVx +// codec is contained within the file. +// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this +// example because we want the algorithm to determine the stream configuration +// (width/height) and allocate memory automatically. +// +// Decoding A Frame +// ---------------- +// Once the frame has been read into memory, it is decoded using the +// `aom_codec_decode` function. The call takes a pointer to the data +// (`frame`) and the length of the data (`frame_size`). No application data +// is associated with the frame in this example, so the `user_priv` +// parameter is NULL. +// +// Codecs may produce a variable number of output frames for every call to +// `aom_codec_decode`. These frames are retrieved by the +// `aom_codec_get_frame` iterator function. The iterator variable `iter` is +// initialized to NULL each time `aom_codec_decode` is called. +// `aom_codec_get_frame` is called in a loop, returning a pointer to a +// decoded image or NULL to indicate the end of list. +// +// Processing The Decoded Data +// --------------------------- +// In this example, we simply write the encoded data to disk. It is +// important to honor the image's `stride` values. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exceptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. + +#include +#include +#include + +#include "aom/aom_decoder.h" +#include "common/tools_common.h" +#include "common/video_reader.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s \n", exec_name); + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) { + int frame_cnt = 0; + FILE *outfile = NULL; + AvxVideoReader *reader = NULL; + const AvxVideoInfo *info = NULL; + + exec_name = argv[0]; + + if (argc != 3) die("Invalid number of arguments."); + + reader = aom_video_reader_open(argv[1]); + if (!reader) die("Failed to open %s for reading.", argv[1]); + + if (!(outfile = fopen(argv[2], "wb"))) + die("Failed to open %s for writing.", argv[2]); + + info = aom_video_reader_get_info(reader); + + aom_codec_iface_t *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); + if (!decoder) die("Unknown input codec."); + + printf("Using %s\n", aom_codec_iface_name(decoder)); + + aom_codec_ctx_t codec; + if (aom_codec_dec_init(&codec, decoder, NULL, 0)) + die("Failed to initialize decoder."); + + while (aom_video_reader_read_frame(reader)) { + aom_codec_iter_t iter = NULL; + aom_image_t *img = NULL; + size_t frame_size = 0; + const unsigned char *frame = + aom_video_reader_get_frame(reader, &frame_size); + if (aom_codec_decode(&codec, frame, frame_size, NULL)) + die_codec(&codec, "Failed to decode frame."); + + while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) { + aom_img_write(img, outfile); + ++frame_cnt; + } + } + + printf("Processed %d frames.\n", frame_cnt); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n", + info->frame_width, info->frame_height, argv[2]); + + aom_video_reader_close(reader); + + fclose(outfile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/simple_encoder.c b/third_party/aom/examples/simple_encoder.c new file mode 100644 index 0000000000..c026706555 --- /dev/null +++ b/third_party/aom/examples/simple_encoder.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Simple Encoder +// ============== +// +// This is an example of a simple encoder loop. It takes an input file in +// YV12 format, passes it through the encoder, and writes the compressed +// frames to disk in IVF format. Other decoder examples build upon this +// one. +// +// The details of the IVF format have been elided from this example for +// simplicity of presentation, as IVF files will not generally be used by +// your application. In general, an IVF file consists of a file header, +// followed by a variable number of frames. Each frame consists of a frame +// header followed by a variable length payload. The length of the payload +// is specified in the first four bytes of the frame header. The payload is +// the raw compressed data. +// +// Standard Includes +// ----------------- +// For encoders, you only have to include `aom_encoder.h` and then any +// header files for the specific codecs you use. In this case, we're using +// aom. +// +// Getting The Default Configuration +// --------------------------------- +// Encoders have the notion of "usage profiles." For example, an encoder +// may want to publish default configurations for both a video +// conferencing application and a best quality offline encoder. These +// obviously have very different default settings. Consult the +// documentation for your codec to see if it provides any default +// configurations. All codecs provide a default configuration, number 0, +// which is valid for material in the vacinity of QCIF/QVGA. +// +// Updating The Configuration +// --------------------------------- +// Almost all applications will want to update the default configuration +// with settings specific to their usage. Here we set the width and height +// of the video file to that specified on the command line. We also scale +// the default bitrate based on the ratio between the default resolution +// and the resolution specified on the command line. +// +// Initializing The Codec +// ---------------------- +// The encoder is initialized by the following code. +// +// Encoding A Frame +// ---------------- +// The frame is read as a continuous block (size width * height * 3 / 2) +// from the input file. If a frame was read (the input file has not hit +// EOF) then the frame is passed to the encoder. Otherwise, a NULL +// is passed, indicating the End-Of-Stream condition to the encoder. The +// `frame_cnt` is reused as the presentation time stamp (PTS) and each +// frame is shown for one frame-time in duration. The flags parameter is +// unused in this example. + +// Forced Keyframes +// ---------------- +// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the +// flags passed to `aom_codec_control()`. In this example, we force a +// keyframe every frames. Note, the output stream can +// contain additional keyframes beyond those that have been forced using the +// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the +// encoder. +// +// Processing The Encoded Data +// --------------------------- +// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// Cleanup +// ------- +// The `aom_codec_destroy` call frees any memory allocated by the codec. +// +// Error Handling +// -------------- +// This example does not special case any error return codes. If there was +// an error, a descriptive message is printed and the program exits. With +// few exeptions, aom_codec functions return an enumerated error status, +// with the value `0` indicating success. +// +// Error Resiliency Features +// ------------------------- +// Error resiliency is controlled by the g_error_resilient member of the +// configuration structure. Use the `decode_with_drops` example to decode with +// frames 5-10 dropped. Compare the output for a file encoded with this example +// versus one encoded with the `simple_encoder` example. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + " \n" + "See comments in simple_encoder.c for more information.\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img, + int frame_index, int flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = + aom_codec_encode(codec, img, frame_index, 1, flags); + if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame"); + + while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) { + die_codec(codec, "Failed to write compressed frame"); + } + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps. +int main(int argc, char **argv) { + FILE *infile = NULL; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + int frame_count = 0; + aom_image_t raw; + aom_codec_err_t res; + AvxVideoInfo info; + AvxVideoWriter *writer = NULL; + const int fps = 30; + const int bitrate = 200; + int keyframe_interval = 0; + int max_frames = 0; + int frames_encoded = 0; + const char *codec_arg = NULL; + const char *width_arg = NULL; + const char *height_arg = NULL; + const char *infile_arg = NULL; + const char *outfile_arg = NULL; + const char *keyframe_interval_arg = NULL; +#if CONFIG_REALTIME_ONLY + const int usage = 1; + const int speed = 7; +#else + const int usage = 0; + const int speed = 2; +#endif + + exec_name = argv[0]; + + // Clear explicitly, as simply assigning "{ 0 }" generates + // "missing-field-initializers" warning in some compilers. + memset(&info, 0, sizeof(info)); + + if (argc != 9) die("Invalid number of arguments"); + + codec_arg = argv[1]; + width_arg = argv[2]; + height_arg = argv[3]; + infile_arg = argv[4]; + outfile_arg = argv[5]; + keyframe_interval_arg = argv[6]; + max_frames = (int)strtol(argv[8], NULL, 0); + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); + info.frame_width = (int)strtol(width_arg, NULL, 0); + info.frame_height = (int)strtol(height_arg, NULL, 0); + info.time_base.numerator = 1; + info.time_base.denominator = fps; + + if (info.frame_width <= 0 || info.frame_height <= 0 || + (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) { + die("Invalid frame size: %dx%d", info.frame_width, info.frame_height); + } + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width, + info.frame_height, 1)) { + die("Failed to allocate image."); + } + + keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0); + if (keyframe_interval < 0) die("Invalid keyframe interval value."); + + printf("Using %s\n", aom_codec_iface_name(encoder)); + + res = aom_codec_enc_config_default(encoder, &cfg, usage); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = info.frame_width; + cfg.g_h = info.frame_height; + cfg.g_timebase.num = info.time_base.numerator; + cfg.g_timebase.den = info.time_base.denominator; + cfg.rc_target_bitrate = bitrate; + cfg.g_error_resilient = (aom_codec_er_flags_t)strtoul(argv[7], NULL, 0); + + writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing.", outfile_arg); + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading.", infile_arg); + + if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) + die("Failed to initialize encoder"); + + if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed)) + die_codec(&codec, "Failed to set cpu-used"); + + // Encode frames. + while (aom_img_read(&raw, infile)) { + int flags = 0; + if (keyframe_interval > 0 && frame_count % keyframe_interval == 0) + flags |= AOM_EFLAG_FORCE_KF; + encode_frame(&codec, &raw, frame_count++, flags, writer); + frames_encoded++; + if (max_frames > 0 && frames_encoded >= max_frames) break; + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, writer)) continue; + + printf("\n"); + fclose(infile); + printf("Processed %d frames.\n", frame_count); + + aom_img_free(&raw); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/svc_encoder_rtc.cc b/third_party/aom/examples/svc_encoder_rtc.cc new file mode 100644 index 0000000000..2c041081e5 --- /dev/null +++ b/third_party/aom/examples/svc_encoder_rtc.cc @@ -0,0 +1,2062 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This is an example demonstrating how to implement a multi-layer AOM +// encoding scheme for RTC video applications. + +#include +#include +#include +#include +#include +#include + +#include + +#include "config/aom_config.h" + +#if CONFIG_AV1_DECODER +#include "aom/aom_decoder.h" +#endif +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/args.h" +#include "common/tools_common.h" +#include "common/video_writer.h" +#include "examples/encoder_util.h" +#include "aom_ports/aom_timer.h" +#include "av1/ratectrl_rtc.h" + +#define OPTION_BUFFER_SIZE 1024 + +typedef struct { + const char *output_filename; + char options[OPTION_BUFFER_SIZE]; + struct AvxInputContext input_ctx; + int speed; + int aq_mode; + int layering_mode; + int output_obu; + int decode; + int tune_content; + int show_psnr; + bool use_external_rc; +} AppInput; + +typedef enum { + QUANTIZER = 0, + BITRATE, + SCALE_FACTOR, + AUTO_ALT_REF, + ALL_OPTION_TYPES +} LAYER_OPTION_TYPE; + +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); +static const arg_def_t frames_arg = + ARG_DEF("f", "frames", 1, "Number of frames to encode"); +static const arg_def_t threads_arg = + ARG_DEF("th", "threads", 1, "Number of threads to use"); +static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "Source width"); +static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "Source height"); +static const arg_def_t timebase_arg = + ARG_DEF("t", "timebase", 1, "Timebase (num/den)"); +static const arg_def_t bitrate_arg = ARG_DEF( + "b", "target-bitrate", 1, "Encoding bitrate, in kilobits per second"); +static const arg_def_t spatial_layers_arg = + ARG_DEF("sl", "spatial-layers", 1, "Number of spatial SVC layers"); +static const arg_def_t temporal_layers_arg = + ARG_DEF("tl", "temporal-layers", 1, "Number of temporal SVC layers"); +static const arg_def_t layering_mode_arg = + ARG_DEF("lm", "layering-mode", 1, "Temporal layering scheme."); +static const arg_def_t kf_dist_arg = + ARG_DEF("k", "kf-dist", 1, "Number of frames between keyframes"); +static const arg_def_t scale_factors_arg = + ARG_DEF("r", "scale-factors", 1, "Scale factors (lowest to highest layer)"); +static const arg_def_t min_q_arg = + ARG_DEF(NULL, "min-q", 1, "Minimum quantizer"); +static const arg_def_t max_q_arg = + ARG_DEF(NULL, "max-q", 1, "Maximum quantizer"); +static const arg_def_t speed_arg = + ARG_DEF("sp", "speed", 1, "Speed configuration"); +static const arg_def_t aqmode_arg = + ARG_DEF("aq", "aqmode", 1, "AQ mode off/on"); +static const arg_def_t bitrates_arg = + ARG_DEF("bl", "bitrates", 1, + "Bitrates[spatial_layer * num_temporal_layer + temporal_layer]"); +static const arg_def_t dropframe_thresh_arg = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const arg_def_t error_resilient_arg = + ARG_DEF(NULL, "error-resilient", 1, "Error resilient flag"); +static const arg_def_t output_obu_arg = + ARG_DEF(NULL, "output-obu", 1, + "Write OBUs when set to 1. Otherwise write IVF files."); +static const arg_def_t test_decode_arg = + ARG_DEF(NULL, "test-decode", 1, + "Attempt to test decoding the output when set to 1. Default is 1."); +static const arg_def_t psnr_arg = + ARG_DEF(NULL, "psnr", -1, "Show PSNR in status line."); +static const arg_def_t ext_rc_arg = + ARG_DEF(NULL, "use-ext-rc", 0, "Use external rate control."); +static const struct arg_enum_list tune_content_enum[] = { + { "default", AOM_CONTENT_DEFAULT }, + { "screen", AOM_CONTENT_SCREEN }, + { "film", AOM_CONTENT_FILM }, + { NULL, 0 } +}; +static const arg_def_t tune_content_arg = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); + +#if CONFIG_AV1_HIGHBITDEPTH +static const struct arg_enum_list bitdepth_enum[] = { { "8", AOM_BITS_8 }, + { "10", AOM_BITS_10 }, + { NULL, 0 } }; + +static const arg_def_t bitdepth_arg = ARG_DEF_ENUM( + "d", "bit-depth", 1, "Bit depth for codec 8 or 10. ", bitdepth_enum); +#endif // CONFIG_AV1_HIGHBITDEPTH + +static const arg_def_t *svc_args[] = { + &frames_arg, &outputfile, &width_arg, + &height_arg, &timebase_arg, &bitrate_arg, + &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg, + &min_q_arg, &max_q_arg, &temporal_layers_arg, + &layering_mode_arg, &threads_arg, &aqmode_arg, +#if CONFIG_AV1_HIGHBITDEPTH + &bitdepth_arg, +#endif + &speed_arg, &bitrates_arg, &dropframe_thresh_arg, + &error_resilient_arg, &output_obu_arg, &test_decode_arg, + &tune_content_arg, &psnr_arg, NULL, +}; + +#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)) + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, "Usage: %s input_filename -o output_filename\n", + exec_name); + fprintf(stderr, "Options:\n"); + arg_show_usage(stderr, svc_args); + exit(EXIT_FAILURE); +} + +static int file_is_y4m(const char detect[4]) { + return memcmp(detect, "YUV4", 4) == 0; +} + +static int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +static const int option_max_values[ALL_OPTION_TYPES] = { 63, INT_MAX, INT_MAX, + 1 }; + +static const int option_min_values[ALL_OPTION_TYPES] = { 0, 0, 1, 0 }; + +static void open_input_file(struct AvxInputContext *input, + aom_chroma_sample_position_t csp) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.aom_fmt; + input->bit_depth = static_cast(input->y4m.bit_depth); + } else { + fatal("Unsupported Y4M stream."); + } + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +static aom_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input, + int *value0, int *value1) { + if (type == SCALE_FACTOR) { + *value0 = (int)strtol(input, &input, 10); + if (*input++ != '/') return AOM_CODEC_INVALID_PARAM; + *value1 = (int)strtol(input, &input, 10); + + if (*value0 < option_min_values[SCALE_FACTOR] || + *value1 < option_min_values[SCALE_FACTOR] || + *value0 > option_max_values[SCALE_FACTOR] || + *value1 > option_max_values[SCALE_FACTOR] || + *value0 > *value1) // num shouldn't be greater than den + return AOM_CODEC_INVALID_PARAM; + } else { + *value0 = atoi(input); + if (*value0 < option_min_values[type] || *value0 > option_max_values[type]) + return AOM_CODEC_INVALID_PARAM; + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_layer_options_from_string( + aom_svc_params_t *svc_params, LAYER_OPTION_TYPE type, const char *input, + int *option0, int *option1) { + aom_codec_err_t res = AOM_CODEC_OK; + char *input_string; + char *token; + const char *delim = ","; + int num_layers = svc_params->number_spatial_layers; + int i = 0; + + if (type == BITRATE) + num_layers = + svc_params->number_spatial_layers * svc_params->number_temporal_layers; + + if (input == NULL || option0 == NULL || + (option1 == NULL && type == SCALE_FACTOR)) + return AOM_CODEC_INVALID_PARAM; + + const size_t input_length = strlen(input); + input_string = reinterpret_cast(malloc(input_length + 1)); + if (input_string == NULL) return AOM_CODEC_MEM_ERROR; + memcpy(input_string, input, input_length + 1); + token = strtok(input_string, delim); // NOLINT + for (i = 0; i < num_layers; ++i) { + if (token != NULL) { + res = extract_option(type, token, option0 + i, option1 + i); + if (res != AOM_CODEC_OK) break; + token = strtok(NULL, delim); // NOLINT + } else { + res = AOM_CODEC_INVALID_PARAM; + break; + } + } + free(input_string); + return res; +} + +static void parse_command_line(int argc, const char **argv_, + AppInput *app_input, + aom_svc_params_t *svc_params, + aom_codec_enc_cfg_t *enc_cfg) { + struct arg arg; + char **argv = NULL; + char **argi = NULL; + char **argj = NULL; + char string_options[1024] = { 0 }; + + // Default settings + svc_params->number_spatial_layers = 1; + svc_params->number_temporal_layers = 1; + app_input->layering_mode = 0; + app_input->output_obu = 0; + app_input->decode = 1; + enc_cfg->g_threads = 1; + enc_cfg->rc_end_usage = AOM_CBR; + + // process command line options + argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + exit(EXIT_FAILURE); + } + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + + if (arg_match(&arg, &outputfile, argi)) { + app_input->output_filename = arg.val; + } else if (arg_match(&arg, &width_arg, argi)) { + enc_cfg->g_w = arg_parse_uint(&arg); + } else if (arg_match(&arg, &height_arg, argi)) { + enc_cfg->g_h = arg_parse_uint(&arg); + } else if (arg_match(&arg, &timebase_arg, argi)) { + enc_cfg->g_timebase = arg_parse_rational(&arg); + } else if (arg_match(&arg, &bitrate_arg, argi)) { + enc_cfg->rc_target_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &spatial_layers_arg, argi)) { + svc_params->number_spatial_layers = arg_parse_uint(&arg); + } else if (arg_match(&arg, &temporal_layers_arg, argi)) { + svc_params->number_temporal_layers = arg_parse_uint(&arg); + } else if (arg_match(&arg, &speed_arg, argi)) { + app_input->speed = arg_parse_uint(&arg); + if (app_input->speed > 11) { + aom_tools_warn("Mapping speed %d to speed 11.\n", app_input->speed); + } + } else if (arg_match(&arg, &aqmode_arg, argi)) { + app_input->aq_mode = arg_parse_uint(&arg); + } else if (arg_match(&arg, &threads_arg, argi)) { + enc_cfg->g_threads = arg_parse_uint(&arg); + } else if (arg_match(&arg, &layering_mode_arg, argi)) { + app_input->layering_mode = arg_parse_int(&arg); + } else if (arg_match(&arg, &kf_dist_arg, argi)) { + enc_cfg->kf_min_dist = arg_parse_uint(&arg); + enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; + } else if (arg_match(&arg, &scale_factors_arg, argi)) { + aom_codec_err_t res = parse_layer_options_from_string( + svc_params, SCALE_FACTOR, arg.val, svc_params->scaling_factor_num, + svc_params->scaling_factor_den); + if (res != AOM_CODEC_OK) { + die("Failed to parse scale factors: %s\n", + aom_codec_err_to_string(res)); + } + } else if (arg_match(&arg, &min_q_arg, argi)) { + enc_cfg->rc_min_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &max_q_arg, argi)) { + enc_cfg->rc_max_quantizer = arg_parse_uint(&arg); +#if CONFIG_AV1_HIGHBITDEPTH + } else if (arg_match(&arg, &bitdepth_arg, argi)) { + enc_cfg->g_bit_depth = + static_cast(arg_parse_enum_or_int(&arg)); + switch (enc_cfg->g_bit_depth) { + case AOM_BITS_8: + enc_cfg->g_input_bit_depth = 8; + enc_cfg->g_profile = 0; + break; + case AOM_BITS_10: + enc_cfg->g_input_bit_depth = 10; + enc_cfg->g_profile = 0; + break; + default: + die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { + enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &error_resilient_arg, argi)) { + enc_cfg->g_error_resilient = arg_parse_uint(&arg); + if (enc_cfg->g_error_resilient != 0 && enc_cfg->g_error_resilient != 1) + die("Invalid value for error resilient (0, 1): %d.", + enc_cfg->g_error_resilient); + } else if (arg_match(&arg, &output_obu_arg, argi)) { + app_input->output_obu = arg_parse_uint(&arg); + if (app_input->output_obu != 0 && app_input->output_obu != 1) + die("Invalid value for obu output flag (0, 1): %d.", + app_input->output_obu); + } else if (arg_match(&arg, &test_decode_arg, argi)) { + app_input->decode = arg_parse_uint(&arg); + if (app_input->decode != 0 && app_input->decode != 1) + die("Invalid value for test decode flag (0, 1): %d.", + app_input->decode); + } else if (arg_match(&arg, &tune_content_arg, argi)) { + app_input->tune_content = arg_parse_enum_or_int(&arg); + printf("tune content %d\n", app_input->tune_content); + } else if (arg_match(&arg, &psnr_arg, argi)) { + app_input->show_psnr = 1; + } else if (arg_match(&arg, &ext_rc_arg, argi)) { + app_input->use_external_rc = true; + } else { + ++argj; + } + } + + // Total bitrate needs to be parsed after the number of layers. + for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { + arg.argv_step = 1; + if (arg_match(&arg, &bitrates_arg, argi)) { + aom_codec_err_t res = parse_layer_options_from_string( + svc_params, BITRATE, arg.val, svc_params->layer_target_bitrate, NULL); + if (res != AOM_CODEC_OK) { + die("Failed to parse bitrates: %s\n", aom_codec_err_to_string(res)); + } + } else { + ++argj; + } + } + + // There will be a space in front of the string options + if (strlen(string_options) > 0) + strncpy(app_input->options, string_options, OPTION_BUFFER_SIZE); + + // Check for unrecognized options + for (argi = argv; *argi; ++argi) + if (argi[0][0] == '-' && strlen(argi[0]) > 1) + die("Error: Unrecognized option %s\n", *argi); + + if (argv[0] == NULL) { + usage_exit(); + } + + app_input->input_ctx.filename = argv[0]; + free(argv); + + open_input_file(&app_input->input_ctx, AOM_CSP_UNKNOWN); + if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) { + enc_cfg->g_w = app_input->input_ctx.width; + enc_cfg->g_h = app_input->input_ctx.height; + } + + if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 || + enc_cfg->g_h % 2) + die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h); + + printf( + "Codec %s\n" + "layers: %d\n" + "width %u, height: %u\n" + "num: %d, den: %d, bitrate: %u\n" + "gop size: %u\n", + aom_codec_iface_name(aom_codec_av1_cx()), + svc_params->number_spatial_layers, enc_cfg->g_w, enc_cfg->g_h, + enc_cfg->g_timebase.num, enc_cfg->g_timebase.den, + enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist); +} + +static int mode_to_num_temporal_layers[12] = { + 1, 2, 3, 3, 2, 1, 1, 3, 3, 3, 3, 3, +}; +static int mode_to_num_spatial_layers[12] = { + 1, 1, 1, 1, 1, 2, 3, 2, 3, 3, 3, 3, +}; + +// For rate control encoding stats. +struct RateControlMetrics { + // Number of input frames per layer. + int layer_input_frames[AOM_MAX_TS_LAYERS]; + // Number of encoded non-key frames per layer. + int layer_enc_frames[AOM_MAX_TS_LAYERS]; + // Framerate per layer layer (cumulative). + double layer_framerate[AOM_MAX_TS_LAYERS]; + // Target average frame size per layer (per-frame-bandwidth per layer). + double layer_pfb[AOM_MAX_LAYERS]; + // Actual average frame size per layer. + double layer_avg_frame_size[AOM_MAX_LAYERS]; + // Average rate mismatch per layer (|target - actual| / target). + double layer_avg_rate_mismatch[AOM_MAX_LAYERS]; + // Actual encoding bitrate per layer (cumulative across temporal layers). + double layer_encoding_bitrate[AOM_MAX_LAYERS]; + // Average of the short-time encoder actual bitrate. + // TODO(marpan): Should we add these short-time stats for each layer? + double avg_st_encoding_bitrate; + // Variance of the short-time encoder actual bitrate. + double variance_st_encoding_bitrate; + // Window (number of frames) for computing short-timee encoding bitrate. + int window_size; + // Number of window measurements. + int window_count; + int layer_target_bitrate[AOM_MAX_LAYERS]; +}; + +static const int REF_FRAMES = 8; + +static const int INTER_REFS_PER_FRAME = 7; + +// Reference frames used in this example encoder. +enum { + SVC_LAST_FRAME = 0, + SVC_LAST2_FRAME, + SVC_LAST3_FRAME, + SVC_GOLDEN_FRAME, + SVC_BWDREF_FRAME, + SVC_ALTREF2_FRAME, + SVC_ALTREF_FRAME +}; + +static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +static void close_input_file(struct AvxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} + +// Note: these rate control metrics assume only 1 key frame in the +// sequence (i.e., first frame only). So for temporal pattern# 7 +// (which has key frame for every frame on base layer), the metrics +// computation will be off/wrong. +// TODO(marpan): Update these metrics to account for multiple key frames +// in the stream. +static void set_rate_control_metrics(struct RateControlMetrics *rc, + double framerate, int ss_number_layers, + int ts_number_layers) { + int ts_rate_decimator[AOM_MAX_TS_LAYERS] = { 1 }; + ts_rate_decimator[0] = 1; + if (ts_number_layers == 2) { + ts_rate_decimator[0] = 2; + ts_rate_decimator[1] = 1; + } + if (ts_number_layers == 3) { + ts_rate_decimator[0] = 4; + ts_rate_decimator[1] = 2; + ts_rate_decimator[2] = 1; + } + // Set the layer (cumulative) framerate and the target layer (non-cumulative) + // per-frame-bandwidth, for the rate control encoding stats below. + for (int sl = 0; sl < ss_number_layers; ++sl) { + int i = sl * ts_number_layers; + rc->layer_framerate[0] = framerate / ts_rate_decimator[0]; + rc->layer_pfb[i] = + 1000.0 * rc->layer_target_bitrate[i] / rc->layer_framerate[0]; + for (int tl = 0; tl < ts_number_layers; ++tl) { + i = sl * ts_number_layers + tl; + if (tl > 0) { + rc->layer_framerate[tl] = framerate / ts_rate_decimator[tl]; + rc->layer_pfb[i] = + 1000.0 * + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / + (rc->layer_framerate[tl] - rc->layer_framerate[tl - 1]); + } + rc->layer_input_frames[tl] = 0; + rc->layer_enc_frames[tl] = 0; + rc->layer_encoding_bitrate[i] = 0.0; + rc->layer_avg_frame_size[i] = 0.0; + rc->layer_avg_rate_mismatch[i] = 0.0; + } + } + rc->window_count = 0; + rc->window_size = 15; + rc->avg_st_encoding_bitrate = 0.0; + rc->variance_st_encoding_bitrate = 0.0; +} + +static void printout_rate_control_summary(struct RateControlMetrics *rc, + int frame_cnt, int ss_number_layers, + int ts_number_layers) { + int tot_num_frames = 0; + double perc_fluctuation = 0.0; + printf("Total number of processed frames: %d\n\n", frame_cnt - 1); + printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers); + for (int sl = 0; sl < ss_number_layers; ++sl) { + tot_num_frames = 0; + for (int tl = 0; tl < ts_number_layers; ++tl) { + int i = sl * ts_number_layers + tl; + const int num_dropped = + tl > 0 ? rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] + : rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] - 1; + tot_num_frames += rc->layer_input_frames[tl]; + rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[tl] * + rc->layer_encoding_bitrate[i] / + tot_num_frames; + rc->layer_avg_frame_size[i] = + rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl]; + rc->layer_avg_rate_mismatch[i] = + 100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl]; + printf("For layer#: %d %d \n", sl, tl); + printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i], + rc->layer_encoding_bitrate[i]); + printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i], + rc->layer_avg_frame_size[i]); + printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[i]); + printf( + "Number of input frames, encoded (non-key) frames, " + "and perc dropped frames: %d %d %f\n", + rc->layer_input_frames[tl], rc->layer_enc_frames[tl], + 100.0 * num_dropped / rc->layer_input_frames[tl]); + printf("\n"); + } + } + rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count; + rc->variance_st_encoding_bitrate = + rc->variance_st_encoding_bitrate / rc->window_count - + (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate); + perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) / + rc->avg_st_encoding_bitrate; + printf("Short-time stats, for window of %d frames:\n", rc->window_size); + printf("Average, rms-variance, and percent-fluct: %f %f %f\n", + rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), + perc_fluctuation); + if (frame_cnt - 1 != tot_num_frames) + die("Error: Number of input frames not equal to output!\n"); +} + +// Layer pattern configuration. +static void set_layer_pattern( + int layering_mode, int superframe_cnt, aom_svc_layer_id_t *layer_id, + aom_svc_ref_frame_config_t *ref_frame_config, + aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control, + int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) { + // Setting this flag to 1 enables simplex example of + // RPS (Reference Picture Selection) for 1 layer. + int use_rps_example = 0; + int i; + int enable_longterm_temporal_ref = 1; + int shift = (layering_mode == 8) ? 2 : 0; + int simulcast_mode = (layering_mode == 11); + *use_svc_control = 1; + layer_id->spatial_layer_id = spatial_layer_id; + int lag_index = 0; + int base_count = superframe_cnt >> 2; + ref_frame_comp_pred->use_comp_pred[0] = 0; // GOLDEN_LAST + ref_frame_comp_pred->use_comp_pred[1] = 0; // LAST2_LAST + ref_frame_comp_pred->use_comp_pred[2] = 0; // ALTREF_LAST + // Set the reference map buffer idx for the 7 references: + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0; + for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0; + + if (ksvc_mode) { + // Same pattern as case 9, but the reference strucutre will be constrained + // below. + layering_mode = 9; + } + switch (layering_mode) { + case 0: + if (use_rps_example == 0) { + // 1-layer: update LAST on every frame, reference LAST. + layer_id->temporal_layer_id = 0; + layer_id->spatial_layer_id = 0; + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else { + // Pattern of 2 references (ALTREF and GOLDEN) trailing + // LAST by 4 and 8 frames, with some switching logic to + // sometimes only predict from the longer-term reference + //(golden here). This is simple example to test RPS + // (reference picture selection). + int last_idx = 0; + int last_idx_refresh = 0; + int gld_idx = 0; + int alt_ref_idx = 0; + int lag_alt = 4; + int lag_gld = 8; + layer_id->temporal_layer_id = 0; + layer_id->spatial_layer_id = 0; + int sh = 8; // slots 0 - 7. + // Moving index slot for last: 0 - (sh - 1) + if (superframe_cnt > 1) last_idx = (superframe_cnt - 1) % sh; + // Moving index for refresh of last: one ahead for next frame. + last_idx_refresh = superframe_cnt % sh; + // Moving index for gld_ref, lag behind current by lag_gld + if (superframe_cnt > lag_gld) gld_idx = (superframe_cnt - lag_gld) % sh; + // Moving index for alt_ref, lag behind LAST by lag_alt frames. + if (superframe_cnt > lag_alt) + alt_ref_idx = (superframe_cnt - lag_alt) % sh; + // Set the ref_idx. + // Default all references to slot for last. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = last_idx; + // Set the ref_idx for the relevant references. + ref_frame_config->ref_idx[SVC_LAST_FRAME] = last_idx; + ref_frame_config->ref_idx[SVC_LAST2_FRAME] = last_idx_refresh; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = gld_idx; + ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = alt_ref_idx; + // Refresh this slot, which will become LAST on next frame. + ref_frame_config->refresh[last_idx_refresh] = 1; + // Reference LAST, ALTREF, and GOLDEN + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + // Switch to only GOLDEN every 300 frames. + if (superframe_cnt % 200 == 0 && superframe_cnt > 0) { + ref_frame_config->reference[SVC_LAST_FRAME] = 0; + ref_frame_config->reference[SVC_ALTREF_FRAME] = 0; + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + // Test if the long-term is LAST instead, this is just a renaming + // but its tests if encoder behaves the same, whether its + // LAST or GOLDEN. + if (superframe_cnt % 400 == 0 && superframe_cnt > 0) { + ref_frame_config->ref_idx[SVC_LAST_FRAME] = gld_idx; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + ref_frame_config->reference[SVC_ALTREF_FRAME] = 0; + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0; + } + } + } + break; + case 1: + // 2-temporal layer. + // 1 3 5 + // 0 2 4 + // Keep golden fixed at slot 3. + base_count = superframe_cnt >> 1; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + // Cyclically refresh slots 5, 6, 7, for lag alt ref. + lag_index = 5; + if (base_count > 0) { + lag_index = 5 + (base_count % 3); + if (superframe_cnt % 2 != 0) lag_index = 5 + ((base_count + 1) % 3); + } + // Set the altref slot to lag_index. + ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index; + if (superframe_cnt % 2 == 0) { + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST. + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + // Refresh lag_index slot, needed for lagging golen. + ref_frame_config->refresh[lag_index] = 1; + // Refresh GOLDEN every x base layer frames. + if (base_count % 32 == 0) ref_frame_config->refresh[3] = 1; + } else { + layer_id->temporal_layer_id = 1; + // No updates on layer 1, reference LAST (TL0). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } + // Always reference golden and altref on TL0. + if (layer_id->temporal_layer_id == 0) { + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; + } + break; + case 2: + // 3-temporal layer: + // 1 3 5 7 + // 2 6 + // 0 4 8 + if (superframe_cnt % 4 == 0) { + // Base layer. + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST. + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 1) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // First top layer: no updates, only reference LAST (TL0). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 2) % 4 == 0) { + layer_id->temporal_layer_id = 1; + // Middle layer (TL1): update LAST2, only reference LAST (TL0). + ref_frame_config->refresh[1] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 3) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // Second top layer: no updates, only reference LAST. + // Set buffer idx for LAST to slot 1, since that was the slot + // updated in previous frame. So LAST is TL1 frame. + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } + break; + case 3: + // 3 TL, same as above, except allow for predicting + // off 2 more references (GOLDEN and ALTREF), with + // GOLDEN updated periodically, and ALTREF lagging from + // LAST from ~4 frames. Both GOLDEN and ALTREF + // can only be updated on base temporal layer. + + // Keep golden fixed at slot 3. + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + // Cyclically refresh slots 5, 6, 7, for lag altref. + lag_index = 5; + if (base_count > 0) { + lag_index = 5 + (base_count % 3); + if (superframe_cnt % 4 != 0) lag_index = 5 + ((base_count + 1) % 3); + } + // Set the altref slot to lag_index. + ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index; + if (superframe_cnt % 4 == 0) { + // Base layer. + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, reference LAST. + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + // Refresh GOLDEN every x ~10 base layer frames. + if (base_count % 10 == 0) ref_frame_config->refresh[3] = 1; + // Refresh lag_index slot, needed for lagging altref. + ref_frame_config->refresh[lag_index] = 1; + } else if ((superframe_cnt - 1) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // First top layer: no updates, only reference LAST (TL0). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 2) % 4 == 0) { + layer_id->temporal_layer_id = 1; + // Middle layer (TL1): update LAST2, only reference LAST (TL0). + ref_frame_config->refresh[1] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 3) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // Second top layer: no updates, only reference LAST. + // Set buffer idx for LAST to slot 1, since that was the slot + // updated in previous frame. So LAST is TL1 frame. + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 0; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } + // Every frame can reference GOLDEN AND ALTREF. + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; + // Allow for compound prediction for LAST-ALTREF and LAST-GOLDEN. + if (speed >= 7) { + ref_frame_comp_pred->use_comp_pred[2] = 1; + ref_frame_comp_pred->use_comp_pred[0] = 1; + } + break; + case 4: + // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will + // only reference GF (not LAST). Other frames only reference LAST. + // 1 3 5 7 + // 2 6 + // 0 4 8 + if (superframe_cnt % 4 == 0) { + // Base layer. + layer_id->temporal_layer_id = 0; + // Update LAST on layer 0, only reference LAST. + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 1) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // First top layer: no updates, only reference LAST (TL0). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 2) % 4 == 0) { + layer_id->temporal_layer_id = 1; + // Middle layer (TL1): update GF, only reference LAST (TL0). + ref_frame_config->refresh[3] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if ((superframe_cnt - 3) % 4 == 0) { + layer_id->temporal_layer_id = 2; + // Second top layer: no updates, only reference GF. + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + } + break; + case 5: + // 2 spatial layers, 1 temporal. + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 + // and GOLDEN to slot 0. Update slot 1 (LAST). + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 0; + ref_frame_config->refresh[1] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + } + break; + case 6: + // 3 spatial layers, 1 temporal. + // Note for this case, we set the buffer idx for all references to be + // either LAST or GOLDEN, which are always valid references, since decoder + // will check if any of the 7 references is valid scale in + // valid_ref_frame_size(). + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. Set all buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1 + // and GOLDEN (and all other refs) to slot 0. + // Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->refresh[1] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2 + // and GOLDEN (and all other refs) to slot 1. + // Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + ref_frame_config->refresh[2] = 1; + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + // For 3 spatial layer case: allow for top spatial layer to use + // additional temporal reference. Update every 10 frames. + if (enable_longterm_temporal_ref) { + ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1; + ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; + if (base_count % 10 == 0) + ref_frame_config->refresh[REF_FRAMES - 1] = 1; + } + } + break; + case 7: + // 2 spatial and 3 temporal layer. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + if (superframe_cnt % 4 == 0) { + // Base temporal layer + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST + // Set all buffer_idx to 0 + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->refresh[1] = 1; + } + } else if ((superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + } + } else if ((superframe_cnt - 2) % 4 == 0) { + // Middle temporal enhancement layer. + layer_id->temporal_layer_id = 1; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift; + ref_frame_config->refresh[5 - shift] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST3 to slot 6 and update slot 6. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 5 - shift; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift; + ref_frame_config->refresh[6 - shift] = 1; + } + } else if ((superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + } + } + break; + case 8: + // 3 spatial and 3 temporal layer. + // Same as case 9 but overalap in the buffer slot updates. + // (shift = 2). The slots 3 and 4 updated by first TL2 are + // reused for update in TL1 superframe. + // Note for this case, frame order hint must be disabled for + // lower resolutios (operating points > 0) to be decoedable. + case 9: + // 3 spatial and 3 temporal layer. + // No overlap in buffer updates between TL2 and TL1. + // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7. + // Set the references via the svc_ref_frame_config control. + // Always reference LAST. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + if (superframe_cnt % 4 == 0) { + // Base temporal layer. + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST, update LAST. + // Set all buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 0. + // Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 1. + // Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + ref_frame_config->refresh[2] = 1; + } + } else if ((superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST (slot 0). + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to slot 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // Set LAST2 to slot 4 and Update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4; + ref_frame_config->refresh[4] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 4. + // No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 4; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + } + } else if ((superframe_cnt - 2) % 4 == 0) { + // Middle temporal enhancement layer. + layer_id->temporal_layer_id = 1; + if (layer_id->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5 - shift; + ref_frame_config->refresh[5 - shift] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST3 to slot 6 and update slot 6. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 5 - shift; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 6 - shift; + ref_frame_config->refresh[6 - shift] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 6. + // Set LAST3 to slot 7 and update slot 7. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 6 - shift; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + ref_frame_config->ref_idx[SVC_LAST3_FRAME] = 7 - shift; + ref_frame_config->refresh[7 - shift] = 1; + } + } else if ((superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer. + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5 - shift; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4; + ref_frame_config->refresh[4] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, + // GOLDEN to slot 4. No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 7 - shift; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 4; + } + } + break; + case 11: + // Simulcast mode for 3 spatial and 3 temporal layers. + // No inter-layer predicton, only prediction is temporal and single + // reference (LAST). + // No overlap in buffer slots between spatial layers. So for example, + // SL0 only uses slots 0 and 1. + // SL1 only uses slots 2 and 3. + // SL2 only uses slots 4 and 5. + // All 7 references for each inter-frame must only access buffer slots + // for that spatial layer. + // On key (super)frames: SL1 and SL2 must have no references set + // and must refresh all the slots for that layer only (so 2 and 3 + // for SL1, 4 and 5 for SL2). The base SL0 will be labelled internally + // as a Key frame (refresh all slots). SL1/SL2 will be labelled + // internally as Intra-only frames that allow that stream to be decoded. + // These conditions will allow for each spatial stream to be + // independently decodeable. + + // Initialize all references to 0 (don't use reference). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->reference[i] = 0; + // Initialize as no refresh/update for all slots. + for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + + if (is_key_frame) { + if (layer_id->spatial_layer_id == 0) { + // Assign LAST/GOLDEN to slot 0/1. + // Refesh slots 0 and 1 for SL0. + // SL0: this will get set to KEY frame internally. + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 1; + ref_frame_config->refresh[0] = 1; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 1) { + // Assign LAST/GOLDEN to slot 2/3. + // Refesh slots 2 and 3 for SL1. + // This will get set to Intra-only frame internally. + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; + ref_frame_config->refresh[2] = 1; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 2) { + // Assign LAST/GOLDEN to slot 4/5. + // Refresh slots 4 and 5 for SL2. + // This will get set to Intra-only frame internally. + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; + ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 5; + ref_frame_config->refresh[4] = 1; + ref_frame_config->refresh[5] = 1; + } + } else if (superframe_cnt % 4 == 0) { + // Base temporal layer: TL0 + layer_id->temporal_layer_id = 0; + if (layer_id->spatial_layer_id == 0) { // SL0 + // Reference LAST. Assign all references to either slot + // 0 or 1. Here we assign LAST to slot 0, all others to 1. + // Update slot 0 (LAST). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; + ref_frame_config->refresh[0] = 1; + } else if (layer_id->spatial_layer_id == 1) { // SL1 + // Reference LAST. Assign all references to either slot + // 2 or 3. Here we assign LAST to slot 2, all others to 3. + // Update slot 2 (LAST). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + ref_frame_config->refresh[2] = 1; + } else if (layer_id->spatial_layer_id == 2) { // SL2 + // Reference LAST. Assign all references to either slot + // 4 or 5. Here we assign LAST to slot 4, all others to 5. + // Update slot 4 (LAST). + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 5; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; + ref_frame_config->refresh[4] = 1; + } + } else if ((superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer: TL2 + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { // SL0 + // Reference LAST (slot 0). Assign other references to slot 1. + // No update/refresh on any slots. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; + } else if (layer_id->spatial_layer_id == 1) { // SL1 + // Reference LAST (slot 2). Assign other references to slot 3. + // No update/refresh on any slots. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + } else if (layer_id->spatial_layer_id == 2) { // SL2 + // Reference LAST (slot 4). Assign other references to slot 4. + // No update/refresh on any slots. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 5; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; + } + } else if ((superframe_cnt - 2) % 4 == 0) { + // Middle temporal enhancement layer: TL1 + layer_id->temporal_layer_id = 1; + if (layer_id->spatial_layer_id == 0) { // SL0 + // Reference LAST (slot 0). + // Set GOLDEN to slot 1 and update slot 1. + // This will be used as reference for next TL2. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 1; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 0; + ref_frame_config->refresh[1] = 1; + } else if (layer_id->spatial_layer_id == 1) { // SL1 + // Reference LAST (slot 2). + // Set GOLDEN to slot 3 and update slot 3. + // This will be used as reference for next TL2. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 3; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 2; + ref_frame_config->refresh[3] = 1; + } else if (layer_id->spatial_layer_id == 2) { // SL2 + // Reference LAST (slot 4). + // Set GOLDEN to slot 5 and update slot 5. + // This will be used as reference for next TL2. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 5; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 4; + ref_frame_config->refresh[5] = 1; + } + } else if ((superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer: TL2 + layer_id->temporal_layer_id = 2; + if (layer_id->spatial_layer_id == 0) { // SL0 + // Reference LAST (slot 1). Assign other references to slot 0. + // No update/refresh on any slots. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 0; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; + } else if (layer_id->spatial_layer_id == 1) { // SL1 + // Reference LAST (slot 3). Assign other references to slot 2. + // No update/refresh on any slots. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 2; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 3; + } else if (layer_id->spatial_layer_id == 2) { // SL2 + // Reference LAST (slot 5). Assign other references to slot 4. + // No update/refresh on any slots. + ref_frame_config->reference[SVC_LAST_FRAME] = 1; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) + ref_frame_config->ref_idx[i] = 4; + ref_frame_config->ref_idx[SVC_LAST_FRAME] = 5; + } + } + if (!simulcast_mode && layer_id->spatial_layer_id > 0) { + // Always reference GOLDEN (inter-layer prediction). + ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1; + if (ksvc_mode) { + // KSVC: only keep the inter-layer reference (GOLDEN) for + // superframes whose base is key. + if (!is_key_frame) ref_frame_config->reference[SVC_GOLDEN_FRAME] = 0; + } + if (is_key_frame && layer_id->spatial_layer_id > 1) { + // On superframes whose base is key: remove LAST to avoid prediction + // off layer two levels below. + ref_frame_config->reference[SVC_LAST_FRAME] = 0; + } + } + // For 3 spatial layer case 8 (where there is free buffer slot): + // allow for top spatial layer to use additional temporal reference. + // Additional reference is only updated on base temporal layer, every + // 10 TL0 frames here. + if (!simulcast_mode && enable_longterm_temporal_ref && + layer_id->spatial_layer_id == 2 && layering_mode == 8) { + ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = REF_FRAMES - 1; + if (!is_key_frame) ref_frame_config->reference[SVC_ALTREF_FRAME] = 1; + if (base_count % 10 == 0 && layer_id->temporal_layer_id == 0) + ref_frame_config->refresh[REF_FRAMES - 1] = 1; + } + break; + default: assert(0); die("Error: Unsupported temporal layering mode!\n"); + } +} + +#if CONFIG_AV1_DECODER +// Returns whether there is a mismatch between the encoder's new frame and the +// decoder's new frame. +static int test_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder, + const int frames_out) { + aom_image_t enc_img, dec_img; + int mismatch = 0; + + /* Get the internal new frame */ + AOM_CODEC_CONTROL_TYPECHECKED(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img); + AOM_CODEC_CONTROL_TYPECHECKED(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img); + +#if CONFIG_AV1_HIGHBITDEPTH + if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t enc_hbd_img; + aom_img_alloc( + &enc_hbd_img, + static_cast(enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH), + enc_img.d_w, enc_img.d_h, 16); + aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); + enc_img = enc_hbd_img; + } + if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t dec_hbd_img; + aom_img_alloc( + &dec_hbd_img, + static_cast(dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH), + dec_img.d_w, dec_img.d_h, 16); + aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); + dec_img = dec_hbd_img; + } + } +#endif + + if (!aom_compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; +#if CONFIG_AV1_HIGHBITDEPTH + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + aom_find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else + aom_find_mismatch(&enc_img, &dec_img, y, u, v); +#endif + fprintf(stderr, + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}\n", + frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], + v[1], v[2], v[3]); + mismatch = 1; + } + + aom_img_free(&enc_img); + aom_img_free(&dec_img); + return mismatch; +} +#endif // CONFIG_AV1_DECODER + +struct psnr_stats { + // The second element of these arrays is reserved for high bitdepth. + uint64_t psnr_sse_total[2]; + uint64_t psnr_samples_total[2]; + double psnr_totals[2][4]; + int psnr_count[2]; +}; + +static void show_psnr(struct psnr_stats *psnr_stream, double peak) { + double ovpsnr; + + if (!psnr_stream->psnr_count[0]) return; + + fprintf(stderr, "\nPSNR (Overall/Avg/Y/U/V)"); + ovpsnr = sse_to_psnr((double)psnr_stream->psnr_samples_total[0], peak, + (double)psnr_stream->psnr_sse_total[0]); + fprintf(stderr, " %.3f", ovpsnr); + + for (int i = 0; i < 4; i++) { + fprintf(stderr, " %.3f", + psnr_stream->psnr_totals[0][i] / psnr_stream->psnr_count[0]); + } + fprintf(stderr, "\n"); +} + +static aom::AV1RateControlRtcConfig create_rtc_rc_config( + const aom_codec_enc_cfg_t &cfg, const AppInput &app_input) { + aom::AV1RateControlRtcConfig rc_cfg; + rc_cfg.width = cfg.g_w; + rc_cfg.height = cfg.g_h; + rc_cfg.max_quantizer = cfg.rc_max_quantizer; + rc_cfg.min_quantizer = cfg.rc_min_quantizer; + rc_cfg.target_bandwidth = cfg.rc_target_bitrate; + rc_cfg.buf_initial_sz = cfg.rc_buf_initial_sz; + rc_cfg.buf_optimal_sz = cfg.rc_buf_optimal_sz; + rc_cfg.buf_sz = cfg.rc_buf_sz; + rc_cfg.overshoot_pct = cfg.rc_overshoot_pct; + rc_cfg.undershoot_pct = cfg.rc_undershoot_pct; + // This is hardcoded as AOME_SET_MAX_INTRA_BITRATE_PCT + rc_cfg.max_intra_bitrate_pct = 300; + rc_cfg.framerate = cfg.g_timebase.den; + // TODO(jianj): Add suppor for SVC. + rc_cfg.ss_number_layers = 1; + rc_cfg.ts_number_layers = 1; + rc_cfg.scaling_factor_num[0] = 1; + rc_cfg.scaling_factor_den[0] = 1; + rc_cfg.layer_target_bitrate[0] = static_cast(rc_cfg.target_bandwidth); + rc_cfg.max_quantizers[0] = rc_cfg.max_quantizer; + rc_cfg.min_quantizers[0] = rc_cfg.min_quantizer; + rc_cfg.aq_mode = app_input.aq_mode; + + return rc_cfg; +} + +static int qindex_to_quantizer(int qindex) { + // Table that converts 0-63 range Q values passed in outside to the 0-255 + // range Qindex used internally. + static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, + 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100, + 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204, + 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255, + }; + for (int quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) return quantizer; + + return 63; +} + +int main(int argc, const char **argv) { + AppInput app_input; + AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL }; + FILE *obu_files[AOM_MAX_LAYERS] = { NULL }; + AvxVideoWriter *total_layer_file = NULL; + FILE *total_layer_obu_file = NULL; + aom_codec_enc_cfg_t cfg; + int frame_cnt = 0; + aom_image_t raw; + int frame_avail; + int got_data = 0; + int flags = 0; + int i; + int pts = 0; // PTS starts at 0. + int frame_duration = 1; // 1 timebase tick per frame. + aom_svc_layer_id_t layer_id; + aom_svc_params_t svc_params; + aom_svc_ref_frame_config_t ref_frame_config; + aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred; + +#if CONFIG_INTERNAL_STATS + FILE *stats_file = fopen("opsnr.stt", "a"); + if (stats_file == NULL) { + die("Cannot open opsnr.stt\n"); + } +#endif +#if CONFIG_AV1_DECODER + aom_codec_ctx_t decoder; +#endif + + struct RateControlMetrics rc; + int64_t cx_time = 0; + int64_t cx_time_layer[AOM_MAX_LAYERS]; // max number of layers. + int frame_cnt_layer[AOM_MAX_LAYERS]; + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + double framerate = 30.0; + int use_svc_control = 1; + int set_err_resil_frame = 0; + int test_changing_bitrate = 0; + zero(rc.layer_target_bitrate); + memset(&layer_id, 0, sizeof(aom_svc_layer_id_t)); + memset(&app_input, 0, sizeof(AppInput)); + memset(&svc_params, 0, sizeof(svc_params)); + + // Flag to test dynamic scaling of source frames for single + // spatial stream, using the scaling_mode control. + const int test_dynamic_scaling_single_layer = 0; + + // Flag to test setting speed per layer. + const int test_speed_per_layer = 0; + + /* Setup default input stream settings */ + app_input.input_ctx.framerate.numerator = 30; + app_input.input_ctx.framerate.denominator = 1; + app_input.input_ctx.only_i420 = 0; + app_input.input_ctx.bit_depth = AOM_BITS_8; + app_input.speed = 7; + exec_name = argv[0]; + + // start with default encoder configuration + aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, + AOM_USAGE_REALTIME); + if (res != AOM_CODEC_OK) { + die("Failed to get config: %s\n", aom_codec_err_to_string(res)); + } + + // Real time parameters. + cfg.g_usage = AOM_USAGE_REALTIME; + + cfg.rc_end_usage = AOM_CBR; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 52; + cfg.rc_undershoot_pct = 50; + cfg.rc_overshoot_pct = 50; + cfg.rc_buf_initial_sz = 600; + cfg.rc_buf_optimal_sz = 600; + cfg.rc_buf_sz = 1000; + cfg.rc_resize_mode = 0; // Set to RESIZE_DYNAMIC for dynamic resize. + cfg.g_lag_in_frames = 0; + cfg.kf_mode = AOM_KF_AUTO; + + parse_command_line(argc, argv, &app_input, &svc_params, &cfg); + + int ts_number_layers = svc_params.number_temporal_layers; + int ss_number_layers = svc_params.number_spatial_layers; + + unsigned int width = cfg.g_w; + unsigned int height = cfg.g_h; + + if (app_input.layering_mode >= 0) { + if (ts_number_layers != + mode_to_num_temporal_layers[app_input.layering_mode] || + ss_number_layers != + mode_to_num_spatial_layers[app_input.layering_mode]) { + die("Number of layers doesn't match layering mode."); + } + } + + // Y4M reader has its own allocation. + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) { + die("Failed to allocate image (%dx%d)", width, height); + } + } + + aom_codec_iface_t *encoder = aom_codec_av1_cx(); + + memcpy(&rc.layer_target_bitrate[0], &svc_params.layer_target_bitrate[0], + sizeof(svc_params.layer_target_bitrate)); + + unsigned int total_rate = 0; + for (i = 0; i < ss_number_layers; i++) { + total_rate += + svc_params + .layer_target_bitrate[i * ts_number_layers + ts_number_layers - 1]; + } + if (total_rate != cfg.rc_target_bitrate) { + die("Incorrect total target bitrate"); + } + + svc_params.framerate_factor[0] = 1; + if (ts_number_layers == 2) { + svc_params.framerate_factor[0] = 2; + svc_params.framerate_factor[1] = 1; + } else if (ts_number_layers == 3) { + svc_params.framerate_factor[0] = 4; + svc_params.framerate_factor[1] = 2; + svc_params.framerate_factor[2] = 1; + } + + if (app_input.input_ctx.file_type == FILE_TYPE_Y4M) { + // Override these settings with the info from Y4M file. + cfg.g_w = app_input.input_ctx.width; + cfg.g_h = app_input.input_ctx.height; + // g_timebase is the reciprocal of frame rate. + cfg.g_timebase.num = app_input.input_ctx.framerate.denominator; + cfg.g_timebase.den = app_input.input_ctx.framerate.numerator; + } + framerate = cfg.g_timebase.den / cfg.g_timebase.num; + set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers); + + AvxVideoInfo info; + info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); + info.frame_width = cfg.g_w; + info.frame_height = cfg.g_h; + info.time_base.numerator = cfg.g_timebase.num; + info.time_base.denominator = cfg.g_timebase.den; + // Open an output file for each stream. + for (int sl = 0; sl < ss_number_layers; ++sl) { + for (int tl = 0; tl < ts_number_layers; ++tl) { + i = sl * ts_number_layers + tl; + char file_name[PATH_MAX]; + snprintf(file_name, sizeof(file_name), "%s_%d.av1", + app_input.output_filename, i); + if (app_input.output_obu) { + obu_files[i] = fopen(file_name, "wb"); + if (!obu_files[i]) die("Failed to open %s for writing", file_name); + } else { + outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[i]) die("Failed to open %s for writing", file_name); + } + } + } + if (app_input.output_obu) { + total_layer_obu_file = fopen(app_input.output_filename, "wb"); + if (!total_layer_obu_file) + die("Failed to open %s for writing", app_input.output_filename); + } else { + total_layer_file = + aom_video_writer_open(app_input.output_filename, kContainerIVF, &info); + if (!total_layer_file) + die("Failed to open %s for writing", app_input.output_filename); + } + + // Initialize codec. + aom_codec_ctx_t codec; + aom_codec_flags_t flag = 0; + flag |= cfg.g_input_bit_depth == AOM_BITS_8 ? 0 : AOM_CODEC_USE_HIGHBITDEPTH; + flag |= app_input.show_psnr ? AOM_CODEC_USE_PSNR : 0; + if (aom_codec_enc_init(&codec, encoder, &cfg, flag)) + die_codec(&codec, "Failed to initialize encoder"); + +#if CONFIG_AV1_DECODER + if (app_input.decode) { + if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) + die_codec(&decoder, "Failed to initialize decoder"); + } +#endif + + aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed); + aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0); + aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1); + aom_codec_control(&codec, AV1E_SET_LOOPFILTER_CONTROL, 1); + aom_codec_control(&codec, AV1E_SET_ENABLE_WARPED_MOTION, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_OBMC, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_GLOBAL_MOTION, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0); + aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0); + aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3); + aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3); + aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3); + aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3); + aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1); + + // Settings to reduce key frame encoding time. + aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_SMOOTH_INTRA, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_ANGLE_DELTA, 0); + aom_codec_control(&codec, AV1E_SET_ENABLE_FILTER_INTRA, 0); + aom_codec_control(&codec, AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1); + + if (cfg.g_threads > 1) { + aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, + (unsigned int)log2(cfg.g_threads)); + } + + aom_codec_control(&codec, AV1E_SET_TUNE_CONTENT, app_input.tune_content); + if (app_input.tune_content == AOM_CONTENT_SCREEN) { + aom_codec_control(&codec, AV1E_SET_ENABLE_PALETTE, 1); + aom_codec_control(&codec, AV1E_SET_ENABLE_CFL_INTRA, 1); + // INTRABC is currently disabled for rt mode, as it's too slow. + aom_codec_control(&codec, AV1E_SET_ENABLE_INTRABC, 0); + } + + if (app_input.use_external_rc) { + aom_codec_control(&codec, AV1E_SET_RTC_EXTERNAL_RC, 1); + } + + aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX); + + aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE, + AOM_FULL_SUPERFRAME_DROP); + + svc_params.number_spatial_layers = ss_number_layers; + svc_params.number_temporal_layers = ts_number_layers; + for (i = 0; i < ss_number_layers * ts_number_layers; ++i) { + svc_params.max_quantizers[i] = cfg.rc_max_quantizer; + svc_params.min_quantizers[i] = cfg.rc_min_quantizer; + } + for (i = 0; i < ss_number_layers; ++i) { + svc_params.scaling_factor_num[i] = 1; + svc_params.scaling_factor_den[i] = 1; + } + if (ss_number_layers == 2) { + svc_params.scaling_factor_num[0] = 1; + svc_params.scaling_factor_den[0] = 2; + } else if (ss_number_layers == 3) { + svc_params.scaling_factor_num[0] = 1; + svc_params.scaling_factor_den[0] = 4; + svc_params.scaling_factor_num[1] = 1; + svc_params.scaling_factor_den[1] = 2; + } + aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params); + // TODO(aomedia:3032): Configure KSVC in fixed mode. + + // This controls the maximum target size of the key frame. + // For generating smaller key frames, use a smaller max_intra_size_pct + // value, like 100 or 200. + { + const int max_intra_size_pct = 300; + aom_codec_control(&codec, AOME_SET_MAX_INTRA_BITRATE_PCT, + max_intra_size_pct); + } + + for (int lx = 0; lx < ts_number_layers * ss_number_layers; lx++) { + cx_time_layer[lx] = 0; + frame_cnt_layer[lx] = 0; + } + + std::unique_ptr rc_api; + if (app_input.use_external_rc) { + const aom::AV1RateControlRtcConfig rc_cfg = + create_rtc_rc_config(cfg, app_input); + rc_api = aom::AV1RateControlRTC::Create(rc_cfg); + } + + frame_avail = 1; + struct psnr_stats psnr_stream; + memset(&psnr_stream, 0, sizeof(psnr_stream)); + while (frame_avail || got_data) { + struct aom_usec_timer timer; + frame_avail = read_frame(&(app_input.input_ctx), &raw); + // Loop over spatial layers. + for (int slx = 0; slx < ss_number_layers; slx++) { + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt; + int layer = 0; + // Flag for superframe whose base is key. + int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0; + // For flexible mode: + if (app_input.layering_mode >= 0) { + // Set the reference/update flags, layer_id, and reference_map + // buffer index. + set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id, + &ref_frame_config, &ref_frame_comp_pred, + &use_svc_control, slx, is_key_frame, + (app_input.layering_mode == 10), app_input.speed); + aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); + if (use_svc_control) { + aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, + &ref_frame_config); + aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED, + &ref_frame_comp_pred); + } + // Set the speed per layer. + if (test_speed_per_layer) { + int speed_per_layer = 10; + if (layer_id.spatial_layer_id == 0) { + if (layer_id.temporal_layer_id == 0) speed_per_layer = 6; + if (layer_id.temporal_layer_id == 1) speed_per_layer = 7; + if (layer_id.temporal_layer_id == 2) speed_per_layer = 8; + } else if (layer_id.spatial_layer_id == 1) { + if (layer_id.temporal_layer_id == 0) speed_per_layer = 7; + if (layer_id.temporal_layer_id == 1) speed_per_layer = 8; + if (layer_id.temporal_layer_id == 2) speed_per_layer = 9; + } else if (layer_id.spatial_layer_id == 2) { + if (layer_id.temporal_layer_id == 0) speed_per_layer = 8; + if (layer_id.temporal_layer_id == 1) speed_per_layer = 9; + if (layer_id.temporal_layer_id == 2) speed_per_layer = 10; + } + aom_codec_control(&codec, AOME_SET_CPUUSED, speed_per_layer); + } + } else { + // Only up to 3 temporal layers supported in fixed mode. + // Only need to set spatial and temporal layer_id: reference + // prediction, refresh, and buffer_idx are set internally. + layer_id.spatial_layer_id = slx; + layer_id.temporal_layer_id = 0; + if (ts_number_layers == 2) { + layer_id.temporal_layer_id = (frame_cnt % 2) != 0; + } else if (ts_number_layers == 3) { + if (frame_cnt % 2 != 0) + layer_id.temporal_layer_id = 2; + else if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) + layer_id.temporal_layer_id = 1; + } + aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); + } + + if (set_err_resil_frame && cfg.g_error_resilient == 0) { + // Set error_resilient per frame: off/0 for base layer and + // on/1 for enhancement layer frames. + // Note that this is can only be done on the fly/per-frame/layer + // if the config error_resilience is off/0. See the logic for updating + // in set_encoder_config(): + // tool_cfg->error_resilient_mode = + // cfg->g_error_resilient | extra_cfg->error_resilient_mode; + const int err_resil_mode = + layer_id.spatial_layer_id > 0 || layer_id.temporal_layer_id > 0; + aom_codec_control(&codec, AV1E_SET_ERROR_RESILIENT_MODE, + err_resil_mode); + } + + layer = slx * ts_number_layers + layer_id.temporal_layer_id; + if (frame_avail && slx == 0) ++rc.layer_input_frames[layer]; + + if (test_dynamic_scaling_single_layer) { + // Example to scale source down by 2x2, then 4x4, and then back up to + // 2x2, and then back to original. + int frame_2x2 = 200; + int frame_4x4 = 400; + int frame_2x2up = 600; + int frame_orig = 800; + if (frame_cnt >= frame_2x2 && frame_cnt < frame_4x4) { + // Scale source down by 2x2. + struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO }; + aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); + } else if (frame_cnt >= frame_4x4 && frame_cnt < frame_2x2up) { + // Scale source down by 4x4. + struct aom_scaling_mode mode = { AOME_ONEFOUR, AOME_ONEFOUR }; + aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); + } else if (frame_cnt >= frame_2x2up && frame_cnt < frame_orig) { + // Source back up to 2x2. + struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO }; + aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); + } else if (frame_cnt >= frame_orig) { + // Source back up to original resolution (no scaling). + struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL }; + aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode); + } + if (frame_cnt == frame_2x2 || frame_cnt == frame_4x4 || + frame_cnt == frame_2x2up || frame_cnt == frame_orig) { + // For dynamic resize testing on single layer: refresh all references + // on the resized frame: this is to avoid decode error: + // if resize goes down by >= 4x4 then libaom decoder will throw an + // error that some reference (even though not used) is beyond the + // limit size (must be smaller than 4x4). + for (i = 0; i < REF_FRAMES; i++) ref_frame_config.refresh[i] = 1; + if (use_svc_control) { + aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, + &ref_frame_config); + aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED, + &ref_frame_comp_pred); + } + } + } + + // Change target_bitrate every other frame. + if (test_changing_bitrate && frame_cnt % 2 == 0) { + if (frame_cnt < 500) + cfg.rc_target_bitrate += 10; + else + cfg.rc_target_bitrate -= 10; + // Do big increase and decrease. + if (frame_cnt == 100) cfg.rc_target_bitrate <<= 1; + if (frame_cnt == 600) cfg.rc_target_bitrate >>= 1; + if (cfg.rc_target_bitrate < 100) cfg.rc_target_bitrate = 100; + // Call change_config, or bypass with new control. + // res = aom_codec_enc_config_set(&codec, &cfg); + if (aom_codec_control(&codec, AV1E_SET_BITRATE_ONE_PASS_CBR, + cfg.rc_target_bitrate)) + die_codec(&codec, "Failed to SET_BITRATE_ONE_PASS_CBR"); + } + + if (rc_api) { + aom::AV1FrameParamsRTC frame_params; + // TODO(jianj): Add support for SVC. + frame_params.spatial_layer_id = 0; + frame_params.temporal_layer_id = 0; + frame_params.frame_type = + is_key_frame ? aom::kKeyFrame : aom::kInterFrame; + rc_api->ComputeQP(frame_params); + const int current_qp = rc_api->GetQP(); + if (aom_codec_control(&codec, AV1E_SET_QUANTIZER_ONE_PASS, + qindex_to_quantizer(current_qp))) { + die_codec(&codec, "Failed to SET_QUANTIZER_ONE_PASS"); + } + } + + // Do the layer encode. + aom_usec_timer_start(&timer); + if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags)) + die_codec(&codec, "Failed to encode frame"); + aom_usec_timer_mark(&timer); + cx_time += aom_usec_timer_elapsed(&timer); + cx_time_layer[layer] += aom_usec_timer_elapsed(&timer); + frame_cnt_layer[layer] += 1; + + got_data = 0; + // For simulcast (mode 11): write out each spatial layer to the file. + int ss_layers_write = (app_input.layering_mode == 11) + ? layer_id.spatial_layer_id + 1 + : ss_number_layers; + while ((pkt = aom_codec_get_cx_data(&codec, &iter))) { + switch (pkt->kind) { + case AOM_CODEC_CX_FRAME_PKT: + for (int sl = layer_id.spatial_layer_id; sl < ss_layers_write; + ++sl) { + for (int tl = layer_id.temporal_layer_id; tl < ts_number_layers; + ++tl) { + int j = sl * ts_number_layers + tl; + if (app_input.output_obu) { + fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + obu_files[j]); + } else { + aom_video_writer_write_frame( + outfile[j], + reinterpret_cast(pkt->data.frame.buf), + pkt->data.frame.sz, pts); + } + if (sl == layer_id.spatial_layer_id) + rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz; + } + } + got_data = 1; + // Write everything into the top layer. + if (app_input.output_obu) { + fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + total_layer_obu_file); + } else { + aom_video_writer_write_frame( + total_layer_file, + reinterpret_cast(pkt->data.frame.buf), + pkt->data.frame.sz, pts); + } + // Keep count of rate control stats per layer (for non-key). + if (!(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) { + int j = layer_id.spatial_layer_id * ts_number_layers + + layer_id.temporal_layer_id; + assert(j >= 0); + rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz; + rc.layer_avg_rate_mismatch[j] += + fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) / + rc.layer_pfb[j]; + if (slx == 0) ++rc.layer_enc_frames[layer_id.temporal_layer_id]; + } + + if (rc_api) { + rc_api->PostEncodeUpdate(pkt->data.frame.sz); + } + // Update for short-time encoding bitrate states, for moving window + // of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + // For spatial layers: only do this for top/highest SL. + if (frame_cnt > rc.window_size && slx == ss_number_layers - 1) { + sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate; + rc.window_size = (rc.window_size <= 0) ? 1 : rc.window_size; + if (frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate / rc.window_size) * + (sum_bitrate / rc.window_size); + sum_bitrate = 0.0; + } + } + // Second shifted window. + if (frame_cnt > rc.window_size + rc.window_size / 2 && + slx == ss_number_layers - 1) { + sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate; + if (frame_cnt > 2 * rc.window_size && + frame_cnt % rc.window_size == 0) { + rc.window_count += 1; + rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; + rc.variance_st_encoding_bitrate += + (sum_bitrate2 / rc.window_size) * + (sum_bitrate2 / rc.window_size); + sum_bitrate2 = 0.0; + } + } + +#if CONFIG_AV1_DECODER + if (app_input.decode) { + if (aom_codec_decode( + &decoder, + reinterpret_cast(pkt->data.frame.buf), + pkt->data.frame.sz, NULL)) + die_codec(&decoder, "Failed to decode frame"); + } +#endif + + break; + case AOM_CODEC_PSNR_PKT: + if (app_input.show_psnr) { + psnr_stream.psnr_sse_total[0] += pkt->data.psnr.sse[0]; + psnr_stream.psnr_samples_total[0] += pkt->data.psnr.samples[0]; + for (int plane = 0; plane < 4; plane++) { + psnr_stream.psnr_totals[0][plane] += pkt->data.psnr.psnr[plane]; + } + psnr_stream.psnr_count[0]++; + } + break; + default: break; + } + } +#if CONFIG_AV1_DECODER + if (got_data && app_input.decode) { + // Don't look for mismatch on top spatial and top temporal layers as + // they are non reference frames. + if ((ss_number_layers > 1 || ts_number_layers > 1) && + !(layer_id.temporal_layer_id > 0 && + layer_id.temporal_layer_id == ts_number_layers - 1)) { + if (test_decode(&codec, &decoder, frame_cnt)) { +#if CONFIG_INTERNAL_STATS + fprintf(stats_file, "First mismatch occurred in frame %d\n", + frame_cnt); + fclose(stats_file); +#endif + fatal("Mismatch seen"); + } + } + } +#endif + } // loop over spatial layers + ++frame_cnt; + pts += frame_duration; + } + + close_input_file(&(app_input.input_ctx)); + printout_rate_control_summary(&rc, frame_cnt, ss_number_layers, + ts_number_layers); + + printf("\n"); + for (int slx = 0; slx < ss_number_layers; slx++) + for (int tlx = 0; tlx < ts_number_layers; tlx++) { + int lx = slx * ts_number_layers + tlx; + printf("Per layer encoding time/FPS stats for encoder: %d %d %d %f %f \n", + slx, tlx, frame_cnt_layer[lx], + (float)cx_time_layer[lx] / (double)(frame_cnt_layer[lx] * 1000), + 1000000 * (double)frame_cnt_layer[lx] / (double)cx_time_layer[lx]); + } + + printf("\n"); + printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n", + frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), + 1000000 * (double)frame_cnt / (double)cx_time); + + if (app_input.show_psnr) { + show_psnr(&psnr_stream, 255.0); + } + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy encoder"); + +#if CONFIG_AV1_DECODER + if (app_input.decode) { + if (aom_codec_destroy(&decoder)) + die_codec(&decoder, "Failed to destroy decoder"); + } +#endif + +#if CONFIG_INTERNAL_STATS + fprintf(stats_file, "No mismatch detected in recon buffers\n"); + fclose(stats_file); +#endif + + // Try to rewrite the output file headers with the actual frame count. + for (i = 0; i < ss_number_layers * ts_number_layers; ++i) + aom_video_writer_close(outfile[i]); + aom_video_writer_close(total_layer_file); + + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { + aom_img_free(&raw); + } + return EXIT_SUCCESS; +} diff --git a/third_party/aom/examples/twopass_encoder.c b/third_party/aom/examples/twopass_encoder.c new file mode 100644 index 0000000000..388f68bd4d --- /dev/null +++ b/third_party/aom/examples/twopass_encoder.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +// Two Pass Encoder +// ================ +// +// This is an example of a two pass encoder loop. It takes an input file in +// YV12 format, passes it through the encoder twice, and writes the compressed +// frames to disk in IVF format. It builds upon the simple_encoder example. +// +// Twopass Variables +// ----------------- +// Twopass mode needs to track the current pass number and the buffer of +// statistics packets. +// +// Updating The Configuration +// --------------------------------- +// In two pass mode, the configuration has to be updated on each pass. The +// statistics buffer is passed on the last pass. +// +// Encoding A Frame +// ---------------- +// Encoding a frame in two pass mode is identical to the simple encoder +// example. +// +// Processing Statistics Packets +// ----------------------------- +// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data +// for this frame. We write a IVF frame header, followed by the raw data. +// +// +// Pass Progress Reporting +// ----------------------------- +// It's sometimes helpful to see when each pass completes. +// +// +// Clean-up +// ----------------------------- +// Destruction of the encoder instance must be done on each pass. The +// raw image should be destroyed at the end as usual. + +#include +#include +#include + +#include "aom/aom_encoder.h" +#include "aom/aomcx.h" +#include "common/tools_common.h" +#include "common/video_writer.h" + +static const char *exec_name; + +void usage_exit(void) { + fprintf(stderr, + "Usage: %s " + "\n", + exec_name); + exit(EXIT_FAILURE); +} + +static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, + aom_fixed_buf_t *stats) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == AOM_CODEC_STATS_PKT) { + const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; + const size_t pkt_size = pkt->data.twopass_stats.sz; + stats->buf = realloc(stats->buf, stats->sz + pkt_size); + if (!stats->buf) die("Failed to allocate frame stats buffer."); + memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); + stats->sz += pkt_size; + } + } + + return got_pkts; +} + +static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img, + aom_codec_pts_t pts, unsigned int duration, + aom_enc_frame_flags_t flags, AvxVideoWriter *writer) { + int got_pkts = 0; + aom_codec_iter_t iter = NULL; + const aom_codec_cx_pkt_t *pkt = NULL; + const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags); + if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame."); + + while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) { + got_pkts = 1; + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) { + const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0; + + if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf, + pkt->data.frame.sz, + pkt->data.frame.pts)) + die_codec(ctx, "Failed to write compressed frame."); + printf(keyframe ? "K" : "."); + fflush(stdout); + } + } + + return got_pkts; +} + +static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile, + aom_codec_iface_t *encoder, + const aom_codec_enc_cfg_t *cfg, int limit) { + aom_codec_ctx_t codec; + int frame_count = 0; + aom_fixed_buf_t stats = { NULL, 0 }; + + if (aom_codec_enc_init(&codec, encoder, cfg, 0)) + die("Failed to initialize encoder"); + + // Calculate frame statistics. + while (aom_img_read(raw, infile) && frame_count < limit) { + ++frame_count; + get_frame_stats(&codec, raw, frame_count, 1, 0, &stats); + } + + // Flush encoder. + while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) { + } + + printf("Pass 0 complete. Processed %d frames.\n", frame_count); + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + return stats; +} + +static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, + aom_codec_iface_t *encoder, const aom_codec_enc_cfg_t *cfg, + int limit) { + AvxVideoInfo info = { get_fourcc_by_aom_encoder(encoder), + cfg->g_w, + cfg->g_h, + { cfg->g_timebase.num, cfg->g_timebase.den }, + 0 }; + AvxVideoWriter *writer = NULL; + aom_codec_ctx_t codec; + int frame_count = 0; + + writer = aom_video_writer_open(outfile_name, kContainerIVF, &info); + if (!writer) die("Failed to open %s for writing", outfile_name); + + if (aom_codec_enc_init(&codec, encoder, cfg, 0)) + die("Failed to initialize encoder"); + + if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2)) + die_codec(&codec, "Failed to set cpu-used"); + + // Encode frames. + while (aom_img_read(raw, infile) && frame_count < limit) { + ++frame_count; + encode_frame(&codec, raw, frame_count, 1, 0, writer); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 1, 0, writer)) { + } + + printf("\n"); + + if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + aom_video_writer_close(writer); + + printf("Pass 1 complete. Processed %d frames.\n", frame_count); +} + +int main(int argc, char **argv) { + FILE *infile = NULL; + int w, h; + aom_codec_ctx_t codec; + aom_codec_enc_cfg_t cfg; + aom_image_t raw; + aom_codec_err_t res; + aom_fixed_buf_t stats; + + const int fps = 30; // TODO(dkovalev) add command line argument + const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument + const char *const codec_arg = argv[1]; + const char *const width_arg = argv[2]; + const char *const height_arg = argv[3]; + const char *const infile_arg = argv[4]; + const char *const outfile_arg = argv[5]; + int limit = 0; + exec_name = argv[0]; + + if (argc < 6) die("Invalid number of arguments"); + + if (argc > 6) limit = (int)strtol(argv[6], NULL, 0); + + if (limit == 0) limit = 100; + + aom_codec_iface_t *encoder = get_aom_encoder_by_short_name(codec_arg); + if (!encoder) die("Unsupported codec."); + + w = (int)strtol(width_arg, NULL, 0); + h = (int)strtol(height_arg, NULL, 0); + + if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0) + die("Invalid frame size: %dx%d", w, h); + + if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1)) + die("Failed to allocate image (%dx%d)", w, h); + + printf("Using %s\n", aom_codec_iface_name(encoder)); + + // Configuration + res = aom_codec_enc_config_default(encoder, &cfg, 0); + if (res) die_codec(&codec, "Failed to get default codec config."); + + cfg.g_w = w; + cfg.g_h = h; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = fps; + cfg.rc_target_bitrate = bitrate; + + if (!(infile = fopen(infile_arg, "rb"))) + die("Failed to open %s for reading", infile_arg); + + // Pass 0 + cfg.g_pass = AOM_RC_FIRST_PASS; + stats = pass0(&raw, infile, encoder, &cfg, limit); + + // Pass 1 + rewind(infile); + cfg.g_pass = AOM_RC_LAST_PASS; + cfg.rc_twopass_stats_in = stats; + pass1(&raw, infile, outfile_arg, encoder, &cfg, limit); + free(stats.buf); + + aom_img_free(&raw); + fclose(infile); + + return EXIT_SUCCESS; +} diff --git a/third_party/aom/keywords.dox b/third_party/aom/keywords.dox new file mode 100644 index 0000000000..56f5368900 --- /dev/null +++ b/third_party/aom/keywords.dox @@ -0,0 +1,51 @@ +/*!\page rfc2119 RFC2119 Keywords + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL + NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and + "OPTIONAL" in this document are to be interpreted as described in + RFC 2119. + +Specifically, the following definitions are used: + +\section MUST +\anchor REQUIRED +\anchor SHALL + This word, or the terms "REQUIRED" or "SHALL", mean that the + definition is an absolute requirement of the specification. + +\section MUSTNOT MUST NOT +\anchor SHALLNOT + This phrase, or the phrase "SHALL NOT", mean that the + definition is an absolute prohibition of the specification. + +\section SHOULD +\anchor RECOMMENDED + This word, or the adjective "RECOMMENDED", mean that there + may exist valid reasons in particular circumstances to ignore a + particular item, but the full implications must be understood and + carefully weighed before choosing a different course. + +\section SHOULDNOT SHOULD NOT +\anchor NOTRECOMMENDED + This phrase, or the phrase "NOT RECOMMENDED" mean that + there may exist valid reasons in particular circumstances when the + particular behavior is acceptable or even useful, but the full + implications should be understood and the case carefully weighed + before implementing any behavior described with this label. + +\section MAY +\anchor OPTIONAL + This word, or the adjective "OPTIONAL", mean that an item is + truly optional. One vendor may choose to include the item because a + particular marketplace requires it or because the vendor feels that + it enhances the product while another vendor may omit the same item. + An implementation which does not include a particular option \ref MUST be + prepared to interoperate with another implementation which does + include the option, though perhaps with reduced functionality. In the + same vein an implementation which does include a particular option + \ref MUST be prepared to interoperate with another implementation which + does not include the option (except, of course, for the feature the + option provides.) + + +*/ diff --git a/third_party/aom/libs.doxy_template b/third_party/aom/libs.doxy_template new file mode 100644 index 0000000000..ba77751a50 --- /dev/null +++ b/third_party/aom/libs.doxy_template @@ -0,0 +1,2447 @@ +## Copyright (c) 2020, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## + +# Doxyfile 1.8.16 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "AOMedia AV1 Codec" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docs + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is +# Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# (including Cygwin) ands Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = NO + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = YES + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = NO + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via Javascript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have Javascript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/xcode/), introduced with OSX +# 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = YES + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /